Add atomic data initialization lock to prevent race conditions

This commit is contained in:
yangdx
2025-03-09 17:33:15 +08:00
parent 020a6b5ae0
commit e47883d872
3 changed files with 49 additions and 29 deletions

View File

@@ -15,6 +15,7 @@ from lightrag.utils import (
from .shared_storage import ( from .shared_storage import (
get_namespace_data, get_namespace_data,
get_storage_lock, get_storage_lock,
get_data_init_lock,
try_initialize_namespace, try_initialize_namespace,
) )
@@ -27,21 +28,22 @@ class JsonDocStatusStorage(DocStatusStorage):
def __post_init__(self): def __post_init__(self):
working_dir = self.global_config["working_dir"] working_dir = self.global_config["working_dir"]
self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json") self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
self._storage_lock = get_storage_lock()
self._data = None self._data = None
async def initialize(self): async def initialize(self):
"""Initialize storage data""" """Initialize storage data"""
# check need_init must before get_namespace_data self._storage_lock = get_storage_lock()
need_init = await try_initialize_namespace(self.namespace)
self._data = await get_namespace_data(self.namespace) self._data = await get_namespace_data(self.namespace)
if need_init: async with get_data_init_lock():
loaded_data = load_json(self._file_name) or {} # check need_init must before get_namespace_data
async with self._storage_lock: need_init = await try_initialize_namespace(self.namespace)
self._data.update(loaded_data) if need_init:
logger.info( loaded_data = load_json(self._file_name) or {}
f"Process {os.getpid()} doc status load {self.namespace} with {len(loaded_data)} records" async with self._storage_lock:
) self._data.update(loaded_data)
logger.info(
f"Process {os.getpid()} doc status load {self.namespace} with {len(loaded_data)} records"
)
async def filter_keys(self, keys: set[str]) -> set[str]: async def filter_keys(self, keys: set[str]) -> set[str]:
"""Return keys that should be processed (not in storage or not successfully processed)""" """Return keys that should be processed (not in storage or not successfully processed)"""

View File

@@ -13,6 +13,7 @@ from lightrag.utils import (
from .shared_storage import ( from .shared_storage import (
get_namespace_data, get_namespace_data,
get_storage_lock, get_storage_lock,
get_data_init_lock,
try_initialize_namespace, try_initialize_namespace,
) )
@@ -23,29 +24,30 @@ class JsonKVStorage(BaseKVStorage):
def __post_init__(self): def __post_init__(self):
working_dir = self.global_config["working_dir"] working_dir = self.global_config["working_dir"]
self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json") self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
self._storage_lock = get_storage_lock()
self._data = None self._data = None
async def initialize(self): async def initialize(self):
"""Initialize storage data""" """Initialize storage data"""
# check need_init must before get_namespace_data self._storage_lock = get_storage_lock()
need_init = await try_initialize_namespace(self.namespace)
self._data = await get_namespace_data(self.namespace) self._data = await get_namespace_data(self.namespace)
if need_init: async with get_data_init_lock():
loaded_data = load_json(self._file_name) or {} # check need_init must before get_namespace_data
async with self._storage_lock: need_init = await try_initialize_namespace(self.namespace)
self._data.update(loaded_data) if need_init:
loaded_data = load_json(self._file_name) or {}
async with self._storage_lock:
self._data.update(loaded_data)
# Calculate data count based on namespace # Calculate data count based on namespace
if self.namespace.endswith("cache"): if self.namespace.endswith("cache"):
# For cache namespaces, sum the cache entries across all cache types # For cache namespaces, sum the cache entries across all cache types
data_count = sum(len(first_level_dict) for first_level_dict in loaded_data.values() data_count = sum(len(first_level_dict) for first_level_dict in loaded_data.values()
if isinstance(first_level_dict, dict)) if isinstance(first_level_dict, dict))
else: else:
# For non-cache namespaces, use the original count method # For non-cache namespaces, use the original count method
data_count = len(loaded_data) data_count = len(loaded_data)
logger.info(f"Process {os.getpid()} KV load {self.namespace} with {data_count} records") logger.info(f"Process {os.getpid()} KV load {self.namespace} with {data_count} records")
async def index_done_callback(self) -> None: async def index_done_callback(self) -> None:
async with self._storage_lock: async with self._storage_lock:

View File

@@ -39,6 +39,7 @@ _storage_lock: Optional[LockType] = None
_internal_lock: Optional[LockType] = None _internal_lock: Optional[LockType] = None
_pipeline_status_lock: Optional[LockType] = None _pipeline_status_lock: Optional[LockType] = None
_graph_db_lock: Optional[LockType] = None _graph_db_lock: Optional[LockType] = None
_data_init_lock: Optional[LockType] = None
class UnifiedLock(Generic[T]): class UnifiedLock(Generic[T]):
@@ -188,6 +189,16 @@ def get_graph_db_lock(enable_logging: bool = False) -> UnifiedLock:
) )
def get_data_init_lock(enable_logging: bool = False) -> UnifiedLock:
"""return unified data initialization lock for ensuring atomic data initialization"""
return UnifiedLock(
lock=_data_init_lock,
is_async=not is_multiprocess,
name="data_init_lock",
enable_logging=enable_logging,
)
def initialize_share_data(workers: int = 1): def initialize_share_data(workers: int = 1):
""" """
Initialize shared storage data for single or multi-process mode. Initialize shared storage data for single or multi-process mode.
@@ -214,6 +225,7 @@ def initialize_share_data(workers: int = 1):
_internal_lock, \ _internal_lock, \
_pipeline_status_lock, \ _pipeline_status_lock, \
_graph_db_lock, \ _graph_db_lock, \
_data_init_lock, \
_shared_dicts, \ _shared_dicts, \
_init_flags, \ _init_flags, \
_initialized, \ _initialized, \
@@ -226,15 +238,16 @@ def initialize_share_data(workers: int = 1):
) )
return return
_manager = Manager()
_workers = workers _workers = workers
if workers > 1: if workers > 1:
is_multiprocess = True is_multiprocess = True
_manager = Manager()
_internal_lock = _manager.Lock() _internal_lock = _manager.Lock()
_storage_lock = _manager.Lock() _storage_lock = _manager.Lock()
_pipeline_status_lock = _manager.Lock() _pipeline_status_lock = _manager.Lock()
_graph_db_lock = _manager.Lock() _graph_db_lock = _manager.Lock()
_data_init_lock = _manager.Lock()
_shared_dicts = _manager.dict() _shared_dicts = _manager.dict()
_init_flags = _manager.dict() _init_flags = _manager.dict()
_update_flags = _manager.dict() _update_flags = _manager.dict()
@@ -247,6 +260,7 @@ def initialize_share_data(workers: int = 1):
_storage_lock = asyncio.Lock() _storage_lock = asyncio.Lock()
_pipeline_status_lock = asyncio.Lock() _pipeline_status_lock = asyncio.Lock()
_graph_db_lock = asyncio.Lock() _graph_db_lock = asyncio.Lock()
_data_init_lock = asyncio.Lock()
_shared_dicts = {} _shared_dicts = {}
_init_flags = {} _init_flags = {}
_update_flags = {} _update_flags = {}
@@ -415,6 +429,7 @@ def finalize_share_data():
_internal_lock, \ _internal_lock, \
_pipeline_status_lock, \ _pipeline_status_lock, \
_graph_db_lock, \ _graph_db_lock, \
_data_init_lock, \
_shared_dicts, \ _shared_dicts, \
_init_flags, \ _init_flags, \
_initialized, \ _initialized, \
@@ -481,6 +496,7 @@ def finalize_share_data():
_internal_lock = None _internal_lock = None
_pipeline_status_lock = None _pipeline_status_lock = None
_graph_db_lock = None _graph_db_lock = None
_data_init_lock = None
_update_flags = None _update_flags = None
direct_log(f"Process {os.getpid()} storage data finalization complete") direct_log(f"Process {os.getpid()} storage data finalization complete")