Add atomic data initialization lock to prevent race conditions

This commit is contained in:
yangdx
2025-03-09 17:33:15 +08:00
parent 020a6b5ae0
commit e47883d872
3 changed files with 49 additions and 29 deletions

View File

@@ -13,6 +13,7 @@ from lightrag.utils import (
from .shared_storage import (
get_namespace_data,
get_storage_lock,
get_data_init_lock,
try_initialize_namespace,
)
@@ -23,29 +24,30 @@ class JsonKVStorage(BaseKVStorage):
def __post_init__(self):
working_dir = self.global_config["working_dir"]
self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
self._storage_lock = get_storage_lock()
self._data = None
async def initialize(self):
"""Initialize storage data"""
# check need_init must before get_namespace_data
need_init = await try_initialize_namespace(self.namespace)
self._storage_lock = get_storage_lock()
self._data = await get_namespace_data(self.namespace)
if need_init:
loaded_data = load_json(self._file_name) or {}
async with self._storage_lock:
self._data.update(loaded_data)
# Calculate data count based on namespace
if self.namespace.endswith("cache"):
# For cache namespaces, sum the cache entries across all cache types
data_count = sum(len(first_level_dict) for first_level_dict in loaded_data.values()
if isinstance(first_level_dict, dict))
else:
# For non-cache namespaces, use the original count method
data_count = len(loaded_data)
logger.info(f"Process {os.getpid()} KV load {self.namespace} with {data_count} records")
async with get_data_init_lock():
# check need_init must before get_namespace_data
need_init = await try_initialize_namespace(self.namespace)
if need_init:
loaded_data = load_json(self._file_name) or {}
async with self._storage_lock:
self._data.update(loaded_data)
# Calculate data count based on namespace
if self.namespace.endswith("cache"):
# For cache namespaces, sum the cache entries across all cache types
data_count = sum(len(first_level_dict) for first_level_dict in loaded_data.values()
if isinstance(first_level_dict, dict))
else:
# For non-cache namespaces, use the original count method
data_count = len(loaded_data)
logger.info(f"Process {os.getpid()} KV load {self.namespace} with {data_count} records")
async def index_done_callback(self) -> None:
async with self._storage_lock: