Add atomic data initialization lock to prevent race conditions

2025-03-09 17:33:15 +08:00
parent 020a6b5ae0
commit e47883d872
3 changed files with 49 additions and 29 deletions
--- a/lightrag/kg/json_kv_impl.py
+++ b/lightrag/kg/json_kv_impl.py
@@ -13,6 +13,7 @@ from lightrag.utils import (
 from .shared_storage import (
    get_namespace_data,
    get_storage_lock,
+    get_data_init_lock,
    try_initialize_namespace,
 )

@@ -23,29 +24,30 @@ class JsonKVStorage(BaseKVStorage):
    def __post_init__(self):
        working_dir = self.global_config["working_dir"]
        self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
-        self._storage_lock = get_storage_lock()
        self._data = None

    async def initialize(self):
        """Initialize storage data"""
-        # check need_init must before get_namespace_data
-        need_init = await try_initialize_namespace(self.namespace)
+        self._storage_lock = get_storage_lock()
        self._data = await get_namespace_data(self.namespace)
-        if need_init:
-            loaded_data = load_json(self._file_name) or {}
-            async with self._storage_lock:
-                self._data.update(loaded_data)
-                
-                # Calculate data count based on namespace
-                if self.namespace.endswith("cache"):
-                    # For cache namespaces, sum the cache entries across all cache types
-                    data_count = sum(len(first_level_dict) for first_level_dict in loaded_data.values() 
-                                    if isinstance(first_level_dict, dict))
-                else:
-                    # For non-cache namespaces, use the original count method
-                    data_count = len(loaded_data)
-                
-                logger.info(f"Process {os.getpid()} KV load {self.namespace} with {data_count} records")
+        async with get_data_init_lock():
+            # check need_init must before get_namespace_data
+            need_init = await try_initialize_namespace(self.namespace)
+            if need_init:
+                loaded_data = load_json(self._file_name) or {}
+                async with self._storage_lock:
+                    self._data.update(loaded_data)
+                    
+                    # Calculate data count based on namespace
+                    if self.namespace.endswith("cache"):
+                        # For cache namespaces, sum the cache entries across all cache types
+                        data_count = sum(len(first_level_dict) for first_level_dict in loaded_data.values() 
+                                        if isinstance(first_level_dict, dict))
+                    else:
+                        # For non-cache namespaces, use the original count method
+                        data_count = len(loaded_data)
+                    
+                    logger.info(f"Process {os.getpid()} KV load {self.namespace} with {data_count} records")

    async def index_done_callback(self) -> None:
        async with self._storage_lock: