feat(storage): Add shared memory support for file-based storage implementations

This commit adds multiprocessing shared memory support to file-based storage implementations:
- JsonDocStatusStorage
- JsonKVStorage
- NanoVectorDBStorage
- NetworkXStorage

Each storage module now uses module-level global variables with multiprocessing.Manager() to ensure data consistency across multiple uvicorn workers. All processes will see
updates immediately when data is modified through ainsert function.
This commit is contained in:
yangdx
2025-02-25 11:10:13 +08:00
parent 7262f61b0e
commit 087d5770b0
4 changed files with 176 additions and 17 deletions

View File

@@ -2,6 +2,8 @@ import asyncio
import os
from dataclasses import dataclass
from typing import Any, final
import threading
from multiprocessing import Manager
from lightrag.base import (
BaseKVStorage,
@@ -12,6 +14,25 @@ from lightrag.utils import (
write_json,
)
# Global variables for shared memory management
_init_lock = threading.Lock()
_manager = None
_shared_kv_data = None
def _get_manager():
"""Get or create the global manager instance"""
global _manager, _shared_kv_data
with _init_lock:
if _manager is None:
try:
_manager = Manager()
_shared_kv_data = _manager.dict()
except Exception as e:
logger.error(f"Failed to initialize shared memory manager: {e}")
raise RuntimeError(f"Shared memory initialization failed: {e}")
return _manager
@final
@dataclass
@@ -19,9 +40,28 @@ class JsonKVStorage(BaseKVStorage):
def __post_init__(self):
working_dir = self.global_config["working_dir"]
self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
self._data: dict[str, Any] = load_json(self._file_name) or {}
self._lock = asyncio.Lock()
logger.info(f"Load KV {self.namespace} with {len(self._data)} data")
# Ensure manager is initialized
_get_manager()
# Get or create namespace data
if self.namespace not in _shared_kv_data:
with _init_lock:
if self.namespace not in _shared_kv_data:
try:
initial_data = load_json(self._file_name) or {}
_shared_kv_data[self.namespace] = initial_data
except Exception as e:
logger.error(f"Failed to initialize shared data for namespace {self.namespace}: {e}")
raise RuntimeError(f"Shared data initialization failed: {e}")
try:
self._data = _shared_kv_data[self.namespace]
logger.info(f"Load KV {self.namespace} with {len(self._data)} data")
except Exception as e:
logger.error(f"Failed to access shared memory: {e}")
raise RuntimeError(f"Cannot access shared memory: {e}")
async def index_done_callback(self) -> None:
write_json(self._data, self._file_name)