Fix linting

This commit is contained in:
yangdx
2025-02-27 19:05:51 +08:00
parent 946095ef80
commit 64f22966a3
8 changed files with 196 additions and 112 deletions

View File

@@ -10,7 +10,11 @@ from lightrag.utils import (
logger,
write_json,
)
from .shared_storage import get_namespace_data, get_storage_lock, try_initialize_namespace
from .shared_storage import (
get_namespace_data,
get_storage_lock,
try_initialize_namespace,
)
@final
@@ -20,7 +24,7 @@ class JsonKVStorage(BaseKVStorage):
working_dir = self.global_config["working_dir"]
self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
self._storage_lock = get_storage_lock()
# check need_init must before get_namespace_data
need_init = try_initialize_namespace(self.namespace)
self._data = get_namespace_data(self.namespace)

View File

@@ -11,7 +11,12 @@ from lightrag.utils import (
)
import pipmaster as pm
from lightrag.base import BaseVectorStorage
from .shared_storage import get_storage_lock, get_namespace_object, is_multiprocess, try_initialize_namespace
from .shared_storage import (
get_storage_lock,
get_namespace_object,
is_multiprocess,
try_initialize_namespace,
)
if not pm.is_installed("nano-vectordb"):
pm.install("nano-vectordb")

View File

@@ -6,7 +6,12 @@ import numpy as np
from lightrag.types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge
from lightrag.utils import logger
from lightrag.base import BaseGraphStorage
from .shared_storage import get_storage_lock, get_namespace_object, is_multiprocess, try_initialize_namespace
from .shared_storage import (
get_storage_lock,
get_namespace_object,
is_multiprocess,
try_initialize_namespace,
)
import pipmaster as pm
@@ -74,16 +79,14 @@ class NetworkXStorage(BaseGraphStorage):
self.global_config["working_dir"], f"graph_{self.namespace}.graphml"
)
self._storage_lock = get_storage_lock()
# check need_init must before get_namespace_object
need_init = try_initialize_namespace(self.namespace)
self._graph = get_namespace_object(self.namespace)
if need_init:
if is_multiprocess:
preloaded_graph = NetworkXStorage.load_nx_graph(
self._graphml_xml_file
)
preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file)
self._graph.value = preloaded_graph or nx.Graph()
if preloaded_graph:
logger.info(
@@ -92,9 +95,7 @@ class NetworkXStorage(BaseGraphStorage):
else:
logger.info("Created new empty graph")
else:
preloaded_graph = NetworkXStorage.load_nx_graph(
self._graphml_xml_file
)
preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file)
self._graph = preloaded_graph or nx.Graph()
if preloaded_graph:
logger.info(

View File

@@ -4,16 +4,17 @@ from multiprocessing.synchronize import Lock as ProcessLock
from threading import Lock as ThreadLock
from multiprocessing import Manager
from typing import Any, Dict, Optional, Union
from lightrag.utils import logger
# Define a direct print function for critical logs that must be visible in all processes
def direct_log(message, level="INFO"):
"""
Log a message directly to stderr to ensure visibility in all processes,
including the Gunicorn master process.
"""
"""
print(f"{level}: {message}", file=sys.stderr, flush=True)
LockType = Union[ProcessLock, ThreadLock]
_manager = None
@@ -31,39 +32,53 @@ _global_lock: Optional[LockType] = None
def initialize_share_data(workers: int = 1):
"""
Initialize shared storage data for single or multi-process mode.
When used with Gunicorn's preload feature, this function is called once in the
master process before forking worker processes, allowing all workers to share
the same initialized data.
In single-process mode, this function is called during LightRAG object initialization.
The function determines whether to use cross-process shared variables for data storage
based on the number of workers. If workers=1, it uses thread locks and local dictionaries.
If workers>1, it uses process locks and shared dictionaries managed by multiprocessing.Manager.
Args:
workers (int): Number of worker processes. If 1, single-process mode is used.
If > 1, multi-process mode with shared memory is used.
"""
global _manager, is_multiprocess, is_multiprocess, _global_lock, _shared_dicts, _share_objects, _init_flags, _initialized
global \
_manager, \
is_multiprocess, \
is_multiprocess, \
_global_lock, \
_shared_dicts, \
_share_objects, \
_init_flags, \
_initialized
# Check if already initialized
if _initialized:
direct_log(f"Process {os.getpid()} Shared-Data already initialized (multiprocess={is_multiprocess})")
direct_log(
f"Process {os.getpid()} Shared-Data already initialized (multiprocess={is_multiprocess})"
)
return
_manager = Manager()
# Force multi-process mode if workers > 1
if workers > 1:
is_multiprocess = True
_global_lock = _manager.Lock()
_global_lock = _manager.Lock()
# Create shared dictionaries with manager
_shared_dicts = _manager.dict()
_share_objects = _manager.dict()
_init_flags = _manager.dict() # Use shared dictionary to store initialization flags
direct_log(f"Process {os.getpid()} Shared-Data created for Multiple Process (workers={workers})")
_init_flags = (
_manager.dict()
) # Use shared dictionary to store initialization flags
direct_log(
f"Process {os.getpid()} Shared-Data created for Multiple Process (workers={workers})"
)
else:
is_multiprocess = False
_global_lock = ThreadLock()
@@ -75,6 +90,7 @@ def initialize_share_data(workers: int = 1):
# Mark as initialized
_initialized = True
def try_initialize_namespace(namespace: str) -> bool:
"""
Try to initialize a namespace. Returns True if the current process gets initialization permission.
@@ -83,8 +99,11 @@ def try_initialize_namespace(namespace: str) -> bool:
global _init_flags, _manager
if _init_flags is None:
direct_log(f"Error: try to create nanmespace before Shared-Data is initialized, pid={os.getpid()}", level="ERROR")
raise ValueError("Shared dictionaries not initialized")
direct_log(
f"Error: try to create nanmespace before Shared-Data is initialized, pid={os.getpid()}",
level="ERROR",
)
raise ValueError("Shared dictionaries not initialized")
if namespace not in _init_flags:
_init_flags[namespace] = True
@@ -112,7 +131,10 @@ def get_namespace_object(namespace: str) -> Any:
"""Get an object for specific namespace"""
if _share_objects is None:
direct_log(f"Error: try to getnanmespace before Shared-Data is initialized, pid={os.getpid()}", level="ERROR")
direct_log(
f"Error: try to getnanmespace before Shared-Data is initialized, pid={os.getpid()}",
level="ERROR",
)
raise ValueError("Shared dictionaries not initialized")
lock = _get_global_lock()
@@ -123,14 +145,20 @@ def get_namespace_object(namespace: str) -> Any:
_share_objects[namespace] = _manager.Value("O", None)
else:
_share_objects[namespace] = None
direct_log(f"Created namespace({namespace}): type={type(_share_objects[namespace])}, pid={os.getpid()}")
direct_log(
f"Created namespace({namespace}): type={type(_share_objects[namespace])}, pid={os.getpid()}"
)
return _share_objects[namespace]
def get_namespace_data(namespace: str) -> Dict[str, Any]:
"""get storage space for specific storage type(namespace)"""
if _shared_dicts is None:
direct_log(f"Error: try to getnanmespace before Shared-Data is initialized, pid={os.getpid()}", level="ERROR")
direct_log(
f"Error: try to getnanmespace before Shared-Data is initialized, pid={os.getpid()}",
level="ERROR",
)
raise ValueError("Shared dictionaries not initialized")
lock = _get_global_lock()
@@ -140,8 +168,10 @@ def get_namespace_data(namespace: str) -> Dict[str, Any]:
_shared_dicts[namespace] = _manager.dict()
else:
_shared_dicts[namespace] = {}
direct_log(f"Created namespace({namespace}): type={type(_shared_dicts[namespace])}, pid={os.getpid()}")
direct_log(
f"Created namespace({namespace}): type={type(_shared_dicts[namespace])}, pid={os.getpid()}"
)
return _shared_dicts[namespace]
@@ -153,22 +183,33 @@ def get_scan_progress() -> Dict[str, Any]:
def finalize_share_data():
"""
Release shared resources and clean up.
This function should be called when the application is shutting down
to properly release shared resources and avoid memory leaks.
In multi-process mode, it shuts down the Manager and releases all shared objects.
In single-process mode, it simply resets the global variables.
"""
global _manager, is_multiprocess, _global_lock, _shared_dicts, _share_objects, _init_flags, _initialized
global \
_manager, \
is_multiprocess, \
_global_lock, \
_shared_dicts, \
_share_objects, \
_init_flags, \
_initialized
# Check if already initialized
if not _initialized:
direct_log(f"Process {os.getpid()} storage data not initialized, nothing to finalize")
direct_log(
f"Process {os.getpid()} storage data not initialized, nothing to finalize"
)
return
direct_log(f"Process {os.getpid()} finalizing storage data (multiprocess={is_multiprocess})")
direct_log(
f"Process {os.getpid()} finalizing storage data (multiprocess={is_multiprocess})"
)
# In multi-process mode, shut down the Manager
if is_multiprocess and _manager is not None:
try:
@@ -179,13 +220,15 @@ def finalize_share_data():
_share_objects.clear()
if _init_flags is not None:
_init_flags.clear()
# Shut down the Manager
_manager.shutdown()
direct_log(f"Process {os.getpid()} Manager shutdown complete")
except Exception as e:
direct_log(f"Process {os.getpid()} Error shutting down Manager: {e}", level="ERROR")
direct_log(
f"Process {os.getpid()} Error shutting down Manager: {e}", level="ERROR"
)
# Reset global variables
_manager = None
_initialized = None
@@ -194,5 +237,5 @@ def finalize_share_data():
_share_objects = None
_init_flags = None
_global_lock = None
direct_log(f"Process {os.getpid()} storage data finalization complete")