Implement Gunicorn+Uvicorn integration for shared data preloading

- Create run_with_gunicorn.py script to properly initialize shared data in the
  main process before forking worker processes
- Revert unvicorn to single process mode only, and let gunicorn do all the multi-process jobs
This commit is contained in:
yangdx
2025-02-27 13:25:22 +08:00
parent 7c237920b1
commit 7aec78833c
4 changed files with 365 additions and 23 deletions

View File

@@ -483,17 +483,28 @@ def main():
display_splash_screen(args)
# Check if running under Gunicorn
if 'GUNICORN_CMD_ARGS' in os.environ:
# If started with Gunicorn, return directly as Gunicorn will call get_application
print("Running under Gunicorn - worker management handled by Gunicorn")
return
# If not running under Gunicorn, initialize shared data here
from lightrag.kg.shared_storage import initialize_share_data
initialize_share_data(args.workers)
print("Starting in single-process mode")
initialize_share_data(1) # Force single process mode
# Create application instance directly instead of using factory function
app = create_app(args)
# Start Uvicorn in single process mode
uvicorn_config = {
"app": "lightrag.api.lightrag_server:get_application",
"factory": True,
"app": app, # Pass application instance directly instead of string path
"host": args.host,
"port": args.port,
"workers": args.workers,
"log_config": None, # Disable default config
}
if args.ssl:
uvicorn_config.update(
{
@@ -501,6 +512,8 @@ def main():
"ssl_keyfile": args.ssl_keyfile,
}
)
print(f"Starting Uvicorn server in single-process mode on {args.host}:{args.port}")
uvicorn.run(**uvicorn_config)

View File

@@ -1,10 +1,19 @@
import os
import sys
from multiprocessing.synchronize import Lock as ProcessLock
from threading import Lock as ThreadLock
from multiprocessing import Manager
from typing import Any, Dict, Optional, Union
from lightrag.utils import logger
# Define a direct print function for critical logs that must be visible in all processes
def direct_log(message, level="INFO"):
"""
Log a message directly to stderr to ensure visibility in all processes,
including the Gunicorn master process.
"""
print(f"{level}: {message}", file=sys.stderr, flush=True)
LockType = Union[ProcessLock, ThreadLock]
_manager = None
@@ -21,41 +30,60 @@ _global_lock: Optional[LockType] = None
def initialize_share_data(workers: int = 1):
"""Initialize storage data"""
global _manager, _is_multiprocess, is_multiprocess, _global_lock, _shared_dicts, _share_objects, _init_flags, _initialized
"""
Initialize shared storage data for single or multi-process mode.
When used with Gunicorn's preload feature, this function is called once in the
master process before forking worker processes, allowing all workers to share
the same initialized data.
In single-process mode, this function is called during LightRAG object initialization.
The function determines whether to use cross-process shared variables for data storage
based on the number of workers. If workers=1, it uses thread locks and local dictionaries.
If workers>1, it uses process locks and shared dictionaries managed by multiprocessing.Manager.
Args:
workers (int): Number of worker processes. If 1, single-process mode is used.
If > 1, multi-process mode with shared memory is used.
"""
global _manager, _is_multiprocess, is_multiprocess, _global_lock, _shared_dicts, _share_objects, _init_flags, _initialized
# Check if already initialized
if _initialized and _initialized.value:
is_multiprocess = _is_multiprocess.value
if _is_multiprocess.value:
logger.info(f"Process {os.getpid()} storage data already initialized!")
return
direct_log(f"Process {os.getpid()} storage data already initialized (multiprocess={_is_multiprocess.value})!")
return
_manager = Manager()
_initialized = _manager.Value("b", False)
_is_multiprocess = _manager.Value("b", False)
if workers == 1:
_is_multiprocess.value = False
_global_lock = ThreadLock()
_shared_dicts = {}
_share_objects = {}
_init_flags = {}
logger.info(f"Process {os.getpid()} storage data created for Single Process")
else:
# Force multi-process mode if workers > 1
if workers > 1:
_is_multiprocess.value = True
_global_lock = _manager.Lock()
# Create shared dictionaries with manager
_shared_dicts = _manager.dict()
_share_objects = _manager.dict()
_init_flags = _manager.dict() # 使用共享字典存储初始化标志
logger.info(f"Process {os.getpid()} storage data created for Multiple Process")
_init_flags = _manager.dict() # Use shared dictionary to store initialization flags
direct_log(f"Process {os.getpid()} storage data created for Multiple Process (workers={workers})")
else:
_is_multiprocess.value = False
_global_lock = ThreadLock()
_shared_dicts = {}
_share_objects = {}
_init_flags = {}
direct_log(f"Process {os.getpid()} storage data created for Single Process")
# Mark as initialized
_initialized.value = True
is_multiprocess = _is_multiprocess.value
def try_initialize_namespace(namespace: str) -> bool:
"""
尝试初始化命名空间。返回True表示当前进程获得了初始化权限。
使用共享字典的原子操作确保只有一个进程能成功初始化。
Try to initialize a namespace. Returns True if the current process gets initialization permission.
Uses atomic operations on shared dictionaries to ensure only one process can successfully initialize.
"""
global _init_flags, _manager
@@ -126,3 +154,52 @@ def get_namespace_data(namespace: str) -> Dict[str, Any]:
def get_scan_progress() -> Dict[str, Any]:
"""get storage space for document scanning progress data"""
return get_namespace_data("scan_progress")
def finalize_share_data():
"""
Release shared resources and clean up.
This function should be called when the application is shutting down
to properly release shared resources and avoid memory leaks.
In multi-process mode, it shuts down the Manager and releases all shared objects.
In single-process mode, it simply resets the global variables.
"""
global _manager, _is_multiprocess, is_multiprocess, _global_lock, _shared_dicts, _share_objects, _init_flags, _initialized
# Check if already initialized
if not (_initialized and _initialized.value):
direct_log(f"Process {os.getpid()} storage data not initialized, nothing to finalize")
return
direct_log(f"Process {os.getpid()} finalizing storage data (multiprocess={_is_multiprocess.value})")
# In multi-process mode, shut down the Manager
if _is_multiprocess.value and _manager is not None:
try:
# Clear shared dictionaries first
if _shared_dicts is not None:
_shared_dicts.clear()
if _share_objects is not None:
_share_objects.clear()
if _init_flags is not None:
_init_flags.clear()
# Shut down the Manager
_manager.shutdown()
direct_log(f"Process {os.getpid()} Manager shutdown complete")
except Exception as e:
direct_log(f"Process {os.getpid()} Error shutting down Manager: {e}", level="ERROR")
# Reset global variables
_manager = None
_initialized = None
_is_multiprocess = None
is_multiprocess = None
_shared_dicts = None
_share_objects = None
_init_flags = None
_global_lock = None
direct_log(f"Process {os.getpid()} storage data finalization complete")