From 462c27c1672c30f54313f832fac182d1587e8092 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 3 Mar 2025 23:18:41 +0800 Subject: [PATCH 1/5] Refactor logging setup and simplify Gunicorn configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Move logging setup code to utils.py • Provide setup_logger for standalone LightRAG logger intialization --- lightrag/api/gunicorn_config.py | 53 ++------- lightrag/api/lightrag_server.py | 3 + lightrag/lightrag.py | 3 - lightrag/utils.py | 96 +++++++++++++++ run_with_gunicorn.py | 203 -------------------------------- 5 files changed, 112 insertions(+), 246 deletions(-) delete mode 100755 run_with_gunicorn.py diff --git a/lightrag/api/gunicorn_config.py b/lightrag/api/gunicorn_config.py index 7f9b4d58..0594ceae 100644 --- a/lightrag/api/gunicorn_config.py +++ b/lightrag/api/gunicorn_config.py @@ -2,12 +2,15 @@ import os import logging from lightrag.kg.shared_storage import finalize_share_data -from lightrag.api.lightrag_server import LightragPathFilter +from lightrag.utils import setup_logger # Get log directory path from environment variable log_dir = os.getenv("LOG_DIR", os.getcwd()) log_file_path = os.path.abspath(os.path.join(log_dir, "lightrag.log")) +# Ensure log directory exists +os.makedirs(os.path.dirname(log_file_path), exist_ok=True) + # Get log file max size and backup count from environment variables log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups @@ -108,6 +111,9 @@ def on_starting(server): except ImportError: print("psutil not installed, skipping memory usage reporting") + # Log the location of the LightRAG log file + print(f"LightRAG log file: {log_file_path}\n") + print("Gunicorn initialization complete, forking workers...\n") @@ -134,51 +140,18 @@ def post_fork(server, worker): Executed after a worker has been forked. This is a good place to set up worker-specific configurations. """ - # Configure formatters - detailed_formatter = logging.Formatter( - "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - ) - simple_formatter = logging.Formatter("%(levelname)s: %(message)s") - - def setup_logger(logger_name: str, level: str = "INFO", add_filter: bool = False): - """Set up a logger with console and file handlers""" - logger_instance = logging.getLogger(logger_name) - logger_instance.setLevel(level) - logger_instance.handlers = [] # Clear existing handlers - logger_instance.propagate = False - - # Add console handler - console_handler = logging.StreamHandler() - console_handler.setFormatter(simple_formatter) - console_handler.setLevel(level) - logger_instance.addHandler(console_handler) - - # Add file handler - file_handler = logging.handlers.RotatingFileHandler( - filename=log_file_path, - maxBytes=log_max_bytes, - backupCount=log_backup_count, - encoding="utf-8", - ) - file_handler.setFormatter(detailed_formatter) - file_handler.setLevel(level) - logger_instance.addHandler(file_handler) - - # Add path filter if requested - if add_filter: - path_filter = LightragPathFilter() - logger_instance.addFilter(path_filter) - # Set up main loggers log_level = loglevel.upper() if loglevel else "INFO" - setup_logger("uvicorn", log_level) - setup_logger("uvicorn.access", log_level, add_filter=True) - setup_logger("lightrag", log_level, add_filter=True) + setup_logger("uvicorn", log_level, add_filter=False, log_file_path=log_file_path) + setup_logger( + "uvicorn.access", log_level, add_filter=True, log_file_path=log_file_path + ) + setup_logger("lightrag", log_level, add_filter=True, log_file_path=log_file_path) # Set up lightrag submodule loggers for name in logging.root.manager.loggerDict: if name.startswith("lightrag."): - setup_logger(name, log_level, add_filter=True) + setup_logger(name, log_level, add_filter=True, log_file_path=log_file_path) # Disable uvicorn.error logger uvicorn_error_logger = logging.getLogger("uvicorn.error") diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 5f2c437f..693c6a9f 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -437,6 +437,9 @@ def configure_logging(): log_dir = os.getenv("LOG_DIR", os.getcwd()) log_file_path = os.path.abspath(os.path.join(log_dir, "lightrag.log")) + print(f"\nLightRAG log file: {log_file_path}\n") + os.makedirs(os.path.dirname(log_dir), exist_ok=True) + # Get log file max size and backup count from environment variables log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 208bdf3e..adcb1029 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -266,9 +266,6 @@ class LightRAG: _storages_status: StoragesStatus = field(default=StoragesStatus.NOT_CREATED) def __post_init__(self): - os.makedirs(os.path.dirname(self.log_file_path), exist_ok=True) - logger.info(f"Logger initialized for working directory: {self.working_dir}") - from lightrag.kg.shared_storage import ( initialize_share_data, ) diff --git a/lightrag/utils.py b/lightrag/utils.py index c86ad8c0..bb1d6fae 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -6,6 +6,7 @@ import io import csv import json import logging +import logging.handlers import os import re from dataclasses import dataclass @@ -68,6 +69,101 @@ logger.setLevel(logging.INFO) logging.getLogger("httpx").setLevel(logging.WARNING) +class LightragPathFilter(logging.Filter): + """Filter for lightrag logger to filter out frequent path access logs""" + + def __init__(self): + super().__init__() + # Define paths to be filtered + self.filtered_paths = ["/documents", "/health", "/webui/"] + + def filter(self, record): + try: + # Check if record has the required attributes for an access log + if not hasattr(record, "args") or not isinstance(record.args, tuple): + return True + if len(record.args) < 5: + return True + + # Extract method, path and status from the record args + method = record.args[1] + path = record.args[2] + status = record.args[4] + + # Filter out successful GET requests to filtered paths + if ( + method == "GET" + and (status == 200 or status == 304) + and path in self.filtered_paths + ): + return False + + return True + except Exception: + # In case of any error, let the message through + return True + + +def setup_logger( + logger_name: str, + level: str = "INFO", + add_filter: bool = False, + log_file_path: str = None, +): + """Set up a logger with console and file handlers + + Args: + logger_name: Name of the logger to set up + level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + add_filter: Whether to add LightragPathFilter to the logger + log_file_path: Path to the log file. If None, will use current directory/lightrag.log + """ + # Configure formatters + detailed_formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + simple_formatter = logging.Formatter("%(levelname)s: %(message)s") + + # Get log file path + if log_file_path is None: + log_dir = os.getenv("LOG_DIR", os.getcwd()) + log_file_path = os.path.abspath(os.path.join(log_dir, "lightrag.log")) + + # Ensure log directory exists + os.makedirs(os.path.dirname(log_file_path), exist_ok=True) + + # Get log file max size and backup count from environment variables + log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB + log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups + + logger_instance = logging.getLogger(logger_name) + logger_instance.setLevel(level) + logger_instance.handlers = [] # Clear existing handlers + logger_instance.propagate = False + + # Add console handler + console_handler = logging.StreamHandler() + console_handler.setFormatter(simple_formatter) + console_handler.setLevel(level) + logger_instance.addHandler(console_handler) + + # Add file handler + file_handler = logging.handlers.RotatingFileHandler( + filename=log_file_path, + maxBytes=log_max_bytes, + backupCount=log_backup_count, + encoding="utf-8", + ) + file_handler.setFormatter(detailed_formatter) + file_handler.setLevel(level) + logger_instance.addHandler(file_handler) + + # Add path filter if requested + if add_filter: + path_filter = LightragPathFilter() + logger_instance.addFilter(path_filter) + + class UnlimitedSemaphore: """A context manager that allows unlimited access.""" diff --git a/run_with_gunicorn.py b/run_with_gunicorn.py deleted file mode 100755 index 2e4e3cf7..00000000 --- a/run_with_gunicorn.py +++ /dev/null @@ -1,203 +0,0 @@ -#!/usr/bin/env python -""" -Start LightRAG server with Gunicorn -""" - -import os -import sys -import signal -import pipmaster as pm -from lightrag.api.utils_api import parse_args, display_splash_screen -from lightrag.kg.shared_storage import initialize_share_data, finalize_share_data - - -def check_and_install_dependencies(): - """Check and install required dependencies""" - required_packages = [ - "gunicorn", - "tiktoken", - "psutil", - # Add other required packages here - ] - - for package in required_packages: - if not pm.is_installed(package): - print(f"Installing {package}...") - pm.install(package) - print(f"{package} installed successfully") - - -# Signal handler for graceful shutdown -def signal_handler(sig, frame): - print("\n\n" + "=" * 80) - print("RECEIVED TERMINATION SIGNAL") - print(f"Process ID: {os.getpid()}") - print("=" * 80 + "\n") - - # Release shared resources - finalize_share_data() - - # Exit with success status - sys.exit(0) - - -def main(): - # Check and install dependencies - check_and_install_dependencies() - - # Register signal handlers for graceful shutdown - signal.signal(signal.SIGINT, signal_handler) # Ctrl+C - signal.signal(signal.SIGTERM, signal_handler) # kill command - - # Parse all arguments using parse_args - args = parse_args(is_uvicorn_mode=False) - - # Display startup information - display_splash_screen(args) - - print("🚀 Starting LightRAG with Gunicorn") - print(f"🔄 Worker management: Gunicorn (workers={args.workers})") - print("🔍 Preloading app: Enabled") - print("📝 Note: Using Gunicorn's preload feature for shared data initialization") - print("\n\n" + "=" * 80) - print("MAIN PROCESS INITIALIZATION") - print(f"Process ID: {os.getpid()}") - print(f"Workers setting: {args.workers}") - print("=" * 80 + "\n") - - # Import Gunicorn's StandaloneApplication - from gunicorn.app.base import BaseApplication - - # Define a custom application class that loads our config - class GunicornApp(BaseApplication): - def __init__(self, app, options=None): - self.options = options or {} - self.application = app - super().__init__() - - def load_config(self): - # Define valid Gunicorn configuration options - valid_options = { - "bind", - "workers", - "worker_class", - "timeout", - "keepalive", - "preload_app", - "errorlog", - "accesslog", - "loglevel", - "certfile", - "keyfile", - "limit_request_line", - "limit_request_fields", - "limit_request_field_size", - "graceful_timeout", - "max_requests", - "max_requests_jitter", - } - - # Special hooks that need to be set separately - special_hooks = { - "on_starting", - "on_reload", - "on_exit", - "pre_fork", - "post_fork", - "pre_exec", - "pre_request", - "post_request", - "worker_init", - "worker_exit", - "nworkers_changed", - "child_exit", - } - - # Import and configure the gunicorn_config module - import gunicorn_config - - # Set configuration variables in gunicorn_config, prioritizing command line arguments - gunicorn_config.workers = ( - args.workers if args.workers else int(os.getenv("WORKERS", 1)) - ) - - # Bind configuration prioritizes command line arguments - host = args.host if args.host != "0.0.0.0" else os.getenv("HOST", "0.0.0.0") - port = args.port if args.port != 9621 else int(os.getenv("PORT", 9621)) - gunicorn_config.bind = f"{host}:{port}" - - # Log level configuration prioritizes command line arguments - gunicorn_config.loglevel = ( - args.log_level.lower() - if args.log_level - else os.getenv("LOG_LEVEL", "info") - ) - - # Timeout configuration prioritizes command line arguments - gunicorn_config.timeout = ( - args.timeout if args.timeout else int(os.getenv("TIMEOUT", 150)) - ) - - # Keepalive configuration - gunicorn_config.keepalive = int(os.getenv("KEEPALIVE", 5)) - - # SSL configuration prioritizes command line arguments - if args.ssl or os.getenv("SSL", "").lower() in ( - "true", - "1", - "yes", - "t", - "on", - ): - gunicorn_config.certfile = ( - args.ssl_certfile - if args.ssl_certfile - else os.getenv("SSL_CERTFILE") - ) - gunicorn_config.keyfile = ( - args.ssl_keyfile if args.ssl_keyfile else os.getenv("SSL_KEYFILE") - ) - - # Set configuration options from the module - for key in dir(gunicorn_config): - if key in valid_options: - value = getattr(gunicorn_config, key) - # Skip functions like on_starting and None values - if not callable(value) and value is not None: - self.cfg.set(key, value) - # Set special hooks - elif key in special_hooks: - value = getattr(gunicorn_config, key) - if callable(value): - self.cfg.set(key, value) - - if hasattr(gunicorn_config, "logconfig_dict"): - self.cfg.set( - "logconfig_dict", getattr(gunicorn_config, "logconfig_dict") - ) - - def load(self): - # Import the application - from lightrag.api.lightrag_server import get_application - - return get_application(args) - - # Create the application - app = GunicornApp("") - - # Force workers to be an integer and greater than 1 for multi-process mode - workers_count = int(args.workers) - if workers_count > 1: - # Set a flag to indicate we're in the main process - os.environ["LIGHTRAG_MAIN_PROCESS"] = "1" - initialize_share_data(workers_count) - else: - initialize_share_data(1) - - # Run the application - print("\nStarting Gunicorn with direct Python API...") - app.run() - - -if __name__ == "__main__": - main() From b26a574f40253ce6a32380b0f29b6d42d75ab0d6 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 4 Mar 2025 01:07:34 +0800 Subject: [PATCH 2/5] Deprecate log_level and log_file_path in LightRAG. - Remove log_level from API initialization - Add warnings for deprecated logging params --- README.md | 18 +++++++++++++++--- lightrag/api/lightrag_server.py | 2 -- lightrag/lightrag.py | 25 ++++++++++++++++++++----- 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index abc2f8b3..5e8c5a94 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,9 @@ import asyncio from lightrag import LightRAG, QueryParam from lightrag.llm.openai import gpt_4o_mini_complete, gpt_4o_complete, openai_embed from lightrag.kg.shared_storage import initialize_pipeline_status +from lightrag.utils import setup_logger + +setup_logger("lightrag", level="INFO") async def initialize_rag(): rag = LightRAG( @@ -344,6 +347,10 @@ from lightrag.llm.llama_index_impl import llama_index_complete_if_cache, llama_i from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.llms.openai import OpenAI from lightrag.kg.shared_storage import initialize_pipeline_status +from lightrag.utils import setup_logger + +# Setup log handler for LightRAG +setup_logger("lightrag", level="INFO") async def initialize_rag(): rag = LightRAG( @@ -640,6 +647,9 @@ export NEO4J_URI="neo4j://localhost:7687" export NEO4J_USERNAME="neo4j" export NEO4J_PASSWORD="password" +# Setup logger for LightRAG +setup_logger("lightrag", level="INFO") + # When you launch the project be sure to override the default KG: NetworkX # by specifying kg="Neo4JStorage". @@ -649,8 +659,12 @@ rag = LightRAG( working_dir=WORKING_DIR, llm_model_func=gpt_4o_mini_complete, # Use gpt_4o_mini_complete LLM model graph_storage="Neo4JStorage", #<-----------override KG default - log_level="DEBUG" #<-----------override log_level default ) + +# Initialize database connections +await rag.initialize_storages() +# Initialize pipeline status for document processing +await initialize_pipeline_status() ``` see test_neo4j.py for a working example. @@ -859,7 +873,6 @@ Valid modes are: | **kv\_storage** | `str` | Storage type for documents and text chunks. Supported types: `JsonKVStorage`, `OracleKVStorage` | `JsonKVStorage` | | **vector\_storage** | `str` | Storage type for embedding vectors. Supported types: `NanoVectorDBStorage`, `OracleVectorDBStorage` | `NanoVectorDBStorage` | | **graph\_storage** | `str` | Storage type for graph edges and nodes. Supported types: `NetworkXStorage`, `Neo4JStorage`, `OracleGraphStorage` | `NetworkXStorage` | -| **log\_level** | | Log level for application runtime | `logging.DEBUG` | | **chunk\_token\_size** | `int` | Maximum token size per chunk when splitting documents | `1200` | | **chunk\_overlap\_token\_size** | `int` | Overlap token size between two chunks when splitting documents | `100` | | **tiktoken\_model\_name** | `str` | Model name for the Tiktoken encoder used to calculate token numbers | `gpt-4o-mini` | @@ -881,7 +894,6 @@ Valid modes are: | **addon\_params** | `dict` | Additional parameters, e.g., `{"example_number": 1, "language": "Simplified Chinese", "entity_types": ["organization", "person", "geo", "event"], "insert_batch_size": 10}`: sets example limit, output language, and batch size for document processing | `example_number: all examples, language: English, insert_batch_size: 10` | | **convert\_response\_to\_json\_func** | `callable` | Not used | `convert_response_to_json` | | **embedding\_cache\_config** | `dict` | Configuration for question-answer caching. Contains three parameters:
- `enabled`: Boolean value to enable/disable cache lookup functionality. When enabled, the system will check cached responses before generating new answers.
- `similarity_threshold`: Float value (0-1), similarity threshold. When a new question's similarity with a cached question exceeds this threshold, the cached answer will be returned directly without calling the LLM.
- `use_llm_check`: Boolean value to enable/disable LLM similarity verification. When enabled, LLM will be used as a secondary check to verify the similarity between questions before returning cached answers. | Default: `{"enabled": False, "similarity_threshold": 0.95, "use_llm_check": False}` | -|**log\_dir** | `str` | Directory to store logs. | `./` | diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 693c6a9f..c91f693f 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -329,7 +329,6 @@ def create_app(args): "similarity_threshold": 0.95, "use_llm_check": False, }, - log_level=args.log_level, namespace_prefix=args.namespace_prefix, auto_manage_storages_states=False, ) @@ -359,7 +358,6 @@ def create_app(args): "similarity_threshold": 0.95, "use_llm_check": False, }, - log_level=args.log_level, namespace_prefix=args.namespace_prefix, auto_manage_storages_states=False, ) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 4dacac08..114b5735 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio import configparser import os +import warnings from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial @@ -85,14 +86,10 @@ class LightRAG: doc_status_storage: str = field(default="JsonDocStatusStorage") """Storage type for tracking document processing statuses.""" - # Logging + # Logging (Deprecated, use setup_logger in utils.py instead) # --- - log_level: int = field(default=logger.level) - """Logging level for the system (e.g., 'DEBUG', 'INFO', 'WARNING').""" - log_file_path: str = field(default=os.path.join(os.getcwd(), "lightrag.log")) - """Log file path.""" # Entity extraction # --- @@ -270,6 +267,24 @@ class LightRAG: initialize_share_data, ) + # Handle deprecated parameters + kwargs = self.__dict__ + if "log_level" in kwargs: + warnings.warn( + "WARNING: log_level parameter is deprecated, use setup_logger in utils.py instead", + UserWarning, + stacklevel=2, + ) + # Remove the attribute to prevent its use + delattr(self, "log_level") + if "log_file_path" in kwargs: + warnings.warn( + "WARNING: log_file_path parameter is deprecated, use setup_logger in utils.py instead", + UserWarning, + stacklevel=2, + ) + delattr(self, "log_file_path") + initialize_share_data() if not os.path.exists(self.working_dir): From 905699429281c576f9abbd29ae8c247b64bcda29 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 4 Mar 2025 01:28:08 +0800 Subject: [PATCH 3/5] Deprecate and remove logging parameters in LightRAG. - Set log_level and log_file_path to None by default - Issue warnings if deprecated parameters are used - Maintain backward compatibility with warnings --- lightrag/lightrag.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 114b5735..21688b7d 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -88,8 +88,8 @@ class LightRAG: # Logging (Deprecated, use setup_logger in utils.py instead) # --- - log_level: int = field(default=logger.level) - log_file_path: str = field(default=os.path.join(os.getcwd(), "lightrag.log")) + log_level: int | None = field(default=None) + log_file_path: str | None = field(default=None) # Entity extraction # --- @@ -268,21 +268,23 @@ class LightRAG: ) # Handle deprecated parameters - kwargs = self.__dict__ - if "log_level" in kwargs: + if self.log_level is not None: warnings.warn( "WARNING: log_level parameter is deprecated, use setup_logger in utils.py instead", UserWarning, stacklevel=2, ) - # Remove the attribute to prevent its use - delattr(self, "log_level") - if "log_file_path" in kwargs: + if self.log_file_path is not None: warnings.warn( "WARNING: log_file_path parameter is deprecated, use setup_logger in utils.py instead", UserWarning, stacklevel=2, ) + + # Remove these attributes to prevent their use + if hasattr(self, "log_level"): + delattr(self, "log_level") + if hasattr(self, "log_file_path"): delattr(self, "log_file_path") initialize_share_data() From 0af774a28f92b0fd6c2ba9ebcd8ce49f697a3eab Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 4 Mar 2025 01:28:39 +0800 Subject: [PATCH 4/5] Fix linting --- lightrag/lightrag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 21688b7d..a2d806b6 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -280,7 +280,7 @@ class LightRAG: UserWarning, stacklevel=2, ) - + # Remove these attributes to prevent their use if hasattr(self, "log_level"): delattr(self, "log_level") From bc9905a06177961b6f0e78f1da967e1b45ecf8cf Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 4 Mar 2025 02:28:09 +0800 Subject: [PATCH 5/5] Fix gensim not compatible wtih numpy and scipy problem - Replace numpy with gensim in requirements.txt - Let gensim choose a correct version of numpy and scipy --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a1a1157e..d9a5c68e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ configparser future # Basic modules -numpy +gensim pipmaster pydantic python-dotenv