From c8ecfa2d68614d47ce55caea73004032cc848c4d Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Tue, 6 May 2025 22:00:43 +0800
Subject: [PATCH] feat: Centralize configuration and update defaults

This commit introduces `lightrag/constants.py` to centralize default values for various configurations across the API and core components.

Key changes:
- Added `constants.py` to centralize default values
- Improved the `get_env_value` function in `api/config.py` to correctly handle string "None" as a None value and to catch `TypeError` during value conversion.
- Updated the default `SUMMARY_LANGUAGE` to "English"
- Set default `WORKERS` to 2
---
 lightrag/api/config.py            | 31 ++++++++++++++++++-------------
 lightrag/api/gunicorn_config.py   | 17 ++++++++++-------
 lightrag/api/lightrag_server.py   | 12 +++++++++---
 lightrag/api/run_with_gunicorn.py | 19 +++++++++++++------
 lightrag/api/utils_api.py         | 14 +++++++++++---
 lightrag/constants.py             | 18 ++++++++++++++++++
 lightrag/lightrag.py              | 15 ++++++++++++---
 lightrag/utils.py                 | 12 +++++++++---
 8 files changed, 100 insertions(+), 38 deletions(-)
 create mode 100644 lightrag/constants.py

diff --git a/lightrag/api/config.py b/lightrag/api/config.py
index 268b41cb..696c1d48 100644
--- a/lightrag/api/config.py
+++ b/lightrag/api/config.py
@@ -7,6 +7,11 @@ import argparse
 import logging
 from dotenv import load_dotenv
 
+from lightrag.constants import (
+    DEFAULT_WOKERS,
+    DEFAULT_TIMEOUT,
+)
+
 # use the .env that is inside the current folder
 # allows to use different .env file for each lightrag instance
 # the OS environment variables take precedence over the .env file
@@ -45,7 +50,9 @@ def get_default_host(binding_type: str) -> str:
     )  # fallback to ollama if unknown
 
 
-def get_env_value(env_key: str, default: any, value_type: type = str) -> any:
+def get_env_value(
+    env_key: str, default: any, value_type: type = str, special_none: bool = False
+) -> any:
     """
     Get value from environment variable with type conversion
 
@@ -53,6 +60,7 @@ def get_env_value(env_key: str, default: any, value_type: type = str) -> any:
         env_key (str): Environment variable key
         default (any): Default value if env variable is not set
         value_type (type): Type to convert the value to
+        special_none (bool): If True, return None when value is "None"
 
     Returns:
         any: Converted value from environment or default
@@ -61,11 +69,15 @@ def get_env_value(env_key: str, default: any, value_type: type = str) -> any:
     if value is None:
         return default
 
+    # Handle special case for "None" string
+    if special_none and value == "None":
+        return None
+
     if value_type is bool:
         return value.lower() in ("true", "1", "yes", "t", "on")
     try:
         return value_type(value)
-    except ValueError:
+    except (ValueError, TypeError):
         return default
 
 
@@ -109,17 +121,10 @@ def parse_args() -> argparse.Namespace:
         help="Directory containing input documents (default: from env or ./inputs)",
     )
 
-    def timeout_type(value):
-        if value is None:
-            return 150
-        if value is None or value == "None":
-            return None
-        return int(value)
-
     parser.add_argument(
         "--timeout",
-        default=get_env_value("TIMEOUT", None, timeout_type),
-        type=timeout_type,
+        default=get_env_value("TIMEOUT", DEFAULT_TIMEOUT, int, special_none=True),
+        type=int,
         help="Timeout in seconds (useful when using slow AI). Use None for infinite timeout",
     )
 
@@ -226,7 +231,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--workers",
         type=int,
-        default=get_env_value("WORKERS", 1, int),
+        default=get_env_value("WORKERS", DEFAULT_WOKERS, int),
         help="Number of worker processes (default: from env or 1)",
     )
 
@@ -307,7 +312,7 @@ def parse_args() -> argparse.Namespace:
 
     # Add environment variables that were previously read directly
     args.cors_origins = get_env_value("CORS_ORIGINS", "*")
-    args.summary_language = get_env_value("SUMMARY_LANGUAGE", "en")
+    args.summary_language = get_env_value("SUMMARY_LANGUAGE", "English")
     args.whitelist_paths = get_env_value("WHITELIST_PATHS", "/health,/api/*")
 
     # For JWT Auth
diff --git a/lightrag/api/gunicorn_config.py b/lightrag/api/gunicorn_config.py
index 0aef108e..25c9e48f 100644
--- a/lightrag/api/gunicorn_config.py
+++ b/lightrag/api/gunicorn_config.py
@@ -3,17 +3,24 @@ import os
 import logging
 from lightrag.kg.shared_storage import finalize_share_data
 from lightrag.utils import setup_logger
+from lightrag.api.config import get_env_value
+from lightrag.constants import (
+    DEFAULT_LOG_MAX_BYTES,
+    DEFAULT_LOG_BACKUP_COUNT,
+    DEFAULT_LOG_FILENAME,
+)
+
 
 # Get log directory path from environment variable
 log_dir = os.getenv("LOG_DIR", os.getcwd())
-log_file_path = os.path.abspath(os.path.join(log_dir, "lightrag.log"))
+log_file_path = os.path.abspath(os.path.join(log_dir, DEFAULT_LOG_FILENAME))
 
 # Ensure log directory exists
 os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
 
 # Get log file max size and backup count from environment variables
-log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760))  # Default 10MB
-log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5))  # Default 5 backups
+log_max_bytes = get_env_value("LOG_MAX_BYTES", DEFAULT_LOG_MAX_BYTES, int)
+log_backup_count = get_env_value("LOG_BACKUP_COUNT", DEFAULT_LOG_BACKUP_COUNT, int)
 
 # These variables will be set by run_with_gunicorn.py
 workers = None
@@ -29,10 +36,6 @@ preload_app = True
 worker_class = "uvicorn.workers.UvicornWorker"
 
 # Other Gunicorn configurations
-timeout = int(
-    os.getenv("TIMEOUT", 150 * 2)
-)  # Default 150s *2 to match run_with_gunicorn.py
-keepalive = int(os.getenv("KEEPALIVE", 5))  # Default 5s
 
 # Logging configuration
 errorlog = os.getenv("ERROR_LOG", log_file_path)  # Default write to lightrag.log
diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index 42a45daa..6c90d35f 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -26,12 +26,18 @@ from .config import (
     global_args,
     update_uvicorn_mode_config,
     get_default_host,
+    get_env_value,
 )
 import sys
 from lightrag import LightRAG, __version__ as core_version
 from lightrag.api import __api_version__
 from lightrag.types import GPTKeywordExtractionFormat
 from lightrag.utils import EmbeddingFunc
+from lightrag.constants import (
+    DEFAULT_LOG_MAX_BYTES,
+    DEFAULT_LOG_BACKUP_COUNT,
+    DEFAULT_LOG_FILENAME,
+)
 from lightrag.api.routers.document_routes import (
     DocumentManager,
     create_document_routes,
@@ -514,14 +520,14 @@ def configure_logging():
 
     # Get log directory path from environment variable
     log_dir = os.getenv("LOG_DIR", os.getcwd())
-    log_file_path = os.path.abspath(os.path.join(log_dir, "lightrag.log"))
+    log_file_path = os.path.abspath(os.path.join(log_dir, DEFAULT_LOG_FILENAME))
 
     print(f"\nLightRAG log file: {log_file_path}\n")
     os.makedirs(os.path.dirname(log_dir), exist_ok=True)
 
     # Get log file max size and backup count from environment variables
-    log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760))  # Default 10MB
-    log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5))  # Default 5 backups
+    log_max_bytes = get_env_value("LOG_MAX_BYTES", DEFAULT_LOG_MAX_BYTES, int)
+    log_backup_count = get_env_value("LOG_BACKUP_COUNT", DEFAULT_LOG_BACKUP_COUNT, int)
 
     logging.config.dictConfig(
         {
diff --git a/lightrag/api/run_with_gunicorn.py b/lightrag/api/run_with_gunicorn.py
index 5b41af8a..11df0801 100644
--- a/lightrag/api/run_with_gunicorn.py
+++ b/lightrag/api/run_with_gunicorn.py
@@ -8,8 +8,13 @@ import sys
 import signal
 import pipmaster as pm
 from lightrag.api.utils_api import display_splash_screen, check_env_file
+from lightrag.api.config import global_args, get_env_value
 from lightrag.kg.shared_storage import initialize_share_data, finalize_share_data
-from .config import global_args
+
+from lightrag.constants import (
+    DEFAULT_WOKERS,
+    DEFAULT_TIMEOUT,
+)
 
 
 def check_and_install_dependencies():
@@ -122,7 +127,7 @@ def main():
             gunicorn_config.workers = (
                 global_args.workers
                 if global_args.workers
-                else int(os.getenv("WORKERS", 1))
+                else get_env_value("WORKERS", DEFAULT_WOKERS, int)
             )
 
             # Bind configuration prioritizes command line arguments
@@ -134,7 +139,7 @@ def main():
             port = (
                 global_args.port
                 if global_args.port != 9621
-                else int(os.getenv("PORT", 9621))
+                else get_env_value("PORT", 9621, int)
             )
             gunicorn_config.bind = f"{host}:{port}"
 
@@ -149,11 +154,13 @@ def main():
             gunicorn_config.timeout = (
                 global_args.timeout * 2
                 if global_args.timeout is not None
-                else int(os.getenv("TIMEOUT", 150 * 2))
+                else get_env_value(
+                    "TIMEOUT", DEFAULT_TIMEOUT + 30, int, special_none=True
+                )
             )
 
             # Keepalive configuration
-            gunicorn_config.keepalive = int(os.getenv("KEEPALIVE", 5))
+            gunicorn_config.keepalive = get_env_value("KEEPALIVE", 5, int)
 
             # SSL configuration prioritizes command line arguments
             if global_args.ssl or os.getenv("SSL", "").lower() in (
@@ -202,7 +209,7 @@ def main():
     app = GunicornApp("")
 
     # Force workers to be an integer and greater than 1 for multi-process mode
-    workers_count = int(global_args.workers)
+    workers_count = global_args.workers
     if workers_count > 1:
         # Set a flag to indicate we're in the main process
         os.environ["LIGHTRAG_MAIN_PROCESS"] = "1"
diff --git a/lightrag/api/utils_api.py b/lightrag/api/utils_api.py
index a1dade88..13ec4ddd 100644
--- a/lightrag/api/utils_api.py
+++ b/lightrag/api/utils_api.py
@@ -9,11 +9,15 @@ import sys
 from ascii_colors import ASCIIColors
 from lightrag.api import __api_version__ as api_version
 from lightrag import __version__ as core_version
+from lightrag.constants import (
+    DEFAULT_MAX_TOKEN_SUMMARY,
+    DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE,
+)
 from fastapi import HTTPException, Security, Request, status
 from fastapi.security import APIKeyHeader, OAuth2PasswordBearer
 from starlette.status import HTTP_403_FORBIDDEN
 from .auth import auth_handler
-from .config import ollama_server_infos, global_args
+from .config import ollama_server_infos, global_args, get_env_value
 
 
 def check_env_file():
@@ -264,9 +268,13 @@ def display_splash_screen(args: argparse.Namespace) -> None:
     ASCIIColors.white("    ├─ Top-K: ", end="")
     ASCIIColors.yellow(f"{args.top_k}")
     ASCIIColors.white("    ├─ Max Token Summary: ", end="")
-    ASCIIColors.yellow(f"{int(os.getenv('MAX_TOKEN_SUMMARY', 500))}")
+    ASCIIColors.yellow(
+        f"{get_env_value('MAX_TOKEN_SUMMARY', DEFAULT_MAX_TOKEN_SUMMARY, int)}"
+    )
     ASCIIColors.white("    └─ Force LLM Summary on Merge: ", end="")
-    ASCIIColors.yellow(f"{int(os.getenv('FORCE_LLM_SUMMARY_ON_MERGE', 6))}")
+    ASCIIColors.yellow(
+        f"{get_env_value('FORCE_LLM_SUMMARY_ON_MERGE', DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE, int)}"
+    )
 
     # System Configuration
     ASCIIColors.magenta("\n💾 Storage Configuration:")
diff --git a/lightrag/constants.py b/lightrag/constants.py
new file mode 100644
index 00000000..787e1c49
--- /dev/null
+++ b/lightrag/constants.py
@@ -0,0 +1,18 @@
+"""
+Centralized configuration constants for LightRAG.
+
+This module defines default values for configuration constants used across
+different parts of the LightRAG system. Centralizing these values ensures
+consistency and makes maintenance easier.
+"""
+
+# Default values for environment variables
+DEFAULT_MAX_TOKEN_SUMMARY = 500
+DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 6
+DEFAULT_WOKERS = 2
+DEFAULT_TIMEOUT = 150
+
+# Logging configuration defaults
+DEFAULT_LOG_MAX_BYTES = 10485760  # Default 10MB
+DEFAULT_LOG_BACKUP_COUNT = 5  # Default 5 backups
+DEFAULT_LOG_FILENAME = "lightrag.log"  # Default log filename
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 17d36166..452941a6 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -20,6 +20,11 @@ from typing import (
     List,
     Dict,
 )
+from lightrag.constants import (
+    DEFAULT_MAX_TOKEN_SUMMARY,
+    DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE,
+)
+from lightrag.api.config import get_env_value
 
 from lightrag.kg import (
     STORAGES,
@@ -119,10 +124,14 @@ class LightRAG:
     entity_extract_max_gleaning: int = field(default=1)
     """Maximum number of entity extraction attempts for ambiguous content."""
 
-    summary_to_max_tokens: int = field(default=int(os.getenv("MAX_TOKEN_SUMMARY", 500)))
+    summary_to_max_tokens: int = field(
+        default=get_env_value("MAX_TOKEN_SUMMARY", DEFAULT_MAX_TOKEN_SUMMARY, int)
+    )
 
     force_llm_summary_on_merge: int = field(
-        default=int(os.getenv("FORCE_LLM_SUMMARY_ON_MERGE", 6))
+        default=get_env_value(
+            "FORCE_LLM_SUMMARY_ON_MERGE", DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE, int
+        )
     )
 
     # Text chunking
@@ -245,7 +254,7 @@ class LightRAG:
 
     addon_params: dict[str, Any] = field(
         default_factory=lambda: {
-            "language": os.getenv("SUMMARY_LANGUAGE", PROMPTS["DEFAULT_LANGUAGE"])
+            "language": get_env_value("SUMMARY_LANGUAGE", "English", str)
         }
     )
 
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 9af97c7a..720f859e 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -17,6 +17,12 @@ import xml.etree.ElementTree as ET
 import numpy as np
 from lightrag.prompt import PROMPTS
 from dotenv import load_dotenv
+from lightrag.constants import (
+    DEFAULT_LOG_MAX_BYTES,
+    DEFAULT_LOG_BACKUP_COUNT,
+    DEFAULT_LOG_FILENAME,
+)
+from lightrag.api.config import get_env_value
 
 # Use TYPE_CHECKING to avoid circular imports
 if TYPE_CHECKING:
@@ -152,14 +158,14 @@ def setup_logger(
         # Get log file path
         if log_file_path is None:
             log_dir = os.getenv("LOG_DIR", os.getcwd())
-            log_file_path = os.path.abspath(os.path.join(log_dir, "lightrag.log"))
+            log_file_path = os.path.abspath(os.path.join(log_dir, DEFAULT_LOG_FILENAME))
 
         # Ensure log directory exists
         os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
 
         # Get log file max size and backup count from environment variables
-        log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760))  # Default 10MB
-        log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5))  # Default 5 backups
+        log_max_bytes = get_env_value("LOG_MAX_BYTES", DEFAULT_LOG_MAX_BYTES, int)
+        log_backup_count = get_env_value("LOG_BACKUP_COUNT", DEFAULT_LOG_BACKUP_COUNT, int)
 
         try:
             # Add file handler