Merge branch 'HKUDS:main' into main

2025-03-04 08:27:53 +01:00
parent 537e01290c de9aeedad7
commit 694daf158a
23 changed files with 563 additions and 177 deletions
--- a/lightrag/api/gunicorn_config.py
+++ b/lightrag/api/gunicorn_config.py
@@ -2,12 +2,15 @@
 import os
 import logging
 from lightrag.kg.shared_storage import finalize_share_data
-from lightrag.api.lightrag_server import LightragPathFilter
+from lightrag.utils import setup_logger

 # Get log directory path from environment variable
 log_dir = os.getenv("LOG_DIR", os.getcwd())
 log_file_path = os.path.abspath(os.path.join(log_dir, "lightrag.log"))

+# Ensure log directory exists
+os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
+
 # Get log file max size and backup count from environment variables
 log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760))  # Default 10MB
 log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5))  # Default 5 backups
@@ -108,6 +111,9 @@ def on_starting(server):
    except ImportError:
        print("psutil not installed, skipping memory usage reporting")

+    # Log the location of the LightRAG log file
+    print(f"LightRAG log file: {log_file_path}\n")
+
    print("Gunicorn initialization complete, forking workers...\n")


@@ -134,51 +140,18 @@ def post_fork(server, worker):
    Executed after a worker has been forked.
    This is a good place to set up worker-specific configurations.
    """
-    # Configure formatters
-    detailed_formatter = logging.Formatter(
-        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-    )
-    simple_formatter = logging.Formatter("%(levelname)s: %(message)s")
-
-    def setup_logger(logger_name: str, level: str = "INFO", add_filter: bool = False):
-        """Set up a logger with console and file handlers"""
-        logger_instance = logging.getLogger(logger_name)
-        logger_instance.setLevel(level)
-        logger_instance.handlers = []  # Clear existing handlers
-        logger_instance.propagate = False
-
-        # Add console handler
-        console_handler = logging.StreamHandler()
-        console_handler.setFormatter(simple_formatter)
-        console_handler.setLevel(level)
-        logger_instance.addHandler(console_handler)
-
-        # Add file handler
-        file_handler = logging.handlers.RotatingFileHandler(
-            filename=log_file_path,
-            maxBytes=log_max_bytes,
-            backupCount=log_backup_count,
-            encoding="utf-8",
-        )
-        file_handler.setFormatter(detailed_formatter)
-        file_handler.setLevel(level)
-        logger_instance.addHandler(file_handler)
-
-        # Add path filter if requested
-        if add_filter:
-            path_filter = LightragPathFilter()
-            logger_instance.addFilter(path_filter)
-
    # Set up main loggers
    log_level = loglevel.upper() if loglevel else "INFO"
-    setup_logger("uvicorn", log_level)
-    setup_logger("uvicorn.access", log_level, add_filter=True)
-    setup_logger("lightrag", log_level, add_filter=True)
+    setup_logger("uvicorn", log_level, add_filter=False, log_file_path=log_file_path)
+    setup_logger(
+        "uvicorn.access", log_level, add_filter=True, log_file_path=log_file_path
+    )
+    setup_logger("lightrag", log_level, add_filter=True, log_file_path=log_file_path)

    # Set up lightrag submodule loggers
    for name in logging.root.manager.loggerDict:
        if name.startswith("lightrag."):
-            setup_logger(name, log_level, add_filter=True)
+            setup_logger(name, log_level, add_filter=True, log_file_path=log_file_path)

    # Disable uvicorn.error logger
    uvicorn_error_logger = logging.getLogger("uvicorn.error")
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -6,7 +6,6 @@ from fastapi import (
    FastAPI,
    Depends,
 )
-from fastapi.responses import FileResponse
 import asyncio
 import os
 import logging
@@ -331,7 +330,6 @@ def create_app(args):
                "similarity_threshold": 0.95,
                "use_llm_check": False,
            },
-            log_level=args.log_level,
            namespace_prefix=args.namespace_prefix,
            auto_manage_storages_states=False,
        )
@@ -361,7 +359,6 @@ def create_app(args):
                "similarity_threshold": 0.95,
                "use_llm_check": False,
            },
-            log_level=args.log_level,
            namespace_prefix=args.namespace_prefix,
            auto_manage_storages_states=False,
        )
@@ -412,10 +409,6 @@ def create_app(args):
        name="webui",
    )

-    @app.get("/webui/")
-    async def webui_root():
-        return FileResponse(static_dir / "index.html")
-
    return app


@@ -439,6 +432,9 @@ def configure_logging():
    log_dir = os.getenv("LOG_DIR", os.getcwd())
    log_file_path = os.path.abspath(os.path.join(log_dir, "lightrag.log"))

+    print(f"\nLightRAG log file: {log_file_path}\n")
+    os.makedirs(os.path.dirname(log_dir), exist_ok=True)
+
    # Get log file max size and backup count from environment variables
    log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760))  # Default 10MB
    log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5))  # Default 5 backups
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -215,9 +215,29 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                | ".scss"
                | ".less"
            ):
-                content = file.decode("utf-8")
+                try:
+                    # Try to decode as UTF-8
+                    content = file.decode("utf-8")
+
+                    # Validate content
+                    if not content or len(content.strip()) == 0:
+                        logger.error(f"Empty content in file: {file_path.name}")
+                        return False
+
+                    # Check if content looks like binary data string representation
+                    if content.startswith("b'") or content.startswith('b"'):
+                        logger.error(
+                            f"File {file_path.name} appears to contain binary data representation instead of text"
+                        )
+                        return False
+
+                except UnicodeDecodeError:
+                    logger.error(
+                        f"File {file_path.name} is not valid UTF-8 encoded text. Please convert it to UTF-8 before processing."
+                    )
+                    return False
            case ".pdf":
-                if not pm.is_installed("pypdf2"):
+                if not pm.is_installed("pypdf2"):  # type: ignore
                    pm.install("pypdf2")
                from PyPDF2 import PdfReader  # type: ignore
                from io import BytesIO
@@ -227,18 +247,18 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                for page in reader.pages:
                    content += page.extract_text() + "\n"
            case ".docx":
-                if not pm.is_installed("docx"):
+                if not pm.is_installed("python-docx"):  # type: ignore
                    pm.install("docx")
-                from docx import Document
+                from docx import Document  # type: ignore
                from io import BytesIO

                docx_file = BytesIO(file)
                doc = Document(docx_file)
                content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
            case ".pptx":
-                if not pm.is_installed("pptx"):
+                if not pm.is_installed("python-pptx"):  # type: ignore
                    pm.install("pptx")
-                from pptx import Presentation
+                from pptx import Presentation  # type: ignore
                from io import BytesIO

                pptx_file = BytesIO(file)
@@ -248,9 +268,9 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                        if hasattr(shape, "text"):
                            content += shape.text + "\n"
            case ".xlsx":
-                if not pm.is_installed("openpyxl"):
+                if not pm.is_installed("openpyxl"):  # type: ignore
                    pm.install("openpyxl")
-                from openpyxl import load_workbook
+                from openpyxl import load_workbook  # type: ignore
                from io import BytesIO

                xlsx_file = BytesIO(file)
--- a/lightrag/api/routers/graph_routes.py
+++ b/lightrag/api/routers/graph_routes.py
@@ -16,12 +16,32 @@ def create_graph_routes(rag, api_key: Optional[str] = None):

    @router.get("/graph/label/list", dependencies=[Depends(optional_api_key)])
    async def get_graph_labels():
-        """Get all graph labels"""
+        """
+        Get all graph labels
+
+        Returns:
+            List[str]: List of graph labels
+        """
        return await rag.get_graph_labels()

    @router.get("/graphs", dependencies=[Depends(optional_api_key)])
    async def get_knowledge_graph(label: str, max_depth: int = 3):
-        """Get knowledge graph for a specific label"""
+        """
+        Retrieve a connected subgraph of nodes where the label includes the specified label.
+        Maximum number of nodes is constrained by the environment variable `MAX_GRAPH_NODES` (default: 1000).
+        When reducing the number of nodes, the prioritization criteria are as follows:
+            1. Label matching nodes take precedence
+            2. Followed by nodes directly connected to the matching nodes
+            3. Finally, the degree of the nodes
+        Maximum number of nodes is limited to env MAX_GRAPH_NODES(default: 1000)
+
+        Args:
+            label (str): Label to get knowledge graph for
+            max_depth (int, optional): Maximum depth of graph. Defaults to 3.
+
+        Returns:
+            Dict[str, List[str]]: Knowledge graph for label
+        """
        return await rag.get_knowledge_graph(node_label=label, max_depth=max_depth)

    return router