Merge branch 'HKUDS:main' into main

This commit is contained in:
Saifeddine ALOUI
2025-03-04 08:27:53 +01:00
committed by GitHub
23 changed files with 563 additions and 177 deletions

View File

@@ -2,12 +2,15 @@
import os
import logging
from lightrag.kg.shared_storage import finalize_share_data
from lightrag.api.lightrag_server import LightragPathFilter
from lightrag.utils import setup_logger
# Get log directory path from environment variable
log_dir = os.getenv("LOG_DIR", os.getcwd())
log_file_path = os.path.abspath(os.path.join(log_dir, "lightrag.log"))
# Ensure log directory exists
os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
# Get log file max size and backup count from environment variables
log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB
log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups
@@ -108,6 +111,9 @@ def on_starting(server):
except ImportError:
print("psutil not installed, skipping memory usage reporting")
# Log the location of the LightRAG log file
print(f"LightRAG log file: {log_file_path}\n")
print("Gunicorn initialization complete, forking workers...\n")
@@ -134,51 +140,18 @@ def post_fork(server, worker):
Executed after a worker has been forked.
This is a good place to set up worker-specific configurations.
"""
# Configure formatters
detailed_formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
simple_formatter = logging.Formatter("%(levelname)s: %(message)s")
def setup_logger(logger_name: str, level: str = "INFO", add_filter: bool = False):
"""Set up a logger with console and file handlers"""
logger_instance = logging.getLogger(logger_name)
logger_instance.setLevel(level)
logger_instance.handlers = [] # Clear existing handlers
logger_instance.propagate = False
# Add console handler
console_handler = logging.StreamHandler()
console_handler.setFormatter(simple_formatter)
console_handler.setLevel(level)
logger_instance.addHandler(console_handler)
# Add file handler
file_handler = logging.handlers.RotatingFileHandler(
filename=log_file_path,
maxBytes=log_max_bytes,
backupCount=log_backup_count,
encoding="utf-8",
)
file_handler.setFormatter(detailed_formatter)
file_handler.setLevel(level)
logger_instance.addHandler(file_handler)
# Add path filter if requested
if add_filter:
path_filter = LightragPathFilter()
logger_instance.addFilter(path_filter)
# Set up main loggers
log_level = loglevel.upper() if loglevel else "INFO"
setup_logger("uvicorn", log_level)
setup_logger("uvicorn.access", log_level, add_filter=True)
setup_logger("lightrag", log_level, add_filter=True)
setup_logger("uvicorn", log_level, add_filter=False, log_file_path=log_file_path)
setup_logger(
"uvicorn.access", log_level, add_filter=True, log_file_path=log_file_path
)
setup_logger("lightrag", log_level, add_filter=True, log_file_path=log_file_path)
# Set up lightrag submodule loggers
for name in logging.root.manager.loggerDict:
if name.startswith("lightrag."):
setup_logger(name, log_level, add_filter=True)
setup_logger(name, log_level, add_filter=True, log_file_path=log_file_path)
# Disable uvicorn.error logger
uvicorn_error_logger = logging.getLogger("uvicorn.error")

View File

@@ -6,7 +6,6 @@ from fastapi import (
FastAPI,
Depends,
)
from fastapi.responses import FileResponse
import asyncio
import os
import logging
@@ -331,7 +330,6 @@ def create_app(args):
"similarity_threshold": 0.95,
"use_llm_check": False,
},
log_level=args.log_level,
namespace_prefix=args.namespace_prefix,
auto_manage_storages_states=False,
)
@@ -361,7 +359,6 @@ def create_app(args):
"similarity_threshold": 0.95,
"use_llm_check": False,
},
log_level=args.log_level,
namespace_prefix=args.namespace_prefix,
auto_manage_storages_states=False,
)
@@ -412,10 +409,6 @@ def create_app(args):
name="webui",
)
@app.get("/webui/")
async def webui_root():
return FileResponse(static_dir / "index.html")
return app
@@ -439,6 +432,9 @@ def configure_logging():
log_dir = os.getenv("LOG_DIR", os.getcwd())
log_file_path = os.path.abspath(os.path.join(log_dir, "lightrag.log"))
print(f"\nLightRAG log file: {log_file_path}\n")
os.makedirs(os.path.dirname(log_dir), exist_ok=True)
# Get log file max size and backup count from environment variables
log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB
log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups

View File

@@ -215,9 +215,29 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
| ".scss"
| ".less"
):
content = file.decode("utf-8")
try:
# Try to decode as UTF-8
content = file.decode("utf-8")
# Validate content
if not content or len(content.strip()) == 0:
logger.error(f"Empty content in file: {file_path.name}")
return False
# Check if content looks like binary data string representation
if content.startswith("b'") or content.startswith('b"'):
logger.error(
f"File {file_path.name} appears to contain binary data representation instead of text"
)
return False
except UnicodeDecodeError:
logger.error(
f"File {file_path.name} is not valid UTF-8 encoded text. Please convert it to UTF-8 before processing."
)
return False
case ".pdf":
if not pm.is_installed("pypdf2"):
if not pm.is_installed("pypdf2"): # type: ignore
pm.install("pypdf2")
from PyPDF2 import PdfReader # type: ignore
from io import BytesIO
@@ -227,18 +247,18 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
for page in reader.pages:
content += page.extract_text() + "\n"
case ".docx":
if not pm.is_installed("docx"):
if not pm.is_installed("python-docx"): # type: ignore
pm.install("docx")
from docx import Document
from docx import Document # type: ignore
from io import BytesIO
docx_file = BytesIO(file)
doc = Document(docx_file)
content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
case ".pptx":
if not pm.is_installed("pptx"):
if not pm.is_installed("python-pptx"): # type: ignore
pm.install("pptx")
from pptx import Presentation
from pptx import Presentation # type: ignore
from io import BytesIO
pptx_file = BytesIO(file)
@@ -248,9 +268,9 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
if hasattr(shape, "text"):
content += shape.text + "\n"
case ".xlsx":
if not pm.is_installed("openpyxl"):
if not pm.is_installed("openpyxl"): # type: ignore
pm.install("openpyxl")
from openpyxl import load_workbook
from openpyxl import load_workbook # type: ignore
from io import BytesIO
xlsx_file = BytesIO(file)

View File

@@ -16,12 +16,32 @@ def create_graph_routes(rag, api_key: Optional[str] = None):
@router.get("/graph/label/list", dependencies=[Depends(optional_api_key)])
async def get_graph_labels():
"""Get all graph labels"""
"""
Get all graph labels
Returns:
List[str]: List of graph labels
"""
return await rag.get_graph_labels()
@router.get("/graphs", dependencies=[Depends(optional_api_key)])
async def get_knowledge_graph(label: str, max_depth: int = 3):
"""Get knowledge graph for a specific label"""
"""
Retrieve a connected subgraph of nodes where the label includes the specified label.
Maximum number of nodes is constrained by the environment variable `MAX_GRAPH_NODES` (default: 1000).
When reducing the number of nodes, the prioritization criteria are as follows:
1. Label matching nodes take precedence
2. Followed by nodes directly connected to the matching nodes
3. Finally, the degree of the nodes
Maximum number of nodes is limited to env MAX_GRAPH_NODES(default: 1000)
Args:
label (str): Label to get knowledge graph for
max_depth (int, optional): Maximum depth of graph. Defaults to 3.
Returns:
Dict[str, List[str]]: Knowledge graph for label
"""
return await rag.get_knowledge_graph(node_label=label, max_depth=max_depth)
return router