Merge branch 'main' into neo4j-add-min-degree

2025-03-10 22:24:06 +08:00
parent bbff3ed0ab 2b230d403d
commit 2bf40ab636
4 changed files with 92 additions and 42 deletions
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -16,7 +16,11 @@ from pydantic import BaseModel, Field, field_validator
 from lightrag import LightRAG
 from lightrag.base import DocProcessingStatus, DocStatus
-from ..utils_api import get_api_key_dependency, get_auth_dependency
+from lightrag.api.utils_api import (
    get_api_key_dependency,
    global_args,
    get_auth_dependency,
 )
 router = APIRouter(
    prefix="/documents",
@@ -240,6 +244,15 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                    )
                    return False
            case ".pdf":
                if global_args["main_args"].document_loading_engine == "DOCLING":
                    if not pm.is_installed("docling"):  # type: ignore
                        pm.install("docling")
                    from docling.document_converter import DocumentConverter
                    converter = DocumentConverter()
                    result = converter.convert(file_path)
                    content = result.document.export_to_markdown()
                else:
                    if not pm.is_installed("pypdf2"):  # type: ignore
                        pm.install("pypdf2")
                    from PyPDF2 import PdfReader  # type: ignore
@@ -250,6 +263,15 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                    for page in reader.pages:
                        content += page.extract_text() + "\n"
            case ".docx":
                if global_args["main_args"].document_loading_engine == "DOCLING":
                    if not pm.is_installed("docling"):  # type: ignore
                        pm.install("docling")
                    from docling.document_converter import DocumentConverter
                    converter = DocumentConverter()
                    result = converter.convert(file_path)
                    content = result.document.export_to_markdown()
                else:
                    if not pm.is_installed("python-docx"):  # type: ignore
                        pm.install("docx")
                    from docx import Document  # type: ignore
@@ -257,8 +279,19 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                    docx_file = BytesIO(file)
                    doc = Document(docx_file)
-                content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
+                    content = "\n".join(
                        [paragraph.text for paragraph in doc.paragraphs]
                    )
            case ".pptx":
                if global_args["main_args"].document_loading_engine == "DOCLING":
                    if not pm.is_installed("docling"):  # type: ignore
                        pm.install("docling")
                    from docling.document_converter import DocumentConverter
                    converter = DocumentConverter()
                    result = converter.convert(file_path)
                    content = result.document.export_to_markdown()
                else:
                    if not pm.is_installed("python-pptx"):  # type: ignore
                        pm.install("pptx")
                    from pptx import Presentation  # type: ignore
@@ -271,6 +304,15 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                            if hasattr(shape, "text"):
                                content += shape.text + "\n"
            case ".xlsx":
                if global_args["main_args"].document_loading_engine == "DOCLING":
                    if not pm.is_installed("docling"):  # type: ignore
                        pm.install("docling")
                    from docling.document_converter import DocumentConverter
                    converter = DocumentConverter()
                    result = converter.convert(file_path)
                    content = result.document.export_to_markdown()
                else:
                    if not pm.is_installed("openpyxl"):  # type: ignore
                        pm.install("openpyxl")
                    from openpyxl import load_workbook  # type: ignore
@@ -283,7 +325,8 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                        for row in sheet.iter_rows(values_only=True):
                            content += (
                                "\t".join(
-                                str(cell) if cell is not None else "" for cell in row
+                                    str(cell) if cell is not None else ""
                                    for cell in row
                                )
                                + "\n"
                            )
--- a/lightrag/api/routers/ollama_api.py
+++ b/lightrag/api/routers/ollama_api.py
@@ -11,7 +11,7 @@ import asyncio
 from ascii_colors import trace_exception
 from lightrag import LightRAG, QueryParam
 from lightrag.utils import encode_string_by_tiktoken
-from ..utils_api import ollama_server_infos
+from lightrag.api.utils_api import ollama_server_infos
 # query mode according to query prefix (bypass is not LightRAG quer mode)
--- a/lightrag/api/utils_api.py
+++ b/lightrag/api/utils_api.py
@@ -18,6 +18,8 @@ from .auth import auth_handler
 # Load environment variables
 load_dotenv(override=True)
 global_args = {"main_args": None}
 class OllamaServerInfos:
    # Constants for emulated Ollama model information
@@ -365,8 +367,12 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:
        "ENABLE_LLM_CACHE_FOR_EXTRACT", False, bool
    )
    # Select Document loading tool (DOCLING, DEFAULT)
    args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
    ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name
    global_args["main_args"] = args
    return args
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -2084,6 +2084,7 @@ class LightRAG:
                cast(StorageNameSpace, storage_inst).index_done_callback()
                for storage_inst in [  # type: ignore
                    self.entities_vdb,
                    self.relationships_vdb,
                    self.chunk_entity_relation_graph,
                ]
            ]