Merge branch 'main' into neo4j-add-min-degree

This commit is contained in:
zrguo
2025-03-10 22:24:06 +08:00
committed by GitHub
4 changed files with 92 additions and 42 deletions

View File

@@ -16,7 +16,11 @@ from pydantic import BaseModel, Field, field_validator
from lightrag import LightRAG from lightrag import LightRAG
from lightrag.base import DocProcessingStatus, DocStatus from lightrag.base import DocProcessingStatus, DocStatus
from ..utils_api import get_api_key_dependency, get_auth_dependency from lightrag.api.utils_api import (
get_api_key_dependency,
global_args,
get_auth_dependency,
)
router = APIRouter( router = APIRouter(
prefix="/documents", prefix="/documents",
@@ -240,6 +244,15 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
) )
return False return False
case ".pdf": case ".pdf":
if global_args["main_args"].document_loading_engine == "DOCLING":
if not pm.is_installed("docling"): # type: ignore
pm.install("docling")
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
else:
if not pm.is_installed("pypdf2"): # type: ignore if not pm.is_installed("pypdf2"): # type: ignore
pm.install("pypdf2") pm.install("pypdf2")
from PyPDF2 import PdfReader # type: ignore from PyPDF2 import PdfReader # type: ignore
@@ -250,6 +263,15 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
for page in reader.pages: for page in reader.pages:
content += page.extract_text() + "\n" content += page.extract_text() + "\n"
case ".docx": case ".docx":
if global_args["main_args"].document_loading_engine == "DOCLING":
if not pm.is_installed("docling"): # type: ignore
pm.install("docling")
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
else:
if not pm.is_installed("python-docx"): # type: ignore if not pm.is_installed("python-docx"): # type: ignore
pm.install("docx") pm.install("docx")
from docx import Document # type: ignore from docx import Document # type: ignore
@@ -257,8 +279,19 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
docx_file = BytesIO(file) docx_file = BytesIO(file)
doc = Document(docx_file) doc = Document(docx_file)
content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) content = "\n".join(
[paragraph.text for paragraph in doc.paragraphs]
)
case ".pptx": case ".pptx":
if global_args["main_args"].document_loading_engine == "DOCLING":
if not pm.is_installed("docling"): # type: ignore
pm.install("docling")
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
else:
if not pm.is_installed("python-pptx"): # type: ignore if not pm.is_installed("python-pptx"): # type: ignore
pm.install("pptx") pm.install("pptx")
from pptx import Presentation # type: ignore from pptx import Presentation # type: ignore
@@ -271,6 +304,15 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
if hasattr(shape, "text"): if hasattr(shape, "text"):
content += shape.text + "\n" content += shape.text + "\n"
case ".xlsx": case ".xlsx":
if global_args["main_args"].document_loading_engine == "DOCLING":
if not pm.is_installed("docling"): # type: ignore
pm.install("docling")
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
else:
if not pm.is_installed("openpyxl"): # type: ignore if not pm.is_installed("openpyxl"): # type: ignore
pm.install("openpyxl") pm.install("openpyxl")
from openpyxl import load_workbook # type: ignore from openpyxl import load_workbook # type: ignore
@@ -283,7 +325,8 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
for row in sheet.iter_rows(values_only=True): for row in sheet.iter_rows(values_only=True):
content += ( content += (
"\t".join( "\t".join(
str(cell) if cell is not None else "" for cell in row str(cell) if cell is not None else ""
for cell in row
) )
+ "\n" + "\n"
) )

View File

@@ -11,7 +11,7 @@ import asyncio
from ascii_colors import trace_exception from ascii_colors import trace_exception
from lightrag import LightRAG, QueryParam from lightrag import LightRAG, QueryParam
from lightrag.utils import encode_string_by_tiktoken from lightrag.utils import encode_string_by_tiktoken
from ..utils_api import ollama_server_infos from lightrag.api.utils_api import ollama_server_infos
# query mode according to query prefix (bypass is not LightRAG quer mode) # query mode according to query prefix (bypass is not LightRAG quer mode)

View File

@@ -18,6 +18,8 @@ from .auth import auth_handler
# Load environment variables # Load environment variables
load_dotenv(override=True) load_dotenv(override=True)
global_args = {"main_args": None}
class OllamaServerInfos: class OllamaServerInfos:
# Constants for emulated Ollama model information # Constants for emulated Ollama model information
@@ -365,8 +367,12 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:
"ENABLE_LLM_CACHE_FOR_EXTRACT", False, bool "ENABLE_LLM_CACHE_FOR_EXTRACT", False, bool
) )
# Select Document loading tool (DOCLING, DEFAULT)
args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name
global_args["main_args"] = args
return args return args

View File

@@ -2084,6 +2084,7 @@ class LightRAG:
cast(StorageNameSpace, storage_inst).index_done_callback() cast(StorageNameSpace, storage_inst).index_done_callback()
for storage_inst in [ # type: ignore for storage_inst in [ # type: ignore
self.entities_vdb, self.entities_vdb,
self.relationships_vdb,
self.chunk_entity_relation_graph, self.chunk_entity_relation_graph,
] ]
] ]