diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 3e51fa4d..c1666192 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -16,7 +16,11 @@ from pydantic import BaseModel, Field, field_validator from lightrag import LightRAG from lightrag.base import DocProcessingStatus, DocStatus -from ..utils_api import get_api_key_dependency, get_auth_dependency +from lightrag.api.utils_api import ( + get_api_key_dependency, + global_args, + get_auth_dependency, +) router = APIRouter( prefix="/documents", @@ -240,54 +244,93 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: ) return False case ".pdf": - if not pm.is_installed("pypdf2"): # type: ignore - pm.install("pypdf2") - from PyPDF2 import PdfReader # type: ignore - from io import BytesIO + if global_args["main_args"].document_loading_engine == "DOCLING": + if not pm.is_installed("docling"): # type: ignore + pm.install("docling") + from docling.document_converter import DocumentConverter - pdf_file = BytesIO(file) - reader = PdfReader(pdf_file) - for page in reader.pages: - content += page.extract_text() + "\n" + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() + else: + if not pm.is_installed("pypdf2"): # type: ignore + pm.install("pypdf2") + from PyPDF2 import PdfReader # type: ignore + from io import BytesIO + + pdf_file = BytesIO(file) + reader = PdfReader(pdf_file) + for page in reader.pages: + content += page.extract_text() + "\n" case ".docx": - if not pm.is_installed("python-docx"): # type: ignore - pm.install("docx") - from docx import Document # type: ignore - from io import BytesIO + if global_args["main_args"].document_loading_engine == "DOCLING": + if not pm.is_installed("docling"): # type: ignore + pm.install("docling") + from docling.document_converter import DocumentConverter - docx_file = BytesIO(file) - doc = Document(docx_file) - content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() + else: + if not pm.is_installed("python-docx"): # type: ignore + pm.install("docx") + from docx import Document # type: ignore + from io import BytesIO + + docx_file = BytesIO(file) + doc = Document(docx_file) + content = "\n".join( + [paragraph.text for paragraph in doc.paragraphs] + ) case ".pptx": - if not pm.is_installed("python-pptx"): # type: ignore - pm.install("pptx") - from pptx import Presentation # type: ignore - from io import BytesIO + if global_args["main_args"].document_loading_engine == "DOCLING": + if not pm.is_installed("docling"): # type: ignore + pm.install("docling") + from docling.document_converter import DocumentConverter - pptx_file = BytesIO(file) - prs = Presentation(pptx_file) - for slide in prs.slides: - for shape in slide.shapes: - if hasattr(shape, "text"): - content += shape.text + "\n" + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() + else: + if not pm.is_installed("python-pptx"): # type: ignore + pm.install("pptx") + from pptx import Presentation # type: ignore + from io import BytesIO + + pptx_file = BytesIO(file) + prs = Presentation(pptx_file) + for slide in prs.slides: + for shape in slide.shapes: + if hasattr(shape, "text"): + content += shape.text + "\n" case ".xlsx": - if not pm.is_installed("openpyxl"): # type: ignore - pm.install("openpyxl") - from openpyxl import load_workbook # type: ignore - from io import BytesIO + if global_args["main_args"].document_loading_engine == "DOCLING": + if not pm.is_installed("docling"): # type: ignore + pm.install("docling") + from docling.document_converter import DocumentConverter - xlsx_file = BytesIO(file) - wb = load_workbook(xlsx_file) - for sheet in wb: - content += f"Sheet: {sheet.title}\n" - for row in sheet.iter_rows(values_only=True): - content += ( - "\t".join( - str(cell) if cell is not None else "" for cell in row + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() + else: + if not pm.is_installed("openpyxl"): # type: ignore + pm.install("openpyxl") + from openpyxl import load_workbook # type: ignore + from io import BytesIO + + xlsx_file = BytesIO(file) + wb = load_workbook(xlsx_file) + for sheet in wb: + content += f"Sheet: {sheet.title}\n" + for row in sheet.iter_rows(values_only=True): + content += ( + "\t".join( + str(cell) if cell is not None else "" + for cell in row + ) + + "\n" ) - + "\n" - ) - content += "\n" + content += "\n" case _: logger.error( f"Unsupported file type: {file_path.name} (extension {ext})" diff --git a/lightrag/api/routers/ollama_api.py b/lightrag/api/routers/ollama_api.py index 9688d073..37d7354e 100644 --- a/lightrag/api/routers/ollama_api.py +++ b/lightrag/api/routers/ollama_api.py @@ -11,7 +11,7 @@ import asyncio from ascii_colors import trace_exception from lightrag import LightRAG, QueryParam from lightrag.utils import encode_string_by_tiktoken -from ..utils_api import ollama_server_infos +from lightrag.api.utils_api import ollama_server_infos # query mode according to query prefix (bypass is not LightRAG quer mode) diff --git a/lightrag/api/utils_api.py b/lightrag/api/utils_api.py index ffe63abd..1f75db9c 100644 --- a/lightrag/api/utils_api.py +++ b/lightrag/api/utils_api.py @@ -18,6 +18,8 @@ from .auth import auth_handler # Load environment variables load_dotenv(override=True) +global_args = {"main_args": None} + class OllamaServerInfos: # Constants for emulated Ollama model information @@ -365,8 +367,12 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace: "ENABLE_LLM_CACHE_FOR_EXTRACT", False, bool ) + # Select Document loading tool (DOCLING, DEFAULT) + args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT") + ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name + global_args["main_args"] = args return args diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 3cd379b6..3a7d340a 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -2084,6 +2084,7 @@ class LightRAG: cast(StorageNameSpace, storage_inst).index_done_callback() for storage_inst in [ # type: ignore self.entities_vdb, + self.relationships_vdb, self.chunk_entity_relation_graph, ] ]