From aaa8194423e18db4503b4b04fa5543bd63980b41 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 5 Mar 2025 15:32:39 +0100 Subject: [PATCH 1/9] Update document_routes.py --- lightrag/api/routers/document_routes.py | 114 +++++++++++++++--------- 1 file changed, 73 insertions(+), 41 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index d9dfe913..9d161f6c 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -16,7 +16,7 @@ from pydantic import BaseModel, Field, field_validator from lightrag import LightRAG from lightrag.base import DocProcessingStatus, DocStatus -from ..utils_api import get_api_key_dependency +from lightrag.api.utils_api import get_api_key_dependency, global_args router = APIRouter(prefix="/documents", tags=["documents"]) @@ -237,54 +237,86 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: ) return False case ".pdf": - if not pm.is_installed("pypdf2"): # type: ignore - pm.install("pypdf2") - from PyPDF2 import PdfReader # type: ignore - from io import BytesIO + if global_args["main_args"].document_loading_tool=="DOCLING": + if not pm.is_installed("docling"): # type: ignore + pm.install("docling") + from docling.document_converter import DocumentConverter + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() + else: + if not pm.is_installed("pypdf2"): # type: ignore + pm.install("pypdf2") + from PyPDF2 import PdfReader # type: ignore + from io import BytesIO - pdf_file = BytesIO(file) - reader = PdfReader(pdf_file) - for page in reader.pages: - content += page.extract_text() + "\n" + pdf_file = BytesIO(file) + reader = PdfReader(pdf_file) + for page in reader.pages: + content += page.extract_text() + "\n" case ".docx": - if not pm.is_installed("python-docx"): # type: ignore - pm.install("docx") - from docx import Document # type: ignore - from io import BytesIO + if global_args["main_args"].document_loading_tool=="DOCLING": + if not pm.is_installed("docling"): # type: ignore + pm.install("docling") + from docling.document_converter import DocumentConverter + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() + else: + if not pm.is_installed("python-docx"): # type: ignore + pm.install("docx") + from docx import Document # type: ignore + from io import BytesIO - docx_file = BytesIO(file) - doc = Document(docx_file) - content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) + docx_file = BytesIO(file) + doc = Document(docx_file) + content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) case ".pptx": - if not pm.is_installed("python-pptx"): # type: ignore - pm.install("pptx") - from pptx import Presentation # type: ignore - from io import BytesIO + if global_args["main_args"].document_loading_tool=="DOCLING": + if not pm.is_installed("docling"): # type: ignore + pm.install("docling") + from docling.document_converter import DocumentConverter + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() + else: + if not pm.is_installed("python-pptx"): # type: ignore + pm.install("pptx") + from pptx import Presentation # type: ignore + from io import BytesIO - pptx_file = BytesIO(file) - prs = Presentation(pptx_file) - for slide in prs.slides: - for shape in slide.shapes: - if hasattr(shape, "text"): - content += shape.text + "\n" + pptx_file = BytesIO(file) + prs = Presentation(pptx_file) + for slide in prs.slides: + for shape in slide.shapes: + if hasattr(shape, "text"): + content += shape.text + "\n" case ".xlsx": - if not pm.is_installed("openpyxl"): # type: ignore - pm.install("openpyxl") - from openpyxl import load_workbook # type: ignore - from io import BytesIO + if global_args["main_args"].document_loading_tool=="DOCLING": + if not pm.is_installed("docling"): # type: ignore + pm.install("docling") + from docling.document_converter import DocumentConverter + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() + else: + if not pm.is_installed("openpyxl"): # type: ignore + pm.install("openpyxl") + from openpyxl import load_workbook # type: ignore + from io import BytesIO - xlsx_file = BytesIO(file) - wb = load_workbook(xlsx_file) - for sheet in wb: - content += f"Sheet: {sheet.title}\n" - for row in sheet.iter_rows(values_only=True): - content += ( - "\t".join( - str(cell) if cell is not None else "" for cell in row + xlsx_file = BytesIO(file) + wb = load_workbook(xlsx_file) + for sheet in wb: + content += f"Sheet: {sheet.title}\n" + for row in sheet.iter_rows(values_only=True): + content += ( + "\t".join( + str(cell) if cell is not None else "" for cell in row + ) + + "\n" ) - + "\n" - ) - content += "\n" + content += "\n" case _: logger.error( f"Unsupported file type: {file_path.name} (extension {ext})" From 95a6a274ca7d0588e72e76f8eb445870a128f868 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 5 Mar 2025 15:33:06 +0100 Subject: [PATCH 2/9] Update ollama_api.py --- lightrag/api/routers/ollama_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/api/routers/ollama_api.py b/lightrag/api/routers/ollama_api.py index 9688d073..37d7354e 100644 --- a/lightrag/api/routers/ollama_api.py +++ b/lightrag/api/routers/ollama_api.py @@ -11,7 +11,7 @@ import asyncio from ascii_colors import trace_exception from lightrag import LightRAG, QueryParam from lightrag.utils import encode_string_by_tiktoken -from ..utils_api import ollama_server_infos +from lightrag.api.utils_api import ollama_server_infos # query mode according to query prefix (bypass is not LightRAG quer mode) From c62422eadee4ac9f666c55fa04706ce52812fd32 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 5 Mar 2025 15:33:54 +0100 Subject: [PATCH 3/9] Update utils_api.py --- lightrag/api/utils_api.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lightrag/api/utils_api.py b/lightrag/api/utils_api.py index ed1250d4..39b2950f 100644 --- a/lightrag/api/utils_api.py +++ b/lightrag/api/utils_api.py @@ -17,6 +17,10 @@ from starlette.status import HTTP_403_FORBIDDEN # Load environment variables load_dotenv(override=True) +global_args={ + "main_args":None +} + class OllamaServerInfos: # Constants for emulated Ollama model information @@ -340,9 +344,13 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace: # Inject chunk configuration args.chunk_size = get_env_value("CHUNK_SIZE", 1200, int) args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int) + + # Select Document loading tool + args.document_loading_tool = get_env_value("DOCUMENT_LOADING_TOOL", "DOCLING") ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name + global_args["main_args"]= args return args From 39c24f4a597c9e82e45975e322ee28156a6fb202 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 5 Mar 2025 15:36:17 +0100 Subject: [PATCH 4/9] Update utils_api.py --- lightrag/api/utils_api.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/lightrag/api/utils_api.py b/lightrag/api/utils_api.py index 39b2950f..8ba4565f 100644 --- a/lightrag/api/utils_api.py +++ b/lightrag/api/utils_api.py @@ -17,9 +17,7 @@ from starlette.status import HTTP_403_FORBIDDEN # Load environment variables load_dotenv(override=True) -global_args={ - "main_args":None -} +global_args = {"main_args": None} class OllamaServerInfos: @@ -344,13 +342,13 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace: # Inject chunk configuration args.chunk_size = get_env_value("CHUNK_SIZE", 1200, int) args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int) - + # Select Document loading tool args.document_loading_tool = get_env_value("DOCUMENT_LOADING_TOOL", "DOCLING") ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name - global_args["main_args"]= args + global_args["main_args"] = args return args From 6e4daea056940b17f6773c59e492bd8a5eb5d308 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 5 Mar 2025 15:36:47 +0100 Subject: [PATCH 5/9] Linting --- lightrag/api/routers/document_routes.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 9d161f6c..a6830389 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -237,10 +237,11 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: ) return False case ".pdf": - if global_args["main_args"].document_loading_tool=="DOCLING": + if global_args["main_args"].document_loading_tool == "DOCLING": if not pm.is_installed("docling"): # type: ignore pm.install("docling") from docling.document_converter import DocumentConverter + converter = DocumentConverter() result = converter.convert(file_path) content = result.document.export_to_markdown() @@ -255,10 +256,11 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: for page in reader.pages: content += page.extract_text() + "\n" case ".docx": - if global_args["main_args"].document_loading_tool=="DOCLING": + if global_args["main_args"].document_loading_tool == "DOCLING": if not pm.is_installed("docling"): # type: ignore pm.install("docling") from docling.document_converter import DocumentConverter + converter = DocumentConverter() result = converter.convert(file_path) content = result.document.export_to_markdown() @@ -270,12 +272,15 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: docx_file = BytesIO(file) doc = Document(docx_file) - content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) + content = "\n".join( + [paragraph.text for paragraph in doc.paragraphs] + ) case ".pptx": - if global_args["main_args"].document_loading_tool=="DOCLING": + if global_args["main_args"].document_loading_tool == "DOCLING": if not pm.is_installed("docling"): # type: ignore pm.install("docling") from docling.document_converter import DocumentConverter + converter = DocumentConverter() result = converter.convert(file_path) content = result.document.export_to_markdown() @@ -292,10 +297,11 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: if hasattr(shape, "text"): content += shape.text + "\n" case ".xlsx": - if global_args["main_args"].document_loading_tool=="DOCLING": + if global_args["main_args"].document_loading_tool == "DOCLING": if not pm.is_installed("docling"): # type: ignore pm.install("docling") from docling.document_converter import DocumentConverter + converter = DocumentConverter() result = converter.convert(file_path) content = result.document.export_to_markdown() @@ -312,7 +318,8 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: for row in sheet.iter_rows(values_only=True): content += ( "\t".join( - str(cell) if cell is not None else "" for cell in row + str(cell) if cell is not None else "" + for cell in row ) + "\n" ) From 00f3c6c6ddce60687d25c4c7022efb5bba1e4b5d Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Thu, 6 Mar 2025 01:11:48 +0100 Subject: [PATCH 6/9] Upgraded document loading engine --- lightrag/api/routers/document_routes.py | 8 ++++---- lightrag/api/utils_api.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index a6830389..dcb8f961 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -237,7 +237,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: ) return False case ".pdf": - if global_args["main_args"].document_loading_tool == "DOCLING": + if global_args["main_args"].document_loading_engine == "DOCLING": if not pm.is_installed("docling"): # type: ignore pm.install("docling") from docling.document_converter import DocumentConverter @@ -256,7 +256,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: for page in reader.pages: content += page.extract_text() + "\n" case ".docx": - if global_args["main_args"].document_loading_tool == "DOCLING": + if global_args["main_args"].document_loading_engine == "DOCLING": if not pm.is_installed("docling"): # type: ignore pm.install("docling") from docling.document_converter import DocumentConverter @@ -276,7 +276,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: [paragraph.text for paragraph in doc.paragraphs] ) case ".pptx": - if global_args["main_args"].document_loading_tool == "DOCLING": + if global_args["main_args"].document_loading_engine == "DOCLING": if not pm.is_installed("docling"): # type: ignore pm.install("docling") from docling.document_converter import DocumentConverter @@ -297,7 +297,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: if hasattr(shape, "text"): content += shape.text + "\n" case ".xlsx": - if global_args["main_args"].document_loading_tool == "DOCLING": + if global_args["main_args"].document_loading_engine == "DOCLING": if not pm.is_installed("docling"): # type: ignore pm.install("docling") from docling.document_converter import DocumentConverter diff --git a/lightrag/api/utils_api.py b/lightrag/api/utils_api.py index 8ba4565f..ae674968 100644 --- a/lightrag/api/utils_api.py +++ b/lightrag/api/utils_api.py @@ -344,7 +344,7 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace: args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int) # Select Document loading tool - args.document_loading_tool = get_env_value("DOCUMENT_LOADING_TOOL", "DOCLING") + args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DOCLING") ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name From 53cfb72db48500297a152301987a8bdbc88a930a Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Sat, 8 Mar 2025 23:07:51 +0100 Subject: [PATCH 7/9] linted --- lightrag/api/routers/document_routes.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 2f1d4d03..c1666192 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -16,7 +16,11 @@ from pydantic import BaseModel, Field, field_validator from lightrag import LightRAG from lightrag.base import DocProcessingStatus, DocStatus -from lightrag.api.utils_api import get_api_key_dependency, global_args +from lightrag.api.utils_api import ( + get_api_key_dependency, + global_args, + get_auth_dependency, +) router = APIRouter( prefix="/documents", From 04862033d6a0f442691572297ee0b6bdc7f30bdf Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Sun, 9 Mar 2025 13:14:39 +0100 Subject: [PATCH 8/9] Made the defa&ult mode non docling --- lightrag/api/utils_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/api/utils_api.py b/lightrag/api/utils_api.py index 55a81c61..581d5f8c 100644 --- a/lightrag/api/utils_api.py +++ b/lightrag/api/utils_api.py @@ -362,8 +362,8 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace: args.chunk_size = get_env_value("CHUNK_SIZE", 1200, int) args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int) - # Select Document loading tool - args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DOCLING") + # Select Document loading tool (DOCLING, DEFAULT) + args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT") ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name From ad13009cffe43026e432f6844fcb3d4baa39ec08 Mon Sep 17 00:00:00 2001 From: zrguo Date: Mon, 10 Mar 2025 19:07:19 +0800 Subject: [PATCH 9/9] fix edit_entity --- lightrag/lightrag.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 41216825..7fb24eee 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -2144,6 +2144,7 @@ class LightRAG: cast(StorageNameSpace, storage_inst).index_done_callback() for storage_inst in [ # type: ignore self.entities_vdb, + self.relationships_vdb, self.chunk_entity_relation_graph, ] ]