From e24a0a86e55c62711b2c8954a4160ee52f2c7c1f Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Fri, 31 Jan 2025 17:12:10 +0100 Subject: [PATCH 1/4] Update requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index d8f5612f..31eb7183 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ pipmaster pydantic # File manipulation libraries +docling PyPDF2 python-docx python-dotenv From ef35f9a4e4533379dde094636709ff8eb0080ee1 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Sat, 1 Feb 2025 00:56:43 +0100 Subject: [PATCH 2/4] Introduced docling instead of other tools for loading files --- lightrag/api/lightrag_server.py | 116 ++++++++++++++------------------ requirements.txt | 3 - 2 files changed, 51 insertions(+), 68 deletions(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index e1b24731..cc6c4b83 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -556,7 +556,7 @@ class DocumentManager: def __init__( self, input_dir: str, - supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx"), + supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx", "xlsx"), ): self.input_dir = Path(input_dir) self.supported_extensions = supported_extensions @@ -974,37 +974,36 @@ def create_app(args): content = await f.read() case ".pdf": - if not pm.is_installed("pypdf2"): - pm.install("pypdf2") - from PyPDF2 import PdfReader + if not pm.is_installed("docling"): + pm.install("docling") + from docling.document_converter import DocumentConverter + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() - # PDF handling - reader = PdfReader(str(file_path)) - content = "" - for page in reader.pages: - content += page.extract_text() + "\n" case ".docx": - if not pm.is_installed("python-docx"): - pm.install("python-docx") - from docx import Document - - # Word document handling - doc = Document(file_path) - content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) + if not pm.is_installed("docling"): + pm.install("docling") + from docling.document_converter import DocumentConverter + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() case ".pptx": - if not pm.is_installed("pptx"): - pm.install("pptx") - from pptx import Presentation # type: ignore - - # PowerPoint handling - prs = Presentation(file_path) - content = "" - for slide in prs.slides: - for shape in slide.shapes: - if hasattr(shape, "text"): - content += shape.text + "\n" + if not pm.is_installed("docling"): + pm.install("docling") + from docling.document_converter import DocumentConverter + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() + case ".xlsx": + if not pm.is_installed("docling"): + pm.install("docling") + from docling.document_converter import DocumentConverter + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() case _: raise ValueError(f"Unsupported file format: {ext}") @@ -1283,49 +1282,36 @@ def create_app(args): content = text_content.decode("utf-8") case ".pdf": - if not pm.is_installed("pypdf2"): - pm.install("pypdf2") - from PyPDF2 import PdfReader - from io import BytesIO + if not pm.is_installed("docling"): + pm.install("docling") + from docling.document_converter import DocumentConverter + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() - # Read PDF from memory - pdf_content = await file.read() - pdf_file = BytesIO(pdf_content) - reader = PdfReader(pdf_file) - content = "" - for page in reader.pages: - content += page.extract_text() + "\n" case ".docx": - if not pm.is_installed("python-docx"): - pm.install("python-docx") - from docx import Document - from io import BytesIO - - # Read DOCX from memory - docx_content = await file.read() - docx_file = BytesIO(docx_content) - doc = Document(docx_file) - content = "\n".join( - [paragraph.text for paragraph in doc.paragraphs] - ) + if not pm.is_installed("docling"): + pm.install("docling") + from docling.document_converter import DocumentConverter + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() case ".pptx": - if not pm.is_installed("pptx"): - pm.install("pptx") - from pptx import Presentation # type: ignore - from io import BytesIO - - # Read PPTX from memory - pptx_content = await file.read() - pptx_file = BytesIO(pptx_content) - prs = Presentation(pptx_file) - content = "" - for slide in prs.slides: - for shape in slide.shapes: - if hasattr(shape, "text"): - content += shape.text + "\n" - + if not pm.is_installed("docling"): + pm.install("docling") + from docling.document_converter import DocumentConverter + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() + case ".xlsx": + if not pm.is_installed("docling"): + pm.install("docling") + from docling.document_converter import DocumentConverter + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() case _: raise HTTPException( status_code=400, diff --git a/requirements.txt b/requirements.txt index 31eb7183..9f9660d9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,10 +14,7 @@ pydantic # File manipulation libraries docling -PyPDF2 -python-docx python-dotenv -python-pptx setuptools tenacity From e09cb85f3743345fc88abefbf97963f34af5e144 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Sat, 1 Feb 2025 01:15:06 +0100 Subject: [PATCH 3/4] fixed linting as well as file path --- lightrag/api/lightrag_server.py | 20 +++++++++++++------- requirements.txt | 6 +++--- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index cc6c4b83..3a8a072b 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -977,15 +977,16 @@ def create_app(args): if not pm.is_installed("docling"): pm.install("docling") from docling.document_converter import DocumentConverter + converter = DocumentConverter() result = converter.convert(file_path) content = result.document.export_to_markdown() - case ".docx": if not pm.is_installed("docling"): pm.install("docling") from docling.document_converter import DocumentConverter + converter = DocumentConverter() result = converter.convert(file_path) content = result.document.export_to_markdown() @@ -994,6 +995,7 @@ def create_app(args): if not pm.is_installed("docling"): pm.install("docling") from docling.document_converter import DocumentConverter + converter = DocumentConverter() result = converter.convert(file_path) content = result.document.export_to_markdown() @@ -1001,6 +1003,7 @@ def create_app(args): if not pm.is_installed("docling"): pm.install("docling") from docling.document_converter import DocumentConverter + converter = DocumentConverter() result = converter.convert(file_path) content = result.document.export_to_markdown() @@ -1285,32 +1288,35 @@ def create_app(args): if not pm.is_installed("docling"): pm.install("docling") from docling.document_converter import DocumentConverter - converter = DocumentConverter() - result = converter.convert(file_path) - content = result.document.export_to_markdown() + converter = DocumentConverter() + result = converter.convert(file.filename) + content = result.document.export_to_markdown() case ".docx": if not pm.is_installed("docling"): pm.install("docling") from docling.document_converter import DocumentConverter + converter = DocumentConverter() - result = converter.convert(file_path) + result = converter.convert(file.filename) content = result.document.export_to_markdown() case ".pptx": if not pm.is_installed("docling"): pm.install("docling") from docling.document_converter import DocumentConverter + converter = DocumentConverter() - result = converter.convert(file_path) + result = converter.convert(file.filename) content = result.document.export_to_markdown() case ".xlsx": if not pm.is_installed("docling"): pm.install("docling") from docling.document_converter import DocumentConverter + converter = DocumentConverter() - result = converter.convert(file_path) + result = converter.convert(file.filename) content = result.document.export_to_markdown() case _: raise HTTPException( diff --git a/requirements.txt b/requirements.txt index 9f9660d9..c5ccac23 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,9 @@ accelerate aiofiles aiohttp configparser + +# File manipulation libraries +docling graspologic # database packages @@ -11,9 +14,6 @@ networkx numpy pipmaster pydantic - -# File manipulation libraries -docling python-dotenv setuptools From 3a40772d301c93576631a2482edb8b317381d408 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Sat, 1 Feb 2025 01:19:32 +0100 Subject: [PATCH 4/4] Simplified file loading --- lightrag/api/lightrag_server.py | 75 +++++++-------------------------- 1 file changed, 15 insertions(+), 60 deletions(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 3a8a072b..5e3c9585 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -973,33 +973,7 @@ def create_app(args): async with aiofiles.open(file_path, "r", encoding="utf-8") as f: content = await f.read() - case ".pdf": - if not pm.is_installed("docling"): - pm.install("docling") - from docling.document_converter import DocumentConverter - - converter = DocumentConverter() - result = converter.convert(file_path) - content = result.document.export_to_markdown() - - case ".docx": - if not pm.is_installed("docling"): - pm.install("docling") - from docling.document_converter import DocumentConverter - - converter = DocumentConverter() - result = converter.convert(file_path) - content = result.document.export_to_markdown() - - case ".pptx": - if not pm.is_installed("docling"): - pm.install("docling") - from docling.document_converter import DocumentConverter - - converter = DocumentConverter() - result = converter.convert(file_path) - content = result.document.export_to_markdown() - case ".xlsx": + case ".pdf" | ".docx" | ".pptx" | ".xlsx": if not pm.is_installed("docling"): pm.install("docling") from docling.document_converter import DocumentConverter @@ -1284,45 +1258,26 @@ def create_app(args): text_content = await file.read() content = text_content.decode("utf-8") - case ".pdf": + case ".pdf" | ".docx" | ".pptx" | ".xlsx": if not pm.is_installed("docling"): pm.install("docling") from docling.document_converter import DocumentConverter - converter = DocumentConverter() - result = converter.convert(file.filename) - content = result.document.export_to_markdown() + # Create a temporary file to save the uploaded content + temp_path = Path("temp") / file.filename + temp_path.parent.mkdir(exist_ok=True) - case ".docx": - if not pm.is_installed("docling"): - pm.install("docling") - from docling.document_converter import DocumentConverter + # Save the uploaded file + with temp_path.open("wb") as f: + f.write(await file.read()) - converter = DocumentConverter() - result = converter.convert(file.filename) - content = result.document.export_to_markdown() - - case ".pptx": - if not pm.is_installed("docling"): - pm.install("docling") - from docling.document_converter import DocumentConverter - - converter = DocumentConverter() - result = converter.convert(file.filename) - content = result.document.export_to_markdown() - case ".xlsx": - if not pm.is_installed("docling"): - pm.install("docling") - from docling.document_converter import DocumentConverter - - converter = DocumentConverter() - result = converter.convert(file.filename) - content = result.document.export_to_markdown() - case _: - raise HTTPException( - status_code=400, - detail=f"Unsupported file type. Supported types: {doc_manager.supported_extensions}", - ) + try: + converter = DocumentConverter() + result = converter.convert(str(temp_path)) + content = result.document.export_to_markdown() + finally: + # Clean up the temporary file + temp_path.unlink() # Insert content into RAG system if content: