diff --git a/examples/lightrag_zhipu_postgres_demo.py b/examples/lightrag_zhipu_postgres_demo.py index 4ed88602..f2066d09 100644 --- a/examples/lightrag_zhipu_postgres_demo.py +++ b/examples/lightrag_zhipu_postgres_demo.py @@ -6,7 +6,8 @@ from dotenv import load_dotenv from lightrag import LightRAG, QueryParam from lightrag.kg.postgres_impl import PostgreSQLDB -from lightrag.llm.zhipu import ollama_embedding, zhipu_complete +from lightrag.llm.zhipu import zhipu_complete +from lightrag.llm.ollama import ollama_embedding from lightrag.utils import EmbeddingFunc load_dotenv() diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index b1039335..f15996ce 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -557,7 +557,7 @@ class DocumentManager: def __init__( self, input_dir: str, - supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx"), + supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx", "xlsx"), ): self.input_dir = Path(input_dir) self.supported_extensions = supported_extensions @@ -986,38 +986,14 @@ def create_app(args): async with aiofiles.open(file_path, "r", encoding="utf-8") as f: content = await f.read() - case ".pdf": - if not pm.is_installed("pypdf2"): - pm.install("pypdf2") - from PyPDF2 import PdfReader + case ".pdf" | ".docx" | ".pptx" | ".xlsx": + if not pm.is_installed("docling"): + pm.install("docling") + from docling.document_converter import DocumentConverter - # PDF handling - reader = PdfReader(str(file_path)) - content = "" - for page in reader.pages: - content += page.extract_text() + "\n" - - case ".docx": - if not pm.is_installed("python-docx"): - pm.install("python-docx") - from docx import Document - - # Word document handling - doc = Document(file_path) - content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) - - case ".pptx": - if not pm.is_installed("pptx"): - pm.install("pptx") - from pptx import Presentation # type: ignore - - # PowerPoint handling - prs = Presentation(file_path) - content = "" - for slide in prs.slides: - for shape in slide.shapes: - if hasattr(shape, "text"): - content += shape.text + "\n" + converter = DocumentConverter() + result = converter.convert(file_path) + content = result.document.export_to_markdown() case _: raise ValueError(f"Unsupported file format: {ext}") @@ -1295,55 +1271,26 @@ def create_app(args): text_content = await file.read() content = text_content.decode("utf-8") - case ".pdf": - if not pm.is_installed("pypdf2"): - pm.install("pypdf2") - from PyPDF2 import PdfReader - from io import BytesIO + case ".pdf" | ".docx" | ".pptx" | ".xlsx": + if not pm.is_installed("docling"): + pm.install("docling") + from docling.document_converter import DocumentConverter - # Read PDF from memory - pdf_content = await file.read() - pdf_file = BytesIO(pdf_content) - reader = PdfReader(pdf_file) - content = "" - for page in reader.pages: - content += page.extract_text() + "\n" + # Create a temporary file to save the uploaded content + temp_path = Path("temp") / file.filename + temp_path.parent.mkdir(exist_ok=True) - case ".docx": - if not pm.is_installed("python-docx"): - pm.install("python-docx") - from docx import Document - from io import BytesIO + # Save the uploaded file + with temp_path.open("wb") as f: + f.write(await file.read()) - # Read DOCX from memory - docx_content = await file.read() - docx_file = BytesIO(docx_content) - doc = Document(docx_file) - content = "\n".join( - [paragraph.text for paragraph in doc.paragraphs] - ) - - case ".pptx": - if not pm.is_installed("pptx"): - pm.install("pptx") - from pptx import Presentation # type: ignore - from io import BytesIO - - # Read PPTX from memory - pptx_content = await file.read() - pptx_file = BytesIO(pptx_content) - prs = Presentation(pptx_file) - content = "" - for slide in prs.slides: - for shape in slide.shapes: - if hasattr(shape, "text"): - content += shape.text + "\n" - - case _: - raise HTTPException( - status_code=400, - detail=f"Unsupported file type. Supported types: {doc_manager.supported_extensions}", - ) + try: + converter = DocumentConverter() + result = converter.convert(str(temp_path)) + content = result.document.export_to_markdown() + finally: + # Clean up the temporary file + temp_path.unlink() # Insert content into RAG system if content: diff --git a/requirements.txt b/requirements.txt index d8f5612f..c5ccac23 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,9 @@ accelerate aiofiles aiohttp configparser + +# File manipulation libraries +docling graspologic # database packages @@ -11,12 +14,7 @@ networkx numpy pipmaster pydantic - -# File manipulation libraries -PyPDF2 -python-docx python-dotenv -python-pptx setuptools tenacity