Merge pull request #689 from ParisNeo/main

Use docling for enhanced files loading
This commit is contained in:
zrguo
2025-02-01 21:39:45 +08:00
committed by GitHub
2 changed files with 28 additions and 83 deletions

View File

@@ -556,7 +556,7 @@ class DocumentManager:
def __init__( def __init__(
self, self,
input_dir: str, input_dir: str,
supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx"), supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx", "xlsx"),
): ):
self.input_dir = Path(input_dir) self.input_dir = Path(input_dir)
self.supported_extensions = supported_extensions self.supported_extensions = supported_extensions
@@ -973,38 +973,14 @@ def create_app(args):
async with aiofiles.open(file_path, "r", encoding="utf-8") as f: async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
content = await f.read() content = await f.read()
case ".pdf": case ".pdf" | ".docx" | ".pptx" | ".xlsx":
if not pm.is_installed("pypdf2"): if not pm.is_installed("docling"):
pm.install("pypdf2") pm.install("docling")
from PyPDF2 import PdfReader from docling.document_converter import DocumentConverter
# PDF handling converter = DocumentConverter()
reader = PdfReader(str(file_path)) result = converter.convert(file_path)
content = "" content = result.document.export_to_markdown()
for page in reader.pages:
content += page.extract_text() + "\n"
case ".docx":
if not pm.is_installed("python-docx"):
pm.install("python-docx")
from docx import Document
# Word document handling
doc = Document(file_path)
content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
case ".pptx":
if not pm.is_installed("pptx"):
pm.install("pptx")
from pptx import Presentation # type: ignore
# PowerPoint handling
prs = Presentation(file_path)
content = ""
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
content += shape.text + "\n"
case _: case _:
raise ValueError(f"Unsupported file format: {ext}") raise ValueError(f"Unsupported file format: {ext}")
@@ -1282,55 +1258,26 @@ def create_app(args):
text_content = await file.read() text_content = await file.read()
content = text_content.decode("utf-8") content = text_content.decode("utf-8")
case ".pdf": case ".pdf" | ".docx" | ".pptx" | ".xlsx":
if not pm.is_installed("pypdf2"): if not pm.is_installed("docling"):
pm.install("pypdf2") pm.install("docling")
from PyPDF2 import PdfReader from docling.document_converter import DocumentConverter
from io import BytesIO
# Read PDF from memory # Create a temporary file to save the uploaded content
pdf_content = await file.read() temp_path = Path("temp") / file.filename
pdf_file = BytesIO(pdf_content) temp_path.parent.mkdir(exist_ok=True)
reader = PdfReader(pdf_file)
content = ""
for page in reader.pages:
content += page.extract_text() + "\n"
case ".docx": # Save the uploaded file
if not pm.is_installed("python-docx"): with temp_path.open("wb") as f:
pm.install("python-docx") f.write(await file.read())
from docx import Document
from io import BytesIO
# Read DOCX from memory try:
docx_content = await file.read() converter = DocumentConverter()
docx_file = BytesIO(docx_content) result = converter.convert(str(temp_path))
doc = Document(docx_file) content = result.document.export_to_markdown()
content = "\n".join( finally:
[paragraph.text for paragraph in doc.paragraphs] # Clean up the temporary file
) temp_path.unlink()
case ".pptx":
if not pm.is_installed("pptx"):
pm.install("pptx")
from pptx import Presentation # type: ignore
from io import BytesIO
# Read PPTX from memory
pptx_content = await file.read()
pptx_file = BytesIO(pptx_content)
prs = Presentation(pptx_file)
content = ""
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
content += shape.text + "\n"
case _:
raise HTTPException(
status_code=400,
detail=f"Unsupported file type. Supported types: {doc_manager.supported_extensions}",
)
# Insert content into RAG system # Insert content into RAG system
if content: if content:

View File

@@ -2,6 +2,9 @@ accelerate
aiofiles aiofiles
aiohttp aiohttp
configparser configparser
# File manipulation libraries
docling
graspologic graspologic
# database packages # database packages
@@ -11,12 +14,7 @@ networkx
numpy numpy
pipmaster pipmaster
pydantic pydantic
# File manipulation libraries
PyPDF2
python-docx
python-dotenv python-dotenv
python-pptx
setuptools setuptools
tenacity tenacity