Introduced docling instead of other tools for loading files

This commit is contained in:
Saifeddine ALOUI
2025-02-01 00:56:43 +01:00
parent e24a0a86e5
commit ef35f9a4e4
2 changed files with 51 additions and 68 deletions

View File

@@ -556,7 +556,7 @@ class DocumentManager:
def __init__( def __init__(
self, self,
input_dir: str, input_dir: str,
supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx"), supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx", "xlsx"),
): ):
self.input_dir = Path(input_dir) self.input_dir = Path(input_dir)
self.supported_extensions = supported_extensions self.supported_extensions = supported_extensions
@@ -974,37 +974,36 @@ def create_app(args):
content = await f.read() content = await f.read()
case ".pdf": case ".pdf":
if not pm.is_installed("pypdf2"): if not pm.is_installed("docling"):
pm.install("pypdf2") pm.install("docling")
from PyPDF2 import PdfReader from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
# PDF handling
reader = PdfReader(str(file_path))
content = ""
for page in reader.pages:
content += page.extract_text() + "\n"
case ".docx": case ".docx":
if not pm.is_installed("python-docx"): if not pm.is_installed("docling"):
pm.install("python-docx") pm.install("docling")
from docx import Document from docling.document_converter import DocumentConverter
converter = DocumentConverter()
# Word document handling result = converter.convert(file_path)
doc = Document(file_path) content = result.document.export_to_markdown()
content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
case ".pptx": case ".pptx":
if not pm.is_installed("pptx"): if not pm.is_installed("docling"):
pm.install("pptx") pm.install("docling")
from pptx import Presentation # type: ignore from docling.document_converter import DocumentConverter
converter = DocumentConverter()
# PowerPoint handling result = converter.convert(file_path)
prs = Presentation(file_path) content = result.document.export_to_markdown()
content = "" case ".xlsx":
for slide in prs.slides: if not pm.is_installed("docling"):
for shape in slide.shapes: pm.install("docling")
if hasattr(shape, "text"): from docling.document_converter import DocumentConverter
content += shape.text + "\n" converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
case _: case _:
raise ValueError(f"Unsupported file format: {ext}") raise ValueError(f"Unsupported file format: {ext}")
@@ -1283,49 +1282,36 @@ def create_app(args):
content = text_content.decode("utf-8") content = text_content.decode("utf-8")
case ".pdf": case ".pdf":
if not pm.is_installed("pypdf2"): if not pm.is_installed("docling"):
pm.install("pypdf2") pm.install("docling")
from PyPDF2 import PdfReader from docling.document_converter import DocumentConverter
from io import BytesIO converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
# Read PDF from memory
pdf_content = await file.read()
pdf_file = BytesIO(pdf_content)
reader = PdfReader(pdf_file)
content = ""
for page in reader.pages:
content += page.extract_text() + "\n"
case ".docx": case ".docx":
if not pm.is_installed("python-docx"): if not pm.is_installed("docling"):
pm.install("python-docx") pm.install("docling")
from docx import Document from docling.document_converter import DocumentConverter
from io import BytesIO converter = DocumentConverter()
result = converter.convert(file_path)
# Read DOCX from memory content = result.document.export_to_markdown()
docx_content = await file.read()
docx_file = BytesIO(docx_content)
doc = Document(docx_file)
content = "\n".join(
[paragraph.text for paragraph in doc.paragraphs]
)
case ".pptx": case ".pptx":
if not pm.is_installed("pptx"): if not pm.is_installed("docling"):
pm.install("pptx") pm.install("docling")
from pptx import Presentation # type: ignore from docling.document_converter import DocumentConverter
from io import BytesIO converter = DocumentConverter()
result = converter.convert(file_path)
# Read PPTX from memory content = result.document.export_to_markdown()
pptx_content = await file.read() case ".xlsx":
pptx_file = BytesIO(pptx_content) if not pm.is_installed("docling"):
prs = Presentation(pptx_file) pm.install("docling")
content = "" from docling.document_converter import DocumentConverter
for slide in prs.slides: converter = DocumentConverter()
for shape in slide.shapes: result = converter.convert(file_path)
if hasattr(shape, "text"): content = result.document.export_to_markdown()
content += shape.text + "\n"
case _: case _:
raise HTTPException( raise HTTPException(
status_code=400, status_code=400,

View File

@@ -14,10 +14,7 @@ pydantic
# File manipulation libraries # File manipulation libraries
docling docling
PyPDF2
python-docx
python-dotenv python-dotenv
python-pptx
setuptools setuptools
tenacity tenacity