Simplified file loading

This commit is contained in:
Saifeddine ALOUI
2025-02-01 01:19:32 +01:00
parent e09cb85f37
commit 3a40772d30

View File

@@ -973,33 +973,7 @@ def create_app(args):
async with aiofiles.open(file_path, "r", encoding="utf-8") as f: async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
content = await f.read() content = await f.read()
case ".pdf": case ".pdf" | ".docx" | ".pptx" | ".xlsx":
if not pm.is_installed("docling"):
pm.install("docling")
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
case ".docx":
if not pm.is_installed("docling"):
pm.install("docling")
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
case ".pptx":
if not pm.is_installed("docling"):
pm.install("docling")
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
case ".xlsx":
if not pm.is_installed("docling"): if not pm.is_installed("docling"):
pm.install("docling") pm.install("docling")
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
@@ -1284,45 +1258,26 @@ def create_app(args):
text_content = await file.read() text_content = await file.read()
content = text_content.decode("utf-8") content = text_content.decode("utf-8")
case ".pdf": case ".pdf" | ".docx" | ".pptx" | ".xlsx":
if not pm.is_installed("docling"): if not pm.is_installed("docling"):
pm.install("docling") pm.install("docling")
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
converter = DocumentConverter() # Create a temporary file to save the uploaded content
result = converter.convert(file.filename) temp_path = Path("temp") / file.filename
content = result.document.export_to_markdown() temp_path.parent.mkdir(exist_ok=True)
case ".docx": # Save the uploaded file
if not pm.is_installed("docling"): with temp_path.open("wb") as f:
pm.install("docling") f.write(await file.read())
from docling.document_converter import DocumentConverter
converter = DocumentConverter() try:
result = converter.convert(file.filename) converter = DocumentConverter()
content = result.document.export_to_markdown() result = converter.convert(str(temp_path))
content = result.document.export_to_markdown()
case ".pptx": finally:
if not pm.is_installed("docling"): # Clean up the temporary file
pm.install("docling") temp_path.unlink()
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file.filename)
content = result.document.export_to_markdown()
case ".xlsx":
if not pm.is_installed("docling"):
pm.install("docling")
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file.filename)
content = result.document.export_to_markdown()
case _:
raise HTTPException(
status_code=400,
detail=f"Unsupported file type. Supported types: {doc_manager.supported_extensions}",
)
# Insert content into RAG system # Insert content into RAG system
if content: if content: