Introduced docling instead of other tools for loading files
This commit is contained in:
@@ -556,7 +556,7 @@ class DocumentManager:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
input_dir: str,
|
input_dir: str,
|
||||||
supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx"),
|
supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx", "xlsx"),
|
||||||
):
|
):
|
||||||
self.input_dir = Path(input_dir)
|
self.input_dir = Path(input_dir)
|
||||||
self.supported_extensions = supported_extensions
|
self.supported_extensions = supported_extensions
|
||||||
@@ -974,37 +974,36 @@ def create_app(args):
|
|||||||
content = await f.read()
|
content = await f.read()
|
||||||
|
|
||||||
case ".pdf":
|
case ".pdf":
|
||||||
if not pm.is_installed("pypdf2"):
|
if not pm.is_installed("docling"):
|
||||||
pm.install("pypdf2")
|
pm.install("docling")
|
||||||
from PyPDF2 import PdfReader
|
from docling.document_converter import DocumentConverter
|
||||||
|
converter = DocumentConverter()
|
||||||
|
result = converter.convert(file_path)
|
||||||
|
content = result.document.export_to_markdown()
|
||||||
|
|
||||||
# PDF handling
|
|
||||||
reader = PdfReader(str(file_path))
|
|
||||||
content = ""
|
|
||||||
for page in reader.pages:
|
|
||||||
content += page.extract_text() + "\n"
|
|
||||||
|
|
||||||
case ".docx":
|
case ".docx":
|
||||||
if not pm.is_installed("python-docx"):
|
if not pm.is_installed("docling"):
|
||||||
pm.install("python-docx")
|
pm.install("docling")
|
||||||
from docx import Document
|
from docling.document_converter import DocumentConverter
|
||||||
|
converter = DocumentConverter()
|
||||||
# Word document handling
|
result = converter.convert(file_path)
|
||||||
doc = Document(file_path)
|
content = result.document.export_to_markdown()
|
||||||
content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
|
||||||
|
|
||||||
case ".pptx":
|
case ".pptx":
|
||||||
if not pm.is_installed("pptx"):
|
if not pm.is_installed("docling"):
|
||||||
pm.install("pptx")
|
pm.install("docling")
|
||||||
from pptx import Presentation # type: ignore
|
from docling.document_converter import DocumentConverter
|
||||||
|
converter = DocumentConverter()
|
||||||
# PowerPoint handling
|
result = converter.convert(file_path)
|
||||||
prs = Presentation(file_path)
|
content = result.document.export_to_markdown()
|
||||||
content = ""
|
case ".xlsx":
|
||||||
for slide in prs.slides:
|
if not pm.is_installed("docling"):
|
||||||
for shape in slide.shapes:
|
pm.install("docling")
|
||||||
if hasattr(shape, "text"):
|
from docling.document_converter import DocumentConverter
|
||||||
content += shape.text + "\n"
|
converter = DocumentConverter()
|
||||||
|
result = converter.convert(file_path)
|
||||||
|
content = result.document.export_to_markdown()
|
||||||
|
|
||||||
case _:
|
case _:
|
||||||
raise ValueError(f"Unsupported file format: {ext}")
|
raise ValueError(f"Unsupported file format: {ext}")
|
||||||
@@ -1283,49 +1282,36 @@ def create_app(args):
|
|||||||
content = text_content.decode("utf-8")
|
content = text_content.decode("utf-8")
|
||||||
|
|
||||||
case ".pdf":
|
case ".pdf":
|
||||||
if not pm.is_installed("pypdf2"):
|
if not pm.is_installed("docling"):
|
||||||
pm.install("pypdf2")
|
pm.install("docling")
|
||||||
from PyPDF2 import PdfReader
|
from docling.document_converter import DocumentConverter
|
||||||
from io import BytesIO
|
converter = DocumentConverter()
|
||||||
|
result = converter.convert(file_path)
|
||||||
|
content = result.document.export_to_markdown()
|
||||||
|
|
||||||
# Read PDF from memory
|
|
||||||
pdf_content = await file.read()
|
|
||||||
pdf_file = BytesIO(pdf_content)
|
|
||||||
reader = PdfReader(pdf_file)
|
|
||||||
content = ""
|
|
||||||
for page in reader.pages:
|
|
||||||
content += page.extract_text() + "\n"
|
|
||||||
|
|
||||||
case ".docx":
|
case ".docx":
|
||||||
if not pm.is_installed("python-docx"):
|
if not pm.is_installed("docling"):
|
||||||
pm.install("python-docx")
|
pm.install("docling")
|
||||||
from docx import Document
|
from docling.document_converter import DocumentConverter
|
||||||
from io import BytesIO
|
converter = DocumentConverter()
|
||||||
|
result = converter.convert(file_path)
|
||||||
# Read DOCX from memory
|
content = result.document.export_to_markdown()
|
||||||
docx_content = await file.read()
|
|
||||||
docx_file = BytesIO(docx_content)
|
|
||||||
doc = Document(docx_file)
|
|
||||||
content = "\n".join(
|
|
||||||
[paragraph.text for paragraph in doc.paragraphs]
|
|
||||||
)
|
|
||||||
|
|
||||||
case ".pptx":
|
case ".pptx":
|
||||||
if not pm.is_installed("pptx"):
|
if not pm.is_installed("docling"):
|
||||||
pm.install("pptx")
|
pm.install("docling")
|
||||||
from pptx import Presentation # type: ignore
|
from docling.document_converter import DocumentConverter
|
||||||
from io import BytesIO
|
converter = DocumentConverter()
|
||||||
|
result = converter.convert(file_path)
|
||||||
# Read PPTX from memory
|
content = result.document.export_to_markdown()
|
||||||
pptx_content = await file.read()
|
case ".xlsx":
|
||||||
pptx_file = BytesIO(pptx_content)
|
if not pm.is_installed("docling"):
|
||||||
prs = Presentation(pptx_file)
|
pm.install("docling")
|
||||||
content = ""
|
from docling.document_converter import DocumentConverter
|
||||||
for slide in prs.slides:
|
converter = DocumentConverter()
|
||||||
for shape in slide.shapes:
|
result = converter.convert(file_path)
|
||||||
if hasattr(shape, "text"):
|
content = result.document.export_to_markdown()
|
||||||
content += shape.text + "\n"
|
|
||||||
|
|
||||||
case _:
|
case _:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=400,
|
status_code=400,
|
||||||
|
@@ -14,10 +14,7 @@ pydantic
|
|||||||
|
|
||||||
# File manipulation libraries
|
# File manipulation libraries
|
||||||
docling
|
docling
|
||||||
PyPDF2
|
|
||||||
python-docx
|
|
||||||
python-dotenv
|
python-dotenv
|
||||||
python-pptx
|
|
||||||
|
|
||||||
setuptools
|
setuptools
|
||||||
tenacity
|
tenacity
|
||||||
|
Reference in New Issue
Block a user