Update document_routes.py

This commit is contained in:
Saifeddine ALOUI
2025-03-05 15:32:39 +01:00
committed by GitHub
parent 20f2e57260
commit aaa8194423

View File

@@ -16,7 +16,7 @@ from pydantic import BaseModel, Field, field_validator
from lightrag import LightRAG from lightrag import LightRAG
from lightrag.base import DocProcessingStatus, DocStatus from lightrag.base import DocProcessingStatus, DocStatus
from ..utils_api import get_api_key_dependency from lightrag.api.utils_api import get_api_key_dependency, global_args
router = APIRouter(prefix="/documents", tags=["documents"]) router = APIRouter(prefix="/documents", tags=["documents"])
@@ -237,54 +237,86 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
) )
return False return False
case ".pdf": case ".pdf":
if not pm.is_installed("pypdf2"): # type: ignore if global_args["main_args"].document_loading_tool=="DOCLING":
pm.install("pypdf2") if not pm.is_installed("docling"): # type: ignore
from PyPDF2 import PdfReader # type: ignore pm.install("docling")
from io import BytesIO from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
else:
if not pm.is_installed("pypdf2"): # type: ignore
pm.install("pypdf2")
from PyPDF2 import PdfReader # type: ignore
from io import BytesIO
pdf_file = BytesIO(file) pdf_file = BytesIO(file)
reader = PdfReader(pdf_file) reader = PdfReader(pdf_file)
for page in reader.pages: for page in reader.pages:
content += page.extract_text() + "\n" content += page.extract_text() + "\n"
case ".docx": case ".docx":
if not pm.is_installed("python-docx"): # type: ignore if global_args["main_args"].document_loading_tool=="DOCLING":
pm.install("docx") if not pm.is_installed("docling"): # type: ignore
from docx import Document # type: ignore pm.install("docling")
from io import BytesIO from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
else:
if not pm.is_installed("python-docx"): # type: ignore
pm.install("docx")
from docx import Document # type: ignore
from io import BytesIO
docx_file = BytesIO(file) docx_file = BytesIO(file)
doc = Document(docx_file) doc = Document(docx_file)
content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
case ".pptx": case ".pptx":
if not pm.is_installed("python-pptx"): # type: ignore if global_args["main_args"].document_loading_tool=="DOCLING":
pm.install("pptx") if not pm.is_installed("docling"): # type: ignore
from pptx import Presentation # type: ignore pm.install("docling")
from io import BytesIO from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
else:
if not pm.is_installed("python-pptx"): # type: ignore
pm.install("pptx")
from pptx import Presentation # type: ignore
from io import BytesIO
pptx_file = BytesIO(file) pptx_file = BytesIO(file)
prs = Presentation(pptx_file) prs = Presentation(pptx_file)
for slide in prs.slides: for slide in prs.slides:
for shape in slide.shapes: for shape in slide.shapes:
if hasattr(shape, "text"): if hasattr(shape, "text"):
content += shape.text + "\n" content += shape.text + "\n"
case ".xlsx": case ".xlsx":
if not pm.is_installed("openpyxl"): # type: ignore if global_args["main_args"].document_loading_tool=="DOCLING":
pm.install("openpyxl") if not pm.is_installed("docling"): # type: ignore
from openpyxl import load_workbook # type: ignore pm.install("docling")
from io import BytesIO from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
else:
if not pm.is_installed("openpyxl"): # type: ignore
pm.install("openpyxl")
from openpyxl import load_workbook # type: ignore
from io import BytesIO
xlsx_file = BytesIO(file) xlsx_file = BytesIO(file)
wb = load_workbook(xlsx_file) wb = load_workbook(xlsx_file)
for sheet in wb: for sheet in wb:
content += f"Sheet: {sheet.title}\n" content += f"Sheet: {sheet.title}\n"
for row in sheet.iter_rows(values_only=True): for row in sheet.iter_rows(values_only=True):
content += ( content += (
"\t".join( "\t".join(
str(cell) if cell is not None else "" for cell in row str(cell) if cell is not None else "" for cell in row
)
+ "\n"
) )
+ "\n" content += "\n"
)
content += "\n"
case _: case _:
logger.error( logger.error(
f"Unsupported file type: {file_path.name} (extension {ext})" f"Unsupported file type: {file_path.name} (extension {ext})"