This commit is contained in:
Saifeddine ALOUI
2025-03-05 15:36:47 +01:00
committed by GitHub
parent 39c24f4a59
commit 6e4daea056

View File

@@ -237,10 +237,11 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
) )
return False return False
case ".pdf": case ".pdf":
if global_args["main_args"].document_loading_tool=="DOCLING": if global_args["main_args"].document_loading_tool == "DOCLING":
if not pm.is_installed("docling"): # type: ignore if not pm.is_installed("docling"): # type: ignore
pm.install("docling") pm.install("docling")
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
converter = DocumentConverter() converter = DocumentConverter()
result = converter.convert(file_path) result = converter.convert(file_path)
content = result.document.export_to_markdown() content = result.document.export_to_markdown()
@@ -255,10 +256,11 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
for page in reader.pages: for page in reader.pages:
content += page.extract_text() + "\n" content += page.extract_text() + "\n"
case ".docx": case ".docx":
if global_args["main_args"].document_loading_tool=="DOCLING": if global_args["main_args"].document_loading_tool == "DOCLING":
if not pm.is_installed("docling"): # type: ignore if not pm.is_installed("docling"): # type: ignore
pm.install("docling") pm.install("docling")
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
converter = DocumentConverter() converter = DocumentConverter()
result = converter.convert(file_path) result = converter.convert(file_path)
content = result.document.export_to_markdown() content = result.document.export_to_markdown()
@@ -270,12 +272,15 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
docx_file = BytesIO(file) docx_file = BytesIO(file)
doc = Document(docx_file) doc = Document(docx_file)
content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) content = "\n".join(
[paragraph.text for paragraph in doc.paragraphs]
)
case ".pptx": case ".pptx":
if global_args["main_args"].document_loading_tool=="DOCLING": if global_args["main_args"].document_loading_tool == "DOCLING":
if not pm.is_installed("docling"): # type: ignore if not pm.is_installed("docling"): # type: ignore
pm.install("docling") pm.install("docling")
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
converter = DocumentConverter() converter = DocumentConverter()
result = converter.convert(file_path) result = converter.convert(file_path)
content = result.document.export_to_markdown() content = result.document.export_to_markdown()
@@ -292,10 +297,11 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
if hasattr(shape, "text"): if hasattr(shape, "text"):
content += shape.text + "\n" content += shape.text + "\n"
case ".xlsx": case ".xlsx":
if global_args["main_args"].document_loading_tool=="DOCLING": if global_args["main_args"].document_loading_tool == "DOCLING":
if not pm.is_installed("docling"): # type: ignore if not pm.is_installed("docling"): # type: ignore
pm.install("docling") pm.install("docling")
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
converter = DocumentConverter() converter = DocumentConverter()
result = converter.convert(file_path) result = converter.convert(file_path)
content = result.document.export_to_markdown() content = result.document.export_to_markdown()
@@ -312,7 +318,8 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
for row in sheet.iter_rows(values_only=True): for row in sheet.iter_rows(values_only=True):
content += ( content += (
"\t".join( "\t".join(
str(cell) if cell is not None else "" for cell in row str(cell) if cell is not None else ""
for cell in row
) )
+ "\n" + "\n"
) )