Merge pull request #844 from danielaskdd/add-duplicate-check

Fix office file indexing problem
This commit is contained in:
Yannick Stephan
2025-02-19 08:34:40 +01:00
committed by GitHub

View File

@@ -1166,8 +1166,7 @@ def create_app(args):
from docx import Document from docx import Document
from io import BytesIO from io import BytesIO
docx_content = await file.read() docx_file = BytesIO(file)
docx_file = BytesIO(docx_content)
doc = Document(docx_file) doc = Document(docx_file)
content = "\n".join( content = "\n".join(
[paragraph.text for paragraph in doc.paragraphs] [paragraph.text for paragraph in doc.paragraphs]
@@ -1178,13 +1177,31 @@ def create_app(args):
from pptx import Presentation # type: ignore from pptx import Presentation # type: ignore
from io import BytesIO from io import BytesIO
pptx_content = await file.read() pptx_file = BytesIO(file)
pptx_file = BytesIO(pptx_content)
prs = Presentation(pptx_file) prs = Presentation(pptx_file)
for slide in prs.slides: for slide in prs.slides:
for shape in slide.shapes: for shape in slide.shapes:
if hasattr(shape, "text"): if hasattr(shape, "text"):
content += shape.text + "\n" content += shape.text + "\n"
case ".xlsx":
if not pm.is_installed("openpyxl"):
pm.install("openpyxl")
from openpyxl import load_workbook # type: ignore
from io import BytesIO
xlsx_file = BytesIO(file)
wb = load_workbook(xlsx_file)
for sheet in wb:
content += f"Sheet: {sheet.title}\n"
for row in sheet.iter_rows(values_only=True):
content += (
"\t".join(
str(cell) if cell is not None else ""
for cell in row
)
+ "\n"
)
content += "\n"
case _: case _:
logging.error( logging.error(
f"Unsupported file type: {file_path.name} (extension {ext})" f"Unsupported file type: {file_path.name} (extension {ext})"
@@ -1195,7 +1212,7 @@ def create_app(args):
if content: if content:
await rag.apipeline_enqueue_documents(content) await rag.apipeline_enqueue_documents(content)
logging.info( logging.info(
f"Successfully processed and enqueued file: {file_path.name}" f"Successfully fetched and enqueued file: {file_path.name}"
) )
return True return True
else: else: