Merge pull request #844 from danielaskdd/add-duplicate-check
Fix office file indexing problem
This commit is contained in:
@@ -1166,8 +1166,7 @@ def create_app(args):
|
|||||||
from docx import Document
|
from docx import Document
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
docx_content = await file.read()
|
docx_file = BytesIO(file)
|
||||||
docx_file = BytesIO(docx_content)
|
|
||||||
doc = Document(docx_file)
|
doc = Document(docx_file)
|
||||||
content = "\n".join(
|
content = "\n".join(
|
||||||
[paragraph.text for paragraph in doc.paragraphs]
|
[paragraph.text for paragraph in doc.paragraphs]
|
||||||
@@ -1178,13 +1177,31 @@ def create_app(args):
|
|||||||
from pptx import Presentation # type: ignore
|
from pptx import Presentation # type: ignore
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
pptx_content = await file.read()
|
pptx_file = BytesIO(file)
|
||||||
pptx_file = BytesIO(pptx_content)
|
|
||||||
prs = Presentation(pptx_file)
|
prs = Presentation(pptx_file)
|
||||||
for slide in prs.slides:
|
for slide in prs.slides:
|
||||||
for shape in slide.shapes:
|
for shape in slide.shapes:
|
||||||
if hasattr(shape, "text"):
|
if hasattr(shape, "text"):
|
||||||
content += shape.text + "\n"
|
content += shape.text + "\n"
|
||||||
|
case ".xlsx":
|
||||||
|
if not pm.is_installed("openpyxl"):
|
||||||
|
pm.install("openpyxl")
|
||||||
|
from openpyxl import load_workbook # type: ignore
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
xlsx_file = BytesIO(file)
|
||||||
|
wb = load_workbook(xlsx_file)
|
||||||
|
for sheet in wb:
|
||||||
|
content += f"Sheet: {sheet.title}\n"
|
||||||
|
for row in sheet.iter_rows(values_only=True):
|
||||||
|
content += (
|
||||||
|
"\t".join(
|
||||||
|
str(cell) if cell is not None else ""
|
||||||
|
for cell in row
|
||||||
|
)
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
content += "\n"
|
||||||
case _:
|
case _:
|
||||||
logging.error(
|
logging.error(
|
||||||
f"Unsupported file type: {file_path.name} (extension {ext})"
|
f"Unsupported file type: {file_path.name} (extension {ext})"
|
||||||
@@ -1195,7 +1212,7 @@ def create_app(args):
|
|||||||
if content:
|
if content:
|
||||||
await rag.apipeline_enqueue_documents(content)
|
await rag.apipeline_enqueue_documents(content)
|
||||||
logging.info(
|
logging.info(
|
||||||
f"Successfully processed and enqueued file: {file_path.name}"
|
f"Successfully fetched and enqueued file: {file_path.name}"
|
||||||
)
|
)
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
|
Reference in New Issue
Block a user