diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 39314233..d9dfe913 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -215,7 +215,27 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: | ".scss" | ".less" ): - content = file.decode("utf-8") + try: + # Try to decode as UTF-8 + content = file.decode("utf-8") + + # Validate content + if not content or len(content.strip()) == 0: + logger.error(f"Empty content in file: {file_path.name}") + return False + + # Check if content looks like binary data string representation + if content.startswith("b'") or content.startswith('b"'): + logger.error( + f"File {file_path.name} appears to contain binary data representation instead of text" + ) + return False + + except UnicodeDecodeError: + logger.error( + f"File {file_path.name} is not valid UTF-8 encoded text. Please convert it to UTF-8 before processing." + ) + return False case ".pdf": if not pm.is_installed("pypdf2"): # type: ignore pm.install("pypdf2") @@ -229,7 +249,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: case ".docx": if not pm.is_installed("python-docx"): # type: ignore pm.install("docx") - from docx import Document + from docx import Document # type: ignore from io import BytesIO docx_file = BytesIO(file) @@ -238,7 +258,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: case ".pptx": if not pm.is_installed("python-pptx"): # type: ignore pm.install("pptx") - from pptx import Presentation + from pptx import Presentation # type: ignore from io import BytesIO pptx_file = BytesIO(file) @@ -250,7 +270,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: case ".xlsx": if not pm.is_installed("openpyxl"): # type: ignore pm.install("openpyxl") - from openpyxl import load_workbook + from openpyxl import load_workbook # type: ignore from io import BytesIO xlsx_file = BytesIO(file) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index b2e9845e..6f42003d 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -685,8 +685,24 @@ class LightRAG: all_new_doc_ids = set(new_docs.keys()) # Exclude IDs of documents that are already in progress unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids) + + # Log ignored document IDs + ignored_ids = [ + doc_id for doc_id in unique_new_doc_ids if doc_id not in new_docs + ] + if ignored_ids: + logger.warning( + f"Ignoring {len(ignored_ids)} document IDs not found in new_docs" + ) + for doc_id in ignored_ids: + logger.warning(f"Ignored document ID: {doc_id}") + # Filter new_docs to only include documents with unique IDs - new_docs = {doc_id: new_docs[doc_id] for doc_id in unique_new_doc_ids} + new_docs = { + doc_id: new_docs[doc_id] + for doc_id in unique_new_doc_ids + if doc_id in new_docs + } if not new_docs: logger.info("No new unique documents were found.")