diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..44c3aff1 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +recursive-include lightrag/api/webui * diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index c91f693f..eddeaa5c 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -6,7 +6,6 @@ from fastapi import ( FastAPI, Depends, ) -from fastapi.responses import FileResponse import asyncio import os import logging @@ -408,10 +407,6 @@ def create_app(args): name="webui", ) - @app.get("/webui/") - async def webui_root(): - return FileResponse(static_dir / "index.html") - return app diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index ab5aff96..d9dfe913 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -215,9 +215,29 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: | ".scss" | ".less" ): - content = file.decode("utf-8") + try: + # Try to decode as UTF-8 + content = file.decode("utf-8") + + # Validate content + if not content or len(content.strip()) == 0: + logger.error(f"Empty content in file: {file_path.name}") + return False + + # Check if content looks like binary data string representation + if content.startswith("b'") or content.startswith('b"'): + logger.error( + f"File {file_path.name} appears to contain binary data representation instead of text" + ) + return False + + except UnicodeDecodeError: + logger.error( + f"File {file_path.name} is not valid UTF-8 encoded text. Please convert it to UTF-8 before processing." + ) + return False case ".pdf": - if not pm.is_installed("pypdf2"): + if not pm.is_installed("pypdf2"): # type: ignore pm.install("pypdf2") from PyPDF2 import PdfReader # type: ignore from io import BytesIO @@ -227,18 +247,18 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: for page in reader.pages: content += page.extract_text() + "\n" case ".docx": - if not pm.is_installed("docx"): + if not pm.is_installed("python-docx"): # type: ignore pm.install("docx") - from docx import Document + from docx import Document # type: ignore from io import BytesIO docx_file = BytesIO(file) doc = Document(docx_file) content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) case ".pptx": - if not pm.is_installed("pptx"): + if not pm.is_installed("python-pptx"): # type: ignore pm.install("pptx") - from pptx import Presentation + from pptx import Presentation # type: ignore from io import BytesIO pptx_file = BytesIO(file) @@ -248,9 +268,9 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: if hasattr(shape, "text"): content += shape.text + "\n" case ".xlsx": - if not pm.is_installed("openpyxl"): + if not pm.is_installed("openpyxl"): # type: ignore pm.install("openpyxl") - from openpyxl import load_workbook + from openpyxl import load_workbook # type: ignore from io import BytesIO xlsx_file = BytesIO(file) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index b2e9845e..6f42003d 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -685,8 +685,24 @@ class LightRAG: all_new_doc_ids = set(new_docs.keys()) # Exclude IDs of documents that are already in progress unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids) + + # Log ignored document IDs + ignored_ids = [ + doc_id for doc_id in unique_new_doc_ids if doc_id not in new_docs + ] + if ignored_ids: + logger.warning( + f"Ignoring {len(ignored_ids)} document IDs not found in new_docs" + ) + for doc_id in ignored_ids: + logger.warning(f"Ignored document ID: {doc_id}") + # Filter new_docs to only include documents with unique IDs - new_docs = {doc_id: new_docs[doc_id] for doc_id in unique_new_doc_ids} + new_docs = { + doc_id: new_docs[doc_id] + for doc_id in unique_new_doc_ids + if doc_id in new_docs + } if not new_docs: logger.info("No new unique documents were found.")