Fix refactoring error on document handling

- Fix refactoring error on pipeline_index_file - Delete unsed func: scan_directory - Add type hints of rag for better maintainability - Refine comments for better understanding
2025-02-20 14:30:41 +08:00
parent 62e1fe5df2
commit 82a4cb3e79
1 changed files with 82 additions and 91 deletions
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -16,6 +16,7 @@ from typing import Dict, List, Optional, Any
 from fastapi import APIRouter, BackgroundTasks, Depends, File, HTTPException, UploadFile
 from pydantic import BaseModel, Field, field_validator

+from lightrag import LightRAG
 from lightrag.base import DocProcessingStatus, DocStatus
 from ..utils_api import get_api_key_dependency

@@ -76,6 +77,20 @@ class DocStatusResponse(BaseModel):
            return dt
        return dt.isoformat()

+    """Response model for document status
+
+    Attributes:
+        id: Document identifier
+        content_summary: Summary of document content
+        content_length: Length of document content
+        status: Current processing status
+        created_at: Creation timestamp (ISO format string)
+        updated_at: Last update timestamp (ISO format string)
+        chunks_count: Number of chunks (optional)
+        error: Error message if any (optional)
+        metadata: Additional metadata (optional)
+    """
+
    id: str
    content_summary: str
    content_length: int
@@ -112,6 +127,7 @@ class DocumentManager:
        self.input_dir.mkdir(parents=True, exist_ok=True)

    def scan_directory_for_new_files(self) -> List[Path]:
+        """Scan input directory for new files"""
        new_files = []
        for ext in self.supported_extensions:
            logging.info(f"Scanning for {ext} files in {self.input_dir}")
@@ -120,12 +136,12 @@ class DocumentManager:
                    new_files.append(file_path)
        return new_files

-    def scan_directory(self) -> List[Path]:
-        new_files = []
-        for ext in self.supported_extensions:
-            for file_path in self.input_dir.rglob(f"*{ext}"):
-                new_files.append(file_path)
-        return new_files
+    # def scan_directory(self) -> List[Path]:
+    #     new_files = []
+    #     for ext in self.supported_extensions:
+    #         for file_path in self.input_dir.rglob(f"*{ext}"):
+    #             new_files.append(file_path)
+    #     return new_files

    def mark_as_indexed(self, file_path: Path):
        self.indexed_files.add(file_path)
@@ -134,7 +150,16 @@ class DocumentManager:
        return any(filename.lower().endswith(ext) for ext in self.supported_extensions)


-async def pipeline_enqueue_file(rag, file_path: Path) -> bool:
+async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
+    """Add a file to the queue for processing
+
+    Args:
+        rag: LightRAG instance
+        file_path: Path to the saved file
+    Returns:
+        bool: True if the file was successfully enqueued, False otherwise
+    """
+
    try:
        content = ""
        ext = file_path.suffix.lower()
@@ -165,7 +190,9 @@ async def pipeline_enqueue_file(rag, file_path: Path) -> bool:

                docx_file = BytesIO(file)
                doc = Document(docx_file)
-                content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
+                content = "\n".join(
+                    [paragraph.text for paragraph in doc.paragraphs]
+                )
            case ".pptx":
                if not pm.is_installed("pptx"):
                    pm.install("pptx")
@@ -205,13 +232,19 @@ async def pipeline_enqueue_file(rag, file_path: Path) -> bool:
        # Insert into the RAG queue
        if content:
            await rag.apipeline_enqueue_documents(content)
-            logging.info(f"Successfully fetched and enqueued file: {file_path.name}")
+            logging.info(
+                f"Successfully fetched and enqueued file: {file_path.name}"
+            )
            return True
        else:
-            logging.error(f"No content could be extracted from file: {file_path.name}")
+            logging.error(
+                f"No content could be extracted from file: {file_path.name}"
+            )

    except Exception as e:
-        logging.error(f"Error processing or enqueueing file {file_path.name}: {str(e)}")
+        logging.error(
+            f"Error processing or enqueueing file {file_path.name}: {str(e)}"
+        )
        logging.error(traceback.format_exc())
    finally:
        if file_path.name.startswith(temp_prefix):
@@ -222,7 +255,7 @@ async def pipeline_enqueue_file(rag, file_path: Path) -> bool:
    return False


-async def pipeline_index_file(rag, file_path: Path):
+async def pipeline_index_file(rag: LightRAG, file_path: Path):
    """Index a file

    Args:
@@ -230,90 +263,26 @@ async def pipeline_index_file(rag, file_path: Path):
        file_path: Path to the saved file
    """
    try:
-        content = ""
-        ext = file_path.suffix.lower()
-
-        file = None
-        async with aiofiles.open(file_path, "rb") as f:
-            file = await f.read()
-
-        # Process based on file type
-        match ext:
-            case ".txt" | ".md":
-                content = file.decode("utf-8")
-            case ".pdf":
-                if not pm.is_installed("pypdf2"):
-                    pm.install("pypdf2")
-                from PyPDF2 import PdfReader  # type: ignore
-                from io import BytesIO
-
-                pdf_file = BytesIO(file)
-                reader = PdfReader(pdf_file)
-                for page in reader.pages:
-                    content += page.extract_text() + "\n"
-            case ".docx":
-                if not pm.is_installed("docx"):
-                    pm.install("docx")
-                from docx import Document
-                from io import BytesIO
-
-                docx_file = BytesIO(file)
-                doc = Document(docx_file)
-                content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
-            case ".pptx":
-                if not pm.is_installed("pptx"):
-                    pm.install("pptx")
-                from pptx import Presentation
-                from io import BytesIO
-
-                pptx_file = BytesIO(file)
-                prs = Presentation(pptx_file)
-                for slide in prs.slides:
-                    for shape in slide.shapes:
-                        if hasattr(shape, "text"):
-                            content += shape.text + "\n"
-            case ".xlsx":
-                if not pm.is_installed("openpyxl"):
-                    pm.install("openpyxl")
-                from openpyxl import load_workbook
-                from io import BytesIO
-
-                xlsx_file = BytesIO(file)
-                wb = load_workbook(xlsx_file)
-                for sheet in wb:
-                    content += f"Sheet: {sheet.title}\n"
-                    for row in sheet.iter_rows(values_only=True):
-                        content += (
-                            "\t".join(
-                                str(cell) if cell is not None else "" for cell in row
-                            )
-                            + "\n"
-                        )
-                    content += "\n"
-            case _:
-                logging.error(
-                    f"Unsupported file type: {file_path.name} (extension {ext})"
-                )
-                return
-
-        # Insert into the RAG queue
-        if content:
-            await rag.apipeline_enqueue_documents(content)
+        if await pipeline_enqueue_file(file_path):
            await rag.apipeline_process_enqueue_documents()
-            logging.info(f"Successfully indexed file: {file_path.name}")
-        else:
-            logging.error(f"No content could be extracted from file: {file_path.name}")

    except Exception as e:
        logging.error(f"Error indexing file {file_path.name}: {str(e)}")
        logging.error(traceback.format_exc())


-async def pipeline_index_files(rag, file_paths: List[Path]):
+async def pipeline_index_files(rag: LightRAG, file_paths: List[Path]):
+    """Index multiple files concurrently
+
+    Args:
+        rag: LightRAG instance
+        file_paths: Paths to the files to index
+    """
    if not file_paths:
        return
    try:
        enqueued = False
+
        if len(file_paths) == 1:
            enqueued = await pipeline_enqueue_file(rag, file_paths[0])
        else:
@@ -327,7 +296,13 @@ async def pipeline_index_files(rag, file_paths: List[Path]):
        logging.error(traceback.format_exc())


-async def pipeline_index_texts(rag, texts: List[str]):
+async def pipeline_index_texts(rag: LightRAG, texts: List[str]):
+    """Index a list of texts
+
+    Args:
+        rag: LightRAG instance
+        texts: The texts to index
+    """
    if not texts:
        return
    await rag.apipeline_enqueue_documents(texts)
@@ -335,16 +310,29 @@ async def pipeline_index_texts(rag, texts: List[str]):


 async def save_temp_file(input_dir: Path, file: UploadFile = File(...)) -> Path:
+    """Save the uploaded file to a temporary location
+
+    Args:
+        file: The uploaded file
+
+    Returns:
+        Path: The path to the saved file
+    """
+    # Generate unique filename to avoid conflicts
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    unique_filename = f"{temp_prefix}{timestamp}_{file.filename}"
+
+    # Create a temporary file to save the uploaded content
    temp_path = input_dir / "temp" / unique_filename
    temp_path.parent.mkdir(exist_ok=True)
+
+    # Save the file
    with open(temp_path, "wb") as buffer:
        shutil.copyfileobj(file.file, buffer)
    return temp_path


-async def run_scanning_process(rag, doc_manager: DocumentManager):
+async def run_scanning_process(rag: LightRAG, doc_manager: DocumentManager):
    """Background task to scan and index documents"""
    try:
        new_files = doc_manager.scan_directory_for_new_files()
@@ -375,7 +363,7 @@ async def run_scanning_process(rag, doc_manager: DocumentManager):


 def create_document_routes(
-    rag, doc_manager: DocumentManager, api_key: Optional[str] = None
+    rag: LightRAG, doc_manager: DocumentManager, api_key: Optional[str] = None
 ):
    optional_api_key = get_api_key_dependency(api_key)

@@ -388,9 +376,6 @@ def create_document_routes(
        and processes them. If a scanning process is already running, it returns a status indicating
        that fact.

-        Args:
-            background_tasks (BackgroundTasks): FastAPI background tasks handler
-
        Returns:
            dict: A dictionary containing the scanning status
        """
@@ -402,6 +387,7 @@ def create_document_routes(
            scan_progress["indexed_count"] = 0
            scan_progress["progress"] = 0

+        # Start the scanning process in the background
        background_tasks.add_task(run_scanning_process, rag, doc_manager)
        return {"status": "scanning_started"}

@@ -453,6 +439,7 @@ def create_document_routes(
            with open(file_path, "wb") as buffer:
                shutil.copyfileobj(file.file, buffer)

+            # Add to background tasks
            background_tasks.add_task(pipeline_index_file, rag, file_path)

            return InsertResponse(
@@ -562,6 +549,8 @@ def create_document_routes(
                )

            temp_path = await save_temp_file(doc_manager.input_dir, file)
+
+            # Add to background tasks
            background_tasks.add_task(pipeline_index_file, rag, temp_path)

            return InsertResponse(
@@ -606,6 +595,7 @@ def create_document_routes(

            for file in files:
                if doc_manager.is_supported_file(file.filename):
+                    # Create a temporary file to save the uploaded content
                    temp_files.append(await save_temp_file(doc_manager.input_dir, file))
                    inserted_count += 1
                else:
@@ -614,6 +604,7 @@ def create_document_routes(
            if temp_files:
                background_tasks.add_task(pipeline_index_files, rag, temp_files)

+            # Prepare status message
            if inserted_count == len(files):
                status = "success"
                status_message = f"Successfully inserted all {inserted_count} documents"