From d03192a3bdfe1411e31a8961754b11e8b96415bd Mon Sep 17 00:00:00 2001 From: iridium-soda Date: Sat, 11 Jan 2025 09:27:53 +0000 Subject: [PATCH 01/11] fix: Resolve 500 error caused by missing `len()` for `LightRAG` --- lightrag/api/lollms_lightrag_server.py | 2 +- lightrag/api/ollama_lightrag_server.py | 2 +- lightrag/api/openai_lightrag_server.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lightrag/api/lollms_lightrag_server.py b/lightrag/api/lollms_lightrag_server.py index 8a2804a0..50a47ec1 100644 --- a/lightrag/api/lollms_lightrag_server.py +++ b/lightrag/api/lollms_lightrag_server.py @@ -376,7 +376,7 @@ def create_app(args): return InsertResponse( status="success", message="Text successfully inserted", - document_count=len(rag), + document_count=1, ) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) diff --git a/lightrag/api/ollama_lightrag_server.py b/lightrag/api/ollama_lightrag_server.py index b3140aba..66b272d8 100644 --- a/lightrag/api/ollama_lightrag_server.py +++ b/lightrag/api/ollama_lightrag_server.py @@ -375,7 +375,7 @@ def create_app(args): return InsertResponse( status="success", message="Text successfully inserted", - document_count=len(rag), + document_count=1, ) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) diff --git a/lightrag/api/openai_lightrag_server.py b/lightrag/api/openai_lightrag_server.py index 349c09da..d65eaa34 100644 --- a/lightrag/api/openai_lightrag_server.py +++ b/lightrag/api/openai_lightrag_server.py @@ -390,7 +390,7 @@ def create_app(args): return InsertResponse( status="success", message="Text successfully inserted", - document_count=len(rag), + document_count=1, ) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) From 7a56f2924629ca99cde1fe952738dfe0a701e47d Mon Sep 17 00:00:00 2001 From: iridium-soda Date: Sat, 11 Jan 2025 09:38:54 +0000 Subject: [PATCH 02/11] fix --- lightrag/api/openai_lightrag_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/api/openai_lightrag_server.py b/lightrag/api/openai_lightrag_server.py index d65eaa34..349c09da 100644 --- a/lightrag/api/openai_lightrag_server.py +++ b/lightrag/api/openai_lightrag_server.py @@ -390,7 +390,7 @@ def create_app(args): return InsertResponse( status="success", message="Text successfully inserted", - document_count=1, + document_count=len(rag), ) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) From fd5683f6ad48189395ac21b936ef9085a4cf077d Mon Sep 17 00:00:00 2001 From: iridium-soda Date: Sat, 11 Jan 2025 09:39:52 +0000 Subject: [PATCH 03/11] Revert "fix" This reverts commit 7a56f2924629ca99cde1fe952738dfe0a701e47d. --- lightrag/api/openai_lightrag_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/api/openai_lightrag_server.py b/lightrag/api/openai_lightrag_server.py index 349c09da..d65eaa34 100644 --- a/lightrag/api/openai_lightrag_server.py +++ b/lightrag/api/openai_lightrag_server.py @@ -390,7 +390,7 @@ def create_app(args): return InsertResponse( status="success", message="Text successfully inserted", - document_count=len(rag), + document_count=1, ) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) From 5c679384671d716efc24bcddd5bade5b384f6e9a Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Sun, 12 Jan 2025 12:46:23 +0100 Subject: [PATCH 04/11] Resolve 500 error caused by missing len() for LightRAG's API insert_text endpoint --- lightrag/api/lightrag_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 644e622d..d29b8b56 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -487,7 +487,7 @@ def create_app(args): return InsertResponse( status="success", message="Text successfully inserted", - document_count=len(rag), + document_count=1, ) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) From 7aaab219eed17b2e8790fe502b58bb129a35606d Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Sun, 12 Jan 2025 12:56:08 +0100 Subject: [PATCH 05/11] Fixed awaiting insert --- lightrag/api/lightrag_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index d29b8b56..5bcb149c 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -483,7 +483,7 @@ def create_app(args): ) async def insert_text(request: InsertTextRequest): try: - rag.insert(request.text) + await rag.ainsert(request.text) return InsertResponse( status="success", message="Text successfully inserted", From 057e23c4e9dcac9cc21e12647560ae550898ad51 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Mon, 13 Jan 2025 10:13:01 +0800 Subject: [PATCH 06/11] Update __init__.py --- lightrag/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/__init__.py b/lightrag/__init__.py index b8037813..7a26a282 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam -__version__ = "1.1.0" +__version__ = "1.1.1" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/LightRAG" From 867475fd1f394dc69da1da47298c7af9ab5682d5 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Mon, 13 Jan 2025 10:28:19 +0800 Subject: [PATCH 07/11] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index e8401a3d..90c3ec04 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ This repository hosts the code of LightRAG. The structure of this code is based ## 🎉 News +- [x] [2025.01.13]🎯📢Our team has launched [MiniRAG](https://github.com/HKUDS/MiniRAG) for small models. - [x] [2025.01.06]🎯📢You can now [use PostgreSQL for Storage](#using-postgresql-for-storage). - [x] [2024.12.31]🎯📢LightRAG now supports [deletion by document ID](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete). - [x] [2024.11.25]🎯📢LightRAG now supports seamless integration of [custom knowledge graphs](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg), empowering users to enhance the system with their own domain expertise. From f28b90b2b397400361caa874e0ddc9db31c0aeb1 Mon Sep 17 00:00:00 2001 From: bingo Date: Mon, 13 Jan 2025 07:06:01 +0000 Subject: [PATCH 08/11] 1. add os env NEO4J_MAX_CONNECTION_POOL_SIZE to for neo4j ; 2. fix https://github.com/HKUDS/LightRAG/issues/580 issue for mongoDB document 16MB limit. --- lightrag/kg/mongo_impl.py | 32 +++++++++++++++++++++++++++----- lightrag/kg/neo4j_impl.py | 3 ++- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index 61222357..5aab9c07 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -2,7 +2,7 @@ import os from tqdm.asyncio import tqdm as tqdm_async from dataclasses import dataclass from pymongo import MongoClient - +from typing import Union from lightrag.utils import logger from lightrag.base import BaseKVStorage @@ -41,11 +41,33 @@ class MongoKVStorage(BaseKVStorage): return set([s for s in data if s not in existing_ids]) async def upsert(self, data: dict[str, dict]): - for k, v in tqdm_async(data.items(), desc="Upserting"): - self._data.update_one({"_id": k}, {"$set": v}, upsert=True) - data[k]["_id"] = k + if self.namespace == "llm_response_cache": + for mode, items in data.items(): + for k, v in tqdm_async(items.items(), desc="Upserting"): + key = f"{mode}_{k}" + result = self._data.update_one({"_id": key}, {"$setOnInsert": v}, upsert=True) + if result.upserted_id: + logger.debug(f"\nInserted new document with key: {key}") + data[mode][k]["_id"] = key + else: + for k, v in tqdm_async(data.items(), desc="Upserting"): + self._data.update_one({"_id": k}, {"$set": v}, upsert=True) + data[k]["_id"] = k return data - + + async def get_by_mode_and_id(self, mode: str, id: str) -> Union[dict, None]: + if "llm_response_cache" == self.namespace: + res = {} + v = self._data.find_one({"_id": mode+"_"+id}) + if v: + res[id] = v + print(f"find one by:{id}") + return res + else: + return None + else: + return None + async def drop(self): """ """ pass diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py index 884fcb40..96247c05 100644 --- a/lightrag/kg/neo4j_impl.py +++ b/lightrag/kg/neo4j_impl.py @@ -39,6 +39,7 @@ class Neo4JStorage(BaseGraphStorage): URI = os.environ["NEO4J_URI"] USERNAME = os.environ["NEO4J_USERNAME"] PASSWORD = os.environ["NEO4J_PASSWORD"] + MAX_CONNECTION_POOL_SIZE = os.environ.get("NEO4J_MAX_CONNECTION_POOL_SIZE", 800) DATABASE = os.environ.get( "NEO4J_DATABASE" ) # If this param is None, the home database will be used. If it is not None, the specified database will be used. @@ -47,7 +48,7 @@ class Neo4JStorage(BaseGraphStorage): URI, auth=(USERNAME, PASSWORD) ) _database_name = "home database" if DATABASE is None else f"database {DATABASE}" - with GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD)) as _sync_driver: + with GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD), max_connection_pool_size=MAX_CONNECTION_POOL_SIZE) as _sync_driver: try: with _sync_driver.session(database=DATABASE) as session: try: From 1984da0fd6ee17d3f187a13e423ce13aaac9945f Mon Sep 17 00:00:00 2001 From: bingo Date: Mon, 13 Jan 2025 07:27:30 +0000 Subject: [PATCH 09/11] add logger.debug for mongo_impl get_by_mode_and_id() --- lightrag/kg/mongo_impl.py | 12 +++++++----- lightrag/kg/neo4j_impl.py | 6 +++++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index 5aab9c07..fbbae8c2 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -45,7 +45,9 @@ class MongoKVStorage(BaseKVStorage): for mode, items in data.items(): for k, v in tqdm_async(items.items(), desc="Upserting"): key = f"{mode}_{k}" - result = self._data.update_one({"_id": key}, {"$setOnInsert": v}, upsert=True) + result = self._data.update_one( + {"_id": key}, {"$setOnInsert": v}, upsert=True + ) if result.upserted_id: logger.debug(f"\nInserted new document with key: {key}") data[mode][k]["_id"] = key @@ -54,20 +56,20 @@ class MongoKVStorage(BaseKVStorage): self._data.update_one({"_id": k}, {"$set": v}, upsert=True) data[k]["_id"] = k return data - + async def get_by_mode_and_id(self, mode: str, id: str) -> Union[dict, None]: if "llm_response_cache" == self.namespace: res = {} - v = self._data.find_one({"_id": mode+"_"+id}) + v = self._data.find_one({"_id": mode + "_" + id}) if v: res[id] = v - print(f"find one by:{id}") + logger.debug(f"llm_response_cache find one by:{id}") return res else: return None else: return None - + async def drop(self): """ """ pass diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py index 96247c05..8c2afb5d 100644 --- a/lightrag/kg/neo4j_impl.py +++ b/lightrag/kg/neo4j_impl.py @@ -48,7 +48,11 @@ class Neo4JStorage(BaseGraphStorage): URI, auth=(USERNAME, PASSWORD) ) _database_name = "home database" if DATABASE is None else f"database {DATABASE}" - with GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD), max_connection_pool_size=MAX_CONNECTION_POOL_SIZE) as _sync_driver: + with GraphDatabase.driver( + URI, + auth=(USERNAME, PASSWORD), + max_connection_pool_size=MAX_CONNECTION_POOL_SIZE, + ) as _sync_driver: try: with _sync_driver.session(database=DATABASE) as session: try: From c3aba5423f995be628df8dbcb22702d00c9476d9 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Tue, 14 Jan 2025 23:08:39 +0100 Subject: [PATCH 10/11] Added more file types support --- lightrag/api/lightrag_server.py | 300 +++++++++++++++++++++++++++----- lightrag/api/requirements.txt | 1 + 2 files changed, 260 insertions(+), 41 deletions(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 5bcb149c..d9f7bf06 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -9,7 +9,7 @@ from lightrag.llm import openai_complete_if_cache, openai_embedding from lightrag.llm import azure_openai_complete_if_cache, azure_openai_embedding from lightrag.utils import EmbeddingFunc -from typing import Optional, List +from typing import Optional, List, Union from enum import Enum from pathlib import Path import shutil @@ -22,6 +22,7 @@ from fastapi.security import APIKeyHeader from fastapi.middleware.cors import CORSMiddleware from starlette.status import HTTP_403_FORBIDDEN +import pipmaster as pm def get_default_host(binding_type: str) -> str: @@ -174,7 +175,7 @@ def parse_args(): class DocumentManager: """Handles document operations and tracking""" - def __init__(self, input_dir: str, supported_extensions: tuple = (".txt", ".md")): + def __init__(self, input_dir: str, supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx")): self.input_dir = Path(input_dir) self.supported_extensions = supported_extensions self.indexed_files = set() @@ -289,7 +290,7 @@ def create_app(args): + "(With authentication)" if api_key else "", - version="1.0.1", + version="1.0.2", openapi_tags=[{"name": "api"}], ) @@ -356,6 +357,85 @@ def create_app(args): ), ) + + + async def index_file(file_path: Union[str, Path]) -> None: + """ Index all files inside the folder with support for multiple file formats + + Args: + file_path: Path to the file to be indexed (str or Path object) + + Raises: + ValueError: If file format is not supported + FileNotFoundError: If file doesn't exist + """ + if not pm.is_installed("aiofiles"): + pm.install("aiofiles") + import aiofiles + + + # Convert to Path object if string + file_path = Path(file_path) + + # Check if file exists + if not file_path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + content = "" + # Get file extension in lowercase + ext = file_path.suffix.lower() + + match ext: + case ".txt" | ".md": + # Text files handling + async with aiofiles.open(file_path, "r", encoding="utf-8") as f: + content = await f.read() + + case ".pdf": + if not pm.is_installed("pypdf2"): + pm.install("pypdf2") + from pypdf2 import PdfReader + # PDF handling + reader = PdfReader(str(file_path)) + content = "" + for page in reader.pages: + content += page.extract_text() + "\n" + + case ".docx": + if not pm.is_installed("docx"): + pm.install("docx") + from docx import Document + + # Word document handling + doc = Document(file_path) + content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) + + case ".pptx": + if not pm.is_installed("pptx"): + pm.install("pptx") + from pptx import Presentation + # PowerPoint handling + prs = Presentation(file_path) + content = "" + for slide in prs.slides: + for shape in slide.shapes: + if hasattr(shape, "text"): + content += shape.text + "\n" + + case _: + raise ValueError(f"Unsupported file format: {ext}") + + # Insert content into RAG system + if content: + await rag.ainsert(content) + doc_manager.mark_as_indexed(file_path) + logging.info(f"Successfully indexed file: {file_path}") + else: + logging.warning(f"No content extracted from file: {file_path}") + + + + @app.on_event("startup") async def startup_event(): """Index all files in input directory during startup""" @@ -363,13 +443,7 @@ def create_app(args): new_files = doc_manager.scan_directory() for file_path in new_files: try: - # Use async file reading - async with aiofiles.open(file_path, "r", encoding="utf-8") as f: - content = await f.read() - # Use the async version of insert directly - await rag.ainsert(content) - doc_manager.mark_as_indexed(file_path) - logging.info(f"Indexed file: {file_path}") + await index_file(file_path) except Exception as e: trace_exception(e) logging.error(f"Error indexing file {file_path}: {str(e)}") @@ -388,11 +462,8 @@ def create_app(args): for file_path in new_files: try: - with open(file_path, "r", encoding="utf-8") as f: - content = f.read() - await rag.ainsert(content) - doc_manager.mark_as_indexed(file_path) - indexed_count += 1 + await index_file(file_path) + indexed_count += 1 except Exception as e: logging.error(f"Error indexing file {file_path}: {str(e)}") @@ -419,10 +490,7 @@ def create_app(args): shutil.copyfileobj(file.file, buffer) # Immediately index the uploaded file - with open(file_path, "r", encoding="utf-8") as f: - content = f.read() - await rag.ainsert(content) - doc_manager.mark_as_indexed(file_path) + await index_file(file_path) return { "status": "success", @@ -491,69 +559,219 @@ def create_app(args): ) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) - @app.post( "/documents/file", response_model=InsertResponse, dependencies=[Depends(optional_api_key)], ) async def insert_file(file: UploadFile = File(...), description: str = Form(None)): + """Insert a file directly into the RAG system + + Args: + file: Uploaded file + description: Optional description of the file + + Returns: + InsertResponse: Status of the insertion operation + + Raises: + HTTPException: For unsupported file types or processing errors + """ try: - content = await file.read() + content = "" + # Get file extension in lowercase + ext = Path(file.filename).suffix.lower() + + match ext: + case ".txt" | ".md": + # Text files handling + text_content = await file.read() + content = text_content.decode("utf-8") + + case ".pdf": + if not pm.is_installed("pypdf2"): + pm.install("pypdf2") + from pypdf2 import PdfReader + from io import BytesIO + + # Read PDF from memory + pdf_content = await file.read() + pdf_file = BytesIO(pdf_content) + reader = PdfReader(pdf_file) + content = "" + for page in reader.pages: + content += page.extract_text() + "\n" + + case ".docx": + if not pm.is_installed("docx"): + pm.install("docx") + from docx import Document + from io import BytesIO + + # Read DOCX from memory + docx_content = await file.read() + docx_file = BytesIO(docx_content) + doc = Document(docx_file) + content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) + + case ".pptx": + if not pm.is_installed("pptx"): + pm.install("pptx") + from pptx import Presentation + from io import BytesIO + + # Read PPTX from memory + pptx_content = await file.read() + pptx_file = BytesIO(pptx_content) + prs = Presentation(pptx_file) + content = "" + for slide in prs.slides: + for shape in slide.shapes: + if hasattr(shape, "text"): + content += shape.text + "\n" + + case _: + raise HTTPException( + status_code=400, + detail=f"Unsupported file type. Supported types: {doc_manager.supported_extensions}", + ) - if file.filename.endswith((".txt", ".md")): - text = content.decode("utf-8") - await rag.ainsert(text) + # Insert content into RAG system + if content: + # Add description if provided + if description: + content = f"{description}\n\n{content}" + + await rag.ainsert(content) + logging.info(f"Successfully indexed file: {file.filename}") + + return InsertResponse( + status="success", + message=f"File '{file.filename}' successfully inserted", + document_count=1, + ) else: raise HTTPException( status_code=400, - detail="Unsupported file type. Only .txt and .md files are supported", + detail="No content could be extracted from the file", ) - return InsertResponse( - status="success", - message=f"File '{file.filename}' successfully inserted", - document_count=1, - ) except UnicodeDecodeError: raise HTTPException(status_code=400, detail="File encoding not supported") except Exception as e: + logging.error(f"Error processing file {file.filename}: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) - @app.post( "/documents/batch", response_model=InsertResponse, dependencies=[Depends(optional_api_key)], ) async def insert_batch(files: List[UploadFile] = File(...)): + """Process multiple files in batch mode + + Args: + files: List of files to process + + Returns: + InsertResponse: Status of the batch insertion operation + + Raises: + HTTPException: For processing errors + """ try: inserted_count = 0 failed_files = [] for file in files: try: - content = await file.read() - if file.filename.endswith((".txt", ".md")): - text = content.decode("utf-8") - await rag.ainsert(text) + content = "" + ext = Path(file.filename).suffix.lower() + + match ext: + case ".txt" | ".md": + text_content = await file.read() + content = text_content.decode("utf-8") + + case ".pdf": + if not pm.is_installed("pypdf2"): + pm.install("pypdf2") + from pypdf2 import PdfReader + from io import BytesIO + + pdf_content = await file.read() + pdf_file = BytesIO(pdf_content) + reader = PdfReader(pdf_file) + for page in reader.pages: + content += page.extract_text() + "\n" + + case ".docx": + if not pm.is_installed("docx"): + pm.install("docx") + from docx import Document + from io import BytesIO + + docx_content = await file.read() + docx_file = BytesIO(docx_content) + doc = Document(docx_file) + content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) + + case ".pptx": + if not pm.is_installed("pptx"): + pm.install("pptx") + from pptx import Presentation + from io import BytesIO + + pptx_content = await file.read() + pptx_file = BytesIO(pptx_content) + prs = Presentation(pptx_file) + for slide in prs.slides: + for shape in slide.shapes: + if hasattr(shape, "text"): + content += shape.text + "\n" + + case _: + failed_files.append(f"{file.filename} (unsupported type)") + continue + + if content: + await rag.ainsert(content) inserted_count += 1 + logging.info(f"Successfully indexed file: {file.filename}") else: - failed_files.append(f"{file.filename} (unsupported type)") + failed_files.append(f"{file.filename} (no content extracted)") + + except UnicodeDecodeError: + failed_files.append(f"{file.filename} (encoding error)") except Exception as e: failed_files.append(f"{file.filename} ({str(e)})") + logging.error(f"Error processing file {file.filename}: {str(e)}") - status_message = f"Successfully inserted {inserted_count} documents" - if failed_files: - status_message += f". Failed files: {', '.join(failed_files)}" + # Prepare status message + if inserted_count == len(files): + status = "success" + status_message = f"Successfully inserted all {inserted_count} documents" + elif inserted_count > 0: + status = "partial_success" + status_message = f"Successfully inserted {inserted_count} out of {len(files)} documents" + if failed_files: + status_message += f". Failed files: {', '.join(failed_files)}" + else: + status = "failure" + status_message = "No documents were successfully inserted" + if failed_files: + status_message += f". Failed files: {', '.join(failed_files)}" return InsertResponse( - status="success" if inserted_count > 0 else "partial_success", + status=status, message=status_message, - document_count=len(files), + document_count=inserted_count, ) + except Exception as e: + logging.error(f"Batch processing error: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) + @app.delete( "/documents", response_model=InsertResponse, diff --git a/lightrag/api/requirements.txt b/lightrag/api/requirements.txt index 221d7f40..b8fc41b2 100644 --- a/lightrag/api/requirements.txt +++ b/lightrag/api/requirements.txt @@ -15,3 +15,4 @@ torch tqdm transformers uvicorn +pipmaster \ No newline at end of file From 29661c92da1a9828e320f6238deeb2861d61532f Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Tue, 14 Jan 2025 23:11:23 +0100 Subject: [PATCH 11/11] fixed linting --- lightrag/api/lightrag_server.py | 94 +++++++++++++++++---------------- lightrag/api/requirements.txt | 2 +- 2 files changed, 50 insertions(+), 46 deletions(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index d9f7bf06..0d154b38 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -175,7 +175,11 @@ def parse_args(): class DocumentManager: """Handles document operations and tracking""" - def __init__(self, input_dir: str, supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx")): + def __init__( + self, + input_dir: str, + supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx"), + ): self.input_dir = Path(input_dir) self.supported_extensions = supported_extensions self.indexed_files = set() @@ -357,26 +361,22 @@ def create_app(args): ), ) - - async def index_file(file_path: Union[str, Path]) -> None: - """ Index all files inside the folder with support for multiple file formats - + """Index all files inside the folder with support for multiple file formats + Args: file_path: Path to the file to be indexed (str or Path object) - + Raises: ValueError: If file format is not supported FileNotFoundError: If file doesn't exist """ if not pm.is_installed("aiofiles"): pm.install("aiofiles") - import aiofiles - - + # Convert to Path object if string file_path = Path(file_path) - + # Check if file exists if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") @@ -384,23 +384,24 @@ def create_app(args): content = "" # Get file extension in lowercase ext = file_path.suffix.lower() - + match ext: case ".txt" | ".md": # Text files handling async with aiofiles.open(file_path, "r", encoding="utf-8") as f: content = await f.read() - + case ".pdf": if not pm.is_installed("pypdf2"): pm.install("pypdf2") from pypdf2 import PdfReader + # PDF handling reader = PdfReader(str(file_path)) content = "" for page in reader.pages: content += page.extract_text() + "\n" - + case ".docx": if not pm.is_installed("docx"): pm.install("docx") @@ -409,11 +410,12 @@ def create_app(args): # Word document handling doc = Document(file_path) content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) - + case ".pptx": if not pm.is_installed("pptx"): pm.install("pptx") from pptx import Presentation + # PowerPoint handling prs = Presentation(file_path) content = "" @@ -421,7 +423,7 @@ def create_app(args): for shape in slide.shapes: if hasattr(shape, "text"): content += shape.text + "\n" - + case _: raise ValueError(f"Unsupported file format: {ext}") @@ -433,9 +435,6 @@ def create_app(args): else: logging.warning(f"No content extracted from file: {file_path}") - - - @app.on_event("startup") async def startup_event(): """Index all files in input directory during startup""" @@ -559,6 +558,7 @@ def create_app(args): ) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) + @app.post( "/documents/file", response_model=InsertResponse, @@ -566,14 +566,14 @@ def create_app(args): ) async def insert_file(file: UploadFile = File(...), description: str = Form(None)): """Insert a file directly into the RAG system - + Args: file: Uploaded file description: Optional description of the file - + Returns: InsertResponse: Status of the insertion operation - + Raises: HTTPException: For unsupported file types or processing errors """ @@ -581,19 +581,19 @@ def create_app(args): content = "" # Get file extension in lowercase ext = Path(file.filename).suffix.lower() - + match ext: case ".txt" | ".md": # Text files handling text_content = await file.read() content = text_content.decode("utf-8") - + case ".pdf": if not pm.is_installed("pypdf2"): pm.install("pypdf2") from pypdf2 import PdfReader from io import BytesIO - + # Read PDF from memory pdf_content = await file.read() pdf_file = BytesIO(pdf_content) @@ -601,25 +601,27 @@ def create_app(args): content = "" for page in reader.pages: content += page.extract_text() + "\n" - + case ".docx": if not pm.is_installed("docx"): pm.install("docx") from docx import Document from io import BytesIO - + # Read DOCX from memory docx_content = await file.read() docx_file = BytesIO(docx_content) doc = Document(docx_file) - content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) - + content = "\n".join( + [paragraph.text for paragraph in doc.paragraphs] + ) + case ".pptx": if not pm.is_installed("pptx"): pm.install("pptx") from pptx import Presentation from io import BytesIO - + # Read PPTX from memory pptx_content = await file.read() pptx_file = BytesIO(pptx_content) @@ -629,7 +631,7 @@ def create_app(args): for shape in slide.shapes: if hasattr(shape, "text"): content += shape.text + "\n" - + case _: raise HTTPException( status_code=400, @@ -641,10 +643,10 @@ def create_app(args): # Add description if provided if description: content = f"{description}\n\n{content}" - + await rag.ainsert(content) logging.info(f"Successfully indexed file: {file.filename}") - + return InsertResponse( status="success", message=f"File '{file.filename}' successfully inserted", @@ -661,6 +663,7 @@ def create_app(args): except Exception as e: logging.error(f"Error processing file {file.filename}: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) + @app.post( "/documents/batch", response_model=InsertResponse, @@ -668,13 +671,13 @@ def create_app(args): ) async def insert_batch(files: List[UploadFile] = File(...)): """Process multiple files in batch mode - + Args: files: List of files to process - + Returns: InsertResponse: Status of the batch insertion operation - + Raises: HTTPException: For processing errors """ @@ -686,41 +689,43 @@ def create_app(args): try: content = "" ext = Path(file.filename).suffix.lower() - + match ext: case ".txt" | ".md": text_content = await file.read() content = text_content.decode("utf-8") - + case ".pdf": if not pm.is_installed("pypdf2"): pm.install("pypdf2") from pypdf2 import PdfReader from io import BytesIO - + pdf_content = await file.read() pdf_file = BytesIO(pdf_content) reader = PdfReader(pdf_file) for page in reader.pages: content += page.extract_text() + "\n" - + case ".docx": if not pm.is_installed("docx"): pm.install("docx") from docx import Document from io import BytesIO - + docx_content = await file.read() docx_file = BytesIO(docx_content) doc = Document(docx_file) - content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) - + content = "\n".join( + [paragraph.text for paragraph in doc.paragraphs] + ) + case ".pptx": if not pm.is_installed("pptx"): pm.install("pptx") from pptx import Presentation from io import BytesIO - + pptx_content = await file.read() pptx_file = BytesIO(pptx_content) prs = Presentation(pptx_file) @@ -728,7 +733,7 @@ def create_app(args): for shape in slide.shapes: if hasattr(shape, "text"): content += shape.text + "\n" - + case _: failed_files.append(f"{file.filename} (unsupported type)") continue @@ -771,7 +776,6 @@ def create_app(args): logging.error(f"Batch processing error: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) - @app.delete( "/documents", response_model=InsertResponse, diff --git a/lightrag/api/requirements.txt b/lightrag/api/requirements.txt index b8fc41b2..9154809c 100644 --- a/lightrag/api/requirements.txt +++ b/lightrag/api/requirements.txt @@ -7,6 +7,7 @@ nest_asyncio numpy ollama openai +pipmaster python-dotenv python-multipart tenacity @@ -15,4 +16,3 @@ torch tqdm transformers uvicorn -pipmaster \ No newline at end of file