From 1a25a78e8a92a8601e04852464f0fe9504816af8 Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 30 Mar 2025 10:25:49 +0800 Subject: [PATCH 001/116] feat: remove check_storage_env_vars and add TODOs - Remove unused check_storage_env_vars method - Add TODO to check if has_edge works on reverse relation - Add TODO about entities_vdb.client_storage local storage limitation --- lightrag/lightrag.py | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index dece78b4..0797ec2f 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1494,6 +1494,7 @@ class LightRAG: target_entity: Name of the target entity """ try: + # TODO: check if has_edge function works on reverse relation # Check if the relation exists edge_exists = await self.chunk_entity_relation_graph.has_edge( source_entity, target_entity @@ -1586,6 +1587,8 @@ class LightRAG: chunk_ids = set(related_chunks.keys()) logger.debug(f"Found {len(chunk_ids)} chunks to delete") + # TODO: self.entities_vdb.client_storage only works for local storage, need to fix this + # 3. Before deleting, check the related entities and relationships for these chunks for chunk_id in chunk_ids: # Check entities @@ -1857,24 +1860,6 @@ class LightRAG: return result - def check_storage_env_vars(self, storage_name: str) -> None: - """Check if all required environment variables for storage implementation exist - - Args: - storage_name: Storage implementation name - - Raises: - ValueError: If required environment variables are missing - """ - required_vars = STORAGE_ENV_REQUIREMENTS.get(storage_name, []) - missing_vars = [var for var in required_vars if var not in os.environ] - - if missing_vars: - raise ValueError( - f"Storage implementation '{storage_name}' requires the following " - f"environment variables: {', '.join(missing_vars)}" - ) - async def aclear_cache(self, modes: list[str] | None = None) -> None: """Clear cache data from the LLM response cache storage. From 1df4b777d7bbdcef0161f9b84cdf594a7c3f2fd1 Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 30 Mar 2025 15:17:57 +0800 Subject: [PATCH 002/116] Add drop funtions to storage implementations --- lightrag/base.py | 25 ++++++++ lightrag/kg/chroma_impl.py | 28 ++++++++- lightrag/kg/faiss_impl.py | 35 +++++++++++ lightrag/kg/json_doc_status_impl.py | 28 +++++++-- lightrag/kg/json_kv_impl.py | 23 +++++++ lightrag/kg/milvus_impl.py | 30 ++++++++- lightrag/kg/nano_vector_db_impl.py | 36 +++++++++++ lightrag/kg/neo4j_impl.py | 23 +++++++ lightrag/kg/networkx_impl.py | 31 ++++++++++ lightrag/kg/qdrant_impl.py | 95 +++++++++++++++++++++++++++-- 10 files changed, 339 insertions(+), 15 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index ad41fc58..bff92b34 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -111,6 +111,31 @@ class StorageNameSpace(ABC): @abstractmethod async def index_done_callback(self) -> None: """Commit the storage operations after indexing""" + + @abstractmethod + async def drop(self) -> dict[str, str]: + """Drop all data from storage and clean up resources + + This abstract method defines the contract for dropping all data from a storage implementation. + Each storage type must implement this method to: + 1. Clear all data from memory and/or external storage + 2. Remove any associated storage files if applicable + 3. Reset the storage to its initial state + 4. Handle cleanup of any resources + 5. Notify other processes if necessary + + Returns: + dict[str, str]: Operation status and message with the following format: + { + "status": str, # "success" or "error" + "message": str # "data dropped" on success, error details on failure + } + + Implementation specific: + - On success: return {"status": "success", "message": "data dropped"} + - On failure: return {"status": "error", "message": ""} + - If not supported: return {"status": "error", "message": "unsupported"} + """ @dataclass diff --git a/lightrag/kg/chroma_impl.py b/lightrag/kg/chroma_impl.py index 84d43326..052088d4 100644 --- a/lightrag/kg/chroma_impl.py +++ b/lightrag/kg/chroma_impl.py @@ -1,4 +1,5 @@ import asyncio +import os from dataclasses import dataclass from typing import Any, final import numpy as np @@ -10,8 +11,8 @@ import pipmaster as pm if not pm.is_installed("chromadb"): pm.install("chromadb") -from chromadb import HttpClient, PersistentClient -from chromadb.config import Settings +from chromadb import HttpClient, PersistentClient # type: ignore +from chromadb.config import Settings # type: ignore @final @@ -335,3 +336,26 @@ class ChromaVectorDBStorage(BaseVectorStorage): except Exception as e: logger.error(f"Error retrieving vector data for IDs {ids}: {e}") return [] + + async def drop(self) -> dict[str, str]: + """Drop all vector data from storage and clean up resources + + This method will delete all documents from the ChromaDB collection. + + Returns: + dict[str, str]: Operation status and message + - On success: {"status": "success", "message": "data dropped"} + - On failure: {"status": "error", "message": ""} + """ + try: + # Get all IDs in the collection + result = self._collection.get(include=[]) + if result and result["ids"] and len(result["ids"]) > 0: + # Delete all documents + self._collection.delete(ids=result["ids"]) + + logger.info(f"Process {os.getpid()} drop ChromaDB collection {self.namespace}") + return {"status": "success", "message": "data dropped"} + except Exception as e: + logger.error(f"Error dropping ChromaDB collection {self.namespace}: {e}") + return {"status": "error", "message": str(e)} diff --git a/lightrag/kg/faiss_impl.py b/lightrag/kg/faiss_impl.py index b8176037..42133090 100644 --- a/lightrag/kg/faiss_impl.py +++ b/lightrag/kg/faiss_impl.py @@ -429,3 +429,38 @@ class FaissVectorDBStorage(BaseVectorStorage): results.append({**metadata, "id": metadata.get("__id__")}) return results + + async def drop(self) -> dict[str, str]: + """Drop all vector data from storage and clean up resources + + This method will remove all vectors from the Faiss index and delete the storage files. + + Returns: + dict[str, str]: Operation status and message + - On success: {"status": "success", "message": "data dropped"} + - On failure: {"status": "error", "message": ""} + """ + try: + async with self._storage_lock: + # Reset the index + self._index = faiss.IndexFlatIP(self._dim) + self._id_to_meta = {} + + # Remove storage files if they exist + if os.path.exists(self._faiss_index_file): + os.remove(self._faiss_index_file) + if os.path.exists(self._meta_file): + os.remove(self._meta_file) + + self._id_to_meta = {} + self._load_faiss_index() + + # Notify other processes + await set_all_update_flags(self.namespace) + self.storage_updated.value = False + + logger.info(f"Process {os.getpid()} drop FAISS index {self.namespace}") + return {"status": "success", "message": "data dropped"} + except Exception as e: + logger.error(f"Error dropping FAISS index {self.namespace}: {e}") + return {"status": "error", "message": str(e)} diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py index 22da07b5..0a3f5470 100644 --- a/lightrag/kg/json_doc_status_impl.py +++ b/lightrag/kg/json_doc_status_impl.py @@ -129,9 +129,25 @@ class JsonDocStatusStorage(DocStatusStorage): await set_all_update_flags(self.namespace) await self.index_done_callback() - async def drop(self) -> None: - """Drop the storage""" - async with self._storage_lock: - self._data.clear() - await set_all_update_flags(self.namespace) - await self.index_done_callback() + async def drop(self) -> dict[str, str]: + """Drop all document status data from storage and clean up resources + + This method will: + 1. Clear all document status data from memory + 2. Update flags to notify other processes + 3. Trigger index_done_callback to save the empty state + + Returns: + dict[str, str]: Operation status and message + - On success: {"status": "success", "message": "data dropped"} + - On failure: {"status": "error", "message": ""} + """ + try: + async with self._storage_lock: + self._data.update({}) + await self.index_done_callback() + logger.info(f"Process {os.getpid()} drop {self.namespace}") + return {"status": "success", "message": "data dropped"} + except Exception as e: + logger.error(f"Error dropping {self.namespace}: {e}") + return {"status": "error", "message": str(e)} diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index e7deaf15..2ca9c03e 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -127,3 +127,26 @@ class JsonKVStorage(BaseKVStorage): self._data.pop(doc_id, None) await set_all_update_flags(self.namespace) await self.index_done_callback() + + async def drop(self) -> dict[str, str]: + """Drop all data from storage and clean up resources + + This method will: + 1. Clear all data from memory + 2. Update flags to notify other processes + 3. Trigger index_done_callback to save the empty state + + Returns: + dict[str, str]: Operation status and message + - On success: {"status": "success", "message": "data dropped"} + - On failure: {"status": "error", "message": ""} + """ + try: + async with self._storage_lock: + self._data.update({}) + await self.index_done_callback() + logger.info(f"Process {os.getpid()} drop {self.namespace}") + return {"status": "success", "message": "data dropped"} + except Exception as e: + logger.error(f"Error dropping {self.namespace}: {e}") + return {"status": "error", "message": str(e)} diff --git a/lightrag/kg/milvus_impl.py b/lightrag/kg/milvus_impl.py index 4b4577ca..74cf416a 100644 --- a/lightrag/kg/milvus_impl.py +++ b/lightrag/kg/milvus_impl.py @@ -15,7 +15,7 @@ if not pm.is_installed("pymilvus"): pm.install("pymilvus") import configparser -from pymilvus import MilvusClient +from pymilvus import MilvusClient # type: ignore config = configparser.ConfigParser() config.read("config.ini", "utf-8") @@ -287,3 +287,31 @@ class MilvusVectorDBStorage(BaseVectorStorage): except Exception as e: logger.error(f"Error retrieving vector data for IDs {ids}: {e}") return [] + + async def drop(self) -> dict[str, str]: + """Drop all vector data from storage and clean up resources + + This method will delete all data from the Milvus collection. + + Returns: + dict[str, str]: Operation status and message + - On success: {"status": "success", "message": "data dropped"} + - On failure: {"status": "error", "message": ""} + """ + try: + # Drop the collection and recreate it + if self._client.has_collection(self.namespace): + self._client.drop_collection(self.namespace) + + # Recreate the collection + MilvusVectorDBStorage.create_collection_if_not_exist( + self._client, + self.namespace, + dimension=self.embedding_func.embedding_dim, + ) + + logger.info(f"Process {os.getpid()} drop Milvus collection {self.namespace}") + return {"status": "success", "message": "data dropped"} + except Exception as e: + logger.error(f"Error dropping Milvus collection {self.namespace}: {e}") + return {"status": "error", "message": str(e)} diff --git a/lightrag/kg/nano_vector_db_impl.py b/lightrag/kg/nano_vector_db_impl.py index 553ba0b2..0f907a42 100644 --- a/lightrag/kg/nano_vector_db_impl.py +++ b/lightrag/kg/nano_vector_db_impl.py @@ -280,3 +280,39 @@ class NanoVectorDBStorage(BaseVectorStorage): client = await self._get_client() return client.get(ids) + + async def drop(self) -> dict[str, str]: + """Drop all vector data from storage and clean up resources + + This method will: + 1. Remove the vector database storage file if it exists + 2. Reinitialize the vector database client + 3. Update flags to notify other processes + 4. Trigger index_done_callback to save the empty state + + Returns: + dict[str, str]: Operation status and message + - On success: {"status": "success", "message": "data dropped"} + - On failure: {"status": "error", "message": ""} + """ + try: + async with self._storage_lock: + # delete _client_file_name + if os.path.exists(self._client_file_name): + os.remove(self._client_file_name) + + self._client = NanoVectorDB( + self.embedding_func.embedding_dim, + storage_file=self._client_file_name, + ) + + # Notify other processes that data has been updated + await set_all_update_flags(self.namespace) + # Reset own update flag to avoid self-reloading + self.storage_updated.value = False + + logger.info(f"Process {os.getpid()} drop {self.namespace}(file:{self._client_file_name})") + return {"status": "success", "message": "data dropped"} + except Exception as e: + logger.error(f"Error dropping {self.namespace}: {e}") + return {"status": "error", "message": str(e)} diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py index 2df420df..3f2545a7 100644 --- a/lightrag/kg/neo4j_impl.py +++ b/lightrag/kg/neo4j_impl.py @@ -1028,3 +1028,26 @@ class Neo4JStorage(BaseGraphStorage): self, algorithm: str ) -> tuple[np.ndarray[Any, Any], list[str]]: raise NotImplementedError + + async def drop(self) -> dict[str, str]: + """Drop all data from storage and clean up resources + + This method will delete all nodes and relationships in the Neo4j database. + + Returns: + dict[str, str]: Operation status and message + - On success: {"status": "success", "message": "data dropped"} + - On failure: {"status": "error", "message": ""} + """ + try: + async with self._driver.session(database=self._DATABASE) as session: + # Delete all nodes and relationships + query = "MATCH (n) DETACH DELETE n" + result = await session.run(query) + await result.consume() # Ensure result is fully consumed + + logger.info(f"Process {os.getpid()} drop Neo4j database {self._DATABASE}") + return {"status": "success", "message": "data dropped"} + except Exception as e: + logger.error(f"Error dropping Neo4j database {self._DATABASE}: {e}") + return {"status": "error", "message": str(e)} diff --git a/lightrag/kg/networkx_impl.py b/lightrag/kg/networkx_impl.py index 324fe7af..99e0e223 100644 --- a/lightrag/kg/networkx_impl.py +++ b/lightrag/kg/networkx_impl.py @@ -42,6 +42,7 @@ class NetworkXStorage(BaseGraphStorage): ) nx.write_graphml(graph, file_name) + # TODO:deprecated, remove later @staticmethod def _stabilize_graph(graph: nx.Graph) -> nx.Graph: """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py @@ -424,3 +425,33 @@ class NetworkXStorage(BaseGraphStorage): return False # Return error return True + + async def drop(self) -> dict[str, str]: + """Drop all graph data from storage and clean up resources + + This method will: + 1. Remove the graph storage file if it exists + 2. Reset the graph to an empty state + 3. Update flags to notify other processes + 4. Trigger index_done_callback to save the empty state + + Returns: + dict[str, str]: Operation status and message + - On success: {"status": "success", "message": "data dropped"} + - On failure: {"status": "error", "message": ""} + """ + try: + async with self._storage_lock: + # delete _client_file_name + if os.path.exists(self._graphml_xml_file): + os.remove(self._graphml_xml_file) + self._graph = nx.Graph() + # Notify other processes that data has been updated + await set_all_update_flags(self.namespace) + # Reset own update flag to avoid self-reloading + self.storage_updated.value = False + logger.info(f"Process {os.getpid()} drop graph {self.namespace} (file:{self._graphml_xml_file})") + return {"status": "success", "message": "data dropped"} + except Exception as e: + logger.error(f"Error dropping graph {self.namespace}: {e}") + return {"status": "error", "message": str(e)} diff --git a/lightrag/kg/qdrant_impl.py b/lightrag/kg/qdrant_impl.py index e32c4335..855b98ae 100644 --- a/lightrag/kg/qdrant_impl.py +++ b/lightrag/kg/qdrant_impl.py @@ -8,18 +8,15 @@ import uuid from ..utils import logger from ..base import BaseVectorStorage import configparser - - -config = configparser.ConfigParser() -config.read("config.ini", "utf-8") - import pipmaster as pm if not pm.is_installed("qdrant-client"): pm.install("qdrant-client") -from qdrant_client import QdrantClient, models +from qdrant_client import QdrantClient, models # type: ignore +config = configparser.ConfigParser() +config.read("config.ini", "utf-8") def compute_mdhash_id_for_qdrant( content: str, prefix: str = "", style: str = "simple" @@ -275,3 +272,89 @@ class QdrantVectorDBStorage(BaseVectorStorage): except Exception as e: logger.error(f"Error searching for prefix '{prefix}': {e}") return [] + + async def get_by_id(self, id: str) -> dict[str, Any] | None: + """Get vector data by its ID + + Args: + id: The unique identifier of the vector + + Returns: + The vector data if found, or None if not found + """ + try: + # Convert to Qdrant compatible ID + qdrant_id = compute_mdhash_id_for_qdrant(id) + + # Retrieve the point by ID + result = self._client.retrieve( + collection_name=self.namespace, + ids=[qdrant_id], + with_payload=True, + ) + + if not result: + return None + + return result[0].payload + except Exception as e: + logger.error(f"Error retrieving vector data for ID {id}: {e}") + return None + + async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]: + """Get multiple vector data by their IDs + + Args: + ids: List of unique identifiers + + Returns: + List of vector data objects that were found + """ + if not ids: + return [] + + try: + # Convert to Qdrant compatible IDs + qdrant_ids = [compute_mdhash_id_for_qdrant(id) for id in ids] + + # Retrieve the points by IDs + results = self._client.retrieve( + collection_name=self.namespace, + ids=qdrant_ids, + with_payload=True, + ) + + return [point.payload for point in results] + except Exception as e: + logger.error(f"Error retrieving vector data for IDs {ids}: {e}") + return [] + + async def drop(self) -> dict[str, str]: + """Drop all vector data from storage and clean up resources + + This method will delete all data from the Qdrant collection. + + Returns: + dict[str, str]: Operation status and message + - On success: {"status": "success", "message": "data dropped"} + - On failure: {"status": "error", "message": ""} + """ + try: + # Delete the collection and recreate it + if self._client.collection_exists(self.namespace): + self._client.delete_collection(self.namespace) + + # Recreate the collection + QdrantVectorDBStorage.create_collection_if_not_exist( + self._client, + self.namespace, + vectors_config=models.VectorParams( + size=self.embedding_func.embedding_dim, distance=models.Distance.COSINE + ), + ) + + logger.info(f"Process {os.getpid()} drop Qdrant collection {self.namespace}") + return {"status": "success", "message": "data dropped"} + except Exception as e: + logger.error(f"Error dropping Qdrant collection {self.namespace}: {e}") + return {"status": "error", "message": str(e)} From 53ab5e015c2e55c0b82740fce1bb63271277d67f Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 30 Mar 2025 15:25:04 +0800 Subject: [PATCH 003/116] remove check_storage_env_vars from lightrag.py --- lightrag/lightrag.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 0797ec2f..62e1b279 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -13,7 +13,6 @@ import pandas as pd from lightrag.kg import ( - STORAGE_ENV_REQUIREMENTS, STORAGES, verify_storage_implementation, ) @@ -1588,7 +1587,7 @@ class LightRAG: logger.debug(f"Found {len(chunk_ids)} chunks to delete") # TODO: self.entities_vdb.client_storage only works for local storage, need to fix this - + # 3. Before deleting, check the related entities and relationships for these chunks for chunk_id in chunk_ids: # Check entities From 949a3904a9dd56242db52bb74f53438605a30d55 Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 30 Mar 2025 16:30:41 +0800 Subject: [PATCH 004/116] feat(api): Enhance document clearing functionality - Use storage drop methods to properly clean up all data - Add file deletion from input directory - Add pipeline status checking and locking mechanism - Improve error handling with detailed logging and pipeline message tracking --- lightrag/api/routers/document_routes.py | 144 ++++++++++++++++++++++-- 1 file changed, 136 insertions(+), 8 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 445008ec..e683b30b 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -443,6 +443,7 @@ async def pipeline_index_texts(rag: LightRAG, texts: List[str]): await rag.apipeline_process_enqueue_documents() +# TODO: deprecate after /insert_file is removed async def save_temp_file(input_dir: Path, file: UploadFile = File(...)) -> Path: """Save the uploaded file to a temporary location @@ -645,6 +646,7 @@ def create_document_routes( logger.error(traceback.format_exc()) raise HTTPException(status_code=500, detail=str(e)) + # TODO: deprecated, use /upload instead @router.post( "/file", response_model=InsertResponse, dependencies=[Depends(combined_auth)] ) @@ -688,6 +690,7 @@ def create_document_routes( logger.error(traceback.format_exc()) raise HTTPException(status_code=500, detail=str(e)) + # TODO: deprecated, use /upload instead @router.post( "/file_batch", response_model=InsertResponse, @@ -758,26 +761,151 @@ def create_document_routes( """ Clear all documents from the RAG system. - This endpoint deletes all text chunks, entities vector database, and relationships - vector database, effectively clearing all documents from the RAG system. + This endpoint deletes all documents, entities, relationships, and files from the system. + It uses the storage drop methods to properly clean up all data and removes all files + from the input directory. Returns: InsertResponse: A response object containing the status and message. Raises: - HTTPException: If an error occurs during the clearing process (500). + HTTPException: If an error occurs during the clearing process (500) or if + the pipeline is busy (400). """ + from lightrag.kg.shared_storage import get_namespace_data, get_pipeline_status_lock + + # Get pipeline status and lock + pipeline_status = await get_namespace_data("pipeline_status") + pipeline_status_lock = get_pipeline_status_lock() + + # Check and set status with lock + async with pipeline_status_lock: + if pipeline_status.get("busy", False): + return InsertResponse( + status="error", + message="Cannot clear documents while pipeline is busy" + ) + # Set busy to true + pipeline_status["busy"] = True + pipeline_status["job_name"] = "Clearing Documents" + pipeline_status["latest_message"] = "Starting document clearing process" + if "history_messages" in pipeline_status: + pipeline_status["history_messages"].append("Starting document clearing process") + try: - rag.text_chunks = [] - rag.entities_vdb = None - rag.relationships_vdb = None + # Use drop method to clear all data + drop_tasks = [] + storages = [ + rag.text_chunks, + rag.full_docs, + rag.entities_vdb, + rag.relationships_vdb, + rag.chunks_vdb, + rag.chunk_entity_relation_graph, + rag.doc_status + ] + + # Log storage drop start + if "history_messages" in pipeline_status: + pipeline_status["history_messages"].append("Starting to drop storage components") + + for storage in storages: + if storage is not None: + drop_tasks.append(storage.drop()) + + # Wait for all drop tasks to complete + drop_results = await asyncio.gather(*drop_tasks, return_exceptions=True) + + # Check for errors and log results + errors = [] + storage_success_count = 0 + storage_error_count = 0 + + for i, result in enumerate(drop_results): + storage_name = storages[i].__class__.__name__ + if isinstance(result, Exception): + error_msg = f"Error dropping {storage_name}: {str(result)}" + errors.append(error_msg) + logger.error(error_msg) + storage_error_count += 1 + else: + logger.info(f"Successfully dropped {storage_name}") + storage_success_count += 1 + + # Log storage drop results + if "history_messages" in pipeline_status: + if storage_error_count > 0: + pipeline_status["history_messages"].append( + f"Dropped {storage_success_count} storage components with {storage_error_count} errors" + ) + else: + pipeline_status["history_messages"].append( + f"Successfully dropped all {storage_success_count} storage components" + ) + + # Log file deletion start + if "history_messages" in pipeline_status: + pipeline_status["history_messages"].append("Starting to delete files in input directory") + + # Delete all files in input_dir + deleted_files_count = 0 + file_errors_count = 0 + + for file_path in doc_manager.input_dir.glob("**/*"): + if file_path.is_file(): + try: + file_path.unlink() + deleted_files_count += 1 + except Exception as e: + logger.error(f"Error deleting file {file_path}: {str(e)}") + file_errors_count += 1 + + # Log file deletion results + if "history_messages" in pipeline_status: + if file_errors_count > 0: + pipeline_status["history_messages"].append( + f"Deleted {deleted_files_count} files with {file_errors_count} errors" + ) + errors.append(f"Failed to delete {file_errors_count} files") + else: + pipeline_status["history_messages"].append( + f"Successfully deleted {deleted_files_count} files" + ) + + # Prepare final result message + final_message = "" + if errors: + final_message = f"Cleared documents with some errors. Deleted {deleted_files_count} files." + status = "partial_success" + else: + final_message = f"All documents cleared successfully. Deleted {deleted_files_count} files." + status = "success" + + + # Log final result + if "history_messages" in pipeline_status: + pipeline_status["history_messages"].append(final_message) + + # Return response based on results return InsertResponse( - status="success", message="All documents cleared successfully" + status=status, + message=final_message ) except Exception as e: - logger.error(f"Error DELETE /documents: {str(e)}") + error_msg = f"Error clearing documents: {str(e)}" + logger.error(error_msg) logger.error(traceback.format_exc()) + if "history_messages" in pipeline_status: + pipeline_status["history_messages"].append(error_msg) raise HTTPException(status_code=500, detail=str(e)) + finally: + # Reset busy status after completion + async with pipeline_status_lock: + pipeline_status["busy"] = False + completion_msg = "Document clearing process completed" + pipeline_status["latest_message"] = completion_msg + if "history_messages" in pipeline_status: + pipeline_status["history_messages"].append(completion_msg) @router.get( "/pipeline_status", From 8b125488e9cbf47a66005006e8290f0970b1df61 Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 30 Mar 2025 16:31:57 +0800 Subject: [PATCH 005/116] Bump api version to 1.2.9 --- lightrag/api/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/api/__init__.py b/lightrag/api/__init__.py index ec1959de..ed0d5292 100644 --- a/lightrag/api/__init__.py +++ b/lightrag/api/__init__.py @@ -1 +1 @@ -__api_version__ = "1.2.8" +__api_version__ = "1.2.9" From 637d37eec49510da8dc4037909cc62697ec07cc1 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 01:03:41 +0800 Subject: [PATCH 006/116] Update drop implementation for all storage type of PostgreSQL --- lightrag/kg/postgres_impl.py | 110 +++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 51 deletions(-) diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 4ff34e13..02fd68a0 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -380,10 +380,20 @@ class PGKVStorage(BaseKVStorage): # PG handles persistence automatically pass - async def drop(self) -> None: + async def drop(self) -> dict[str, str]: """Drop the storage""" - drop_sql = SQL_TEMPLATES["drop_all"] - await self.db.execute(drop_sql) + try: + table_name = namespace_to_table_name(self.namespace) + if not table_name: + return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} + + drop_sql = SQL_TEMPLATES["drop_specifiy_table_workspace"].format( + table_name=table_name + ) + await self.db.execute(drop_sql, {"workspace": self.db.workspace}) + return {"status": "success", "message": "data dropped"} + except Exception as e: + return {"status": "error", "message": str(e)} @final @@ -690,6 +700,21 @@ class PGVectorStorage(BaseVectorStorage): logger.error(f"Error retrieving vector data for IDs {ids}: {e}") return [] + async def drop(self) -> dict[str, str]: + """Drop the storage""" + try: + table_name = namespace_to_table_name(self.namespace) + if not table_name: + return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} + + drop_sql = SQL_TEMPLATES["drop_specifiy_table_workspace"].format( + table_name=table_name + ) + await self.db.execute(drop_sql, {"workspace": self.db.workspace}) + return {"status": "success", "message": "data dropped"} + except Exception as e: + return {"status": "error", "message": str(e)} + @final @dataclass @@ -846,10 +871,20 @@ class PGDocStatusStorage(DocStatusStorage): }, ) - async def drop(self) -> None: + async def drop(self) -> dict[str, str]: """Drop the storage""" - drop_sql = SQL_TEMPLATES["drop_doc_full"] - await self.db.execute(drop_sql) + try: + table_name = namespace_to_table_name(self.namespace) + if not table_name: + return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} + + drop_sql = SQL_TEMPLATES["drop_specifiy_table_workspace"].format( + table_name=table_name + ) + await self.db.execute(drop_sql, {"workspace": self.db.workspace}) + return {"status": "success", "message": "data dropped"} + except Exception as e: + return {"status": "error", "message": str(e)} class PGGraphQueryException(Exception): @@ -1530,12 +1565,19 @@ class PGGraphStorage(BaseGraphStorage): return kg - async def drop(self) -> None: + async def drop(self) -> dict[str, str]: """Drop the storage""" - drop_sql = SQL_TEMPLATES["drop_vdb_entity"] - await self.db.execute(drop_sql) - drop_sql = SQL_TEMPLATES["drop_vdb_relation"] - await self.db.execute(drop_sql) + try: + drop_query = f"""SELECT * FROM cypher('{self.graph_name}', $$ + MATCH (n) + DETACH DELETE n + $$) AS (result agtype)""" + + await self._query(drop_query, readonly=False) + return {"status": "success", "message": "graph data dropped"} + except Exception as e: + logger.error(f"Error dropping graph: {e}") + return {"status": "error", "message": str(e)} NAMESPACE_TABLE_MAP = { @@ -1693,6 +1735,7 @@ SQL_TEMPLATES = { file_path=EXCLUDED.file_path, update_time = CURRENT_TIMESTAMP """, + # SQL for VectorStorage "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, content_vector, chunk_ids, file_path) VALUES ($1, $2, $3, $4, $5, $6::varchar[], $7) @@ -1715,46 +1758,7 @@ SQL_TEMPLATES = { chunk_ids=EXCLUDED.chunk_ids, file_path=EXCLUDED.file_path, update_time = CURRENT_TIMESTAMP - """, - # SQL for VectorStorage - # "entities": """SELECT entity_name FROM - # (SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance - # FROM LIGHTRAG_VDB_ENTITY where workspace=$1) - # WHERE distance>$2 ORDER BY distance DESC LIMIT $3 - # """, - # "relationships": """SELECT source_id as src_id, target_id as tgt_id FROM - # (SELECT id, source_id,target_id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance - # FROM LIGHTRAG_VDB_RELATION where workspace=$1) - # WHERE distance>$2 ORDER BY distance DESC LIMIT $3 - # """, - # "chunks": """SELECT id FROM - # (SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance - # FROM LIGHTRAG_DOC_CHUNKS where workspace=$1) - # WHERE distance>$2 ORDER BY distance DESC LIMIT $3 - # """, - # DROP tables - "drop_all": """ - DROP TABLE IF EXISTS LIGHTRAG_DOC_FULL CASCADE; - DROP TABLE IF EXISTS LIGHTRAG_DOC_CHUNKS CASCADE; - DROP TABLE IF EXISTS LIGHTRAG_LLM_CACHE CASCADE; - DROP TABLE IF EXISTS LIGHTRAG_VDB_ENTITY CASCADE; - DROP TABLE IF EXISTS LIGHTRAG_VDB_RELATION CASCADE; - """, - "drop_doc_full": """ - DROP TABLE IF EXISTS LIGHTRAG_DOC_FULL CASCADE; - """, - "drop_doc_chunks": """ - DROP TABLE IF EXISTS LIGHTRAG_DOC_CHUNKS CASCADE; - """, - "drop_llm_cache": """ - DROP TABLE IF EXISTS LIGHTRAG_LLM_CACHE CASCADE; - """, - "drop_vdb_entity": """ - DROP TABLE IF EXISTS LIGHTRAG_VDB_ENTITY CASCADE; - """, - "drop_vdb_relation": """ - DROP TABLE IF EXISTS LIGHTRAG_VDB_RELATION CASCADE; - """, + """, "relationships": """ WITH relevant_chunks AS ( SELECT id as chunk_id @@ -1806,4 +1810,8 @@ SQL_TEMPLATES = { ORDER BY distance DESC LIMIT $3 """, + # DROP tables + "drop_specifiy_table_workspace": """ + DELETE FROM {table_name} WHERE workspace=$1 + """, } From 0538217c3edb69bdd29b885f0c31b60331e59cf0 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 01:03:58 +0800 Subject: [PATCH 007/116] Update env.example --- env.example | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/env.example b/env.example index 20d80d43..ea6d7773 100644 --- a/env.example +++ b/env.example @@ -117,7 +117,7 @@ ORACLE_PASSWORD='your_password' ORACLE_CONFIG_DIR=/path/to/oracle/config #ORACLE_WALLET_LOCATION=/path/to/wallet #ORACLE_WALLET_PASSWORD='your_password' -### separating all data from difference Lightrag instances(deprecating, use NAMESPACE_PREFIX in future) +### separating all data from difference Lightrag instances(deprecating) #ORACLE_WORKSPACE=default ### TiDB Configuration @@ -126,8 +126,8 @@ TIDB_PORT=4000 TIDB_USER=your_username TIDB_PASSWORD='your_password' TIDB_DATABASE=your_database -### separating all data from difference Lightrag instances(deprecating, use NAMESPACE_PREFIX in future) -#TIDB_WORKSPACE=default +### separating all data from difference Lightrag instances(deprecating) +# TIDB_WORKSPACE=default ### PostgreSQL Configuration POSTGRES_HOST=localhost @@ -135,8 +135,8 @@ POSTGRES_PORT=5432 POSTGRES_USER=your_username POSTGRES_PASSWORD='your_password' POSTGRES_DATABASE=your_database -### separating all data from difference Lightrag instances(deprecating, use NAMESPACE_PREFIX in future) -#POSTGRES_WORKSPACE=default +### separating all data from difference Lightrag instances(deprecating) +# POSTGRES_WORKSPACE=default ### Independent AGM Configuration(not for AMG embedded in PostreSQL) AGE_POSTGRES_DB= @@ -145,8 +145,8 @@ AGE_POSTGRES_PASSWORD= AGE_POSTGRES_HOST= # AGE_POSTGRES_PORT=8529 -### separating all data from difference Lightrag instances(deprecating, use NAMESPACE_PREFIX in future) # AGE Graph Name(apply to PostgreSQL and independent AGM) +### AGE_GRAPH_NAME is precated # AGE_GRAPH_NAME=lightrag ### Neo4j Configuration @@ -157,7 +157,7 @@ NEO4J_PASSWORD='your_password' ### MongoDB Configuration MONGO_URI=mongodb://root:root@localhost:27017/ MONGO_DATABASE=LightRAG -### separating all data from difference Lightrag instances(deprecating, use NAMESPACE_PREFIX in future) +### separating all data from difference Lightrag instances(deprecating) # MONGODB_GRAPH=false ### Milvus Configuration From ee2719896f83b3612f36701938f9a9e3048b63a6 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 01:09:30 +0800 Subject: [PATCH 008/116] Add drop support for AGE storage --- lightrag/kg/age_impl.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/lightrag/kg/age_impl.py b/lightrag/kg/age_impl.py index 22951554..8530e12d 100644 --- a/lightrag/kg/age_impl.py +++ b/lightrag/kg/age_impl.py @@ -34,9 +34,9 @@ if not pm.is_installed("psycopg-pool"): if not pm.is_installed("asyncpg"): pm.install("asyncpg") -import psycopg -from psycopg.rows import namedtuple_row -from psycopg_pool import AsyncConnectionPool, PoolTimeout +import psycopg # type: ignore +from psycopg.rows import namedtuple_row # type: ignore +from psycopg_pool import AsyncConnectionPool, PoolTimeout # type: ignore class AGEQueryException(Exception): @@ -871,3 +871,21 @@ class AGEStorage(BaseGraphStorage): async def index_done_callback(self) -> None: # AGES handles persistence automatically pass + + async def drop(self) -> dict[str, str]: + """Drop the storage by removing all nodes and relationships in the graph. + + Returns: + dict[str, str]: Status of the operation with keys 'status' and 'message' + """ + try: + query = """ + MATCH (n) + DETACH DELETE n + """ + await self._query(query) + logger.info(f"Successfully dropped all data from graph {self.graph_name}") + return {"status": "success", "message": "graph data dropped"} + except Exception as e: + logger.error(f"Error dropping graph {self.graph_name}: {e}") + return {"status": "error", "message": str(e)} From 0a82356d0860600689a5ea03f2e49b1f470f2754 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 01:16:56 +0800 Subject: [PATCH 009/116] Add drop support for Gremlin Graph --- lightrag/kg/gremlin_impl.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/lightrag/kg/gremlin_impl.py b/lightrag/kg/gremlin_impl.py index ddb7559f..d616a409 100644 --- a/lightrag/kg/gremlin_impl.py +++ b/lightrag/kg/gremlin_impl.py @@ -24,9 +24,9 @@ from ..base import BaseGraphStorage if not pm.is_installed("gremlinpython"): pm.install("gremlinpython") -from gremlin_python.driver import client, serializer -from gremlin_python.driver.aiohttp.transport import AiohttpTransport -from gremlin_python.driver.protocol import GremlinServerError +from gremlin_python.driver import client, serializer # type: ignore +from gremlin_python.driver.aiohttp.transport import AiohttpTransport # type: ignore +from gremlin_python.driver.protocol import GremlinServerError # type: ignore @final @@ -695,3 +695,24 @@ class GremlinStorage(BaseGraphStorage): except Exception as e: logger.error(f"Error during edge deletion: {str(e)}") raise + + async def drop(self) -> dict[str, str]: + """Drop the storage by removing all nodes and relationships in the graph. + + This function deletes all nodes with the specified graph name property, + which automatically removes all associated edges. + + Returns: + dict[str, str]: Status of the operation with keys 'status' and 'message' + """ + try: + query = f"""g + .V().has('graph', {self.graph_name}) + .drop() + """ + await self._query(query) + logger.info(f"Successfully dropped all data from graph {self.graph_name}") + return {"status": "success", "message": "graph data dropped"} + except Exception as e: + logger.error(f"Error dropping graph {self.graph_name}: {e}") + return {"status": "error", "message": str(e)} From 77bc9594cf73b5bd472e9d2f57226ecb6e50f2a7 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 01:34:41 +0800 Subject: [PATCH 010/116] Remove delete_entity and delete_entity_relation from RediskKVStorage --- lightrag/kg/redis_impl.py | 68 ++------------------------------------- 1 file changed, 2 insertions(+), 66 deletions(-) diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py index 3feb4985..435adee1 100644 --- a/lightrag/kg/redis_impl.py +++ b/lightrag/kg/redis_impl.py @@ -8,8 +8,8 @@ if not pm.is_installed("redis"): pm.install("redis") # aioredis is a depricated library, replaced with redis -from redis.asyncio import Redis -from lightrag.utils import logger, compute_mdhash_id +from redis.asyncio import Redis # type: ignore +from lightrag.utils import logger from lightrag.base import BaseKVStorage import json @@ -83,67 +83,3 @@ class RedisKVStorage(BaseKVStorage): logger.info( f"Deleted {deleted_count} of {len(ids)} entries from {self.namespace}" ) - - async def delete_entity(self, entity_name: str) -> None: - """Delete an entity by name - - Args: - entity_name: Name of the entity to delete - """ - - try: - entity_id = compute_mdhash_id(entity_name, prefix="ent-") - logger.debug( - f"Attempting to delete entity {entity_name} with ID {entity_id}" - ) - - # Delete the entity - result = await self._redis.delete(f"{self.namespace}:{entity_id}") - - if result: - logger.debug(f"Successfully deleted entity {entity_name}") - else: - logger.debug(f"Entity {entity_name} not found in storage") - except Exception as e: - logger.error(f"Error deleting entity {entity_name}: {e}") - - async def delete_entity_relation(self, entity_name: str) -> None: - """Delete all relations associated with an entity - - Args: - entity_name: Name of the entity whose relations should be deleted - """ - try: - # Get all keys in this namespace - cursor = 0 - relation_keys = [] - pattern = f"{self.namespace}:*" - - while True: - cursor, keys = await self._redis.scan(cursor, match=pattern) - - # For each key, get the value and check if it's related to entity_name - for key in keys: - value = await self._redis.get(key) - if value: - data = json.loads(value) - # Check if this is a relation involving the entity - if ( - data.get("src_id") == entity_name - or data.get("tgt_id") == entity_name - ): - relation_keys.append(key) - - # Exit loop when cursor returns to 0 - if cursor == 0: - break - - # Delete the relation keys - if relation_keys: - deleted = await self._redis.delete(*relation_keys) - logger.debug(f"Deleted {deleted} relations for {entity_name}") - else: - logger.debug(f"No relations found for entity {entity_name}") - - except Exception as e: - logger.error(f"Error deleting relations for {entity_name}: {e}") From b411ce2fedb96427551287a8e159510e53018741 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 01:40:14 +0800 Subject: [PATCH 011/116] Add drop support for RedisKVStorage --- lightrag/kg/redis_impl.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py index 435adee1..9ff50008 100644 --- a/lightrag/kg/redis_impl.py +++ b/lightrag/kg/redis_impl.py @@ -83,3 +83,29 @@ class RedisKVStorage(BaseKVStorage): logger.info( f"Deleted {deleted_count} of {len(ids)} entries from {self.namespace}" ) + + async def drop(self) -> dict[str, str]: + """Drop the storage by removing all keys under the current namespace. + + Returns: + dict[str, str]: Status of the operation with keys 'status' and 'message' + """ + try: + keys = await self._redis.keys(f"{self.namespace}:*") + + if keys: + pipe = self._redis.pipeline() + for key in keys: + pipe.delete(key) + results = await pipe.execute() + deleted_count = sum(results) + + logger.info(f"Dropped {deleted_count} keys from {self.namespace}") + return {"status": "success", "message": f"{deleted_count} keys dropped"} + else: + logger.info(f"No keys found to drop in {self.namespace}") + return {"status": "success", "message": "no keys to drop"} + + except Exception as e: + logger.error(f"Error dropping keys from {self.namespace}: {e}") + return {"status": "error", "message": str(e)} From 078cee390c69e8229e55bdfaf871faf72b15fc69 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 02:10:58 +0800 Subject: [PATCH 012/116] Add drop support for all storage type implementation for Mongo DB --- lightrag/kg/mongo_impl.py | 74 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index e4ae0a8d..8a9f1f3a 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -25,13 +25,13 @@ if not pm.is_installed("pymongo"): if not pm.is_installed("motor"): pm.install("motor") -from motor.motor_asyncio import ( +from motor.motor_asyncio import ( # type: ignore AsyncIOMotorClient, AsyncIOMotorDatabase, AsyncIOMotorCollection, ) -from pymongo.operations import SearchIndexModel -from pymongo.errors import PyMongoError +from pymongo.operations import SearchIndexModel # type: ignore +from pymongo.errors import PyMongoError # type: ignore config = configparser.ConfigParser() config.read("config.ini", "utf-8") @@ -149,6 +149,22 @@ class MongoKVStorage(BaseKVStorage): async def index_done_callback(self) -> None: # Mongo handles persistence automatically pass + + async def drop(self) -> dict[str, str]: + """Drop the storage by removing all documents in the collection. + + Returns: + dict[str, str]: Status of the operation with keys 'status' and 'message' + """ + try: + result = await self._data.delete_many({}) + deleted_count = result.deleted_count + + logger.info(f"Dropped {deleted_count} documents from doc status {self._collection_name}") + return {"status": "success", "message": f"{deleted_count} documents dropped"} + except PyMongoError as e: + logger.error(f"Error dropping doc status {self._collection_name}: {e}") + return {"status": "error", "message": str(e)} @final @@ -229,6 +245,22 @@ class MongoDocStatusStorage(DocStatusStorage): async def index_done_callback(self) -> None: # Mongo handles persistence automatically pass + + async def drop(self) -> dict[str, str]: + """Drop the storage by removing all documents in the collection. + + Returns: + dict[str, str]: Status of the operation with keys 'status' and 'message' + """ + try: + result = await self._data.delete_many({}) + deleted_count = result.deleted_count + + logger.info(f"Dropped {deleted_count} documents from doc status {self._collection_name}") + return {"status": "success", "message": f"{deleted_count} documents dropped"} + except PyMongoError as e: + logger.error(f"Error dropping doc status {self._collection_name}: {e}") + return {"status": "error", "message": str(e)} @final @@ -840,6 +872,22 @@ class MongoGraphStorage(BaseGraphStorage): logger.debug(f"Successfully deleted edges: {edges}") + async def drop(self) -> dict[str, str]: + """Drop the storage by removing all documents in the collection. + + Returns: + dict[str, str]: Status of the operation with keys 'status' and 'message' + """ + try: + result = await self.collection.delete_many({}) + deleted_count = result.deleted_count + + logger.info(f"Dropped {deleted_count} documents from graph {self._collection_name}") + return {"status": "success", "message": f"{deleted_count} documents dropped"} + except PyMongoError as e: + logger.error(f"Error dropping graph {self._collection_name}: {e}") + return {"status": "error", "message": str(e)} + @final @dataclass @@ -1126,6 +1174,26 @@ class MongoVectorDBStorage(BaseVectorStorage): except Exception as e: logger.error(f"Error retrieving vector data for IDs {ids}: {e}") return [] + + async def drop(self) -> dict[str, str]: + """Drop the storage by removing all documents in the collection and recreating vector index. + + Returns: + dict[str, str]: Status of the operation with keys 'status' and 'message' + """ + try: + # Delete all documents + result = await self._data.delete_many({}) + deleted_count = result.deleted_count + + # Recreate vector index + await self.create_vector_index_if_not_exists() + + logger.info(f"Dropped {deleted_count} documents from vector storage {self._collection_name} and recreated vector index") + return {"status": "success", "message": f"{deleted_count} documents dropped and vector index recreated"} + except PyMongoError as e: + logger.error(f"Error dropping vector storage {self._collection_name}: {e}") + return {"status": "error", "message": str(e)} async def get_or_create_collection(db: AsyncIOMotorDatabase, collection_name: str): From 5b7cd500058e50d41614c9b4b38c3039f465905b Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 02:14:16 +0800 Subject: [PATCH 013/116] Add delete support for MongoKVStorage --- lightrag/kg/mongo_impl.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index 8a9f1f3a..d5832af0 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -150,6 +150,21 @@ class MongoKVStorage(BaseKVStorage): # Mongo handles persistence automatically pass + async def delete(self, ids: list[str]) -> None: + """Delete documents with specified IDs + + Args: + ids: List of document IDs to be deleted + """ + if not ids: + return + + try: + result = await self._data.delete_many({"_id": {"$in": ids}}) + logger.info(f"Deleted {result.deleted_count} documents from {self.namespace}") + except PyMongoError as e: + logger.error(f"Error deleting documents from {self.namespace}: {e}") + async def drop(self) -> dict[str, str]: """Drop the storage by removing all documents in the collection. From 6a51f38cae4ebaa13552199f79c535d543164d2a Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 02:59:44 +0800 Subject: [PATCH 014/116] Remove namespace_prefix from PostgreSQL, maintain consistency with other storage implementation --- env.example | 1 - lightrag/api/lightrag_server.py | 4 ++-- lightrag/kg/postgres_impl.py | 14 +++++--------- lightrag/lightrag.py | 1 + 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/env.example b/env.example index ea6d7773..fb603279 100644 --- a/env.example +++ b/env.example @@ -5,7 +5,6 @@ # PORT=9621 # WORKERS=2 ### separating data from difference Lightrag instances -# NAMESPACE_PREFIX=lightrag ### Max nodes return from grap retrieval # MAX_GRAPH_NODES=1000 # CORS_ORIGINS=http://localhost:3000,http://localhost:8080 diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 8110d6d4..e30d34e3 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -315,7 +315,7 @@ def create_app(args): "similarity_threshold": 0.95, "use_llm_check": False, }, - namespace_prefix=args.namespace_prefix, + # namespace_prefix=args.namespace_prefix, auto_manage_storages_states=False, max_parallel_insert=args.max_parallel_insert, ) @@ -345,7 +345,7 @@ def create_app(args): "similarity_threshold": 0.95, "use_llm_check": False, }, - namespace_prefix=args.namespace_prefix, + # namespace_prefix=args.namespace_prefix, auto_manage_storages_states=False, max_parallel_insert=args.max_parallel_insert, ) diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 02fd68a0..ff90d14b 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -254,8 +254,6 @@ class PGKVStorage(BaseKVStorage): db: PostgreSQLDB = field(default=None) def __post_init__(self): - namespace_prefix = self.global_config.get("namespace_prefix") - self.base_namespace = self.namespace.replace(namespace_prefix, "") self._max_batch_size = self.global_config["embedding_batch_num"] async def initialize(self): @@ -271,7 +269,7 @@ class PGKVStorage(BaseKVStorage): async def get_by_id(self, id: str) -> dict[str, Any] | None: """Get doc_full data by id.""" - sql = SQL_TEMPLATES["get_by_id_" + self.base_namespace] + sql = SQL_TEMPLATES["get_by_id_" + self.namespace] params = {"workspace": self.db.workspace, "id": id} if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE): array_res = await self.db.query(sql, params, multirows=True) @@ -285,7 +283,7 @@ class PGKVStorage(BaseKVStorage): async def get_by_mode_and_id(self, mode: str, id: str) -> Union[dict, None]: """Specifically for llm_response_cache.""" - sql = SQL_TEMPLATES["get_by_mode_id_" + self.base_namespace] + sql = SQL_TEMPLATES["get_by_mode_id_" + self.namespace] params = {"workspace": self.db.workspace, mode: mode, "id": id} if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE): array_res = await self.db.query(sql, params, multirows=True) @@ -299,7 +297,7 @@ class PGKVStorage(BaseKVStorage): # Query by id async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]: """Get doc_chunks data by id""" - sql = SQL_TEMPLATES["get_by_ids_" + self.base_namespace].format( + sql = SQL_TEMPLATES["get_by_ids_" + self.namespace].format( ids=",".join([f"'{id}'" for id in ids]) ) params = {"workspace": self.db.workspace} @@ -320,7 +318,7 @@ class PGKVStorage(BaseKVStorage): async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]: """Specifically for llm_response_cache.""" - SQL = SQL_TEMPLATES["get_by_status_" + self.base_namespace] + SQL = SQL_TEMPLATES["get_by_status_" + self.namespace] params = {"workspace": self.db.workspace, "status": status} return await self.db.query(SQL, params, multirows=True) @@ -403,8 +401,6 @@ class PGVectorStorage(BaseVectorStorage): def __post_init__(self): self._max_batch_size = self.global_config["embedding_batch_num"] - namespace_prefix = self.global_config.get("namespace_prefix") - self.base_namespace = self.namespace.replace(namespace_prefix, "") config = self.global_config.get("vector_db_storage_cls_kwargs", {}) cosine_threshold = config.get("cosine_better_than_threshold") if cosine_threshold is None: @@ -533,7 +529,7 @@ class PGVectorStorage(BaseVectorStorage): else: formatted_ids = "NULL" - sql = SQL_TEMPLATES[self.base_namespace].format( + sql = SQL_TEMPLATES[self.namespace].format( embedding_string=embedding_string, doc_ids=formatted_ids ) params = { diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 62e1b279..283bd4a4 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -229,6 +229,7 @@ class LightRAG: vector_db_storage_cls_kwargs: dict[str, Any] = field(default_factory=dict) """Additional parameters for vector database storage.""" + # TODO:deprecated, remove in the future, use WORKSPACE instead namespace_prefix: str = field(default="") """Prefix for namespacing stored data across different environments.""" From 9959ea90a83f57175bf24b94ffe13c6a29b1c638 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 03:19:06 +0800 Subject: [PATCH 015/116] Add drop support for Oracle --- lightrag/kg/oracle_impl.py | 49 +++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index c42f0f76..0477ea03 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -27,7 +27,7 @@ if not pm.is_installed("oracledb"): pm.install("oracledb") from graspologic import embed -import oracledb +import oracledb # type: ignore class OracleDB: @@ -392,6 +392,21 @@ class OracleKVStorage(BaseKVStorage): # Oracle handles persistence automatically pass + async def drop(self) -> dict[str, str]: + """Drop the storage""" + try: + table_name = namespace_to_table_name(self.namespace) + if not table_name: + return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} + + drop_sql = SQL_TEMPLATES["drop_specifiy_table_workspace"].format( + table_name=table_name + ) + await self.db.execute(drop_sql, {"workspace": self.db.workspace}) + return {"status": "success", "message": "data dropped"} + except Exception as e: + return {"status": "error", "message": str(e)} + @final @dataclass @@ -605,6 +620,21 @@ class OracleVectorDBStorage(BaseVectorStorage): logger.error(f"Error retrieving vector data for IDs {ids}: {e}") return [] + async def drop(self) -> dict[str, str]: + """Drop the storage""" + try: + table_name = namespace_to_table_name(self.namespace) + if not table_name: + return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} + + drop_sql = SQL_TEMPLATES["drop_specifiy_table_workspace"].format( + table_name=table_name + ) + await self.db.execute(drop_sql, {"workspace": self.db.workspace}) + return {"status": "success", "message": "data dropped"} + except Exception as e: + return {"status": "error", "message": str(e)} + @final @dataclass @@ -933,6 +963,21 @@ class OracleGraphStorage(BaseGraphStorage): logger.error(f"Error retrieving entity types: {e}") return [] + async def drop(self) -> dict[str, str]: + """Drop the storage""" + try: + # 使用图形查询删除所有节点和关系 + delete_edges_sql = """DELETE FROM LIGHTRAG_GRAPH_EDGES WHERE workspace=:workspace""" + await self.db.execute(delete_edges_sql, {"workspace": self.db.workspace}) + + delete_nodes_sql = """DELETE FROM LIGHTRAG_GRAPH_NODES WHERE workspace=:workspace""" + await self.db.execute(delete_nodes_sql, {"workspace": self.db.workspace}) + + return {"status": "success", "message": "graph data dropped"} + except Exception as e: + logger.error(f"Error dropping graph: {e}") + return {"status": "error", "message": str(e)} + async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 ) -> KnowledgeGraph: @@ -1343,4 +1388,6 @@ SQL_TEMPLATES = { MATCH (a) WHERE a.workspace=:workspace AND a.name=:node_id ACTION DELETE a)""", + # Drop tables + "drop_specifiy_table_workspace": "DELETE FROM {table_name} WHERE workspace=:workspace", } From 088fc19318430aa43ef9c7ff76ad61296f8c8666 Mon Sep 17 00:00:00 2001 From: Milin Date: Mon, 31 Mar 2025 11:19:47 +0800 Subject: [PATCH 016/116] feat(config): Refactor configuration management - Optimize JWT Auth module to load configuration via `global_args`. - Decouple configuration-related code from `utils_api.py`, and add a new `config.py` file for unified configuration management. - Adjust configuration import in `lightrag_server.py`, `auth.py`, and `document_routes.py` to be introduced through `global_args`. --- env.example | 4 +- lightrag/api/auth.py | 17 +- lightrag/api/config.py | 324 +++++++++++++++++++++++ lightrag/api/lightrag_server.py | 30 ++- lightrag/api/routers/document_routes.py | 18 +- lightrag/api/utils_api.py | 329 ++---------------------- 6 files changed, 376 insertions(+), 346 deletions(-) create mode 100644 lightrag/api/config.py diff --git a/env.example b/env.example index 8e54c812..4bf9a340 100644 --- a/env.example +++ b/env.example @@ -160,7 +160,9 @@ REDIS_URI=redis://localhost:6379 ### For JWT Auth AUTH_ACCOUNTS='admin:admin123,user1:pass456' # username:password,username:password TOKEN_SECRET=Your-Key-For-LightRAG-API-Server # JWT key -TOKEN_EXPIRE_HOURS=4 # expire duration +#TOKEN_EXPIRE_HOURS=4 # Expire duration, default 4 +#GUEST_TOKEN_EXPIRE_HOURS=2 # Guest expire duration, default 2 +#JWT_ALGORITHM=HS256 # JWT encode algorithm, default HS256 ### API-Key to access LightRAG Server API # LIGHTRAG_API_KEY=your-secure-api-key-here diff --git a/lightrag/api/auth.py b/lightrag/api/auth.py index 50648504..de02721b 100644 --- a/lightrag/api/auth.py +++ b/lightrag/api/auth.py @@ -1,9 +1,11 @@ -import os from datetime import datetime, timedelta + import jwt +from dotenv import load_dotenv from fastapi import HTTPException, status from pydantic import BaseModel -from dotenv import load_dotenv + +from .config import global_args load_dotenv() @@ -17,13 +19,12 @@ class TokenPayload(BaseModel): class AuthHandler: def __init__(self): - self.secret = os.getenv("TOKEN_SECRET", "4f85ds4f56dsf46") - self.algorithm = "HS256" - self.expire_hours = int(os.getenv("TOKEN_EXPIRE_HOURS", 4)) - self.guest_expire_hours = int(os.getenv("GUEST_TOKEN_EXPIRE_HOURS", 2)) - + self.secret = global_args.token_secret + self.algorithm = global_args.jwt_algorithm + self.expire_hours = global_args.token_expire_hours + self.guest_expire_hours = global_args.guest_token_expire_hours self.accounts = {} - auth_accounts = os.getenv("AUTH_ACCOUNTS") + auth_accounts = global_args.auth_accounts if auth_accounts: for account in auth_accounts.split(","): username, password = account.split(":", 1) diff --git a/lightrag/api/config.py b/lightrag/api/config.py new file mode 100644 index 00000000..bacf1b7f --- /dev/null +++ b/lightrag/api/config.py @@ -0,0 +1,324 @@ +""" +Configs for the LightRAG API. +""" + +import os +import argparse +import logging + + +class OllamaServerInfos: + # Constants for emulated Ollama model information + LIGHTRAG_NAME = "lightrag" + LIGHTRAG_TAG = os.getenv("OLLAMA_EMULATING_MODEL_TAG", "latest") + LIGHTRAG_MODEL = f"{LIGHTRAG_NAME}:{LIGHTRAG_TAG}" + LIGHTRAG_SIZE = 7365960935 # it's a dummy value + LIGHTRAG_CREATED_AT = "2024-01-15T00:00:00Z" + LIGHTRAG_DIGEST = "sha256:lightrag" + + +ollama_server_infos = OllamaServerInfos() + + +class DefaultRAGStorageConfig: + KV_STORAGE = "JsonKVStorage" + VECTOR_STORAGE = "NanoVectorDBStorage" + GRAPH_STORAGE = "NetworkXStorage" + DOC_STATUS_STORAGE = "JsonDocStatusStorage" + + +def get_default_host(binding_type: str) -> str: + default_hosts = { + "ollama": os.getenv("LLM_BINDING_HOST", "http://localhost:11434"), + "lollms": os.getenv("LLM_BINDING_HOST", "http://localhost:9600"), + "azure_openai": os.getenv("AZURE_OPENAI_ENDPOINT", "https://api.openai.com/v1"), + "openai": os.getenv("LLM_BINDING_HOST", "https://api.openai.com/v1"), + } + return default_hosts.get( + binding_type, os.getenv("LLM_BINDING_HOST", "http://localhost:11434") + ) # fallback to ollama if unknown + + +def get_env_value(env_key: str, default: any, value_type: type = str) -> any: + """ + Get value from environment variable with type conversion + + Args: + env_key (str): Environment variable key + default (any): Default value if env variable is not set + value_type (type): Type to convert the value to + + Returns: + any: Converted value from environment or default + """ + value = os.getenv(env_key) + if value is None: + return default + + if value_type is bool: + return value.lower() in ("true", "1", "yes", "t", "on") + try: + return value_type(value) + except ValueError: + return default + + +def parse_args() -> argparse.Namespace: + """ + Parse command line arguments with environment variable fallback + + Args: + is_uvicorn_mode: Whether running under uvicorn mode + + Returns: + argparse.Namespace: Parsed arguments + """ + + parser = argparse.ArgumentParser( + description="LightRAG FastAPI Server with separate working and input directories" + ) + + # Server configuration + parser.add_argument( + "--host", + default=get_env_value("HOST", "0.0.0.0"), + help="Server host (default: from env or 0.0.0.0)", + ) + parser.add_argument( + "--port", + type=int, + default=get_env_value("PORT", 9621, int), + help="Server port (default: from env or 9621)", + ) + + # Directory configuration + parser.add_argument( + "--working-dir", + default=get_env_value("WORKING_DIR", "./rag_storage"), + help="Working directory for RAG storage (default: from env or ./rag_storage)", + ) + parser.add_argument( + "--input-dir", + default=get_env_value("INPUT_DIR", "./inputs"), + help="Directory containing input documents (default: from env or ./inputs)", + ) + + def timeout_type(value): + if value is None: + return 150 + if value is None or value == "None": + return None + return int(value) + + parser.add_argument( + "--timeout", + default=get_env_value("TIMEOUT", None, timeout_type), + type=timeout_type, + help="Timeout in seconds (useful when using slow AI). Use None for infinite timeout", + ) + + # RAG configuration + parser.add_argument( + "--max-async", + type=int, + default=get_env_value("MAX_ASYNC", 4, int), + help="Maximum async operations (default: from env or 4)", + ) + parser.add_argument( + "--max-tokens", + type=int, + default=get_env_value("MAX_TOKENS", 32768, int), + help="Maximum token size (default: from env or 32768)", + ) + + # Logging configuration + parser.add_argument( + "--log-level", + default=get_env_value("LOG_LEVEL", "INFO"), + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + help="Logging level (default: from env or INFO)", + ) + parser.add_argument( + "--verbose", + action="store_true", + default=get_env_value("VERBOSE", False, bool), + help="Enable verbose debug output(only valid for DEBUG log-level)", + ) + + parser.add_argument( + "--key", + type=str, + default=get_env_value("LIGHTRAG_API_KEY", None), + help="API key for authentication. This protects lightrag server against unauthorized access", + ) + + # Optional https parameters + parser.add_argument( + "--ssl", + action="store_true", + default=get_env_value("SSL", False, bool), + help="Enable HTTPS (default: from env or False)", + ) + parser.add_argument( + "--ssl-certfile", + default=get_env_value("SSL_CERTFILE", None), + help="Path to SSL certificate file (required if --ssl is enabled)", + ) + parser.add_argument( + "--ssl-keyfile", + default=get_env_value("SSL_KEYFILE", None), + help="Path to SSL private key file (required if --ssl is enabled)", + ) + + parser.add_argument( + "--history-turns", + type=int, + default=get_env_value("HISTORY_TURNS", 3, int), + help="Number of conversation history turns to include (default: from env or 3)", + ) + + # Search parameters + parser.add_argument( + "--top-k", + type=int, + default=get_env_value("TOP_K", 60, int), + help="Number of most similar results to return (default: from env or 60)", + ) + parser.add_argument( + "--cosine-threshold", + type=float, + default=get_env_value("COSINE_THRESHOLD", 0.2, float), + help="Cosine similarity threshold (default: from env or 0.4)", + ) + + # Ollama model name + parser.add_argument( + "--simulated-model-name", + type=str, + default=get_env_value( + "SIMULATED_MODEL_NAME", ollama_server_infos.LIGHTRAG_MODEL + ), + help="Number of conversation history turns to include (default: from env or 3)", + ) + + # Namespace + parser.add_argument( + "--namespace-prefix", + type=str, + default=get_env_value("NAMESPACE_PREFIX", ""), + help="Prefix of the namespace", + ) + + parser.add_argument( + "--auto-scan-at-startup", + action="store_true", + default=False, + help="Enable automatic scanning when the program starts", + ) + + # Server workers configuration + parser.add_argument( + "--workers", + type=int, + default=get_env_value("WORKERS", 1, int), + help="Number of worker processes (default: from env or 1)", + ) + + # LLM and embedding bindings + parser.add_argument( + "--llm-binding", + type=str, + default=get_env_value("LLM_BINDING", "ollama"), + choices=["lollms", "ollama", "openai", "openai-ollama", "azure_openai"], + help="LLM binding type (default: from env or ollama)", + ) + parser.add_argument( + "--embedding-binding", + type=str, + default=get_env_value("EMBEDDING_BINDING", "ollama"), + choices=["lollms", "ollama", "openai", "azure_openai"], + help="Embedding binding type (default: from env or ollama)", + ) + + args = parser.parse_args() + + # convert relative path to absolute path + args.working_dir = os.path.abspath(args.working_dir) + args.input_dir = os.path.abspath(args.input_dir) + + # Inject storage configuration from environment variables + args.kv_storage = get_env_value( + "LIGHTRAG_KV_STORAGE", DefaultRAGStorageConfig.KV_STORAGE + ) + args.doc_status_storage = get_env_value( + "LIGHTRAG_DOC_STATUS_STORAGE", DefaultRAGStorageConfig.DOC_STATUS_STORAGE + ) + args.graph_storage = get_env_value( + "LIGHTRAG_GRAPH_STORAGE", DefaultRAGStorageConfig.GRAPH_STORAGE + ) + args.vector_storage = get_env_value( + "LIGHTRAG_VECTOR_STORAGE", DefaultRAGStorageConfig.VECTOR_STORAGE + ) + + # Get MAX_PARALLEL_INSERT from environment + args.max_parallel_insert = get_env_value("MAX_PARALLEL_INSERT", 2, int) + + # Handle openai-ollama special case + if args.llm_binding == "openai-ollama": + args.llm_binding = "openai" + args.embedding_binding = "ollama" + + args.llm_binding_host = get_env_value( + "LLM_BINDING_HOST", get_default_host(args.llm_binding) + ) + args.embedding_binding_host = get_env_value( + "EMBEDDING_BINDING_HOST", get_default_host(args.embedding_binding) + ) + args.llm_binding_api_key = get_env_value("LLM_BINDING_API_KEY", None) + args.embedding_binding_api_key = get_env_value("EMBEDDING_BINDING_API_KEY", "") + + # Inject model configuration + args.llm_model = get_env_value("LLM_MODEL", "mistral-nemo:latest") + args.embedding_model = get_env_value("EMBEDDING_MODEL", "bge-m3:latest") + args.embedding_dim = get_env_value("EMBEDDING_DIM", 1024, int) + args.max_embed_tokens = get_env_value("MAX_EMBED_TOKENS", 8192, int) + + # Inject chunk configuration + args.chunk_size = get_env_value("CHUNK_SIZE", 1200, int) + args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int) + + # Inject LLM cache configuration + args.enable_llm_cache_for_extract = get_env_value( + "ENABLE_LLM_CACHE_FOR_EXTRACT", True, bool + ) + + # Inject LLM temperature configuration + args.temperature = get_env_value("TEMPERATURE", 0.5, float) + + # Select Document loading tool (DOCLING, DEFAULT) + args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT") + + # For JWT Auth + args.auth_accounts = get_env_value("AUTH_ACCOUNTS", "") + args.token_secret = get_env_value("TOKEN_SECRET", "lightrag-jwt-default-secret") + args.token_expire_hours = get_env_value("TOKEN_EXPIRE_HOURS", 4, int) + args.guest_token_expire_hours = get_env_value("GUEST_TOKEN_EXPIRE_HOURS", 2, int) + args.jwt_algorithm = get_env_value("JWT_ALGORITHM", "HS256") + + ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name + + return args + + +def update_uvicorn_mode_config(): + # If in uvicorn mode and workers > 1, force it to 1 and log warning + if global_args.workers > 1: + original_workers = global_args.workers + global_args.workers = 1 + # Log warning directly here + logging.warning( + f"In uvicorn mode, workers parameter was set to {original_workers}. Forcing workers=1" + ) + + +global_args = parse_args() diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index fe0e416e..a00eab6c 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -19,11 +19,14 @@ from contextlib import asynccontextmanager from dotenv import load_dotenv from lightrag.api.utils_api import ( get_combined_auth_dependency, - parse_args, - get_default_host, display_splash_screen, check_env_file, ) +from .config import ( + global_args, + update_uvicorn_mode_config, + get_default_host, +) import sys from lightrag import LightRAG, __version__ as core_version from lightrag.api import __api_version__ @@ -489,7 +492,7 @@ def create_app(args): def get_application(args=None): """Factory function for creating the FastAPI application""" if args is None: - args = parse_args() + args = global_args return create_app(args) @@ -610,30 +613,31 @@ def main(): # Configure logging before parsing args configure_logging() - - args = parse_args(is_uvicorn_mode=True) - display_splash_screen(args) + update_uvicorn_mode_config() + display_splash_screen(global_args) # Create application instance directly instead of using factory function - app = create_app(args) + app = create_app(global_args) # Start Uvicorn in single process mode uvicorn_config = { "app": app, # Pass application instance directly instead of string path - "host": args.host, - "port": args.port, + "host": global_args.host, + "port": global_args.port, "log_config": None, # Disable default config } - if args.ssl: + if global_args.ssl: uvicorn_config.update( { - "ssl_certfile": args.ssl_certfile, - "ssl_keyfile": args.ssl_keyfile, + "ssl_certfile": global_args.ssl_certfile, + "ssl_keyfile": global_args.ssl_keyfile, } ) - print(f"Starting Uvicorn server in single-process mode on {args.host}:{args.port}") + print( + f"Starting Uvicorn server in single-process mode on {global_args.host}:{global_args.port}" + ) uvicorn.run(**uvicorn_config) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index d6434e08..2a7873b8 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -16,10 +16,8 @@ from pydantic import BaseModel, Field, field_validator from lightrag import LightRAG from lightrag.base import DocProcessingStatus, DocStatus -from lightrag.api.utils_api import ( - get_combined_auth_dependency, - global_args, -) +from lightrag.api.utils_api import get_combined_auth_dependency +from ..config import global_args router = APIRouter( prefix="/documents", @@ -276,7 +274,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: ) return False case ".pdf": - if global_args["main_args"].document_loading_engine == "DOCLING": + if global_args.document_loading_engine == "DOCLING": if not pm.is_installed("docling"): # type: ignore pm.install("docling") from docling.document_converter import DocumentConverter # type: ignore @@ -295,7 +293,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: for page in reader.pages: content += page.extract_text() + "\n" case ".docx": - if global_args["main_args"].document_loading_engine == "DOCLING": + if global_args.document_loading_engine == "DOCLING": if not pm.is_installed("docling"): # type: ignore pm.install("docling") from docling.document_converter import DocumentConverter # type: ignore @@ -315,7 +313,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: [paragraph.text for paragraph in doc.paragraphs] ) case ".pptx": - if global_args["main_args"].document_loading_engine == "DOCLING": + if global_args.document_loading_engine == "DOCLING": if not pm.is_installed("docling"): # type: ignore pm.install("docling") from docling.document_converter import DocumentConverter # type: ignore @@ -336,7 +334,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: if hasattr(shape, "text"): content += shape.text + "\n" case ".xlsx": - if global_args["main_args"].document_loading_engine == "DOCLING": + if global_args.document_loading_engine == "DOCLING": if not pm.is_installed("docling"): # type: ignore pm.install("docling") from docling.document_converter import DocumentConverter # type: ignore @@ -476,8 +474,8 @@ async def run_scanning_process(rag: LightRAG, doc_manager: DocumentManager): if not new_files: return - # Get MAX_PARALLEL_INSERT from global_args["main_args"] - max_parallel = global_args["main_args"].max_parallel_insert + # Get MAX_PARALLEL_INSERT from global_args + max_parallel = global_args.max_parallel_insert # Calculate batch size as 2 * MAX_PARALLEL_INSERT batch_size = 2 * max_parallel diff --git a/lightrag/api/utils_api.py b/lightrag/api/utils_api.py index 32a453bf..62ceecce 100644 --- a/lightrag/api/utils_api.py +++ b/lightrag/api/utils_api.py @@ -2,18 +2,20 @@ Utility functions for the LightRAG API. """ -import os import argparse -from typing import Optional, List, Tuple +import os import sys +from typing import Optional, List, Tuple + from ascii_colors import ASCIIColors -import logging -from lightrag.api import __api_version__ -from fastapi import HTTPException, Security, Request, status from dotenv import load_dotenv +from fastapi import HTTPException, Security, Request, status from fastapi.security import APIKeyHeader, OAuth2PasswordBearer from starlette.status import HTTP_403_FORBIDDEN + +from lightrag.api import __api_version__ from .auth import auth_handler +from .config import ollama_server_infos from ..prompt import PROMPTS @@ -40,8 +42,6 @@ def check_env_file(): # Load environment variables load_dotenv() -global_args = {"main_args": None} - # Get whitelist paths from environment variable, only once during initialization default_whitelist = "/health,/api/*" whitelist_paths = os.getenv("WHITELIST_PATHS", default_whitelist).split(",") @@ -62,19 +62,6 @@ for path in whitelist_paths: auth_configured = bool(auth_handler.accounts) -class OllamaServerInfos: - # Constants for emulated Ollama model information - LIGHTRAG_NAME = "lightrag" - LIGHTRAG_TAG = os.getenv("OLLAMA_EMULATING_MODEL_TAG", "latest") - LIGHTRAG_MODEL = f"{LIGHTRAG_NAME}:{LIGHTRAG_TAG}" - LIGHTRAG_SIZE = 7365960935 # it's a dummy value - LIGHTRAG_CREATED_AT = "2024-01-15T00:00:00Z" - LIGHTRAG_DIGEST = "sha256:lightrag" - - -ollama_server_infos = OllamaServerInfos() - - def get_combined_auth_dependency(api_key: Optional[str] = None): """ Create a combined authentication dependency that implements authentication logic @@ -185,299 +172,6 @@ def get_combined_auth_dependency(api_key: Optional[str] = None): return combined_dependency -class DefaultRAGStorageConfig: - KV_STORAGE = "JsonKVStorage" - VECTOR_STORAGE = "NanoVectorDBStorage" - GRAPH_STORAGE = "NetworkXStorage" - DOC_STATUS_STORAGE = "JsonDocStatusStorage" - - -def get_default_host(binding_type: str) -> str: - default_hosts = { - "ollama": os.getenv("LLM_BINDING_HOST", "http://localhost:11434"), - "lollms": os.getenv("LLM_BINDING_HOST", "http://localhost:9600"), - "azure_openai": os.getenv("AZURE_OPENAI_ENDPOINT", "https://api.openai.com/v1"), - "openai": os.getenv("LLM_BINDING_HOST", "https://api.openai.com/v1"), - } - return default_hosts.get( - binding_type, os.getenv("LLM_BINDING_HOST", "http://localhost:11434") - ) # fallback to ollama if unknown - - -def get_env_value(env_key: str, default: any, value_type: type = str) -> any: - """ - Get value from environment variable with type conversion - - Args: - env_key (str): Environment variable key - default (any): Default value if env variable is not set - value_type (type): Type to convert the value to - - Returns: - any: Converted value from environment or default - """ - value = os.getenv(env_key) - if value is None: - return default - - if value_type is bool: - return value.lower() in ("true", "1", "yes", "t", "on") - try: - return value_type(value) - except ValueError: - return default - - -def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace: - """ - Parse command line arguments with environment variable fallback - - Args: - is_uvicorn_mode: Whether running under uvicorn mode - - Returns: - argparse.Namespace: Parsed arguments - """ - - parser = argparse.ArgumentParser( - description="LightRAG FastAPI Server with separate working and input directories" - ) - - # Server configuration - parser.add_argument( - "--host", - default=get_env_value("HOST", "0.0.0.0"), - help="Server host (default: from env or 0.0.0.0)", - ) - parser.add_argument( - "--port", - type=int, - default=get_env_value("PORT", 9621, int), - help="Server port (default: from env or 9621)", - ) - - # Directory configuration - parser.add_argument( - "--working-dir", - default=get_env_value("WORKING_DIR", "./rag_storage"), - help="Working directory for RAG storage (default: from env or ./rag_storage)", - ) - parser.add_argument( - "--input-dir", - default=get_env_value("INPUT_DIR", "./inputs"), - help="Directory containing input documents (default: from env or ./inputs)", - ) - - def timeout_type(value): - if value is None: - return 150 - if value is None or value == "None": - return None - return int(value) - - parser.add_argument( - "--timeout", - default=get_env_value("TIMEOUT", None, timeout_type), - type=timeout_type, - help="Timeout in seconds (useful when using slow AI). Use None for infinite timeout", - ) - - # RAG configuration - parser.add_argument( - "--max-async", - type=int, - default=get_env_value("MAX_ASYNC", 4, int), - help="Maximum async operations (default: from env or 4)", - ) - parser.add_argument( - "--max-tokens", - type=int, - default=get_env_value("MAX_TOKENS", 32768, int), - help="Maximum token size (default: from env or 32768)", - ) - - # Logging configuration - parser.add_argument( - "--log-level", - default=get_env_value("LOG_LEVEL", "INFO"), - choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], - help="Logging level (default: from env or INFO)", - ) - parser.add_argument( - "--verbose", - action="store_true", - default=get_env_value("VERBOSE", False, bool), - help="Enable verbose debug output(only valid for DEBUG log-level)", - ) - - parser.add_argument( - "--key", - type=str, - default=get_env_value("LIGHTRAG_API_KEY", None), - help="API key for authentication. This protects lightrag server against unauthorized access", - ) - - # Optional https parameters - parser.add_argument( - "--ssl", - action="store_true", - default=get_env_value("SSL", False, bool), - help="Enable HTTPS (default: from env or False)", - ) - parser.add_argument( - "--ssl-certfile", - default=get_env_value("SSL_CERTFILE", None), - help="Path to SSL certificate file (required if --ssl is enabled)", - ) - parser.add_argument( - "--ssl-keyfile", - default=get_env_value("SSL_KEYFILE", None), - help="Path to SSL private key file (required if --ssl is enabled)", - ) - - parser.add_argument( - "--history-turns", - type=int, - default=get_env_value("HISTORY_TURNS", 3, int), - help="Number of conversation history turns to include (default: from env or 3)", - ) - - # Search parameters - parser.add_argument( - "--top-k", - type=int, - default=get_env_value("TOP_K", 60, int), - help="Number of most similar results to return (default: from env or 60)", - ) - parser.add_argument( - "--cosine-threshold", - type=float, - default=get_env_value("COSINE_THRESHOLD", 0.2, float), - help="Cosine similarity threshold (default: from env or 0.4)", - ) - - # Ollama model name - parser.add_argument( - "--simulated-model-name", - type=str, - default=get_env_value( - "SIMULATED_MODEL_NAME", ollama_server_infos.LIGHTRAG_MODEL - ), - help="Number of conversation history turns to include (default: from env or 3)", - ) - - # Namespace - parser.add_argument( - "--namespace-prefix", - type=str, - default=get_env_value("NAMESPACE_PREFIX", ""), - help="Prefix of the namespace", - ) - - parser.add_argument( - "--auto-scan-at-startup", - action="store_true", - default=False, - help="Enable automatic scanning when the program starts", - ) - - # Server workers configuration - parser.add_argument( - "--workers", - type=int, - default=get_env_value("WORKERS", 1, int), - help="Number of worker processes (default: from env or 1)", - ) - - # LLM and embedding bindings - parser.add_argument( - "--llm-binding", - type=str, - default=get_env_value("LLM_BINDING", "ollama"), - choices=["lollms", "ollama", "openai", "openai-ollama", "azure_openai"], - help="LLM binding type (default: from env or ollama)", - ) - parser.add_argument( - "--embedding-binding", - type=str, - default=get_env_value("EMBEDDING_BINDING", "ollama"), - choices=["lollms", "ollama", "openai", "azure_openai"], - help="Embedding binding type (default: from env or ollama)", - ) - - args = parser.parse_args() - - # If in uvicorn mode and workers > 1, force it to 1 and log warning - if is_uvicorn_mode and args.workers > 1: - original_workers = args.workers - args.workers = 1 - # Log warning directly here - logging.warning( - f"In uvicorn mode, workers parameter was set to {original_workers}. Forcing workers=1" - ) - - # convert relative path to absolute path - args.working_dir = os.path.abspath(args.working_dir) - args.input_dir = os.path.abspath(args.input_dir) - - # Inject storage configuration from environment variables - args.kv_storage = get_env_value( - "LIGHTRAG_KV_STORAGE", DefaultRAGStorageConfig.KV_STORAGE - ) - args.doc_status_storage = get_env_value( - "LIGHTRAG_DOC_STATUS_STORAGE", DefaultRAGStorageConfig.DOC_STATUS_STORAGE - ) - args.graph_storage = get_env_value( - "LIGHTRAG_GRAPH_STORAGE", DefaultRAGStorageConfig.GRAPH_STORAGE - ) - args.vector_storage = get_env_value( - "LIGHTRAG_VECTOR_STORAGE", DefaultRAGStorageConfig.VECTOR_STORAGE - ) - - # Get MAX_PARALLEL_INSERT from environment - args.max_parallel_insert = get_env_value("MAX_PARALLEL_INSERT", 2, int) - - # Handle openai-ollama special case - if args.llm_binding == "openai-ollama": - args.llm_binding = "openai" - args.embedding_binding = "ollama" - - args.llm_binding_host = get_env_value( - "LLM_BINDING_HOST", get_default_host(args.llm_binding) - ) - args.embedding_binding_host = get_env_value( - "EMBEDDING_BINDING_HOST", get_default_host(args.embedding_binding) - ) - args.llm_binding_api_key = get_env_value("LLM_BINDING_API_KEY", None) - args.embedding_binding_api_key = get_env_value("EMBEDDING_BINDING_API_KEY", "") - - # Inject model configuration - args.llm_model = get_env_value("LLM_MODEL", "mistral-nemo:latest") - args.embedding_model = get_env_value("EMBEDDING_MODEL", "bge-m3:latest") - args.embedding_dim = get_env_value("EMBEDDING_DIM", 1024, int) - args.max_embed_tokens = get_env_value("MAX_EMBED_TOKENS", 8192, int) - - # Inject chunk configuration - args.chunk_size = get_env_value("CHUNK_SIZE", 1200, int) - args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int) - - # Inject LLM cache configuration - args.enable_llm_cache_for_extract = get_env_value( - "ENABLE_LLM_CACHE_FOR_EXTRACT", True, bool - ) - - # Inject LLM temperature configuration - args.temperature = get_env_value("TEMPERATURE", 0.5, float) - - # Select Document loading tool (DOCLING, DEFAULT) - args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT") - - ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name - - global_args["main_args"] = args - return args - - def display_splash_screen(args: argparse.Namespace) -> None: """ Display a colorful splash screen showing LightRAG server configuration @@ -518,8 +212,10 @@ def display_splash_screen(args: argparse.Namespace) -> None: ASCIIColors.yellow(f"{args.verbose}") ASCIIColors.white(" ├─ History Turns: ", end="") ASCIIColors.yellow(f"{args.history_turns}") - ASCIIColors.white(" └─ API Key: ", end="") + ASCIIColors.white(" ├─ API Key: ", end="") ASCIIColors.yellow("Set" if args.key else "Not Set") + ASCIIColors.white(" └─ JWT Auth: ", end="") + ASCIIColors.yellow("Enabled" if args.auth_accounts else "Disabled") # Directory Configuration ASCIIColors.magenta("\n📂 Directory Configuration:") @@ -652,6 +348,11 @@ def display_splash_screen(args: argparse.Namespace) -> None: ASCIIColors.white(""" API Key authentication is enabled. Make sure to include the X-API-Key header in all your requests. """) + if args.auth_accounts: + ASCIIColors.yellow("\n⚠️ Security Notice:") + ASCIIColors.white(""" JWT authentication is enabled. + Make sure to login before making the request, and include the 'Authorization' in the header. + """) # Ensure splash output flush to system log sys.stdout.flush() From 787d91cb26fe5cafbf7548956fe0594a82b823ea Mon Sep 17 00:00:00 2001 From: Milin Date: Mon, 31 Mar 2025 11:34:49 +0800 Subject: [PATCH 017/116] refactor(env): optimize env.example --- env.example | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/env.example b/env.example index b81d82b9..adbd82e2 100644 --- a/env.example +++ b/env.example @@ -175,11 +175,11 @@ QDRANT_URL=http://localhost:16333 REDIS_URI=redis://localhost:6379 ### For JWT Auth -#AUTH_ACCOUNTS='admin:admin123,user1:pass456' -#TOKEN_SECRET=Your-Key-For-LightRAG-API-Server -#TOKEN_EXPIRE_HOURS=4 -#GUEST_TOKEN_EXPIRE_HOURS=2 -#JWT_ALGORITHM=HS256 +# AUTH_ACCOUNTS='admin:admin123,user1:pass456' +# TOKEN_SECRET=Your-Key-For-LightRAG-API-Server +# TOKEN_EXPIRE_HOURS=4 +# GUEST_TOKEN_EXPIRE_HOURS=2 +# JWT_ALGORITHM=HS256 ### API-Key to access LightRAG Server API # LIGHTRAG_API_KEY=your-secure-api-key-here From 795b69b275cc882678d0dd6b1dcf4e5331d032ab Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 12:11:21 +0800 Subject: [PATCH 018/116] Add drop support for TiDB --- lightrag/kg/tidb_impl.py | 46 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index 0982c914..a7dc0039 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -20,7 +20,7 @@ if not pm.is_installed("pymysql"): if not pm.is_installed("sqlalchemy"): pm.install("sqlalchemy") -from sqlalchemy import create_engine, text +from sqlalchemy import create_engine, text # type: ignore class TiDB: @@ -278,6 +278,21 @@ class TiDBKVStorage(BaseKVStorage): # Ti handles persistence automatically pass + async def drop(self) -> dict[str, str]: + """Drop the storage""" + try: + table_name = namespace_to_table_name(self.namespace) + if not table_name: + return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} + + drop_sql = SQL_TEMPLATES["drop_specifiy_table_workspace"].format( + table_name=table_name + ) + await self.db.execute(drop_sql, {"workspace": self.db.workspace}) + return {"status": "success", "message": "data dropped"} + except Exception as e: + return {"status": "error", "message": str(e)} + @final @dataclass @@ -416,6 +431,21 @@ class TiDBVectorDBStorage(BaseVectorStorage): # Ti handles persistence automatically pass + async def drop(self) -> dict[str, str]: + """Drop the storage""" + try: + table_name = namespace_to_table_name(self.namespace) + if not table_name: + return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} + + drop_sql = SQL_TEMPLATES["drop_specifiy_table_workspace"].format( + table_name=table_name + ) + await self.db.execute(drop_sql, {"workspace": self.db.workspace}) + return {"status": "success", "message": "data dropped"} + except Exception as e: + return {"status": "error", "message": str(e)} + async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]: """Search for records with IDs starting with a specific prefix. @@ -710,6 +740,18 @@ class TiDBGraphStorage(BaseGraphStorage): # Ti handles persistence automatically pass + async def drop(self) -> dict[str, str]: + """Drop the storage""" + try: + drop_sql = """ + DELETE FROM LIGHTRAG_GRAPH_EDGES WHERE workspace = :workspace; + DELETE FROM LIGHTRAG_GRAPH_NODES WHERE workspace = :workspace; + """ + await self.db.execute(drop_sql, {"workspace": self.db.workspace}) + return {"status": "success", "message": "graph data dropped"} + except Exception as e: + return {"status": "error", "message": str(e)} + async def delete_node(self, node_id: str) -> None: """Delete a node and all its related edges @@ -1129,4 +1171,6 @@ SQL_TEMPLATES = { FROM LIGHTRAG_DOC_CHUNKS WHERE chunk_id LIKE :prefix_pattern AND workspace = :workspace """, + # Drop tables + "drop_specifiy_table_workspace": "DELETE FROM {table_name} WHERE workspace = :workspace", } From fe270a9ebecc960fe1119874f9961fea49393514 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 12:18:53 +0800 Subject: [PATCH 019/116] Remove some graph storage support: MongoDB, TiDB and Oracle --- lightrag/kg/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lightrag/kg/__init__.py b/lightrag/kg/__init__.py index 4943fc1d..91d42d81 100644 --- a/lightrag/kg/__init__.py +++ b/lightrag/kg/__init__.py @@ -14,12 +14,12 @@ STORAGE_IMPLEMENTATIONS = { "implementations": [ "NetworkXStorage", "Neo4JStorage", - "MongoGraphStorage", - "TiDBGraphStorage", + # "MongoGraphStorage", + # "TiDBGraphStorage", "AGEStorage", "GremlinStorage", "PGGraphStorage", - "OracleGraphStorage", + # "OracleGraphStorage", ], "required_methods": ["upsert_node", "upsert_edge"], }, From bbc770d1ed70e98838a88c6a5b6551f5c37d01b5 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 13:01:52 +0800 Subject: [PATCH 020/116] feat(api): enhance document clearing error handling and status reporting - Change pipeline busy status from "error" to "busy" - Improve error handling documentation --- lightrag/api/routers/document_routes.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index e683b30b..6f7c3e48 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -767,10 +767,16 @@ def create_document_routes( Returns: InsertResponse: A response object containing the status and message. + - status="success": All documents and files were successfully cleared. + - status="partial_success": Document clear job exit with some errors. + - status="busy": Operation could not be completed because the pipeline is busy. + - status="fail": All storage drop operations failed, with message + - message: Detailed information about the operation results, including counts + of deleted files and any errors encountered. Raises: - HTTPException: If an error occurs during the clearing process (500) or if - the pipeline is busy (400). + HTTPException: Raised when a serious error occurs during the clearing process, + with status code 500 and error details in the detail field. """ from lightrag.kg.shared_storage import get_namespace_data, get_pipeline_status_lock @@ -782,7 +788,7 @@ def create_document_routes( async with pipeline_status_lock: if pipeline_status.get("busy", False): return InsertResponse( - status="error", + status="busy", message="Cannot clear documents while pipeline is busy" ) # Set busy to true @@ -843,6 +849,17 @@ def create_document_routes( f"Successfully dropped all {storage_success_count} storage components" ) + # If all storage operations failed, return error status and don't proceed with file deletion + if storage_success_count == 0 and storage_error_count > 0: + error_message = "All storage drop operations failed. Aborting document clearing process." + logger.error(error_message) + if "history_messages" in pipeline_status: + pipeline_status["history_messages"].append(error_message) + return InsertResponse( + status="fail", + message=error_message + ) + # Log file deletion start if "history_messages" in pipeline_status: pipeline_status["history_messages"].append("Starting to delete files in input directory") From 81f887ebaba49103f4c6828fd6722243efaa957c Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 14:14:32 +0800 Subject: [PATCH 021/116] feat: Remove immediate persistence in delete operation - Enhance delete implementation in JsonKVStorage by removing immediate persistence in delete operation - Update documentation for drop method to clarify persistence behavior - Add abstract delete method to BaseKVStorage --- lightrag/base.py | 17 +++++++++++++++++ lightrag/kg/json_kv_impl.py | 15 ++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/lightrag/base.py b/lightrag/base.py index bff92b34..f35440c1 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -123,6 +123,7 @@ class StorageNameSpace(ABC): 3. Reset the storage to its initial state 4. Handle cleanup of any resources 5. Notify other processes if necessary + 6. This action should persistent the data to disk immediately. Returns: dict[str, str]: Operation status and message with the following format: @@ -207,6 +208,22 @@ class BaseKVStorage(StorageNameSpace, ABC): async def upsert(self, data: dict[str, dict[str, Any]]) -> None: """Upsert data""" + @abstractmethod + async def delete(self, ids: list[str]) -> None: + """Delete specific records from storage by their IDs + + This method will: + 1. Remove the specified records from in-memory storage + 2. For in-memory DB, update flags to notify other processes that data persistence is needed + 3. For in-memory DB, changes will be persisted to disk during the next index_done_callback + + Args: + ids (list[str]): List of document IDs to be deleted from storage + + Returns: + None + """ + @dataclass class BaseGraphStorage(StorageNameSpace, ABC): diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index 2ca9c03e..ea4fb51b 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -122,14 +122,27 @@ class JsonKVStorage(BaseKVStorage): await set_all_update_flags(self.namespace) async def delete(self, ids: list[str]) -> None: + """Delete specific records from storage by their IDs + + This method will: + 1. Remove the specified records from in-memory storage + 2. Update flags to notify other processes that data persistence is needed + 3. The changes will be persisted to disk during the next index_done_callback + + Args: + ids (list[str]): List of document IDs to be deleted from storage + + Returns: + None + """ async with self._storage_lock: for doc_id in ids: self._data.pop(doc_id, None) await set_all_update_flags(self.namespace) - await self.index_done_callback() async def drop(self) -> dict[str, str]: """Drop all data from storage and clean up resources + This action will persistent the data to disk immediately. This method will: 1. Clear all data from memory From 2cb64ad2803ec2a546c0b701134a5c1791f87fc1 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 14:46:36 +0800 Subject: [PATCH 022/116] feat: Remove immediate persistence in delete operation for JsonDocStatusStorage --- lightrag/kg/json_doc_status_impl.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py index 0a3f5470..bbd0cd8e 100644 --- a/lightrag/kg/json_doc_status_impl.py +++ b/lightrag/kg/json_doc_status_impl.py @@ -123,11 +123,23 @@ class JsonDocStatusStorage(DocStatusStorage): return self._data.get(id) async def delete(self, doc_ids: list[str]): + """Delete specific records from storage by their IDs + + This method will: + 1. Remove the specified records from in-memory storage + 2. Update flags to notify other processes that data persistence is needed + 3. The changes will be persisted to disk during the next index_done_callback + + Args: + ids (list[str]): List of document IDs to be deleted from storage + + Returns: + None + """ async with self._storage_lock: for doc_id in doc_ids: self._data.pop(doc_id, None) await set_all_update_flags(self.namespace) - await self.index_done_callback() async def drop(self) -> dict[str, str]: """Drop all document status data from storage and clean up resources From 1772e7a8870fece1c8c2ad058ebc636babb5c671 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 16:21:20 +0800 Subject: [PATCH 023/116] Add delete support to all storage implementation --- lightrag/base.py | 60 +++++++++++++++++--- lightrag/kg/faiss_impl.py | 22 +++++++- lightrag/kg/json_doc_status_impl.py | 12 ++-- lightrag/kg/json_kv_impl.py | 12 ++-- lightrag/kg/nano_vector_db_impl.py | 30 +++++++++- lightrag/kg/networkx_impl.py | 31 +++++++++- lightrag/kg/oracle_impl.py | 27 +++++++++ lightrag/kg/tidb_impl.py | 88 ++++++++++++++++++++++++++++- lightrag/lightrag.py | 6 ++ 9 files changed, 266 insertions(+), 22 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index f35440c1..6b9163df 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -153,15 +153,33 @@ class BaseVectorStorage(StorageNameSpace, ABC): @abstractmethod async def upsert(self, data: dict[str, dict[str, Any]]) -> None: - """Insert or update vectors in the storage.""" + """Insert or update vectors in the storage. + + Importance notes for in-memory storage: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption + """ @abstractmethod async def delete_entity(self, entity_name: str) -> None: - """Delete a single entity by its name.""" + """Delete a single entity by its name. + + Importance notes for in-memory storage: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption + """ @abstractmethod async def delete_entity_relation(self, entity_name: str) -> None: - """Delete relations for a given entity.""" + """Delete relations for a given entity. + + Importance notes for in-memory storage: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption + """ @abstractmethod async def get_by_id(self, id: str) -> dict[str, Any] | None: @@ -187,6 +205,19 @@ class BaseVectorStorage(StorageNameSpace, ABC): """ pass + @abstractmethod + async def delete(self, ids: list[str]): + """Delete vectors with specified IDs + + Importance notes for in-memory storage: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption + + Args: + ids: List of vector IDs to be deleted + """ + @dataclass class BaseKVStorage(StorageNameSpace, ABC): @@ -206,16 +237,20 @@ class BaseKVStorage(StorageNameSpace, ABC): @abstractmethod async def upsert(self, data: dict[str, dict[str, Any]]) -> None: - """Upsert data""" + """Upsert data + + Importance notes for in-memory storage: + 1. Changes will be persisted to disk during the next index_done_callback + 2. update flags to notify other processes that data persistence is needed + """ @abstractmethod async def delete(self, ids: list[str]) -> None: """Delete specific records from storage by their IDs - This method will: - 1. Remove the specified records from in-memory storage - 2. For in-memory DB, update flags to notify other processes that data persistence is needed - 3. For in-memory DB, changes will be persisted to disk during the next index_done_callback + Importance notes for in-memory storage: + 1. Changes will be persisted to disk during the next index_done_callback + 2. update flags to notify other processes that data persistence is needed Args: ids (list[str]): List of document IDs to be deleted from storage @@ -267,7 +302,14 @@ class BaseGraphStorage(StorageNameSpace, ABC): async def upsert_edge( self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] ) -> None: - """Delete a node from the graph.""" + """Delete a node from the graph. + + Importance notes for in-memory storage: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption + """ + @abstractmethod async def delete_node(self, node_id: str) -> None: diff --git a/lightrag/kg/faiss_impl.py b/lightrag/kg/faiss_impl.py index 42133090..1e0659cb 100644 --- a/lightrag/kg/faiss_impl.py +++ b/lightrag/kg/faiss_impl.py @@ -217,6 +217,11 @@ class FaissVectorDBStorage(BaseVectorStorage): async def delete(self, ids: list[str]): """ Delete vectors for the provided custom IDs. + + Importance notes: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption """ logger.info(f"Deleting {len(ids)} vectors from {self.namespace}") to_remove = [] @@ -232,13 +237,22 @@ class FaissVectorDBStorage(BaseVectorStorage): ) async def delete_entity(self, entity_name: str) -> None: + """ + Importance notes: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption + """ entity_id = compute_mdhash_id(entity_name, prefix="ent-") logger.debug(f"Attempting to delete entity {entity_name} with ID {entity_id}") await self.delete([entity_id]) async def delete_entity_relation(self, entity_name: str) -> None: """ - Delete relations for a given entity by scanning metadata. + Importance notes: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption """ logger.debug(f"Searching relations for entity {entity_name}") relations = [] @@ -433,6 +447,12 @@ class FaissVectorDBStorage(BaseVectorStorage): async def drop(self) -> dict[str, str]: """Drop all vector data from storage and clean up resources + This method will: + 1. Remove the vector database storage file if it exists + 2. Reinitialize the vector database client + 3. Update flags to notify other processes + 4. Changes is persisted to disk immediately + This method will remove all vectors from the Faiss index and delete the storage files. Returns: diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py index bbd0cd8e..33b61eea 100644 --- a/lightrag/kg/json_doc_status_impl.py +++ b/lightrag/kg/json_doc_status_impl.py @@ -109,6 +109,11 @@ class JsonDocStatusStorage(DocStatusStorage): await clear_all_update_flags(self.namespace) async def upsert(self, data: dict[str, dict[str, Any]]) -> None: + """ + Importance notes for in-memory storage: + 1. Changes will be persisted to disk during the next index_done_callback + 2. update flags to notify other processes that data persistence is needed + """ if not data: return logger.info(f"Inserting {len(data)} records to {self.namespace}") @@ -125,10 +130,9 @@ class JsonDocStatusStorage(DocStatusStorage): async def delete(self, doc_ids: list[str]): """Delete specific records from storage by their IDs - This method will: - 1. Remove the specified records from in-memory storage - 2. Update flags to notify other processes that data persistence is needed - 3. The changes will be persisted to disk during the next index_done_callback + Importance notes for in-memory storage: + 1. Changes will be persisted to disk during the next index_done_callback + 2. update flags to notify other processes that data persistence is needed Args: ids (list[str]): List of document IDs to be deleted from storage diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index ea4fb51b..4972bf6a 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -114,6 +114,11 @@ class JsonKVStorage(BaseKVStorage): return set(keys) - set(self._data.keys()) async def upsert(self, data: dict[str, dict[str, Any]]) -> None: + """ + Importance notes for in-memory storage: + 1. Changes will be persisted to disk during the next index_done_callback + 2. update flags to notify other processes that data persistence is needed + """ if not data: return logger.info(f"Inserting {len(data)} records to {self.namespace}") @@ -124,10 +129,9 @@ class JsonKVStorage(BaseKVStorage): async def delete(self, ids: list[str]) -> None: """Delete specific records from storage by their IDs - This method will: - 1. Remove the specified records from in-memory storage - 2. Update flags to notify other processes that data persistence is needed - 3. The changes will be persisted to disk during the next index_done_callback + Importance notes for in-memory storage: + 1. Changes will be persisted to disk during the next index_done_callback + 2. update flags to notify other processes that data persistence is needed Args: ids (list[str]): List of document IDs to be deleted from storage diff --git a/lightrag/kg/nano_vector_db_impl.py b/lightrag/kg/nano_vector_db_impl.py index 0f907a42..8c00437d 100644 --- a/lightrag/kg/nano_vector_db_impl.py +++ b/lightrag/kg/nano_vector_db_impl.py @@ -78,6 +78,13 @@ class NanoVectorDBStorage(BaseVectorStorage): return self._client async def upsert(self, data: dict[str, dict[str, Any]]) -> None: + """ + Importance notes: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption + """ + logger.info(f"Inserting {len(data)} to {self.namespace}") if not data: return @@ -146,6 +153,11 @@ class NanoVectorDBStorage(BaseVectorStorage): async def delete(self, ids: list[str]): """Delete vectors with specified IDs + Importance notes: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption + Args: ids: List of vector IDs to be deleted """ @@ -159,6 +171,13 @@ class NanoVectorDBStorage(BaseVectorStorage): logger.error(f"Error while deleting vectors from {self.namespace}: {e}") async def delete_entity(self, entity_name: str) -> None: + """ + Importance notes: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption + """ + try: entity_id = compute_mdhash_id(entity_name, prefix="ent-") logger.debug( @@ -176,6 +195,13 @@ class NanoVectorDBStorage(BaseVectorStorage): logger.error(f"Error deleting entity {entity_name}: {e}") async def delete_entity_relation(self, entity_name: str) -> None: + """ + Importance notes: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption + """ + try: client = await self._get_client() storage = getattr(client, "_NanoVectorDB__storage") @@ -288,7 +314,9 @@ class NanoVectorDBStorage(BaseVectorStorage): 1. Remove the vector database storage file if it exists 2. Reinitialize the vector database client 3. Update flags to notify other processes - 4. Trigger index_done_callback to save the empty state + 4. Changes is persisted to disk immediately + + This method is intended for use in scenarios where all data needs to be removed, Returns: dict[str, str]: Operation status and message diff --git a/lightrag/kg/networkx_impl.py b/lightrag/kg/networkx_impl.py index 99e0e223..0baa72a3 100644 --- a/lightrag/kg/networkx_impl.py +++ b/lightrag/kg/networkx_impl.py @@ -156,16 +156,34 @@ class NetworkXStorage(BaseGraphStorage): return None async def upsert_node(self, node_id: str, node_data: dict[str, str]) -> None: + """ + Importance notes: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption + """ graph = await self._get_graph() graph.add_node(node_id, **node_data) async def upsert_edge( self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] ) -> None: + """ + Importance notes: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption + """ graph = await self._get_graph() graph.add_edge(source_node_id, target_node_id, **edge_data) async def delete_node(self, node_id: str) -> None: + """ + Importance notes: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption + """ graph = await self._get_graph() if graph.has_node(node_id): graph.remove_node(node_id) @@ -173,6 +191,7 @@ class NetworkXStorage(BaseGraphStorage): else: logger.warning(f"Node {node_id} not found in the graph for deletion.") + # TODO: NOT USED async def embed_nodes( self, algorithm: str ) -> tuple[np.ndarray[Any, Any], list[str]]: @@ -193,6 +212,11 @@ class NetworkXStorage(BaseGraphStorage): async def remove_nodes(self, nodes: list[str]): """Delete multiple nodes + Importance notes: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption + Args: nodes: List of node IDs to be deleted """ @@ -204,6 +228,11 @@ class NetworkXStorage(BaseGraphStorage): async def remove_edges(self, edges: list[tuple[str, str]]): """Delete multiple edges + Importance notes: + 1. Changes will be persisted to disk during the next index_done_callback + 2. Only one process should updating the storage at a time before index_done_callback, + KG-storage-log should be used to avoid data corruption + Args: edges: List of edges to be deleted, each edge is a (source, target) tuple """ @@ -433,7 +462,7 @@ class NetworkXStorage(BaseGraphStorage): 1. Remove the graph storage file if it exists 2. Reset the graph to an empty state 3. Update flags to notify other processes - 4. Trigger index_done_callback to save the empty state + 4. Changes is persisted to disk immediately Returns: dict[str, str]: Operation status and message diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index 0477ea03..2560502b 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -392,6 +392,33 @@ class OracleKVStorage(BaseKVStorage): # Oracle handles persistence automatically pass + async def delete(self, ids: list[str]) -> dict[str, str]: + """Delete records with specified IDs from the storage. + + Args: + ids: List of record IDs to be deleted + + Returns: + Dictionary with status and message + """ + if not ids: + return {"status": "success", "message": "No IDs provided for deletion"} + + try: + table_name = namespace_to_table_name(self.namespace) + if not table_name: + return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} + + ids_list = ",".join([f"'{id}'" for id in ids]) + delete_sql = f"DELETE FROM {table_name} WHERE workspace=:workspace AND id IN ({ids_list})" + + await self.db.execute(delete_sql, {"workspace": self.db.workspace}) + logger.info(f"Successfully deleted {len(ids)} records from {self.namespace}") + return {"status": "success", "message": f"Successfully deleted {len(ids)} records"} + except Exception as e: + logger.error(f"Error deleting records from {self.namespace}: {e}") + return {"status": "error", "message": str(e)} + async def drop(self) -> dict[str, str]: """Drop the storage""" try: diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index a7dc0039..895e5ebb 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -278,6 +278,35 @@ class TiDBKVStorage(BaseKVStorage): # Ti handles persistence automatically pass + async def delete(self, ids: list[str]) -> dict[str, str]: + """Delete records with specified IDs from the storage. + + Args: + ids: List of record IDs to be deleted + + Returns: + Dictionary with status and message + """ + if not ids: + return {"status": "success", "message": "No IDs provided for deletion"} + + try: + table_name = namespace_to_table_name(self.namespace) + id_field = namespace_to_id(self.namespace) + + if not table_name or not id_field: + return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} + + ids_list = ",".join([f"'{id}'" for id in ids]) + delete_sql = f"DELETE FROM {table_name} WHERE workspace = :workspace AND {id_field} IN ({ids_list})" + + await self.db.execute(delete_sql, {"workspace": self.db.workspace}) + logger.info(f"Successfully deleted {len(ids)} records from {self.namespace}") + return {"status": "success", "message": f"Successfully deleted {len(ids)} records"} + except Exception as e: + logger.error(f"Error deleting records from {self.namespace}: {e}") + return {"status": "error", "message": str(e)} + async def drop(self) -> dict[str, str]: """Drop the storage""" try: @@ -421,11 +450,66 @@ class TiDBVectorDBStorage(BaseVectorStorage): params = {"workspace": self.db.workspace, "status": status} return await self.db.query(SQL, params, multirows=True) + async def delete(self, ids: list[str]) -> None: + """Delete vectors with specified IDs from the storage. + + Args: + ids: List of vector IDs to be deleted + """ + if not ids: + return + + table_name = namespace_to_table_name(self.namespace) + id_field = namespace_to_id(self.namespace) + + if not table_name or not id_field: + logger.error(f"Unknown namespace for vector deletion: {self.namespace}") + return + + ids_list = ",".join([f"'{id}'" for id in ids]) + delete_sql = f"DELETE FROM {table_name} WHERE workspace = :workspace AND {id_field} IN ({ids_list})" + + try: + await self.db.execute(delete_sql, {"workspace": self.db.workspace}) + logger.debug(f"Successfully deleted {len(ids)} vectors from {self.namespace}") + except Exception as e: + logger.error(f"Error while deleting vectors from {self.namespace}: {e}") + async def delete_entity(self, entity_name: str) -> None: - raise NotImplementedError + """Delete an entity by its name from the vector storage. + + Args: + entity_name: The name of the entity to delete + """ + try: + # Construct SQL to delete the entity + delete_sql = """DELETE FROM LIGHTRAG_GRAPH_NODES + WHERE workspace = :workspace AND name = :entity_name""" + + await self.db.execute( + delete_sql, {"workspace": self.db.workspace, "entity_name": entity_name} + ) + logger.debug(f"Successfully deleted entity {entity_name}") + except Exception as e: + logger.error(f"Error deleting entity {entity_name}: {e}") async def delete_entity_relation(self, entity_name: str) -> None: - raise NotImplementedError + """Delete all relations associated with an entity. + + Args: + entity_name: The name of the entity whose relations should be deleted + """ + try: + # Delete relations where the entity is either the source or target + delete_sql = """DELETE FROM LIGHTRAG_GRAPH_EDGES + WHERE workspace = :workspace AND (source_name = :entity_name OR target_name = :entity_name)""" + + await self.db.execute( + delete_sql, {"workspace": self.db.workspace, "entity_name": entity_name} + ) + logger.debug(f"Successfully deleted relations for entity {entity_name}") + except Exception as e: + logger.error(f"Error deleting relations for entity {entity_name}: {e}") async def index_done_callback(self) -> None: # Ti handles persistence automatically diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 283bd4a4..81797385 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1449,6 +1449,7 @@ class LightRAG: loop = always_get_an_event_loop() return loop.run_until_complete(self.adelete_by_entity(entity_name)) + # TODO: Lock all KG relative DB to esure consistency across multiple processes async def adelete_by_entity(self, entity_name: str) -> None: try: await self.entities_vdb.delete_entity(entity_name) @@ -1486,6 +1487,7 @@ class LightRAG: self.adelete_by_relation(source_entity, target_entity) ) + # TODO: Lock all KG relative DB to esure consistency across multiple processes async def adelete_by_relation(self, source_entity: str, target_entity: str) -> None: """Asynchronously delete a relation between two entities. @@ -1555,6 +1557,7 @@ class LightRAG: """ return await self.doc_status.get_docs_by_status(status) + # TODO: Lock all KG relative DB to esure consistency across multiple processes async def adelete_by_doc_id(self, doc_id: str) -> None: """Delete a document and all its related data @@ -1907,6 +1910,7 @@ class LightRAG: """Synchronous version of aclear_cache.""" return always_get_an_event_loop().run_until_complete(self.aclear_cache(modes)) + # TODO: Lock all KG relative DB to esure consistency across multiple processes async def aedit_entity( self, entity_name: str, updated_data: dict[str, str], allow_rename: bool = True ) -> dict[str, Any]: @@ -2119,6 +2123,7 @@ class LightRAG: ] ) + # TODO: Lock all KG relative DB to esure consistency across multiple processes async def aedit_relation( self, source_entity: str, target_entity: str, updated_data: dict[str, Any] ) -> dict[str, Any]: @@ -2433,6 +2438,7 @@ class LightRAG: self.acreate_relation(source_entity, target_entity, relation_data) ) + # TODO: Lock all KG relative DB to esure consistency across multiple processes async def amerge_entities( self, source_entities: list[str], From 04967b33cce47d98b81f785978765f1f48caaf42 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 19:13:27 +0800 Subject: [PATCH 024/116] feat(api): Add dedicated ClearDocumentsResponse class for document deletion endpoint --- lightrag/api/routers/document_routes.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 6f7c3e48..987695f7 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -59,6 +59,11 @@ class InsertResponse(BaseModel): message: str = Field(description="Message describing the operation result") +class ClearDocumentsResponse(BaseModel): + status: str = Field(description="Status of the clear operation: success/partial_success/busy/fail") + message: str = Field(description="Message describing the operation result") + + class DocStatusResponse(BaseModel): @staticmethod def format_datetime(dt: Any) -> Optional[str]: @@ -755,7 +760,7 @@ def create_document_routes( raise HTTPException(status_code=500, detail=str(e)) @router.delete( - "", response_model=InsertResponse, dependencies=[Depends(combined_auth)] + "", response_model=ClearDocumentsResponse, dependencies=[Depends(combined_auth)] ) async def clear_documents(): """ @@ -766,7 +771,7 @@ def create_document_routes( from the input directory. Returns: - InsertResponse: A response object containing the status and message. + ClearDocumentsResponse: A response object containing the status and message. - status="success": All documents and files were successfully cleared. - status="partial_success": Document clear job exit with some errors. - status="busy": Operation could not be completed because the pipeline is busy. @@ -787,7 +792,7 @@ def create_document_routes( # Check and set status with lock async with pipeline_status_lock: if pipeline_status.get("busy", False): - return InsertResponse( + return ClearDocumentsResponse( status="busy", message="Cannot clear documents while pipeline is busy" ) @@ -855,7 +860,7 @@ def create_document_routes( logger.error(error_message) if "history_messages" in pipeline_status: pipeline_status["history_messages"].append(error_message) - return InsertResponse( + return ClearDocumentsResponse( status="fail", message=error_message ) @@ -904,7 +909,7 @@ def create_document_routes( pipeline_status["history_messages"].append(final_message) # Return response based on results - return InsertResponse( + return ClearDocumentsResponse( status=status, message=final_message ) From 3d4f8f67c947c8dfc8ba2cfcdc19d1ea2a78cd61 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 23:10:21 +0800 Subject: [PATCH 025/116] Add drop_cace_by_modes to all KV storage implementation --- lightrag/base.py | 19 +++++++- lightrag/kg/json_doc_status_impl.py | 2 +- lightrag/kg/json_kv_impl.py | 24 ++++++++++ lightrag/kg/mongo_impl.py | 22 ++++++++++ lightrag/kg/oracle_impl.py | 47 ++++++++++++++++---- lightrag/kg/postgres_impl.py | 68 ++++++++++++++++++++++++++--- lightrag/kg/redis_impl.py | 22 ++++++++++ lightrag/kg/tidb_impl.py | 47 ++++++++++++++++---- 8 files changed, 228 insertions(+), 23 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index 6b9163df..05f30d3c 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -259,6 +259,20 @@ class BaseKVStorage(StorageNameSpace, ABC): None """ + async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: + """Delete specific records from storage by cache mode + + Importance notes for in-memory storage: + 1. Changes will be persisted to disk during the next index_done_callback + 2. update flags to notify other processes that data persistence is needed + + Args: + modes (list[str]): List of cache modes to be dropped from storage + + Returns: + True: if the cache drop successfully + False: if the cache drop failed, or the cache mode is not supported + """ @dataclass class BaseGraphStorage(StorageNameSpace, ABC): @@ -310,7 +324,6 @@ class BaseGraphStorage(StorageNameSpace, ABC): KG-storage-log should be used to avoid data corruption """ - @abstractmethod async def delete_node(self, node_id: str) -> None: """Embed nodes using an algorithm.""" @@ -381,6 +394,10 @@ class DocStatusStorage(BaseKVStorage, ABC): ) -> dict[str, DocProcessingStatus]: """Get all documents with a specific status""" + async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: + """Drop cache is not supported for Doc Status storage""" + return False + class StoragesStatus(str, Enum): """Storages status""" diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py index 33b61eea..003a6733 100644 --- a/lightrag/kg/json_doc_status_impl.py +++ b/lightrag/kg/json_doc_status_impl.py @@ -127,7 +127,7 @@ class JsonDocStatusStorage(DocStatusStorage): async with self._storage_lock: return self._data.get(id) - async def delete(self, doc_ids: list[str]): + async def delete(self, doc_ids: list[str]) -> None: """Delete specific records from storage by their IDs Importance notes for in-memory storage: diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index 4972bf6a..8857aa9a 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -144,6 +144,30 @@ class JsonKVStorage(BaseKVStorage): self._data.pop(doc_id, None) await set_all_update_flags(self.namespace) + async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: + """Delete specific records from storage by by cache mode + + Importance notes for in-memory storage: + 1. Changes will be persisted to disk during the next index_done_callback + 2. update flags to notify other processes that data persistence is needed + + Args: + ids (list[str]): List of cache mode to be drop from storage + + Returns: + True: if the cache drop successfully + False: if the cache drop failed + """ + if not modes: + return False + + try: + await self.delete(modes) + return True + except Exception: + return False + + async def drop(self) -> dict[str, str]: """Drop all data from storage and clean up resources This action will persistent the data to disk immediately. diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index d5832af0..24c215be 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -165,6 +165,28 @@ class MongoKVStorage(BaseKVStorage): except PyMongoError as e: logger.error(f"Error deleting documents from {self.namespace}: {e}") + async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: + """Delete specific records from storage by cache mode + + Args: + modes (list[str]): List of cache modes to be dropped from storage + + Returns: + bool: True if successful, False otherwise + """ + if not modes: + return False + + try: + # Build regex pattern to match documents with the specified modes + pattern = f"^({'|'.join(modes)})_" + result = await self._data.delete_many({"_id": {"$regex": pattern}}) + logger.info(f"Deleted {result.deleted_count} documents by modes: {modes}") + return True + except Exception as e: + logger.error(f"Error deleting cache by modes {modes}: {e}") + return False + async def drop(self) -> dict[str, str]: """Drop the storage by removing all documents in the collection. diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index 2560502b..6d3e2e8d 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -392,32 +392,63 @@ class OracleKVStorage(BaseKVStorage): # Oracle handles persistence automatically pass - async def delete(self, ids: list[str]) -> dict[str, str]: + async def delete(self, ids: list[str]) -> None: """Delete records with specified IDs from the storage. Args: ids: List of record IDs to be deleted - - Returns: - Dictionary with status and message """ if not ids: - return {"status": "success", "message": "No IDs provided for deletion"} + return try: table_name = namespace_to_table_name(self.namespace) if not table_name: - return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} + logger.error(f"Unknown namespace for deletion: {self.namespace}") + return ids_list = ",".join([f"'{id}'" for id in ids]) delete_sql = f"DELETE FROM {table_name} WHERE workspace=:workspace AND id IN ({ids_list})" await self.db.execute(delete_sql, {"workspace": self.db.workspace}) logger.info(f"Successfully deleted {len(ids)} records from {self.namespace}") - return {"status": "success", "message": f"Successfully deleted {len(ids)} records"} except Exception as e: logger.error(f"Error deleting records from {self.namespace}: {e}") - return {"status": "error", "message": str(e)} + + async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: + """Delete specific records from storage by cache mode + + Args: + modes (list[str]): List of cache modes to be dropped from storage + + Returns: + bool: True if successful, False otherwise + """ + if not modes: + return False + + try: + table_name = namespace_to_table_name(self.namespace) + if not table_name: + return False + + if table_name != "LIGHTRAG_LLM_CACHE": + return False + + # 构建Oracle风格的IN查询 + modes_list = ", ".join([f"'{mode}'" for mode in modes]) + sql = f""" + DELETE FROM {table_name} + WHERE workspace = :workspace + AND cache_mode IN ({modes_list}) + """ + + logger.info(f"Deleting cache by modes: {modes}") + await self.db.execute(sql, {"workspace": self.db.workspace}) + return True + except Exception as e: + logger.error(f"Error deleting cache by modes {modes}: {e}") + return False async def drop(self) -> dict[str, str]: """Drop the storage""" diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index ff90d14b..34268e32 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -378,6 +378,67 @@ class PGKVStorage(BaseKVStorage): # PG handles persistence automatically pass + async def delete(self, ids: list[str]) -> None: + """Delete specific records from storage by their IDs + + Args: + ids (list[str]): List of document IDs to be deleted from storage + + Returns: + None + """ + if not ids: + return + + table_name = namespace_to_table_name(self.namespace) + if not table_name: + logger.error(f"Unknown namespace for deletion: {self.namespace}") + return + + delete_sql = f"DELETE FROM {table_name} WHERE workspace=$1 AND id = ANY($2)" + + try: + await self.db.execute(delete_sql, {"workspace": self.db.workspace, "ids": ids}) + logger.debug(f"Successfully deleted {len(ids)} records from {self.namespace}") + except Exception as e: + logger.error(f"Error while deleting records from {self.namespace}: {e}") + + async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: + """Delete specific records from storage by cache mode + + Args: + modes (list[str]): List of cache modes to be dropped from storage + + Returns: + bool: True if successful, False otherwise + """ + if not modes: + return False + + try: + table_name = namespace_to_table_name(self.namespace) + if not table_name: + return False + + if table_name != "LIGHTRAG_LLM_CACHE": + return False + + sql = f""" + DELETE FROM {table_name} + WHERE workspace = $1 AND mode = ANY($2) + """ + params = { + "workspace": self.db.workspace, + "modes": modes + } + + logger.info(f"Deleting cache by modes: {modes}") + await self.db.execute(sql, params) + return True + except Exception as e: + logger.error(f"Error deleting cache by modes {modes}: {e}") + return False + async def drop(self) -> dict[str, str]: """Drop the storage""" try: @@ -558,13 +619,10 @@ class PGVectorStorage(BaseVectorStorage): logger.error(f"Unknown namespace for vector deletion: {self.namespace}") return - ids_list = ",".join([f"'{id}'" for id in ids]) - delete_sql = ( - f"DELETE FROM {table_name} WHERE workspace=$1 AND id IN ({ids_list})" - ) + delete_sql = f"DELETE FROM {table_name} WHERE workspace=$1 AND id = ANY($2)" try: - await self.db.execute(delete_sql, {"workspace": self.db.workspace}) + await self.db.execute(delete_sql, {"workspace": self.db.workspace, "ids": ids}) logger.debug( f"Successfully deleted {len(ids)} vectors from {self.namespace}" ) diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py index 9ff50008..964b0ad7 100644 --- a/lightrag/kg/redis_impl.py +++ b/lightrag/kg/redis_impl.py @@ -84,6 +84,28 @@ class RedisKVStorage(BaseKVStorage): f"Deleted {deleted_count} of {len(ids)} entries from {self.namespace}" ) + async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: + """Delete specific records from storage by by cache mode + + Importance notes for Redis storage: + 1. This will immediately delete the specified cache modes from Redis + + Args: + modes (list[str]): List of cache mode to be drop from storage + + Returns: + True: if the cache drop successfully + False: if the cache drop failed + """ + if not modes: + return False + + try: + await self.delete(modes) + return True + except Exception: + return False + async def drop(self) -> dict[str, str]: """Drop the storage by removing all keys under the current namespace. diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index 895e5ebb..3d06ad0a 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -278,34 +278,65 @@ class TiDBKVStorage(BaseKVStorage): # Ti handles persistence automatically pass - async def delete(self, ids: list[str]) -> dict[str, str]: + async def delete(self, ids: list[str]) -> None: """Delete records with specified IDs from the storage. Args: ids: List of record IDs to be deleted - - Returns: - Dictionary with status and message """ if not ids: - return {"status": "success", "message": "No IDs provided for deletion"} + return try: table_name = namespace_to_table_name(self.namespace) id_field = namespace_to_id(self.namespace) if not table_name or not id_field: - return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} + logger.error(f"Unknown namespace for deletion: {self.namespace}") + return ids_list = ",".join([f"'{id}'" for id in ids]) delete_sql = f"DELETE FROM {table_name} WHERE workspace = :workspace AND {id_field} IN ({ids_list})" await self.db.execute(delete_sql, {"workspace": self.db.workspace}) logger.info(f"Successfully deleted {len(ids)} records from {self.namespace}") - return {"status": "success", "message": f"Successfully deleted {len(ids)} records"} except Exception as e: logger.error(f"Error deleting records from {self.namespace}: {e}") - return {"status": "error", "message": str(e)} + + async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: + """Delete specific records from storage by cache mode + + Args: + modes (list[str]): List of cache modes to be dropped from storage + + Returns: + bool: True if successful, False otherwise + """ + if not modes: + return False + + try: + table_name = namespace_to_table_name(self.namespace) + if not table_name: + return False + + if table_name != "LIGHTRAG_LLM_CACHE": + return False + + # 构建MySQL风格的IN查询 + modes_list = ", ".join([f"'{mode}'" for mode in modes]) + sql = f""" + DELETE FROM {table_name} + WHERE workspace = :workspace + AND mode IN ({modes_list}) + """ + + logger.info(f"Deleting cache by modes: {modes}") + await self.db.execute(sql, {"workspace": self.db.workspace}) + return True + except Exception as e: + logger.error(f"Error deleting cache by modes {modes}: {e}") + return False async def drop(self) -> dict[str, str]: """Drop the storage""" From 95a8ee27ed707ab2e0ee4aa440fae3fafbb655a5 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 23:22:27 +0800 Subject: [PATCH 026/116] Fix linting --- lightrag/api/routers/document_routes.py | 64 ++++++++++--------- lightrag/base.py | 29 ++++----- lightrag/kg/age_impl.py | 10 +-- lightrag/kg/chroma_impl.py | 16 ++--- lightrag/kg/faiss_impl.py | 10 +-- lightrag/kg/gremlin_impl.py | 12 ++-- lightrag/kg/json_doc_status_impl.py | 14 ++--- lightrag/kg/json_kv_impl.py | 23 ++++--- lightrag/kg/milvus_impl.py | 16 ++--- lightrag/kg/mongo_impl.py | 82 ++++++++++++++++--------- lightrag/kg/nano_vector_db_impl.py | 8 ++- lightrag/kg/neo4j_impl.py | 12 ++-- lightrag/kg/networkx_impl.py | 8 ++- lightrag/kg/oracle_impl.py | 50 +++++++++------ lightrag/kg/postgres_impl.py | 58 ++++++++++------- lightrag/kg/qdrant_impl.py | 36 ++++++----- lightrag/kg/redis_impl.py | 24 ++++---- lightrag/kg/tidb_impl.py | 46 ++++++++------ 18 files changed, 296 insertions(+), 222 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 987695f7..144b1274 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -60,7 +60,9 @@ class InsertResponse(BaseModel): class ClearDocumentsResponse(BaseModel): - status: str = Field(description="Status of the clear operation: success/partial_success/busy/fail") + status: str = Field( + description="Status of the clear operation: success/partial_success/busy/fail" + ) message: str = Field(description="Message describing the operation result") @@ -448,7 +450,7 @@ async def pipeline_index_texts(rag: LightRAG, texts: List[str]): await rag.apipeline_process_enqueue_documents() -# TODO: deprecate after /insert_file is removed +# TODO: deprecate after /insert_file is removed async def save_temp_file(input_dir: Path, file: UploadFile = File(...)) -> Path: """Save the uploaded file to a temporary location @@ -783,7 +785,10 @@ def create_document_routes( HTTPException: Raised when a serious error occurs during the clearing process, with status code 500 and error details in the detail field. """ - from lightrag.kg.shared_storage import get_namespace_data, get_pipeline_status_lock + from lightrag.kg.shared_storage import ( + get_namespace_data, + get_pipeline_status_lock, + ) # Get pipeline status and lock pipeline_status = await get_namespace_data("pipeline_status") @@ -794,14 +799,16 @@ def create_document_routes( if pipeline_status.get("busy", False): return ClearDocumentsResponse( status="busy", - message="Cannot clear documents while pipeline is busy" + message="Cannot clear documents while pipeline is busy", ) # Set busy to true pipeline_status["busy"] = True pipeline_status["job_name"] = "Clearing Documents" pipeline_status["latest_message"] = "Starting document clearing process" if "history_messages" in pipeline_status: - pipeline_status["history_messages"].append("Starting document clearing process") + pipeline_status["history_messages"].append( + "Starting document clearing process" + ) try: # Use drop method to clear all data @@ -813,25 +820,27 @@ def create_document_routes( rag.relationships_vdb, rag.chunks_vdb, rag.chunk_entity_relation_graph, - rag.doc_status + rag.doc_status, ] - + # Log storage drop start if "history_messages" in pipeline_status: - pipeline_status["history_messages"].append("Starting to drop storage components") - + pipeline_status["history_messages"].append( + "Starting to drop storage components" + ) + for storage in storages: if storage is not None: drop_tasks.append(storage.drop()) - + # Wait for all drop tasks to complete drop_results = await asyncio.gather(*drop_tasks, return_exceptions=True) - + # Check for errors and log results errors = [] storage_success_count = 0 storage_error_count = 0 - + for i, result in enumerate(drop_results): storage_name = storages[i].__class__.__name__ if isinstance(result, Exception): @@ -842,7 +851,7 @@ def create_document_routes( else: logger.info(f"Successfully dropped {storage_name}") storage_success_count += 1 - + # Log storage drop results if "history_messages" in pipeline_status: if storage_error_count > 0: @@ -853,26 +862,25 @@ def create_document_routes( pipeline_status["history_messages"].append( f"Successfully dropped all {storage_success_count} storage components" ) - + # If all storage operations failed, return error status and don't proceed with file deletion if storage_success_count == 0 and storage_error_count > 0: error_message = "All storage drop operations failed. Aborting document clearing process." logger.error(error_message) if "history_messages" in pipeline_status: pipeline_status["history_messages"].append(error_message) - return ClearDocumentsResponse( - status="fail", - message=error_message - ) - + return ClearDocumentsResponse(status="fail", message=error_message) + # Log file deletion start if "history_messages" in pipeline_status: - pipeline_status["history_messages"].append("Starting to delete files in input directory") - + pipeline_status["history_messages"].append( + "Starting to delete files in input directory" + ) + # Delete all files in input_dir deleted_files_count = 0 file_errors_count = 0 - + for file_path in doc_manager.input_dir.glob("**/*"): if file_path.is_file(): try: @@ -881,7 +889,7 @@ def create_document_routes( except Exception as e: logger.error(f"Error deleting file {file_path}: {str(e)}") file_errors_count += 1 - + # Log file deletion results if "history_messages" in pipeline_status: if file_errors_count > 0: @@ -893,7 +901,7 @@ def create_document_routes( pipeline_status["history_messages"].append( f"Successfully deleted {deleted_files_count} files" ) - + # Prepare final result message final_message = "" if errors: @@ -903,16 +911,12 @@ def create_document_routes( final_message = f"All documents cleared successfully. Deleted {deleted_files_count} files." status = "success" - # Log final result if "history_messages" in pipeline_status: pipeline_status["history_messages"].append(final_message) - + # Return response based on results - return ClearDocumentsResponse( - status=status, - message=final_message - ) + return ClearDocumentsResponse(status=status, message=final_message) except Exception as e: error_msg = f"Error clearing documents: {str(e)}" logger.error(error_msg) diff --git a/lightrag/base.py b/lightrag/base.py index 05f30d3c..223cc7c9 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -111,11 +111,11 @@ class StorageNameSpace(ABC): @abstractmethod async def index_done_callback(self) -> None: """Commit the storage operations after indexing""" - + @abstractmethod async def drop(self) -> dict[str, str]: """Drop all data from storage and clean up resources - + This abstract method defines the contract for dropping all data from a storage implementation. Each storage type must implement this method to: 1. Clear all data from memory and/or external storage @@ -124,14 +124,14 @@ class StorageNameSpace(ABC): 4. Handle cleanup of any resources 5. Notify other processes if necessary 6. This action should persistent the data to disk immediately. - + Returns: dict[str, str]: Operation status and message with the following format: { "status": str, # "success" or "error" "message": str # "data dropped" on success, error details on failure } - + Implementation specific: - On success: return {"status": "success", "message": "data dropped"} - On failure: return {"status": "error", "message": ""} @@ -238,42 +238,43 @@ class BaseKVStorage(StorageNameSpace, ABC): @abstractmethod async def upsert(self, data: dict[str, dict[str, Any]]) -> None: """Upsert data - + Importance notes for in-memory storage: 1. Changes will be persisted to disk during the next index_done_callback - 2. update flags to notify other processes that data persistence is needed + 2. update flags to notify other processes that data persistence is needed """ @abstractmethod async def delete(self, ids: list[str]) -> None: """Delete specific records from storage by their IDs - + Importance notes for in-memory storage: 1. Changes will be persisted to disk during the next index_done_callback 2. update flags to notify other processes that data persistence is needed - + Args: ids (list[str]): List of document IDs to be deleted from storage - + Returns: None """ - async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: + async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: """Delete specific records from storage by cache mode - + Importance notes for in-memory storage: 1. Changes will be persisted to disk during the next index_done_callback 2. update flags to notify other processes that data persistence is needed - + Args: modes (list[str]): List of cache modes to be dropped from storage - + Returns: True: if the cache drop successfully False: if the cache drop failed, or the cache mode is not supported """ + @dataclass class BaseGraphStorage(StorageNameSpace, ABC): embedding_func: EmbeddingFunc @@ -394,7 +395,7 @@ class DocStatusStorage(BaseKVStorage, ABC): ) -> dict[str, DocProcessingStatus]: """Get all documents with a specific status""" - async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: + async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: """Drop cache is not supported for Doc Status storage""" return False diff --git a/lightrag/kg/age_impl.py b/lightrag/kg/age_impl.py index 8530e12d..b744ae1e 100644 --- a/lightrag/kg/age_impl.py +++ b/lightrag/kg/age_impl.py @@ -34,9 +34,9 @@ if not pm.is_installed("psycopg-pool"): if not pm.is_installed("asyncpg"): pm.install("asyncpg") -import psycopg # type: ignore -from psycopg.rows import namedtuple_row # type: ignore -from psycopg_pool import AsyncConnectionPool, PoolTimeout # type: ignore +import psycopg # type: ignore +from psycopg.rows import namedtuple_row # type: ignore +from psycopg_pool import AsyncConnectionPool, PoolTimeout # type: ignore class AGEQueryException(Exception): @@ -871,10 +871,10 @@ class AGEStorage(BaseGraphStorage): async def index_done_callback(self) -> None: # AGES handles persistence automatically pass - + async def drop(self) -> dict[str, str]: """Drop the storage by removing all nodes and relationships in the graph. - + Returns: dict[str, str]: Status of the operation with keys 'status' and 'message' """ diff --git a/lightrag/kg/chroma_impl.py b/lightrag/kg/chroma_impl.py index 052088d4..020e358f 100644 --- a/lightrag/kg/chroma_impl.py +++ b/lightrag/kg/chroma_impl.py @@ -11,8 +11,8 @@ import pipmaster as pm if not pm.is_installed("chromadb"): pm.install("chromadb") -from chromadb import HttpClient, PersistentClient # type: ignore -from chromadb.config import Settings # type: ignore +from chromadb import HttpClient, PersistentClient # type: ignore +from chromadb.config import Settings # type: ignore @final @@ -336,12 +336,12 @@ class ChromaVectorDBStorage(BaseVectorStorage): except Exception as e: logger.error(f"Error retrieving vector data for IDs {ids}: {e}") return [] - + async def drop(self) -> dict[str, str]: """Drop all vector data from storage and clean up resources - + This method will delete all documents from the ChromaDB collection. - + Returns: dict[str, str]: Operation status and message - On success: {"status": "success", "message": "data dropped"} @@ -353,8 +353,10 @@ class ChromaVectorDBStorage(BaseVectorStorage): if result and result["ids"] and len(result["ids"]) > 0: # Delete all documents self._collection.delete(ids=result["ids"]) - - logger.info(f"Process {os.getpid()} drop ChromaDB collection {self.namespace}") + + logger.info( + f"Process {os.getpid()} drop ChromaDB collection {self.namespace}" + ) return {"status": "success", "message": "data dropped"} except Exception as e: logger.error(f"Error dropping ChromaDB collection {self.namespace}: {e}") diff --git a/lightrag/kg/faiss_impl.py b/lightrag/kg/faiss_impl.py index 1e0659cb..cf870b3a 100644 --- a/lightrag/kg/faiss_impl.py +++ b/lightrag/kg/faiss_impl.py @@ -443,10 +443,10 @@ class FaissVectorDBStorage(BaseVectorStorage): results.append({**metadata, "id": metadata.get("__id__")}) return results - + async def drop(self) -> dict[str, str]: """Drop all vector data from storage and clean up resources - + This method will: 1. Remove the vector database storage file if it exists 2. Reinitialize the vector database client @@ -454,7 +454,7 @@ class FaissVectorDBStorage(BaseVectorStorage): 4. Changes is persisted to disk immediately This method will remove all vectors from the Faiss index and delete the storage files. - + Returns: dict[str, str]: Operation status and message - On success: {"status": "success", "message": "data dropped"} @@ -465,7 +465,7 @@ class FaissVectorDBStorage(BaseVectorStorage): # Reset the index self._index = faiss.IndexFlatIP(self._dim) self._id_to_meta = {} - + # Remove storage files if they exist if os.path.exists(self._faiss_index_file): os.remove(self._faiss_index_file) @@ -478,7 +478,7 @@ class FaissVectorDBStorage(BaseVectorStorage): # Notify other processes await set_all_update_flags(self.namespace) self.storage_updated.value = False - + logger.info(f"Process {os.getpid()} drop FAISS index {self.namespace}") return {"status": "success", "message": "data dropped"} except Exception as e: diff --git a/lightrag/kg/gremlin_impl.py b/lightrag/kg/gremlin_impl.py index d616a409..e27c561e 100644 --- a/lightrag/kg/gremlin_impl.py +++ b/lightrag/kg/gremlin_impl.py @@ -24,9 +24,9 @@ from ..base import BaseGraphStorage if not pm.is_installed("gremlinpython"): pm.install("gremlinpython") -from gremlin_python.driver import client, serializer # type: ignore -from gremlin_python.driver.aiohttp.transport import AiohttpTransport # type: ignore -from gremlin_python.driver.protocol import GremlinServerError # type: ignore +from gremlin_python.driver import client, serializer # type: ignore +from gremlin_python.driver.aiohttp.transport import AiohttpTransport # type: ignore +from gremlin_python.driver.protocol import GremlinServerError # type: ignore @final @@ -695,13 +695,13 @@ class GremlinStorage(BaseGraphStorage): except Exception as e: logger.error(f"Error during edge deletion: {str(e)}") raise - + async def drop(self) -> dict[str, str]: """Drop the storage by removing all nodes and relationships in the graph. - + This function deletes all nodes with the specified graph name property, which automatically removes all associated edges. - + Returns: dict[str, str]: Status of the operation with keys 'status' and 'message' """ diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py index 003a6733..a1d10a62 100644 --- a/lightrag/kg/json_doc_status_impl.py +++ b/lightrag/kg/json_doc_status_impl.py @@ -112,7 +112,7 @@ class JsonDocStatusStorage(DocStatusStorage): """ Importance notes for in-memory storage: 1. Changes will be persisted to disk during the next index_done_callback - 2. update flags to notify other processes that data persistence is needed + 2. update flags to notify other processes that data persistence is needed """ if not data: return @@ -129,14 +129,14 @@ class JsonDocStatusStorage(DocStatusStorage): async def delete(self, doc_ids: list[str]) -> None: """Delete specific records from storage by their IDs - + Importance notes for in-memory storage: 1. Changes will be persisted to disk during the next index_done_callback - 2. update flags to notify other processes that data persistence is needed - + 2. update flags to notify other processes that data persistence is needed + Args: ids (list[str]): List of document IDs to be deleted from storage - + Returns: None """ @@ -147,12 +147,12 @@ class JsonDocStatusStorage(DocStatusStorage): async def drop(self) -> dict[str, str]: """Drop all document status data from storage and clean up resources - + This method will: 1. Clear all document status data from memory 2. Update flags to notify other processes 3. Trigger index_done_callback to save the empty state - + Returns: dict[str, str]: Operation status and message - On success: {"status": "success", "message": "data dropped"} diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index 8857aa9a..79a043c8 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -117,7 +117,7 @@ class JsonKVStorage(BaseKVStorage): """ Importance notes for in-memory storage: 1. Changes will be persisted to disk during the next index_done_callback - 2. update flags to notify other processes that data persistence is needed + 2. update flags to notify other processes that data persistence is needed """ if not data: return @@ -128,14 +128,14 @@ class JsonKVStorage(BaseKVStorage): async def delete(self, ids: list[str]) -> None: """Delete specific records from storage by their IDs - + Importance notes for in-memory storage: 1. Changes will be persisted to disk during the next index_done_callback 2. update flags to notify other processes that data persistence is needed - + Args: ids (list[str]): List of document IDs to be deleted from storage - + Returns: None """ @@ -144,39 +144,38 @@ class JsonKVStorage(BaseKVStorage): self._data.pop(doc_id, None) await set_all_update_flags(self.namespace) - async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: + async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: """Delete specific records from storage by by cache mode - + Importance notes for in-memory storage: 1. Changes will be persisted to disk during the next index_done_callback 2. update flags to notify other processes that data persistence is needed - + Args: ids (list[str]): List of cache mode to be drop from storage - + Returns: True: if the cache drop successfully False: if the cache drop failed """ if not modes: return False - + try: await self.delete(modes) return True except Exception: return False - async def drop(self) -> dict[str, str]: """Drop all data from storage and clean up resources This action will persistent the data to disk immediately. - + This method will: 1. Clear all data from memory 2. Update flags to notify other processes 3. Trigger index_done_callback to save the empty state - + Returns: dict[str, str]: Operation status and message - On success: {"status": "success", "message": "data dropped"} diff --git a/lightrag/kg/milvus_impl.py b/lightrag/kg/milvus_impl.py index 74cf416a..2cff0079 100644 --- a/lightrag/kg/milvus_impl.py +++ b/lightrag/kg/milvus_impl.py @@ -15,7 +15,7 @@ if not pm.is_installed("pymilvus"): pm.install("pymilvus") import configparser -from pymilvus import MilvusClient # type: ignore +from pymilvus import MilvusClient # type: ignore config = configparser.ConfigParser() config.read("config.ini", "utf-8") @@ -287,12 +287,12 @@ class MilvusVectorDBStorage(BaseVectorStorage): except Exception as e: logger.error(f"Error retrieving vector data for IDs {ids}: {e}") return [] - + async def drop(self) -> dict[str, str]: """Drop all vector data from storage and clean up resources - + This method will delete all data from the Milvus collection. - + Returns: dict[str, str]: Operation status and message - On success: {"status": "success", "message": "data dropped"} @@ -302,15 +302,17 @@ class MilvusVectorDBStorage(BaseVectorStorage): # Drop the collection and recreate it if self._client.has_collection(self.namespace): self._client.drop_collection(self.namespace) - + # Recreate the collection MilvusVectorDBStorage.create_collection_if_not_exist( self._client, self.namespace, dimension=self.embedding_func.embedding_dim, ) - - logger.info(f"Process {os.getpid()} drop Milvus collection {self.namespace}") + + logger.info( + f"Process {os.getpid()} drop Milvus collection {self.namespace}" + ) return {"status": "success", "message": "data dropped"} except Exception as e: logger.error(f"Error dropping Milvus collection {self.namespace}: {e}") diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index 24c215be..dd4f7447 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -25,13 +25,13 @@ if not pm.is_installed("pymongo"): if not pm.is_installed("motor"): pm.install("motor") -from motor.motor_asyncio import ( # type: ignore +from motor.motor_asyncio import ( # type: ignore AsyncIOMotorClient, AsyncIOMotorDatabase, AsyncIOMotorCollection, ) -from pymongo.operations import SearchIndexModel # type: ignore -from pymongo.errors import PyMongoError # type: ignore +from pymongo.operations import SearchIndexModel # type: ignore +from pymongo.errors import PyMongoError # type: ignore config = configparser.ConfigParser() config.read("config.ini", "utf-8") @@ -149,34 +149,36 @@ class MongoKVStorage(BaseKVStorage): async def index_done_callback(self) -> None: # Mongo handles persistence automatically pass - + async def delete(self, ids: list[str]) -> None: """Delete documents with specified IDs - + Args: ids: List of document IDs to be deleted """ if not ids: return - + try: result = await self._data.delete_many({"_id": {"$in": ids}}) - logger.info(f"Deleted {result.deleted_count} documents from {self.namespace}") + logger.info( + f"Deleted {result.deleted_count} documents from {self.namespace}" + ) except PyMongoError as e: logger.error(f"Error deleting documents from {self.namespace}: {e}") - + async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: """Delete specific records from storage by cache mode - + Args: modes (list[str]): List of cache modes to be dropped from storage - + Returns: bool: True if successful, False otherwise """ if not modes: return False - + try: # Build regex pattern to match documents with the specified modes pattern = f"^({'|'.join(modes)})_" @@ -189,16 +191,21 @@ class MongoKVStorage(BaseKVStorage): async def drop(self) -> dict[str, str]: """Drop the storage by removing all documents in the collection. - + Returns: dict[str, str]: Status of the operation with keys 'status' and 'message' """ try: result = await self._data.delete_many({}) deleted_count = result.deleted_count - - logger.info(f"Dropped {deleted_count} documents from doc status {self._collection_name}") - return {"status": "success", "message": f"{deleted_count} documents dropped"} + + logger.info( + f"Dropped {deleted_count} documents from doc status {self._collection_name}" + ) + return { + "status": "success", + "message": f"{deleted_count} documents dropped", + } except PyMongoError as e: logger.error(f"Error dropping doc status {self._collection_name}: {e}") return {"status": "error", "message": str(e)} @@ -282,19 +289,24 @@ class MongoDocStatusStorage(DocStatusStorage): async def index_done_callback(self) -> None: # Mongo handles persistence automatically pass - + async def drop(self) -> dict[str, str]: """Drop the storage by removing all documents in the collection. - + Returns: dict[str, str]: Status of the operation with keys 'status' and 'message' """ try: result = await self._data.delete_many({}) deleted_count = result.deleted_count - - logger.info(f"Dropped {deleted_count} documents from doc status {self._collection_name}") - return {"status": "success", "message": f"{deleted_count} documents dropped"} + + logger.info( + f"Dropped {deleted_count} documents from doc status {self._collection_name}" + ) + return { + "status": "success", + "message": f"{deleted_count} documents dropped", + } except PyMongoError as e: logger.error(f"Error dropping doc status {self._collection_name}: {e}") return {"status": "error", "message": str(e)} @@ -911,16 +923,21 @@ class MongoGraphStorage(BaseGraphStorage): async def drop(self) -> dict[str, str]: """Drop the storage by removing all documents in the collection. - + Returns: dict[str, str]: Status of the operation with keys 'status' and 'message' """ try: result = await self.collection.delete_many({}) deleted_count = result.deleted_count - - logger.info(f"Dropped {deleted_count} documents from graph {self._collection_name}") - return {"status": "success", "message": f"{deleted_count} documents dropped"} + + logger.info( + f"Dropped {deleted_count} documents from graph {self._collection_name}" + ) + return { + "status": "success", + "message": f"{deleted_count} documents dropped", + } except PyMongoError as e: logger.error(f"Error dropping graph {self._collection_name}: {e}") return {"status": "error", "message": str(e)} @@ -1211,10 +1228,10 @@ class MongoVectorDBStorage(BaseVectorStorage): except Exception as e: logger.error(f"Error retrieving vector data for IDs {ids}: {e}") return [] - + async def drop(self) -> dict[str, str]: """Drop the storage by removing all documents in the collection and recreating vector index. - + Returns: dict[str, str]: Status of the operation with keys 'status' and 'message' """ @@ -1222,12 +1239,17 @@ class MongoVectorDBStorage(BaseVectorStorage): # Delete all documents result = await self._data.delete_many({}) deleted_count = result.deleted_count - + # Recreate vector index await self.create_vector_index_if_not_exists() - - logger.info(f"Dropped {deleted_count} documents from vector storage {self._collection_name} and recreated vector index") - return {"status": "success", "message": f"{deleted_count} documents dropped and vector index recreated"} + + logger.info( + f"Dropped {deleted_count} documents from vector storage {self._collection_name} and recreated vector index" + ) + return { + "status": "success", + "message": f"{deleted_count} documents dropped and vector index recreated", + } except PyMongoError as e: logger.error(f"Error dropping vector storage {self._collection_name}: {e}") return {"status": "error", "message": str(e)} diff --git a/lightrag/kg/nano_vector_db_impl.py b/lightrag/kg/nano_vector_db_impl.py index 8c00437d..56a52b92 100644 --- a/lightrag/kg/nano_vector_db_impl.py +++ b/lightrag/kg/nano_vector_db_impl.py @@ -309,7 +309,7 @@ class NanoVectorDBStorage(BaseVectorStorage): async def drop(self) -> dict[str, str]: """Drop all vector data from storage and clean up resources - + This method will: 1. Remove the vector database storage file if it exists 2. Reinitialize the vector database client @@ -317,7 +317,7 @@ class NanoVectorDBStorage(BaseVectorStorage): 4. Changes is persisted to disk immediately This method is intended for use in scenarios where all data needs to be removed, - + Returns: dict[str, str]: Operation status and message - On success: {"status": "success", "message": "data dropped"} @@ -339,7 +339,9 @@ class NanoVectorDBStorage(BaseVectorStorage): # Reset own update flag to avoid self-reloading self.storage_updated.value = False - logger.info(f"Process {os.getpid()} drop {self.namespace}(file:{self._client_file_name})") + logger.info( + f"Process {os.getpid()} drop {self.namespace}(file:{self._client_file_name})" + ) return {"status": "success", "message": "data dropped"} except Exception as e: logger.error(f"Error dropping {self.namespace}: {e}") diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py index 3f2545a7..4ee88da2 100644 --- a/lightrag/kg/neo4j_impl.py +++ b/lightrag/kg/neo4j_impl.py @@ -1028,12 +1028,12 @@ class Neo4JStorage(BaseGraphStorage): self, algorithm: str ) -> tuple[np.ndarray[Any, Any], list[str]]: raise NotImplementedError - + async def drop(self) -> dict[str, str]: """Drop all data from storage and clean up resources - + This method will delete all nodes and relationships in the Neo4j database. - + Returns: dict[str, str]: Operation status and message - On success: {"status": "success", "message": "data dropped"} @@ -1045,8 +1045,10 @@ class Neo4JStorage(BaseGraphStorage): query = "MATCH (n) DETACH DELETE n" result = await session.run(query) await result.consume() # Ensure result is fully consumed - - logger.info(f"Process {os.getpid()} drop Neo4j database {self._DATABASE}") + + logger.info( + f"Process {os.getpid()} drop Neo4j database {self._DATABASE}" + ) return {"status": "success", "message": "data dropped"} except Exception as e: logger.error(f"Error dropping Neo4j database {self._DATABASE}: {e}") diff --git a/lightrag/kg/networkx_impl.py b/lightrag/kg/networkx_impl.py index 0baa72a3..7a9cb203 100644 --- a/lightrag/kg/networkx_impl.py +++ b/lightrag/kg/networkx_impl.py @@ -457,13 +457,13 @@ class NetworkXStorage(BaseGraphStorage): async def drop(self) -> dict[str, str]: """Drop all graph data from storage and clean up resources - + This method will: 1. Remove the graph storage file if it exists 2. Reset the graph to an empty state 3. Update flags to notify other processes 4. Changes is persisted to disk immediately - + Returns: dict[str, str]: Operation status and message - On success: {"status": "success", "message": "data dropped"} @@ -479,7 +479,9 @@ class NetworkXStorage(BaseGraphStorage): await set_all_update_flags(self.namespace) # Reset own update flag to avoid self-reloading self.storage_updated.value = False - logger.info(f"Process {os.getpid()} drop graph {self.namespace} (file:{self._graphml_xml_file})") + logger.info( + f"Process {os.getpid()} drop graph {self.namespace} (file:{self._graphml_xml_file})" + ) return {"status": "success", "message": "data dropped"} except Exception as e: logger.error(f"Error dropping graph {self.namespace}: {e}") diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index 6d3e2e8d..7ba9f428 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -27,7 +27,7 @@ if not pm.is_installed("oracledb"): pm.install("oracledb") from graspologic import embed -import oracledb # type: ignore +import oracledb # type: ignore class OracleDB: @@ -406,43 +406,45 @@ class OracleKVStorage(BaseKVStorage): if not table_name: logger.error(f"Unknown namespace for deletion: {self.namespace}") return - + ids_list = ",".join([f"'{id}'" for id in ids]) delete_sql = f"DELETE FROM {table_name} WHERE workspace=:workspace AND id IN ({ids_list})" - + await self.db.execute(delete_sql, {"workspace": self.db.workspace}) - logger.info(f"Successfully deleted {len(ids)} records from {self.namespace}") + logger.info( + f"Successfully deleted {len(ids)} records from {self.namespace}" + ) except Exception as e: logger.error(f"Error deleting records from {self.namespace}: {e}") async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: """Delete specific records from storage by cache mode - + Args: modes (list[str]): List of cache modes to be dropped from storage - + Returns: bool: True if successful, False otherwise """ if not modes: return False - + try: table_name = namespace_to_table_name(self.namespace) if not table_name: return False - + if table_name != "LIGHTRAG_LLM_CACHE": return False - + # 构建Oracle风格的IN查询 modes_list = ", ".join([f"'{mode}'" for mode in modes]) sql = f""" DELETE FROM {table_name} - WHERE workspace = :workspace + WHERE workspace = :workspace AND cache_mode IN ({modes_list}) """ - + logger.info(f"Deleting cache by modes: {modes}") await self.db.execute(sql, {"workspace": self.db.workspace}) return True @@ -455,8 +457,11 @@ class OracleKVStorage(BaseKVStorage): try: table_name = namespace_to_table_name(self.namespace) if not table_name: - return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} - + return { + "status": "error", + "message": f"Unknown namespace: {self.namespace}", + } + drop_sql = SQL_TEMPLATES["drop_specifiy_table_workspace"].format( table_name=table_name ) @@ -683,8 +688,11 @@ class OracleVectorDBStorage(BaseVectorStorage): try: table_name = namespace_to_table_name(self.namespace) if not table_name: - return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} - + return { + "status": "error", + "message": f"Unknown namespace: {self.namespace}", + } + drop_sql = SQL_TEMPLATES["drop_specifiy_table_workspace"].format( table_name=table_name ) @@ -1025,12 +1033,16 @@ class OracleGraphStorage(BaseGraphStorage): """Drop the storage""" try: # 使用图形查询删除所有节点和关系 - delete_edges_sql = """DELETE FROM LIGHTRAG_GRAPH_EDGES WHERE workspace=:workspace""" + delete_edges_sql = ( + """DELETE FROM LIGHTRAG_GRAPH_EDGES WHERE workspace=:workspace""" + ) await self.db.execute(delete_edges_sql, {"workspace": self.db.workspace}) - - delete_nodes_sql = """DELETE FROM LIGHTRAG_GRAPH_NODES WHERE workspace=:workspace""" + + delete_nodes_sql = ( + """DELETE FROM LIGHTRAG_GRAPH_NODES WHERE workspace=:workspace""" + ) await self.db.execute(delete_nodes_sql, {"workspace": self.db.workspace}) - + return {"status": "success", "message": "graph data dropped"} except Exception as e: logger.error(f"Error dropping graph: {e}") diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 34268e32..9e4f20bd 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -380,10 +380,10 @@ class PGKVStorage(BaseKVStorage): async def delete(self, ids: list[str]) -> None: """Delete specific records from storage by their IDs - + Args: ids (list[str]): List of document IDs to be deleted from storage - + Returns: None """ @@ -398,40 +398,41 @@ class PGKVStorage(BaseKVStorage): delete_sql = f"DELETE FROM {table_name} WHERE workspace=$1 AND id = ANY($2)" try: - await self.db.execute(delete_sql, {"workspace": self.db.workspace, "ids": ids}) - logger.debug(f"Successfully deleted {len(ids)} records from {self.namespace}") + await self.db.execute( + delete_sql, {"workspace": self.db.workspace, "ids": ids} + ) + logger.debug( + f"Successfully deleted {len(ids)} records from {self.namespace}" + ) except Exception as e: logger.error(f"Error while deleting records from {self.namespace}: {e}") async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: """Delete specific records from storage by cache mode - + Args: modes (list[str]): List of cache modes to be dropped from storage - + Returns: bool: True if successful, False otherwise """ if not modes: return False - + try: table_name = namespace_to_table_name(self.namespace) if not table_name: return False - + if table_name != "LIGHTRAG_LLM_CACHE": return False - + sql = f""" DELETE FROM {table_name} WHERE workspace = $1 AND mode = ANY($2) """ - params = { - "workspace": self.db.workspace, - "modes": modes - } - + params = {"workspace": self.db.workspace, "modes": modes} + logger.info(f"Deleting cache by modes: {modes}") await self.db.execute(sql, params) return True @@ -444,8 +445,11 @@ class PGKVStorage(BaseKVStorage): try: table_name = namespace_to_table_name(self.namespace) if not table_name: - return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} - + return { + "status": "error", + "message": f"Unknown namespace: {self.namespace}", + } + drop_sql = SQL_TEMPLATES["drop_specifiy_table_workspace"].format( table_name=table_name ) @@ -622,7 +626,9 @@ class PGVectorStorage(BaseVectorStorage): delete_sql = f"DELETE FROM {table_name} WHERE workspace=$1 AND id = ANY($2)" try: - await self.db.execute(delete_sql, {"workspace": self.db.workspace, "ids": ids}) + await self.db.execute( + delete_sql, {"workspace": self.db.workspace, "ids": ids} + ) logger.debug( f"Successfully deleted {len(ids)} vectors from {self.namespace}" ) @@ -759,8 +765,11 @@ class PGVectorStorage(BaseVectorStorage): try: table_name = namespace_to_table_name(self.namespace) if not table_name: - return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} - + return { + "status": "error", + "message": f"Unknown namespace: {self.namespace}", + } + drop_sql = SQL_TEMPLATES["drop_specifiy_table_workspace"].format( table_name=table_name ) @@ -930,8 +939,11 @@ class PGDocStatusStorage(DocStatusStorage): try: table_name = namespace_to_table_name(self.namespace) if not table_name: - return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} - + return { + "status": "error", + "message": f"Unknown namespace: {self.namespace}", + } + drop_sql = SQL_TEMPLATES["drop_specifiy_table_workspace"].format( table_name=table_name ) @@ -1626,7 +1638,7 @@ class PGGraphStorage(BaseGraphStorage): MATCH (n) DETACH DELETE n $$) AS (result agtype)""" - + await self._query(drop_query, readonly=False) return {"status": "success", "message": "graph data dropped"} except Exception as e: @@ -1812,7 +1824,7 @@ SQL_TEMPLATES = { chunk_ids=EXCLUDED.chunk_ids, file_path=EXCLUDED.file_path, update_time = CURRENT_TIMESTAMP - """, + """, "relationships": """ WITH relevant_chunks AS ( SELECT id as chunk_id diff --git a/lightrag/kg/qdrant_impl.py b/lightrag/kg/qdrant_impl.py index 855b98ae..d758ca5c 100644 --- a/lightrag/kg/qdrant_impl.py +++ b/lightrag/kg/qdrant_impl.py @@ -13,11 +13,12 @@ import pipmaster as pm if not pm.is_installed("qdrant-client"): pm.install("qdrant-client") -from qdrant_client import QdrantClient, models # type: ignore +from qdrant_client import QdrantClient, models # type: ignore config = configparser.ConfigParser() config.read("config.ini", "utf-8") + def compute_mdhash_id_for_qdrant( content: str, prefix: str = "", style: str = "simple" ) -> str: @@ -272,7 +273,7 @@ class QdrantVectorDBStorage(BaseVectorStorage): except Exception as e: logger.error(f"Error searching for prefix '{prefix}': {e}") return [] - + async def get_by_id(self, id: str) -> dict[str, Any] | None: """Get vector data by its ID @@ -285,22 +286,22 @@ class QdrantVectorDBStorage(BaseVectorStorage): try: # Convert to Qdrant compatible ID qdrant_id = compute_mdhash_id_for_qdrant(id) - + # Retrieve the point by ID result = self._client.retrieve( collection_name=self.namespace, ids=[qdrant_id], with_payload=True, ) - + if not result: return None - + return result[0].payload except Exception as e: logger.error(f"Error retrieving vector data for ID {id}: {e}") return None - + async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]: """Get multiple vector data by their IDs @@ -312,28 +313,28 @@ class QdrantVectorDBStorage(BaseVectorStorage): """ if not ids: return [] - + try: # Convert to Qdrant compatible IDs qdrant_ids = [compute_mdhash_id_for_qdrant(id) for id in ids] - + # Retrieve the points by IDs results = self._client.retrieve( collection_name=self.namespace, ids=qdrant_ids, with_payload=True, ) - + return [point.payload for point in results] except Exception as e: logger.error(f"Error retrieving vector data for IDs {ids}: {e}") return [] - + async def drop(self) -> dict[str, str]: """Drop all vector data from storage and clean up resources - + This method will delete all data from the Qdrant collection. - + Returns: dict[str, str]: Operation status and message - On success: {"status": "success", "message": "data dropped"} @@ -343,17 +344,20 @@ class QdrantVectorDBStorage(BaseVectorStorage): # Delete the collection and recreate it if self._client.collection_exists(self.namespace): self._client.delete_collection(self.namespace) - + # Recreate the collection QdrantVectorDBStorage.create_collection_if_not_exist( self._client, self.namespace, vectors_config=models.VectorParams( - size=self.embedding_func.embedding_dim, distance=models.Distance.COSINE + size=self.embedding_func.embedding_dim, + distance=models.Distance.COSINE, ), ) - - logger.info(f"Process {os.getpid()} drop Qdrant collection {self.namespace}") + + logger.info( + f"Process {os.getpid()} drop Qdrant collection {self.namespace}" + ) return {"status": "success", "message": "data dropped"} except Exception as e: logger.error(f"Error dropping Qdrant collection {self.namespace}: {e}") diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py index 964b0ad7..4452d55f 100644 --- a/lightrag/kg/redis_impl.py +++ b/lightrag/kg/redis_impl.py @@ -8,7 +8,7 @@ if not pm.is_installed("redis"): pm.install("redis") # aioredis is a depricated library, replaced with redis -from redis.asyncio import Redis # type: ignore +from redis.asyncio import Redis # type: ignore from lightrag.utils import logger from lightrag.base import BaseKVStorage import json @@ -83,51 +83,51 @@ class RedisKVStorage(BaseKVStorage): logger.info( f"Deleted {deleted_count} of {len(ids)} entries from {self.namespace}" ) - - async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: + + async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: """Delete specific records from storage by by cache mode - + Importance notes for Redis storage: 1. This will immediately delete the specified cache modes from Redis - + Args: modes (list[str]): List of cache mode to be drop from storage - + Returns: True: if the cache drop successfully False: if the cache drop failed """ if not modes: return False - + try: await self.delete(modes) return True except Exception: return False - + async def drop(self) -> dict[str, str]: """Drop the storage by removing all keys under the current namespace. - + Returns: dict[str, str]: Status of the operation with keys 'status' and 'message' """ try: keys = await self._redis.keys(f"{self.namespace}:*") - + if keys: pipe = self._redis.pipeline() for key in keys: pipe.delete(key) results = await pipe.execute() deleted_count = sum(results) - + logger.info(f"Dropped {deleted_count} keys from {self.namespace}") return {"status": "success", "message": f"{deleted_count} keys dropped"} else: logger.info(f"No keys found to drop in {self.namespace}") return {"status": "success", "message": "no keys to drop"} - + except Exception as e: logger.error(f"Error dropping keys from {self.namespace}: {e}") return {"status": "error", "message": str(e)} diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index 3d06ad0a..e57357de 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -20,7 +20,7 @@ if not pm.is_installed("pymysql"): if not pm.is_installed("sqlalchemy"): pm.install("sqlalchemy") -from sqlalchemy import create_engine, text # type: ignore +from sqlalchemy import create_engine, text # type: ignore class TiDB: @@ -290,47 +290,49 @@ class TiDBKVStorage(BaseKVStorage): try: table_name = namespace_to_table_name(self.namespace) id_field = namespace_to_id(self.namespace) - + if not table_name or not id_field: logger.error(f"Unknown namespace for deletion: {self.namespace}") return - + ids_list = ",".join([f"'{id}'" for id in ids]) delete_sql = f"DELETE FROM {table_name} WHERE workspace = :workspace AND {id_field} IN ({ids_list})" - + await self.db.execute(delete_sql, {"workspace": self.db.workspace}) - logger.info(f"Successfully deleted {len(ids)} records from {self.namespace}") + logger.info( + f"Successfully deleted {len(ids)} records from {self.namespace}" + ) except Exception as e: logger.error(f"Error deleting records from {self.namespace}: {e}") async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool: """Delete specific records from storage by cache mode - + Args: modes (list[str]): List of cache modes to be dropped from storage - + Returns: bool: True if successful, False otherwise """ if not modes: return False - + try: table_name = namespace_to_table_name(self.namespace) if not table_name: return False - + if table_name != "LIGHTRAG_LLM_CACHE": return False - + # 构建MySQL风格的IN查询 modes_list = ", ".join([f"'{mode}'" for mode in modes]) sql = f""" DELETE FROM {table_name} - WHERE workspace = :workspace + WHERE workspace = :workspace AND mode IN ({modes_list}) """ - + logger.info(f"Deleting cache by modes: {modes}") await self.db.execute(sql, {"workspace": self.db.workspace}) return True @@ -343,8 +345,11 @@ class TiDBKVStorage(BaseKVStorage): try: table_name = namespace_to_table_name(self.namespace) if not table_name: - return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} - + return { + "status": "error", + "message": f"Unknown namespace: {self.namespace}", + } + drop_sql = SQL_TEMPLATES["drop_specifiy_table_workspace"].format( table_name=table_name ) @@ -492,7 +497,7 @@ class TiDBVectorDBStorage(BaseVectorStorage): table_name = namespace_to_table_name(self.namespace) id_field = namespace_to_id(self.namespace) - + if not table_name or not id_field: logger.error(f"Unknown namespace for vector deletion: {self.namespace}") return @@ -502,7 +507,9 @@ class TiDBVectorDBStorage(BaseVectorStorage): try: await self.db.execute(delete_sql, {"workspace": self.db.workspace}) - logger.debug(f"Successfully deleted {len(ids)} vectors from {self.namespace}") + logger.debug( + f"Successfully deleted {len(ids)} vectors from {self.namespace}" + ) except Exception as e: logger.error(f"Error while deleting vectors from {self.namespace}: {e}") @@ -551,8 +558,11 @@ class TiDBVectorDBStorage(BaseVectorStorage): try: table_name = namespace_to_table_name(self.namespace) if not table_name: - return {"status": "error", "message": f"Unknown namespace: {self.namespace}"} - + return { + "status": "error", + "message": f"Unknown namespace: {self.namespace}", + } + drop_sql = SQL_TEMPLATES["drop_specifiy_table_workspace"].format( table_name=table_name ) From 0981ee1af9cbbeb72d42d4e830c997304bba5a32 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 23:23:31 +0800 Subject: [PATCH 027/116] Replace delete() with drop_cache_by_modes() method to implement cache clearing operations --- lightrag/lightrag.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 81797385..7077f94d 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1894,12 +1894,18 @@ class LightRAG: try: # Reset the cache storage for specified mode if modes: - await self.llm_response_cache.delete(modes) - logger.info(f"Cleared cache for modes: {modes}") + success = await self.llm_response_cache.drop_cache_by_modes(modes) + if success: + logger.info(f"Cleared cache for modes: {modes}") + else: + logger.warning(f"Failed to clear cache for modes: {modes}") else: # Clear all modes - await self.llm_response_cache.delete(valid_modes) - logger.info("Cleared all cache") + success = await self.llm_response_cache.drop_cache_by_modes(valid_modes) + if success: + logger.info("Cleared all cache") + else: + logger.warning("Failed to clear all cache") await self.llm_response_cache.index_done_callback() From 8845779ed78b0dd38e2b2c4a09804d7481d20a35 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 23:37:03 +0800 Subject: [PATCH 028/116] Add clear cache API endpoint --- lightrag/api/routers/document_routes.py | 65 +++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 144b1274..11a0fedb 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -66,6 +66,18 @@ class ClearDocumentsResponse(BaseModel): message: str = Field(description="Message describing the operation result") +class ClearCacheRequest(BaseModel): + modes: Optional[List[str]] = Field( + default=None, + description="Modes of cache to clear. Options: ['default', 'naive', 'local', 'global', 'hybrid', 'mix']. If None, clears all cache.", + ) + + +class ClearCacheResponse(BaseModel): + status: str = Field(description="Status of the clear operation: success/fail") + message: str = Field(description="Message describing the operation result") + + class DocStatusResponse(BaseModel): @staticmethod def format_datetime(dt: Any) -> Optional[str]: @@ -1062,4 +1074,57 @@ def create_document_routes( logger.error(traceback.format_exc()) raise HTTPException(status_code=500, detail=str(e)) + @router.post( + "/cache_clear", + response_model=ClearCacheResponse, + dependencies=[Depends(combined_auth)], + ) + async def clear_cache(request: ClearCacheRequest): + """ + Clear cache data from the LLM response cache storage. + + This endpoint allows clearing specific modes of cache or all cache if no modes are specified. + Valid modes include: "default", "naive", "local", "global", "hybrid", "mix". + - "default" represents extraction cache. + - Other modes correspond to different query modes. + + Args: + request (ClearCacheRequest): The request body containing optional modes to clear. + + Returns: + ClearCacheResponse: A response object containing the status and message. + + Raises: + HTTPException: If an error occurs during cache clearing (400 for invalid modes, 500 for other errors). + """ + try: + # Validate modes if provided + valid_modes = ["default", "naive", "local", "global", "hybrid", "mix"] + if request.modes and not all(mode in valid_modes for mode in request.modes): + invalid_modes = [ + mode for mode in request.modes if mode not in valid_modes + ] + raise HTTPException( + status_code=400, + detail=f"Invalid mode(s): {invalid_modes}. Valid modes are: {valid_modes}", + ) + + # Call the aclear_cache method + await rag.aclear_cache(request.modes) + + # Prepare success message + if request.modes: + message = f"Successfully cleared cache for modes: {request.modes}" + else: + message = "Successfully cleared all cache" + + return ClearCacheResponse(status="success", message=message) + except HTTPException: + # Re-raise HTTP exceptions + raise + except Exception as e: + logger.error(f"Error clearing cache: {str(e)}") + logger.error(traceback.format_exc()) + raise HTTPException(status_code=500, detail=str(e)) + return router From d54bda8d3646f4af6cf6a0b324c4d6ca8c81a038 Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 31 Mar 2025 23:53:14 +0800 Subject: [PATCH 029/116] feat(api): Add Pydantic models for all endpoints in document_routes.py --- lightrag/api/routers/document_routes.py | 258 ++++++++++++++++++++---- 1 file changed, 222 insertions(+), 36 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 11a0fedb..ab172a2c 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -10,7 +10,7 @@ import traceback import pipmaster as pm from datetime import datetime from pathlib import Path -from typing import Dict, List, Optional, Any +from typing import Dict, List, Optional, Any, Literal from fastapi import APIRouter, BackgroundTasks, Depends, File, HTTPException, UploadFile from pydantic import BaseModel, Field, field_validator @@ -30,7 +30,37 @@ router = APIRouter( temp_prefix = "__tmp__" +class ScanResponse(BaseModel): + """Response model for document scanning operation + + Attributes: + status: Status of the scanning operation + message: Optional message with additional details + """ + + status: Literal["scanning_started"] = Field( + description="Status of the scanning operation" + ) + message: Optional[str] = Field( + default=None, description="Additional details about the scanning operation" + ) + + class Config: + json_schema_extra = { + "example": { + "status": "scanning_started", + "message": "Scanning process has been initiated in the background", + } + } + + class InsertTextRequest(BaseModel): + """Request model for inserting a single text document + + Attributes: + text: The text content to be inserted into the RAG system + """ + text: str = Field( min_length=1, description="The text to insert", @@ -41,8 +71,21 @@ class InsertTextRequest(BaseModel): def strip_after(cls, text: str) -> str: return text.strip() + class Config: + json_schema_extra = { + "example": { + "text": "This is a sample text to be inserted into the RAG system." + } + } + class InsertTextsRequest(BaseModel): + """Request model for inserting multiple text documents + + Attributes: + texts: List of text contents to be inserted into the RAG system + """ + texts: list[str] = Field( min_length=1, description="The texts to insert", @@ -53,30 +96,116 @@ class InsertTextsRequest(BaseModel): def strip_after(cls, texts: list[str]) -> list[str]: return [text.strip() for text in texts] + class Config: + json_schema_extra = { + "example": { + "texts": [ + "This is the first text to be inserted.", + "This is the second text to be inserted.", + ] + } + } + class InsertResponse(BaseModel): - status: str = Field(description="Status of the operation") + """Response model for document insertion operations + + Attributes: + status: Status of the operation (success, duplicated, partial_success, failure) + message: Detailed message describing the operation result + """ + + status: Literal["success", "duplicated", "partial_success", "failure"] = Field( + description="Status of the operation" + ) message: str = Field(description="Message describing the operation result") + class Config: + json_schema_extra = { + "example": { + "status": "success", + "message": "File 'document.pdf' uploaded successfully. Processing will continue in background.", + } + } + class ClearDocumentsResponse(BaseModel): - status: str = Field( - description="Status of the clear operation: success/partial_success/busy/fail" + """Response model for document clearing operation + + Attributes: + status: Status of the clear operation + message: Detailed message describing the operation result + """ + + status: Literal["success", "partial_success", "busy", "fail"] = Field( + description="Status of the clear operation" ) message: str = Field(description="Message describing the operation result") + class Config: + json_schema_extra = { + "example": { + "status": "success", + "message": "All documents cleared successfully. Deleted 15 files.", + } + } + class ClearCacheRequest(BaseModel): - modes: Optional[List[str]] = Field( + """Request model for clearing cache + + Attributes: + modes: Optional list of cache modes to clear + """ + + modes: Optional[ + List[Literal["default", "naive", "local", "global", "hybrid", "mix"]] + ] = Field( default=None, - description="Modes of cache to clear. Options: ['default', 'naive', 'local', 'global', 'hybrid', 'mix']. If None, clears all cache.", + description="Modes of cache to clear. If None, clears all cache.", ) + class Config: + json_schema_extra = {"example": {"modes": ["default", "naive"]}} + class ClearCacheResponse(BaseModel): - status: str = Field(description="Status of the clear operation: success/fail") + """Response model for cache clearing operation + + Attributes: + status: Status of the clear operation + message: Detailed message describing the operation result + """ + + status: Literal["success", "fail"] = Field( + description="Status of the clear operation" + ) message: str = Field(description="Message describing the operation result") + class Config: + json_schema_extra = { + "example": { + "status": "success", + "message": "Successfully cleared cache for modes: ['default', 'naive']", + } + } + + +"""Response model for document status + +Attributes: + id: Document identifier + content_summary: Summary of document content + content_length: Length of document content + status: Current processing status + created_at: Creation timestamp (ISO format string) + updated_at: Last update timestamp (ISO format string) + chunks_count: Number of chunks (optional) + error: Error message if any (optional) + metadata: Additional metadata (optional) + file_path: Path to the document file +""" + class DocStatusResponse(BaseModel): @staticmethod @@ -87,34 +216,82 @@ class DocStatusResponse(BaseModel): return dt return dt.isoformat() - """Response model for document status + id: str = Field(description="Document identifier") + content_summary: str = Field(description="Summary of document content") + content_length: int = Field(description="Length of document content in characters") + status: DocStatus = Field(description="Current processing status") + created_at: str = Field(description="Creation timestamp (ISO format string)") + updated_at: str = Field(description="Last update timestamp (ISO format string)") + chunks_count: Optional[int] = Field( + default=None, description="Number of chunks the document was split into" + ) + error: Optional[str] = Field( + default=None, description="Error message if processing failed" + ) + metadata: Optional[dict[str, Any]] = Field( + default=None, description="Additional metadata about the document" + ) + file_path: str = Field(description="Path to the document file") - Attributes: - id: Document identifier - content_summary: Summary of document content - content_length: Length of document content - status: Current processing status - created_at: Creation timestamp (ISO format string) - updated_at: Last update timestamp (ISO format string) - chunks_count: Number of chunks (optional) - error: Error message if any (optional) - metadata: Additional metadata (optional) - """ - - id: str - content_summary: str - content_length: int - status: DocStatus - created_at: str - updated_at: str - chunks_count: Optional[int] = None - error: Optional[str] = None - metadata: Optional[dict[str, Any]] = None - file_path: str + class Config: + json_schema_extra = { + "example": { + "id": "doc_123456", + "content_summary": "Research paper on machine learning", + "content_length": 15240, + "status": "PROCESSED", + "created_at": "2025-03-31T12:34:56", + "updated_at": "2025-03-31T12:35:30", + "chunks_count": 12, + "error": None, + "metadata": {"author": "John Doe", "year": 2025}, + "file_path": "research_paper.pdf", + } + } class DocsStatusesResponse(BaseModel): - statuses: Dict[DocStatus, List[DocStatusResponse]] = {} + """Response model for document statuses + + Attributes: + statuses: Dictionary mapping document status to lists of document status responses + """ + + statuses: Dict[DocStatus, List[DocStatusResponse]] = Field( + default_factory=dict, + description="Dictionary mapping document status to lists of document status responses", + ) + + class Config: + json_schema_extra = { + "example": { + "statuses": { + "PENDING": [ + { + "id": "doc_123", + "content_summary": "Pending document", + "content_length": 5000, + "status": "PENDING", + "created_at": "2025-03-31T10:00:00", + "updated_at": "2025-03-31T10:00:00", + "file_path": "pending_doc.pdf", + } + ], + "PROCESSED": [ + { + "id": "doc_456", + "content_summary": "Processed document", + "content_length": 8000, + "status": "PROCESSED", + "created_at": "2025-03-31T09:00:00", + "updated_at": "2025-03-31T09:05:00", + "chunks_count": 8, + "file_path": "processed_doc.pdf", + } + ], + } + } + } class PipelineStatusResponse(BaseModel): @@ -529,7 +706,9 @@ def create_document_routes( # Create combined auth dependency for document routes combined_auth = get_combined_auth_dependency(api_key) - @router.post("/scan", dependencies=[Depends(combined_auth)]) + @router.post( + "/scan", response_model=ScanResponse, dependencies=[Depends(combined_auth)] + ) async def scan_for_new_documents(background_tasks: BackgroundTasks): """ Trigger the scanning process for new documents. @@ -539,13 +718,18 @@ def create_document_routes( that fact. Returns: - dict: A dictionary containing the scanning status + ScanResponse: A response object containing the scanning status """ # Start the scanning process in the background background_tasks.add_task(run_scanning_process, rag, doc_manager) - return {"status": "scanning_started"} + return ScanResponse( + status="scanning_started", + message="Scanning process has been initiated in the background", + ) - @router.post("/upload", dependencies=[Depends(combined_auth)]) + @router.post( + "/upload", response_model=InsertResponse, dependencies=[Depends(combined_auth)] + ) async def upload_to_input_dir( background_tasks: BackgroundTasks, file: UploadFile = File(...) ): @@ -1016,7 +1200,9 @@ def create_document_routes( logger.error(traceback.format_exc()) raise HTTPException(status_code=500, detail=str(e)) - @router.get("", dependencies=[Depends(combined_auth)]) + @router.get( + "", response_model=DocsStatusesResponse, dependencies=[Depends(combined_auth)] + ) async def documents() -> DocsStatusesResponse: """ Get the status of all documents in the system. From cd94e842670e807b71896519105528e6df03b4e3 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 1 Apr 2025 10:36:28 +0800 Subject: [PATCH 030/116] Update clear cache endpoint path --- lightrag/api/routers/document_routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index ab172a2c..1b8ab345 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1261,7 +1261,7 @@ def create_document_routes( raise HTTPException(status_code=500, detail=str(e)) @router.post( - "/cache_clear", + "/clear_cache", response_model=ClearCacheResponse, dependencies=[Depends(combined_auth)], ) From 4c9c60047270e1abdad290bab25718854306b41c Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 1 Apr 2025 13:31:14 +0800 Subject: [PATCH 031/116] Refactoring clear document UI --- lightrag_webui/src/api/lightrag.ts | 8 ++ .../documents/ClearDocumentsDialog.tsx | 121 ++++++++++++++++-- lightrag_webui/src/locales/ar.json | 9 ++ lightrag_webui/src/locales/en.json | 9 ++ lightrag_webui/src/locales/fr.json | 9 ++ lightrag_webui/src/locales/zh.json | 9 ++ 6 files changed, 155 insertions(+), 10 deletions(-) diff --git a/lightrag_webui/src/api/lightrag.ts b/lightrag_webui/src/api/lightrag.ts index 364ecb44..3dbf52e9 100644 --- a/lightrag_webui/src/api/lightrag.ts +++ b/lightrag_webui/src/api/lightrag.ts @@ -382,6 +382,14 @@ export const clearDocuments = async (): Promise => { return response.data } +export const clearCache = async (modes?: string[]): Promise<{ + status: 'success' | 'fail' + message: string +}> => { + const response = await axiosInstance.post('/documents/clear_cache', { modes }) + return response.data +} + export const getAuthStatus = async (): Promise => { try { // Add a timeout to the request to prevent hanging diff --git a/lightrag_webui/src/components/documents/ClearDocumentsDialog.tsx b/lightrag_webui/src/components/documents/ClearDocumentsDialog.tsx index cc11ac5d..878ba030 100644 --- a/lightrag_webui/src/components/documents/ClearDocumentsDialog.tsx +++ b/lightrag_webui/src/components/documents/ClearDocumentsDialog.tsx @@ -1,4 +1,4 @@ -import { useState, useCallback } from 'react' +import { useState, useCallback, useEffect } from 'react' import Button from '@/components/ui/Button' import { Dialog, @@ -6,24 +6,79 @@ import { DialogDescription, DialogHeader, DialogTitle, - DialogTrigger + DialogTrigger, + DialogFooter } from '@/components/ui/Dialog' +import Input from '@/components/ui/Input' +import Checkbox from '@/components/ui/Checkbox' import { toast } from 'sonner' import { errorMessage } from '@/lib/utils' -import { clearDocuments } from '@/api/lightrag' +import { clearDocuments, clearCache, getDocuments } from '@/api/lightrag' +import { useBackendState } from '@/stores/state' -import { EraserIcon } from 'lucide-react' +import { EraserIcon, AlertTriangleIcon } from 'lucide-react' import { useTranslation } from 'react-i18next' +// 简单的Label组件 +const Label = ({ + htmlFor, + className, + children, + ...props +}: React.LabelHTMLAttributes) => ( + +) + export default function ClearDocumentsDialog() { const { t } = useTranslation() const [open, setOpen] = useState(false) + const [confirmText, setConfirmText] = useState('') + const [clearCacheOption, setClearCacheOption] = useState(false) + const isConfirmEnabled = confirmText.toLowerCase() === 'yes' + const check = useBackendState.use.check() + + // 重置状态当对话框关闭时 + useEffect(() => { + if (!open) { + setConfirmText('') + setClearCacheOption(false) + } + }, [open]) const handleClear = useCallback(async () => { + if (!isConfirmEnabled) return + try { + // 清空文档 const result = await clearDocuments() + + // 如果选择了清空缓存,则清空缓存 + if (clearCacheOption) { + try { + await clearCache() + toast.success(t('documentPanel.clearDocuments.cacheCleared')) + } catch (cacheErr) { + toast.error(t('documentPanel.clearDocuments.cacheClearFailed', { error: errorMessage(cacheErr) })) + } + } + if (result.status === 'success') { toast.success(t('documentPanel.clearDocuments.success')) + + // 刷新文档列表和后端状态 + try { + await getDocuments() + check() + } catch (refreshErr) { + console.error('Error refreshing documents:', refreshErr) + } + setOpen(false) } else { toast.error(t('documentPanel.clearDocuments.failed', { message: result.message })) @@ -31,7 +86,7 @@ export default function ClearDocumentsDialog() { } catch (err) { toast.error(t('documentPanel.clearDocuments.error', { error: errorMessage(err) })) } - }, [setOpen, t]) + }, [isConfirmEnabled, clearCacheOption, setOpen, t, check]) return ( @@ -42,12 +97,58 @@ export default function ClearDocumentsDialog() { e.preventDefault()}> - {t('documentPanel.clearDocuments.title')} - {t('documentPanel.clearDocuments.confirm')} + + + {t('documentPanel.clearDocuments.title')} + + +
+ {t('documentPanel.clearDocuments.warning')} +
+
+ {t('documentPanel.clearDocuments.confirm')} +
+
- + +
+
+ + ) => setConfirmText(e.target.value)} + placeholder={t('documentPanel.clearDocuments.confirmPlaceholder')} + className="w-full" + /> +
+ +
+ setClearCacheOption(checked === true)} + /> + +
+
+ + + + +
) diff --git a/lightrag_webui/src/locales/ar.json b/lightrag_webui/src/locales/ar.json index d7cff8e3..83f78698 100644 --- a/lightrag_webui/src/locales/ar.json +++ b/lightrag_webui/src/locales/ar.json @@ -32,14 +32,23 @@ "authDisabled": "تم تعطيل المصادقة. استخدام وضع بدون تسجيل دخول.", "guestMode": "وضع بدون تسجيل دخول" }, + "common": { + "cancel": "إلغاء" + }, "documentPanel": { "clearDocuments": { "button": "مسح", "tooltip": "مسح المستندات", "title": "مسح المستندات", + "warning": "تحذير: سيؤدي هذا الإجراء إلى حذف جميع المستندات بشكل دائم ولا يمكن التراجع عنه!", "confirm": "هل تريد حقًا مسح جميع المستندات؟", + "confirmPrompt": "اكتب 'yes' لتأكيد هذا الإجراء", + "confirmPlaceholder": "اكتب yes للتأكيد", + "clearCache": "مسح ذاكرة التخزين المؤقت أيضًا", "confirmButton": "نعم", "success": "تم مسح المستندات بنجاح", + "cacheCleared": "تم مسح ذاكرة التخزين المؤقت بنجاح", + "cacheClearFailed": "فشل مسح ذاكرة التخزين المؤقت:\n{{error}}", "failed": "فشل مسح المستندات:\n{{message}}", "error": "فشل مسح المستندات:\n{{error}}" }, diff --git a/lightrag_webui/src/locales/en.json b/lightrag_webui/src/locales/en.json index d7c68c73..a5a1c267 100644 --- a/lightrag_webui/src/locales/en.json +++ b/lightrag_webui/src/locales/en.json @@ -32,14 +32,23 @@ "authDisabled": "Authentication is disabled. Using login free mode.", "guestMode": "Login Free" }, + "common": { + "cancel": "Cancel" + }, "documentPanel": { "clearDocuments": { "button": "Clear", "tooltip": "Clear documents", "title": "Clear Documents", + "warning": "WARNING: This action will permanently delete all documents and cannot be undone!", "confirm": "Do you really want to clear all documents?", + "confirmPrompt": "Type 'yes' to confirm this action", + "confirmPlaceholder": "Type yes to confirm", + "clearCache": "Also clear cache", "confirmButton": "YES", "success": "Documents cleared successfully", + "cacheCleared": "Cache cleared successfully", + "cacheClearFailed": "Failed to clear cache:\n{{error}}", "failed": "Clear Documents Failed:\n{{message}}", "error": "Clear Documents Failed:\n{{error}}" }, diff --git a/lightrag_webui/src/locales/fr.json b/lightrag_webui/src/locales/fr.json index f40d43f6..ebe5556b 100644 --- a/lightrag_webui/src/locales/fr.json +++ b/lightrag_webui/src/locales/fr.json @@ -32,14 +32,23 @@ "authDisabled": "L'authentification est désactivée. Utilisation du mode sans connexion.", "guestMode": "Mode sans connexion" }, + "common": { + "cancel": "Annuler" + }, "documentPanel": { "clearDocuments": { "button": "Effacer", "tooltip": "Effacer les documents", "title": "Effacer les documents", + "warning": "ATTENTION : Cette action supprimera définitivement tous les documents et ne peut pas être annulée !", "confirm": "Voulez-vous vraiment effacer tous les documents ?", + "confirmPrompt": "Tapez 'yes' pour confirmer cette action", + "confirmPlaceholder": "Tapez yes pour confirmer", + "clearCache": "Effacer également le cache", "confirmButton": "OUI", "success": "Documents effacés avec succès", + "cacheCleared": "Cache effacé avec succès", + "cacheClearFailed": "Échec de l'effacement du cache :\n{{error}}", "failed": "Échec de l'effacement des documents :\n{{message}}", "error": "Échec de l'effacement des documents :\n{{error}}" }, diff --git a/lightrag_webui/src/locales/zh.json b/lightrag_webui/src/locales/zh.json index bd1a1841..a2152ae1 100644 --- a/lightrag_webui/src/locales/zh.json +++ b/lightrag_webui/src/locales/zh.json @@ -32,14 +32,23 @@ "authDisabled": "认证已禁用,使用无需登陆模式。", "guestMode": "无需登陆" }, + "common": { + "cancel": "取消" + }, "documentPanel": { "clearDocuments": { "button": "清空", "tooltip": "清空文档", "title": "清空文档", + "warning": "警告:此操作将永久删除所有文档,无法恢复!", "confirm": "确定要清空所有文档吗?", + "confirmPrompt": "请输入 yes 确认操作", + "confirmPlaceholder": "输入 yes 确认", + "clearCache": "同时清空缓存", "confirmButton": "确定", "success": "文档清空成功", + "cacheCleared": "缓存清空成功", + "cacheClearFailed": "清空缓存失败:\n{{error}}", "failed": "清空文档失败:\n{{message}}", "error": "清空文档失败:\n{{error}}" }, From e83f6c0c631dc3eb926642ca78416ad66863769a Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 1 Apr 2025 13:32:05 +0800 Subject: [PATCH 032/116] Fix lingting --- .../documents/ClearDocumentsDialog.tsx | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/lightrag_webui/src/components/documents/ClearDocumentsDialog.tsx b/lightrag_webui/src/components/documents/ClearDocumentsDialog.tsx index 878ba030..2f27867a 100644 --- a/lightrag_webui/src/components/documents/ClearDocumentsDialog.tsx +++ b/lightrag_webui/src/components/documents/ClearDocumentsDialog.tsx @@ -20,15 +20,15 @@ import { EraserIcon, AlertTriangleIcon } from 'lucide-react' import { useTranslation } from 'react-i18next' // 简单的Label组件 -const Label = ({ - htmlFor, - className, - children, - ...props +const Label = ({ + htmlFor, + className, + children, + ...props }: React.LabelHTMLAttributes) => ( -