From fd9f71e0eee26189f19448d04678ff5dc0254524 Mon Sep 17 00:00:00 2001 From: zrguo Date: Tue, 4 Mar 2025 13:22:33 +0800 Subject: [PATCH] fix delete_by_doc_id --- lightrag/kg/json_kv_impl.py | 9 +++++++++ lightrag/kg/tidb_impl.py | 8 ++++++++ lightrag/lightrag.py | 33 +++++++++++++++++++++++++-------- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index 8d707899..c0b61a63 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -44,6 +44,15 @@ class JsonKVStorage(BaseKVStorage): ) write_json(data_dict, self._file_name) + async def get_all(self) -> dict[str, Any]: + """Get all data from storage + + Returns: + Dictionary containing all stored data + """ + async with self._storage_lock: + return dict(self._data) + async def get_by_id(self, id: str) -> dict[str, Any] | None: async with self._storage_lock: return self._data.get(id) diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index 4adb0141..51d1c365 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -174,6 +174,14 @@ class TiDBKVStorage(BaseKVStorage): self.db = None ################ QUERY METHODS ################ + async def get_all(self) -> dict[str, Any]: + """Get all data from storage + + Returns: + Dictionary containing all stored data + """ + async with self._storage_lock: + return dict(self._data) async def get_by_id(self, id: str) -> dict[str, Any] | None: """Fetch doc_full data by id.""" diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index a5d3c94b..b2e9845e 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1431,14 +1431,22 @@ class LightRAG: logger.debug(f"Starting deletion for document {doc_id}") - doc_to_chunk_id = doc_id.replace("doc", "chunk") + # 2. Get all chunks related to this document + # Find all chunks where full_doc_id equals the current doc_id + all_chunks = await self.text_chunks.get_all() + related_chunks = { + chunk_id: chunk_data + for chunk_id, chunk_data in all_chunks.items() + if isinstance(chunk_data, dict) + and chunk_data.get("full_doc_id") == doc_id + } - # 2. Get all related chunks - chunks = await self.text_chunks.get_by_id(doc_to_chunk_id) - if not chunks: + if not related_chunks: + logger.warning(f"No chunks found for document {doc_id}") return - chunk_ids = {chunks["full_doc_id"].replace("doc", "chunk")} + # Get all related chunk IDs + chunk_ids = set(related_chunks.keys()) logger.debug(f"Found {len(chunk_ids)} chunks to delete") # 3. Before deleting, check the related entities and relationships for these chunks @@ -1626,9 +1634,18 @@ class LightRAG: logger.warning(f"Document {doc_id} still exists in full_docs") # Verify if chunks have been deleted - remaining_chunks = await self.text_chunks.get_by_id(doc_to_chunk_id) - if remaining_chunks: - logger.warning(f"Found {len(remaining_chunks)} remaining chunks") + all_remaining_chunks = await self.text_chunks.get_all() + remaining_related_chunks = { + chunk_id: chunk_data + for chunk_id, chunk_data in all_remaining_chunks.items() + if isinstance(chunk_data, dict) + and chunk_data.get("full_doc_id") == doc_id + } + + if remaining_related_chunks: + logger.warning( + f"Found {len(remaining_related_chunks)} remaining chunks" + ) # Verify entities and relationships for chunk_id in chunk_ids: