fix delete_by_doc_id

This commit is contained in:
zrguo
2025-03-04 13:22:33 +08:00
parent 0f430ca1a7
commit fd9f71e0ee
3 changed files with 42 additions and 8 deletions

View File

@@ -44,6 +44,15 @@ class JsonKVStorage(BaseKVStorage):
)
write_json(data_dict, self._file_name)
async def get_all(self) -> dict[str, Any]:
"""Get all data from storage
Returns:
Dictionary containing all stored data
"""
async with self._storage_lock:
return dict(self._data)
async def get_by_id(self, id: str) -> dict[str, Any] | None:
async with self._storage_lock:
return self._data.get(id)

View File

@@ -174,6 +174,14 @@ class TiDBKVStorage(BaseKVStorage):
self.db = None
################ QUERY METHODS ################
async def get_all(self) -> dict[str, Any]:
"""Get all data from storage
Returns:
Dictionary containing all stored data
"""
async with self._storage_lock:
return dict(self._data)
async def get_by_id(self, id: str) -> dict[str, Any] | None:
"""Fetch doc_full data by id."""

View File

@@ -1431,14 +1431,22 @@ class LightRAG:
logger.debug(f"Starting deletion for document {doc_id}")
doc_to_chunk_id = doc_id.replace("doc", "chunk")
# 2. Get all chunks related to this document
# Find all chunks where full_doc_id equals the current doc_id
all_chunks = await self.text_chunks.get_all()
related_chunks = {
chunk_id: chunk_data
for chunk_id, chunk_data in all_chunks.items()
if isinstance(chunk_data, dict)
and chunk_data.get("full_doc_id") == doc_id
}
# 2. Get all related chunks
chunks = await self.text_chunks.get_by_id(doc_to_chunk_id)
if not chunks:
if not related_chunks:
logger.warning(f"No chunks found for document {doc_id}")
return
chunk_ids = {chunks["full_doc_id"].replace("doc", "chunk")}
# Get all related chunk IDs
chunk_ids = set(related_chunks.keys())
logger.debug(f"Found {len(chunk_ids)} chunks to delete")
# 3. Before deleting, check the related entities and relationships for these chunks
@@ -1626,9 +1634,18 @@ class LightRAG:
logger.warning(f"Document {doc_id} still exists in full_docs")
# Verify if chunks have been deleted
remaining_chunks = await self.text_chunks.get_by_id(doc_to_chunk_id)
if remaining_chunks:
logger.warning(f"Found {len(remaining_chunks)} remaining chunks")
all_remaining_chunks = await self.text_chunks.get_all()
remaining_related_chunks = {
chunk_id: chunk_data
for chunk_id, chunk_data in all_remaining_chunks.items()
if isinstance(chunk_data, dict)
and chunk_data.get("full_doc_id") == doc_id
}
if remaining_related_chunks:
logger.warning(
f"Found {len(remaining_related_chunks)} remaining chunks"
)
# Verify entities and relationships
for chunk_id in chunk_ids: