fix linting errors

This commit is contained in:
zrguo
2024-12-31 17:32:04 +08:00
parent 71e9267f4b
commit d489d9dec0
2 changed files with 105 additions and 59 deletions

View File

@@ -731,30 +731,34 @@ class LightRAG:
if not doc_status: if not doc_status:
logger.warning(f"Document {doc_id} not found") logger.warning(f"Document {doc_id} not found")
return return
logger.debug(f"Starting deletion for document {doc_id}") logger.debug(f"Starting deletion for document {doc_id}")
# 2. Get all related chunks # 2. Get all related chunks
chunks = await self.text_chunks.filter(lambda x: x.get("full_doc_id") == doc_id) chunks = await self.text_chunks.filter(
lambda x: x.get("full_doc_id") == doc_id
)
chunk_ids = list(chunks.keys()) chunk_ids = list(chunks.keys())
logger.debug(f"Found {len(chunk_ids)} chunks to delete") logger.debug(f"Found {len(chunk_ids)} chunks to delete")
# 3. Before deleting, check the related entities and relationships for these chunks # 3. Before deleting, check the related entities and relationships for these chunks
for chunk_id in chunk_ids: for chunk_id in chunk_ids:
# Check entities # Check entities
entities = [ entities = [
dp for dp in self.entities_vdb.client_storage["data"] dp
for dp in self.entities_vdb.client_storage["data"]
if dp.get("source_id") == chunk_id if dp.get("source_id") == chunk_id
] ]
logger.debug(f"Chunk {chunk_id} has {len(entities)} related entities") logger.debug(f"Chunk {chunk_id} has {len(entities)} related entities")
# Check relationships # Check relationships
relations = [ relations = [
dp for dp in self.relationships_vdb.client_storage["data"] dp
for dp in self.relationships_vdb.client_storage["data"]
if dp.get("source_id") == chunk_id if dp.get("source_id") == chunk_id
] ]
logger.debug(f"Chunk {chunk_id} has {len(relations)} related relations") logger.debug(f"Chunk {chunk_id} has {len(relations)} related relations")
# Continue with the original deletion process... # Continue with the original deletion process...
# 4. Delete chunks from vector database # 4. Delete chunks from vector database
@@ -775,31 +779,39 @@ class LightRAG:
# Process entities # Process entities
for node, data in nodes: for node, data in nodes:
if 'source_id' in data: if "source_id" in data:
# Split source_id using GRAPH_FIELD_SEP # Split source_id using GRAPH_FIELD_SEP
sources = set(data['source_id'].split(GRAPH_FIELD_SEP)) sources = set(data["source_id"].split(GRAPH_FIELD_SEP))
sources.difference_update(chunk_ids) sources.difference_update(chunk_ids)
if not sources: if not sources:
entities_to_delete.add(node) entities_to_delete.add(node)
logger.debug(f"Entity {node} marked for deletion - no remaining sources") logger.debug(
f"Entity {node} marked for deletion - no remaining sources"
)
else: else:
new_source_id = GRAPH_FIELD_SEP.join(sources) new_source_id = GRAPH_FIELD_SEP.join(sources)
entities_to_update[node] = new_source_id entities_to_update[node] = new_source_id
logger.debug(f"Entity {node} will be updated with new source_id: {new_source_id}") logger.debug(
f"Entity {node} will be updated with new source_id: {new_source_id}"
)
# Process relationships # Process relationships
for src, tgt, data in edges: for src, tgt, data in edges:
if 'source_id' in data: if "source_id" in data:
# Split source_id using GRAPH_FIELD_SEP # Split source_id using GRAPH_FIELD_SEP
sources = set(data['source_id'].split(GRAPH_FIELD_SEP)) sources = set(data["source_id"].split(GRAPH_FIELD_SEP))
sources.difference_update(chunk_ids) sources.difference_update(chunk_ids)
if not sources: if not sources:
relationships_to_delete.add((src, tgt)) relationships_to_delete.add((src, tgt))
logger.debug(f"Relationship {src}-{tgt} marked for deletion - no remaining sources") logger.debug(
f"Relationship {src}-{tgt} marked for deletion - no remaining sources"
)
else: else:
new_source_id = GRAPH_FIELD_SEP.join(sources) new_source_id = GRAPH_FIELD_SEP.join(sources)
relationships_to_update[(src, tgt)] = new_source_id relationships_to_update[(src, tgt)] = new_source_id
logger.debug(f"Relationship {src}-{tgt} will be updated with new source_id: {new_source_id}") logger.debug(
f"Relationship {src}-{tgt} will be updated with new source_id: {new_source_id}"
)
# Delete entities # Delete entities
if entities_to_delete: if entities_to_delete:
@@ -812,9 +824,11 @@ class LightRAG:
# Update entities # Update entities
for entity, new_source_id in entities_to_update.items(): for entity, new_source_id in entities_to_update.items():
node_data = self.chunk_entity_relation_graph._graph.nodes[entity] node_data = self.chunk_entity_relation_graph._graph.nodes[entity]
node_data['source_id'] = new_source_id node_data["source_id"] = new_source_id
await self.chunk_entity_relation_graph.upsert_node(entity, node_data) await self.chunk_entity_relation_graph.upsert_node(entity, node_data)
logger.debug(f"Updated entity {entity} with new source_id: {new_source_id}") logger.debug(
f"Updated entity {entity} with new source_id: {new_source_id}"
)
# Delete relationships # Delete relationships
if relationships_to_delete: if relationships_to_delete:
@@ -823,15 +837,21 @@ class LightRAG:
rel_id_1 = compute_mdhash_id(tgt + src, prefix="rel-") rel_id_1 = compute_mdhash_id(tgt + src, prefix="rel-")
await self.relationships_vdb.delete([rel_id_0, rel_id_1]) await self.relationships_vdb.delete([rel_id_0, rel_id_1])
logger.debug(f"Deleted relationship {src}-{tgt} from vector DB") logger.debug(f"Deleted relationship {src}-{tgt} from vector DB")
self.chunk_entity_relation_graph.remove_edges(list(relationships_to_delete)) self.chunk_entity_relation_graph.remove_edges(
logger.debug(f"Deleted {len(relationships_to_delete)} relationships from graph") list(relationships_to_delete)
)
logger.debug(
f"Deleted {len(relationships_to_delete)} relationships from graph"
)
# Update relationships # Update relationships
for (src, tgt), new_source_id in relationships_to_update.items(): for (src, tgt), new_source_id in relationships_to_update.items():
edge_data = self.chunk_entity_relation_graph._graph.edges[src, tgt] edge_data = self.chunk_entity_relation_graph._graph.edges[src, tgt]
edge_data['source_id'] = new_source_id edge_data["source_id"] = new_source_id
await self.chunk_entity_relation_graph.upsert_edge(src, tgt, edge_data) await self.chunk_entity_relation_graph.upsert_edge(src, tgt, edge_data)
logger.debug(f"Updated relationship {src}-{tgt} with new source_id: {new_source_id}") logger.debug(
f"Updated relationship {src}-{tgt} with new source_id: {new_source_id}"
)
# 6. Delete original document and status # 6. Delete original document and status
await self.full_docs.delete([doc_id]) await self.full_docs.delete([doc_id])
@@ -851,31 +871,39 @@ class LightRAG:
# Verify if the document has been deleted # Verify if the document has been deleted
if await self.full_docs.get_by_id(doc_id): if await self.full_docs.get_by_id(doc_id):
logger.error(f"Document {doc_id} still exists in full_docs") logger.error(f"Document {doc_id} still exists in full_docs")
# Verify if chunks have been deleted # Verify if chunks have been deleted
remaining_chunks = await self.text_chunks.filter( remaining_chunks = await self.text_chunks.filter(
lambda x: x.get("full_doc_id") == doc_id lambda x: x.get("full_doc_id") == doc_id
) )
if remaining_chunks: if remaining_chunks:
logger.error(f"Found {len(remaining_chunks)} remaining chunks") logger.error(f"Found {len(remaining_chunks)} remaining chunks")
# Verify entities and relationships # Verify entities and relationships
for chunk_id in chunk_ids: for chunk_id in chunk_ids:
# Check entities # Check entities
entities_with_chunk = [ entities_with_chunk = [
dp for dp in self.entities_vdb.client_storage["data"] dp
if chunk_id in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) for dp in self.entities_vdb.client_storage["data"]
if chunk_id
in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
] ]
if entities_with_chunk: if entities_with_chunk:
logger.error(f"Found {len(entities_with_chunk)} entities still referencing chunk {chunk_id}") logger.error(
f"Found {len(entities_with_chunk)} entities still referencing chunk {chunk_id}"
)
# Check relationships # Check relationships
relations_with_chunk = [ relations_with_chunk = [
dp for dp in self.relationships_vdb.client_storage["data"] dp
if chunk_id in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) for dp in self.relationships_vdb.client_storage["data"]
if chunk_id
in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
] ]
if relations_with_chunk: if relations_with_chunk:
logger.error(f"Found {len(relations_with_chunk)} relations still referencing chunk {chunk_id}") logger.error(
f"Found {len(relations_with_chunk)} relations still referencing chunk {chunk_id}"
)
await verify_deletion() await verify_deletion()
@@ -886,13 +914,15 @@ class LightRAG:
"""Synchronous version of adelete""" """Synchronous version of adelete"""
return asyncio.run(self.adelete_by_doc_id(doc_id)) return asyncio.run(self.adelete_by_doc_id(doc_id))
async def get_entity_info(self, entity_name: str, include_vector_data: bool = False): async def get_entity_info(
self, entity_name: str, include_vector_data: bool = False
):
"""Get detailed information of an entity """Get detailed information of an entity
Args: Args:
entity_name: Entity name (no need for quotes) entity_name: Entity name (no need for quotes)
include_vector_data: Whether to include data from the vector database include_vector_data: Whether to include data from the vector database
Returns: Returns:
dict: A dictionary containing entity information, including: dict: A dictionary containing entity information, including:
- entity_name: Entity name - entity_name: Entity name
@@ -901,47 +931,50 @@ class LightRAG:
- vector_data: (optional) Data from the vector database - vector_data: (optional) Data from the vector database
""" """
entity_name = f'"{entity_name.upper()}"' entity_name = f'"{entity_name.upper()}"'
# Get information from the graph # Get information from the graph
node_data = await self.chunk_entity_relation_graph.get_node(entity_name) node_data = await self.chunk_entity_relation_graph.get_node(entity_name)
source_id = node_data.get('source_id') if node_data else None source_id = node_data.get("source_id") if node_data else None
result = { result = {
"entity_name": entity_name, "entity_name": entity_name,
"source_id": source_id, "source_id": source_id,
"graph_data": node_data, "graph_data": node_data,
} }
# Optional: Get vector database information # Optional: Get vector database information
if include_vector_data: if include_vector_data:
entity_id = compute_mdhash_id(entity_name, prefix="ent-") entity_id = compute_mdhash_id(entity_name, prefix="ent-")
vector_data = self.entities_vdb._client.get([entity_id]) vector_data = self.entities_vdb._client.get([entity_id])
result["vector_data"] = vector_data[0] if vector_data else None result["vector_data"] = vector_data[0] if vector_data else None
return result return result
def get_entity_info_sync(self, entity_name: str, include_vector_data: bool = False): def get_entity_info_sync(self, entity_name: str, include_vector_data: bool = False):
"""Synchronous version of getting entity information """Synchronous version of getting entity information
Args: Args:
entity_name: Entity name (no need for quotes) entity_name: Entity name (no need for quotes)
include_vector_data: Whether to include data from the vector database include_vector_data: Whether to include data from the vector database
""" """
try: try:
import tracemalloc import tracemalloc
tracemalloc.start() tracemalloc.start()
return asyncio.run(self.get_entity_info(entity_name, include_vector_data)) return asyncio.run(self.get_entity_info(entity_name, include_vector_data))
finally: finally:
tracemalloc.stop() tracemalloc.stop()
async def get_relation_info(self, src_entity: str, tgt_entity: str, include_vector_data: bool = False): async def get_relation_info(
self, src_entity: str, tgt_entity: str, include_vector_data: bool = False
):
"""Get detailed information of a relationship """Get detailed information of a relationship
Args: Args:
src_entity: Source entity name (no need for quotes) src_entity: Source entity name (no need for quotes)
tgt_entity: Target entity name (no need for quotes) tgt_entity: Target entity name (no need for quotes)
include_vector_data: Whether to include data from the vector database include_vector_data: Whether to include data from the vector database
Returns: Returns:
dict: A dictionary containing relationship information, including: dict: A dictionary containing relationship information, including:
- src_entity: Source entity name - src_entity: Source entity name
@@ -952,29 +985,33 @@ class LightRAG:
""" """
src_entity = f'"{src_entity.upper()}"' src_entity = f'"{src_entity.upper()}"'
tgt_entity = f'"{tgt_entity.upper()}"' tgt_entity = f'"{tgt_entity.upper()}"'
# Get information from the graph # Get information from the graph
edge_data = await self.chunk_entity_relation_graph.get_edge(src_entity, tgt_entity) edge_data = await self.chunk_entity_relation_graph.get_edge(
source_id = edge_data.get('source_id') if edge_data else None src_entity, tgt_entity
)
source_id = edge_data.get("source_id") if edge_data else None
result = { result = {
"src_entity": src_entity, "src_entity": src_entity,
"tgt_entity": tgt_entity, "tgt_entity": tgt_entity,
"source_id": source_id, "source_id": source_id,
"graph_data": edge_data, "graph_data": edge_data,
} }
# Optional: Get vector database information # Optional: Get vector database information
if include_vector_data: if include_vector_data:
rel_id = compute_mdhash_id(src_entity + tgt_entity, prefix="rel-") rel_id = compute_mdhash_id(src_entity + tgt_entity, prefix="rel-")
vector_data = self.relationships_vdb._client.get([rel_id]) vector_data = self.relationships_vdb._client.get([rel_id])
result["vector_data"] = vector_data[0] if vector_data else None result["vector_data"] = vector_data[0] if vector_data else None
return result return result
def get_relation_info_sync(self, src_entity: str, tgt_entity: str, include_vector_data: bool = False): def get_relation_info_sync(
self, src_entity: str, tgt_entity: str, include_vector_data: bool = False
):
"""Synchronous version of getting relationship information """Synchronous version of getting relationship information
Args: Args:
src_entity: Source entity name (no need for quotes) src_entity: Source entity name (no need for quotes)
tgt_entity: Target entity name (no need for quotes) tgt_entity: Target entity name (no need for quotes)
@@ -982,8 +1019,10 @@ class LightRAG:
""" """
try: try:
import tracemalloc import tracemalloc
tracemalloc.start() tracemalloc.start()
return asyncio.run(self.get_relation_info(src_entity, tgt_entity, include_vector_data)) return asyncio.run(
self.get_relation_info(src_entity, tgt_entity, include_vector_data)
)
finally: finally:
tracemalloc.stop() tracemalloc.stop()

View File

@@ -188,14 +188,18 @@ class NanoVectorDBStorage(BaseVectorStorage):
""" """
try: try:
self._client.delete(ids) self._client.delete(ids)
logger.info(f"Successfully deleted {len(ids)} vectors from {self.namespace}") logger.info(
f"Successfully deleted {len(ids)} vectors from {self.namespace}"
)
except Exception as e: except Exception as e:
logger.error(f"Error while deleting vectors from {self.namespace}: {e}") logger.error(f"Error while deleting vectors from {self.namespace}: {e}")
async def delete_entity(self, entity_name: str): async def delete_entity(self, entity_name: str):
try: try:
entity_id = compute_mdhash_id(entity_name, prefix="ent-") entity_id = compute_mdhash_id(entity_name, prefix="ent-")
logger.debug(f"Attempting to delete entity {entity_name} with ID {entity_id}") logger.debug(
f"Attempting to delete entity {entity_name} with ID {entity_id}"
)
# Check if the entity exists # Check if the entity exists
if self._client.get([entity_id]): if self._client.get([entity_id]):
await self.delete([entity_id]) await self.delete([entity_id])
@@ -208,15 +212,18 @@ class NanoVectorDBStorage(BaseVectorStorage):
async def delete_entity_relation(self, entity_name: str): async def delete_entity_relation(self, entity_name: str):
try: try:
relations = [ relations = [
dp for dp in self.client_storage["data"] dp
for dp in self.client_storage["data"]
if dp["src_id"] == entity_name or dp["tgt_id"] == entity_name if dp["src_id"] == entity_name or dp["tgt_id"] == entity_name
] ]
logger.debug(f"Found {len(relations)} relations for entity {entity_name}") logger.debug(f"Found {len(relations)} relations for entity {entity_name}")
ids_to_delete = [relation["__id__"] for relation in relations] ids_to_delete = [relation["__id__"] for relation in relations]
if ids_to_delete: if ids_to_delete:
await self.delete(ids_to_delete) await self.delete(ids_to_delete)
logger.debug(f"Deleted {len(ids_to_delete)} relations for {entity_name}") logger.debug(
f"Deleted {len(ids_to_delete)} relations for {entity_name}"
)
else: else:
logger.debug(f"No relations found for entity {entity_name}") logger.debug(f"No relations found for entity {entity_name}")
except Exception as e: except Exception as e:
@@ -446,4 +453,4 @@ class JsonDocStatusStorage(DocStatusStorage):
"""Delete document status by IDs""" """Delete document status by IDs"""
for doc_id in doc_ids: for doc_id in doc_ids:
self._data.pop(doc_id, None) self._data.pop(doc_id, None)
await self.index_done_callback() await self.index_done_callback()