From d489d9dec0a0da719ffa29eda13fed19a3ce0b82 Mon Sep 17 00:00:00 2001 From: zrguo Date: Tue, 31 Dec 2024 17:32:04 +0800 Subject: [PATCH] fix linting errors --- lightrag/lightrag.py | 145 +++++++++++++++++++++++++++---------------- lightrag/storage.py | 19 ++++-- 2 files changed, 105 insertions(+), 59 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 03922556..22ea7117 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -731,30 +731,34 @@ class LightRAG: if not doc_status: logger.warning(f"Document {doc_id} not found") return - + logger.debug(f"Starting deletion for document {doc_id}") - + # 2. Get all related chunks - chunks = await self.text_chunks.filter(lambda x: x.get("full_doc_id") == doc_id) + chunks = await self.text_chunks.filter( + lambda x: x.get("full_doc_id") == doc_id + ) chunk_ids = list(chunks.keys()) logger.debug(f"Found {len(chunk_ids)} chunks to delete") - + # 3. Before deleting, check the related entities and relationships for these chunks for chunk_id in chunk_ids: # Check entities entities = [ - dp for dp in self.entities_vdb.client_storage["data"] + dp + for dp in self.entities_vdb.client_storage["data"] if dp.get("source_id") == chunk_id ] logger.debug(f"Chunk {chunk_id} has {len(entities)} related entities") - + # Check relationships relations = [ - dp for dp in self.relationships_vdb.client_storage["data"] + dp + for dp in self.relationships_vdb.client_storage["data"] if dp.get("source_id") == chunk_id ] logger.debug(f"Chunk {chunk_id} has {len(relations)} related relations") - + # Continue with the original deletion process... # 4. Delete chunks from vector database @@ -775,31 +779,39 @@ class LightRAG: # Process entities for node, data in nodes: - if 'source_id' in data: + if "source_id" in data: # Split source_id using GRAPH_FIELD_SEP - sources = set(data['source_id'].split(GRAPH_FIELD_SEP)) + sources = set(data["source_id"].split(GRAPH_FIELD_SEP)) sources.difference_update(chunk_ids) if not sources: entities_to_delete.add(node) - logger.debug(f"Entity {node} marked for deletion - no remaining sources") + logger.debug( + f"Entity {node} marked for deletion - no remaining sources" + ) else: new_source_id = GRAPH_FIELD_SEP.join(sources) entities_to_update[node] = new_source_id - logger.debug(f"Entity {node} will be updated with new source_id: {new_source_id}") + logger.debug( + f"Entity {node} will be updated with new source_id: {new_source_id}" + ) # Process relationships for src, tgt, data in edges: - if 'source_id' in data: + if "source_id" in data: # Split source_id using GRAPH_FIELD_SEP - sources = set(data['source_id'].split(GRAPH_FIELD_SEP)) + sources = set(data["source_id"].split(GRAPH_FIELD_SEP)) sources.difference_update(chunk_ids) if not sources: relationships_to_delete.add((src, tgt)) - logger.debug(f"Relationship {src}-{tgt} marked for deletion - no remaining sources") + logger.debug( + f"Relationship {src}-{tgt} marked for deletion - no remaining sources" + ) else: new_source_id = GRAPH_FIELD_SEP.join(sources) relationships_to_update[(src, tgt)] = new_source_id - logger.debug(f"Relationship {src}-{tgt} will be updated with new source_id: {new_source_id}") + logger.debug( + f"Relationship {src}-{tgt} will be updated with new source_id: {new_source_id}" + ) # Delete entities if entities_to_delete: @@ -812,9 +824,11 @@ class LightRAG: # Update entities for entity, new_source_id in entities_to_update.items(): node_data = self.chunk_entity_relation_graph._graph.nodes[entity] - node_data['source_id'] = new_source_id + node_data["source_id"] = new_source_id await self.chunk_entity_relation_graph.upsert_node(entity, node_data) - logger.debug(f"Updated entity {entity} with new source_id: {new_source_id}") + logger.debug( + f"Updated entity {entity} with new source_id: {new_source_id}" + ) # Delete relationships if relationships_to_delete: @@ -823,15 +837,21 @@ class LightRAG: rel_id_1 = compute_mdhash_id(tgt + src, prefix="rel-") await self.relationships_vdb.delete([rel_id_0, rel_id_1]) logger.debug(f"Deleted relationship {src}-{tgt} from vector DB") - self.chunk_entity_relation_graph.remove_edges(list(relationships_to_delete)) - logger.debug(f"Deleted {len(relationships_to_delete)} relationships from graph") + self.chunk_entity_relation_graph.remove_edges( + list(relationships_to_delete) + ) + logger.debug( + f"Deleted {len(relationships_to_delete)} relationships from graph" + ) # Update relationships for (src, tgt), new_source_id in relationships_to_update.items(): edge_data = self.chunk_entity_relation_graph._graph.edges[src, tgt] - edge_data['source_id'] = new_source_id + edge_data["source_id"] = new_source_id await self.chunk_entity_relation_graph.upsert_edge(src, tgt, edge_data) - logger.debug(f"Updated relationship {src}-{tgt} with new source_id: {new_source_id}") + logger.debug( + f"Updated relationship {src}-{tgt} with new source_id: {new_source_id}" + ) # 6. Delete original document and status await self.full_docs.delete([doc_id]) @@ -851,31 +871,39 @@ class LightRAG: # Verify if the document has been deleted if await self.full_docs.get_by_id(doc_id): logger.error(f"Document {doc_id} still exists in full_docs") - + # Verify if chunks have been deleted remaining_chunks = await self.text_chunks.filter( lambda x: x.get("full_doc_id") == doc_id ) if remaining_chunks: logger.error(f"Found {len(remaining_chunks)} remaining chunks") - + # Verify entities and relationships for chunk_id in chunk_ids: # Check entities entities_with_chunk = [ - dp for dp in self.entities_vdb.client_storage["data"] - if chunk_id in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) + dp + for dp in self.entities_vdb.client_storage["data"] + if chunk_id + in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) ] if entities_with_chunk: - logger.error(f"Found {len(entities_with_chunk)} entities still referencing chunk {chunk_id}") - + logger.error( + f"Found {len(entities_with_chunk)} entities still referencing chunk {chunk_id}" + ) + # Check relationships relations_with_chunk = [ - dp for dp in self.relationships_vdb.client_storage["data"] - if chunk_id in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) + dp + for dp in self.relationships_vdb.client_storage["data"] + if chunk_id + in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) ] if relations_with_chunk: - logger.error(f"Found {len(relations_with_chunk)} relations still referencing chunk {chunk_id}") + logger.error( + f"Found {len(relations_with_chunk)} relations still referencing chunk {chunk_id}" + ) await verify_deletion() @@ -886,13 +914,15 @@ class LightRAG: """Synchronous version of adelete""" return asyncio.run(self.adelete_by_doc_id(doc_id)) - async def get_entity_info(self, entity_name: str, include_vector_data: bool = False): + async def get_entity_info( + self, entity_name: str, include_vector_data: bool = False + ): """Get detailed information of an entity - + Args: entity_name: Entity name (no need for quotes) include_vector_data: Whether to include data from the vector database - + Returns: dict: A dictionary containing entity information, including: - entity_name: Entity name @@ -901,47 +931,50 @@ class LightRAG: - vector_data: (optional) Data from the vector database """ entity_name = f'"{entity_name.upper()}"' - + # Get information from the graph node_data = await self.chunk_entity_relation_graph.get_node(entity_name) - source_id = node_data.get('source_id') if node_data else None - + source_id = node_data.get("source_id") if node_data else None + result = { "entity_name": entity_name, "source_id": source_id, "graph_data": node_data, } - + # Optional: Get vector database information if include_vector_data: entity_id = compute_mdhash_id(entity_name, prefix="ent-") vector_data = self.entities_vdb._client.get([entity_id]) result["vector_data"] = vector_data[0] if vector_data else None - + return result def get_entity_info_sync(self, entity_name: str, include_vector_data: bool = False): """Synchronous version of getting entity information - + Args: entity_name: Entity name (no need for quotes) include_vector_data: Whether to include data from the vector database """ try: import tracemalloc + tracemalloc.start() return asyncio.run(self.get_entity_info(entity_name, include_vector_data)) finally: tracemalloc.stop() - async def get_relation_info(self, src_entity: str, tgt_entity: str, include_vector_data: bool = False): + async def get_relation_info( + self, src_entity: str, tgt_entity: str, include_vector_data: bool = False + ): """Get detailed information of a relationship - + Args: src_entity: Source entity name (no need for quotes) tgt_entity: Target entity name (no need for quotes) include_vector_data: Whether to include data from the vector database - + Returns: dict: A dictionary containing relationship information, including: - src_entity: Source entity name @@ -952,29 +985,33 @@ class LightRAG: """ src_entity = f'"{src_entity.upper()}"' tgt_entity = f'"{tgt_entity.upper()}"' - + # Get information from the graph - edge_data = await self.chunk_entity_relation_graph.get_edge(src_entity, tgt_entity) - source_id = edge_data.get('source_id') if edge_data else None - + edge_data = await self.chunk_entity_relation_graph.get_edge( + src_entity, tgt_entity + ) + source_id = edge_data.get("source_id") if edge_data else None + result = { "src_entity": src_entity, "tgt_entity": tgt_entity, "source_id": source_id, "graph_data": edge_data, } - + # Optional: Get vector database information if include_vector_data: rel_id = compute_mdhash_id(src_entity + tgt_entity, prefix="rel-") vector_data = self.relationships_vdb._client.get([rel_id]) result["vector_data"] = vector_data[0] if vector_data else None - + return result - def get_relation_info_sync(self, src_entity: str, tgt_entity: str, include_vector_data: bool = False): + def get_relation_info_sync( + self, src_entity: str, tgt_entity: str, include_vector_data: bool = False + ): """Synchronous version of getting relationship information - + Args: src_entity: Source entity name (no need for quotes) tgt_entity: Target entity name (no need for quotes) @@ -982,8 +1019,10 @@ class LightRAG: """ try: import tracemalloc + tracemalloc.start() - return asyncio.run(self.get_relation_info(src_entity, tgt_entity, include_vector_data)) + return asyncio.run( + self.get_relation_info(src_entity, tgt_entity, include_vector_data) + ) finally: tracemalloc.stop() - diff --git a/lightrag/storage.py b/lightrag/storage.py index a3138568..dd265007 100644 --- a/lightrag/storage.py +++ b/lightrag/storage.py @@ -188,14 +188,18 @@ class NanoVectorDBStorage(BaseVectorStorage): """ try: self._client.delete(ids) - logger.info(f"Successfully deleted {len(ids)} vectors from {self.namespace}") + logger.info( + f"Successfully deleted {len(ids)} vectors from {self.namespace}" + ) except Exception as e: logger.error(f"Error while deleting vectors from {self.namespace}: {e}") async def delete_entity(self, entity_name: str): try: entity_id = compute_mdhash_id(entity_name, prefix="ent-") - logger.debug(f"Attempting to delete entity {entity_name} with ID {entity_id}") + logger.debug( + f"Attempting to delete entity {entity_name} with ID {entity_id}" + ) # Check if the entity exists if self._client.get([entity_id]): await self.delete([entity_id]) @@ -208,15 +212,18 @@ class NanoVectorDBStorage(BaseVectorStorage): async def delete_entity_relation(self, entity_name: str): try: relations = [ - dp for dp in self.client_storage["data"] + dp + for dp in self.client_storage["data"] if dp["src_id"] == entity_name or dp["tgt_id"] == entity_name ] logger.debug(f"Found {len(relations)} relations for entity {entity_name}") ids_to_delete = [relation["__id__"] for relation in relations] - + if ids_to_delete: await self.delete(ids_to_delete) - logger.debug(f"Deleted {len(ids_to_delete)} relations for {entity_name}") + logger.debug( + f"Deleted {len(ids_to_delete)} relations for {entity_name}" + ) else: logger.debug(f"No relations found for entity {entity_name}") except Exception as e: @@ -446,4 +453,4 @@ class JsonDocStatusStorage(DocStatusStorage): """Delete document status by IDs""" for doc_id in doc_ids: self._data.pop(doc_id, None) - await self.index_done_callback() \ No newline at end of file + await self.index_done_callback()