From cee5b2fbb0ad6f6ffa244d3335874f0a1830f6e8 Mon Sep 17 00:00:00 2001 From: zrguo Date: Tue, 31 Dec 2024 17:15:57 +0800 Subject: [PATCH 1/4] add delete by doc id --- lightrag/lightrag.py | 273 ++++++++++++++++++++++++++++++++++++++++++- lightrag/storage.py | 107 ++++++++++++++--- 2 files changed, 360 insertions(+), 20 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 9a7ebeb8..03922556 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -43,6 +43,8 @@ from .storage import ( JsonDocStatusStorage, ) +from .prompt import GRAPH_FIELD_SEP + # future KG integrations # from .kg.ArangoDB_impl import ( @@ -672,7 +674,7 @@ class LightRAG: try: await self.entities_vdb.delete_entity(entity_name) - await self.relationships_vdb.delete_relation(entity_name) + await self.relationships_vdb.delete_entity_relation(entity_name) await self.chunk_entity_relation_graph.delete_node(entity_name) logger.info( @@ -716,3 +718,272 @@ class LightRAG: Dict with counts for each status """ return await self.doc_status.get_status_counts() + + async def adelete_by_doc_id(self, doc_id: str): + """Delete a document and all its related data + + Args: + doc_id: Document ID to delete + """ + try: + # 1. Get the document status and related data + doc_status = await self.doc_status.get(doc_id) + if not doc_status: + logger.warning(f"Document {doc_id} not found") + return + + logger.debug(f"Starting deletion for document {doc_id}") + + # 2. Get all related chunks + chunks = await self.text_chunks.filter(lambda x: x.get("full_doc_id") == doc_id) + chunk_ids = list(chunks.keys()) + logger.debug(f"Found {len(chunk_ids)} chunks to delete") + + # 3. Before deleting, check the related entities and relationships for these chunks + for chunk_id in chunk_ids: + # Check entities + entities = [ + dp for dp in self.entities_vdb.client_storage["data"] + if dp.get("source_id") == chunk_id + ] + logger.debug(f"Chunk {chunk_id} has {len(entities)} related entities") + + # Check relationships + relations = [ + dp for dp in self.relationships_vdb.client_storage["data"] + if dp.get("source_id") == chunk_id + ] + logger.debug(f"Chunk {chunk_id} has {len(relations)} related relations") + + # Continue with the original deletion process... + + # 4. Delete chunks from vector database + if chunk_ids: + await self.chunks_vdb.delete(chunk_ids) + await self.text_chunks.delete(chunk_ids) + + # 5. Find and process entities and relationships that have these chunks as source + # Get all nodes in the graph + nodes = self.chunk_entity_relation_graph._graph.nodes(data=True) + edges = self.chunk_entity_relation_graph._graph.edges(data=True) + + # Track which entities and relationships need to be deleted or updated + entities_to_delete = set() + entities_to_update = {} # entity_name -> new_source_id + relationships_to_delete = set() + relationships_to_update = {} # (src, tgt) -> new_source_id + + # Process entities + for node, data in nodes: + if 'source_id' in data: + # Split source_id using GRAPH_FIELD_SEP + sources = set(data['source_id'].split(GRAPH_FIELD_SEP)) + sources.difference_update(chunk_ids) + if not sources: + entities_to_delete.add(node) + logger.debug(f"Entity {node} marked for deletion - no remaining sources") + else: + new_source_id = GRAPH_FIELD_SEP.join(sources) + entities_to_update[node] = new_source_id + logger.debug(f"Entity {node} will be updated with new source_id: {new_source_id}") + + # Process relationships + for src, tgt, data in edges: + if 'source_id' in data: + # Split source_id using GRAPH_FIELD_SEP + sources = set(data['source_id'].split(GRAPH_FIELD_SEP)) + sources.difference_update(chunk_ids) + if not sources: + relationships_to_delete.add((src, tgt)) + logger.debug(f"Relationship {src}-{tgt} marked for deletion - no remaining sources") + else: + new_source_id = GRAPH_FIELD_SEP.join(sources) + relationships_to_update[(src, tgt)] = new_source_id + logger.debug(f"Relationship {src}-{tgt} will be updated with new source_id: {new_source_id}") + + # Delete entities + if entities_to_delete: + for entity in entities_to_delete: + await self.entities_vdb.delete_entity(entity) + logger.debug(f"Deleted entity {entity} from vector DB") + self.chunk_entity_relation_graph.remove_nodes(list(entities_to_delete)) + logger.debug(f"Deleted {len(entities_to_delete)} entities from graph") + + # Update entities + for entity, new_source_id in entities_to_update.items(): + node_data = self.chunk_entity_relation_graph._graph.nodes[entity] + node_data['source_id'] = new_source_id + await self.chunk_entity_relation_graph.upsert_node(entity, node_data) + logger.debug(f"Updated entity {entity} with new source_id: {new_source_id}") + + # Delete relationships + if relationships_to_delete: + for src, tgt in relationships_to_delete: + rel_id_0 = compute_mdhash_id(src + tgt, prefix="rel-") + rel_id_1 = compute_mdhash_id(tgt + src, prefix="rel-") + await self.relationships_vdb.delete([rel_id_0, rel_id_1]) + logger.debug(f"Deleted relationship {src}-{tgt} from vector DB") + self.chunk_entity_relation_graph.remove_edges(list(relationships_to_delete)) + logger.debug(f"Deleted {len(relationships_to_delete)} relationships from graph") + + # Update relationships + for (src, tgt), new_source_id in relationships_to_update.items(): + edge_data = self.chunk_entity_relation_graph._graph.edges[src, tgt] + edge_data['source_id'] = new_source_id + await self.chunk_entity_relation_graph.upsert_edge(src, tgt, edge_data) + logger.debug(f"Updated relationship {src}-{tgt} with new source_id: {new_source_id}") + + # 6. Delete original document and status + await self.full_docs.delete([doc_id]) + await self.doc_status.delete([doc_id]) + + # 7. Ensure all indexes are updated + await self._insert_done() + + logger.info( + f"Successfully deleted document {doc_id} and related data. " + f"Deleted {len(entities_to_delete)} entities and {len(relationships_to_delete)} relationships. " + f"Updated {len(entities_to_update)} entities and {len(relationships_to_update)} relationships." + ) + + # Add verification step + async def verify_deletion(): + # Verify if the document has been deleted + if await self.full_docs.get_by_id(doc_id): + logger.error(f"Document {doc_id} still exists in full_docs") + + # Verify if chunks have been deleted + remaining_chunks = await self.text_chunks.filter( + lambda x: x.get("full_doc_id") == doc_id + ) + if remaining_chunks: + logger.error(f"Found {len(remaining_chunks)} remaining chunks") + + # Verify entities and relationships + for chunk_id in chunk_ids: + # Check entities + entities_with_chunk = [ + dp for dp in self.entities_vdb.client_storage["data"] + if chunk_id in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) + ] + if entities_with_chunk: + logger.error(f"Found {len(entities_with_chunk)} entities still referencing chunk {chunk_id}") + + # Check relationships + relations_with_chunk = [ + dp for dp in self.relationships_vdb.client_storage["data"] + if chunk_id in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) + ] + if relations_with_chunk: + logger.error(f"Found {len(relations_with_chunk)} relations still referencing chunk {chunk_id}") + + await verify_deletion() + + except Exception as e: + logger.error(f"Error while deleting document {doc_id}: {e}") + + def delete_by_doc_id(self, doc_id: str): + """Synchronous version of adelete""" + return asyncio.run(self.adelete_by_doc_id(doc_id)) + + async def get_entity_info(self, entity_name: str, include_vector_data: bool = False): + """Get detailed information of an entity + + Args: + entity_name: Entity name (no need for quotes) + include_vector_data: Whether to include data from the vector database + + Returns: + dict: A dictionary containing entity information, including: + - entity_name: Entity name + - source_id: Source document ID + - graph_data: Complete node data from the graph database + - vector_data: (optional) Data from the vector database + """ + entity_name = f'"{entity_name.upper()}"' + + # Get information from the graph + node_data = await self.chunk_entity_relation_graph.get_node(entity_name) + source_id = node_data.get('source_id') if node_data else None + + result = { + "entity_name": entity_name, + "source_id": source_id, + "graph_data": node_data, + } + + # Optional: Get vector database information + if include_vector_data: + entity_id = compute_mdhash_id(entity_name, prefix="ent-") + vector_data = self.entities_vdb._client.get([entity_id]) + result["vector_data"] = vector_data[0] if vector_data else None + + return result + + def get_entity_info_sync(self, entity_name: str, include_vector_data: bool = False): + """Synchronous version of getting entity information + + Args: + entity_name: Entity name (no need for quotes) + include_vector_data: Whether to include data from the vector database + """ + try: + import tracemalloc + tracemalloc.start() + return asyncio.run(self.get_entity_info(entity_name, include_vector_data)) + finally: + tracemalloc.stop() + + async def get_relation_info(self, src_entity: str, tgt_entity: str, include_vector_data: bool = False): + """Get detailed information of a relationship + + Args: + src_entity: Source entity name (no need for quotes) + tgt_entity: Target entity name (no need for quotes) + include_vector_data: Whether to include data from the vector database + + Returns: + dict: A dictionary containing relationship information, including: + - src_entity: Source entity name + - tgt_entity: Target entity name + - source_id: Source document ID + - graph_data: Complete edge data from the graph database + - vector_data: (optional) Data from the vector database + """ + src_entity = f'"{src_entity.upper()}"' + tgt_entity = f'"{tgt_entity.upper()}"' + + # Get information from the graph + edge_data = await self.chunk_entity_relation_graph.get_edge(src_entity, tgt_entity) + source_id = edge_data.get('source_id') if edge_data else None + + result = { + "src_entity": src_entity, + "tgt_entity": tgt_entity, + "source_id": source_id, + "graph_data": edge_data, + } + + # Optional: Get vector database information + if include_vector_data: + rel_id = compute_mdhash_id(src_entity + tgt_entity, prefix="rel-") + vector_data = self.relationships_vdb._client.get([rel_id]) + result["vector_data"] = vector_data[0] if vector_data else None + + return result + + def get_relation_info_sync(self, src_entity: str, tgt_entity: str, include_vector_data: bool = False): + """Synchronous version of getting relationship information + + Args: + src_entity: Source entity name (no need for quotes) + tgt_entity: Target entity name (no need for quotes) + include_vector_data: Whether to include data from the vector database + """ + try: + import tracemalloc + tracemalloc.start() + return asyncio.run(self.get_relation_info(src_entity, tgt_entity, include_vector_data)) + finally: + tracemalloc.stop() + diff --git a/lightrag/storage.py b/lightrag/storage.py index ac8a95d3..a3138568 100644 --- a/lightrag/storage.py +++ b/lightrag/storage.py @@ -32,6 +32,7 @@ class JsonKVStorage(BaseKVStorage): working_dir = self.global_config["working_dir"] self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json") self._data = load_json(self._file_name) or {} + self._lock = asyncio.Lock() logger.info(f"Load KV {self.namespace} with {len(self._data)} data") async def all_keys(self) -> list[str]: @@ -66,6 +67,35 @@ class JsonKVStorage(BaseKVStorage): async def drop(self): self._data = {} + async def filter(self, filter_func): + """Filter key-value pairs based on a filter function + + Args: + filter_func: The filter function, which takes a value as an argument and returns a boolean value + + Returns: + Dict: Key-value pairs that meet the condition + """ + result = {} + async with self._lock: + for key, value in self._data.items(): + if filter_func(value): + result[key] = value + return result + + async def delete(self, ids: list[str]): + """Delete data with specified IDs + + Args: + ids: List of IDs to delete + """ + async with self._lock: + for id in ids: + if id in self._data: + del self._data[id] + await self.index_done_callback() + logger.info(f"Successfully deleted {len(ids)} items from {self.namespace}") + @dataclass class NanoVectorDBStorage(BaseVectorStorage): @@ -150,38 +180,47 @@ class NanoVectorDBStorage(BaseVectorStorage): def client_storage(self): return getattr(self._client, "_NanoVectorDB__storage") + async def delete(self, ids: list[str]): + """Delete vectors with specified IDs + + Args: + ids: List of vector IDs to be deleted + """ + try: + self._client.delete(ids) + logger.info(f"Successfully deleted {len(ids)} vectors from {self.namespace}") + except Exception as e: + logger.error(f"Error while deleting vectors from {self.namespace}: {e}") + async def delete_entity(self, entity_name: str): try: - entity_id = [compute_mdhash_id(entity_name, prefix="ent-")] - - if self._client.get(entity_id): - self._client.delete(entity_id) - logger.info(f"Entity {entity_name} have been deleted.") + entity_id = compute_mdhash_id(entity_name, prefix="ent-") + logger.debug(f"Attempting to delete entity {entity_name} with ID {entity_id}") + # Check if the entity exists + if self._client.get([entity_id]): + await self.delete([entity_id]) + logger.debug(f"Successfully deleted entity {entity_name}") else: - logger.info(f"No entity found with name {entity_name}.") + logger.debug(f"Entity {entity_name} not found in storage") except Exception as e: - logger.error(f"Error while deleting entity {entity_name}: {e}") + logger.error(f"Error deleting entity {entity_name}: {e}") - async def delete_relation(self, entity_name: str): + async def delete_entity_relation(self, entity_name: str): try: relations = [ - dp - for dp in self.client_storage["data"] + dp for dp in self.client_storage["data"] if dp["src_id"] == entity_name or dp["tgt_id"] == entity_name ] + logger.debug(f"Found {len(relations)} relations for entity {entity_name}") ids_to_delete = [relation["__id__"] for relation in relations] - + if ids_to_delete: - self._client.delete(ids_to_delete) - logger.info( - f"All relations related to entity {entity_name} have been deleted." - ) + await self.delete(ids_to_delete) + logger.debug(f"Deleted {len(ids_to_delete)} relations for {entity_name}") else: - logger.info(f"No relations found for entity {entity_name}.") + logger.debug(f"No relations found for entity {entity_name}") except Exception as e: - logger.error( - f"Error while deleting relations for entity {entity_name}: {e}" - ) + logger.error(f"Error deleting relations for {entity_name}: {e}") async def index_done_callback(self): self._client.save() @@ -329,6 +368,26 @@ class NetworkXStorage(BaseGraphStorage): nodes_ids = [self._graph.nodes[node_id]["id"] for node_id in nodes] return embeddings, nodes_ids + def remove_nodes(self, nodes: list[str]): + """Delete multiple nodes + + Args: + nodes: List of node IDs to be deleted + """ + for node in nodes: + if self._graph.has_node(node): + self._graph.remove_node(node) + + def remove_edges(self, edges: list[tuple[str, str]]): + """Delete multiple edges + + Args: + edges: List of edges to be deleted, each edge is a (source, target) tuple + """ + for source, target in edges: + if self._graph.has_edge(source, target): + self._graph.remove_edge(source, target) + @dataclass class JsonDocStatusStorage(DocStatusStorage): @@ -378,3 +437,13 @@ class JsonDocStatusStorage(DocStatusStorage): self._data.update(data) await self.index_done_callback() return data + + async def get(self, doc_id: str) -> Union[DocProcessingStatus, None]: + """Get document status by ID""" + return self._data.get(doc_id) + + async def delete(self, doc_ids: list[str]): + """Delete document status by IDs""" + for doc_id in doc_ids: + self._data.pop(doc_id, None) + await self.index_done_callback() \ No newline at end of file From 71e9267f4bad36198ada0a5175d0f137c23af018 Mon Sep 17 00:00:00 2001 From: zrguo Date: Tue, 31 Dec 2024 17:25:57 +0800 Subject: [PATCH 2/4] Update README.md --- README.md | 12 ++++++++---- lightrag/__init__.py | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 26d41e22..fbd0fdf6 100644 --- a/README.md +++ b/README.md @@ -26,10 +26,11 @@ This repository hosts the code of LightRAG. The structure of this code is based ## 🎉 News +- [x] [2024.12.31]🎯📢LightRAG now supports [deletion by document ID](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete). - [x] [2024.11.25]🎯📢LightRAG now supports seamless integration of [custom knowledge graphs](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg), empowering users to enhance the system with their own domain expertise. - [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author. - [x] [2024.11.12]🎯📢LightRAG now supports [Oracle Database 23ai for all storage types (KV, vector, and graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py). -- [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete-entity). +- [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete). - [x] [2024.11.09]🎯📢Introducing the [LightRAG Gui](https://lightrag-gui.streamlit.app), which allows you to insert, query, visualize, and download LightRAG knowledge. - [x] [2024.11.04]🎯📢You can now [use Neo4J for Storage](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-neo4j-for-storage). - [x] [2024.10.29]🎯📢LightRAG now supports multiple file types, including PDF, DOC, PPT, and CSV via `textract`. @@ -412,10 +413,9 @@ custom_kg = { rag.insert_custom_kg(custom_kg) ``` -### Delete Entity - +### Delete ```python -# Delete Entity: Deleting entities by their names + rag = LightRAG( working_dir=WORKING_DIR, llm_model_func=llm_model_func, @@ -426,7 +426,11 @@ rag = LightRAG( ), ) +# Delete Entity: Deleting entities by their names rag.delete_by_entity("Project Gutenberg") + +# Delete Document: Deleting entities and relationships associated with the document by doc id +rag.delete_by_doc_id("doc_id") ``` ### Multi-file Type Support diff --git a/lightrag/__init__.py b/lightrag/__init__.py index 83ed72cc..cd2ccf04 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam -__version__ = "1.0.8" +__version__ = "1.0.9" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/LightRAG" From d489d9dec0a0da719ffa29eda13fed19a3ce0b82 Mon Sep 17 00:00:00 2001 From: zrguo Date: Tue, 31 Dec 2024 17:32:04 +0800 Subject: [PATCH 3/4] fix linting errors --- lightrag/lightrag.py | 145 +++++++++++++++++++++++++++---------------- lightrag/storage.py | 19 ++++-- 2 files changed, 105 insertions(+), 59 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 03922556..22ea7117 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -731,30 +731,34 @@ class LightRAG: if not doc_status: logger.warning(f"Document {doc_id} not found") return - + logger.debug(f"Starting deletion for document {doc_id}") - + # 2. Get all related chunks - chunks = await self.text_chunks.filter(lambda x: x.get("full_doc_id") == doc_id) + chunks = await self.text_chunks.filter( + lambda x: x.get("full_doc_id") == doc_id + ) chunk_ids = list(chunks.keys()) logger.debug(f"Found {len(chunk_ids)} chunks to delete") - + # 3. Before deleting, check the related entities and relationships for these chunks for chunk_id in chunk_ids: # Check entities entities = [ - dp for dp in self.entities_vdb.client_storage["data"] + dp + for dp in self.entities_vdb.client_storage["data"] if dp.get("source_id") == chunk_id ] logger.debug(f"Chunk {chunk_id} has {len(entities)} related entities") - + # Check relationships relations = [ - dp for dp in self.relationships_vdb.client_storage["data"] + dp + for dp in self.relationships_vdb.client_storage["data"] if dp.get("source_id") == chunk_id ] logger.debug(f"Chunk {chunk_id} has {len(relations)} related relations") - + # Continue with the original deletion process... # 4. Delete chunks from vector database @@ -775,31 +779,39 @@ class LightRAG: # Process entities for node, data in nodes: - if 'source_id' in data: + if "source_id" in data: # Split source_id using GRAPH_FIELD_SEP - sources = set(data['source_id'].split(GRAPH_FIELD_SEP)) + sources = set(data["source_id"].split(GRAPH_FIELD_SEP)) sources.difference_update(chunk_ids) if not sources: entities_to_delete.add(node) - logger.debug(f"Entity {node} marked for deletion - no remaining sources") + logger.debug( + f"Entity {node} marked for deletion - no remaining sources" + ) else: new_source_id = GRAPH_FIELD_SEP.join(sources) entities_to_update[node] = new_source_id - logger.debug(f"Entity {node} will be updated with new source_id: {new_source_id}") + logger.debug( + f"Entity {node} will be updated with new source_id: {new_source_id}" + ) # Process relationships for src, tgt, data in edges: - if 'source_id' in data: + if "source_id" in data: # Split source_id using GRAPH_FIELD_SEP - sources = set(data['source_id'].split(GRAPH_FIELD_SEP)) + sources = set(data["source_id"].split(GRAPH_FIELD_SEP)) sources.difference_update(chunk_ids) if not sources: relationships_to_delete.add((src, tgt)) - logger.debug(f"Relationship {src}-{tgt} marked for deletion - no remaining sources") + logger.debug( + f"Relationship {src}-{tgt} marked for deletion - no remaining sources" + ) else: new_source_id = GRAPH_FIELD_SEP.join(sources) relationships_to_update[(src, tgt)] = new_source_id - logger.debug(f"Relationship {src}-{tgt} will be updated with new source_id: {new_source_id}") + logger.debug( + f"Relationship {src}-{tgt} will be updated with new source_id: {new_source_id}" + ) # Delete entities if entities_to_delete: @@ -812,9 +824,11 @@ class LightRAG: # Update entities for entity, new_source_id in entities_to_update.items(): node_data = self.chunk_entity_relation_graph._graph.nodes[entity] - node_data['source_id'] = new_source_id + node_data["source_id"] = new_source_id await self.chunk_entity_relation_graph.upsert_node(entity, node_data) - logger.debug(f"Updated entity {entity} with new source_id: {new_source_id}") + logger.debug( + f"Updated entity {entity} with new source_id: {new_source_id}" + ) # Delete relationships if relationships_to_delete: @@ -823,15 +837,21 @@ class LightRAG: rel_id_1 = compute_mdhash_id(tgt + src, prefix="rel-") await self.relationships_vdb.delete([rel_id_0, rel_id_1]) logger.debug(f"Deleted relationship {src}-{tgt} from vector DB") - self.chunk_entity_relation_graph.remove_edges(list(relationships_to_delete)) - logger.debug(f"Deleted {len(relationships_to_delete)} relationships from graph") + self.chunk_entity_relation_graph.remove_edges( + list(relationships_to_delete) + ) + logger.debug( + f"Deleted {len(relationships_to_delete)} relationships from graph" + ) # Update relationships for (src, tgt), new_source_id in relationships_to_update.items(): edge_data = self.chunk_entity_relation_graph._graph.edges[src, tgt] - edge_data['source_id'] = new_source_id + edge_data["source_id"] = new_source_id await self.chunk_entity_relation_graph.upsert_edge(src, tgt, edge_data) - logger.debug(f"Updated relationship {src}-{tgt} with new source_id: {new_source_id}") + logger.debug( + f"Updated relationship {src}-{tgt} with new source_id: {new_source_id}" + ) # 6. Delete original document and status await self.full_docs.delete([doc_id]) @@ -851,31 +871,39 @@ class LightRAG: # Verify if the document has been deleted if await self.full_docs.get_by_id(doc_id): logger.error(f"Document {doc_id} still exists in full_docs") - + # Verify if chunks have been deleted remaining_chunks = await self.text_chunks.filter( lambda x: x.get("full_doc_id") == doc_id ) if remaining_chunks: logger.error(f"Found {len(remaining_chunks)} remaining chunks") - + # Verify entities and relationships for chunk_id in chunk_ids: # Check entities entities_with_chunk = [ - dp for dp in self.entities_vdb.client_storage["data"] - if chunk_id in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) + dp + for dp in self.entities_vdb.client_storage["data"] + if chunk_id + in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) ] if entities_with_chunk: - logger.error(f"Found {len(entities_with_chunk)} entities still referencing chunk {chunk_id}") - + logger.error( + f"Found {len(entities_with_chunk)} entities still referencing chunk {chunk_id}" + ) + # Check relationships relations_with_chunk = [ - dp for dp in self.relationships_vdb.client_storage["data"] - if chunk_id in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) + dp + for dp in self.relationships_vdb.client_storage["data"] + if chunk_id + in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) ] if relations_with_chunk: - logger.error(f"Found {len(relations_with_chunk)} relations still referencing chunk {chunk_id}") + logger.error( + f"Found {len(relations_with_chunk)} relations still referencing chunk {chunk_id}" + ) await verify_deletion() @@ -886,13 +914,15 @@ class LightRAG: """Synchronous version of adelete""" return asyncio.run(self.adelete_by_doc_id(doc_id)) - async def get_entity_info(self, entity_name: str, include_vector_data: bool = False): + async def get_entity_info( + self, entity_name: str, include_vector_data: bool = False + ): """Get detailed information of an entity - + Args: entity_name: Entity name (no need for quotes) include_vector_data: Whether to include data from the vector database - + Returns: dict: A dictionary containing entity information, including: - entity_name: Entity name @@ -901,47 +931,50 @@ class LightRAG: - vector_data: (optional) Data from the vector database """ entity_name = f'"{entity_name.upper()}"' - + # Get information from the graph node_data = await self.chunk_entity_relation_graph.get_node(entity_name) - source_id = node_data.get('source_id') if node_data else None - + source_id = node_data.get("source_id") if node_data else None + result = { "entity_name": entity_name, "source_id": source_id, "graph_data": node_data, } - + # Optional: Get vector database information if include_vector_data: entity_id = compute_mdhash_id(entity_name, prefix="ent-") vector_data = self.entities_vdb._client.get([entity_id]) result["vector_data"] = vector_data[0] if vector_data else None - + return result def get_entity_info_sync(self, entity_name: str, include_vector_data: bool = False): """Synchronous version of getting entity information - + Args: entity_name: Entity name (no need for quotes) include_vector_data: Whether to include data from the vector database """ try: import tracemalloc + tracemalloc.start() return asyncio.run(self.get_entity_info(entity_name, include_vector_data)) finally: tracemalloc.stop() - async def get_relation_info(self, src_entity: str, tgt_entity: str, include_vector_data: bool = False): + async def get_relation_info( + self, src_entity: str, tgt_entity: str, include_vector_data: bool = False + ): """Get detailed information of a relationship - + Args: src_entity: Source entity name (no need for quotes) tgt_entity: Target entity name (no need for quotes) include_vector_data: Whether to include data from the vector database - + Returns: dict: A dictionary containing relationship information, including: - src_entity: Source entity name @@ -952,29 +985,33 @@ class LightRAG: """ src_entity = f'"{src_entity.upper()}"' tgt_entity = f'"{tgt_entity.upper()}"' - + # Get information from the graph - edge_data = await self.chunk_entity_relation_graph.get_edge(src_entity, tgt_entity) - source_id = edge_data.get('source_id') if edge_data else None - + edge_data = await self.chunk_entity_relation_graph.get_edge( + src_entity, tgt_entity + ) + source_id = edge_data.get("source_id") if edge_data else None + result = { "src_entity": src_entity, "tgt_entity": tgt_entity, "source_id": source_id, "graph_data": edge_data, } - + # Optional: Get vector database information if include_vector_data: rel_id = compute_mdhash_id(src_entity + tgt_entity, prefix="rel-") vector_data = self.relationships_vdb._client.get([rel_id]) result["vector_data"] = vector_data[0] if vector_data else None - + return result - def get_relation_info_sync(self, src_entity: str, tgt_entity: str, include_vector_data: bool = False): + def get_relation_info_sync( + self, src_entity: str, tgt_entity: str, include_vector_data: bool = False + ): """Synchronous version of getting relationship information - + Args: src_entity: Source entity name (no need for quotes) tgt_entity: Target entity name (no need for quotes) @@ -982,8 +1019,10 @@ class LightRAG: """ try: import tracemalloc + tracemalloc.start() - return asyncio.run(self.get_relation_info(src_entity, tgt_entity, include_vector_data)) + return asyncio.run( + self.get_relation_info(src_entity, tgt_entity, include_vector_data) + ) finally: tracemalloc.stop() - diff --git a/lightrag/storage.py b/lightrag/storage.py index a3138568..dd265007 100644 --- a/lightrag/storage.py +++ b/lightrag/storage.py @@ -188,14 +188,18 @@ class NanoVectorDBStorage(BaseVectorStorage): """ try: self._client.delete(ids) - logger.info(f"Successfully deleted {len(ids)} vectors from {self.namespace}") + logger.info( + f"Successfully deleted {len(ids)} vectors from {self.namespace}" + ) except Exception as e: logger.error(f"Error while deleting vectors from {self.namespace}: {e}") async def delete_entity(self, entity_name: str): try: entity_id = compute_mdhash_id(entity_name, prefix="ent-") - logger.debug(f"Attempting to delete entity {entity_name} with ID {entity_id}") + logger.debug( + f"Attempting to delete entity {entity_name} with ID {entity_id}" + ) # Check if the entity exists if self._client.get([entity_id]): await self.delete([entity_id]) @@ -208,15 +212,18 @@ class NanoVectorDBStorage(BaseVectorStorage): async def delete_entity_relation(self, entity_name: str): try: relations = [ - dp for dp in self.client_storage["data"] + dp + for dp in self.client_storage["data"] if dp["src_id"] == entity_name or dp["tgt_id"] == entity_name ] logger.debug(f"Found {len(relations)} relations for entity {entity_name}") ids_to_delete = [relation["__id__"] for relation in relations] - + if ids_to_delete: await self.delete(ids_to_delete) - logger.debug(f"Deleted {len(ids_to_delete)} relations for {entity_name}") + logger.debug( + f"Deleted {len(ids_to_delete)} relations for {entity_name}" + ) else: logger.debug(f"No relations found for entity {entity_name}") except Exception as e: @@ -446,4 +453,4 @@ class JsonDocStatusStorage(DocStatusStorage): """Delete document status by IDs""" for doc_id in doc_ids: self._data.pop(doc_id, None) - await self.index_done_callback() \ No newline at end of file + await self.index_done_callback() From 7e7230a6e3cca8963b41cff2affa2621862e5a44 Mon Sep 17 00:00:00 2001 From: JasonnnW3000 Date: Wed, 1 Jan 2025 00:20:46 -0500 Subject: [PATCH 4/4] Update LICENSE, fix license year Signed-off-by: JasonnnW3000 --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 61fc28be..c65e8258 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2024 Gustavo Ye +Copyright (c) 2025 Gustavo Ye Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal