diff --git a/.gitignore b/.gitignore index 65aaaa02..942c2c25 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ neo4jWorkDir/ ignore_this.txt .venv/ *.ignore.* +.ruff_cache/ diff --git a/README.md b/README.md index f0276bcd..cb4e2b02 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ - +

@@ -22,7 +22,8 @@ This repository hosts the code of LightRAG. The structure of this code is based ## 🎉 News -- [x] [2024.11.11]🎯📢You can [use Oracle Database 23ai for all storage types (kv/vector/graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py) now. +- [x] [2024.11.12]🎯📢You can [use Oracle Database 23ai for all storage types (kv/vector/graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py) now. +- [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete-entity). - [x] [2024.11.09]🎯📢Now comes [LightRAG Gui](https://lightrag-gui.streamlit.app) that lets you insert, query, visualize, and download LightRAG knowledge. - [x] [2024.11.04]🎯📢You can [use Neo4J for Storage](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-neo4j-for-storage) now. - [x] [2024.10.29]🎯📢LightRAG now supports multiple file types, including PDF, DOC, PPT, and CSV via `textract`. @@ -319,6 +320,23 @@ with open("./newText.txt") as f: rag.insert(f.read()) ``` +### Delete Entity + +```python +# Delete Entity: Deleting entities by their names +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=embedding_dimension, + max_token_size=8192, + func=embedding_func, + ), +) + +rag.delete_by_entity("Project Gutenberg") +``` + ### Multi-file Type Support The `textract` supports reading file types such as TXT, DOCX, PPTX, CSV, and PDF. diff --git a/lightrag/__init__.py b/lightrag/__init__.py index b73db1b9..6d9003ff 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam -__version__ = "0.0.9" +__version__ = "1.0.0" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/LightRAG" diff --git a/lightrag/base.py b/lightrag/base.py index b88acae2..379efeb3 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -118,7 +118,7 @@ class BaseGraphStorage(StorageNameSpace): ): raise NotImplementedError - async def clustering(self, algorithm: str): + async def delete_node(self, node_id: str): raise NotImplementedError async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index 1d8b5002..7340ad7d 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -592,7 +592,9 @@ TABLES = { workspace varchar(1024), doc_name varchar(1024), content CLOB, - meta JSON + meta JSON, + createtime TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updatetime TIMESTAMP DEFAULT NULL )"""}, "LIGHTRAG_DOC_CHUNKS": @@ -603,7 +605,9 @@ TABLES = { chunk_order_index NUMBER, tokens NUMBER, content CLOB, - content_vector VECTOR + content_vector VECTOR, + createtime TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updatetime TIMESTAMP DEFAULT NULL )"""}, "LIGHTRAG_GRAPH_NODES": @@ -615,7 +619,9 @@ TABLES = { description CLOB, source_chunk_id varchar(256), content CLOB, - content_vector VECTOR + content_vector VECTOR, + createtime TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updatetime TIMESTAMP DEFAULT NULL )"""}, "LIGHTRAG_GRAPH_EDGES": {"ddl":"""CREATE TABLE LIGHTRAG_GRAPH_EDGES ( @@ -628,13 +634,18 @@ TABLES = { description CLOB, source_chunk_id varchar(256), content CLOB, - content_vector VECTOR + content_vector VECTOR, + createtime TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updatetime TIMESTAMP DEFAULT NULL )"""}, "LIGHTRAG_LLM_CACHE": {"ddl":"""CREATE TABLE LIGHTRAG_LLM_CACHE ( id varchar(256) PRIMARY KEY, + send clob, return clob, - model varchar(1024) + model varchar(1024), + createtime TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updatetime TIMESTAMP DEFAULT NULL )"""}, "LIGHTRAG_GRAPH": diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 40f4eca3..52786970 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -351,3 +351,34 @@ class LightRAG: continue tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback()) await asyncio.gather(*tasks) + + def delete_by_entity(self, entity_name: str): + loop = always_get_an_event_loop() + return loop.run_until_complete(self.adelete_by_entity(entity_name)) + + async def adelete_by_entity(self, entity_name: str): + entity_name = f'"{entity_name.upper()}"' + + try: + await self.entities_vdb.delete_entity(entity_name) + await self.relationships_vdb.delete_relation(entity_name) + await self.chunk_entity_relation_graph.delete_node(entity_name) + + logger.info( + f"Entity '{entity_name}' and its relationships have been deleted." + ) + await self._delete_by_entity_done() + except Exception as e: + logger.error(f"Error while deleting entity '{entity_name}': {e}") + + async def _delete_by_entity_done(self): + tasks = [] + for storage_inst in [ + self.entities_vdb, + self.relationships_vdb, + self.chunk_entity_relation_graph, + ]: + if storage_inst is None: + continue + tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback()) + await asyncio.gather(*tasks) diff --git a/lightrag/storage.py b/lightrag/storage.py index 61bebf2d..9a4c3d4c 100644 --- a/lightrag/storage.py +++ b/lightrag/storage.py @@ -7,7 +7,13 @@ import networkx as nx import numpy as np from nano_vectordb import NanoVectorDB -from .utils import load_json, logger, write_json +from .utils import ( + logger, + load_json, + write_json, + compute_mdhash_id, +) + from .base import ( BaseGraphStorage, BaseKVStorage, @@ -111,6 +117,43 @@ class NanoVectorDBStorage(BaseVectorStorage): ] return results + @property + def client_storage(self): + return getattr(self._client, "_NanoVectorDB__storage") + + async def delete_entity(self, entity_name: str): + try: + entity_id = [compute_mdhash_id(entity_name, prefix="ent-")] + + if self._client.get(entity_id): + self._client.delete(entity_id) + logger.info(f"Entity {entity_name} have been deleted.") + else: + logger.info(f"No entity found with name {entity_name}.") + except Exception as e: + logger.error(f"Error while deleting entity {entity_name}: {e}") + + async def delete_relation(self, entity_name: str): + try: + relations = [ + dp + for dp in self.client_storage["data"] + if dp["src_id"] == entity_name or dp["tgt_id"] == entity_name + ] + ids_to_delete = [relation["__id__"] for relation in relations] + + if ids_to_delete: + self._client.delete(ids_to_delete) + logger.info( + f"All relations related to entity {entity_name} have been deleted." + ) + else: + logger.info(f"No relations found for entity {entity_name}.") + except Exception as e: + logger.error( + f"Error while deleting relations for entity {entity_name}: {e}" + ) + async def index_done_callback(self): self._client.save() @@ -228,6 +271,18 @@ class NetworkXStorage(BaseGraphStorage): ): self._graph.add_edge(source_node_id, target_node_id, **edge_data) + async def delete_node(self, node_id: str): + """ + Delete a node from the graph based on the specified node_id. + + :param node_id: The node_id to delete + """ + if self._graph.has_node(node_id): + self._graph.remove_node(node_id) + logger.info(f"Node {node_id} deleted from the graph.") + else: + logger.warning(f"Node {node_id} not found in the graph for deletion.") + async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: if algorithm not in self._node_embed_algorithms: raise ValueError(f"Node embedding algorithm {algorithm} not supported")