From abc56d68a26d9241a2418d6f56515045e2e5c757 Mon Sep 17 00:00:00 2001 From: Larfii <834462287@qq.com> Date: Mon, 25 Nov 2024 18:06:19 +0800 Subject: [PATCH] Add custom KG insertion --- README.md | 44 ++++++++++++++ examples/insert_custom_kg.py | 108 +++++++++++++++++++++++++++++++++++ lightrag/__init__.py | 2 +- lightrag/lightrag.py | 102 +++++++++++++++++++++++++++++++++ 4 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 examples/insert_custom_kg.py diff --git a/README.md b/README.md index 6d5af135..155d20bb 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ This repository hosts the code of LightRAG. The structure of this code is based ## 🎉 News +- [x] [2024.11.25]🎯📢LightRAG now supports [custom KG insertion](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg). - [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author! - [x] [2024.11.12]🎯📢LightRAG now supports [Oracle Database 23ai for all storage types (KV, vector, and graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py). - [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete-entity). @@ -327,6 +328,49 @@ with open("./newText.txt") as f: rag.insert(f.read()) ``` +### Insert Custom KG + +```python +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=embedding_dimension, + max_token_size=8192, + func=embedding_func, + ), +) + +custom_kg = { + "entities": [ + { + "entity_name": "CompanyA", + "entity_type": "Organization", + "description": "A major technology company", + "source_id": "Source1" + }, + { + "entity_name": "ProductX", + "entity_type": "Product", + "description": "A popular product developed by CompanyA", + "source_id": "Source1" + } + ], + "relationships": [ + { + "src_id": "CompanyA", + "tgt_id": "ProductX", + "description": "CompanyA develops ProductX", + "keywords": "develop, produce", + "weight": 1.0, + "source_id": "Source1" + } + ] +} + +rag.insert_custom_kg(custom_kg) +``` + ### Delete Entity ```python diff --git a/examples/insert_custom_kg.py b/examples/insert_custom_kg.py new file mode 100644 index 00000000..bbabe6a9 --- /dev/null +++ b/examples/insert_custom_kg.py @@ -0,0 +1,108 @@ +import os +from lightrag import LightRAG, QueryParam +from lightrag.llm import gpt_4o_mini_complete +######### +# Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert() +# import nest_asyncio +# nest_asyncio.apply() +######### + +WORKING_DIR = "./custom_kg" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=gpt_4o_mini_complete, # Use gpt_4o_mini_complete LLM model + # llm_model_func=gpt_4o_complete # Optionally, use a stronger model +) + +custom_kg = { + "entities": [ + { + "entity_name": "CompanyA", + "entity_type": "Organization", + "description": "A major technology company", + "source_id": "Source1" + }, + { + "entity_name": "ProductX", + "entity_type": "Product", + "description": "A popular product developed by CompanyA", + "source_id": "Source1" + }, + { + "entity_name": "PersonA", + "entity_type": "Person", + "description": "A renowned researcher in AI", + "source_id": "Source2" + }, + { + "entity_name": "UniversityB", + "entity_type": "Organization", + "description": "A leading university specializing in technology and sciences", + "source_id": "Source2" + }, + { + "entity_name": "CityC", + "entity_type": "Location", + "description": "A large metropolitan city known for its culture and economy", + "source_id": "Source3" + }, + { + "entity_name": "EventY", + "entity_type": "Event", + "description": "An annual technology conference held in CityC", + "source_id": "Source3" + }, + { + "entity_name": "CompanyD", + "entity_type": "Organization", + "description": "A financial services company specializing in insurance", + "source_id": "Source4" + }, + { + "entity_name": "ServiceZ", + "entity_type": "Service", + "description": "An insurance product offered by CompanyD", + "source_id": "Source4" + } + ], + "relationships": [ + { + "src_id": "CompanyA", + "tgt_id": "ProductX", + "description": "CompanyA develops ProductX", + "keywords": "develop, produce", + "weight": 1.0, + "source_id": "Source1" + }, + { + "src_id": "PersonA", + "tgt_id": "UniversityB", + "description": "PersonA works at UniversityB", + "keywords": "employment, affiliation", + "weight": 0.9, + "source_id": "Source2" + }, + { + "src_id": "CityC", + "tgt_id": "EventY", + "description": "EventY is hosted in CityC", + "keywords": "host, location", + "weight": 0.8, + "source_id": "Source3" + }, + { + "src_id": "CompanyD", + "tgt_id": "ServiceZ", + "description": "CompanyD provides ServiceZ", + "keywords": "provide, offer", + "weight": 1.0, + "source_id": "Source4" + } + ] +} + +rag.insert_custom_kg(custom_kg) diff --git a/lightrag/__init__.py b/lightrag/__init__.py index c8b61765..a8b60e55 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam -__version__ = "1.0.1" +__version__ = "1.0.2" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/LightRAG" diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 28e72102..d531f4f6 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -308,6 +308,108 @@ class LightRAG: tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback()) await asyncio.gather(*tasks) + def insert_custom_kg(self, custom_kg: dict): + loop = always_get_an_event_loop() + return loop.run_until_complete(self.ainsert_custom_kg(custom_kg)) + + async def ainsert_custom_kg(self, custom_kg: dict): + update_storage = False + try: + # Insert entities into knowledge graph + all_entities_data = [] + for entity_data in custom_kg.get("entities", []): + entity_name = f'"{entity_data["entity_name"].upper()}"' + entity_type = entity_data.get("entity_type", "UNKNOWN") + description = entity_data.get("description", "No description provided") + source_id = entity_data["source_id"] + + # Prepare node data + node_data = { + "entity_type": entity_type, + "description": description, + "source_id": source_id, + } + # Insert node data into the knowledge graph + await self.chunk_entity_relation_graph.upsert_node( + entity_name, node_data=node_data + ) + node_data["entity_name"] = entity_name + all_entities_data.append(node_data) + update_storage = True + + # Insert relationships into knowledge graph + all_relationships_data = [] + for relationship_data in custom_kg.get("relationships", []): + src_id = f'"{relationship_data["src_id"].upper()}"' + tgt_id = f'"{relationship_data["tgt_id"].upper()}"' + description = relationship_data["description"] + keywords = relationship_data["keywords"] + weight = relationship_data.get("weight", 1.0) + source_id = relationship_data["source_id"] + + # Check if nodes exist in the knowledge graph + for need_insert_id in [src_id, tgt_id]: + if not ( + await self.chunk_entity_relation_graph.has_node(need_insert_id) + ): + await self.chunk_entity_relation_graph.upsert_node( + need_insert_id, + node_data={ + "source_id": source_id, + "description": "UNKNOWN", + "entity_type": "UNKNOWN", + }, + ) + + # Insert edge into the knowledge graph + await self.chunk_entity_relation_graph.upsert_edge( + src_id, + tgt_id, + edge_data={ + "weight": weight, + "description": description, + "keywords": keywords, + "source_id": source_id, + }, + ) + edge_data = { + "src_id": src_id, + "tgt_id": tgt_id, + "description": description, + "keywords": keywords, + } + all_relationships_data.append(edge_data) + update_storage = True + + # Insert entities into vector storage if needed + if self.entities_vdb is not None: + data_for_vdb = { + compute_mdhash_id(dp["entity_name"], prefix="ent-"): { + "content": dp["entity_name"] + dp["description"], + "entity_name": dp["entity_name"], + } + for dp in all_entities_data + } + await self.entities_vdb.upsert(data_for_vdb) + + # Insert relationships into vector storage if needed + if self.relationships_vdb is not None: + data_for_vdb = { + compute_mdhash_id(dp["src_id"] + dp["tgt_id"], prefix="rel-"): { + "src_id": dp["src_id"], + "tgt_id": dp["tgt_id"], + "content": dp["keywords"] + + dp["src_id"] + + dp["tgt_id"] + + dp["description"], + } + for dp in all_relationships_data + } + await self.relationships_vdb.upsert(data_for_vdb) + finally: + if update_storage: + await self._insert_done() + def query(self, query: str, param: QueryParam = QueryParam()): loop = always_get_an_event_loop() return loop.run_until_complete(self.aquery(query, param))