From 8562ecdebca67f689b682275f63e42e6634fa47f Mon Sep 17 00:00:00 2001 From: Larfii <834462287@qq.com> Date: Mon, 25 Nov 2024 15:04:38 +0800 Subject: [PATCH 1/3] Add a progress bar --- lightrag/lightrag.py | 5 +++- lightrag/operate.py | 59 +++++++++++++++++++++++++++++++------------- lightrag/storage.py | 14 ++++++++--- 3 files changed, 57 insertions(+), 21 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 7fafadcf..28e72102 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1,5 +1,6 @@ import asyncio import os +from tqdm.asyncio import tqdm as tqdm_async from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial @@ -243,7 +244,9 @@ class LightRAG: logger.info(f"[New Docs] inserting {len(new_docs)} docs") inserting_chunks = {} - for doc_key, doc in new_docs.items(): + for doc_key, doc in tqdm_async( + new_docs.items(), desc="Chunking documents", unit="doc" + ): chunks = { compute_mdhash_id(dp["content"], prefix="chunk-"): { **dp, diff --git a/lightrag/operate.py b/lightrag/operate.py index cf236633..9e4b768a 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1,6 +1,7 @@ import asyncio import json import re +from tqdm.asyncio import tqdm as tqdm_async from typing import Union from collections import Counter, defaultdict import warnings @@ -329,11 +330,15 @@ async def extract_entities( ) return dict(maybe_nodes), dict(maybe_edges) - # use_llm_func is wrapped in ascynio.Semaphore, limiting max_async callings - results = await asyncio.gather( - *[_process_single_content(c) for c in ordered_chunks] - ) - print() # clear the progress bar + results = [] + for result in tqdm_async( + asyncio.as_completed([_process_single_content(c) for c in ordered_chunks]), + total=len(ordered_chunks), + desc="Extracting entities from chunks", + unit="chunk", + ): + results.append(await result) + maybe_nodes = defaultdict(list) maybe_edges = defaultdict(list) for m_nodes, m_edges in results: @@ -341,18 +346,38 @@ async def extract_entities( maybe_nodes[k].extend(v) for k, v in m_edges.items(): maybe_edges[tuple(sorted(k))].extend(v) - all_entities_data = await asyncio.gather( - *[ - _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config) - for k, v in maybe_nodes.items() - ] - ) - all_relationships_data = await asyncio.gather( - *[ - _merge_edges_then_upsert(k[0], k[1], v, knowledge_graph_inst, global_config) - for k, v in maybe_edges.items() - ] - ) + logger.info("Inserting entities into storage...") + all_entities_data = [] + for result in tqdm_async( + asyncio.as_completed( + [ + _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config) + for k, v in maybe_nodes.items() + ] + ), + total=len(maybe_nodes), + desc="Inserting entities", + unit="entity", + ): + all_entities_data.append(await result) + + logger.info("Inserting relationships into storage...") + all_relationships_data = [] + for result in tqdm_async( + asyncio.as_completed( + [ + _merge_edges_then_upsert( + k[0], k[1], v, knowledge_graph_inst, global_config + ) + for k, v in maybe_edges.items() + ] + ), + total=len(maybe_edges), + desc="Inserting relationships", + unit="relationship", + ): + all_relationships_data.append(await result) + if not len(all_entities_data): logger.warning("Didn't extract any entities, maybe your LLM is not working") return None diff --git a/lightrag/storage.py b/lightrag/storage.py index 9a4c3d4c..007d6534 100644 --- a/lightrag/storage.py +++ b/lightrag/storage.py @@ -1,6 +1,7 @@ import asyncio import html import os +from tqdm.asyncio import tqdm as tqdm_async from dataclasses import dataclass from typing import Any, Union, cast import networkx as nx @@ -95,9 +96,16 @@ class NanoVectorDBStorage(BaseVectorStorage): contents[i : i + self._max_batch_size] for i in range(0, len(contents), self._max_batch_size) ] - embeddings_list = await asyncio.gather( - *[self.embedding_func(batch) for batch in batches] - ) + embedding_tasks = [self.embedding_func(batch) for batch in batches] + embeddings_list = [] + for f in tqdm_async( + asyncio.as_completed(embedding_tasks), + total=len(embedding_tasks), + desc="Generating embeddings", + unit="batch", + ): + embeddings = await f + embeddings_list.append(embeddings) embeddings = np.concatenate(embeddings_list) for i, d in enumerate(list_data): d["__vector__"] = embeddings[i] From cb492ccb04aca49aa26cab5885c2c4f881ebfc33 Mon Sep 17 00:00:00 2001 From: Larfii <834462287@qq.com> Date: Mon, 25 Nov 2024 18:06:19 +0800 Subject: [PATCH 2/3] Add custom KG insertion --- README.md | 44 ++++++++++++++ examples/insert_custom_kg.py | 108 +++++++++++++++++++++++++++++++++++ lightrag/__init__.py | 2 +- lightrag/lightrag.py | 102 +++++++++++++++++++++++++++++++++ 4 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 examples/insert_custom_kg.py diff --git a/README.md b/README.md index 6d5af135..155d20bb 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ This repository hosts the code of LightRAG. The structure of this code is based ## 🎉 News +- [x] [2024.11.25]🎯📢LightRAG now supports [custom KG insertion](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg). - [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author! - [x] [2024.11.12]🎯📢LightRAG now supports [Oracle Database 23ai for all storage types (KV, vector, and graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py). - [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete-entity). @@ -327,6 +328,49 @@ with open("./newText.txt") as f: rag.insert(f.read()) ``` +### Insert Custom KG + +```python +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=embedding_dimension, + max_token_size=8192, + func=embedding_func, + ), +) + +custom_kg = { + "entities": [ + { + "entity_name": "CompanyA", + "entity_type": "Organization", + "description": "A major technology company", + "source_id": "Source1" + }, + { + "entity_name": "ProductX", + "entity_type": "Product", + "description": "A popular product developed by CompanyA", + "source_id": "Source1" + } + ], + "relationships": [ + { + "src_id": "CompanyA", + "tgt_id": "ProductX", + "description": "CompanyA develops ProductX", + "keywords": "develop, produce", + "weight": 1.0, + "source_id": "Source1" + } + ] +} + +rag.insert_custom_kg(custom_kg) +``` + ### Delete Entity ```python diff --git a/examples/insert_custom_kg.py b/examples/insert_custom_kg.py new file mode 100644 index 00000000..bbabe6a9 --- /dev/null +++ b/examples/insert_custom_kg.py @@ -0,0 +1,108 @@ +import os +from lightrag import LightRAG, QueryParam +from lightrag.llm import gpt_4o_mini_complete +######### +# Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert() +# import nest_asyncio +# nest_asyncio.apply() +######### + +WORKING_DIR = "./custom_kg" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=gpt_4o_mini_complete, # Use gpt_4o_mini_complete LLM model + # llm_model_func=gpt_4o_complete # Optionally, use a stronger model +) + +custom_kg = { + "entities": [ + { + "entity_name": "CompanyA", + "entity_type": "Organization", + "description": "A major technology company", + "source_id": "Source1" + }, + { + "entity_name": "ProductX", + "entity_type": "Product", + "description": "A popular product developed by CompanyA", + "source_id": "Source1" + }, + { + "entity_name": "PersonA", + "entity_type": "Person", + "description": "A renowned researcher in AI", + "source_id": "Source2" + }, + { + "entity_name": "UniversityB", + "entity_type": "Organization", + "description": "A leading university specializing in technology and sciences", + "source_id": "Source2" + }, + { + "entity_name": "CityC", + "entity_type": "Location", + "description": "A large metropolitan city known for its culture and economy", + "source_id": "Source3" + }, + { + "entity_name": "EventY", + "entity_type": "Event", + "description": "An annual technology conference held in CityC", + "source_id": "Source3" + }, + { + "entity_name": "CompanyD", + "entity_type": "Organization", + "description": "A financial services company specializing in insurance", + "source_id": "Source4" + }, + { + "entity_name": "ServiceZ", + "entity_type": "Service", + "description": "An insurance product offered by CompanyD", + "source_id": "Source4" + } + ], + "relationships": [ + { + "src_id": "CompanyA", + "tgt_id": "ProductX", + "description": "CompanyA develops ProductX", + "keywords": "develop, produce", + "weight": 1.0, + "source_id": "Source1" + }, + { + "src_id": "PersonA", + "tgt_id": "UniversityB", + "description": "PersonA works at UniversityB", + "keywords": "employment, affiliation", + "weight": 0.9, + "source_id": "Source2" + }, + { + "src_id": "CityC", + "tgt_id": "EventY", + "description": "EventY is hosted in CityC", + "keywords": "host, location", + "weight": 0.8, + "source_id": "Source3" + }, + { + "src_id": "CompanyD", + "tgt_id": "ServiceZ", + "description": "CompanyD provides ServiceZ", + "keywords": "provide, offer", + "weight": 1.0, + "source_id": "Source4" + } + ] +} + +rag.insert_custom_kg(custom_kg) diff --git a/lightrag/__init__.py b/lightrag/__init__.py index c8b61765..a8b60e55 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam -__version__ = "1.0.1" +__version__ = "1.0.2" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/LightRAG" diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 28e72102..d531f4f6 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -308,6 +308,108 @@ class LightRAG: tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback()) await asyncio.gather(*tasks) + def insert_custom_kg(self, custom_kg: dict): + loop = always_get_an_event_loop() + return loop.run_until_complete(self.ainsert_custom_kg(custom_kg)) + + async def ainsert_custom_kg(self, custom_kg: dict): + update_storage = False + try: + # Insert entities into knowledge graph + all_entities_data = [] + for entity_data in custom_kg.get("entities", []): + entity_name = f'"{entity_data["entity_name"].upper()}"' + entity_type = entity_data.get("entity_type", "UNKNOWN") + description = entity_data.get("description", "No description provided") + source_id = entity_data["source_id"] + + # Prepare node data + node_data = { + "entity_type": entity_type, + "description": description, + "source_id": source_id, + } + # Insert node data into the knowledge graph + await self.chunk_entity_relation_graph.upsert_node( + entity_name, node_data=node_data + ) + node_data["entity_name"] = entity_name + all_entities_data.append(node_data) + update_storage = True + + # Insert relationships into knowledge graph + all_relationships_data = [] + for relationship_data in custom_kg.get("relationships", []): + src_id = f'"{relationship_data["src_id"].upper()}"' + tgt_id = f'"{relationship_data["tgt_id"].upper()}"' + description = relationship_data["description"] + keywords = relationship_data["keywords"] + weight = relationship_data.get("weight", 1.0) + source_id = relationship_data["source_id"] + + # Check if nodes exist in the knowledge graph + for need_insert_id in [src_id, tgt_id]: + if not ( + await self.chunk_entity_relation_graph.has_node(need_insert_id) + ): + await self.chunk_entity_relation_graph.upsert_node( + need_insert_id, + node_data={ + "source_id": source_id, + "description": "UNKNOWN", + "entity_type": "UNKNOWN", + }, + ) + + # Insert edge into the knowledge graph + await self.chunk_entity_relation_graph.upsert_edge( + src_id, + tgt_id, + edge_data={ + "weight": weight, + "description": description, + "keywords": keywords, + "source_id": source_id, + }, + ) + edge_data = { + "src_id": src_id, + "tgt_id": tgt_id, + "description": description, + "keywords": keywords, + } + all_relationships_data.append(edge_data) + update_storage = True + + # Insert entities into vector storage if needed + if self.entities_vdb is not None: + data_for_vdb = { + compute_mdhash_id(dp["entity_name"], prefix="ent-"): { + "content": dp["entity_name"] + dp["description"], + "entity_name": dp["entity_name"], + } + for dp in all_entities_data + } + await self.entities_vdb.upsert(data_for_vdb) + + # Insert relationships into vector storage if needed + if self.relationships_vdb is not None: + data_for_vdb = { + compute_mdhash_id(dp["src_id"] + dp["tgt_id"], prefix="rel-"): { + "src_id": dp["src_id"], + "tgt_id": dp["tgt_id"], + "content": dp["keywords"] + + dp["src_id"] + + dp["tgt_id"] + + dp["description"], + } + for dp in all_relationships_data + } + await self.relationships_vdb.upsert(data_for_vdb) + finally: + if update_storage: + await self._insert_done() + def query(self, query: str, param: QueryParam = QueryParam()): loop = always_get_an_event_loop() return loop.run_until_complete(self.aquery(query, param)) From cbcc449683dc178f880b5b023b801282b1300129 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=9C=A8Data=20Intelligence=20Lab=40HKU=E2=9C=A8?= <118165258+HKUDS@users.noreply.github.com> Date: Mon, 25 Nov 2024 18:18:04 +0800 Subject: [PATCH 3/3] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 155d20bb..ce14e3bb 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,8 @@ This repository hosts the code of LightRAG. The structure of this code is based ## 🎉 News -- [x] [2024.11.25]🎯📢LightRAG now supports [custom KG insertion](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg). -- [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author! +- [x] [2024.11.25]🎯📢LightRAG now supports seamless integration of [custom knowledge graphs](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg), empowering users to enhance the system with their own domain expertise. +- [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author. - [x] [2024.11.12]🎯📢LightRAG now supports [Oracle Database 23ai for all storage types (KV, vector, and graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py). - [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete-entity). - [x] [2024.11.09]🎯📢Introducing the [LightRAG Gui](https://lightrag-gui.streamlit.app), which allows you to insert, query, visualize, and download LightRAG knowledge.