From 8562ecdebca67f689b682275f63e42e6634fa47f Mon Sep 17 00:00:00 2001
From: Larfii <834462287@qq.com>
Date: Mon, 25 Nov 2024 15:04:38 +0800
Subject: [PATCH 1/3] Add a progress bar

---
 lightrag/lightrag.py |  5 +++-
 lightrag/operate.py  | 59 +++++++++++++++++++++++++++++++-------------
 lightrag/storage.py  | 14 ++++++++---
 3 files changed, 57 insertions(+), 21 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 7fafadcf..28e72102 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -1,5 +1,6 @@
 import asyncio
 import os
+from tqdm.asyncio import tqdm as tqdm_async
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from functools import partial
@@ -243,7 +244,9 @@ class LightRAG:
             logger.info(f"[New Docs] inserting {len(new_docs)} docs")
 
             inserting_chunks = {}
-            for doc_key, doc in new_docs.items():
+            for doc_key, doc in tqdm_async(
+                new_docs.items(), desc="Chunking documents", unit="doc"
+            ):
                 chunks = {
                     compute_mdhash_id(dp["content"], prefix="chunk-"): {
                         **dp,
diff --git a/lightrag/operate.py b/lightrag/operate.py
index cf236633..9e4b768a 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1,6 +1,7 @@
 import asyncio
 import json
 import re
+from tqdm.asyncio import tqdm as tqdm_async
 from typing import Union
 from collections import Counter, defaultdict
 import warnings
@@ -329,11 +330,15 @@ async def extract_entities(
         )
         return dict(maybe_nodes), dict(maybe_edges)
 
-    # use_llm_func is wrapped in ascynio.Semaphore, limiting max_async callings
-    results = await asyncio.gather(
-        *[_process_single_content(c) for c in ordered_chunks]
-    )
-    print()  # clear the progress bar
+    results = []
+    for result in tqdm_async(
+        asyncio.as_completed([_process_single_content(c) for c in ordered_chunks]),
+        total=len(ordered_chunks),
+        desc="Extracting entities from chunks",
+        unit="chunk",
+    ):
+        results.append(await result)
+
     maybe_nodes = defaultdict(list)
     maybe_edges = defaultdict(list)
     for m_nodes, m_edges in results:
@@ -341,18 +346,38 @@ async def extract_entities(
             maybe_nodes[k].extend(v)
         for k, v in m_edges.items():
             maybe_edges[tuple(sorted(k))].extend(v)
-    all_entities_data = await asyncio.gather(
-        *[
-            _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
-            for k, v in maybe_nodes.items()
-        ]
-    )
-    all_relationships_data = await asyncio.gather(
-        *[
-            _merge_edges_then_upsert(k[0], k[1], v, knowledge_graph_inst, global_config)
-            for k, v in maybe_edges.items()
-        ]
-    )
+    logger.info("Inserting entities into storage...")
+    all_entities_data = []
+    for result in tqdm_async(
+        asyncio.as_completed(
+            [
+                _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
+                for k, v in maybe_nodes.items()
+            ]
+        ),
+        total=len(maybe_nodes),
+        desc="Inserting entities",
+        unit="entity",
+    ):
+        all_entities_data.append(await result)
+
+    logger.info("Inserting relationships into storage...")
+    all_relationships_data = []
+    for result in tqdm_async(
+        asyncio.as_completed(
+            [
+                _merge_edges_then_upsert(
+                    k[0], k[1], v, knowledge_graph_inst, global_config
+                )
+                for k, v in maybe_edges.items()
+            ]
+        ),
+        total=len(maybe_edges),
+        desc="Inserting relationships",
+        unit="relationship",
+    ):
+        all_relationships_data.append(await result)
+
     if not len(all_entities_data):
         logger.warning("Didn't extract any entities, maybe your LLM is not working")
         return None
diff --git a/lightrag/storage.py b/lightrag/storage.py
index 9a4c3d4c..007d6534 100644
--- a/lightrag/storage.py
+++ b/lightrag/storage.py
@@ -1,6 +1,7 @@
 import asyncio
 import html
 import os
+from tqdm.asyncio import tqdm as tqdm_async
 from dataclasses import dataclass
 from typing import Any, Union, cast
 import networkx as nx
@@ -95,9 +96,16 @@ class NanoVectorDBStorage(BaseVectorStorage):
             contents[i : i + self._max_batch_size]
             for i in range(0, len(contents), self._max_batch_size)
         ]
-        embeddings_list = await asyncio.gather(
-            *[self.embedding_func(batch) for batch in batches]
-        )
+        embedding_tasks = [self.embedding_func(batch) for batch in batches]
+        embeddings_list = []
+        for f in tqdm_async(
+            asyncio.as_completed(embedding_tasks),
+            total=len(embedding_tasks),
+            desc="Generating embeddings",
+            unit="batch",
+        ):
+            embeddings = await f
+            embeddings_list.append(embeddings)
         embeddings = np.concatenate(embeddings_list)
         for i, d in enumerate(list_data):
             d["__vector__"] = embeddings[i]

From cb492ccb04aca49aa26cab5885c2c4f881ebfc33 Mon Sep 17 00:00:00 2001
From: Larfii <834462287@qq.com>
Date: Mon, 25 Nov 2024 18:06:19 +0800
Subject: [PATCH 2/3] Add custom KG insertion

---
 README.md                    |  44 ++++++++++++++
 examples/insert_custom_kg.py | 108 +++++++++++++++++++++++++++++++++++
 lightrag/__init__.py         |   2 +-
 lightrag/lightrag.py         | 102 +++++++++++++++++++++++++++++++++
 4 files changed, 255 insertions(+), 1 deletion(-)
 create mode 100644 examples/insert_custom_kg.py

diff --git a/README.md b/README.md
index 6d5af135..155d20bb 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,7 @@ This repository hosts the code of LightRAG. The structure of this code is based
 </div>
 
 ## 🎉 News
+- [x] [2024.11.25]🎯📢LightRAG now supports [custom KG insertion](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg).
 - [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author!
 - [x] [2024.11.12]🎯📢LightRAG now supports [Oracle Database 23ai for all storage types (KV, vector, and graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py).
 - [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete-entity).
@@ -327,6 +328,49 @@ with open("./newText.txt") as f:
     rag.insert(f.read())
 ```
 
+### Insert Custom KG
+
+```python
+rag = LightRAG(
+     working_dir=WORKING_DIR,
+     llm_model_func=llm_model_func,
+     embedding_func=EmbeddingFunc(
+          embedding_dim=embedding_dimension,
+          max_token_size=8192,
+          func=embedding_func,
+     ),
+)
+
+custom_kg = {
+    "entities": [
+        {
+            "entity_name": "CompanyA",
+            "entity_type": "Organization",
+            "description": "A major technology company",
+            "source_id": "Source1"
+        },
+        {
+            "entity_name": "ProductX",
+            "entity_type": "Product",
+            "description": "A popular product developed by CompanyA",
+            "source_id": "Source1"
+        }
+    ],
+    "relationships": [
+        {
+            "src_id": "CompanyA",
+            "tgt_id": "ProductX",
+            "description": "CompanyA develops ProductX",
+            "keywords": "develop, produce",
+            "weight": 1.0,
+            "source_id": "Source1"
+        }
+    ]
+}
+
+rag.insert_custom_kg(custom_kg)
+```
+
 ### Delete Entity
 
 ```python
diff --git a/examples/insert_custom_kg.py b/examples/insert_custom_kg.py
new file mode 100644
index 00000000..bbabe6a9
--- /dev/null
+++ b/examples/insert_custom_kg.py
@@ -0,0 +1,108 @@
+import os
+from lightrag import LightRAG, QueryParam
+from lightrag.llm import gpt_4o_mini_complete
+#########
+# Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert()
+# import nest_asyncio
+# nest_asyncio.apply()
+#########
+
+WORKING_DIR = "./custom_kg"
+
+if not os.path.exists(WORKING_DIR):
+    os.mkdir(WORKING_DIR)
+
+rag = LightRAG(
+    working_dir=WORKING_DIR,
+    llm_model_func=gpt_4o_mini_complete,  # Use gpt_4o_mini_complete LLM model
+    # llm_model_func=gpt_4o_complete  # Optionally, use a stronger model
+)
+
+custom_kg = {
+    "entities": [
+        {
+            "entity_name": "CompanyA",
+            "entity_type": "Organization",
+            "description": "A major technology company",
+            "source_id": "Source1"
+        },
+        {
+            "entity_name": "ProductX",
+            "entity_type": "Product",
+            "description": "A popular product developed by CompanyA",
+            "source_id": "Source1"
+        },
+        {
+            "entity_name": "PersonA",
+            "entity_type": "Person",
+            "description": "A renowned researcher in AI",
+            "source_id": "Source2"
+        },
+        {
+            "entity_name": "UniversityB",
+            "entity_type": "Organization",
+            "description": "A leading university specializing in technology and sciences",
+            "source_id": "Source2"
+        },
+        {
+            "entity_name": "CityC",
+            "entity_type": "Location",
+            "description": "A large metropolitan city known for its culture and economy",
+            "source_id": "Source3"
+        },
+        {
+            "entity_name": "EventY",
+            "entity_type": "Event",
+            "description": "An annual technology conference held in CityC",
+            "source_id": "Source3"
+        },
+        {
+            "entity_name": "CompanyD",
+            "entity_type": "Organization",
+            "description": "A financial services company specializing in insurance",
+            "source_id": "Source4"
+        },
+        {
+            "entity_name": "ServiceZ",
+            "entity_type": "Service",
+            "description": "An insurance product offered by CompanyD",
+            "source_id": "Source4"
+        }
+    ],
+    "relationships": [
+        {
+            "src_id": "CompanyA",
+            "tgt_id": "ProductX",
+            "description": "CompanyA develops ProductX",
+            "keywords": "develop, produce",
+            "weight": 1.0,
+            "source_id": "Source1"
+        },
+        {
+            "src_id": "PersonA",
+            "tgt_id": "UniversityB",
+            "description": "PersonA works at UniversityB",
+            "keywords": "employment, affiliation",
+            "weight": 0.9,
+            "source_id": "Source2"
+        },
+        {
+            "src_id": "CityC",
+            "tgt_id": "EventY",
+            "description": "EventY is hosted in CityC",
+            "keywords": "host, location",
+            "weight": 0.8,
+            "source_id": "Source3"
+        },
+        {
+            "src_id": "CompanyD",
+            "tgt_id": "ServiceZ",
+            "description": "CompanyD provides ServiceZ",
+            "keywords": "provide, offer",
+            "weight": 1.0,
+            "source_id": "Source4"
+        }
+    ]
+}
+
+rag.insert_custom_kg(custom_kg)
diff --git a/lightrag/__init__.py b/lightrag/__init__.py
index c8b61765..a8b60e55 100644
--- a/lightrag/__init__.py
+++ b/lightrag/__init__.py
@@ -1,5 +1,5 @@
 from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
 
-__version__ = "1.0.1"
+__version__ = "1.0.2"
 __author__ = "Zirui Guo"
 __url__ = "https://github.com/HKUDS/LightRAG"
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 28e72102..d531f4f6 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -308,6 +308,108 @@ class LightRAG:
             tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback())
         await asyncio.gather(*tasks)
 
+    def insert_custom_kg(self, custom_kg: dict):
+        loop = always_get_an_event_loop()
+        return loop.run_until_complete(self.ainsert_custom_kg(custom_kg))
+
+    async def ainsert_custom_kg(self, custom_kg: dict):
+        update_storage = False
+        try:
+            # Insert entities into knowledge graph
+            all_entities_data = []
+            for entity_data in custom_kg.get("entities", []):
+                entity_name = f'"{entity_data["entity_name"].upper()}"'
+                entity_type = entity_data.get("entity_type", "UNKNOWN")
+                description = entity_data.get("description", "No description provided")
+                source_id = entity_data["source_id"]
+
+                # Prepare node data
+                node_data = {
+                    "entity_type": entity_type,
+                    "description": description,
+                    "source_id": source_id,
+                }
+                # Insert node data into the knowledge graph
+                await self.chunk_entity_relation_graph.upsert_node(
+                    entity_name, node_data=node_data
+                )
+                node_data["entity_name"] = entity_name
+                all_entities_data.append(node_data)
+                update_storage = True
+
+            # Insert relationships into knowledge graph
+            all_relationships_data = []
+            for relationship_data in custom_kg.get("relationships", []):
+                src_id = f'"{relationship_data["src_id"].upper()}"'
+                tgt_id = f'"{relationship_data["tgt_id"].upper()}"'
+                description = relationship_data["description"]
+                keywords = relationship_data["keywords"]
+                weight = relationship_data.get("weight", 1.0)
+                source_id = relationship_data["source_id"]
+
+                # Check if nodes exist in the knowledge graph
+                for need_insert_id in [src_id, tgt_id]:
+                    if not (
+                        await self.chunk_entity_relation_graph.has_node(need_insert_id)
+                    ):
+                        await self.chunk_entity_relation_graph.upsert_node(
+                            need_insert_id,
+                            node_data={
+                                "source_id": source_id,
+                                "description": "UNKNOWN",
+                                "entity_type": "UNKNOWN",
+                            },
+                        )
+
+                # Insert edge into the knowledge graph
+                await self.chunk_entity_relation_graph.upsert_edge(
+                    src_id,
+                    tgt_id,
+                    edge_data={
+                        "weight": weight,
+                        "description": description,
+                        "keywords": keywords,
+                        "source_id": source_id,
+                    },
+                )
+                edge_data = {
+                    "src_id": src_id,
+                    "tgt_id": tgt_id,
+                    "description": description,
+                    "keywords": keywords,
+                }
+                all_relationships_data.append(edge_data)
+                update_storage = True
+
+            # Insert entities into vector storage if needed
+            if self.entities_vdb is not None:
+                data_for_vdb = {
+                    compute_mdhash_id(dp["entity_name"], prefix="ent-"): {
+                        "content": dp["entity_name"] + dp["description"],
+                        "entity_name": dp["entity_name"],
+                    }
+                    for dp in all_entities_data
+                }
+                await self.entities_vdb.upsert(data_for_vdb)
+
+            # Insert relationships into vector storage if needed
+            if self.relationships_vdb is not None:
+                data_for_vdb = {
+                    compute_mdhash_id(dp["src_id"] + dp["tgt_id"], prefix="rel-"): {
+                        "src_id": dp["src_id"],
+                        "tgt_id": dp["tgt_id"],
+                        "content": dp["keywords"]
+                        + dp["src_id"]
+                        + dp["tgt_id"]
+                        + dp["description"],
+                    }
+                    for dp in all_relationships_data
+                }
+                await self.relationships_vdb.upsert(data_for_vdb)
+        finally:
+            if update_storage:
+                await self._insert_done()
+
     def query(self, query: str, param: QueryParam = QueryParam()):
         loop = always_get_an_event_loop()
         return loop.run_until_complete(self.aquery(query, param))

From cbcc449683dc178f880b5b023b801282b1300129 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=9C=A8Data=20Intelligence=20Lab=40HKU=E2=9C=A8?=
 <118165258+HKUDS@users.noreply.github.com>
Date: Mon, 25 Nov 2024 18:18:04 +0800
Subject: [PATCH 3/3] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 155d20bb..ce14e3bb 100644
--- a/README.md
+++ b/README.md
@@ -26,8 +26,8 @@ This repository hosts the code of LightRAG. The structure of this code is based
 </div>
 
 ## 🎉 News
-- [x] [2024.11.25]🎯📢LightRAG now supports [custom KG insertion](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg).
-- [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author!
+- [x] [2024.11.25]🎯📢LightRAG now supports seamless integration of [custom knowledge graphs](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg), empowering users to enhance the system with their own domain expertise.
+- [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author.
 - [x] [2024.11.12]🎯📢LightRAG now supports [Oracle Database 23ai for all storage types (KV, vector, and graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py).
 - [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete-entity).
 - [x] [2024.11.09]🎯📢Introducing the [LightRAG Gui](https://lightrag-gui.streamlit.app), which allows you to insert, query, visualize, and download LightRAG knowledge.