Merge branch 'HKUDS:main' into main
This commit is contained in:
46
README.md
46
README.md
@@ -26,7 +26,8 @@ This repository hosts the code of LightRAG. The structure of this code is based
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
## 🎉 News
|
## 🎉 News
|
||||||
- [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author!
|
- [x] [2024.11.25]🎯📢LightRAG now supports seamless integration of [custom knowledge graphs](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg), empowering users to enhance the system with their own domain expertise.
|
||||||
|
- [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author.
|
||||||
- [x] [2024.11.12]🎯📢LightRAG now supports [Oracle Database 23ai for all storage types (KV, vector, and graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py).
|
- [x] [2024.11.12]🎯📢LightRAG now supports [Oracle Database 23ai for all storage types (KV, vector, and graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py).
|
||||||
- [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete-entity).
|
- [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete-entity).
|
||||||
- [x] [2024.11.09]🎯📢Introducing the [LightRAG Gui](https://lightrag-gui.streamlit.app), which allows you to insert, query, visualize, and download LightRAG knowledge.
|
- [x] [2024.11.09]🎯📢Introducing the [LightRAG Gui](https://lightrag-gui.streamlit.app), which allows you to insert, query, visualize, and download LightRAG knowledge.
|
||||||
@@ -327,6 +328,49 @@ with open("./newText.txt") as f:
|
|||||||
rag.insert(f.read())
|
rag.insert(f.read())
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Insert Custom KG
|
||||||
|
|
||||||
|
```python
|
||||||
|
rag = LightRAG(
|
||||||
|
working_dir=WORKING_DIR,
|
||||||
|
llm_model_func=llm_model_func,
|
||||||
|
embedding_func=EmbeddingFunc(
|
||||||
|
embedding_dim=embedding_dimension,
|
||||||
|
max_token_size=8192,
|
||||||
|
func=embedding_func,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
custom_kg = {
|
||||||
|
"entities": [
|
||||||
|
{
|
||||||
|
"entity_name": "CompanyA",
|
||||||
|
"entity_type": "Organization",
|
||||||
|
"description": "A major technology company",
|
||||||
|
"source_id": "Source1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entity_name": "ProductX",
|
||||||
|
"entity_type": "Product",
|
||||||
|
"description": "A popular product developed by CompanyA",
|
||||||
|
"source_id": "Source1"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"relationships": [
|
||||||
|
{
|
||||||
|
"src_id": "CompanyA",
|
||||||
|
"tgt_id": "ProductX",
|
||||||
|
"description": "CompanyA develops ProductX",
|
||||||
|
"keywords": "develop, produce",
|
||||||
|
"weight": 1.0,
|
||||||
|
"source_id": "Source1"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
rag.insert_custom_kg(custom_kg)
|
||||||
|
```
|
||||||
|
|
||||||
### Delete Entity
|
### Delete Entity
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
108
examples/insert_custom_kg.py
Normal file
108
examples/insert_custom_kg.py
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
import os
|
||||||
|
from lightrag import LightRAG, QueryParam
|
||||||
|
from lightrag.llm import gpt_4o_mini_complete
|
||||||
|
#########
|
||||||
|
# Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert()
|
||||||
|
# import nest_asyncio
|
||||||
|
# nest_asyncio.apply()
|
||||||
|
#########
|
||||||
|
|
||||||
|
WORKING_DIR = "./custom_kg"
|
||||||
|
|
||||||
|
if not os.path.exists(WORKING_DIR):
|
||||||
|
os.mkdir(WORKING_DIR)
|
||||||
|
|
||||||
|
rag = LightRAG(
|
||||||
|
working_dir=WORKING_DIR,
|
||||||
|
llm_model_func=gpt_4o_mini_complete, # Use gpt_4o_mini_complete LLM model
|
||||||
|
# llm_model_func=gpt_4o_complete # Optionally, use a stronger model
|
||||||
|
)
|
||||||
|
|
||||||
|
custom_kg = {
|
||||||
|
"entities": [
|
||||||
|
{
|
||||||
|
"entity_name": "CompanyA",
|
||||||
|
"entity_type": "Organization",
|
||||||
|
"description": "A major technology company",
|
||||||
|
"source_id": "Source1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entity_name": "ProductX",
|
||||||
|
"entity_type": "Product",
|
||||||
|
"description": "A popular product developed by CompanyA",
|
||||||
|
"source_id": "Source1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entity_name": "PersonA",
|
||||||
|
"entity_type": "Person",
|
||||||
|
"description": "A renowned researcher in AI",
|
||||||
|
"source_id": "Source2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entity_name": "UniversityB",
|
||||||
|
"entity_type": "Organization",
|
||||||
|
"description": "A leading university specializing in technology and sciences",
|
||||||
|
"source_id": "Source2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entity_name": "CityC",
|
||||||
|
"entity_type": "Location",
|
||||||
|
"description": "A large metropolitan city known for its culture and economy",
|
||||||
|
"source_id": "Source3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entity_name": "EventY",
|
||||||
|
"entity_type": "Event",
|
||||||
|
"description": "An annual technology conference held in CityC",
|
||||||
|
"source_id": "Source3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entity_name": "CompanyD",
|
||||||
|
"entity_type": "Organization",
|
||||||
|
"description": "A financial services company specializing in insurance",
|
||||||
|
"source_id": "Source4"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entity_name": "ServiceZ",
|
||||||
|
"entity_type": "Service",
|
||||||
|
"description": "An insurance product offered by CompanyD",
|
||||||
|
"source_id": "Source4"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"relationships": [
|
||||||
|
{
|
||||||
|
"src_id": "CompanyA",
|
||||||
|
"tgt_id": "ProductX",
|
||||||
|
"description": "CompanyA develops ProductX",
|
||||||
|
"keywords": "develop, produce",
|
||||||
|
"weight": 1.0,
|
||||||
|
"source_id": "Source1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"src_id": "PersonA",
|
||||||
|
"tgt_id": "UniversityB",
|
||||||
|
"description": "PersonA works at UniversityB",
|
||||||
|
"keywords": "employment, affiliation",
|
||||||
|
"weight": 0.9,
|
||||||
|
"source_id": "Source2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"src_id": "CityC",
|
||||||
|
"tgt_id": "EventY",
|
||||||
|
"description": "EventY is hosted in CityC",
|
||||||
|
"keywords": "host, location",
|
||||||
|
"weight": 0.8,
|
||||||
|
"source_id": "Source3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"src_id": "CompanyD",
|
||||||
|
"tgt_id": "ServiceZ",
|
||||||
|
"description": "CompanyD provides ServiceZ",
|
||||||
|
"keywords": "provide, offer",
|
||||||
|
"weight": 1.0,
|
||||||
|
"source_id": "Source4"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
rag.insert_custom_kg(custom_kg)
|
@@ -1,5 +1,5 @@
|
|||||||
from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
|
from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
|
||||||
|
|
||||||
__version__ = "1.0.1"
|
__version__ = "1.0.2"
|
||||||
__author__ = "Zirui Guo"
|
__author__ = "Zirui Guo"
|
||||||
__url__ = "https://github.com/HKUDS/LightRAG"
|
__url__ = "https://github.com/HKUDS/LightRAG"
|
||||||
|
@@ -1,5 +1,6 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
|
from tqdm.asyncio import tqdm as tqdm_async
|
||||||
from dataclasses import asdict, dataclass, field
|
from dataclasses import asdict, dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from functools import partial
|
from functools import partial
|
||||||
@@ -242,7 +243,9 @@ class LightRAG:
|
|||||||
logger.info(f"[New Docs] inserting {len(new_docs)} docs")
|
logger.info(f"[New Docs] inserting {len(new_docs)} docs")
|
||||||
|
|
||||||
inserting_chunks = {}
|
inserting_chunks = {}
|
||||||
for doc_key, doc in new_docs.items():
|
for doc_key, doc in tqdm_async(
|
||||||
|
new_docs.items(), desc="Chunking documents", unit="doc"
|
||||||
|
):
|
||||||
chunks = {
|
chunks = {
|
||||||
compute_mdhash_id(dp["content"], prefix="chunk-"): {
|
compute_mdhash_id(dp["content"], prefix="chunk-"): {
|
||||||
**dp,
|
**dp,
|
||||||
@@ -304,6 +307,108 @@ class LightRAG:
|
|||||||
tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback())
|
tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback())
|
||||||
await asyncio.gather(*tasks)
|
await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
def insert_custom_kg(self, custom_kg: dict):
|
||||||
|
loop = always_get_an_event_loop()
|
||||||
|
return loop.run_until_complete(self.ainsert_custom_kg(custom_kg))
|
||||||
|
|
||||||
|
async def ainsert_custom_kg(self, custom_kg: dict):
|
||||||
|
update_storage = False
|
||||||
|
try:
|
||||||
|
# Insert entities into knowledge graph
|
||||||
|
all_entities_data = []
|
||||||
|
for entity_data in custom_kg.get("entities", []):
|
||||||
|
entity_name = f'"{entity_data["entity_name"].upper()}"'
|
||||||
|
entity_type = entity_data.get("entity_type", "UNKNOWN")
|
||||||
|
description = entity_data.get("description", "No description provided")
|
||||||
|
source_id = entity_data["source_id"]
|
||||||
|
|
||||||
|
# Prepare node data
|
||||||
|
node_data = {
|
||||||
|
"entity_type": entity_type,
|
||||||
|
"description": description,
|
||||||
|
"source_id": source_id,
|
||||||
|
}
|
||||||
|
# Insert node data into the knowledge graph
|
||||||
|
await self.chunk_entity_relation_graph.upsert_node(
|
||||||
|
entity_name, node_data=node_data
|
||||||
|
)
|
||||||
|
node_data["entity_name"] = entity_name
|
||||||
|
all_entities_data.append(node_data)
|
||||||
|
update_storage = True
|
||||||
|
|
||||||
|
# Insert relationships into knowledge graph
|
||||||
|
all_relationships_data = []
|
||||||
|
for relationship_data in custom_kg.get("relationships", []):
|
||||||
|
src_id = f'"{relationship_data["src_id"].upper()}"'
|
||||||
|
tgt_id = f'"{relationship_data["tgt_id"].upper()}"'
|
||||||
|
description = relationship_data["description"]
|
||||||
|
keywords = relationship_data["keywords"]
|
||||||
|
weight = relationship_data.get("weight", 1.0)
|
||||||
|
source_id = relationship_data["source_id"]
|
||||||
|
|
||||||
|
# Check if nodes exist in the knowledge graph
|
||||||
|
for need_insert_id in [src_id, tgt_id]:
|
||||||
|
if not (
|
||||||
|
await self.chunk_entity_relation_graph.has_node(need_insert_id)
|
||||||
|
):
|
||||||
|
await self.chunk_entity_relation_graph.upsert_node(
|
||||||
|
need_insert_id,
|
||||||
|
node_data={
|
||||||
|
"source_id": source_id,
|
||||||
|
"description": "UNKNOWN",
|
||||||
|
"entity_type": "UNKNOWN",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Insert edge into the knowledge graph
|
||||||
|
await self.chunk_entity_relation_graph.upsert_edge(
|
||||||
|
src_id,
|
||||||
|
tgt_id,
|
||||||
|
edge_data={
|
||||||
|
"weight": weight,
|
||||||
|
"description": description,
|
||||||
|
"keywords": keywords,
|
||||||
|
"source_id": source_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
edge_data = {
|
||||||
|
"src_id": src_id,
|
||||||
|
"tgt_id": tgt_id,
|
||||||
|
"description": description,
|
||||||
|
"keywords": keywords,
|
||||||
|
}
|
||||||
|
all_relationships_data.append(edge_data)
|
||||||
|
update_storage = True
|
||||||
|
|
||||||
|
# Insert entities into vector storage if needed
|
||||||
|
if self.entities_vdb is not None:
|
||||||
|
data_for_vdb = {
|
||||||
|
compute_mdhash_id(dp["entity_name"], prefix="ent-"): {
|
||||||
|
"content": dp["entity_name"] + dp["description"],
|
||||||
|
"entity_name": dp["entity_name"],
|
||||||
|
}
|
||||||
|
for dp in all_entities_data
|
||||||
|
}
|
||||||
|
await self.entities_vdb.upsert(data_for_vdb)
|
||||||
|
|
||||||
|
# Insert relationships into vector storage if needed
|
||||||
|
if self.relationships_vdb is not None:
|
||||||
|
data_for_vdb = {
|
||||||
|
compute_mdhash_id(dp["src_id"] + dp["tgt_id"], prefix="rel-"): {
|
||||||
|
"src_id": dp["src_id"],
|
||||||
|
"tgt_id": dp["tgt_id"],
|
||||||
|
"content": dp["keywords"]
|
||||||
|
+ dp["src_id"]
|
||||||
|
+ dp["tgt_id"]
|
||||||
|
+ dp["description"],
|
||||||
|
}
|
||||||
|
for dp in all_relationships_data
|
||||||
|
}
|
||||||
|
await self.relationships_vdb.upsert(data_for_vdb)
|
||||||
|
finally:
|
||||||
|
if update_storage:
|
||||||
|
await self._insert_done()
|
||||||
|
|
||||||
def query(self, query: str, param: QueryParam = QueryParam()):
|
def query(self, query: str, param: QueryParam = QueryParam()):
|
||||||
loop = always_get_an_event_loop()
|
loop = always_get_an_event_loop()
|
||||||
return loop.run_until_complete(self.aquery(query, param))
|
return loop.run_until_complete(self.aquery(query, param))
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
from tqdm.asyncio import tqdm as tqdm_async
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from collections import Counter, defaultdict
|
from collections import Counter, defaultdict
|
||||||
import warnings
|
import warnings
|
||||||
@@ -342,11 +343,15 @@ async def extract_entities(
|
|||||||
)
|
)
|
||||||
return dict(maybe_nodes), dict(maybe_edges)
|
return dict(maybe_nodes), dict(maybe_edges)
|
||||||
|
|
||||||
# use_llm_func is wrapped in ascynio.Semaphore, limiting max_async callings
|
results = []
|
||||||
results = await asyncio.gather(
|
for result in tqdm_async(
|
||||||
*[_process_single_content(c) for c in ordered_chunks]
|
asyncio.as_completed([_process_single_content(c) for c in ordered_chunks]),
|
||||||
)
|
total=len(ordered_chunks),
|
||||||
print() # clear the progress bar
|
desc="Extracting entities from chunks",
|
||||||
|
unit="chunk",
|
||||||
|
):
|
||||||
|
results.append(await result)
|
||||||
|
|
||||||
maybe_nodes = defaultdict(list)
|
maybe_nodes = defaultdict(list)
|
||||||
maybe_edges = defaultdict(list)
|
maybe_edges = defaultdict(list)
|
||||||
for m_nodes, m_edges in results:
|
for m_nodes, m_edges in results:
|
||||||
@@ -354,18 +359,38 @@ async def extract_entities(
|
|||||||
maybe_nodes[k].extend(v)
|
maybe_nodes[k].extend(v)
|
||||||
for k, v in m_edges.items():
|
for k, v in m_edges.items():
|
||||||
maybe_edges[tuple(sorted(k))].extend(v)
|
maybe_edges[tuple(sorted(k))].extend(v)
|
||||||
all_entities_data = await asyncio.gather(
|
logger.info("Inserting entities into storage...")
|
||||||
*[
|
all_entities_data = []
|
||||||
_merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
|
for result in tqdm_async(
|
||||||
for k, v in maybe_nodes.items()
|
asyncio.as_completed(
|
||||||
]
|
[
|
||||||
)
|
_merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
|
||||||
all_relationships_data = await asyncio.gather(
|
for k, v in maybe_nodes.items()
|
||||||
*[
|
]
|
||||||
_merge_edges_then_upsert(k[0], k[1], v, knowledge_graph_inst, global_config)
|
),
|
||||||
for k, v in maybe_edges.items()
|
total=len(maybe_nodes),
|
||||||
]
|
desc="Inserting entities",
|
||||||
)
|
unit="entity",
|
||||||
|
):
|
||||||
|
all_entities_data.append(await result)
|
||||||
|
|
||||||
|
logger.info("Inserting relationships into storage...")
|
||||||
|
all_relationships_data = []
|
||||||
|
for result in tqdm_async(
|
||||||
|
asyncio.as_completed(
|
||||||
|
[
|
||||||
|
_merge_edges_then_upsert(
|
||||||
|
k[0], k[1], v, knowledge_graph_inst, global_config
|
||||||
|
)
|
||||||
|
for k, v in maybe_edges.items()
|
||||||
|
]
|
||||||
|
),
|
||||||
|
total=len(maybe_edges),
|
||||||
|
desc="Inserting relationships",
|
||||||
|
unit="relationship",
|
||||||
|
):
|
||||||
|
all_relationships_data.append(await result)
|
||||||
|
|
||||||
if not len(all_entities_data):
|
if not len(all_entities_data):
|
||||||
logger.warning("Didn't extract any entities, maybe your LLM is not working")
|
logger.warning("Didn't extract any entities, maybe your LLM is not working")
|
||||||
return None
|
return None
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import html
|
import html
|
||||||
import os
|
import os
|
||||||
|
from tqdm.asyncio import tqdm as tqdm_async
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Any, Union, cast
|
from typing import Any, Union, cast
|
||||||
import networkx as nx
|
import networkx as nx
|
||||||
@@ -95,9 +96,16 @@ class NanoVectorDBStorage(BaseVectorStorage):
|
|||||||
contents[i : i + self._max_batch_size]
|
contents[i : i + self._max_batch_size]
|
||||||
for i in range(0, len(contents), self._max_batch_size)
|
for i in range(0, len(contents), self._max_batch_size)
|
||||||
]
|
]
|
||||||
embeddings_list = await asyncio.gather(
|
embedding_tasks = [self.embedding_func(batch) for batch in batches]
|
||||||
*[self.embedding_func(batch) for batch in batches]
|
embeddings_list = []
|
||||||
)
|
for f in tqdm_async(
|
||||||
|
asyncio.as_completed(embedding_tasks),
|
||||||
|
total=len(embedding_tasks),
|
||||||
|
desc="Generating embeddings",
|
||||||
|
unit="batch",
|
||||||
|
):
|
||||||
|
embeddings = await f
|
||||||
|
embeddings_list.append(embeddings)
|
||||||
embeddings = np.concatenate(embeddings_list)
|
embeddings = np.concatenate(embeddings_list)
|
||||||
for i, d in enumerate(list_data):
|
for i, d in enumerate(list_data):
|
||||||
d["__vector__"] = embeddings[i]
|
d["__vector__"] = embeddings[i]
|
||||||
|
Reference in New Issue
Block a user