update insert custom kg

This commit is contained in:
LarFii
2024-12-04 19:44:04 +08:00
parent be72c825d2
commit 44d441a951
8 changed files with 119 additions and 163 deletions

View File

@@ -1,56 +0,0 @@
FROM debian:bullseye-slim
ENV JAVA_HOME=/opt/java/openjdk
COPY --from=eclipse-temurin:17 $JAVA_HOME $JAVA_HOME
ENV PATH="${JAVA_HOME}/bin:${PATH}" \
NEO4J_SHA256=7ce97bd9a4348af14df442f00b3dc5085b5983d6f03da643744838c7a1bc8ba7 \
NEO4J_TARBALL=neo4j-enterprise-5.24.2-unix.tar.gz \
NEO4J_EDITION=enterprise \
NEO4J_HOME="/var/lib/neo4j" \
LANG=C.UTF-8
ARG NEO4J_URI=https://dist.neo4j.org/neo4j-enterprise-5.24.2-unix.tar.gz
RUN addgroup --gid 7474 --system neo4j && adduser --uid 7474 --system --no-create-home --home "${NEO4J_HOME}" --ingroup neo4j neo4j
COPY ./local-package/* /startup/
RUN apt update \
&& apt-get install -y curl gcc git jq make procps tini wget \
&& curl --fail --silent --show-error --location --remote-name ${NEO4J_URI} \
&& echo "${NEO4J_SHA256} ${NEO4J_TARBALL}" | sha256sum -c --strict --quiet \
&& tar --extract --file ${NEO4J_TARBALL} --directory /var/lib \
&& mv /var/lib/neo4j-* "${NEO4J_HOME}" \
&& rm ${NEO4J_TARBALL} \
&& sed -i 's/Package Type:.*/Package Type: docker bullseye/' $NEO4J_HOME/packaging_info \
&& mv /startup/neo4j-admin-report.sh "${NEO4J_HOME}"/bin/neo4j-admin-report \
&& mv "${NEO4J_HOME}"/data /data \
&& mv "${NEO4J_HOME}"/logs /logs \
&& chown -R neo4j:neo4j /data \
&& chmod -R 777 /data \
&& chown -R neo4j:neo4j /logs \
&& chmod -R 777 /logs \
&& chown -R neo4j:neo4j "${NEO4J_HOME}" \
&& chmod -R 777 "${NEO4J_HOME}" \
&& chmod -R 755 "${NEO4J_HOME}/bin" \
&& ln -s /data "${NEO4J_HOME}"/data \
&& ln -s /logs "${NEO4J_HOME}"/logs \
&& git clone https://github.com/ncopa/su-exec.git \
&& cd su-exec \
&& git checkout 4c3bb42b093f14da70d8ab924b487ccfbb1397af \
&& echo d6c40440609a23483f12eb6295b5191e94baf08298a856bab6e15b10c3b82891 su-exec.c | sha256sum -c \
&& echo 2a87af245eb125aca9305a0b1025525ac80825590800f047419dc57bba36b334 Makefile | sha256sum -c \
&& make \
&& mv /su-exec/su-exec /usr/bin/su-exec \
&& apt-get -y purge --auto-remove curl gcc git make \
&& rm -rf /var/lib/apt/lists/* /su-exec
ENV PATH "${NEO4J_HOME}"/bin:$PATH
WORKDIR "${NEO4J_HOME}"
VOLUME /data /logs
EXPOSE 7474 7473 7687
ENTRYPOINT ["tini", "-g", "--", "/startup/docker-entrypoint.sh"]
CMD ["neo4j"]

View File

@@ -42,9 +42,9 @@ This repository hosts the code of LightRAG. The structure of this code is based
## Algorithm Flowchart
![LightRAG Indexing Flowchart](https://learnopencv.com/wp-content/uploads/2024/11/LightRAG-VectorDB-Json-KV-Store-Indexing-Flowchart-scaled.jpg)
*Figure 1: LightRAG Indexing Flowchart*
*Figure 1: LightRAG Indexing Flowchart - Img Caption : [Source](https://learnopencv.com/lightrag/)*
![LightRAG Retrieval and Querying Flowchart](https://learnopencv.com/wp-content/uploads/2024/11/LightRAG-Querying-Flowchart-Dual-Level-Retrieval-Generation-Knowledge-Graphs-scaled.jpg)
*Figure 2: LightRAG Retrieval and Querying Flowchart*
*Figure 2: LightRAG Retrieval and Querying Flowchart - Img Caption : [Source](https://learnopencv.com/lightrag/)*
## Install
@@ -364,7 +364,21 @@ custom_kg = {
"weight": 1.0,
"source_id": "Source1"
}
]
],
"chunks": [
{
"content": "ProductX, developed by CompanyA, has revolutionized the market with its cutting-edge features.",
"source_id": "Source1",
},
{
"content": "PersonA is a prominent researcher at UniversityB, focusing on artificial intelligence and machine learning.",
"source_id": "Source2",
},
{
"content": "None",
"source_id": "UNKNOWN",
},
],
}
rag.insert_custom_kg(custom_kg)
@@ -947,56 +961,6 @@ def extract_queries(file_path):
```
</details>
## Code Structure
```python
.
├── examples
├── batch_eval.py
├── generate_query.py
├── graph_visual_with_html.py
├── graph_visual_with_neo4j.py
├── lightrag_api_openai_compatible_demo.py
├── lightrag_azure_openai_demo.py
├── lightrag_bedrock_demo.py
├── lightrag_hf_demo.py
├── lightrag_lmdeploy_demo.py
├── lightrag_ollama_demo.py
├── lightrag_openai_compatible_demo.py
├── lightrag_openai_demo.py
├── lightrag_siliconcloud_demo.py
└── vram_management_demo.py
├── lightrag
├── kg
├── __init__.py
└── neo4j_impl.py
├── __init__.py
├── base.py
├── lightrag.py
├── llm.py
├── operate.py
├── prompt.py
├── storage.py
└── utils.py
├── reproduce
├── Step_0.py
├── Step_1_openai_compatible.py
├── Step_1.py
├── Step_2.py
├── Step_3_openai_compatible.py
└── Step_3.py
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── get_all_edges_nx.py
├── LICENSE
├── README.md
├── requirements.txt
├── setup.py
├── test_neo4j.py
└── test.py
```
## Star History
<a href="https://star-history.com/#HKUDS/LightRAG&Date">

View File

@@ -56,18 +56,6 @@ custom_kg = {
"description": "An annual technology conference held in CityC",
"source_id": "Source3",
},
{
"entity_name": "CompanyD",
"entity_type": "Organization",
"description": "A financial services company specializing in insurance",
"source_id": "Source4",
},
{
"entity_name": "ServiceZ",
"entity_type": "Service",
"description": "An insurance product offered by CompanyD",
"source_id": "Source4",
},
],
"relationships": [
{
@@ -94,13 +82,23 @@ custom_kg = {
"weight": 0.8,
"source_id": "Source3",
},
],
"chunks": [
{
"src_id": "CompanyD",
"tgt_id": "ServiceZ",
"description": "CompanyD provides ServiceZ",
"keywords": "provide, offer",
"weight": 1.0,
"source_id": "Source4",
"content": "ProductX, developed by CompanyA, has revolutionized the market with its cutting-edge features.",
"source_id": "Source1",
},
{
"content": "PersonA is a prominent researcher at UniversityB, focusing on artificial intelligence and machine learning.",
"source_id": "Source2",
},
{
"content": "EventY, held in CityC, attracts technology enthusiasts and companies from around the globe.",
"source_id": "Source3",
},
{
"content": "None",
"source_id": "UNKNOWN",
},
],
}

View File

@@ -1,11 +1,14 @@
import os
import asyncio
from lightrag import LightRAG, QueryParam
from lightrag.llm import openai_complete_if_cache, nvidia_openai_embedding, nvidia_openai_complete
from lightrag.llm import (
openai_complete_if_cache,
nvidia_openai_embedding,
)
from lightrag.utils import EmbeddingFunc
import numpy as np
#for custom llm_model_func
# for custom llm_model_func
from lightrag.utils import locate_json_string_body_from_string
WORKING_DIR = "./dickens"
@@ -13,14 +16,15 @@ WORKING_DIR = "./dickens"
if not os.path.exists(WORKING_DIR):
os.mkdir(WORKING_DIR)
#some method to use your API key (choose one)
# some method to use your API key (choose one)
# NVIDIA_OPENAI_API_KEY = os.getenv("NVIDIA_OPENAI_API_KEY")
NVIDIA_OPENAI_API_KEY = "nvapi-xxxx" #your api key
NVIDIA_OPENAI_API_KEY = "nvapi-xxxx" # your api key
# using pre-defined function for nvidia LLM API. OpenAI compatible
# llm_model_func = nvidia_openai_complete
#If you trying to make custom llm_model_func to use llm model on NVIDIA API like other example:
# If you trying to make custom llm_model_func to use llm model on NVIDIA API like other example:
async def llm_model_func(
prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
) -> str:
@@ -37,33 +41,38 @@ async def llm_model_func(
return locate_json_string_body_from_string(result)
return result
#custom embedding
# custom embedding
nvidia_embed_model = "nvidia/nv-embedqa-e5-v5"
async def indexing_embedding_func(texts: list[str]) -> np.ndarray:
return await nvidia_openai_embedding(
texts,
model = nvidia_embed_model, #maximum 512 token
model=nvidia_embed_model, # maximum 512 token
# model="nvidia/llama-3.2-nv-embedqa-1b-v1",
api_key=NVIDIA_OPENAI_API_KEY,
base_url="https://integrate.api.nvidia.com/v1",
input_type = "passage",
trunc = "END", #handling on server side if input token is longer than maximum token
encode = "float"
input_type="passage",
trunc="END", # handling on server side if input token is longer than maximum token
encode="float",
)
async def query_embedding_func(texts: list[str]) -> np.ndarray:
return await nvidia_openai_embedding(
texts,
model = nvidia_embed_model, #maximum 512 token
model=nvidia_embed_model, # maximum 512 token
# model="nvidia/llama-3.2-nv-embedqa-1b-v1",
api_key=NVIDIA_OPENAI_API_KEY,
base_url="https://integrate.api.nvidia.com/v1",
input_type = "query",
trunc = "END", #handling on server side if input token is longer than maximum token
encode = "float"
input_type="query",
trunc="END", # handling on server side if input token is longer than maximum token
encode="float",
)
#dimension are same
# dimension are same
async def get_embedding_dim():
test_text = ["This is a test sentence."]
embedding = await indexing_embedding_func(test_text)
@@ -88,25 +97,25 @@ async def main():
embedding_dimension = await get_embedding_dim()
print(f"Detected embedding dimension: {embedding_dimension}")
#lightRAG class during indexing
# lightRAG class during indexing
rag = LightRAG(
working_dir=WORKING_DIR,
llm_model_func=llm_model_func,
# llm_model_name="meta/llama3-70b-instruct", #un comment if
embedding_func=EmbeddingFunc(
embedding_dim=embedding_dimension,
max_token_size=512, #maximum token size, somehow it's still exceed maximum number of token
#so truncate (trunc) parameter on embedding_func will handle it and try to examine the tokenizer used in LightRAG
#so you can adjust to be able to fit the NVIDIA model (future work)
max_token_size=512, # maximum token size, somehow it's still exceed maximum number of token
# so truncate (trunc) parameter on embedding_func will handle it and try to examine the tokenizer used in LightRAG
# so you can adjust to be able to fit the NVIDIA model (future work)
func=indexing_embedding_func,
),
)
#reading file
# reading file
with open("./book.txt", "r", encoding="utf-8") as f:
await rag.ainsert(f.read())
#redefine rag to change embedding into query type
# redefine rag to change embedding into query type
rag = LightRAG(
working_dir=WORKING_DIR,
llm_model_func=llm_model_func,

View File

@@ -1,5 +1,5 @@
from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
__version__ = "1.0.2"
__version__ = "1.0.3"
__author__ = "Zirui Guo"
__url__ = "https://github.com/HKUDS/LightRAG"

View File

@@ -329,13 +329,39 @@ class LightRAG:
async def ainsert_custom_kg(self, custom_kg: dict):
update_storage = False
try:
# Insert chunks into vector storage
all_chunks_data = {}
chunk_to_source_map = {}
for chunk_data in custom_kg.get("chunks", []):
chunk_content = chunk_data["content"]
source_id = chunk_data["source_id"]
chunk_id = compute_mdhash_id(chunk_content.strip(), prefix="chunk-")
chunk_entry = {"content": chunk_content.strip(), "source_id": source_id}
all_chunks_data[chunk_id] = chunk_entry
chunk_to_source_map[source_id] = chunk_id
update_storage = True
if self.chunks_vdb is not None and all_chunks_data:
await self.chunks_vdb.upsert(all_chunks_data)
if self.text_chunks is not None and all_chunks_data:
await self.text_chunks.upsert(all_chunks_data)
# Insert entities into knowledge graph
all_entities_data = []
for entity_data in custom_kg.get("entities", []):
entity_name = f'"{entity_data["entity_name"].upper()}"'
entity_type = entity_data.get("entity_type", "UNKNOWN")
description = entity_data.get("description", "No description provided")
source_id = entity_data["source_id"]
# source_id = entity_data["source_id"]
source_chunk_id = entity_data.get("source_id", "UNKNOWN")
source_id = chunk_to_source_map.get(source_chunk_id, "UNKNOWN")
# Log if source_id is UNKNOWN
if source_id == "UNKNOWN":
logger.warning(
f"Entity '{entity_name}' has an UNKNOWN source_id. Please check the source mapping."
)
# Prepare node data
node_data = {
@@ -359,7 +385,15 @@ class LightRAG:
description = relationship_data["description"]
keywords = relationship_data["keywords"]
weight = relationship_data.get("weight", 1.0)
source_id = relationship_data["source_id"]
# source_id = relationship_data["source_id"]
source_chunk_id = relationship_data.get("source_id", "UNKNOWN")
source_id = chunk_to_source_map.get(source_chunk_id, "UNKNOWN")
# Log if source_id is UNKNOWN
if source_id == "UNKNOWN":
logger.warning(
f"Relationship from '{src_id}' to '{tgt_id}' has an UNKNOWN source_id. Please check the source mapping."
)
# Check if nodes exist in the knowledge graph
for need_insert_id in [src_id, tgt_id]:

View File

@@ -502,11 +502,12 @@ async def gpt_4o_mini_complete(
**kwargs,
)
async def nvidia_openai_complete(
prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
) -> str:
result = await openai_complete_if_cache(
"nvidia/llama-3.1-nemotron-70b-instruct", #context length 128k
"nvidia/llama-3.1-nemotron-70b-instruct", # context length 128k
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
@@ -517,6 +518,7 @@ async def nvidia_openai_complete(
return locate_json_string_body_from_string(result)
return result
async def azure_openai_complete(
prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
) -> str:
@@ -610,12 +612,12 @@ async def openai_embedding(
)
async def nvidia_openai_embedding(
texts: list[str],
model: str = "nvidia/llama-3.2-nv-embedqa-1b-v1", #refer to https://build.nvidia.com/nim?filters=usecase%3Ausecase_text_to_embedding
model: str = "nvidia/llama-3.2-nv-embedqa-1b-v1", # refer to https://build.nvidia.com/nim?filters=usecase%3Ausecase_text_to_embedding
base_url: str = "https://integrate.api.nvidia.com/v1",
api_key: str = None,
input_type: str = "passage", #query for retrieval, passage for embedding
trunc: str = "NONE", #NONE or START or END
encode: str = "float" #float or base64
input_type: str = "passage", # query for retrieval, passage for embedding
trunc: str = "NONE", # NONE or START or END
encode: str = "float", # float or base64
) -> np.ndarray:
if api_key:
os.environ["OPENAI_API_KEY"] = api_key
@@ -624,10 +626,14 @@ async def nvidia_openai_embedding(
AsyncOpenAI() if base_url is None else AsyncOpenAI(base_url=base_url)
)
response = await openai_async_client.embeddings.create(
model=model, input=texts, encoding_format=encode, extra_body={"input_type": input_type, "truncate": trunc}
model=model,
input=texts,
encoding_format=encode,
extra_body={"input_type": input_type, "truncate": trunc},
)
return np.array([dp.embedding for dp in response.data])
@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8191)
@retry(
stop=stop_after_attempt(3),

View File

@@ -297,7 +297,9 @@ async def extract_entities(
chunk_dp = chunk_key_dp[1]
content = chunk_dp["content"]
# hint_prompt = entity_extract_prompt.format(**context_base, input_text=content)
hint_prompt = entity_extract_prompt.format(**context_base, input_text="{input_text}").format(**context_base, input_text=content)
hint_prompt = entity_extract_prompt.format(
**context_base, input_text="{input_text}"
).format(**context_base, input_text=content)
final_result = await use_llm_func(hint_prompt)
history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
@@ -949,7 +951,6 @@ async def _find_related_text_unit_from_relationships(
split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP])
for dp in edge_datas
]
all_text_units_lookup = {}
for index, unit_list in enumerate(text_units):