From 79646fced8612187ec26e013eb34d19c210e4908 Mon Sep 17 00:00:00 2001
From: xYLiuuuuuu
Date: Mon, 6 Jan 2025 16:54:53 +0800
Subject: [PATCH 1/6] Fix:Optimized logic for automatic switching modes when
keywords do not exist
---
lightrag/operate.py | 117 ++++++++++++++++----------------------------
1 file changed, 42 insertions(+), 75 deletions(-)
diff --git a/lightrag/operate.py b/lightrag/operate.py
index f21e41ff..c8e4565c 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -522,15 +522,16 @@ async def kg_query(
logger.warning("low_level_keywords and high_level_keywords is empty")
return PROMPTS["fail_response"]
if ll_keywords == [] and query_param.mode in ["local", "hybrid"]:
- logger.warning("low_level_keywords is empty")
- return PROMPTS["fail_response"]
- else:
- ll_keywords = ", ".join(ll_keywords)
+ logger.warning("low_level_keywords is empty, switching from %s mode to global mode", query_param.mode)
+ query_param.mode = "global"
if hl_keywords == [] and query_param.mode in ["global", "hybrid"]:
- logger.warning("high_level_keywords is empty")
- return PROMPTS["fail_response"]
- else:
- hl_keywords = ", ".join(hl_keywords)
+ logger.warning("high_level_keywords is empty, switching from %s mode to local mode", query_param.mode)
+ query_param.mode = "local"
+
+ ll_keywords = ", ".join(ll_keywords) if ll_keywords else ""
+ hl_keywords = ", ".join(hl_keywords) if hl_keywords else ""
+
+ logger.info("Using %s mode for query processing", query_param.mode)
# Build context
keywords = [ll_keywords, hl_keywords]
@@ -596,78 +597,44 @@ async def _build_query_context(
# ll_entities_context, ll_relations_context, ll_text_units_context = "", "", ""
# hl_entities_context, hl_relations_context, hl_text_units_context = "", "", ""
- ll_kewwords, hl_keywrds = query[0], query[1]
- if query_param.mode in ["local", "hybrid"]:
- if ll_kewwords == "":
- ll_entities_context, ll_relations_context, ll_text_units_context = (
- "",
- "",
- "",
- )
- warnings.warn(
- "Low Level context is None. Return empty Low entity/relationship/source"
- )
- query_param.mode = "global"
- else:
- (
- ll_entities_context,
- ll_relations_context,
- ll_text_units_context,
- ) = await _get_node_data(
- ll_kewwords,
- knowledge_graph_inst,
- entities_vdb,
- text_chunks_db,
- query_param,
- )
- if query_param.mode in ["global", "hybrid"]:
- if hl_keywrds == "":
- hl_entities_context, hl_relations_context, hl_text_units_context = (
- "",
- "",
- "",
- )
- warnings.warn(
- "High Level context is None. Return empty High entity/relationship/source"
- )
- query_param.mode = "local"
- else:
- (
- hl_entities_context,
- hl_relations_context,
- hl_text_units_context,
- ) = await _get_edge_data(
- hl_keywrds,
- knowledge_graph_inst,
- relationships_vdb,
- text_chunks_db,
- query_param,
- )
- if (
- hl_entities_context == ""
- and hl_relations_context == ""
- and hl_text_units_context == ""
- ):
- logger.warn("No high level context found. Switching to local mode.")
- query_param.mode = "local"
- if query_param.mode == "hybrid":
+ ll_keywords, hl_keywords = query[0], query[1]
+
+ if query_param.mode == "local":
+ entities_context, relations_context, text_units_context = await _get_node_data(
+ ll_keywords,
+ knowledge_graph_inst,
+ entities_vdb,
+ text_chunks_db,
+ query_param,
+ )
+ elif query_param.mode == "global":
+ entities_context, relations_context, text_units_context = await _get_edge_data(
+ hl_keywords,
+ knowledge_graph_inst,
+ relationships_vdb,
+ text_chunks_db,
+ query_param,
+ )
+ else: # hybrid mode
+ ll_entities_context, ll_relations_context, ll_text_units_context = await _get_node_data(
+ ll_keywords,
+ knowledge_graph_inst,
+ entities_vdb,
+ text_chunks_db,
+ query_param,
+ )
+ hl_entities_context, hl_relations_context, hl_text_units_context = await _get_edge_data(
+ hl_keywords,
+ knowledge_graph_inst,
+ relationships_vdb,
+ text_chunks_db,
+ query_param,
+ )
entities_context, relations_context, text_units_context = combine_contexts(
[hl_entities_context, ll_entities_context],
[hl_relations_context, ll_relations_context],
[hl_text_units_context, ll_text_units_context],
)
- elif query_param.mode == "local":
- entities_context, relations_context, text_units_context = (
- ll_entities_context,
- ll_relations_context,
- ll_text_units_context,
- )
- elif query_param.mode == "global":
- entities_context, relations_context, text_units_context = (
- hl_entities_context,
- hl_relations_context,
- hl_text_units_context,
- )
return f"""
-----Entities-----
```csv
From 6c78c96854d9ab563a547546dd8652ed59190bd2 Mon Sep 17 00:00:00 2001
From: zrguo
Date: Tue, 7 Jan 2025 22:02:34 +0800
Subject: [PATCH 2/6] fix linting errors
---
lightrag/operate.py | 23 ++++++++++++++++++-----
1 file changed, 18 insertions(+), 5 deletions(-)
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 59e9f648..ce7b0a8a 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -4,7 +4,6 @@ import re
from tqdm.asyncio import tqdm as tqdm_async
from typing import Union
from collections import Counter, defaultdict
-import warnings
from .utils import (
logger,
clean_str,
@@ -605,10 +604,16 @@ async def kg_query(
logger.warning("low_level_keywords and high_level_keywords is empty")
return PROMPTS["fail_response"]
if ll_keywords == [] and query_param.mode in ["local", "hybrid"]:
- logger.warning("low_level_keywords is empty, switching from %s mode to global mode", query_param.mode)
+ logger.warning(
+ "low_level_keywords is empty, switching from %s mode to global mode",
+ query_param.mode,
+ )
query_param.mode = "global"
if hl_keywords == [] and query_param.mode in ["global", "hybrid"]:
- logger.warning("high_level_keywords is empty, switching from %s mode to local mode", query_param.mode)
+ logger.warning(
+ "high_level_keywords is empty, switching from %s mode to local mode",
+ query_param.mode,
+ )
query_param.mode = "local"
ll_keywords = ", ".join(ll_keywords) if ll_keywords else ""
@@ -699,14 +704,22 @@ async def _build_query_context(
query_param,
)
else: # hybrid mode
- ll_entities_context, ll_relations_context, ll_text_units_context = await _get_node_data(
+ (
+ ll_entities_context,
+ ll_relations_context,
+ ll_text_units_context,
+ ) = await _get_node_data(
ll_keywords,
knowledge_graph_inst,
entities_vdb,
text_chunks_db,
query_param,
)
- hl_entities_context, hl_relations_context, hl_text_units_context = await _get_edge_data(
+ (
+ hl_entities_context,
+ hl_relations_context,
+ hl_text_units_context,
+ ) = await _get_edge_data(
hl_keywords,
knowledge_graph_inst,
relationships_vdb,
From a9402513909606c76a2e8d5e040f12ecb8aa4739 Mon Sep 17 00:00:00 2001
From: Gurjot Singh
Date: Tue, 7 Jan 2025 20:57:39 +0530
Subject: [PATCH 3/6] Implement custom chunking feature
---
lightrag/lightrag.py | 66 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 66 insertions(+)
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 7496d736..2225b2d1 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -458,6 +458,72 @@ class LightRAG:
# Ensure all indexes are updated after each document
await self._insert_done()
+ def insert_custom_chunks(self, full_text: str, text_chunks: list[str]):
+ loop = always_get_an_event_loop()
+ return loop.run_until_complete(self.ainsert_custom_chunks(full_text, text_chunks))
+
+ async def ainsert_custom_chunks(self, full_text: str, text_chunks: list[str]):
+
+ update_storage = False
+ try:
+ doc_key = compute_mdhash_id(full_text.strip(), prefix="doc-")
+ new_docs = {
+ doc_key: {"content": full_text.strip()}
+ }
+
+ _add_doc_keys = await self.full_docs.filter_keys([doc_key])
+ new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
+ if not len(new_docs):
+ logger.warning("This document is already in the storage.")
+ return
+
+ update_storage = True
+ logger.info(f"[New Docs] inserting {len(new_docs)} docs")
+
+ inserting_chunks = {}
+ for chunk_text in text_chunks:
+ chunk_text_stripped = chunk_text.strip()
+ chunk_key = compute_mdhash_id(chunk_text_stripped, prefix="chunk-")
+
+ inserting_chunks[chunk_key] = {
+ "content": chunk_text_stripped,
+ "full_doc_id": doc_key,
+ }
+
+ _add_chunk_keys = await self.text_chunks.filter_keys(list(inserting_chunks.keys()))
+ inserting_chunks = {
+ k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
+ }
+ if not len(inserting_chunks):
+ logger.warning("All chunks are already in the storage.")
+ return
+
+ logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks")
+
+ await self.chunks_vdb.upsert(inserting_chunks)
+
+ logger.info("[Entity Extraction]...")
+ maybe_new_kg = await extract_entities(
+ inserting_chunks,
+ knowledge_graph_inst=self.chunk_entity_relation_graph,
+ entity_vdb=self.entities_vdb,
+ relationships_vdb=self.relationships_vdb,
+ global_config=asdict(self),
+ )
+
+ if maybe_new_kg is None:
+ logger.warning("No new entities and relationships found")
+ return
+ else:
+ self.chunk_entity_relation_graph = maybe_new_kg
+
+ await self.full_docs.upsert(new_docs)
+ await self.text_chunks.upsert(inserting_chunks)
+
+ finally:
+ if update_storage:
+ await self._insert_done()
+
async def _insert_done(self):
tasks = []
for storage_inst in [
From 9e7784ab8a642415432c742d8e891f6173886f66 Mon Sep 17 00:00:00 2001
From: zrguo <49157727+LarFii@users.noreply.github.com>
Date: Wed, 8 Jan 2025 18:17:32 +0800
Subject: [PATCH 4/6] Update README.md
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index f66fb3ce..6c981d92 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@
-
+
From 9565a4663ad8878126f16d667455ca5a22f1d557 Mon Sep 17 00:00:00 2001
From: Gurjot Singh
Date: Thu, 9 Jan 2025 00:39:22 +0530
Subject: [PATCH 5/6] Fix trailing whitespace and formatting issues in
lightrag.py
---
lightrag/lightrag.py | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 2225b2d1..6af29aa2 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -460,16 +460,15 @@ class LightRAG:
def insert_custom_chunks(self, full_text: str, text_chunks: list[str]):
loop = always_get_an_event_loop()
- return loop.run_until_complete(self.ainsert_custom_chunks(full_text, text_chunks))
+ return loop.run_until_complete(
+ self.ainsert_custom_chunks(full_text, text_chunks)
+ )
async def ainsert_custom_chunks(self, full_text: str, text_chunks: list[str]):
-
update_storage = False
try:
doc_key = compute_mdhash_id(full_text.strip(), prefix="doc-")
- new_docs = {
- doc_key: {"content": full_text.strip()}
- }
+ new_docs = {doc_key: {"content": full_text.strip()}}
_add_doc_keys = await self.full_docs.filter_keys([doc_key])
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
@@ -484,13 +483,15 @@ class LightRAG:
for chunk_text in text_chunks:
chunk_text_stripped = chunk_text.strip()
chunk_key = compute_mdhash_id(chunk_text_stripped, prefix="chunk-")
-
+
inserting_chunks[chunk_key] = {
"content": chunk_text_stripped,
"full_doc_id": doc_key,
}
- _add_chunk_keys = await self.text_chunks.filter_keys(list(inserting_chunks.keys()))
+ _add_chunk_keys = await self.text_chunks.filter_keys(
+ list(inserting_chunks.keys())
+ )
inserting_chunks = {
k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
}
From 65c1450c66a769e9134e900a87706f9bc4ab5a97 Mon Sep 17 00:00:00 2001
From: Saifeddine ALOUI
Date: Wed, 8 Jan 2025 20:50:22 +0100
Subject: [PATCH 6/6] fixed retro compatibility with ainsert by making
split_by_character get a None default value
---
lightrag/lightrag.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 7496d736..362b7275 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -320,7 +320,7 @@ class LightRAG:
self.ainsert(string_or_strings, split_by_character)
)
- async def ainsert(self, string_or_strings, split_by_character):
+ async def ainsert(self, string_or_strings, split_by_character=None):
"""Insert documents with checkpoint support
Args: