From 79646fced8612187ec26e013eb34d19c210e4908 Mon Sep 17 00:00:00 2001
From: xYLiuuuuuu <liuxiaoyang1115@126.com>
Date: Mon, 6 Jan 2025 16:54:53 +0800
Subject: [PATCH 1/6] Fix:Optimized logic for automatic switching modes when
 keywords do not exist

---
 lightrag/operate.py | 117 ++++++++++++++++----------------------------
 1 file changed, 42 insertions(+), 75 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index f21e41ff..c8e4565c 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -522,15 +522,16 @@ async def kg_query(
         logger.warning("low_level_keywords and high_level_keywords is empty")
         return PROMPTS["fail_response"]
     if ll_keywords == [] and query_param.mode in ["local", "hybrid"]:
-        logger.warning("low_level_keywords is empty")
-        return PROMPTS["fail_response"]
-    else:
-        ll_keywords = ", ".join(ll_keywords)
+        logger.warning("low_level_keywords is empty, switching from %s mode to global mode", query_param.mode)
+        query_param.mode = "global"
     if hl_keywords == [] and query_param.mode in ["global", "hybrid"]:
-        logger.warning("high_level_keywords is empty")
-        return PROMPTS["fail_response"]
-    else:
-        hl_keywords = ", ".join(hl_keywords)
+        logger.warning("high_level_keywords is empty, switching from %s mode to local mode", query_param.mode)
+        query_param.mode = "local"
+
+    ll_keywords = ", ".join(ll_keywords) if ll_keywords else ""
+    hl_keywords = ", ".join(hl_keywords) if hl_keywords else ""
+
+    logger.info("Using %s mode for query processing", query_param.mode)
 
     # Build context
     keywords = [ll_keywords, hl_keywords]
@@ -596,78 +597,44 @@ async def _build_query_context(
     # ll_entities_context, ll_relations_context, ll_text_units_context = "", "", ""
     # hl_entities_context, hl_relations_context, hl_text_units_context = "", "", ""
 
-    ll_kewwords, hl_keywrds = query[0], query[1]
-    if query_param.mode in ["local", "hybrid"]:
-        if ll_kewwords == "":
-            ll_entities_context, ll_relations_context, ll_text_units_context = (
-                "",
-                "",
-                "",
-            )
-            warnings.warn(
-                "Low Level context is None. Return empty Low entity/relationship/source"
-            )
-            query_param.mode = "global"
-        else:
-            (
-                ll_entities_context,
-                ll_relations_context,
-                ll_text_units_context,
-            ) = await _get_node_data(
-                ll_kewwords,
-                knowledge_graph_inst,
-                entities_vdb,
-                text_chunks_db,
-                query_param,
-            )
-    if query_param.mode in ["global", "hybrid"]:
-        if hl_keywrds == "":
-            hl_entities_context, hl_relations_context, hl_text_units_context = (
-                "",
-                "",
-                "",
-            )
-            warnings.warn(
-                "High Level context is None. Return empty High entity/relationship/source"
-            )
-            query_param.mode = "local"
-        else:
-            (
-                hl_entities_context,
-                hl_relations_context,
-                hl_text_units_context,
-            ) = await _get_edge_data(
-                hl_keywrds,
-                knowledge_graph_inst,
-                relationships_vdb,
-                text_chunks_db,
-                query_param,
-            )
-            if (
-                hl_entities_context == ""
-                and hl_relations_context == ""
-                and hl_text_units_context == ""
-            ):
-                logger.warn("No high level context found. Switching to local mode.")
-                query_param.mode = "local"
-    if query_param.mode == "hybrid":
+    ll_keywords, hl_keywords = query[0], query[1]
+
+    if query_param.mode == "local":
+        entities_context, relations_context, text_units_context = await _get_node_data(
+            ll_keywords,
+            knowledge_graph_inst,
+            entities_vdb,
+            text_chunks_db,
+            query_param,
+        )
+    elif query_param.mode == "global":
+        entities_context, relations_context, text_units_context = await _get_edge_data(
+            hl_keywords,
+            knowledge_graph_inst,
+            relationships_vdb,
+            text_chunks_db,
+            query_param,
+        )
+    else:  # hybrid mode
+        ll_entities_context, ll_relations_context, ll_text_units_context = await _get_node_data(
+            ll_keywords,
+            knowledge_graph_inst,
+            entities_vdb,
+            text_chunks_db,
+            query_param,
+        )
+        hl_entities_context, hl_relations_context, hl_text_units_context = await _get_edge_data(
+            hl_keywords,
+            knowledge_graph_inst,
+            relationships_vdb,
+            text_chunks_db,
+            query_param,
+        )
         entities_context, relations_context, text_units_context = combine_contexts(
             [hl_entities_context, ll_entities_context],
             [hl_relations_context, ll_relations_context],
             [hl_text_units_context, ll_text_units_context],
         )
-    elif query_param.mode == "local":
-        entities_context, relations_context, text_units_context = (
-            ll_entities_context,
-            ll_relations_context,
-            ll_text_units_context,
-        )
-    elif query_param.mode == "global":
-        entities_context, relations_context, text_units_context = (
-            hl_entities_context,
-            hl_relations_context,
-            hl_text_units_context,
-        )
     return f"""
 -----Entities-----
 ```csv

From 6c78c96854d9ab563a547546dd8652ed59190bd2 Mon Sep 17 00:00:00 2001
From: zrguo <zrguo.bupt@qq.com>
Date: Tue, 7 Jan 2025 22:02:34 +0800
Subject: [PATCH 2/6] fix linting errors

---
 lightrag/operate.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 59e9f648..ce7b0a8a 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -4,7 +4,6 @@ import re
 from tqdm.asyncio import tqdm as tqdm_async
 from typing import Union
 from collections import Counter, defaultdict
-import warnings
 from .utils import (
     logger,
     clean_str,
@@ -605,10 +604,16 @@ async def kg_query(
         logger.warning("low_level_keywords and high_level_keywords is empty")
         return PROMPTS["fail_response"]
     if ll_keywords == [] and query_param.mode in ["local", "hybrid"]:
-        logger.warning("low_level_keywords is empty, switching from %s mode to global mode", query_param.mode)
+        logger.warning(
+            "low_level_keywords is empty, switching from %s mode to global mode",
+            query_param.mode,
+        )
         query_param.mode = "global"
     if hl_keywords == [] and query_param.mode in ["global", "hybrid"]:
-        logger.warning("high_level_keywords is empty, switching from %s mode to local mode", query_param.mode)
+        logger.warning(
+            "high_level_keywords is empty, switching from %s mode to local mode",
+            query_param.mode,
+        )
         query_param.mode = "local"
 
     ll_keywords = ", ".join(ll_keywords) if ll_keywords else ""
@@ -699,14 +704,22 @@ async def _build_query_context(
             query_param,
         )
     else:  # hybrid mode
-        ll_entities_context, ll_relations_context, ll_text_units_context = await _get_node_data(
+        (
+            ll_entities_context,
+            ll_relations_context,
+            ll_text_units_context,
+        ) = await _get_node_data(
             ll_keywords,
             knowledge_graph_inst,
             entities_vdb,
             text_chunks_db,
             query_param,
         )
-        hl_entities_context, hl_relations_context, hl_text_units_context = await _get_edge_data(
+        (
+            hl_entities_context,
+            hl_relations_context,
+            hl_text_units_context,
+        ) = await _get_edge_data(
             hl_keywords,
             knowledge_graph_inst,
             relationships_vdb,

From a9402513909606c76a2e8d5e040f12ecb8aa4739 Mon Sep 17 00:00:00 2001
From: Gurjot Singh <gurjotsingh@shorthills.ai>
Date: Tue, 7 Jan 2025 20:57:39 +0530
Subject: [PATCH 3/6] Implement custom chunking feature

---
 lightrag/lightrag.py | 66 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 7496d736..2225b2d1 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -458,6 +458,72 @@ class LightRAG:
                     # Ensure all indexes are updated after each document
                     await self._insert_done()
 
+    def insert_custom_chunks(self, full_text: str, text_chunks: list[str]):
+        loop = always_get_an_event_loop()
+        return loop.run_until_complete(self.ainsert_custom_chunks(full_text, text_chunks))
+
+    async def ainsert_custom_chunks(self, full_text: str, text_chunks: list[str]):
+        
+        update_storage = False
+        try:
+            doc_key = compute_mdhash_id(full_text.strip(), prefix="doc-")
+            new_docs = {
+                doc_key: {"content": full_text.strip()}
+            }
+
+            _add_doc_keys = await self.full_docs.filter_keys([doc_key])
+            new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
+            if not len(new_docs):
+                logger.warning("This document is already in the storage.")
+                return
+
+            update_storage = True
+            logger.info(f"[New Docs] inserting {len(new_docs)} docs")
+
+            inserting_chunks = {}
+            for chunk_text in text_chunks:
+                chunk_text_stripped = chunk_text.strip()
+                chunk_key = compute_mdhash_id(chunk_text_stripped, prefix="chunk-")
+            
+                inserting_chunks[chunk_key] = {
+                    "content": chunk_text_stripped,
+                    "full_doc_id": doc_key,
+                }
+
+            _add_chunk_keys = await self.text_chunks.filter_keys(list(inserting_chunks.keys()))
+            inserting_chunks = {
+                k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
+            }
+            if not len(inserting_chunks):
+                logger.warning("All chunks are already in the storage.")
+                return
+
+            logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks")
+
+            await self.chunks_vdb.upsert(inserting_chunks)
+
+            logger.info("[Entity Extraction]...")
+            maybe_new_kg = await extract_entities(
+                inserting_chunks,
+                knowledge_graph_inst=self.chunk_entity_relation_graph,
+                entity_vdb=self.entities_vdb,
+                relationships_vdb=self.relationships_vdb,
+                global_config=asdict(self),
+            )
+
+            if maybe_new_kg is None:
+                logger.warning("No new entities and relationships found")
+                return
+            else:
+                self.chunk_entity_relation_graph = maybe_new_kg
+
+            await self.full_docs.upsert(new_docs)
+            await self.text_chunks.upsert(inserting_chunks)
+
+        finally:
+            if update_storage:
+                await self._insert_done()
+
     async def _insert_done(self):
         tasks = []
         for storage_inst in [

From 9e7784ab8a642415432c742d8e891f6173886f66 Mon Sep 17 00:00:00 2001
From: zrguo <49157727+LarFii@users.noreply.github.com>
Date: Wed, 8 Jan 2025 18:17:32 +0800
Subject: [PATCH 4/6] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f66fb3ce..6c981d92 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@
     </p>
      <p>
           <img src='https://img.shields.io/github/stars/hkuds/lightrag?color=green&style=social' />
-        <img src="https://img.shields.io/badge/python->=3.10-blue">
+        <img src="https://img.shields.io/badge/python-3.10-blue">
         <a href="https://pypi.org/project/lightrag-hku/"><img src="https://img.shields.io/pypi/v/lightrag-hku.svg"></a>
         <a href="https://pepy.tech/project/lightrag-hku"><img src="https://static.pepy.tech/badge/lightrag-hku/month"></a>
     </p>

From 9565a4663ad8878126f16d667455ca5a22f1d557 Mon Sep 17 00:00:00 2001
From: Gurjot Singh <gurjotsingh@shorthills.ai>
Date: Thu, 9 Jan 2025 00:39:22 +0530
Subject: [PATCH 5/6] Fix trailing whitespace and formatting issues in
 lightrag.py

---
 lightrag/lightrag.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 2225b2d1..6af29aa2 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -460,16 +460,15 @@ class LightRAG:
 
     def insert_custom_chunks(self, full_text: str, text_chunks: list[str]):
         loop = always_get_an_event_loop()
-        return loop.run_until_complete(self.ainsert_custom_chunks(full_text, text_chunks))
+        return loop.run_until_complete(
+            self.ainsert_custom_chunks(full_text, text_chunks)
+        )
 
     async def ainsert_custom_chunks(self, full_text: str, text_chunks: list[str]):
-        
         update_storage = False
         try:
             doc_key = compute_mdhash_id(full_text.strip(), prefix="doc-")
-            new_docs = {
-                doc_key: {"content": full_text.strip()}
-            }
+            new_docs = {doc_key: {"content": full_text.strip()}}
 
             _add_doc_keys = await self.full_docs.filter_keys([doc_key])
             new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
@@ -484,13 +483,15 @@ class LightRAG:
             for chunk_text in text_chunks:
                 chunk_text_stripped = chunk_text.strip()
                 chunk_key = compute_mdhash_id(chunk_text_stripped, prefix="chunk-")
-            
+
                 inserting_chunks[chunk_key] = {
                     "content": chunk_text_stripped,
                     "full_doc_id": doc_key,
                 }
 
-            _add_chunk_keys = await self.text_chunks.filter_keys(list(inserting_chunks.keys()))
+            _add_chunk_keys = await self.text_chunks.filter_keys(
+                list(inserting_chunks.keys())
+            )
             inserting_chunks = {
                 k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
             }

From 65c1450c66a769e9134e900a87706f9bc4ab5a97 Mon Sep 17 00:00:00 2001
From: Saifeddine ALOUI <aloui.seifeddine@gmail.com>
Date: Wed, 8 Jan 2025 20:50:22 +0100
Subject: [PATCH 6/6] fixed retro compatibility with ainsert by making
 split_by_character get a None default value

---
 lightrag/lightrag.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 7496d736..362b7275 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -320,7 +320,7 @@ class LightRAG:
             self.ainsert(string_or_strings, split_by_character)
         )
 
-    async def ainsert(self, string_or_strings, split_by_character):
+    async def ainsert(self, string_or_strings, split_by_character=None):
         """Insert documents with checkpoint support
 
         Args: