From 79646fced8612187ec26e013eb34d19c210e4908 Mon Sep 17 00:00:00 2001 From: xYLiuuuuuu Date: Mon, 6 Jan 2025 16:54:53 +0800 Subject: [PATCH 1/6] Fix:Optimized logic for automatic switching modes when keywords do not exist --- lightrag/operate.py | 117 ++++++++++++++++---------------------------- 1 file changed, 42 insertions(+), 75 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index f21e41ff..c8e4565c 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -522,15 +522,16 @@ async def kg_query( logger.warning("low_level_keywords and high_level_keywords is empty") return PROMPTS["fail_response"] if ll_keywords == [] and query_param.mode in ["local", "hybrid"]: - logger.warning("low_level_keywords is empty") - return PROMPTS["fail_response"] - else: - ll_keywords = ", ".join(ll_keywords) + logger.warning("low_level_keywords is empty, switching from %s mode to global mode", query_param.mode) + query_param.mode = "global" if hl_keywords == [] and query_param.mode in ["global", "hybrid"]: - logger.warning("high_level_keywords is empty") - return PROMPTS["fail_response"] - else: - hl_keywords = ", ".join(hl_keywords) + logger.warning("high_level_keywords is empty, switching from %s mode to local mode", query_param.mode) + query_param.mode = "local" + + ll_keywords = ", ".join(ll_keywords) if ll_keywords else "" + hl_keywords = ", ".join(hl_keywords) if hl_keywords else "" + + logger.info("Using %s mode for query processing", query_param.mode) # Build context keywords = [ll_keywords, hl_keywords] @@ -596,78 +597,44 @@ async def _build_query_context( # ll_entities_context, ll_relations_context, ll_text_units_context = "", "", "" # hl_entities_context, hl_relations_context, hl_text_units_context = "", "", "" - ll_kewwords, hl_keywrds = query[0], query[1] - if query_param.mode in ["local", "hybrid"]: - if ll_kewwords == "": - ll_entities_context, ll_relations_context, ll_text_units_context = ( - "", - "", - "", - ) - warnings.warn( - "Low Level context is None. Return empty Low entity/relationship/source" - ) - query_param.mode = "global" - else: - ( - ll_entities_context, - ll_relations_context, - ll_text_units_context, - ) = await _get_node_data( - ll_kewwords, - knowledge_graph_inst, - entities_vdb, - text_chunks_db, - query_param, - ) - if query_param.mode in ["global", "hybrid"]: - if hl_keywrds == "": - hl_entities_context, hl_relations_context, hl_text_units_context = ( - "", - "", - "", - ) - warnings.warn( - "High Level context is None. Return empty High entity/relationship/source" - ) - query_param.mode = "local" - else: - ( - hl_entities_context, - hl_relations_context, - hl_text_units_context, - ) = await _get_edge_data( - hl_keywrds, - knowledge_graph_inst, - relationships_vdb, - text_chunks_db, - query_param, - ) - if ( - hl_entities_context == "" - and hl_relations_context == "" - and hl_text_units_context == "" - ): - logger.warn("No high level context found. Switching to local mode.") - query_param.mode = "local" - if query_param.mode == "hybrid": + ll_keywords, hl_keywords = query[0], query[1] + + if query_param.mode == "local": + entities_context, relations_context, text_units_context = await _get_node_data( + ll_keywords, + knowledge_graph_inst, + entities_vdb, + text_chunks_db, + query_param, + ) + elif query_param.mode == "global": + entities_context, relations_context, text_units_context = await _get_edge_data( + hl_keywords, + knowledge_graph_inst, + relationships_vdb, + text_chunks_db, + query_param, + ) + else: # hybrid mode + ll_entities_context, ll_relations_context, ll_text_units_context = await _get_node_data( + ll_keywords, + knowledge_graph_inst, + entities_vdb, + text_chunks_db, + query_param, + ) + hl_entities_context, hl_relations_context, hl_text_units_context = await _get_edge_data( + hl_keywords, + knowledge_graph_inst, + relationships_vdb, + text_chunks_db, + query_param, + ) entities_context, relations_context, text_units_context = combine_contexts( [hl_entities_context, ll_entities_context], [hl_relations_context, ll_relations_context], [hl_text_units_context, ll_text_units_context], ) - elif query_param.mode == "local": - entities_context, relations_context, text_units_context = ( - ll_entities_context, - ll_relations_context, - ll_text_units_context, - ) - elif query_param.mode == "global": - entities_context, relations_context, text_units_context = ( - hl_entities_context, - hl_relations_context, - hl_text_units_context, - ) return f""" -----Entities----- ```csv From 6c78c96854d9ab563a547546dd8652ed59190bd2 Mon Sep 17 00:00:00 2001 From: zrguo Date: Tue, 7 Jan 2025 22:02:34 +0800 Subject: [PATCH 2/6] fix linting errors --- lightrag/operate.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 59e9f648..ce7b0a8a 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -4,7 +4,6 @@ import re from tqdm.asyncio import tqdm as tqdm_async from typing import Union from collections import Counter, defaultdict -import warnings from .utils import ( logger, clean_str, @@ -605,10 +604,16 @@ async def kg_query( logger.warning("low_level_keywords and high_level_keywords is empty") return PROMPTS["fail_response"] if ll_keywords == [] and query_param.mode in ["local", "hybrid"]: - logger.warning("low_level_keywords is empty, switching from %s mode to global mode", query_param.mode) + logger.warning( + "low_level_keywords is empty, switching from %s mode to global mode", + query_param.mode, + ) query_param.mode = "global" if hl_keywords == [] and query_param.mode in ["global", "hybrid"]: - logger.warning("high_level_keywords is empty, switching from %s mode to local mode", query_param.mode) + logger.warning( + "high_level_keywords is empty, switching from %s mode to local mode", + query_param.mode, + ) query_param.mode = "local" ll_keywords = ", ".join(ll_keywords) if ll_keywords else "" @@ -699,14 +704,22 @@ async def _build_query_context( query_param, ) else: # hybrid mode - ll_entities_context, ll_relations_context, ll_text_units_context = await _get_node_data( + ( + ll_entities_context, + ll_relations_context, + ll_text_units_context, + ) = await _get_node_data( ll_keywords, knowledge_graph_inst, entities_vdb, text_chunks_db, query_param, ) - hl_entities_context, hl_relations_context, hl_text_units_context = await _get_edge_data( + ( + hl_entities_context, + hl_relations_context, + hl_text_units_context, + ) = await _get_edge_data( hl_keywords, knowledge_graph_inst, relationships_vdb, From a9402513909606c76a2e8d5e040f12ecb8aa4739 Mon Sep 17 00:00:00 2001 From: Gurjot Singh Date: Tue, 7 Jan 2025 20:57:39 +0530 Subject: [PATCH 3/6] Implement custom chunking feature --- lightrag/lightrag.py | 66 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 7496d736..2225b2d1 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -458,6 +458,72 @@ class LightRAG: # Ensure all indexes are updated after each document await self._insert_done() + def insert_custom_chunks(self, full_text: str, text_chunks: list[str]): + loop = always_get_an_event_loop() + return loop.run_until_complete(self.ainsert_custom_chunks(full_text, text_chunks)) + + async def ainsert_custom_chunks(self, full_text: str, text_chunks: list[str]): + + update_storage = False + try: + doc_key = compute_mdhash_id(full_text.strip(), prefix="doc-") + new_docs = { + doc_key: {"content": full_text.strip()} + } + + _add_doc_keys = await self.full_docs.filter_keys([doc_key]) + new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} + if not len(new_docs): + logger.warning("This document is already in the storage.") + return + + update_storage = True + logger.info(f"[New Docs] inserting {len(new_docs)} docs") + + inserting_chunks = {} + for chunk_text in text_chunks: + chunk_text_stripped = chunk_text.strip() + chunk_key = compute_mdhash_id(chunk_text_stripped, prefix="chunk-") + + inserting_chunks[chunk_key] = { + "content": chunk_text_stripped, + "full_doc_id": doc_key, + } + + _add_chunk_keys = await self.text_chunks.filter_keys(list(inserting_chunks.keys())) + inserting_chunks = { + k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys + } + if not len(inserting_chunks): + logger.warning("All chunks are already in the storage.") + return + + logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks") + + await self.chunks_vdb.upsert(inserting_chunks) + + logger.info("[Entity Extraction]...") + maybe_new_kg = await extract_entities( + inserting_chunks, + knowledge_graph_inst=self.chunk_entity_relation_graph, + entity_vdb=self.entities_vdb, + relationships_vdb=self.relationships_vdb, + global_config=asdict(self), + ) + + if maybe_new_kg is None: + logger.warning("No new entities and relationships found") + return + else: + self.chunk_entity_relation_graph = maybe_new_kg + + await self.full_docs.upsert(new_docs) + await self.text_chunks.upsert(inserting_chunks) + + finally: + if update_storage: + await self._insert_done() + async def _insert_done(self): tasks = [] for storage_inst in [ From 9e7784ab8a642415432c742d8e891f6173886f66 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Wed, 8 Jan 2025 18:17:32 +0800 Subject: [PATCH 4/6] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f66fb3ce..6c981d92 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@

- +

From 9565a4663ad8878126f16d667455ca5a22f1d557 Mon Sep 17 00:00:00 2001 From: Gurjot Singh Date: Thu, 9 Jan 2025 00:39:22 +0530 Subject: [PATCH 5/6] Fix trailing whitespace and formatting issues in lightrag.py --- lightrag/lightrag.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 2225b2d1..6af29aa2 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -460,16 +460,15 @@ class LightRAG: def insert_custom_chunks(self, full_text: str, text_chunks: list[str]): loop = always_get_an_event_loop() - return loop.run_until_complete(self.ainsert_custom_chunks(full_text, text_chunks)) + return loop.run_until_complete( + self.ainsert_custom_chunks(full_text, text_chunks) + ) async def ainsert_custom_chunks(self, full_text: str, text_chunks: list[str]): - update_storage = False try: doc_key = compute_mdhash_id(full_text.strip(), prefix="doc-") - new_docs = { - doc_key: {"content": full_text.strip()} - } + new_docs = {doc_key: {"content": full_text.strip()}} _add_doc_keys = await self.full_docs.filter_keys([doc_key]) new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} @@ -484,13 +483,15 @@ class LightRAG: for chunk_text in text_chunks: chunk_text_stripped = chunk_text.strip() chunk_key = compute_mdhash_id(chunk_text_stripped, prefix="chunk-") - + inserting_chunks[chunk_key] = { "content": chunk_text_stripped, "full_doc_id": doc_key, } - _add_chunk_keys = await self.text_chunks.filter_keys(list(inserting_chunks.keys())) + _add_chunk_keys = await self.text_chunks.filter_keys( + list(inserting_chunks.keys()) + ) inserting_chunks = { k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys } From 65c1450c66a769e9134e900a87706f9bc4ab5a97 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 8 Jan 2025 20:50:22 +0100 Subject: [PATCH 6/6] fixed retro compatibility with ainsert by making split_by_character get a None default value --- lightrag/lightrag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 7496d736..362b7275 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -320,7 +320,7 @@ class LightRAG: self.ainsert(string_or_strings, split_by_character) ) - async def ainsert(self, string_or_strings, split_by_character): + async def ainsert(self, string_or_strings, split_by_character=None): """Insert documents with checkpoint support Args: