From cc50ade14ed8e0852b5da75bde4809829269d285 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Thu, 30 Jan 2025 02:45:33 +0800
Subject: [PATCH 01/32] Fix concurrent problem on extract_entities function.

- Abandon the approach of temporarily replacing the global llm_model_func configuration
- Introduce custom_llm function with new_config for handle_cache while extracting entities
- Update handle_cache to accept custom_llm
---
 lightrag/operate.py | 28 ++++++++++++++++++++++------
 lightrag/utils.py   |  4 ++--
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index aeb64a7e..481a31bb 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -352,7 +352,7 @@ async def extract_entities(
         input_text: str, history_messages: list[dict[str, str]] = None
     ) -> str:
         if enable_llm_cache_for_entity_extract and llm_response_cache:
-            need_to_restore = False
+            custom_llm = None            
             if (
                 global_config["embedding_cache_config"]
                 and global_config["embedding_cache_config"]["enabled"]
@@ -360,8 +360,21 @@ async def extract_entities(
                 new_config = global_config.copy()
                 new_config["embedding_cache_config"] = None
                 new_config["enable_llm_cache"] = True
-                llm_response_cache.global_config = new_config
-                need_to_restore = True
+                
+                # create a llm function with new_config for handle_cache
+                async def custom_llm(
+                    prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
+                ) -> str:
+                    # 合并 new_config 和其他 kwargs,保证其他参数不被覆盖
+                    merged_config = {**kwargs, **new_config}
+                    return await use_llm_func(
+                        prompt,
+                        system_prompt=system_prompt,
+                        history_messages=history_messages,
+                        keyword_extraction=keyword_extraction,
+                        **merged_config,
+                    )
+                
             if history_messages:
                 history = json.dumps(history_messages, ensure_ascii=False)
                 _prompt = history + "\n" + input_text
@@ -370,10 +383,13 @@ async def extract_entities(
 
             arg_hash = compute_args_hash(_prompt)
             cached_return, _1, _2, _3 = await handle_cache(
-                llm_response_cache, arg_hash, _prompt, "default", cache_type="default"
+                llm_response_cache, 
+                arg_hash, 
+                _prompt, 
+                "default", 
+                cache_type="default",
+                llm=custom_llm
             )
-            if need_to_restore:
-                llm_response_cache.global_config = global_config
             if cached_return:
                 logger.debug(f"Found cache for {arg_hash}")
                 statistic_data["llm_cache"] += 1
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 745594d2..e6dc6507 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -491,7 +491,7 @@ def dequantize_embedding(
     return (quantized * scale + min_val).astype(np.float32)
 
 
-async def handle_cache(hashing_kv, args_hash, prompt, mode="default", cache_type=None):
+async def handle_cache(hashing_kv, args_hash, prompt, mode="default", cache_type=None, llm=None):
     """Generic cache handling function"""
     if hashing_kv is None or not hashing_kv.global_config.get("enable_llm_cache"):
         return None, None, None, None
@@ -528,7 +528,7 @@ async def handle_cache(hashing_kv, args_hash, prompt, mode="default", cache_type
             similarity_threshold=embedding_cache_config["similarity_threshold"],
             mode=mode,
             use_llm_check=use_llm_check,
-            llm_func=llm_model_func if use_llm_check else None,
+            llm_func=llm if (use_llm_check and llm is not None) else (llm_model_func if use_llm_check else None),
             original_prompt=prompt if use_llm_check else None,
             cache_type=cache_type,
         )

From 21481dba8f3b020797718de3d8a82aafa7f69590 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Fri, 31 Jan 2025 15:00:56 +0800
Subject: [PATCH 02/32] Refactor embedding functions and add async query limit

- Separate insert/query embedding funcs
- Add query-specific async limit
- Update storage classes to use new funcs
- Protect vector DB save with lock
- Improve config handling for thresholds
---
 lightrag/kg/nano_vector_db_impl.py |  6 +++++-
 lightrag/lightrag.py               | 20 ++++++++++++--------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/lightrag/kg/nano_vector_db_impl.py b/lightrag/kg/nano_vector_db_impl.py
index ed272fee..60a2bc00 100644
--- a/lightrag/kg/nano_vector_db_impl.py
+++ b/lightrag/kg/nano_vector_db_impl.py
@@ -76,6 +76,8 @@ class NanoVectorDBStorage(BaseVectorStorage):
     cosine_better_than_threshold: float = float(os.getenv("COSINE_THRESHOLD", "0.2"))
 
     def __post_init__(self):
+        # Initialize lock only for file operations
+        self._save_lock = asyncio.Lock()
         # Use global config value if specified, otherwise use default
         config = self.global_config.get("vector_db_storage_cls_kwargs", {})
         self.cosine_better_than_threshold = config.get(
@@ -210,4 +212,6 @@ class NanoVectorDBStorage(BaseVectorStorage):
             logger.error(f"Error deleting relations for {entity_name}: {e}")
 
     async def index_done_callback(self):
-        self._client.save()
+        # Protect file write operation
+        async with self._save_lock:
+            self._client.save()
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 92fc954f..ca82a3d7 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -154,6 +154,7 @@ class LightRAG:
     embedding_func: EmbeddingFunc = None  # This must be set (we do want to separate llm from the corte, so no more default initialization)
     embedding_batch_num: int = 32
     embedding_func_max_async: int = 16
+    embedding_func_max_async_query: int = 4
 
     # LLM
     llm_model_func: callable = None  # This must be set (we do want to separate llm from the corte, so no more default initialization)
@@ -195,8 +196,11 @@ class LightRAG:
         _print_config = ",\n  ".join([f"{k} = {v}" for k, v in global_config.items()])
         logger.debug(f"LightRAG init with param:\n  {_print_config}\n")
 
-        # Init LLM
-        self.embedding_func = limit_async_func_call(self.embedding_func_max_async)(
+        # Init embedding functions with separate instances for insert and query
+        self.insert_embedding_func = limit_async_func_call(self.embedding_func_max_async)(
+            self.embedding_func
+        )
+        self.query_embedding_func = limit_async_func_call(self.embedding_func_max_async_query)(
             self.embedding_func
         )
 
@@ -238,15 +242,15 @@ class LightRAG:
         ####
         self.full_docs = self.key_string_value_json_storage_cls(
             namespace="full_docs",
-            embedding_func=self.embedding_func,
+            embedding_func=self.insert_embedding_func,
         )
         self.text_chunks = self.key_string_value_json_storage_cls(
             namespace="text_chunks",
-            embedding_func=self.embedding_func,
+            embedding_func=self.insert_embedding_func,
         )
         self.chunk_entity_relation_graph = self.graph_storage_cls(
             namespace="chunk_entity_relation",
-            embedding_func=self.embedding_func,
+            embedding_func=self.insert_embedding_func,
         )
         ####
         # add embedding func by walter over
@@ -254,17 +258,17 @@ class LightRAG:
 
         self.entities_vdb = self.vector_db_storage_cls(
             namespace="entities",
-            embedding_func=self.embedding_func,
+            embedding_func=self.query_embedding_func,
             meta_fields={"entity_name"},
         )
         self.relationships_vdb = self.vector_db_storage_cls(
             namespace="relationships",
-            embedding_func=self.embedding_func,
+            embedding_func=self.query_embedding_func,
             meta_fields={"src_id", "tgt_id"},
         )
         self.chunks_vdb = self.vector_db_storage_cls(
             namespace="chunks",
-            embedding_func=self.embedding_func,
+            embedding_func=self.query_embedding_func,
         )
 
         if self.llm_response_cache and hasattr(

From 389f4ee87272fae7501b445360977ff709e65cea Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Fri, 31 Jan 2025 15:33:41 +0800
Subject: [PATCH 03/32] Shorten log message for cosine similarity threshold.

---
 lightrag/kg/nano_vector_db_impl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/kg/nano_vector_db_impl.py b/lightrag/kg/nano_vector_db_impl.py
index 60a2bc00..6e8873fc 100644
--- a/lightrag/kg/nano_vector_db_impl.py
+++ b/lightrag/kg/nano_vector_db_impl.py
@@ -140,7 +140,7 @@ class NanoVectorDBStorage(BaseVectorStorage):
         embedding = await self.embedding_func([query])
         embedding = embedding[0]
         logger.info(
-            f"Query: {query}, top_k: {top_k}, cosine_better_than_threshold: {self.cosine_better_than_threshold}"
+            f"Query: {query}, top_k: {top_k}, cosine: {self.cosine_better_than_threshold}"
         )
         results = self._client.query(
             query=embedding,

From b0d87b2e296e30f5705db497fc9204c3f33cdab5 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Fri, 31 Jan 2025 15:33:50 +0800
Subject: [PATCH 04/32] Fix linting

---
 lightrag/lightrag.py | 12 ++++++------
 lightrag/operate.py  | 22 +++++++++++++---------
 lightrag/utils.py    |  8 ++++++--
 3 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index ca82a3d7..f0fb92fd 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -197,12 +197,12 @@ class LightRAG:
         logger.debug(f"LightRAG init with param:\n  {_print_config}\n")
 
         # Init embedding functions with separate instances for insert and query
-        self.insert_embedding_func = limit_async_func_call(self.embedding_func_max_async)(
-            self.embedding_func
-        )
-        self.query_embedding_func = limit_async_func_call(self.embedding_func_max_async_query)(
-            self.embedding_func
-        )
+        self.insert_embedding_func = limit_async_func_call(
+            self.embedding_func_max_async
+        )(self.embedding_func)
+        self.query_embedding_func = limit_async_func_call(
+            self.embedding_func_max_async_query
+        )(self.embedding_func)
 
         # Initialize all storages
         self.key_string_value_json_storage_cls: Type[BaseKVStorage] = (
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 481a31bb..d88dc7c2 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -352,7 +352,7 @@ async def extract_entities(
         input_text: str, history_messages: list[dict[str, str]] = None
     ) -> str:
         if enable_llm_cache_for_entity_extract and llm_response_cache:
-            custom_llm = None            
+            custom_llm = None
             if (
                 global_config["embedding_cache_config"]
                 and global_config["embedding_cache_config"]["enabled"]
@@ -360,10 +360,14 @@ async def extract_entities(
                 new_config = global_config.copy()
                 new_config["embedding_cache_config"] = None
                 new_config["enable_llm_cache"] = True
-                
+
                 # create a llm function with new_config for handle_cache
                 async def custom_llm(
-                    prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
+                    prompt,
+                    system_prompt=None,
+                    history_messages=[],
+                    keyword_extraction=False,
+                    **kwargs,
                 ) -> str:
                     # 合并 new_config 和其他 kwargs,保证其他参数不被覆盖
                     merged_config = {**kwargs, **new_config}
@@ -374,7 +378,7 @@ async def extract_entities(
                         keyword_extraction=keyword_extraction,
                         **merged_config,
                     )
-                
+
             if history_messages:
                 history = json.dumps(history_messages, ensure_ascii=False)
                 _prompt = history + "\n" + input_text
@@ -383,12 +387,12 @@ async def extract_entities(
 
             arg_hash = compute_args_hash(_prompt)
             cached_return, _1, _2, _3 = await handle_cache(
-                llm_response_cache, 
-                arg_hash, 
-                _prompt, 
-                "default", 
+                llm_response_cache,
+                arg_hash,
+                _prompt,
+                "default",
                 cache_type="default",
-                llm=custom_llm
+                llm=custom_llm,
             )
             if cached_return:
                 logger.debug(f"Found cache for {arg_hash}")
diff --git a/lightrag/utils.py b/lightrag/utils.py
index daab10b0..e5b3b8d8 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -491,7 +491,9 @@ def dequantize_embedding(
     return (quantized * scale + min_val).astype(np.float32)
 
 
-async def handle_cache(hashing_kv, args_hash, prompt, mode="default", cache_type=None, llm=None):
+async def handle_cache(
+    hashing_kv, args_hash, prompt, mode="default", cache_type=None, llm=None
+):
     """Generic cache handling function"""
     if hashing_kv is None or not hashing_kv.global_config.get("enable_llm_cache"):
         return None, None, None, None
@@ -528,7 +530,9 @@ async def handle_cache(hashing_kv, args_hash, prompt, mode="default", cache_type
             similarity_threshold=embedding_cache_config["similarity_threshold"],
             mode=mode,
             use_llm_check=use_llm_check,
-            llm_func=llm if (use_llm_check and llm is not None) else (llm_model_func if use_llm_check else None),
+            llm_func=llm
+            if (use_llm_check and llm is not None)
+            else (llm_model_func if use_llm_check else None),
             original_prompt=prompt if use_llm_check else None,
             cache_type=cache_type,
         )

From 2a010c985e1548d295b4972678a56509726a999e Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Thu, 30 Jan 2025 22:21:52 +0800
Subject: [PATCH 05/32] Add logging for chunk truncation in mix_kg_vector_query

---
 lightrag/operate.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index d88dc7c2..7fbd23b6 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -911,7 +911,8 @@ async def mix_kg_vector_query(
                 if c["created_at"]:
                     chunk_text = f"[Created at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['created_at']))}]\n{chunk_text}"
                 formatted_chunks.append(chunk_text)
-
+            
+            logger.info(f"Truncate {len(chunks)} to {len(formatted_chunks)} chunks")
             return "\n--New Chunk--\n".join(formatted_chunks)
         except Exception as e:
             logger.error(f"Error in get_vector_context: {e}")

From b22a8b216c21737a599b863e3136e158cb0be109 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Thu, 30 Jan 2025 22:26:28 +0800
Subject: [PATCH 06/32] Fix linting

---
 lightrag/operate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 7fbd23b6..37ea24dc 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -911,7 +911,7 @@ async def mix_kg_vector_query(
                 if c["created_at"]:
                     chunk_text = f"[Created at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['created_at']))}]\n{chunk_text}"
                 formatted_chunks.append(chunk_text)
-            
+
             logger.info(f"Truncate {len(chunks)} to {len(formatted_chunks)} chunks")
             return "\n--New Chunk--\n".join(formatted_chunks)
         except Exception as e:

From 0692635ebdaf66844a1864f2eddee794c86ec56d Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Thu, 30 Jan 2025 22:38:32 +0800
Subject: [PATCH 07/32] Improve prompts to avoid make-up respond from LLM like
 qwen-plus when very long context is provided.

---
 lightrag/prompt.py | 74 ++++++++++++++++++++++++----------------------
 1 file changed, 38 insertions(+), 36 deletions(-)

diff --git a/lightrag/prompt.py b/lightrag/prompt.py
index 2839e740..8e52b9c3 100644
--- a/lightrag/prompt.py
+++ b/lightrag/prompt.py
@@ -151,18 +151,18 @@ PROMPTS[
 ] = """It appears some entities may have still been missed.  Answer YES | NO if there are still entities that need to be added.
 """
 
-PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question."
+PROMPTS["fail_response"] = (
+    "Sorry, I'm not able to provide an answer to that question.[no-context]"
+)
 
 PROMPTS["rag_response"] = """---Role---
 
-You are a helpful assistant responding to questions about data in the tables provided.
+You are a helpful assistant responding to user query about Knowledge Base provided below.
 
 
 ---Goal---
 
-Generate a response of the target length and format that responds to the user's question, considering both the conversation history and the current query. Summarize all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
-If you don't know the answer, just say so. Do not make anything up.
-Do not include information where the supporting evidence for it is not provided.
+Generate a concise response based on Knowledge Base and follow Response Rules, considering both the conversation history and the current query. Summarize all information in the provided Knowledge Base, and incorporating general knowledge relevant to the Knowledge Base. Do not include information not provided by Knowledge Base.
 
 When handling relationships with timestamps:
 1. Each relationship has a "created_at" timestamp indicating when we acquired this knowledge
@@ -173,15 +173,17 @@ When handling relationships with timestamps:
 ---Conversation History---
 {history}
 
----Target response length and format---
-
-{response_type}
-
----Data tables---
-
+---Knowledge Base---
 {context_data}
 
-Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. Ensure the response maintains continuity with the conversation history."""
+---Response Rules---
+
+- Target format and length: {response_type}
+- Use markdown formatting with appropriate section headings
+- Please respond in the same language as the user's question.
+- Ensure the response maintains continuity with the conversation history.
+- If you don't know the answer, just say so.
+- Do not make anything up. Do not include information not provided by the Knowledge Base."""
 
 PROMPTS["keywords_extraction"] = """---Role---
 
@@ -253,13 +255,11 @@ Output:
 
 PROMPTS["naive_rag_response"] = """---Role---
 
-You are a helpful assistant responding to questions about documents provided.
+You are a helpful assistant responding to user query about Document Chunks provided below.
 
 ---Goal---
 
-Generate a response of the target length and format that responds to the user's question, considering both the conversation history and the current query. Summarize all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
-If you don't know the answer, just say so. Do not make anything up.
-Do not include information where the supporting evidence for it is not provided.
+GGenerate a concise response based on Document Chunks and follow Response Rules, considering both the conversation history and the current query. Summarize all information in the provided Document Chunks, and incorporating general knowledge relevant to the Document Chunks. Do not include information not provided by Document Chunks.
 
 When handling content with timestamps:
 1. Each piece of content has a "created_at" timestamp indicating when we acquired this knowledge
@@ -270,15 +270,18 @@ When handling content with timestamps:
 ---Conversation History---
 {history}
 
----Target response length and format---
-
-{response_type}
-
----Documents---
-
+---Document Chunks---
 {content_data}
 
-Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. Ensure the response maintains continuity with the conversation history."""
+---Response Rules---
+
+- Target format and length: {response_type}
+- Use markdown formatting with appropriate section headings
+- Please respond in the same language as the user's question.
+- Ensure the response maintains continuity with the conversation history.
+- If you don't know the answer, just say so.
+- Do not include information not provided by the Document Chunks."""
+
 
 PROMPTS[
     "similarity_check"
@@ -306,11 +309,12 @@ Return only a number between 0-1, without any additional content.
 
 PROMPTS["mix_rag_response"] = """---Role---
 
-You are a professional assistant responsible for answering questions based on knowledge graph and textual information. Please respond in the same language as the user's question.
+You are a helpful assistant responding to user query about Data Sources provided below.
+
 
 ---Goal---
 
-Generate a concise response that summarizes relevant points from the provided information, considering both the current query and conversation history. If you don't know the answer, just say so. Do not make anything up or include information where the supporting evidence is not provided.
+Generate a concise response based on Data Sources and follow Response Rules, considering both the conversation history and the current query. Data sources contain two parts: Knowledge Graph(KG) and Document Chunks(DC). Summarize all information in the provided Data Sources, and incorporating general knowledge relevant to the Data Sources. Do not include information not provided by Data Sources.
 
 When handling information with timestamps:
 1. Each piece of information (both relationships and content) has a "created_at" timestamp indicating when we acquired this knowledge
@@ -323,22 +327,20 @@ When handling information with timestamps:
 
 ---Data Sources---
 
-1. Knowledge Graph Data:
+1. From Knowledge Graph(KG):
 {kg_context}
 
-2. Vector Data:
+2. From Document Chunks(DC):
 {vector_context}
 
----Response Requirements---
+---Response Rules---
 
 - Target format and length: {response_type}
 - Use markdown formatting with appropriate section headings
-- Aim to keep content around 3 paragraphs for conciseness
-- Each paragraph should be under a relevant section heading
-- Each section should focus on one main point or aspect of the answer
+- Please respond in the same language as the user's question.
+- Ensure the response maintains continuity with the conversation history.
+- Organize answer in sesctions focusing on one main point or aspect of the answer
 - Use clear and descriptive section titles that reflect the content
-- Ensure the response maintains continuity with the conversation history
-- List up to 5 most important reference sources at the end under "References", clearly indicating whether each source is from Knowledge Graph (KG) or Vector Data (VD)
-  Format: [KG/VD] Source content
-
-Add sections and commentary to the response as appropriate for the length and format. If the provided information is insufficient to answer the question, clearly state that you don't know or cannot provide an answer in the same language as the user's question."""
+- List up to 5 most important reference sources at the end under "References" sesction. Clearly indicating whether each source is from Knowledge Graph (KG) or Vector Data (DC), in the following format: [KG/DC] Source content
+- If you don't know the answer, just say so. Do not make anything up.
+- Do not include information not provided by the Data Sources."""

From 60b66b95e35c99d53e642b3a5c6941e95708b10b Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Fri, 31 Jan 2025 20:40:37 +0800
Subject: [PATCH 08/32] Fix typo in prompt

---
 lightrag/prompt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/prompt.py b/lightrag/prompt.py
index 8e52b9c3..913f8eef 100644
--- a/lightrag/prompt.py
+++ b/lightrag/prompt.py
@@ -259,7 +259,7 @@ You are a helpful assistant responding to user query about Document Chunks provi
 
 ---Goal---
 
-GGenerate a concise response based on Document Chunks and follow Response Rules, considering both the conversation history and the current query. Summarize all information in the provided Document Chunks, and incorporating general knowledge relevant to the Document Chunks. Do not include information not provided by Document Chunks.
+Generate a concise response based on Document Chunks and follow Response Rules, considering both the conversation history and the current query. Summarize all information in the provided Document Chunks, and incorporating general knowledge relevant to the Document Chunks. Do not include information not provided by Document Chunks.
 
 When handling content with timestamps:
 1. Each piece of content has a "created_at" timestamp indicating when we acquired this knowledge

From 1192727be7cde4853e887b64cb24637736c27b33 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 1 Feb 2025 09:53:11 +0800
Subject: [PATCH 09/32] remove semaphore logic from EmbeddingFunc(cause num of
 instances is already control by limit_async_func_call)

---
 lightrag/utils.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index e5b3b8d8..b9308b66 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -60,15 +60,8 @@ class EmbeddingFunc:
     func: callable
     concurrent_limit: int = 16
 
-    def __post_init__(self):
-        if self.concurrent_limit != 0:
-            self._semaphore = asyncio.Semaphore(self.concurrent_limit)
-        else:
-            self._semaphore = UnlimitedSemaphore()
-
     async def __call__(self, *args, **kwargs) -> np.ndarray:
-        async with self._semaphore:
-            return await self.func(*args, **kwargs)
+        return await self.func(*args, **kwargs)
 
 
 def locate_json_string_body_from_string(content: str) -> Union[str, None]:

From 2ba36f87e37c35104c48f9cebf12f17398d0ee1a Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 1 Feb 2025 09:55:05 +0800
Subject: [PATCH 10/32] Add support for list input in quantize_embedding
 function

- Convert list to numpy array if needed
- Maintain existing functionality
---
 lightrag/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index b9308b66..29665c3e 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -463,8 +463,12 @@ def cosine_similarity(v1, v2):
     return dot_product / (norm1 * norm2)
 
 
-def quantize_embedding(embedding: np.ndarray, bits=8) -> tuple:
+def quantize_embedding(embedding: Union[np.ndarray, list], bits=8) -> tuple:
     """Quantize embedding to specified bits"""
+    # Convert list to numpy array if needed
+    if isinstance(embedding, list):
+        embedding = np.array(embedding)
+        
     # Calculate min/max values for reconstruction
     min_val = embedding.min()
     max_val = embedding.max()

From 6a326e2783acfbccd5632aed33f6609df92f68b6 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 1 Feb 2025 10:03:27 +0800
Subject: [PATCH 11/32] Revert "Refactor embedding functions and add async
 query limit"

This reverts commit 21481dba8f3b020797718de3d8a82aafa7f69590.
---
 lightrag/kg/nano_vector_db_impl.py |  6 +-----
 lightrag/lightrag.py               | 24 ++++++++++--------------
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/lightrag/kg/nano_vector_db_impl.py b/lightrag/kg/nano_vector_db_impl.py
index 6e8873fc..7bd52d20 100644
--- a/lightrag/kg/nano_vector_db_impl.py
+++ b/lightrag/kg/nano_vector_db_impl.py
@@ -76,8 +76,6 @@ class NanoVectorDBStorage(BaseVectorStorage):
     cosine_better_than_threshold: float = float(os.getenv("COSINE_THRESHOLD", "0.2"))
 
     def __post_init__(self):
-        # Initialize lock only for file operations
-        self._save_lock = asyncio.Lock()
         # Use global config value if specified, otherwise use default
         config = self.global_config.get("vector_db_storage_cls_kwargs", {})
         self.cosine_better_than_threshold = config.get(
@@ -212,6 +210,4 @@ class NanoVectorDBStorage(BaseVectorStorage):
             logger.error(f"Error deleting relations for {entity_name}: {e}")
 
     async def index_done_callback(self):
-        # Protect file write operation
-        async with self._save_lock:
-            self._client.save()
+        self._client.save()
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index f0fb92fd..92fc954f 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -154,7 +154,6 @@ class LightRAG:
     embedding_func: EmbeddingFunc = None  # This must be set (we do want to separate llm from the corte, so no more default initialization)
     embedding_batch_num: int = 32
     embedding_func_max_async: int = 16
-    embedding_func_max_async_query: int = 4
 
     # LLM
     llm_model_func: callable = None  # This must be set (we do want to separate llm from the corte, so no more default initialization)
@@ -196,13 +195,10 @@ class LightRAG:
         _print_config = ",\n  ".join([f"{k} = {v}" for k, v in global_config.items()])
         logger.debug(f"LightRAG init with param:\n  {_print_config}\n")
 
-        # Init embedding functions with separate instances for insert and query
-        self.insert_embedding_func = limit_async_func_call(
-            self.embedding_func_max_async
-        )(self.embedding_func)
-        self.query_embedding_func = limit_async_func_call(
-            self.embedding_func_max_async_query
-        )(self.embedding_func)
+        # Init LLM
+        self.embedding_func = limit_async_func_call(self.embedding_func_max_async)(
+            self.embedding_func
+        )
 
         # Initialize all storages
         self.key_string_value_json_storage_cls: Type[BaseKVStorage] = (
@@ -242,15 +238,15 @@ class LightRAG:
         ####
         self.full_docs = self.key_string_value_json_storage_cls(
             namespace="full_docs",
-            embedding_func=self.insert_embedding_func,
+            embedding_func=self.embedding_func,
         )
         self.text_chunks = self.key_string_value_json_storage_cls(
             namespace="text_chunks",
-            embedding_func=self.insert_embedding_func,
+            embedding_func=self.embedding_func,
         )
         self.chunk_entity_relation_graph = self.graph_storage_cls(
             namespace="chunk_entity_relation",
-            embedding_func=self.insert_embedding_func,
+            embedding_func=self.embedding_func,
         )
         ####
         # add embedding func by walter over
@@ -258,17 +254,17 @@ class LightRAG:
 
         self.entities_vdb = self.vector_db_storage_cls(
             namespace="entities",
-            embedding_func=self.query_embedding_func,
+            embedding_func=self.embedding_func,
             meta_fields={"entity_name"},
         )
         self.relationships_vdb = self.vector_db_storage_cls(
             namespace="relationships",
-            embedding_func=self.query_embedding_func,
+            embedding_func=self.embedding_func,
             meta_fields={"src_id", "tgt_id"},
         )
         self.chunks_vdb = self.vector_db_storage_cls(
             namespace="chunks",
-            embedding_func=self.query_embedding_func,
+            embedding_func=self.embedding_func,
         )
 
         if self.llm_response_cache and hasattr(

From 635d4fd9e4a2e2e0b3b6bc62f7fd8ca3a971c96f Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 1 Feb 2025 10:14:59 +0800
Subject: [PATCH 12/32] Add lock to protect file write operations in
 NanoVectorDBStorage

- Introduce asyncio.Lock for save operations
- Ensure thread-safe file writes
---
 lightrag/kg/nano_vector_db_impl.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lightrag/kg/nano_vector_db_impl.py b/lightrag/kg/nano_vector_db_impl.py
index 7bd52d20..6e8873fc 100644
--- a/lightrag/kg/nano_vector_db_impl.py
+++ b/lightrag/kg/nano_vector_db_impl.py
@@ -76,6 +76,8 @@ class NanoVectorDBStorage(BaseVectorStorage):
     cosine_better_than_threshold: float = float(os.getenv("COSINE_THRESHOLD", "0.2"))
 
     def __post_init__(self):
+        # Initialize lock only for file operations
+        self._save_lock = asyncio.Lock()
         # Use global config value if specified, otherwise use default
         config = self.global_config.get("vector_db_storage_cls_kwargs", {})
         self.cosine_better_than_threshold = config.get(
@@ -210,4 +212,6 @@ class NanoVectorDBStorage(BaseVectorStorage):
             logger.error(f"Error deleting relations for {entity_name}: {e}")
 
     async def index_done_callback(self):
-        self._client.save()
+        # Protect file write operation
+        async with self._save_lock:
+            self._client.save()

From b109f57ddd4cb072449e3c0c4f293c7e3fe016d2 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 1 Feb 2025 10:36:15 +0800
Subject: [PATCH 13/32] Refactor async call limiting to use asyncio.Semaphore
 for better performance.

- Replace custom counter with asyncio.Semaphore
- The existing implementation cannot follow the FIFO order
---
 lightrag/utils.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index 29665c3e..8ab052b6 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -124,22 +124,17 @@ def compute_mdhash_id(content, prefix: str = ""):
     return prefix + md5(content.encode()).hexdigest()
 
 
-def limit_async_func_call(max_size: int, waitting_time: float = 0.0001):
-    """Add restriction of maximum async calling times for a async func"""
+def limit_async_func_call(max_size: int):
+    """Add restriction of maximum concurrent async calls using asyncio.Semaphore"""
 
     def final_decro(func):
-        """Not using async.Semaphore to aovid use nest-asyncio"""
-        __current_size = 0
+        sem = asyncio.Semaphore(max_size)
 
         @wraps(func)
         async def wait_func(*args, **kwargs):
-            nonlocal __current_size
-            while __current_size >= max_size:
-                await asyncio.sleep(waitting_time)
-            __current_size += 1
-            result = await func(*args, **kwargs)
-            __current_size -= 1
-            return result
+            async with sem:
+                result = await func(*args, **kwargs)
+                return result
 
         return wait_func
 

From 95edf8a51e28d0e0f62b980e50c356df76a5f4a4 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 1 Feb 2025 15:22:40 +0800
Subject: [PATCH 14/32] Fix linting

---
 lightrag/api/lightrag_server.py | 27 ++++++++++++++-------------
 lightrag/utils.py               |  2 +-
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index e1b24731..3bfa4943 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -13,18 +13,6 @@ from fastapi import (
 from typing import Dict
 import threading
 
-# Global progress tracker
-scan_progress: Dict = {
-    "is_scanning": False,
-    "current_file": "",
-    "indexed_count": 0,
-    "total_files": 0,
-    "progress": 0,
-}
-
-# Lock for thread-safe operations
-progress_lock = threading.Lock()
-
 import json
 import os
 
@@ -34,7 +22,7 @@ import logging
 import argparse
 import time
 import re
-from typing import List, Dict, Any, Optional, Union
+from typing import List, Any, Optional, Union
 from lightrag import LightRAG, QueryParam
 from lightrag.api import __api_version__
 
@@ -57,8 +45,21 @@ import pipmaster as pm
 
 from dotenv import load_dotenv
 
+# Load environment variables
 load_dotenv()
 
+# Global progress tracker
+scan_progress: Dict = {
+    "is_scanning": False,
+    "current_file": "",
+    "indexed_count": 0,
+    "total_files": 0,
+    "progress": 0,
+}
+
+# Lock for thread-safe operations
+progress_lock = threading.Lock()
+
 
 def estimate_tokens(text: str) -> int:
     """Estimate the number of tokens in text
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 8ab052b6..4f06d718 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -463,7 +463,7 @@ def quantize_embedding(embedding: Union[np.ndarray, list], bits=8) -> tuple:
     # Convert list to numpy array if needed
     if isinstance(embedding, list):
         embedding = np.array(embedding)
-        
+
     # Calculate min/max values for reconstruction
     min_val = embedding.min()
     max_val = embedding.max()

From c98a675b6c89d3d334aa8c7636431527040f9ff4 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 1 Feb 2025 22:07:12 +0800
Subject: [PATCH 15/32] remove unused parm

---
 lightrag/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index 4f06d718..963b3743 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -58,7 +58,7 @@ class EmbeddingFunc:
     embedding_dim: int
     max_token_size: int
     func: callable
-    concurrent_limit: int = 16
+    # concurrent_limit: int = 16
 
     async def __call__(self, *args, **kwargs) -> np.ndarray:
         return await self.func(*args, **kwargs)

From c3942077a98d5a0971948497241dce42bf2b9b72 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 1 Feb 2025 22:12:45 +0800
Subject: [PATCH 16/32] Use direct embedding_func from hashing_kv (do not by
 pass maxiumu async control)

---
 lightrag/utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index 963b3743..e868d32e 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -511,10 +511,10 @@ async def handle_cache(
     quantized = min_val = max_val = None
     if is_embedding_cache_enabled:
         # Use embedding cache
-        embedding_model_func = hashing_kv.global_config["embedding_func"]["func"]
-        llm_model_func = hashing_kv.global_config.get("llm_model_func")
-
-        current_embedding = await embedding_model_func([prompt])
+        current_embedding = await hashing_kv.embedding_func([prompt])
+        llm_model_func = (
+            hashing_kv.llm_model_func if hasattr(hashing_kv, "llm_model_func") else None
+        )
         quantized, min_val, max_val = quantize_embedding(current_embedding[0])
         best_cached_response = await get_best_cached_response(
             hashing_kv,

From 3bc7c4d8f11c7951777298a15d5203b469b2d264 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 1 Feb 2025 22:18:59 +0800
Subject: [PATCH 17/32] Save cache_type to llm_response_cache

---
 lightrag/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index e868d32e..9316f908 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -568,6 +568,7 @@ async def save_to_cache(hashing_kv, cache_data: CacheData):
 
     mode_cache[cache_data.args_hash] = {
         "return": cache_data.content,
+        "cache_type": cache_data.cache_data,
         "embedding": cache_data.quantized.tobytes().hex()
         if cache_data.quantized is not None
         else None,

From b87703aea62b1b8cb227de8e628b78cfc6ee4744 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 1 Feb 2025 22:19:16 +0800
Subject: [PATCH 18/32] Add embedding_func to llm_response_cache

---
 lightrag/lightrag.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 22db6994..108b4dd6 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -231,7 +231,7 @@ class LightRAG:
 
         self.llm_response_cache = self.key_string_value_json_storage_cls(
             namespace="llm_response_cache",
-            embedding_func=None,
+            embedding_func=self.embedding_func,
         )
 
         ####
@@ -275,7 +275,7 @@ class LightRAG:
         else:
             hashing_kv = self.key_string_value_json_storage_cls(
                 namespace="llm_response_cache",
-                embedding_func=None,
+                embedding_func=self.embedding_func,
             )
 
         self.llm_model_func = limit_async_func_call(self.llm_model_max_async)(

From 3c3cdba49978e371fdc1838dd217b059cb7d3d5c Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 1 Feb 2025 22:27:49 +0800
Subject: [PATCH 19/32] Fix typo error

---
 lightrag/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index 9316f908..be6bdafe 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -568,7 +568,7 @@ async def save_to_cache(hashing_kv, cache_data: CacheData):
 
     mode_cache[cache_data.args_hash] = {
         "return": cache_data.content,
-        "cache_type": cache_data.cache_data,
+        "cache_type": cache_data.cache_type,
         "embedding": cache_data.quantized.tobytes().hex()
         if cache_data.quantized is not None
         else None,

From 2d387fa6de698a72cc6c7ea6ee3a474b2261844f Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 1 Feb 2025 22:54:23 +0800
Subject: [PATCH 20/32] Save keywords to cache only when it's no empty

---
 lightrag/operate.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 37ea24dc..49f6bf39 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -760,7 +760,7 @@ async def extract_keywords_only(
     # 6. Parse out JSON from the LLM response
     match = re.search(r"\{.*\}", result, re.DOTALL)
     if not match:
-        logger.error("No JSON-like structure found in the result.")
+        logger.error("No JSON-like structure found in the LLM respond.")
         return [], []
     try:
         keywords_data = json.loads(match.group(0))
@@ -772,20 +772,21 @@ async def extract_keywords_only(
     ll_keywords = keywords_data.get("low_level_keywords", [])
 
     # 7. Cache only the processed keywords with cache type
-    cache_data = {"high_level_keywords": hl_keywords, "low_level_keywords": ll_keywords}
-    await save_to_cache(
-        hashing_kv,
-        CacheData(
-            args_hash=args_hash,
-            content=json.dumps(cache_data),
-            prompt=text,
-            quantized=quantized,
-            min_val=min_val,
-            max_val=max_val,
-            mode=param.mode,
-            cache_type="keywords",
-        ),
-    )
+    if hl_keywords or ll_keywords:
+        cache_data = {"high_level_keywords": hl_keywords, "low_level_keywords": ll_keywords}
+        await save_to_cache(
+            hashing_kv,
+            CacheData(
+                args_hash=args_hash,
+                content=json.dumps(cache_data),
+                prompt=text,
+                quantized=quantized,
+                min_val=min_val,
+                max_val=max_val,
+                mode=param.mode,
+                cache_type="keywords",
+            ),
+        )
     return hl_keywords, ll_keywords
 
 

From c9481c81b9ebaab16be423e48eec023e0f05ae6f Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 1 Feb 2025 23:05:02 +0800
Subject: [PATCH 21/32] Add cache type "extract" for entity extraction

---
 lightrag/operate.py | 4 ++--
 lightrag/utils.py   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 49f6bf39..bc011cb9 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -391,7 +391,7 @@ async def extract_entities(
                 arg_hash,
                 _prompt,
                 "default",
-                cache_type="default",
+                cache_type="extract",
                 llm=custom_llm,
             )
             if cached_return:
@@ -407,7 +407,7 @@ async def extract_entities(
                 res: str = await use_llm_func(input_text)
             await save_to_cache(
                 llm_response_cache,
-                CacheData(args_hash=arg_hash, content=res, prompt=_prompt),
+                CacheData(args_hash=arg_hash, content=res, prompt=_prompt, cache_type="extract"),
             )
             return res
 
diff --git a/lightrag/utils.py b/lightrag/utils.py
index be6bdafe..d4d42b40 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -105,7 +105,7 @@ def compute_args_hash(*args, cache_type: str = None) -> str:
     """Compute a hash for the given arguments.
     Args:
         *args: Arguments to hash
-        cache_type: Type of cache (e.g., 'keywords', 'query')
+        cache_type: Type of cache (e.g., 'keywords', 'query', 'extract')
     Returns:
         str: Hash string
     """

From 6c7d7c25d388641bfa4518ededaf56f8301e74df Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 2 Feb 2025 00:10:21 +0800
Subject: [PATCH 22/32] Refactor cache handling logic for better readability,
 keep function unchanged.

---
 lightrag/utils.py | 88 ++++++++++++++++++++++-------------------------
 1 file changed, 41 insertions(+), 47 deletions(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index d4d42b40..edf96dcc 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -490,56 +490,50 @@ async def handle_cache(
     if hashing_kv is None or not hashing_kv.global_config.get("enable_llm_cache"):
         return None, None, None, None
 
-    # For default mode, only use simple cache matching
-    if mode == "default":
-        if exists_func(hashing_kv, "get_by_mode_and_id"):
-            mode_cache = await hashing_kv.get_by_mode_and_id(mode, args_hash) or {}
-        else:
-            mode_cache = await hashing_kv.get_by_id(mode) or {}
-        if args_hash in mode_cache:
-            return mode_cache[args_hash]["return"], None, None, None
-        return None, None, None, None
-
-    # Get embedding cache configuration
-    embedding_cache_config = hashing_kv.global_config.get(
-        "embedding_cache_config",
-        {"enabled": False, "similarity_threshold": 0.95, "use_llm_check": False},
-    )
-    is_embedding_cache_enabled = embedding_cache_config["enabled"]
-    use_llm_check = embedding_cache_config.get("use_llm_check", False)
-
-    quantized = min_val = max_val = None
-    if is_embedding_cache_enabled:
-        # Use embedding cache
-        current_embedding = await hashing_kv.embedding_func([prompt])
-        llm_model_func = (
-            hashing_kv.llm_model_func if hasattr(hashing_kv, "llm_model_func") else None
+    if mode != "default":
+        # Get embedding cache configuration
+        embedding_cache_config = hashing_kv.global_config.get(
+            "embedding_cache_config",
+            {"enabled": False, "similarity_threshold": 0.95, "use_llm_check": False},
         )
-        quantized, min_val, max_val = quantize_embedding(current_embedding[0])
-        best_cached_response = await get_best_cached_response(
-            hashing_kv,
-            current_embedding[0],
-            similarity_threshold=embedding_cache_config["similarity_threshold"],
-            mode=mode,
-            use_llm_check=use_llm_check,
-            llm_func=llm
-            if (use_llm_check and llm is not None)
-            else (llm_model_func if use_llm_check else None),
-            original_prompt=prompt if use_llm_check else None,
-            cache_type=cache_type,
-        )
-        if best_cached_response is not None:
-            return best_cached_response, None, None, None
+        is_embedding_cache_enabled = embedding_cache_config["enabled"]
+        use_llm_check = embedding_cache_config.get("use_llm_check", False)
+
+        quantized = min_val = max_val = None
+        if is_embedding_cache_enabled:
+            # Use embedding cache
+            current_embedding = await hashing_kv.embedding_func([prompt])
+            llm_model_func = (
+                hashing_kv.llm_model_func if hasattr(hashing_kv, "llm_model_func") else None
+            )
+            quantized, min_val, max_val = quantize_embedding(current_embedding[0])
+            best_cached_response = await get_best_cached_response(
+                hashing_kv,
+                current_embedding[0],
+                similarity_threshold=embedding_cache_config["similarity_threshold"],
+                mode=mode,
+                use_llm_check=use_llm_check,
+                llm_func=llm
+                if (use_llm_check and llm is not None)
+                else (llm_model_func if use_llm_check else None),
+                original_prompt=prompt if use_llm_check else None,
+                cache_type=cache_type,
+            )
+            if best_cached_response is not None:
+                return best_cached_response, None, None, None
+            else:
+                return None, quantized, min_val, max_val
+
+    # For default mode(extract_entities or naive query) or is_embedding_cache_enabled is False
+    # Use regular cache
+    if exists_func(hashing_kv, "get_by_mode_and_id"):
+        mode_cache = await hashing_kv.get_by_mode_and_id(mode, args_hash) or {}
     else:
-        # Use regular cache
-        if exists_func(hashing_kv, "get_by_mode_and_id"):
-            mode_cache = await hashing_kv.get_by_mode_and_id(mode, args_hash) or {}
-        else:
-            mode_cache = await hashing_kv.get_by_id(mode) or {}
-        if args_hash in mode_cache:
-            return mode_cache[args_hash]["return"], None, None, None
+        mode_cache = await hashing_kv.get_by_id(mode) or {}
+    if args_hash in mode_cache:
+        return mode_cache[args_hash]["return"], None, None, None
 
-    return None, quantized, min_val, max_val
+    return None, None, None, None
 
 
 @dataclass

From b45ae1567c29ebeedb3014cbfae4afc6025f43f7 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 2 Feb 2025 01:28:46 +0800
Subject: [PATCH 23/32] Refactor LLM cache handling and entity extraction

- Removed custom LLM function in entity extraction
- Simplified cache handling logic
- Added `force_llm_cache` parameter
- Updated cache handling conditions
---
 lightrag/operate.py | 28 +---------------------------
 lightrag/utils.py   |  8 +++-----
 2 files changed, 4 insertions(+), 32 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index bc011cb9..0e1eb3f3 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -352,32 +352,6 @@ async def extract_entities(
         input_text: str, history_messages: list[dict[str, str]] = None
     ) -> str:
         if enable_llm_cache_for_entity_extract and llm_response_cache:
-            custom_llm = None
-            if (
-                global_config["embedding_cache_config"]
-                and global_config["embedding_cache_config"]["enabled"]
-            ):
-                new_config = global_config.copy()
-                new_config["embedding_cache_config"] = None
-                new_config["enable_llm_cache"] = True
-
-                # create a llm function with new_config for handle_cache
-                async def custom_llm(
-                    prompt,
-                    system_prompt=None,
-                    history_messages=[],
-                    keyword_extraction=False,
-                    **kwargs,
-                ) -> str:
-                    # 合并 new_config 和其他 kwargs,保证其他参数不被覆盖
-                    merged_config = {**kwargs, **new_config}
-                    return await use_llm_func(
-                        prompt,
-                        system_prompt=system_prompt,
-                        history_messages=history_messages,
-                        keyword_extraction=keyword_extraction,
-                        **merged_config,
-                    )
 
             if history_messages:
                 history = json.dumps(history_messages, ensure_ascii=False)
@@ -392,7 +366,7 @@ async def extract_entities(
                 _prompt,
                 "default",
                 cache_type="extract",
-                llm=custom_llm,
+                force_llm_cache=True,
             )
             if cached_return:
                 logger.debug(f"Found cache for {arg_hash}")
diff --git a/lightrag/utils.py b/lightrag/utils.py
index edf96dcc..1bd06e6d 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -484,10 +484,10 @@ def dequantize_embedding(
 
 
 async def handle_cache(
-    hashing_kv, args_hash, prompt, mode="default", cache_type=None, llm=None
+    hashing_kv, args_hash, prompt, mode="default", cache_type=None, force_llm_cache=False
 ):
     """Generic cache handling function"""
-    if hashing_kv is None or not hashing_kv.global_config.get("enable_llm_cache"):
+    if hashing_kv is None or not (force_llm_cache or hashing_kv.global_config.get("enable_llm_cache")):
         return None, None, None, None
 
     if mode != "default":
@@ -513,9 +513,7 @@ async def handle_cache(
                 similarity_threshold=embedding_cache_config["similarity_threshold"],
                 mode=mode,
                 use_llm_check=use_llm_check,
-                llm_func=llm
-                if (use_llm_check and llm is not None)
-                else (llm_model_func if use_llm_check else None),
+                llm_func=llm_model_func if use_llm_check else None,
                 original_prompt=prompt if use_llm_check else None,
                 cache_type=cache_type,
             )

From 5d14ab03eb32e7c4685e0bbeb0170b61cb30e786 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 2 Feb 2025 01:56:32 +0800
Subject: [PATCH 24/32] Fix linting

---
 lightrag/operate.py | 13 ++++++++++---
 lightrag/utils.py   | 15 ++++++++++++---
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 0e1eb3f3..c8c50f61 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -352,7 +352,6 @@ async def extract_entities(
         input_text: str, history_messages: list[dict[str, str]] = None
     ) -> str:
         if enable_llm_cache_for_entity_extract and llm_response_cache:
-
             if history_messages:
                 history = json.dumps(history_messages, ensure_ascii=False)
                 _prompt = history + "\n" + input_text
@@ -381,7 +380,12 @@ async def extract_entities(
                 res: str = await use_llm_func(input_text)
             await save_to_cache(
                 llm_response_cache,
-                CacheData(args_hash=arg_hash, content=res, prompt=_prompt, cache_type="extract"),
+                CacheData(
+                    args_hash=arg_hash,
+                    content=res,
+                    prompt=_prompt,
+                    cache_type="extract",
+                ),
             )
             return res
 
@@ -747,7 +751,10 @@ async def extract_keywords_only(
 
     # 7. Cache only the processed keywords with cache type
     if hl_keywords or ll_keywords:
-        cache_data = {"high_level_keywords": hl_keywords, "low_level_keywords": ll_keywords}
+        cache_data = {
+            "high_level_keywords": hl_keywords,
+            "low_level_keywords": ll_keywords,
+        }
         await save_to_cache(
             hashing_kv,
             CacheData(
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 1bd06e6d..dfd68c72 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -484,10 +484,17 @@ def dequantize_embedding(
 
 
 async def handle_cache(
-    hashing_kv, args_hash, prompt, mode="default", cache_type=None, force_llm_cache=False
+    hashing_kv,
+    args_hash,
+    prompt,
+    mode="default",
+    cache_type=None,
+    force_llm_cache=False,
 ):
     """Generic cache handling function"""
-    if hashing_kv is None or not (force_llm_cache or hashing_kv.global_config.get("enable_llm_cache")):
+    if hashing_kv is None or not (
+        force_llm_cache or hashing_kv.global_config.get("enable_llm_cache")
+    ):
         return None, None, None, None
 
     if mode != "default":
@@ -504,7 +511,9 @@ async def handle_cache(
             # Use embedding cache
             current_embedding = await hashing_kv.embedding_func([prompt])
             llm_model_func = (
-                hashing_kv.llm_model_func if hasattr(hashing_kv, "llm_model_func") else None
+                hashing_kv.llm_model_func
+                if hasattr(hashing_kv, "llm_model_func")
+                else None
             )
             quantized, min_val, max_val = quantize_embedding(current_embedding[0])
             best_cached_response = await get_best_cached_response(

From bed5a97ae29eb90d61adbab350f98ee0e490f721 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 2 Feb 2025 03:09:06 +0800
Subject: [PATCH 25/32] Fix prompt respond cache fail when
 is_embedding_cache_enabled is true

---
 lightrag/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index dfd68c72..6faa0d9e 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -523,7 +523,7 @@ async def handle_cache(
                 mode=mode,
                 use_llm_check=use_llm_check,
                 llm_func=llm_model_func if use_llm_check else None,
-                original_prompt=prompt if use_llm_check else None,
+                original_prompt=prompt,
                 cache_type=cache_type,
             )
             if best_cached_response is not None:

From fdc9017dedd1076df94a2414e2c4b21be53b4422 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 2 Feb 2025 03:14:07 +0800
Subject: [PATCH 26/32] Set embedding_func in all llm_response_cache

---
 lightrag/lightrag.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 108b4dd6..5ae0a47f 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -916,7 +916,7 @@ class LightRAG:
                 else self.key_string_value_json_storage_cls(
                     namespace="llm_response_cache",
                     global_config=asdict(self),
-                    embedding_func=None,
+                    embedding_func=self.embedding_func,  
                 ),
                 prompt=prompt,
             )
@@ -933,7 +933,7 @@ class LightRAG:
                 else self.key_string_value_json_storage_cls(
                     namespace="llm_response_cache",
                     global_config=asdict(self),
-                    embedding_func=None,
+                    embedding_func=self.embedding_func,
                 ),
             )
         elif param.mode == "mix":
@@ -952,7 +952,7 @@ class LightRAG:
                 else self.key_string_value_json_storage_cls(
                     namespace="llm_response_cache",
                     global_config=asdict(self),
-                    embedding_func=None,
+                    embedding_func=self.embedding_func,
                 ),
             )
         else:
@@ -993,7 +993,7 @@ class LightRAG:
             or self.key_string_value_json_storage_cls(
                 namespace="llm_response_cache",
                 global_config=asdict(self),
-                embedding_func=None,
+                embedding_func=self.embedding_func,
             ),
         )
 
@@ -1024,7 +1024,7 @@ class LightRAG:
                 else self.key_string_value_json_storage_cls(
                     namespace="llm_response_cache",
                     global_config=asdict(self),
-                    embedding_func=None,
+                    embedding_func=self.embedding_funcne,
                 ),
             )
         elif param.mode == "naive":
@@ -1040,7 +1040,7 @@ class LightRAG:
                 else self.key_string_value_json_storage_cls(
                     namespace="llm_response_cache",
                     global_config=asdict(self),
-                    embedding_func=None,
+                    embedding_func=self.embedding_func,
                 ),
             )
         elif param.mode == "mix":
@@ -1059,7 +1059,7 @@ class LightRAG:
                 else self.key_string_value_json_storage_cls(
                     namespace="llm_response_cache",
                     global_config=asdict(self),
-                    embedding_func=None,
+                    embedding_func=self.embedding_func,
                 ),
             )
         else:

From 873b52d2e4f0f2671bef8ccbd27a6731ba5c8ff2 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 2 Feb 2025 03:15:43 +0800
Subject: [PATCH 27/32] Add debug logging for cache response retrieval

---
 lightrag/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index 6faa0d9e..f2c2c610 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -368,6 +368,7 @@ async def get_best_cached_response(
     original_prompt=None,
     cache_type=None,
 ) -> Union[str, None]:
+    logger.debug(f"get_best_cached_response:  mode={mode} cache_type={cache_type}")
     mode_cache = await hashing_kv.get_by_id(mode)
     if not mode_cache:
         return None

From 8484564f50dceb826b0e7f442bf3c81b15f74360 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 2 Feb 2025 03:54:41 +0800
Subject: [PATCH 28/32] Fix llm_model_func retrieval error.

---
 lightrag/utils.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index f2c2c610..2ae067ba 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -368,7 +368,7 @@ async def get_best_cached_response(
     original_prompt=None,
     cache_type=None,
 ) -> Union[str, None]:
-    logger.debug(f"get_best_cached_response:  mode={mode} cache_type={cache_type}")
+    logger.debug(f"get_best_cached_response:  mode={mode} cache_type={cache_type} use_llm_check={use_llm_check}")
     mode_cache = await hashing_kv.get_by_id(mode)
     if not mode_cache:
         return None
@@ -511,11 +511,7 @@ async def handle_cache(
         if is_embedding_cache_enabled:
             # Use embedding cache
             current_embedding = await hashing_kv.embedding_func([prompt])
-            llm_model_func = (
-                hashing_kv.llm_model_func
-                if hasattr(hashing_kv, "llm_model_func")
-                else None
-            )
+            llm_model_func = hashing_kv.global_config.get('llm_model_func')
             quantized, min_val, max_val = quantize_embedding(current_embedding[0])
             best_cached_response = await get_best_cached_response(
                 hashing_kv,

From 6f5503ebd6d8e2e440105dd88336156b337aae5c Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 2 Feb 2025 04:22:43 +0800
Subject: [PATCH 29/32] Update similarity_check prompt to avoid generating two
 scores sometiimes

---
 lightrag/prompt.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lightrag/prompt.py b/lightrag/prompt.py
index 913f8eef..160663d9 100644
--- a/lightrag/prompt.py
+++ b/lightrag/prompt.py
@@ -290,9 +290,8 @@ PROMPTS[
 Question 1: {original_prompt}
 Question 2: {cached_prompt}
 
-Please evaluate the following two points and provide a similarity score between 0 and 1 directly:
-1. Whether these two questions are semantically similar
-2. Whether the answer to Question 2 can be used to answer Question 1
+Please evaluate whether these two questions are semantically similar, and whether the answer to Question 2 can be used to answer Question 1, provide a similarity score between 0 and 1 directly.
+
 Similarity score criteria:
 0: Completely unrelated or answer cannot be reused, including but not limited to:
    - The questions have different topics

From ecf48a5be5f7acc827853a1883a6910e72de5051 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 2 Feb 2025 04:27:21 +0800
Subject: [PATCH 30/32] Add embedding cache config and disable LLM cache for
 entity extraction for API Server

---
 lightrag/api/lightrag_server.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index 3bfa4943..b1039335 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -919,6 +919,12 @@ def create_app(args):
             vector_db_storage_cls_kwargs={
                 "cosine_better_than_threshold": args.cosine_threshold
             },
+            enable_llm_cache_for_entity_extract=False,
+            embedding_cache_config={
+                "enabled": True,
+                "similarity_threshold": 0.95,
+                "use_llm_check": False,
+            },
         )
     else:
         rag = LightRAG(
@@ -942,6 +948,12 @@ def create_app(args):
             vector_db_storage_cls_kwargs={
                 "cosine_better_than_threshold": args.cosine_threshold
             },
+            enable_llm_cache_for_entity_extract=False,
+            embedding_cache_config={
+                "enabled": True,
+                "similarity_threshold": 0.95,
+                "use_llm_check": False,
+            },
         )
 
     async def index_file(file_path: Union[str, Path]) -> None:

From 0a693dbfda138674acd242a4b531f1307be2f030 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 2 Feb 2025 04:27:55 +0800
Subject: [PATCH 31/32] Fix linting

---
 lightrag/lightrag.py | 2 +-
 lightrag/utils.py    | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 5ae0a47f..d77af71d 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -916,7 +916,7 @@ class LightRAG:
                 else self.key_string_value_json_storage_cls(
                     namespace="llm_response_cache",
                     global_config=asdict(self),
-                    embedding_func=self.embedding_func,  
+                    embedding_func=self.embedding_func,
                 ),
                 prompt=prompt,
             )
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 2ae067ba..3a69513b 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -368,7 +368,9 @@ async def get_best_cached_response(
     original_prompt=None,
     cache_type=None,
 ) -> Union[str, None]:
-    logger.debug(f"get_best_cached_response:  mode={mode} cache_type={cache_type} use_llm_check={use_llm_check}")
+    logger.debug(
+        f"get_best_cached_response:  mode={mode} cache_type={cache_type} use_llm_check={use_llm_check}"
+    )
     mode_cache = await hashing_kv.get_by_id(mode)
     if not mode_cache:
         return None
@@ -511,7 +513,7 @@ async def handle_cache(
         if is_embedding_cache_enabled:
             # Use embedding cache
             current_embedding = await hashing_kv.embedding_func([prompt])
-            llm_model_func = hashing_kv.global_config.get('llm_model_func')
+            llm_model_func = hashing_kv.global_config.get("llm_model_func")
             quantized, min_val, max_val = quantize_embedding(current_embedding[0])
             best_cached_response = await get_best_cached_response(
                 hashing_kv,

From 7ea1856699713de5962066014c5cc3e9078b330d Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 2 Feb 2025 07:29:01 +0800
Subject: [PATCH 32/32] Add comment to clarify LLM cache setting for entity
 extraction

---
 lightrag/api/lightrag_server.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index f15996ce..600bfdc4 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -919,7 +919,7 @@ def create_app(args):
             vector_db_storage_cls_kwargs={
                 "cosine_better_than_threshold": args.cosine_threshold
             },
-            enable_llm_cache_for_entity_extract=False,
+            enable_llm_cache_for_entity_extract=False,  # set to True for debuging to reduce llm fee
             embedding_cache_config={
                 "enabled": True,
                 "similarity_threshold": 0.95,
@@ -948,7 +948,7 @@ def create_app(args):
             vector_db_storage_cls_kwargs={
                 "cosine_better_than_threshold": args.cosine_threshold
             },
-            enable_llm_cache_for_entity_extract=False,
+            enable_llm_cache_for_entity_extract=False,  # set to True for debuging to reduce llm fee
             embedding_cache_config={
                 "enabled": True,
                 "similarity_threshold": 0.95,