From edb3d6ac11c7ce37503c2dc955dcf001389656dc Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 7 May 2025 10:51:44 +0800 Subject: [PATCH 01/10] Improve query context format for mix mode --- lightrag/operate.py | 162 ++++++++++++++++++++++---------------------- 1 file changed, 81 insertions(+), 81 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index b8c6a855..255b37cc 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1209,10 +1209,10 @@ async def mix_kg_vector_query( if query_param.only_need_context: context_str = f""" -\r\n\r\n=====Knowledge Graph Context=====\r\n\r\n +\r\n\r\n-----Knowledge Graph Context-----\r\n\r\n {kg_context if kg_context else "No relevant knowledge graph information found"} -\r\n\r\n=====Vector Context=====\r\n\r\n +\r\n\r\n-----Vector Context-----\r\n\r\n {vector_context if vector_context else "No relevant text information found"} """.strip() return context_str @@ -1275,6 +1275,85 @@ async def mix_kg_vector_query( return response +async def _get_vector_context( + query: str, + chunks_vdb: BaseVectorStorage, + query_param: QueryParam, + tokenizer: Tokenizer, +) -> str | None: + """ + Retrieve vector context from the vector database. + + This function performs vector search to find relevant text chunks for a query, + formats them with file path and creation time information, and truncates + the results to fit within token limits. + + Args: + query: The query string to search for + chunks_vdb: Vector database containing document chunks + query_param: Query parameters including top_k and ids + tokenizer: Tokenizer for counting tokens + + Returns: + Formatted string containing relevant text chunks, or None if no results found + """ + try: + # Reduce top_k for vector search in hybrid mode since we have structured information from KG + mix_topk = ( + min(10, query_param.top_k) + if hasattr(query_param, "mode") and query_param.mode == "mix" + else query_param.top_k + ) + results = await chunks_vdb.query(query, top_k=mix_topk, ids=query_param.ids) + if not results: + return None + + valid_chunks = [] + for result in results: + if "content" in result: + # Directly use content from chunks_vdb.query result + chunk_with_time = { + "content": result["content"], + "created_at": result.get("created_at", None), + "file_path": result.get("file_path", None), + } + valid_chunks.append(chunk_with_time) + + if not valid_chunks: + return None + + maybe_trun_chunks = truncate_list_by_token_size( + valid_chunks, + key=lambda x: x["content"], + max_token_size=query_param.max_token_for_text_unit, + tokenizer=tokenizer, + ) + + logger.debug( + f"Truncate chunks from {len(valid_chunks)} to {len(maybe_trun_chunks)} (max tokens:{query_param.max_token_for_text_unit})" + ) + logger.info(f"Vector query: {len(maybe_trun_chunks)} chunks, top_k: {mix_topk}") + + if not maybe_trun_chunks: + return None + + # Include time information in content + formatted_chunks = [] + for c in maybe_trun_chunks: + chunk_text = "File path: " + c["file_path"] + "\r\n\r\n" + c["content"] + if c["created_at"]: + chunk_text = f"[Created at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['created_at']))}]\r\n\r\n{chunk_text}" + formatted_chunks.append(chunk_text) + + logger.debug( + f"Truncate chunks from {len(valid_chunks)} to {len(formatted_chunks)} (max tokens:{query_param.max_token_for_text_unit})" + ) + return "\r\n\r\n-------New Chunk-------\r\n\r\n".join(formatted_chunks) + except Exception as e: + logger.error(f"Error in _get_vector_context: {e}") + return None + + async def _build_query_context( ll_keywords: str, hl_keywords: str, @@ -2198,85 +2277,6 @@ async def kg_query_with_keywords( return response -async def _get_vector_context( - query: str, - chunks_vdb: BaseVectorStorage, - query_param: QueryParam, - tokenizer: Tokenizer, -) -> str | None: - """ - Retrieve vector context from the vector database. - - This function performs vector search to find relevant text chunks for a query, - formats them with file path and creation time information, and truncates - the results to fit within token limits. - - Args: - query: The query string to search for - chunks_vdb: Vector database containing document chunks - query_param: Query parameters including top_k and ids - tokenizer: Tokenizer for counting tokens - - Returns: - Formatted string containing relevant text chunks, or None if no results found - """ - try: - # Reduce top_k for vector search in hybrid mode since we have structured information from KG - mix_topk = ( - min(10, query_param.top_k) - if hasattr(query_param, "mode") and query_param.mode == "mix" - else query_param.top_k - ) - results = await chunks_vdb.query(query, top_k=mix_topk, ids=query_param.ids) - if not results: - return None - - valid_chunks = [] - for result in results: - if "content" in result: - # Directly use content from chunks_vdb.query result - chunk_with_time = { - "content": result["content"], - "created_at": result.get("created_at", None), - "file_path": result.get("file_path", None), - } - valid_chunks.append(chunk_with_time) - - if not valid_chunks: - return None - - maybe_trun_chunks = truncate_list_by_token_size( - valid_chunks, - key=lambda x: x["content"], - max_token_size=query_param.max_token_for_text_unit, - tokenizer=tokenizer, - ) - - logger.debug( - f"Truncate chunks from {len(valid_chunks)} to {len(maybe_trun_chunks)} (max tokens:{query_param.max_token_for_text_unit})" - ) - logger.info(f"Vector query: {len(maybe_trun_chunks)} chunks, top_k: {mix_topk}") - - if not maybe_trun_chunks: - return None - - # Include time information in content - formatted_chunks = [] - for c in maybe_trun_chunks: - chunk_text = "File path: " + c["file_path"] + "\r\n\r\n" + c["content"] - if c["created_at"]: - chunk_text = f"[Created at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['created_at']))}]\r\n\r\n{chunk_text}" - formatted_chunks.append(chunk_text) - - logger.debug( - f"Truncate chunks from {len(valid_chunks)} to {len(formatted_chunks)} (max tokens:{query_param.max_token_for_text_unit})" - ) - return "\r\n\r\n--New Chunk--\r\n\r\n".join(formatted_chunks) - except Exception as e: - logger.error(f"Error in _get_vector_context: {e}") - return None - - async def query_with_keywords( query: str, prompt: str, From 3146309fde5393641e9f888da7df4546615d1cd9 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 7 May 2025 10:52:26 +0800 Subject: [PATCH 02/10] Change function name from list_of_list_to_json to list_of_list_to_dict --- lightrag/operate.py | 14 +++++++------- lightrag/utils.py | 20 +++++++++++++++++++- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 255b37cc..2f1a8a57 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -26,7 +26,7 @@ from .utils import ( CacheData, get_conversation_turns, use_llm_func_with_cache, - list_of_list_to_json, + list_of_list_to_dict, ) from .base import ( BaseGraphStorage, @@ -1549,7 +1549,7 @@ async def _get_node_data( file_path, ] ) - entities_context = list_of_list_to_json(entites_section_list) + entities_context = list_of_list_to_dict(entites_section_list) relations_section_list = [ [ @@ -1586,14 +1586,14 @@ async def _get_node_data( file_path, ] ) - relations_context = list_of_list_to_json(relations_section_list) + relations_context = list_of_list_to_dict(relations_section_list) text_units_section_list = [["id", "content", "file_path"]] for i, t in enumerate(use_text_units): text_units_section_list.append( [i, t["content"], t.get("file_path", "unknown_source")] ) - text_units_context = list_of_list_to_json(text_units_section_list) + text_units_context = list_of_list_to_dict(text_units_section_list) return entities_context, relations_context, text_units_context @@ -1871,7 +1871,7 @@ async def _get_edge_data( file_path, ] ) - relations_context = list_of_list_to_json(relations_section_list) + relations_context = list_of_list_to_dict(relations_section_list) entites_section_list = [ ["id", "entity", "type", "description", "rank", "created_at", "file_path"] @@ -1896,12 +1896,12 @@ async def _get_edge_data( file_path, ] ) - entities_context = list_of_list_to_json(entites_section_list) + entities_context = list_of_list_to_dict(entites_section_list) text_units_section_list = [["id", "content", "file_path"]] for i, t in enumerate(use_text_units): text_units_section_list.append([i, t["content"], t.get("file_path", "unknown")]) - text_units_context = list_of_list_to_json(text_units_section_list) + text_units_context = list_of_list_to_dict(text_units_section_list) return entities_context, relations_context, text_units_context diff --git a/lightrag/utils.py b/lightrag/utils.py index e4ef7699..2ed831b5 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -719,7 +719,25 @@ def truncate_list_by_token_size( return list_data -def list_of_list_to_json(data: list[list[str]]) -> list[dict[str, str]]: +def list_of_list_to_dict(data: list[list[str]]) -> list[dict[str, str]]: + """Convert a 2D string list (table-like data) into a list of dictionaries. + + The first row is treated as header containing field names. Subsequent rows become + dictionary entries where keys come from header and values from row data. + + Args: + data: 2D string array where first row contains headers and rest are data rows. + Minimum 2 columns required in data rows (rows with <2 elements are skipped). + + Returns: + List of dictionaries where each dict represents a data row with: + - Keys: Header values from first row + - Values: Corresponding row values (empty string if missing) + + Example: + Input: [["Name","Age"], ["Alice","23"], ["Bob"]] + Output: [{"Name":"Alice","Age":"23"}, {"Name":"Bob","Age":""}] + """ if not data or len(data) <= 1: return [] From 1e03888cef9225d0c554f375d04ed156e6da7252 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 7 May 2025 10:57:33 +0800 Subject: [PATCH 03/10] Change function name get_kg_context to _get_kg_context --- lightrag/operate.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 2f1a8a57..ce1e76f6 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1156,7 +1156,7 @@ async def mix_kg_vector_query( ) # 2. Execute knowledge graph and vector searches in parallel - async def get_kg_context(): + async def _get_kg_context(): try: hl_keywords, ll_keywords = await get_keywords_from_query( query, query_param, global_config, hashing_kv @@ -1194,13 +1194,13 @@ async def mix_kg_vector_query( return context except Exception as e: - logger.error(f"Error in get_kg_context: {str(e)}") + logger.error(f"Error in _get_kg_context: {str(e)}") traceback.print_exc() return None # 3. Execute both retrievals in parallel kg_context, vector_context = await asyncio.gather( - get_kg_context(), _get_vector_context(query, chunks_vdb, query_param, tokenizer) + _get_kg_context(), _get_vector_context(query, chunks_vdb, query_param, tokenizer) ) # 4. Merge contexts From 59771b60df0363b3edd3756faff1d673db7d0150 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 7 May 2025 13:02:22 +0800 Subject: [PATCH 04/10] Optimize relationship title to entity1 and entity2 --- lightrag/operate.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index ce1e76f6..2a4137a0 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1208,8 +1208,7 @@ async def mix_kg_vector_query( return PROMPTS["fail_response"] if query_param.only_need_context: - context_str = f""" -\r\n\r\n-----Knowledge Graph Context-----\r\n\r\n + context_str = f"""\r\n\r\n-----Knowledge Graph Context-----\r\n\r\n {kg_context if kg_context else "No relevant knowledge graph information found"} \r\n\r\n-----Vector Context-----\r\n\r\n @@ -1554,8 +1553,8 @@ async def _get_node_data( relations_section_list = [ [ "id", - "source", - "target", + "entity1", + "entity2", "description", "keywords", "weight", @@ -1839,8 +1838,8 @@ async def _get_edge_data( relations_section_list = [ [ "id", - "source", - "target", + "entity1", + "entity2", "description", "keywords", "weight", From 156244e26087cdc86ae06281dc2b0ddc37d65411 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 7 May 2025 17:42:14 +0800 Subject: [PATCH 05/10] Refactor: Unify naive context to JSON format - Merges 'mix' mode query handling into 'hybrid' mode, simplifying query logic by removing the dedicated `mix_kg_vector_query` function - Standardizes vector search result by using JSON string format to build context - Fixes a bug in `query_with_keywords` ensuring `hl_keywords` and `ll_keywords` are correctly passed to `kg_query_with_keywords` --- lightrag/lightrag.py | 23 +-- lightrag/operate.py | 394 +++++++++++++------------------------------ lightrag/utils.py | 40 +++-- 3 files changed, 148 insertions(+), 309 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index e9cb0926..2145fcb1 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -53,7 +53,6 @@ from .operate import ( extract_entities, merge_nodes_and_edges, kg_query, - mix_kg_vector_query, naive_query, query_with_keywords, ) @@ -1437,8 +1436,10 @@ class LightRAG: """ # If a custom model is provided in param, temporarily update global config global_config = asdict(self) + # Save original query for vector search + param.original_query = query - if param.mode in ["local", "global", "hybrid"]: + if param.mode in ["local", "global", "hybrid", "mix"]: response = await kg_query( query.strip(), self.chunk_entity_relation_graph, @@ -1447,8 +1448,9 @@ class LightRAG: self.text_chunks, param, global_config, - hashing_kv=self.llm_response_cache, # Directly use llm_response_cache + hashing_kv=self.llm_response_cache, system_prompt=system_prompt, + chunks_vdb=self.chunks_vdb, ) elif param.mode == "naive": response = await naive_query( @@ -1457,20 +1459,7 @@ class LightRAG: self.text_chunks, param, global_config, - hashing_kv=self.llm_response_cache, # Directly use llm_response_cache - system_prompt=system_prompt, - ) - elif param.mode == "mix": - response = await mix_kg_vector_query( - query.strip(), - self.chunk_entity_relation_graph, - self.entities_vdb, - self.relationships_vdb, - self.chunks_vdb, - self.text_chunks, - param, - global_config, - hashing_kv=self.llm_response_cache, # Directly use llm_response_cache + hashing_kv=self.llm_response_cache, system_prompt=system_prompt, ) elif param.mode == "bypass": diff --git a/lightrag/operate.py b/lightrag/operate.py index 2a4137a0..0ff485a8 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -2,7 +2,6 @@ from __future__ import annotations from functools import partial import asyncio -import traceback import json import re import os @@ -859,6 +858,7 @@ async def kg_query( global_config: dict[str, str], hashing_kv: BaseKVStorage | None = None, system_prompt: str | None = None, + chunks_vdb: BaseVectorStorage = None, ) -> str | AsyncIterator[str]: if query_param.model_func: use_model_func = query_param.model_func @@ -911,6 +911,7 @@ async def kg_query( relationships_vdb, text_chunks_db, query_param, + chunks_vdb, ) if query_param.only_need_context: @@ -1110,182 +1111,17 @@ async def extract_keywords_only( return hl_keywords, ll_keywords -async def mix_kg_vector_query( - query: str, - knowledge_graph_inst: BaseGraphStorage, - entities_vdb: BaseVectorStorage, - relationships_vdb: BaseVectorStorage, - chunks_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage, - query_param: QueryParam, - global_config: dict[str, str], - hashing_kv: BaseKVStorage | None = None, - system_prompt: str | None = None, -) -> str | AsyncIterator[str]: - """ - Hybrid retrieval implementation combining knowledge graph and vector search. - - This function performs a hybrid search by: - 1. Extracting semantic information from knowledge graph - 2. Retrieving relevant text chunks through vector similarity - 3. Combining both results for comprehensive answer generation - """ - # get tokenizer - tokenizer: Tokenizer = global_config["tokenizer"] - - if query_param.model_func: - use_model_func = query_param.model_func - else: - use_model_func = global_config["llm_model_func"] - # Apply higher priority (5) to query relation LLM function - use_model_func = partial(use_model_func, _priority=5) - - # 1. Cache handling - args_hash = compute_args_hash("mix", query, cache_type="query") - cached_response, quantized, min_val, max_val = await handle_cache( - hashing_kv, args_hash, query, "mix", cache_type="query" - ) - if cached_response is not None: - return cached_response - - # Process conversation history - history_context = "" - if query_param.conversation_history: - history_context = get_conversation_turns( - query_param.conversation_history, query_param.history_turns - ) - - # 2. Execute knowledge graph and vector searches in parallel - async def _get_kg_context(): - try: - hl_keywords, ll_keywords = await get_keywords_from_query( - query, query_param, global_config, hashing_kv - ) - - if not hl_keywords and not ll_keywords: - logger.warning("Both high-level and low-level keywords are empty") - return None - - # Convert keyword lists to strings - ll_keywords_str = ", ".join(ll_keywords) if ll_keywords else "" - hl_keywords_str = ", ".join(hl_keywords) if hl_keywords else "" - - # Set query mode based on available keywords - if not ll_keywords_str and not hl_keywords_str: - return None - elif not ll_keywords_str: - query_param.mode = "global" - elif not hl_keywords_str: - query_param.mode = "local" - else: - query_param.mode = "hybrid" - - # Build knowledge graph context - context = await _build_query_context( - ll_keywords_str, - hl_keywords_str, - knowledge_graph_inst, - entities_vdb, - relationships_vdb, - text_chunks_db, - query_param, - ) - - return context - - except Exception as e: - logger.error(f"Error in _get_kg_context: {str(e)}") - traceback.print_exc() - return None - - # 3. Execute both retrievals in parallel - kg_context, vector_context = await asyncio.gather( - _get_kg_context(), _get_vector_context(query, chunks_vdb, query_param, tokenizer) - ) - - # 4. Merge contexts - if kg_context is None and vector_context is None: - return PROMPTS["fail_response"] - - if query_param.only_need_context: - context_str = f"""\r\n\r\n-----Knowledge Graph Context-----\r\n\r\n -{kg_context if kg_context else "No relevant knowledge graph information found"} - -\r\n\r\n-----Vector Context-----\r\n\r\n -{vector_context if vector_context else "No relevant text information found"} -""".strip() - return context_str - - # 5. Construct hybrid prompt - sys_prompt = ( - system_prompt if system_prompt else PROMPTS["mix_rag_response"] - ).format( - kg_context=kg_context - if kg_context - else "No relevant knowledge graph information found", - vector_context=vector_context - if vector_context - else "No relevant text information found", - response_type=query_param.response_type, - history=history_context, - ) - - if query_param.only_need_prompt: - return sys_prompt - - len_of_prompts = len(tokenizer.encode(query + sys_prompt)) - logger.debug(f"[mix_kg_vector_query]Prompt Tokens: {len_of_prompts}") - - # 6. Generate response - response = await use_model_func( - query, - system_prompt=sys_prompt, - stream=query_param.stream, - ) - - # Clean up response content - if isinstance(response, str) and len(response) > len(sys_prompt): - response = ( - response.replace(sys_prompt, "") - .replace("user", "") - .replace("model", "") - .replace(query, "") - .replace("", "") - .replace("", "") - .strip() - ) - - if hashing_kv.global_config.get("enable_llm_cache"): - # 7. Save cache - Only cache after collecting complete response - await save_to_cache( - hashing_kv, - CacheData( - args_hash=args_hash, - content=response, - prompt=query, - quantized=quantized, - min_val=min_val, - max_val=max_val, - mode="mix", - cache_type="query", - ), - ) - - return response - - async def _get_vector_context( query: str, chunks_vdb: BaseVectorStorage, query_param: QueryParam, tokenizer: Tokenizer, -) -> str | None: +) -> tuple[list, list, list] | None: """ Retrieve vector context from the vector database. This function performs vector search to find relevant text chunks for a query, - formats them with file path and creation time information, and truncates - the results to fit within token limits. + formats them with file path and creation time information. Args: query: The query string to search for @@ -1294,18 +1130,15 @@ async def _get_vector_context( tokenizer: Tokenizer for counting tokens Returns: - Formatted string containing relevant text chunks, or None if no results found + Tuple (empty_entities, empty_relations, text_units) for combine_contexts, + compatible with _get_edge_data and _get_node_data format """ try: - # Reduce top_k for vector search in hybrid mode since we have structured information from KG - mix_topk = ( - min(10, query_param.top_k) - if hasattr(query_param, "mode") and query_param.mode == "mix" - else query_param.top_k + results = await chunks_vdb.query( + query, top_k=query_param.top_k, ids=query_param.ids ) - results = await chunks_vdb.query(query, top_k=mix_topk, ids=query_param.ids) if not results: - return None + return [], [], [] valid_chunks = [] for result in results: @@ -1314,12 +1147,12 @@ async def _get_vector_context( chunk_with_time = { "content": result["content"], "created_at": result.get("created_at", None), - "file_path": result.get("file_path", None), + "file_path": result.get("file_path", "unknown_source"), } valid_chunks.append(chunk_with_time) if not valid_chunks: - return None + return [], [], [] maybe_trun_chunks = truncate_list_by_token_size( valid_chunks, @@ -1331,26 +1164,37 @@ async def _get_vector_context( logger.debug( f"Truncate chunks from {len(valid_chunks)} to {len(maybe_trun_chunks)} (max tokens:{query_param.max_token_for_text_unit})" ) - logger.info(f"Vector query: {len(maybe_trun_chunks)} chunks, top_k: {mix_topk}") + logger.info( + f"Vector query: {len(maybe_trun_chunks)} chunks, top_k: {query_param.top_k}" + ) if not maybe_trun_chunks: - return None + return [], [], [] - # Include time information in content - formatted_chunks = [] - for c in maybe_trun_chunks: - chunk_text = "File path: " + c["file_path"] + "\r\n\r\n" + c["content"] - if c["created_at"]: - chunk_text = f"[Created at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['created_at']))}]\r\n\r\n{chunk_text}" - formatted_chunks.append(chunk_text) + # Create empty entities and relations contexts + entities_context = [] + relations_context = [] - logger.debug( - f"Truncate chunks from {len(valid_chunks)} to {len(formatted_chunks)} (max tokens:{query_param.max_token_for_text_unit})" - ) - return "\r\n\r\n-------New Chunk-------\r\n\r\n".join(formatted_chunks) + # Create text_units_context in the same format as _get_edge_data and _get_node_data + text_units_section_list = [["id", "content", "file_path"]] + + for i, chunk in enumerate(maybe_trun_chunks): + # Add to text_units_section_list + text_units_section_list.append( + [ + i + 1, # id + chunk["content"], # content + chunk["file_path"], # file_path + ] + ) + + # Convert to dictionary format using list_of_list_to_dict + text_units_context = list_of_list_to_dict(text_units_section_list) + + return entities_context, relations_context, text_units_context except Exception as e: logger.error(f"Error in _get_vector_context: {e}") - return None + return [], [], [] async def _build_query_context( @@ -1361,8 +1205,11 @@ async def _build_query_context( relationships_vdb: BaseVectorStorage, text_chunks_db: BaseKVStorage, query_param: QueryParam, + chunks_vdb: BaseVectorStorage = None, # Add chunks_vdb parameter for mix mode ): - logger.info(f"Process {os.getpid()} buidling query context...") + logger.info(f"Process {os.getpid()} building query context...") + + # Handle local and global modes as before if query_param.mode == "local": entities_context, relations_context, text_units_context = await _get_node_data( ll_keywords, @@ -1379,7 +1226,7 @@ async def _build_query_context( text_chunks_db, query_param, ) - else: # hybrid mode + else: # hybrid or mix mode ll_data = await _get_node_data( ll_keywords, knowledge_graph_inst, @@ -1407,10 +1254,43 @@ async def _build_query_context( hl_text_units_context, ) = hl_data - entities_context, relations_context, text_units_context = combine_contexts( - [hl_entities_context, ll_entities_context], - [hl_relations_context, ll_relations_context], - [hl_text_units_context, ll_text_units_context], + # Initialize vector data with empty lists + vector_entities_context, vector_relations_context, vector_text_units_context = ( + [], + [], + [], + ) + + # Only get vector data if in mix mode + if query_param.mode == "mix" and hasattr(query_param, "original_query"): + # Get tokenizer from text_chunks_db + tokenizer = text_chunks_db.global_config.get("tokenizer") + + # Get vector context in triple format + vector_data = await _get_vector_context( + query_param.original_query, # We need to pass the original query + chunks_vdb, + query_param, + tokenizer, + ) + + # If vector_data is not None, unpack it + if vector_data is not None: + ( + vector_entities_context, + vector_relations_context, + vector_text_units_context, + ) = vector_data + + # Combine and deduplicate the entities, relationships, and sources + entities_context = process_combine_contexts( + hl_entities_context, ll_entities_context, vector_entities_context + ) + relations_context = process_combine_contexts( + hl_relations_context, ll_relations_context, vector_relations_context + ) + text_units_context = process_combine_contexts( + hl_text_units_context, ll_text_units_context, vector_text_units_context ) # not necessary to use LLM to generate a response if not entities_context and not relations_context: @@ -1539,7 +1419,7 @@ async def _get_node_data( entites_section_list.append( [ - i, + i + 1, n["entity_name"], n.get("entity_type", "UNKNOWN"), n.get("description", "UNKNOWN"), @@ -1574,7 +1454,7 @@ async def _get_node_data( relations_section_list.append( [ - i, + i + 1, e["src_tgt"][0], e["src_tgt"][1], e["description"], @@ -1590,7 +1470,7 @@ async def _get_node_data( text_units_section_list = [["id", "content", "file_path"]] for i, t in enumerate(use_text_units): text_units_section_list.append( - [i, t["content"], t.get("file_path", "unknown_source")] + [i + 1, t["content"], t.get("file_path", "unknown_source")] ) text_units_context = list_of_list_to_dict(text_units_section_list) return entities_context, relations_context, text_units_context @@ -1859,7 +1739,7 @@ async def _get_edge_data( relations_section_list.append( [ - i, + i + 1, e["src_id"], e["tgt_id"], e["description"], @@ -1886,7 +1766,7 @@ async def _get_edge_data( entites_section_list.append( [ - i, + i + 1, n["entity_name"], n.get("entity_type", "UNKNOWN"), n.get("description", "UNKNOWN"), @@ -1899,7 +1779,9 @@ async def _get_edge_data( text_units_section_list = [["id", "content", "file_path"]] for i, t in enumerate(use_text_units): - text_units_section_list.append([i, t["content"], t.get("file_path", "unknown")]) + text_units_section_list.append( + [i + 1, t["content"], t.get("file_path", "unknown")] + ) text_units_context = list_of_list_to_dict(text_units_section_list) return entities_context, relations_context, text_units_context @@ -2016,25 +1898,6 @@ async def _find_related_text_unit_from_relationships( return all_text_units -def combine_contexts(entities, relationships, sources): - # Function to extract entities, relationships, and sources from context strings - hl_entities, ll_entities = entities[0], entities[1] - hl_relationships, ll_relationships = relationships[0], relationships[1] - hl_sources, ll_sources = sources[0], sources[1] - # Combine and deduplicate the entities - combined_entities = process_combine_contexts(hl_entities, ll_entities) - - # Combine and deduplicate the relationships - combined_relationships = process_combine_contexts( - hl_relationships, ll_relationships - ) - - # Combine and deduplicate the sources - combined_sources = process_combine_contexts(hl_sources, ll_sources) - - return combined_entities, combined_relationships, combined_sources - - async def naive_query( query: str, chunks_vdb: BaseVectorStorage, @@ -2060,14 +1923,24 @@ async def naive_query( return cached_response tokenizer: Tokenizer = global_config["tokenizer"] - section = await _get_vector_context(query, chunks_vdb, query_param, tokenizer) - if section is None: + _, _, text_units_context = await _get_vector_context( + query, chunks_vdb, query_param, tokenizer + ) + + if text_units_context is None or len(text_units_context) == 0: return PROMPTS["fail_response"] + text_units_str = json.dumps(text_units_context, ensure_ascii=False) if query_param.only_need_context: - return section + return f""" +---Document Chunks--- +```json +{text_units_str} +``` + +""" # Process conversation history history_context = "" if query_param.conversation_history: @@ -2077,7 +1950,7 @@ async def naive_query( sys_prompt_temp = system_prompt if system_prompt else PROMPTS["naive_rag_response"] sys_prompt = sys_prompt_temp.format( - content_data=section, + content_data=text_units_str, response_type=query_param.response_type, history=history_context, ) @@ -2134,6 +2007,9 @@ async def kg_query_with_keywords( query_param: QueryParam, global_config: dict[str, str], hashing_kv: BaseKVStorage | None = None, + ll_keywords: list[str] = [], + hl_keywords: list[str] = [], + chunks_vdb: BaseVectorStorage | None = None, ) -> str | AsyncIterator[str]: """ Refactored kg_query that does NOT extract keywords by itself. @@ -2147,9 +2023,6 @@ async def kg_query_with_keywords( # Apply higher priority (5) to query relation LLM function use_model_func = partial(use_model_func, _priority=5) - # --------------------------- - # 1) Handle potential cache for query results - # --------------------------- args_hash = compute_args_hash(query_param.mode, query, cache_type="query") cached_response, quantized, min_val, max_val = await handle_cache( hashing_kv, args_hash, query, query_param.mode, cache_type="query" @@ -2157,14 +2030,6 @@ async def kg_query_with_keywords( if cached_response is not None: return cached_response - # --------------------------- - # 2) RETRIEVE KEYWORDS FROM query_param - # --------------------------- - - # If these fields don't exist, default to empty lists/strings. - hl_keywords = getattr(query_param, "hl_keywords", []) or [] - ll_keywords = getattr(query_param, "ll_keywords", []) or [] - # If neither has any keywords, you could handle that logic here. if not hl_keywords and not ll_keywords: logger.warning( @@ -2178,25 +2043,9 @@ async def kg_query_with_keywords( logger.warning("high_level_keywords is empty, switching to local mode.") query_param.mode = "local" - # Flatten low-level and high-level keywords if needed - ll_keywords_flat = ( - [item for sublist in ll_keywords for item in sublist] - if any(isinstance(i, list) for i in ll_keywords) - else ll_keywords - ) - hl_keywords_flat = ( - [item for sublist in hl_keywords for item in sublist] - if any(isinstance(i, list) for i in hl_keywords) - else hl_keywords - ) + ll_keywords_str = ", ".join(ll_keywords) if ll_keywords else "" + hl_keywords_str = ", ".join(hl_keywords) if hl_keywords else "" - # Join the flattened lists - ll_keywords_str = ", ".join(ll_keywords_flat) if ll_keywords_flat else "" - hl_keywords_str = ", ".join(hl_keywords_flat) if hl_keywords_flat else "" - - # --------------------------- - # 3) BUILD CONTEXT - # --------------------------- context = await _build_query_context( ll_keywords_str, hl_keywords_str, @@ -2205,18 +2054,14 @@ async def kg_query_with_keywords( relationships_vdb, text_chunks_db, query_param, + chunks_vdb=chunks_vdb, ) if not context: return PROMPTS["fail_response"] - # If only context is needed, return it if query_param.only_need_context: return context - # --------------------------- - # 4) BUILD THE SYSTEM PROMPT + CALL LLM - # --------------------------- - # Process conversation history history_context = "" if query_param.conversation_history: @@ -2258,7 +2103,6 @@ async def kg_query_with_keywords( ) if hashing_kv.global_config.get("enable_llm_cache"): - # 7. Save cache - 只有在收集完整响应后才缓存 await save_to_cache( hashing_kv, CacheData( @@ -2319,12 +2163,15 @@ async def query_with_keywords( ) # Create a new string with the prompt and the keywords - ll_keywords_str = ", ".join(ll_keywords) - hl_keywords_str = ", ".join(hl_keywords) - formatted_question = f"{prompt}\n\n### Keywords:\nHigh-level: {hl_keywords_str}\nLow-level: {ll_keywords_str}\n\n### Query:\n{query}" + keywords_str = ", ".join(ll_keywords + hl_keywords) + formatted_question = ( + f"{prompt}\n\n### Keywords\n\n{keywords_str}\n\n### Query\n\n{query}" + ) + + param.original_query = query # Use appropriate query method based on mode - if param.mode in ["local", "global", "hybrid"]: + if param.mode in ["local", "global", "hybrid", "mix"]: return await kg_query_with_keywords( formatted_question, knowledge_graph_inst, @@ -2334,6 +2181,9 @@ async def query_with_keywords( param, global_config, hashing_kv=hashing_kv, + hl_keywords=hl_keywords, + ll_keywords=ll_keywords, + chunks_vdb=chunks_vdb, ) elif param.mode == "naive": return await naive_query( @@ -2344,17 +2194,5 @@ async def query_with_keywords( global_config, hashing_kv=hashing_kv, ) - elif param.mode == "mix": - return await mix_kg_vector_query( - formatted_question, - knowledge_graph_inst, - entities_vdb, - relationships_vdb, - chunks_vdb, - text_chunks_db, - param, - global_config, - hashing_kv=hashing_kv, - ) else: raise ValueError(f"Unknown mode {param.mode}") diff --git a/lightrag/utils.py b/lightrag/utils.py index 2ed831b5..7b4920eb 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -721,19 +721,19 @@ def truncate_list_by_token_size( def list_of_list_to_dict(data: list[list[str]]) -> list[dict[str, str]]: """Convert a 2D string list (table-like data) into a list of dictionaries. - + The first row is treated as header containing field names. Subsequent rows become dictionary entries where keys come from header and values from row data. - + Args: data: 2D string array where first row contains headers and rest are data rows. Minimum 2 columns required in data rows (rows with <2 elements are skipped). - + Returns: List of dictionaries where each dict represents a data row with: - Keys: Header values from first row - Values: Corresponding row values (empty string if missing) - + Example: Input: [["Name","Age"], ["Alice","23"], ["Bob"]] Output: [{"Name":"Alice","Age":"23"}, {"Name":"Bob","Age":""}] @@ -822,21 +822,33 @@ def xml_to_json(xml_file): return None -def process_combine_contexts( - hl_context: list[dict[str, str]], ll_context: list[dict[str, str]] -): +def process_combine_contexts(*context_lists): + """ + Combine multiple context lists and remove duplicate content + + Args: + *context_lists: Any number of context lists + + Returns: + Combined context list with duplicates removed + """ seen_content = {} combined_data = [] - for item in hl_context + ll_context: - content_dict = {k: v for k, v in item.items() if k != "id"} - content_key = tuple(sorted(content_dict.items())) - if content_key not in seen_content: - seen_content[content_key] = item - combined_data.append(item) + # Iterate through all input context lists + for context_list in context_lists: + if not context_list: # Skip empty lists + continue + for item in context_list: + content_dict = {k: v for k, v in item.items() if k != "id"} + content_key = tuple(sorted(content_dict.items())) + if content_key not in seen_content: + seen_content[content_key] = item + combined_data.append(item) + # Reassign IDs for i, item in enumerate(combined_data): - item["id"] = str(i) + item["id"] = str(i + 1) return combined_data From 3eb3b170ab37dbcc9f8cf6c62f05c2a91f19cbef Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 7 May 2025 18:01:23 +0800 Subject: [PATCH 06/10] Remove list_of_list_to_dict function --- lightrag/operate.py | 180 +++++++++++++++++--------------------------- lightrag/utils.py | 38 ---------- 2 files changed, 70 insertions(+), 148 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 0ff485a8..08296bdb 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -25,7 +25,6 @@ from .utils import ( CacheData, get_conversation_turns, use_llm_func_with_cache, - list_of_list_to_dict, ) from .base import ( BaseGraphStorage, @@ -1175,22 +1174,17 @@ async def _get_vector_context( entities_context = [] relations_context = [] - # Create text_units_context in the same format as _get_edge_data and _get_node_data - text_units_section_list = [["id", "content", "file_path"]] - + # Create text_units_context directly as a list of dictionaries + text_units_context = [] for i, chunk in enumerate(maybe_trun_chunks): - # Add to text_units_section_list - text_units_section_list.append( - [ - i + 1, # id - chunk["content"], # content - chunk["file_path"], # file_path - ] + text_units_context.append( + { + "id": i + 1, + "content": chunk["content"], + "file_path": chunk["file_path"], + } ) - # Convert to dictionary format using list_of_list_to_dict - text_units_context = list_of_list_to_dict(text_units_section_list) - return entities_context, relations_context, text_units_context except Exception as e: logger.error(f"Error in _get_vector_context: {e}") @@ -1398,17 +1392,7 @@ async def _get_node_data( ) # build prompt - entites_section_list = [ - [ - "id", - "entity", - "type", - "description", - "rank", - "created_at", - "file_path", - ] - ] + entities_context = [] for i, n in enumerate(node_datas): created_at = n.get("created_at", "UNKNOWN") if isinstance(created_at, (int, float)): @@ -1417,32 +1401,19 @@ async def _get_node_data( # Get file path from node data file_path = n.get("file_path", "unknown_source") - entites_section_list.append( - [ - i + 1, - n["entity_name"], - n.get("entity_type", "UNKNOWN"), - n.get("description", "UNKNOWN"), - n["rank"], - created_at, - file_path, - ] + entities_context.append( + { + "id": i + 1, + "entity": n["entity_name"], + "type": n.get("entity_type", "UNKNOWN"), + "description": n.get("description", "UNKNOWN"), + "rank": n["rank"], + "created_at": created_at, + "file_path": file_path, + } ) - entities_context = list_of_list_to_dict(entites_section_list) - relations_section_list = [ - [ - "id", - "entity1", - "entity2", - "description", - "keywords", - "weight", - "rank", - "created_at", - "file_path", - ] - ] + relations_context = [] for i, e in enumerate(use_relations): created_at = e.get("created_at", "UNKNOWN") # Convert timestamp to readable format @@ -1452,27 +1423,29 @@ async def _get_node_data( # Get file path from edge data file_path = e.get("file_path", "unknown_source") - relations_section_list.append( - [ - i + 1, - e["src_tgt"][0], - e["src_tgt"][1], - e["description"], - e["keywords"], - e["weight"], - e["rank"], - created_at, - file_path, - ] + relations_context.append( + { + "id": i + 1, + "entity1": e["src_tgt"][0], + "entity2": e["src_tgt"][1], + "description": e["description"], + "keywords": e["keywords"], + "weight": e["weight"], + "rank": e["rank"], + "created_at": created_at, + "file_path": file_path, + } ) - relations_context = list_of_list_to_dict(relations_section_list) - text_units_section_list = [["id", "content", "file_path"]] + text_units_context = [] for i, t in enumerate(use_text_units): - text_units_section_list.append( - [i + 1, t["content"], t.get("file_path", "unknown_source")] + text_units_context.append( + { + "id": i + 1, + "content": t["content"], + "file_path": t.get("file_path", "unknown_source"), + } ) - text_units_context = list_of_list_to_dict(text_units_section_list) return entities_context, relations_context, text_units_context @@ -1715,19 +1688,7 @@ async def _get_edge_data( f"Global query uses {len(use_entities)} entites, {len(edge_datas)} relations, {len(use_text_units)} chunks" ) - relations_section_list = [ - [ - "id", - "entity1", - "entity2", - "description", - "keywords", - "weight", - "rank", - "created_at", - "file_path", - ] - ] + relations_context = [] for i, e in enumerate(edge_datas): created_at = e.get("created_at", "UNKNOWN") # Convert timestamp to readable format @@ -1737,24 +1698,21 @@ async def _get_edge_data( # Get file path from edge data file_path = e.get("file_path", "unknown_source") - relations_section_list.append( - [ - i + 1, - e["src_id"], - e["tgt_id"], - e["description"], - e["keywords"], - e["weight"], - e["rank"], - created_at, - file_path, - ] + relations_context.append( + { + "id": i + 1, + "entity1": e["src_id"], + "entity2": e["tgt_id"], + "description": e["description"], + "keywords": e["keywords"], + "weight": e["weight"], + "rank": e["rank"], + "created_at": created_at, + "file_path": file_path, + } ) - relations_context = list_of_list_to_dict(relations_section_list) - entites_section_list = [ - ["id", "entity", "type", "description", "rank", "created_at", "file_path"] - ] + entities_context = [] for i, n in enumerate(use_entities): created_at = n.get("created_at", "UNKNOWN") # Convert timestamp to readable format @@ -1764,25 +1722,27 @@ async def _get_edge_data( # Get file path from node data file_path = n.get("file_path", "unknown_source") - entites_section_list.append( - [ - i + 1, - n["entity_name"], - n.get("entity_type", "UNKNOWN"), - n.get("description", "UNKNOWN"), - n["rank"], - created_at, - file_path, - ] + entities_context.append( + { + "id": i + 1, + "entity": n["entity_name"], + "type": n.get("entity_type", "UNKNOWN"), + "description": n.get("description", "UNKNOWN"), + "rank": n["rank"], + "created_at": created_at, + "file_path": file_path, + } ) - entities_context = list_of_list_to_dict(entites_section_list) - text_units_section_list = [["id", "content", "file_path"]] + text_units_context = [] for i, t in enumerate(use_text_units): - text_units_section_list.append( - [i + 1, t["content"], t.get("file_path", "unknown")] + text_units_context.append( + { + "id": i + 1, + "content": t["content"], + "file_path": t.get("file_path", "unknown"), + } ) - text_units_context = list_of_list_to_dict(text_units_section_list) return entities_context, relations_context, text_units_context diff --git a/lightrag/utils.py b/lightrag/utils.py index 7b4920eb..5e252de1 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -719,44 +719,6 @@ def truncate_list_by_token_size( return list_data -def list_of_list_to_dict(data: list[list[str]]) -> list[dict[str, str]]: - """Convert a 2D string list (table-like data) into a list of dictionaries. - - The first row is treated as header containing field names. Subsequent rows become - dictionary entries where keys come from header and values from row data. - - Args: - data: 2D string array where first row contains headers and rest are data rows. - Minimum 2 columns required in data rows (rows with <2 elements are skipped). - - Returns: - List of dictionaries where each dict represents a data row with: - - Keys: Header values from first row - - Values: Corresponding row values (empty string if missing) - - Example: - Input: [["Name","Age"], ["Alice","23"], ["Bob"]] - Output: [{"Name":"Alice","Age":"23"}, {"Name":"Bob","Age":""}] - """ - if not data or len(data) <= 1: - return [] - - header = data[0] - result = [] - - for row in data[1:]: - if len(row) >= 2: - item = {} - for i, field_name in enumerate(header): - if i < len(row): - item[field_name] = str(row[i]) - else: - item[field_name] = "" - result.append(item) - - return result - - def save_data_to_file(data, file_name): with open(file_name, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=4) From 1c5bbe396a8a4d83dcdb4f6e8018de2dfdc18616 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 7 May 2025 18:11:12 +0800 Subject: [PATCH 07/10] Optimize prompt template for naive query --- lightrag/prompt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/prompt.py b/lightrag/prompt.py index f9b8cb1b..61317513 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -311,7 +311,7 @@ When handling content with timestamps: ---Conversation History--- {history} ----Document Chunks--- +---Document Chunks(DC)--- {content_data} ---Response Rules--- @@ -320,7 +320,7 @@ When handling content with timestamps: - Use markdown formatting with appropriate section headings - Please respond in the same language as the user's question. - Ensure the response maintains continuity with the conversation history. -- List up to 5 most important reference sources at the end under "References" section. Clearly indicating whether each source is from Knowledge Graph (KG) or Vector Data (DC), and include the file path if available, in the following format: [KG/DC] file_path +- List up to 5 most important reference sources at the end under "References" section. Clearly indicating each source from Document Chunks(DC), and include the file path if available, in the following format: [DC] file_path - If you don't know the answer, just say so. - Do not include information not provided by the Document Chunks.""" From 474b77c43eab8bc7f7c0df39957efc51d626ec4b Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 7 May 2025 18:11:35 +0800 Subject: [PATCH 08/10] Remove deprecated mix_rag_response prompt template --- lightrag/prompt.py | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/lightrag/prompt.py b/lightrag/prompt.py index 61317513..5616f66a 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -347,41 +347,3 @@ Similarity score criteria: 0.5: Partially related and answer needs modification to be used Return only a number between 0-1, without any additional content. """ - -PROMPTS["mix_rag_response"] = """---Role--- - -You are a helpful assistant responding to user query about Data Sources provided below. - - ----Goal--- - -Generate a concise response based on Data Sources and follow Response Rules, considering both the conversation history and the current query. Data sources contain two parts: Knowledge Graph(KG) and Document Chunks(DC). Summarize all information in the provided Data Sources, and incorporating general knowledge relevant to the Data Sources. Do not include information not provided by Data Sources. - -When handling information with timestamps: -1. Each piece of information (both relationships and content) has a "created_at" timestamp indicating when we acquired this knowledge -2. When encountering conflicting information, consider both the content/relationship and the timestamp -3. Don't automatically prefer the most recent information - use judgment based on the context -4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps - ----Conversation History--- -{history} - ----Data Sources--- - -1. From Knowledge Graph(KG): -{kg_context} - -2. From Document Chunks(DC): -{vector_context} - ----Response Rules--- - -- Target format and length: {response_type} -- Use markdown formatting with appropriate section headings -- Please respond in the same language as the user's question. -- Ensure the response maintains continuity with the conversation history. -- Organize answer in sections focusing on one main point or aspect of the answer -- Use clear and descriptive section titles that reflect the content -- List up to 5 most important reference sources at the end under "References" section. Clearly indicating whether each source is from Knowledge Graph (KG) or Vector Data (DC), and include the file path if available, in the following format: [KG/DC] file_path -- If you don't know the answer, just say so. Do not make anything up. -- Do not include information not provided by the Data Sources.""" From 147f73002d232aa3f84afe157567d4deb3ef10a6 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 7 May 2025 19:07:31 +0800 Subject: [PATCH 09/10] Remove ll-keywords and hl-keywords from WebUI --- lightrag/api/__init__.py | 2 +- lightrag/api/routers/query_routes.py | 24 ------- lightrag_webui/src/api/lightrag.ts | 4 -- .../components/retrieval/QuerySettings.tsx | 66 ------------------- 4 files changed, 1 insertion(+), 95 deletions(-) diff --git a/lightrag/api/__init__.py b/lightrag/api/__init__.py index ceb80096..bfc99727 100644 --- a/lightrag/api/__init__.py +++ b/lightrag/api/__init__.py @@ -1 +1 @@ -__api_version__ = "0168" +__api_version__ = "0169" diff --git a/lightrag/api/routers/query_routes.py b/lightrag/api/routers/query_routes.py index 81603487..600e680e 100644 --- a/lightrag/api/routers/query_routes.py +++ b/lightrag/api/routers/query_routes.py @@ -67,16 +67,6 @@ class QueryRequest(BaseModel): description="Maximum number of tokens allocated for entity descriptions in local retrieval.", ) - hl_keywords: Optional[List[str]] = Field( - default=None, - description="List of high-level keywords to prioritize in retrieval.", - ) - - ll_keywords: Optional[List[str]] = Field( - default=None, - description="List of low-level keywords to refine retrieval focus.", - ) - conversation_history: Optional[List[Dict[str, Any]]] = Field( default=None, description="Stores past conversation history to maintain context. Format: [{'role': 'user/assistant', 'content': 'message'}].", @@ -93,20 +83,6 @@ class QueryRequest(BaseModel): def query_strip_after(cls, query: str) -> str: return query.strip() - @field_validator("hl_keywords", mode="after") - @classmethod - def hl_keywords_strip_after(cls, hl_keywords: List[str] | None) -> List[str] | None: - if hl_keywords is None: - return None - return [keyword.strip() for keyword in hl_keywords] - - @field_validator("ll_keywords", mode="after") - @classmethod - def ll_keywords_strip_after(cls, ll_keywords: List[str] | None) -> List[str] | None: - if ll_keywords is None: - return None - return [keyword.strip() for keyword in ll_keywords] - @field_validator("conversation_history", mode="after") @classmethod def conversation_history_role_check( diff --git a/lightrag_webui/src/api/lightrag.ts b/lightrag_webui/src/api/lightrag.ts index da786529..5adeb8be 100644 --- a/lightrag_webui/src/api/lightrag.ts +++ b/lightrag_webui/src/api/lightrag.ts @@ -94,10 +94,6 @@ export type QueryRequest = { max_token_for_global_context?: number /** Maximum number of tokens allocated for entity descriptions in local retrieval. */ max_token_for_local_context?: number - /** List of high-level keywords to prioritize in retrieval. */ - hl_keywords?: string[] - /** List of low-level keywords to refine retrieval focus. */ - ll_keywords?: string[] /** * Stores past conversation history to maintain context. * Format: [{"role": "user/assistant", "content": "message"}]. diff --git a/lightrag_webui/src/components/retrieval/QuerySettings.tsx b/lightrag_webui/src/components/retrieval/QuerySettings.tsx index 723c1359..761d8e45 100644 --- a/lightrag_webui/src/components/retrieval/QuerySettings.tsx +++ b/lightrag_webui/src/components/retrieval/QuerySettings.tsx @@ -1,7 +1,6 @@ import { useCallback } from 'react' import { QueryMode, QueryRequest } from '@/api/lightrag' // Removed unused import for Text component -import Input from '@/components/ui/Input' import Checkbox from '@/components/ui/Checkbox' import NumberInput from '@/components/ui/NumberInput' import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/Card' @@ -242,71 +241,6 @@ export default function QuerySettings() { - {/* Keywords */} - <> - <> - - - - - - -

{t('retrievePanel.querySettings.hlKeywordsTooltip')}

-
-
-
-
- {/* Removed sr-only label */} - { - const keywords = e.target.value - .split(',') - .map((k) => k.trim()) - .filter((k) => k !== '') - handleChange('hl_keywords', keywords) - }} - placeholder={t('retrievePanel.querySettings.hlkeywordsPlaceHolder')} - /> -
- - - <> - - - - - - -

{t('retrievePanel.querySettings.llKeywordsTooltip')}

-
-
-
-
- {/* Removed sr-only label */} - { - const keywords = e.target.value - .split(',') - .map((k) => k.trim()) - .filter((k) => k !== '') - handleChange('ll_keywords', keywords) - }} - placeholder={t('retrievePanel.querySettings.hlkeywordsPlaceHolder')} - /> -
- - - {/* Toggle Options */} <>
From 08e532eaf3dc246d03c853c7a4e4f28a621b2472 Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 8 May 2025 03:26:14 +0800 Subject: [PATCH 10/10] Remove unused text_chunks_db param from naive_query --- lightrag/lightrag.py | 1 - lightrag/operate.py | 1 - 2 files changed, 2 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 2145fcb1..26a90539 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1456,7 +1456,6 @@ class LightRAG: response = await naive_query( query.strip(), self.chunks_vdb, - self.text_chunks, param, global_config, hashing_kv=self.llm_response_cache, diff --git a/lightrag/operate.py b/lightrag/operate.py index 08296bdb..0ca3747c 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1861,7 +1861,6 @@ async def _find_related_text_unit_from_relationships( async def naive_query( query: str, chunks_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage, query_param: QueryParam, global_config: dict[str, str], hashing_kv: BaseKVStorage | None = None,