Merge pull request #1537 from danielaskdd/redundant-chunk-fetch

Eliminate redundant chunk data fetching in naive and mix query mode
This commit is contained in:
Daniel.y
2025-05-07 02:05:05 +08:00
committed by GitHub
3 changed files with 12 additions and 19 deletions

View File

@@ -1 +1 @@
__api_version__ = "0166"
__api_version__ = "0167"

View File

@@ -390,6 +390,8 @@ class LightRAG:
),
embedding_func=self.embedding_func,
)
# TODO: deprecating, text_chunks is redundant with chunks_vdb
self.text_chunks: BaseKVStorage = self.key_string_value_json_storage_cls( # type: ignore
namespace=make_namespace(
self.namespace_prefix, NameSpace.KV_STORE_TEXT_CHUNKS

View File

@@ -1213,15 +1213,12 @@ async def mix_kg_vector_query(
if not results:
return None
chunks_ids = [r["id"] for r in results]
chunks = await text_chunks_db.get_by_ids(chunks_ids)
valid_chunks = []
for chunk, result in zip(chunks, results):
if chunk is not None and "content" in chunk:
# Merge chunk content and time metadata
for result in results:
if "content" in result:
# Directly use content from chunks_vdb.query result
chunk_with_time = {
"content": chunk["content"],
"content": result["content"],
"created_at": result.get("created_at", None),
"file_path": result.get("file_path", None),
}
@@ -1256,9 +1253,9 @@ async def mix_kg_vector_query(
formatted_chunks.append(chunk_text)
logger.debug(
f"Truncate chunks from {len(chunks)} to {len(formatted_chunks)} (max tokens:{query_param.max_token_for_text_unit})"
f"Truncate chunks from {len(valid_chunks)} to {len(formatted_chunks)} (max tokens:{query_param.max_token_for_text_unit})"
)
return "\n--New Chunk--\n".join(formatted_chunks)
return "\n\n--New Chunk--\n".join(formatted_chunks)
except Exception as e:
logger.error(f"Error in get_vector_context: {e}")
return None
@@ -2052,13 +2049,7 @@ async def naive_query(
if not len(results):
return PROMPTS["fail_response"]
chunks_ids = [r["id"] for r in results]
chunks = await text_chunks_db.get_by_ids(chunks_ids)
# Filter out invalid chunks
valid_chunks = [
chunk for chunk in chunks if chunk is not None and "content" in chunk
]
valid_chunks = [result for result in results if "content" in result]
if not valid_chunks:
logger.warning("No valid chunks found after filtering")
@@ -2077,13 +2068,13 @@ async def naive_query(
return PROMPTS["fail_response"]
logger.debug(
f"Truncate chunks from {len(chunks)} to {len(maybe_trun_chunks)} (max tokens:{query_param.max_token_for_text_unit})"
f"Truncate chunks from {len(valid_chunks)} to {len(maybe_trun_chunks)} (max tokens:{query_param.max_token_for_text_unit})"
)
logger.info(
f"Naive query: {len(maybe_trun_chunks)} chunks, top_k: {query_param.top_k}"
)
section = "\n--New Chunk--\n".join(
section = "\n\n--New Chunk--\n".join(
[
"File path: " + c["file_path"] + "\n" + c["content"]
for c in maybe_trun_chunks