支持多轮对话

2025-01-24 18:59:24 +08:00
parent 3d93df4049
commit 5719aa8882
5 changed files with 479 additions and 364 deletions
--- a/README.md
+++ b/README.md
@@ -119,6 +119,34 @@ print(rag.query("What are the top themes in this story?", param=QueryParam(mode=
 print(rag.query("What are the top themes in this story?", param=QueryParam(
    mode="mix")))

+### Conversation History Support
+LightRAG now supports multi-turn dialogue through the conversation history feature. Here's how to use it:
+
+```python
+from lightrag import LightRAG, QueryParam
+
+# Initialize LightRAG
+rag = LightRAG(working_dir=WORKING_DIR)
+
+# Create conversation history
+conversation_history = [
+    {"role": "user", "content": "What is the main character's attitude towards Christmas?"},
+    {"role": "assistant", "content": "At the beginning of the story, Ebenezer Scrooge has a very negative attitude towards Christmas..."},
+    {"role": "user", "content": "How does his attitude change?"}
+]
+
+# Create query parameters with conversation history
+query_param = QueryParam(
+    mode="mix",  # or any other mode: "local", "global", "hybrid"
+    conversation_history=conversation_history,  # Add the conversation history
+    history_turns=3  # Number of recent conversation turns to consider
+)
+
+# Make a query that takes into account the conversation history
+response = rag.query(
+    "What causes this change in his character?",
+    param=query_param
+)
 ```


--- a/lightrag/base.py
+++ b/lightrag/base.py
@@ -33,6 +33,13 @@ class QueryParam:
    max_token_for_local_context: int = 4000
    hl_keywords: list[str] = field(default_factory=list)
    ll_keywords: list[str] = field(default_factory=list)
+    # Conversation history support
+    conversation_history: list[dict] = field(
+        default_factory=list
+    )  # Format: [{"role": "user/assistant", "content": "message"}]
+    history_turns: int = (
+        3  # Number of complete conversation turns (user-assistant pairs) to consider
+    )


@dataclass
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -21,6 +21,7 @@ from .utils import (
    save_to_cache,
    CacheData,
    statistic_data,
+    get_conversation_turns,
 )
 from .base import (
    BaseGraphStorage,
@@ -369,7 +370,7 @@ async def extract_entities(

            arg_hash = compute_args_hash(_prompt)
            cached_return, _1, _2, _3 = await handle_cache(
-                llm_response_cache, arg_hash, _prompt, "default"
+                llm_response_cache, arg_hash, _prompt, "default", cache_type="default"
            )
            if need_to_restore:
                llm_response_cache.global_config = global_config
@@ -576,54 +577,19 @@ async def kg_query(
 ) -> str:
    # Handle cache
    use_model_func = global_config["llm_model_func"]
-    args_hash = compute_args_hash(query_param.mode, query)
+    args_hash = compute_args_hash(query_param.mode, query, cache_type="query")
    cached_response, quantized, min_val, max_val = await handle_cache(
-        hashing_kv, args_hash, query, query_param.mode
+        hashing_kv, args_hash, query, query_param.mode, cache_type="query"
    )
    if cached_response is not None:
        return cached_response

-    example_number = global_config["addon_params"].get("example_number", None)
-    if example_number and example_number < len(PROMPTS["keywords_extraction_examples"]):
-        examples = "\n".join(
-            PROMPTS["keywords_extraction_examples"][: int(example_number)]
-        )
-    else:
-        examples = "\n".join(PROMPTS["keywords_extraction_examples"])
-    language = global_config["addon_params"].get(
-        "language", PROMPTS["DEFAULT_LANGUAGE"]
+    # Extract keywords using extract_keywords_only function which already supports conversation history
+    hl_keywords, ll_keywords = await extract_keywords_only(
+        query, query_param, global_config, hashing_kv
    )

-    # Set mode
-    if query_param.mode not in ["local", "global", "hybrid"]:
-        logger.error(f"Unknown mode {query_param.mode} in kg_query")
-        return PROMPTS["fail_response"]
-
-    # LLM generate keywords
-    kw_prompt_temp = PROMPTS["keywords_extraction"]
-    kw_prompt = kw_prompt_temp.format(query=query, examples=examples, language=language)
-    result = await use_model_func(kw_prompt, keyword_extraction=True)
-    logger.info("kw_prompt result:")
-    print(result)
-    try:
-        # json_text = locate_json_string_body_from_string(result) # handled in use_model_func
-        match = re.search(r"\{.*\}", result, re.DOTALL)
-        if match:
-            result = match.group(0)
-            keywords_data = json.loads(result)
-
-            hl_keywords = keywords_data.get("high_level_keywords", [])
-            ll_keywords = keywords_data.get("low_level_keywords", [])
-        else:
-            logger.error("No JSON-like structure found in the result.")
-            return PROMPTS["fail_response"]
-
-    # Handle parsing error
-    except json.JSONDecodeError as e:
-        print(f"JSON parsing error: {e} {result}")
-        return PROMPTS["fail_response"]
-
-    # Handdle keywords missing
+    # Handle empty keywords
    if hl_keywords == [] and ll_keywords == []:
        logger.warning("low_level_keywords and high_level_keywords is empty")
        return PROMPTS["fail_response"]
@@ -660,12 +626,27 @@ async def kg_query(
        return context
    if context is None:
        return PROMPTS["fail_response"]
+
+    # Process conversation history
+    history_context = ""
+    if query_param.conversation_history:
+        recent_history = query_param.conversation_history[
+            -query_param.history_window_size :
+        ]
+        history_context = "\n".join(
+            [f"{turn['role']}: {turn['content']}" for turn in recent_history]
+        )
+
    sys_prompt_temp = PROMPTS["rag_response"]
    sys_prompt = sys_prompt_temp.format(
-        context_data=context, response_type=query_param.response_type
+        context_data=context,
+        response_type=query_param.response_type,
+        history=history_context,
    )
+
    if query_param.only_need_prompt:
        return sys_prompt
+
    response = await use_model_func(
        query,
        system_prompt=sys_prompt,
@@ -693,140 +674,7 @@ async def kg_query(
            min_val=min_val,
            max_val=max_val,
            mode=query_param.mode,
-        ),
-    )
-    return response
-
-
-async def kg_query_with_keywords(
-    query: str,
-    knowledge_graph_inst: BaseGraphStorage,
-    entities_vdb: BaseVectorStorage,
-    relationships_vdb: BaseVectorStorage,
-    text_chunks_db: BaseKVStorage[TextChunkSchema],
-    query_param: QueryParam,
-    global_config: dict,
-    hashing_kv: BaseKVStorage = None,
-) -> str:
-    """
-    Refactored kg_query that does NOT extract keywords by itself.
-    It expects hl_keywords and ll_keywords to be set in query_param, or defaults to empty.
-    Then it uses those to build context and produce a final LLM response.
-    """
-
-    # ---------------------------
-    # 0) Handle potential cache
-    # ---------------------------
-    use_model_func = global_config["llm_model_func"]
-    args_hash = compute_args_hash(query_param.mode, query)
-    cached_response, quantized, min_val, max_val = await handle_cache(
-        hashing_kv, args_hash, query, query_param.mode
-    )
-    if cached_response is not None:
-        return cached_response
-
-    # ---------------------------
-    # 1) RETRIEVE KEYWORDS FROM query_param
-    # ---------------------------
-
-    # If these fields don't exist, default to empty lists/strings.
-    hl_keywords = getattr(query_param, "hl_keywords", []) or []
-    ll_keywords = getattr(query_param, "ll_keywords", []) or []
-
-    # If neither has any keywords, you could handle that logic here.
-    if not hl_keywords and not ll_keywords:
-        logger.warning(
-            "No keywords found in query_param. Could default to global mode or fail."
-        )
-        return PROMPTS["fail_response"]
-    if not ll_keywords and query_param.mode in ["local", "hybrid"]:
-        logger.warning("low_level_keywords is empty, switching to global mode.")
-        query_param.mode = "global"
-    if not hl_keywords and query_param.mode in ["global", "hybrid"]:
-        logger.warning("high_level_keywords is empty, switching to local mode.")
-        query_param.mode = "local"
-
-    # Flatten low-level and high-level keywords if needed
-    ll_keywords_flat = (
-        [item for sublist in ll_keywords for item in sublist]
-        if any(isinstance(i, list) for i in ll_keywords)
-        else ll_keywords
-    )
-    hl_keywords_flat = (
-        [item for sublist in hl_keywords for item in sublist]
-        if any(isinstance(i, list) for i in hl_keywords)
-        else hl_keywords
-    )
-
-    # Join the flattened lists
-    ll_keywords_str = ", ".join(ll_keywords_flat) if ll_keywords_flat else ""
-    hl_keywords_str = ", ".join(hl_keywords_flat) if hl_keywords_flat else ""
-
-    keywords = [ll_keywords_str, hl_keywords_str]
-
-    logger.info("Using %s mode for query processing", query_param.mode)
-
-    # ---------------------------
-    # 2) BUILD CONTEXT
-    # ---------------------------
-    context = await _build_query_context(
-        keywords,
-        knowledge_graph_inst,
-        entities_vdb,
-        relationships_vdb,
-        text_chunks_db,
-        query_param,
-    )
-    if not context:
-        return PROMPTS["fail_response"]
-
-    # If only context is needed, return it
-    if query_param.only_need_context:
-        return context
-
-    # ---------------------------
-    # 3) BUILD THE SYSTEM PROMPT + CALL LLM
-    # ---------------------------
-    sys_prompt_temp = PROMPTS["rag_response"]
-    sys_prompt = sys_prompt_temp.format(
-        context_data=context, response_type=query_param.response_type
-    )
-
-    if query_param.only_need_prompt:
-        return sys_prompt
-
-    # Now call the LLM with the final system prompt
-    response = await use_model_func(
-        query,
-        system_prompt=sys_prompt,
-        stream=query_param.stream,
-    )
-
-    # Clean up the response
-    if isinstance(response, str) and len(response) > len(sys_prompt):
-        response = (
-            response.replace(sys_prompt, "")
-            .replace("user", "")
-            .replace("model", "")
-            .replace(query, "")
-            .replace("<system>", "")
-            .replace("</system>", "")
-            .strip()
-        )
-
-    # ---------------------------
-    # 4) SAVE TO CACHE
-    # ---------------------------
-    await save_to_cache(
-        hashing_kv,
-        CacheData(
-            args_hash=args_hash,
-            content=response,
-            prompt=query,
-            quantized=quantized,
-            min_val=min_val,
-            max_val=max_val,
-            mode=query_param.mode,
+            cache_type="query",
        ),
    )
    return response
@@ -844,22 +692,21 @@ async def extract_keywords_only(
    It ONLY extracts keywords (hl_keywords, ll_keywords).
    """

-    # 1. Handle cache if needed
-    args_hash = compute_args_hash(param.mode, text)
+    # 1. Handle cache if needed - add cache type for keywords
+    args_hash = compute_args_hash(param.mode, text, cache_type="keywords")
    cached_response, quantized, min_val, max_val = await handle_cache(
-        hashing_kv, args_hash, text, param.mode
+        hashing_kv, args_hash, text, param.mode, cache_type="keywords"
    )
    if cached_response is not None:
-        # parse the cached_response if it’s JSON containing keywords
-        # or simply return (hl_keywords, ll_keywords) from cached
-        # Assuming cached_response is in the same JSON structure:
-        match = re.search(r"\{.*\}", cached_response, re.DOTALL)
-        if match:
-            keywords_data = json.loads(match.group(0))
-            hl_keywords = keywords_data.get("high_level_keywords", [])
-            ll_keywords = keywords_data.get("low_level_keywords", [])
-            return hl_keywords, ll_keywords
-        return [], []
+        try:
+            keywords_data = json.loads(cached_response)
+            return keywords_data["high_level_keywords"], keywords_data[
+                "low_level_keywords"
+            ]
+        except (json.JSONDecodeError, KeyError):
+            logger.warning(
+                "Invalid cache format for keywords, proceeding with extraction"
+            )

    # 2. Build the examples
    example_number = global_config["addon_params"].get("example_number", None)
@@ -873,15 +720,23 @@ async def extract_keywords_only(
        "language", PROMPTS["DEFAULT_LANGUAGE"]
    )

-    # 3. Build the keyword-extraction prompt
-    kw_prompt_temp = PROMPTS["keywords_extraction"]
-    kw_prompt = kw_prompt_temp.format(query=text, examples=examples, language=language)
+    # 3. Process conversation history
+    history_context = ""
+    if param.conversation_history:
+        history_context = get_conversation_turns(
+            param.conversation_history, param.history_turns
+        )

-    # 4. Call the LLM for keyword extraction
+    # 4. Build the keyword-extraction prompt
+    kw_prompt = PROMPTS["keywords_extraction"].format(
+        query=text, examples=examples, language=language, history=history_context
+    )
+
+    # 5. Call the LLM for keyword extraction
    use_model_func = global_config["llm_model_func"]
    result = await use_model_func(kw_prompt, keyword_extraction=True)

-    # 5. Parse out JSON from the LLM response
+    # 6. Parse out JSON from the LLM response
    match = re.search(r"\{.*\}", result, re.DOTALL)
    if not match:
        logger.error("No JSON-like structure found in the result.")
@@ -895,22 +750,225 @@ async def extract_keywords_only(
    hl_keywords = keywords_data.get("high_level_keywords", [])
    ll_keywords = keywords_data.get("low_level_keywords", [])

-    # 6. Cache the result if needed
+    # 7. Cache only the processed keywords with cache type
+    cache_data = {"high_level_keywords": hl_keywords, "low_level_keywords": ll_keywords}
    await save_to_cache(
        hashing_kv,
        CacheData(
            args_hash=args_hash,
-            content=result,
+            content=json.dumps(cache_data),
            prompt=text,
            quantized=quantized,
            min_val=min_val,
            max_val=max_val,
            mode=param.mode,
+            cache_type="keywords",
        ),
    )
    return hl_keywords, ll_keywords


+async def mix_kg_vector_query(
+    query: str,
+    knowledge_graph_inst: BaseGraphStorage,
+    entities_vdb: BaseVectorStorage,
+    relationships_vdb: BaseVectorStorage,
+    chunks_vdb: BaseVectorStorage,
+    text_chunks_db: BaseKVStorage[TextChunkSchema],
+    query_param: QueryParam,
+    global_config: dict,
+    hashing_kv: BaseKVStorage = None,
+) -> str:
+    """
+    Hybrid retrieval implementation combining knowledge graph and vector search.
+
+    This function performs a hybrid search by:
+    1. Extracting semantic information from knowledge graph
+    2. Retrieving relevant text chunks through vector similarity
+    3. Combining both results for comprehensive answer generation
+    """
+    # 1. Cache handling
+    use_model_func = global_config["llm_model_func"]
+    args_hash = compute_args_hash("mix", query, cache_type="query")
+    cached_response, quantized, min_val, max_val = await handle_cache(
+        hashing_kv, args_hash, query, "mix", cache_type="query"
+    )
+    if cached_response is not None:
+        return cached_response
+
+    # Process conversation history
+    history_context = ""
+    if query_param.conversation_history:
+        history_context = get_conversation_turns(
+            query_param.conversation_history, query_param.history_turns
+        )
+
+    # 2. Execute knowledge graph and vector searches in parallel
+    async def get_kg_context():
+        try:
+            # Extract keywords using extract_keywords_only function which already supports conversation history
+            hl_keywords, ll_keywords = await extract_keywords_only(
+                query, query_param, global_config, hashing_kv
+            )
+
+            if not hl_keywords and not ll_keywords:
+                logger.warning("Both high-level and low-level keywords are empty")
+                return None
+
+            # Convert keyword lists to strings
+            ll_keywords_str = ", ".join(ll_keywords) if ll_keywords else ""
+            hl_keywords_str = ", ".join(hl_keywords) if hl_keywords else ""
+
+            # Set query mode based on available keywords
+            if not ll_keywords_str and not hl_keywords_str:
+                return None
+            elif not ll_keywords_str:
+                query_param.mode = "global"
+            elif not hl_keywords_str:
+                query_param.mode = "local"
+            else:
+                query_param.mode = "hybrid"
+
+            # Build knowledge graph context
+            context = await _build_query_context(
+                [ll_keywords_str, hl_keywords_str],
+                knowledge_graph_inst,
+                entities_vdb,
+                relationships_vdb,
+                text_chunks_db,
+                query_param,
+            )
+
+            return context
+
+        except Exception as e:
+            logger.error(f"Error in get_kg_context: {str(e)}")
+            return None
+
+    async def get_vector_context():
+        # Consider conversation history in vector search
+        augmented_query = query
+        if history_context:
+            augmented_query = f"{history_context}\n{query}"
+
+        try:
+            # Reduce top_k for vector search in hybrid mode since we have structured information from KG
+            mix_topk = min(10, query_param.top_k)
+            results = await chunks_vdb.query(augmented_query, top_k=mix_topk)
+            if not results:
+                return None
+
+            chunks_ids = [r["id"] for r in results]
+            chunks = await text_chunks_db.get_by_ids(chunks_ids)
+
+            valid_chunks = []
+            for chunk, result in zip(chunks, results):
+                if chunk is not None and "content" in chunk:
+                    # Merge chunk content and time metadata
+                    chunk_with_time = {
+                        "content": chunk["content"],
+                        "created_at": result.get("created_at", None),
+                    }
+                    valid_chunks.append(chunk_with_time)
+
+            if not valid_chunks:
+                return None
+
+            maybe_trun_chunks = truncate_list_by_token_size(
+                valid_chunks,
+                key=lambda x: x["content"],
+                max_token_size=query_param.max_token_for_text_unit,
+            )
+
+            if not maybe_trun_chunks:
+                return None
+
+            # Include time information in content
+            formatted_chunks = []
+            for c in maybe_trun_chunks:
+                chunk_text = c["content"]
+                if c["created_at"]:
+                    chunk_text = f"[Created at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['created_at']))}]\n{chunk_text}"
+                formatted_chunks.append(chunk_text)
+
+            return "\n--New Chunk--\n".join(formatted_chunks)
+        except Exception as e:
+            logger.error(f"Error in get_vector_context: {e}")
+            return None
+
+    # 3. Execute both retrievals in parallel
+    kg_context, vector_context = await asyncio.gather(
+        get_kg_context(), get_vector_context()
+    )
+
+    # 4. Merge contexts
+    if kg_context is None and vector_context is None:
+        return PROMPTS["fail_response"]
+
+    if query_param.only_need_context:
+        return {"kg_context": kg_context, "vector_context": vector_context}
+
+    # 5. Construct hybrid prompt
+    sys_prompt = PROMPTS["mix_rag_response"].format(
+        kg_context=kg_context
+        if kg_context
+        else "No relevant knowledge graph information found",
+        vector_context=vector_context
+        if vector_context
+        else "No relevant text information found",
+        response_type=query_param.response_type,
+        history=history_context,
+    )
+
+    if query_param.only_need_prompt:
+        return sys_prompt
+
+    # 6. Generate response
+    response = await use_model_func(
+        query,
+        system_prompt=sys_prompt,
+        stream=query_param.stream,
+    )
+
+    if query_param.stream:
+        # 如果是流式响应，先收集完整响应
+        full_response = []
+        async for chunk in response:
+            full_response.append(chunk)
+
+        # 将完整响应组合起来用于缓存
+        response = "".join(full_response)
+
+    # 清理响应内容
+    if isinstance(response, str) and len(response) > len(sys_prompt):
+        response = (
+            response.replace(sys_prompt, "")
+            .replace("user", "")
+            .replace("model", "")
+            .replace(query, "")
+            .replace("<system>", "")
+            .replace("</system>", "")
+            .strip()
+        )
+
+        # 7. Save cache - 只有在收集完整响应后才缓存
+        await save_to_cache(
+            hashing_kv,
+            CacheData(
+                args_hash=args_hash,
+                content=response,
+                prompt=query,
+                quantized=quantized,
+                min_val=min_val,
+                max_val=max_val,
+                mode="mix",
+                cache_type="query",
+            ),
+        )
+
+    return response
+
+
 async def _build_query_context(
    query: list,
    knowledge_graph_inst: BaseGraphStorage,
@@ -1407,9 +1465,9 @@ async def naive_query(
 ):
    # Handle cache
    use_model_func = global_config["llm_model_func"]
-    args_hash = compute_args_hash(query_param.mode, query)
+    args_hash = compute_args_hash(query_param.mode, query, cache_type="query")
    cached_response, quantized, min_val, max_val = await handle_cache(
-        hashing_kv, args_hash, query, query_param.mode
+        hashing_kv, args_hash, query, "default", cache_type="query"
    )
    if cached_response is not None:
        return cached_response
@@ -1482,190 +1540,125 @@ async def naive_query(
            min_val=min_val,
            max_val=max_val,
            mode=query_param.mode,
+            cache_type="query",
        ),
    )

    return response


-async def mix_kg_vector_query(
-    query,
+async def kg_query_with_keywords(
+    query: str,
    knowledge_graph_inst: BaseGraphStorage,
    entities_vdb: BaseVectorStorage,
    relationships_vdb: BaseVectorStorage,
-    chunks_vdb: BaseVectorStorage,
    text_chunks_db: BaseKVStorage[TextChunkSchema],
    query_param: QueryParam,
    global_config: dict,
    hashing_kv: BaseKVStorage = None,
 ) -> str:
    """
-    Hybrid retrieval implementation combining knowledge graph and vector search.
-
-    This function performs a hybrid search by:
-    1. Extracting semantic information from knowledge graph
-    2. Retrieving relevant text chunks through vector similarity
-    3. Combining both results for comprehensive answer generation
+    Refactored kg_query that does NOT extract keywords by itself.
+    It expects hl_keywords and ll_keywords to be set in query_param, or defaults to empty.
+    Then it uses those to build context and produce a final LLM response.
    """
-    # 1. Cache handling
+
+    # ---------------------------
+    # 1) Handle potential cache for query results
+    # ---------------------------
    use_model_func = global_config["llm_model_func"]
-    args_hash = compute_args_hash("mix", query)
+    args_hash = compute_args_hash(query_param.mode, query, cache_type="query")
    cached_response, quantized, min_val, max_val = await handle_cache(
-        hashing_kv, args_hash, query, "mix"
+        hashing_kv, args_hash, query, query_param.mode, cache_type="query"
    )
    if cached_response is not None:
        return cached_response

-    # 2. Execute knowledge graph and vector searches in parallel
-    async def get_kg_context():
-        try:
-            # Reuse keyword extraction logic from kg_query
-            example_number = global_config["addon_params"].get("example_number", None)
-            if example_number and example_number < len(
-                PROMPTS["keywords_extraction_examples"]
-            ):
-                examples = "\n".join(
-                    PROMPTS["keywords_extraction_examples"][: int(example_number)]
-                )
-            else:
-                examples = "\n".join(PROMPTS["keywords_extraction_examples"])
+    # ---------------------------
+    # 2) RETRIEVE KEYWORDS FROM query_param
+    # ---------------------------

-            language = global_config["addon_params"].get(
-                "language", PROMPTS["DEFAULT_LANGUAGE"]
-            )
+    # If these fields don't exist, default to empty lists/strings.
+    hl_keywords = getattr(query_param, "hl_keywords", []) or []
+    ll_keywords = getattr(query_param, "ll_keywords", []) or []

-            # Extract keywords using LLM
-            kw_prompt = PROMPTS["keywords_extraction"].format(
-                query=query, examples=examples, language=language
-            )
-            result = await use_model_func(kw_prompt, keyword_extraction=True)
+    # If neither has any keywords, you could handle that logic here.
+    if not hl_keywords and not ll_keywords:
+        logger.warning(
+            "No keywords found in query_param. Could default to global mode or fail."
+        )
+        return PROMPTS["fail_response"]
+    if not ll_keywords and query_param.mode in ["local", "hybrid"]:
+        logger.warning("low_level_keywords is empty, switching to global mode.")
+        query_param.mode = "global"
+    if not hl_keywords and query_param.mode in ["global", "hybrid"]:
+        logger.warning("high_level_keywords is empty, switching to local mode.")
+        query_param.mode = "local"

-            match = re.search(r"\{.*\}", result, re.DOTALL)
-            if not match:
-                logger.warning(
-                    "No JSON-like structure found in keywords extraction result"
-                )
-                return None
-
-            result = match.group(0)
-            keywords_data = json.loads(result)
-            hl_keywords = keywords_data.get("high_level_keywords", [])
-            ll_keywords = keywords_data.get("low_level_keywords", [])
-
-            if not hl_keywords and not ll_keywords:
-                logger.warning("Both high-level and low-level keywords are empty")
-                return None
-
-            # Convert keyword lists to strings
-            ll_keywords_str = ", ".join(ll_keywords) if ll_keywords else ""
-            hl_keywords_str = ", ".join(hl_keywords) if hl_keywords else ""
-
-            # Set query mode based on available keywords
-            if not ll_keywords_str and not hl_keywords_str:
-                return None
-            elif not ll_keywords_str:
-                query_param.mode = "global"
-            elif not hl_keywords_str:
-                query_param.mode = "local"
-            else:
-                query_param.mode = "hybrid"
-
-            # Build knowledge graph context
-            context = await _build_query_context(
-                [ll_keywords_str, hl_keywords_str],
-                knowledge_graph_inst,
-                entities_vdb,
-                relationships_vdb,
-                text_chunks_db,
-                query_param,
-            )
-
-            return context
-
-        except Exception as e:
-            logger.error(f"Error in get_kg_context: {str(e)}")
-            return None
-
-    async def get_vector_context():
-        # Reuse vector search logic from naive_query
-        try:
-            # Reduce top_k for vector search in hybrid mode since we have structured information from KG
-            mix_topk = min(10, query_param.top_k)
-            results = await chunks_vdb.query(query, top_k=mix_topk)
-            if not results:
-                return None
-
-            chunks_ids = [r["id"] for r in results]
-            chunks = await text_chunks_db.get_by_ids(chunks_ids)
-
-            valid_chunks = []
-            for chunk, result in zip(chunks, results):
-                if chunk is not None and "content" in chunk:
-                    # Merge chunk content and time metadata
-                    chunk_with_time = {
-                        "content": chunk["content"],
-                        "created_at": result.get("created_at", None),
-                    }
-                    valid_chunks.append(chunk_with_time)
-
-            if not valid_chunks:
-                return None
-
-            maybe_trun_chunks = truncate_list_by_token_size(
-                valid_chunks,
-                key=lambda x: x["content"],
-                max_token_size=query_param.max_token_for_text_unit,
-            )
-
-            if not maybe_trun_chunks:
-                return None
-
-            # Include time information in content
-            formatted_chunks = []
-            for c in maybe_trun_chunks:
-                chunk_text = c["content"]
-                if c["created_at"]:
-                    chunk_text = f"[Created at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['created_at']))}]\n{chunk_text}"
-                formatted_chunks.append(chunk_text)
-
-            return "\n--New Chunk--\n".join(formatted_chunks)
-        except Exception as e:
-            logger.error(f"Error in get_vector_context: {e}")
-            return None
-
-    # 3. Execute both retrievals in parallel
-    kg_context, vector_context = await asyncio.gather(
-        get_kg_context(), get_vector_context()
+    # Flatten low-level and high-level keywords if needed
+    ll_keywords_flat = (
+        [item for sublist in ll_keywords for item in sublist]
+        if any(isinstance(i, list) for i in ll_keywords)
+        else ll_keywords
+    )
+    hl_keywords_flat = (
+        [item for sublist in hl_keywords for item in sublist]
+        if any(isinstance(i, list) for i in hl_keywords)
+        else hl_keywords
    )

-    # 4. Merge contexts
-    if kg_context is None and vector_context is None:
+    # Join the flattened lists
+    ll_keywords_str = ", ".join(ll_keywords_flat) if ll_keywords_flat else ""
+    hl_keywords_str = ", ".join(hl_keywords_flat) if hl_keywords_flat else ""
+
+    keywords = [ll_keywords_str, hl_keywords_str]
+
+    logger.info("Using %s mode for query processing", query_param.mode)
+
+    # ---------------------------
+    # 3) BUILD CONTEXT
+    # ---------------------------
+    context = await _build_query_context(
+        keywords,
+        knowledge_graph_inst,
+        entities_vdb,
+        relationships_vdb,
+        text_chunks_db,
+        query_param,
+    )
+    if not context:
        return PROMPTS["fail_response"]

+    # If only context is needed, return it
    if query_param.only_need_context:
-        return {"kg_context": kg_context, "vector_context": vector_context}
+        return context

-    # 5. Construct hybrid prompt
-    sys_prompt = PROMPTS["mix_rag_response"].format(
-        kg_context=kg_context
-        if kg_context
-        else "No relevant knowledge graph information found",
-        vector_context=vector_context
-        if vector_context
-        else "No relevant text information found",
+    # ---------------------------
+    # 4) BUILD THE SYSTEM PROMPT + CALL LLM
+    # ---------------------------
+
+    # Process conversation history
+    history_context = ""
+    if query_param.conversation_history:
+        history_context = get_conversation_turns(
+            query_param.conversation_history, query_param.history_turns
+        )
+
+    sys_prompt_temp = PROMPTS["rag_response"]
+    sys_prompt = sys_prompt_temp.format(
+        context_data=context,
        response_type=query_param.response_type,
+        history=history_context,
    )

    if query_param.only_need_prompt:
        return sys_prompt

-    # 6. Generate response
    response = await use_model_func(
        query,
        system_prompt=sys_prompt,
        stream=query_param.stream,
    )
-
    if isinstance(response, str) and len(response) > len(sys_prompt):
        response = (
            response.replace(sys_prompt, "")
@@ -1677,7 +1670,7 @@ async def mix_kg_vector_query(
            .strip()
        )

-    # 7. Save cache
+    # Save to cache
    await save_to_cache(
        hashing_kv,
        CacheData(
@@ -1687,8 +1680,8 @@ async def mix_kg_vector_query(
            quantized=quantized,
            min_val=min_val,
            max_val=max_val,
-            mode="mix",
+            mode=query_param.mode,
+            cache_type="query",
        ),
    )
-
    return response
--- a/lightrag/prompt.py
+++ b/lightrag/prompt.py
@@ -58,7 +58,7 @@ Entity_types: [person, technology, mission, organization, location]
 Text:
 while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order.

-Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.”
+Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. "If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us."

 The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce.

@@ -160,7 +160,7 @@ You are a helpful assistant responding to questions about data in the tables pro

 ---Goal---

-Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
+Generate a response of the target length and format that responds to the user's question, considering both the conversation history and the current query. Summarize all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
 If you don't know the answer, just say so. Do not make anything up.
 Do not include information where the supporting evidence for it is not provided.

@@ -170,6 +170,9 @@ When handling relationships with timestamps:
 3. Don't automatically prefer the most recently created relationships - use judgment based on the context
 4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps

+---Conversation History---
+{history}
+
 ---Target response length and format---

 {response_type}
@@ -178,22 +181,23 @@ When handling relationships with timestamps:

 {context_data}

-Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown."""
+Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. Ensure the response maintains continuity with the conversation history."""

 PROMPTS["keywords_extraction"] = """---Role---

-You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query.
+You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query and conversation history.

 ---Goal---

-Given the query, list both high-level and low-level keywords. High-level keywords focus on overarching concepts or themes, while low-level keywords focus on specific entities, details, or concrete terms.
+Given the query and conversation history, list both high-level and low-level keywords. High-level keywords focus on overarching concepts or themes, while low-level keywords focus on specific entities, details, or concrete terms.

 ---Instructions---

- Output the keywords in JSON format.
+- Consider both the current query and relevant conversation history when extracting keywords
+- Output the keywords in JSON format
 - The JSON should have two keys:
-  - "high_level_keywords" for overarching concepts or themes.
-  - "low_level_keywords" for specific entities or details.
+  - "high_level_keywords" for overarching concepts or themes
+  - "low_level_keywords" for specific entities or details

 ######################
 -Examples-
@@ -203,7 +207,10 @@ Given the query, list both high-level and low-level keywords. High-level keyword
 #############################
 -Real Data-
 ######################
-Query: {query}
+Conversation History:
+{history}
+
+Current Query: {query}
 ######################
 The `Output` should be human text, not unicode characters. Keep the same language as `Query`.
 Output:
@@ -248,10 +255,9 @@ PROMPTS["naive_rag_response"] = """---Role---

 You are a helpful assistant responding to questions about documents provided.

-
 ---Goal---

-Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
+Generate a response of the target length and format that responds to the user's question, considering both the conversation history and the current query. Summarize all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
 If you don't know the answer, just say so. Do not make anything up.
 Do not include information where the supporting evidence for it is not provided.

@@ -261,6 +267,9 @@ When handling content with timestamps:
 3. Don't automatically prefer the most recent content - use judgment based on the context
 4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps

+---Conversation History---
+{history}
+
 ---Target response length and format---

 {response_type}
@@ -269,8 +278,7 @@ When handling content with timestamps:

 {content_data}

-Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
-"""
+Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. Ensure the response maintains continuity with the conversation history."""

 PROMPTS[
    "similarity_check"
@@ -302,7 +310,7 @@ You are a professional assistant responsible for answering questions based on kn

 ---Goal---

-Generate a concise response that summarizes relevant points from the provided information. If you don't know the answer, just say so. Do not make anything up or include information where the supporting evidence is not provided.
+Generate a concise response that summarizes relevant points from the provided information, considering both the current query and conversation history. If you don't know the answer, just say so. Do not make anything up or include information where the supporting evidence is not provided.

 When handling information with timestamps:
 1. Each piece of information (both relationships and content) has a "created_at" timestamp indicating when we acquired this knowledge
@@ -310,6 +318,9 @@ When handling information with timestamps:
 3. Don't automatically prefer the most recent information - use judgment based on the context
 4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps

+---Conversation History---
+{history}
+
 ---Data Sources---

 1. Knowledge Graph Data:
@@ -326,6 +337,7 @@ When handling information with timestamps:
 - Each paragraph should be under a relevant section heading
 - Each section should focus on one main point or aspect of the answer
 - Use clear and descriptive section titles that reflect the content
+- Ensure the response maintains continuity with the conversation history
 - List up to 5 most important reference sources at the end under "References", clearly indicating whether each source is from Knowledge Graph (KG) or Vector Data (VD)
  Format: [KG/VD] Source content

--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -108,8 +108,23 @@ def convert_response_to_json(response: str) -> dict:
        raise e from None


-def compute_args_hash(*args):
-    return md5(str(args).encode()).hexdigest()
+def compute_args_hash(*args, cache_type: str = None) -> str:
+    """Compute a hash for the given arguments.
+    Args:
+        *args: Arguments to hash
+        cache_type: Type of cache (e.g., 'keywords', 'query')
+    Returns:
+        str: Hash string
+    """
+    import hashlib
+
+    # Convert all arguments to strings and join them
+    args_str = "".join([str(arg) for arg in args])
+    if cache_type:
+        args_str = f"{cache_type}:{args_str}"
+
+    # Compute MD5 hash
+    return hashlib.md5(args_str.encode()).hexdigest()


 def compute_mdhash_id(content, prefix: str = ""):
@@ -343,8 +358,8 @@ async def get_best_cached_response(
    use_llm_check=False,
    llm_func=None,
    original_prompt=None,
+    cache_type=None,
 ) -> Union[str, None]:
-    # Get mode-specific cache
    mode_cache = await hashing_kv.get_by_id(mode)
    if not mode_cache:
        return None
@@ -356,6 +371,10 @@ async def get_best_cached_response(

    # Only iterate through cache entries for this mode
    for cache_id, cache_data in mode_cache.items():
+        # Skip if cache_type doesn't match
+        if cache_type and cache_data.get("cache_type") != cache_type:
+            continue
+
        if cache_data["embedding"] is None:
            continue

@@ -452,13 +471,12 @@ def dequantize_embedding(
    return (quantized * scale + min_val).astype(np.float32)


-async def handle_cache(hashing_kv, args_hash, prompt, mode="default"):
+async def handle_cache(hashing_kv, args_hash, prompt, mode="default", cache_type=None):
    """Generic cache handling function"""
    if hashing_kv is None or not hashing_kv.global_config.get("enable_llm_cache"):
        return None, None, None, None

-    # For naive mode, only use simple cache matching
-    # if mode == "naive":
+    # For default mode, only use simple cache matching
    if mode == "default":
        if exists_func(hashing_kv, "get_by_mode_and_id"):
            mode_cache = await hashing_kv.get_by_mode_and_id(mode, args_hash) or {}
@@ -492,6 +510,7 @@ async def handle_cache(hashing_kv, args_hash, prompt, mode="default"):
            use_llm_check=use_llm_check,
            llm_func=llm_model_func if use_llm_check else None,
            original_prompt=prompt if use_llm_check else None,
+            cache_type=cache_type,
        )
        if best_cached_response is not None:
            return best_cached_response, None, None, None
@@ -573,3 +592,59 @@ def exists_func(obj, func_name: str) -> bool:
        return True
    else:
        return False
+
+
+def get_conversation_turns(conversation_history: list[dict], num_turns: int) -> str:
+    """
+    Process conversation history to get the specified number of complete turns.
+
+    Args:
+        conversation_history: List of conversation messages in chronological order
+        num_turns: Number of complete turns to include
+
+    Returns:
+        Formatted string of the conversation history
+    """
+    # Group messages into turns
+    turns = []
+    messages = []
+
+    # First, filter out keyword extraction messages
+    for msg in conversation_history:
+        if msg["role"] == "assistant" and (
+            msg["content"].startswith('{ "high_level_keywords"')
+            or msg["content"].startswith("{'high_level_keywords'")
+        ):
+            continue
+        messages.append(msg)
+
+    # Then process messages in chronological order
+    i = 0
+    while i < len(messages) - 1:
+        msg1 = messages[i]
+        msg2 = messages[i + 1]
+
+        # Check if we have a user-assistant or assistant-user pair
+        if (msg1["role"] == "user" and msg2["role"] == "assistant") or (
+            msg1["role"] == "assistant" and msg2["role"] == "user"
+        ):
+            # Always put user message first in the turn
+            if msg1["role"] == "assistant":
+                turn = [msg2, msg1]  # user, assistant
+            else:
+                turn = [msg1, msg2]  # user, assistant
+            turns.append(turn)
+        i += 1
+
+    # Keep only the most recent num_turns
+    if len(turns) > num_turns:
+        turns = turns[-num_turns:]
+
+    # Format the turns into a string
+    formatted_turns = []
+    for turn in turns:
+        formatted_turns.extend(
+            [f"user: {turn[0]['content']}", f"assistant: {turn[1]['content']}"]
+        )
+
+    return "\n".join(formatted_turns)