From ccf44dc334531383af060821c54a48f9075bb9da Mon Sep 17 00:00:00 2001 From: Magic_yuan <317617749@qq.com> Date: Sun, 8 Dec 2024 17:35:52 +0800 Subject: [PATCH] =?UTF-8?q?feat(cache):=20=E5=A2=9E=E5=8A=A0=20LLM=20?= =?UTF-8?q?=E7=9B=B8=E4=BC=BC=E6=80=A7=E6=A3=80=E6=9F=A5=E5=8A=9F=E8=83=BD?= =?UTF-8?q?=E5=B9=B6=E4=BC=98=E5=8C=96=E7=BC=93=E5=AD=98=E6=9C=BA=E5=88=B6?= =?UTF-8?q?=20-=20=E5=9C=A8=20embedding=20=E7=BC=93=E5=AD=98=E9=85=8D?= =?UTF-8?q?=E7=BD=AE=E4=B8=AD=E6=B7=BB=E5=8A=A0=20use=5Fllm=5Fcheck=20?= =?UTF-8?q?=E5=8F=82=E6=95=B0=20-=20=E5=AE=9E=E7=8E=B0=20LLM=20=E7=9B=B8?= =?UTF-8?q?=E4=BC=BC=E6=80=A7=E6=A3=80=E6=9F=A5=E9=80=BB=E8=BE=91=EF=BC=8C?= =?UTF-8?q?=E4=BD=9C=E4=B8=BA=E7=BC=93=E5=AD=98=E5=91=BD=E4=B8=AD=E7=9A=84?= =?UTF-8?q?=E4=BA=8C=E6=AC=A1=E9=AA=8C=E8=AF=81-=20=E4=BC=98=E5=8C=96=20na?= =?UTF-8?q?ive=20=E6=A8=A1=E5=BC=8F=E7=9A=84=E7=BC=93=E5=AD=98=E5=A4=84?= =?UTF-8?q?=E7=90=86=E6=B5=81=E7=A8=8B=20-=20=E8=B0=83=E6=95=B4=E7=BC=93?= =?UTF-8?q?=E5=AD=98=E6=95=B0=E6=8D=AE=E7=BB=93=E6=9E=84=EF=BC=8C=E7=A7=BB?= =?UTF-8?q?=E9=99=A4=E4=B8=8D=E5=BF=85=E8=A6=81=E7=9A=84=20model=20?= =?UTF-8?q?=E5=AD=97=E6=AE=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 6 +- lightrag/lightrag.py | 9 +- lightrag/llm.py | 265 ++----------------------------------------- lightrag/operate.py | 59 ++++++++-- lightrag/prompt.py | 19 ++++ lightrag/utils.py | 55 ++++++++- 6 files changed, 138 insertions(+), 275 deletions(-) diff --git a/README.md b/README.md index a2cbb217..a1454792 100644 --- a/README.md +++ b/README.md @@ -596,11 +596,7 @@ if __name__ == "__main__": | **enable\_llm\_cache** | `bool` | If `TRUE`, stores LLM results in cache; repeated prompts return cached responses | `TRUE` | | **addon\_params** | `dict` | Additional parameters, e.g., `{"example_number": 1, "language": "Simplified Chinese"}`: sets example limit and output language | `example_number: all examples, language: English` | | **convert\_response\_to\_json\_func** | `callable` | Not used | `convert_response_to_json` | -| **embedding\_cache\_config** | `dict` | Configuration for question-answer caching. Contains two parameters: -- `enabled`: Boolean value to enable/disable caching functionality. When enabled, questions and answers will be cached. -- `similarity_threshold`: Float value (0-1), similarity threshold. When a new question's similarity with a cached question exceeds this threshold, the cached answer will be returned directly without calling the LLM. - -Default: `{"enabled": False, "similarity_threshold": 0.95}` | `{"enabled": False, "similarity_threshold": 0.95}` | +| **embedding\_cache\_config** | `dict` | Configuration for question-answer caching. Contains three parameters:
- `enabled`: Boolean value to enable/disable cache lookup functionality. When enabled, the system will check cached responses before generating new answers.
- `similarity_threshold`: Float value (0-1), similarity threshold. When a new question's similarity with a cached question exceeds this threshold, the cached answer will be returned directly without calling the LLM.
- `use_llm_check`: Boolean value to enable/disable LLM similarity verification. When enabled, LLM will be used as a secondary check to verify the similarity between questions before returning cached answers. | Default: `{"enabled": False, "similarity_threshold": 0.95, "use_llm_check": False}` | ## API Server Implementation diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 0a44187e..0eb1b27e 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -87,7 +87,11 @@ class LightRAG: ) # Default not to use embedding cache embedding_cache_config: dict = field( - default_factory=lambda: {"enabled": False, "similarity_threshold": 0.95} + default_factory=lambda: { + "enabled": False, + "similarity_threshold": 0.95, + "use_llm_check": False, + } ) kv_storage: str = field(default="JsonKVStorage") vector_storage: str = field(default="NanoVectorDBStorage") @@ -174,7 +178,6 @@ class LightRAG: if self.enable_llm_cache else None ) - self.embedding_func = limit_async_func_call(self.embedding_func_max_async)( self.embedding_func ) @@ -481,6 +484,7 @@ class LightRAG: self.text_chunks, param, asdict(self), + hashing_kv=self.llm_response_cache, ) elif param.mode == "naive": response = await naive_query( @@ -489,6 +493,7 @@ class LightRAG: self.text_chunks, param, asdict(self), + hashing_kv=self.llm_response_cache, ) else: raise ValueError(f"Unknown mode {param.mode}") diff --git a/lightrag/llm.py b/lightrag/llm.py index 63913c90..507753f4 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -4,8 +4,7 @@ import json import os import struct from functools import lru_cache -from typing import List, Dict, Callable, Any, Union, Optional -from dataclasses import dataclass +from typing import List, Dict, Callable, Any, Union import aioboto3 import aiohttp import numpy as np @@ -27,13 +26,9 @@ from tenacity import ( ) from transformers import AutoTokenizer, AutoModelForCausalLM -from .base import BaseKVStorage from .utils import ( - compute_args_hash, wrap_embedding_func_with_attrs, locate_json_string_body_from_string, - quantize_embedding, - get_best_cached_response, ) import sys @@ -66,23 +61,13 @@ async def openai_complete_if_cache( openai_async_client = ( AsyncOpenAI() if base_url is None else AsyncOpenAI(base_url=base_url) ) - + kwargs.pop("hashing_kv", None) messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) messages.extend(history_messages) messages.append({"role": "user", "content": prompt}) - hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) - # Handle cache - mode = kwargs.pop("mode", "default") - args_hash = compute_args_hash(model, messages) - cached_response, quantized, min_val, max_val = await handle_cache( - hashing_kv, args_hash, prompt, mode - ) - if cached_response is not None: - return cached_response - if "response_format" in kwargs: response = await openai_async_client.beta.chat.completions.parse( model=model, messages=messages, **kwargs @@ -95,21 +80,6 @@ async def openai_complete_if_cache( if r"\u" in content: content = content.encode("utf-8").decode("unicode_escape") - # Save to cache - await save_to_cache( - hashing_kv, - CacheData( - args_hash=args_hash, - content=content, - model=model, - prompt=prompt, - quantized=quantized, - min_val=min_val, - max_val=max_val, - mode=mode, - ), - ) - return content @@ -140,10 +110,7 @@ async def azure_openai_complete_if_cache( api_key=os.getenv("AZURE_OPENAI_API_KEY"), api_version=os.getenv("AZURE_OPENAI_API_VERSION"), ) - - hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) - mode = kwargs.pop("mode", "default") - + kwargs.pop("hashing_kv", None) messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) @@ -151,34 +118,11 @@ async def azure_openai_complete_if_cache( if prompt is not None: messages.append({"role": "user", "content": prompt}) - # Handle cache - args_hash = compute_args_hash(model, messages) - cached_response, quantized, min_val, max_val = await handle_cache( - hashing_kv, args_hash, prompt, mode - ) - if cached_response is not None: - return cached_response - response = await openai_async_client.chat.completions.create( model=model, messages=messages, **kwargs ) content = response.choices[0].message.content - # Save to cache - await save_to_cache( - hashing_kv, - CacheData( - args_hash=args_hash, - content=content, - model=model, - prompt=prompt, - quantized=quantized, - min_val=min_val, - max_val=max_val, - mode=mode, - ), - ) - return content @@ -210,7 +154,7 @@ async def bedrock_complete_if_cache( os.environ["AWS_SESSION_TOKEN"] = os.environ.get( "AWS_SESSION_TOKEN", aws_session_token ) - + kwargs.pop("hashing_kv", None) # Fix message history format messages = [] for history_message in history_messages: @@ -220,15 +164,6 @@ async def bedrock_complete_if_cache( # Add user prompt messages.append({"role": "user", "content": [{"text": prompt}]}) - hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) - # Handle cache - mode = kwargs.pop("mode", "default") - args_hash = compute_args_hash(model, messages) - cached_response, quantized, min_val, max_val = await handle_cache( - hashing_kv, args_hash, prompt, mode - ) - if cached_response is not None: - return cached_response # Initialize Converse API arguments args = {"modelId": model, "messages": messages} @@ -251,15 +186,6 @@ async def bedrock_complete_if_cache( args["inferenceConfig"][inference_params_map.get(param, param)] = ( kwargs.pop(param) ) - hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) - # Handle cache - mode = kwargs.pop("mode", "default") - args_hash = compute_args_hash(model, messages) - cached_response, quantized, min_val, max_val = await handle_cache( - hashing_kv, args_hash, prompt, mode - ) - if cached_response is not None: - return cached_response # Call model via Converse API session = aioboto3.Session() @@ -269,21 +195,6 @@ async def bedrock_complete_if_cache( except Exception as e: raise BedrockError(e) - # Save to cache - await save_to_cache( - hashing_kv, - CacheData( - args_hash=args_hash, - content=response["output"]["message"]["content"][0]["text"], - model=model, - prompt=prompt, - quantized=quantized, - min_val=min_val, - max_val=max_val, - mode=mode, - ), - ) - return response["output"]["message"]["content"][0]["text"] @@ -315,22 +226,12 @@ async def hf_model_if_cache( ) -> str: model_name = model hf_model, hf_tokenizer = initialize_hf_model(model_name) - hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) messages.extend(history_messages) messages.append({"role": "user", "content": prompt}) - - # Handle cache - mode = kwargs.pop("mode", "default") - args_hash = compute_args_hash(model, messages) - cached_response, quantized, min_val, max_val = await handle_cache( - hashing_kv, args_hash, prompt, mode - ) - if cached_response is not None: - return cached_response - + kwargs.pop("hashing_kv", None) input_prompt = "" try: input_prompt = hf_tokenizer.apply_chat_template( @@ -375,21 +276,6 @@ async def hf_model_if_cache( output[0][len(inputs["input_ids"][0]) :], skip_special_tokens=True ) - # Save to cache - await save_to_cache( - hashing_kv, - CacheData( - args_hash=args_hash, - content=response_text, - model=model, - prompt=prompt, - quantized=quantized, - min_val=min_val, - max_val=max_val, - mode=mode, - ), - ) - return response_text @@ -410,25 +296,14 @@ async def ollama_model_if_cache( # kwargs.pop("response_format", None) # allow json host = kwargs.pop("host", None) timeout = kwargs.pop("timeout", None) - + kwargs.pop("hashing_kv", None) ollama_client = ollama.AsyncClient(host=host, timeout=timeout) messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) - - hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) messages.extend(history_messages) messages.append({"role": "user", "content": prompt}) - # Handle cache - mode = kwargs.pop("mode", "default") - args_hash = compute_args_hash(model, messages) - cached_response, quantized, min_val, max_val = await handle_cache( - hashing_kv, args_hash, prompt, mode - ) - if cached_response is not None: - return cached_response - response = await ollama_client.chat(model=model, messages=messages, **kwargs) if stream: """ cannot cache stream response """ @@ -441,38 +316,7 @@ async def ollama_model_if_cache( else: result = response["message"]["content"] # Save to cache - await save_to_cache( - hashing_kv, - CacheData( - args_hash=args_hash, - content=result, - model=model, - prompt=prompt, - quantized=quantized, - min_val=min_val, - max_val=max_val, - mode=mode, - ), - ) return result - result = response["message"]["content"] - - # Save to cache - await save_to_cache( - hashing_kv, - CacheData( - args_hash=args_hash, - content=result, - model=model, - prompt=prompt, - quantized=quantized, - min_val=min_val, - max_val=max_val, - mode=mode, - ), - ) - - return result @lru_cache(maxsize=1) @@ -547,7 +391,7 @@ async def lmdeploy_model_if_cache( from lmdeploy import version_info, GenerationConfig except Exception: raise ImportError("Please install lmdeploy before intialize lmdeploy backend.") - + kwargs.pop("hashing_kv", None) kwargs.pop("response_format", None) max_new_tokens = kwargs.pop("max_tokens", 512) tp = kwargs.pop("tp", 1) @@ -579,19 +423,9 @@ async def lmdeploy_model_if_cache( if system_prompt: messages.append({"role": "system", "content": system_prompt}) - hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) messages.extend(history_messages) messages.append({"role": "user", "content": prompt}) - # Handle cache - mode = kwargs.pop("mode", "default") - args_hash = compute_args_hash(model, messages) - cached_response, quantized, min_val, max_val = await handle_cache( - hashing_kv, args_hash, prompt, mode - ) - if cached_response is not None: - return cached_response - gen_config = GenerationConfig( skip_special_tokens=skip_special_tokens, max_new_tokens=max_new_tokens, @@ -607,22 +441,6 @@ async def lmdeploy_model_if_cache( session_id=1, ): response += res.response - - # Save to cache - await save_to_cache( - hashing_kv, - CacheData( - args_hash=args_hash, - content=response, - model=model, - prompt=prompt, - quantized=quantized, - min_val=min_val, - max_val=max_val, - mode=mode, - ), - ) - return response @@ -1052,75 +870,6 @@ class MultiModel: return await next_model.gen_func(**args) -async def handle_cache(hashing_kv, args_hash, prompt, mode="default"): - """Generic cache handling function""" - if hashing_kv is None: - return None, None, None, None - - # Get embedding cache configuration - embedding_cache_config = hashing_kv.global_config.get( - "embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95} - ) - is_embedding_cache_enabled = embedding_cache_config["enabled"] - - quantized = min_val = max_val = None - if is_embedding_cache_enabled: - # Use embedding cache - embedding_model_func = hashing_kv.global_config["embedding_func"]["func"] - current_embedding = await embedding_model_func([prompt]) - quantized, min_val, max_val = quantize_embedding(current_embedding[0]) - best_cached_response = await get_best_cached_response( - hashing_kv, - current_embedding[0], - similarity_threshold=embedding_cache_config["similarity_threshold"], - mode=mode, - ) - if best_cached_response is not None: - return best_cached_response, None, None, None - else: - # Use regular cache - mode_cache = await hashing_kv.get_by_id(mode) or {} - if args_hash in mode_cache: - return mode_cache[args_hash]["return"], None, None, None - - return None, quantized, min_val, max_val - - -@dataclass -class CacheData: - args_hash: str - content: str - model: str - prompt: str - quantized: Optional[np.ndarray] = None - min_val: Optional[float] = None - max_val: Optional[float] = None - mode: str = "default" - - -async def save_to_cache(hashing_kv, cache_data: CacheData): - if hashing_kv is None: - return - - mode_cache = await hashing_kv.get_by_id(cache_data.mode) or {} - - mode_cache[cache_data.args_hash] = { - "return": cache_data.content, - "model": cache_data.model, - "embedding": cache_data.quantized.tobytes().hex() - if cache_data.quantized is not None - else None, - "embedding_shape": cache_data.quantized.shape - if cache_data.quantized is not None - else None, - "embedding_min": cache_data.min_val, - "embedding_max": cache_data.max_val, - "original_prompt": cache_data.prompt, - } - - await hashing_kv.upsert({cache_data.mode: mode_cache}) - - if __name__ == "__main__": import asyncio diff --git a/lightrag/operate.py b/lightrag/operate.py index acbdf072..feaec27d 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -17,6 +17,10 @@ from .utils import ( split_string_by_multi_markers, truncate_list_by_token_size, process_combine_contexts, + compute_args_hash, + handle_cache, + save_to_cache, + CacheData, ) from .base import ( BaseGraphStorage, @@ -452,8 +456,17 @@ async def kg_query( text_chunks_db: BaseKVStorage[TextChunkSchema], query_param: QueryParam, global_config: dict, + hashing_kv: BaseKVStorage = None, ) -> str: - context = None + # Handle cache + use_model_func = global_config["llm_model_func"] + args_hash = compute_args_hash(query_param.mode, query) + cached_response, quantized, min_val, max_val = await handle_cache( + hashing_kv, args_hash, query, query_param.mode + ) + if cached_response is not None: + return cached_response + example_number = global_config["addon_params"].get("example_number", None) if example_number and example_number < len(PROMPTS["keywords_extraction_examples"]): examples = "\n".join( @@ -471,12 +484,9 @@ async def kg_query( return PROMPTS["fail_response"] # LLM generate keywords - use_model_func = global_config["llm_model_func"] kw_prompt_temp = PROMPTS["keywords_extraction"] kw_prompt = kw_prompt_temp.format(query=query, examples=examples, language=language) - result = await use_model_func( - kw_prompt, keyword_extraction=True, mode=query_param.mode - ) + result = await use_model_func(kw_prompt, keyword_extraction=True) logger.info("kw_prompt result:") print(result) try: @@ -537,7 +547,6 @@ async def kg_query( query, system_prompt=sys_prompt, stream=query_param.stream, - mode=query_param.mode, ) if isinstance(response, str) and len(response) > len(sys_prompt): response = ( @@ -550,6 +559,20 @@ async def kg_query( .strip() ) + # Save to cache + await save_to_cache( + hashing_kv, + CacheData( + args_hash=args_hash, + content=response, + prompt=query, + quantized=quantized, + min_val=min_val, + max_val=max_val, + mode=query_param.mode, + ), + ) + return response @@ -1013,8 +1036,17 @@ async def naive_query( text_chunks_db: BaseKVStorage[TextChunkSchema], query_param: QueryParam, global_config: dict, + hashing_kv: BaseKVStorage = None, ): + # Handle cache use_model_func = global_config["llm_model_func"] + args_hash = compute_args_hash(query_param.mode, query) + cached_response, quantized, min_val, max_val = await handle_cache( + hashing_kv, args_hash, query, query_param.mode + ) + if cached_response is not None: + return cached_response + results = await chunks_vdb.query(query, top_k=query_param.top_k) if not len(results): return PROMPTS["fail_response"] @@ -1039,7 +1071,6 @@ async def naive_query( response = await use_model_func( query, system_prompt=sys_prompt, - mode=query_param.mode, ) if len(response) > len(sys_prompt): @@ -1054,4 +1085,18 @@ async def naive_query( .strip() ) + # Save to cache + await save_to_cache( + hashing_kv, + CacheData( + args_hash=args_hash, + content=response, + prompt=query, + quantized=quantized, + min_val=min_val, + max_val=max_val, + mode=query_param.mode, + ), + ) + return response diff --git a/lightrag/prompt.py b/lightrag/prompt.py index d758397b..863d38dc 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -261,3 +261,22 @@ Do not include information where the supporting evidence for it is not provided. Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. """ + +PROMPTS[ + "similarity_check" +] = """Please analyze the similarity between these two questions: + +Question 1: {original_prompt} +Question 2: {cached_prompt} + +Please evaluate: +1. Whether these two questions are semantically similar +2. Whether the answer to Question 2 can be used to answer Question 1 + +Please provide a similarity score between 0 and 1, where: +0: Completely unrelated or answer cannot be reused +1: Identical and answer can be directly reused +0.5: Partially related and answer needs modification to be used + +Return only a number between 0-1, without any additional content. +""" diff --git a/lightrag/utils.py b/lightrag/utils.py index 0fcb437f..32d5c87f 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -15,6 +15,8 @@ import xml.etree.ElementTree as ET import numpy as np import tiktoken +from lightrag.prompt import PROMPTS + ENCODER = None logger = logging.getLogger("lightrag") @@ -314,6 +316,9 @@ async def get_best_cached_response( current_embedding, similarity_threshold=0.95, mode="default", + use_llm_check=False, + llm_func=None, + original_prompt=None, ) -> Union[str, None]: # Get mode-specific cache mode_cache = await hashing_kv.get_by_id(mode) @@ -348,6 +353,37 @@ async def get_best_cached_response( best_cache_id = cache_id if best_similarity > similarity_threshold: + # If LLM check is enabled and all required parameters are provided + if use_llm_check and llm_func and original_prompt and best_prompt: + compare_prompt = PROMPTS["similarity_check"].format( + original_prompt=original_prompt, cached_prompt=best_prompt + ) + + try: + llm_result = await llm_func(compare_prompt) + llm_result = llm_result.strip() + llm_similarity = float(llm_result) + + # Replace vector similarity with LLM similarity score + best_similarity = llm_similarity + if best_similarity < similarity_threshold: + log_data = { + "event": "llm_check_cache_rejected", + "original_question": original_prompt[:100] + "..." + if len(original_prompt) > 100 + else original_prompt, + "cached_question": best_prompt[:100] + "..." + if len(best_prompt) > 100 + else best_prompt, + "similarity_score": round(best_similarity, 4), + "threshold": similarity_threshold, + } + logger.info(json.dumps(log_data, ensure_ascii=False)) + return None + except Exception as e: # Catch all possible exceptions + logger.warning(f"LLM similarity check failed: {e}") + return None # Return None directly when LLM check fails + prompt_display = ( best_prompt[:50] + "..." if len(best_prompt) > 50 else best_prompt ) @@ -391,21 +427,33 @@ def dequantize_embedding( scale = (max_val - min_val) / (2**bits - 1) return (quantized * scale + min_val).astype(np.float32) + async def handle_cache(hashing_kv, args_hash, prompt, mode="default"): """Generic cache handling function""" if hashing_kv is None: return None, None, None, None + # For naive mode, only use simple cache matching + if mode == "naive": + mode_cache = await hashing_kv.get_by_id(mode) or {} + if args_hash in mode_cache: + return mode_cache[args_hash]["return"], None, None, None + return None, None, None, None + # Get embedding cache configuration embedding_cache_config = hashing_kv.global_config.get( - "embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95} + "embedding_cache_config", + {"enabled": False, "similarity_threshold": 0.95, "use_llm_check": False}, ) is_embedding_cache_enabled = embedding_cache_config["enabled"] + use_llm_check = embedding_cache_config.get("use_llm_check", False) quantized = min_val = max_val = None if is_embedding_cache_enabled: # Use embedding cache embedding_model_func = hashing_kv.global_config["embedding_func"]["func"] + llm_model_func = hashing_kv.global_config.get("llm_model_func") + current_embedding = await embedding_model_func([prompt]) quantized, min_val, max_val = quantize_embedding(current_embedding[0]) best_cached_response = await get_best_cached_response( @@ -413,6 +461,9 @@ async def handle_cache(hashing_kv, args_hash, prompt, mode="default"): current_embedding[0], similarity_threshold=embedding_cache_config["similarity_threshold"], mode=mode, + use_llm_check=use_llm_check, + llm_func=llm_model_func if use_llm_check else None, + original_prompt=prompt if use_llm_check else None, ) if best_cached_response is not None: return best_cached_response, None, None, None @@ -429,7 +480,6 @@ async def handle_cache(hashing_kv, args_hash, prompt, mode="default"): class CacheData: args_hash: str content: str - model: str prompt: str quantized: Optional[np.ndarray] = None min_val: Optional[float] = None @@ -445,7 +495,6 @@ async def save_to_cache(hashing_kv, cache_data: CacheData): mode_cache[cache_data.args_hash] = { "return": cache_data.content, - "model": cache_data.model, "embedding": cache_data.quantized.tobytes().hex() if cache_data.quantized is not None else None,