修复 args_hash在使用常规缓存时候才计算导致embedding缓存时没有计算的bug
This commit is contained in:
@@ -596,7 +596,11 @@ if __name__ == "__main__":
|
||||
| **enable\_llm\_cache** | `bool` | If `TRUE`, stores LLM results in cache; repeated prompts return cached responses | `TRUE` |
|
||||
| **addon\_params** | `dict` | Additional parameters, e.g., `{"example_number": 1, "language": "Simplified Chinese"}`: sets example limit and output language | `example_number: all examples, language: English` |
|
||||
| **convert\_response\_to\_json\_func** | `callable` | Not used | `convert_response_to_json` |
|
||||
| **embedding\_cache\_config** | `dict` | Configuration for embedding cache. Includes `enabled` (bool) to toggle cache and `similarity_threshold` (float) for cache retrieval | `{"enabled": False, "similarity_threshold": 0.95}` |
|
||||
| **embedding\_cache\_config** | `dict` | Configuration for question-answer caching. Contains two parameters:
|
||||
- `enabled`: Boolean value to enable/disable caching functionality. When enabled, questions and answers will be cached.
|
||||
- `similarity_threshold`: Float value (0-1), similarity threshold. When a new question's similarity with a cached question exceeds this threshold, the cached answer will be returned directly without calling the LLM.
|
||||
|
||||
Default: `{"enabled": False, "similarity_threshold": 0.95}` | `{"enabled": False, "similarity_threshold": 0.95}` |
|
||||
|
||||
## API Server Implementation
|
||||
|
||||
|
@@ -66,7 +66,11 @@ async def openai_complete_if_cache(
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.extend(history_messages)
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
if hashing_kv is not None:
|
||||
# Calculate args_hash only when using cache
|
||||
args_hash = compute_args_hash(model, messages)
|
||||
|
||||
# Get embedding cache configuration
|
||||
embedding_cache_config = hashing_kv.global_config.get(
|
||||
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
||||
@@ -86,7 +90,6 @@ async def openai_complete_if_cache(
|
||||
return best_cached_response
|
||||
else:
|
||||
# Use regular cache
|
||||
args_hash = compute_args_hash(model, messages)
|
||||
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
||||
if if_cache_return is not None:
|
||||
return if_cache_return["return"]
|
||||
@@ -159,7 +162,12 @@ async def azure_openai_complete_if_cache(
|
||||
messages.extend(history_messages)
|
||||
if prompt is not None:
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
||||
if hashing_kv is not None:
|
||||
# Calculate args_hash only when using cache
|
||||
args_hash = compute_args_hash(model, messages)
|
||||
|
||||
# Get embedding cache configuration
|
||||
embedding_cache_config = hashing_kv.global_config.get(
|
||||
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
||||
@@ -178,7 +186,7 @@ async def azure_openai_complete_if_cache(
|
||||
if best_cached_response is not None:
|
||||
return best_cached_response
|
||||
else:
|
||||
args_hash = compute_args_hash(model, messages)
|
||||
# Use regular cache
|
||||
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
||||
if if_cache_return is not None:
|
||||
return if_cache_return["return"]
|
||||
@@ -271,6 +279,9 @@ async def bedrock_complete_if_cache(
|
||||
|
||||
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
||||
if hashing_kv is not None:
|
||||
# Calculate args_hash only when using cache
|
||||
args_hash = compute_args_hash(model, messages)
|
||||
|
||||
# Get embedding cache configuration
|
||||
embedding_cache_config = hashing_kv.global_config.get(
|
||||
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
||||
@@ -290,7 +301,6 @@ async def bedrock_complete_if_cache(
|
||||
return best_cached_response
|
||||
else:
|
||||
# Use regular cache
|
||||
args_hash = compute_args_hash(model, messages)
|
||||
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
||||
if if_cache_return is not None:
|
||||
return if_cache_return["return"]
|
||||
@@ -343,6 +353,11 @@ def initialize_hf_model(model_name):
|
||||
return hf_model, hf_tokenizer
|
||||
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=4, max=10),
|
||||
retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)),
|
||||
)
|
||||
async def hf_model_if_cache(
|
||||
model,
|
||||
prompt,
|
||||
@@ -359,7 +374,11 @@ async def hf_model_if_cache(
|
||||
messages.extend(history_messages)
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
||||
if hashing_kv is not None:
|
||||
# Calculate args_hash only when using cache
|
||||
args_hash = compute_args_hash(model, messages)
|
||||
|
||||
# Get embedding cache configuration
|
||||
embedding_cache_config = hashing_kv.global_config.get(
|
||||
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
||||
@@ -379,7 +398,6 @@ async def hf_model_if_cache(
|
||||
return best_cached_response
|
||||
else:
|
||||
# Use regular cache
|
||||
args_hash = compute_args_hash(model, messages)
|
||||
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
||||
if if_cache_return is not None:
|
||||
return if_cache_return["return"]
|
||||
@@ -448,6 +466,11 @@ async def hf_model_if_cache(
|
||||
return response_text
|
||||
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=4, max=10),
|
||||
retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)),
|
||||
)
|
||||
async def ollama_model_if_cache(
|
||||
model,
|
||||
prompt,
|
||||
@@ -468,7 +491,12 @@ async def ollama_model_if_cache(
|
||||
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
||||
messages.extend(history_messages)
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
||||
if hashing_kv is not None:
|
||||
# Calculate args_hash only when using cache
|
||||
args_hash = compute_args_hash(model, messages)
|
||||
|
||||
# Get embedding cache configuration
|
||||
embedding_cache_config = hashing_kv.global_config.get(
|
||||
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
||||
@@ -488,7 +516,6 @@ async def ollama_model_if_cache(
|
||||
return best_cached_response
|
||||
else:
|
||||
# Use regular cache
|
||||
args_hash = compute_args_hash(model, messages)
|
||||
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
||||
if if_cache_return is not None:
|
||||
return if_cache_return["return"]
|
||||
@@ -542,6 +569,11 @@ def initialize_lmdeploy_pipeline(
|
||||
return lmdeploy_pipe
|
||||
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=4, max=10),
|
||||
retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)),
|
||||
)
|
||||
async def lmdeploy_model_if_cache(
|
||||
model,
|
||||
prompt,
|
||||
@@ -620,7 +652,12 @@ async def lmdeploy_model_if_cache(
|
||||
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
||||
messages.extend(history_messages)
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
||||
if hashing_kv is not None:
|
||||
# Calculate args_hash only when using cache
|
||||
args_hash = compute_args_hash(model, messages)
|
||||
|
||||
# Get embedding cache configuration
|
||||
embedding_cache_config = hashing_kv.global_config.get(
|
||||
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
||||
@@ -640,7 +677,6 @@ async def lmdeploy_model_if_cache(
|
||||
return best_cached_response
|
||||
else:
|
||||
# Use regular cache
|
||||
args_hash = compute_args_hash(model, messages)
|
||||
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
||||
if if_cache_return is not None:
|
||||
return if_cache_return["return"]
|
||||
|
Reference in New Issue
Block a user