Use direct embedding_func from hashing_kv (do not by pass maxiumu async control)
This commit is contained in:
@@ -511,10 +511,10 @@ async def handle_cache(
|
|||||||
quantized = min_val = max_val = None
|
quantized = min_val = max_val = None
|
||||||
if is_embedding_cache_enabled:
|
if is_embedding_cache_enabled:
|
||||||
# Use embedding cache
|
# Use embedding cache
|
||||||
embedding_model_func = hashing_kv.global_config["embedding_func"]["func"]
|
current_embedding = await hashing_kv.embedding_func([prompt])
|
||||||
llm_model_func = hashing_kv.global_config.get("llm_model_func")
|
llm_model_func = (
|
||||||
|
hashing_kv.llm_model_func if hasattr(hashing_kv, "llm_model_func") else None
|
||||||
current_embedding = await embedding_model_func([prompt])
|
)
|
||||||
quantized, min_val, max_val = quantize_embedding(current_embedding[0])
|
quantized, min_val, max_val = quantize_embedding(current_embedding[0])
|
||||||
best_cached_response = await get_best_cached_response(
|
best_cached_response = await get_best_cached_response(
|
||||||
hashing_kv,
|
hashing_kv,
|
||||||
|
Reference in New Issue
Block a user