Add support for Ollama streaming output and integrate Open-WebUI as the chat UI demo
This commit is contained in:
@@ -19,6 +19,7 @@ class QueryParam:
|
||||
only_need_context: bool = False
|
||||
only_need_prompt: bool = False
|
||||
response_type: str = "Multiple Paragraphs"
|
||||
stream: bool = False
|
||||
# Number of top-k items to retrieve; corresponds to entities in "local" mode and relationships in "global" mode.
|
||||
top_k: int = 60
|
||||
# Number of document chunks to retrieve.
|
||||
|
@@ -27,7 +27,7 @@ from tenacity import (
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
import torch
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List, Dict, Callable, Any
|
||||
from typing import List, Dict, Callable, Any, Union
|
||||
from .base import BaseKVStorage
|
||||
from .utils import (
|
||||
compute_args_hash,
|
||||
@@ -37,6 +37,13 @@ from .utils import (
|
||||
get_best_cached_response,
|
||||
)
|
||||
|
||||
import sys
|
||||
|
||||
if sys.version_info < (3, 9):
|
||||
from typing import AsyncIterator
|
||||
else:
|
||||
from collections.abc import AsyncIterator
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
|
||||
@@ -454,7 +461,8 @@ async def ollama_model_if_cache(
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
**kwargs,
|
||||
) -> str:
|
||||
) -> Union[str, AsyncIterator[str]]:
|
||||
stream = True if kwargs.get("stream") else False
|
||||
kwargs.pop("max_tokens", None)
|
||||
# kwargs.pop("response_format", None) # allow json
|
||||
host = kwargs.pop("host", None)
|
||||
@@ -494,28 +502,39 @@ async def ollama_model_if_cache(
|
||||
return if_cache_return["return"]
|
||||
|
||||
response = await ollama_client.chat(model=model, messages=messages, **kwargs)
|
||||
if stream:
|
||||
""" cannot cache stream response """
|
||||
|
||||
result = response["message"]["content"]
|
||||
async def inner():
|
||||
async for chunk in response:
|
||||
yield chunk["message"]["content"]
|
||||
|
||||
if hashing_kv is not None:
|
||||
await hashing_kv.upsert(
|
||||
{
|
||||
args_hash: {
|
||||
"return": result,
|
||||
"model": model,
|
||||
"embedding": quantized.tobytes().hex()
|
||||
if is_embedding_cache_enabled
|
||||
else None,
|
||||
"embedding_shape": quantized.shape
|
||||
if is_embedding_cache_enabled
|
||||
else None,
|
||||
"embedding_min": min_val if is_embedding_cache_enabled else None,
|
||||
"embedding_max": max_val if is_embedding_cache_enabled else None,
|
||||
"original_prompt": prompt,
|
||||
return inner()
|
||||
else:
|
||||
result = response["message"]["content"]
|
||||
if hashing_kv is not None:
|
||||
await hashing_kv.upsert(
|
||||
{
|
||||
args_hash: {
|
||||
"return": result,
|
||||
"model": model,
|
||||
"embedding": quantized.tobytes().hex()
|
||||
if is_embedding_cache_enabled
|
||||
else None,
|
||||
"embedding_shape": quantized.shape
|
||||
if is_embedding_cache_enabled
|
||||
else None,
|
||||
"embedding_min": min_val
|
||||
if is_embedding_cache_enabled
|
||||
else None,
|
||||
"embedding_max": max_val
|
||||
if is_embedding_cache_enabled
|
||||
else None,
|
||||
"original_prompt": prompt,
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
return result
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
@@ -785,7 +804,7 @@ async def hf_model_complete(
|
||||
|
||||
async def ollama_model_complete(
|
||||
prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
|
||||
) -> str:
|
||||
) -> Union[str, AsyncIterator[str]]:
|
||||
keyword_extraction = kwargs.pop("keyword_extraction", None)
|
||||
if keyword_extraction:
|
||||
kwargs["format"] = "json"
|
||||
|
@@ -534,8 +534,9 @@ async def kg_query(
|
||||
response = await use_model_func(
|
||||
query,
|
||||
system_prompt=sys_prompt,
|
||||
stream=query_param.stream,
|
||||
)
|
||||
if len(response) > len(sys_prompt):
|
||||
if isinstance(response, str) and len(response) > len(sys_prompt):
|
||||
response = (
|
||||
response.replace(sys_prompt, "")
|
||||
.replace("user", "")
|
||||
|
Reference in New Issue
Block a user