feat: trimming the model’s reasoning
This commit is contained in:
@@ -66,6 +66,7 @@ from lightrag.exceptions import (
|
||||
RateLimitError,
|
||||
APITimeoutError,
|
||||
)
|
||||
from lightrag.utils import extract_reasoning
|
||||
import numpy as np
|
||||
from typing import Union
|
||||
|
||||
@@ -85,6 +86,7 @@ async def ollama_model_if_cache(
|
||||
**kwargs,
|
||||
) -> Union[str, AsyncIterator[str]]:
|
||||
stream = True if kwargs.get("stream") else False
|
||||
reasoning_tag = kwargs.pop("reasoning_tag", None)
|
||||
kwargs.pop("max_tokens", None)
|
||||
# kwargs.pop("response_format", None) # allow json
|
||||
host = kwargs.pop("host", None)
|
||||
@@ -105,7 +107,7 @@ async def ollama_model_if_cache(
|
||||
|
||||
response = await ollama_client.chat(model=model, messages=messages, **kwargs)
|
||||
if stream:
|
||||
"""cannot cache stream response"""
|
||||
"""cannot cache stream response and process reasoning"""
|
||||
|
||||
async def inner():
|
||||
async for chunk in response:
|
||||
@@ -113,7 +115,19 @@ async def ollama_model_if_cache(
|
||||
|
||||
return inner()
|
||||
else:
|
||||
return response["message"]["content"]
|
||||
model_response = response["message"]["content"]
|
||||
|
||||
"""
|
||||
If the model also wraps its thoughts in a specific tag,
|
||||
this information is not needed for the final
|
||||
response and can simply be trimmed.
|
||||
"""
|
||||
|
||||
return (
|
||||
model_response
|
||||
if reasoning_tag is None
|
||||
else extract_reasoning(model_response, reasoning_tag).response_content
|
||||
)
|
||||
|
||||
|
||||
async def ollama_model_complete(
|
||||
|
Reference in New Issue
Block a user