feat: trimming the model’s reasoning

2025-02-06 22:56:17 +03:00
parent 9db1db2b38
commit 19ee3d109c
3 changed files with 55 additions and 2 deletions
--- a/lightrag/llm/ollama.py
+++ b/lightrag/llm/ollama.py
@@ -66,6 +66,7 @@ from lightrag.exceptions import (
    RateLimitError,
    APITimeoutError,
 )
+from lightrag.utils import extract_reasoning
 import numpy as np
 from typing import Union

@@ -85,6 +86,7 @@ async def ollama_model_if_cache(
    **kwargs,
 ) -> Union[str, AsyncIterator[str]]:
    stream = True if kwargs.get("stream") else False
+    reasoning_tag = kwargs.pop("reasoning_tag", None)
    kwargs.pop("max_tokens", None)
    # kwargs.pop("response_format", None) # allow json
    host = kwargs.pop("host", None)
@@ -105,7 +107,7 @@ async def ollama_model_if_cache(

    response = await ollama_client.chat(model=model, messages=messages, **kwargs)
    if stream:
-        """cannot cache stream response"""
+        """cannot cache stream response and process reasoning"""

        async def inner():
            async for chunk in response:
@@ -113,7 +115,19 @@ async def ollama_model_if_cache(

        return inner()
    else:
-        return response["message"]["content"]
+        model_response = response["message"]["content"]
+
+        """
+        If the model also wraps its thoughts in a specific tag,
+        this information is not needed for the final
+        response and can simply be trimmed.
+        """
+
+        return (
+            model_response
+            if reasoning_tag is None
+            else extract_reasoning(model_response, reasoning_tag).response_content
+        )


 async def ollama_model_complete(