Merge pull request #650 from danielaskdd/Add-history-support-for-ollama-api

Add history support for ollama api
2025-01-27 06:34:10 +08:00
parent 28b139d074 03604d3186
commit bd2b3f334e
6 changed files with 122 additions and 183 deletions
--- a/.env.example
+++ b/.env.example
@@ -43,6 +43,9 @@ MAX_ASYNC=4
 MAX_TOKENS=32768
 EMBEDDING_DIM=1024
 MAX_EMBED_TOKENS=8192
 #HISTORY_TURNS=3
 #CHUNK_SIZE=1200
 #CHUNK_OVERLAP_SIZE=100
 # Security (empty for no key)
 LIGHTRAG_API_KEY=your-secure-api-key-here
--- a/examples/lightrag_api_open_webui_demo.py
+++ b/examples/lightrag_api_open_webui_demo.py
@@ -1,140 +0,0 @@
 from datetime import datetime, timezone
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
 import inspect
 import json
 from pydantic import BaseModel
 from typing import Optional
 import os
 import logging
 from lightrag import LightRAG, QueryParam
 from lightrag.llm.ollama import ollama_model_complete, ollama_embed
 from lightrag.utils import EmbeddingFunc
 import nest_asyncio
 WORKING_DIR = "./dickens"
 logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)
 if not os.path.exists(WORKING_DIR):
    os.mkdir(WORKING_DIR)
 rag = LightRAG(
    working_dir=WORKING_DIR,
    llm_model_func=ollama_model_complete,
    llm_model_name="qwen2.5:latest",
    llm_model_max_async=4,
    llm_model_max_token_size=32768,
    llm_model_kwargs={"host": "http://localhost:11434", "options": {"num_ctx": 32768}},
    embedding_func=EmbeddingFunc(
        embedding_dim=1024,
        max_token_size=8192,
        func=lambda texts: ollama_embed(
            texts=texts, embed_model="bge-m3:latest", host="http://127.0.0.1:11434"
        ),
    ),
 )
 with open("./book.txt", "r", encoding="utf-8") as f:
    rag.insert(f.read())
 # Apply nest_asyncio to solve event loop issues
 nest_asyncio.apply()
 app = FastAPI(title="LightRAG", description="LightRAG API open-webui")
 # Data models
 MODEL_NAME = "LightRAG:latest"
 class Message(BaseModel):
    role: Optional[str] = None
    content: str
 class OpenWebUIRequest(BaseModel):
    stream: Optional[bool] = None
    model: Optional[str] = None
    messages: list[Message]
 # API routes
@app.get("/")
 async def index():
    return "Set Ollama link to http://ip:port/ollama in Open-WebUI Settings"
@app.get("/ollama/api/version")
 async def ollama_version():
    return {"version": "0.4.7"}
@app.get("/ollama/api/tags")
 async def ollama_tags():
    return {
        "models": [
            {
                "name": MODEL_NAME,
                "model": MODEL_NAME,
                "modified_at": "2024-11-12T20:22:37.561463923+08:00",
                "size": 4683087332,
                "digest": "845dbda0ea48ed749caafd9e6037047aa19acfcfd82e704d7ca97d631a0b697e",
                "details": {
                    "parent_model": "",
                    "format": "gguf",
                    "family": "qwen2",
                    "families": ["qwen2"],
                    "parameter_size": "7.6B",
                    "quantization_level": "Q4_K_M",
                },
            }
        ]
    }
@app.post("/ollama/api/chat")
 async def ollama_chat(request: OpenWebUIRequest):
    resp = rag.query(
        request.messages[-1].content, param=QueryParam(mode="hybrid", stream=True)
    )
    if inspect.isasyncgen(resp):
        async def ollama_resp(chunks):
            async for chunk in chunks:
                yield (
                    json.dumps(
                        {
                            "model": MODEL_NAME,
                            "created_at": datetime.now(timezone.utc).strftime(
                                "%Y-%m-%dT%H:%M:%S.%fZ"
                            ),
                            "message": {
                                "role": "assistant",
                                "content": chunk,
                            },
                            "done": False,
                        },
                        ensure_ascii=False,
                    ).encode("utf-8")
                    + b"\n"
                )  # the b"\n" is important
        return StreamingResponse(ollama_resp(resp), media_type="application/json")
    else:
        return resp
@app.get("/health")
 async def health_check():
    return {"status": "healthy"}
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8020)
--- a/lightrag/api/README.md
+++ b/lightrag/api/README.md
@@ -94,8 +94,6 @@ For example, chat message "/mix 唐僧有几个徒弟" will trigger a mix mode q
 After starting the lightrag-server, you can add an Ollama-type connection in the Open WebUI admin pannel. And then a model named lightrag:latest will appear in Open WebUI's model management interface. Users can then send queries to LightRAG through the chat interface.
 To prevent Open WebUI from using LightRAG when generating conversation titles, go to Admin Panel > Interface > Set Task Model and change both Local Models and External Models to any option except "Current Model".
 ## Configuration
 LightRAG can be configured using either command-line arguments or environment variables. When both are provided, command-line arguments take precedence over environment variables.
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -17,6 +17,7 @@ import shutil
 import aiofiles
 from ascii_colors import trace_exception, ASCIIColors
 import os
 import sys
 import configparser
 from fastapi import Depends, Security
@@ -200,8 +201,14 @@ def display_splash_screen(args: argparse.Namespace) -> None:
    ASCIIColors.yellow(f"{args.max_async}")
    ASCIIColors.white("    ├─ Max Tokens: ", end="")
    ASCIIColors.yellow(f"{args.max_tokens}")
-    ASCIIColors.white("    └─ Max Embed Tokens: ", end="")
+    ASCIIColors.white("    ├─ Max Embed Tokens: ", end="")
    ASCIIColors.yellow(f"{args.max_embed_tokens}")
    ASCIIColors.white("    ├─ Chunk Size: ", end="")
    ASCIIColors.yellow(f"{args.chunk_size}")
    ASCIIColors.white("    ├─ Chunk Overlap Size: ", end="")
    ASCIIColors.yellow(f"{args.chunk_overlap_size}")
    ASCIIColors.white("    └─ History Turns: ", end="")
    ASCIIColors.yellow(f"{args.history_turns}")
    # System Configuration
    ASCIIColors.magenta("\n🛠️ System Configuration:")
@@ -281,6 +288,9 @@ def display_splash_screen(args: argparse.Namespace) -> None:
    ASCIIColors.green("Server is ready to accept connections! 🚀\n")
    # Ensure splash output flush to system log
    sys.stdout.flush()
 def parse_args() -> argparse.Namespace:
    """
@@ -294,7 +304,7 @@ def parse_args() -> argparse.Namespace:
        description="LightRAG FastAPI Server with separate working and input directories"
    )
-    # Bindings (with env var support)
+    # Bindings configuration
    parser.add_argument(
        "--llm-binding",
        default=get_env_value("LLM_BINDING", "ollama"),
@@ -306,9 +316,6 @@ def parse_args() -> argparse.Namespace:
        help="Embedding binding to be used. Supported: lollms, ollama, openai (default: from env or ollama)",
    )
    # Parse temporary args for host defaults
    temp_args, _ = parser.parse_known_args()
    # Server configuration
    parser.add_argument(
        "--host",
@@ -335,13 +342,13 @@ def parse_args() -> argparse.Namespace:
    )
    # LLM Model configuration
    default_llm_host = get_env_value(
        "LLM_BINDING_HOST", get_default_host(temp_args.llm_binding)
    )
    parser.add_argument(
        "--llm-binding-host",
-        default=default_llm_host,
+        default=get_env_value("LLM_BINDING_HOST", None),
-        help=f"llm server host URL (default: from env or {default_llm_host})",
+        help="LLM server host URL. If not provided, defaults based on llm-binding:\n"
        + "- ollama: http://localhost:11434\n"
        + "- lollms: http://localhost:9600\n"
        + "- openai: https://api.openai.com/v1",
    )
    default_llm_api_key = get_env_value("LLM_BINDING_API_KEY", None)
@@ -359,13 +366,13 @@ def parse_args() -> argparse.Namespace:
    )
    # Embedding model configuration
    default_embedding_host = get_env_value(
        "EMBEDDING_BINDING_HOST", get_default_host(temp_args.embedding_binding)
    )
    parser.add_argument(
        "--embedding-binding-host",
-        default=default_embedding_host,
+        default=get_env_value("EMBEDDING_BINDING_HOST", None),
-        help=f"embedding server host URL (default: from env or {default_embedding_host})",
+        help="Embedding server host URL. If not provided, defaults based on embedding-binding:\n"
        + "- ollama: http://localhost:11434\n"
        + "- lollms: http://localhost:9600\n"
        + "- openai: https://api.openai.com/v1",
    )
    default_embedding_api_key = get_env_value("EMBEDDING_BINDING_API_KEY", "")
@@ -383,14 +390,14 @@ def parse_args() -> argparse.Namespace:
    parser.add_argument(
        "--chunk_size",
-        default=1200,
+        default=get_env_value("CHUNK_SIZE", 1200),
-        help="chunk token size default 1200",
+        help="chunk chunk size default 1200",
    )
    parser.add_argument(
        "--chunk_overlap_size",
-        default=100,
+        default=get_env_value("CHUNK_OVERLAP_SIZE", 100),
-        help="chunk token size default 1200",
+        help="chunk overlap size default 100",
    )
    def timeout_type(value):
@@ -470,6 +477,13 @@ def parse_args() -> argparse.Namespace:
        help="Enable automatic scanning when the program starts",
    )
    parser.add_argument(
        "--history-turns",
        type=int,
        default=get_env_value("HISTORY_TURNS", 3, int),
        help="Number of conversation history turns to include (default: from env or 3)",
    )
    args = parser.parse_args()
    return args
@@ -634,8 +648,7 @@ def get_api_key_dependency(api_key: Optional[str]):
 def create_app(args):
-    # Verify that bindings arer correctly setup
+    # Verify that bindings are correctly setup
    if args.llm_binding not in [
        "lollms",
        "ollama",
@@ -648,6 +661,13 @@ def create_app(args):
    if args.embedding_binding not in ["lollms", "ollama", "openai", "azure_openai"]:
        raise Exception("embedding binding not supported")
    # Set default hosts if not provided
    if args.llm_binding_host is None:
        args.llm_binding_host = get_default_host(args.llm_binding)
    if args.embedding_binding_host is None:
        args.embedding_binding_host = get_default_host(args.embedding_binding)
    # Add SSL validation
    if args.ssl:
        if not args.ssl_certfile or not args.ssl_keyfile:
@@ -1442,7 +1462,10 @@ def create_app(args):
    @app.post("/api/generate")
    async def generate(raw_request: Request, request: OllamaGenerateRequest):
-        """Handle generate completion requests"""
+        """Handle generate completion requests
        For compatiblity purpuse, the request is not processed by LightRAG,
        and will be handled by underlying LLM model.
        """
        try:
            query = request.prompt
            start_time = time.time_ns()
@@ -1581,15 +1604,22 @@ def create_app(args):
    @app.post("/api/chat")
    async def chat(raw_request: Request, request: OllamaChatRequest):
-        """Handle chat completion requests"""
+        """Process chat completion requests.
        Routes user queries through LightRAG by selecting query mode based on prefix indicators.
        Detects and forwards OpenWebUI session-related requests (for meta data generation task) directly to LLM.
        """
        try:
            # Get all messages
            messages = request.messages
            if not messages:
                raise HTTPException(status_code=400, detail="No messages provided")
-            # Get the last message as query
+            # Get the last message as query and previous messages as history
            query = messages[-1].content
            # Convert OllamaMessage objects to dictionaries
            conversation_history = [
                {"role": msg.role, "content": msg.content} for msg in messages[:-1]
            ]
            # Check for query prefix
            cleaned_query, mode = parse_query_mode(query)
@@ -1597,9 +1627,17 @@ def create_app(args):
            start_time = time.time_ns()
            prompt_tokens = estimate_tokens(cleaned_query)
-            query_param = QueryParam(
+            param_dict = {
-                mode=mode, stream=request.stream, only_need_context=False
+                "mode": mode,
-            )
+                "stream": request.stream,
                "only_need_context": False,
                "conversation_history": conversation_history,
            }
            if args.history_turns is not None:
                param_dict["history_turns"] = args.history_turns
            query_param = QueryParam(**param_dict)
            if request.stream:
                from fastapi.responses import StreamingResponse
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -633,11 +633,8 @@ async def kg_query(
    # Process conversation history
    history_context = ""
    if query_param.conversation_history:
-        recent_history = query_param.conversation_history[
+        history_context = get_conversation_turns(
-            -query_param.history_window_size :
+            query_param.conversation_history, query_param.history_turns
        ]
        history_context = "\n".join(
            [f"{turn['role']}: {turn['content']}" for turn in recent_history]
        )
    sys_prompt_temp = PROMPTS["rag_response"]
--- a/test_lightrag_ollama_chat.py
+++ b/test_lightrag_ollama_chat.py
@@ -104,7 +104,7 @@ DEFAULT_CONFIG = {
        "host": "localhost",
        "port": 9621,
        "model": "lightrag:latest",
-        "timeout": 30,
+        "timeout": 120,
        "max_retries": 3,
        "retry_delay": 1,
    },
@@ -189,19 +189,32 @@ def get_base_url(endpoint: str = "chat") -> str:
 def create_chat_request_data(
-    content: str, stream: bool = False, model: str = None
+    content: str,
    stream: bool = False,
    model: str = None,
    conversation_history: List[Dict[str, str]] = None,
    history_turns: int = None,
 ) -> Dict[str, Any]:
    """Create chat request data
    Args:
        content: User message content
        stream: Whether to use streaming response
        model: Model name
        conversation_history: List of previous conversation messages
        history_turns: Number of history turns to include
    Returns:
        Dictionary containing complete chat request data
    """
    messages = conversation_history or []
    if history_turns is not None and conversation_history:
        messages = messages[
            -2 * history_turns :
        ]  # Each turn has 2 messages (user + assistant)
    messages.append({"role": "user", "content": content})
    return {
        "model": model or CONFIG["server"]["model"],
-        "messages": [{"role": "user", "content": content}],
+        "messages": messages,
        "stream": stream,
    }
@@ -259,11 +272,25 @@ def run_test(func: Callable, name: str) -> None:
 def test_non_stream_chat() -> None:
    """Test non-streaming call to /api/chat endpoint"""
    url = get_base_url()
    data = create_chat_request_data(
        CONFIG["test_cases"]["basic"]["query"], stream=False
    )
-    # Send request
+    # Example conversation history
    conversation_history = [
        {"role": "user", "content": "你好"},
        {"role": "assistant", "content": "你好!我是一个AI助手,很高兴为你服务。"},
        {"role": "user", "content": "西游记里有几个主要人物?"},
        {
            "role": "assistant",
            "content": "西游记的主要人物有唐僧、孙悟空、猪八戒、沙和尚这四位主角。",
        },
    ]
    # Send request with conversation history and history turns
    data = create_chat_request_data(
        CONFIG["test_cases"]["basic"]["query"],
        stream=False,
        conversation_history=conversation_history,
        history_turns=2,  # Only include last 2 turns
    )
    response = make_request(url, data)
    # Print response
@@ -297,9 +324,25 @@ def test_stream_chat() -> None:
    The last message will contain performance statistics, with done set to true.
    """
    url = get_base_url()
    data = create_chat_request_data(CONFIG["test_cases"]["basic"]["query"], stream=True)
-    # Send request and get streaming response
+    # Example conversation history
    conversation_history = [
        {"role": "user", "content": "你好"},
        {"role": "assistant", "content": "你好!我是一个AI助手,很高兴为你服务。"},
        {"role": "user", "content": "西游记里有几个主要人物?"},
        {
            "role": "assistant",
            "content": "西游记的主要人物有唐僧、孙悟空、猪八戒、沙和尚这四位主角。",
        },
    ]
    # Send request with conversation history and history turns
    data = create_chat_request_data(
        CONFIG["test_cases"]["basic"]["query"],
        stream=True,
        conversation_history=conversation_history,
        history_turns=2,  # Only include last 2 turns
    )
    response = make_request(url, data, stream=True)
    if OutputControl.is_verbose():