Merge pull request #156 from alazarchuk/main

Add ability to pass additional parameters to ollama library
This commit is contained in:
zrguo
2024-10-29 10:11:21 +08:00
committed by GitHub
5 changed files with 63 additions and 14 deletions

1
.gitignore vendored
View File

@@ -5,3 +5,4 @@ book.txt
lightrag-dev/
.idea/
dist/
.venv/

View File

@@ -163,7 +163,10 @@ rag = LightRAG(
<details>
<summary> Using Ollama Models </summary>
* If you want to use Ollama models, you only need to set LightRAG as follows:
### Overview
If you want to use Ollama models, you need to pull model you plan to use and embedding model, for example `nomic-embed-text`.
Then you only need to set LightRAG as follows:
```python
from lightrag.llm import ollama_model_complete, ollama_embedding
@@ -185,28 +188,59 @@ rag = LightRAG(
)
```
* Increasing the `num_ctx` parameter:
### Increasing context size
In order for LightRAG to work context should be at least 32k tokens. By default Ollama models have context size of 8k. You can achieve this using one of two ways:
#### Increasing the `num_ctx` parameter in Modelfile.
1. Pull the model:
```python
```bash
ollama pull qwen2
```
2. Display the model file:
```python
```bash
ollama show --modelfile qwen2 > Modelfile
```
3. Edit the Modelfile by adding the following line:
```python
```bash
PARAMETER num_ctx 32768
```
4. Create the modified model:
```python
```bash
ollama create -f Modelfile qwen2m
```
#### Setup `num_ctx` via Ollama API.
Tiy can use `llm_model_kwargs` param to configure ollama:
```python
rag = LightRAG(
working_dir=WORKING_DIR,
llm_model_func=ollama_model_complete, # Use Ollama model for text generation
llm_model_name='your_model_name', # Your model name
llm_model_kwargs={"options": {"num_ctx": 32768}},
# Use Ollama embedding function
embedding_func=EmbeddingFunc(
embedding_dim=768,
max_token_size=8192,
func=lambda texts: ollama_embedding(
texts,
embed_model="nomic-embed-text"
)
),
)
```
#### Fully functional example
There fully functional example `examples/lightrag_ollama_demo.py` that utilizes `gemma2:2b` model, runs only 4 requests in parallel and set context size to 32k.
#### Low RAM GPUs
In order to run this experiment on low RAM GPU you should select small model and tune context window (increasing context increase memory consumption). For example, running this ollama example on repurposed mining GPU with 6Gb of RAM required to set context size to 26k while using `gemma2:2b`. It was able to find 197 entities and 19 relations on `book.txt`.
</details>
### Query Param

View File

@@ -1,26 +1,32 @@
import os
import logging
from lightrag import LightRAG, QueryParam
from lightrag.llm import ollama_model_complete, ollama_embedding
from lightrag.utils import EmbeddingFunc
WORKING_DIR = "./dickens"
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)
if not os.path.exists(WORKING_DIR):
os.mkdir(WORKING_DIR)
rag = LightRAG(
working_dir=WORKING_DIR,
llm_model_func=ollama_model_complete,
llm_model_name="your_model_name",
llm_model_name="gemma2:2b",
llm_model_max_async=4,
llm_model_max_token_size=32768,
llm_model_kwargs={"host": "http://localhost:11434", "options": {"num_ctx": 32768}},
embedding_func=EmbeddingFunc(
embedding_dim=768,
max_token_size=8192,
func=lambda texts: ollama_embedding(texts, embed_model="nomic-embed-text"),
func=lambda texts: ollama_embedding(
texts, embed_model="nomic-embed-text", host="http://localhost:11434"
),
),
)
with open("./book.txt", "r", encoding="utf-8") as f:
rag.insert(f.read())

View File

@@ -88,6 +88,7 @@ class LightRAG:
llm_model_name: str = "meta-llama/Llama-3.2-1B-Instruct" #'meta-llama/Llama-3.2-1B'#'google/gemma-2-2b-it'
llm_model_max_token_size: int = 32768
llm_model_max_async: int = 16
llm_model_kwargs: dict = field(default_factory=dict)
# storage
key_string_value_json_storage_cls: Type[BaseKVStorage] = JsonKVStorage
@@ -154,7 +155,11 @@ class LightRAG:
)
self.llm_model_func = limit_async_func_call(self.llm_model_max_async)(
partial(self.llm_model_func, hashing_kv=self.llm_response_cache)
partial(
self.llm_model_func,
hashing_kv=self.llm_response_cache,
**self.llm_model_kwargs,
)
)
def insert(self, string_or_strings):

View File

@@ -299,8 +299,10 @@ async def ollama_model_if_cache(
) -> str:
kwargs.pop("max_tokens", None)
kwargs.pop("response_format", None)
host = kwargs.pop("host", None)
timeout = kwargs.pop("timeout", None)
ollama_client = ollama.AsyncClient()
ollama_client = ollama.AsyncClient(host=host, timeout=timeout)
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
@@ -700,10 +702,11 @@ async def hf_embedding(texts: list[str], tokenizer, embed_model) -> np.ndarray:
return embeddings.detach().numpy()
async def ollama_embedding(texts: list[str], embed_model) -> np.ndarray:
async def ollama_embedding(texts: list[str], embed_model, **kwargs) -> np.ndarray:
embed_text = []
ollama_client = ollama.Client(**kwargs)
for text in texts:
data = ollama.embeddings(model=embed_model, prompt=text)
data = ollama_client.embeddings(model=embed_model, prompt=text)
embed_text.append(data["embedding"])
return embed_text