diff --git a/README.md b/README.md index 61e7b20f..c9a35260 100644 --- a/README.md +++ b/README.md @@ -1061,7 +1061,7 @@ Valid modes are: | **llm\_model\_func** | `callable` | Function for LLM generation | `gpt_4o_mini_complete` | | **llm\_model\_name** | `str` | LLM model name for generation | `meta-llama/Llama-3.2-1B-Instruct` | | **llm\_model\_max\_token\_size** | `int` | Maximum token size for LLM generation (affects entity relation summaries) | `32768`(default value changed by env var MAX_TOKENS) | -| **llm\_model\_max\_async** | `int` | Maximum number of concurrent asynchronous LLM processes | `16`(default value changed by env var MAX_ASYNC) | +| **llm\_model\_max\_async** | `int` | Maximum number of concurrent asynchronous LLM processes | `4`(default value changed by env var MAX_ASYNC) | | **llm\_model\_kwargs** | `dict` | Additional parameters for LLM generation | | | **vector\_db\_storage\_cls\_kwargs** | `dict` | Additional parameters for vector database, like setting the threshold for nodes and relations retrieval. | cosine_better_than_threshold: 0.2(default value changed by env var COSINE_THRESHOLD) | | **enable\_llm\_cache** | `bool` | If `TRUE`, stores LLM results in cache; repeated prompts return cached responses | `TRUE` | diff --git a/env.example b/env.example index 955741ef..66d209ad 100644 --- a/env.example +++ b/env.example @@ -50,7 +50,8 @@ # MAX_TOKEN_SUMMARY=500 # Max tokens for entity or relations summary # SUMMARY_LANGUAGE=English # MAX_EMBED_TOKENS=8192 -# ENABLE_LLM_CACHE_FOR_EXTRACT=false # Enable LLM cache for entity extraction, defaults to false +# ENABLE_LLM_CACHE_FOR_EXTRACT=true # Enable LLM cache for entity extraction +# MAX_PARALLEL_INSERT=2 # Maximum number of parallel processing documents in pipeline ### LLM Configuration (Use valid host. For local services installed with docker, you can use host.docker.internal) LLM_BINDING=ollama diff --git a/lightrag/api/README.md b/lightrag/api/README.md index 7a07ddb8..8dcba7a2 100644 --- a/lightrag/api/README.md +++ b/lightrag/api/README.md @@ -224,7 +224,7 @@ LightRAG supports binding to various LLM/Embedding backends: Use environment variables `LLM_BINDING` or CLI argument `--llm-binding` to select LLM backend type. Use environment variables `EMBEDDING_BINDING` or CLI argument `--embedding-binding` to select LLM backend type. ### Entity Extraction Configuration -* ENABLE_LLM_CACHE_FOR_EXTRACT: Enable LLM cache for entity extraction (default: false) +* ENABLE_LLM_CACHE_FOR_EXTRACT: Enable LLM cache for entity extraction (default: true) It's very common to set `ENABLE_LLM_CACHE_FOR_EXTRACT` to true for test environment to reduce the cost of LLM calls. diff --git a/lightrag/api/utils_api.py b/lightrag/api/utils_api.py index 1f75db9c..88a0132c 100644 --- a/lightrag/api/utils_api.py +++ b/lightrag/api/utils_api.py @@ -364,7 +364,7 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace: # Inject LLM cache configuration args.enable_llm_cache_for_extract = get_env_value( - "ENABLE_LLM_CACHE_FOR_EXTRACT", False, bool + "ENABLE_LLM_CACHE_FOR_EXTRACT", True, bool ) # Select Document loading tool (DOCLING, DEFAULT) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 9fe13f97..5a5461e0 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -214,7 +214,7 @@ class LightRAG: llm_model_max_token_size: int = field(default=int(os.getenv("MAX_TOKENS", 32768))) """Maximum number of tokens allowed per LLM response.""" - llm_model_max_async: int = field(default=int(os.getenv("MAX_ASYNC", 16))) + llm_model_max_async: int = field(default=int(os.getenv("MAX_ASYNC", 4))) """Maximum number of concurrent LLM calls.""" llm_model_kwargs: dict[str, Any] = field(default_factory=dict) @@ -238,7 +238,7 @@ class LightRAG: # Extensions # --- - max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 20))) + max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 2))) """Maximum number of parallel insert operations.""" addon_params: dict[str, Any] = field(