Adjust concurrency limits more LLM friendly settings for new comers

- Lowered max async LLM processes to 4 - Enabled LLM cache for entity extraction - Reduced max parallel insert to 2
2025-03-16 23:56:34 +08:00
parent 9d971e5889
commit c2ba7f33ff
5 changed files with 7 additions and 6 deletions
--- a/lightrag/api/README.md
+++ b/lightrag/api/README.md
@@ -224,7 +224,7 @@ LightRAG supports binding to various LLM/Embedding backends:
 Use environment variables  `LLM_BINDING` or CLI argument `--llm-binding` to select LLM backend type. Use environment variables  `EMBEDDING_BINDING` or CLI argument `--embedding-binding` to select LLM backend type.

 ### Entity Extraction Configuration
-* ENABLE_LLM_CACHE_FOR_EXTRACT: Enable LLM cache for entity extraction (default: false)
+* ENABLE_LLM_CACHE_FOR_EXTRACT: Enable LLM cache for entity extraction (default: true)

 It's very common to set `ENABLE_LLM_CACHE_FOR_EXTRACT` to true for test environment to reduce the cost of LLM calls.

--- a/lightrag/api/utils_api.py
+++ b/lightrag/api/utils_api.py
@@ -364,7 +364,7 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:

    # Inject LLM cache configuration
    args.enable_llm_cache_for_extract = get_env_value(
-        "ENABLE_LLM_CACHE_FOR_EXTRACT", False, bool
+        "ENABLE_LLM_CACHE_FOR_EXTRACT", True, bool
    )

    # Select Document loading tool (DOCLING, DEFAULT)
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -214,7 +214,7 @@ class LightRAG:
    llm_model_max_token_size: int = field(default=int(os.getenv("MAX_TOKENS", 32768)))
    """Maximum number of tokens allowed per LLM response."""

-    llm_model_max_async: int = field(default=int(os.getenv("MAX_ASYNC", 16)))
+    llm_model_max_async: int = field(default=int(os.getenv("MAX_ASYNC", 4)))
    """Maximum number of concurrent LLM calls."""

    llm_model_kwargs: dict[str, Any] = field(default_factory=dict)
@@ -238,7 +238,7 @@ class LightRAG:
    # Extensions
    # ---

-    max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 20)))
+    max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 2)))
    """Maximum number of parallel insert operations."""

    addon_params: dict[str, Any] = field(