From e1f4f9560da021fb4b731baf119889873728e71d Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 00:13:26 +0100 Subject: [PATCH 1/4] updated documentation --- lightrag/base.py | 71 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index 1a7f9c2e..ae5ce92e 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -27,31 +27,54 @@ T = TypeVar("T") @dataclass class QueryParam: - mode: Literal["local", "global", "hybrid", "naive", "mix"] = "global" - only_need_context: bool = False - only_need_prompt: bool = False - response_type: str = "Multiple Paragraphs" - stream: bool = False - # Number of top-k items to retrieve; corresponds to entities in "local" mode and relationships in "global" mode. - top_k: int = int(os.getenv("TOP_K", "60")) - # Number of document chunks to retrieve. - # top_n: int = 10 - # Number of tokens for the original chunks. - max_token_for_text_unit: int = 4000 - # Number of tokens for the relationship descriptions - max_token_for_global_context: int = 4000 - # Number of tokens for the entity descriptions - max_token_for_local_context: int = 4000 - hl_keywords: list[str] = field(default_factory=list) - ll_keywords: list[str] = field(default_factory=list) - # Conversation history support - conversation_history: list[dict[str, str]] = field( - default_factory=list - ) # Format: [{"role": "user/assistant", "content": "message"}] - history_turns: int = ( - 3 # Number of complete conversation turns (user-assistant pairs) to consider - ) + """Configuration parameters for query execution in LightRAG.""" + mode: Literal["local", "global", "hybrid", "naive", "mix"] = "global" + """Specifies the retrieval mode: + - "local": Focuses on context-dependent information. + - "global": Utilizes global knowledge. + - "hybrid": Combines local and global retrieval methods. + - "naive": Performs a basic search without advanced techniques. + - "mix": Integrates knowledge graph and vector retrieval. + """ + + only_need_context: bool = False + """If True, only returns the retrieved context without generating a response.""" + + only_need_prompt: bool = False + """If True, only returns the generated prompt without producing a response.""" + + response_type: str = "Multiple Paragraphs" + """Defines the response format. Examples: 'Multiple Paragraphs', 'Single Paragraph', 'Bullet Points'.""" + + stream: bool = False + """If True, enables streaming output for real-time responses.""" + + top_k: int = int(os.getenv("TOP_K", "60")) + """Number of top items to retrieve. Represents entities in 'local' mode and relationships in 'global' mode.""" + + max_token_for_text_unit: int = 4000 + """Maximum number of tokens allowed for each retrieved text chunk.""" + + max_token_for_global_context: int = 4000 + """Maximum number of tokens allocated for relationship descriptions in global retrieval.""" + + max_token_for_local_context: int = 4000 + """Maximum number of tokens allocated for entity descriptions in local retrieval.""" + + hl_keywords: List[str] = field(default_factory=list) + """List of high-level keywords to prioritize in retrieval.""" + + ll_keywords: List[str] = field(default_factory=list) + """List of low-level keywords to refine retrieval focus.""" + + conversation_history: List[dict[str, Any]] = field(default_factory=list) + """Stores past conversation history to maintain context. + Format: [{"role": "user/assistant", "content": "message"}]. + """ + + history_turns: int = 3 + """Number of complete conversation turns (user-assistant pairs) to consider in the response context.""" @dataclass class StorageNameSpace: From 4c2f13f79e27ce4790cce637a25825989a4893ef Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 00:23:55 +0100 Subject: [PATCH 2/4] improved docs --- lightrag/lightrag.py | 97 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 77 insertions(+), 20 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 347f0f4c..eff4614d 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -109,38 +109,65 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop: @dataclass class LightRAG: + """LightRAG: Simple and Fast Retrieval-Augmented Generation.""" + working_dir: str = field( default_factory=lambda: f'./lightrag_cache_{datetime.now().strftime("%Y-%m-%d-%H:%M:%S")}' ) - # Default not to use embedding cache - embedding_cache_config: dict = field( + """Directory where cache and temporary files are stored.""" + + embedding_cache_config: dict[str, Any] = field( default_factory=lambda: { "enabled": False, "similarity_threshold": 0.95, "use_llm_check": False, } ) + """Configuration for embedding cache. + - enabled: If True, enables caching to avoid redundant computations. + - similarity_threshold: Minimum similarity score to use cached embeddings. + - use_llm_check: If True, validates cached embeddings using an LLM. + """ + kv_storage: str = field(default="JsonKVStorage") + """Storage backend for key-value data.""" + vector_storage: str = field(default="NanoVectorDBStorage") + """Storage backend for vector embeddings.""" + graph_storage: str = field(default="NetworkXStorage") + """Storage backend for knowledge graphs.""" - # logging + # Logging current_log_level = logger.level - log_level: str = field(default=current_log_level) + log_level: int = field(default=current_log_level) + """Logging level for the system (e.g., 'DEBUG', 'INFO', 'WARNING').""" + log_dir: str = field(default=os.getcwd()) + """Directory where logs are stored. Defaults to the current working directory.""" - # text chunking + # Text chunking chunk_token_size: int = 1200 + """Maximum number of tokens per text chunk when splitting documents.""" + chunk_overlap_token_size: int = 100 + """Number of overlapping tokens between consecutive text chunks to preserve context.""" + tiktoken_model_name: str = "gpt-4o-mini" + """Model name used for tokenization when chunking text.""" - # entity extraction + # Entity extraction entity_extract_max_gleaning: int = 1 - entity_summary_to_max_tokens: int = 500 + """Maximum number of entity extraction attempts for ambiguous content.""" - # node embedding + entity_summary_to_max_tokens: int = 500 + """Maximum number of tokens used for summarizing extracted entities.""" + + # Node embedding node_embedding_algorithm: str = "node2vec" - node2vec_params: dict = field( + """Algorithm used for node embedding in knowledge graphs.""" + + node2vec_params: dict[str, int] = field( default_factory=lambda: { "dimensions": 1536, "num_walks": 10, @@ -150,26 +177,56 @@ class LightRAG: "random_seed": 3, } ) + """Configuration for the node2vec embedding algorithm: + - dimensions: Number of dimensions for embeddings. + - num_walks: Number of random walks per node. + - walk_length: Number of steps per random walk. + - window_size: Context window size for training. + - iterations: Number of iterations for training. + - random_seed: Seed value for reproducibility. + """ + + embedding_func: EmbeddingFunc = None + """Function for computing text embeddings. Must be set before use.""" - # embedding_func: EmbeddingFunc = field(default_factory=lambda:hf_embedding) - embedding_func: EmbeddingFunc = None # This must be set (we do want to separate llm from the corte, so no more default initialization) embedding_batch_num: int = 32 + """Batch size for embedding computations.""" + embedding_func_max_async: int = 16 + """Maximum number of concurrent embedding function calls.""" + + # LLM Configuration + llm_model_func: callable = None + """Function for interacting with the large language model (LLM). Must be set before use.""" + + llm_model_name: str = "meta-llama/Llama-3.2-1B-Instruct" + """Name of the LLM model used for generating responses.""" - # LLM - llm_model_func: callable = None # This must be set (we do want to separate llm from the corte, so no more default initialization) - llm_model_name: str = "meta-llama/Llama-3.2-1B-Instruct" # 'meta-llama/Llama-3.2-1B'#'google/gemma-2-2b-it' llm_model_max_token_size: int = int(os.getenv("MAX_TOKENS", "32768")) - llm_model_max_async: int = int(os.getenv("MAX_ASYNC", "16")) - llm_model_kwargs: dict = field(default_factory=dict) + """Maximum number of tokens allowed per LLM response.""" + + llm_model_max_async: int = int(os.getenv("MAX_ASYNC", "16")) + """Maximum number of concurrent LLM calls.""" + + llm_model_kwargs: dict[str, Any] = field(default_factory=dict) + """Additional keyword arguments passed to the LLM model function.""" + + # Storage + vector_db_storage_cls_kwargs: dict[str, Any] = field(default_factory=dict) + """Additional parameters for vector database storage.""" - # storage - vector_db_storage_cls_kwargs: dict = field(default_factory=dict) namespace_prefix: str = field(default="") + """Prefix for namespacing stored data across different environments.""" enable_llm_cache: bool = True - # Sometimes there are some reason the LLM failed at Extracting Entities, and we want to continue without LLM cost, we can use this flag + """Enables caching for LLM responses to avoid redundant computations.""" + enable_llm_cache_for_entity_extract: bool = True + """If True, enables caching for entity extraction steps to reduce LLM costs.""" + + # Extensions + addon_params: dict[str, Any] = field(default_factory=dict) + """Dictionary for additional parameters and extensions.""" # extension addon_params: dict[str, Any] = field(default_factory=dict) @@ -177,8 +234,8 @@ class LightRAG: convert_response_to_json ) - # Add new field for document status storage type doc_status_storage: str = field(default="JsonDocStatusStorage") + """Storage type for tracking document processing statuses.""" # Custom Chunking Function chunking_func: Callable[ From 41f76ec4592eca9873e9f94ea9a5545c51f266e8 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 01:05:27 +0100 Subject: [PATCH 3/4] updated readme --- README.md | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 850cacd3..480b8d00 100644 --- a/README.md +++ b/README.md @@ -355,16 +355,26 @@ In order to run this experiment on low RAM GPU you should select small model and ```python class QueryParam: mode: Literal["local", "global", "hybrid", "naive", "mix"] = "global" + """Specifies the retrieval mode: + - "local": Focuses on context-dependent information. + - "global": Utilizes global knowledge. + - "hybrid": Combines local and global retrieval methods. + - "naive": Performs a basic search without advanced techniques. + - "mix": Integrates knowledge graph and vector retrieval. + """ only_need_context: bool = False + """If True, only returns the retrieved context without generating a response.""" response_type: str = "Multiple Paragraphs" - # Number of top-k items to retrieve; corresponds to entities in "local" mode and relationships in "global" mode. + """Defines the response format. Examples: 'Multiple Paragraphs', 'Single Paragraph', 'Bullet Points'.""" top_k: int = 60 - # Number of tokens for the original chunks. + """Number of top items to retrieve. Represents entities in 'local' mode and relationships in 'global' mode.""" max_token_for_text_unit: int = 4000 - # Number of tokens for the relationship descriptions + """Maximum number of tokens allowed for each retrieved text chunk.""" max_token_for_global_context: int = 4000 - # Number of tokens for the entity descriptions + """Maximum number of tokens allocated for relationship descriptions in global retrieval.""" max_token_for_local_context: int = 4000 + """Maximum number of tokens allocated for entity descriptions in local retrieval.""" + ... ``` > default value of Top_k can be change by environment variables TOP_K. From 23283180c7a6e368489df53bb9fa4f209cb00154 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 18:03:34 +0100 Subject: [PATCH 4/4] fixed type --- README.md | 2 +- lightrag/base.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 480b8d00..cf1d86aa 100644 --- a/README.md +++ b/README.md @@ -361,7 +361,7 @@ class QueryParam: - "hybrid": Combines local and global retrieval methods. - "naive": Performs a basic search without advanced techniques. - "mix": Integrates knowledge graph and vector retrieval. - """ + """ only_need_context: bool = False """If True, only returns the retrieved context without generating a response.""" response_type: str = "Multiple Paragraphs" diff --git a/lightrag/base.py b/lightrag/base.py index ae5ce92e..0e3f1dc6 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -62,13 +62,13 @@ class QueryParam: max_token_for_local_context: int = 4000 """Maximum number of tokens allocated for entity descriptions in local retrieval.""" - hl_keywords: List[str] = field(default_factory=list) + hl_keywords: list[str] = field(default_factory=list) """List of high-level keywords to prioritize in retrieval.""" - ll_keywords: List[str] = field(default_factory=list) + ll_keywords: list[str] = field(default_factory=list) """List of low-level keywords to refine retrieval focus.""" - conversation_history: List[dict[str, Any]] = field(default_factory=list) + conversation_history: list[dict[str, Any]] = field(default_factory=list) """Stores past conversation history to maintain context. Format: [{"role": "user/assistant", "content": "message"}]. """ @@ -76,6 +76,7 @@ class QueryParam: history_turns: int = 3 """Number of complete conversation turns (user-assistant pairs) to consider in the response context.""" + @dataclass class StorageNameSpace: namespace: str