Merge pull request #735 from YanSte/documentatino

Updated documentation
2025-02-11 08:57:39 +08:00
parent d0779209d9 23283180c7
commit 87e6bc5c42
3 changed files with 128 additions and 37 deletions
--- a/README.md
+++ b/README.md
@@ -355,16 +355,26 @@ In order to run this experiment on low RAM GPU you should select small model and
 ```python
 class QueryParam:
    mode: Literal["local", "global", "hybrid", "naive", "mix"] = "global"
+    """Specifies the retrieval mode:
+    - "local": Focuses on context-dependent information.
+    - "global": Utilizes global knowledge.
+    - "hybrid": Combines local and global retrieval methods.
+    - "naive": Performs a basic search without advanced techniques.
+    - "mix": Integrates knowledge graph and vector retrieval.
+    """
    only_need_context: bool = False
+    """If True, only returns the retrieved context without generating a response."""
    response_type: str = "Multiple Paragraphs"
-    # Number of top-k items to retrieve; corresponds to entities in "local" mode and relationships in "global" mode.
+    """Defines the response format. Examples: 'Multiple Paragraphs', 'Single Paragraph', 'Bullet Points'."""
    top_k: int = 60
-    # Number of tokens for the original chunks.
+    """Number of top items to retrieve. Represents entities in 'local' mode and relationships in 'global' mode."""
    max_token_for_text_unit: int = 4000
-    # Number of tokens for the relationship descriptions
+    """Maximum number of tokens allowed for each retrieved text chunk."""
    max_token_for_global_context: int = 4000
-    # Number of tokens for the entity descriptions
+    """Maximum number of tokens allocated for relationship descriptions in global retrieval."""
    max_token_for_local_context: int = 4000
+    """Maximum number of tokens allocated for entity descriptions in local retrieval."""
+    ...
 ```

 > default value of Top_k can be change by environment  variables  TOP_K.
--- a/lightrag/base.py
+++ b/lightrag/base.py
@@ -27,30 +27,54 @@ T = TypeVar("T")

@dataclass
 class QueryParam:
+    """Configuration parameters for query execution in LightRAG."""
+
    mode: Literal["local", "global", "hybrid", "naive", "mix"] = "global"
+    """Specifies the retrieval mode:
+    - "local": Focuses on context-dependent information.
+    - "global": Utilizes global knowledge.
+    - "hybrid": Combines local and global retrieval methods.
+    - "naive": Performs a basic search without advanced techniques.
+    - "mix": Integrates knowledge graph and vector retrieval.
+    """
+
    only_need_context: bool = False
+    """If True, only returns the retrieved context without generating a response."""
+
    only_need_prompt: bool = False
+    """If True, only returns the generated prompt without producing a response."""
+
    response_type: str = "Multiple Paragraphs"
+    """Defines the response format. Examples: 'Multiple Paragraphs', 'Single Paragraph', 'Bullet Points'."""
+
    stream: bool = False
-    # Number of top-k items to retrieve; corresponds to entities in "local" mode and relationships in "global" mode.
+    """If True, enables streaming output for real-time responses."""
+
    top_k: int = int(os.getenv("TOP_K", "60"))
-    # Number of document chunks to retrieve.
-    # top_n: int = 10
-    # Number of tokens for the original chunks.
+    """Number of top items to retrieve. Represents entities in 'local' mode and relationships in 'global' mode."""
+
    max_token_for_text_unit: int = 4000
-    # Number of tokens for the relationship descriptions
+    """Maximum number of tokens allowed for each retrieved text chunk."""
+
    max_token_for_global_context: int = 4000
-    # Number of tokens for the entity descriptions
+    """Maximum number of tokens allocated for relationship descriptions in global retrieval."""
+
    max_token_for_local_context: int = 4000
+    """Maximum number of tokens allocated for entity descriptions in local retrieval."""
+
    hl_keywords: list[str] = field(default_factory=list)
+    """List of high-level keywords to prioritize in retrieval."""
+
    ll_keywords: list[str] = field(default_factory=list)
-    # Conversation history support
-    conversation_history: list[dict[str, str]] = field(
-        default_factory=list
-    )  # Format: [{"role": "user/assistant", "content": "message"}]
-    history_turns: int = (
-        3  # Number of complete conversation turns (user-assistant pairs) to consider
-    )
+    """List of low-level keywords to refine retrieval focus."""
+
+    conversation_history: list[dict[str, Any]] = field(default_factory=list)
+    """Stores past conversation history to maintain context.
+    Format: [{"role": "user/assistant", "content": "message"}].
+    """
+
+    history_turns: int = 3
+    """Number of complete conversation turns (user-assistant pairs) to consider in the response context."""


@dataclass
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -109,38 +109,65 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop:

@dataclass
 class LightRAG:
+    """LightRAG: Simple and Fast Retrieval-Augmented Generation."""
+
    working_dir: str = field(
        default_factory=lambda: f'./lightrag_cache_{datetime.now().strftime("%Y-%m-%d-%H:%M:%S")}'
    )
-    # Default not to use embedding cache
-    embedding_cache_config: dict = field(
+    """Directory where cache and temporary files are stored."""
+
+    embedding_cache_config: dict[str, Any] = field(
        default_factory=lambda: {
            "enabled": False,
            "similarity_threshold": 0.95,
            "use_llm_check": False,
        }
    )
+    """Configuration for embedding cache.
+    - enabled: If True, enables caching to avoid redundant computations.
+    - similarity_threshold: Minimum similarity score to use cached embeddings.
+    - use_llm_check: If True, validates cached embeddings using an LLM.
+    """
+
    kv_storage: str = field(default="JsonKVStorage")
+    """Storage backend for key-value data."""
+
    vector_storage: str = field(default="NanoVectorDBStorage")
+    """Storage backend for vector embeddings."""
+
    graph_storage: str = field(default="NetworkXStorage")
+    """Storage backend for knowledge graphs."""

-    # logging
+    # Logging
    current_log_level = logger.level
-    log_level: str = field(default=current_log_level)
+    log_level: int = field(default=current_log_level)
+    """Logging level for the system (e.g., 'DEBUG', 'INFO', 'WARNING')."""
+
    log_dir: str = field(default=os.getcwd())
+    """Directory where logs are stored. Defaults to the current working directory."""

-    # text chunking
+    # Text chunking
    chunk_token_size: int = 1200
+    """Maximum number of tokens per text chunk when splitting documents."""
+
    chunk_overlap_token_size: int = 100
+    """Number of overlapping tokens between consecutive text chunks to preserve context."""
+
    tiktoken_model_name: str = "gpt-4o-mini"
+    """Model name used for tokenization when chunking text."""

-    # entity extraction
+    # Entity extraction
    entity_extract_max_gleaning: int = 1
-    entity_summary_to_max_tokens: int = 500
+    """Maximum number of entity extraction attempts for ambiguous content."""

-    # node embedding
+    entity_summary_to_max_tokens: int = 500
+    """Maximum number of tokens used for summarizing extracted entities."""
+
+    # Node embedding
    node_embedding_algorithm: str = "node2vec"
-    node2vec_params: dict = field(
+    """Algorithm used for node embedding in knowledge graphs."""
+
+    node2vec_params: dict[str, int] = field(
        default_factory=lambda: {
            "dimensions": 1536,
            "num_walks": 10,
@@ -150,26 +177,56 @@ class LightRAG:
            "random_seed": 3,
        }
    )
+    """Configuration for the node2vec embedding algorithm:
+    - dimensions: Number of dimensions for embeddings.
+    - num_walks: Number of random walks per node.
+    - walk_length: Number of steps per random walk.
+    - window_size: Context window size for training.
+    - iterations: Number of iterations for training.
+    - random_seed: Seed value for reproducibility.
+    """
+
+    embedding_func: EmbeddingFunc = None
+    """Function for computing text embeddings. Must be set before use."""

-    # embedding_func: EmbeddingFunc = field(default_factory=lambda:hf_embedding)
-    embedding_func: EmbeddingFunc = None  # This must be set (we do want to separate llm from the corte, so no more default initialization)
    embedding_batch_num: int = 32
+    """Batch size for embedding computations."""
+
    embedding_func_max_async: int = 16
+    """Maximum number of concurrent embedding function calls."""
+
+    # LLM Configuration
+    llm_model_func: callable = None
+    """Function for interacting with the large language model (LLM). Must be set before use."""
+
+    llm_model_name: str = "meta-llama/Llama-3.2-1B-Instruct"
+    """Name of the LLM model used for generating responses."""

-    # LLM
-    llm_model_func: callable = None  # This must be set (we do want to separate llm from the corte, so no more default initialization)
-    llm_model_name: str = "meta-llama/Llama-3.2-1B-Instruct"  # 'meta-llama/Llama-3.2-1B'#'google/gemma-2-2b-it'
    llm_model_max_token_size: int = int(os.getenv("MAX_TOKENS", "32768"))
-    llm_model_max_async: int = int(os.getenv("MAX_ASYNC", "16"))
-    llm_model_kwargs: dict = field(default_factory=dict)
+    """Maximum number of tokens allowed per LLM response."""
+
+    llm_model_max_async: int = int(os.getenv("MAX_ASYNC", "16"))
+    """Maximum number of concurrent LLM calls."""
+
+    llm_model_kwargs: dict[str, Any] = field(default_factory=dict)
+    """Additional keyword arguments passed to the LLM model function."""
+
+    # Storage
+    vector_db_storage_cls_kwargs: dict[str, Any] = field(default_factory=dict)
+    """Additional parameters for vector database storage."""

-    # storage
-    vector_db_storage_cls_kwargs: dict = field(default_factory=dict)
    namespace_prefix: str = field(default="")
+    """Prefix for namespacing stored data across different environments."""

    enable_llm_cache: bool = True
-    # Sometimes there are some reason the LLM failed at Extracting Entities, and we want to continue without LLM cost, we can use this flag
+    """Enables caching for LLM responses to avoid redundant computations."""
+
    enable_llm_cache_for_entity_extract: bool = True
+    """If True, enables caching for entity extraction steps to reduce LLM costs."""
+
+    # Extensions
+    addon_params: dict[str, Any] = field(default_factory=dict)
+    """Dictionary for additional parameters and extensions."""

    # extension
    addon_params: dict[str, Any] = field(default_factory=dict)
@@ -177,8 +234,8 @@ class LightRAG:
        convert_response_to_json
    )

-    # Add new field for document status storage type
    doc_status_storage: str = field(default="JsonDocStatusStorage")
+    """Storage type for tracking document processing statuses."""

    # Custom Chunking Function
    chunking_func: Callable[