cleaned type

2025-02-09 11:46:01 +01:00
parent 572a75b141
commit 1f8fc44591
3 changed files with 21 additions and 13 deletions
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -4,7 +4,7 @@ from tqdm.asyncio import tqdm as tqdm_async
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from functools import partial
-from typing import Any, Type, Union, cast
+from typing import Any, Callable, Optional, Type, Union, cast
 import traceback
 from .operate import (
    chunking_by_token_size,
@@ -177,13 +177,24 @@ class LightRAG:
    # extension
    addon_params: dict[str, Any] = field(default_factory=dict)
-    convert_response_to_json_func: callable = convert_response_to_json
+    convert_response_to_json_func: Callable[[str], dict[str, Any]] = convert_response_to_json
    # Add new field for document status storage type
    doc_status_storage: str = field(default="JsonDocStatusStorage")
    # Custom Chunking Function
-    chunking_func: callable = chunking_by_token_size
+    chunking_func: Callable[
        [
            str,
            Optional[str],
            bool,
            int,
            int,
            str,
        ],
        list[dict[str, Any]],
    ] = chunking_by_token_size
    chunking_func_kwargs: dict = field(default_factory=dict)
    def __post_init__(self):
@@ -538,9 +549,7 @@ class LightRAG:
            return
        full_docs_ids = await self.full_docs.get_by_ids(to_process_doc_keys)
-        new_docs = {}
+        new_docs = {doc["id"]: doc for doc in full_docs_ids or []}    
        if full_docs_ids:
            new_docs = {doc["id"]: doc for doc in full_docs_ids or []}
        if not new_docs:
            logger.info("All documents have been processed or are duplicates")
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -36,12 +36,11 @@ import time
 def chunking_by_token_size(
    content: str,
-    split_by_character=None,
+    split_by_character: Union[str, None]=None,
-    split_by_character_only=False,
+    split_by_character_only: bool =False,
-    overlap_token_size=128,
+    overlap_token_size: int =128,
-    max_token_size=1024,
+    max_token_size: int =1024,
-    tiktoken_model="gpt-4o",
+    tiktoken_model: str="gpt-4o"
    **kwargs,
 ) -> list[dict[str, Any]]:
    tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
    results: list[dict[str, Any]] = []
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -98,7 +98,7 @@ def locate_json_string_body_from_string(content: str) -> Union[str, None]:
        return None
-def convert_response_to_json(response: str) -> dict:
+def convert_response_to_json(response: str) -> dict[str, Any]:
    json_str = locate_json_string_body_from_string(response)
    assert json_str is not None, f"Unable to parse JSON from response: {response}"
    try: