From 1f8fc4459122bfc10f64f374fe1d7177e1f395d1 Mon Sep 17 00:00:00 2001
From: Yannick Stephan <stephan.yannick@me.com>
Date: Sun, 9 Feb 2025 11:46:01 +0100
Subject: [PATCH] cleaned type

---
 lightrag/lightrag.py | 21 +++++++++++++++------
 lightrag/operate.py  | 11 +++++------
 lightrag/utils.py    |  2 +-
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index b190054d..c2f45fe7 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -4,7 +4,7 @@ from tqdm.asyncio import tqdm as tqdm_async
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from functools import partial
-from typing import Any, Type, Union, cast
+from typing import Any, Callable, Optional, Type, Union, cast
 import traceback
 from .operate import (
     chunking_by_token_size,
@@ -177,13 +177,24 @@ class LightRAG:
 
     # extension
     addon_params: dict[str, Any] = field(default_factory=dict)
-    convert_response_to_json_func: callable = convert_response_to_json
+    convert_response_to_json_func: Callable[[str], dict[str, Any]] = convert_response_to_json
 
     # Add new field for document status storage type
     doc_status_storage: str = field(default="JsonDocStatusStorage")
 
     # Custom Chunking Function
-    chunking_func: callable = chunking_by_token_size
+    chunking_func: Callable[
+        [
+            str,
+            Optional[str],
+            bool,
+            int,
+            int,
+            str,
+        ],
+        list[dict[str, Any]],
+    ] = chunking_by_token_size
+    
     chunking_func_kwargs: dict = field(default_factory=dict)
 
     def __post_init__(self):
@@ -538,9 +549,7 @@ class LightRAG:
             return
 
         full_docs_ids = await self.full_docs.get_by_ids(to_process_doc_keys)
-        new_docs = {}
-        if full_docs_ids:
-            new_docs = {doc["id"]: doc for doc in full_docs_ids or []}
+        new_docs = {doc["id"]: doc for doc in full_docs_ids or []}    
 
         if not new_docs:
             logger.info("All documents have been processed or are duplicates")
diff --git a/lightrag/operate.py b/lightrag/operate.py
index ec896cc4..7c70d948 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -36,12 +36,11 @@ import time
 
 def chunking_by_token_size(
     content: str,
-    split_by_character=None,
-    split_by_character_only=False,
-    overlap_token_size=128,
-    max_token_size=1024,
-    tiktoken_model="gpt-4o",
-    **kwargs,
+    split_by_character: Union[str, None]=None,
+    split_by_character_only: bool =False,
+    overlap_token_size: int =128,
+    max_token_size: int =1024,
+    tiktoken_model: str="gpt-4o"
 ) -> list[dict[str, Any]]:
     tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
     results: list[dict[str, Any]] = []
diff --git a/lightrag/utils.py b/lightrag/utils.py
index ed0b6c06..28d9bfaa 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -98,7 +98,7 @@ def locate_json_string_body_from_string(content: str) -> Union[str, None]:
         return None
 
 
-def convert_response_to_json(response: str) -> dict:
+def convert_response_to_json(response: str) -> dict[str, Any]:
     json_str = locate_json_string_body_from_string(response)
     assert json_str is not None, f"Unable to parse JSON from response: {response}"
     try: