From 1f8fc4459122bfc10f64f374fe1d7177e1f395d1 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 11:46:01 +0100 Subject: [PATCH] cleaned type --- lightrag/lightrag.py | 21 +++++++++++++++------ lightrag/operate.py | 11 +++++------ lightrag/utils.py | 2 +- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index b190054d..c2f45fe7 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -4,7 +4,7 @@ from tqdm.asyncio import tqdm as tqdm_async from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial -from typing import Any, Type, Union, cast +from typing import Any, Callable, Optional, Type, Union, cast import traceback from .operate import ( chunking_by_token_size, @@ -177,13 +177,24 @@ class LightRAG: # extension addon_params: dict[str, Any] = field(default_factory=dict) - convert_response_to_json_func: callable = convert_response_to_json + convert_response_to_json_func: Callable[[str], dict[str, Any]] = convert_response_to_json # Add new field for document status storage type doc_status_storage: str = field(default="JsonDocStatusStorage") # Custom Chunking Function - chunking_func: callable = chunking_by_token_size + chunking_func: Callable[ + [ + str, + Optional[str], + bool, + int, + int, + str, + ], + list[dict[str, Any]], + ] = chunking_by_token_size + chunking_func_kwargs: dict = field(default_factory=dict) def __post_init__(self): @@ -538,9 +549,7 @@ class LightRAG: return full_docs_ids = await self.full_docs.get_by_ids(to_process_doc_keys) - new_docs = {} - if full_docs_ids: - new_docs = {doc["id"]: doc for doc in full_docs_ids or []} + new_docs = {doc["id"]: doc for doc in full_docs_ids or []} if not new_docs: logger.info("All documents have been processed or are duplicates") diff --git a/lightrag/operate.py b/lightrag/operate.py index ec896cc4..7c70d948 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -36,12 +36,11 @@ import time def chunking_by_token_size( content: str, - split_by_character=None, - split_by_character_only=False, - overlap_token_size=128, - max_token_size=1024, - tiktoken_model="gpt-4o", - **kwargs, + split_by_character: Union[str, None]=None, + split_by_character_only: bool =False, + overlap_token_size: int =128, + max_token_size: int =1024, + tiktoken_model: str="gpt-4o" ) -> list[dict[str, Any]]: tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) results: list[dict[str, Any]] = [] diff --git a/lightrag/utils.py b/lightrag/utils.py index ed0b6c06..28d9bfaa 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -98,7 +98,7 @@ def locate_json_string_body_from_string(content: str) -> Union[str, None]: return None -def convert_response_to_json(response: str) -> dict: +def convert_response_to_json(response: str) -> dict[str, Any]: json_str = locate_json_string_body_from_string(response) assert json_str is not None, f"Unable to parse JSON from response: {response}" try: