cleaned type

This commit is contained in:
Yannick Stephan
2025-02-09 11:46:01 +01:00
parent 572a75b141
commit 1f8fc44591
3 changed files with 21 additions and 13 deletions

View File

@@ -4,7 +4,7 @@ from tqdm.asyncio import tqdm as tqdm_async
from dataclasses import asdict, dataclass, field from dataclasses import asdict, dataclass, field
from datetime import datetime from datetime import datetime
from functools import partial from functools import partial
from typing import Any, Type, Union, cast from typing import Any, Callable, Optional, Type, Union, cast
import traceback import traceback
from .operate import ( from .operate import (
chunking_by_token_size, chunking_by_token_size,
@@ -177,13 +177,24 @@ class LightRAG:
# extension # extension
addon_params: dict[str, Any] = field(default_factory=dict) addon_params: dict[str, Any] = field(default_factory=dict)
convert_response_to_json_func: callable = convert_response_to_json convert_response_to_json_func: Callable[[str], dict[str, Any]] = convert_response_to_json
# Add new field for document status storage type # Add new field for document status storage type
doc_status_storage: str = field(default="JsonDocStatusStorage") doc_status_storage: str = field(default="JsonDocStatusStorage")
# Custom Chunking Function # Custom Chunking Function
chunking_func: callable = chunking_by_token_size chunking_func: Callable[
[
str,
Optional[str],
bool,
int,
int,
str,
],
list[dict[str, Any]],
] = chunking_by_token_size
chunking_func_kwargs: dict = field(default_factory=dict) chunking_func_kwargs: dict = field(default_factory=dict)
def __post_init__(self): def __post_init__(self):
@@ -538,8 +549,6 @@ class LightRAG:
return return
full_docs_ids = await self.full_docs.get_by_ids(to_process_doc_keys) full_docs_ids = await self.full_docs.get_by_ids(to_process_doc_keys)
new_docs = {}
if full_docs_ids:
new_docs = {doc["id"]: doc for doc in full_docs_ids or []} new_docs = {doc["id"]: doc for doc in full_docs_ids or []}
if not new_docs: if not new_docs:

View File

@@ -36,12 +36,11 @@ import time
def chunking_by_token_size( def chunking_by_token_size(
content: str, content: str,
split_by_character=None, split_by_character: Union[str, None]=None,
split_by_character_only=False, split_by_character_only: bool =False,
overlap_token_size=128, overlap_token_size: int =128,
max_token_size=1024, max_token_size: int =1024,
tiktoken_model="gpt-4o", tiktoken_model: str="gpt-4o"
**kwargs,
) -> list[dict[str, Any]]: ) -> list[dict[str, Any]]:
tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
results: list[dict[str, Any]] = [] results: list[dict[str, Any]] = []

View File

@@ -98,7 +98,7 @@ def locate_json_string_body_from_string(content: str) -> Union[str, None]:
return None return None
def convert_response_to_json(response: str) -> dict: def convert_response_to_json(response: str) -> dict[str, Any]:
json_str = locate_json_string_body_from_string(response) json_str = locate_json_string_body_from_string(response)
assert json_str is not None, f"Unable to parse JSON from response: {response}" assert json_str is not None, f"Unable to parse JSON from response: {response}"
try: try: