cleaned type
This commit is contained in:
@@ -4,7 +4,7 @@ from tqdm.asyncio import tqdm as tqdm_async
|
|||||||
from dataclasses import asdict, dataclass, field
|
from dataclasses import asdict, dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import Any, Type, Union, cast
|
from typing import Any, Callable, Optional, Type, Union, cast
|
||||||
import traceback
|
import traceback
|
||||||
from .operate import (
|
from .operate import (
|
||||||
chunking_by_token_size,
|
chunking_by_token_size,
|
||||||
@@ -177,13 +177,24 @@ class LightRAG:
|
|||||||
|
|
||||||
# extension
|
# extension
|
||||||
addon_params: dict[str, Any] = field(default_factory=dict)
|
addon_params: dict[str, Any] = field(default_factory=dict)
|
||||||
convert_response_to_json_func: callable = convert_response_to_json
|
convert_response_to_json_func: Callable[[str], dict[str, Any]] = convert_response_to_json
|
||||||
|
|
||||||
# Add new field for document status storage type
|
# Add new field for document status storage type
|
||||||
doc_status_storage: str = field(default="JsonDocStatusStorage")
|
doc_status_storage: str = field(default="JsonDocStatusStorage")
|
||||||
|
|
||||||
# Custom Chunking Function
|
# Custom Chunking Function
|
||||||
chunking_func: callable = chunking_by_token_size
|
chunking_func: Callable[
|
||||||
|
[
|
||||||
|
str,
|
||||||
|
Optional[str],
|
||||||
|
bool,
|
||||||
|
int,
|
||||||
|
int,
|
||||||
|
str,
|
||||||
|
],
|
||||||
|
list[dict[str, Any]],
|
||||||
|
] = chunking_by_token_size
|
||||||
|
|
||||||
chunking_func_kwargs: dict = field(default_factory=dict)
|
chunking_func_kwargs: dict = field(default_factory=dict)
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
@@ -538,9 +549,7 @@ class LightRAG:
|
|||||||
return
|
return
|
||||||
|
|
||||||
full_docs_ids = await self.full_docs.get_by_ids(to_process_doc_keys)
|
full_docs_ids = await self.full_docs.get_by_ids(to_process_doc_keys)
|
||||||
new_docs = {}
|
new_docs = {doc["id"]: doc for doc in full_docs_ids or []}
|
||||||
if full_docs_ids:
|
|
||||||
new_docs = {doc["id"]: doc for doc in full_docs_ids or []}
|
|
||||||
|
|
||||||
if not new_docs:
|
if not new_docs:
|
||||||
logger.info("All documents have been processed or are duplicates")
|
logger.info("All documents have been processed or are duplicates")
|
||||||
|
@@ -36,12 +36,11 @@ import time
|
|||||||
|
|
||||||
def chunking_by_token_size(
|
def chunking_by_token_size(
|
||||||
content: str,
|
content: str,
|
||||||
split_by_character=None,
|
split_by_character: Union[str, None]=None,
|
||||||
split_by_character_only=False,
|
split_by_character_only: bool =False,
|
||||||
overlap_token_size=128,
|
overlap_token_size: int =128,
|
||||||
max_token_size=1024,
|
max_token_size: int =1024,
|
||||||
tiktoken_model="gpt-4o",
|
tiktoken_model: str="gpt-4o"
|
||||||
**kwargs,
|
|
||||||
) -> list[dict[str, Any]]:
|
) -> list[dict[str, Any]]:
|
||||||
tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
|
tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
|
||||||
results: list[dict[str, Any]] = []
|
results: list[dict[str, Any]] = []
|
||||||
|
@@ -98,7 +98,7 @@ def locate_json_string_body_from_string(content: str) -> Union[str, None]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def convert_response_to_json(response: str) -> dict:
|
def convert_response_to_json(response: str) -> dict[str, Any]:
|
||||||
json_str = locate_json_string_body_from_string(response)
|
json_str = locate_json_string_body_from_string(response)
|
||||||
assert json_str is not None, f"Unable to parse JSON from response: {response}"
|
assert json_str is not None, f"Unable to parse JSON from response: {response}"
|
||||||
try:
|
try:
|
||||||
|
Reference in New Issue
Block a user