add: to optionally replace default tiktoken Tokenizer with a custom one

This commit is contained in:
drahnreb
2025-04-17 10:56:23 +02:00
parent 4fd40fd798
commit 20ba1eb9c2
6 changed files with 138 additions and 53 deletions

View File

@@ -41,11 +41,12 @@ from .operate import (
)
from .prompt import GRAPH_FIELD_SEP, PROMPTS
from .utils import (
Tokenizer,
TiktokenTokenizer,
EmbeddingFunc,
always_get_an_event_loop,
compute_mdhash_id,
convert_response_to_json,
encode_string_by_tiktoken,
lazy_external_import,
limit_async_func_call,
get_content_summary,
@@ -122,33 +123,38 @@ class LightRAG:
)
"""Number of overlapping tokens between consecutive text chunks to preserve context."""
tiktoken_model_name: str = field(default="gpt-4o-mini")
"""Model name used for tokenization when chunking text."""
tokenizer: Optional[Tokenizer] = field(default=None)
"""
A function that returns a Tokenizer instance.
If None, and a `tiktoken_model_name` is provided, a TiktokenTokenizer will be created.
If both are None, the default TiktokenTokenizer is used.
"""
"""Maximum number of tokens used for summarizing extracted entities."""
tiktoken_model_name: str = field(default="gpt-4o-mini")
"""Model name used for tokenization when chunking text with tiktoken. Defaults to `gpt-4o-mini`."""
chunking_func: Callable[
[
Tokenizer,
str,
str | None,
Optional[str],
bool,
int,
int,
str,
],
list[dict[str, Any]],
List[Dict[str, Any]],
] = field(default_factory=lambda: chunking_by_token_size)
"""
Custom chunking function for splitting text into chunks before processing.
The function should take the following parameters:
- `tokenizer`: A Tokenizer instance to use for tokenization.
- `content`: The text to be split into chunks.
- `split_by_character`: The character to split the text on. If None, the text is split into chunks of `chunk_token_size` tokens.
- `split_by_character_only`: If True, the text is split only on the specified character.
- `chunk_token_size`: The maximum number of tokens per chunk.
- `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.
- `tiktoken_model_name`: The name of the tiktoken model to use for tokenization.
The function should return a list of dictionaries, where each dictionary contains the following keys:
- `tokens`: The number of tokens in the chunk.
@@ -310,7 +316,15 @@ class LightRAG:
_print_config = ",\n ".join([f"{k} = {v}" for k, v in global_config.items()])
logger.debug(f"LightRAG init with param:\n {_print_config}\n")
# Init LLM
# Init Tokenizer
# Post-initialization hook to handle backward compatabile tokenizer initialization based on provided parameters
if self.tokenizer is None:
if self.tiktoken_model_name:
self.tokenizer = TiktokenTokenizer(self.tiktoken_model_name)
else:
self.tokenizer = TiktokenTokenizer()
# Init Embedding
self.embedding_func = limit_async_func_call(self.embedding_func_max_async)( # type: ignore
self.embedding_func
)
@@ -900,12 +914,12 @@ class LightRAG:
"file_path": file_path, # Add file path to each chunk
}
for dp in self.chunking_func(
self.tokenizer,
status_doc.content,
split_by_character,
split_by_character_only,
self.chunk_overlap_token_size,
self.chunk_token_size,
self.tiktoken_model_name,
)
}
@@ -1134,8 +1148,8 @@ class LightRAG:
chunk_content = clean_text(chunk_data["content"])
source_id = chunk_data["source_id"]
tokens = len(
encode_string_by_tiktoken(
chunk_content, model_name=self.tiktoken_model_name
self.tokenizer.encode(
chunk_content
)
)
chunk_order_index = (