add: to optionally replace default tiktoken Tokenizer with a custom one

2025-04-17 10:56:23 +02:00
parent 4fd40fd798
commit 20ba1eb9c2
6 changed files with 138 additions and 53 deletions
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -41,11 +41,12 @@ from .operate import (
 )
 from .prompt import GRAPH_FIELD_SEP, PROMPTS
 from .utils import (
+    Tokenizer,
+    TiktokenTokenizer,
    EmbeddingFunc,
    always_get_an_event_loop,
    compute_mdhash_id,
    convert_response_to_json,
-    encode_string_by_tiktoken,
    lazy_external_import,
    limit_async_func_call,
    get_content_summary,
@@ -122,33 +123,38 @@ class LightRAG:
    )
    """Number of overlapping tokens between consecutive text chunks to preserve context."""

-    tiktoken_model_name: str = field(default="gpt-4o-mini")
-    """Model name used for tokenization when chunking text."""
+    tokenizer: Optional[Tokenizer] = field(default=None)
+    """
+    A function that returns a Tokenizer instance.
+    If None, and a `tiktoken_model_name` is provided, a TiktokenTokenizer will be created.
+    If both are None, the default TiktokenTokenizer is used.
+    """

-    """Maximum number of tokens used for summarizing extracted entities."""
+    tiktoken_model_name: str = field(default="gpt-4o-mini")
+    """Model name used for tokenization when chunking text with tiktoken. Defaults to `gpt-4o-mini`."""

    chunking_func: Callable[
        [
+            Tokenizer,
            str,
-            str | None,
+            Optional[str],
            bool,
            int,
            int,
-            str,
        ],
-        list[dict[str, Any]],
+        List[Dict[str, Any]],
    ] = field(default_factory=lambda: chunking_by_token_size)
    """
    Custom chunking function for splitting text into chunks before processing.

    The function should take the following parameters:

+        - `tokenizer`: A Tokenizer instance to use for tokenization.
        - `content`: The text to be split into chunks.
        - `split_by_character`: The character to split the text on. If None, the text is split into chunks of `chunk_token_size` tokens.
        - `split_by_character_only`: If True, the text is split only on the specified character.
        - `chunk_token_size`: The maximum number of tokens per chunk.
        - `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.
-        - `tiktoken_model_name`: The name of the tiktoken model to use for tokenization.

    The function should return a list of dictionaries, where each dictionary contains the following keys:
        - `tokens`: The number of tokens in the chunk.
@@ -310,7 +316,15 @@ class LightRAG:
        _print_config = ",\n  ".join([f"{k} = {v}" for k, v in global_config.items()])
        logger.debug(f"LightRAG init with param:\n  {_print_config}\n")

-        # Init LLM
+        # Init Tokenizer
+        # Post-initialization hook to handle backward compatabile tokenizer initialization based on provided parameters
+        if self.tokenizer is None:
+            if self.tiktoken_model_name:
+                self.tokenizer = TiktokenTokenizer(self.tiktoken_model_name)
+            else:
+                self.tokenizer = TiktokenTokenizer()
+
+        # Init Embedding
        self.embedding_func = limit_async_func_call(self.embedding_func_max_async)(  # type: ignore
            self.embedding_func
        )
@@ -900,12 +914,12 @@ class LightRAG:
                                "file_path": file_path,  # Add file path to each chunk
                            }
                            for dp in self.chunking_func(
+                                self.tokenizer,
                                status_doc.content,
                                split_by_character,
                                split_by_character_only,
                                self.chunk_overlap_token_size,
                                self.chunk_token_size,
-                                self.tiktoken_model_name,
                            )
                        }

@@ -1134,8 +1148,8 @@ class LightRAG:
                chunk_content = clean_text(chunk_data["content"])
                source_id = chunk_data["source_id"]
                tokens = len(
-                    encode_string_by_tiktoken(
-                        chunk_content, model_name=self.tiktoken_model_name
+                    self.tokenizer.encode(
+                        chunk_content
                    )
                )
                chunk_order_index = (