Improve token estimation accuracy by using tiktoken instead of regex-based approach
• Replace regex token estimation method • Use tiktoken for more precise counting • Support all types of text input • Simplify code implementation
This commit is contained in:
@@ -11,6 +11,7 @@ from fastapi.responses import StreamingResponse
|
|||||||
import asyncio
|
import asyncio
|
||||||
from ascii_colors import trace_exception
|
from ascii_colors import trace_exception
|
||||||
from lightrag import LightRAG, QueryParam
|
from lightrag import LightRAG, QueryParam
|
||||||
|
from lightrag.utils import encode_string_by_tiktoken
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
|
||||||
@@ -111,18 +112,9 @@ class OllamaTagResponse(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
def estimate_tokens(text: str) -> int:
|
def estimate_tokens(text: str) -> int:
|
||||||
"""Estimate the number of tokens in text
|
"""Estimate the number of tokens in text using tiktoken"""
|
||||||
Chinese characters: approximately 1.5 tokens per character
|
tokens = encode_string_by_tiktoken(text)
|
||||||
English characters: approximately 0.25 tokens per character
|
return len(tokens)
|
||||||
"""
|
|
||||||
# Use regex to match Chinese and non-Chinese characters separately
|
|
||||||
chinese_chars = len(re.findall(r"[\u4e00-\u9fff]", text))
|
|
||||||
non_chinese_chars = len(re.findall(r"[^\u4e00-\u9fff]", text))
|
|
||||||
|
|
||||||
# Calculate estimated token count
|
|
||||||
tokens = chinese_chars * 1.5 + non_chinese_chars * 0.25
|
|
||||||
|
|
||||||
return int(tokens)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_query_mode(query: str) -> tuple[str, SearchMode]:
|
def parse_query_mode(query: str) -> tuple[str, SearchMode]:
|
||||||
|
Reference in New Issue
Block a user