增加仅字符分割参数,如果开启,仅采用字符分割,不开启,在分割完以后如果chunk过大,会继续根据token size分割,更新测试文件

This commit is contained in:
童石渊
2025-01-09 11:55:49 +08:00
parent b8f7a0a1e7
commit dd213c95be
4 changed files with 1328 additions and 758 deletions

View File

@@ -314,18 +314,25 @@ class LightRAG:
"JsonDocStatusStorage": JsonDocStatusStorage,
}
def insert(self, string_or_strings, split_by_character=None):
def insert(
self, string_or_strings, split_by_character=None, split_by_character_only=False
):
loop = always_get_an_event_loop()
return loop.run_until_complete(
self.ainsert(string_or_strings, split_by_character)
self.ainsert(string_or_strings, split_by_character, split_by_character_only)
)
async def ainsert(self, string_or_strings, split_by_character):
async def ainsert(
self, string_or_strings, split_by_character, split_by_character_only
):
"""Insert documents with checkpoint support
Args:
string_or_strings: Single document string or list of document strings
split_by_character: if split_by_character is not None, split the string by character
split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
chunk_size, split the sub chunk by token size.
split_by_character_only: if split_by_character_only is True, split the string by character only, when
split_by_character is None, this parameter is ignored.
"""
if isinstance(string_or_strings, str):
string_or_strings = [string_or_strings]
@@ -384,6 +391,7 @@ class LightRAG:
for dp in chunking_by_token_size(
doc["content"],
split_by_character=split_by_character,
split_by_character_only=split_by_character_only,
overlap_token_size=self.chunk_overlap_token_size,
max_token_size=self.chunk_token_size,
tiktoken_model=self.tiktoken_model_name,

View File

@@ -36,6 +36,7 @@ import time
def chunking_by_token_size(
content: str,
split_by_character=None,
split_by_character_only=False,
overlap_token_size=128,
max_token_size=1024,
tiktoken_model="gpt-4o",
@@ -45,21 +46,26 @@ def chunking_by_token_size(
if split_by_character:
raw_chunks = content.split(split_by_character)
new_chunks = []
for chunk in raw_chunks:
_tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model)
if len(_tokens) > max_token_size:
for start in range(
0, len(_tokens), max_token_size - overlap_token_size
):
chunk_content = decode_tokens_by_tiktoken(
_tokens[start : start + max_token_size],
model_name=tiktoken_model,
)
new_chunks.append(
(min(max_token_size, len(_tokens) - start), chunk_content)
)
else:
if split_by_character_only:
for chunk in raw_chunks:
_tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model)
new_chunks.append((len(_tokens), chunk))
else:
for chunk in raw_chunks:
_tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model)
if len(_tokens) > max_token_size:
for start in range(
0, len(_tokens), max_token_size - overlap_token_size
):
chunk_content = decode_tokens_by_tiktoken(
_tokens[start : start + max_token_size],
model_name=tiktoken_model,
)
new_chunks.append(
(min(max_token_size, len(_tokens) - start), chunk_content)
)
else:
new_chunks.append((len(_tokens), chunk))
for index, (_len, chunk) in enumerate(new_chunks):
results.append(
{