Merge pull request #560 from AdiKalra/main

Add custom chunking function.
This commit is contained in:
zrguo
2025-01-09 21:33:55 +08:00
committed by GitHub
2 changed files with 7 additions and 1 deletions

View File

@@ -187,6 +187,10 @@ class LightRAG:
# Add new field for document status storage type # Add new field for document status storage type
doc_status_storage: str = field(default="JsonDocStatusStorage") doc_status_storage: str = field(default="JsonDocStatusStorage")
# Custom Chunking Function
chunking_func: callable = chunking_by_token_size
chunking_func_kwargs: dict = field(default_factory=dict)
def __post_init__(self): def __post_init__(self):
log_file = os.path.join("lightrag.log") log_file = os.path.join("lightrag.log")
set_logger(log_file) set_logger(log_file)
@@ -388,13 +392,14 @@ class LightRAG:
**dp, **dp,
"full_doc_id": doc_id, "full_doc_id": doc_id,
} }
for dp in chunking_by_token_size( for dp in self.chunking_func(
doc["content"], doc["content"],
split_by_character=split_by_character, split_by_character=split_by_character,
split_by_character_only=split_by_character_only, split_by_character_only=split_by_character_only,
overlap_token_size=self.chunk_overlap_token_size, overlap_token_size=self.chunk_overlap_token_size,
max_token_size=self.chunk_token_size, max_token_size=self.chunk_token_size,
tiktoken_model=self.tiktoken_model_name, tiktoken_model=self.tiktoken_model_name,
**self.chunking_func_kwargs,
) )
} }

View File

@@ -39,6 +39,7 @@ def chunking_by_token_size(
overlap_token_size=128, overlap_token_size=128,
max_token_size=1024, max_token_size=1024,
tiktoken_model="gpt-4o", tiktoken_model="gpt-4o",
**kwargs,
): ):
tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
results = [] results = []