Merge pull request #560 from AdiKalra/main
Add custom chunking function.
This commit is contained in:
@@ -187,6 +187,10 @@ class LightRAG:
|
|||||||
# Add new field for document status storage type
|
# Add new field for document status storage type
|
||||||
doc_status_storage: str = field(default="JsonDocStatusStorage")
|
doc_status_storage: str = field(default="JsonDocStatusStorage")
|
||||||
|
|
||||||
|
# Custom Chunking Function
|
||||||
|
chunking_func: callable = chunking_by_token_size
|
||||||
|
chunking_func_kwargs: dict = field(default_factory=dict)
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
log_file = os.path.join("lightrag.log")
|
log_file = os.path.join("lightrag.log")
|
||||||
set_logger(log_file)
|
set_logger(log_file)
|
||||||
@@ -388,13 +392,14 @@ class LightRAG:
|
|||||||
**dp,
|
**dp,
|
||||||
"full_doc_id": doc_id,
|
"full_doc_id": doc_id,
|
||||||
}
|
}
|
||||||
for dp in chunking_by_token_size(
|
for dp in self.chunking_func(
|
||||||
doc["content"],
|
doc["content"],
|
||||||
split_by_character=split_by_character,
|
split_by_character=split_by_character,
|
||||||
split_by_character_only=split_by_character_only,
|
split_by_character_only=split_by_character_only,
|
||||||
overlap_token_size=self.chunk_overlap_token_size,
|
overlap_token_size=self.chunk_overlap_token_size,
|
||||||
max_token_size=self.chunk_token_size,
|
max_token_size=self.chunk_token_size,
|
||||||
tiktoken_model=self.tiktoken_model_name,
|
tiktoken_model=self.tiktoken_model_name,
|
||||||
|
**self.chunking_func_kwargs,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -39,6 +39,7 @@ def chunking_by_token_size(
|
|||||||
overlap_token_size=128,
|
overlap_token_size=128,
|
||||||
max_token_size=1024,
|
max_token_size=1024,
|
||||||
tiktoken_model="gpt-4o",
|
tiktoken_model="gpt-4o",
|
||||||
|
**kwargs,
|
||||||
):
|
):
|
||||||
tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
|
tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
|
||||||
results = []
|
results = []
|
||||||
|
Reference in New Issue
Block a user