diff --git a/README.md b/README.md index bc0b6bde..97d6279c 100644 --- a/README.md +++ b/README.md @@ -461,14 +461,22 @@ custom_kg = { { "content": "ProductX, developed by CompanyA, has revolutionized the market with its cutting-edge features.", "source_id": "Source1", + "chunk_order_index": 0, + }, + { + "content": "One outstanding feature of ProductX is its advanced AI capabilities.", + "source_id": "Source1", + "chunk_order_index": 1, }, { "content": "PersonA is a prominent researcher at UniversityB, focusing on artificial intelligence and machine learning.", "source_id": "Source2", + "chunk_order_index": 0, }, { "content": "None", "source_id": "UNKNOWN", + "chunk_order_index": 0, }, ], } diff --git a/examples/insert_custom_kg.py b/examples/insert_custom_kg.py index 50ad925e..db489c96 100644 --- a/examples/insert_custom_kg.py +++ b/examples/insert_custom_kg.py @@ -87,18 +87,27 @@ custom_kg = { { "content": "ProductX, developed by CompanyA, has revolutionized the market with its cutting-edge features.", "source_id": "Source1", + "source_chunk_index": 0, + }, + { + "content": "One outstanding feature of ProductX is its advanced AI capabilities.", + "source_id": "Source1", + "chunk_order_index": 1, }, { "content": "PersonA is a prominent researcher at UniversityB, focusing on artificial intelligence and machine learning.", "source_id": "Source2", + "source_chunk_index": 0, }, { "content": "EventY, held in CityC, attracts technology enthusiasts and companies from around the globe.", "source_id": "Source3", + "source_chunk_index": 0, }, { "content": "None", "source_id": "UNKNOWN", + "source_chunk_index": 0, }, ], } diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index f9ab2333..a565f017 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -37,6 +37,7 @@ from .utils import ( limit_async_func_call, logger, set_logger, + encode_string_by_tiktoken, ) from .types import KnowledgeGraph @@ -926,11 +927,28 @@ class LightRAG: all_chunks_data: dict[str, dict[str, str]] = {} chunk_to_source_map: dict[str, str] = {} for chunk_data in custom_kg.get("chunks", {}): - chunk_content = chunk_data["content"] + chunk_content = chunk_data["content"].strip() source_id = chunk_data["source_id"] - chunk_id = compute_mdhash_id(chunk_content.strip(), prefix="chunk-") + tokens = len( + encode_string_by_tiktoken( + chunk_content, model_name=self.tiktoken_model_name + ) + ) + chunk_order_index = ( + 0 + if "chunk_order_index" not in chunk_data.keys() + else chunk_data["chunk_order_index"] + ) + chunk_id = compute_mdhash_id(chunk_content, prefix="chunk-") - chunk_entry = {"content": chunk_content.strip(), "source_id": source_id} + chunk_entry = { + "content": chunk_content, + "source_id": source_id, + "tokens": tokens, + "chunk_order_index": chunk_order_index, + "full_doc_id": source_id, + "status": DocStatus.PROCESSED, + } all_chunks_data[chunk_id] = chunk_entry chunk_to_source_map[source_id] = chunk_id update_storage = True