From 791330400788e360fd3e985fad5fac59e7fec21c Mon Sep 17 00:00:00 2001 From: Lukas Selch Date: Mon, 17 Feb 2025 15:12:35 +0100 Subject: [PATCH 1/6] Fixed broken ainsert_custom_kg() --- lightrag/lightrag.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index bf1c02d2..7b3e8605 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -36,6 +36,7 @@ from .utils import ( limit_async_func_call, logger, set_logger, + encode_string_by_tiktoken, ) from .types import KnowledgeGraph @@ -863,7 +864,14 @@ class LightRAG: source_id = chunk_data["source_id"] chunk_id = compute_mdhash_id(chunk_content.strip(), prefix="chunk-") - chunk_entry = {"content": chunk_content.strip(), "source_id": source_id} + chunk_entry = { + "content": chunk_content.strip(), + "source_id": source_id, + "tokens": len(encode_string_by_tiktoken(chunk_entry["content"])), + "chunk_order_id": 0, + "full_doc_id": source_id, + "status": DocStatus.PROCESSED + } all_chunks_data[chunk_id] = chunk_entry chunk_to_source_map[source_id] = chunk_id update_storage = True From 86f5a88db792c26094617f3c135a0078c9bfdcf1 Mon Sep 17 00:00:00 2001 From: Lukas Selch Date: Mon, 17 Feb 2025 15:20:23 +0100 Subject: [PATCH 2/6] Fixed wrong variable name --- lightrag/lightrag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 7b3e8605..2f7bb5e4 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -867,7 +867,7 @@ class LightRAG: chunk_entry = { "content": chunk_content.strip(), "source_id": source_id, - "tokens": len(encode_string_by_tiktoken(chunk_entry["content"])), + "tokens": len(encode_string_by_tiktoken(chunk_content.strip())), "chunk_order_id": 0, "full_doc_id": source_id, "status": DocStatus.PROCESSED From 537e10303dafcc1e46fa43d65d28eae4b0f63111 Mon Sep 17 00:00:00 2001 From: Lukas Selch Date: Mon, 17 Feb 2025 15:25:50 +0100 Subject: [PATCH 3/6] Fixed formatting --- lightrag/lightrag.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 2f7bb5e4..1c3bd089 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -860,17 +860,17 @@ class LightRAG: all_chunks_data: dict[str, dict[str, str]] = {} chunk_to_source_map: dict[str, str] = {} for chunk_data in custom_kg.get("chunks", {}): - chunk_content = chunk_data["content"] + chunk_content = chunk_data["content"].strip() source_id = chunk_data["source_id"] - chunk_id = compute_mdhash_id(chunk_content.strip(), prefix="chunk-") + chunk_id = compute_mdhash_id(chunk_content, prefix="chunk-") chunk_entry = { - "content": chunk_content.strip(), + "content": chunk_content, "source_id": source_id, - "tokens": len(encode_string_by_tiktoken(chunk_content.strip())), + "tokens": len(encode_string_by_tiktoken(chunk_content)), "chunk_order_id": 0, "full_doc_id": source_id, - "status": DocStatus.PROCESSED + "status": DocStatus.PROCESSED, } all_chunks_data[chunk_id] = chunk_entry chunk_to_source_map[source_id] = chunk_id From bc630c862000893f09540219d6655b69c94ee3a3 Mon Sep 17 00:00:00 2001 From: Lukas Selch Date: Wed, 19 Feb 2025 07:15:30 +0100 Subject: [PATCH 4/6] Renamed chunk_order_index and improve token calculation --- lightrag/lightrag.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 1c3bd089..8513ac19 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -862,13 +862,15 @@ class LightRAG: for chunk_data in custom_kg.get("chunks", {}): chunk_content = chunk_data["content"].strip() source_id = chunk_data["source_id"] + tokens = len(encode_string_by_tiktoken(chunk_content, model_name=self.tiktoken_model_name)) + chunk_order_index = 0 if "chunk_order_index" not in chunk_data.keys() else chunk_data["chunk_order_index"] chunk_id = compute_mdhash_id(chunk_content, prefix="chunk-") chunk_entry = { "content": chunk_content, "source_id": source_id, - "tokens": len(encode_string_by_tiktoken(chunk_content)), - "chunk_order_id": 0, + "tokens": tokens, + "chunk_order_index": chunk_order_index, "full_doc_id": source_id, "status": DocStatus.PROCESSED, } From 701d8bb48e3e3224b357e677c5bde042963dc0de Mon Sep 17 00:00:00 2001 From: Lukas Selch Date: Wed, 19 Feb 2025 10:28:25 +0100 Subject: [PATCH 5/6] Applied lint --- lightrag/lightrag.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 8513ac19..e73e4c1b 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -862,8 +862,16 @@ class LightRAG: for chunk_data in custom_kg.get("chunks", {}): chunk_content = chunk_data["content"].strip() source_id = chunk_data["source_id"] - tokens = len(encode_string_by_tiktoken(chunk_content, model_name=self.tiktoken_model_name)) - chunk_order_index = 0 if "chunk_order_index" not in chunk_data.keys() else chunk_data["chunk_order_index"] + tokens = len( + encode_string_by_tiktoken( + chunk_content, model_name=self.tiktoken_model_name + ) + ) + chunk_order_index = ( + 0 + if "chunk_order_index" not in chunk_data.keys() + else chunk_data["chunk_order_index"] + ) chunk_id = compute_mdhash_id(chunk_content, prefix="chunk-") chunk_entry = { From 7fab9accfe220f154ca4aede8e2d1f2dd3870602 Mon Sep 17 00:00:00 2001 From: Lukas Selch Date: Wed, 19 Feb 2025 14:58:51 +0100 Subject: [PATCH 6/6] Updated documentation examples to include chunk_order_index case --- README.md | 8 ++++++++ examples/insert_custom_kg.py | 9 +++++++++ 2 files changed, 17 insertions(+) diff --git a/README.md b/README.md index 92a32703..f43dd370 100644 --- a/README.md +++ b/README.md @@ -608,14 +608,22 @@ custom_kg = { { "content": "ProductX, developed by CompanyA, has revolutionized the market with its cutting-edge features.", "source_id": "Source1", + "chunk_order_index": 0, + }, + { + "content": "One outstanding feature of ProductX is its advanced AI capabilities.", + "source_id": "Source1", + "chunk_order_index": 1, }, { "content": "PersonA is a prominent researcher at UniversityB, focusing on artificial intelligence and machine learning.", "source_id": "Source2", + "chunk_order_index": 0, }, { "content": "None", "source_id": "UNKNOWN", + "chunk_order_index": 0, }, ], } diff --git a/examples/insert_custom_kg.py b/examples/insert_custom_kg.py index 50ad925e..db489c96 100644 --- a/examples/insert_custom_kg.py +++ b/examples/insert_custom_kg.py @@ -87,18 +87,27 @@ custom_kg = { { "content": "ProductX, developed by CompanyA, has revolutionized the market with its cutting-edge features.", "source_id": "Source1", + "source_chunk_index": 0, + }, + { + "content": "One outstanding feature of ProductX is its advanced AI capabilities.", + "source_id": "Source1", + "chunk_order_index": 1, }, { "content": "PersonA is a prominent researcher at UniversityB, focusing on artificial intelligence and machine learning.", "source_id": "Source2", + "source_chunk_index": 0, }, { "content": "EventY, held in CityC, attracts technology enthusiasts and companies from around the globe.", "source_id": "Source3", + "source_chunk_index": 0, }, { "content": "None", "source_id": "UNKNOWN", + "source_chunk_index": 0, }, ], }