From a20d68d8659c0e3baea744a50f49710cf932eba5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AD=9F=E8=B6=85?= <mcshr@hotmail.com>
Date: Sat, 19 Apr 2025 15:18:33 +0800
Subject: [PATCH 1/5] Revise the context format of chunks from CSV to JSON to
 enhance compatibility with LLM

---
 lightrag/operate.py |  6 ++---
 lightrag/utils.py   | 54 +++++++++++++++++++++++++++------------------
 2 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 7040ae2e..e1a3718c 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1316,15 +1316,15 @@ async def _build_query_context(
 
     result = f"""
     -----Entities-----
-    ```csv
+    ```json
     {entities_context}
     ```
     -----Relationships-----
-    ```csv
+    ```json
     {relations_context}
     ```
     -----Sources-----
-    ```csv
+    ```json
     {text_units_context}
     ```
     """.strip()
diff --git a/lightrag/utils.py b/lightrag/utils.py
index dc717fb7..44b21fc6 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -473,40 +473,50 @@ def xml_to_json(xml_file):
 
 
 def process_combine_contexts(hl: str, ll: str):
-    header = None
-    list_hl = csv_string_to_list(hl.strip())
-    list_ll = csv_string_to_list(ll.strip())
+    list_hl = csv_string_to_list(hl.strip()) if hl.strip() else []
+    list_ll = csv_string_to_list(ll.strip()) if ll.strip() else []
 
-    if list_hl:
+    if not list_hl and not list_ll:
+        return json.dumps([], ensure_ascii=False)
+
+    header = None
+    if list_hl and len(list_hl) > 0:
         header = list_hl[0]
         list_hl = list_hl[1:]
-    if list_ll:
-        header = list_ll[0]
+    if list_ll and len(list_ll) > 0:
+        if header is None:
+            header = list_ll[0]
         list_ll = list_ll[1:]
+
     if header is None:
-        return ""
+        return json.dumps([], ensure_ascii=False)
 
-    if list_hl:
-        list_hl = [",".join(item[1:]) for item in list_hl if item]
-    if list_ll:
-        list_ll = [",".join(item[1:]) for item in list_ll if item]
-
-    combined_sources = []
+    combined_data = []
     seen = set()
 
-    for item in list_hl + list_ll:
-        if item and item not in seen:
-            combined_sources.append(item)
-            seen.add(item)
+    def process_row(row):
+        if len(row) < 2:
+            return None
 
-    combined_sources_result = [",\t".join(header)]
+        item_data = {}
 
-    for i, item in enumerate(combined_sources, start=1):
-        combined_sources_result.append(f"{i},\t{item}")
+        for i, field_name in enumerate(header):
+            item_data[field_name] = row[i]
 
-    combined_sources_result = "\n".join(combined_sources_result)
+        return item_data
 
-    return combined_sources_result
+    for row in list_hl + list_ll:
+        # 创建内容的标识符用于去重（跳过第一列的索引）
+        if len(row) >= 2:
+            row_identifier = json.dumps(row[1:])
+
+            if row_identifier not in seen:
+                seen.add(row_identifier)
+                item = process_row(row)
+                if item:
+                    combined_data.append(item)
+
+    return json.dumps(combined_data, ensure_ascii=False)
 
 
 async def get_best_cached_response(

From 6d486f5813f6e87b1b3d14e3b3a235af6bed5343 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AD=9F=E8=B6=85?= <mcshr@hotmail.com>
Date: Sat, 19 Apr 2025 15:28:07 +0800
Subject: [PATCH 2/5] json dumps without ascii

---
 lightrag/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index 44b21fc6..372e4d07 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -506,9 +506,8 @@ def process_combine_contexts(hl: str, ll: str):
         return item_data
 
     for row in list_hl + list_ll:
-        # 创建内容的标识符用于去重（跳过第一列的索引）
         if len(row) >= 2:
-            row_identifier = json.dumps(row[1:])
+            row_identifier = json.dumps(row[1:], ensure_ascii=False)
 
             if row_identifier not in seen:
                 seen.add(row_identifier)

From f2f3a2721d27178e24e909f2c29a2ad5cc0f07d8 Mon Sep 17 00:00:00 2001
From: mengchao <mcshr@hotmail.com>
Date: Sun, 20 Apr 2025 19:24:05 +0800
Subject: [PATCH 3/5] Refactor context handling to convert data from CSV to
 JSON format for improved compatibility with LLM, replacing the
 list_of_list_to_csv function with list_of_list_to_json

---
 lightrag/operate.py | 27 +++++++------
 lightrag/utils.py   | 94 +++++++++++++--------------------------------
 2 files changed, 42 insertions(+), 79 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index e1a3718c..5060c9bb 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -15,7 +15,6 @@ from .utils import (
     decode_tokens_by_tiktoken,
     encode_string_by_tiktoken,
     is_float_regex,
-    list_of_list_to_csv,
     normalize_extracted_info,
     pack_user_ass_to_openai_messages,
     split_string_by_multi_markers,
@@ -27,6 +26,7 @@ from .utils import (
     CacheData,
     get_conversation_turns,
     use_llm_func_with_cache,
+    list_of_list_to_json,
 )
 from .base import (
     BaseGraphStorage,
@@ -1311,21 +1311,26 @@ async def _build_query_context(
             [hl_text_units_context, ll_text_units_context],
         )
     # not necessary to use LLM to generate a response
-    if not entities_context.strip() and not relations_context.strip():
+    if not entities_context and not relations_context:
         return None
 
+    # 转换为 JSON 字符串
+    entities_str = json.dumps(entities_context, ensure_ascii=False)
+    relations_str = json.dumps(relations_context, ensure_ascii=False)
+    text_units_str = json.dumps(text_units_context, ensure_ascii=False)
+
     result = f"""
     -----Entities-----
     ```json
-    {entities_context}
+    {entities_str}
     ```
     -----Relationships-----
     ```json
-    {relations_context}
+    {relations_str}
     ```
     -----Sources-----
     ```json
-    {text_units_context}
+    {text_units_str}
     ```
     """.strip()
     return result
@@ -1424,7 +1429,7 @@ async def _get_node_data(
                 file_path,
             ]
         )
-    entities_context = list_of_list_to_csv(entites_section_list)
+    entities_context = list_of_list_to_json(entites_section_list)
 
     relations_section_list = [
         [
@@ -1461,14 +1466,14 @@ async def _get_node_data(
                 file_path,
             ]
         )
-    relations_context = list_of_list_to_csv(relations_section_list)
+    relations_context = list_of_list_to_json(relations_section_list)
 
     text_units_section_list = [["id", "content", "file_path"]]
     for i, t in enumerate(use_text_units):
         text_units_section_list.append(
             [i, t["content"], t.get("file_path", "unknown_source")]
         )
-    text_units_context = list_of_list_to_csv(text_units_section_list)
+    text_units_context = list_of_list_to_json(text_units_section_list)
     return entities_context, relations_context, text_units_context
 
 
@@ -1736,7 +1741,7 @@ async def _get_edge_data(
                 file_path,
             ]
         )
-    relations_context = list_of_list_to_csv(relations_section_list)
+    relations_context = list_of_list_to_json(relations_section_list)
 
     entites_section_list = [
         ["id", "entity", "type", "description", "rank", "created_at", "file_path"]
@@ -1761,12 +1766,12 @@ async def _get_edge_data(
                 file_path,
             ]
         )
-    entities_context = list_of_list_to_csv(entites_section_list)
+    entities_context = list_of_list_to_json(entites_section_list)
 
     text_units_section_list = [["id", "content", "file_path"]]
     for i, t in enumerate(use_text_units):
         text_units_section_list.append([i, t["content"], t.get("file_path", "unknown")])
-    text_units_context = list_of_list_to_csv(text_units_section_list)
+    text_units_context = list_of_list_to_json(text_units_section_list)
     return entities_context, relations_context, text_units_context
 
 
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 372e4d07..45ee7bb9 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -374,37 +374,24 @@ def truncate_list_by_token_size(
     return list_data
 
 
-def list_of_list_to_csv(data: list[list[str]]) -> str:
-    output = io.StringIO()
-    writer = csv.writer(
-        output,
-        quoting=csv.QUOTE_ALL,  # Quote all fields
-        escapechar="\\",  # Use backslash as escape character
-        quotechar='"',  # Use double quotes
-        lineterminator="\n",  # Explicit line terminator
-    )
-    writer.writerows(data)
-    return output.getvalue()
+def list_of_list_to_json(data: list[list[str]]) -> list[dict[str, str]]:
+    if not data or len(data) <= 1:
+        return []
 
+    header = data[0]
+    result = []
 
-def csv_string_to_list(csv_string: str) -> list[list[str]]:
-    # Clean the string by removing NUL characters
-    cleaned_string = csv_string.replace("\0", "")
+    for row in data[1:]:
+        if len(row) >= 2:
+            item = {}
+            for i, field_name in enumerate(header):
+                if i < len(row):
+                    item[field_name] = row[i]
+                else:
+                    item[field_name] = ""
+            result.append(item)
 
-    output = io.StringIO(cleaned_string)
-    reader = csv.reader(
-        output,
-        quoting=csv.QUOTE_ALL,  # Match the writer configuration
-        escapechar="\\",  # Use backslash as escape character
-        quotechar='"',  # Use double quotes
-    )
-
-    try:
-        return [row for row in reader]
-    except csv.Error as e:
-        raise ValueError(f"Failed to parse CSV string: {str(e)}")
-    finally:
-        output.close()
+    return result
 
 
 def save_data_to_file(data, file_name):
@@ -472,50 +459,21 @@ def xml_to_json(xml_file):
         return None
 
 
-def process_combine_contexts(hl: str, ll: str):
-    list_hl = csv_string_to_list(hl.strip()) if hl.strip() else []
-    list_ll = csv_string_to_list(ll.strip()) if ll.strip() else []
-
-    if not list_hl and not list_ll:
-        return json.dumps([], ensure_ascii=False)
-
-    header = None
-    if list_hl and len(list_hl) > 0:
-        header = list_hl[0]
-        list_hl = list_hl[1:]
-    if list_ll and len(list_ll) > 0:
-        if header is None:
-            header = list_ll[0]
-        list_ll = list_ll[1:]
-
-    if header is None:
-        return json.dumps([], ensure_ascii=False)
-
+def process_combine_contexts(hl_context: dict, ll_context: dict):
+    seen_content = {}
     combined_data = []
-    seen = set()
 
-    def process_row(row):
-        if len(row) < 2:
-            return None
+    for item in hl_context + ll_context:
+        content_key = {k: v for k, v in item.items() if k != 'id'}
+        content_key_str = str(content_key)
+        if content_key_str not in seen_content:
+            seen_content[content_key_str] = item
+            combined_data.append(item)
 
-        item_data = {}
+    for i, item in enumerate(combined_data):
+        item['id'] = i
 
-        for i, field_name in enumerate(header):
-            item_data[field_name] = row[i]
-
-        return item_data
-
-    for row in list_hl + list_ll:
-        if len(row) >= 2:
-            row_identifier = json.dumps(row[1:], ensure_ascii=False)
-
-            if row_identifier not in seen:
-                seen.add(row_identifier)
-                item = process_row(row)
-                if item:
-                    combined_data.append(item)
-
-    return json.dumps(combined_data, ensure_ascii=False)
+    return combined_data
 
 
 async def get_best_cached_response(

From 510a6adfbc514a1ee583bd28c2c533fe333ba823 Mon Sep 17 00:00:00 2001
From: mengchao <mcshr@hotmail.com>
Date: Sun, 20 Apr 2025 19:28:25 +0800
Subject: [PATCH 4/5] Fix linting

---
 lightrag/utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index 45ee7bb9..bdf9eb72 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 
 import asyncio
 import html
-import io
 import csv
 import json
 import logging
@@ -464,14 +463,14 @@ def process_combine_contexts(hl_context: dict, ll_context: dict):
     combined_data = []
 
     for item in hl_context + ll_context:
-        content_key = {k: v for k, v in item.items() if k != 'id'}
+        content_key = {k: v for k, v in item.items() if k != "id"}
         content_key_str = str(content_key)
         if content_key_str not in seen_content:
             seen_content[content_key_str] = item
             combined_data.append(item)
 
     for i, item in enumerate(combined_data):
-        item['id'] = i
+        item["id"] = i
 
     return combined_data
 

From 8064a2339fb4cb1c6d963f33f003b87f25d36616 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AD=9F=E8=B6=85?= <mcshr@hotmail.com>
Date: Mon, 21 Apr 2025 12:08:12 +0800
Subject: [PATCH 5/5] change process_combine_contexts params type to
 list[dict[str, str]]

---
 lightrag/utils.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index bdf9eb72..ef63de01 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -385,7 +385,7 @@ def list_of_list_to_json(data: list[list[str]]) -> list[dict[str, str]]:
             item = {}
             for i, field_name in enumerate(header):
                 if i < len(row):
-                    item[field_name] = row[i]
+                    item[field_name] = str(row[i])
                 else:
                     item[field_name] = ""
             result.append(item)
@@ -458,19 +458,21 @@ def xml_to_json(xml_file):
         return None
 
 
-def process_combine_contexts(hl_context: dict, ll_context: dict):
+def process_combine_contexts(
+    hl_context: list[dict[str, str]], ll_context: list[dict[str, str]]
+):
     seen_content = {}
     combined_data = []
 
     for item in hl_context + ll_context:
-        content_key = {k: v for k, v in item.items() if k != "id"}
-        content_key_str = str(content_key)
-        if content_key_str not in seen_content:
-            seen_content[content_key_str] = item
+        content_dict = {k: v for k, v in item.items() if k != "id"}
+        content_key = tuple(sorted(content_dict.items()))
+        if content_key not in seen_content:
+            seen_content[content_key] = item
             combined_data.append(item)
 
     for i, item in enumerate(combined_data):
-        item["id"] = i
+        item["id"] = str(i)
 
     return combined_data