Revise the context format of chunks from CSV to JSON to enhance compatibility with LLM

2025-04-19 15:18:33 +08:00
parent 4fd40fd798
commit a20d68d865
2 changed files with 35 additions and 25 deletions
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1316,15 +1316,15 @@ async def _build_query_context(
    result = f"""
    -----Entities-----
-    ```csv
+    ```json
    {entities_context}
    ```
    -----Relationships-----
-    ```csv
+    ```json
    {relations_context}
    ```
    -----Sources-----
-    ```csv
+    ```json
    {text_units_context}
    ```
    """.strip()
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -473,40 +473,50 @@ def xml_to_json(xml_file):
 def process_combine_contexts(hl: str, ll: str):
-    header = None
+    list_hl = csv_string_to_list(hl.strip()) if hl.strip() else []
-    list_hl = csv_string_to_list(hl.strip())
+    list_ll = csv_string_to_list(ll.strip()) if ll.strip() else []
    list_ll = csv_string_to_list(ll.strip())
-    if list_hl:
+    if not list_hl and not list_ll:
        return json.dumps([], ensure_ascii=False)
    header = None
    if list_hl and len(list_hl) > 0:
        header = list_hl[0]
        list_hl = list_hl[1:]
-    if list_ll:
+    if list_ll and len(list_ll) > 0:
-        header = list_ll[0]
+        if header is None:
            header = list_ll[0]
        list_ll = list_ll[1:]
    if header is None:
-        return ""
+        return json.dumps([], ensure_ascii=False)
-    if list_hl:
+    combined_data = []
        list_hl = [",".join(item[1:]) for item in list_hl if item]
    if list_ll:
        list_ll = [",".join(item[1:]) for item in list_ll if item]
    combined_sources = []
    seen = set()
-    for item in list_hl + list_ll:
+    def process_row(row):
-        if item and item not in seen:
+        if len(row) < 2:
-            combined_sources.append(item)
+            return None
            seen.add(item)
-    combined_sources_result = [",\t".join(header)]
+        item_data = {}
-    for i, item in enumerate(combined_sources, start=1):
+        for i, field_name in enumerate(header):
-        combined_sources_result.append(f"{i},\t{item}")
+            item_data[field_name] = row[i]
-    combined_sources_result = "\n".join(combined_sources_result)
+        return item_data
-    return combined_sources_result
+    for row in list_hl + list_ll:
        # 创建内容的标识符用于去重（跳过第一列的索引）
        if len(row) >= 2:
            row_identifier = json.dumps(row[1:])
            if row_identifier not in seen:
                seen.add(row_identifier)
                item = process_row(row)
                if item:
                    combined_data.append(item)
    return json.dumps(combined_data, ensure_ascii=False)
 async def get_best_cached_response(