From a20d68d8659c0e3baea744a50f49710cf932eba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AD=9F=E8=B6=85?= Date: Sat, 19 Apr 2025 15:18:33 +0800 Subject: [PATCH 1/5] Revise the context format of chunks from CSV to JSON to enhance compatibility with LLM --- lightrag/operate.py | 6 ++--- lightrag/utils.py | 54 +++++++++++++++++++++++++++------------------ 2 files changed, 35 insertions(+), 25 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 7040ae2e..e1a3718c 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1316,15 +1316,15 @@ async def _build_query_context( result = f""" -----Entities----- - ```csv + ```json {entities_context} ``` -----Relationships----- - ```csv + ```json {relations_context} ``` -----Sources----- - ```csv + ```json {text_units_context} ``` """.strip() diff --git a/lightrag/utils.py b/lightrag/utils.py index dc717fb7..44b21fc6 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -473,40 +473,50 @@ def xml_to_json(xml_file): def process_combine_contexts(hl: str, ll: str): - header = None - list_hl = csv_string_to_list(hl.strip()) - list_ll = csv_string_to_list(ll.strip()) + list_hl = csv_string_to_list(hl.strip()) if hl.strip() else [] + list_ll = csv_string_to_list(ll.strip()) if ll.strip() else [] - if list_hl: + if not list_hl and not list_ll: + return json.dumps([], ensure_ascii=False) + + header = None + if list_hl and len(list_hl) > 0: header = list_hl[0] list_hl = list_hl[1:] - if list_ll: - header = list_ll[0] + if list_ll and len(list_ll) > 0: + if header is None: + header = list_ll[0] list_ll = list_ll[1:] + if header is None: - return "" + return json.dumps([], ensure_ascii=False) - if list_hl: - list_hl = [",".join(item[1:]) for item in list_hl if item] - if list_ll: - list_ll = [",".join(item[1:]) for item in list_ll if item] - - combined_sources = [] + combined_data = [] seen = set() - for item in list_hl + list_ll: - if item and item not in seen: - combined_sources.append(item) - seen.add(item) + def process_row(row): + if len(row) < 2: + return None - combined_sources_result = [",\t".join(header)] + item_data = {} - for i, item in enumerate(combined_sources, start=1): - combined_sources_result.append(f"{i},\t{item}") + for i, field_name in enumerate(header): + item_data[field_name] = row[i] - combined_sources_result = "\n".join(combined_sources_result) + return item_data - return combined_sources_result + for row in list_hl + list_ll: + # 创建内容的标识符用于去重(跳过第一列的索引) + if len(row) >= 2: + row_identifier = json.dumps(row[1:]) + + if row_identifier not in seen: + seen.add(row_identifier) + item = process_row(row) + if item: + combined_data.append(item) + + return json.dumps(combined_data, ensure_ascii=False) async def get_best_cached_response( From 6d486f5813f6e87b1b3d14e3b3a235af6bed5343 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AD=9F=E8=B6=85?= Date: Sat, 19 Apr 2025 15:28:07 +0800 Subject: [PATCH 2/5] json dumps without ascii --- lightrag/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index 44b21fc6..372e4d07 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -506,9 +506,8 @@ def process_combine_contexts(hl: str, ll: str): return item_data for row in list_hl + list_ll: - # 创建内容的标识符用于去重(跳过第一列的索引) if len(row) >= 2: - row_identifier = json.dumps(row[1:]) + row_identifier = json.dumps(row[1:], ensure_ascii=False) if row_identifier not in seen: seen.add(row_identifier) From f2f3a2721d27178e24e909f2c29a2ad5cc0f07d8 Mon Sep 17 00:00:00 2001 From: mengchao Date: Sun, 20 Apr 2025 19:24:05 +0800 Subject: [PATCH 3/5] Refactor context handling to convert data from CSV to JSON format for improved compatibility with LLM, replacing the list_of_list_to_csv function with list_of_list_to_json --- lightrag/operate.py | 27 +++++++------ lightrag/utils.py | 94 +++++++++++++-------------------------------- 2 files changed, 42 insertions(+), 79 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index e1a3718c..5060c9bb 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -15,7 +15,6 @@ from .utils import ( decode_tokens_by_tiktoken, encode_string_by_tiktoken, is_float_regex, - list_of_list_to_csv, normalize_extracted_info, pack_user_ass_to_openai_messages, split_string_by_multi_markers, @@ -27,6 +26,7 @@ from .utils import ( CacheData, get_conversation_turns, use_llm_func_with_cache, + list_of_list_to_json, ) from .base import ( BaseGraphStorage, @@ -1311,21 +1311,26 @@ async def _build_query_context( [hl_text_units_context, ll_text_units_context], ) # not necessary to use LLM to generate a response - if not entities_context.strip() and not relations_context.strip(): + if not entities_context and not relations_context: return None + # 转换为 JSON 字符串 + entities_str = json.dumps(entities_context, ensure_ascii=False) + relations_str = json.dumps(relations_context, ensure_ascii=False) + text_units_str = json.dumps(text_units_context, ensure_ascii=False) + result = f""" -----Entities----- ```json - {entities_context} + {entities_str} ``` -----Relationships----- ```json - {relations_context} + {relations_str} ``` -----Sources----- ```json - {text_units_context} + {text_units_str} ``` """.strip() return result @@ -1424,7 +1429,7 @@ async def _get_node_data( file_path, ] ) - entities_context = list_of_list_to_csv(entites_section_list) + entities_context = list_of_list_to_json(entites_section_list) relations_section_list = [ [ @@ -1461,14 +1466,14 @@ async def _get_node_data( file_path, ] ) - relations_context = list_of_list_to_csv(relations_section_list) + relations_context = list_of_list_to_json(relations_section_list) text_units_section_list = [["id", "content", "file_path"]] for i, t in enumerate(use_text_units): text_units_section_list.append( [i, t["content"], t.get("file_path", "unknown_source")] ) - text_units_context = list_of_list_to_csv(text_units_section_list) + text_units_context = list_of_list_to_json(text_units_section_list) return entities_context, relations_context, text_units_context @@ -1736,7 +1741,7 @@ async def _get_edge_data( file_path, ] ) - relations_context = list_of_list_to_csv(relations_section_list) + relations_context = list_of_list_to_json(relations_section_list) entites_section_list = [ ["id", "entity", "type", "description", "rank", "created_at", "file_path"] @@ -1761,12 +1766,12 @@ async def _get_edge_data( file_path, ] ) - entities_context = list_of_list_to_csv(entites_section_list) + entities_context = list_of_list_to_json(entites_section_list) text_units_section_list = [["id", "content", "file_path"]] for i, t in enumerate(use_text_units): text_units_section_list.append([i, t["content"], t.get("file_path", "unknown")]) - text_units_context = list_of_list_to_csv(text_units_section_list) + text_units_context = list_of_list_to_json(text_units_section_list) return entities_context, relations_context, text_units_context diff --git a/lightrag/utils.py b/lightrag/utils.py index 372e4d07..45ee7bb9 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -374,37 +374,24 @@ def truncate_list_by_token_size( return list_data -def list_of_list_to_csv(data: list[list[str]]) -> str: - output = io.StringIO() - writer = csv.writer( - output, - quoting=csv.QUOTE_ALL, # Quote all fields - escapechar="\\", # Use backslash as escape character - quotechar='"', # Use double quotes - lineterminator="\n", # Explicit line terminator - ) - writer.writerows(data) - return output.getvalue() +def list_of_list_to_json(data: list[list[str]]) -> list[dict[str, str]]: + if not data or len(data) <= 1: + return [] + header = data[0] + result = [] -def csv_string_to_list(csv_string: str) -> list[list[str]]: - # Clean the string by removing NUL characters - cleaned_string = csv_string.replace("\0", "") + for row in data[1:]: + if len(row) >= 2: + item = {} + for i, field_name in enumerate(header): + if i < len(row): + item[field_name] = row[i] + else: + item[field_name] = "" + result.append(item) - output = io.StringIO(cleaned_string) - reader = csv.reader( - output, - quoting=csv.QUOTE_ALL, # Match the writer configuration - escapechar="\\", # Use backslash as escape character - quotechar='"', # Use double quotes - ) - - try: - return [row for row in reader] - except csv.Error as e: - raise ValueError(f"Failed to parse CSV string: {str(e)}") - finally: - output.close() + return result def save_data_to_file(data, file_name): @@ -472,50 +459,21 @@ def xml_to_json(xml_file): return None -def process_combine_contexts(hl: str, ll: str): - list_hl = csv_string_to_list(hl.strip()) if hl.strip() else [] - list_ll = csv_string_to_list(ll.strip()) if ll.strip() else [] - - if not list_hl and not list_ll: - return json.dumps([], ensure_ascii=False) - - header = None - if list_hl and len(list_hl) > 0: - header = list_hl[0] - list_hl = list_hl[1:] - if list_ll and len(list_ll) > 0: - if header is None: - header = list_ll[0] - list_ll = list_ll[1:] - - if header is None: - return json.dumps([], ensure_ascii=False) - +def process_combine_contexts(hl_context: dict, ll_context: dict): + seen_content = {} combined_data = [] - seen = set() - def process_row(row): - if len(row) < 2: - return None + for item in hl_context + ll_context: + content_key = {k: v for k, v in item.items() if k != 'id'} + content_key_str = str(content_key) + if content_key_str not in seen_content: + seen_content[content_key_str] = item + combined_data.append(item) - item_data = {} + for i, item in enumerate(combined_data): + item['id'] = i - for i, field_name in enumerate(header): - item_data[field_name] = row[i] - - return item_data - - for row in list_hl + list_ll: - if len(row) >= 2: - row_identifier = json.dumps(row[1:], ensure_ascii=False) - - if row_identifier not in seen: - seen.add(row_identifier) - item = process_row(row) - if item: - combined_data.append(item) - - return json.dumps(combined_data, ensure_ascii=False) + return combined_data async def get_best_cached_response( From 510a6adfbc514a1ee583bd28c2c533fe333ba823 Mon Sep 17 00:00:00 2001 From: mengchao Date: Sun, 20 Apr 2025 19:28:25 +0800 Subject: [PATCH 4/5] Fix linting --- lightrag/utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index 45ee7bb9..bdf9eb72 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -2,7 +2,6 @@ from __future__ import annotations import asyncio import html -import io import csv import json import logging @@ -464,14 +463,14 @@ def process_combine_contexts(hl_context: dict, ll_context: dict): combined_data = [] for item in hl_context + ll_context: - content_key = {k: v for k, v in item.items() if k != 'id'} + content_key = {k: v for k, v in item.items() if k != "id"} content_key_str = str(content_key) if content_key_str not in seen_content: seen_content[content_key_str] = item combined_data.append(item) for i, item in enumerate(combined_data): - item['id'] = i + item["id"] = i return combined_data From 8064a2339fb4cb1c6d963f33f003b87f25d36616 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AD=9F=E8=B6=85?= Date: Mon, 21 Apr 2025 12:08:12 +0800 Subject: [PATCH 5/5] change process_combine_contexts params type to list[dict[str, str]] --- lightrag/utils.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index bdf9eb72..ef63de01 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -385,7 +385,7 @@ def list_of_list_to_json(data: list[list[str]]) -> list[dict[str, str]]: item = {} for i, field_name in enumerate(header): if i < len(row): - item[field_name] = row[i] + item[field_name] = str(row[i]) else: item[field_name] = "" result.append(item) @@ -458,19 +458,21 @@ def xml_to_json(xml_file): return None -def process_combine_contexts(hl_context: dict, ll_context: dict): +def process_combine_contexts( + hl_context: list[dict[str, str]], ll_context: list[dict[str, str]] +): seen_content = {} combined_data = [] for item in hl_context + ll_context: - content_key = {k: v for k, v in item.items() if k != "id"} - content_key_str = str(content_key) - if content_key_str not in seen_content: - seen_content[content_key_str] = item + content_dict = {k: v for k, v in item.items() if k != "id"} + content_key = tuple(sorted(content_dict.items())) + if content_key not in seen_content: + seen_content[content_key] = item combined_data.append(item) for i, item in enumerate(combined_data): - item["id"] = i + item["id"] = str(i) return combined_data