From de40f1b5b3905c9518fb4f6ab7f67f651c76a413 Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 8 May 2025 15:52:18 +0800 Subject: [PATCH 1/3] Deduplicate merged relation keywords --- lightrag/operate.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 5f57a90c..086de3d5 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -388,14 +388,22 @@ async def _merge_edges_then_upsert( ) ) ) - keywords = GRAPH_FIELD_SEP.join( - sorted( - set( - [dp["keywords"] for dp in edges_data if dp.get("keywords")] - + already_keywords + + # Split all existing and new keywords into individual terms, then combine and deduplicate + all_keywords = set() + # Process already_keywords (which are comma-separated) + for keyword_str in already_keywords: + if keyword_str: # Skip empty strings + all_keywords.update(k.strip() for k in keyword_str.split(",") if k.strip()) + # Process new keywords from edges_data + for edge in edges_data: + if edge.get("keywords"): + all_keywords.update( + k.strip() for k in edge["keywords"].split(",") if k.strip() ) - ) - ) + # Join all unique keywords with commas + keywords = ",".join(sorted(all_keywords)) + source_id = GRAPH_FIELD_SEP.join( set( [dp["source_id"] for dp in edges_data if dp.get("source_id")] From 2fefb57356e6aee653aaffba4c49f59944f76f69 Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 8 May 2025 16:05:08 +0800 Subject: [PATCH 2/3] Normalize keyword extration result --- lightrag/operate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 086de3d5..b110048f 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -218,7 +218,9 @@ async def _handle_single_relationship_extraction( edge_description = clean_str(record_attributes[3]) edge_description = normalize_extracted_info(edge_description) - edge_keywords = clean_str(record_attributes[4]).strip('"').strip("'") + edge_keywords = normalize_extracted_info(clean_str(record_attributes[4]), is_entity=True) + edge_keywords = edge_keywords.replace(",", ",") + edge_source_id = chunk_key weight = ( float(record_attributes[-1].strip('"').strip("'")) From d2d755db7be46d510002ee113d2420d29a79be0b Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 8 May 2025 16:05:52 +0800 Subject: [PATCH 3/3] Normalize keyword extration result --- lightrag/operate.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 086de3d5..0d5c7866 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -218,7 +218,11 @@ async def _handle_single_relationship_extraction( edge_description = clean_str(record_attributes[3]) edge_description = normalize_extracted_info(edge_description) - edge_keywords = clean_str(record_attributes[4]).strip('"').strip("'") + edge_keywords = normalize_extracted_info( + clean_str(record_attributes[4]), is_entity=True + ) + edge_keywords = edge_keywords.replace(",", ",") + edge_source_id = chunk_key weight = ( float(record_attributes[-1].strip('"').strip("'"))