Merge pull request #1550 from danielaskdd/keyword-join-with-comma
Deduplicate merged relation keywords
This commit is contained in:
@@ -218,7 +218,11 @@ async def _handle_single_relationship_extraction(
|
|||||||
edge_description = clean_str(record_attributes[3])
|
edge_description = clean_str(record_attributes[3])
|
||||||
edge_description = normalize_extracted_info(edge_description)
|
edge_description = normalize_extracted_info(edge_description)
|
||||||
|
|
||||||
edge_keywords = clean_str(record_attributes[4]).strip('"').strip("'")
|
edge_keywords = normalize_extracted_info(
|
||||||
|
clean_str(record_attributes[4]), is_entity=True
|
||||||
|
)
|
||||||
|
edge_keywords = edge_keywords.replace(",", ",")
|
||||||
|
|
||||||
edge_source_id = chunk_key
|
edge_source_id = chunk_key
|
||||||
weight = (
|
weight = (
|
||||||
float(record_attributes[-1].strip('"').strip("'"))
|
float(record_attributes[-1].strip('"').strip("'"))
|
||||||
@@ -388,14 +392,22 @@ async def _merge_edges_then_upsert(
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
keywords = GRAPH_FIELD_SEP.join(
|
|
||||||
sorted(
|
# Split all existing and new keywords into individual terms, then combine and deduplicate
|
||||||
set(
|
all_keywords = set()
|
||||||
[dp["keywords"] for dp in edges_data if dp.get("keywords")]
|
# Process already_keywords (which are comma-separated)
|
||||||
+ already_keywords
|
for keyword_str in already_keywords:
|
||||||
|
if keyword_str: # Skip empty strings
|
||||||
|
all_keywords.update(k.strip() for k in keyword_str.split(",") if k.strip())
|
||||||
|
# Process new keywords from edges_data
|
||||||
|
for edge in edges_data:
|
||||||
|
if edge.get("keywords"):
|
||||||
|
all_keywords.update(
|
||||||
|
k.strip() for k in edge["keywords"].split(",") if k.strip()
|
||||||
)
|
)
|
||||||
)
|
# Join all unique keywords with commas
|
||||||
)
|
keywords = ",".join(sorted(all_keywords))
|
||||||
|
|
||||||
source_id = GRAPH_FIELD_SEP.join(
|
source_id = GRAPH_FIELD_SEP.join(
|
||||||
set(
|
set(
|
||||||
[dp["source_id"] for dp in edges_data if dp.get("source_id")]
|
[dp["source_id"] for dp in edges_data if dp.get("source_id")]
|
||||||
|
Reference in New Issue
Block a user