Merge pull request #1360 from danielaskdd/normalize-entity-name

Feat: Implement entity/relation name and description normalization
This commit is contained in:
Daniel.y
2025-04-12 20:56:31 +08:00
committed by GitHub
2 changed files with 63 additions and 6 deletions

View File

@@ -16,6 +16,7 @@ from .utils import (
encode_string_by_tiktoken, encode_string_by_tiktoken,
is_float_regex, is_float_regex,
list_of_list_to_csv, list_of_list_to_csv,
normalize_extracted_info,
pack_user_ass_to_openai_messages, pack_user_ass_to_openai_messages,
split_string_by_multi_markers, split_string_by_multi_markers,
truncate_list_by_token_size, truncate_list_by_token_size,
@@ -163,6 +164,9 @@ async def _handle_single_entity_extraction(
) )
return None return None
# Normalize entity name
entity_name = normalize_extracted_info(entity_name, is_entity=True)
# Clean and validate entity type # Clean and validate entity type
entity_type = clean_str(record_attributes[2]).strip('"') entity_type = clean_str(record_attributes[2]).strip('"')
if not entity_type.strip() or entity_type.startswith('("'): if not entity_type.strip() or entity_type.startswith('("'):
@@ -172,7 +176,9 @@ async def _handle_single_entity_extraction(
return None return None
# Clean and validate description # Clean and validate description
entity_description = clean_str(record_attributes[3]).strip('"') entity_description = clean_str(record_attributes[3])
entity_description = normalize_extracted_info(entity_description)
if not entity_description.strip(): if not entity_description.strip():
logger.warning( logger.warning(
f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'" f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'"
@@ -196,13 +202,20 @@ async def _handle_single_relationship_extraction(
if len(record_attributes) < 5 or record_attributes[0] != '"relationship"': if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
return None return None
# add this record as edge # add this record as edge
source = clean_str(record_attributes[1]).strip('"') source = clean_str(record_attributes[1])
target = clean_str(record_attributes[2]).strip('"') target = clean_str(record_attributes[2])
edge_description = clean_str(record_attributes[3]).strip('"')
edge_keywords = clean_str(record_attributes[4]).strip('"') # Normalize source and target entity names
source = normalize_extracted_info(source, is_entity=True)
target = normalize_extracted_info(target, is_entity=True)
edge_description = clean_str(record_attributes[3])
edge_description = normalize_extracted_info(edge_description)
edge_keywords = clean_str(record_attributes[4]).strip('"').strip("'")
edge_source_id = chunk_key edge_source_id = chunk_key
weight = ( weight = (
float(record_attributes[-1].strip('"')) float(record_attributes[-1].strip('"').strip("'"))
if is_float_regex(record_attributes[-1]) if is_float_regex(record_attributes[-1])
else 1.0 else 1.0
) )

View File

@@ -1006,6 +1006,50 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
return content[:max_length] + "..." return content[:max_length] + "..."
def normalize_extracted_info(name: str, is_entity=False) -> str:
"""Normalize entity/relation names and description with the following rules:
1. Remove spaces between Chinese characters
2. Remove spaces between Chinese characters and English letters/numbers
3. Preserve spaces within English text and numbers
4. Replace Chinese parentheses with English parentheses
5. Replace Chinese dash with English dash
Args:
name: Entity name to normalize
Returns:
Normalized entity name
"""
# Replace Chinese parentheses with English parentheses
name = name.replace("", "(").replace("", ")")
# Replace Chinese dash with English dash
name = name.replace("", "-").replace("", "-")
# Use regex to remove spaces between Chinese characters
# Regex explanation:
# (?<=[\u4e00-\u9fa5]): Positive lookbehind for Chinese character
# \s+: One or more whitespace characters
# (?=[\u4e00-\u9fa5]): Positive lookahead for Chinese character
name = re.sub(r"(?<=[\u4e00-\u9fa5])\s+(?=[\u4e00-\u9fa5])", "", name)
# Remove spaces between Chinese and English/numbers
name = re.sub(r"(?<=[\u4e00-\u9fa5])\s+(?=[a-zA-Z0-9])", "", name)
name = re.sub(r"(?<=[a-zA-Z0-9])\s+(?=[\u4e00-\u9fa5])", "", name)
# Remove English quotation marks from the beginning and end
name = name.strip('"').strip("'")
if is_entity:
# remove Chinese quotes
name = name.replace("", "").replace("", "").replace("", "").replace("", "")
# remove English queotes in and around chinese
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
return name
def clean_text(text: str) -> str: def clean_text(text: str) -> str:
"""Clean text by removing null bytes (0x00) and whitespace """Clean text by removing null bytes (0x00) and whitespace