Merge pull request #1360 from danielaskdd/normalize-entity-name
Feat: Implement entity/relation name and description normalization
This commit is contained in:
@@ -16,6 +16,7 @@ from .utils import (
|
|||||||
encode_string_by_tiktoken,
|
encode_string_by_tiktoken,
|
||||||
is_float_regex,
|
is_float_regex,
|
||||||
list_of_list_to_csv,
|
list_of_list_to_csv,
|
||||||
|
normalize_extracted_info,
|
||||||
pack_user_ass_to_openai_messages,
|
pack_user_ass_to_openai_messages,
|
||||||
split_string_by_multi_markers,
|
split_string_by_multi_markers,
|
||||||
truncate_list_by_token_size,
|
truncate_list_by_token_size,
|
||||||
@@ -163,6 +164,9 @@ async def _handle_single_entity_extraction(
|
|||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Normalize entity name
|
||||||
|
entity_name = normalize_extracted_info(entity_name, is_entity=True)
|
||||||
|
|
||||||
# Clean and validate entity type
|
# Clean and validate entity type
|
||||||
entity_type = clean_str(record_attributes[2]).strip('"')
|
entity_type = clean_str(record_attributes[2]).strip('"')
|
||||||
if not entity_type.strip() or entity_type.startswith('("'):
|
if not entity_type.strip() or entity_type.startswith('("'):
|
||||||
@@ -172,7 +176,9 @@ async def _handle_single_entity_extraction(
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
# Clean and validate description
|
# Clean and validate description
|
||||||
entity_description = clean_str(record_attributes[3]).strip('"')
|
entity_description = clean_str(record_attributes[3])
|
||||||
|
entity_description = normalize_extracted_info(entity_description)
|
||||||
|
|
||||||
if not entity_description.strip():
|
if not entity_description.strip():
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'"
|
f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'"
|
||||||
@@ -196,13 +202,20 @@ async def _handle_single_relationship_extraction(
|
|||||||
if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
|
if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
|
||||||
return None
|
return None
|
||||||
# add this record as edge
|
# add this record as edge
|
||||||
source = clean_str(record_attributes[1]).strip('"')
|
source = clean_str(record_attributes[1])
|
||||||
target = clean_str(record_attributes[2]).strip('"')
|
target = clean_str(record_attributes[2])
|
||||||
edge_description = clean_str(record_attributes[3]).strip('"')
|
|
||||||
edge_keywords = clean_str(record_attributes[4]).strip('"')
|
# Normalize source and target entity names
|
||||||
|
source = normalize_extracted_info(source, is_entity=True)
|
||||||
|
target = normalize_extracted_info(target, is_entity=True)
|
||||||
|
|
||||||
|
edge_description = clean_str(record_attributes[3])
|
||||||
|
edge_description = normalize_extracted_info(edge_description)
|
||||||
|
|
||||||
|
edge_keywords = clean_str(record_attributes[4]).strip('"').strip("'")
|
||||||
edge_source_id = chunk_key
|
edge_source_id = chunk_key
|
||||||
weight = (
|
weight = (
|
||||||
float(record_attributes[-1].strip('"'))
|
float(record_attributes[-1].strip('"').strip("'"))
|
||||||
if is_float_regex(record_attributes[-1])
|
if is_float_regex(record_attributes[-1])
|
||||||
else 1.0
|
else 1.0
|
||||||
)
|
)
|
||||||
|
@@ -1006,6 +1006,50 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
|
|||||||
return content[:max_length] + "..."
|
return content[:max_length] + "..."
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_extracted_info(name: str, is_entity=False) -> str:
|
||||||
|
"""Normalize entity/relation names and description with the following rules:
|
||||||
|
1. Remove spaces between Chinese characters
|
||||||
|
2. Remove spaces between Chinese characters and English letters/numbers
|
||||||
|
3. Preserve spaces within English text and numbers
|
||||||
|
4. Replace Chinese parentheses with English parentheses
|
||||||
|
5. Replace Chinese dash with English dash
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Entity name to normalize
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Normalized entity name
|
||||||
|
"""
|
||||||
|
# Replace Chinese parentheses with English parentheses
|
||||||
|
name = name.replace("(", "(").replace(")", ")")
|
||||||
|
|
||||||
|
# Replace Chinese dash with English dash
|
||||||
|
name = name.replace("—", "-").replace("-", "-")
|
||||||
|
|
||||||
|
# Use regex to remove spaces between Chinese characters
|
||||||
|
# Regex explanation:
|
||||||
|
# (?<=[\u4e00-\u9fa5]): Positive lookbehind for Chinese character
|
||||||
|
# \s+: One or more whitespace characters
|
||||||
|
# (?=[\u4e00-\u9fa5]): Positive lookahead for Chinese character
|
||||||
|
name = re.sub(r"(?<=[\u4e00-\u9fa5])\s+(?=[\u4e00-\u9fa5])", "", name)
|
||||||
|
|
||||||
|
# Remove spaces between Chinese and English/numbers
|
||||||
|
name = re.sub(r"(?<=[\u4e00-\u9fa5])\s+(?=[a-zA-Z0-9])", "", name)
|
||||||
|
name = re.sub(r"(?<=[a-zA-Z0-9])\s+(?=[\u4e00-\u9fa5])", "", name)
|
||||||
|
|
||||||
|
# Remove English quotation marks from the beginning and end
|
||||||
|
name = name.strip('"').strip("'")
|
||||||
|
|
||||||
|
if is_entity:
|
||||||
|
# remove Chinese quotes
|
||||||
|
name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "")
|
||||||
|
# remove English queotes in and around chinese
|
||||||
|
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
|
||||||
|
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
|
||||||
|
|
||||||
|
return name
|
||||||
|
|
||||||
|
|
||||||
def clean_text(text: str) -> str:
|
def clean_text(text: str) -> str:
|
||||||
"""Clean text by removing null bytes (0x00) and whitespace
|
"""Clean text by removing null bytes (0x00) and whitespace
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user