Merge pull request #1360 from danielaskdd/normalize-entity-name

Feat: Implement entity/relation name and description normalization
2025-04-12 20:56:31 +08:00
parent 745301ea13 6174554c58
commit 4e45c3b942
2 changed files with 63 additions and 6 deletions
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -16,6 +16,7 @@ from .utils import (
    encode_string_by_tiktoken,
    is_float_regex,
    list_of_list_to_csv,
    normalize_extracted_info,
    pack_user_ass_to_openai_messages,
    split_string_by_multi_markers,
    truncate_list_by_token_size,
@@ -163,6 +164,9 @@ async def _handle_single_entity_extraction(
        )
        return None
    # Normalize entity name
    entity_name = normalize_extracted_info(entity_name, is_entity=True)
    # Clean and validate entity type
    entity_type = clean_str(record_attributes[2]).strip('"')
    if not entity_type.strip() or entity_type.startswith('("'):
@@ -172,7 +176,9 @@ async def _handle_single_entity_extraction(
        return None
    # Clean and validate description
-    entity_description = clean_str(record_attributes[3]).strip('"')
+    entity_description = clean_str(record_attributes[3])
    entity_description = normalize_extracted_info(entity_description)
    if not entity_description.strip():
        logger.warning(
            f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'"
@@ -196,13 +202,20 @@ async def _handle_single_relationship_extraction(
    if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
        return None
    # add this record as edge
-    source = clean_str(record_attributes[1]).strip('"')
+    source = clean_str(record_attributes[1])
-    target = clean_str(record_attributes[2]).strip('"')
+    target = clean_str(record_attributes[2])
-    edge_description = clean_str(record_attributes[3]).strip('"')
+
-    edge_keywords = clean_str(record_attributes[4]).strip('"')
+    # Normalize source and target entity names
    source = normalize_extracted_info(source, is_entity=True)
    target = normalize_extracted_info(target, is_entity=True)
    edge_description = clean_str(record_attributes[3])
    edge_description = normalize_extracted_info(edge_description)
    edge_keywords = clean_str(record_attributes[4]).strip('"').strip("'")
    edge_source_id = chunk_key
    weight = (
-        float(record_attributes[-1].strip('"'))
+        float(record_attributes[-1].strip('"').strip("'"))
        if is_float_regex(record_attributes[-1])
        else 1.0
    )
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -1006,6 +1006,50 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
    return content[:max_length] + "..."
 def normalize_extracted_info(name: str, is_entity=False) -> str:
    """Normalize entity/relation names and description with the following rules:
    1. Remove spaces between Chinese characters
    2. Remove spaces between Chinese characters and English letters/numbers
    3. Preserve spaces within English text and numbers
    4. Replace Chinese parentheses with English parentheses
    5. Replace Chinese dash with English dash
    Args:
        name: Entity name to normalize
    Returns:
        Normalized entity name
    """
    # Replace Chinese parentheses with English parentheses
    name = name.replace("（", "(").replace("）", ")")
    # Replace Chinese dash with English dash
    name = name.replace("—", "-").replace("－", "-")
    # Use regex to remove spaces between Chinese characters
    # Regex explanation:
    # (?<=[\u4e00-\u9fa5]): Positive lookbehind for Chinese character
    # \s+: One or more whitespace characters
    # (?=[\u4e00-\u9fa5]): Positive lookahead for Chinese character
    name = re.sub(r"(?<=[\u4e00-\u9fa5])\s+(?=[\u4e00-\u9fa5])", "", name)
    # Remove spaces between Chinese and English/numbers
    name = re.sub(r"(?<=[\u4e00-\u9fa5])\s+(?=[a-zA-Z0-9])", "", name)
    name = re.sub(r"(?<=[a-zA-Z0-9])\s+(?=[\u4e00-\u9fa5])", "", name)
    # Remove English quotation marks from the beginning and end
    name = name.strip('"').strip("'")
    if is_entity:
        # remove Chinese quotes
        name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "")
        # remove English queotes in and around chinese
        name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
        name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
    return name
 def clean_text(text: str) -> str:
    """Clean text by removing null bytes (0x00) and whitespace