From 0eed5eb7185bf326d91c571d1487aeb9bac74a8b Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 12 Apr 2025 19:26:02 +0800
Subject: [PATCH 1/3] feat: implement entity/relation name and description
 normalization

- Remove spaces between Chinese characters
- Remove spaces between Chinese and English/numbers
- Preserve spaces within English text and numbers
- Replace Chinese parentheses with English parentheses
- Replace Chinese dash with English dash
---
 lightrag/operate.py | 13 +++++++++++++
 lightrag/utils.py   | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 02d9c85e..ceec7b61 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -16,6 +16,7 @@ from .utils import (
     encode_string_by_tiktoken,
     is_float_regex,
     list_of_list_to_csv,
+    normalize_extracted_info,
     pack_user_ass_to_openai_messages,
     split_string_by_multi_markers,
     truncate_list_by_token_size,
@@ -163,6 +164,9 @@ async def _handle_single_entity_extraction(
         )
         return None
 
+    # Normalize entity name
+    entity_name = normalize_extracted_info(entity_name)
+
     # Clean and validate entity type
     entity_type = clean_str(record_attributes[2]).strip('"')
     if not entity_type.strip() or entity_type.startswith('("'):
@@ -173,6 +177,8 @@ async def _handle_single_entity_extraction(
 
     # Clean and validate description
     entity_description = clean_str(record_attributes[3]).strip('"')
+    entity_description = normalize_extracted_info(entity_description)
+
     if not entity_description.strip():
         logger.warning(
             f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'"
@@ -198,7 +204,14 @@ async def _handle_single_relationship_extraction(
     # add this record as edge
     source = clean_str(record_attributes[1]).strip('"')
     target = clean_str(record_attributes[2]).strip('"')
+
+    # Normalize source and target entity names
+    source = normalize_extracted_info(source)
+    target = normalize_extracted_info(target)
+
     edge_description = clean_str(record_attributes[3]).strip('"')
+    edge_description = normalize_extracted_info(edge_description)
+
     edge_keywords = clean_str(record_attributes[4]).strip('"')
     edge_source_id = chunk_key
     weight = (
diff --git a/lightrag/utils.py b/lightrag/utils.py
index fd188498..6b9b07fa 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -1006,6 +1006,43 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
     return content[:max_length] + "..."
 
 
+def normalize_extracted_info(name: str) -> str:
+    """Normalize entity/relation names and description with the following rules:
+    1. Remove spaces between Chinese characters
+    2. Remove spaces between Chinese characters and English letters/numbers
+    3. Preserve spaces within English text and numbers
+    4. Replace Chinese parentheses with English parentheses
+    5. Replace Chinese dash with English dash
+
+    Args:
+        name: Entity name to normalize
+
+    Returns:
+        Normalized entity name
+    """
+    # Replace Chinese parentheses with English parentheses
+    name = name.replace("（", "(").replace("）", ")")
+
+    # Replace Chinese dash with English dash
+    name = name.replace("—", "-").replace("－", "-")
+
+    # Use regex to remove spaces between Chinese characters
+    # Regex explanation:
+    # (?<=[\u4e00-\u9fa5]): Positive lookbehind for Chinese character
+    # \s+: One or more whitespace characters
+    # (?=[\u4e00-\u9fa5]): Positive lookahead for Chinese character
+    name = re.sub(r"(?<=[\u4e00-\u9fa5])\s+(?=[\u4e00-\u9fa5])", "", name)
+
+    # Remove spaces between Chinese and English/numbers
+    name = re.sub(r"(?<=[\u4e00-\u9fa5])\s+(?=[a-zA-Z0-9])", "", name)
+    name = re.sub(r"(?<=[a-zA-Z0-9])\s+(?=[\u4e00-\u9fa5])", "", name)
+
+    # Remove English quotation marks from the beginning and end
+    name = name.strip('"').strip("'")
+
+    return name
+
+
 def clean_text(text: str) -> str:
     """Clean text by removing null bytes (0x00) and whitespace
 

From 2ac66c3531f95c380816a32f845a23e864fb6c43 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 12 Apr 2025 20:45:41 +0800
Subject: [PATCH 2/3] Remove chinese quotes in entity name

---
 lightrag/operate.py | 18 +++++++++---------
 lightrag/utils.py   |  9 ++++++++-
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index ceec7b61..8c6688aa 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -165,7 +165,7 @@ async def _handle_single_entity_extraction(
         return None
 
     # Normalize entity name
-    entity_name = normalize_extracted_info(entity_name)
+    entity_name = normalize_extracted_info(entity_name, is_entity=True)
 
     # Clean and validate entity type
     entity_type = clean_str(record_attributes[2]).strip('"')
@@ -176,7 +176,7 @@ async def _handle_single_entity_extraction(
         return None
 
     # Clean and validate description
-    entity_description = clean_str(record_attributes[3]).strip('"')
+    entity_description = clean_str(record_attributes[3])
     entity_description = normalize_extracted_info(entity_description)
 
     if not entity_description.strip():
@@ -202,20 +202,20 @@ async def _handle_single_relationship_extraction(
     if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
         return None
     # add this record as edge
-    source = clean_str(record_attributes[1]).strip('"')
-    target = clean_str(record_attributes[2]).strip('"')
+    source = clean_str(record_attributes[1])
+    target = clean_str(record_attributes[2])
 
     # Normalize source and target entity names
-    source = normalize_extracted_info(source)
-    target = normalize_extracted_info(target)
+    source = normalize_extracted_info(source, is_entity=True)
+    target = normalize_extracted_info(target, is_entity=True)
 
-    edge_description = clean_str(record_attributes[3]).strip('"')
+    edge_description = clean_str(record_attributes[3])
     edge_description = normalize_extracted_info(edge_description)
 
-    edge_keywords = clean_str(record_attributes[4]).strip('"')
+    edge_keywords = clean_str(record_attributes[4]).strip('"').strip("'")
     edge_source_id = chunk_key
     weight = (
-        float(record_attributes[-1].strip('"'))
+        float(record_attributes[-1].strip('"').strip("'"))
         if is_float_regex(record_attributes[-1])
         else 1.0
     )
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 6b9b07fa..8473ea81 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -1006,7 +1006,7 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
     return content[:max_length] + "..."
 
 
-def normalize_extracted_info(name: str) -> str:
+def normalize_extracted_info(name: str, is_entity = False) -> str:
     """Normalize entity/relation names and description with the following rules:
     1. Remove spaces between Chinese characters
     2. Remove spaces between Chinese characters and English letters/numbers
@@ -1040,6 +1040,13 @@ def normalize_extracted_info(name: str) -> str:
     # Remove English quotation marks from the beginning and end
     name = name.strip('"').strip("'")
 
+    if is_entity:
+        # remove Chinese quotes
+        name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "")
+        # remove English queotes in and around chinese
+        name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
+        name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
+
     return name
 
 

From 6174554c5835e1ff0e31246aa419d33dab87fd1c Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 12 Apr 2025 20:50:21 +0800
Subject: [PATCH 3/3] Fix linting

---
 lightrag/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index 8473ea81..43d82196 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -1006,7 +1006,7 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
     return content[:max_length] + "..."
 
 
-def normalize_extracted_info(name: str, is_entity = False) -> str:
+def normalize_extracted_info(name: str, is_entity=False) -> str:
     """Normalize entity/relation names and description with the following rules:
     1. Remove spaces between Chinese characters
     2. Remove spaces between Chinese characters and English letters/numbers