From 0eed5eb7185bf326d91c571d1487aeb9bac74a8b Mon Sep 17 00:00:00 2001 From: yangdx Date: Sat, 12 Apr 2025 19:26:02 +0800 Subject: [PATCH 1/3] feat: implement entity/relation name and description normalization - Remove spaces between Chinese characters - Remove spaces between Chinese and English/numbers - Preserve spaces within English text and numbers - Replace Chinese parentheses with English parentheses - Replace Chinese dash with English dash --- lightrag/operate.py | 13 +++++++++++++ lightrag/utils.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/lightrag/operate.py b/lightrag/operate.py index 02d9c85e..ceec7b61 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -16,6 +16,7 @@ from .utils import ( encode_string_by_tiktoken, is_float_regex, list_of_list_to_csv, + normalize_extracted_info, pack_user_ass_to_openai_messages, split_string_by_multi_markers, truncate_list_by_token_size, @@ -163,6 +164,9 @@ async def _handle_single_entity_extraction( ) return None + # Normalize entity name + entity_name = normalize_extracted_info(entity_name) + # Clean and validate entity type entity_type = clean_str(record_attributes[2]).strip('"') if not entity_type.strip() or entity_type.startswith('("'): @@ -173,6 +177,8 @@ async def _handle_single_entity_extraction( # Clean and validate description entity_description = clean_str(record_attributes[3]).strip('"') + entity_description = normalize_extracted_info(entity_description) + if not entity_description.strip(): logger.warning( f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'" @@ -198,7 +204,14 @@ async def _handle_single_relationship_extraction( # add this record as edge source = clean_str(record_attributes[1]).strip('"') target = clean_str(record_attributes[2]).strip('"') + + # Normalize source and target entity names + source = normalize_extracted_info(source) + target = normalize_extracted_info(target) + edge_description = clean_str(record_attributes[3]).strip('"') + edge_description = normalize_extracted_info(edge_description) + edge_keywords = clean_str(record_attributes[4]).strip('"') edge_source_id = chunk_key weight = ( diff --git a/lightrag/utils.py b/lightrag/utils.py index fd188498..6b9b07fa 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -1006,6 +1006,43 @@ def get_content_summary(content: str, max_length: int = 250) -> str: return content[:max_length] + "..." +def normalize_extracted_info(name: str) -> str: + """Normalize entity/relation names and description with the following rules: + 1. Remove spaces between Chinese characters + 2. Remove spaces between Chinese characters and English letters/numbers + 3. Preserve spaces within English text and numbers + 4. Replace Chinese parentheses with English parentheses + 5. Replace Chinese dash with English dash + + Args: + name: Entity name to normalize + + Returns: + Normalized entity name + """ + # Replace Chinese parentheses with English parentheses + name = name.replace("(", "(").replace(")", ")") + + # Replace Chinese dash with English dash + name = name.replace("—", "-").replace("-", "-") + + # Use regex to remove spaces between Chinese characters + # Regex explanation: + # (?<=[\u4e00-\u9fa5]): Positive lookbehind for Chinese character + # \s+: One or more whitespace characters + # (?=[\u4e00-\u9fa5]): Positive lookahead for Chinese character + name = re.sub(r"(?<=[\u4e00-\u9fa5])\s+(?=[\u4e00-\u9fa5])", "", name) + + # Remove spaces between Chinese and English/numbers + name = re.sub(r"(?<=[\u4e00-\u9fa5])\s+(?=[a-zA-Z0-9])", "", name) + name = re.sub(r"(?<=[a-zA-Z0-9])\s+(?=[\u4e00-\u9fa5])", "", name) + + # Remove English quotation marks from the beginning and end + name = name.strip('"').strip("'") + + return name + + def clean_text(text: str) -> str: """Clean text by removing null bytes (0x00) and whitespace From 2ac66c3531f95c380816a32f845a23e864fb6c43 Mon Sep 17 00:00:00 2001 From: yangdx Date: Sat, 12 Apr 2025 20:45:41 +0800 Subject: [PATCH 2/3] Remove chinese quotes in entity name --- lightrag/operate.py | 18 +++++++++--------- lightrag/utils.py | 9 ++++++++- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index ceec7b61..8c6688aa 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -165,7 +165,7 @@ async def _handle_single_entity_extraction( return None # Normalize entity name - entity_name = normalize_extracted_info(entity_name) + entity_name = normalize_extracted_info(entity_name, is_entity=True) # Clean and validate entity type entity_type = clean_str(record_attributes[2]).strip('"') @@ -176,7 +176,7 @@ async def _handle_single_entity_extraction( return None # Clean and validate description - entity_description = clean_str(record_attributes[3]).strip('"') + entity_description = clean_str(record_attributes[3]) entity_description = normalize_extracted_info(entity_description) if not entity_description.strip(): @@ -202,20 +202,20 @@ async def _handle_single_relationship_extraction( if len(record_attributes) < 5 or record_attributes[0] != '"relationship"': return None # add this record as edge - source = clean_str(record_attributes[1]).strip('"') - target = clean_str(record_attributes[2]).strip('"') + source = clean_str(record_attributes[1]) + target = clean_str(record_attributes[2]) # Normalize source and target entity names - source = normalize_extracted_info(source) - target = normalize_extracted_info(target) + source = normalize_extracted_info(source, is_entity=True) + target = normalize_extracted_info(target, is_entity=True) - edge_description = clean_str(record_attributes[3]).strip('"') + edge_description = clean_str(record_attributes[3]) edge_description = normalize_extracted_info(edge_description) - edge_keywords = clean_str(record_attributes[4]).strip('"') + edge_keywords = clean_str(record_attributes[4]).strip('"').strip("'") edge_source_id = chunk_key weight = ( - float(record_attributes[-1].strip('"')) + float(record_attributes[-1].strip('"').strip("'")) if is_float_regex(record_attributes[-1]) else 1.0 ) diff --git a/lightrag/utils.py b/lightrag/utils.py index 6b9b07fa..8473ea81 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -1006,7 +1006,7 @@ def get_content_summary(content: str, max_length: int = 250) -> str: return content[:max_length] + "..." -def normalize_extracted_info(name: str) -> str: +def normalize_extracted_info(name: str, is_entity = False) -> str: """Normalize entity/relation names and description with the following rules: 1. Remove spaces between Chinese characters 2. Remove spaces between Chinese characters and English letters/numbers @@ -1040,6 +1040,13 @@ def normalize_extracted_info(name: str) -> str: # Remove English quotation marks from the beginning and end name = name.strip('"').strip("'") + if is_entity: + # remove Chinese quotes + name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "") + # remove English queotes in and around chinese + name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name) + name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name) + return name From 6174554c5835e1ff0e31246aa419d33dab87fd1c Mon Sep 17 00:00:00 2001 From: yangdx Date: Sat, 12 Apr 2025 20:50:21 +0800 Subject: [PATCH 3/3] Fix linting --- lightrag/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index 8473ea81..43d82196 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -1006,7 +1006,7 @@ def get_content_summary(content: str, max_length: int = 250) -> str: return content[:max_length] + "..." -def normalize_extracted_info(name: str, is_entity = False) -> str: +def normalize_extracted_info(name: str, is_entity=False) -> str: """Normalize entity/relation names and description with the following rules: 1. Remove spaces between Chinese characters 2. Remove spaces between Chinese characters and English letters/numbers