From f3c57b606ede1c87deb26ef836bb8b8042a7223e Mon Sep 17 00:00:00 2001 From: tackhwa <55059307+tackhwa@users.noreply.github.com> Date: Mon, 21 Apr 2025 16:52:13 +0800 Subject: [PATCH 1/3] friendly implementation of entity extraction and relationship weight extract for Low-Capability LLMs --- lightrag/operate.py | 5 ++++- lightrag/utils.py | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 73d559e6..7cc3105b 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -18,6 +18,7 @@ from .utils import ( normalize_extracted_info, pack_user_ass_to_openai_messages, split_string_by_multi_markers, + extract_fixed_parenthesized_content, truncate_list_by_token_size, process_combine_contexts, compute_args_hash, @@ -215,7 +216,7 @@ async def _handle_single_relationship_extraction( edge_source_id = chunk_key weight = ( float(record_attributes[-1].strip('"').strip("'")) - if is_float_regex(record_attributes[-1]) + if is_float_regex(record_attributes[-1].strip('"').strip("'")) else 1.0 ) return dict( @@ -549,6 +550,8 @@ async def extract_entities( [context_base["record_delimiter"], context_base["completion_delimiter"]], ) + records = extract_fixed_parenthesized_content(records) + for record in records: record = re.search(r"\((.*)\)", record) if record is None: diff --git a/lightrag/utils.py b/lightrag/utils.py index c6991629..165d7106 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -408,6 +408,33 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str] return [r.strip() for r in results if r.strip()] +def extract_fixed_parenthesized_content(records: list[str]) -> list[str]: + """ + Extract content that should be in parentheses from each record. + Ensures each extracted item has both opening and closing parentheses. + """ + result = [] + + for record in records: + # First, extract properly matched pairs + balanced_matches = re.findall(r'\((.*?)\)', record) + for match in balanced_matches: + result.append(f"({match})") + + # Process string to handle unbalanced parentheses + # For opening without closing + open_matches = re.findall(r'\(([^()]*?)$', record) + for match in open_matches: + result.append(f"({match})") + + # For closing without opening + close_matches = re.findall(r'^([^()]*?)\)', record) + for match in close_matches: + result.append(f"({match})") + + return result + + # Refer the utils functions of the official GraphRAG implementation: # https://github.com/microsoft/graphrag def clean_str(input: Any) -> str: From 9e1be12e4acb41e87d43b3f03f18d22d1f5cc510 Mon Sep 17 00:00:00 2001 From: tackhwa <55059307+tackhwa@users.noreply.github.com> Date: Mon, 21 Apr 2025 17:07:34 +0800 Subject: [PATCH 2/3] fix lint --- lightrag/utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index 165d7106..78ee6472 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -417,21 +417,21 @@ def extract_fixed_parenthesized_content(records: list[str]) -> list[str]: for record in records: # First, extract properly matched pairs - balanced_matches = re.findall(r'\((.*?)\)', record) + balanced_matches = re.findall(r"\((.*?)\)", record) for match in balanced_matches: result.append(f"({match})") - + # Process string to handle unbalanced parentheses # For opening without closing - open_matches = re.findall(r'\(([^()]*?)$', record) + open_matches = re.findall(r"\(([^()]*?)$", record) for match in open_matches: result.append(f"({match})") - + # For closing without opening - close_matches = re.findall(r'^([^()]*?)\)', record) + close_matches = re.findall(r"^([^()]*?)\)", record) for match in close_matches: result.append(f"({match})") - + return result From 2e186ba4885355e81adc42b9f0a17163b4dc11c7 Mon Sep 17 00:00:00 2001 From: tackhwa <55059307+tackhwa@users.noreply.github.com> Date: Tue, 22 Apr 2025 15:22:37 +0800 Subject: [PATCH 3/3] remove regex --- lightrag/operate.py | 7 ++----- lightrag/utils.py | 27 --------------------------- 2 files changed, 2 insertions(+), 32 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 7cc3105b..b746dfbb 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -18,7 +18,6 @@ from .utils import ( normalize_extracted_info, pack_user_ass_to_openai_messages, split_string_by_multi_markers, - extract_fixed_parenthesized_content, truncate_list_by_token_size, process_combine_contexts, compute_args_hash, @@ -153,7 +152,7 @@ async def _handle_single_entity_extraction( chunk_key: str, file_path: str = "unknown_source", ): - if len(record_attributes) < 4 or record_attributes[0] != '"entity"': + if len(record_attributes) < 4 or '"entity"' not in record_attributes[0]: return None # Clean and validate entity name @@ -199,7 +198,7 @@ async def _handle_single_relationship_extraction( chunk_key: str, file_path: str = "unknown_source", ): - if len(record_attributes) < 5 or record_attributes[0] != '"relationship"': + if len(record_attributes) < 5 or '"relationship"' not in record_attributes[0]: return None # add this record as edge source = clean_str(record_attributes[1]) @@ -550,8 +549,6 @@ async def extract_entities( [context_base["record_delimiter"], context_base["completion_delimiter"]], ) - records = extract_fixed_parenthesized_content(records) - for record in records: record = re.search(r"\((.*)\)", record) if record is None: diff --git a/lightrag/utils.py b/lightrag/utils.py index 78ee6472..c6991629 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -408,33 +408,6 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str] return [r.strip() for r in results if r.strip()] -def extract_fixed_parenthesized_content(records: list[str]) -> list[str]: - """ - Extract content that should be in parentheses from each record. - Ensures each extracted item has both opening and closing parentheses. - """ - result = [] - - for record in records: - # First, extract properly matched pairs - balanced_matches = re.findall(r"\((.*?)\)", record) - for match in balanced_matches: - result.append(f"({match})") - - # Process string to handle unbalanced parentheses - # For opening without closing - open_matches = re.findall(r"\(([^()]*?)$", record) - for match in open_matches: - result.append(f"({match})") - - # For closing without opening - close_matches = re.findall(r"^([^()]*?)\)", record) - for match in close_matches: - result.append(f"({match})") - - return result - - # Refer the utils functions of the official GraphRAG implementation: # https://github.com/microsoft/graphrag def clean_str(input: Any) -> str: