From f3c57b606ede1c87deb26ef836bb8b8042a7223e Mon Sep 17 00:00:00 2001
From: tackhwa <55059307+tackhwa@users.noreply.github.com>
Date: Mon, 21 Apr 2025 16:52:13 +0800
Subject: [PATCH 1/3] friendly implementation of entity extraction and
 relationship weight extract for Low-Capability LLMs

---
 lightrag/operate.py |  5 ++++-
 lightrag/utils.py   | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 73d559e6..7cc3105b 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -18,6 +18,7 @@ from .utils import (
     normalize_extracted_info,
     pack_user_ass_to_openai_messages,
     split_string_by_multi_markers,
+    extract_fixed_parenthesized_content,
     truncate_list_by_token_size,
     process_combine_contexts,
     compute_args_hash,
@@ -215,7 +216,7 @@ async def _handle_single_relationship_extraction(
     edge_source_id = chunk_key
     weight = (
         float(record_attributes[-1].strip('"').strip("'"))
-        if is_float_regex(record_attributes[-1])
+        if is_float_regex(record_attributes[-1].strip('"').strip("'"))
         else 1.0
     )
     return dict(
@@ -549,6 +550,8 @@ async def extract_entities(
             [context_base["record_delimiter"], context_base["completion_delimiter"]],
         )
 
+        records = extract_fixed_parenthesized_content(records)
+
         for record in records:
             record = re.search(r"\((.*)\)", record)
             if record is None:
diff --git a/lightrag/utils.py b/lightrag/utils.py
index c6991629..165d7106 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -408,6 +408,33 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]
     return [r.strip() for r in results if r.strip()]
 
 
+def extract_fixed_parenthesized_content(records: list[str]) -> list[str]:
+    """
+    Extract content that should be in parentheses from each record.
+    Ensures each extracted item has both opening and closing parentheses.
+    """
+    result = []
+
+    for record in records:
+        # First, extract properly matched pairs
+        balanced_matches = re.findall(r'\((.*?)\)', record)
+        for match in balanced_matches:
+            result.append(f"({match})")
+        
+        # Process string to handle unbalanced parentheses
+        # For opening without closing
+        open_matches = re.findall(r'\(([^()]*?)$', record)
+        for match in open_matches:
+            result.append(f"({match})")
+        
+        # For closing without opening
+        close_matches = re.findall(r'^([^()]*?)\)', record)
+        for match in close_matches:
+            result.append(f"({match})")
+    
+    return result
+
+
 # Refer the utils functions of the official GraphRAG implementation:
 # https://github.com/microsoft/graphrag
 def clean_str(input: Any) -> str:

From 9e1be12e4acb41e87d43b3f03f18d22d1f5cc510 Mon Sep 17 00:00:00 2001
From: tackhwa <55059307+tackhwa@users.noreply.github.com>
Date: Mon, 21 Apr 2025 17:07:34 +0800
Subject: [PATCH 2/3] fix lint

---
 lightrag/utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index 165d7106..78ee6472 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -417,21 +417,21 @@ def extract_fixed_parenthesized_content(records: list[str]) -> list[str]:
 
     for record in records:
         # First, extract properly matched pairs
-        balanced_matches = re.findall(r'\((.*?)\)', record)
+        balanced_matches = re.findall(r"\((.*?)\)", record)
         for match in balanced_matches:
             result.append(f"({match})")
-        
+
         # Process string to handle unbalanced parentheses
         # For opening without closing
-        open_matches = re.findall(r'\(([^()]*?)$', record)
+        open_matches = re.findall(r"\(([^()]*?)$", record)
         for match in open_matches:
             result.append(f"({match})")
-        
+
         # For closing without opening
-        close_matches = re.findall(r'^([^()]*?)\)', record)
+        close_matches = re.findall(r"^([^()]*?)\)", record)
         for match in close_matches:
             result.append(f"({match})")
-    
+
     return result
 
 

From 2e186ba4885355e81adc42b9f0a17163b4dc11c7 Mon Sep 17 00:00:00 2001
From: tackhwa <55059307+tackhwa@users.noreply.github.com>
Date: Tue, 22 Apr 2025 15:22:37 +0800
Subject: [PATCH 3/3] remove regex

---
 lightrag/operate.py |  7 ++-----
 lightrag/utils.py   | 27 ---------------------------
 2 files changed, 2 insertions(+), 32 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 7cc3105b..b746dfbb 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -18,7 +18,6 @@ from .utils import (
     normalize_extracted_info,
     pack_user_ass_to_openai_messages,
     split_string_by_multi_markers,
-    extract_fixed_parenthesized_content,
     truncate_list_by_token_size,
     process_combine_contexts,
     compute_args_hash,
@@ -153,7 +152,7 @@ async def _handle_single_entity_extraction(
     chunk_key: str,
     file_path: str = "unknown_source",
 ):
-    if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
+    if len(record_attributes) < 4 or '"entity"' not in record_attributes[0]:
         return None
 
     # Clean and validate entity name
@@ -199,7 +198,7 @@ async def _handle_single_relationship_extraction(
     chunk_key: str,
     file_path: str = "unknown_source",
 ):
-    if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
+    if len(record_attributes) < 5 or '"relationship"' not in record_attributes[0]:
         return None
     # add this record as edge
     source = clean_str(record_attributes[1])
@@ -550,8 +549,6 @@ async def extract_entities(
             [context_base["record_delimiter"], context_base["completion_delimiter"]],
         )
 
-        records = extract_fixed_parenthesized_content(records)
-
         for record in records:
             record = re.search(r"\((.*)\)", record)
             if record is None:
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 78ee6472..c6991629 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -408,33 +408,6 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]
     return [r.strip() for r in results if r.strip()]
 
 
-def extract_fixed_parenthesized_content(records: list[str]) -> list[str]:
-    """
-    Extract content that should be in parentheses from each record.
-    Ensures each extracted item has both opening and closing parentheses.
-    """
-    result = []
-
-    for record in records:
-        # First, extract properly matched pairs
-        balanced_matches = re.findall(r"\((.*?)\)", record)
-        for match in balanced_matches:
-            result.append(f"({match})")
-
-        # Process string to handle unbalanced parentheses
-        # For opening without closing
-        open_matches = re.findall(r"\(([^()]*?)$", record)
-        for match in open_matches:
-            result.append(f"({match})")
-
-        # For closing without opening
-        close_matches = re.findall(r"^([^()]*?)\)", record)
-        for match in close_matches:
-            result.append(f"({match})")
-
-    return result
-
-
 # Refer the utils functions of the official GraphRAG implementation:
 # https://github.com/microsoft/graphrag
 def clean_str(input: Any) -> str: