remove regex

This commit is contained in:
tackhwa
2025-04-22 15:22:37 +08:00
parent 9e1be12e4a
commit 2e186ba488
2 changed files with 2 additions and 32 deletions

View File

@@ -18,7 +18,6 @@ from .utils import (
normalize_extracted_info,
pack_user_ass_to_openai_messages,
split_string_by_multi_markers,
extract_fixed_parenthesized_content,
truncate_list_by_token_size,
process_combine_contexts,
compute_args_hash,
@@ -153,7 +152,7 @@ async def _handle_single_entity_extraction(
chunk_key: str,
file_path: str = "unknown_source",
):
if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
if len(record_attributes) < 4 or '"entity"' not in record_attributes[0]:
return None
# Clean and validate entity name
@@ -199,7 +198,7 @@ async def _handle_single_relationship_extraction(
chunk_key: str,
file_path: str = "unknown_source",
):
if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
if len(record_attributes) < 5 or '"relationship"' not in record_attributes[0]:
return None
# add this record as edge
source = clean_str(record_attributes[1])
@@ -550,8 +549,6 @@ async def extract_entities(
[context_base["record_delimiter"], context_base["completion_delimiter"]],
)
records = extract_fixed_parenthesized_content(records)
for record in records:
record = re.search(r"\((.*)\)", record)
if record is None:

View File

@@ -408,33 +408,6 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]
return [r.strip() for r in results if r.strip()]
def extract_fixed_parenthesized_content(records: list[str]) -> list[str]:
"""
Extract content that should be in parentheses from each record.
Ensures each extracted item has both opening and closing parentheses.
"""
result = []
for record in records:
# First, extract properly matched pairs
balanced_matches = re.findall(r"\((.*?)\)", record)
for match in balanced_matches:
result.append(f"({match})")
# Process string to handle unbalanced parentheses
# For opening without closing
open_matches = re.findall(r"\(([^()]*?)$", record)
for match in open_matches:
result.append(f"({match})")
# For closing without opening
close_matches = re.findall(r"^([^()]*?)\)", record)
for match in close_matches:
result.append(f"({match})")
return result
# Refer the utils functions of the official GraphRAG implementation:
# https://github.com/microsoft/graphrag
def clean_str(input: Any) -> str: