remove regex
This commit is contained in:
@@ -18,7 +18,6 @@ from .utils import (
|
||||
normalize_extracted_info,
|
||||
pack_user_ass_to_openai_messages,
|
||||
split_string_by_multi_markers,
|
||||
extract_fixed_parenthesized_content,
|
||||
truncate_list_by_token_size,
|
||||
process_combine_contexts,
|
||||
compute_args_hash,
|
||||
@@ -153,7 +152,7 @@ async def _handle_single_entity_extraction(
|
||||
chunk_key: str,
|
||||
file_path: str = "unknown_source",
|
||||
):
|
||||
if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
|
||||
if len(record_attributes) < 4 or '"entity"' not in record_attributes[0]:
|
||||
return None
|
||||
|
||||
# Clean and validate entity name
|
||||
@@ -199,7 +198,7 @@ async def _handle_single_relationship_extraction(
|
||||
chunk_key: str,
|
||||
file_path: str = "unknown_source",
|
||||
):
|
||||
if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
|
||||
if len(record_attributes) < 5 or '"relationship"' not in record_attributes[0]:
|
||||
return None
|
||||
# add this record as edge
|
||||
source = clean_str(record_attributes[1])
|
||||
@@ -550,8 +549,6 @@ async def extract_entities(
|
||||
[context_base["record_delimiter"], context_base["completion_delimiter"]],
|
||||
)
|
||||
|
||||
records = extract_fixed_parenthesized_content(records)
|
||||
|
||||
for record in records:
|
||||
record = re.search(r"\((.*)\)", record)
|
||||
if record is None:
|
||||
|
@@ -408,33 +408,6 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]
|
||||
return [r.strip() for r in results if r.strip()]
|
||||
|
||||
|
||||
def extract_fixed_parenthesized_content(records: list[str]) -> list[str]:
|
||||
"""
|
||||
Extract content that should be in parentheses from each record.
|
||||
Ensures each extracted item has both opening and closing parentheses.
|
||||
"""
|
||||
result = []
|
||||
|
||||
for record in records:
|
||||
# First, extract properly matched pairs
|
||||
balanced_matches = re.findall(r"\((.*?)\)", record)
|
||||
for match in balanced_matches:
|
||||
result.append(f"({match})")
|
||||
|
||||
# Process string to handle unbalanced parentheses
|
||||
# For opening without closing
|
||||
open_matches = re.findall(r"\(([^()]*?)$", record)
|
||||
for match in open_matches:
|
||||
result.append(f"({match})")
|
||||
|
||||
# For closing without opening
|
||||
close_matches = re.findall(r"^([^()]*?)\)", record)
|
||||
for match in close_matches:
|
||||
result.append(f"({match})")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# Refer the utils functions of the official GraphRAG implementation:
|
||||
# https://github.com/microsoft/graphrag
|
||||
def clean_str(input: Any) -> str:
|
||||
|
Reference in New Issue
Block a user