remove regex
This commit is contained in:
@@ -18,7 +18,6 @@ from .utils import (
|
|||||||
normalize_extracted_info,
|
normalize_extracted_info,
|
||||||
pack_user_ass_to_openai_messages,
|
pack_user_ass_to_openai_messages,
|
||||||
split_string_by_multi_markers,
|
split_string_by_multi_markers,
|
||||||
extract_fixed_parenthesized_content,
|
|
||||||
truncate_list_by_token_size,
|
truncate_list_by_token_size,
|
||||||
process_combine_contexts,
|
process_combine_contexts,
|
||||||
compute_args_hash,
|
compute_args_hash,
|
||||||
@@ -153,7 +152,7 @@ async def _handle_single_entity_extraction(
|
|||||||
chunk_key: str,
|
chunk_key: str,
|
||||||
file_path: str = "unknown_source",
|
file_path: str = "unknown_source",
|
||||||
):
|
):
|
||||||
if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
|
if len(record_attributes) < 4 or '"entity"' not in record_attributes[0]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Clean and validate entity name
|
# Clean and validate entity name
|
||||||
@@ -199,7 +198,7 @@ async def _handle_single_relationship_extraction(
|
|||||||
chunk_key: str,
|
chunk_key: str,
|
||||||
file_path: str = "unknown_source",
|
file_path: str = "unknown_source",
|
||||||
):
|
):
|
||||||
if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
|
if len(record_attributes) < 5 or '"relationship"' not in record_attributes[0]:
|
||||||
return None
|
return None
|
||||||
# add this record as edge
|
# add this record as edge
|
||||||
source = clean_str(record_attributes[1])
|
source = clean_str(record_attributes[1])
|
||||||
@@ -550,8 +549,6 @@ async def extract_entities(
|
|||||||
[context_base["record_delimiter"], context_base["completion_delimiter"]],
|
[context_base["record_delimiter"], context_base["completion_delimiter"]],
|
||||||
)
|
)
|
||||||
|
|
||||||
records = extract_fixed_parenthesized_content(records)
|
|
||||||
|
|
||||||
for record in records:
|
for record in records:
|
||||||
record = re.search(r"\((.*)\)", record)
|
record = re.search(r"\((.*)\)", record)
|
||||||
if record is None:
|
if record is None:
|
||||||
|
@@ -408,33 +408,6 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]
|
|||||||
return [r.strip() for r in results if r.strip()]
|
return [r.strip() for r in results if r.strip()]
|
||||||
|
|
||||||
|
|
||||||
def extract_fixed_parenthesized_content(records: list[str]) -> list[str]:
|
|
||||||
"""
|
|
||||||
Extract content that should be in parentheses from each record.
|
|
||||||
Ensures each extracted item has both opening and closing parentheses.
|
|
||||||
"""
|
|
||||||
result = []
|
|
||||||
|
|
||||||
for record in records:
|
|
||||||
# First, extract properly matched pairs
|
|
||||||
balanced_matches = re.findall(r"\((.*?)\)", record)
|
|
||||||
for match in balanced_matches:
|
|
||||||
result.append(f"({match})")
|
|
||||||
|
|
||||||
# Process string to handle unbalanced parentheses
|
|
||||||
# For opening without closing
|
|
||||||
open_matches = re.findall(r"\(([^()]*?)$", record)
|
|
||||||
for match in open_matches:
|
|
||||||
result.append(f"({match})")
|
|
||||||
|
|
||||||
# For closing without opening
|
|
||||||
close_matches = re.findall(r"^([^()]*?)\)", record)
|
|
||||||
for match in close_matches:
|
|
||||||
result.append(f"({match})")
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
# Refer the utils functions of the official GraphRAG implementation:
|
# Refer the utils functions of the official GraphRAG implementation:
|
||||||
# https://github.com/microsoft/graphrag
|
# https://github.com/microsoft/graphrag
|
||||||
def clean_str(input: Any) -> str:
|
def clean_str(input: Any) -> str:
|
||||||
|
Reference in New Issue
Block a user