friendly implementation of entity extraction and relationship weight extract for Low-Capability LLMs
This commit is contained in:
@@ -18,6 +18,7 @@ from .utils import (
|
||||
normalize_extracted_info,
|
||||
pack_user_ass_to_openai_messages,
|
||||
split_string_by_multi_markers,
|
||||
extract_fixed_parenthesized_content,
|
||||
truncate_list_by_token_size,
|
||||
process_combine_contexts,
|
||||
compute_args_hash,
|
||||
@@ -215,7 +216,7 @@ async def _handle_single_relationship_extraction(
|
||||
edge_source_id = chunk_key
|
||||
weight = (
|
||||
float(record_attributes[-1].strip('"').strip("'"))
|
||||
if is_float_regex(record_attributes[-1])
|
||||
if is_float_regex(record_attributes[-1].strip('"').strip("'"))
|
||||
else 1.0
|
||||
)
|
||||
return dict(
|
||||
@@ -549,6 +550,8 @@ async def extract_entities(
|
||||
[context_base["record_delimiter"], context_base["completion_delimiter"]],
|
||||
)
|
||||
|
||||
records = extract_fixed_parenthesized_content(records)
|
||||
|
||||
for record in records:
|
||||
record = re.search(r"\((.*)\)", record)
|
||||
if record is None:
|
||||
|
@@ -408,6 +408,33 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]
|
||||
return [r.strip() for r in results if r.strip()]
|
||||
|
||||
|
||||
def extract_fixed_parenthesized_content(records: list[str]) -> list[str]:
|
||||
"""
|
||||
Extract content that should be in parentheses from each record.
|
||||
Ensures each extracted item has both opening and closing parentheses.
|
||||
"""
|
||||
result = []
|
||||
|
||||
for record in records:
|
||||
# First, extract properly matched pairs
|
||||
balanced_matches = re.findall(r'\((.*?)\)', record)
|
||||
for match in balanced_matches:
|
||||
result.append(f"({match})")
|
||||
|
||||
# Process string to handle unbalanced parentheses
|
||||
# For opening without closing
|
||||
open_matches = re.findall(r'\(([^()]*?)$', record)
|
||||
for match in open_matches:
|
||||
result.append(f"({match})")
|
||||
|
||||
# For closing without opening
|
||||
close_matches = re.findall(r'^([^()]*?)\)', record)
|
||||
for match in close_matches:
|
||||
result.append(f"({match})")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# Refer the utils functions of the official GraphRAG implementation:
|
||||
# https://github.com/microsoft/graphrag
|
||||
def clean_str(input: Any) -> str:
|
||||
|
Reference in New Issue
Block a user