feat: 增强知识图谱关系的时序性支持
- 为关系和向量数据增加时间戳支持,记录知识获取的时间 - 优化混合查询策略,同时考虑语义相关性和时间顺序 - 增强提示词模板,指导LLM在处理冲突信息时考虑时间因素
This commit is contained in:
@@ -30,6 +30,7 @@ from .base import (
|
|||||||
QueryParam,
|
QueryParam,
|
||||||
)
|
)
|
||||||
from .prompt import GRAPH_FIELD_SEP, PROMPTS
|
from .prompt import GRAPH_FIELD_SEP, PROMPTS
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
def chunking_by_token_size(
|
def chunking_by_token_size(
|
||||||
@@ -128,6 +129,9 @@ async def _handle_single_relationship_extraction(
|
|||||||
description=edge_description,
|
description=edge_description,
|
||||||
keywords=edge_keywords,
|
keywords=edge_keywords,
|
||||||
source_id=edge_source_id,
|
source_id=edge_source_id,
|
||||||
|
metadata={
|
||||||
|
"created_at": time.time()
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -445,6 +449,9 @@ async def extract_entities(
|
|||||||
+ dp["src_id"]
|
+ dp["src_id"]
|
||||||
+ dp["tgt_id"]
|
+ dp["tgt_id"]
|
||||||
+ dp["description"],
|
+ dp["description"],
|
||||||
|
"metadata": {
|
||||||
|
"created_at": dp.get("metadata", {}).get("created_at", time.time())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for dp in all_relationships_data
|
for dp in all_relationships_data
|
||||||
}
|
}
|
||||||
@@ -733,9 +740,13 @@ async def _get_node_data(
|
|||||||
entities_context = list_of_list_to_csv(entites_section_list)
|
entities_context = list_of_list_to_csv(entites_section_list)
|
||||||
|
|
||||||
relations_section_list = [
|
relations_section_list = [
|
||||||
["id", "source", "target", "description", "keywords", "weight", "rank"]
|
["id", "source", "target", "description", "keywords", "weight", "rank", "created_at"]
|
||||||
]
|
]
|
||||||
for i, e in enumerate(use_relations):
|
for i, e in enumerate(use_relations):
|
||||||
|
created_at = e.get("created_at", "未知")
|
||||||
|
# 转换时间戳为可读格式
|
||||||
|
if isinstance(created_at, (int, float)):
|
||||||
|
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
||||||
relations_section_list.append(
|
relations_section_list.append(
|
||||||
[
|
[
|
||||||
i,
|
i,
|
||||||
@@ -745,6 +756,7 @@ async def _get_node_data(
|
|||||||
e["keywords"],
|
e["keywords"],
|
||||||
e["weight"],
|
e["weight"],
|
||||||
e["rank"],
|
e["rank"],
|
||||||
|
created_at
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
relations_context = list_of_list_to_csv(relations_section_list)
|
relations_context = list_of_list_to_csv(relations_section_list)
|
||||||
@@ -882,6 +894,8 @@ async def _get_edge_data(
|
|||||||
if not len(results):
|
if not len(results):
|
||||||
return "", "", ""
|
return "", "", ""
|
||||||
|
|
||||||
|
# 从 KV 存储中获取完整的关系信息
|
||||||
|
edge_ids = [r["id"] for r in results]
|
||||||
edge_datas = await asyncio.gather(
|
edge_datas = await asyncio.gather(
|
||||||
*[knowledge_graph_inst.get_edge(r["src_id"], r["tgt_id"]) for r in results]
|
*[knowledge_graph_inst.get_edge(r["src_id"], r["tgt_id"]) for r in results]
|
||||||
)
|
)
|
||||||
@@ -892,7 +906,13 @@ async def _get_edge_data(
|
|||||||
*[knowledge_graph_inst.edge_degree(r["src_id"], r["tgt_id"]) for r in results]
|
*[knowledge_graph_inst.edge_degree(r["src_id"], r["tgt_id"]) for r in results]
|
||||||
)
|
)
|
||||||
edge_datas = [
|
edge_datas = [
|
||||||
{"src_id": k["src_id"], "tgt_id": k["tgt_id"], "rank": d, **v}
|
{
|
||||||
|
"src_id": k["src_id"],
|
||||||
|
"tgt_id": k["tgt_id"],
|
||||||
|
"rank": d,
|
||||||
|
"created_at": k.get("__created_at__", None), # 从 KV 存储中获取时间元数据
|
||||||
|
**v
|
||||||
|
}
|
||||||
for k, v, d in zip(results, edge_datas, edge_degree)
|
for k, v, d in zip(results, edge_datas, edge_degree)
|
||||||
if v is not None
|
if v is not None
|
||||||
]
|
]
|
||||||
@@ -916,9 +936,13 @@ async def _get_edge_data(
|
|||||||
)
|
)
|
||||||
|
|
||||||
relations_section_list = [
|
relations_section_list = [
|
||||||
["id", "source", "target", "description", "keywords", "weight", "rank"]
|
["id", "source", "target", "description", "keywords", "weight", "rank", "created_at"]
|
||||||
]
|
]
|
||||||
for i, e in enumerate(edge_datas):
|
for i, e in enumerate(edge_datas):
|
||||||
|
created_at = e.get("created_at", "未知")
|
||||||
|
# 转换时间戳为可读格式
|
||||||
|
if isinstance(created_at, (int, float)):
|
||||||
|
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
||||||
relations_section_list.append(
|
relations_section_list.append(
|
||||||
[
|
[
|
||||||
i,
|
i,
|
||||||
@@ -928,6 +952,7 @@ async def _get_edge_data(
|
|||||||
e["keywords"],
|
e["keywords"],
|
||||||
e["weight"],
|
e["weight"],
|
||||||
e["rank"],
|
e["rank"],
|
||||||
|
created_at
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
relations_context = list_of_list_to_csv(relations_section_list)
|
relations_context = list_of_list_to_csv(relations_section_list)
|
||||||
@@ -1259,9 +1284,15 @@ async def mix_kg_vector_query(
|
|||||||
chunks_ids = [r["id"] for r in results]
|
chunks_ids = [r["id"] for r in results]
|
||||||
chunks = await text_chunks_db.get_by_ids(chunks_ids)
|
chunks = await text_chunks_db.get_by_ids(chunks_ids)
|
||||||
|
|
||||||
valid_chunks = [
|
valid_chunks = []
|
||||||
chunk for chunk in chunks if chunk is not None and "content" in chunk
|
for chunk, result in zip(chunks, results):
|
||||||
]
|
if chunk is not None and "content" in chunk:
|
||||||
|
# 合并 chunk 内容和时间元数据
|
||||||
|
chunk_with_time = {
|
||||||
|
"content": chunk["content"],
|
||||||
|
"created_at": result.get("created_at", None)
|
||||||
|
}
|
||||||
|
valid_chunks.append(chunk_with_time)
|
||||||
|
|
||||||
if not valid_chunks:
|
if not valid_chunks:
|
||||||
return None
|
return None
|
||||||
@@ -1275,7 +1306,15 @@ async def mix_kg_vector_query(
|
|||||||
if not maybe_trun_chunks:
|
if not maybe_trun_chunks:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return "\n--New Chunk--\n".join([c["content"] for c in maybe_trun_chunks])
|
# 在内容中包含时间信息
|
||||||
|
formatted_chunks = []
|
||||||
|
for c in maybe_trun_chunks:
|
||||||
|
chunk_text = c["content"]
|
||||||
|
if c["created_at"]:
|
||||||
|
chunk_text = f"[Created at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['created_at']))}]\n{chunk_text}"
|
||||||
|
formatted_chunks.append(chunk_text)
|
||||||
|
|
||||||
|
return "\n--New Chunk--\n".join(formatted_chunks)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in get_vector_context: {e}")
|
logger.error(f"Error in get_vector_context: {e}")
|
||||||
return None
|
return None
|
||||||
|
@@ -164,6 +164,12 @@ Generate a response of the target length and format that responds to the user's
|
|||||||
If you don't know the answer, just say so. Do not make anything up.
|
If you don't know the answer, just say so. Do not make anything up.
|
||||||
Do not include information where the supporting evidence for it is not provided.
|
Do not include information where the supporting evidence for it is not provided.
|
||||||
|
|
||||||
|
When handling relationships with timestamps:
|
||||||
|
1. Each relationship has a "created_at" timestamp indicating when we acquired this knowledge
|
||||||
|
2. When encountering conflicting relationships, consider both the semantic content and the timestamp
|
||||||
|
3. Don't automatically prefer the most recently created relationships - use judgment based on the context
|
||||||
|
4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
|
||||||
|
|
||||||
---Target response length and format---
|
---Target response length and format---
|
||||||
|
|
||||||
{response_type}
|
{response_type}
|
||||||
@@ -172,8 +178,7 @@ Do not include information where the supporting evidence for it is not provided.
|
|||||||
|
|
||||||
{context_data}
|
{context_data}
|
||||||
|
|
||||||
Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
|
Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown."""
|
||||||
"""
|
|
||||||
|
|
||||||
PROMPTS["keywords_extraction"] = """---Role---
|
PROMPTS["keywords_extraction"] = """---Role---
|
||||||
|
|
||||||
@@ -250,6 +255,12 @@ Generate a response of the target length and format that responds to the user's
|
|||||||
If you don't know the answer, just say so. Do not make anything up.
|
If you don't know the answer, just say so. Do not make anything up.
|
||||||
Do not include information where the supporting evidence for it is not provided.
|
Do not include information where the supporting evidence for it is not provided.
|
||||||
|
|
||||||
|
When handling content with timestamps:
|
||||||
|
1. Each piece of content has a "created_at" timestamp indicating when we acquired this knowledge
|
||||||
|
2. When encountering conflicting information, consider both the content and the timestamp
|
||||||
|
3. Don't automatically prefer the most recent content - use judgment based on the context
|
||||||
|
4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
|
||||||
|
|
||||||
---Target response length and format---
|
---Target response length and format---
|
||||||
|
|
||||||
{response_type}
|
{response_type}
|
||||||
@@ -293,6 +304,12 @@ You are a professional assistant responsible for answering questions based on kn
|
|||||||
|
|
||||||
Generate a concise response that summarizes relevant points from the provided information. If you don't know the answer, just say so. Do not make anything up or include information where the supporting evidence is not provided.
|
Generate a concise response that summarizes relevant points from the provided information. If you don't know the answer, just say so. Do not make anything up or include information where the supporting evidence is not provided.
|
||||||
|
|
||||||
|
When handling information with timestamps:
|
||||||
|
1. Each piece of information (both relationships and content) has a "created_at" timestamp indicating when we acquired this knowledge
|
||||||
|
2. When encountering conflicting information, consider both the content/relationship and the timestamp
|
||||||
|
3. Don't automatically prefer the most recent information - use judgment based on the context
|
||||||
|
4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
|
||||||
|
|
||||||
---Data Sources---
|
---Data Sources---
|
||||||
|
|
||||||
1. Knowledge Graph Data:
|
1. Knowledge Graph Data:
|
||||||
|
@@ -7,6 +7,7 @@ from typing import Any, Union, cast, Dict
|
|||||||
import networkx as nx
|
import networkx as nx
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from nano_vectordb import NanoVectorDB
|
from nano_vectordb import NanoVectorDB
|
||||||
|
import time
|
||||||
|
|
||||||
from .utils import (
|
from .utils import (
|
||||||
logger,
|
logger,
|
||||||
@@ -87,9 +88,12 @@ class NanoVectorDBStorage(BaseVectorStorage):
|
|||||||
if not len(data):
|
if not len(data):
|
||||||
logger.warning("You insert an empty data to vector DB")
|
logger.warning("You insert an empty data to vector DB")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
current_time = time.time()
|
||||||
list_data = [
|
list_data = [
|
||||||
{
|
{
|
||||||
"__id__": k,
|
"__id__": k,
|
||||||
|
"__created_at__": current_time,
|
||||||
**{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
|
**{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
|
||||||
}
|
}
|
||||||
for k, v in data.items()
|
for k, v in data.items()
|
||||||
@@ -132,7 +136,13 @@ class NanoVectorDBStorage(BaseVectorStorage):
|
|||||||
better_than_threshold=self.cosine_better_than_threshold,
|
better_than_threshold=self.cosine_better_than_threshold,
|
||||||
)
|
)
|
||||||
results = [
|
results = [
|
||||||
{**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results
|
{
|
||||||
|
**dp,
|
||||||
|
"id": dp["__id__"],
|
||||||
|
"distance": dp["__metrics__"],
|
||||||
|
"created_at": dp.get("__created_at__")
|
||||||
|
}
|
||||||
|
for dp in results
|
||||||
]
|
]
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user