Merge branch 'main' into loginPage

This commit is contained in:
choizhang
2025-03-11 23:57:03 +08:00
39 changed files with 1702 additions and 531 deletions

View File

@@ -81,6 +81,9 @@ class QueryParam:
history_turns: int = 3
"""Number of complete conversation turns (user-assistant pairs) to consider in the response context."""
ids: list[str] | None = None
"""List of ids to filter the results."""
@dataclass
class StorageNameSpace(ABC):
@@ -107,7 +110,9 @@ class BaseVectorStorage(StorageNameSpace, ABC):
meta_fields: set[str] = field(default_factory=set)
@abstractmethod
async def query(self, query: str, top_k: int) -> list[dict[str, Any]]:
async def query(
self, query: str, top_k: int, ids: list[str] | None = None
) -> list[dict[str, Any]]:
"""Query the vector storage and retrieve top_k results."""
@abstractmethod
@@ -122,6 +127,30 @@ class BaseVectorStorage(StorageNameSpace, ABC):
async def delete_entity_relation(self, entity_name: str) -> None:
"""Delete relations for a given entity."""
@abstractmethod
async def get_by_id(self, id: str) -> dict[str, Any] | None:
"""Get vector data by its ID
Args:
id: The unique identifier of the vector
Returns:
The vector data if found, or None if not found
"""
pass
@abstractmethod
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
"""Get multiple vector data by their IDs
Args:
ids: List of unique identifiers
Returns:
List of vector data objects that were found
"""
pass
@dataclass
class BaseKVStorage(StorageNameSpace, ABC):

View File

@@ -269,3 +269,67 @@ class ChromaVectorDBStorage(BaseVectorStorage):
except Exception as e:
logger.error(f"Error during prefix search in ChromaDB: {str(e)}")
raise
async def get_by_id(self, id: str) -> dict[str, Any] | None:
"""Get vector data by its ID
Args:
id: The unique identifier of the vector
Returns:
The vector data if found, or None if not found
"""
try:
# Query the collection for a single vector by ID
result = self._collection.get(
ids=[id], include=["metadatas", "embeddings", "documents"]
)
if not result or not result["ids"] or len(result["ids"]) == 0:
return None
# Format the result to match the expected structure
return {
"id": result["ids"][0],
"vector": result["embeddings"][0],
"content": result["documents"][0],
**result["metadatas"][0],
}
except Exception as e:
logger.error(f"Error retrieving vector data for ID {id}: {e}")
return None
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
"""Get multiple vector data by their IDs
Args:
ids: List of unique identifiers
Returns:
List of vector data objects that were found
"""
if not ids:
return []
try:
# Query the collection for multiple vectors by IDs
result = self._collection.get(
ids=ids, include=["metadatas", "embeddings", "documents"]
)
if not result or not result["ids"] or len(result["ids"]) == 0:
return []
# Format the results to match the expected structure
return [
{
"id": result["ids"][i],
"vector": result["embeddings"][i],
"content": result["documents"][i],
**result["metadatas"][i],
}
for i in range(len(result["ids"]))
]
except Exception as e:
logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
return []

View File

@@ -392,3 +392,46 @@ class FaissVectorDBStorage(BaseVectorStorage):
logger.debug(f"Found {len(matching_records)} records with prefix '{prefix}'")
return matching_records
async def get_by_id(self, id: str) -> dict[str, Any] | None:
"""Get vector data by its ID
Args:
id: The unique identifier of the vector
Returns:
The vector data if found, or None if not found
"""
# Find the Faiss internal ID for the custom ID
fid = self._find_faiss_id_by_custom_id(id)
if fid is None:
return None
# Get the metadata for the found ID
metadata = self._id_to_meta.get(fid, {})
if not metadata:
return None
return {**metadata, "id": metadata.get("__id__")}
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
"""Get multiple vector data by their IDs
Args:
ids: List of unique identifiers
Returns:
List of vector data objects that were found
"""
if not ids:
return []
results = []
for id in ids:
fid = self._find_faiss_id_by_custom_id(id)
if fid is not None:
metadata = self._id_to_meta.get(fid, {})
if metadata:
results.append({**metadata, "id": metadata.get("__id__")})
return results

View File

@@ -231,3 +231,57 @@ class MilvusVectorDBStorage(BaseVectorStorage):
except Exception as e:
logger.error(f"Error searching for records with prefix '{prefix}': {e}")
return []
async def get_by_id(self, id: str) -> dict[str, Any] | None:
"""Get vector data by its ID
Args:
id: The unique identifier of the vector
Returns:
The vector data if found, or None if not found
"""
try:
# Query Milvus for a specific ID
result = self._client.query(
collection_name=self.namespace,
filter=f'id == "{id}"',
output_fields=list(self.meta_fields) + ["id"],
)
if not result or len(result) == 0:
return None
return result[0]
except Exception as e:
logger.error(f"Error retrieving vector data for ID {id}: {e}")
return None
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
"""Get multiple vector data by their IDs
Args:
ids: List of unique identifiers
Returns:
List of vector data objects that were found
"""
if not ids:
return []
try:
# Prepare the ID filter expression
id_list = '", "'.join(ids)
filter_expr = f'id in ["{id_list}"]'
# Query Milvus with the filter
result = self._client.query(
collection_name=self.namespace,
filter=filter_expr,
output_fields=list(self.meta_fields) + ["id"],
)
return result or []
except Exception as e:
logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
return []

View File

@@ -1071,6 +1071,59 @@ class MongoVectorDBStorage(BaseVectorStorage):
logger.error(f"Error searching by prefix in {self.namespace}: {str(e)}")
return []
async def get_by_id(self, id: str) -> dict[str, Any] | None:
"""Get vector data by its ID
Args:
id: The unique identifier of the vector
Returns:
The vector data if found, or None if not found
"""
try:
# Search for the specific ID in MongoDB
result = await self._data.find_one({"_id": id})
if result:
# Format the result to include id field expected by API
result_dict = dict(result)
if "_id" in result_dict and "id" not in result_dict:
result_dict["id"] = result_dict["_id"]
return result_dict
return None
except Exception as e:
logger.error(f"Error retrieving vector data for ID {id}: {e}")
return None
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
"""Get multiple vector data by their IDs
Args:
ids: List of unique identifiers
Returns:
List of vector data objects that were found
"""
if not ids:
return []
try:
# Query MongoDB for multiple IDs
cursor = self._data.find({"_id": {"$in": ids}})
results = await cursor.to_list(length=None)
# Format results to include id field expected by API
formatted_results = []
for result in results:
result_dict = dict(result)
if "_id" in result_dict and "id" not in result_dict:
result_dict["id"] = result_dict["_id"]
formatted_results.append(result_dict)
return formatted_results
except Exception as e:
logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
return []
async def get_or_create_collection(db: AsyncIOMotorDatabase, collection_name: str):
collection_names = await db.list_collection_names()

View File

@@ -256,3 +256,33 @@ class NanoVectorDBStorage(BaseVectorStorage):
logger.debug(f"Found {len(matching_records)} records with prefix '{prefix}'")
return matching_records
async def get_by_id(self, id: str) -> dict[str, Any] | None:
"""Get vector data by its ID
Args:
id: The unique identifier of the vector
Returns:
The vector data if found, or None if not found
"""
client = await self._get_client()
result = client.get([id])
if result:
return result[0]
return None
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
"""Get multiple vector data by their IDs
Args:
ids: List of unique identifiers
Returns:
List of vector data objects that were found
"""
if not ids:
return []
client = await self._get_client()
return client.get(ids)

View File

@@ -176,23 +176,6 @@ class Neo4JStorage(BaseGraphStorage):
# Noe4J handles persistence automatically
pass
def _ensure_label(self, label: str) -> str:
"""Ensure a label is valid
Args:
label: The label to validate
Returns:
str: The cleaned label
Raises:
ValueError: If label is empty after cleaning
"""
clean_label = label.strip('"')
if not clean_label:
raise ValueError("Neo4j: Label cannot be empty")
return clean_label
async def has_node(self, node_id: str) -> bool:
"""
Check if a node with the given label exists in the database
@@ -207,20 +190,17 @@ class Neo4JStorage(BaseGraphStorage):
ValueError: If node_id is invalid
Exception: If there is an error executing the query
"""
entity_name_label = self._ensure_label(node_id)
async with self._driver.session(
database=self._DATABASE, default_access_mode="READ"
) as session:
try:
query = f"MATCH (n:`{entity_name_label}`) RETURN count(n) > 0 AS node_exists"
result = await session.run(query)
query = "MATCH (n:base {entity_id: $entity_id}) RETURN count(n) > 0 AS node_exists"
result = await session.run(query, entity_id=node_id)
single_result = await result.single()
await result.consume() # Ensure result is fully consumed
return single_result["node_exists"]
except Exception as e:
logger.error(
f"Error checking node existence for {entity_name_label}: {str(e)}"
)
logger.error(f"Error checking node existence for {node_id}: {str(e)}")
await result.consume() # Ensure results are consumed even on error
raise
@@ -239,24 +219,25 @@ class Neo4JStorage(BaseGraphStorage):
ValueError: If either node_id is invalid
Exception: If there is an error executing the query
"""
entity_name_label_source = self._ensure_label(source_node_id)
entity_name_label_target = self._ensure_label(target_node_id)
async with self._driver.session(
database=self._DATABASE, default_access_mode="READ"
) as session:
try:
query = (
f"MATCH (a:`{entity_name_label_source}`)-[r]-(b:`{entity_name_label_target}`) "
"MATCH (a:base {entity_id: $source_entity_id})-[r]-(b:base {entity_id: $target_entity_id}) "
"RETURN COUNT(r) > 0 AS edgeExists"
)
result = await session.run(query)
result = await session.run(
query,
source_entity_id=source_node_id,
target_entity_id=target_node_id,
)
single_result = await result.single()
await result.consume() # Ensure result is fully consumed
return single_result["edgeExists"]
except Exception as e:
logger.error(
f"Error checking edge existence between {entity_name_label_source} and {entity_name_label_target}: {str(e)}"
f"Error checking edge existence between {source_node_id} and {target_node_id}: {str(e)}"
)
await result.consume() # Ensure results are consumed even on error
raise
@@ -275,13 +256,12 @@ class Neo4JStorage(BaseGraphStorage):
ValueError: If node_id is invalid
Exception: If there is an error executing the query
"""
entity_name_label = self._ensure_label(node_id)
async with self._driver.session(
database=self._DATABASE, default_access_mode="READ"
) as session:
try:
query = f"MATCH (n:`{entity_name_label}` {{entity_id: $entity_id}}) RETURN n"
result = await session.run(query, entity_id=entity_name_label)
query = "MATCH (n:base {entity_id: $entity_id}) RETURN n"
result = await session.run(query, entity_id=node_id)
try:
records = await result.fetch(
2
@@ -289,20 +269,25 @@ class Neo4JStorage(BaseGraphStorage):
if len(records) > 1:
logger.warning(
f"Multiple nodes found with label '{entity_name_label}'. Using first node."
f"Multiple nodes found with label '{node_id}'. Using first node."
)
if records:
node = records[0]["n"]
node_dict = dict(node)
logger.debug(
f"{inspect.currentframe().f_code.co_name}: query: {query}, result: {node_dict}"
)
# Remove base label from labels list if it exists
if "labels" in node_dict:
node_dict["labels"] = [
label
for label in node_dict["labels"]
if label != "base"
]
logger.debug(f"Neo4j query node {query} return: {node_dict}")
return node_dict
return None
finally:
await result.consume() # Ensure result is fully consumed
except Exception as e:
logger.error(f"Error getting node for {entity_name_label}: {str(e)}")
logger.error(f"Error getting node for {node_id}: {str(e)}")
raise
async def node_degree(self, node_id: str) -> int:
@@ -320,43 +305,32 @@ class Neo4JStorage(BaseGraphStorage):
ValueError: If node_id is invalid
Exception: If there is an error executing the query
"""
entity_name_label = self._ensure_label(node_id)
async with self._driver.session(
database=self._DATABASE, default_access_mode="READ"
) as session:
try:
query = f"""
MATCH (n:`{entity_name_label}`)
query = """
MATCH (n:base {entity_id: $entity_id})
OPTIONAL MATCH (n)-[r]-()
RETURN n, COUNT(r) AS degree
RETURN COUNT(r) AS degree
"""
result = await session.run(query)
result = await session.run(query, entity_id=node_id)
try:
records = await result.fetch(100)
record = await result.single()
if not records:
logger.warning(
f"No node found with label '{entity_name_label}'"
)
if not record:
logger.warning(f"No node found with label '{node_id}'")
return 0
if len(records) > 1:
logger.warning(
f"Multiple nodes ({len(records)}) found with label '{entity_name_label}', using first node's degree"
)
degree = records[0]["degree"]
degree = record["degree"]
logger.debug(
f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{degree}"
"Neo4j query node degree for {node_id} return: {degree}"
)
return degree
finally:
await result.consume() # Ensure result is fully consumed
except Exception as e:
logger.error(
f"Error getting node degree for {entity_name_label}: {str(e)}"
)
logger.error(f"Error getting node degree for {node_id}: {str(e)}")
raise
async def edge_degree(self, src_id: str, tgt_id: str) -> int:
@@ -369,11 +343,8 @@ class Neo4JStorage(BaseGraphStorage):
Returns:
int: Sum of the degrees of both nodes
"""
entity_name_label_source = self._ensure_label(src_id)
entity_name_label_target = self._ensure_label(tgt_id)
src_degree = await self.node_degree(entity_name_label_source)
trg_degree = await self.node_degree(entity_name_label_target)
src_degree = await self.node_degree(src_id)
trg_degree = await self.node_degree(tgt_id)
# Convert None to 0 for addition
src_degree = 0 if src_degree is None else src_degree
@@ -399,24 +370,24 @@ class Neo4JStorage(BaseGraphStorage):
Exception: If there is an error executing the query
"""
try:
entity_name_label_source = self._ensure_label(source_node_id)
entity_name_label_target = self._ensure_label(target_node_id)
async with self._driver.session(
database=self._DATABASE, default_access_mode="READ"
) as session:
query = f"""
MATCH (start:`{entity_name_label_source}`)-[r]-(end:`{entity_name_label_target}`)
query = """
MATCH (start:base {entity_id: $source_entity_id})-[r]-(end:base {entity_id: $target_entity_id})
RETURN properties(r) as edge_properties
"""
result = await session.run(query)
result = await session.run(
query,
source_entity_id=source_node_id,
target_entity_id=target_node_id,
)
try:
records = await result.fetch(2)
if len(records) > 1:
logger.warning(
f"Multiple edges found between '{entity_name_label_source}' and '{entity_name_label_target}'. Using first edge."
f"Multiple edges found between '{source_node_id}' and '{target_node_id}'. Using first edge."
)
if records:
try:
@@ -433,7 +404,7 @@ class Neo4JStorage(BaseGraphStorage):
if key not in edge_result:
edge_result[key] = default_value
logger.warning(
f"Edge between {entity_name_label_source} and {entity_name_label_target} "
f"Edge between {source_node_id} and {target_node_id} "
f"missing {key}, using default: {default_value}"
)
@@ -443,8 +414,8 @@ class Neo4JStorage(BaseGraphStorage):
return edge_result
except (KeyError, TypeError, ValueError) as e:
logger.error(
f"Error processing edge properties between {entity_name_label_source} "
f"and {entity_name_label_target}: {str(e)}"
f"Error processing edge properties between {source_node_id} "
f"and {target_node_id}: {str(e)}"
)
# Return default edge properties on error
return {
@@ -455,7 +426,7 @@ class Neo4JStorage(BaseGraphStorage):
}
logger.debug(
f"{inspect.currentframe().f_code.co_name}: No edge found between {entity_name_label_source} and {entity_name_label_target}"
f"{inspect.currentframe().f_code.co_name}: No edge found between {source_node_id} and {target_node_id}"
)
# Return default edge properties when no edge found
return {
@@ -488,29 +459,33 @@ class Neo4JStorage(BaseGraphStorage):
Exception: If there is an error executing the query
"""
try:
node_label = self._ensure_label(source_node_id)
query = f"""MATCH (n:`{node_label}`)
OPTIONAL MATCH (n)-[r]-(connected)
RETURN n, r, connected"""
async with self._driver.session(
database=self._DATABASE, default_access_mode="READ"
) as session:
try:
results = await session.run(query)
edges = []
query = """MATCH (n:base {entity_id: $entity_id})
OPTIONAL MATCH (n)-[r]-(connected:base)
WHERE connected.entity_id IS NOT NULL
RETURN n, r, connected"""
results = await session.run(query, entity_id=source_node_id)
edges = []
async for record in results:
source_node = record["n"]
connected_node = record["connected"]
# Skip if either node is None
if not source_node or not connected_node:
continue
source_label = (
list(source_node.labels)[0] if source_node.labels else None
source_node.get("entity_id")
if source_node.get("entity_id")
else None
)
target_label = (
list(connected_node.labels)[0]
if connected_node and connected_node.labels
connected_node.get("entity_id")
if connected_node.get("entity_id")
else None
)
@@ -520,7 +495,9 @@ class Neo4JStorage(BaseGraphStorage):
await results.consume() # Ensure results are consumed
return edges
except Exception as e:
logger.error(f"Error getting edges for node {node_label}: {str(e)}")
logger.error(
f"Error getting edges for node {source_node_id}: {str(e)}"
)
await results.consume() # Ensure results are consumed even on error
raise
except Exception as e:
@@ -547,8 +524,9 @@ class Neo4JStorage(BaseGraphStorage):
node_id: The unique identifier for the node (used as label)
node_data: Dictionary of node properties
"""
label = self._ensure_label(node_id)
properties = node_data
entity_type = properties["entity_type"]
entity_id = properties["entity_id"]
if "entity_id" not in properties:
raise ValueError("Neo4j: node properties must contain an 'entity_id' field")
@@ -556,13 +534,17 @@ class Neo4JStorage(BaseGraphStorage):
async with self._driver.session(database=self._DATABASE) as session:
async def execute_upsert(tx: AsyncManagedTransaction):
query = f"""
MERGE (n:`{label}` {{entity_id: $properties.entity_id}})
query = (
"""
MERGE (n:base {entity_id: $properties.entity_id})
SET n += $properties
SET n:`%s`
"""
% entity_type
)
result = await tx.run(query, properties=properties)
logger.debug(
f"Upserted node with label '{label}' and properties: {properties}"
f"Upserted node with entity_id '{entity_id}' and properties: {properties}"
)
await result.consume() # Ensure result is fully consumed
@@ -583,52 +565,6 @@ class Neo4JStorage(BaseGraphStorage):
)
),
)
async def _get_unique_node_entity_id(self, node_label: str) -> str:
"""
Get the entity_id of a node with the given label, ensuring the node is unique.
Args:
node_label (str): Label of the node to check
Returns:
str: The entity_id of the unique node
Raises:
ValueError: If no node with the given label exists or if multiple nodes have the same label
"""
async with self._driver.session(
database=self._DATABASE, default_access_mode="READ"
) as session:
query = f"""
MATCH (n:`{node_label}`)
RETURN n, count(n) as node_count
"""
result = await session.run(query)
try:
records = await result.fetch(
2
) # We only need to know if there are 0, 1, or >1 nodes
if not records or records[0]["node_count"] == 0:
raise ValueError(
f"Neo4j: node with label '{node_label}' does not exist"
)
if records[0]["node_count"] > 1:
raise ValueError(
f"Neo4j: multiple nodes found with label '{node_label}', cannot determine unique node"
)
node = records[0]["n"]
if "entity_id" not in node:
raise ValueError(
f"Neo4j: node with label '{node_label}' does not have an entity_id property"
)
return node["entity_id"]
finally:
await result.consume() # Ensure result is fully consumed
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=4, max=10),
@@ -657,38 +593,30 @@ class Neo4JStorage(BaseGraphStorage):
Raises:
ValueError: If either source or target node does not exist or is not unique
"""
source_label = self._ensure_label(source_node_id)
target_label = self._ensure_label(target_node_id)
edge_properties = edge_data
# Get entity_ids for source and target nodes, ensuring they are unique
source_entity_id = await self._get_unique_node_entity_id(source_label)
target_entity_id = await self._get_unique_node_entity_id(target_label)
try:
edge_properties = edge_data
async with self._driver.session(database=self._DATABASE) as session:
async def execute_upsert(tx: AsyncManagedTransaction):
query = f"""
MATCH (source:`{source_label}` {{entity_id: $source_entity_id}})
query = """
MATCH (source:base {entity_id: $source_entity_id})
WITH source
MATCH (target:`{target_label}` {{entity_id: $target_entity_id}})
MATCH (target:base {entity_id: $target_entity_id})
MERGE (source)-[r:DIRECTED]-(target)
SET r += $properties
RETURN r, source, target
"""
result = await tx.run(
query,
source_entity_id=source_entity_id,
target_entity_id=target_entity_id,
source_entity_id=source_node_id,
target_entity_id=target_node_id,
properties=edge_properties,
)
try:
records = await result.fetch(100)
records = await result.fetch(2)
if records:
logger.debug(
f"Upserted edge from '{source_label}' (entity_id: {source_entity_id}) "
f"to '{target_label}' (entity_id: {target_entity_id}) "
f"Upserted edge from '{source_node_id}' to '{target_node_id}'"
f"with properties: {edge_properties}"
)
finally:
@@ -726,7 +654,6 @@ class Neo4JStorage(BaseGraphStorage):
Returns:
KnowledgeGraph: Complete connected subgraph for specified node
"""
label = node_label.strip('"')
result = KnowledgeGraph()
seen_nodes = set()
seen_edges = set()
@@ -735,7 +662,7 @@ class Neo4JStorage(BaseGraphStorage):
database=self._DATABASE, default_access_mode="READ"
) as session:
try:
if label == "*":
if node_label == "*":
main_query = """
MATCH (n)
OPTIONAL MATCH (n)-[r]-()
@@ -760,12 +687,11 @@ class Neo4JStorage(BaseGraphStorage):
# Main query uses partial matching
main_query = """
MATCH (start)
WHERE any(label IN labels(start) WHERE
WHERE
CASE
WHEN $inclusive THEN label CONTAINS $label
ELSE label = $label
WHEN $inclusive THEN start.entity_id CONTAINS $entity_id
ELSE start.entity_id = $entity_id
END
)
WITH start
CALL apoc.path.subgraphAll(start, {
relationshipFilter: '',
@@ -799,7 +725,7 @@ class Neo4JStorage(BaseGraphStorage):
main_query,
{
"max_nodes": MAX_GRAPH_NODES,
"label": label,
"entity_id": node_label,
"inclusive": inclusive,
"max_depth": max_depth,
"min_degree": min_degree,
@@ -818,7 +744,11 @@ class Neo4JStorage(BaseGraphStorage):
result.nodes.append(
KnowledgeGraphNode(
id=f"{node_id}",
labels=list(node.labels),
labels=[
label
for label in node.labels
if label != "base"
],
properties=dict(node),
)
)
@@ -849,7 +779,7 @@ class Neo4JStorage(BaseGraphStorage):
except neo4jExceptions.ClientError as e:
logger.warning(f"APOC plugin error: {str(e)}")
if label != "*":
if node_label != "*":
logger.warning(
"Neo4j: falling back to basic Cypher recursive search..."
)
@@ -857,12 +787,14 @@ class Neo4JStorage(BaseGraphStorage):
logger.warning(
"Neo4j: inclusive search mode is not supported in recursive query, using exact matching"
)
return await self._robust_fallback(label, max_depth, min_degree)
return await self._robust_fallback(
node_label, max_depth, min_degree
)
return result
async def _robust_fallback(
self, label: str, max_depth: int, min_degree: int = 0
self, node_label: str, max_depth: int, min_degree: int = 0
) -> KnowledgeGraph:
"""
Fallback implementation when APOC plugin is not available or incompatible.
@@ -895,12 +827,11 @@ class Neo4JStorage(BaseGraphStorage):
database=self._DATABASE, default_access_mode="READ"
) as session:
query = """
MATCH (a)-[r]-(b)
WHERE id(a) = toInteger($node_id)
MATCH (a:base {entity_id: $entity_id})-[r]-(b)
WITH r, b, id(r) as edge_id, id(b) as target_id
RETURN r, b, edge_id, target_id
"""
results = await session.run(query, {"node_id": node.id})
results = await session.run(query, entity_id=node.id)
# Get all records and release database connection
records = await results.fetch(
@@ -928,14 +859,16 @@ class Neo4JStorage(BaseGraphStorage):
edge_id = str(record["edge_id"])
if edge_id not in visited_edges:
b_node = record["b"]
target_id = str(record["target_id"])
target_id = b_node.get("entity_id")
if b_node.labels: # Only process if target node has labels
if target_id: # Only process if target node has entity_id
# Create KnowledgeGraphNode for target
target_node = KnowledgeGraphNode(
id=f"{target_id}",
labels=list(b_node.labels),
properties=dict(b_node),
labels=[
label for label in b_node.labels if label != "base"
],
properties=dict(b_node.properties),
)
# Create KnowledgeGraphEdge
@@ -961,11 +894,11 @@ class Neo4JStorage(BaseGraphStorage):
async with self._driver.session(
database=self._DATABASE, default_access_mode="READ"
) as session:
query = f"""
MATCH (n:`{label}`)
query = """
MATCH (n:base {entity_id: $entity_id})
RETURN id(n) as node_id, n
"""
node_result = await session.run(query)
node_result = await session.run(query, entity_id=node_label)
try:
node_record = await node_result.single()
if not node_record:
@@ -973,9 +906,11 @@ class Neo4JStorage(BaseGraphStorage):
# Create initial KnowledgeGraphNode
start_node = KnowledgeGraphNode(
id=f"{node_record['node_id']}",
labels=list(node_record["n"].labels),
properties=dict(node_record["n"]),
id=f"{node_record['n'].get('entity_id')}",
labels=[
label for label in node_record["n"].labels if label != "base"
],
properties=dict(node_record["n"].properties),
)
finally:
await node_result.consume() # Ensure results are consumed
@@ -999,11 +934,10 @@ class Neo4JStorage(BaseGraphStorage):
# Method 2: Query compatible with older versions
query = """
MATCH (n)
WITH DISTINCT labels(n) AS node_labels
UNWIND node_labels AS label
RETURN DISTINCT label
ORDER BY label
MATCH (n)
WHERE n.entity_id IS NOT NULL
RETURN DISTINCT n.entity_id AS label
ORDER BY label
"""
result = await session.run(query)
labels = []
@@ -1034,15 +968,14 @@ class Neo4JStorage(BaseGraphStorage):
Args:
node_id: The label of the node to delete
"""
label = self._ensure_label(node_id)
async def _do_delete(tx: AsyncManagedTransaction):
query = f"""
MATCH (n:`{label}`)
query = """
MATCH (n:base {entity_id: $entity_id})
DETACH DELETE n
"""
result = await tx.run(query)
logger.debug(f"Deleted node with label '{label}'")
result = await tx.run(query, entity_id=node_id)
logger.debug(f"Deleted node with label '{node_id}'")
await result.consume() # Ensure result is fully consumed
try:
@@ -1092,16 +1025,16 @@ class Neo4JStorage(BaseGraphStorage):
edges: List of edges to be deleted, each edge is a (source, target) tuple
"""
for source, target in edges:
source_label = self._ensure_label(source)
target_label = self._ensure_label(target)
async def _do_delete_edge(tx: AsyncManagedTransaction):
query = f"""
MATCH (source:`{source_label}`)-[r]-(target:`{target_label}`)
query = """
MATCH (source:base {entity_id: $source_entity_id})-[r]-(target:base {entity_id: $target_entity_id})
DELETE r
"""
result = await tx.run(query)
logger.debug(f"Deleted edge from '{source_label}' to '{target_label}'")
result = await tx.run(
query, source_entity_id=source, target_entity_id=target
)
logger.debug(f"Deleted edge from '{source}' to '{target}'")
await result.consume() # Ensure result is fully consumed
try:

View File

@@ -529,6 +529,80 @@ class OracleVectorDBStorage(BaseVectorStorage):
logger.error(f"Error searching records with prefix '{prefix}': {e}")
return []
async def get_by_id(self, id: str) -> dict[str, Any] | None:
"""Get vector data by its ID
Args:
id: The unique identifier of the vector
Returns:
The vector data if found, or None if not found
"""
try:
# Determine the table name based on namespace
table_name = namespace_to_table_name(self.namespace)
if not table_name:
logger.error(f"Unknown namespace for ID lookup: {self.namespace}")
return None
# Create the appropriate ID field name based on namespace
id_field = "entity_id" if "NODES" in table_name else "relation_id"
if "CHUNKS" in table_name:
id_field = "chunk_id"
# Prepare and execute the query
query = f"""
SELECT * FROM {table_name}
WHERE {id_field} = :id AND workspace = :workspace
"""
params = {"id": id, "workspace": self.db.workspace}
result = await self.db.query(query, params)
return result
except Exception as e:
logger.error(f"Error retrieving vector data for ID {id}: {e}")
return None
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
"""Get multiple vector data by their IDs
Args:
ids: List of unique identifiers
Returns:
List of vector data objects that were found
"""
if not ids:
return []
try:
# Determine the table name based on namespace
table_name = namespace_to_table_name(self.namespace)
if not table_name:
logger.error(f"Unknown namespace for IDs lookup: {self.namespace}")
return []
# Create the appropriate ID field name based on namespace
id_field = "entity_id" if "NODES" in table_name else "relation_id"
if "CHUNKS" in table_name:
id_field = "chunk_id"
# Format the list of IDs for SQL IN clause
ids_list = ", ".join([f"'{id}'" for id in ids])
# Prepare and execute the query
query = f"""
SELECT * FROM {table_name}
WHERE {id_field} IN ({ids_list}) AND workspace = :workspace
"""
params = {"workspace": self.db.workspace}
results = await self.db.query(query, params, multirows=True)
return results or []
except Exception as e:
logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
return []
@final
@dataclass

View File

@@ -438,6 +438,8 @@ class PGVectorStorage(BaseVectorStorage):
"entity_name": item["entity_name"],
"content": item["content"],
"content_vector": json.dumps(item["__vector__"].tolist()),
"chunk_id": item["source_id"],
# TODO: add document_id
}
return upsert_sql, data
@@ -450,6 +452,8 @@ class PGVectorStorage(BaseVectorStorage):
"target_id": item["tgt_id"],
"content": item["content"],
"content_vector": json.dumps(item["__vector__"].tolist()),
"chunk_id": item["source_id"],
# TODO: add document_id
}
return upsert_sql, data
@@ -492,13 +496,20 @@ class PGVectorStorage(BaseVectorStorage):
await self.db.execute(upsert_sql, data)
#################### query method ###############
async def query(self, query: str, top_k: int) -> list[dict[str, Any]]:
async def query(
self, query: str, top_k: int, ids: list[str] | None = None
) -> list[dict[str, Any]]:
embeddings = await self.embedding_func([query])
embedding = embeddings[0]
embedding_string = ",".join(map(str, embedding))
if ids:
formatted_ids = ",".join(f"'{id}'" for id in ids)
else:
formatted_ids = "NULL"
sql = SQL_TEMPLATES[self.base_namespace].format(
embedding_string=embedding_string
embedding_string=embedding_string, doc_ids=formatted_ids
)
params = {
"workspace": self.db.workspace,
@@ -610,6 +621,60 @@ class PGVectorStorage(BaseVectorStorage):
logger.error(f"Error during prefix search for '{prefix}': {e}")
return []
async def get_by_id(self, id: str) -> dict[str, Any] | None:
"""Get vector data by its ID
Args:
id: The unique identifier of the vector
Returns:
The vector data if found, or None if not found
"""
table_name = namespace_to_table_name(self.namespace)
if not table_name:
logger.error(f"Unknown namespace for ID lookup: {self.namespace}")
return None
query = f"SELECT * FROM {table_name} WHERE workspace=$1 AND id=$2"
params = {"workspace": self.db.workspace, "id": id}
try:
result = await self.db.query(query, params)
if result:
return dict(result)
return None
except Exception as e:
logger.error(f"Error retrieving vector data for ID {id}: {e}")
return None
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
"""Get multiple vector data by their IDs
Args:
ids: List of unique identifiers
Returns:
List of vector data objects that were found
"""
if not ids:
return []
table_name = namespace_to_table_name(self.namespace)
if not table_name:
logger.error(f"Unknown namespace for IDs lookup: {self.namespace}")
return []
ids_str = ",".join([f"'{id}'" for id in ids])
query = f"SELECT * FROM {table_name} WHERE workspace=$1 AND id IN ({ids_str})"
params = {"workspace": self.db.workspace}
try:
results = await self.db.query(query, params, multirows=True)
return [dict(record) for record in results]
except Exception as e:
logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
return []
@final
@dataclass
@@ -1491,6 +1556,7 @@ TABLES = {
content_vector VECTOR,
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
update_time TIMESTAMP,
chunk_id VARCHAR(255) NULL,
CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id)
)"""
},
@@ -1504,6 +1570,7 @@ TABLES = {
content_vector VECTOR,
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
update_time TIMESTAMP,
chunk_id VARCHAR(255) NULL,
CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id)
)"""
},
@@ -1586,8 +1653,9 @@ SQL_TEMPLATES = {
content_vector=EXCLUDED.content_vector,
update_time = CURRENT_TIMESTAMP
""",
"upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, content_vector)
VALUES ($1, $2, $3, $4, $5)
"upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
content_vector, chunk_id)
VALUES ($1, $2, $3, $4, $5, $6)
ON CONFLICT (workspace,id) DO UPDATE
SET entity_name=EXCLUDED.entity_name,
content=EXCLUDED.content,
@@ -1595,8 +1663,8 @@ SQL_TEMPLATES = {
update_time=CURRENT_TIMESTAMP
""",
"upsert_relationship": """INSERT INTO LIGHTRAG_VDB_RELATION (workspace, id, source_id,
target_id, content, content_vector)
VALUES ($1, $2, $3, $4, $5, $6)
target_id, content, content_vector, chunk_id)
VALUES ($1, $2, $3, $4, $5, $6, $7)
ON CONFLICT (workspace,id) DO UPDATE
SET source_id=EXCLUDED.source_id,
target_id=EXCLUDED.target_id,
@@ -1604,21 +1672,21 @@ SQL_TEMPLATES = {
content_vector=EXCLUDED.content_vector, update_time = CURRENT_TIMESTAMP
""",
# SQL for VectorStorage
"entities": """SELECT entity_name FROM
(SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
FROM LIGHTRAG_VDB_ENTITY where workspace=$1)
WHERE distance>$2 ORDER BY distance DESC LIMIT $3
""",
"relationships": """SELECT source_id as src_id, target_id as tgt_id FROM
(SELECT id, source_id,target_id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
FROM LIGHTRAG_VDB_RELATION where workspace=$1)
WHERE distance>$2 ORDER BY distance DESC LIMIT $3
""",
"chunks": """SELECT id FROM
(SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
FROM LIGHTRAG_DOC_CHUNKS where workspace=$1)
WHERE distance>$2 ORDER BY distance DESC LIMIT $3
""",
# "entities": """SELECT entity_name FROM
# (SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
# FROM LIGHTRAG_VDB_ENTITY where workspace=$1)
# WHERE distance>$2 ORDER BY distance DESC LIMIT $3
# """,
# "relationships": """SELECT source_id as src_id, target_id as tgt_id FROM
# (SELECT id, source_id,target_id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
# FROM LIGHTRAG_VDB_RELATION where workspace=$1)
# WHERE distance>$2 ORDER BY distance DESC LIMIT $3
# """,
# "chunks": """SELECT id FROM
# (SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
# FROM LIGHTRAG_DOC_CHUNKS where workspace=$1)
# WHERE distance>$2 ORDER BY distance DESC LIMIT $3
# """,
# DROP tables
"drop_all": """
DROP TABLE IF EXISTS LIGHTRAG_DOC_FULL CASCADE;
@@ -1642,4 +1710,55 @@ SQL_TEMPLATES = {
"drop_vdb_relation": """
DROP TABLE IF EXISTS LIGHTRAG_VDB_RELATION CASCADE;
""",
"relationships": """
WITH relevant_chunks AS (
SELECT id as chunk_id
FROM LIGHTRAG_DOC_CHUNKS
WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
)
SELECT source_id as src_id, target_id as tgt_id
FROM (
SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance
FROM LIGHTRAG_VDB_RELATION r
WHERE r.workspace=$1
AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks)
) filtered
WHERE distance>$2
ORDER BY distance DESC
LIMIT $3
""",
"entities": """
WITH relevant_chunks AS (
SELECT id as chunk_id
FROM LIGHTRAG_DOC_CHUNKS
WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
)
SELECT entity_name FROM
(
SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
FROM LIGHTRAG_VDB_ENTITY
where workspace=$1
AND chunk_id IN (SELECT chunk_id FROM relevant_chunks)
)
WHERE distance>$2
ORDER BY distance DESC
LIMIT $3
""",
"chunks": """
WITH relevant_chunks AS (
SELECT id as chunk_id
FROM LIGHTRAG_DOC_CHUNKS
WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
)
SELECT id FROM
(
SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
FROM LIGHTRAG_DOC_CHUNKS
where workspace=$1
AND id IN (SELECT chunk_id FROM relevant_chunks)
)
WHERE distance>$2
ORDER BY distance DESC
LIMIT $3
""",
}

View File

@@ -463,6 +463,100 @@ class TiDBVectorDBStorage(BaseVectorStorage):
logger.error(f"Error searching records with prefix '{prefix}': {e}")
return []
async def get_by_id(self, id: str) -> dict[str, Any] | None:
"""Get vector data by its ID
Args:
id: The unique identifier of the vector
Returns:
The vector data if found, or None if not found
"""
try:
# Determine which table to query based on namespace
if self.namespace == NameSpace.VECTOR_STORE_ENTITIES:
sql_template = """
SELECT entity_id as id, name as entity_name, entity_type, description, content
FROM LIGHTRAG_GRAPH_NODES
WHERE entity_id = :entity_id AND workspace = :workspace
"""
params = {"entity_id": id, "workspace": self.db.workspace}
elif self.namespace == NameSpace.VECTOR_STORE_RELATIONSHIPS:
sql_template = """
SELECT relation_id as id, source_name as src_id, target_name as tgt_id,
keywords, description, content
FROM LIGHTRAG_GRAPH_EDGES
WHERE relation_id = :relation_id AND workspace = :workspace
"""
params = {"relation_id": id, "workspace": self.db.workspace}
elif self.namespace == NameSpace.VECTOR_STORE_CHUNKS:
sql_template = """
SELECT chunk_id as id, content, tokens, chunk_order_index, full_doc_id
FROM LIGHTRAG_DOC_CHUNKS
WHERE chunk_id = :chunk_id AND workspace = :workspace
"""
params = {"chunk_id": id, "workspace": self.db.workspace}
else:
logger.warning(
f"Namespace {self.namespace} not supported for get_by_id"
)
return None
result = await self.db.query(sql_template, params=params)
return result
except Exception as e:
logger.error(f"Error retrieving vector data for ID {id}: {e}")
return None
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
"""Get multiple vector data by their IDs
Args:
ids: List of unique identifiers
Returns:
List of vector data objects that were found
"""
if not ids:
return []
try:
# Format IDs for SQL IN clause
ids_str = ", ".join([f"'{id}'" for id in ids])
# Determine which table to query based on namespace
if self.namespace == NameSpace.VECTOR_STORE_ENTITIES:
sql_template = f"""
SELECT entity_id as id, name as entity_name, entity_type, description, content
FROM LIGHTRAG_GRAPH_NODES
WHERE entity_id IN ({ids_str}) AND workspace = :workspace
"""
elif self.namespace == NameSpace.VECTOR_STORE_RELATIONSHIPS:
sql_template = f"""
SELECT relation_id as id, source_name as src_id, target_name as tgt_id,
keywords, description, content
FROM LIGHTRAG_GRAPH_EDGES
WHERE relation_id IN ({ids_str}) AND workspace = :workspace
"""
elif self.namespace == NameSpace.VECTOR_STORE_CHUNKS:
sql_template = f"""
SELECT chunk_id as id, content, tokens, chunk_order_index, full_doc_id
FROM LIGHTRAG_DOC_CHUNKS
WHERE chunk_id IN ({ids_str}) AND workspace = :workspace
"""
else:
logger.warning(
f"Namespace {self.namespace} not supported for get_by_ids"
)
return []
params = {"workspace": self.db.workspace}
results = await self.db.query(sql_template, params=params, multirows=True)
return results if results else []
except Exception as e:
logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
return []
@final
@dataclass

View File

@@ -30,11 +30,10 @@ from .namespace import NameSpace, make_namespace
from .operate import (
chunking_by_token_size,
extract_entities,
extract_keywords_only,
kg_query,
kg_query_with_keywords,
mix_kg_vector_query,
naive_query,
query_with_keywords,
)
from .prompt import GRAPH_FIELD_SEP, PROMPTS
from .utils import (
@@ -45,6 +44,9 @@ from .utils import (
encode_string_by_tiktoken,
lazy_external_import,
limit_async_func_call,
get_content_summary,
clean_text,
check_storage_env_vars,
logger,
)
from .types import KnowledgeGraph
@@ -309,7 +311,7 @@ class LightRAG:
# Verify storage implementation compatibility
verify_storage_implementation(storage_type, storage_name)
# Check environment variables
# self.check_storage_env_vars(storage_name)
check_storage_env_vars(storage_name)
# Ensure vector_db_storage_cls_kwargs has required fields
self.vector_db_storage_cls_kwargs = {
@@ -536,11 +538,6 @@ class LightRAG:
storage_class = lazy_external_import(import_path, storage_name)
return storage_class
@staticmethod
def clean_text(text: str) -> str:
"""Clean text by removing null bytes (0x00) and whitespace"""
return text.strip().replace("\x00", "")
def insert(
self,
input: str | list[str],
@@ -602,8 +599,8 @@ class LightRAG:
update_storage = False
try:
# Clean input texts
full_text = self.clean_text(full_text)
text_chunks = [self.clean_text(chunk) for chunk in text_chunks]
full_text = clean_text(full_text)
text_chunks = [clean_text(chunk) for chunk in text_chunks]
# Process cleaned texts
if doc_id is None:
@@ -682,7 +679,7 @@ class LightRAG:
contents = {id_: doc for id_, doc in zip(ids, input)}
else:
# Clean input text and remove duplicates
input = list(set(self.clean_text(doc) for doc in input))
input = list(set(clean_text(doc) for doc in input))
# Generate contents dict of MD5 hash IDs and documents
contents = {compute_mdhash_id(doc, prefix="doc-"): doc for doc in input}
@@ -698,7 +695,7 @@ class LightRAG:
new_docs: dict[str, Any] = {
id_: {
"content": content,
"content_summary": self._get_content_summary(content),
"content_summary": get_content_summary(content),
"content_length": len(content),
"status": DocStatus.PENDING,
"created_at": datetime.now().isoformat(),
@@ -1063,7 +1060,7 @@ class LightRAG:
all_chunks_data: dict[str, dict[str, str]] = {}
chunk_to_source_map: dict[str, str] = {}
for chunk_data in custom_kg.get("chunks", []):
chunk_content = self.clean_text(chunk_data["content"])
chunk_content = clean_text(chunk_data["content"])
source_id = chunk_data["source_id"]
tokens = len(
encode_string_by_tiktoken(
@@ -1296,8 +1293,17 @@ class LightRAG:
self, query: str, prompt: str, param: QueryParam = QueryParam()
):
"""
1. Extract keywords from the 'query' using new function in operate.py.
2. Then run the standard aquery() flow with the final prompt (formatted_question).
Query with separate keyword extraction step.
This method extracts keywords from the query first, then uses them for the query.
Args:
query: User query
prompt: Additional prompt for the query
param: Query parameters
Returns:
Query response
"""
loop = always_get_an_event_loop()
return loop.run_until_complete(
@@ -1308,66 +1314,29 @@ class LightRAG:
self, query: str, prompt: str, param: QueryParam = QueryParam()
) -> str | AsyncIterator[str]:
"""
1. Calls extract_keywords_only to get HL/LL keywords from 'query'.
2. Then calls kg_query(...) or naive_query(...), etc. as the main query, while also injecting the newly extracted keywords if needed.
Async version of query_with_separate_keyword_extraction.
Args:
query: User query
prompt: Additional prompt for the query
param: Query parameters
Returns:
Query response or async iterator
"""
# ---------------------
# STEP 1: Keyword Extraction
# ---------------------
hl_keywords, ll_keywords = await extract_keywords_only(
text=query,
response = await query_with_keywords(
query=query,
prompt=prompt,
param=param,
knowledge_graph_inst=self.chunk_entity_relation_graph,
entities_vdb=self.entities_vdb,
relationships_vdb=self.relationships_vdb,
chunks_vdb=self.chunks_vdb,
text_chunks_db=self.text_chunks,
global_config=asdict(self),
hashing_kv=self.llm_response_cache, # Directly use llm_response_cache
hashing_kv=self.llm_response_cache,
)
param.hl_keywords = hl_keywords
param.ll_keywords = ll_keywords
# ---------------------
# STEP 2: Final Query Logic
# ---------------------
# Create a new string with the prompt and the keywords
ll_keywords_str = ", ".join(ll_keywords)
hl_keywords_str = ", ".join(hl_keywords)
formatted_question = f"{prompt}\n\n### Keywords:\nHigh-level: {hl_keywords_str}\nLow-level: {ll_keywords_str}\n\n### Query:\n{query}"
if param.mode in ["local", "global", "hybrid"]:
response = await kg_query_with_keywords(
formatted_question,
self.chunk_entity_relation_graph,
self.entities_vdb,
self.relationships_vdb,
self.text_chunks,
param,
asdict(self),
hashing_kv=self.llm_response_cache, # Directly use llm_response_cache
)
elif param.mode == "naive":
response = await naive_query(
formatted_question,
self.chunks_vdb,
self.text_chunks,
param,
asdict(self),
hashing_kv=self.llm_response_cache, # Directly use llm_response_cache
)
elif param.mode == "mix":
response = await mix_kg_vector_query(
formatted_question,
self.chunk_entity_relation_graph,
self.entities_vdb,
self.relationships_vdb,
self.chunks_vdb,
self.text_chunks,
param,
asdict(self),
hashing_kv=self.llm_response_cache, # Directly use llm_response_cache
)
else:
raise ValueError(f"Unknown mode {param.mode}")
await self._query_done()
return response
@@ -1465,21 +1434,6 @@ class LightRAG:
]
)
def _get_content_summary(self, content: str, max_length: int = 100) -> str:
"""Get summary of document content
Args:
content: Original document content
max_length: Maximum length of summary
Returns:
Truncated content with ellipsis if needed
"""
content = content.strip()
if len(content) <= max_length:
return content
return content[:max_length] + "..."
async def get_processing_status(self) -> dict[str, int]:
"""Get current document processing status counts
@@ -1756,19 +1710,7 @@ class LightRAG:
async def get_entity_info(
self, entity_name: str, include_vector_data: bool = False
) -> dict[str, str | None | dict[str, str]]:
"""Get detailed information of an entity
Args:
entity_name: Entity name (no need for quotes)
include_vector_data: Whether to include data from the vector database
Returns:
dict: A dictionary containing entity information, including:
- entity_name: Entity name
- source_id: Source document ID
- graph_data: Complete node data from the graph database
- vector_data: (optional) Data from the vector database
"""
"""Get detailed information of an entity"""
# Get information from the graph
node_data = await self.chunk_entity_relation_graph.get_node(entity_name)
@@ -1783,29 +1725,15 @@ class LightRAG:
# Optional: Get vector database information
if include_vector_data:
entity_id = compute_mdhash_id(entity_name, prefix="ent-")
vector_data = self.entities_vdb._client.get([entity_id])
result["vector_data"] = vector_data[0] if vector_data else None
vector_data = await self.entities_vdb.get_by_id(entity_id)
result["vector_data"] = vector_data
return result
async def get_relation_info(
self, src_entity: str, tgt_entity: str, include_vector_data: bool = False
) -> dict[str, str | None | dict[str, str]]:
"""Get detailed information of a relationship
Args:
src_entity: Source entity name (no need for quotes)
tgt_entity: Target entity name (no need for quotes)
include_vector_data: Whether to include data from the vector database
Returns:
dict: A dictionary containing relationship information, including:
- src_entity: Source entity name
- tgt_entity: Target entity name
- source_id: Source document ID
- graph_data: Complete edge data from the graph database
- vector_data: (optional) Data from the vector database
"""
"""Get detailed information of a relationship"""
# Get information from the graph
edge_data = await self.chunk_entity_relation_graph.get_edge(
@@ -1823,8 +1751,8 @@ class LightRAG:
# Optional: Get vector database information
if include_vector_data:
rel_id = compute_mdhash_id(src_entity + tgt_entity, prefix="rel-")
vector_data = self.relationships_vdb._client.get([rel_id])
result["vector_data"] = vector_data[0] if vector_data else None
vector_data = await self.relationships_vdb.get_by_id(rel_id)
result["vector_data"] = vector_data
return result
@@ -2622,6 +2550,12 @@ class LightRAG:
# 9. Delete source entities
for entity_name in source_entities:
if entity_name == target_entity:
logger.info(
f"Skipping deletion of '{entity_name}' as it's also the target entity"
)
continue
# Delete entity node from knowledge graph
await self.chunk_entity_relation_graph.delete_node(entity_name)

View File

@@ -55,6 +55,7 @@ async def azure_openai_complete_if_cache(
openai_async_client = AsyncAzureOpenAI(
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
azure_deployment=model,
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)
@@ -136,6 +137,7 @@ async def azure_openai_embed(
openai_async_client = AsyncAzureOpenAI(
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
azure_deployment=model,
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)

View File

@@ -141,18 +141,36 @@ async def _handle_single_entity_extraction(
):
if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
return None
# add this record as a node in the G
# Clean and validate entity name
entity_name = clean_str(record_attributes[1]).strip('"')
if not entity_name.strip():
logger.warning(
f"Entity extraction error: empty entity name in: {record_attributes}"
)
return None
# Clean and validate entity type
entity_type = clean_str(record_attributes[2]).strip('"')
if not entity_type.strip() or entity_type.startswith('("'):
logger.warning(
f"Entity extraction error: invalid entity type in: {record_attributes}"
)
return None
# Clean and validate description
entity_description = clean_str(record_attributes[3]).strip('"')
entity_source_id = chunk_key
if not entity_description.strip():
logger.warning(
f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'"
)
return None
return dict(
entity_name=entity_name,
entity_type=entity_type,
description=entity_description,
source_id=entity_source_id,
source_id=chunk_key,
metadata={"created_at": time.time()},
)
@@ -438,47 +456,22 @@ async def extract_entities(
else:
return await use_llm_func(input_text)
async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
""" "Prpocess a single chunk
async def _process_extraction_result(result: str, chunk_key: str):
"""Process a single extraction result (either initial or gleaning)
Args:
chunk_key_dp (tuple[str, TextChunkSchema]):
("chunck-xxxxxx", {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int})
result (str): The extraction result to process
chunk_key (str): The chunk key for source tracking
Returns:
tuple: (nodes_dict, edges_dict) containing the extracted entities and relationships
"""
nonlocal processed_chunks
chunk_key = chunk_key_dp[0]
chunk_dp = chunk_key_dp[1]
content = chunk_dp["content"]
# hint_prompt = entity_extract_prompt.format(**context_base, input_text=content)
hint_prompt = entity_extract_prompt.format(
**context_base, input_text="{input_text}"
).format(**context_base, input_text=content)
final_result = await _user_llm_func_with_cache(hint_prompt)
history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
for now_glean_index in range(entity_extract_max_gleaning):
glean_result = await _user_llm_func_with_cache(
continue_prompt, history_messages=history
)
history += pack_user_ass_to_openai_messages(continue_prompt, glean_result)
final_result += glean_result
if now_glean_index == entity_extract_max_gleaning - 1:
break
if_loop_result: str = await _user_llm_func_with_cache(
if_loop_prompt, history_messages=history
)
if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
if if_loop_result != "yes":
break
maybe_nodes = defaultdict(list)
maybe_edges = defaultdict(list)
records = split_string_by_multi_markers(
final_result,
result,
[context_base["record_delimiter"], context_base["completion_delimiter"]],
)
maybe_nodes = defaultdict(list)
maybe_edges = defaultdict(list)
for record in records:
record = re.search(r"\((.*)\)", record)
if record is None:
@@ -487,6 +480,7 @@ async def extract_entities(
record_attributes = split_string_by_multi_markers(
record, [context_base["tuple_delimiter"]]
)
if_entities = await _handle_single_entity_extraction(
record_attributes, chunk_key
)
@@ -501,6 +495,62 @@ async def extract_entities(
maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append(
if_relation
)
return maybe_nodes, maybe_edges
async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
"""Process a single chunk
Args:
chunk_key_dp (tuple[str, TextChunkSchema]):
("chunk-xxxxxx", {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int})
"""
nonlocal processed_chunks
chunk_key = chunk_key_dp[0]
chunk_dp = chunk_key_dp[1]
content = chunk_dp["content"]
# Get initial extraction
hint_prompt = entity_extract_prompt.format(
**context_base, input_text="{input_text}"
).format(**context_base, input_text=content)
final_result = await _user_llm_func_with_cache(hint_prompt)
history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
# Process initial extraction
maybe_nodes, maybe_edges = await _process_extraction_result(
final_result, chunk_key
)
# Process additional gleaning results
for now_glean_index in range(entity_extract_max_gleaning):
glean_result = await _user_llm_func_with_cache(
continue_prompt, history_messages=history
)
history += pack_user_ass_to_openai_messages(continue_prompt, glean_result)
# Process gleaning result separately
glean_nodes, glean_edges = await _process_extraction_result(
glean_result, chunk_key
)
# Merge results
for entity_name, entities in glean_nodes.items():
maybe_nodes[entity_name].extend(entities)
for edge_key, edges in glean_edges.items():
maybe_edges[edge_key].extend(edges)
if now_glean_index == entity_extract_max_gleaning - 1:
break
if_loop_result: str = await _user_llm_func_with_cache(
if_loop_prompt, history_messages=history
)
if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
if if_loop_result != "yes":
break
processed_chunks += 1
entities_count = len(maybe_nodes)
relations_count = len(maybe_edges)
@@ -912,7 +962,10 @@ async def mix_kg_vector_query(
try:
# Reduce top_k for vector search in hybrid mode since we have structured information from KG
mix_topk = min(10, query_param.top_k)
results = await chunks_vdb.query(augmented_query, top_k=mix_topk)
# TODO: add ids to the query
results = await chunks_vdb.query(
augmented_query, top_k=mix_topk, ids=query_param.ids
)
if not results:
return None
@@ -1121,7 +1174,11 @@ async def _get_node_data(
logger.info(
f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}"
)
results = await entities_vdb.query(query, top_k=query_param.top_k)
results = await entities_vdb.query(
query, top_k=query_param.top_k, ids=query_param.ids
)
if not len(results):
return "", "", ""
# get entity information
@@ -1374,7 +1431,10 @@ async def _get_edge_data(
logger.info(
f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}"
)
results = await relationships_vdb.query(keywords, top_k=query_param.top_k)
results = await relationships_vdb.query(
keywords, top_k=query_param.top_k, ids=query_param.ids
)
if not len(results):
return "", "", ""
@@ -1623,7 +1683,9 @@ async def naive_query(
if cached_response is not None:
return cached_response
results = await chunks_vdb.query(query, top_k=query_param.top_k)
results = await chunks_vdb.query(
query, top_k=query_param.top_k, ids=query_param.ids
)
if not len(results):
return PROMPTS["fail_response"]
@@ -1854,3 +1916,90 @@ async def kg_query_with_keywords(
)
return response
async def query_with_keywords(
query: str,
prompt: str,
param: QueryParam,
knowledge_graph_inst: BaseGraphStorage,
entities_vdb: BaseVectorStorage,
relationships_vdb: BaseVectorStorage,
chunks_vdb: BaseVectorStorage,
text_chunks_db: BaseKVStorage,
global_config: dict[str, str],
hashing_kv: BaseKVStorage | None = None,
) -> str | AsyncIterator[str]:
"""
Extract keywords from the query and then use them for retrieving information.
1. Extracts high-level and low-level keywords from the query
2. Formats the query with the extracted keywords and prompt
3. Uses the appropriate query method based on param.mode
Args:
query: The user's query
prompt: Additional prompt to prepend to the query
param: Query parameters
knowledge_graph_inst: Knowledge graph storage
entities_vdb: Entities vector database
relationships_vdb: Relationships vector database
chunks_vdb: Document chunks vector database
text_chunks_db: Text chunks storage
global_config: Global configuration
hashing_kv: Cache storage
Returns:
Query response or async iterator
"""
# Extract keywords
hl_keywords, ll_keywords = await extract_keywords_only(
text=query,
param=param,
global_config=global_config,
hashing_kv=hashing_kv,
)
param.hl_keywords = hl_keywords
param.ll_keywords = ll_keywords
# Create a new string with the prompt and the keywords
ll_keywords_str = ", ".join(ll_keywords)
hl_keywords_str = ", ".join(hl_keywords)
formatted_question = f"{prompt}\n\n### Keywords:\nHigh-level: {hl_keywords_str}\nLow-level: {ll_keywords_str}\n\n### Query:\n{query}"
# Use appropriate query method based on mode
if param.mode in ["local", "global", "hybrid"]:
return await kg_query_with_keywords(
formatted_question,
knowledge_graph_inst,
entities_vdb,
relationships_vdb,
text_chunks_db,
param,
global_config,
hashing_kv=hashing_kv,
)
elif param.mode == "naive":
return await naive_query(
formatted_question,
chunks_vdb,
text_chunks_db,
param,
global_config,
hashing_kv=hashing_kv,
)
elif param.mode == "mix":
return await mix_kg_vector_query(
formatted_question,
knowledge_graph_inst,
entities_vdb,
relationships_vdb,
chunks_vdb,
text_chunks_db,
param,
global_config,
hashing_kv=hashing_kv,
)
else:
raise ValueError(f"Unknown mode {param.mode}")

View File

@@ -236,7 +236,7 @@ Given the query and conversation history, list both high-level and low-level key
---Instructions---
- Consider both the current query and relevant conversation history when extracting keywords
- Output the keywords in JSON format
- Output the keywords in JSON format, it will be parsed by a JSON parser, do not add any extra content in output
- The JSON should have two keys:
- "high_level_keywords" for overarching concepts or themes
- "low_level_keywords" for specific entities or details

View File

@@ -890,3 +890,52 @@ def lazy_external_import(module_name: str, class_name: str) -> Callable[..., Any
return cls(*args, **kwargs)
return import_class
def get_content_summary(content: str, max_length: int = 100) -> str:
"""Get summary of document content
Args:
content: Original document content
max_length: Maximum length of summary
Returns:
Truncated content with ellipsis if needed
"""
content = content.strip()
if len(content) <= max_length:
return content
return content[:max_length] + "..."
def clean_text(text: str) -> str:
"""Clean text by removing null bytes (0x00) and whitespace
Args:
text: Input text to clean
Returns:
Cleaned text
"""
return text.strip().replace("\x00", "")
def check_storage_env_vars(storage_name: str) -> None:
"""Check if all required environment variables for storage implementation exist
Args:
storage_name: Storage implementation name
Raises:
ValueError: If required environment variables are missing
"""
from lightrag.kg import STORAGE_ENV_REQUIREMENTS
required_vars = STORAGE_ENV_REQUIREMENTS.get(storage_name, [])
missing_vars = [var for var in required_vars if var not in os.environ]
if missing_vars:
raise ValueError(
f"Storage implementation '{storage_name}' requires the following "
f"environment variables: {', '.join(missing_vars)}"
)