Merge branch 'main' into edit-node
This commit is contained in:
@@ -102,6 +102,10 @@ lightrag-gunicorn --workers 4
|
||||
- `--log-level`:日志级别(默认:INFO)
|
||||
- --input-dir:指定要扫描文档的目录(默认:./input)
|
||||
|
||||
> ** 要求将.env文件置于启动目录中是经过特意设计的**。 这样做的目的是支持用户同时启动多个LightRAG实例,并为不同实例配置不同的.env文件。
|
||||
|
||||
> **修改.env文件后,您需要重新打开终端以使新设置生效**。 这是因为每次启动时,LightRAG Server会将.env文件中的环境变量加载至系统环境变量,且系统环境变量的设置具有更高优先级。
|
||||
|
||||
### 启动时自动扫描
|
||||
|
||||
当使用 `--auto-scan-at-startup` 参数启动任何服务器时,系统将自动:
|
||||
|
@@ -106,6 +106,8 @@ Here are some commonly used startup parameters:
|
||||
|
||||
> The requirement for the .env file to be in the startup directory is intentionally designed this way. The purpose is to support users in launching multiple LightRAG instances simultaneously, allowing different .env files for different instances.
|
||||
|
||||
> **After changing the .env file, you need to open a new terminal to make the new settings take effect.** This because the LightRAG Server will load the environment variables from .env into the system environment variables each time it starts, and LightRAG Server will prioritize the settings in the system environment variables.
|
||||
|
||||
### Auto scan on startup
|
||||
|
||||
When starting any of the servers with the `--auto-scan-at-startup` parameter, the system will automatically:
|
||||
|
@@ -1 +1 @@
|
||||
__api_version__ = "0146"
|
||||
__api_version__ = "0148"
|
||||
|
@@ -499,7 +499,10 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
|
||||
content = result.document.export_to_markdown()
|
||||
else:
|
||||
if not pm.is_installed("python-docx"): # type: ignore
|
||||
pm.install("docx")
|
||||
try:
|
||||
pm.install("python-docx")
|
||||
except Exception:
|
||||
pm.install("docx")
|
||||
from docx import Document # type: ignore
|
||||
from io import BytesIO
|
||||
|
||||
|
@@ -308,7 +308,7 @@ class OllamaAPI:
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"Content-Type": "application/x-ndjson",
|
||||
"X-Accel-Buffering": "no", # 确保在Nginx代理时正确处理流式响应
|
||||
"X-Accel-Buffering": "no", # Ensure proper handling of streaming responses in Nginx proxy
|
||||
},
|
||||
)
|
||||
else:
|
||||
|
@@ -22,7 +22,7 @@ class QueryRequest(BaseModel):
|
||||
description="The query text",
|
||||
)
|
||||
|
||||
mode: Literal["local", "global", "hybrid", "naive", "mix"] = Field(
|
||||
mode: Literal["local", "global", "hybrid", "naive", "mix", "bypass"] = Field(
|
||||
default="hybrid",
|
||||
description="Query mode",
|
||||
)
|
||||
|
File diff suppressed because one or more lines are too long
2
lightrag/api/webui/index.html
generated
2
lightrag/api/webui/index.html
generated
@@ -8,7 +8,7 @@
|
||||
<link rel="icon" type="image/svg+xml" href="logo.png" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Lightrag</title>
|
||||
<script type="module" crossorigin src="/webui/assets/index-DSVCuARS.js"></script>
|
||||
<script type="module" crossorigin src="/webui/assets/index-CkwV8nfm.js"></script>
|
||||
<link rel="stylesheet" crossorigin href="/webui/assets/index-CTB4Vp_z.css">
|
||||
</head>
|
||||
<body>
|
||||
|
134
lightrag/base.py
134
lightrag/base.py
@@ -12,7 +12,6 @@ from typing import (
|
||||
TypeVar,
|
||||
Callable,
|
||||
)
|
||||
import numpy as np
|
||||
from .utils import EmbeddingFunc
|
||||
from .types import KnowledgeGraph
|
||||
|
||||
@@ -36,7 +35,7 @@ T = TypeVar("T")
|
||||
class QueryParam:
|
||||
"""Configuration parameters for query execution in LightRAG."""
|
||||
|
||||
mode: Literal["local", "global", "hybrid", "naive", "mix"] = "global"
|
||||
mode: Literal["local", "global", "hybrid", "naive", "mix", "bypass"] = "global"
|
||||
"""Specifies the retrieval mode:
|
||||
- "local": Focuses on context-dependent information.
|
||||
- "global": Utilizes global knowledge.
|
||||
@@ -281,63 +280,164 @@ class BaseGraphStorage(StorageNameSpace, ABC):
|
||||
|
||||
@abstractmethod
|
||||
async def has_node(self, node_id: str) -> bool:
|
||||
"""Check if an edge exists in the graph."""
|
||||
"""Check if a node exists in the graph.
|
||||
|
||||
Args:
|
||||
node_id: The ID of the node to check
|
||||
|
||||
Returns:
|
||||
True if the node exists, False otherwise
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
|
||||
"""Get the degree of a node."""
|
||||
"""Check if an edge exists between two nodes.
|
||||
|
||||
Args:
|
||||
source_node_id: The ID of the source node
|
||||
target_node_id: The ID of the target node
|
||||
|
||||
Returns:
|
||||
True if the edge exists, False otherwise
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def node_degree(self, node_id: str) -> int:
|
||||
"""Get the degree of an edge."""
|
||||
"""Get the degree (number of connected edges) of a node.
|
||||
|
||||
Args:
|
||||
node_id: The ID of the node
|
||||
|
||||
Returns:
|
||||
The number of edges connected to the node
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def edge_degree(self, src_id: str, tgt_id: str) -> int:
|
||||
"""Get a node by its id."""
|
||||
"""Get the total degree of an edge (sum of degrees of its source and target nodes).
|
||||
|
||||
Args:
|
||||
src_id: The ID of the source node
|
||||
tgt_id: The ID of the target node
|
||||
|
||||
Returns:
|
||||
The sum of the degrees of the source and target nodes
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def get_node(self, node_id: str) -> dict[str, str] | None:
|
||||
"""Get node by its label identifier, return only node properties"""
|
||||
"""Get node by its ID, returning only node properties.
|
||||
|
||||
Args:
|
||||
node_id: The ID of the node to retrieve
|
||||
|
||||
Returns:
|
||||
A dictionary of node properties if found, None otherwise
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def get_edge(
|
||||
self, source_node_id: str, target_node_id: str
|
||||
) -> dict[str, str] | None:
|
||||
"""Get edge properties between two nodes"""
|
||||
"""Get edge properties between two nodes.
|
||||
|
||||
Args:
|
||||
source_node_id: The ID of the source node
|
||||
target_node_id: The ID of the target node
|
||||
|
||||
Returns:
|
||||
A dictionary of edge properties if found, None otherwise
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def get_node_edges(self, source_node_id: str) -> list[tuple[str, str]] | None:
|
||||
"""Upsert a node into the graph."""
|
||||
"""Get all edges connected to a node.
|
||||
|
||||
Args:
|
||||
source_node_id: The ID of the node to get edges for
|
||||
|
||||
Returns:
|
||||
A list of (source_id, target_id) tuples representing edges,
|
||||
or None if the node doesn't exist
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def upsert_node(self, node_id: str, node_data: dict[str, str]) -> None:
|
||||
"""Upsert an edge into the graph."""
|
||||
"""Insert a new node or update an existing node in the graph.
|
||||
|
||||
Importance notes for in-memory storage:
|
||||
1. Changes will be persisted to disk during the next index_done_callback
|
||||
2. Only one process should updating the storage at a time before index_done_callback,
|
||||
KG-storage-log should be used to avoid data corruption
|
||||
|
||||
Args:
|
||||
node_id: The ID of the node to insert or update
|
||||
node_data: A dictionary of node properties
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def upsert_edge(
|
||||
self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
|
||||
) -> None:
|
||||
"""Insert a new edge or update an existing edge in the graph.
|
||||
|
||||
Importance notes for in-memory storage:
|
||||
1. Changes will be persisted to disk during the next index_done_callback
|
||||
2. Only one process should updating the storage at a time before index_done_callback,
|
||||
KG-storage-log should be used to avoid data corruption
|
||||
|
||||
Args:
|
||||
source_node_id: The ID of the source node
|
||||
target_node_id: The ID of the target node
|
||||
edge_data: A dictionary of edge properties
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def delete_node(self, node_id: str) -> None:
|
||||
"""Delete a node from the graph.
|
||||
|
||||
Importance notes for in-memory storage:
|
||||
1. Changes will be persisted to disk during the next index_done_callback
|
||||
2. Only one process should updating the storage at a time before index_done_callback,
|
||||
KG-storage-log should be used to avoid data corruption
|
||||
|
||||
Args:
|
||||
node_id: The ID of the node to delete
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def delete_node(self, node_id: str) -> None:
|
||||
"""Embed nodes using an algorithm."""
|
||||
async def remove_nodes(self, nodes: list[str]):
|
||||
"""Delete multiple nodes
|
||||
|
||||
Importance notes:
|
||||
1. Changes will be persisted to disk during the next index_done_callback
|
||||
2. Only one process should updating the storage at a time before index_done_callback,
|
||||
KG-storage-log should be used to avoid data corruption
|
||||
|
||||
Args:
|
||||
nodes: List of node IDs to be deleted
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def embed_nodes(
|
||||
self, algorithm: str
|
||||
) -> tuple[np.ndarray[Any, Any], list[str]]:
|
||||
"""Get all labels in the graph."""
|
||||
async def remove_edges(self, edges: list[tuple[str, str]]):
|
||||
"""Delete multiple edges
|
||||
|
||||
Importance notes:
|
||||
1. Changes will be persisted to disk during the next index_done_callback
|
||||
2. Only one process should updating the storage at a time before index_done_callback,
|
||||
KG-storage-log should be used to avoid data corruption
|
||||
|
||||
Args:
|
||||
edges: List of edges to be deleted, each edge is a (source, target) tuple
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def get_all_labels(self) -> list[str]:
|
||||
"""Get a knowledge graph of a node."""
|
||||
"""Get all labels in the graph.
|
||||
|
||||
Returns:
|
||||
A list of all node labels in the graph, sorted alphabetically
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def get_knowledge_graph(
|
||||
|
@@ -6,7 +6,6 @@ import sys
|
||||
from contextlib import asynccontextmanager
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, NamedTuple, Optional, Union, final
|
||||
import numpy as np
|
||||
import pipmaster as pm
|
||||
from lightrag.types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge
|
||||
|
||||
@@ -89,11 +88,6 @@ class AGEStorage(BaseGraphStorage):
|
||||
|
||||
return None
|
||||
|
||||
def __post_init__(self):
|
||||
self._node_embed_algorithms = {
|
||||
"node2vec": self._node2vec_embed,
|
||||
}
|
||||
|
||||
async def close(self):
|
||||
if self._driver:
|
||||
await self._driver.close()
|
||||
@@ -593,9 +587,6 @@ class AGEStorage(BaseGraphStorage):
|
||||
logger.error("Error during edge upsert: {%s}", e)
|
||||
raise
|
||||
|
||||
async def _node2vec_embed(self):
|
||||
print("Implemented but never called.")
|
||||
|
||||
@asynccontextmanager
|
||||
async def _get_pool_connection(self, timeout: Optional[float] = None):
|
||||
"""Workaround for a psycopg_pool bug"""
|
||||
@@ -668,21 +659,6 @@ class AGEStorage(BaseGraphStorage):
|
||||
logger.error(f"Error during edge deletion: {str(e)}")
|
||||
raise
|
||||
|
||||
async def embed_nodes(
|
||||
self, algorithm: str
|
||||
) -> tuple[np.ndarray[Any, Any], list[str]]:
|
||||
"""Embed nodes using the specified algorithm
|
||||
|
||||
Args:
|
||||
algorithm: Name of the embedding algorithm
|
||||
|
||||
Returns:
|
||||
tuple: (embedding matrix, list of node identifiers)
|
||||
"""
|
||||
if algorithm not in self._node_embed_algorithms:
|
||||
raise ValueError(f"Node embedding algorithm {algorithm} not supported")
|
||||
return await self._node_embed_algorithms[algorithm]()
|
||||
|
||||
async def get_all_labels(self) -> list[str]:
|
||||
"""Get all node labels in the database
|
||||
|
||||
|
@@ -6,9 +6,6 @@ import pipmaster as pm
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, final
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
from tenacity import (
|
||||
retry,
|
||||
retry_if_exception_type,
|
||||
@@ -72,11 +69,6 @@ class GremlinStorage(BaseGraphStorage):
|
||||
transport_factory=lambda: AiohttpTransport(call_from_event_loop=True),
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
self._node_embed_algorithms = {
|
||||
"node2vec": self._node2vec_embed,
|
||||
}
|
||||
|
||||
async def close(self):
|
||||
if self._driver:
|
||||
self._driver.close()
|
||||
@@ -392,9 +384,6 @@ class GremlinStorage(BaseGraphStorage):
|
||||
logger.error("Error during edge upsert: {%s}", e)
|
||||
raise
|
||||
|
||||
async def _node2vec_embed(self):
|
||||
print("Implemented but never called.")
|
||||
|
||||
async def delete_node(self, node_id: str) -> None:
|
||||
"""Delete a node with the specified entity_name
|
||||
|
||||
@@ -419,27 +408,6 @@ class GremlinStorage(BaseGraphStorage):
|
||||
logger.error(f"Error during node deletion: {str(e)}")
|
||||
raise
|
||||
|
||||
async def embed_nodes(
|
||||
self, algorithm: str
|
||||
) -> tuple[np.ndarray[Any, Any], list[str]]:
|
||||
"""
|
||||
Embed nodes using the specified algorithm.
|
||||
Currently, only node2vec is supported but never called.
|
||||
|
||||
Args:
|
||||
algorithm: The name of the embedding algorithm to use
|
||||
|
||||
Returns:
|
||||
A tuple of (embeddings, node_ids)
|
||||
|
||||
Raises:
|
||||
NotImplementedError: If the specified algorithm is not supported
|
||||
ValueError: If the algorithm is not supported
|
||||
"""
|
||||
if algorithm not in self._node_embed_algorithms:
|
||||
raise ValueError(f"Node embedding algorithm {algorithm} not supported")
|
||||
return await self._node_embed_algorithms[algorithm]()
|
||||
|
||||
async def get_all_labels(self) -> list[str]:
|
||||
"""
|
||||
Get all node entity_names in the graph
|
||||
|
@@ -663,20 +663,6 @@ class MongoGraphStorage(BaseGraphStorage):
|
||||
# Remove the node doc
|
||||
await self.collection.delete_one({"_id": node_id})
|
||||
|
||||
#
|
||||
# -------------------------------------------------------------------------
|
||||
# EMBEDDINGS (NOT IMPLEMENTED)
|
||||
# -------------------------------------------------------------------------
|
||||
#
|
||||
|
||||
async def embed_nodes(
|
||||
self, algorithm: str
|
||||
) -> tuple[np.ndarray[Any, Any], list[str]]:
|
||||
"""
|
||||
Placeholder for demonstration, raises NotImplementedError.
|
||||
"""
|
||||
raise NotImplementedError("Node embedding is not used in lightrag.")
|
||||
|
||||
#
|
||||
# -------------------------------------------------------------------------
|
||||
# QUERY
|
||||
|
@@ -2,8 +2,7 @@ import inspect
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, final
|
||||
import numpy as np
|
||||
from typing import final
|
||||
import configparser
|
||||
|
||||
|
||||
@@ -51,11 +50,6 @@ class Neo4JStorage(BaseGraphStorage):
|
||||
)
|
||||
self._driver = None
|
||||
|
||||
def __post_init__(self):
|
||||
self._node_embed_algorithms = {
|
||||
"node2vec": self._node2vec_embed,
|
||||
}
|
||||
|
||||
async def initialize(self):
|
||||
URI = os.environ.get("NEO4J_URI", config.get("neo4j", "uri", fallback=None))
|
||||
USERNAME = os.environ.get(
|
||||
@@ -635,9 +629,6 @@ class Neo4JStorage(BaseGraphStorage):
|
||||
logger.error(f"Error during edge upsert: {str(e)}")
|
||||
raise
|
||||
|
||||
async def _node2vec_embed(self):
|
||||
print("Implemented but never called.")
|
||||
|
||||
async def get_knowledge_graph(
|
||||
self,
|
||||
node_label: str,
|
||||
@@ -1126,11 +1117,6 @@ class Neo4JStorage(BaseGraphStorage):
|
||||
logger.error(f"Error during edge deletion: {str(e)}")
|
||||
raise
|
||||
|
||||
async def embed_nodes(
|
||||
self, algorithm: str
|
||||
) -> tuple[np.ndarray[Any, Any], list[str]]:
|
||||
raise NotImplementedError
|
||||
|
||||
async def drop(self) -> dict[str, str]:
|
||||
"""Drop all data from storage and clean up resources
|
||||
|
||||
|
@@ -1,7 +1,6 @@
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, final
|
||||
import numpy as np
|
||||
from typing import final
|
||||
|
||||
from lightrag.types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge
|
||||
from lightrag.utils import logger
|
||||
@@ -16,7 +15,6 @@ if not pm.is_installed("graspologic"):
|
||||
pm.install("graspologic")
|
||||
|
||||
import networkx as nx
|
||||
from graspologic import embed
|
||||
from .shared_storage import (
|
||||
get_storage_lock,
|
||||
get_update_flag,
|
||||
@@ -42,40 +40,6 @@ class NetworkXStorage(BaseGraphStorage):
|
||||
)
|
||||
nx.write_graphml(graph, file_name)
|
||||
|
||||
# TODO:deprecated, remove later
|
||||
@staticmethod
|
||||
def _stabilize_graph(graph: nx.Graph) -> nx.Graph:
|
||||
"""Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py
|
||||
Ensure an undirected graph with the same relationships will always be read the same way.
|
||||
"""
|
||||
fixed_graph = nx.DiGraph() if graph.is_directed() else nx.Graph()
|
||||
|
||||
sorted_nodes = graph.nodes(data=True)
|
||||
sorted_nodes = sorted(sorted_nodes, key=lambda x: x[0])
|
||||
|
||||
fixed_graph.add_nodes_from(sorted_nodes)
|
||||
edges = list(graph.edges(data=True))
|
||||
|
||||
if not graph.is_directed():
|
||||
|
||||
def _sort_source_target(edge):
|
||||
source, target, edge_data = edge
|
||||
if source > target:
|
||||
temp = source
|
||||
source = target
|
||||
target = temp
|
||||
return source, target, edge_data
|
||||
|
||||
edges = [_sort_source_target(edge) for edge in edges]
|
||||
|
||||
def _get_edge_key(source: Any, target: Any) -> str:
|
||||
return f"{source} -> {target}"
|
||||
|
||||
edges = sorted(edges, key=lambda x: _get_edge_key(x[0], x[1]))
|
||||
|
||||
fixed_graph.add_edges_from(edges)
|
||||
return fixed_graph
|
||||
|
||||
def __post_init__(self):
|
||||
self._graphml_xml_file = os.path.join(
|
||||
self.global_config["working_dir"], f"graph_{self.namespace}.graphml"
|
||||
@@ -94,10 +58,6 @@ class NetworkXStorage(BaseGraphStorage):
|
||||
logger.info("Created new empty graph")
|
||||
self._graph = preloaded_graph or nx.Graph()
|
||||
|
||||
self._node_embed_algorithms = {
|
||||
"node2vec": self._node2vec_embed,
|
||||
}
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize storage data"""
|
||||
# Get the update flag for cross-process update notification
|
||||
@@ -191,24 +151,6 @@ class NetworkXStorage(BaseGraphStorage):
|
||||
else:
|
||||
logger.warning(f"Node {node_id} not found in the graph for deletion.")
|
||||
|
||||
# TODO: NOT USED
|
||||
async def embed_nodes(
|
||||
self, algorithm: str
|
||||
) -> tuple[np.ndarray[Any, Any], list[str]]:
|
||||
if algorithm not in self._node_embed_algorithms:
|
||||
raise ValueError(f"Node embedding algorithm {algorithm} not supported")
|
||||
return await self._node_embed_algorithms[algorithm]()
|
||||
|
||||
# TODO: NOT USED
|
||||
async def _node2vec_embed(self):
|
||||
graph = await self._get_graph()
|
||||
embeddings, nodes = embed.node2vec_embed(
|
||||
graph,
|
||||
**self.global_config["node2vec_params"],
|
||||
)
|
||||
nodes_ids = [graph.nodes[node_id]["id"] for node_id in nodes]
|
||||
return embeddings, nodes_ids
|
||||
|
||||
async def remove_nodes(self, nodes: list[str]):
|
||||
"""Delete multiple nodes
|
||||
|
||||
|
@@ -1021,9 +1021,6 @@ class PGGraphQueryException(Exception):
|
||||
class PGGraphStorage(BaseGraphStorage):
|
||||
def __post_init__(self):
|
||||
self.graph_name = self.namespace or os.environ.get("AGE_GRAPH_NAME", "lightrag")
|
||||
self._node_embed_algorithms = {
|
||||
"node2vec": self._node2vec_embed,
|
||||
}
|
||||
self.db: PostgreSQLDB | None = None
|
||||
|
||||
async def initialize(self):
|
||||
@@ -1396,9 +1393,6 @@ class PGGraphStorage(BaseGraphStorage):
|
||||
)
|
||||
raise
|
||||
|
||||
async def _node2vec_embed(self):
|
||||
print("Implemented but never called.")
|
||||
|
||||
async def delete_node(self, node_id: str) -> None:
|
||||
"""
|
||||
Delete a node from the graph.
|
||||
@@ -1485,24 +1479,6 @@ class PGGraphStorage(BaseGraphStorage):
|
||||
labels = [result["label"] for result in results]
|
||||
return labels
|
||||
|
||||
async def embed_nodes(
|
||||
self, algorithm: str
|
||||
) -> tuple[np.ndarray[Any, Any], list[str]]:
|
||||
"""
|
||||
Generate node embeddings using the specified algorithm.
|
||||
|
||||
Args:
|
||||
algorithm (str): The name of the embedding algorithm to use.
|
||||
|
||||
Returns:
|
||||
tuple[np.ndarray[Any, Any], list[str]]: A tuple containing the embeddings and the corresponding node IDs.
|
||||
"""
|
||||
if algorithm not in self._node_embed_algorithms:
|
||||
raise ValueError(f"Unsupported embedding algorithm: {algorithm}")
|
||||
|
||||
embed_func = self._node_embed_algorithms[algorithm]
|
||||
return await embed_func()
|
||||
|
||||
async def get_knowledge_graph(
|
||||
self,
|
||||
node_label: str,
|
||||
|
@@ -800,13 +800,6 @@ class TiDBGraphStorage(BaseGraphStorage):
|
||||
}
|
||||
await self.db.execute(merge_sql, data)
|
||||
|
||||
async def embed_nodes(
|
||||
self, algorithm: str
|
||||
) -> tuple[np.ndarray[Any, Any], list[str]]:
|
||||
if algorithm not in self._node_embed_algorithms:
|
||||
raise ValueError(f"Node embedding algorithm {algorithm} not supported")
|
||||
return await self._node_embed_algorithms[algorithm]()
|
||||
|
||||
# Query
|
||||
|
||||
async def has_node(self, node_id: str) -> bool:
|
||||
|
@@ -155,31 +155,6 @@ class LightRAG:
|
||||
Defaults to `chunking_by_token_size` if not specified.
|
||||
"""
|
||||
|
||||
# Node embedding
|
||||
# ---
|
||||
|
||||
node_embedding_algorithm: str = field(default="node2vec")
|
||||
"""Algorithm used for node embedding in knowledge graphs."""
|
||||
|
||||
node2vec_params: dict[str, int] = field(
|
||||
default_factory=lambda: {
|
||||
"dimensions": 1536,
|
||||
"num_walks": 10,
|
||||
"walk_length": 40,
|
||||
"window_size": 2,
|
||||
"iterations": 3,
|
||||
"random_seed": 3,
|
||||
}
|
||||
)
|
||||
"""Configuration for the node2vec embedding algorithm:
|
||||
- dimensions: Number of dimensions for embeddings.
|
||||
- num_walks: Number of random walks per node.
|
||||
- walk_length: Number of steps per random walk.
|
||||
- window_size: Context window size for training.
|
||||
- iterations: Number of iterations for training.
|
||||
- random_seed: Seed value for reproducibility.
|
||||
"""
|
||||
|
||||
# Embedding
|
||||
# ---
|
||||
|
||||
@@ -904,8 +879,10 @@ class LightRAG:
|
||||
|
||||
async with pipeline_status_lock:
|
||||
log_message = f"Processing file: {file_path}"
|
||||
logger.info(log_message)
|
||||
pipeline_status["history_messages"].append(log_message)
|
||||
log_message = f"Processing d-id: {doc_id}"
|
||||
logger.info(log_message)
|
||||
pipeline_status["latest_message"] = log_message
|
||||
pipeline_status["history_messages"].append(log_message)
|
||||
|
||||
@@ -1381,6 +1358,16 @@ class LightRAG:
|
||||
hashing_kv=self.llm_response_cache, # Directly use llm_response_cache
|
||||
system_prompt=system_prompt,
|
||||
)
|
||||
elif param.mode == "bypass":
|
||||
# Bypass mode: directly use LLM without knowledge retrieval
|
||||
use_llm_func = param.model_func or global_config["llm_model_func"]
|
||||
param.stream = True if param.stream is None else param.stream
|
||||
response = await use_llm_func(
|
||||
query.strip(),
|
||||
system_prompt=system_prompt,
|
||||
history_messages=param.conversation_history,
|
||||
stream=param.stream,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown mode {param.mode}")
|
||||
await self._query_done()
|
||||
|
@@ -16,6 +16,7 @@ from .utils import (
|
||||
encode_string_by_tiktoken,
|
||||
is_float_regex,
|
||||
list_of_list_to_csv,
|
||||
normalize_extracted_info,
|
||||
pack_user_ass_to_openai_messages,
|
||||
split_string_by_multi_markers,
|
||||
truncate_list_by_token_size,
|
||||
@@ -163,6 +164,9 @@ async def _handle_single_entity_extraction(
|
||||
)
|
||||
return None
|
||||
|
||||
# Normalize entity name
|
||||
entity_name = normalize_extracted_info(entity_name, is_entity=True)
|
||||
|
||||
# Clean and validate entity type
|
||||
entity_type = clean_str(record_attributes[2]).strip('"')
|
||||
if not entity_type.strip() or entity_type.startswith('("'):
|
||||
@@ -172,7 +176,9 @@ async def _handle_single_entity_extraction(
|
||||
return None
|
||||
|
||||
# Clean and validate description
|
||||
entity_description = clean_str(record_attributes[3]).strip('"')
|
||||
entity_description = clean_str(record_attributes[3])
|
||||
entity_description = normalize_extracted_info(entity_description)
|
||||
|
||||
if not entity_description.strip():
|
||||
logger.warning(
|
||||
f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'"
|
||||
@@ -196,13 +202,20 @@ async def _handle_single_relationship_extraction(
|
||||
if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
|
||||
return None
|
||||
# add this record as edge
|
||||
source = clean_str(record_attributes[1]).strip('"')
|
||||
target = clean_str(record_attributes[2]).strip('"')
|
||||
edge_description = clean_str(record_attributes[3]).strip('"')
|
||||
edge_keywords = clean_str(record_attributes[4]).strip('"')
|
||||
source = clean_str(record_attributes[1])
|
||||
target = clean_str(record_attributes[2])
|
||||
|
||||
# Normalize source and target entity names
|
||||
source = normalize_extracted_info(source, is_entity=True)
|
||||
target = normalize_extracted_info(target, is_entity=True)
|
||||
|
||||
edge_description = clean_str(record_attributes[3])
|
||||
edge_description = normalize_extracted_info(edge_description)
|
||||
|
||||
edge_keywords = clean_str(record_attributes[4]).strip('"').strip("'")
|
||||
edge_source_id = chunk_key
|
||||
weight = (
|
||||
float(record_attributes[-1].strip('"'))
|
||||
float(record_attributes[-1].strip('"').strip("'"))
|
||||
if is_float_regex(record_attributes[-1])
|
||||
else 1.0
|
||||
)
|
||||
@@ -642,7 +655,7 @@ async def extract_entities(
|
||||
processed_chunks += 1
|
||||
entities_count = len(maybe_nodes)
|
||||
relations_count = len(maybe_edges)
|
||||
log_message = f"Chk {processed_chunks}/{total_chunks}: extracted {entities_count} Ent + {relations_count} Rel (deduplicated)"
|
||||
log_message = f"Chk {processed_chunks}/{total_chunks}: extracted {entities_count} Ent + {relations_count} Rel"
|
||||
logger.info(log_message)
|
||||
if pipeline_status is not None:
|
||||
async with pipeline_status_lock:
|
||||
|
@@ -1006,6 +1006,50 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
|
||||
return content[:max_length] + "..."
|
||||
|
||||
|
||||
def normalize_extracted_info(name: str, is_entity=False) -> str:
|
||||
"""Normalize entity/relation names and description with the following rules:
|
||||
1. Remove spaces between Chinese characters
|
||||
2. Remove spaces between Chinese characters and English letters/numbers
|
||||
3. Preserve spaces within English text and numbers
|
||||
4. Replace Chinese parentheses with English parentheses
|
||||
5. Replace Chinese dash with English dash
|
||||
|
||||
Args:
|
||||
name: Entity name to normalize
|
||||
|
||||
Returns:
|
||||
Normalized entity name
|
||||
"""
|
||||
# Replace Chinese parentheses with English parentheses
|
||||
name = name.replace("(", "(").replace(")", ")")
|
||||
|
||||
# Replace Chinese dash with English dash
|
||||
name = name.replace("—", "-").replace("-", "-")
|
||||
|
||||
# Use regex to remove spaces between Chinese characters
|
||||
# Regex explanation:
|
||||
# (?<=[\u4e00-\u9fa5]): Positive lookbehind for Chinese character
|
||||
# \s+: One or more whitespace characters
|
||||
# (?=[\u4e00-\u9fa5]): Positive lookahead for Chinese character
|
||||
name = re.sub(r"(?<=[\u4e00-\u9fa5])\s+(?=[\u4e00-\u9fa5])", "", name)
|
||||
|
||||
# Remove spaces between Chinese and English/numbers
|
||||
name = re.sub(r"(?<=[\u4e00-\u9fa5])\s+(?=[a-zA-Z0-9])", "", name)
|
||||
name = re.sub(r"(?<=[a-zA-Z0-9])\s+(?=[\u4e00-\u9fa5])", "", name)
|
||||
|
||||
# Remove English quotation marks from the beginning and end
|
||||
name = name.strip('"').strip("'")
|
||||
|
||||
if is_entity:
|
||||
# remove Chinese quotes
|
||||
name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "")
|
||||
# remove English queotes in and around chinese
|
||||
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
|
||||
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
|
||||
|
||||
return name
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
"""Clean text by removing null bytes (0x00) and whitespace
|
||||
|
||||
|
@@ -65,8 +65,9 @@ export type LightragDocumentsScanProgress = {
|
||||
* - "global": Utilizes global knowledge.
|
||||
* - "hybrid": Combines local and global retrieval methods.
|
||||
* - "mix": Integrates knowledge graph and vector retrieval.
|
||||
* - "bypass": Bypasses knowledge retrieval and directly uses the LLM.
|
||||
*/
|
||||
export type QueryMode = 'naive' | 'local' | 'global' | 'hybrid' | 'mix'
|
||||
export type QueryMode = 'naive' | 'local' | 'global' | 'hybrid' | 'mix' | 'bypass'
|
||||
|
||||
export type Message = {
|
||||
role: 'user' | 'assistant' | 'system'
|
||||
|
@@ -55,6 +55,7 @@ export default function QuerySettings() {
|
||||
<SelectItem value="global">{t('retrievePanel.querySettings.queryModeOptions.global')}</SelectItem>
|
||||
<SelectItem value="hybrid">{t('retrievePanel.querySettings.queryModeOptions.hybrid')}</SelectItem>
|
||||
<SelectItem value="mix">{t('retrievePanel.querySettings.queryModeOptions.mix')}</SelectItem>
|
||||
<SelectItem value="bypass">{t('retrievePanel.querySettings.queryModeOptions.bypass')}</SelectItem>
|
||||
</SelectGroup>
|
||||
</SelectContent>
|
||||
</Select>
|
||||
|
@@ -306,13 +306,14 @@
|
||||
"parametersTitle": "المعلمات",
|
||||
"parametersDescription": "تكوين معلمات الاستعلام الخاص بك",
|
||||
"queryMode": "وضع الاستعلام",
|
||||
"queryModeTooltip": "حدد استراتيجية الاسترجاع:\n• ساذج: بحث أساسي بدون تقنيات متقدمة\n• محلي: استرجاع معلومات يعتمد على السياق\n• عالمي: يستخدم قاعدة المعرفة العالمية\n• مختلط: يجمع بين الاسترجاع المحلي والعالمي\n• مزيج: يدمج شبكة المعرفة مع الاسترجاع المتجهي",
|
||||
"queryModeTooltip": "حدد استراتيجية الاسترجاع:\n• ساذج: بحث أساسي بدون تقنيات متقدمة\n• محلي: استرجاع معلومات يعتمد على السياق\n• عالمي: يستخدم قاعدة المعرفة العالمية\n• مختلط: يجمع بين الاسترجاع المحلي والعالمي\n• مزيج: يدمج شبكة المعرفة مع الاسترجاع المتجهي\n• تجاوز: يمرر الاستعلام مباشرة إلى LLM بدون استرجاع",
|
||||
"queryModeOptions": {
|
||||
"naive": "ساذج",
|
||||
"local": "محلي",
|
||||
"global": "عالمي",
|
||||
"hybrid": "مختلط",
|
||||
"mix": "مزيج"
|
||||
"mix": "مزيج",
|
||||
"bypass": "تجاوز"
|
||||
},
|
||||
"responseFormat": "تنسيق الرد",
|
||||
"responseFormatTooltip": "يحدد تنسيق الرد. أمثلة:\n• فقرات متعددة\n• فقرة واحدة\n• نقاط نقطية",
|
||||
|
@@ -305,13 +305,14 @@
|
||||
"parametersTitle": "Parameters",
|
||||
"parametersDescription": "Configure your query parameters",
|
||||
"queryMode": "Query Mode",
|
||||
"queryModeTooltip": "Select the retrieval strategy:\n• Naive: Basic search without advanced techniques\n• Local: Context-dependent information retrieval\n• Global: Utilizes global knowledge base\n• Hybrid: Combines local and global retrieval\n• Mix: Integrates knowledge graph with vector retrieval",
|
||||
"queryModeTooltip": "Select the retrieval strategy:\n• Naive: Basic search without advanced techniques\n• Local: Context-dependent information retrieval\n• Global: Utilizes global knowledge base\n• Hybrid: Combines local and global retrieval\n• Mix: Integrates knowledge graph with vector retrieval\n• Bypass: Passes query directly to LLM without retrieval",
|
||||
"queryModeOptions": {
|
||||
"naive": "Naive",
|
||||
"local": "Local",
|
||||
"global": "Global",
|
||||
"hybrid": "Hybrid",
|
||||
"mix": "Mix"
|
||||
"mix": "Mix",
|
||||
"bypass": "Bypass"
|
||||
},
|
||||
"responseFormat": "Response Format",
|
||||
"responseFormatTooltip": "Defines the response format. Examples:\n• Multiple Paragraphs\n• Single Paragraph\n• Bullet Points",
|
||||
|
@@ -306,13 +306,14 @@
|
||||
"parametersTitle": "Paramètres",
|
||||
"parametersDescription": "Configurez vos paramètres de requête",
|
||||
"queryMode": "Mode de requête",
|
||||
"queryModeTooltip": "Sélectionnez la stratégie de récupération :\n• Naïf : Recherche de base sans techniques avancées\n• Local : Récupération d'informations dépendante du contexte\n• Global : Utilise une base de connaissances globale\n• Hybride : Combine récupération locale et globale\n• Mixte : Intègre le graphe de connaissances avec la récupération vectorielle",
|
||||
"queryModeTooltip": "Sélectionnez la stratégie de récupération :\n• Naïf : Recherche de base sans techniques avancées\n• Local : Récupération d'informations dépendante du contexte\n• Global : Utilise une base de connaissances globale\n• Hybride : Combine récupération locale et globale\n• Mixte : Intègre le graphe de connaissances avec la récupération vectorielle\n• Bypass : Transmet directement la requête au LLM sans récupération",
|
||||
"queryModeOptions": {
|
||||
"naive": "Naïf",
|
||||
"local": "Local",
|
||||
"global": "Global",
|
||||
"hybrid": "Hybride",
|
||||
"mix": "Mixte"
|
||||
"mix": "Mixte",
|
||||
"bypass": "Bypass"
|
||||
},
|
||||
"responseFormat": "Format de réponse",
|
||||
"responseFormatTooltip": "Définit le format de la réponse. Exemples :\n• Plusieurs paragraphes\n• Paragraphe unique\n• Points à puces",
|
||||
|
@@ -306,13 +306,14 @@
|
||||
"parametersTitle": "参数",
|
||||
"parametersDescription": "配置查询参数",
|
||||
"queryMode": "查询模式",
|
||||
"queryModeTooltip": "选择检索策略:\n• Naive:基础搜索,无高级技术\n• Local:上下文相关信息检索\n• Global:利用全局知识库\n• Hybrid:结合本地和全局检索\n• Mix:整合知识图谱和向量检索",
|
||||
"queryModeTooltip": "选择检索策略:\n• Naive:基础搜索,无高级技术\n• Local:上下文相关信息检索\n• Global:利用全局知识库\n• Hybrid:结合本地和全局检索\n• Mix:整合知识图谱和向量检索\n• Bypass:直接传递查询到LLM,不进行检索",
|
||||
"queryModeOptions": {
|
||||
"naive": "Naive",
|
||||
"local": "Local",
|
||||
"global": "Global",
|
||||
"hybrid": "Hybrid",
|
||||
"mix": "Mix"
|
||||
"mix": "Mix",
|
||||
"bypass": "Bypass"
|
||||
},
|
||||
"responseFormat": "响应格式",
|
||||
"responseFormatTooltip": "定义响应格式。例如:\n• 多段落\n• 单段落\n• 要点",
|
||||
|
Reference in New Issue
Block a user