Merge branch 'main' into edit-node

This commit is contained in:
choizhang
2025-04-13 11:51:55 +08:00
25 changed files with 241 additions and 251 deletions

View File

@@ -102,6 +102,10 @@ lightrag-gunicorn --workers 4
- `--log-level`日志级别默认INFO - `--log-level`日志级别默认INFO
- --input-dir指定要扫描文档的目录默认./input - --input-dir指定要扫描文档的目录默认./input
> ** 要求将.env文件置于启动目录中是经过特意设计的**。 这样做的目的是支持用户同时启动多个LightRAG实例并为不同实例配置不同的.env文件。
> **修改.env文件后您需要重新打开终端以使新设置生效**。 这是因为每次启动时LightRAG Server会将.env文件中的环境变量加载至系统环境变量且系统环境变量的设置具有更高优先级。
### 启动时自动扫描 ### 启动时自动扫描
当使用 `--auto-scan-at-startup` 参数启动任何服务器时,系统将自动: 当使用 `--auto-scan-at-startup` 参数启动任何服务器时,系统将自动:

View File

@@ -106,6 +106,8 @@ Here are some commonly used startup parameters:
> The requirement for the .env file to be in the startup directory is intentionally designed this way. The purpose is to support users in launching multiple LightRAG instances simultaneously, allowing different .env files for different instances. > The requirement for the .env file to be in the startup directory is intentionally designed this way. The purpose is to support users in launching multiple LightRAG instances simultaneously, allowing different .env files for different instances.
> **After changing the .env file, you need to open a new terminal to make the new settings take effect.** This because the LightRAG Server will load the environment variables from .env into the system environment variables each time it starts, and LightRAG Server will prioritize the settings in the system environment variables.
### Auto scan on startup ### Auto scan on startup
When starting any of the servers with the `--auto-scan-at-startup` parameter, the system will automatically: When starting any of the servers with the `--auto-scan-at-startup` parameter, the system will automatically:

View File

@@ -1 +1 @@
__api_version__ = "0146" __api_version__ = "0148"

View File

@@ -499,6 +499,9 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
content = result.document.export_to_markdown() content = result.document.export_to_markdown()
else: else:
if not pm.is_installed("python-docx"): # type: ignore if not pm.is_installed("python-docx"): # type: ignore
try:
pm.install("python-docx")
except Exception:
pm.install("docx") pm.install("docx")
from docx import Document # type: ignore from docx import Document # type: ignore
from io import BytesIO from io import BytesIO

View File

@@ -308,7 +308,7 @@ class OllamaAPI:
"Cache-Control": "no-cache", "Cache-Control": "no-cache",
"Connection": "keep-alive", "Connection": "keep-alive",
"Content-Type": "application/x-ndjson", "Content-Type": "application/x-ndjson",
"X-Accel-Buffering": "no", # 确保在Nginx代理时正确处理流式响应 "X-Accel-Buffering": "no", # Ensure proper handling of streaming responses in Nginx proxy
}, },
) )
else: else:

View File

@@ -22,7 +22,7 @@ class QueryRequest(BaseModel):
description="The query text", description="The query text",
) )
mode: Literal["local", "global", "hybrid", "naive", "mix"] = Field( mode: Literal["local", "global", "hybrid", "naive", "mix", "bypass"] = Field(
default="hybrid", default="hybrid",
description="Query mode", description="Query mode",
) )

File diff suppressed because one or more lines are too long

View File

@@ -8,7 +8,7 @@
<link rel="icon" type="image/svg+xml" href="logo.png" /> <link rel="icon" type="image/svg+xml" href="logo.png" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> <meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Lightrag</title> <title>Lightrag</title>
<script type="module" crossorigin src="/webui/assets/index-DSVCuARS.js"></script> <script type="module" crossorigin src="/webui/assets/index-CkwV8nfm.js"></script>
<link rel="stylesheet" crossorigin href="/webui/assets/index-CTB4Vp_z.css"> <link rel="stylesheet" crossorigin href="/webui/assets/index-CTB4Vp_z.css">
</head> </head>
<body> <body>

View File

@@ -12,7 +12,6 @@ from typing import (
TypeVar, TypeVar,
Callable, Callable,
) )
import numpy as np
from .utils import EmbeddingFunc from .utils import EmbeddingFunc
from .types import KnowledgeGraph from .types import KnowledgeGraph
@@ -36,7 +35,7 @@ T = TypeVar("T")
class QueryParam: class QueryParam:
"""Configuration parameters for query execution in LightRAG.""" """Configuration parameters for query execution in LightRAG."""
mode: Literal["local", "global", "hybrid", "naive", "mix"] = "global" mode: Literal["local", "global", "hybrid", "naive", "mix", "bypass"] = "global"
"""Specifies the retrieval mode: """Specifies the retrieval mode:
- "local": Focuses on context-dependent information. - "local": Focuses on context-dependent information.
- "global": Utilizes global knowledge. - "global": Utilizes global knowledge.
@@ -281,63 +280,164 @@ class BaseGraphStorage(StorageNameSpace, ABC):
@abstractmethod @abstractmethod
async def has_node(self, node_id: str) -> bool: async def has_node(self, node_id: str) -> bool:
"""Check if an edge exists in the graph.""" """Check if a node exists in the graph.
Args:
node_id: The ID of the node to check
Returns:
True if the node exists, False otherwise
"""
@abstractmethod @abstractmethod
async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
"""Get the degree of a node.""" """Check if an edge exists between two nodes.
Args:
source_node_id: The ID of the source node
target_node_id: The ID of the target node
Returns:
True if the edge exists, False otherwise
"""
@abstractmethod @abstractmethod
async def node_degree(self, node_id: str) -> int: async def node_degree(self, node_id: str) -> int:
"""Get the degree of an edge.""" """Get the degree (number of connected edges) of a node.
Args:
node_id: The ID of the node
Returns:
The number of edges connected to the node
"""
@abstractmethod @abstractmethod
async def edge_degree(self, src_id: str, tgt_id: str) -> int: async def edge_degree(self, src_id: str, tgt_id: str) -> int:
"""Get a node by its id.""" """Get the total degree of an edge (sum of degrees of its source and target nodes).
Args:
src_id: The ID of the source node
tgt_id: The ID of the target node
Returns:
The sum of the degrees of the source and target nodes
"""
@abstractmethod @abstractmethod
async def get_node(self, node_id: str) -> dict[str, str] | None: async def get_node(self, node_id: str) -> dict[str, str] | None:
"""Get node by its label identifier, return only node properties""" """Get node by its ID, returning only node properties.
Args:
node_id: The ID of the node to retrieve
Returns:
A dictionary of node properties if found, None otherwise
"""
@abstractmethod @abstractmethod
async def get_edge( async def get_edge(
self, source_node_id: str, target_node_id: str self, source_node_id: str, target_node_id: str
) -> dict[str, str] | None: ) -> dict[str, str] | None:
"""Get edge properties between two nodes""" """Get edge properties between two nodes.
Args:
source_node_id: The ID of the source node
target_node_id: The ID of the target node
Returns:
A dictionary of edge properties if found, None otherwise
"""
@abstractmethod @abstractmethod
async def get_node_edges(self, source_node_id: str) -> list[tuple[str, str]] | None: async def get_node_edges(self, source_node_id: str) -> list[tuple[str, str]] | None:
"""Upsert a node into the graph.""" """Get all edges connected to a node.
Args:
source_node_id: The ID of the node to get edges for
Returns:
A list of (source_id, target_id) tuples representing edges,
or None if the node doesn't exist
"""
@abstractmethod @abstractmethod
async def upsert_node(self, node_id: str, node_data: dict[str, str]) -> None: async def upsert_node(self, node_id: str, node_data: dict[str, str]) -> None:
"""Upsert an edge into the graph.""" """Insert a new node or update an existing node in the graph.
Importance notes for in-memory storage:
1. Changes will be persisted to disk during the next index_done_callback
2. Only one process should updating the storage at a time before index_done_callback,
KG-storage-log should be used to avoid data corruption
Args:
node_id: The ID of the node to insert or update
node_data: A dictionary of node properties
"""
@abstractmethod @abstractmethod
async def upsert_edge( async def upsert_edge(
self, source_node_id: str, target_node_id: str, edge_data: dict[str, str] self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
) -> None: ) -> None:
"""Insert a new edge or update an existing edge in the graph.
Importance notes for in-memory storage:
1. Changes will be persisted to disk during the next index_done_callback
2. Only one process should updating the storage at a time before index_done_callback,
KG-storage-log should be used to avoid data corruption
Args:
source_node_id: The ID of the source node
target_node_id: The ID of the target node
edge_data: A dictionary of edge properties
"""
@abstractmethod
async def delete_node(self, node_id: str) -> None:
"""Delete a node from the graph. """Delete a node from the graph.
Importance notes for in-memory storage: Importance notes for in-memory storage:
1. Changes will be persisted to disk during the next index_done_callback 1. Changes will be persisted to disk during the next index_done_callback
2. Only one process should updating the storage at a time before index_done_callback, 2. Only one process should updating the storage at a time before index_done_callback,
KG-storage-log should be used to avoid data corruption KG-storage-log should be used to avoid data corruption
Args:
node_id: The ID of the node to delete
""" """
@abstractmethod @abstractmethod
async def delete_node(self, node_id: str) -> None: async def remove_nodes(self, nodes: list[str]):
"""Embed nodes using an algorithm.""" """Delete multiple nodes
Importance notes:
1. Changes will be persisted to disk during the next index_done_callback
2. Only one process should updating the storage at a time before index_done_callback,
KG-storage-log should be used to avoid data corruption
Args:
nodes: List of node IDs to be deleted
"""
@abstractmethod @abstractmethod
async def embed_nodes( async def remove_edges(self, edges: list[tuple[str, str]]):
self, algorithm: str """Delete multiple edges
) -> tuple[np.ndarray[Any, Any], list[str]]:
"""Get all labels in the graph.""" Importance notes:
1. Changes will be persisted to disk during the next index_done_callback
2. Only one process should updating the storage at a time before index_done_callback,
KG-storage-log should be used to avoid data corruption
Args:
edges: List of edges to be deleted, each edge is a (source, target) tuple
"""
@abstractmethod @abstractmethod
async def get_all_labels(self) -> list[str]: async def get_all_labels(self) -> list[str]:
"""Get a knowledge graph of a node.""" """Get all labels in the graph.
Returns:
A list of all node labels in the graph, sorted alphabetically
"""
@abstractmethod @abstractmethod
async def get_knowledge_graph( async def get_knowledge_graph(

View File

@@ -6,7 +6,6 @@ import sys
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, Dict, List, NamedTuple, Optional, Union, final from typing import Any, Dict, List, NamedTuple, Optional, Union, final
import numpy as np
import pipmaster as pm import pipmaster as pm
from lightrag.types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge from lightrag.types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge
@@ -89,11 +88,6 @@ class AGEStorage(BaseGraphStorage):
return None return None
def __post_init__(self):
self._node_embed_algorithms = {
"node2vec": self._node2vec_embed,
}
async def close(self): async def close(self):
if self._driver: if self._driver:
await self._driver.close() await self._driver.close()
@@ -593,9 +587,6 @@ class AGEStorage(BaseGraphStorage):
logger.error("Error during edge upsert: {%s}", e) logger.error("Error during edge upsert: {%s}", e)
raise raise
async def _node2vec_embed(self):
print("Implemented but never called.")
@asynccontextmanager @asynccontextmanager
async def _get_pool_connection(self, timeout: Optional[float] = None): async def _get_pool_connection(self, timeout: Optional[float] = None):
"""Workaround for a psycopg_pool bug""" """Workaround for a psycopg_pool bug"""
@@ -668,21 +659,6 @@ class AGEStorage(BaseGraphStorage):
logger.error(f"Error during edge deletion: {str(e)}") logger.error(f"Error during edge deletion: {str(e)}")
raise raise
async def embed_nodes(
self, algorithm: str
) -> tuple[np.ndarray[Any, Any], list[str]]:
"""Embed nodes using the specified algorithm
Args:
algorithm: Name of the embedding algorithm
Returns:
tuple: (embedding matrix, list of node identifiers)
"""
if algorithm not in self._node_embed_algorithms:
raise ValueError(f"Node embedding algorithm {algorithm} not supported")
return await self._node_embed_algorithms[algorithm]()
async def get_all_labels(self) -> list[str]: async def get_all_labels(self) -> list[str]:
"""Get all node labels in the database """Get all node labels in the database

View File

@@ -6,9 +6,6 @@ import pipmaster as pm
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, Dict, List, final from typing import Any, Dict, List, final
import numpy as np
from tenacity import ( from tenacity import (
retry, retry,
retry_if_exception_type, retry_if_exception_type,
@@ -72,11 +69,6 @@ class GremlinStorage(BaseGraphStorage):
transport_factory=lambda: AiohttpTransport(call_from_event_loop=True), transport_factory=lambda: AiohttpTransport(call_from_event_loop=True),
) )
def __post_init__(self):
self._node_embed_algorithms = {
"node2vec": self._node2vec_embed,
}
async def close(self): async def close(self):
if self._driver: if self._driver:
self._driver.close() self._driver.close()
@@ -392,9 +384,6 @@ class GremlinStorage(BaseGraphStorage):
logger.error("Error during edge upsert: {%s}", e) logger.error("Error during edge upsert: {%s}", e)
raise raise
async def _node2vec_embed(self):
print("Implemented but never called.")
async def delete_node(self, node_id: str) -> None: async def delete_node(self, node_id: str) -> None:
"""Delete a node with the specified entity_name """Delete a node with the specified entity_name
@@ -419,27 +408,6 @@ class GremlinStorage(BaseGraphStorage):
logger.error(f"Error during node deletion: {str(e)}") logger.error(f"Error during node deletion: {str(e)}")
raise raise
async def embed_nodes(
self, algorithm: str
) -> tuple[np.ndarray[Any, Any], list[str]]:
"""
Embed nodes using the specified algorithm.
Currently, only node2vec is supported but never called.
Args:
algorithm: The name of the embedding algorithm to use
Returns:
A tuple of (embeddings, node_ids)
Raises:
NotImplementedError: If the specified algorithm is not supported
ValueError: If the algorithm is not supported
"""
if algorithm not in self._node_embed_algorithms:
raise ValueError(f"Node embedding algorithm {algorithm} not supported")
return await self._node_embed_algorithms[algorithm]()
async def get_all_labels(self) -> list[str]: async def get_all_labels(self) -> list[str]:
""" """
Get all node entity_names in the graph Get all node entity_names in the graph

View File

@@ -663,20 +663,6 @@ class MongoGraphStorage(BaseGraphStorage):
# Remove the node doc # Remove the node doc
await self.collection.delete_one({"_id": node_id}) await self.collection.delete_one({"_id": node_id})
#
# -------------------------------------------------------------------------
# EMBEDDINGS (NOT IMPLEMENTED)
# -------------------------------------------------------------------------
#
async def embed_nodes(
self, algorithm: str
) -> tuple[np.ndarray[Any, Any], list[str]]:
"""
Placeholder for demonstration, raises NotImplementedError.
"""
raise NotImplementedError("Node embedding is not used in lightrag.")
# #
# ------------------------------------------------------------------------- # -------------------------------------------------------------------------
# QUERY # QUERY

View File

@@ -2,8 +2,7 @@ import inspect
import os import os
import re import re
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, final from typing import final
import numpy as np
import configparser import configparser
@@ -51,11 +50,6 @@ class Neo4JStorage(BaseGraphStorage):
) )
self._driver = None self._driver = None
def __post_init__(self):
self._node_embed_algorithms = {
"node2vec": self._node2vec_embed,
}
async def initialize(self): async def initialize(self):
URI = os.environ.get("NEO4J_URI", config.get("neo4j", "uri", fallback=None)) URI = os.environ.get("NEO4J_URI", config.get("neo4j", "uri", fallback=None))
USERNAME = os.environ.get( USERNAME = os.environ.get(
@@ -635,9 +629,6 @@ class Neo4JStorage(BaseGraphStorage):
logger.error(f"Error during edge upsert: {str(e)}") logger.error(f"Error during edge upsert: {str(e)}")
raise raise
async def _node2vec_embed(self):
print("Implemented but never called.")
async def get_knowledge_graph( async def get_knowledge_graph(
self, self,
node_label: str, node_label: str,
@@ -1126,11 +1117,6 @@ class Neo4JStorage(BaseGraphStorage):
logger.error(f"Error during edge deletion: {str(e)}") logger.error(f"Error during edge deletion: {str(e)}")
raise raise
async def embed_nodes(
self, algorithm: str
) -> tuple[np.ndarray[Any, Any], list[str]]:
raise NotImplementedError
async def drop(self) -> dict[str, str]: async def drop(self) -> dict[str, str]:
"""Drop all data from storage and clean up resources """Drop all data from storage and clean up resources

View File

@@ -1,7 +1,6 @@
import os import os
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, final from typing import final
import numpy as np
from lightrag.types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge from lightrag.types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge
from lightrag.utils import logger from lightrag.utils import logger
@@ -16,7 +15,6 @@ if not pm.is_installed("graspologic"):
pm.install("graspologic") pm.install("graspologic")
import networkx as nx import networkx as nx
from graspologic import embed
from .shared_storage import ( from .shared_storage import (
get_storage_lock, get_storage_lock,
get_update_flag, get_update_flag,
@@ -42,40 +40,6 @@ class NetworkXStorage(BaseGraphStorage):
) )
nx.write_graphml(graph, file_name) nx.write_graphml(graph, file_name)
# TODOdeprecated, remove later
@staticmethod
def _stabilize_graph(graph: nx.Graph) -> nx.Graph:
"""Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py
Ensure an undirected graph with the same relationships will always be read the same way.
"""
fixed_graph = nx.DiGraph() if graph.is_directed() else nx.Graph()
sorted_nodes = graph.nodes(data=True)
sorted_nodes = sorted(sorted_nodes, key=lambda x: x[0])
fixed_graph.add_nodes_from(sorted_nodes)
edges = list(graph.edges(data=True))
if not graph.is_directed():
def _sort_source_target(edge):
source, target, edge_data = edge
if source > target:
temp = source
source = target
target = temp
return source, target, edge_data
edges = [_sort_source_target(edge) for edge in edges]
def _get_edge_key(source: Any, target: Any) -> str:
return f"{source} -> {target}"
edges = sorted(edges, key=lambda x: _get_edge_key(x[0], x[1]))
fixed_graph.add_edges_from(edges)
return fixed_graph
def __post_init__(self): def __post_init__(self):
self._graphml_xml_file = os.path.join( self._graphml_xml_file = os.path.join(
self.global_config["working_dir"], f"graph_{self.namespace}.graphml" self.global_config["working_dir"], f"graph_{self.namespace}.graphml"
@@ -94,10 +58,6 @@ class NetworkXStorage(BaseGraphStorage):
logger.info("Created new empty graph") logger.info("Created new empty graph")
self._graph = preloaded_graph or nx.Graph() self._graph = preloaded_graph or nx.Graph()
self._node_embed_algorithms = {
"node2vec": self._node2vec_embed,
}
async def initialize(self): async def initialize(self):
"""Initialize storage data""" """Initialize storage data"""
# Get the update flag for cross-process update notification # Get the update flag for cross-process update notification
@@ -191,24 +151,6 @@ class NetworkXStorage(BaseGraphStorage):
else: else:
logger.warning(f"Node {node_id} not found in the graph for deletion.") logger.warning(f"Node {node_id} not found in the graph for deletion.")
# TODO: NOT USED
async def embed_nodes(
self, algorithm: str
) -> tuple[np.ndarray[Any, Any], list[str]]:
if algorithm not in self._node_embed_algorithms:
raise ValueError(f"Node embedding algorithm {algorithm} not supported")
return await self._node_embed_algorithms[algorithm]()
# TODO: NOT USED
async def _node2vec_embed(self):
graph = await self._get_graph()
embeddings, nodes = embed.node2vec_embed(
graph,
**self.global_config["node2vec_params"],
)
nodes_ids = [graph.nodes[node_id]["id"] for node_id in nodes]
return embeddings, nodes_ids
async def remove_nodes(self, nodes: list[str]): async def remove_nodes(self, nodes: list[str]):
"""Delete multiple nodes """Delete multiple nodes

View File

@@ -1021,9 +1021,6 @@ class PGGraphQueryException(Exception):
class PGGraphStorage(BaseGraphStorage): class PGGraphStorage(BaseGraphStorage):
def __post_init__(self): def __post_init__(self):
self.graph_name = self.namespace or os.environ.get("AGE_GRAPH_NAME", "lightrag") self.graph_name = self.namespace or os.environ.get("AGE_GRAPH_NAME", "lightrag")
self._node_embed_algorithms = {
"node2vec": self._node2vec_embed,
}
self.db: PostgreSQLDB | None = None self.db: PostgreSQLDB | None = None
async def initialize(self): async def initialize(self):
@@ -1396,9 +1393,6 @@ class PGGraphStorage(BaseGraphStorage):
) )
raise raise
async def _node2vec_embed(self):
print("Implemented but never called.")
async def delete_node(self, node_id: str) -> None: async def delete_node(self, node_id: str) -> None:
""" """
Delete a node from the graph. Delete a node from the graph.
@@ -1485,24 +1479,6 @@ class PGGraphStorage(BaseGraphStorage):
labels = [result["label"] for result in results] labels = [result["label"] for result in results]
return labels return labels
async def embed_nodes(
self, algorithm: str
) -> tuple[np.ndarray[Any, Any], list[str]]:
"""
Generate node embeddings using the specified algorithm.
Args:
algorithm (str): The name of the embedding algorithm to use.
Returns:
tuple[np.ndarray[Any, Any], list[str]]: A tuple containing the embeddings and the corresponding node IDs.
"""
if algorithm not in self._node_embed_algorithms:
raise ValueError(f"Unsupported embedding algorithm: {algorithm}")
embed_func = self._node_embed_algorithms[algorithm]
return await embed_func()
async def get_knowledge_graph( async def get_knowledge_graph(
self, self,
node_label: str, node_label: str,

View File

@@ -800,13 +800,6 @@ class TiDBGraphStorage(BaseGraphStorage):
} }
await self.db.execute(merge_sql, data) await self.db.execute(merge_sql, data)
async def embed_nodes(
self, algorithm: str
) -> tuple[np.ndarray[Any, Any], list[str]]:
if algorithm not in self._node_embed_algorithms:
raise ValueError(f"Node embedding algorithm {algorithm} not supported")
return await self._node_embed_algorithms[algorithm]()
# Query # Query
async def has_node(self, node_id: str) -> bool: async def has_node(self, node_id: str) -> bool:

View File

@@ -155,31 +155,6 @@ class LightRAG:
Defaults to `chunking_by_token_size` if not specified. Defaults to `chunking_by_token_size` if not specified.
""" """
# Node embedding
# ---
node_embedding_algorithm: str = field(default="node2vec")
"""Algorithm used for node embedding in knowledge graphs."""
node2vec_params: dict[str, int] = field(
default_factory=lambda: {
"dimensions": 1536,
"num_walks": 10,
"walk_length": 40,
"window_size": 2,
"iterations": 3,
"random_seed": 3,
}
)
"""Configuration for the node2vec embedding algorithm:
- dimensions: Number of dimensions for embeddings.
- num_walks: Number of random walks per node.
- walk_length: Number of steps per random walk.
- window_size: Context window size for training.
- iterations: Number of iterations for training.
- random_seed: Seed value for reproducibility.
"""
# Embedding # Embedding
# --- # ---
@@ -904,8 +879,10 @@ class LightRAG:
async with pipeline_status_lock: async with pipeline_status_lock:
log_message = f"Processing file: {file_path}" log_message = f"Processing file: {file_path}"
logger.info(log_message)
pipeline_status["history_messages"].append(log_message) pipeline_status["history_messages"].append(log_message)
log_message = f"Processing d-id: {doc_id}" log_message = f"Processing d-id: {doc_id}"
logger.info(log_message)
pipeline_status["latest_message"] = log_message pipeline_status["latest_message"] = log_message
pipeline_status["history_messages"].append(log_message) pipeline_status["history_messages"].append(log_message)
@@ -1381,6 +1358,16 @@ class LightRAG:
hashing_kv=self.llm_response_cache, # Directly use llm_response_cache hashing_kv=self.llm_response_cache, # Directly use llm_response_cache
system_prompt=system_prompt, system_prompt=system_prompt,
) )
elif param.mode == "bypass":
# Bypass mode: directly use LLM without knowledge retrieval
use_llm_func = param.model_func or global_config["llm_model_func"]
param.stream = True if param.stream is None else param.stream
response = await use_llm_func(
query.strip(),
system_prompt=system_prompt,
history_messages=param.conversation_history,
stream=param.stream,
)
else: else:
raise ValueError(f"Unknown mode {param.mode}") raise ValueError(f"Unknown mode {param.mode}")
await self._query_done() await self._query_done()

View File

@@ -16,6 +16,7 @@ from .utils import (
encode_string_by_tiktoken, encode_string_by_tiktoken,
is_float_regex, is_float_regex,
list_of_list_to_csv, list_of_list_to_csv,
normalize_extracted_info,
pack_user_ass_to_openai_messages, pack_user_ass_to_openai_messages,
split_string_by_multi_markers, split_string_by_multi_markers,
truncate_list_by_token_size, truncate_list_by_token_size,
@@ -163,6 +164,9 @@ async def _handle_single_entity_extraction(
) )
return None return None
# Normalize entity name
entity_name = normalize_extracted_info(entity_name, is_entity=True)
# Clean and validate entity type # Clean and validate entity type
entity_type = clean_str(record_attributes[2]).strip('"') entity_type = clean_str(record_attributes[2]).strip('"')
if not entity_type.strip() or entity_type.startswith('("'): if not entity_type.strip() or entity_type.startswith('("'):
@@ -172,7 +176,9 @@ async def _handle_single_entity_extraction(
return None return None
# Clean and validate description # Clean and validate description
entity_description = clean_str(record_attributes[3]).strip('"') entity_description = clean_str(record_attributes[3])
entity_description = normalize_extracted_info(entity_description)
if not entity_description.strip(): if not entity_description.strip():
logger.warning( logger.warning(
f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'" f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'"
@@ -196,13 +202,20 @@ async def _handle_single_relationship_extraction(
if len(record_attributes) < 5 or record_attributes[0] != '"relationship"': if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
return None return None
# add this record as edge # add this record as edge
source = clean_str(record_attributes[1]).strip('"') source = clean_str(record_attributes[1])
target = clean_str(record_attributes[2]).strip('"') target = clean_str(record_attributes[2])
edge_description = clean_str(record_attributes[3]).strip('"')
edge_keywords = clean_str(record_attributes[4]).strip('"') # Normalize source and target entity names
source = normalize_extracted_info(source, is_entity=True)
target = normalize_extracted_info(target, is_entity=True)
edge_description = clean_str(record_attributes[3])
edge_description = normalize_extracted_info(edge_description)
edge_keywords = clean_str(record_attributes[4]).strip('"').strip("'")
edge_source_id = chunk_key edge_source_id = chunk_key
weight = ( weight = (
float(record_attributes[-1].strip('"')) float(record_attributes[-1].strip('"').strip("'"))
if is_float_regex(record_attributes[-1]) if is_float_regex(record_attributes[-1])
else 1.0 else 1.0
) )
@@ -642,7 +655,7 @@ async def extract_entities(
processed_chunks += 1 processed_chunks += 1
entities_count = len(maybe_nodes) entities_count = len(maybe_nodes)
relations_count = len(maybe_edges) relations_count = len(maybe_edges)
log_message = f"Chk {processed_chunks}/{total_chunks}: extracted {entities_count} Ent + {relations_count} Rel (deduplicated)" log_message = f"Chk {processed_chunks}/{total_chunks}: extracted {entities_count} Ent + {relations_count} Rel"
logger.info(log_message) logger.info(log_message)
if pipeline_status is not None: if pipeline_status is not None:
async with pipeline_status_lock: async with pipeline_status_lock:

View File

@@ -1006,6 +1006,50 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
return content[:max_length] + "..." return content[:max_length] + "..."
def normalize_extracted_info(name: str, is_entity=False) -> str:
"""Normalize entity/relation names and description with the following rules:
1. Remove spaces between Chinese characters
2. Remove spaces between Chinese characters and English letters/numbers
3. Preserve spaces within English text and numbers
4. Replace Chinese parentheses with English parentheses
5. Replace Chinese dash with English dash
Args:
name: Entity name to normalize
Returns:
Normalized entity name
"""
# Replace Chinese parentheses with English parentheses
name = name.replace("", "(").replace("", ")")
# Replace Chinese dash with English dash
name = name.replace("", "-").replace("", "-")
# Use regex to remove spaces between Chinese characters
# Regex explanation:
# (?<=[\u4e00-\u9fa5]): Positive lookbehind for Chinese character
# \s+: One or more whitespace characters
# (?=[\u4e00-\u9fa5]): Positive lookahead for Chinese character
name = re.sub(r"(?<=[\u4e00-\u9fa5])\s+(?=[\u4e00-\u9fa5])", "", name)
# Remove spaces between Chinese and English/numbers
name = re.sub(r"(?<=[\u4e00-\u9fa5])\s+(?=[a-zA-Z0-9])", "", name)
name = re.sub(r"(?<=[a-zA-Z0-9])\s+(?=[\u4e00-\u9fa5])", "", name)
# Remove English quotation marks from the beginning and end
name = name.strip('"').strip("'")
if is_entity:
# remove Chinese quotes
name = name.replace("", "").replace("", "").replace("", "").replace("", "")
# remove English queotes in and around chinese
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
return name
def clean_text(text: str) -> str: def clean_text(text: str) -> str:
"""Clean text by removing null bytes (0x00) and whitespace """Clean text by removing null bytes (0x00) and whitespace

View File

@@ -65,8 +65,9 @@ export type LightragDocumentsScanProgress = {
* - "global": Utilizes global knowledge. * - "global": Utilizes global knowledge.
* - "hybrid": Combines local and global retrieval methods. * - "hybrid": Combines local and global retrieval methods.
* - "mix": Integrates knowledge graph and vector retrieval. * - "mix": Integrates knowledge graph and vector retrieval.
* - "bypass": Bypasses knowledge retrieval and directly uses the LLM.
*/ */
export type QueryMode = 'naive' | 'local' | 'global' | 'hybrid' | 'mix' export type QueryMode = 'naive' | 'local' | 'global' | 'hybrid' | 'mix' | 'bypass'
export type Message = { export type Message = {
role: 'user' | 'assistant' | 'system' role: 'user' | 'assistant' | 'system'

View File

@@ -55,6 +55,7 @@ export default function QuerySettings() {
<SelectItem value="global">{t('retrievePanel.querySettings.queryModeOptions.global')}</SelectItem> <SelectItem value="global">{t('retrievePanel.querySettings.queryModeOptions.global')}</SelectItem>
<SelectItem value="hybrid">{t('retrievePanel.querySettings.queryModeOptions.hybrid')}</SelectItem> <SelectItem value="hybrid">{t('retrievePanel.querySettings.queryModeOptions.hybrid')}</SelectItem>
<SelectItem value="mix">{t('retrievePanel.querySettings.queryModeOptions.mix')}</SelectItem> <SelectItem value="mix">{t('retrievePanel.querySettings.queryModeOptions.mix')}</SelectItem>
<SelectItem value="bypass">{t('retrievePanel.querySettings.queryModeOptions.bypass')}</SelectItem>
</SelectGroup> </SelectGroup>
</SelectContent> </SelectContent>
</Select> </Select>

View File

@@ -306,13 +306,14 @@
"parametersTitle": "المعلمات", "parametersTitle": "المعلمات",
"parametersDescription": "تكوين معلمات الاستعلام الخاص بك", "parametersDescription": "تكوين معلمات الاستعلام الخاص بك",
"queryMode": "وضع الاستعلام", "queryMode": "وضع الاستعلام",
"queryModeTooltip": "حدد استراتيجية الاسترجاع:\n• ساذج: بحث أساسي بدون تقنيات متقدمة\n• محلي: استرجاع معلومات يعتمد على السياق\n• عالمي: يستخدم قاعدة المعرفة العالمية\n• مختلط: يجمع بين الاسترجاع المحلي والعالمي\n• مزيج: يدمج شبكة المعرفة مع الاسترجاع المتجهي", "queryModeTooltip": "حدد استراتيجية الاسترجاع:\n• ساذج: بحث أساسي بدون تقنيات متقدمة\n• محلي: استرجاع معلومات يعتمد على السياق\n• عالمي: يستخدم قاعدة المعرفة العالمية\n• مختلط: يجمع بين الاسترجاع المحلي والعالمي\n• مزيج: يدمج شبكة المعرفة مع الاسترجاع المتجهي\n• تجاوز: يمرر الاستعلام مباشرة إلى LLM بدون استرجاع",
"queryModeOptions": { "queryModeOptions": {
"naive": "ساذج", "naive": "ساذج",
"local": "محلي", "local": "محلي",
"global": "عالمي", "global": "عالمي",
"hybrid": "مختلط", "hybrid": "مختلط",
"mix": "مزيج" "mix": "مزيج",
"bypass": "تجاوز"
}, },
"responseFormat": "تنسيق الرد", "responseFormat": "تنسيق الرد",
"responseFormatTooltip": "يحدد تنسيق الرد. أمثلة:\n• فقرات متعددة\n• فقرة واحدة\n• نقاط نقطية", "responseFormatTooltip": "يحدد تنسيق الرد. أمثلة:\n• فقرات متعددة\n• فقرة واحدة\n• نقاط نقطية",

View File

@@ -305,13 +305,14 @@
"parametersTitle": "Parameters", "parametersTitle": "Parameters",
"parametersDescription": "Configure your query parameters", "parametersDescription": "Configure your query parameters",
"queryMode": "Query Mode", "queryMode": "Query Mode",
"queryModeTooltip": "Select the retrieval strategy:\n• Naive: Basic search without advanced techniques\n• Local: Context-dependent information retrieval\n• Global: Utilizes global knowledge base\n• Hybrid: Combines local and global retrieval\n• Mix: Integrates knowledge graph with vector retrieval", "queryModeTooltip": "Select the retrieval strategy:\n• Naive: Basic search without advanced techniques\n• Local: Context-dependent information retrieval\n• Global: Utilizes global knowledge base\n• Hybrid: Combines local and global retrieval\n• Mix: Integrates knowledge graph with vector retrieval\n• Bypass: Passes query directly to LLM without retrieval",
"queryModeOptions": { "queryModeOptions": {
"naive": "Naive", "naive": "Naive",
"local": "Local", "local": "Local",
"global": "Global", "global": "Global",
"hybrid": "Hybrid", "hybrid": "Hybrid",
"mix": "Mix" "mix": "Mix",
"bypass": "Bypass"
}, },
"responseFormat": "Response Format", "responseFormat": "Response Format",
"responseFormatTooltip": "Defines the response format. Examples:\n• Multiple Paragraphs\n• Single Paragraph\n• Bullet Points", "responseFormatTooltip": "Defines the response format. Examples:\n• Multiple Paragraphs\n• Single Paragraph\n• Bullet Points",

View File

@@ -306,13 +306,14 @@
"parametersTitle": "Paramètres", "parametersTitle": "Paramètres",
"parametersDescription": "Configurez vos paramètres de requête", "parametersDescription": "Configurez vos paramètres de requête",
"queryMode": "Mode de requête", "queryMode": "Mode de requête",
"queryModeTooltip": "Sélectionnez la stratégie de récupération :\n• Naïf : Recherche de base sans techniques avancées\n• Local : Récupération d'informations dépendante du contexte\n• Global : Utilise une base de connaissances globale\n• Hybride : Combine récupération locale et globale\n• Mixte : Intègre le graphe de connaissances avec la récupération vectorielle", "queryModeTooltip": "Sélectionnez la stratégie de récupération :\n• Naïf : Recherche de base sans techniques avancées\n• Local : Récupération d'informations dépendante du contexte\n• Global : Utilise une base de connaissances globale\n• Hybride : Combine récupération locale et globale\n• Mixte : Intègre le graphe de connaissances avec la récupération vectorielle\n• Bypass : Transmet directement la requête au LLM sans récupération",
"queryModeOptions": { "queryModeOptions": {
"naive": "Naïf", "naive": "Naïf",
"local": "Local", "local": "Local",
"global": "Global", "global": "Global",
"hybrid": "Hybride", "hybrid": "Hybride",
"mix": "Mixte" "mix": "Mixte",
"bypass": "Bypass"
}, },
"responseFormat": "Format de réponse", "responseFormat": "Format de réponse",
"responseFormatTooltip": "Définit le format de la réponse. Exemples :\n• Plusieurs paragraphes\n• Paragraphe unique\n• Points à puces", "responseFormatTooltip": "Définit le format de la réponse. Exemples :\n• Plusieurs paragraphes\n• Paragraphe unique\n• Points à puces",

View File

@@ -306,13 +306,14 @@
"parametersTitle": "参数", "parametersTitle": "参数",
"parametersDescription": "配置查询参数", "parametersDescription": "配置查询参数",
"queryMode": "查询模式", "queryMode": "查询模式",
"queryModeTooltip": "选择检索策略:\n• Naive基础搜索无高级技术\n• Local上下文相关信息检索\n• Global利用全局知识库\n• Hybrid结合本地和全局检索\n• Mix整合知识图谱和向量检索", "queryModeTooltip": "选择检索策略:\n• Naive基础搜索无高级技术\n• Local上下文相关信息检索\n• Global利用全局知识库\n• Hybrid结合本地和全局检索\n• Mix整合知识图谱和向量检索\n• Bypass直接传递查询到LLM不进行检索",
"queryModeOptions": { "queryModeOptions": {
"naive": "Naive", "naive": "Naive",
"local": "Local", "local": "Local",
"global": "Global", "global": "Global",
"hybrid": "Hybrid", "hybrid": "Hybrid",
"mix": "Mix" "mix": "Mix",
"bypass": "Bypass"
}, },
"responseFormat": "响应格式", "responseFormat": "响应格式",
"responseFormatTooltip": "定义响应格式。例如:\n• 多段落\n• 单段落\n• 要点", "responseFormatTooltip": "定义响应格式。例如:\n• 多段落\n• 单段落\n• 要点",