main_merge

This commit is contained in:
Roy
2025-03-08 20:34:29 +00:00
39 changed files with 1475 additions and 244 deletions

View File

@@ -849,6 +849,76 @@ All operations are available in both synchronous and asynchronous versions. The
These operations maintain data consistency across both the graph database and vector database components, ensuring your knowledge graph remains coherent. These operations maintain data consistency across both the graph database and vector database components, ensuring your knowledge graph remains coherent.
## Entity Merging
<details>
<summary> <b>Merge Entities and Their Relationships</b> </summary>
LightRAG now supports merging multiple entities into a single entity, automatically handling all relationships:
```python
# Basic entity merging
rag.merge_entities(
source_entities=["Artificial Intelligence", "AI", "Machine Intelligence"],
target_entity="AI Technology"
)
```
With custom merge strategy:
```python
# Define custom merge strategy for different fields
rag.merge_entities(
source_entities=["John Smith", "Dr. Smith", "J. Smith"],
target_entity="John Smith",
merge_strategy={
"description": "concatenate", # Combine all descriptions
"entity_type": "keep_first", # Keep the entity type from the first entity
"source_id": "join_unique" # Combine all unique source IDs
}
)
```
With custom target entity data:
```python
# Specify exact values for the merged entity
rag.merge_entities(
source_entities=["New York", "NYC", "Big Apple"],
target_entity="New York City",
target_entity_data={
"entity_type": "LOCATION",
"description": "New York City is the most populous city in the United States.",
}
)
```
Advanced usage combining both approaches:
```python
# Merge company entities with both strategy and custom data
rag.merge_entities(
source_entities=["Microsoft Corp", "Microsoft Corporation", "MSFT"],
target_entity="Microsoft",
merge_strategy={
"description": "concatenate", # Combine all descriptions
"source_id": "join_unique" # Combine source IDs
},
target_entity_data={
"entity_type": "ORGANIZATION",
}
)
```
When merging entities:
* All relationships from source entities are redirected to the target entity
* Duplicate relationships are intelligently merged
* Self-relationships (loops) are prevented
* Source entities are removed after merging
* Relationship weights and attributes are preserved
</details>
## Cache ## Cache
<details> <details>

View File

@@ -48,8 +48,9 @@
# CHUNK_OVERLAP_SIZE=100 # CHUNK_OVERLAP_SIZE=100
# MAX_TOKENS=32768 # Max tokens send to LLM for summarization # MAX_TOKENS=32768 # Max tokens send to LLM for summarization
# MAX_TOKEN_SUMMARY=500 # Max tokens for entity or relations summary # MAX_TOKEN_SUMMARY=500 # Max tokens for entity or relations summary
# LANGUAGE=English # SUMMARY_LANGUAGE=English
# MAX_EMBED_TOKENS=8192 # MAX_EMBED_TOKENS=8192
# ENABLE_LLM_CACHE_FOR_EXTRACT=false # Enable LLM cache for entity extraction, defaults to false
### LLM Configuration (Use valid host. For local services installed with docker, you can use host.docker.internal) ### LLM Configuration (Use valid host. For local services installed with docker, you can use host.docker.internal)
LLM_BINDING=ollama LLM_BINDING=ollama
@@ -148,3 +149,10 @@ QDRANT_URL=http://localhost:16333
### Redis ### Redis
REDIS_URI=redis://localhost:6379 REDIS_URI=redis://localhost:6379
# For jwt auth
AUTH_USERNAME=admin # login name
AUTH_PASSWORD=admin123 # password
TOKEN_SECRET=your-key # JWT key
TOKEN_EXPIRE_HOURS=4 # expire duration
WHITELIST_PATHS=/login,/health # white list

View File

@@ -37,8 +37,8 @@ async def llm_model_func(
prompt, prompt,
system_prompt=system_prompt, system_prompt=system_prompt,
history_messages=history_messages, history_messages=history_messages,
api_key="sk-91d0b59f25554251aa813ed756d79a6d", api_key="",
base_url="https://api.deepseek.com", base_url="",
**kwargs, **kwargs,
) )

51
examples/test_postgres.py Normal file
View File

@@ -0,0 +1,51 @@
import os
import asyncio
from lightrag.kg.postgres_impl import PGGraphStorage
from lightrag.llm.ollama import ollama_embedding
from lightrag.utils import EmbeddingFunc
#########
# Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert()
# import nest_asyncio
# nest_asyncio.apply()
#########
WORKING_DIR = "./local_neo4jWorkDir"
if not os.path.exists(WORKING_DIR):
os.mkdir(WORKING_DIR)
# AGE
os.environ["AGE_GRAPH_NAME"] = "dickens"
os.environ["POSTGRES_HOST"] = "localhost"
os.environ["POSTGRES_PORT"] = "15432"
os.environ["POSTGRES_USER"] = "rag"
os.environ["POSTGRES_PASSWORD"] = "rag"
os.environ["POSTGRES_DATABASE"] = "rag"
async def main():
graph_db = PGGraphStorage(
namespace="dickens",
embedding_func=EmbeddingFunc(
embedding_dim=1024,
max_token_size=8192,
func=lambda texts: ollama_embedding(
texts, embed_model="bge-m3", host="http://localhost:11434"
),
),
global_config={},
)
await graph_db.initialize()
labels = await graph_db.get_all_labels()
print("all labels", labels)
res = await graph_db.get_knowledge_graph("FEZZIWIG")
print("knowledge graphs", res)
await graph_db.finalize()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -223,6 +223,11 @@ LightRAG supports binding to various LLM/Embedding backends:
Use environment variables `LLM_BINDING` or CLI argument `--llm-binding` to select LLM backend type. Use environment variables `EMBEDDING_BINDING` or CLI argument `--embedding-binding` to select LLM backend type. Use environment variables `LLM_BINDING` or CLI argument `--llm-binding` to select LLM backend type. Use environment variables `EMBEDDING_BINDING` or CLI argument `--embedding-binding` to select LLM backend type.
### Entity Extraction Configuration
* ENABLE_LLM_CACHE_FOR_EXTRACT: Enable LLM cache for entity extraction (default: false)
It's very common to set `ENABLE_LLM_CACHE_FOR_EXTRACT` to true for test environment to reduce the cost of LLM calls.
### Storage Types Supported ### Storage Types Supported
LightRAG uses 4 types of storage for difference purposes: LightRAG uses 4 types of storage for difference purposes:
@@ -387,6 +392,19 @@ Note: If you don't need the API functionality, you can install the base package
pip install lightrag-hku pip install lightrag-hku
``` ```
## Authentication Endpoints
### JWT Authentication Mechanism
LightRAG API Server implements JWT-based authentication using HS256 algorithm. To enable secure access control, the following environment variables are required:
```bash
# For jwt auth
AUTH_USERNAME=admin # login name
AUTH_PASSWORD=admin123 # password
TOKEN_SECRET=your-key # JWT key
TOKEN_EXPIRE_HOURS=4 # expire duration
WHITELIST_PATHS=/api1,/api2 # white list. /login,/health,/docs,/redoc,/openapi.json are whitelisted by default.
```
## API Endpoints ## API Endpoints
All servers (LoLLMs, Ollama, OpenAI and Azure OpenAI) provide the same REST API endpoints for RAG functionality. When API Server is running, visit: All servers (LoLLMs, Ollama, OpenAI and Azure OpenAI) provide the same REST API endpoints for RAG functionality. When API Server is running, visit:

41
lightrag/api/auth.py Normal file
View File

@@ -0,0 +1,41 @@
import os
from datetime import datetime, timedelta
import jwt
from fastapi import HTTPException, status
from pydantic import BaseModel
class TokenPayload(BaseModel):
sub: str
exp: datetime
class AuthHandler:
def __init__(self):
self.secret = os.getenv("TOKEN_SECRET", "4f85ds4f56dsf46")
self.algorithm = "HS256"
self.expire_hours = int(os.getenv("TOKEN_EXPIRE_HOURS", 4))
def create_token(self, username: str) -> str:
expire = datetime.utcnow() + timedelta(hours=self.expire_hours)
payload = TokenPayload(sub=username, exp=expire)
return jwt.encode(payload.dict(), self.secret, algorithm=self.algorithm)
def validate_token(self, token: str) -> str:
try:
payload = jwt.decode(token, self.secret, algorithms=[self.algorithm])
expire_timestamp = payload["exp"]
expire_time = datetime.utcfromtimestamp(expire_timestamp)
if datetime.utcnow() > expire_time:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED, detail="Token expired"
)
return payload["sub"]
except jwt.PyJWTError:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid token"
)
auth_handler = AuthHandler()

View File

@@ -2,10 +2,7 @@
LightRAG FastAPI Server LightRAG FastAPI Server
""" """
from fastapi import ( from fastapi import FastAPI, Depends, HTTPException, status
FastAPI,
Depends,
)
import asyncio import asyncio
import os import os
import logging import logging
@@ -45,12 +42,17 @@ from lightrag.kg.shared_storage import (
initialize_pipeline_status, initialize_pipeline_status,
get_all_update_flags_status, get_all_update_flags_status,
) )
from fastapi.security import OAuth2PasswordRequestForm
from .auth import auth_handler
# Load environment variables # Load environment variables
# Updated to use the .env that is inside the current folder # Updated to use the .env that is inside the current folder
# This update allows the user to put a different.env file for each lightrag folder # This update allows the user to put a different.env file for each lightrag folder
load_dotenv(".env", override=True) load_dotenv(".env", override=True)
# Read entity extraction cache config
enable_llm_cache = os.getenv("ENABLE_LLM_CACHE_FOR_EXTRACT", "false").lower() == "true"
# Initialize config parser # Initialize config parser
config = configparser.ConfigParser() config = configparser.ConfigParser()
config.read("config.ini") config.read("config.ini")
@@ -324,16 +326,13 @@ def create_app(args):
vector_db_storage_cls_kwargs={ vector_db_storage_cls_kwargs={
"cosine_better_than_threshold": args.cosine_threshold "cosine_better_than_threshold": args.cosine_threshold
}, },
enable_llm_cache_for_entity_extract=False, # set to True for debuging to reduce llm fee enable_llm_cache_for_entity_extract=enable_llm_cache, # Read from environment variable
embedding_cache_config={ embedding_cache_config={
"enabled": True, "enabled": True,
"similarity_threshold": 0.95, "similarity_threshold": 0.95,
"use_llm_check": False, "use_llm_check": False,
}, },
namespace_prefix=args.namespace_prefix, namespace_prefix=args.namespace_prefix,
addon_params={
"language": args.language,
},
auto_manage_storages_states=False, auto_manage_storages_states=False,
) )
else: # azure_openai else: # azure_openai
@@ -356,7 +355,7 @@ def create_app(args):
vector_db_storage_cls_kwargs={ vector_db_storage_cls_kwargs={
"cosine_better_than_threshold": args.cosine_threshold "cosine_better_than_threshold": args.cosine_threshold
}, },
enable_llm_cache_for_entity_extract=False, # set to True for debuging to reduce llm fee enable_llm_cache_for_entity_extract=enable_llm_cache, # Read from environment variable
embedding_cache_config={ embedding_cache_config={
"enabled": True, "enabled": True,
"similarity_threshold": 0.95, "similarity_threshold": 0.95,
@@ -375,6 +374,27 @@ def create_app(args):
ollama_api = OllamaAPI(rag, top_k=args.top_k) ollama_api = OllamaAPI(rag, top_k=args.top_k)
app.include_router(ollama_api.router, prefix="/api") app.include_router(ollama_api.router, prefix="/api")
@app.post("/login")
async def login(form_data: OAuth2PasswordRequestForm = Depends()):
username = os.getenv("AUTH_USERNAME")
password = os.getenv("AUTH_PASSWORD")
if not (username and password):
raise HTTPException(
status_code=status.HTTP_501_NOT_IMPLEMENTED,
detail="Authentication not configured",
)
if form_data.username != username or form_data.password != password:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED, detail="Incorrect credentials"
)
return {
"access_token": auth_handler.create_token(username),
"token_type": "bearer",
}
@app.get("/health", dependencies=[Depends(optional_api_key)]) @app.get("/health", dependencies=[Depends(optional_api_key)])
async def get_status(): async def get_status():
"""Get current system status""" """Get current system status"""

View File

@@ -1,10 +1,20 @@
aiofiles aiofiles
ascii_colors ascii_colors
asyncpg
distro
fastapi fastapi
httpcore
httpx
jiter
numpy numpy
openai
passlib[bcrypt]
pipmaster pipmaster
PyJWT
python-dotenv python-dotenv
python-jose[cryptography]
python-multipart python-multipart
pytz
tenacity tenacity
tiktoken tiktoken
uvicorn uvicorn

View File

@@ -16,10 +16,13 @@ from pydantic import BaseModel, Field, field_validator
from lightrag import LightRAG from lightrag import LightRAG
from lightrag.base import DocProcessingStatus, DocStatus from lightrag.base import DocProcessingStatus, DocStatus
from ..utils_api import get_api_key_dependency from ..utils_api import get_api_key_dependency, get_auth_dependency
router = APIRouter(
router = APIRouter(prefix="/documents", tags=["documents"]) prefix="/documents",
tags=["documents"],
dependencies=[Depends(get_auth_dependency())],
)
# Temporary file prefix # Temporary file prefix
temp_prefix = "__tmp__" temp_prefix = "__tmp__"

View File

@@ -3,12 +3,11 @@ This module contains all graph-related routes for the LightRAG API.
""" """
from typing import Optional from typing import Optional
from fastapi import APIRouter, Depends from fastapi import APIRouter, Depends
from ..utils_api import get_api_key_dependency from ..utils_api import get_api_key_dependency, get_auth_dependency
router = APIRouter(tags=["graph"]) router = APIRouter(tags=["graph"], dependencies=[Depends(get_auth_dependency())])
def create_graph_routes(rag, api_key: Optional[str] = None): def create_graph_routes(rag, api_key: Optional[str] = None):
@@ -25,23 +24,33 @@ def create_graph_routes(rag, api_key: Optional[str] = None):
return await rag.get_graph_labels() return await rag.get_graph_labels()
@router.get("/graphs", dependencies=[Depends(optional_api_key)]) @router.get("/graphs", dependencies=[Depends(optional_api_key)])
async def get_knowledge_graph(label: str, max_depth: int = 3): async def get_knowledge_graph(
label: str, max_depth: int = 3, min_degree: int = 0, inclusive: bool = False
):
""" """
Retrieve a connected subgraph of nodes where the label includes the specified label. Retrieve a connected subgraph of nodes where the label includes the specified label.
Maximum number of nodes is constrained by the environment variable `MAX_GRAPH_NODES` (default: 1000). Maximum number of nodes is constrained by the environment variable `MAX_GRAPH_NODES` (default: 1000).
When reducing the number of nodes, the prioritization criteria are as follows: When reducing the number of nodes, the prioritization criteria are as follows:
1. Label matching nodes take precedence 1. min_degree does not affect nodes directly connected to the matching nodes
2. Followed by nodes directly connected to the matching nodes 2. Label matching nodes take precedence
3. Finally, the degree of the nodes 3. Followed by nodes directly connected to the matching nodes
4. Finally, the degree of the nodes
Maximum number of nodes is limited to env MAX_GRAPH_NODES(default: 1000) Maximum number of nodes is limited to env MAX_GRAPH_NODES(default: 1000)
Args: Args:
label (str): Label to get knowledge graph for label (str): Label to get knowledge graph for
max_depth (int, optional): Maximum depth of graph. Defaults to 3. max_depth (int, optional): Maximum depth of graph. Defaults to 3.
inclusive_search (bool, optional): If True, search for nodes that include the label. Defaults to False.
min_degree (int, optional): Minimum degree of nodes. Defaults to 0.
Returns: Returns:
Dict[str, List[str]]: Knowledge graph for label Dict[str, List[str]]: Knowledge graph for label
""" """
return await rag.get_knowledge_graph(node_label=label, max_depth=max_depth) return await rag.get_knowledge_graph(
node_label=label,
max_depth=max_depth,
inclusive=inclusive,
min_degree=min_degree,
)
return router return router

View File

@@ -8,12 +8,12 @@ from typing import Any, Dict, List, Literal, Optional
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from lightrag.base import QueryParam from lightrag.base import QueryParam
from ..utils_api import get_api_key_dependency from ..utils_api import get_api_key_dependency, get_auth_dependency
from pydantic import BaseModel, Field, field_validator from pydantic import BaseModel, Field, field_validator
from ascii_colors import trace_exception from ascii_colors import trace_exception
router = APIRouter(tags=["query"]) router = APIRouter(tags=["query"], dependencies=[Depends(get_auth_dependency())])
class QueryRequest(BaseModel): class QueryRequest(BaseModel):

View File

@@ -9,10 +9,11 @@ import sys
import logging import logging
from ascii_colors import ASCIIColors from ascii_colors import ASCIIColors
from lightrag.api import __api_version__ from lightrag.api import __api_version__
from fastapi import HTTPException, Security from fastapi import HTTPException, Security, Depends, Request
from dotenv import load_dotenv from dotenv import load_dotenv
from fastapi.security import APIKeyHeader from fastapi.security import APIKeyHeader, OAuth2PasswordBearer
from starlette.status import HTTP_403_FORBIDDEN from starlette.status import HTTP_403_FORBIDDEN
from .auth import auth_handler
# Load environment variables # Load environment variables
load_dotenv(override=True) load_dotenv(override=True)
@@ -31,6 +32,24 @@ class OllamaServerInfos:
ollama_server_infos = OllamaServerInfos() ollama_server_infos = OllamaServerInfos()
def get_auth_dependency():
whitelist = os.getenv("WHITELIST_PATHS", "").split(",")
async def dependency(
request: Request,
token: str = Depends(OAuth2PasswordBearer(tokenUrl="login", auto_error=False)),
):
if request.url.path in whitelist:
return
if not (os.getenv("AUTH_USERNAME") and os.getenv("AUTH_PASSWORD")):
return
auth_handler.validate_token(token)
return dependency
def get_api_key_dependency(api_key: Optional[str]): def get_api_key_dependency(api_key: Optional[str]):
""" """
Create an API key dependency for route protection. Create an API key dependency for route protection.
@@ -340,7 +359,6 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:
# Inject chunk configuration # Inject chunk configuration
args.chunk_size = get_env_value("CHUNK_SIZE", 1200, int) args.chunk_size = get_env_value("CHUNK_SIZE", 1200, int)
args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int) args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int)
args.language = get_env_value("LANGUAGE", "English")
ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -5,8 +5,8 @@
<link rel="icon" type="image/svg+xml" href="./logo.png" /> <link rel="icon" type="image/svg+xml" href="./logo.png" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> <meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Lightrag</title> <title>Lightrag</title>
<script type="module" crossorigin src="./assets/index-DbuMPJAD.js"></script> <script type="module" crossorigin src="./assets/index-CJz72b6Q.js"></script>
<link rel="stylesheet" crossorigin href="./assets/index-rP-YlyR1.css"> <link rel="stylesheet" crossorigin href="./assets/index-CH-3l4_Z.css">
</head> </head>
<body> <body>
<div id="root"></div> <div id="root"></div>

View File

@@ -206,7 +206,7 @@ class BaseGraphStorage(StorageNameSpace, ABC):
@abstractmethod @abstractmethod
async def get_knowledge_graph( async def get_knowledge_graph(
self, node_label: str, max_depth: int = 5 self, node_label: str, max_depth: int = 3
) -> KnowledgeGraph: ) -> KnowledgeGraph:
"""Retrieve a subgraph of the knowledge graph starting from a given node.""" """Retrieve a subgraph of the knowledge graph starting from a given node."""

View File

@@ -229,3 +229,43 @@ class ChromaVectorDBStorage(BaseVectorStorage):
except Exception as e: except Exception as e:
logger.error(f"Error while deleting vectors from {self.namespace}: {e}") logger.error(f"Error while deleting vectors from {self.namespace}: {e}")
raise raise
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
"""Search for records with IDs starting with a specific prefix.
Args:
prefix: The prefix to search for in record IDs
Returns:
List of records with matching ID prefixes
"""
try:
# Get all records from the collection
# Since ChromaDB doesn't directly support prefix search on IDs,
# we'll get all records and filter in Python
results = self._collection.get(
include=["metadatas", "documents", "embeddings"]
)
matching_records = []
# Filter records where ID starts with the prefix
for i, record_id in enumerate(results["ids"]):
if record_id.startswith(prefix):
matching_records.append(
{
"id": record_id,
"content": results["documents"][i],
"vector": results["embeddings"][i],
**results["metadatas"][i],
}
)
logger.debug(
f"Found {len(matching_records)} records with prefix '{prefix}'"
)
return matching_records
except Exception as e:
logger.error(f"Error during prefix search in ChromaDB: {str(e)}")
raise

View File

@@ -371,3 +371,24 @@ class FaissVectorDBStorage(BaseVectorStorage):
return False # Return error return False # Return error
return True # Return success return True # Return success
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
"""Search for records with IDs starting with a specific prefix.
Args:
prefix: The prefix to search for in record IDs
Returns:
List of records with matching ID prefixes
"""
matching_records = []
# Search for records with IDs starting with the prefix
for faiss_id, meta in self._id_to_meta.items():
if "__id__" in meta and meta["__id__"].startswith(prefix):
# Create a copy of all metadata and add "id" field
record = {**meta, "id": meta["__id__"]}
matching_records.append(record)
logger.debug(f"Found {len(matching_records)} records with prefix '{prefix}'")
return matching_records

View File

@@ -206,3 +206,28 @@ class MilvusVectorDBStorage(BaseVectorStorage):
except Exception as e: except Exception as e:
logger.error(f"Error while deleting vectors from {self.namespace}: {e}") logger.error(f"Error while deleting vectors from {self.namespace}: {e}")
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
"""Search for records with IDs starting with a specific prefix.
Args:
prefix: The prefix to search for in record IDs
Returns:
List of records with matching ID prefixes
"""
try:
# Use Milvus query with expression to find IDs with the given prefix
expression = f'id like "{prefix}%"'
results = self._client.query(
collection_name=self.namespace,
filter=expression,
output_fields=list(self.meta_fields) + ["id"],
)
logger.debug(f"Found {len(results)} records with prefix '{prefix}'")
return results
except Exception as e:
logger.error(f"Error searching for records with prefix '{prefix}': {e}")
return []

View File

@@ -1045,6 +1045,32 @@ class MongoVectorDBStorage(BaseVectorStorage):
except PyMongoError as e: except PyMongoError as e:
logger.error(f"Error deleting relations for {entity_name}: {str(e)}") logger.error(f"Error deleting relations for {entity_name}: {str(e)}")
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
"""Search for records with IDs starting with a specific prefix.
Args:
prefix: The prefix to search for in record IDs
Returns:
List of records with matching ID prefixes
"""
try:
# Use MongoDB regex to find documents where _id starts with the prefix
cursor = self._data.find({"_id": {"$regex": f"^{prefix}"}})
matching_records = await cursor.to_list(length=None)
# Format results
results = [{**doc, "id": doc["_id"]} for doc in matching_records]
logger.debug(
f"Found {len(results)} records with prefix '{prefix}' in {self.namespace}"
)
return results
except PyMongoError as e:
logger.error(f"Error searching by prefix in {self.namespace}: {str(e)}")
return []
async def get_or_create_collection(db: AsyncIOMotorDatabase, collection_name: str): async def get_or_create_collection(db: AsyncIOMotorDatabase, collection_name: str):
collection_names = await db.list_collection_names() collection_names = await db.list_collection_names()

View File

@@ -236,3 +236,23 @@ class NanoVectorDBStorage(BaseVectorStorage):
return False # Return error return False # Return error
return True # Return success return True # Return success
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
"""Search for records with IDs starting with a specific prefix.
Args:
prefix: The prefix to search for in record IDs
Returns:
List of records with matching ID prefixes
"""
storage = await self.client_storage
matching_records = []
# Search for records with IDs starting with the prefix
for record in storage["data"]:
if "__id__" in record and record["__id__"].startswith(prefix):
matching_records.append({**record, "id": record["__id__"]})
logger.debug(f"Found {len(matching_records)} records with prefix '{prefix}'")
return matching_records

View File

@@ -232,19 +232,26 @@ class NetworkXStorage(BaseGraphStorage):
return sorted(list(labels)) return sorted(list(labels))
async def get_knowledge_graph( async def get_knowledge_graph(
self, node_label: str, max_depth: int = 5 self,
node_label: str,
max_depth: int = 3,
min_degree: int = 0,
inclusive: bool = False,
) -> KnowledgeGraph: ) -> KnowledgeGraph:
""" """
Retrieve a connected subgraph of nodes where the label includes the specified `node_label`. Retrieve a connected subgraph of nodes where the label includes the specified `node_label`.
Maximum number of nodes is constrained by the environment variable `MAX_GRAPH_NODES` (default: 1000). Maximum number of nodes is constrained by the environment variable `MAX_GRAPH_NODES` (default: 1000).
When reducing the number of nodes, the prioritization criteria are as follows: When reducing the number of nodes, the prioritization criteria are as follows:
1. Label matching nodes take precedence 1. min_degree does not affect nodes directly connected to the matching nodes
2. Followed by nodes directly connected to the matching nodes 2. Label matching nodes take precedence
3. Finally, the degree of the nodes 3. Followed by nodes directly connected to the matching nodes
4. Finally, the degree of the nodes
Args: Args:
node_label: Label of the starting node node_label: Label of the starting node
max_depth: Maximum depth of the subgraph max_depth: Maximum depth of the subgraph
min_degree: Minimum degree of nodes to include. Defaults to 0
inclusive: Do an inclusive search if true
Returns: Returns:
KnowledgeGraph object containing nodes and edges KnowledgeGraph object containing nodes and edges
@@ -255,6 +262,10 @@ class NetworkXStorage(BaseGraphStorage):
graph = await self._get_graph() graph = await self._get_graph()
# Initialize sets for start nodes and direct connected nodes
start_nodes = set()
direct_connected_nodes = set()
# Handle special case for "*" label # Handle special case for "*" label
if node_label == "*": if node_label == "*":
# For "*", return the entire graph including all nodes and edges # For "*", return the entire graph including all nodes and edges
@@ -262,11 +273,16 @@ class NetworkXStorage(BaseGraphStorage):
graph.copy() graph.copy()
) # Create a copy to avoid modifying the original graph ) # Create a copy to avoid modifying the original graph
else: else:
# Find nodes with matching node id (partial match) # Find nodes with matching node id based on search_mode
nodes_to_explore = [] nodes_to_explore = []
for n, attr in graph.nodes(data=True): for n, attr in graph.nodes(data=True):
if node_label in str(n): # Use partial matching node_str = str(n)
nodes_to_explore.append(n) if not inclusive:
if node_label == node_str: # Use exact matching
nodes_to_explore.append(n)
else: # inclusive mode
if node_label in node_str: # Use partial matching
nodes_to_explore.append(n)
if not nodes_to_explore: if not nodes_to_explore:
logger.warning(f"No nodes found with label {node_label}") logger.warning(f"No nodes found with label {node_label}")
@@ -277,26 +293,37 @@ class NetworkXStorage(BaseGraphStorage):
for start_node in nodes_to_explore: for start_node in nodes_to_explore:
node_subgraph = nx.ego_graph(graph, start_node, radius=max_depth) node_subgraph = nx.ego_graph(graph, start_node, radius=max_depth)
combined_subgraph = nx.compose(combined_subgraph, node_subgraph) combined_subgraph = nx.compose(combined_subgraph, node_subgraph)
# Get start nodes and direct connected nodes
if nodes_to_explore:
start_nodes = set(nodes_to_explore)
# Get nodes directly connected to all start nodes
for start_node in start_nodes:
direct_connected_nodes.update(
combined_subgraph.neighbors(start_node)
)
# Remove start nodes from directly connected nodes (avoid duplicates)
direct_connected_nodes -= start_nodes
subgraph = combined_subgraph subgraph = combined_subgraph
# Filter nodes based on min_degree, but keep start nodes and direct connected nodes
if min_degree > 0:
nodes_to_keep = [
node
for node, degree in subgraph.degree()
if node in start_nodes
or node in direct_connected_nodes
or degree >= min_degree
]
subgraph = subgraph.subgraph(nodes_to_keep)
# Check if number of nodes exceeds max_graph_nodes # Check if number of nodes exceeds max_graph_nodes
if len(subgraph.nodes()) > MAX_GRAPH_NODES: if len(subgraph.nodes()) > MAX_GRAPH_NODES:
origin_nodes = len(subgraph.nodes()) origin_nodes = len(subgraph.nodes())
node_degrees = dict(subgraph.degree()) node_degrees = dict(subgraph.degree())
start_nodes = set()
direct_connected_nodes = set()
if node_label != "*" and nodes_to_explore:
start_nodes = set(nodes_to_explore)
# Get nodes directly connected to all start nodes
for start_node in start_nodes:
direct_connected_nodes.update(subgraph.neighbors(start_node))
# Remove start nodes from directly connected nodes (avoid duplicates)
direct_connected_nodes -= start_nodes
def priority_key(node_item): def priority_key(node_item):
node, degree = node_item node, degree = node_item
# Priority order: start(2) > directly connected(1) > other nodes(0) # Priority order: start(2) > directly connected(1) > other nodes(0)
@@ -356,7 +383,7 @@ class NetworkXStorage(BaseGraphStorage):
result.edges.append( result.edges.append(
KnowledgeGraphEdge( KnowledgeGraphEdge(
id=edge_id, id=edge_id,
type="RELATED", type="DIRECTED",
source=str(source), source=str(source),
target=str(target), target=str(target),
properties=edge_data, properties=edge_data,

View File

@@ -494,6 +494,41 @@ class OracleVectorDBStorage(BaseVectorStorage):
logger.error(f"Error deleting relations for entity {entity_name}: {e}") logger.error(f"Error deleting relations for entity {entity_name}: {e}")
raise raise
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
"""Search for records with IDs starting with a specific prefix.
Args:
prefix: The prefix to search for in record IDs
Returns:
List of records with matching ID prefixes
"""
try:
# Determine the appropriate table based on namespace
table_name = namespace_to_table_name(self.namespace)
# Create SQL query to find records with IDs starting with prefix
search_sql = f"""
SELECT * FROM {table_name}
WHERE workspace = :workspace
AND id LIKE :prefix_pattern
ORDER BY id
"""
params = {"workspace": self.db.workspace, "prefix_pattern": f"{prefix}%"}
# Execute query and get results
results = await self.db.query(search_sql, params, multirows=True)
logger.debug(
f"Found {len(results) if results else 0} records with prefix '{prefix}'"
)
return results or []
except Exception as e:
logger.error(f"Error searching records with prefix '{prefix}': {e}")
return []
@final @final
@dataclass @dataclass

View File

@@ -585,6 +585,41 @@ class PGVectorStorage(BaseVectorStorage):
except Exception as e: except Exception as e:
logger.error(f"Error deleting relations for entity {entity_name}: {e}") logger.error(f"Error deleting relations for entity {entity_name}: {e}")
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
"""Search for records with IDs starting with a specific prefix.
Args:
prefix: The prefix to search for in record IDs
Returns:
List of records with matching ID prefixes
"""
table_name = namespace_to_table_name(self.namespace)
if not table_name:
logger.error(f"Unknown namespace for prefix search: {self.namespace}")
return []
search_sql = f"SELECT * FROM {table_name} WHERE workspace=$1 AND id LIKE $2"
params = {"workspace": self.db.workspace, "prefix": f"{prefix}%"}
try:
results = await self.db.query(search_sql, params, multirows=True)
logger.debug(f"Found {len(results)} records with prefix '{prefix}'")
# Format results to match the expected return format
formatted_results = []
for record in results:
formatted_record = dict(record)
# Ensure id field is available (for consistency with NanoVectorDB implementation)
if "id" not in formatted_record:
formatted_record["id"] = record["id"]
formatted_results.append(formatted_record)
return formatted_results
except Exception as e:
logger.error(f"Error during prefix search for '{prefix}': {e}")
return []
@final @final
@dataclass @dataclass
@@ -785,42 +820,85 @@ class PGGraphStorage(BaseGraphStorage):
v = record[k] v = record[k]
# agtype comes back '{key: value}::type' which must be parsed # agtype comes back '{key: value}::type' which must be parsed
if isinstance(v, str) and "::" in v: if isinstance(v, str) and "::" in v:
dtype = v.split("::")[-1] if v.startswith("[") and v.endswith("]"):
v = v.split("::")[0] if "::vertex" not in v:
if dtype == "vertex": continue
vertex = json.loads(v) v = v.replace("::vertex", "")
vertices[vertex["id"]] = vertex.get("properties") vertexes = json.loads(v)
for vertex in vertexes:
vertices[vertex["id"]] = vertex.get("properties")
else:
dtype = v.split("::")[-1]
v = v.split("::")[0]
if dtype == "vertex":
vertex = json.loads(v)
vertices[vertex["id"]] = vertex.get("properties")
# iterate returned fields and parse appropriately # iterate returned fields and parse appropriately
for k in record.keys(): for k in record.keys():
v = record[k] v = record[k]
if isinstance(v, str) and "::" in v: if isinstance(v, str) and "::" in v:
dtype = v.split("::")[-1] if v.startswith("[") and v.endswith("]"):
v = v.split("::")[0] if "::vertex" in v:
else: v = v.replace("::vertex", "")
dtype = "" vertexes = json.loads(v)
dl = []
for vertex in vertexes:
prop = vertex.get("properties")
if not prop:
prop = {}
prop["label"] = PGGraphStorage._decode_graph_label(
prop["node_id"]
)
dl.append(prop)
d[k] = dl
if dtype == "vertex": elif "::edge" in v:
vertex = json.loads(v) v = v.replace("::edge", "")
field = vertex.get("properties") edges = json.loads(v)
if not field: dl = []
field = {} for edge in edges:
field["label"] = PGGraphStorage._decode_graph_label(field["node_id"]) dl.append(
d[k] = field (
# convert edge from id-label->id by replacing id with node information vertices[edge["start_id"]],
# we only do this if the vertex was also returned in the query edge["label"],
# this is an attempt to be consistent with neo4j implementation vertices[edge["end_id"]],
elif dtype == "edge": )
edge = json.loads(v) )
d[k] = ( d[k] = dl
vertices.get(edge["start_id"], {}), else:
edge[ print("WARNING: unsupported type")
"label" continue
], # we don't use decode_graph_label(), since edge label is always "DIRECTED"
vertices.get(edge["end_id"], {}), else:
) dtype = v.split("::")[-1]
v = v.split("::")[0]
if dtype == "vertex":
vertex = json.loads(v)
field = vertex.get("properties")
if not field:
field = {}
field["label"] = PGGraphStorage._decode_graph_label(
field["node_id"]
)
d[k] = field
# convert edge from id-label->id by replacing id with node information
# we only do this if the vertex was also returned in the query
# this is an attempt to be consistent with neo4j implementation
elif dtype == "edge":
edge = json.loads(v)
d[k] = (
vertices.get(edge["start_id"], {}),
edge[
"label"
], # we don't use decode_graph_label(), since edge label is always "DIRECTED"
vertices.get(edge["end_id"], {}),
)
else: else:
d[k] = json.loads(v) if isinstance(v, str) else v if v is None or (v.count("{") < 1 and v.count("[") < 1):
d[k] = v
else:
d[k] = json.loads(v) if isinstance(v, str) else v
return d return d
@@ -1294,7 +1372,7 @@ class PGGraphStorage(BaseGraphStorage):
OPTIONAL MATCH p = (n)-[*..%d]-(m) OPTIONAL MATCH p = (n)-[*..%d]-(m)
RETURN nodes(p) AS nodes, relationships(p) AS relationships RETURN nodes(p) AS nodes, relationships(p) AS relationships
LIMIT %d LIMIT %d
$$) AS (nodes agtype[], relationships agtype[])""" % ( $$) AS (nodes agtype, relationships agtype)""" % (
self.graph_name, self.graph_name,
encoded_node_label, encoded_node_label,
max_depth, max_depth,
@@ -1303,17 +1381,23 @@ class PGGraphStorage(BaseGraphStorage):
results = await self._query(query) results = await self._query(query)
nodes = set() nodes = {}
edges = [] edges = []
unique_edge_ids = set()
for result in results: for result in results:
if node_label == "*": if node_label == "*":
if result["n"]: if result["n"]:
node = result["n"] node = result["n"]
nodes.add(self._decode_graph_label(node["node_id"])) node_id = self._decode_graph_label(node["node_id"])
if node_id not in nodes:
nodes[node_id] = node
if result["m"]: if result["m"]:
node = result["m"] node = result["m"]
nodes.add(self._decode_graph_label(node["node_id"])) node_id = self._decode_graph_label(node["node_id"])
if node_id not in nodes:
nodes[node_id] = node
if result["r"]: if result["r"]:
edge = result["r"] edge = result["r"]
src_id = self._decode_graph_label(edge["start_id"]) src_id = self._decode_graph_label(edge["start_id"])
@@ -1322,16 +1406,36 @@ class PGGraphStorage(BaseGraphStorage):
else: else:
if result["nodes"]: if result["nodes"]:
for node in result["nodes"]: for node in result["nodes"]:
nodes.add(self._decode_graph_label(node["node_id"])) node_id = self._decode_graph_label(node["node_id"])
if node_id not in nodes:
nodes[node_id] = node
if result["relationships"]: if result["relationships"]:
for edge in result["relationships"]: for edge in result["relationships"]: # src --DIRECTED--> target
src_id = self._decode_graph_label(edge["start_id"]) src_id = self._decode_graph_label(edge[0]["node_id"])
tgt_id = self._decode_graph_label(edge["end_id"]) tgt_id = self._decode_graph_label(edge[2]["node_id"])
edges.append((src_id, tgt_id)) id = src_id + "," + tgt_id
if id in unique_edge_ids:
continue
else:
unique_edge_ids.add(id)
edges.append(
(id, src_id, tgt_id, {"source": edge[0], "target": edge[2]})
)
kg = KnowledgeGraph( kg = KnowledgeGraph(
nodes=[KnowledgeGraphNode(id=node_id) for node_id in nodes], nodes=[
edges=[KnowledgeGraphEdge(source=src, target=tgt) for src, tgt in edges], KnowledgeGraphNode(
id=node_id, labels=[node_id], properties=nodes[node_id]
)
for node_id in nodes
],
edges=[
KnowledgeGraphEdge(
id=id, type="DIRECTED", source=src, target=tgt, properties=props
)
for id, src, tgt, props in edges
],
) )
return kg return kg

View File

@@ -135,7 +135,7 @@ class QdrantVectorDBStorage(BaseVectorStorage):
logger.debug(f"query result: {results}") logger.debug(f"query result: {results}")
return [{**dp.payload, "id": dp.id, "distance": dp.score} for dp in results] return [{**dp.payload, "distance": dp.score} for dp in results]
async def index_done_callback(self) -> None: async def index_done_callback(self) -> None:
# Qdrant handles persistence automatically # Qdrant handles persistence automatically
@@ -233,3 +233,43 @@ class QdrantVectorDBStorage(BaseVectorStorage):
logger.debug(f"No relations found for entity {entity_name}") logger.debug(f"No relations found for entity {entity_name}")
except Exception as e: except Exception as e:
logger.error(f"Error deleting relations for {entity_name}: {e}") logger.error(f"Error deleting relations for {entity_name}: {e}")
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
"""Search for records with IDs starting with a specific prefix.
Args:
prefix: The prefix to search for in record IDs
Returns:
List of records with matching ID prefixes
"""
try:
# Use scroll method to find records with IDs starting with the prefix
results = self._client.scroll(
collection_name=self.namespace,
scroll_filter=models.Filter(
must=[
models.FieldCondition(
key="id", match=models.MatchText(text=prefix, prefix=True)
)
]
),
with_payload=True,
with_vectors=False,
limit=1000, # Adjust as needed for your use case
)
# Extract matching points
matching_records = results[0]
# Format the results to match expected return format
formatted_results = [{**point.payload} for point in matching_records]
logger.debug(
f"Found {len(formatted_results)} records with prefix '{prefix}'"
)
return formatted_results
except Exception as e:
logger.error(f"Error searching for prefix '{prefix}': {e}")
return []

View File

@@ -414,6 +414,55 @@ class TiDBVectorDBStorage(BaseVectorStorage):
# Ti handles persistence automatically # Ti handles persistence automatically
pass pass
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
"""Search for records with IDs starting with a specific prefix.
Args:
prefix: The prefix to search for in record IDs
Returns:
List of records with matching ID prefixes
"""
# Determine which table to query based on namespace
if self.namespace == NameSpace.VECTOR_STORE_ENTITIES:
sql_template = """
SELECT entity_id as id, name as entity_name, entity_type, description, content
FROM LIGHTRAG_GRAPH_NODES
WHERE entity_id LIKE :prefix_pattern AND workspace = :workspace
"""
elif self.namespace == NameSpace.VECTOR_STORE_RELATIONSHIPS:
sql_template = """
SELECT relation_id as id, source_name as src_id, target_name as tgt_id,
keywords, description, content
FROM LIGHTRAG_GRAPH_EDGES
WHERE relation_id LIKE :prefix_pattern AND workspace = :workspace
"""
elif self.namespace == NameSpace.VECTOR_STORE_CHUNKS:
sql_template = """
SELECT chunk_id as id, content, tokens, chunk_order_index, full_doc_id
FROM LIGHTRAG_DOC_CHUNKS
WHERE chunk_id LIKE :prefix_pattern AND workspace = :workspace
"""
else:
logger.warning(
f"Namespace {self.namespace} not supported for prefix search"
)
return []
# Add prefix pattern parameter with % for SQL LIKE
prefix_pattern = f"{prefix}%"
params = {"prefix_pattern": prefix_pattern, "workspace": self.db.workspace}
try:
results = await self.db.query(sql_template, params=params, multirows=True)
logger.debug(
f"Found {len(results) if results else 0} records with prefix '{prefix}'"
)
return results if results else []
except Exception as e:
logger.error(f"Error searching records with prefix '{prefix}': {e}")
return []
@final @final
@dataclass @dataclass
@@ -968,4 +1017,20 @@ SQL_TEMPLATES = {
WHERE (source_name = :source AND target_name = :target) WHERE (source_name = :source AND target_name = :target)
AND workspace = :workspace AND workspace = :workspace
""", """,
# Search by prefix SQL templates
"search_entity_by_prefix": """
SELECT entity_id as id, name as entity_name, entity_type, description, content
FROM LIGHTRAG_GRAPH_NODES
WHERE entity_id LIKE :prefix_pattern AND workspace = :workspace
""",
"search_relationship_by_prefix": """
SELECT relation_id as id, source_name as src_id, target_name as tgt_id, keywords, description, content
FROM LIGHTRAG_GRAPH_EDGES
WHERE relation_id LIKE :prefix_pattern AND workspace = :workspace
""",
"search_chunk_by_prefix": """
SELECT chunk_id as id, content, tokens, chunk_order_index, full_doc_id
FROM LIGHTRAG_DOC_CHUNKS
WHERE chunk_id LIKE :prefix_pattern AND workspace = :workspace
""",
} }

View File

@@ -504,11 +504,39 @@ class LightRAG:
return text return text
async def get_knowledge_graph( async def get_knowledge_graph(
self, node_label: str, max_depth: int self,
node_label: str,
max_depth: int = 3,
min_degree: int = 0,
inclusive: bool = False,
) -> KnowledgeGraph: ) -> KnowledgeGraph:
return await self.chunk_entity_relation_graph.get_knowledge_graph( """Get knowledge graph for a given label
node_label=node_label, max_depth=max_depth
) Args:
node_label (str): Label to get knowledge graph for
max_depth (int): Maximum depth of graph
min_degree (int, optional): Minimum degree of nodes to include. Defaults to 0.
inclusive (bool, optional): Whether to use inclusive search mode. Defaults to False.
Returns:
KnowledgeGraph: Knowledge graph containing nodes and edges
"""
# get params supported by get_knowledge_graph of specified storage
import inspect
storage_params = inspect.signature(
self.chunk_entity_relation_graph.get_knowledge_graph
).parameters
kwargs = {"node_label": node_label, "max_depth": max_depth}
if "min_degree" in storage_params and min_degree > 0:
kwargs["min_degree"] = min_degree
if "inclusive" in storage_params:
kwargs["inclusive"] = inclusive
return await self.chunk_entity_relation_graph.get_knowledge_graph(**kwargs)
def _get_storage_class(self, storage_name: str) -> Callable[..., Any]: def _get_storage_class(self, storage_name: str) -> Callable[..., Any]:
import_path = STORAGES[storage_name] import_path = STORAGES[storage_name]
@@ -1981,6 +2009,9 @@ class LightRAG:
new_entity_name, new_node_data new_entity_name, new_node_data
) )
# Store relationships that need to be updated
relations_to_update = []
# Get all edges related to the original entity # Get all edges related to the original entity
edges = await self.chunk_entity_relation_graph.get_node_edges( edges = await self.chunk_entity_relation_graph.get_node_edges(
entity_name entity_name
@@ -1996,10 +2027,16 @@ class LightRAG:
await self.chunk_entity_relation_graph.upsert_edge( await self.chunk_entity_relation_graph.upsert_edge(
new_entity_name, target, edge_data new_entity_name, target, edge_data
) )
relations_to_update.append(
(new_entity_name, target, edge_data)
)
else: # target == entity_name else: # target == entity_name
await self.chunk_entity_relation_graph.upsert_edge( await self.chunk_entity_relation_graph.upsert_edge(
source, new_entity_name, edge_data source, new_entity_name, edge_data
) )
relations_to_update.append(
(source, new_entity_name, edge_data)
)
# Delete old entity # Delete old entity
await self.chunk_entity_relation_graph.delete_node(entity_name) await self.chunk_entity_relation_graph.delete_node(entity_name)
@@ -2007,6 +2044,38 @@ class LightRAG:
# Delete old entity record from vector database # Delete old entity record from vector database
old_entity_id = compute_mdhash_id(entity_name, prefix="ent-") old_entity_id = compute_mdhash_id(entity_name, prefix="ent-")
await self.entities_vdb.delete([old_entity_id]) await self.entities_vdb.delete([old_entity_id])
logger.info(
f"Deleted old entity '{entity_name}' and its vector embedding from database"
)
# Update relationship vector representations
for src, tgt, edge_data in relations_to_update:
description = edge_data.get("description", "")
keywords = edge_data.get("keywords", "")
source_id = edge_data.get("source_id", "")
weight = float(edge_data.get("weight", 1.0))
# Create new content for embedding
content = f"{src}\t{tgt}\n{keywords}\n{description}"
# Calculate relationship ID
relation_id = compute_mdhash_id(src + tgt, prefix="rel-")
# Prepare data for vector database update
relation_data = {
relation_id: {
"content": content,
"src_id": src,
"tgt_id": tgt,
"source_id": source_id,
"description": description,
"keywords": keywords,
"weight": weight,
}
}
# Update vector database
await self.relationships_vdb.upsert(relation_data)
# Update working entity name to new name # Update working entity name to new name
entity_name = new_entity_name entity_name = new_entity_name
@@ -2105,6 +2174,15 @@ class LightRAG:
f"Relation from '{source_entity}' to '{target_entity}' does not exist" f"Relation from '{source_entity}' to '{target_entity}' does not exist"
) )
# Important: First delete the old relation record from the vector database
old_relation_id = compute_mdhash_id(
source_entity + target_entity, prefix="rel-"
)
await self.relationships_vdb.delete([old_relation_id])
logger.info(
f"Deleted old relation record from vector database for relation {source_entity} -> {target_entity}"
)
# 2. Update relation information in the graph # 2. Update relation information in the graph
new_edge_data = {**edge_data, **updated_data} new_edge_data = {**edge_data, **updated_data}
await self.chunk_entity_relation_graph.upsert_edge( await self.chunk_entity_relation_graph.upsert_edge(
@@ -2118,7 +2196,7 @@ class LightRAG:
weight = float(new_edge_data.get("weight", 1.0)) weight = float(new_edge_data.get("weight", 1.0))
# Create content for embedding # Create content for embedding
content = f"{keywords}\t{source_entity}\n{target_entity}\n{description}" content = f"{source_entity}\t{target_entity}\n{keywords}\n{description}"
# Calculate relation ID # Calculate relation ID
relation_id = compute_mdhash_id( relation_id = compute_mdhash_id(
@@ -2382,3 +2460,409 @@ class LightRAG:
return loop.run_until_complete( return loop.run_until_complete(
self.acreate_relation(source_entity, target_entity, relation_data) self.acreate_relation(source_entity, target_entity, relation_data)
) )
async def amerge_entities(
self,
source_entities: list[str],
target_entity: str,
merge_strategy: dict[str, str] = None,
target_entity_data: dict[str, Any] = None,
) -> dict[str, Any]:
"""Asynchronously merge multiple entities into one entity.
Merges multiple source entities into a target entity, handling all relationships,
and updating both the knowledge graph and vector database.
Args:
source_entities: List of source entity names to merge
target_entity: Name of the target entity after merging
merge_strategy: Merge strategy configuration, e.g. {"description": "concatenate", "entity_type": "keep_first"}
Supported strategies:
- "concatenate": Concatenate all values (for text fields)
- "keep_first": Keep the first non-empty value
- "keep_last": Keep the last non-empty value
- "join_unique": Join all unique values (for fields separated by delimiter)
target_entity_data: Dictionary of specific values to set for the target entity,
overriding any merged values, e.g. {"description": "custom description", "entity_type": "PERSON"}
Returns:
Dictionary containing the merged entity information
"""
try:
# Default merge strategy
default_strategy = {
"description": "concatenate",
"entity_type": "keep_first",
"source_id": "join_unique",
}
merge_strategy = (
default_strategy
if merge_strategy is None
else {**default_strategy, **merge_strategy}
)
target_entity_data = (
{} if target_entity_data is None else target_entity_data
)
# 1. Check if all source entities exist
source_entities_data = {}
for entity_name in source_entities:
node_data = await self.chunk_entity_relation_graph.get_node(entity_name)
if not node_data:
raise ValueError(f"Source entity '{entity_name}' does not exist")
source_entities_data[entity_name] = node_data
# 2. Check if target entity exists and get its data if it does
target_exists = await self.chunk_entity_relation_graph.has_node(
target_entity
)
target_entity_data = {}
if target_exists:
target_entity_data = await self.chunk_entity_relation_graph.get_node(
target_entity
)
logger.info(
f"Target entity '{target_entity}' already exists, will merge data"
)
# 3. Merge entity data
merged_entity_data = self._merge_entity_attributes(
list(source_entities_data.values())
+ ([target_entity_data] if target_exists else []),
merge_strategy,
)
# Apply any explicitly provided target entity data (overrides merged data)
for key, value in target_entity_data.items():
merged_entity_data[key] = value
# 4. Get all relationships of the source entities
all_relations = []
for entity_name in source_entities:
# Get all relationships where this entity is the source
outgoing_edges = await self.chunk_entity_relation_graph.get_node_edges(
entity_name
)
if outgoing_edges:
for src, tgt in outgoing_edges:
# Ensure src is the current entity
if src == entity_name:
edge_data = await self.chunk_entity_relation_graph.get_edge(
src, tgt
)
all_relations.append(("outgoing", src, tgt, edge_data))
# Get all relationships where this entity is the target
incoming_edges = []
all_labels = await self.chunk_entity_relation_graph.get_all_labels()
for label in all_labels:
if label == entity_name:
continue
node_edges = await self.chunk_entity_relation_graph.get_node_edges(
label
)
for src, tgt in node_edges or []:
if tgt == entity_name:
incoming_edges.append((src, tgt))
for src, tgt in incoming_edges:
edge_data = await self.chunk_entity_relation_graph.get_edge(
src, tgt
)
all_relations.append(("incoming", src, tgt, edge_data))
# 5. Create or update the target entity
if not target_exists:
await self.chunk_entity_relation_graph.upsert_node(
target_entity, merged_entity_data
)
logger.info(f"Created new target entity '{target_entity}'")
else:
await self.chunk_entity_relation_graph.upsert_node(
target_entity, merged_entity_data
)
logger.info(f"Updated existing target entity '{target_entity}'")
# 6. Recreate all relationships, pointing to the target entity
relation_updates = {} # Track relationships that need to be merged
for rel_type, src, tgt, edge_data in all_relations:
new_src = target_entity if src in source_entities else src
new_tgt = target_entity if tgt in source_entities else tgt
# Skip relationships between source entities to avoid self-loops
if new_src == new_tgt:
logger.info(
f"Skipping relationship between source entities: {src} -> {tgt} to avoid self-loop"
)
continue
# Check if the same relationship already exists
relation_key = f"{new_src}|{new_tgt}"
if relation_key in relation_updates:
# Merge relationship data
existing_data = relation_updates[relation_key]["data"]
merged_relation = self._merge_relation_attributes(
[existing_data, edge_data],
{
"description": "concatenate",
"keywords": "join_unique",
"source_id": "join_unique",
"weight": "max",
},
)
relation_updates[relation_key]["data"] = merged_relation
logger.info(
f"Merged duplicate relationship: {new_src} -> {new_tgt}"
)
else:
relation_updates[relation_key] = {
"src": new_src,
"tgt": new_tgt,
"data": edge_data.copy(),
}
# Apply relationship updates
for rel_data in relation_updates.values():
await self.chunk_entity_relation_graph.upsert_edge(
rel_data["src"], rel_data["tgt"], rel_data["data"]
)
logger.info(
f"Created or updated relationship: {rel_data['src']} -> {rel_data['tgt']}"
)
# 7. Update entity vector representation
description = merged_entity_data.get("description", "")
source_id = merged_entity_data.get("source_id", "")
entity_type = merged_entity_data.get("entity_type", "")
content = target_entity + "\n" + description
entity_id = compute_mdhash_id(target_entity, prefix="ent-")
entity_data_for_vdb = {
entity_id: {
"content": content,
"entity_name": target_entity,
"source_id": source_id,
"description": description,
"entity_type": entity_type,
}
}
await self.entities_vdb.upsert(entity_data_for_vdb)
# 8. Update relationship vector representations
for rel_data in relation_updates.values():
src = rel_data["src"]
tgt = rel_data["tgt"]
edge_data = rel_data["data"]
description = edge_data.get("description", "")
keywords = edge_data.get("keywords", "")
source_id = edge_data.get("source_id", "")
weight = float(edge_data.get("weight", 1.0))
content = f"{keywords}\t{src}\n{tgt}\n{description}"
relation_id = compute_mdhash_id(src + tgt, prefix="rel-")
relation_data_for_vdb = {
relation_id: {
"content": content,
"src_id": src,
"tgt_id": tgt,
"source_id": source_id,
"description": description,
"keywords": keywords,
"weight": weight,
}
}
await self.relationships_vdb.upsert(relation_data_for_vdb)
# 9. Delete source entities
for entity_name in source_entities:
# Delete entity node from knowledge graph
await self.chunk_entity_relation_graph.delete_node(entity_name)
# Delete entity record from vector database
entity_id = compute_mdhash_id(entity_name, prefix="ent-")
await self.entities_vdb.delete([entity_id])
# Also ensure any relationships specific to this entity are deleted from vector DB
# This is a safety check, as these should have been transformed to the target entity already
entity_relation_prefix = compute_mdhash_id(entity_name, prefix="rel-")
relations_with_entity = await self.relationships_vdb.search_by_prefix(
entity_relation_prefix
)
if relations_with_entity:
relation_ids = [r["id"] for r in relations_with_entity]
await self.relationships_vdb.delete(relation_ids)
logger.info(
f"Deleted {len(relation_ids)} relation records for entity '{entity_name}' from vector database"
)
logger.info(
f"Deleted source entity '{entity_name}' and its vector embedding from database"
)
# 10. Save changes
await self._merge_entities_done()
logger.info(
f"Successfully merged {len(source_entities)} entities into '{target_entity}'"
)
return await self.get_entity_info(target_entity, include_vector_data=True)
except Exception as e:
logger.error(f"Error merging entities: {e}")
raise
def merge_entities(
self,
source_entities: list[str],
target_entity: str,
merge_strategy: dict[str, str] = None,
target_entity_data: dict[str, Any] = None,
) -> dict[str, Any]:
"""Synchronously merge multiple entities into one entity.
Merges multiple source entities into a target entity, handling all relationships,
and updating both the knowledge graph and vector database.
Args:
source_entities: List of source entity names to merge
target_entity: Name of the target entity after merging
merge_strategy: Merge strategy configuration, e.g. {"description": "concatenate", "entity_type": "keep_first"}
target_entity_data: Dictionary of specific values to set for the target entity,
overriding any merged values, e.g. {"description": "custom description", "entity_type": "PERSON"}
Returns:
Dictionary containing the merged entity information
"""
loop = always_get_an_event_loop()
return loop.run_until_complete(
self.amerge_entities(
source_entities, target_entity, merge_strategy, target_entity_data
)
)
def _merge_entity_attributes(
self, entity_data_list: list[dict[str, Any]], merge_strategy: dict[str, str]
) -> dict[str, Any]:
"""Merge attributes from multiple entities.
Args:
entity_data_list: List of dictionaries containing entity data
merge_strategy: Merge strategy for each field
Returns:
Dictionary containing merged entity data
"""
merged_data = {}
# Collect all possible keys
all_keys = set()
for data in entity_data_list:
all_keys.update(data.keys())
# Merge values for each key
for key in all_keys:
# Get all values for this key
values = [data.get(key) for data in entity_data_list if data.get(key)]
if not values:
continue
# Merge values according to strategy
strategy = merge_strategy.get(key, "keep_first")
if strategy == "concatenate":
merged_data[key] = "\n\n".join(values)
elif strategy == "keep_first":
merged_data[key] = values[0]
elif strategy == "keep_last":
merged_data[key] = values[-1]
elif strategy == "join_unique":
# Handle fields separated by GRAPH_FIELD_SEP
unique_items = set()
for value in values:
items = value.split(GRAPH_FIELD_SEP)
unique_items.update(items)
merged_data[key] = GRAPH_FIELD_SEP.join(unique_items)
else:
# Default strategy
merged_data[key] = values[0]
return merged_data
def _merge_relation_attributes(
self, relation_data_list: list[dict[str, Any]], merge_strategy: dict[str, str]
) -> dict[str, Any]:
"""Merge attributes from multiple relationships.
Args:
relation_data_list: List of dictionaries containing relationship data
merge_strategy: Merge strategy for each field
Returns:
Dictionary containing merged relationship data
"""
merged_data = {}
# Collect all possible keys
all_keys = set()
for data in relation_data_list:
all_keys.update(data.keys())
# Merge values for each key
for key in all_keys:
# Get all values for this key
values = [
data.get(key)
for data in relation_data_list
if data.get(key) is not None
]
if not values:
continue
# Merge values according to strategy
strategy = merge_strategy.get(key, "keep_first")
if strategy == "concatenate":
merged_data[key] = "\n\n".join(str(v) for v in values)
elif strategy == "keep_first":
merged_data[key] = values[0]
elif strategy == "keep_last":
merged_data[key] = values[-1]
elif strategy == "join_unique":
# Handle fields separated by GRAPH_FIELD_SEP
unique_items = set()
for value in values:
items = str(value).split(GRAPH_FIELD_SEP)
unique_items.update(items)
merged_data[key] = GRAPH_FIELD_SEP.join(unique_items)
elif strategy == "max":
# For numeric fields like weight
try:
merged_data[key] = max(float(v) for v in values)
except (ValueError, TypeError):
merged_data[key] = values[0]
else:
# Default strategy
merged_data[key] = values[0]
return merged_data
async def _merge_entities_done(self) -> None:
"""Callback after entity merging is complete, ensures updates are persisted"""
await asyncio.gather(
*[
cast(StorageNameSpace, storage_inst).index_done_callback()
for storage_inst in [ # type: ignore
self.entities_vdb,
self.relationships_vdb,
self.chunk_entity_relation_graph,
]
]
)

View File

@@ -384,8 +384,8 @@ async def extract_entities(
language=language, language=language,
) )
continue_prompt = PROMPTS["entiti_continue_extraction"] continue_prompt = PROMPTS["entity_continue_extraction"]
if_loop_prompt = PROMPTS["entiti_if_loop_extraction"] if_loop_prompt = PROMPTS["entity_if_loop_extraction"]
processed_chunks = 0 processed_chunks = 0
total_chunks = len(ordered_chunks) total_chunks = len(ordered_chunks)
@@ -1156,7 +1156,8 @@ async def _get_node_data(
"entity", "entity",
"type", "type",
"description", "description",
"rank" "created_at", "rank",
"created_at",
] ]
] ]
for i, n in enumerate(node_datas): for i, n in enumerate(node_datas):

View File

@@ -58,14 +58,16 @@ PROMPTS["entity_extraction_examples"] = [
Entity_types: [person, technology, mission, organization, location] Entity_types: [person, technology, mission, organization, location]
Text: Text:
```
while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order. while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order.
Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. "If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us." Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.
The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce. The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce.
It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths
################ ```
Output: Output:
("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter} ("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter}
("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter} ("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter}
@@ -81,48 +83,52 @@ Output:
#############################""", #############################""",
"""Example 2: """Example 2:
Entity_types: [person, technology, mission, organization, location] Entity_types: [company, index, commodity, market_trend, economic_policy, biological]
Text: Text:
They were no longer mere operatives; they had become guardians of a threshold, keepers of a message from a realm beyond stars and stripes. This elevation in their mission could not be shackled by regulations and established protocols—it demanded a new perspective, a new resolve. ```
Stock markets faced a sharp downturn today as tech giants saw significant declines, with the Global Tech Index dropping by 3.4% in midday trading. Analysts attribute the selloff to investor concerns over rising interest rates and regulatory uncertainty.
Tension threaded through the dialogue of beeps and static as communications with Washington buzzed in the background. The team stood, a portentous air enveloping them. It was clear that the decisions they made in the ensuing hours could redefine humanity's place in the cosmos or condemn them to ignorance and potential peril. Among the hardest hit, Nexon Technologies saw its stock plummet by 7.8% after reporting lower-than-expected quarterly earnings. In contrast, Omega Energy posted a modest 2.1% gain, driven by rising oil prices.
Meanwhile, commodity markets reflected a mixed sentiment. Gold futures rose by 1.5%, reaching $2,080 per ounce, as investors sought safe-haven assets. Crude oil prices continued their rally, climbing to $87.60 per barrel, supported by supply constraints and strong demand.
Financial experts are closely watching the Federal Reserves next move, as speculation grows over potential rate hikes. The upcoming policy announcement is expected to influence investor confidence and overall market stability.
```
Their connection to the stars solidified, the group moved to address the crystallizing warning, shifting from passive recipients to active participants. Mercer's latter instincts gained precedence— the team's mandate had evolved, no longer solely to observe and report but to interact and prepare. A metamorphosis had begun, and Operation: Dulce hummed with the newfound frequency of their daring, a tone set not by the earthly
#############
Output: Output:
("entity"{tuple_delimiter}"Washington"{tuple_delimiter}"location"{tuple_delimiter}"Washington is a location where communications are being received, indicating its importance in the decision-making process."){record_delimiter} ("entity"{tuple_delimiter}"Global Tech Index"{tuple_delimiter}"index"{tuple_delimiter}"The Global Tech Index tracks the performance of major technology stocks and experienced a 3.4% decline today."){record_delimiter}
("entity"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"mission"{tuple_delimiter}"Operation: Dulce is described as a mission that has evolved to interact and prepare, indicating a significant shift in objectives and activities."){record_delimiter} ("entity"{tuple_delimiter}"Nexon Technologies"{tuple_delimiter}"company"{tuple_delimiter}"Nexon Technologies is a tech company that saw its stock decline by 7.8% after disappointing earnings."){record_delimiter}
("entity"{tuple_delimiter}"The team"{tuple_delimiter}"organization"{tuple_delimiter}"The team is portrayed as a group of individuals who have transitioned from passive observers to active participants in a mission, showing a dynamic change in their role."){record_delimiter} ("entity"{tuple_delimiter}"Omega Energy"{tuple_delimiter}"company"{tuple_delimiter}"Omega Energy is an energy company that gained 2.1% in stock value due to rising oil prices."){record_delimiter}
("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington, which influences their decision-making process."{tuple_delimiter}"decision-making, external influence"{tuple_delimiter}7){record_delimiter} ("entity"{tuple_delimiter}"Gold Futures"{tuple_delimiter}"commodity"{tuple_delimiter}"Gold futures rose by 1.5%, indicating increased investor interest in safe-haven assets."){record_delimiter}
("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"The team is directly involved in Operation: Dulce, executing its evolved objectives and activities."{tuple_delimiter}"mission evolution, active participation"{tuple_delimiter}9){record_delimiter} ("entity"{tuple_delimiter}"Crude Oil"{tuple_delimiter}"commodity"{tuple_delimiter}"Crude oil prices rose to $87.60 per barrel due to supply constraints and strong demand."){record_delimiter}
("content_keywords"{tuple_delimiter}"mission evolution, decision-making, active participation, cosmic significance"){completion_delimiter} ("entity"{tuple_delimiter}"Market Selloff"{tuple_delimiter}"market_trend"{tuple_delimiter}"Market selloff refers to the significant decline in stock values due to investor concerns over interest rates and regulations."){record_delimiter}
("entity"{tuple_delimiter}"Federal Reserve Policy Announcement"{tuple_delimiter}"economic_policy"{tuple_delimiter}"The Federal Reserve's upcoming policy announcement is expected to impact investor confidence and market stability."){record_delimiter}
("relationship"{tuple_delimiter}"Global Tech Index"{tuple_delimiter}"Market Selloff"{tuple_delimiter}"The decline in the Global Tech Index is part of the broader market selloff driven by investor concerns."{tuple_delimiter}"market performance, investor sentiment"{tuple_delimiter}9){record_delimiter}
("relationship"{tuple_delimiter}"Nexon Technologies"{tuple_delimiter}"Global Tech Index"{tuple_delimiter}"Nexon Technologies' stock decline contributed to the overall drop in the Global Tech Index."{tuple_delimiter}"company impact, index movement"{tuple_delimiter}8){record_delimiter}
("relationship"{tuple_delimiter}"Gold Futures"{tuple_delimiter}"Market Selloff"{tuple_delimiter}"Gold prices rose as investors sought safe-haven assets during the market selloff."{tuple_delimiter}"market reaction, safe-haven investment"{tuple_delimiter}10){record_delimiter}
("relationship"{tuple_delimiter}"Federal Reserve Policy Announcement"{tuple_delimiter}"Market Selloff"{tuple_delimiter}"Speculation over Federal Reserve policy changes contributed to market volatility and investor selloff."{tuple_delimiter}"interest rate impact, financial regulation"{tuple_delimiter}7){record_delimiter}
("content_keywords"{tuple_delimiter}"market downturn, investor sentiment, commodities, Federal Reserve, stock performance"){completion_delimiter}
#############################""", #############################""",
"""Example 3: """Example 3:
Entity_types: [person, role, technology, organization, event, location, concept] Entity_types: [economic_policy, athlete, event, location, record, organization, equipment]
Text: Text:
their voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data. ```
At the World Athletics Championship in Tokyo, Noah Carter broke the 100m sprint record using cutting-edge carbon-fiber spikes.
```
"It's like it's learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers' a whole new meaning."
Alex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back."
Together, they stood on the edge of the unknown, forging humanity's response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history.
The encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation
#############
Output: Output:
("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"Sam Rivera is a member of a team working on communicating with an unknown intelligence, showing a mix of awe and anxiety."){record_delimiter} ("entity"{tuple_delimiter}"World Athletics Championship"{tuple_delimiter}"event"{tuple_delimiter}"The World Athletics Championship is a global sports competition featuring top athletes in track and field."){record_delimiter}
("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is the leader of a team attempting first contact with an unknown intelligence, acknowledging the significance of their task."){record_delimiter} ("entity"{tuple_delimiter}"Tokyo"{tuple_delimiter}"location"{tuple_delimiter}"Tokyo is the host city of the World Athletics Championship."){record_delimiter}
("entity"{tuple_delimiter}"Control"{tuple_delimiter}"concept"{tuple_delimiter}"Control refers to the ability to manage or govern, which is challenged by an intelligence that writes its own rules."){record_delimiter} ("entity"{tuple_delimiter}"Noah Carter"{tuple_delimiter}"athlete"{tuple_delimiter}"Noah Carter is a sprinter who set a new record in the 100m sprint at the World Athletics Championship."){record_delimiter}
("entity"{tuple_delimiter}"Intelligence"{tuple_delimiter}"concept"{tuple_delimiter}"Intelligence here refers to an unknown entity capable of writing its own rules and learning to communicate."){record_delimiter} ("entity"{tuple_delimiter}"100m Sprint Record"{tuple_delimiter}"record"{tuple_delimiter}"The 100m sprint record is a benchmark in athletics, recently broken by Noah Carter."){record_delimiter}
("entity"{tuple_delimiter}"First Contact"{tuple_delimiter}"event"{tuple_delimiter}"First Contact is the potential initial communication between humanity and an unknown intelligence."){record_delimiter} ("entity"{tuple_delimiter}"Carbon-Fiber Spikes"{tuple_delimiter}"equipment"{tuple_delimiter}"Carbon-fiber spikes are advanced sprinting shoes that provide enhanced speed and traction."){record_delimiter}
("entity"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"event"{tuple_delimiter}"Humanity's Response is the collective action taken by Alex's team in response to a message from an unknown intelligence."){record_delimiter} ("entity"{tuple_delimiter}"World Athletics Federation"{tuple_delimiter}"organization"{tuple_delimiter}"The World Athletics Federation is the governing body overseeing the World Athletics Championship and record validations."){record_delimiter}
("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera is directly involved in the process of learning to communicate with the unknown intelligence."{tuple_delimiter}"communication, learning process"{tuple_delimiter}9){record_delimiter} ("relationship"{tuple_delimiter}"World Athletics Championship"{tuple_delimiter}"Tokyo"{tuple_delimiter}"The World Athletics Championship is being hosted in Tokyo."{tuple_delimiter}"event location, international competition"{tuple_delimiter}8){record_delimiter}
("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"First Contact"{tuple_delimiter}"Alex leads the team that might be making the First Contact with the unknown intelligence."{tuple_delimiter}"leadership, exploration"{tuple_delimiter}10){record_delimiter} ("relationship"{tuple_delimiter}"Noah Carter"{tuple_delimiter}"100m Sprint Record"{tuple_delimiter}"Noah Carter set a new 100m sprint record at the championship."{tuple_delimiter}"athlete achievement, record-breaking"{tuple_delimiter}10){record_delimiter}
("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity's Response to the unknown intelligence."{tuple_delimiter}"collective action, cosmic significance"{tuple_delimiter}8){record_delimiter} ("relationship"{tuple_delimiter}"Noah Carter"{tuple_delimiter}"Carbon-Fiber Spikes"{tuple_delimiter}"Noah Carter used carbon-fiber spikes to enhance performance during the race."{tuple_delimiter}"athletic equipment, performance boost"{tuple_delimiter}7){record_delimiter}
("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}"power dynamics, autonomy"{tuple_delimiter}7){record_delimiter} ("relationship"{tuple_delimiter}"World Athletics Federation"{tuple_delimiter}"100m Sprint Record"{tuple_delimiter}"The World Athletics Federation is responsible for validating and recognizing new sprint records."{tuple_delimiter}"sports regulation, record certification"{tuple_delimiter}9){record_delimiter}
("content_keywords"{tuple_delimiter}"first contact, control, communication, cosmic significance"){completion_delimiter} ("content_keywords"{tuple_delimiter}"athletics, sprinting, record-breaking, sports technology, competition"){completion_delimiter}
#############################""", #############################""",
] ]
@@ -143,15 +149,47 @@ Description List: {description_list}
Output: Output:
""" """
PROMPTS[ PROMPTS["entity_continue_extraction"] = """
"entiti_continue_extraction" MANY entities and relationships were missed in the last extraction.
] = """MANY entities were missed in the last extraction. Add them below using the same format:
"""
PROMPTS[ ---Remember Steps---
"entiti_if_loop_extraction"
] = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added. 1. Identify all entities. For each identified entity, extract the following information:
""" - entity_name: Name of the entity, use same language as input text. If English, capitalized the name.
- entity_type: One of the following types: [{entity_types}]
- entity_description: Comprehensive description of the entity's attributes and activities
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>
2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other
- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details
Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_strength>)
3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document.
Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_level_keywords>)
4. Return output in {language} as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
5. When finished, output {completion_delimiter}
---Output---
Add them below using the same format:\n
""".strip()
PROMPTS["entity_if_loop_extraction"] = """
---Goal---'
It appears some entities may have still been missed.
---Output---
Answer ONLY by `YES` OR `NO` if there are still entities that need to be added.
""".strip()
PROMPTS["fail_response"] = ( PROMPTS["fail_response"] = (
"Sorry, I'm not able to provide an answer to that question.[no-context]" "Sorry, I'm not able to provide an answer to that question.[no-context]"

View File

@@ -161,8 +161,12 @@ axiosInstance.interceptors.response.use(
) )
// API methods // API methods
export const queryGraphs = async (label: string, maxDepth: number): Promise<LightragGraphType> => { export const queryGraphs = async (
const response = await axiosInstance.get(`/graphs?label=${label}&max_depth=${maxDepth}`) label: string,
maxDepth: number,
minDegree: number
): Promise<LightragGraphType> => {
const response = await axiosInstance.get(`/graphs?label=${encodeURIComponent(label)}&max_depth=${maxDepth}&min_degree=${minDegree}`)
return response.data return response.data
} }

View File

@@ -40,18 +40,21 @@ const GraphControl = ({ disableHoverEffect }: { disableHoverEffect?: boolean })
const focusedEdge = useGraphStore.use.focusedEdge() const focusedEdge = useGraphStore.use.focusedEdge()
/** /**
* When component mount * When component mount or maxIterations changes
* => load the graph * => load the graph and apply layout
*/ */
useEffect(() => { useEffect(() => {
// Create & load the graph // Create & load the graph
const graph = lightrageGraph() const graph = lightrageGraph()
loadGraph(graph) loadGraph(graph)
if (!(graph as any).__force_applied) { assignLayout()
assignLayout() }, [assignLayout, loadGraph, lightrageGraph, maxIterations])
Object.assign(graph, { __force_applied: true })
}
/**
* When component mount
* => register events
*/
useEffect(() => {
const { setFocusedNode, setSelectedNode, setFocusedEdge, setSelectedEdge, clearSelection } = const { setFocusedNode, setSelectedNode, setFocusedEdge, setSelectedEdge, clearSelection } =
useGraphStore.getState() useGraphStore.getState()
@@ -87,7 +90,7 @@ const GraphControl = ({ disableHoverEffect }: { disableHoverEffect?: boolean })
}, },
clickStage: () => clearSelection() clickStage: () => clearSelection()
}) })
}, [assignLayout, loadGraph, registerEvents, lightrageGraph]) }, [registerEvents])
/** /**
* When component mount or hovered node change * When component mount or hovered node change

View File

@@ -90,9 +90,12 @@ const LabeledNumberInput = ({
{label} {label}
</label> </label>
<Input <Input
value={currentValue || ''} type="number"
value={currentValue === null ? '' : currentValue}
onChange={onValueChange} onChange={onValueChange}
className="h-6 w-full min-w-0" className="h-6 w-full min-w-0 pr-1"
min={min}
max={max}
onBlur={onBlur} onBlur={onBlur}
onKeyDown={(e) => { onKeyDown={(e) => {
if (e.key === 'Enter') { if (e.key === 'Enter') {
@@ -119,6 +122,7 @@ export default function Settings() {
const enableHideUnselectedEdges = useSettingsStore.use.enableHideUnselectedEdges() const enableHideUnselectedEdges = useSettingsStore.use.enableHideUnselectedEdges()
const showEdgeLabel = useSettingsStore.use.showEdgeLabel() const showEdgeLabel = useSettingsStore.use.showEdgeLabel()
const graphQueryMaxDepth = useSettingsStore.use.graphQueryMaxDepth() const graphQueryMaxDepth = useSettingsStore.use.graphQueryMaxDepth()
const graphMinDegree = useSettingsStore.use.graphMinDegree()
const graphLayoutMaxIterations = useSettingsStore.use.graphLayoutMaxIterations() const graphLayoutMaxIterations = useSettingsStore.use.graphLayoutMaxIterations()
const enableHealthCheck = useSettingsStore.use.enableHealthCheck() const enableHealthCheck = useSettingsStore.use.enableHealthCheck()
@@ -177,6 +181,11 @@ export default function Settings() {
useSettingsStore.setState({ graphQueryMaxDepth: depth }) useSettingsStore.setState({ graphQueryMaxDepth: depth })
}, []) }, [])
const setGraphMinDegree = useCallback((degree: number) => {
if (degree < 0) return
useSettingsStore.setState({ graphMinDegree: degree })
}, [])
const setGraphLayoutMaxIterations = useCallback((iterations: number) => { const setGraphLayoutMaxIterations = useCallback((iterations: number) => {
if (iterations < 1) return if (iterations < 1) return
useSettingsStore.setState({ graphLayoutMaxIterations: iterations }) useSettingsStore.setState({ graphLayoutMaxIterations: iterations })
@@ -266,6 +275,12 @@ export default function Settings() {
value={graphQueryMaxDepth} value={graphQueryMaxDepth}
onEditFinished={setGraphQueryMaxDepth} onEditFinished={setGraphQueryMaxDepth}
/> />
<LabeledNumberInput
label="Minimum Degree"
min={0}
value={graphMinDegree}
onEditFinished={setGraphMinDegree}
/>
<LabeledNumberInput <LabeledNumberInput
label="Max Layout Iterations" label="Max Layout Iterations"
min={1} min={1}

View File

@@ -7,7 +7,7 @@ const Input = React.forwardRef<HTMLInputElement, React.ComponentProps<'input'>>(
<input <input
type={type} type={type}
className={cn( className={cn(
'border-input file:text-foreground placeholder:text-muted-foreground focus-visible:ring-ring flex h-9 rounded-md border bg-transparent px-3 py-1 text-base shadow-sm transition-colors file:border-0 file:bg-transparent file:text-sm file:font-medium focus-visible:ring-1 focus-visible:outline-none disabled:cursor-not-allowed disabled:opacity-50 md:text-sm', 'border-input file:text-foreground placeholder:text-muted-foreground focus-visible:ring-ring flex h-9 rounded-md border bg-transparent px-3 py-1 text-base shadow-sm transition-colors file:border-0 file:bg-transparent file:text-sm file:font-medium focus-visible:ring-1 focus-visible:outline-none disabled:cursor-not-allowed disabled:opacity-50 md:text-sm [&::-webkit-inner-spin-button]:opacity-100 [&::-webkit-outer-spin-button]:opacity-100',
className className
)} )}
ref={ref} ref={ref}

View File

@@ -50,11 +50,11 @@ export type NodeType = {
} }
export type EdgeType = { label: string } export type EdgeType = { label: string }
const fetchGraph = async (label: string, maxDepth: number) => { const fetchGraph = async (label: string, maxDepth: number, minDegree: number) => {
let rawData: any = null let rawData: any = null
try { try {
rawData = await queryGraphs(label, maxDepth) rawData = await queryGraphs(label, maxDepth, minDegree)
} catch (e) { } catch (e) {
useBackendState.getState().setErrorMessage(errorMessage(e), 'Query Graphs Error!') useBackendState.getState().setErrorMessage(errorMessage(e), 'Query Graphs Error!')
return null return null
@@ -161,13 +161,14 @@ const createSigmaGraph = (rawGraph: RawGraph | null) => {
return graph return graph
} }
const lastQueryLabel = { label: '', maxQueryDepth: 0 } const lastQueryLabel = { label: '', maxQueryDepth: 0, minDegree: 0 }
const useLightrangeGraph = () => { const useLightrangeGraph = () => {
const queryLabel = useSettingsStore.use.queryLabel() const queryLabel = useSettingsStore.use.queryLabel()
const rawGraph = useGraphStore.use.rawGraph() const rawGraph = useGraphStore.use.rawGraph()
const sigmaGraph = useGraphStore.use.sigmaGraph() const sigmaGraph = useGraphStore.use.sigmaGraph()
const maxQueryDepth = useSettingsStore.use.graphQueryMaxDepth() const maxQueryDepth = useSettingsStore.use.graphQueryMaxDepth()
const minDegree = useSettingsStore.use.graphMinDegree()
const getNode = useCallback( const getNode = useCallback(
(nodeId: string) => { (nodeId: string) => {
@@ -185,13 +186,16 @@ const useLightrangeGraph = () => {
useEffect(() => { useEffect(() => {
if (queryLabel) { if (queryLabel) {
if (lastQueryLabel.label !== queryLabel || lastQueryLabel.maxQueryDepth !== maxQueryDepth) { if (lastQueryLabel.label !== queryLabel ||
lastQueryLabel.maxQueryDepth !== maxQueryDepth ||
lastQueryLabel.minDegree !== minDegree) {
lastQueryLabel.label = queryLabel lastQueryLabel.label = queryLabel
lastQueryLabel.maxQueryDepth = maxQueryDepth lastQueryLabel.maxQueryDepth = maxQueryDepth
lastQueryLabel.minDegree = minDegree
const state = useGraphStore.getState() const state = useGraphStore.getState()
state.reset() state.reset()
fetchGraph(queryLabel, maxQueryDepth).then((data) => { fetchGraph(queryLabel, maxQueryDepth, minDegree).then((data) => {
// console.debug('Query label: ' + queryLabel) // console.debug('Query label: ' + queryLabel)
state.setSigmaGraph(createSigmaGraph(data)) state.setSigmaGraph(createSigmaGraph(data))
data?.buildDynamicMap() data?.buildDynamicMap()
@@ -203,7 +207,7 @@ const useLightrangeGraph = () => {
state.reset() state.reset()
state.setSigmaGraph(new DirectedGraph()) state.setSigmaGraph(new DirectedGraph())
} }
}, [queryLabel, maxQueryDepth]) }, [queryLabel, maxQueryDepth, minDegree])
const lightrageGraph = useCallback(() => { const lightrageGraph = useCallback(() => {
if (sigmaGraph) { if (sigmaGraph) {

View File

@@ -22,6 +22,9 @@ interface SettingsState {
graphQueryMaxDepth: number graphQueryMaxDepth: number
setGraphQueryMaxDepth: (depth: number) => void setGraphQueryMaxDepth: (depth: number) => void
graphMinDegree: number
setGraphMinDegree: (degree: number) => void
graphLayoutMaxIterations: number graphLayoutMaxIterations: number
setGraphLayoutMaxIterations: (iterations: number) => void setGraphLayoutMaxIterations: (iterations: number) => void
@@ -66,6 +69,7 @@ const useSettingsStoreBase = create<SettingsState>()(
enableEdgeEvents: false, enableEdgeEvents: false,
graphQueryMaxDepth: 3, graphQueryMaxDepth: 3,
graphMinDegree: 0,
graphLayoutMaxIterations: 10, graphLayoutMaxIterations: 10,
queryLabel: defaultQueryLabel, queryLabel: defaultQueryLabel,
@@ -107,6 +111,8 @@ const useSettingsStoreBase = create<SettingsState>()(
setGraphQueryMaxDepth: (depth: number) => set({ graphQueryMaxDepth: depth }), setGraphQueryMaxDepth: (depth: number) => set({ graphQueryMaxDepth: depth }),
setGraphMinDegree: (degree: number) => set({ graphMinDegree: degree }),
setEnableHealthCheck: (enable: boolean) => set({ enableHealthCheck: enable }), setEnableHealthCheck: (enable: boolean) => set({ enableHealthCheck: enable }),
setApiKey: (apiKey: string | null) => set({ apiKey }), setApiKey: (apiKey: string | null) => set({ apiKey }),

View File

@@ -1 +1,11 @@
/// <reference types="vite/client" /> /// <reference types="vite/client" />
interface ImportMetaEnv {
readonly VITE_API_PROXY: string
readonly VITE_API_ENDPOINTS: string
readonly VITE_BACKEND_URL: string
}
interface ImportMeta {
readonly env: ImportMetaEnv
}

View File

@@ -26,5 +26,5 @@
"@/*": ["./src/*"] "@/*": ["./src/*"]
} }
}, },
"include": ["src"] "include": ["src", "vite.config.ts"]
} }

View File

@@ -14,6 +14,21 @@ export default defineConfig({
}, },
base: './', base: './',
build: { build: {
outDir: path.resolve(__dirname, '../lightrag/api/webui') outDir: path.resolve(__dirname, '../lightrag/api/webui'),
emptyOutDir: true
},
server: {
proxy: import.meta.env.VITE_API_PROXY === 'true' && import.meta.env.VITE_API_ENDPOINTS ?
Object.fromEntries(
import.meta.env.VITE_API_ENDPOINTS.split(',').map(endpoint => [
endpoint,
{
target: import.meta.env.VITE_BACKEND_URL || 'http://localhost:9621',
changeOrigin: true,
rewrite: endpoint === '/api' ?
(path) => path.replace(/^\/api/, '') : undefined
}
])
) : {}
} }
}) })