Merge pull request #1510 from danielaskdd/fix-time-zone
Fix time zone and created_at problem
This commit is contained in:
@@ -1,7 +0,0 @@
|
|||||||
AZURE_OPENAI_API_VERSION=2024-08-01-preview
|
|
||||||
AZURE_OPENAI_DEPLOYMENT=gpt-4o
|
|
||||||
AZURE_OPENAI_API_KEY=myapikey
|
|
||||||
AZURE_OPENAI_ENDPOINT=https://myendpoint.openai.azure.com
|
|
||||||
|
|
||||||
AZURE_EMBEDDING_DEPLOYMENT=text-embedding-3-large
|
|
||||||
AZURE_EMBEDDING_API_VERSION=2023-05-15
|
|
@@ -1,108 +0,0 @@
|
|||||||
import re
|
|
||||||
import json
|
|
||||||
import jsonlines
|
|
||||||
|
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
|
|
||||||
def batch_eval(query_file, result1_file, result2_file, output_file_path):
|
|
||||||
client = OpenAI()
|
|
||||||
|
|
||||||
with open(query_file, "r") as f:
|
|
||||||
data = f.read()
|
|
||||||
|
|
||||||
queries = re.findall(r"- Question \d+: (.+)", data)
|
|
||||||
|
|
||||||
with open(result1_file, "r") as f:
|
|
||||||
answers1 = json.load(f)
|
|
||||||
answers1 = [i["result"] for i in answers1]
|
|
||||||
|
|
||||||
with open(result2_file, "r") as f:
|
|
||||||
answers2 = json.load(f)
|
|
||||||
answers2 = [i["result"] for i in answers2]
|
|
||||||
|
|
||||||
requests = []
|
|
||||||
for i, (query, answer1, answer2) in enumerate(zip(queries, answers1, answers2)):
|
|
||||||
sys_prompt = """
|
|
||||||
---Role---
|
|
||||||
You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**.
|
|
||||||
"""
|
|
||||||
|
|
||||||
prompt = f"""
|
|
||||||
You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**.
|
|
||||||
|
|
||||||
- **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question?
|
|
||||||
- **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question?
|
|
||||||
- **Empowerment**: How well does the answer help the reader understand and make informed judgments about the topic?
|
|
||||||
|
|
||||||
For each criterion, choose the better answer (either Answer 1 or Answer 2) and explain why. Then, select an overall winner based on these three categories.
|
|
||||||
|
|
||||||
Here is the question:
|
|
||||||
{query}
|
|
||||||
|
|
||||||
Here are the two answers:
|
|
||||||
|
|
||||||
**Answer 1:**
|
|
||||||
{answer1}
|
|
||||||
|
|
||||||
**Answer 2:**
|
|
||||||
{answer2}
|
|
||||||
|
|
||||||
Evaluate both answers using the three criteria listed above and provide detailed explanations for each criterion.
|
|
||||||
|
|
||||||
Output your evaluation in the following JSON format:
|
|
||||||
|
|
||||||
{{
|
|
||||||
"Comprehensiveness": {{
|
|
||||||
"Winner": "[Answer 1 or Answer 2]",
|
|
||||||
"Explanation": "[Provide explanation here]"
|
|
||||||
}},
|
|
||||||
"Empowerment": {{
|
|
||||||
"Winner": "[Answer 1 or Answer 2]",
|
|
||||||
"Explanation": "[Provide explanation here]"
|
|
||||||
}},
|
|
||||||
"Overall Winner": {{
|
|
||||||
"Winner": "[Answer 1 or Answer 2]",
|
|
||||||
"Explanation": "[Summarize why this answer is the overall winner based on the three criteria]"
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
"""
|
|
||||||
|
|
||||||
request_data = {
|
|
||||||
"custom_id": f"request-{i+1}",
|
|
||||||
"method": "POST",
|
|
||||||
"url": "/v1/chat/completions",
|
|
||||||
"body": {
|
|
||||||
"model": "gpt-4o-mini",
|
|
||||||
"messages": [
|
|
||||||
{"role": "system", "content": sys_prompt},
|
|
||||||
{"role": "user", "content": prompt},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
requests.append(request_data)
|
|
||||||
|
|
||||||
with jsonlines.open(output_file_path, mode="w") as writer:
|
|
||||||
for request in requests:
|
|
||||||
writer.write(request)
|
|
||||||
|
|
||||||
print(f"Batch API requests written to {output_file_path}")
|
|
||||||
|
|
||||||
batch_input_file = client.files.create(
|
|
||||||
file=open(output_file_path, "rb"), purpose="batch"
|
|
||||||
)
|
|
||||||
batch_input_file_id = batch_input_file.id
|
|
||||||
|
|
||||||
batch = client.batches.create(
|
|
||||||
input_file_id=batch_input_file_id,
|
|
||||||
endpoint="/v1/chat/completions",
|
|
||||||
completion_window="24h",
|
|
||||||
metadata={"description": "nightly eval job"},
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"Batch {batch.id} has been created.")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
batch_eval()
|
|
@@ -1,55 +0,0 @@
|
|||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
# os.environ["OPENAI_API_KEY"] = ""
|
|
||||||
|
|
||||||
|
|
||||||
def openai_complete_if_cache(
|
|
||||||
model="gpt-4o-mini", prompt=None, system_prompt=None, history_messages=[], **kwargs
|
|
||||||
) -> str:
|
|
||||||
openai_client = OpenAI()
|
|
||||||
|
|
||||||
messages = []
|
|
||||||
if system_prompt:
|
|
||||||
messages.append({"role": "system", "content": system_prompt})
|
|
||||||
messages.extend(history_messages)
|
|
||||||
messages.append({"role": "user", "content": prompt})
|
|
||||||
|
|
||||||
response = openai_client.chat.completions.create(
|
|
||||||
model=model, messages=messages, **kwargs
|
|
||||||
)
|
|
||||||
return response.choices[0].message.content
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
description = ""
|
|
||||||
prompt = f"""
|
|
||||||
Given the following description of a dataset:
|
|
||||||
|
|
||||||
{description}
|
|
||||||
|
|
||||||
Please identify 5 potential users who would engage with this dataset. For each user, list 5 tasks they would perform with this dataset. Then, for each (user, task) combination, generate 5 questions that require a high-level understanding of the entire dataset.
|
|
||||||
|
|
||||||
Output the results in the following structure:
|
|
||||||
- User 1: [user description]
|
|
||||||
- Task 1: [task description]
|
|
||||||
- Question 1:
|
|
||||||
- Question 2:
|
|
||||||
- Question 3:
|
|
||||||
- Question 4:
|
|
||||||
- Question 5:
|
|
||||||
- Task 2: [task description]
|
|
||||||
...
|
|
||||||
- Task 5: [task description]
|
|
||||||
- User 2: [user description]
|
|
||||||
...
|
|
||||||
- User 5: [user description]
|
|
||||||
...
|
|
||||||
"""
|
|
||||||
|
|
||||||
result = openai_complete_if_cache(model="gpt-4o-mini", prompt=prompt)
|
|
||||||
|
|
||||||
file_path = "./queries.txt"
|
|
||||||
with open(file_path, "w") as file:
|
|
||||||
file.write(result)
|
|
||||||
|
|
||||||
print(f"Queries written to {file_path}")
|
|
@@ -1,40 +0,0 @@
|
|||||||
import networkx as nx
|
|
||||||
|
|
||||||
G = nx.read_graphml("./dickensTestEmbedcall/graph_chunk_entity_relation.graphml")
|
|
||||||
|
|
||||||
|
|
||||||
def get_all_edges_and_nodes(G):
|
|
||||||
# Get all edges and their properties
|
|
||||||
edges_with_properties = []
|
|
||||||
for u, v, data in G.edges(data=True):
|
|
||||||
edges_with_properties.append(
|
|
||||||
{
|
|
||||||
"start": u,
|
|
||||||
"end": v,
|
|
||||||
"label": data.get(
|
|
||||||
"label", ""
|
|
||||||
), # Assuming 'label' is used for edge type
|
|
||||||
"properties": data,
|
|
||||||
"start_node_properties": G.nodes[u],
|
|
||||||
"end_node_properties": G.nodes[v],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
return edges_with_properties
|
|
||||||
|
|
||||||
|
|
||||||
# Example usage
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# Assume G is your NetworkX graph loaded from Neo4j
|
|
||||||
|
|
||||||
all_edges = get_all_edges_and_nodes(G)
|
|
||||||
|
|
||||||
# Print all edges and node properties
|
|
||||||
for edge in all_edges:
|
|
||||||
print(f"Edge Label: {edge['label']}")
|
|
||||||
print(f"Edge Properties: {edge['properties']}")
|
|
||||||
print(f"Start Node: {edge['start']}")
|
|
||||||
print(f"Start Node Properties: {edge['start_node_properties']}")
|
|
||||||
print(f"End Node: {edge['end']}")
|
|
||||||
print(f"End Node Properties: {edge['end_node_properties']}")
|
|
||||||
print("---")
|
|
@@ -1,114 +0,0 @@
|
|||||||
|
|
||||||
## API Server Implementation
|
|
||||||
|
|
||||||
LightRAG also provides a FastAPI-based server implementation for RESTful API access to RAG operations. This allows you to run LightRAG as a service and interact with it through HTTP requests.
|
|
||||||
|
|
||||||
### Setting up the API Server
|
|
||||||
<details>
|
|
||||||
<summary>Click to expand setup instructions</summary>
|
|
||||||
|
|
||||||
1. First, ensure you have the required dependencies:
|
|
||||||
```bash
|
|
||||||
pip install fastapi uvicorn pydantic
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Set up your environment variables:
|
|
||||||
```bash
|
|
||||||
export RAG_DIR="your_index_directory" # Optional: Defaults to "index_default"
|
|
||||||
export OPENAI_BASE_URL="Your OpenAI API base URL" # Optional: Defaults to "https://api.openai.com/v1"
|
|
||||||
export OPENAI_API_KEY="Your OpenAI API key" # Required
|
|
||||||
export LLM_MODEL="Your LLM model" # Optional: Defaults to "gpt-4o-mini"
|
|
||||||
export EMBEDDING_MODEL="Your embedding model" # Optional: Defaults to "text-embedding-3-large"
|
|
||||||
```
|
|
||||||
|
|
||||||
3. Run the API server:
|
|
||||||
```bash
|
|
||||||
python examples/lightrag_api_openai_compatible_demo.py
|
|
||||||
```
|
|
||||||
|
|
||||||
The server will start on `http://0.0.0.0:8020`.
|
|
||||||
</details>
|
|
||||||
|
|
||||||
### API Endpoints
|
|
||||||
|
|
||||||
The API server provides the following endpoints:
|
|
||||||
|
|
||||||
#### 1. Query Endpoint
|
|
||||||
<details>
|
|
||||||
<summary>Click to view Query endpoint details</summary>
|
|
||||||
|
|
||||||
- **URL:** `/query`
|
|
||||||
- **Method:** POST
|
|
||||||
- **Body:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"query": "Your question here",
|
|
||||||
"mode": "hybrid", // Can be "naive", "local", "global", or "hybrid"
|
|
||||||
"only_need_context": true // Optional: Defaults to false, if true, only the referenced context will be returned, otherwise the llm answer will be returned
|
|
||||||
}
|
|
||||||
```
|
|
||||||
- **Example:**
|
|
||||||
```bash
|
|
||||||
curl -X POST "http://127.0.0.1:8020/query" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"query": "What are the main themes?", "mode": "hybrid"}'
|
|
||||||
```
|
|
||||||
</details>
|
|
||||||
|
|
||||||
#### 2. Insert Text Endpoint
|
|
||||||
<details>
|
|
||||||
<summary>Click to view Insert Text endpoint details</summary>
|
|
||||||
|
|
||||||
- **URL:** `/insert`
|
|
||||||
- **Method:** POST
|
|
||||||
- **Body:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"text": "Your text content here"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
- **Example:**
|
|
||||||
```bash
|
|
||||||
curl -X POST "http://127.0.0.1:8020/insert" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"text": "Content to be inserted into RAG"}'
|
|
||||||
```
|
|
||||||
</details>
|
|
||||||
|
|
||||||
#### 3. Insert File Endpoint
|
|
||||||
<details>
|
|
||||||
<summary>Click to view Insert File endpoint details</summary>
|
|
||||||
|
|
||||||
- **URL:** `/insert_file`
|
|
||||||
- **Method:** POST
|
|
||||||
- **Body:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"file_path": "path/to/your/file.txt"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
- **Example:**
|
|
||||||
```bash
|
|
||||||
curl -X POST "http://127.0.0.1:8020/insert_file" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"file_path": "./book.txt"}'
|
|
||||||
```
|
|
||||||
</details>
|
|
||||||
|
|
||||||
#### 4. Health Check Endpoint
|
|
||||||
<details>
|
|
||||||
<summary>Click to view Health Check endpoint details</summary>
|
|
||||||
|
|
||||||
- **URL:** `/health`
|
|
||||||
- **Method:** GET
|
|
||||||
- **Example:**
|
|
||||||
```bash
|
|
||||||
curl -X GET "http://127.0.0.1:8020/health"
|
|
||||||
```
|
|
||||||
</details>
|
|
||||||
|
|
||||||
### Configuration
|
|
||||||
|
|
||||||
The API server can be configured using environment variables:
|
|
||||||
- `RAG_DIR`: Directory for storing the RAG index (default: "index_default")
|
|
||||||
- API keys and base URLs should be configured in the code for your specific LLM and embedding model providers
|
|
@@ -1,115 +0,0 @@
|
|||||||
|
|
||||||
## API 服务器实现
|
|
||||||
|
|
||||||
LightRAG also provides a FastAPI-based server implementation for RESTful API access to RAG operations. This allows you to run LightRAG as a service and interact with it through HTTP requests.
|
|
||||||
LightRAG 还提供基于 FastAPI 的服务器实现,用于对 RAG 操作进行 RESTful API 访问。这允许您将 LightRAG 作为服务运行并通过 HTTP 请求与其交互。
|
|
||||||
|
|
||||||
### 设置 API 服务器
|
|
||||||
<details>
|
|
||||||
<summary>单击展开设置说明</summary>
|
|
||||||
|
|
||||||
1. 首先,确保您具有所需的依赖项:
|
|
||||||
```bash
|
|
||||||
pip install fastapi uvicorn pydantic
|
|
||||||
```
|
|
||||||
|
|
||||||
2. 设置您的环境变量:
|
|
||||||
```bash
|
|
||||||
export RAG_DIR="your_index_directory" # Optional: Defaults to "index_default"
|
|
||||||
export OPENAI_BASE_URL="Your OpenAI API base URL" # Optional: Defaults to "https://api.openai.com/v1"
|
|
||||||
export OPENAI_API_KEY="Your OpenAI API key" # Required
|
|
||||||
export LLM_MODEL="Your LLM model" # Optional: Defaults to "gpt-4o-mini"
|
|
||||||
export EMBEDDING_MODEL="Your embedding model" # Optional: Defaults to "text-embedding-3-large"
|
|
||||||
```
|
|
||||||
|
|
||||||
3. 运行API服务器:
|
|
||||||
```bash
|
|
||||||
python examples/lightrag_api_openai_compatible_demo.py
|
|
||||||
```
|
|
||||||
|
|
||||||
服务器将启动于 `http://0.0.0.0:8020`.
|
|
||||||
</details>
|
|
||||||
|
|
||||||
### API端点
|
|
||||||
|
|
||||||
API服务器提供以下端点:
|
|
||||||
|
|
||||||
#### 1. 查询端点
|
|
||||||
<details>
|
|
||||||
<summary>点击查看查询端点详情</summary>
|
|
||||||
|
|
||||||
- **URL:** `/query`
|
|
||||||
- **Method:** POST
|
|
||||||
- **Body:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"query": "Your question here",
|
|
||||||
"mode": "hybrid", // Can be "naive", "local", "global", or "hybrid"
|
|
||||||
"only_need_context": true // Optional: Defaults to false, if true, only the referenced context will be returned, otherwise the llm answer will be returned
|
|
||||||
}
|
|
||||||
```
|
|
||||||
- **Example:**
|
|
||||||
```bash
|
|
||||||
curl -X POST "http://127.0.0.1:8020/query" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"query": "What are the main themes?", "mode": "hybrid"}'
|
|
||||||
```
|
|
||||||
</details>
|
|
||||||
|
|
||||||
#### 2. 插入文本端点
|
|
||||||
<details>
|
|
||||||
<summary>单击可查看插入文本端点详细信息</summary>
|
|
||||||
|
|
||||||
- **URL:** `/insert`
|
|
||||||
- **Method:** POST
|
|
||||||
- **Body:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"text": "Your text content here"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
- **Example:**
|
|
||||||
```bash
|
|
||||||
curl -X POST "http://127.0.0.1:8020/insert" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"text": "Content to be inserted into RAG"}'
|
|
||||||
```
|
|
||||||
</details>
|
|
||||||
|
|
||||||
#### 3. 插入文件端点
|
|
||||||
<details>
|
|
||||||
<summary>单击查看插入文件端点详细信息</summary>
|
|
||||||
|
|
||||||
- **URL:** `/insert_file`
|
|
||||||
- **Method:** POST
|
|
||||||
- **Body:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"file_path": "path/to/your/file.txt"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
- **Example:**
|
|
||||||
```bash
|
|
||||||
curl -X POST "http://127.0.0.1:8020/insert_file" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"file_path": "./book.txt"}'
|
|
||||||
```
|
|
||||||
</details>
|
|
||||||
|
|
||||||
#### 4. 健康检查端点
|
|
||||||
<details>
|
|
||||||
<summary>点击查看健康检查端点详细信息</summary>
|
|
||||||
|
|
||||||
- **URL:** `/health`
|
|
||||||
- **Method:** GET
|
|
||||||
- **Example:**
|
|
||||||
```bash
|
|
||||||
curl -X GET "http://127.0.0.1:8020/health"
|
|
||||||
```
|
|
||||||
</details>
|
|
||||||
|
|
||||||
### 配置
|
|
||||||
|
|
||||||
可以使用环境变量配置API服务器:
|
|
||||||
- `RAG_DIR`: 存放RAG索引的目录 (default: "index_default")
|
|
||||||
- 应在代码中为您的特定 LLM 和嵌入模型提供商配置 API 密钥和基本 URL
|
|
@@ -1,68 +0,0 @@
|
|||||||
import os
|
|
||||||
import asyncio
|
|
||||||
from lightrag import LightRAG, QueryParam
|
|
||||||
from lightrag.llm.openai import gpt_4o_mini_complete
|
|
||||||
from lightrag.kg.shared_storage import initialize_pipeline_status
|
|
||||||
#########
|
|
||||||
# Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert()
|
|
||||||
# import nest_asyncio
|
|
||||||
# nest_asyncio.apply()
|
|
||||||
#########
|
|
||||||
|
|
||||||
WORKING_DIR = "./dickens"
|
|
||||||
|
|
||||||
if not os.path.exists(WORKING_DIR):
|
|
||||||
os.mkdir(WORKING_DIR)
|
|
||||||
|
|
||||||
|
|
||||||
async def initialize_rag():
|
|
||||||
rag = LightRAG(
|
|
||||||
working_dir=WORKING_DIR,
|
|
||||||
llm_model_func=gpt_4o_mini_complete, # Use gpt_4o_mini_complete LLM model
|
|
||||||
# llm_model_func=gpt_4o_complete # Optionally, use a stronger model
|
|
||||||
)
|
|
||||||
|
|
||||||
await rag.initialize_storages()
|
|
||||||
await initialize_pipeline_status()
|
|
||||||
|
|
||||||
return rag
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
# Initialize RAG instance
|
|
||||||
rag = asyncio.run(initialize_rag())
|
|
||||||
|
|
||||||
with open("./book.txt", "r", encoding="utf-8") as f:
|
|
||||||
rag.insert(f.read())
|
|
||||||
|
|
||||||
# Perform naive search
|
|
||||||
print(
|
|
||||||
rag.query(
|
|
||||||
"What are the top themes in this story?", param=QueryParam(mode="naive")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Perform local search
|
|
||||||
print(
|
|
||||||
rag.query(
|
|
||||||
"What are the top themes in this story?", param=QueryParam(mode="local")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Perform global search
|
|
||||||
print(
|
|
||||||
rag.query(
|
|
||||||
"What are the top themes in this story?", param=QueryParam(mode="global")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Perform hybrid search
|
|
||||||
print(
|
|
||||||
rag.query(
|
|
||||||
"What are the top themes in this story?", param=QueryParam(mode="hybrid")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@@ -1,158 +0,0 @@
|
|||||||
import os
|
|
||||||
import asyncio
|
|
||||||
from lightrag import LightRAG, QueryParam
|
|
||||||
from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
|
|
||||||
from lightrag.utils import EmbeddingFunc
|
|
||||||
import numpy as np
|
|
||||||
from lightrag.kg.shared_storage import initialize_pipeline_status
|
|
||||||
|
|
||||||
#########
|
|
||||||
# Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert()
|
|
||||||
# import nest_asyncio
|
|
||||||
# nest_asyncio.apply()
|
|
||||||
#########
|
|
||||||
WORKING_DIR = "./chromadb_test_dir"
|
|
||||||
if not os.path.exists(WORKING_DIR):
|
|
||||||
os.mkdir(WORKING_DIR)
|
|
||||||
|
|
||||||
# ChromaDB Configuration
|
|
||||||
CHROMADB_USE_LOCAL_PERSISTENT = False
|
|
||||||
# Local PersistentClient Configuration
|
|
||||||
CHROMADB_LOCAL_PATH = os.environ.get(
|
|
||||||
"CHROMADB_LOCAL_PATH", os.path.join(WORKING_DIR, "chroma_data")
|
|
||||||
)
|
|
||||||
# Remote HttpClient Configuration
|
|
||||||
CHROMADB_HOST = os.environ.get("CHROMADB_HOST", "localhost")
|
|
||||||
CHROMADB_PORT = int(os.environ.get("CHROMADB_PORT", 8000))
|
|
||||||
CHROMADB_AUTH_TOKEN = os.environ.get("CHROMADB_AUTH_TOKEN", "secret-token")
|
|
||||||
CHROMADB_AUTH_PROVIDER = os.environ.get(
|
|
||||||
"CHROMADB_AUTH_PROVIDER", "chromadb.auth.token_authn.TokenAuthClientProvider"
|
|
||||||
)
|
|
||||||
CHROMADB_AUTH_HEADER = os.environ.get("CHROMADB_AUTH_HEADER", "X-Chroma-Token")
|
|
||||||
|
|
||||||
# Embedding Configuration and Functions
|
|
||||||
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "text-embedding-3-large")
|
|
||||||
EMBEDDING_MAX_TOKEN_SIZE = int(os.environ.get("EMBEDDING_MAX_TOKEN_SIZE", 8192))
|
|
||||||
|
|
||||||
# ChromaDB requires knowing the dimension of embeddings upfront when
|
|
||||||
# creating a collection. The embedding dimension is model-specific
|
|
||||||
# (e.g. text-embedding-3-large uses 3072 dimensions)
|
|
||||||
# we dynamically determine it by running a test embedding
|
|
||||||
# and then pass it to the ChromaDBStorage class
|
|
||||||
|
|
||||||
|
|
||||||
async def embedding_func(texts: list[str]) -> np.ndarray:
|
|
||||||
return await openai_embed(
|
|
||||||
texts,
|
|
||||||
model=EMBEDDING_MODEL,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
async def get_embedding_dimension():
|
|
||||||
test_text = ["This is a test sentence."]
|
|
||||||
embedding = await embedding_func(test_text)
|
|
||||||
return embedding.shape[1]
|
|
||||||
|
|
||||||
|
|
||||||
async def create_embedding_function_instance():
|
|
||||||
# Get embedding dimension
|
|
||||||
embedding_dimension = await get_embedding_dimension()
|
|
||||||
# Create embedding function instance
|
|
||||||
return EmbeddingFunc(
|
|
||||||
embedding_dim=embedding_dimension,
|
|
||||||
max_token_size=EMBEDDING_MAX_TOKEN_SIZE,
|
|
||||||
func=embedding_func,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
async def initialize_rag():
|
|
||||||
embedding_func_instance = await create_embedding_function_instance()
|
|
||||||
if CHROMADB_USE_LOCAL_PERSISTENT:
|
|
||||||
rag = LightRAG(
|
|
||||||
working_dir=WORKING_DIR,
|
|
||||||
llm_model_func=gpt_4o_mini_complete,
|
|
||||||
embedding_func=embedding_func_instance,
|
|
||||||
vector_storage="ChromaVectorDBStorage",
|
|
||||||
log_level="DEBUG",
|
|
||||||
embedding_batch_num=32,
|
|
||||||
vector_db_storage_cls_kwargs={
|
|
||||||
"local_path": CHROMADB_LOCAL_PATH,
|
|
||||||
"collection_settings": {
|
|
||||||
"hnsw:space": "cosine",
|
|
||||||
"hnsw:construction_ef": 128,
|
|
||||||
"hnsw:search_ef": 128,
|
|
||||||
"hnsw:M": 16,
|
|
||||||
"hnsw:batch_size": 100,
|
|
||||||
"hnsw:sync_threshold": 1000,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
rag = LightRAG(
|
|
||||||
working_dir=WORKING_DIR,
|
|
||||||
llm_model_func=gpt_4o_mini_complete,
|
|
||||||
embedding_func=embedding_func_instance,
|
|
||||||
vector_storage="ChromaVectorDBStorage",
|
|
||||||
log_level="DEBUG",
|
|
||||||
embedding_batch_num=32,
|
|
||||||
vector_db_storage_cls_kwargs={
|
|
||||||
"host": CHROMADB_HOST,
|
|
||||||
"port": CHROMADB_PORT,
|
|
||||||
"auth_token": CHROMADB_AUTH_TOKEN,
|
|
||||||
"auth_provider": CHROMADB_AUTH_PROVIDER,
|
|
||||||
"auth_header_name": CHROMADB_AUTH_HEADER,
|
|
||||||
"collection_settings": {
|
|
||||||
"hnsw:space": "cosine",
|
|
||||||
"hnsw:construction_ef": 128,
|
|
||||||
"hnsw:search_ef": 128,
|
|
||||||
"hnsw:M": 16,
|
|
||||||
"hnsw:batch_size": 100,
|
|
||||||
"hnsw:sync_threshold": 1000,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
await rag.initialize_storages()
|
|
||||||
await initialize_pipeline_status()
|
|
||||||
|
|
||||||
return rag
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
# Initialize RAG instance
|
|
||||||
rag = asyncio.run(initialize_rag())
|
|
||||||
|
|
||||||
with open("./book.txt", "r", encoding="utf-8") as f:
|
|
||||||
rag.insert(f.read())
|
|
||||||
|
|
||||||
# Perform naive search
|
|
||||||
print(
|
|
||||||
rag.query(
|
|
||||||
"What are the top themes in this story?", param=QueryParam(mode="naive")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Perform local search
|
|
||||||
print(
|
|
||||||
rag.query(
|
|
||||||
"What are the top themes in this story?", param=QueryParam(mode="local")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Perform global search
|
|
||||||
print(
|
|
||||||
rag.query(
|
|
||||||
"What are the top themes in this story?", param=QueryParam(mode="global")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Perform hybrid search
|
|
||||||
print(
|
|
||||||
rag.query(
|
|
||||||
"What are the top themes in this story?", param=QueryParam(mode="hybrid")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@@ -1,108 +0,0 @@
|
|||||||
import os
|
|
||||||
import logging
|
|
||||||
import asyncio
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
from sentence_transformers import SentenceTransformer
|
|
||||||
|
|
||||||
from openai import AzureOpenAI
|
|
||||||
from lightrag import LightRAG, QueryParam
|
|
||||||
from lightrag.utils import EmbeddingFunc
|
|
||||||
from lightrag.kg.shared_storage import initialize_pipeline_status
|
|
||||||
|
|
||||||
WORKING_DIR = "./dickens"
|
|
||||||
# Configure Logging
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
|
|
||||||
# Load environment variables from .env file
|
|
||||||
load_dotenv()
|
|
||||||
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
|
|
||||||
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
|
|
||||||
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
|
|
||||||
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
|
||||||
|
|
||||||
|
|
||||||
async def llm_model_func(
|
|
||||||
prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
|
|
||||||
) -> str:
|
|
||||||
# Create a client for AzureOpenAI
|
|
||||||
client = AzureOpenAI(
|
|
||||||
api_key=AZURE_OPENAI_API_KEY,
|
|
||||||
api_version=AZURE_OPENAI_API_VERSION,
|
|
||||||
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Build the messages list for the conversation
|
|
||||||
messages = []
|
|
||||||
if system_prompt:
|
|
||||||
messages.append({"role": "system", "content": system_prompt})
|
|
||||||
if history_messages:
|
|
||||||
messages.extend(history_messages)
|
|
||||||
messages.append({"role": "user", "content": prompt})
|
|
||||||
|
|
||||||
# Call the LLM
|
|
||||||
chat_completion = client.chat.completions.create(
|
|
||||||
model=AZURE_OPENAI_DEPLOYMENT,
|
|
||||||
messages=messages,
|
|
||||||
temperature=kwargs.get("temperature", 0),
|
|
||||||
top_p=kwargs.get("top_p", 1),
|
|
||||||
n=kwargs.get("n", 1),
|
|
||||||
)
|
|
||||||
|
|
||||||
return chat_completion.choices[0].message.content
|
|
||||||
|
|
||||||
|
|
||||||
async def embedding_func(texts: list[str]) -> np.ndarray:
|
|
||||||
model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
||||||
embeddings = model.encode(texts, convert_to_numpy=True)
|
|
||||||
return embeddings
|
|
||||||
|
|
||||||
|
|
||||||
async def initialize_rag():
|
|
||||||
rag = LightRAG(
|
|
||||||
working_dir=WORKING_DIR,
|
|
||||||
llm_model_func=llm_model_func,
|
|
||||||
embedding_func=EmbeddingFunc(
|
|
||||||
embedding_dim=384,
|
|
||||||
max_token_size=8192,
|
|
||||||
func=embedding_func,
|
|
||||||
),
|
|
||||||
vector_storage="FaissVectorDBStorage",
|
|
||||||
vector_db_storage_cls_kwargs={
|
|
||||||
"cosine_better_than_threshold": 0.2 # Your desired threshold
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
await rag.initialize_storages()
|
|
||||||
await initialize_pipeline_status()
|
|
||||||
|
|
||||||
return rag
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
# Initialize RAG instance
|
|
||||||
rag = asyncio.run(initialize_rag())
|
|
||||||
# Insert the custom chunks into LightRAG
|
|
||||||
book1 = open("./book_1.txt", encoding="utf-8")
|
|
||||||
book2 = open("./book_2.txt", encoding="utf-8")
|
|
||||||
|
|
||||||
rag.insert([book1.read(), book2.read()])
|
|
||||||
|
|
||||||
query_text = "What are the main themes?"
|
|
||||||
|
|
||||||
print("Result (Naive):")
|
|
||||||
print(rag.query(query_text, param=QueryParam(mode="naive")))
|
|
||||||
|
|
||||||
print("\nResult (Local):")
|
|
||||||
print(rag.query(query_text, param=QueryParam(mode="local")))
|
|
||||||
|
|
||||||
print("\nResult (Global):")
|
|
||||||
print(rag.query(query_text, param=QueryParam(mode="global")))
|
|
||||||
|
|
||||||
print("\nResult (Hybrid):")
|
|
||||||
print(rag.query(query_text, param=QueryParam(mode="hybrid")))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@@ -1,71 +0,0 @@
|
|||||||
import os
|
|
||||||
import asyncio
|
|
||||||
from lightrag import LightRAG, QueryParam
|
|
||||||
from lightrag.llm.openai import gpt_4o_mini_complete
|
|
||||||
from lightrag.kg.shared_storage import initialize_pipeline_status
|
|
||||||
|
|
||||||
#########
|
|
||||||
# Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert()
|
|
||||||
# import nest_asyncio
|
|
||||||
# nest_asyncio.apply()
|
|
||||||
#########
|
|
||||||
|
|
||||||
WORKING_DIR = "./local_neo4jWorkDir"
|
|
||||||
|
|
||||||
if not os.path.exists(WORKING_DIR):
|
|
||||||
os.mkdir(WORKING_DIR)
|
|
||||||
|
|
||||||
|
|
||||||
async def initialize_rag():
|
|
||||||
rag = LightRAG(
|
|
||||||
working_dir=WORKING_DIR,
|
|
||||||
llm_model_func=gpt_4o_mini_complete, # Use gpt_4o_mini_complete LLM model
|
|
||||||
graph_storage="Neo4JStorage",
|
|
||||||
log_level="INFO",
|
|
||||||
# llm_model_func=gpt_4o_complete # Optionally, use a stronger model
|
|
||||||
)
|
|
||||||
|
|
||||||
await rag.initialize_storages()
|
|
||||||
await initialize_pipeline_status()
|
|
||||||
|
|
||||||
return rag
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
# Initialize RAG instance
|
|
||||||
rag = asyncio.run(initialize_rag())
|
|
||||||
|
|
||||||
with open("./book.txt", "r", encoding="utf-8") as f:
|
|
||||||
rag.insert(f.read())
|
|
||||||
|
|
||||||
# Perform naive search
|
|
||||||
print(
|
|
||||||
rag.query(
|
|
||||||
"What are the top themes in this story?", param=QueryParam(mode="naive")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Perform local search
|
|
||||||
print(
|
|
||||||
rag.query(
|
|
||||||
"What are the top themes in this story?", param=QueryParam(mode="local")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Perform global search
|
|
||||||
print(
|
|
||||||
rag.query(
|
|
||||||
"What are the top themes in this story?", param=QueryParam(mode="global")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Perform hybrid search
|
|
||||||
print(
|
|
||||||
rag.query(
|
|
||||||
"What are the top themes in this story?", param=QueryParam(mode="hybrid")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@@ -1,51 +0,0 @@
|
|||||||
import os
|
|
||||||
import asyncio
|
|
||||||
from lightrag.kg.postgres_impl import PGGraphStorage
|
|
||||||
from lightrag.llm.ollama import ollama_embedding
|
|
||||||
from lightrag.utils import EmbeddingFunc
|
|
||||||
|
|
||||||
#########
|
|
||||||
# Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert()
|
|
||||||
# import nest_asyncio
|
|
||||||
# nest_asyncio.apply()
|
|
||||||
#########
|
|
||||||
|
|
||||||
WORKING_DIR = "./local_neo4jWorkDir"
|
|
||||||
|
|
||||||
if not os.path.exists(WORKING_DIR):
|
|
||||||
os.mkdir(WORKING_DIR)
|
|
||||||
|
|
||||||
# AGE
|
|
||||||
os.environ["AGE_GRAPH_NAME"] = "dickens"
|
|
||||||
|
|
||||||
os.environ["POSTGRES_HOST"] = "localhost"
|
|
||||||
os.environ["POSTGRES_PORT"] = "15432"
|
|
||||||
os.environ["POSTGRES_USER"] = "rag"
|
|
||||||
os.environ["POSTGRES_PASSWORD"] = "rag"
|
|
||||||
os.environ["POSTGRES_DATABASE"] = "rag"
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
graph_db = PGGraphStorage(
|
|
||||||
namespace="dickens",
|
|
||||||
embedding_func=EmbeddingFunc(
|
|
||||||
embedding_dim=1024,
|
|
||||||
max_token_size=8192,
|
|
||||||
func=lambda texts: ollama_embedding(
|
|
||||||
texts, embed_model="bge-m3", host="http://localhost:11434"
|
|
||||||
),
|
|
||||||
),
|
|
||||||
global_config={},
|
|
||||||
)
|
|
||||||
await graph_db.initialize()
|
|
||||||
labels = await graph_db.get_all_labels()
|
|
||||||
print("all labels", labels)
|
|
||||||
|
|
||||||
res = await graph_db.get_knowledge_graph("FEZZIWIG")
|
|
||||||
print("knowledge graphs", res)
|
|
||||||
|
|
||||||
await graph_db.finalize()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
@@ -1,121 +0,0 @@
|
|||||||
import os
|
|
||||||
import time
|
|
||||||
import asyncio
|
|
||||||
from lightrag import LightRAG, QueryParam
|
|
||||||
from lightrag.llm.ollama import ollama_model_complete, ollama_embed
|
|
||||||
from lightrag.utils import EmbeddingFunc
|
|
||||||
from lightrag.kg.shared_storage import initialize_pipeline_status
|
|
||||||
|
|
||||||
# Working directory and the directory path for text files
|
|
||||||
WORKING_DIR = "./dickens"
|
|
||||||
TEXT_FILES_DIR = "/llm/mt"
|
|
||||||
|
|
||||||
# Create the working directory if it doesn't exist
|
|
||||||
if not os.path.exists(WORKING_DIR):
|
|
||||||
os.mkdir(WORKING_DIR)
|
|
||||||
|
|
||||||
|
|
||||||
async def initialize_rag():
|
|
||||||
# Initialize LightRAG
|
|
||||||
rag = LightRAG(
|
|
||||||
working_dir=WORKING_DIR,
|
|
||||||
llm_model_func=ollama_model_complete,
|
|
||||||
llm_model_name="qwen2.5:3b-instruct-max-context",
|
|
||||||
embedding_func=EmbeddingFunc(
|
|
||||||
embedding_dim=768,
|
|
||||||
max_token_size=8192,
|
|
||||||
func=lambda texts: ollama_embed(texts, embed_model="nomic-embed-text"),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
await rag.initialize_storages()
|
|
||||||
await initialize_pipeline_status()
|
|
||||||
|
|
||||||
return rag
|
|
||||||
|
|
||||||
|
|
||||||
# Read all .txt files from the TEXT_FILES_DIR directory
|
|
||||||
texts = []
|
|
||||||
for filename in os.listdir(TEXT_FILES_DIR):
|
|
||||||
if filename.endswith(".txt"):
|
|
||||||
file_path = os.path.join(TEXT_FILES_DIR, filename)
|
|
||||||
with open(file_path, "r", encoding="utf-8") as file:
|
|
||||||
texts.append(file.read())
|
|
||||||
|
|
||||||
|
|
||||||
# Batch insert texts into LightRAG with a retry mechanism
|
|
||||||
def insert_texts_with_retry(rag, texts, retries=3, delay=5):
|
|
||||||
for _ in range(retries):
|
|
||||||
try:
|
|
||||||
rag.insert(texts)
|
|
||||||
return
|
|
||||||
except Exception as e:
|
|
||||||
print(
|
|
||||||
f"Error occurred during insertion: {e}. Retrying in {delay} seconds..."
|
|
||||||
)
|
|
||||||
time.sleep(delay)
|
|
||||||
raise RuntimeError("Failed to insert texts after multiple retries.")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
# Initialize RAG instance
|
|
||||||
rag = asyncio.run(initialize_rag())
|
|
||||||
|
|
||||||
insert_texts_with_retry(rag, texts)
|
|
||||||
|
|
||||||
# Perform different types of queries and handle potential errors
|
|
||||||
try:
|
|
||||||
print(
|
|
||||||
rag.query(
|
|
||||||
"What are the top themes in this story?", param=QueryParam(mode="naive")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error performing naive search: {e}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
print(
|
|
||||||
rag.query(
|
|
||||||
"What are the top themes in this story?", param=QueryParam(mode="local")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error performing local search: {e}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
print(
|
|
||||||
rag.query(
|
|
||||||
"What are the top themes in this story?",
|
|
||||||
param=QueryParam(mode="global"),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error performing global search: {e}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
print(
|
|
||||||
rag.query(
|
|
||||||
"What are the top themes in this story?",
|
|
||||||
param=QueryParam(mode="hybrid"),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error performing hybrid search: {e}")
|
|
||||||
|
|
||||||
# Function to clear VRAM resources
|
|
||||||
def clear_vram():
|
|
||||||
os.system("sudo nvidia-smi --gpu-reset")
|
|
||||||
|
|
||||||
# Regularly clear VRAM to prevent overflow
|
|
||||||
clear_vram_interval = 3600 # Clear once every hour
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
while True:
|
|
||||||
current_time = time.time()
|
|
||||||
if current_time - start_time > clear_vram_interval:
|
|
||||||
clear_vram()
|
|
||||||
start_time = current_time
|
|
||||||
time.sleep(60) # Check the time every minute
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@@ -9,7 +9,7 @@ import aiofiles
|
|||||||
import shutil
|
import shutil
|
||||||
import traceback
|
import traceback
|
||||||
import pipmaster as pm
|
import pipmaster as pm
|
||||||
from datetime import datetime
|
from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Optional, Any, Literal
|
from typing import Dict, List, Optional, Any, Literal
|
||||||
from fastapi import APIRouter, BackgroundTasks, Depends, File, HTTPException, UploadFile
|
from fastapi import APIRouter, BackgroundTasks, Depends, File, HTTPException, UploadFile
|
||||||
@@ -20,6 +20,32 @@ from lightrag.base import DocProcessingStatus, DocStatus
|
|||||||
from lightrag.api.utils_api import get_combined_auth_dependency
|
from lightrag.api.utils_api import get_combined_auth_dependency
|
||||||
from ..config import global_args
|
from ..config import global_args
|
||||||
|
|
||||||
|
|
||||||
|
# Function to format datetime to ISO format string with timezone information
|
||||||
|
def format_datetime(dt: Any) -> Optional[str]:
|
||||||
|
"""Format datetime to ISO format string with timezone information
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dt: Datetime object, string, or None
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ISO format string with timezone information, or None if input is None
|
||||||
|
"""
|
||||||
|
if dt is None:
|
||||||
|
return None
|
||||||
|
if isinstance(dt, str):
|
||||||
|
return dt
|
||||||
|
|
||||||
|
# Check if datetime object has timezone information
|
||||||
|
if isinstance(dt, datetime):
|
||||||
|
# If datetime object has no timezone info (naive datetime), add UTC timezone
|
||||||
|
if dt.tzinfo is None:
|
||||||
|
dt = dt.replace(tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
# Return ISO format string with timezone information
|
||||||
|
return dt.isoformat()
|
||||||
|
|
||||||
|
|
||||||
router = APIRouter(
|
router = APIRouter(
|
||||||
prefix="/documents",
|
prefix="/documents",
|
||||||
tags=["documents"],
|
tags=["documents"],
|
||||||
@@ -207,14 +233,6 @@ Attributes:
|
|||||||
|
|
||||||
|
|
||||||
class DocStatusResponse(BaseModel):
|
class DocStatusResponse(BaseModel):
|
||||||
@staticmethod
|
|
||||||
def format_datetime(dt: Any) -> Optional[str]:
|
|
||||||
if dt is None:
|
|
||||||
return None
|
|
||||||
if isinstance(dt, str):
|
|
||||||
return dt
|
|
||||||
return dt.isoformat()
|
|
||||||
|
|
||||||
id: str = Field(description="Document identifier")
|
id: str = Field(description="Document identifier")
|
||||||
content_summary: str = Field(description="Summary of document content")
|
content_summary: str = Field(description="Summary of document content")
|
||||||
content_length: int = Field(description="Length of document content in characters")
|
content_length: int = Field(description="Length of document content in characters")
|
||||||
@@ -300,7 +318,7 @@ class PipelineStatusResponse(BaseModel):
|
|||||||
autoscanned: Whether auto-scan has started
|
autoscanned: Whether auto-scan has started
|
||||||
busy: Whether the pipeline is currently busy
|
busy: Whether the pipeline is currently busy
|
||||||
job_name: Current job name (e.g., indexing files/indexing texts)
|
job_name: Current job name (e.g., indexing files/indexing texts)
|
||||||
job_start: Job start time as ISO format string (optional)
|
job_start: Job start time as ISO format string with timezone (optional)
|
||||||
docs: Total number of documents to be indexed
|
docs: Total number of documents to be indexed
|
||||||
batchs: Number of batches for processing documents
|
batchs: Number of batches for processing documents
|
||||||
cur_batch: Current processing batch
|
cur_batch: Current processing batch
|
||||||
@@ -322,6 +340,12 @@ class PipelineStatusResponse(BaseModel):
|
|||||||
history_messages: Optional[List[str]] = None
|
history_messages: Optional[List[str]] = None
|
||||||
update_status: Optional[dict] = None
|
update_status: Optional[dict] = None
|
||||||
|
|
||||||
|
@field_validator("job_start", mode="before")
|
||||||
|
@classmethod
|
||||||
|
def parse_job_start(cls, value):
|
||||||
|
"""Process datetime and return as ISO format string with timezone"""
|
||||||
|
return format_datetime(value)
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
extra = "allow" # Allow additional fields from the pipeline status
|
extra = "allow" # Allow additional fields from the pipeline status
|
||||||
|
|
||||||
@@ -1188,9 +1212,10 @@ def create_document_routes(
|
|||||||
if "history_messages" in status_dict:
|
if "history_messages" in status_dict:
|
||||||
status_dict["history_messages"] = list(status_dict["history_messages"])
|
status_dict["history_messages"] = list(status_dict["history_messages"])
|
||||||
|
|
||||||
# Format the job_start time if it exists
|
# Ensure job_start is properly formatted as a string with timezone information
|
||||||
if status_dict.get("job_start"):
|
if "job_start" in status_dict and status_dict["job_start"]:
|
||||||
status_dict["job_start"] = str(status_dict["job_start"])
|
# Use format_datetime to ensure consistent formatting
|
||||||
|
status_dict["job_start"] = format_datetime(status_dict["job_start"])
|
||||||
|
|
||||||
return PipelineStatusResponse(**status_dict)
|
return PipelineStatusResponse(**status_dict)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -1240,12 +1265,8 @@ def create_document_routes(
|
|||||||
content_summary=doc_status.content_summary,
|
content_summary=doc_status.content_summary,
|
||||||
content_length=doc_status.content_length,
|
content_length=doc_status.content_length,
|
||||||
status=doc_status.status,
|
status=doc_status.status,
|
||||||
created_at=DocStatusResponse.format_datetime(
|
created_at=format_datetime(doc_status.created_at),
|
||||||
doc_status.created_at
|
updated_at=format_datetime(doc_status.updated_at),
|
||||||
),
|
|
||||||
updated_at=DocStatusResponse.format_datetime(
|
|
||||||
doc_status.updated_at
|
|
||||||
),
|
|
||||||
chunks_count=doc_status.chunks_count,
|
chunks_count=doc_status.chunks_count,
|
||||||
error=doc_status.error,
|
error=doc_status.error,
|
||||||
metadata=doc_status.metadata,
|
metadata=doc_status.metadata,
|
||||||
|
@@ -114,11 +114,18 @@ class ChromaVectorDBStorage(BaseVectorStorage):
|
|||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
import time
|
||||||
|
|
||||||
|
current_time = int(time.time())
|
||||||
|
|
||||||
ids = list(data.keys())
|
ids = list(data.keys())
|
||||||
documents = [v["content"] for v in data.values()]
|
documents = [v["content"] for v in data.values()]
|
||||||
metadatas = [
|
metadatas = [
|
||||||
{k: v for k, v in item.items() if k in self.meta_fields}
|
{
|
||||||
or {"_default": "true"}
|
**{k: v for k, v in item.items() if k in self.meta_fields},
|
||||||
|
"created_at": current_time,
|
||||||
|
}
|
||||||
|
or {"_default": "true", "created_at": current_time}
|
||||||
for item in data.values()
|
for item in data.values()
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -183,6 +190,7 @@ class ChromaVectorDBStorage(BaseVectorStorage):
|
|||||||
"id": results["ids"][0][i],
|
"id": results["ids"][0][i],
|
||||||
"distance": 1 - results["distances"][0][i],
|
"distance": 1 - results["distances"][0][i],
|
||||||
"content": results["documents"][0][i],
|
"content": results["documents"][0][i],
|
||||||
|
"created_at": results["metadatas"][0][i].get("created_at"),
|
||||||
**results["metadatas"][0][i],
|
**results["metadatas"][0][i],
|
||||||
}
|
}
|
||||||
for i in range(len(results["ids"][0]))
|
for i in range(len(results["ids"][0]))
|
||||||
@@ -235,42 +243,6 @@ class ChromaVectorDBStorage(BaseVectorStorage):
|
|||||||
logger.error(f"Error while deleting vectors from {self.namespace}: {e}")
|
logger.error(f"Error while deleting vectors from {self.namespace}: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
|
|
||||||
"""Search for records with IDs starting with a specific prefix.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
prefix: The prefix to search for in record IDs
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of records with matching ID prefixes
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Get all records from the collection
|
|
||||||
# Since ChromaDB doesn't directly support prefix search on IDs,
|
|
||||||
# we'll get all records and filter in Python
|
|
||||||
results = self._collection.get(
|
|
||||||
include=["metadatas", "documents", "embeddings"]
|
|
||||||
)
|
|
||||||
|
|
||||||
matching_records = []
|
|
||||||
|
|
||||||
# Filter records where ID starts with the prefix
|
|
||||||
for i, record_id in enumerate(results["ids"]):
|
|
||||||
if record_id.startswith(prefix):
|
|
||||||
matching_records.append(
|
|
||||||
{
|
|
||||||
"id": record_id,
|
|
||||||
"content": results["documents"][i],
|
|
||||||
"vector": results["embeddings"][i],
|
|
||||||
**results["metadatas"][i],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.debug(
|
|
||||||
f"Found {len(matching_records)} records with prefix '{prefix}'"
|
|
||||||
)
|
|
||||||
return matching_records
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error during prefix search in ChromaDB: {str(e)}")
|
logger.error(f"Error during prefix search in ChromaDB: {str(e)}")
|
||||||
raise
|
raise
|
||||||
@@ -298,6 +270,7 @@ class ChromaVectorDBStorage(BaseVectorStorage):
|
|||||||
"id": result["ids"][0],
|
"id": result["ids"][0],
|
||||||
"vector": result["embeddings"][0],
|
"vector": result["embeddings"][0],
|
||||||
"content": result["documents"][0],
|
"content": result["documents"][0],
|
||||||
|
"created_at": result["metadatas"][0].get("created_at"),
|
||||||
**result["metadatas"][0],
|
**result["metadatas"][0],
|
||||||
}
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -331,6 +304,7 @@ class ChromaVectorDBStorage(BaseVectorStorage):
|
|||||||
"id": result["ids"][i],
|
"id": result["ids"][i],
|
||||||
"vector": result["embeddings"][i],
|
"vector": result["embeddings"][i],
|
||||||
"content": result["documents"][i],
|
"content": result["documents"][i],
|
||||||
|
"created_at": result["metadatas"][i].get("created_at"),
|
||||||
**result["metadatas"][i],
|
**result["metadatas"][i],
|
||||||
}
|
}
|
||||||
for i in range(len(result["ids"]))
|
for i in range(len(result["ids"]))
|
||||||
|
@@ -107,7 +107,7 @@ class FaissVectorDBStorage(BaseVectorStorage):
|
|||||||
if not data:
|
if not data:
|
||||||
return
|
return
|
||||||
|
|
||||||
current_time = time.time()
|
current_time = int(time.time())
|
||||||
|
|
||||||
# Prepare data for embedding
|
# Prepare data for embedding
|
||||||
list_data = []
|
list_data = []
|
||||||
@@ -385,27 +385,6 @@ class FaissVectorDBStorage(BaseVectorStorage):
|
|||||||
|
|
||||||
return True # Return success
|
return True # Return success
|
||||||
|
|
||||||
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
|
|
||||||
"""Search for records with IDs starting with a specific prefix.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
prefix: The prefix to search for in record IDs
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of records with matching ID prefixes
|
|
||||||
"""
|
|
||||||
matching_records = []
|
|
||||||
|
|
||||||
# Search for records with IDs starting with the prefix
|
|
||||||
for faiss_id, meta in self._id_to_meta.items():
|
|
||||||
if "__id__" in meta and meta["__id__"].startswith(prefix):
|
|
||||||
# Create a copy of all metadata and add "id" field
|
|
||||||
record = {**meta, "id": meta["__id__"]}
|
|
||||||
matching_records.append(record)
|
|
||||||
|
|
||||||
logger.debug(f"Found {len(matching_records)} records with prefix '{prefix}'")
|
|
||||||
return matching_records
|
|
||||||
|
|
||||||
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
||||||
"""Get vector data by its ID
|
"""Get vector data by its ID
|
||||||
|
|
||||||
@@ -425,7 +404,11 @@ class FaissVectorDBStorage(BaseVectorStorage):
|
|||||||
if not metadata:
|
if not metadata:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return {**metadata, "id": metadata.get("__id__")}
|
return {
|
||||||
|
**metadata,
|
||||||
|
"id": metadata.get("__id__"),
|
||||||
|
"created_at": metadata.get("__created_at__"),
|
||||||
|
}
|
||||||
|
|
||||||
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
|
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
|
||||||
"""Get multiple vector data by their IDs
|
"""Get multiple vector data by their IDs
|
||||||
@@ -445,7 +428,13 @@ class FaissVectorDBStorage(BaseVectorStorage):
|
|||||||
if fid is not None:
|
if fid is not None:
|
||||||
metadata = self._id_to_meta.get(fid, {})
|
metadata = self._id_to_meta.get(fid, {})
|
||||||
if metadata:
|
if metadata:
|
||||||
results.append({**metadata, "id": metadata.get("__id__")})
|
results.append(
|
||||||
|
{
|
||||||
|
**metadata,
|
||||||
|
"id": metadata.get("__id__"),
|
||||||
|
"created_at": metadata.get("__created_at__"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
@@ -79,9 +79,14 @@ class MilvusVectorDBStorage(BaseVectorStorage):
|
|||||||
if not data:
|
if not data:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
current_time = int(time.time())
|
||||||
|
|
||||||
list_data: list[dict[str, Any]] = [
|
list_data: list[dict[str, Any]] = [
|
||||||
{
|
{
|
||||||
"id": k,
|
"id": k,
|
||||||
|
"created_at": current_time,
|
||||||
**{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
|
**{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
|
||||||
}
|
}
|
||||||
for k, v in data.items()
|
for k, v in data.items()
|
||||||
@@ -111,7 +116,7 @@ class MilvusVectorDBStorage(BaseVectorStorage):
|
|||||||
collection_name=self.namespace,
|
collection_name=self.namespace,
|
||||||
data=embedding,
|
data=embedding,
|
||||||
limit=top_k,
|
limit=top_k,
|
||||||
output_fields=list(self.meta_fields),
|
output_fields=list(self.meta_fields) + ["created_at"],
|
||||||
search_params={
|
search_params={
|
||||||
"metric_type": "COSINE",
|
"metric_type": "COSINE",
|
||||||
"params": {"radius": self.cosine_better_than_threshold},
|
"params": {"radius": self.cosine_better_than_threshold},
|
||||||
@@ -119,7 +124,13 @@ class MilvusVectorDBStorage(BaseVectorStorage):
|
|||||||
)
|
)
|
||||||
print(results)
|
print(results)
|
||||||
return [
|
return [
|
||||||
{**dp["entity"], "id": dp["id"], "distance": dp["distance"]}
|
{
|
||||||
|
**dp["entity"],
|
||||||
|
"id": dp["id"],
|
||||||
|
"distance": dp["distance"],
|
||||||
|
# created_at is requested in output_fields, so it should be a top-level key in the result dict (dp)
|
||||||
|
"created_at": dp.get("created_at"),
|
||||||
|
}
|
||||||
for dp in results[0]
|
for dp in results[0]
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -211,31 +222,6 @@ class MilvusVectorDBStorage(BaseVectorStorage):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error while deleting vectors from {self.namespace}: {e}")
|
logger.error(f"Error while deleting vectors from {self.namespace}: {e}")
|
||||||
|
|
||||||
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
|
|
||||||
"""Search for records with IDs starting with a specific prefix.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
prefix: The prefix to search for in record IDs
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of records with matching ID prefixes
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Use Milvus query with expression to find IDs with the given prefix
|
|
||||||
expression = f'id like "{prefix}%"'
|
|
||||||
results = self._client.query(
|
|
||||||
collection_name=self.namespace,
|
|
||||||
filter=expression,
|
|
||||||
output_fields=list(self.meta_fields) + ["id"],
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.debug(f"Found {len(results)} records with prefix '{prefix}'")
|
|
||||||
return results
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error searching for records with prefix '{prefix}': {e}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
||||||
"""Get vector data by its ID
|
"""Get vector data by its ID
|
||||||
|
|
||||||
@@ -250,12 +236,16 @@ class MilvusVectorDBStorage(BaseVectorStorage):
|
|||||||
result = self._client.query(
|
result = self._client.query(
|
||||||
collection_name=self.namespace,
|
collection_name=self.namespace,
|
||||||
filter=f'id == "{id}"',
|
filter=f'id == "{id}"',
|
||||||
output_fields=list(self.meta_fields) + ["id"],
|
output_fields=list(self.meta_fields) + ["id", "created_at"],
|
||||||
)
|
)
|
||||||
|
|
||||||
if not result or len(result) == 0:
|
if not result or len(result) == 0:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Ensure the result contains created_at field
|
||||||
|
if "created_at" not in result[0]:
|
||||||
|
result[0]["created_at"] = None
|
||||||
|
|
||||||
return result[0]
|
return result[0]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error retrieving vector data for ID {id}: {e}")
|
logger.error(f"Error retrieving vector data for ID {id}: {e}")
|
||||||
@@ -282,9 +272,14 @@ class MilvusVectorDBStorage(BaseVectorStorage):
|
|||||||
result = self._client.query(
|
result = self._client.query(
|
||||||
collection_name=self.namespace,
|
collection_name=self.namespace,
|
||||||
filter=filter_expr,
|
filter=filter_expr,
|
||||||
output_fields=list(self.meta_fields) + ["id"],
|
output_fields=list(self.meta_fields) + ["id", "created_at"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Ensure each result contains created_at field
|
||||||
|
for item in result:
|
||||||
|
if "created_at" not in item:
|
||||||
|
item["created_at"] = None
|
||||||
|
|
||||||
return result or []
|
return result or []
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
|
logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
|
||||||
|
@@ -999,9 +999,15 @@ class MongoVectorDBStorage(BaseVectorStorage):
|
|||||||
if not data:
|
if not data:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Add current time as Unix timestamp
|
||||||
|
import time
|
||||||
|
|
||||||
|
current_time = int(time.time())
|
||||||
|
|
||||||
list_data = [
|
list_data = [
|
||||||
{
|
{
|
||||||
"_id": k,
|
"_id": k,
|
||||||
|
"created_at": current_time, # Add created_at field as Unix timestamp
|
||||||
**{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
|
**{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
|
||||||
}
|
}
|
||||||
for k, v in data.items()
|
for k, v in data.items()
|
||||||
@@ -1059,9 +1065,14 @@ class MongoVectorDBStorage(BaseVectorStorage):
|
|||||||
cursor = self._data.aggregate(pipeline)
|
cursor = self._data.aggregate(pipeline)
|
||||||
results = await cursor.to_list()
|
results = await cursor.to_list()
|
||||||
|
|
||||||
# Format and return the results
|
# Format and return the results with created_at field
|
||||||
return [
|
return [
|
||||||
{**doc, "id": doc["_id"], "distance": doc.get("score", None)}
|
{
|
||||||
|
**doc,
|
||||||
|
"id": doc["_id"],
|
||||||
|
"distance": doc.get("score", None),
|
||||||
|
"created_at": doc.get("created_at"), # Include created_at field
|
||||||
|
}
|
||||||
for doc in results
|
for doc in results
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -1138,28 +1149,6 @@ class MongoVectorDBStorage(BaseVectorStorage):
|
|||||||
except PyMongoError as e:
|
except PyMongoError as e:
|
||||||
logger.error(f"Error deleting relations for {entity_name}: {str(e)}")
|
logger.error(f"Error deleting relations for {entity_name}: {str(e)}")
|
||||||
|
|
||||||
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
|
|
||||||
"""Search for records with IDs starting with a specific prefix.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
prefix: The prefix to search for in record IDs
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of records with matching ID prefixes
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Use MongoDB regex to find documents where _id starts with the prefix
|
|
||||||
cursor = self._data.find({"_id": {"$regex": f"^{prefix}"}})
|
|
||||||
matching_records = await cursor.to_list(length=None)
|
|
||||||
|
|
||||||
# Format results
|
|
||||||
results = [{**doc, "id": doc["_id"]} for doc in matching_records]
|
|
||||||
|
|
||||||
logger.debug(
|
|
||||||
f"Found {len(results)} records with prefix '{prefix}' in {self.namespace}"
|
|
||||||
)
|
|
||||||
return results
|
|
||||||
|
|
||||||
except PyMongoError as e:
|
except PyMongoError as e:
|
||||||
logger.error(f"Error searching by prefix in {self.namespace}: {str(e)}")
|
logger.error(f"Error searching by prefix in {self.namespace}: {str(e)}")
|
||||||
return []
|
return []
|
||||||
|
@@ -89,7 +89,7 @@ class NanoVectorDBStorage(BaseVectorStorage):
|
|||||||
if not data:
|
if not data:
|
||||||
return
|
return
|
||||||
|
|
||||||
current_time = time.time()
|
current_time = int(time.time())
|
||||||
list_data = [
|
list_data = [
|
||||||
{
|
{
|
||||||
"__id__": k,
|
"__id__": k,
|
||||||
@@ -259,26 +259,6 @@ class NanoVectorDBStorage(BaseVectorStorage):
|
|||||||
|
|
||||||
return True # Return success
|
return True # Return success
|
||||||
|
|
||||||
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
|
|
||||||
"""Search for records with IDs starting with a specific prefix.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
prefix: The prefix to search for in record IDs
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of records with matching ID prefixes
|
|
||||||
"""
|
|
||||||
storage = await self.client_storage
|
|
||||||
matching_records = []
|
|
||||||
|
|
||||||
# Search for records with IDs starting with the prefix
|
|
||||||
for record in storage["data"]:
|
|
||||||
if "__id__" in record and record["__id__"].startswith(prefix):
|
|
||||||
matching_records.append({**record, "id": record["__id__"]})
|
|
||||||
|
|
||||||
logger.debug(f"Found {len(matching_records)} records with prefix '{prefix}'")
|
|
||||||
return matching_records
|
|
||||||
|
|
||||||
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
||||||
"""Get vector data by its ID
|
"""Get vector data by its ID
|
||||||
|
|
||||||
@@ -291,7 +271,12 @@ class NanoVectorDBStorage(BaseVectorStorage):
|
|||||||
client = await self._get_client()
|
client = await self._get_client()
|
||||||
result = client.get([id])
|
result = client.get([id])
|
||||||
if result:
|
if result:
|
||||||
return result[0]
|
dp = result[0]
|
||||||
|
return {
|
||||||
|
**dp,
|
||||||
|
"id": dp.get("__id__"),
|
||||||
|
"created_at": dp.get("__created_at__"),
|
||||||
|
}
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
|
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
|
||||||
@@ -307,7 +292,15 @@ class NanoVectorDBStorage(BaseVectorStorage):
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
client = await self._get_client()
|
client = await self._get_client()
|
||||||
return client.get(ids)
|
results = client.get(ids)
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
**dp,
|
||||||
|
"id": dp.get("__id__"),
|
||||||
|
"created_at": dp.get("__created_at__"),
|
||||||
|
}
|
||||||
|
for dp in results
|
||||||
|
]
|
||||||
|
|
||||||
async def drop(self) -> dict[str, str]:
|
async def drop(self) -> dict[str, str]:
|
||||||
"""Drop all vector data from storage and clean up resources
|
"""Drop all vector data from storage and clean up resources
|
||||||
|
@@ -1,7 +1,8 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import datetime
|
||||||
|
from datetime import timezone
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Any, Union, final
|
from typing import Any, Union, final
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -105,7 +106,61 @@ class PostgreSQLDB:
|
|||||||
):
|
):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
async def _migrate_timestamp_columns(self):
|
||||||
|
"""Migrate timestamp columns in tables to timezone-aware types, assuming original data is in UTC time"""
|
||||||
|
# Tables and columns that need migration
|
||||||
|
tables_to_migrate = {
|
||||||
|
"LIGHTRAG_VDB_ENTITY": ["create_time", "update_time"],
|
||||||
|
"LIGHTRAG_VDB_RELATION": ["create_time", "update_time"],
|
||||||
|
"LIGHTRAG_DOC_CHUNKS": ["create_time", "update_time"],
|
||||||
|
}
|
||||||
|
|
||||||
|
for table_name, columns in tables_to_migrate.items():
|
||||||
|
for column_name in columns:
|
||||||
|
try:
|
||||||
|
# Check if column exists
|
||||||
|
check_column_sql = f"""
|
||||||
|
SELECT column_name, data_type
|
||||||
|
FROM information_schema.columns
|
||||||
|
WHERE table_name = '{table_name.lower()}'
|
||||||
|
AND column_name = '{column_name}'
|
||||||
|
"""
|
||||||
|
|
||||||
|
column_info = await self.query(check_column_sql)
|
||||||
|
if not column_info:
|
||||||
|
logger.warning(
|
||||||
|
f"Column {table_name}.{column_name} does not exist, skipping migration"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check column type
|
||||||
|
data_type = column_info.get("data_type")
|
||||||
|
if data_type == "timestamp with time zone":
|
||||||
|
logger.info(
|
||||||
|
f"Column {table_name}.{column_name} is already timezone-aware, no migration needed"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Execute migration, explicitly specifying UTC timezone for interpreting original data
|
||||||
|
logger.info(
|
||||||
|
f"Migrating {table_name}.{column_name} to timezone-aware type"
|
||||||
|
)
|
||||||
|
migration_sql = f"""
|
||||||
|
ALTER TABLE {table_name}
|
||||||
|
ALTER COLUMN {column_name} TYPE TIMESTAMP(0) WITH TIME ZONE
|
||||||
|
USING {column_name} AT TIME ZONE 'UTC'
|
||||||
|
"""
|
||||||
|
|
||||||
|
await self.execute(migration_sql)
|
||||||
|
logger.info(
|
||||||
|
f"Successfully migrated {table_name}.{column_name} to timezone-aware type"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
# Log error but don't interrupt the process
|
||||||
|
logger.warning(f"Failed to migrate {table_name}.{column_name}: {e}")
|
||||||
|
|
||||||
async def check_tables(self):
|
async def check_tables(self):
|
||||||
|
# First create all tables
|
||||||
for k, v in TABLES.items():
|
for k, v in TABLES.items():
|
||||||
try:
|
try:
|
||||||
await self.query(f"SELECT 1 FROM {k} LIMIT 1")
|
await self.query(f"SELECT 1 FROM {k} LIMIT 1")
|
||||||
@@ -141,6 +196,13 @@ class PostgreSQLDB:
|
|||||||
f"PostgreSQL, Failed to create index on table {k}, Got: {e}"
|
f"PostgreSQL, Failed to create index on table {k}, Got: {e}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# After all tables are created, attempt to migrate timestamp fields
|
||||||
|
try:
|
||||||
|
await self._migrate_timestamp_columns()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"PostgreSQL, Failed to migrate timestamp columns: {e}")
|
||||||
|
# Don't throw an exception, allow the initialization process to continue
|
||||||
|
|
||||||
async def query(
|
async def query(
|
||||||
self,
|
self,
|
||||||
sql: str,
|
sql: str,
|
||||||
@@ -544,7 +606,9 @@ class PGVectorStorage(BaseVectorStorage):
|
|||||||
await ClientManager.release_client(self.db)
|
await ClientManager.release_client(self.db)
|
||||||
self.db = None
|
self.db = None
|
||||||
|
|
||||||
def _upsert_chunks(self, item: dict[str, Any]) -> tuple[str, dict[str, Any]]:
|
def _upsert_chunks(
|
||||||
|
self, item: dict[str, Any], current_time: datetime.datetime
|
||||||
|
) -> tuple[str, dict[str, Any]]:
|
||||||
try:
|
try:
|
||||||
upsert_sql = SQL_TEMPLATES["upsert_chunk"]
|
upsert_sql = SQL_TEMPLATES["upsert_chunk"]
|
||||||
data: dict[str, Any] = {
|
data: dict[str, Any] = {
|
||||||
@@ -556,6 +620,8 @@ class PGVectorStorage(BaseVectorStorage):
|
|||||||
"content": item["content"],
|
"content": item["content"],
|
||||||
"content_vector": json.dumps(item["__vector__"].tolist()),
|
"content_vector": json.dumps(item["__vector__"].tolist()),
|
||||||
"file_path": item["file_path"],
|
"file_path": item["file_path"],
|
||||||
|
"create_time": current_time,
|
||||||
|
"update_time": current_time,
|
||||||
}
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error to prepare upsert,\nsql: {e}\nitem: {item}")
|
logger.error(f"Error to prepare upsert,\nsql: {e}\nitem: {item}")
|
||||||
@@ -563,7 +629,9 @@ class PGVectorStorage(BaseVectorStorage):
|
|||||||
|
|
||||||
return upsert_sql, data
|
return upsert_sql, data
|
||||||
|
|
||||||
def _upsert_entities(self, item: dict[str, Any]) -> tuple[str, dict[str, Any]]:
|
def _upsert_entities(
|
||||||
|
self, item: dict[str, Any], current_time: datetime.datetime
|
||||||
|
) -> tuple[str, dict[str, Any]]:
|
||||||
upsert_sql = SQL_TEMPLATES["upsert_entity"]
|
upsert_sql = SQL_TEMPLATES["upsert_entity"]
|
||||||
source_id = item["source_id"]
|
source_id = item["source_id"]
|
||||||
if isinstance(source_id, str) and "<SEP>" in source_id:
|
if isinstance(source_id, str) and "<SEP>" in source_id:
|
||||||
@@ -579,10 +647,14 @@ class PGVectorStorage(BaseVectorStorage):
|
|||||||
"content_vector": json.dumps(item["__vector__"].tolist()),
|
"content_vector": json.dumps(item["__vector__"].tolist()),
|
||||||
"chunk_ids": chunk_ids,
|
"chunk_ids": chunk_ids,
|
||||||
"file_path": item.get("file_path", None),
|
"file_path": item.get("file_path", None),
|
||||||
|
"create_time": current_time,
|
||||||
|
"update_time": current_time,
|
||||||
}
|
}
|
||||||
return upsert_sql, data
|
return upsert_sql, data
|
||||||
|
|
||||||
def _upsert_relationships(self, item: dict[str, Any]) -> tuple[str, dict[str, Any]]:
|
def _upsert_relationships(
|
||||||
|
self, item: dict[str, Any], current_time: datetime.datetime
|
||||||
|
) -> tuple[str, dict[str, Any]]:
|
||||||
upsert_sql = SQL_TEMPLATES["upsert_relationship"]
|
upsert_sql = SQL_TEMPLATES["upsert_relationship"]
|
||||||
source_id = item["source_id"]
|
source_id = item["source_id"]
|
||||||
if isinstance(source_id, str) and "<SEP>" in source_id:
|
if isinstance(source_id, str) and "<SEP>" in source_id:
|
||||||
@@ -599,6 +671,8 @@ class PGVectorStorage(BaseVectorStorage):
|
|||||||
"content_vector": json.dumps(item["__vector__"].tolist()),
|
"content_vector": json.dumps(item["__vector__"].tolist()),
|
||||||
"chunk_ids": chunk_ids,
|
"chunk_ids": chunk_ids,
|
||||||
"file_path": item.get("file_path", None),
|
"file_path": item.get("file_path", None),
|
||||||
|
"create_time": current_time,
|
||||||
|
"update_time": current_time,
|
||||||
}
|
}
|
||||||
return upsert_sql, data
|
return upsert_sql, data
|
||||||
|
|
||||||
@@ -607,11 +681,11 @@ class PGVectorStorage(BaseVectorStorage):
|
|||||||
if not data:
|
if not data:
|
||||||
return
|
return
|
||||||
|
|
||||||
current_time = time.time()
|
# Get current time with UTC timezone
|
||||||
|
current_time = datetime.datetime.now(timezone.utc)
|
||||||
list_data = [
|
list_data = [
|
||||||
{
|
{
|
||||||
"__id__": k,
|
"__id__": k,
|
||||||
"__created_at__": current_time,
|
|
||||||
**{k1: v1 for k1, v1 in v.items()},
|
**{k1: v1 for k1, v1 in v.items()},
|
||||||
}
|
}
|
||||||
for k, v in data.items()
|
for k, v in data.items()
|
||||||
@@ -630,11 +704,11 @@ class PGVectorStorage(BaseVectorStorage):
|
|||||||
d["__vector__"] = embeddings[i]
|
d["__vector__"] = embeddings[i]
|
||||||
for item in list_data:
|
for item in list_data:
|
||||||
if is_namespace(self.namespace, NameSpace.VECTOR_STORE_CHUNKS):
|
if is_namespace(self.namespace, NameSpace.VECTOR_STORE_CHUNKS):
|
||||||
upsert_sql, data = self._upsert_chunks(item)
|
upsert_sql, data = self._upsert_chunks(item, current_time)
|
||||||
elif is_namespace(self.namespace, NameSpace.VECTOR_STORE_ENTITIES):
|
elif is_namespace(self.namespace, NameSpace.VECTOR_STORE_ENTITIES):
|
||||||
upsert_sql, data = self._upsert_entities(item)
|
upsert_sql, data = self._upsert_entities(item, current_time)
|
||||||
elif is_namespace(self.namespace, NameSpace.VECTOR_STORE_RELATIONSHIPS):
|
elif is_namespace(self.namespace, NameSpace.VECTOR_STORE_RELATIONSHIPS):
|
||||||
upsert_sql, data = self._upsert_relationships(item)
|
upsert_sql, data = self._upsert_relationships(item, current_time)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"{self.namespace} is not supported")
|
raise ValueError(f"{self.namespace} is not supported")
|
||||||
|
|
||||||
@@ -726,41 +800,6 @@ class PGVectorStorage(BaseVectorStorage):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error deleting relations for entity {entity_name}: {e}")
|
logger.error(f"Error deleting relations for entity {entity_name}: {e}")
|
||||||
|
|
||||||
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
|
|
||||||
"""Search for records with IDs starting with a specific prefix.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
prefix: The prefix to search for in record IDs
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of records with matching ID prefixes
|
|
||||||
"""
|
|
||||||
table_name = namespace_to_table_name(self.namespace)
|
|
||||||
if not table_name:
|
|
||||||
logger.error(f"Unknown namespace for prefix search: {self.namespace}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
search_sql = f"SELECT * FROM {table_name} WHERE workspace=$1 AND id LIKE $2"
|
|
||||||
params = {"workspace": self.db.workspace, "prefix": f"{prefix}%"}
|
|
||||||
|
|
||||||
try:
|
|
||||||
results = await self.db.query(search_sql, params, multirows=True)
|
|
||||||
logger.debug(f"Found {len(results)} records with prefix '{prefix}'")
|
|
||||||
|
|
||||||
# Format results to match the expected return format
|
|
||||||
formatted_results = []
|
|
||||||
for record in results:
|
|
||||||
formatted_record = dict(record)
|
|
||||||
# Ensure id field is available (for consistency with NanoVectorDB implementation)
|
|
||||||
if "id" not in formatted_record:
|
|
||||||
formatted_record["id"] = record["id"]
|
|
||||||
formatted_results.append(formatted_record)
|
|
||||||
|
|
||||||
return formatted_results
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error during prefix search for '{prefix}': {e}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
||||||
"""Get vector data by its ID
|
"""Get vector data by its ID
|
||||||
|
|
||||||
@@ -775,7 +814,7 @@ class PGVectorStorage(BaseVectorStorage):
|
|||||||
logger.error(f"Unknown namespace for ID lookup: {self.namespace}")
|
logger.error(f"Unknown namespace for ID lookup: {self.namespace}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
query = f"SELECT * FROM {table_name} WHERE workspace=$1 AND id=$2"
|
query = f"SELECT *, EXTRACT(EPOCH FROM create_time)::BIGINT as created_at FROM {table_name} WHERE workspace=$1 AND id=$2"
|
||||||
params = {"workspace": self.db.workspace, "id": id}
|
params = {"workspace": self.db.workspace, "id": id}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -805,7 +844,7 @@ class PGVectorStorage(BaseVectorStorage):
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
ids_str = ",".join([f"'{id}'" for id in ids])
|
ids_str = ",".join([f"'{id}'" for id in ids])
|
||||||
query = f"SELECT * FROM {table_name} WHERE workspace=$1 AND id IN ({ids_str})"
|
query = f"SELECT *, EXTRACT(EPOCH FROM create_time)::BIGINT as created_at FROM {table_name} WHERE workspace=$1 AND id IN ({ids_str})"
|
||||||
params = {"workspace": self.db.workspace}
|
params = {"workspace": self.db.workspace}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -992,8 +1031,28 @@ class PGDocStatusStorage(DocStatusStorage):
|
|||||||
if not data:
|
if not data:
|
||||||
return
|
return
|
||||||
|
|
||||||
sql = """insert into LIGHTRAG_DOC_STATUS(workspace,id,content,content_summary,content_length,chunks_count,status,file_path)
|
def parse_datetime(dt_str):
|
||||||
values($1,$2,$3,$4,$5,$6,$7,$8)
|
if dt_str is None:
|
||||||
|
return None
|
||||||
|
if isinstance(dt_str, (datetime.date, datetime.datetime)):
|
||||||
|
# If it's a datetime object without timezone info, remove timezone info
|
||||||
|
if isinstance(dt_str, datetime.datetime):
|
||||||
|
# Remove timezone info, return naive datetime object
|
||||||
|
return dt_str.replace(tzinfo=None)
|
||||||
|
return dt_str
|
||||||
|
try:
|
||||||
|
# Process ISO format string with timezone
|
||||||
|
dt = datetime.datetime.fromisoformat(dt_str)
|
||||||
|
# Remove timezone info, return naive datetime object
|
||||||
|
return dt.replace(tzinfo=None)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
logger.warning(f"Unable to parse datetime string: {dt_str}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Modified SQL to include created_at and updated_at in both INSERT and UPDATE operations
|
||||||
|
# Both fields are updated from the input data in both INSERT and UPDATE cases
|
||||||
|
sql = """insert into LIGHTRAG_DOC_STATUS(workspace,id,content,content_summary,content_length,chunks_count,status,file_path,created_at,updated_at)
|
||||||
|
values($1,$2,$3,$4,$5,$6,$7,$8,$9,$10)
|
||||||
on conflict(id,workspace) do update set
|
on conflict(id,workspace) do update set
|
||||||
content = EXCLUDED.content,
|
content = EXCLUDED.content,
|
||||||
content_summary = EXCLUDED.content_summary,
|
content_summary = EXCLUDED.content_summary,
|
||||||
@@ -1001,8 +1060,13 @@ class PGDocStatusStorage(DocStatusStorage):
|
|||||||
chunks_count = EXCLUDED.chunks_count,
|
chunks_count = EXCLUDED.chunks_count,
|
||||||
status = EXCLUDED.status,
|
status = EXCLUDED.status,
|
||||||
file_path = EXCLUDED.file_path,
|
file_path = EXCLUDED.file_path,
|
||||||
updated_at = CURRENT_TIMESTAMP"""
|
created_at = EXCLUDED.created_at,
|
||||||
|
updated_at = EXCLUDED.updated_at"""
|
||||||
for k, v in data.items():
|
for k, v in data.items():
|
||||||
|
# Remove timezone information, store utc time in db
|
||||||
|
created_at = parse_datetime(v.get("created_at"))
|
||||||
|
updated_at = parse_datetime(v.get("updated_at"))
|
||||||
|
|
||||||
# chunks_count is optional
|
# chunks_count is optional
|
||||||
await self.db.execute(
|
await self.db.execute(
|
||||||
sql,
|
sql,
|
||||||
@@ -1015,6 +1079,8 @@ class PGDocStatusStorage(DocStatusStorage):
|
|||||||
"chunks_count": v["chunks_count"] if "chunks_count" in v else -1,
|
"chunks_count": v["chunks_count"] if "chunks_count" in v else -1,
|
||||||
"status": v["status"],
|
"status": v["status"],
|
||||||
"file_path": v["file_path"],
|
"file_path": v["file_path"],
|
||||||
|
"created_at": created_at, # Use the converted datetime object
|
||||||
|
"updated_at": updated_at, # Use the converted datetime object
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -2194,8 +2260,8 @@ TABLES = {
|
|||||||
doc_name VARCHAR(1024),
|
doc_name VARCHAR(1024),
|
||||||
content TEXT,
|
content TEXT,
|
||||||
meta JSONB,
|
meta JSONB,
|
||||||
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
create_time TIMESTAMP(0)
|
||||||
update_time TIMESTAMP,
|
update_time TIMESTAMP(0)
|
||||||
CONSTRAINT LIGHTRAG_DOC_FULL_PK PRIMARY KEY (workspace, id)
|
CONSTRAINT LIGHTRAG_DOC_FULL_PK PRIMARY KEY (workspace, id)
|
||||||
)"""
|
)"""
|
||||||
},
|
},
|
||||||
@@ -2209,8 +2275,8 @@ TABLES = {
|
|||||||
content TEXT,
|
content TEXT,
|
||||||
content_vector VECTOR,
|
content_vector VECTOR,
|
||||||
file_path VARCHAR(256),
|
file_path VARCHAR(256),
|
||||||
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
create_time TIMESTAMP(0) WITH TIME ZONE,
|
||||||
update_time TIMESTAMP,
|
update_time TIMESTAMP(0) WITH TIME ZONE,
|
||||||
CONSTRAINT LIGHTRAG_DOC_CHUNKS_PK PRIMARY KEY (workspace, id)
|
CONSTRAINT LIGHTRAG_DOC_CHUNKS_PK PRIMARY KEY (workspace, id)
|
||||||
)"""
|
)"""
|
||||||
},
|
},
|
||||||
@@ -2221,8 +2287,8 @@ TABLES = {
|
|||||||
entity_name VARCHAR(255),
|
entity_name VARCHAR(255),
|
||||||
content TEXT,
|
content TEXT,
|
||||||
content_vector VECTOR,
|
content_vector VECTOR,
|
||||||
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
create_time TIMESTAMP(0) WITH TIME ZONE,
|
||||||
update_time TIMESTAMP,
|
update_time TIMESTAMP(0) WITH TIME ZONE,
|
||||||
chunk_ids VARCHAR(255)[] NULL,
|
chunk_ids VARCHAR(255)[] NULL,
|
||||||
file_path TEXT NULL,
|
file_path TEXT NULL,
|
||||||
CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id)
|
CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id)
|
||||||
@@ -2236,8 +2302,8 @@ TABLES = {
|
|||||||
target_id VARCHAR(256),
|
target_id VARCHAR(256),
|
||||||
content TEXT,
|
content TEXT,
|
||||||
content_vector VECTOR,
|
content_vector VECTOR,
|
||||||
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
create_time TIMESTAMP(0) WITH TIME ZONE,
|
||||||
update_time TIMESTAMP,
|
update_time TIMESTAMP(0) WITH TIME ZONE,
|
||||||
chunk_ids VARCHAR(255)[] NULL,
|
chunk_ids VARCHAR(255)[] NULL,
|
||||||
file_path TEXT NULL,
|
file_path TEXT NULL,
|
||||||
CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id)
|
CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id)
|
||||||
@@ -2265,8 +2331,8 @@ TABLES = {
|
|||||||
chunks_count int4 NULL,
|
chunks_count int4 NULL,
|
||||||
status varchar(64) NULL,
|
status varchar(64) NULL,
|
||||||
file_path TEXT NULL,
|
file_path TEXT NULL,
|
||||||
created_at timestamp DEFAULT CURRENT_TIMESTAMP NULL,
|
created_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NULL,
|
||||||
updated_at timestamp DEFAULT CURRENT_TIMESTAMP NULL,
|
updated_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NULL,
|
||||||
CONSTRAINT LIGHTRAG_DOC_STATUS_PK PRIMARY KEY (workspace, id)
|
CONSTRAINT LIGHTRAG_DOC_STATUS_PK PRIMARY KEY (workspace, id)
|
||||||
)"""
|
)"""
|
||||||
},
|
},
|
||||||
@@ -2313,8 +2379,9 @@ SQL_TEMPLATES = {
|
|||||||
update_time = CURRENT_TIMESTAMP
|
update_time = CURRENT_TIMESTAMP
|
||||||
""",
|
""",
|
||||||
"upsert_chunk": """INSERT INTO LIGHTRAG_DOC_CHUNKS (workspace, id, tokens,
|
"upsert_chunk": """INSERT INTO LIGHTRAG_DOC_CHUNKS (workspace, id, tokens,
|
||||||
chunk_order_index, full_doc_id, content, content_vector, file_path)
|
chunk_order_index, full_doc_id, content, content_vector, file_path,
|
||||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
create_time, update_time)
|
||||||
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
|
||||||
ON CONFLICT (workspace,id) DO UPDATE
|
ON CONFLICT (workspace,id) DO UPDATE
|
||||||
SET tokens=EXCLUDED.tokens,
|
SET tokens=EXCLUDED.tokens,
|
||||||
chunk_order_index=EXCLUDED.chunk_order_index,
|
chunk_order_index=EXCLUDED.chunk_order_index,
|
||||||
@@ -2322,23 +2389,23 @@ SQL_TEMPLATES = {
|
|||||||
content = EXCLUDED.content,
|
content = EXCLUDED.content,
|
||||||
content_vector=EXCLUDED.content_vector,
|
content_vector=EXCLUDED.content_vector,
|
||||||
file_path=EXCLUDED.file_path,
|
file_path=EXCLUDED.file_path,
|
||||||
update_time = CURRENT_TIMESTAMP
|
update_time = EXCLUDED.update_time
|
||||||
""",
|
""",
|
||||||
# SQL for VectorStorage
|
# SQL for VectorStorage
|
||||||
"upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
|
"upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
|
||||||
content_vector, chunk_ids, file_path)
|
content_vector, chunk_ids, file_path, create_time, update_time)
|
||||||
VALUES ($1, $2, $3, $4, $5, $6::varchar[], $7)
|
VALUES ($1, $2, $3, $4, $5, $6::varchar[], $7, $8, $9)
|
||||||
ON CONFLICT (workspace,id) DO UPDATE
|
ON CONFLICT (workspace,id) DO UPDATE
|
||||||
SET entity_name=EXCLUDED.entity_name,
|
SET entity_name=EXCLUDED.entity_name,
|
||||||
content=EXCLUDED.content,
|
content=EXCLUDED.content,
|
||||||
content_vector=EXCLUDED.content_vector,
|
content_vector=EXCLUDED.content_vector,
|
||||||
chunk_ids=EXCLUDED.chunk_ids,
|
chunk_ids=EXCLUDED.chunk_ids,
|
||||||
file_path=EXCLUDED.file_path,
|
file_path=EXCLUDED.file_path,
|
||||||
update_time=CURRENT_TIMESTAMP
|
update_time=EXCLUDED.update_time
|
||||||
""",
|
""",
|
||||||
"upsert_relationship": """INSERT INTO LIGHTRAG_VDB_RELATION (workspace, id, source_id,
|
"upsert_relationship": """INSERT INTO LIGHTRAG_VDB_RELATION (workspace, id, source_id,
|
||||||
target_id, content, content_vector, chunk_ids, file_path)
|
target_id, content, content_vector, chunk_ids, file_path, create_time, update_time)
|
||||||
VALUES ($1, $2, $3, $4, $5, $6, $7::varchar[], $8)
|
VALUES ($1, $2, $3, $4, $5, $6, $7::varchar[], $8, $9, $10)
|
||||||
ON CONFLICT (workspace,id) DO UPDATE
|
ON CONFLICT (workspace,id) DO UPDATE
|
||||||
SET source_id=EXCLUDED.source_id,
|
SET source_id=EXCLUDED.source_id,
|
||||||
target_id=EXCLUDED.target_id,
|
target_id=EXCLUDED.target_id,
|
||||||
@@ -2346,7 +2413,7 @@ SQL_TEMPLATES = {
|
|||||||
content_vector=EXCLUDED.content_vector,
|
content_vector=EXCLUDED.content_vector,
|
||||||
chunk_ids=EXCLUDED.chunk_ids,
|
chunk_ids=EXCLUDED.chunk_ids,
|
||||||
file_path=EXCLUDED.file_path,
|
file_path=EXCLUDED.file_path,
|
||||||
update_time = CURRENT_TIMESTAMP
|
update_time = EXCLUDED.update_time
|
||||||
""",
|
""",
|
||||||
"relationships": """
|
"relationships": """
|
||||||
WITH relevant_chunks AS (
|
WITH relevant_chunks AS (
|
||||||
@@ -2354,9 +2421,9 @@ SQL_TEMPLATES = {
|
|||||||
FROM LIGHTRAG_DOC_CHUNKS
|
FROM LIGHTRAG_DOC_CHUNKS
|
||||||
WHERE $2::varchar[] IS NULL OR full_doc_id = ANY($2::varchar[])
|
WHERE $2::varchar[] IS NULL OR full_doc_id = ANY($2::varchar[])
|
||||||
)
|
)
|
||||||
SELECT source_id as src_id, target_id as tgt_id
|
SELECT source_id as src_id, target_id as tgt_id, EXTRACT(EPOCH FROM create_time)::BIGINT as created_at
|
||||||
FROM (
|
FROM (
|
||||||
SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance
|
SELECT r.id, r.source_id, r.target_id, r.create_time, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance
|
||||||
FROM LIGHTRAG_VDB_RELATION r
|
FROM LIGHTRAG_VDB_RELATION r
|
||||||
JOIN relevant_chunks c ON c.chunk_id = ANY(r.chunk_ids)
|
JOIN relevant_chunks c ON c.chunk_id = ANY(r.chunk_ids)
|
||||||
WHERE r.workspace=$1
|
WHERE r.workspace=$1
|
||||||
@@ -2371,9 +2438,9 @@ SQL_TEMPLATES = {
|
|||||||
FROM LIGHTRAG_DOC_CHUNKS
|
FROM LIGHTRAG_DOC_CHUNKS
|
||||||
WHERE $2::varchar[] IS NULL OR full_doc_id = ANY($2::varchar[])
|
WHERE $2::varchar[] IS NULL OR full_doc_id = ANY($2::varchar[])
|
||||||
)
|
)
|
||||||
SELECT entity_name FROM
|
SELECT entity_name, EXTRACT(EPOCH FROM create_time)::BIGINT as created_at FROM
|
||||||
(
|
(
|
||||||
SELECT e.id, e.entity_name, 1 - (e.content_vector <=> '[{embedding_string}]'::vector) as distance
|
SELECT e.id, e.entity_name, e.create_time, 1 - (e.content_vector <=> '[{embedding_string}]'::vector) as distance
|
||||||
FROM LIGHTRAG_VDB_ENTITY e
|
FROM LIGHTRAG_VDB_ENTITY e
|
||||||
JOIN relevant_chunks c ON c.chunk_id = ANY(e.chunk_ids)
|
JOIN relevant_chunks c ON c.chunk_id = ANY(e.chunk_ids)
|
||||||
WHERE e.workspace=$1
|
WHERE e.workspace=$1
|
||||||
@@ -2388,9 +2455,9 @@ SQL_TEMPLATES = {
|
|||||||
FROM LIGHTRAG_DOC_CHUNKS
|
FROM LIGHTRAG_DOC_CHUNKS
|
||||||
WHERE $2::varchar[] IS NULL OR full_doc_id = ANY($2::varchar[])
|
WHERE $2::varchar[] IS NULL OR full_doc_id = ANY($2::varchar[])
|
||||||
)
|
)
|
||||||
SELECT id, content, file_path FROM
|
SELECT id, content, file_path, EXTRACT(EPOCH FROM create_time)::BIGINT as created_at FROM
|
||||||
(
|
(
|
||||||
SELECT id, content, file_path, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
|
SELECT id, content, file_path, create_time, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
|
||||||
FROM LIGHTRAG_DOC_CHUNKS
|
FROM LIGHTRAG_DOC_CHUNKS
|
||||||
WHERE workspace=$1
|
WHERE workspace=$1
|
||||||
AND id IN (SELECT chunk_id FROM relevant_chunks)
|
AND id IN (SELECT chunk_id FROM relevant_chunks)
|
||||||
|
@@ -88,9 +88,15 @@ class QdrantVectorDBStorage(BaseVectorStorage):
|
|||||||
logger.info(f"Inserting {len(data)} to {self.namespace}")
|
logger.info(f"Inserting {len(data)} to {self.namespace}")
|
||||||
if not data:
|
if not data:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
current_time = int(time.time())
|
||||||
|
|
||||||
list_data = [
|
list_data = [
|
||||||
{
|
{
|
||||||
"id": k,
|
"id": k,
|
||||||
|
"created_at": current_time,
|
||||||
**{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
|
**{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
|
||||||
}
|
}
|
||||||
for k, v in data.items()
|
for k, v in data.items()
|
||||||
@@ -137,7 +143,14 @@ class QdrantVectorDBStorage(BaseVectorStorage):
|
|||||||
|
|
||||||
logger.debug(f"query result: {results}")
|
logger.debug(f"query result: {results}")
|
||||||
|
|
||||||
return [{**dp.payload, "distance": dp.score} for dp in results]
|
return [
|
||||||
|
{
|
||||||
|
**dp.payload,
|
||||||
|
"distance": dp.score,
|
||||||
|
"created_at": dp.payload.get("created_at"),
|
||||||
|
}
|
||||||
|
for dp in results
|
||||||
|
]
|
||||||
|
|
||||||
async def index_done_callback(self) -> None:
|
async def index_done_callback(self) -> None:
|
||||||
# Qdrant handles persistence automatically
|
# Qdrant handles persistence automatically
|
||||||
@@ -236,46 +249,6 @@ class QdrantVectorDBStorage(BaseVectorStorage):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error deleting relations for {entity_name}: {e}")
|
logger.error(f"Error deleting relations for {entity_name}: {e}")
|
||||||
|
|
||||||
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
|
|
||||||
"""Search for records with IDs starting with a specific prefix.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
prefix: The prefix to search for in record IDs
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of records with matching ID prefixes
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Use scroll method to find records with IDs starting with the prefix
|
|
||||||
results = self._client.scroll(
|
|
||||||
collection_name=self.namespace,
|
|
||||||
scroll_filter=models.Filter(
|
|
||||||
must=[
|
|
||||||
models.FieldCondition(
|
|
||||||
key="id", match=models.MatchText(text=prefix, prefix=True)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
),
|
|
||||||
with_payload=True,
|
|
||||||
with_vectors=False,
|
|
||||||
limit=1000, # Adjust as needed for your use case
|
|
||||||
)
|
|
||||||
|
|
||||||
# Extract matching points
|
|
||||||
matching_records = results[0]
|
|
||||||
|
|
||||||
# Format the results to match expected return format
|
|
||||||
formatted_results = [{**point.payload} for point in matching_records]
|
|
||||||
|
|
||||||
logger.debug(
|
|
||||||
f"Found {len(formatted_results)} records with prefix '{prefix}'"
|
|
||||||
)
|
|
||||||
return formatted_results
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error searching for prefix '{prefix}': {e}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
||||||
"""Get vector data by its ID
|
"""Get vector data by its ID
|
||||||
|
|
||||||
@@ -299,7 +272,12 @@ class QdrantVectorDBStorage(BaseVectorStorage):
|
|||||||
if not result:
|
if not result:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return result[0].payload
|
# Ensure the result contains created_at field
|
||||||
|
payload = result[0].payload
|
||||||
|
if "created_at" not in payload:
|
||||||
|
payload["created_at"] = None
|
||||||
|
|
||||||
|
return payload
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error retrieving vector data for ID {id}: {e}")
|
logger.error(f"Error retrieving vector data for ID {id}: {e}")
|
||||||
return None
|
return None
|
||||||
@@ -327,7 +305,15 @@ class QdrantVectorDBStorage(BaseVectorStorage):
|
|||||||
with_payload=True,
|
with_payload=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
return [point.payload for point in results]
|
# Ensure each result contains created_at field
|
||||||
|
payloads = []
|
||||||
|
for point in results:
|
||||||
|
payload = point.payload
|
||||||
|
if "created_at" not in payload:
|
||||||
|
payload["created_at"] = None
|
||||||
|
payloads.append(payload)
|
||||||
|
|
||||||
|
return payloads
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
|
logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
|
||||||
return []
|
return []
|
||||||
|
@@ -2,7 +2,7 @@ import asyncio
|
|||||||
import os
|
import os
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Any, Union, final
|
from typing import Any, Union, final
|
||||||
|
import time
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from lightrag.types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge
|
from lightrag.types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge
|
||||||
@@ -44,7 +44,13 @@ class TiDB:
|
|||||||
logger.error(f"TiDB database error: {e}")
|
logger.error(f"TiDB database error: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
async def _migrate_timestamp_columns(self):
|
||||||
|
"""Migrate timestamp columns in tables to timezone-aware types, assuming original data is in UTC"""
|
||||||
|
# Not implemented yet
|
||||||
|
pass
|
||||||
|
|
||||||
async def check_tables(self):
|
async def check_tables(self):
|
||||||
|
# First create all tables
|
||||||
for k, v in TABLES.items():
|
for k, v in TABLES.items():
|
||||||
try:
|
try:
|
||||||
await self.query(f"SELECT 1 FROM {k}".format(k=k))
|
await self.query(f"SELECT 1 FROM {k}".format(k=k))
|
||||||
@@ -58,6 +64,13 @@ class TiDB:
|
|||||||
logger.error(f"Failed to create table {k} in TiDB database")
|
logger.error(f"Failed to create table {k} in TiDB database")
|
||||||
logger.error(f"TiDB database error: {e}")
|
logger.error(f"TiDB database error: {e}")
|
||||||
|
|
||||||
|
# After all tables are created, try to migrate timestamp fields
|
||||||
|
try:
|
||||||
|
await self._migrate_timestamp_columns()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"TiDB, Failed to migrate timestamp columns: {e}")
|
||||||
|
# Don't raise exceptions, allow initialization process to continue
|
||||||
|
|
||||||
async def query(
|
async def query(
|
||||||
self, sql: str, params: dict = None, multirows: bool = False
|
self, sql: str, params: dict = None, multirows: bool = False
|
||||||
) -> Union[dict, None]:
|
) -> Union[dict, None]:
|
||||||
@@ -244,6 +257,9 @@ class TiDBKVStorage(BaseKVStorage):
|
|||||||
for i, d in enumerate(list_data):
|
for i, d in enumerate(list_data):
|
||||||
d["__vector__"] = embeddings[i]
|
d["__vector__"] = embeddings[i]
|
||||||
|
|
||||||
|
# Get current time as UNIX timestamp
|
||||||
|
current_time = int(time.time())
|
||||||
|
|
||||||
merge_sql = SQL_TEMPLATES["upsert_chunk"]
|
merge_sql = SQL_TEMPLATES["upsert_chunk"]
|
||||||
data = []
|
data = []
|
||||||
for item in list_data:
|
for item in list_data:
|
||||||
@@ -256,6 +272,7 @@ class TiDBKVStorage(BaseKVStorage):
|
|||||||
"full_doc_id": item["full_doc_id"],
|
"full_doc_id": item["full_doc_id"],
|
||||||
"content_vector": f"{item['__vector__'].tolist()}",
|
"content_vector": f"{item['__vector__'].tolist()}",
|
||||||
"workspace": self.db.workspace,
|
"workspace": self.db.workspace,
|
||||||
|
"timestamp": current_time,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
await self.db.execute(merge_sql, data)
|
await self.db.execute(merge_sql, data)
|
||||||
@@ -325,7 +342,7 @@ class TiDBKVStorage(BaseKVStorage):
|
|||||||
if table_name != "LIGHTRAG_LLM_CACHE":
|
if table_name != "LIGHTRAG_LLM_CACHE":
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# 构建MySQL风格的IN查询
|
# Build MySQL style IN query
|
||||||
modes_list = ", ".join([f"'{mode}'" for mode in modes])
|
modes_list = ", ".join([f"'{mode}'" for mode in modes])
|
||||||
sql = f"""
|
sql = f"""
|
||||||
DELETE FROM {table_name}
|
DELETE FROM {table_name}
|
||||||
@@ -406,7 +423,6 @@ class TiDBVectorDBStorage(BaseVectorStorage):
|
|||||||
results = await self.db.query(
|
results = await self.db.query(
|
||||||
SQL_TEMPLATES[self.namespace], params=params, multirows=True
|
SQL_TEMPLATES[self.namespace], params=params, multirows=True
|
||||||
)
|
)
|
||||||
print("vector search result:", results)
|
|
||||||
if not results:
|
if not results:
|
||||||
return []
|
return []
|
||||||
return results
|
return results
|
||||||
@@ -416,14 +432,18 @@ class TiDBVectorDBStorage(BaseVectorStorage):
|
|||||||
logger.info(f"Inserting {len(data)} to {self.namespace}")
|
logger.info(f"Inserting {len(data)} to {self.namespace}")
|
||||||
if not data:
|
if not data:
|
||||||
return
|
return
|
||||||
if is_namespace(self.namespace, NameSpace.VECTOR_STORE_CHUNKS):
|
|
||||||
return
|
|
||||||
|
|
||||||
logger.info(f"Inserting {len(data)} vectors to {self.namespace}")
|
logger.info(f"Inserting {len(data)} vectors to {self.namespace}")
|
||||||
|
|
||||||
|
# Get current time as UNIX timestamp
|
||||||
|
import time
|
||||||
|
|
||||||
|
current_time = int(time.time())
|
||||||
|
|
||||||
list_data = [
|
list_data = [
|
||||||
{
|
{
|
||||||
"id": k,
|
"id": k,
|
||||||
|
"timestamp": current_time,
|
||||||
**{k1: v1 for k1, v1 in v.items()},
|
**{k1: v1 for k1, v1 in v.items()},
|
||||||
}
|
}
|
||||||
for k, v in data.items()
|
for k, v in data.items()
|
||||||
@@ -440,8 +460,20 @@ class TiDBVectorDBStorage(BaseVectorStorage):
|
|||||||
for i, d in enumerate(list_data):
|
for i, d in enumerate(list_data):
|
||||||
d["content_vector"] = embeddings[i]
|
d["content_vector"] = embeddings[i]
|
||||||
|
|
||||||
if is_namespace(self.namespace, NameSpace.VECTOR_STORE_ENTITIES):
|
if is_namespace(self.namespace, NameSpace.VECTOR_STORE_CHUNKS):
|
||||||
data = []
|
for item in list_data:
|
||||||
|
param = {
|
||||||
|
"id": item["id"],
|
||||||
|
"content": item["content"],
|
||||||
|
"tokens": item.get("tokens", 0),
|
||||||
|
"chunk_order_index": item.get("chunk_order_index", 0),
|
||||||
|
"full_doc_id": item.get("full_doc_id", ""),
|
||||||
|
"content_vector": f"{item['content_vector'].tolist()}",
|
||||||
|
"workspace": self.db.workspace,
|
||||||
|
"timestamp": item["timestamp"],
|
||||||
|
}
|
||||||
|
await self.db.execute(SQL_TEMPLATES["upsert_chunk"], param)
|
||||||
|
elif is_namespace(self.namespace, NameSpace.VECTOR_STORE_ENTITIES):
|
||||||
for item in list_data:
|
for item in list_data:
|
||||||
param = {
|
param = {
|
||||||
"id": item["id"],
|
"id": item["id"],
|
||||||
@@ -449,20 +481,10 @@ class TiDBVectorDBStorage(BaseVectorStorage):
|
|||||||
"content": item["content"],
|
"content": item["content"],
|
||||||
"content_vector": f"{item['content_vector'].tolist()}",
|
"content_vector": f"{item['content_vector'].tolist()}",
|
||||||
"workspace": self.db.workspace,
|
"workspace": self.db.workspace,
|
||||||
|
"timestamp": item["timestamp"],
|
||||||
}
|
}
|
||||||
# update entity_id if node inserted by graph_storage_instance before
|
await self.db.execute(SQL_TEMPLATES["upsert_entity"], param)
|
||||||
has = await self.db.query(SQL_TEMPLATES["has_entity"], param)
|
|
||||||
if has["cnt"] != 0:
|
|
||||||
await self.db.execute(SQL_TEMPLATES["update_entity"], param)
|
|
||||||
continue
|
|
||||||
|
|
||||||
data.append(param)
|
|
||||||
if data:
|
|
||||||
merge_sql = SQL_TEMPLATES["insert_entity"]
|
|
||||||
await self.db.execute(merge_sql, data)
|
|
||||||
|
|
||||||
elif is_namespace(self.namespace, NameSpace.VECTOR_STORE_RELATIONSHIPS):
|
elif is_namespace(self.namespace, NameSpace.VECTOR_STORE_RELATIONSHIPS):
|
||||||
data = []
|
|
||||||
for item in list_data:
|
for item in list_data:
|
||||||
param = {
|
param = {
|
||||||
"id": item["id"],
|
"id": item["id"],
|
||||||
@@ -471,17 +493,9 @@ class TiDBVectorDBStorage(BaseVectorStorage):
|
|||||||
"content": item["content"],
|
"content": item["content"],
|
||||||
"content_vector": f"{item['content_vector'].tolist()}",
|
"content_vector": f"{item['content_vector'].tolist()}",
|
||||||
"workspace": self.db.workspace,
|
"workspace": self.db.workspace,
|
||||||
|
"timestamp": item["timestamp"],
|
||||||
}
|
}
|
||||||
# update relation_id if node inserted by graph_storage_instance before
|
await self.db.execute(SQL_TEMPLATES["upsert_relationship"], param)
|
||||||
has = await self.db.query(SQL_TEMPLATES["has_relationship"], param)
|
|
||||||
if has["cnt"] != 0:
|
|
||||||
await self.db.execute(SQL_TEMPLATES["update_relationship"], param)
|
|
||||||
continue
|
|
||||||
|
|
||||||
data.append(param)
|
|
||||||
if data:
|
|
||||||
merge_sql = SQL_TEMPLATES["insert_relationship"]
|
|
||||||
await self.db.execute(merge_sql, data)
|
|
||||||
|
|
||||||
async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]:
|
async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]:
|
||||||
SQL = SQL_TEMPLATES["get_by_status_" + self.namespace]
|
SQL = SQL_TEMPLATES["get_by_status_" + self.namespace]
|
||||||
@@ -573,55 +587,6 @@ class TiDBVectorDBStorage(BaseVectorStorage):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"status": "error", "message": str(e)}
|
return {"status": "error", "message": str(e)}
|
||||||
|
|
||||||
async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
|
|
||||||
"""Search for records with IDs starting with a specific prefix.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
prefix: The prefix to search for in record IDs
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of records with matching ID prefixes
|
|
||||||
"""
|
|
||||||
# Determine which table to query based on namespace
|
|
||||||
if self.namespace == NameSpace.VECTOR_STORE_ENTITIES:
|
|
||||||
sql_template = """
|
|
||||||
SELECT entity_id as id, name as entity_name, entity_type, description, content
|
|
||||||
FROM LIGHTRAG_GRAPH_NODES
|
|
||||||
WHERE entity_id LIKE :prefix_pattern AND workspace = :workspace
|
|
||||||
"""
|
|
||||||
elif self.namespace == NameSpace.VECTOR_STORE_RELATIONSHIPS:
|
|
||||||
sql_template = """
|
|
||||||
SELECT relation_id as id, source_name as src_id, target_name as tgt_id,
|
|
||||||
keywords, description, content
|
|
||||||
FROM LIGHTRAG_GRAPH_EDGES
|
|
||||||
WHERE relation_id LIKE :prefix_pattern AND workspace = :workspace
|
|
||||||
"""
|
|
||||||
elif self.namespace == NameSpace.VECTOR_STORE_CHUNKS:
|
|
||||||
sql_template = """
|
|
||||||
SELECT chunk_id as id, content, tokens, chunk_order_index, full_doc_id
|
|
||||||
FROM LIGHTRAG_DOC_CHUNKS
|
|
||||||
WHERE chunk_id LIKE :prefix_pattern AND workspace = :workspace
|
|
||||||
"""
|
|
||||||
else:
|
|
||||||
logger.warning(
|
|
||||||
f"Namespace {self.namespace} not supported for prefix search"
|
|
||||||
)
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Add prefix pattern parameter with % for SQL LIKE
|
|
||||||
prefix_pattern = f"{prefix}%"
|
|
||||||
params = {"prefix_pattern": prefix_pattern, "workspace": self.db.workspace}
|
|
||||||
|
|
||||||
try:
|
|
||||||
results = await self.db.query(sql_template, params=params, multirows=True)
|
|
||||||
logger.debug(
|
|
||||||
f"Found {len(results) if results else 0} records with prefix '{prefix}'"
|
|
||||||
)
|
|
||||||
return results if results else []
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error searching records with prefix '{prefix}': {e}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
||||||
"""Get vector data by its ID
|
"""Get vector data by its ID
|
||||||
|
|
||||||
@@ -635,7 +600,8 @@ class TiDBVectorDBStorage(BaseVectorStorage):
|
|||||||
# Determine which table to query based on namespace
|
# Determine which table to query based on namespace
|
||||||
if self.namespace == NameSpace.VECTOR_STORE_ENTITIES:
|
if self.namespace == NameSpace.VECTOR_STORE_ENTITIES:
|
||||||
sql_template = """
|
sql_template = """
|
||||||
SELECT entity_id as id, name as entity_name, entity_type, description, content
|
SELECT entity_id as id, name as entity_name, entity_type, description, content,
|
||||||
|
UNIX_TIMESTAMP(createtime) as created_at
|
||||||
FROM LIGHTRAG_GRAPH_NODES
|
FROM LIGHTRAG_GRAPH_NODES
|
||||||
WHERE entity_id = :entity_id AND workspace = :workspace
|
WHERE entity_id = :entity_id AND workspace = :workspace
|
||||||
"""
|
"""
|
||||||
@@ -643,14 +609,15 @@ class TiDBVectorDBStorage(BaseVectorStorage):
|
|||||||
elif self.namespace == NameSpace.VECTOR_STORE_RELATIONSHIPS:
|
elif self.namespace == NameSpace.VECTOR_STORE_RELATIONSHIPS:
|
||||||
sql_template = """
|
sql_template = """
|
||||||
SELECT relation_id as id, source_name as src_id, target_name as tgt_id,
|
SELECT relation_id as id, source_name as src_id, target_name as tgt_id,
|
||||||
keywords, description, content
|
keywords, description, content, UNIX_TIMESTAMP(createtime) as created_at
|
||||||
FROM LIGHTRAG_GRAPH_EDGES
|
FROM LIGHTRAG_GRAPH_EDGES
|
||||||
WHERE relation_id = :relation_id AND workspace = :workspace
|
WHERE relation_id = :relation_id AND workspace = :workspace
|
||||||
"""
|
"""
|
||||||
params = {"relation_id": id, "workspace": self.db.workspace}
|
params = {"relation_id": id, "workspace": self.db.workspace}
|
||||||
elif self.namespace == NameSpace.VECTOR_STORE_CHUNKS:
|
elif self.namespace == NameSpace.VECTOR_STORE_CHUNKS:
|
||||||
sql_template = """
|
sql_template = """
|
||||||
SELECT chunk_id as id, content, tokens, chunk_order_index, full_doc_id
|
SELECT chunk_id as id, content, tokens, chunk_order_index, full_doc_id,
|
||||||
|
UNIX_TIMESTAMP(createtime) as created_at
|
||||||
FROM LIGHTRAG_DOC_CHUNKS
|
FROM LIGHTRAG_DOC_CHUNKS
|
||||||
WHERE chunk_id = :chunk_id AND workspace = :workspace
|
WHERE chunk_id = :chunk_id AND workspace = :workspace
|
||||||
"""
|
"""
|
||||||
@@ -686,20 +653,22 @@ class TiDBVectorDBStorage(BaseVectorStorage):
|
|||||||
# Determine which table to query based on namespace
|
# Determine which table to query based on namespace
|
||||||
if self.namespace == NameSpace.VECTOR_STORE_ENTITIES:
|
if self.namespace == NameSpace.VECTOR_STORE_ENTITIES:
|
||||||
sql_template = f"""
|
sql_template = f"""
|
||||||
SELECT entity_id as id, name as entity_name, entity_type, description, content
|
SELECT entity_id as id, name as entity_name, entity_type, description, content,
|
||||||
|
UNIX_TIMESTAMP(createtime) as created_at
|
||||||
FROM LIGHTRAG_GRAPH_NODES
|
FROM LIGHTRAG_GRAPH_NODES
|
||||||
WHERE entity_id IN ({ids_str}) AND workspace = :workspace
|
WHERE entity_id IN ({ids_str}) AND workspace = :workspace
|
||||||
"""
|
"""
|
||||||
elif self.namespace == NameSpace.VECTOR_STORE_RELATIONSHIPS:
|
elif self.namespace == NameSpace.VECTOR_STORE_RELATIONSHIPS:
|
||||||
sql_template = f"""
|
sql_template = f"""
|
||||||
SELECT relation_id as id, source_name as src_id, target_name as tgt_id,
|
SELECT relation_id as id, source_name as src_id, target_name as tgt_id,
|
||||||
keywords, description, content
|
keywords, description, content, UNIX_TIMESTAMP(createtime) as created_at
|
||||||
FROM LIGHTRAG_GRAPH_EDGES
|
FROM LIGHTRAG_GRAPH_EDGES
|
||||||
WHERE relation_id IN ({ids_str}) AND workspace = :workspace
|
WHERE relation_id IN ({ids_str}) AND workspace = :workspace
|
||||||
"""
|
"""
|
||||||
elif self.namespace == NameSpace.VECTOR_STORE_CHUNKS:
|
elif self.namespace == NameSpace.VECTOR_STORE_CHUNKS:
|
||||||
sql_template = f"""
|
sql_template = f"""
|
||||||
SELECT chunk_id as id, content, tokens, chunk_order_index, full_doc_id
|
SELECT chunk_id as id, content, tokens, chunk_order_index, full_doc_id,
|
||||||
|
UNIX_TIMESTAMP(createtime) as created_at
|
||||||
FROM LIGHTRAG_DOC_CHUNKS
|
FROM LIGHTRAG_DOC_CHUNKS
|
||||||
WHERE chunk_id IN ({ids_str}) AND workspace = :workspace
|
WHERE chunk_id IN ({ids_str}) AND workspace = :workspace
|
||||||
"""
|
"""
|
||||||
@@ -1086,8 +1055,8 @@ TABLES = {
|
|||||||
`tokens` INT,
|
`tokens` INT,
|
||||||
`content` LONGTEXT,
|
`content` LONGTEXT,
|
||||||
`content_vector` VECTOR,
|
`content_vector` VECTOR,
|
||||||
`createtime` DATETIME DEFAULT CURRENT_TIMESTAMP,
|
`createtime` TIMESTAMP,
|
||||||
`updatetime` DATETIME DEFAULT NULL,
|
`updatetime` TIMESTAMP,
|
||||||
UNIQUE KEY (`chunk_id`)
|
UNIQUE KEY (`chunk_id`)
|
||||||
);
|
);
|
||||||
"""
|
"""
|
||||||
@@ -1104,8 +1073,8 @@ TABLES = {
|
|||||||
`source_chunk_id` VARCHAR(256),
|
`source_chunk_id` VARCHAR(256),
|
||||||
`content` LONGTEXT,
|
`content` LONGTEXT,
|
||||||
`content_vector` VECTOR,
|
`content_vector` VECTOR,
|
||||||
`createtime` DATETIME DEFAULT CURRENT_TIMESTAMP,
|
`createtime` TIMESTAMP,
|
||||||
`updatetime` DATETIME DEFAULT NULL,
|
`updatetime` TIMESTAMP,
|
||||||
KEY (`entity_id`)
|
KEY (`entity_id`)
|
||||||
);
|
);
|
||||||
"""
|
"""
|
||||||
@@ -1124,8 +1093,8 @@ TABLES = {
|
|||||||
`source_chunk_id` varchar(256),
|
`source_chunk_id` varchar(256),
|
||||||
`content` LONGTEXT,
|
`content` LONGTEXT,
|
||||||
`content_vector` VECTOR,
|
`content_vector` VECTOR,
|
||||||
`createtime` DATETIME DEFAULT CURRENT_TIMESTAMP,
|
`createtime` TIMESTAMP,
|
||||||
`updatetime` DATETIME DEFAULT NULL,
|
`updatetime` TIMESTAMP,
|
||||||
KEY (`relation_id`)
|
KEY (`relation_id`)
|
||||||
);
|
);
|
||||||
"""
|
"""
|
||||||
@@ -1159,25 +1128,25 @@ SQL_TEMPLATES = {
|
|||||||
ON DUPLICATE KEY UPDATE content = VALUES(content), workspace = VALUES(workspace), updatetime = CURRENT_TIMESTAMP
|
ON DUPLICATE KEY UPDATE content = VALUES(content), workspace = VALUES(workspace), updatetime = CURRENT_TIMESTAMP
|
||||||
""",
|
""",
|
||||||
"upsert_chunk": """
|
"upsert_chunk": """
|
||||||
INSERT INTO LIGHTRAG_DOC_CHUNKS(chunk_id, content, tokens, chunk_order_index, full_doc_id, content_vector, workspace)
|
INSERT INTO LIGHTRAG_DOC_CHUNKS(chunk_id, content, tokens, chunk_order_index, full_doc_id, content_vector, workspace, createtime, updatetime)
|
||||||
VALUES (:id, :content, :tokens, :chunk_order_index, :full_doc_id, :content_vector, :workspace)
|
VALUES (:id, :content, :tokens, :chunk_order_index, :full_doc_id, :content_vector, :workspace, FROM_UNIXTIME(:timestamp), FROM_UNIXTIME(:timestamp))
|
||||||
ON DUPLICATE KEY UPDATE
|
ON DUPLICATE KEY UPDATE
|
||||||
content = VALUES(content), tokens = VALUES(tokens), chunk_order_index = VALUES(chunk_order_index),
|
content = VALUES(content), tokens = VALUES(tokens), chunk_order_index = VALUES(chunk_order_index),
|
||||||
full_doc_id = VALUES(full_doc_id), content_vector = VALUES(content_vector), workspace = VALUES(workspace), updatetime = CURRENT_TIMESTAMP
|
full_doc_id = VALUES(full_doc_id), content_vector = VALUES(content_vector), workspace = VALUES(workspace), updatetime = FROM_UNIXTIME(:timestamp)
|
||||||
""",
|
""",
|
||||||
# SQL for VectorStorage
|
# SQL for VectorStorage
|
||||||
"entities": """SELECT n.name as entity_name FROM
|
"entities": """SELECT n.name as entity_name, UNIX_TIMESTAMP(n.createtime) as created_at FROM
|
||||||
(SELECT entity_id as id, name, VEC_COSINE_DISTANCE(content_vector,:embedding_string) as distance
|
(SELECT entity_id as id, name, createtime, VEC_COSINE_DISTANCE(content_vector,:embedding_string) as distance
|
||||||
FROM LIGHTRAG_GRAPH_NODES WHERE workspace = :workspace) n
|
FROM LIGHTRAG_GRAPH_NODES WHERE workspace = :workspace) n
|
||||||
WHERE n.distance>:better_than_threshold ORDER BY n.distance DESC LIMIT :top_k
|
WHERE n.distance>:better_than_threshold ORDER BY n.distance DESC LIMIT :top_k
|
||||||
""",
|
""",
|
||||||
"relationships": """SELECT e.source_name as src_id, e.target_name as tgt_id FROM
|
"relationships": """SELECT e.source_name as src_id, e.target_name as tgt_id, UNIX_TIMESTAMP(e.createtime) as created_at FROM
|
||||||
(SELECT source_name, target_name, VEC_COSINE_DISTANCE(content_vector, :embedding_string) as distance
|
(SELECT source_name, target_name, createtime, VEC_COSINE_DISTANCE(content_vector, :embedding_string) as distance
|
||||||
FROM LIGHTRAG_GRAPH_EDGES WHERE workspace = :workspace) e
|
FROM LIGHTRAG_GRAPH_EDGES WHERE workspace = :workspace) e
|
||||||
WHERE e.distance>:better_than_threshold ORDER BY e.distance DESC LIMIT :top_k
|
WHERE e.distance>:better_than_threshold ORDER BY e.distance DESC LIMIT :top_k
|
||||||
""",
|
""",
|
||||||
"chunks": """SELECT c.id FROM
|
"chunks": """SELECT c.id, UNIX_TIMESTAMP(c.createtime) as created_at FROM
|
||||||
(SELECT chunk_id as id,VEC_COSINE_DISTANCE(content_vector, :embedding_string) as distance
|
(SELECT chunk_id as id, createtime, VEC_COSINE_DISTANCE(content_vector, :embedding_string) as distance
|
||||||
FROM LIGHTRAG_DOC_CHUNKS WHERE workspace = :workspace) c
|
FROM LIGHTRAG_DOC_CHUNKS WHERE workspace = :workspace) c
|
||||||
WHERE c.distance>:better_than_threshold ORDER BY c.distance DESC LIMIT :top_k
|
WHERE c.distance>:better_than_threshold ORDER BY c.distance DESC LIMIT :top_k
|
||||||
""",
|
""",
|
||||||
@@ -1187,23 +1156,21 @@ SQL_TEMPLATES = {
|
|||||||
"has_relationship": """
|
"has_relationship": """
|
||||||
SELECT COUNT(id) AS cnt FROM LIGHTRAG_GRAPH_EDGES WHERE source_name = :source_name AND target_name = :target_name AND workspace = :workspace
|
SELECT COUNT(id) AS cnt FROM LIGHTRAG_GRAPH_EDGES WHERE source_name = :source_name AND target_name = :target_name AND workspace = :workspace
|
||||||
""",
|
""",
|
||||||
"update_entity": """
|
"upsert_entity": """
|
||||||
UPDATE LIGHTRAG_GRAPH_NODES SET
|
INSERT INTO LIGHTRAG_GRAPH_NODES(entity_id, name, content, content_vector, workspace, createtime, updatetime)
|
||||||
entity_id = :id, content = :content, content_vector = :content_vector, updatetime = CURRENT_TIMESTAMP
|
VALUES(:id, :name, :content, :content_vector, :workspace, FROM_UNIXTIME(:timestamp), FROM_UNIXTIME(:timestamp))
|
||||||
WHERE workspace = :workspace AND name = :name
|
ON DUPLICATE KEY UPDATE
|
||||||
|
content = VALUES(content),
|
||||||
|
content_vector = VALUES(content_vector),
|
||||||
|
updatetime = FROM_UNIXTIME(:timestamp)
|
||||||
""",
|
""",
|
||||||
"update_relationship": """
|
"upsert_relationship": """
|
||||||
UPDATE LIGHTRAG_GRAPH_EDGES SET
|
INSERT INTO LIGHTRAG_GRAPH_EDGES(relation_id, source_name, target_name, content, content_vector, workspace, createtime, updatetime)
|
||||||
relation_id = :id, content = :content, content_vector = :content_vector, updatetime = CURRENT_TIMESTAMP
|
VALUES(:id, :source_name, :target_name, :content, :content_vector, :workspace, FROM_UNIXTIME(:timestamp), FROM_UNIXTIME(:timestamp))
|
||||||
WHERE workspace = :workspace AND source_name = :source_name AND target_name = :target_name
|
ON DUPLICATE KEY UPDATE
|
||||||
""",
|
content = VALUES(content),
|
||||||
"insert_entity": """
|
content_vector = VALUES(content_vector),
|
||||||
INSERT INTO LIGHTRAG_GRAPH_NODES(entity_id, name, content, content_vector, workspace)
|
updatetime = FROM_UNIXTIME(:timestamp)
|
||||||
VALUES(:id, :name, :content, :content_vector, :workspace)
|
|
||||||
""",
|
|
||||||
"insert_relationship": """
|
|
||||||
INSERT INTO LIGHTRAG_GRAPH_EDGES(relation_id, source_name, target_name, content, content_vector, workspace)
|
|
||||||
VALUES(:id, :source_name, :target_name, :content, :content_vector, :workspace)
|
|
||||||
""",
|
""",
|
||||||
# SQL for GraphStorage
|
# SQL for GraphStorage
|
||||||
"get_node": """
|
"get_node": """
|
||||||
@@ -1275,22 +1242,6 @@ SQL_TEMPLATES = {
|
|||||||
WHERE (source_name = :source AND target_name = :target)
|
WHERE (source_name = :source AND target_name = :target)
|
||||||
AND workspace = :workspace
|
AND workspace = :workspace
|
||||||
""",
|
""",
|
||||||
# Search by prefix SQL templates
|
|
||||||
"search_entity_by_prefix": """
|
|
||||||
SELECT entity_id as id, name as entity_name, entity_type, description, content
|
|
||||||
FROM LIGHTRAG_GRAPH_NODES
|
|
||||||
WHERE entity_id LIKE :prefix_pattern AND workspace = :workspace
|
|
||||||
""",
|
|
||||||
"search_relationship_by_prefix": """
|
|
||||||
SELECT relation_id as id, source_name as src_id, target_name as tgt_id, keywords, description, content
|
|
||||||
FROM LIGHTRAG_GRAPH_EDGES
|
|
||||||
WHERE relation_id LIKE :prefix_pattern AND workspace = :workspace
|
|
||||||
""",
|
|
||||||
"search_chunk_by_prefix": """
|
|
||||||
SELECT chunk_id as id, content, tokens, chunk_order_index, full_doc_id
|
|
||||||
FROM LIGHTRAG_DOC_CHUNKS
|
|
||||||
WHERE chunk_id LIKE :prefix_pattern AND workspace = :workspace
|
|
||||||
""",
|
|
||||||
# Drop tables
|
# Drop tables
|
||||||
"drop_specifiy_table_workspace": "DELETE FROM {table_name} WHERE workspace = :workspace",
|
"drop_specifiy_table_workspace": "DELETE FROM {table_name} WHERE workspace = :workspace",
|
||||||
}
|
}
|
||||||
|
@@ -6,7 +6,7 @@ import configparser
|
|||||||
import os
|
import os
|
||||||
import warnings
|
import warnings
|
||||||
from dataclasses import asdict, dataclass, field
|
from dataclasses import asdict, dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime, timezone
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import (
|
from typing import (
|
||||||
Any,
|
Any,
|
||||||
@@ -756,8 +756,8 @@ class LightRAG:
|
|||||||
"content": content_data["content"],
|
"content": content_data["content"],
|
||||||
"content_summary": get_content_summary(content_data["content"]),
|
"content_summary": get_content_summary(content_data["content"]),
|
||||||
"content_length": len(content_data["content"]),
|
"content_length": len(content_data["content"]),
|
||||||
"created_at": datetime.now().isoformat(),
|
"created_at": datetime.now(timezone.utc).isoformat(),
|
||||||
"updated_at": datetime.now().isoformat(),
|
"updated_at": datetime.now(timezone.utc).isoformat(),
|
||||||
"file_path": content_data[
|
"file_path": content_data[
|
||||||
"file_path"
|
"file_path"
|
||||||
], # Store file path in document status
|
], # Store file path in document status
|
||||||
@@ -840,7 +840,7 @@ class LightRAG:
|
|||||||
{
|
{
|
||||||
"busy": True,
|
"busy": True,
|
||||||
"job_name": "Default Job",
|
"job_name": "Default Job",
|
||||||
"job_start": datetime.now().isoformat(),
|
"job_start": datetime.now(timezone.utc).isoformat(),
|
||||||
"docs": 0,
|
"docs": 0,
|
||||||
"batchs": 0, # Total number of files to be processed
|
"batchs": 0, # Total number of files to be processed
|
||||||
"cur_batch": 0, # Number of files already processed
|
"cur_batch": 0, # Number of files already processed
|
||||||
@@ -958,7 +958,9 @@ class LightRAG:
|
|||||||
"content_summary": status_doc.content_summary,
|
"content_summary": status_doc.content_summary,
|
||||||
"content_length": status_doc.content_length,
|
"content_length": status_doc.content_length,
|
||||||
"created_at": status_doc.created_at,
|
"created_at": status_doc.created_at,
|
||||||
"updated_at": datetime.now().isoformat(),
|
"updated_at": datetime.now(
|
||||||
|
timezone.utc
|
||||||
|
).isoformat(),
|
||||||
"file_path": file_path,
|
"file_path": file_path,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1018,7 +1020,9 @@ class LightRAG:
|
|||||||
"content_summary": status_doc.content_summary,
|
"content_summary": status_doc.content_summary,
|
||||||
"content_length": status_doc.content_length,
|
"content_length": status_doc.content_length,
|
||||||
"created_at": status_doc.created_at,
|
"created_at": status_doc.created_at,
|
||||||
"updated_at": datetime.now().isoformat(),
|
"updated_at": datetime.now(
|
||||||
|
timezone.utc
|
||||||
|
).isoformat(),
|
||||||
"file_path": file_path,
|
"file_path": file_path,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1053,7 +1057,9 @@ class LightRAG:
|
|||||||
"content_summary": status_doc.content_summary,
|
"content_summary": status_doc.content_summary,
|
||||||
"content_length": status_doc.content_length,
|
"content_length": status_doc.content_length,
|
||||||
"created_at": status_doc.created_at,
|
"created_at": status_doc.created_at,
|
||||||
"updated_at": datetime.now().isoformat(),
|
"updated_at": datetime.now(
|
||||||
|
timezone.utc
|
||||||
|
).isoformat(),
|
||||||
"file_path": file_path,
|
"file_path": file_path,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -311,6 +311,7 @@ async def _merge_nodes_then_upsert(
|
|||||||
description=description,
|
description=description,
|
||||||
source_id=source_id,
|
source_id=source_id,
|
||||||
file_path=file_path,
|
file_path=file_path,
|
||||||
|
created_at=int(time.time()),
|
||||||
)
|
)
|
||||||
await knowledge_graph_inst.upsert_node(
|
await knowledge_graph_inst.upsert_node(
|
||||||
entity_name,
|
entity_name,
|
||||||
@@ -422,6 +423,7 @@ async def _merge_edges_then_upsert(
|
|||||||
"description": description,
|
"description": description,
|
||||||
"entity_type": "UNKNOWN",
|
"entity_type": "UNKNOWN",
|
||||||
"file_path": file_path,
|
"file_path": file_path,
|
||||||
|
"created_at": int(time.time()),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -465,6 +467,7 @@ async def _merge_edges_then_upsert(
|
|||||||
keywords=keywords,
|
keywords=keywords,
|
||||||
source_id=source_id,
|
source_id=source_id,
|
||||||
file_path=file_path,
|
file_path=file_path,
|
||||||
|
created_at=int(time.time()),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1455,7 +1458,12 @@ async def _get_node_data(
|
|||||||
logger.warning("Some nodes are missing, maybe the storage is damaged")
|
logger.warning("Some nodes are missing, maybe the storage is damaged")
|
||||||
|
|
||||||
node_datas = [
|
node_datas = [
|
||||||
{**n, "entity_name": k["entity_name"], "rank": d}
|
{
|
||||||
|
**n,
|
||||||
|
"entity_name": k["entity_name"],
|
||||||
|
"rank": d,
|
||||||
|
"created_at": k.get("created_at"),
|
||||||
|
}
|
||||||
for k, n, d in zip(results, node_datas, node_degrees)
|
for k, n, d in zip(results, node_datas, node_degrees)
|
||||||
if n is not None
|
if n is not None
|
||||||
] # what is this text_chunks_db doing. dont remember it in airvx. check the diagram.
|
] # what is this text_chunks_db doing. dont remember it in airvx. check the diagram.
|
||||||
@@ -1774,7 +1782,7 @@ async def _get_edge_data(
|
|||||||
"src_id": k["src_id"],
|
"src_id": k["src_id"],
|
||||||
"tgt_id": k["tgt_id"],
|
"tgt_id": k["tgt_id"],
|
||||||
"rank": edge_degrees_dict.get(pair, k.get("rank", 0)),
|
"rank": edge_degrees_dict.get(pair, k.get("rank", 0)),
|
||||||
"created_at": k.get("__created_at__", None),
|
"created_at": k.get("created_at", None),
|
||||||
**edge_props,
|
**edge_props,
|
||||||
}
|
}
|
||||||
edge_datas.append(combined)
|
edge_datas.append(combined)
|
||||||
@@ -1820,7 +1828,7 @@ async def _get_edge_data(
|
|||||||
]
|
]
|
||||||
]
|
]
|
||||||
for i, e in enumerate(edge_datas):
|
for i, e in enumerate(edge_datas):
|
||||||
created_at = e.get("created_at", "Unknown")
|
created_at = e.get("created_at", "UNKNOWN")
|
||||||
# Convert timestamp to readable format
|
# Convert timestamp to readable format
|
||||||
if isinstance(created_at, (int, float)):
|
if isinstance(created_at, (int, float)):
|
||||||
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
||||||
@@ -1847,7 +1855,7 @@ async def _get_edge_data(
|
|||||||
["id", "entity", "type", "description", "rank", "created_at", "file_path"]
|
["id", "entity", "type", "description", "rank", "created_at", "file_path"]
|
||||||
]
|
]
|
||||||
for i, n in enumerate(use_entities):
|
for i, n in enumerate(use_entities):
|
||||||
created_at = n.get("created_at", "Unknown")
|
created_at = n.get("created_at", "UNKNOWN")
|
||||||
# Convert timestamp to readable format
|
# Convert timestamp to readable format
|
||||||
if isinstance(created_at, (int, float)):
|
if isinstance(created_at, (int, float)):
|
||||||
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
||||||
|
@@ -158,7 +158,16 @@ export default function PipelineStatusDialog({
|
|||||||
<div className="rounded-md border p-3 space-y-2">
|
<div className="rounded-md border p-3 space-y-2">
|
||||||
<div>{t('documentPanel.pipelineStatus.jobName')}: {status?.job_name || '-'}</div>
|
<div>{t('documentPanel.pipelineStatus.jobName')}: {status?.job_name || '-'}</div>
|
||||||
<div className="flex justify-between">
|
<div className="flex justify-between">
|
||||||
<span>{t('documentPanel.pipelineStatus.startTime')}: {status?.job_start ? new Date(status.job_start).toLocaleString() : '-'}</span>
|
<span>{t('documentPanel.pipelineStatus.startTime')}: {status?.job_start
|
||||||
|
? new Date(status.job_start).toLocaleString(undefined, {
|
||||||
|
year: 'numeric',
|
||||||
|
month: 'numeric',
|
||||||
|
day: 'numeric',
|
||||||
|
hour: 'numeric',
|
||||||
|
minute: 'numeric',
|
||||||
|
second: 'numeric'
|
||||||
|
})
|
||||||
|
: '-'}</span>
|
||||||
<span>{t('documentPanel.pipelineStatus.progress')}: {status ? `${status.cur_batch}/${status.batchs} ${t('documentPanel.pipelineStatus.unit')}` : '-'}</span>
|
<span>{t('documentPanel.pipelineStatus.progress')}: {status ? `${status.cur_batch}/${status.batchs} ${t('documentPanel.pipelineStatus.unit')}` : '-'}</span>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
Reference in New Issue
Block a user