From 81655829bc2c89f9dbebac62b0452bff36195127 Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 30 Jan 2025 22:21:52 +0800 Subject: [PATCH 01/15] Add logging for chunk truncation in mix_kg_vector_query --- lightrag/operate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 10c76bcc..6756a40c 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -891,7 +891,8 @@ async def mix_kg_vector_query( if c["created_at"]: chunk_text = f"[Created at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['created_at']))}]\n{chunk_text}" formatted_chunks.append(chunk_text) - + + logger.info(f"Truncate {len(chunks)} to {len(formatted_chunks)} chunks") return "\n--New Chunk--\n".join(formatted_chunks) except Exception as e: logger.error(f"Error in get_vector_context: {e}") From 601f10b132ad4f5dc082c49c4af59d5be33f71bf Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 30 Jan 2025 22:26:28 +0800 Subject: [PATCH 02/15] Fix linting --- lightrag/operate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index fbcb6c7c..6a1763c7 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -891,7 +891,7 @@ async def mix_kg_vector_query( if c["created_at"]: chunk_text = f"[Created at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['created_at']))}]\n{chunk_text}" formatted_chunks.append(chunk_text) - + logger.info(f"Truncate {len(chunks)} to {len(formatted_chunks)} chunks") return "\n--New Chunk--\n".join(formatted_chunks) except Exception as e: From fb63e5ed39626754befed88b9029fc93a13ff547 Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 30 Jan 2025 22:38:32 +0800 Subject: [PATCH 03/15] Improve prompts to avoid make-up respond from LLM like qwen-plus when very long context is provided. --- lightrag/prompt.py | 74 ++++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 36 deletions(-) diff --git a/lightrag/prompt.py b/lightrag/prompt.py index 2839e740..8e52b9c3 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -151,18 +151,18 @@ PROMPTS[ ] = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added. """ -PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question." +PROMPTS["fail_response"] = ( + "Sorry, I'm not able to provide an answer to that question.[no-context]" +) PROMPTS["rag_response"] = """---Role--- -You are a helpful assistant responding to questions about data in the tables provided. +You are a helpful assistant responding to user query about Knowledge Base provided below. ---Goal--- -Generate a response of the target length and format that responds to the user's question, considering both the conversation history and the current query. Summarize all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. -If you don't know the answer, just say so. Do not make anything up. -Do not include information where the supporting evidence for it is not provided. +Generate a concise response based on Knowledge Base and follow Response Rules, considering both the conversation history and the current query. Summarize all information in the provided Knowledge Base, and incorporating general knowledge relevant to the Knowledge Base. Do not include information not provided by Knowledge Base. When handling relationships with timestamps: 1. Each relationship has a "created_at" timestamp indicating when we acquired this knowledge @@ -173,15 +173,17 @@ When handling relationships with timestamps: ---Conversation History--- {history} ----Target response length and format--- - -{response_type} - ----Data tables--- - +---Knowledge Base--- {context_data} -Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. Ensure the response maintains continuity with the conversation history.""" +---Response Rules--- + +- Target format and length: {response_type} +- Use markdown formatting with appropriate section headings +- Please respond in the same language as the user's question. +- Ensure the response maintains continuity with the conversation history. +- If you don't know the answer, just say so. +- Do not make anything up. Do not include information not provided by the Knowledge Base.""" PROMPTS["keywords_extraction"] = """---Role--- @@ -253,13 +255,11 @@ Output: PROMPTS["naive_rag_response"] = """---Role--- -You are a helpful assistant responding to questions about documents provided. +You are a helpful assistant responding to user query about Document Chunks provided below. ---Goal--- -Generate a response of the target length and format that responds to the user's question, considering both the conversation history and the current query. Summarize all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. -If you don't know the answer, just say so. Do not make anything up. -Do not include information where the supporting evidence for it is not provided. +GGenerate a concise response based on Document Chunks and follow Response Rules, considering both the conversation history and the current query. Summarize all information in the provided Document Chunks, and incorporating general knowledge relevant to the Document Chunks. Do not include information not provided by Document Chunks. When handling content with timestamps: 1. Each piece of content has a "created_at" timestamp indicating when we acquired this knowledge @@ -270,15 +270,18 @@ When handling content with timestamps: ---Conversation History--- {history} ----Target response length and format--- - -{response_type} - ----Documents--- - +---Document Chunks--- {content_data} -Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. Ensure the response maintains continuity with the conversation history.""" +---Response Rules--- + +- Target format and length: {response_type} +- Use markdown formatting with appropriate section headings +- Please respond in the same language as the user's question. +- Ensure the response maintains continuity with the conversation history. +- If you don't know the answer, just say so. +- Do not include information not provided by the Document Chunks.""" + PROMPTS[ "similarity_check" @@ -306,11 +309,12 @@ Return only a number between 0-1, without any additional content. PROMPTS["mix_rag_response"] = """---Role--- -You are a professional assistant responsible for answering questions based on knowledge graph and textual information. Please respond in the same language as the user's question. +You are a helpful assistant responding to user query about Data Sources provided below. + ---Goal--- -Generate a concise response that summarizes relevant points from the provided information, considering both the current query and conversation history. If you don't know the answer, just say so. Do not make anything up or include information where the supporting evidence is not provided. +Generate a concise response based on Data Sources and follow Response Rules, considering both the conversation history and the current query. Data sources contain two parts: Knowledge Graph(KG) and Document Chunks(DC). Summarize all information in the provided Data Sources, and incorporating general knowledge relevant to the Data Sources. Do not include information not provided by Data Sources. When handling information with timestamps: 1. Each piece of information (both relationships and content) has a "created_at" timestamp indicating when we acquired this knowledge @@ -323,22 +327,20 @@ When handling information with timestamps: ---Data Sources--- -1. Knowledge Graph Data: +1. From Knowledge Graph(KG): {kg_context} -2. Vector Data: +2. From Document Chunks(DC): {vector_context} ----Response Requirements--- +---Response Rules--- - Target format and length: {response_type} - Use markdown formatting with appropriate section headings -- Aim to keep content around 3 paragraphs for conciseness -- Each paragraph should be under a relevant section heading -- Each section should focus on one main point or aspect of the answer +- Please respond in the same language as the user's question. +- Ensure the response maintains continuity with the conversation history. +- Organize answer in sesctions focusing on one main point or aspect of the answer - Use clear and descriptive section titles that reflect the content -- Ensure the response maintains continuity with the conversation history -- List up to 5 most important reference sources at the end under "References", clearly indicating whether each source is from Knowledge Graph (KG) or Vector Data (VD) - Format: [KG/VD] Source content - -Add sections and commentary to the response as appropriate for the length and format. If the provided information is insufficient to answer the question, clearly state that you don't know or cannot provide an answer in the same language as the user's question.""" +- List up to 5 most important reference sources at the end under "References" sesction. Clearly indicating whether each source is from Knowledge Graph (KG) or Vector Data (DC), in the following format: [KG/DC] Source content +- If you don't know the answer, just say so. Do not make anything up. +- Do not include information not provided by the Data Sources.""" From 219cbab1e36493cc25c5ff2cb5443072463e9a7c Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Thu, 30 Jan 2025 23:27:43 +0100 Subject: [PATCH 04/15] Added progress when scanning files and fixed some bugs in the API --- lightrag/api/lightrag_server.py | 89 +- lightrag/api/static/index.html | 2 +- .../api/static/js/{lightrag_api.js => api.js} | 778 +++++++++--------- 3 files changed, 470 insertions(+), 399 deletions(-) rename lightrag/api/static/js/{lightrag_api.js => api.js} (91%) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index e162f5ec..e4da6a4e 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -1,4 +1,24 @@ from fastapi import FastAPI, HTTPException, File, UploadFile, Form, Request +# Backend (Python) +# Add this to store progress globally +from typing import Dict +import threading + +# Global progress tracker +scan_progress: Dict = { + "is_scanning": False, + "current_file": "", + "indexed_count": 0, + "total_files": 0, + "progress": 0 +} + +# Lock for thread-safe operations +progress_lock = threading.Lock() + +import json +import os + from fastapi.staticfiles import StaticFiles from pydantic import BaseModel import logging @@ -538,7 +558,7 @@ class DocumentManager: # Create input directory if it doesn't exist self.input_dir.mkdir(parents=True, exist_ok=True) - def scan_directory(self) -> List[Path]: + def scan_directory_for_new_files(self) -> List[Path]: """Scan input directory for new files""" new_files = [] for ext in self.supported_extensions: @@ -547,6 +567,14 @@ class DocumentManager: new_files.append(file_path) return new_files + def scan_directory(self) -> List[Path]: + """Scan input directory for new files""" + new_files = [] + for ext in self.supported_extensions: + for file_path in self.input_dir.rglob(f"*{ext}"): + new_files.append(file_path) + return new_files + def mark_as_indexed(self, file_path: Path): """Mark a file as indexed""" self.indexed_files.add(file_path) @@ -730,7 +758,7 @@ def create_app(args): # Startup logic if args.auto_scan_at_startup: try: - new_files = doc_manager.scan_directory() + new_files = doc_manager.scan_directory_for_new_files() for file_path in new_files: try: await index_file(file_path) @@ -982,43 +1010,56 @@ def create_app(args): else: logging.warning(f"No content extracted from file: {file_path}") + @app.post("/documents/scan", dependencies=[Depends(optional_api_key)]) async def scan_for_new_documents(): - """ - Manually trigger scanning for new documents in the directory managed by `doc_manager`. - - This endpoint facilitates manual initiation of a document scan to identify and index new files. - It processes all newly detected files, attempts indexing each file, logs any errors that occur, - and returns a summary of the operation. - - Returns: - dict: A dictionary containing: - - "status" (str): Indicates success or failure of the scanning process. - - "indexed_count" (int): The number of successfully indexed documents. - - "total_documents" (int): Total number of documents that have been indexed so far. - - Raises: - HTTPException: If an error occurs during the document scanning process, a 500 status - code is returned with details about the exception. - """ + """Trigger the scanning process""" + global scan_progress + try: - new_files = doc_manager.scan_directory() - indexed_count = 0 + with progress_lock: + if scan_progress["is_scanning"]: + return {"status": "already_scanning"} + + scan_progress["is_scanning"] = True + scan_progress["indexed_count"] = 0 + scan_progress["progress"] = 0 + + new_files = doc_manager.scan_directory_for_new_files() + scan_progress["total_files"] = len(new_files) for file_path in new_files: try: + with progress_lock: + scan_progress["current_file"] = os.path.basename(file_path) + await index_file(file_path) - indexed_count += 1 + + with progress_lock: + scan_progress["indexed_count"] += 1 + scan_progress["progress"] = (scan_progress["indexed_count"] / scan_progress["total_files"]) * 100 + except Exception as e: logging.error(f"Error indexing file {file_path}: {str(e)}") return { "status": "success", - "indexed_count": indexed_count, + "indexed_count": scan_progress["indexed_count"], "total_documents": len(doc_manager.indexed_files), } except Exception as e: raise HTTPException(status_code=500, detail=str(e)) + finally: + with progress_lock: + scan_progress["is_scanning"] = False + + @app.get("/documents/scan-progress") + async def get_scan_progress(): + """Get the current scanning progress""" + with progress_lock: + return scan_progress + + @app.post("/documents/upload", dependencies=[Depends(optional_api_key)]) async def upload_to_input_dir(file: UploadFile = File(...)): @@ -1849,7 +1890,7 @@ def create_app(args): "status": "healthy", "working_directory": str(args.working_dir), "input_directory": str(args.input_dir), - "indexed_files": files, + "indexed_files": [str(f) for f in files], "indexed_files_count": len(files), "configuration": { # LLM configuration binding/host address (if applicable)/model (if applicable) diff --git a/lightrag/api/static/index.html b/lightrag/api/static/index.html index 60900c03..c9659d5e 100644 --- a/lightrag/api/static/index.html +++ b/lightrag/api/static/index.html @@ -98,7 +98,7 @@ - + diff --git a/lightrag/api/static/js/lightrag_api.js b/lightrag/api/static/js/api.js similarity index 91% rename from lightrag/api/static/js/lightrag_api.js rename to lightrag/api/static/js/api.js index 3c2ff69c..65aa53be 100644 --- a/lightrag/api/static/js/lightrag_api.js +++ b/lightrag/api/static/js/api.js @@ -1,375 +1,405 @@ -// State management -const state = { - apiKey: localStorage.getItem('apiKey') || '', - files: [], - indexedFiles: [], - currentPage: 'file-manager' -}; - -// Utility functions -const showToast = (message, duration = 3000) => { - const toast = document.getElementById('toast'); - toast.querySelector('div').textContent = message; - toast.classList.remove('hidden'); - setTimeout(() => toast.classList.add('hidden'), duration); -}; - -const fetchWithAuth = async (url, options = {}) => { - const headers = { - ...(options.headers || {}), - ...(state.apiKey ? { 'Authorization': `Bearer ${state.apiKey}` } : {}) - }; - return fetch(url, { ...options, headers }); -}; - -// Page renderers -const pages = { - 'file-manager': () => ` -
-

File Manager

- -
- - -
- -
-

Selected Files

-
-
- - - - -
-

Indexed Files

-
-
- - - -
- `, - - 'query': () => ` -
-

Query Database

- -
-
- - -
- -
- - -
- - - -
-
-
- `, - - 'knowledge-graph': () => ` -
-
- - - -

Under Construction

-

Knowledge graph visualization will be available in a future update.

-
-
- `, - - 'status': () => ` -
-

System Status

-
-
-

System Health

-
-
-
-

Configuration

-
-
-
-
- `, - - 'settings': () => ` -
-

Settings

- -
-
-
- - -
- - -
-
-
- ` -}; - -// Page handlers -const handlers = { - 'file-manager': () => { - const fileInput = document.getElementById('fileInput'); - const dropZone = fileInput.parentElement.parentElement; - const fileList = document.querySelector('#fileList div'); - const indexedFiles = document.querySelector('#indexedFiles div'); - const uploadBtn = document.getElementById('uploadBtn'); - - const updateFileList = () => { - fileList.innerHTML = state.files.map(file => ` -
- ${file.name} - -
- `).join(''); - }; - - const updateIndexedFiles = async () => { - const response = await fetchWithAuth('/health'); - const data = await response.json(); - indexedFiles.innerHTML = data.indexed_files.map(file => ` -
- ${file} -
- `).join(''); - }; - - dropZone.addEventListener('dragover', (e) => { - e.preventDefault(); - dropZone.classList.add('border-blue-500'); - }); - - dropZone.addEventListener('dragleave', () => { - dropZone.classList.remove('border-blue-500'); - }); - - dropZone.addEventListener('drop', (e) => { - e.preventDefault(); - dropZone.classList.remove('border-blue-500'); - const files = Array.from(e.dataTransfer.files); - state.files.push(...files); - updateFileList(); - }); - - fileInput.addEventListener('change', () => { - state.files.push(...Array.from(fileInput.files)); - updateFileList(); - }); - - uploadBtn.addEventListener('click', async () => { - if (state.files.length === 0) { - showToast('Please select files to upload'); - return; - } - let apiKey = localStorage.getItem('apiKey') || ''; - const progress = document.getElementById('uploadProgress'); - const progressBar = progress.querySelector('div'); - const statusText = document.getElementById('uploadStatus'); - progress.classList.remove('hidden'); - - for (let i = 0; i < state.files.length; i++) { - const formData = new FormData(); - formData.append('file', state.files[i]); - - try { - await fetch('/documents/upload', { - method: 'POST', - headers: apiKey ? { 'Authorization': `Bearer ${apiKey}` } : {}, - body: formData - }); - - const percentage = ((i + 1) / state.files.length) * 100; - progressBar.style.width = `${percentage}%`; - statusText.textContent = `${i + 1}/${state.files.length}`; - } catch (error) { - console.error('Upload error:', error); - } - } - progress.classList.add('hidden'); - }); - rescanBtn.addEventListener('click', async () => { - let apiKey = localStorage.getItem('apiKey') || ''; - const progress = document.getElementById('uploadProgress'); - const progressBar = progress.querySelector('div'); - const statusText = document.getElementById('uploadStatus'); - progress.classList.remove('hidden'); - try { - const scan_output = await fetch('/documents/scan', { - method: 'GET', - }); - statusText.textContent = scan_output.data; - } catch (error) { - console.error('Upload error:', error); - } - progress.classList.add('hidden'); - }); - updateIndexedFiles(); - }, - - 'query': () => { - const queryBtn = document.getElementById('queryBtn'); - const queryInput = document.getElementById('queryInput'); - const queryMode = document.getElementById('queryMode'); - const queryResult = document.getElementById('queryResult'); - - let apiKey = localStorage.getItem('apiKey') || ''; - - queryBtn.addEventListener('click', async () => { - const query = queryInput.value.trim(); - if (!query) { - showToast('Please enter a query'); - return; - } - - queryBtn.disabled = true; - queryBtn.innerHTML = ` - - - - - Processing... - `; - - try { - const response = await fetchWithAuth('/query', { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - query, - mode: queryMode.value, - stream: false, - only_need_context: false - }) - }); - - const data = await response.json(); - queryResult.innerHTML = marked.parse(data.response); - } catch (error) { - showToast('Error processing query'); - } finally { - queryBtn.disabled = false; - queryBtn.textContent = 'Send Query'; - } - }); - }, - - 'status': async () => { - const healthStatus = document.getElementById('healthStatus'); - const configStatus = document.getElementById('configStatus'); - - try { - const response = await fetchWithAuth('/health'); - const data = await response.json(); - - healthStatus.innerHTML = ` -
-
-
- ${data.status} -
-
-

Working Directory: ${data.working_directory}

-

Input Directory: ${data.input_directory}

-

Indexed Files: ${data.indexed_files_count}

-
-
- `; - - configStatus.innerHTML = Object.entries(data.configuration) - .map(([key, value]) => ` -
- ${key}: - ${value} -
- `).join(''); - } catch (error) { - showToast('Error fetching status'); - } - }, - - 'settings': () => { - const saveBtn = document.getElementById('saveSettings'); - const apiKeyInput = document.getElementById('apiKeyInput'); - - saveBtn.addEventListener('click', () => { - state.apiKey = apiKeyInput.value; - localStorage.setItem('apiKey', state.apiKey); - showToast('Settings saved successfully'); - }); - } -}; - -// Navigation handling -document.querySelectorAll('.nav-item').forEach(item => { - item.addEventListener('click', (e) => { - e.preventDefault(); - const page = item.dataset.page; - document.getElementById('content').innerHTML = pages[page](); - if (handlers[page]) handlers[page](); - state.currentPage = page; - }); -}); - -// Initialize with file manager -document.getElementById('content').innerHTML = pages['file-manager'](); -handlers['file-manager'](); - -// Global functions -window.removeFile = (fileName) => { - state.files = state.files.filter(file => file.name !== fileName); - document.querySelector('#fileList div').innerHTML = state.files.map(file => ` -
- ${file.name} - -
- `).join(''); +// State management +const state = { + apiKey: localStorage.getItem('apiKey') || '', + files: [], + indexedFiles: [], + currentPage: 'file-manager' +}; + +// Utility functions +const showToast = (message, duration = 3000) => { + const toast = document.getElementById('toast'); + toast.querySelector('div').textContent = message; + toast.classList.remove('hidden'); + setTimeout(() => toast.classList.add('hidden'), duration); +}; + +const fetchWithAuth = async (url, options = {}) => { + const headers = { + ...(options.headers || {}), + ...(state.apiKey ? { 'Authorization': `Bearer ${state.apiKey}` } : {}) + }; + return fetch(url, { ...options, headers }); +}; + +// Page renderers +const pages = { + 'file-manager': () => ` +
+

File Manager

+ +
+ + +
+ +
+

Selected Files

+
+
+ + + + + +
+

Indexed Files

+
+
+ + +
+ `, + + 'query': () => ` +
+

Query Database

+ +
+
+ + +
+ +
+ + +
+ + + +
+
+
+ `, + + 'knowledge-graph': () => ` +
+
+ + + +

Under Construction

+

Knowledge graph visualization will be available in a future update.

+
+
+ `, + + 'status': () => ` +
+

System Status

+
+
+

System Health

+
+
+
+

Configuration

+
+
+
+
+ `, + + 'settings': () => ` +
+

Settings

+ +
+
+
+ + +
+ + +
+
+
+ ` +}; + +// Page handlers +const handlers = { + 'file-manager': () => { + const fileInput = document.getElementById('fileInput'); + const dropZone = fileInput.parentElement.parentElement; + const fileList = document.querySelector('#fileList div'); + const indexedFiles = document.querySelector('#indexedFiles div'); + const uploadBtn = document.getElementById('uploadBtn'); + + const updateFileList = () => { + fileList.innerHTML = state.files.map(file => ` +
+ ${file.name} + +
+ `).join(''); + }; + + const updateIndexedFiles = async () => { + const response = await fetchWithAuth('/health'); + const data = await response.json(); + indexedFiles.innerHTML = data.indexed_files.map(file => ` +
+ ${file} +
+ `).join(''); + }; + + dropZone.addEventListener('dragover', (e) => { + e.preventDefault(); + dropZone.classList.add('border-blue-500'); + }); + + dropZone.addEventListener('dragleave', () => { + dropZone.classList.remove('border-blue-500'); + }); + + dropZone.addEventListener('drop', (e) => { + e.preventDefault(); + dropZone.classList.remove('border-blue-500'); + const files = Array.from(e.dataTransfer.files); + state.files.push(...files); + updateFileList(); + }); + + fileInput.addEventListener('change', () => { + state.files.push(...Array.from(fileInput.files)); + updateFileList(); + }); + + uploadBtn.addEventListener('click', async () => { + if (state.files.length === 0) { + showToast('Please select files to upload'); + return; + } + let apiKey = localStorage.getItem('apiKey') || ''; + const progress = document.getElementById('uploadProgress'); + const progressBar = progress.querySelector('div'); + const statusText = document.getElementById('uploadStatus'); + progress.classList.remove('hidden'); + + for (let i = 0; i < state.files.length; i++) { + const formData = new FormData(); + formData.append('file', state.files[i]); + + try { + await fetch('/documents/upload', { + method: 'POST', + headers: apiKey ? { 'Authorization': `Bearer ${apiKey}` } : {}, + body: formData + }); + + const percentage = ((i + 1) / state.files.length) * 100; + progressBar.style.width = `${percentage}%`; + statusText.textContent = `${i + 1}/${state.files.length}`; + } catch (error) { + console.error('Upload error:', error); + } + } + progress.classList.add('hidden'); + }); + + rescanBtn.addEventListener('click', async () => { + const progress = document.getElementById('uploadProgress'); + const progressBar = progress.querySelector('div'); + const statusText = document.getElementById('uploadStatus'); + progress.classList.remove('hidden'); + + try { + // Start the scanning process + const scanResponse = await fetch('/documents/scan', { + method: 'POST', + }); + + if (!scanResponse.ok) { + throw new Error('Scan failed to start'); + } + + // Start polling for progress + const pollInterval = setInterval(async () => { + const progressResponse = await fetch('/documents/scan-progress'); + const progressData = await progressResponse.json(); + + // Update progress bar + progressBar.style.width = `${progressData.progress}%`; + + // Update status text + if (progressData.total_files > 0) { + statusText.textContent = `Processing ${progressData.current_file} (${progressData.indexed_count}/${progressData.total_files})`; + } + + // Check if scanning is complete + if (!progressData.is_scanning) { + clearInterval(pollInterval); + progress.classList.add('hidden'); + statusText.textContent = 'Scan complete!'; + } + }, 1000); // Poll every second + + } catch (error) { + console.error('Upload error:', error); + progress.classList.add('hidden'); + statusText.textContent = 'Error during scanning process'; + } + }); + + + updateIndexedFiles(); + }, + + 'query': () => { + const queryBtn = document.getElementById('queryBtn'); + const queryInput = document.getElementById('queryInput'); + const queryMode = document.getElementById('queryMode'); + const queryResult = document.getElementById('queryResult'); + + let apiKey = localStorage.getItem('apiKey') || ''; + + queryBtn.addEventListener('click', async () => { + const query = queryInput.value.trim(); + if (!query) { + showToast('Please enter a query'); + return; + } + + queryBtn.disabled = true; + queryBtn.innerHTML = ` + + + + + Processing... + `; + + try { + const response = await fetchWithAuth('/query', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + query, + mode: queryMode.value, + stream: false, + only_need_context: false + }) + }); + + const data = await response.json(); + queryResult.innerHTML = marked.parse(data.response); + } catch (error) { + showToast('Error processing query'); + } finally { + queryBtn.disabled = false; + queryBtn.textContent = 'Send Query'; + } + }); + }, + + 'status': async () => { + const healthStatus = document.getElementById('healthStatus'); + const configStatus = document.getElementById('configStatus'); + + try { + const response = await fetchWithAuth('/health'); + const data = await response.json(); + + healthStatus.innerHTML = ` +
+
+
+ ${data.status} +
+
+

Working Directory: ${data.working_directory}

+

Input Directory: ${data.input_directory}

+

Indexed Files: ${data.indexed_files_count}

+
+
+ `; + + configStatus.innerHTML = Object.entries(data.configuration) + .map(([key, value]) => ` +
+ ${key}: + ${value} +
+ `).join(''); + } catch (error) { + showToast('Error fetching status'); + } + }, + + 'settings': () => { + const saveBtn = document.getElementById('saveSettings'); + const apiKeyInput = document.getElementById('apiKeyInput'); + + saveBtn.addEventListener('click', () => { + state.apiKey = apiKeyInput.value; + localStorage.setItem('apiKey', state.apiKey); + showToast('Settings saved successfully'); + }); + } +}; + +// Navigation handling +document.querySelectorAll('.nav-item').forEach(item => { + item.addEventListener('click', (e) => { + e.preventDefault(); + const page = item.dataset.page; + document.getElementById('content').innerHTML = pages[page](); + if (handlers[page]) handlers[page](); + state.currentPage = page; + }); +}); + +// Initialize with file manager +document.getElementById('content').innerHTML = pages['file-manager'](); +handlers['file-manager'](); + +// Global functions +window.removeFile = (fileName) => { + state.files = state.files.filter(file => file.name !== fileName); + document.querySelector('#fileList div').innerHTML = state.files.map(file => ` +
+ ${file.name} + +
+ `).join(''); }; From 381f7deec6a2c28e8907fba8d0c98ef530a0de30 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Thu, 30 Jan 2025 23:29:21 +0100 Subject: [PATCH 05/15] linting --- lightrag/api/lightrag_server.py | 25 ++++++++++++------------- lightrag/api/static/js/api.js | 2 +- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index e4da6a4e..c71d5075 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -1,4 +1,5 @@ from fastapi import FastAPI, HTTPException, File, UploadFile, Form, Request + # Backend (Python) # Add this to store progress globally from typing import Dict @@ -10,7 +11,7 @@ scan_progress: Dict = { "current_file": "", "indexed_count": 0, "total_files": 0, - "progress": 0 + "progress": 0, } # Lock for thread-safe operations @@ -23,7 +24,6 @@ from fastapi.staticfiles import StaticFiles from pydantic import BaseModel import logging import argparse -import json import time import re from typing import List, Dict, Any, Optional, Union @@ -36,7 +36,6 @@ from pathlib import Path import shutil import aiofiles from ascii_colors import trace_exception, ASCIIColors -import os import sys import configparser @@ -1010,21 +1009,20 @@ def create_app(args): else: logging.warning(f"No content extracted from file: {file_path}") - @app.post("/documents/scan", dependencies=[Depends(optional_api_key)]) async def scan_for_new_documents(): """Trigger the scanning process""" global scan_progress - + try: with progress_lock: if scan_progress["is_scanning"]: return {"status": "already_scanning"} - + scan_progress["is_scanning"] = True scan_progress["indexed_count"] = 0 scan_progress["progress"] = 0 - + new_files = doc_manager.scan_directory_for_new_files() scan_progress["total_files"] = len(new_files) @@ -1032,13 +1030,16 @@ def create_app(args): try: with progress_lock: scan_progress["current_file"] = os.path.basename(file_path) - + await index_file(file_path) - + with progress_lock: scan_progress["indexed_count"] += 1 - scan_progress["progress"] = (scan_progress["indexed_count"] / scan_progress["total_files"]) * 100 - + scan_progress["progress"] = ( + scan_progress["indexed_count"] + / scan_progress["total_files"] + ) * 100 + except Exception as e: logging.error(f"Error indexing file {file_path}: {str(e)}") @@ -1059,8 +1060,6 @@ def create_app(args): with progress_lock: return scan_progress - - @app.post("/documents/upload", dependencies=[Depends(optional_api_key)]) async def upload_to_input_dir(file: UploadFile = File(...)): """ diff --git a/lightrag/api/static/js/api.js b/lightrag/api/static/js/api.js index 65aa53be..5654e253 100644 --- a/lightrag/api/static/js/api.js +++ b/lightrag/api/static/js/api.js @@ -256,7 +256,7 @@ const handlers = { // Update progress bar progressBar.style.width = `${progressData.progress}%`; - + // Update status text if (progressData.total_files > 0) { statusText.textContent = `Processing ${progressData.current_file} (${progressData.indexed_count}/${progressData.total_files})`; From 6889606a489be69736b15262e9e276077e471611 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Fri, 31 Jan 2025 11:19:12 +0100 Subject: [PATCH 06/15] Update lightrag_server.py --- lightrag/api/lightrag_server.py | 49 +++++++++++++++++---------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index c71d5075..436437f3 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -1,5 +1,4 @@ -from fastapi import FastAPI, HTTPException, File, UploadFile, Form, Request - +from fastapi import FastAPI, HTTPException, File, UploadFile, Form, Request, BackgroundTasks # Backend (Python) # Add this to store progress globally from typing import Dict @@ -1010,46 +1009,50 @@ def create_app(args): logging.warning(f"No content extracted from file: {file_path}") @app.post("/documents/scan", dependencies=[Depends(optional_api_key)]) - async def scan_for_new_documents(): + async def scan_for_new_documents(background_tasks: BackgroundTasks): """Trigger the scanning process""" global scan_progress - + + with progress_lock: + if scan_progress["is_scanning"]: + return {"status": "already_scanning"} + + scan_progress["is_scanning"] = True + scan_progress["indexed_count"] = 0 + scan_progress["progress"] = 0 + + # Start the scanning process in the background + background_tasks.add_task(run_scanning_process) + + return {"status": "scanning_started"} + + async def run_scanning_process(): + """Background task to scan and index documents""" + global scan_progress + try: - with progress_lock: - if scan_progress["is_scanning"]: - return {"status": "already_scanning"} - - scan_progress["is_scanning"] = True - scan_progress["indexed_count"] = 0 - scan_progress["progress"] = 0 - new_files = doc_manager.scan_directory_for_new_files() scan_progress["total_files"] = len(new_files) - + for file_path in new_files: try: with progress_lock: scan_progress["current_file"] = os.path.basename(file_path) - + await index_file(file_path) - + with progress_lock: scan_progress["indexed_count"] += 1 scan_progress["progress"] = ( scan_progress["indexed_count"] / scan_progress["total_files"] ) * 100 - + except Exception as e: logging.error(f"Error indexing file {file_path}: {str(e)}") - - return { - "status": "success", - "indexed_count": scan_progress["indexed_count"], - "total_documents": len(doc_manager.indexed_files), - } + except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) + logging.error(f"Error during scanning process: {str(e)}") finally: with progress_lock: scan_progress["is_scanning"] = False From 2444975bf1ce5bc24abc63d09e33f681aa6974e2 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Fri, 31 Jan 2025 13:22:19 +0100 Subject: [PATCH 07/15] Update api.js --- lightrag/api/static/js/api.js | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/lightrag/api/static/js/api.js b/lightrag/api/static/js/api.js index 5654e253..effd73c2 100644 --- a/lightrag/api/static/js/api.js +++ b/lightrag/api/static/js/api.js @@ -49,16 +49,18 @@ const pages = {

0 files processed

- - - +
+ + + +

Indexed Files

From 2d4991dfbd32b856da3b5ccc690b8f62aa604b81 Mon Sep 17 00:00:00 2001 From: yangdx Date: Fri, 31 Jan 2025 20:40:37 +0800 Subject: [PATCH 08/15] Fix typo in prompt --- lightrag/prompt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/prompt.py b/lightrag/prompt.py index 8e52b9c3..913f8eef 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -259,7 +259,7 @@ You are a helpful assistant responding to user query about Document Chunks provi ---Goal--- -GGenerate a concise response based on Document Chunks and follow Response Rules, considering both the conversation history and the current query. Summarize all information in the provided Document Chunks, and incorporating general knowledge relevant to the Document Chunks. Do not include information not provided by Document Chunks. +Generate a concise response based on Document Chunks and follow Response Rules, considering both the conversation history and the current query. Summarize all information in the provided Document Chunks, and incorporating general knowledge relevant to the Document Chunks. Do not include information not provided by Document Chunks. When handling content with timestamps: 1. Each piece of content has a "created_at" timestamp indicating when we acquired this knowledge From 8a624e198a7db66bcde454ff648096e90ce2ed22 Mon Sep 17 00:00:00 2001 From: Gurjot Singh Date: Fri, 31 Jan 2025 19:00:36 +0530 Subject: [PATCH 09/15] Add faiss integration for storage --- README.md | 29 ++++ examples/test_faiss.py | 104 +++++++++++++ lightrag/kg/faiss_impl.py | 318 ++++++++++++++++++++++++++++++++++++++ lightrag/lightrag.py | 1 + 4 files changed, 452 insertions(+) create mode 100644 examples/test_faiss.py create mode 100644 lightrag/kg/faiss_impl.py diff --git a/README.md b/README.md index ad405e90..dd215b04 100644 --- a/README.md +++ b/README.md @@ -465,7 +465,36 @@ For production level scenarios you will most likely want to leverage an enterpri > > You can Compile the AGE from source code and fix it. +### Using Faiss for Storage +- Install the required dependencies: +``` +pip install faiss-cpu +``` +You can also install `faiss-gpu` if you have GPU support. +- Here we are using `sentence-transformers` but you can also use `OpenAIEmbedding` model with `3072` dimensions. + +``` +async def embedding_func(texts: list[str]) -> np.ndarray: + model = SentenceTransformer('all-MiniLM-L6-v2') + embeddings = model.encode(texts, convert_to_numpy=True) + return embeddings + +# Initialize LightRAG with the LLM model function and embedding function + rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=384, + max_token_size=8192, + func=embedding_func, + ), + vector_storage="FaissVectorDBStorage", + vector_db_storage_cls_kwargs={ + "cosine_better_than_threshold": 0.3 # Your desired threshold + } + ) +``` ### Insert Custom KG diff --git a/examples/test_faiss.py b/examples/test_faiss.py new file mode 100644 index 00000000..e73c0bfc --- /dev/null +++ b/examples/test_faiss.py @@ -0,0 +1,104 @@ +import os +import logging +import numpy as np + +from dotenv import load_dotenv +from sentence_transformers import SentenceTransformer + +from openai import AzureOpenAI +from lightrag import LightRAG, QueryParam +from lightrag.utils import EmbeddingFunc +from lightrag.kg.faiss_impl import FaissVectorDBStorage + +# Configure Logging +logging.basicConfig(level=logging.INFO) + +# Load environment variables from .env file +load_dotenv() +AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION") +AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT") +AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") +AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") + +async def llm_model_func( + prompt, + system_prompt=None, + history_messages=[], + keyword_extraction=False, + **kwargs +) -> str: + + # Create a client for AzureOpenAI + client = AzureOpenAI( + api_key=AZURE_OPENAI_API_KEY, + api_version=AZURE_OPENAI_API_VERSION, + azure_endpoint=AZURE_OPENAI_ENDPOINT, + ) + + # Build the messages list for the conversation + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + if history_messages: + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + + # Call the LLM + chat_completion = client.chat.completions.create( + model=AZURE_OPENAI_DEPLOYMENT, + messages=messages, + temperature=kwargs.get("temperature", 0), + top_p=kwargs.get("top_p", 1), + n=kwargs.get("n", 1), + ) + + return chat_completion.choices[0].message.content + + +async def embedding_func(texts: list[str]) -> np.ndarray: + model = SentenceTransformer('all-MiniLM-L6-v2') + embeddings = model.encode(texts, convert_to_numpy=True) + return embeddings + +def main(): + + WORKING_DIR = "./dickens" + + # Initialize LightRAG with the LLM model function and embedding function + rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=384, + max_token_size=8192, + func=embedding_func, + ), + vector_storage="FaissVectorDBStorage", + vector_db_storage_cls_kwargs={ + "cosine_better_than_threshold": 0.3 # Your desired threshold + } + ) + + # Insert the custom chunks into LightRAG + book1 = open("./book_1.txt", encoding="utf-8") + book2 = open("./book_2.txt", encoding="utf-8") + + rag.insert([book1.read(), book2.read()]) + + query_text = "What are the main themes?" + + print("Result (Naive):") + print(rag.query(query_text, param=QueryParam(mode="naive"))) + + print("\nResult (Local):") + print(rag.query(query_text, param=QueryParam(mode="local"))) + + print("\nResult (Global):") + print(rag.query(query_text, param=QueryParam(mode="global"))) + + print("\nResult (Hybrid):") + print(rag.query(query_text, param=QueryParam(mode="hybrid"))) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/lightrag/kg/faiss_impl.py b/lightrag/kg/faiss_impl.py new file mode 100644 index 00000000..1688e507 --- /dev/null +++ b/lightrag/kg/faiss_impl.py @@ -0,0 +1,318 @@ +import os +import time +import asyncio +import faiss +import json +import numpy as np +from tqdm.asyncio import tqdm as tqdm_async +from dataclasses import dataclass + +from lightrag.utils import ( + logger, + compute_mdhash_id, +) +from lightrag.base import ( + BaseVectorStorage, +) + + +@dataclass +class FaissVectorDBStorage(BaseVectorStorage): + """ + A Faiss-based Vector DB Storage for LightRAG. + Uses cosine similarity by storing normalized vectors in a Faiss index with inner product search. + """ + cosine_better_than_threshold: float = float(os.getenv("COSINE_THRESHOLD", "0.2")) + + def __post_init__(self): + # Grab config values if available + config = self.global_config.get("vector_db_storage_cls_kwargs", {}) + self.cosine_better_than_threshold = config.get( + "cosine_better_than_threshold", self.cosine_better_than_threshold + ) + + # Where to save index file if you want persistent storage + self._faiss_index_file = os.path.join( + self.global_config["working_dir"], f"faiss_index_{self.namespace}.index" + ) + self._meta_file = self._faiss_index_file + ".meta.json" + + self._max_batch_size = self.global_config["embedding_batch_num"] + # Embedding dimension (e.g. 768) must match your embedding function + self._dim = self.embedding_func.embedding_dim + + # Create an empty Faiss index for inner product (useful for normalized vectors = cosine similarity). + # If you have a large number of vectors, you might want IVF or other indexes. + # For demonstration, we use a simple IndexFlatIP. + self._index = faiss.IndexFlatIP(self._dim) + + # Keep a local store for metadata, IDs, etc. + # Maps → metadata (including your original ID). + self._id_to_meta = {} + + # Attempt to load an existing index + metadata from disk + self._load_faiss_index() + + async def upsert(self, data: dict[str, dict]): + """ + Insert or update vectors in the Faiss index. + + data: { + "custom_id_1": { + "content": , + ...metadata... + }, + "custom_id_2": { + "content": , + ...metadata... + }, + ... + } + """ + logger.info(f"Inserting {len(data)} vectors to {self.namespace}") + if not data: + logger.warning("You are inserting empty data to the vector DB") + return [] + + current_time = time.time() + + # Prepare data for embedding + list_data = [] + contents = [] + for k, v in data.items(): + # Store only known meta fields if needed + meta = {mf: v[mf] for mf in self.meta_fields if mf in v} + meta["__id__"] = k + meta["__created_at__"] = current_time + list_data.append(meta) + contents.append(v["content"]) + + # Split into batches for embedding if needed + batches = [ + contents[i : i + self._max_batch_size] + for i in range(0, len(contents), self._max_batch_size) + ] + + pbar = tqdm_async(total=len(batches), desc="Generating embeddings", unit="batch") + + async def wrapped_task(batch): + result = await self.embedding_func(batch) + pbar.update(1) + return result + + embedding_tasks = [wrapped_task(batch) for batch in batches] + embeddings_list = await asyncio.gather(*embedding_tasks) + + # Flatten the list of arrays + embeddings = np.concatenate(embeddings_list, axis=0) + if len(embeddings) != len(list_data): + logger.error( + f"Embedding size mismatch. Embeddings: {len(embeddings)}, Data: {len(list_data)}" + ) + return [] + + # Normalize embeddings for cosine similarity (in-place) + faiss.normalize_L2(embeddings) + + # Upsert logic: + # 1. Identify which vectors to remove if they exist + # 2. Remove them + # 3. Add the new vectors + existing_ids_to_remove = [] + for meta, emb in zip(list_data, embeddings): + faiss_internal_id = self._find_faiss_id_by_custom_id(meta["__id__"]) + if faiss_internal_id is not None: + existing_ids_to_remove.append(faiss_internal_id) + + if existing_ids_to_remove: + self._remove_faiss_ids(existing_ids_to_remove) + + # Step 2: Add new vectors + start_idx = self._index.ntotal + self._index.add(embeddings) + + # Step 3: Store metadata + vector for each new ID + for i, meta in enumerate(list_data): + fid = start_idx + i + # Store the raw vector so we can rebuild if something is removed + meta["__vector__"] = embeddings[i].tolist() + self._id_to_meta[fid] = meta + + logger.info(f"Upserted {len(list_data)} vectors into Faiss index.") + return [m["__id__"] for m in list_data] + + async def query(self, query: str, top_k=5): + """ + Search by a textual query; returns top_k results with their metadata + similarity distance. + """ + embedding = await self.embedding_func([query]) + # embedding is shape (1, dim) + embedding = np.array(embedding, dtype=np.float32) + faiss.normalize_L2(embedding) # we do in-place normalization + + logger.info( + f"Query: {query}, top_k: {top_k}, threshold: {self.cosine_better_than_threshold}" + ) + + # Perform the similarity search + distances, indices = self._index.search(embedding, top_k) + + distances = distances[0] + indices = indices[0] + + results = [] + for dist, idx in zip(distances, indices): + if idx == -1: + # Faiss returns -1 if no neighbor + continue + + # Cosine similarity threshold + if dist < self.cosine_better_than_threshold: + continue + + meta = self._id_to_meta.get(idx, {}) + results.append( + { + **meta, + "id": meta.get("__id__"), + "distance": float(dist), + "created_at": meta.get("__created_at__"), + } + ) + + return results + + @property + def client_storage(self): + # Return whatever structure LightRAG might need for debugging + return {"data": list(self._id_to_meta.values())} + + async def delete(self, ids: list[str]): + """ + Delete vectors for the provided custom IDs. + """ + logger.info(f"Deleting {len(ids)} vectors from {self.namespace}") + to_remove = [] + for cid in ids: + fid = self._find_faiss_id_by_custom_id(cid) + if fid is not None: + to_remove.append(fid) + + if to_remove: + self._remove_faiss_ids(to_remove) + logger.info(f"Successfully deleted {len(to_remove)} vectors from {self.namespace}") + + async def delete_entity(self, entity_name: str): + """ + Delete a single entity by computing its hashed ID + the same way your code does it with `compute_mdhash_id`. + """ + entity_id = compute_mdhash_id(entity_name, prefix="ent-") + logger.debug(f"Attempting to delete entity {entity_name} with ID {entity_id}") + await self.delete([entity_id]) + + async def delete_entity_relation(self, entity_name: str): + """ + Delete relations for a given entity by scanning metadata. + """ + logger.debug(f"Searching relations for entity {entity_name}") + relations = [] + for fid, meta in self._id_to_meta.items(): + if meta.get("src_id") == entity_name or meta.get("tgt_id") == entity_name: + relations.append(fid) + + logger.debug(f"Found {len(relations)} relations for {entity_name}") + if relations: + self._remove_faiss_ids(relations) + logger.debug(f"Deleted {len(relations)} relations for {entity_name}") + + async def index_done_callback(self): + """ + Called after indexing is done (save Faiss index + metadata). + """ + self._save_faiss_index() + logger.info("Faiss index saved successfully.") + + # -------------------------------------------------------------------------------- + # Internal helper methods + # -------------------------------------------------------------------------------- + + def _find_faiss_id_by_custom_id(self, custom_id: str): + """ + Return the Faiss internal ID for a given custom ID, or None if not found. + """ + for fid, meta in self._id_to_meta.items(): + if meta.get("__id__") == custom_id: + return fid + return None + + def _remove_faiss_ids(self, fid_list): + """ + Remove a list of internal Faiss IDs from the index. + Because IndexFlatIP doesn't support 'removals', + we rebuild the index excluding those vectors. + """ + keep_fids = [fid for fid in self._id_to_meta if fid not in fid_list] + + # Rebuild the index + vectors_to_keep = [] + new_id_to_meta = {} + for new_fid, old_fid in enumerate(keep_fids): + vec_meta = self._id_to_meta[old_fid] + vectors_to_keep.append(vec_meta["__vector__"]) # stored as list + new_id_to_meta[new_fid] = vec_meta + + # Re-init index + self._index = faiss.IndexFlatIP(self._dim) + if vectors_to_keep: + arr = np.array(vectors_to_keep, dtype=np.float32) + self._index.add(arr) + + self._id_to_meta = new_id_to_meta + + def _save_faiss_index(self): + """ + Save the current Faiss index + metadata to disk so it can persist across runs. + """ + faiss.write_index(self._index, self._faiss_index_file) + + # Save metadata dict to JSON. Convert all keys to strings for JSON storage. + # _id_to_meta is { int: { '__id__': doc_id, '__vector__': [float,...], ... } } + # We'll keep the int -> dict, but JSON requires string keys. + serializable_dict = {} + for fid, meta in self._id_to_meta.items(): + serializable_dict[str(fid)] = meta + + with open(self._meta_file, "w", encoding="utf-8") as f: + json.dump(serializable_dict, f) + + def _load_faiss_index(self): + """ + Load the Faiss index + metadata from disk if it exists, + and rebuild in-memory structures so we can query. + """ + if not os.path.exists(self._faiss_index_file): + logger.warning("No existing Faiss index file found. Starting fresh.") + return + + try: + # Load the Faiss index + self._index = faiss.read_index(self._faiss_index_file) + # Load metadata + with open(self._meta_file, "r", encoding="utf-8") as f: + stored_dict = json.load(f) + + # Convert string keys back to int + self._id_to_meta = {} + for fid_str, meta in stored_dict.items(): + fid = int(fid_str) + self._id_to_meta[fid] = meta + + logger.info( + f"Faiss index loaded with {self._index.ntotal} vectors from {self._faiss_index_file}" + ) + except Exception as e: + logger.error(f"Failed to load Faiss index or metadata: {e}") + logger.warning("Starting with an empty Faiss index.") + self._index = faiss.IndexFlatIP(self._dim) + self._id_to_meta = {} diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 92fc954f..22db6994 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -60,6 +60,7 @@ STORAGES = { "PGGraphStorage": ".kg.postgres_impl", "GremlinStorage": ".kg.gremlin_impl", "PGDocStatusStorage": ".kg.postgres_impl", + "FaissVectorDBStorage": ".kg.faiss_impl", } From 2894e8faf2e6a6c257ea0ccd5b24c79dfc111ae8 Mon Sep 17 00:00:00 2001 From: Gurjot Singh Date: Fri, 31 Jan 2025 19:05:47 +0530 Subject: [PATCH 10/15] Fix linting errors --- examples/test_faiss.py | 17 ++++++----------- lightrag/kg/faiss_impl.py | 13 +++++++++---- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/examples/test_faiss.py b/examples/test_faiss.py index e73c0bfc..ab0ef9f7 100644 --- a/examples/test_faiss.py +++ b/examples/test_faiss.py @@ -8,7 +8,6 @@ from sentence_transformers import SentenceTransformer from openai import AzureOpenAI from lightrag import LightRAG, QueryParam from lightrag.utils import EmbeddingFunc -from lightrag.kg.faiss_impl import FaissVectorDBStorage # Configure Logging logging.basicConfig(level=logging.INFO) @@ -20,14 +19,10 @@ AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT") AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") + async def llm_model_func( - prompt, - system_prompt=None, - history_messages=[], - keyword_extraction=False, - **kwargs + prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs ) -> str: - # Create a client for AzureOpenAI client = AzureOpenAI( api_key=AZURE_OPENAI_API_KEY, @@ -56,12 +51,12 @@ async def llm_model_func( async def embedding_func(texts: list[str]) -> np.ndarray: - model = SentenceTransformer('all-MiniLM-L6-v2') + model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = model.encode(texts, convert_to_numpy=True) return embeddings + def main(): - WORKING_DIR = "./dickens" # Initialize LightRAG with the LLM model function and embedding function @@ -76,7 +71,7 @@ def main(): vector_storage="FaissVectorDBStorage", vector_db_storage_cls_kwargs={ "cosine_better_than_threshold": 0.3 # Your desired threshold - } + }, ) # Insert the custom chunks into LightRAG @@ -101,4 +96,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/lightrag/kg/faiss_impl.py b/lightrag/kg/faiss_impl.py index 1688e507..fc6aa779 100644 --- a/lightrag/kg/faiss_impl.py +++ b/lightrag/kg/faiss_impl.py @@ -22,6 +22,7 @@ class FaissVectorDBStorage(BaseVectorStorage): A Faiss-based Vector DB Storage for LightRAG. Uses cosine similarity by storing normalized vectors in a Faiss index with inner product search. """ + cosine_better_than_threshold: float = float(os.getenv("COSINE_THRESHOLD", "0.2")) def __post_init__(self): @@ -46,7 +47,7 @@ class FaissVectorDBStorage(BaseVectorStorage): # For demonstration, we use a simple IndexFlatIP. self._index = faiss.IndexFlatIP(self._dim) - # Keep a local store for metadata, IDs, etc. + # Keep a local store for metadata, IDs, etc. # Maps → metadata (including your original ID). self._id_to_meta = {} @@ -93,7 +94,9 @@ class FaissVectorDBStorage(BaseVectorStorage): for i in range(0, len(contents), self._max_batch_size) ] - pbar = tqdm_async(total=len(batches), desc="Generating embeddings", unit="batch") + pbar = tqdm_async( + total=len(batches), desc="Generating embeddings", unit="batch" + ) async def wrapped_task(batch): result = await self.embedding_func(batch) @@ -200,7 +203,9 @@ class FaissVectorDBStorage(BaseVectorStorage): if to_remove: self._remove_faiss_ids(to_remove) - logger.info(f"Successfully deleted {len(to_remove)} vectors from {self.namespace}") + logger.info( + f"Successfully deleted {len(to_remove)} vectors from {self.namespace}" + ) async def delete_entity(self, entity_name: str): """ @@ -288,7 +293,7 @@ class FaissVectorDBStorage(BaseVectorStorage): def _load_faiss_index(self): """ - Load the Faiss index + metadata from disk if it exists, + Load the Faiss index + metadata from disk if it exists, and rebuild in-memory structures so we can query. """ if not os.path.exists(self._faiss_index_file): From e9591548b4de480f07f439a86f76f4e1b1d22387 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Fri, 31 Jan 2025 16:03:31 +0100 Subject: [PATCH 11/15] Update api.js --- lightrag/api/static/js/api.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lightrag/api/static/js/api.js b/lightrag/api/static/js/api.js index effd73c2..86d8785d 100644 --- a/lightrag/api/static/js/api.js +++ b/lightrag/api/static/js/api.js @@ -17,11 +17,12 @@ const showToast = (message, duration = 3000) => { const fetchWithAuth = async (url, options = {}) => { const headers = { ...(options.headers || {}), - ...(state.apiKey ? { 'Authorization': `Bearer ${state.apiKey}` } : {}) + ...(state.apiKey ? { 'X-API-Key': state.apiKey } : {}) // Use X-API-Key instead of Bearer }; return fetch(url, { ...options, headers }); }; + // Page renderers const pages = { 'file-manager': () => ` From d1210851aa23967255c6d626da4644451feeb3e1 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Fri, 31 Jan 2025 16:07:27 +0100 Subject: [PATCH 12/15] Update api.js --- lightrag/api/static/js/api.js | 1 + 1 file changed, 1 insertion(+) diff --git a/lightrag/api/static/js/api.js b/lightrag/api/static/js/api.js index 86d8785d..551d29ac 100644 --- a/lightrag/api/static/js/api.js +++ b/lightrag/api/static/js/api.js @@ -15,6 +15,7 @@ const showToast = (message, duration = 3000) => { }; const fetchWithAuth = async (url, options = {}) => { + console.log(`Calling server with api key : ${}`) const headers = { ...(options.headers || {}), ...(state.apiKey ? { 'X-API-Key': state.apiKey } : {}) // Use X-API-Key instead of Bearer From d2a550fd310b99d24b25853ab9d0883626034502 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Fri, 31 Jan 2025 16:08:23 +0100 Subject: [PATCH 13/15] Update api.js --- lightrag/api/static/js/api.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/api/static/js/api.js b/lightrag/api/static/js/api.js index 551d29ac..5a331d98 100644 --- a/lightrag/api/static/js/api.js +++ b/lightrag/api/static/js/api.js @@ -15,7 +15,7 @@ const showToast = (message, duration = 3000) => { }; const fetchWithAuth = async (url, options = {}) => { - console.log(`Calling server with api key : ${}`) + console.log(`Calling server with api key : ${state.apiKey}`) const headers = { ...(options.headers || {}), ...(state.apiKey ? { 'X-API-Key': state.apiKey } : {}) // Use X-API-Key instead of Bearer From 78b858c03bcee48cc62832cf141187b5d4034895 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Fri, 31 Jan 2025 16:19:46 +0100 Subject: [PATCH 14/15] Finished testing api key --- lightrag/api/static/js/api.js | 1 - 1 file changed, 1 deletion(-) diff --git a/lightrag/api/static/js/api.js b/lightrag/api/static/js/api.js index 5a331d98..86d8785d 100644 --- a/lightrag/api/static/js/api.js +++ b/lightrag/api/static/js/api.js @@ -15,7 +15,6 @@ const showToast = (message, duration = 3000) => { }; const fetchWithAuth = async (url, options = {}) => { - console.log(`Calling server with api key : ${state.apiKey}`) const headers = { ...(options.headers || {}), ...(state.apiKey ? { 'X-API-Key': state.apiKey } : {}) // Use X-API-Key instead of Bearer From e59cb7493cd434449f8755a40832836d1a0459c3 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Fri, 31 Jan 2025 23:35:42 +0800 Subject: [PATCH 15/15] fixed linting --- lightrag/api/lightrag_server.py | 33 +++++++++++++++++++++------------ lightrag/api/static/js/api.js | 2 +- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 436437f3..e1b24731 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -1,4 +1,13 @@ -from fastapi import FastAPI, HTTPException, File, UploadFile, Form, Request, BackgroundTasks +from fastapi import ( + FastAPI, + HTTPException, + File, + UploadFile, + Form, + Request, + BackgroundTasks, +) + # Backend (Python) # Add this to store progress globally from typing import Dict @@ -1012,45 +1021,45 @@ def create_app(args): async def scan_for_new_documents(background_tasks: BackgroundTasks): """Trigger the scanning process""" global scan_progress - + with progress_lock: if scan_progress["is_scanning"]: return {"status": "already_scanning"} - + scan_progress["is_scanning"] = True scan_progress["indexed_count"] = 0 scan_progress["progress"] = 0 - + # Start the scanning process in the background background_tasks.add_task(run_scanning_process) - + return {"status": "scanning_started"} - + async def run_scanning_process(): """Background task to scan and index documents""" global scan_progress - + try: new_files = doc_manager.scan_directory_for_new_files() scan_progress["total_files"] = len(new_files) - + for file_path in new_files: try: with progress_lock: scan_progress["current_file"] = os.path.basename(file_path) - + await index_file(file_path) - + with progress_lock: scan_progress["indexed_count"] += 1 scan_progress["progress"] = ( scan_progress["indexed_count"] / scan_progress["total_files"] ) * 100 - + except Exception as e: logging.error(f"Error indexing file {file_path}: {str(e)}") - + except Exception as e: logging.error(f"Error during scanning process: {str(e)}") finally: diff --git a/lightrag/api/static/js/api.js b/lightrag/api/static/js/api.js index 86d8785d..b610eb10 100644 --- a/lightrag/api/static/js/api.js +++ b/lightrag/api/static/js/api.js @@ -57,7 +57,7 @@ const pages = { Rescan Files - +