Added progress when scanning files and fixed some bugs in the API

This commit is contained in:
Saifeddine ALOUI
2025-01-30 23:27:43 +01:00
parent 59617da83e
commit 219cbab1e3
3 changed files with 470 additions and 399 deletions

View File

@@ -1,4 +1,24 @@
from fastapi import FastAPI, HTTPException, File, UploadFile, Form, Request from fastapi import FastAPI, HTTPException, File, UploadFile, Form, Request
# Backend (Python)
# Add this to store progress globally
from typing import Dict
import threading
# Global progress tracker
scan_progress: Dict = {
"is_scanning": False,
"current_file": "",
"indexed_count": 0,
"total_files": 0,
"progress": 0
}
# Lock for thread-safe operations
progress_lock = threading.Lock()
import json
import os
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel from pydantic import BaseModel
import logging import logging
@@ -538,7 +558,7 @@ class DocumentManager:
# Create input directory if it doesn't exist # Create input directory if it doesn't exist
self.input_dir.mkdir(parents=True, exist_ok=True) self.input_dir.mkdir(parents=True, exist_ok=True)
def scan_directory(self) -> List[Path]: def scan_directory_for_new_files(self) -> List[Path]:
"""Scan input directory for new files""" """Scan input directory for new files"""
new_files = [] new_files = []
for ext in self.supported_extensions: for ext in self.supported_extensions:
@@ -547,6 +567,14 @@ class DocumentManager:
new_files.append(file_path) new_files.append(file_path)
return new_files return new_files
def scan_directory(self) -> List[Path]:
"""Scan input directory for new files"""
new_files = []
for ext in self.supported_extensions:
for file_path in self.input_dir.rglob(f"*{ext}"):
new_files.append(file_path)
return new_files
def mark_as_indexed(self, file_path: Path): def mark_as_indexed(self, file_path: Path):
"""Mark a file as indexed""" """Mark a file as indexed"""
self.indexed_files.add(file_path) self.indexed_files.add(file_path)
@@ -730,7 +758,7 @@ def create_app(args):
# Startup logic # Startup logic
if args.auto_scan_at_startup: if args.auto_scan_at_startup:
try: try:
new_files = doc_manager.scan_directory() new_files = doc_manager.scan_directory_for_new_files()
for file_path in new_files: for file_path in new_files:
try: try:
await index_file(file_path) await index_file(file_path)
@@ -982,43 +1010,56 @@ def create_app(args):
else: else:
logging.warning(f"No content extracted from file: {file_path}") logging.warning(f"No content extracted from file: {file_path}")
@app.post("/documents/scan", dependencies=[Depends(optional_api_key)]) @app.post("/documents/scan", dependencies=[Depends(optional_api_key)])
async def scan_for_new_documents(): async def scan_for_new_documents():
""" """Trigger the scanning process"""
Manually trigger scanning for new documents in the directory managed by `doc_manager`. global scan_progress
This endpoint facilitates manual initiation of a document scan to identify and index new files.
It processes all newly detected files, attempts indexing each file, logs any errors that occur,
and returns a summary of the operation.
Returns:
dict: A dictionary containing:
- "status" (str): Indicates success or failure of the scanning process.
- "indexed_count" (int): The number of successfully indexed documents.
- "total_documents" (int): Total number of documents that have been indexed so far.
Raises:
HTTPException: If an error occurs during the document scanning process, a 500 status
code is returned with details about the exception.
"""
try: try:
new_files = doc_manager.scan_directory() with progress_lock:
indexed_count = 0 if scan_progress["is_scanning"]:
return {"status": "already_scanning"}
scan_progress["is_scanning"] = True
scan_progress["indexed_count"] = 0
scan_progress["progress"] = 0
new_files = doc_manager.scan_directory_for_new_files()
scan_progress["total_files"] = len(new_files)
for file_path in new_files: for file_path in new_files:
try: try:
with progress_lock:
scan_progress["current_file"] = os.path.basename(file_path)
await index_file(file_path) await index_file(file_path)
indexed_count += 1
with progress_lock:
scan_progress["indexed_count"] += 1
scan_progress["progress"] = (scan_progress["indexed_count"] / scan_progress["total_files"]) * 100
except Exception as e: except Exception as e:
logging.error(f"Error indexing file {file_path}: {str(e)}") logging.error(f"Error indexing file {file_path}: {str(e)}")
return { return {
"status": "success", "status": "success",
"indexed_count": indexed_count, "indexed_count": scan_progress["indexed_count"],
"total_documents": len(doc_manager.indexed_files), "total_documents": len(doc_manager.indexed_files),
} }
except Exception as e: except Exception as e:
raise HTTPException(status_code=500, detail=str(e)) raise HTTPException(status_code=500, detail=str(e))
finally:
with progress_lock:
scan_progress["is_scanning"] = False
@app.get("/documents/scan-progress")
async def get_scan_progress():
"""Get the current scanning progress"""
with progress_lock:
return scan_progress
@app.post("/documents/upload", dependencies=[Depends(optional_api_key)]) @app.post("/documents/upload", dependencies=[Depends(optional_api_key)])
async def upload_to_input_dir(file: UploadFile = File(...)): async def upload_to_input_dir(file: UploadFile = File(...)):
@@ -1849,7 +1890,7 @@ def create_app(args):
"status": "healthy", "status": "healthy",
"working_directory": str(args.working_dir), "working_directory": str(args.working_dir),
"input_directory": str(args.input_dir), "input_directory": str(args.input_dir),
"indexed_files": files, "indexed_files": [str(f) for f in files],
"indexed_files_count": len(files), "indexed_files_count": len(files),
"configuration": { "configuration": {
# LLM configuration binding/host address (if applicable)/model (if applicable) # LLM configuration binding/host address (if applicable)/model (if applicable)

View File

@@ -98,7 +98,7 @@
</div> </div>
</div> </div>
<script src="/js/lightrag_api.js"></script> <script src="/js/api.js"></script>
</body> </body>
</html> </html>

View File

@@ -49,6 +49,12 @@ const pages = {
</div> </div>
<p class="text-sm text-gray-600 mt-2"><span id="uploadStatus">0</span> files processed</p> <p class="text-sm text-gray-600 mt-2"><span id="uploadStatus">0</span> files processed</p>
</div> </div>
<button id="rescanBtn" class="flex items-center bg-blue-600 text-white px-4 py-2 rounded-lg hover:bg-blue-700 transition-colors">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="20" height="20" fill="currentColor" class="mr-2">
<path d="M12 4a8 8 0 1 1-8 8H2.5a9.5 9.5 0 1 0 2.8-6.7L2 3v6h6L5.7 6.7A7.96 7.96 0 0 1 12 4z"/>
</svg>
Rescan Files
</button>
<button id="uploadBtn" class="bg-blue-600 text-white px-4 py-2 rounded-lg hover:bg-blue-700 transition-colors"> <button id="uploadBtn" class="bg-blue-600 text-white px-4 py-2 rounded-lg hover:bg-blue-700 transition-colors">
Upload & Index Files Upload & Index Files
@@ -58,12 +64,6 @@ const pages = {
<h3 class="text-lg font-semibold text-gray-700">Indexed Files</h3> <h3 class="text-lg font-semibold text-gray-700">Indexed Files</h3>
<div class="space-y-2"></div> <div class="space-y-2"></div>
</div> </div>
<button id="rescanBtn" class="flex items-center bg-blue-600 text-white px-4 py-2 rounded-lg hover:bg-blue-700 transition-colors">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="20" height="20" fill="currentColor" class="mr-2">
<path d="M12 4a8 8 0 1 1-8 8H2.5a9.5 9.5 0 1 0 2.8-6.7L2 3v6h6L5.7 6.7A7.96 7.96 0 0 1 12 4z"/>
</svg>
Rescan Files
</button>
</div> </div>
@@ -232,22 +232,52 @@ const handlers = {
} }
progress.classList.add('hidden'); progress.classList.add('hidden');
}); });
rescanBtn.addEventListener('click', async () => { rescanBtn.addEventListener('click', async () => {
let apiKey = localStorage.getItem('apiKey') || '';
const progress = document.getElementById('uploadProgress'); const progress = document.getElementById('uploadProgress');
const progressBar = progress.querySelector('div'); const progressBar = progress.querySelector('div');
const statusText = document.getElementById('uploadStatus'); const statusText = document.getElementById('uploadStatus');
progress.classList.remove('hidden'); progress.classList.remove('hidden');
try { try {
const scan_output = await fetch('/documents/scan', { // Start the scanning process
method: 'GET', const scanResponse = await fetch('/documents/scan', {
method: 'POST',
}); });
statusText.textContent = scan_output.data;
if (!scanResponse.ok) {
throw new Error('Scan failed to start');
}
// Start polling for progress
const pollInterval = setInterval(async () => {
const progressResponse = await fetch('/documents/scan-progress');
const progressData = await progressResponse.json();
// Update progress bar
progressBar.style.width = `${progressData.progress}%`;
// Update status text
if (progressData.total_files > 0) {
statusText.textContent = `Processing ${progressData.current_file} (${progressData.indexed_count}/${progressData.total_files})`;
}
// Check if scanning is complete
if (!progressData.is_scanning) {
clearInterval(pollInterval);
progress.classList.add('hidden');
statusText.textContent = 'Scan complete!';
}
}, 1000); // Poll every second
} catch (error) { } catch (error) {
console.error('Upload error:', error); console.error('Upload error:', error);
progress.classList.add('hidden');
statusText.textContent = 'Error during scanning process';
} }
progress.classList.add('hidden');
}); });
updateIndexedFiles(); updateIndexedFiles();
}, },