Merge pull request #766 from danielaskdd/background-scan-at-startup

Move document scanning trigger by command line to background task
This commit is contained in:
zrguo
2025-02-14 09:41:20 +08:00
committed by GitHub

View File

@@ -6,7 +6,7 @@ from fastapi import (
Form, Form,
BackgroundTasks, BackgroundTasks,
) )
import asyncio
import threading import threading
import os import os
import json import json
@@ -730,6 +730,8 @@ def create_app(args):
postgres_db = None postgres_db = None
oracle_db = None oracle_db = None
tidb_db = None tidb_db = None
# Store background tasks
app.state.background_tasks = set()
try: try:
# Check if PostgreSQL is needed # Check if PostgreSQL is needed
@@ -794,20 +796,23 @@ def create_app(args):
# Auto scan documents if enabled # Auto scan documents if enabled
if args.auto_scan_at_startup: if args.auto_scan_at_startup:
try: # Start scanning in background
new_files = doc_manager.scan_directory_for_new_files() with progress_lock:
for file_path in new_files: if not scan_progress["is_scanning"]:
try: scan_progress["is_scanning"] = True
await index_file(file_path) scan_progress["indexed_count"] = 0
except Exception as e: scan_progress["progress"] = 0
trace_exception(e) # Create background task
logging.error(f"Error indexing file {file_path}: {str(e)}") task = asyncio.create_task(run_scanning_process())
app.state.background_tasks.add(task)
ASCIIColors.info( task.add_done_callback(app.state.background_tasks.discard)
f"Indexed {len(new_files)} documents from {args.input_dir}" ASCIIColors.info(
) f"Started background scanning of documents from {args.input_dir}"
except Exception as e: )
logging.error(f"Error during startup indexing: {str(e)}") else:
ASCIIColors.info(
"Skip document scanning(anohter scanning is active)"
)
yield yield
@@ -1145,9 +1150,15 @@ def create_app(args):
pm.install("docling") pm.install("docling")
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
converter = DocumentConverter() async def convert_doc():
result = converter.convert(file_path) def sync_convert():
content = result.document.export_to_markdown() converter = DocumentConverter()
result = converter.convert(file_path)
return result.document.export_to_markdown()
return await asyncio.to_thread(sync_convert)
content = await convert_doc()
case _: case _:
raise ValueError(f"Unsupported file format: {ext}") raise ValueError(f"Unsupported file format: {ext}")
@@ -1439,9 +1450,16 @@ def create_app(args):
f.write(await file.read()) f.write(await file.read())
try: try:
converter = DocumentConverter()
result = converter.convert(str(temp_path)) async def convert_doc():
content = result.document.export_to_markdown() def sync_convert():
converter = DocumentConverter()
result = converter.convert(str(temp_path))
return result.document.export_to_markdown()
return await asyncio.to_thread(sync_convert)
content = await convert_doc()
finally: finally:
# Clean up the temporary file # Clean up the temporary file
temp_path.unlink() temp_path.unlink()