Refactor storage initialization to avoid redundant intitial data loads across processes, show init logs to first load only

This commit is contained in:
yangdx
2025-02-26 12:28:49 +08:00
parent 15a6a9cf7c
commit 2c019dbc7b
4 changed files with 15 additions and 14 deletions

View File

@@ -57,16 +57,16 @@ class FaissVectorDBStorage(BaseVectorStorage):
# If you have a large number of vectors, you might want IVF or other indexes. # If you have a large number of vectors, you might want IVF or other indexes.
# For demonstration, we use a simple IndexFlatIP. # For demonstration, we use a simple IndexFlatIP.
self._index.value = faiss.IndexFlatIP(self._dim) self._index.value = faiss.IndexFlatIP(self._dim)
else:
if self._index is None:
self._index = faiss.IndexFlatIP(self._dim)
# Keep a local store for metadata, IDs, etc. # Keep a local store for metadata, IDs, etc.
# Maps <int faiss_id> → metadata (including your original ID). # Maps <int faiss_id> → metadata (including your original ID).
self._id_to_meta.update({}) self._id_to_meta.update({})
# Attempt to load an existing index + metadata from disk # Attempt to load an existing index + metadata from disk
self._load_faiss_index() self._load_faiss_index()
else:
if self._index is None:
self._index = faiss.IndexFlatIP(self._dim)
self._id_to_meta.update({})
self._load_faiss_index()
async def upsert(self, data: dict[str, dict[str, Any]]) -> None: async def upsert(self, data: dict[str, dict[str, Any]]) -> None:

View File

@@ -26,6 +26,7 @@ class JsonDocStatusStorage(DocStatusStorage):
self._storage_lock = get_storage_lock() self._storage_lock = get_storage_lock()
self._data = get_namespace_data(self.namespace) self._data = get_namespace_data(self.namespace)
with self._storage_lock: with self._storage_lock:
if not self._data:
self._data.update(load_json(self._file_name) or {}) self._data.update(load_json(self._file_name) or {})
logger.info(f"Loaded document status storage with {len(self._data)} records") logger.info(f"Loaded document status storage with {len(self._data)} records")

View File

@@ -18,11 +18,11 @@ from .shared_storage import get_namespace_data, get_storage_lock
class JsonKVStorage(BaseKVStorage): class JsonKVStorage(BaseKVStorage):
def __post_init__(self): def __post_init__(self):
working_dir = self.global_config["working_dir"] working_dir = self.global_config["working_dir"]
self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
self._storage_lock = get_storage_lock() self._storage_lock = get_storage_lock()
self._data = get_namespace_data(self.namespace) self._data = get_namespace_data(self.namespace)
with self._storage_lock: with self._storage_lock:
if not self._data: if not self._data:
self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
self._data: dict[str, Any] = load_json(self._file_name) or {} self._data: dict[str, Any] = load_json(self._file_name) or {}
logger.info(f"Load KV {self.namespace} with {len(self._data)} data") logger.info(f"Load KV {self.namespace} with {len(self._data)} data")

View File

@@ -48,12 +48,12 @@ class NanoVectorDBStorage(BaseVectorStorage):
self._client.value = NanoVectorDB( self._client.value = NanoVectorDB(
self.embedding_func.embedding_dim, storage_file=self._client_file_name self.embedding_func.embedding_dim, storage_file=self._client_file_name
) )
logger.info(f"Initialized vector DB client for namespace {self.namespace}")
else: else:
if self._client is None: if self._client is None:
self._client = NanoVectorDB( self._client = NanoVectorDB(
self.embedding_func.embedding_dim, storage_file=self._client_file_name self.embedding_func.embedding_dim, storage_file=self._client_file_name
) )
logger.info(f"Initialized vector DB client for namespace {self.namespace}") logger.info(f"Initialized vector DB client for namespace {self.namespace}")
def _get_client(self): def _get_client(self):