revert vector and graph use local data(single process)

2025-02-28 01:14:25 +08:00
parent db2a902fcb
commit 291e0c1b14
4 changed files with 287 additions and 443 deletions
--- a/lightrag/kg/faiss_impl.py
+++ b/lightrag/kg/faiss_impl.py
@@ -10,19 +10,12 @@ import pipmaster as pm

 from lightrag.utils import logger, compute_mdhash_id
 from lightrag.base import BaseVectorStorage
-from .shared_storage import (
-    get_namespace_data,
-    get_storage_lock,
-    get_namespace_object,
-    is_multiprocess,
-    try_initialize_namespace,
-)

 if not pm.is_installed("faiss"):
    pm.install("faiss")

 import faiss  # type: ignore
-
+from threading import Lock as ThreadLock

@final
@dataclass
@@ -51,35 +44,29 @@ class FaissVectorDBStorage(BaseVectorStorage):
        self._max_batch_size = self.global_config["embedding_batch_num"]
        # Embedding dimension (e.g. 768) must match your embedding function
        self._dim = self.embedding_func.embedding_dim
-        self._storage_lock = get_storage_lock()
+        self._storage_lock = ThreadLock()

-        # check need_init must before get_namespace_object/get_namespace_data
-        need_init = try_initialize_namespace("faiss_indices")
-        self._index = get_namespace_object("faiss_indices")
-        self._id_to_meta = get_namespace_data("faiss_meta")
+        # Create an empty Faiss index for inner product (useful for normalized vectors = cosine similarity).
+        # If you have a large number of vectors, you might want IVF or other indexes.
+        # For demonstration, we use a simple IndexFlatIP.
+        self._index = faiss.IndexFlatIP(self._dim)
+
+        # Keep a local store for metadata, IDs, etc.
+        # Maps <int faiss_id> → metadata (including your original ID).
+        self._id_to_meta = {}
+
+        # Attempt to load an existing index + metadata from disk
+        with self._storage_lock:
+            self._load_faiss_index()

-        if need_init:
-            if is_multiprocess:
-                # Create an empty Faiss index for inner product (useful for normalized vectors = cosine similarity).
-                # If you have a large number of vectors, you might want IVF or other indexes.
-                # For demonstration, we use a simple IndexFlatIP.
-                self._index.value = faiss.IndexFlatIP(self._dim)
-                # Keep a local store for metadata, IDs, etc.
-                # Maps <int faiss_id> → metadata (including your original ID).
-                self._id_to_meta.update({})
-                # Attempt to load an existing index + metadata from disk
-                self._load_faiss_index()
-            else:
-                self._index = faiss.IndexFlatIP(self._dim)
-                self._id_to_meta.update({})
-                self._load_faiss_index()

    def _get_index(self):
-        """
-        Helper method to get the correct index object based on multiprocess mode.
-        Returns the actual index object that can be used for operations.
-        """
-        return self._index.value if is_multiprocess else self._index
+        """Check if the shtorage should be reloaded"""
+        return self._index
+
+    async def index_done_callback(self) -> None:
+        with self._storage_lock:
+            self._save_faiss_index()

    async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
        """
@@ -134,34 +121,33 @@ class FaissVectorDBStorage(BaseVectorStorage):
        # Normalize embeddings for cosine similarity (in-place)
        faiss.normalize_L2(embeddings)

-        with self._storage_lock:
-            # Upsert logic:
-            # 1. Identify which vectors to remove if they exist
-            # 2. Remove them
-            # 3. Add the new vectors
-            existing_ids_to_remove = []
-            for meta, emb in zip(list_data, embeddings):
-                faiss_internal_id = self._find_faiss_id_by_custom_id(meta["__id__"])
-                if faiss_internal_id is not None:
-                    existing_ids_to_remove.append(faiss_internal_id)
+        # Upsert logic:
+        # 1. Identify which vectors to remove if they exist
+        # 2. Remove them
+        # 3. Add the new vectors
+        existing_ids_to_remove = []
+        for meta, emb in zip(list_data, embeddings):
+            faiss_internal_id = self._find_faiss_id_by_custom_id(meta["__id__"])
+            if faiss_internal_id is not None:
+                existing_ids_to_remove.append(faiss_internal_id)

-            if existing_ids_to_remove:
-                self._remove_faiss_ids(existing_ids_to_remove)
+        if existing_ids_to_remove:
+            self._remove_faiss_ids(existing_ids_to_remove)

-            # Step 2: Add new vectors
-            index = self._get_index()
-            start_idx = index.ntotal
-            index.add(embeddings)
+        # Step 2: Add new vectors
+        index = self._get_index()
+        start_idx = index.ntotal
+        index.add(embeddings)

-            # Step 3: Store metadata + vector for each new ID
-            for i, meta in enumerate(list_data):
-                fid = start_idx + i
-                # Store the raw vector so we can rebuild if something is removed
-                meta["__vector__"] = embeddings[i].tolist()
-                self._id_to_meta.update({fid: meta})
+        # Step 3: Store metadata + vector for each new ID
+        for i, meta in enumerate(list_data):
+            fid = start_idx + i
+            # Store the raw vector so we can rebuild if something is removed
+            meta["__vector__"] = embeddings[i].tolist()
+            self._id_to_meta.update({fid: meta})

-            logger.info(f"Upserted {len(list_data)} vectors into Faiss index.")
-            return [m["__id__"] for m in list_data]
+        logger.info(f"Upserted {len(list_data)} vectors into Faiss index.")
+        return [m["__id__"] for m in list_data]

    async def query(self, query: str, top_k: int) -> list[dict[str, Any]]:
        """
@@ -177,57 +163,54 @@ class FaissVectorDBStorage(BaseVectorStorage):
        )

        # Perform the similarity search
-        with self._storage_lock:
-            distances, indices = self._get_index().search(embedding, top_k)
+        distances, indices = self._get_index().search(embedding, top_k)

-            distances = distances[0]
-            indices = indices[0]
+        distances = distances[0]
+        indices = indices[0]

-            results = []
-            for dist, idx in zip(distances, indices):
-                if idx == -1:
-                    # Faiss returns -1 if no neighbor
-                    continue
+        results = []
+        for dist, idx in zip(distances, indices):
+            if idx == -1:
+                # Faiss returns -1 if no neighbor
+                continue

-                # Cosine similarity threshold
-                if dist < self.cosine_better_than_threshold:
-                    continue
+            # Cosine similarity threshold
+            if dist < self.cosine_better_than_threshold:
+                continue

-                meta = self._id_to_meta.get(idx, {})
-                results.append(
-                    {
-                        **meta,
-                        "id": meta.get("__id__"),
-                        "distance": float(dist),
-                        "created_at": meta.get("__created_at__"),
-                    }
-                )
+            meta = self._id_to_meta.get(idx, {})
+            results.append(
+                {
+                    **meta,
+                    "id": meta.get("__id__"),
+                    "distance": float(dist),
+                    "created_at": meta.get("__created_at__"),
+                }
+            )

-            return results
+        return results

    @property
    def client_storage(self):
        # Return whatever structure LightRAG might need for debugging
-        with self._storage_lock:
-            return {"data": list(self._id_to_meta.values())}
+        return {"data": list(self._id_to_meta.values())}

    async def delete(self, ids: list[str]):
        """
        Delete vectors for the provided custom IDs.
        """
        logger.info(f"Deleting {len(ids)} vectors from {self.namespace}")
-        with self._storage_lock:
-            to_remove = []
-            for cid in ids:
-                fid = self._find_faiss_id_by_custom_id(cid)
-                if fid is not None:
-                    to_remove.append(fid)
+        to_remove = []
+        for cid in ids:
+            fid = self._find_faiss_id_by_custom_id(cid)
+            if fid is not None:
+                to_remove.append(fid)

-            if to_remove:
-                self._remove_faiss_ids(to_remove)
-            logger.debug(
-                f"Successfully deleted {len(to_remove)} vectors from {self.namespace}"
-            )
+        if to_remove:
+            self._remove_faiss_ids(to_remove)
+        logger.debug(
+            f"Successfully deleted {len(to_remove)} vectors from {self.namespace}"
+        )

    async def delete_entity(self, entity_name: str) -> None:
        entity_id = compute_mdhash_id(entity_name, prefix="ent-")
@@ -239,23 +222,18 @@ class FaissVectorDBStorage(BaseVectorStorage):
        Delete relations for a given entity by scanning metadata.
        """
        logger.debug(f"Searching relations for entity {entity_name}")
-        with self._storage_lock:
-            relations = []
-            for fid, meta in self._id_to_meta.items():
-                if (
-                    meta.get("src_id") == entity_name
-                    or meta.get("tgt_id") == entity_name
-                ):
-                    relations.append(fid)
+        relations = []
+        for fid, meta in self._id_to_meta.items():
+            if (
+                meta.get("src_id") == entity_name
+                or meta.get("tgt_id") == entity_name
+            ):
+                relations.append(fid)

-            logger.debug(f"Found {len(relations)} relations for {entity_name}")
-            if relations:
-                self._remove_faiss_ids(relations)
-                logger.debug(f"Deleted {len(relations)} relations for {entity_name}")
-
-    async def index_done_callback(self) -> None:
-        with self._storage_lock:
-            self._save_faiss_index()
+        logger.debug(f"Found {len(relations)} relations for {entity_name}")
+        if relations:
+            self._remove_faiss_ids(relations)
+            logger.debug(f"Deleted {len(relations)} relations for {entity_name}")

    # --------------------------------------------------------------------------------
    # Internal helper methods
@@ -265,11 +243,10 @@ class FaissVectorDBStorage(BaseVectorStorage):
        """
        Return the Faiss internal ID for a given custom ID, or None if not found.
        """
-        with self._storage_lock:
-            for fid, meta in self._id_to_meta.items():
-                if meta.get("__id__") == custom_id:
-                    return fid
-            return None
+        for fid, meta in self._id_to_meta.items():
+            if meta.get("__id__") == custom_id:
+                return fid
+        return None

    def _remove_faiss_ids(self, fid_list):
        """
@@ -277,48 +254,42 @@ class FaissVectorDBStorage(BaseVectorStorage):
        Because IndexFlatIP doesn't support 'removals',
        we rebuild the index excluding those vectors.
        """
+        keep_fids = [fid for fid in self._id_to_meta if fid not in fid_list]
+
+        # Rebuild the index
+        vectors_to_keep = []
+        new_id_to_meta = {}
+        for new_fid, old_fid in enumerate(keep_fids):
+            vec_meta = self._id_to_meta[old_fid]
+            vectors_to_keep.append(vec_meta["__vector__"])  # stored as list
+            new_id_to_meta[new_fid] = vec_meta
+
        with self._storage_lock:
-            keep_fids = [fid for fid in self._id_to_meta if fid not in fid_list]
-
-            # Rebuild the index
-            vectors_to_keep = []
-            new_id_to_meta = {}
-            for new_fid, old_fid in enumerate(keep_fids):
-                vec_meta = self._id_to_meta[old_fid]
-                vectors_to_keep.append(vec_meta["__vector__"])  # stored as list
-                new_id_to_meta[new_fid] = vec_meta
-
-            # Re-init index
-            new_index = faiss.IndexFlatIP(self._dim)
+        # Re-init index
+            self._index = faiss.IndexFlatIP(self._dim)
            if vectors_to_keep:
                arr = np.array(vectors_to_keep, dtype=np.float32)
-                new_index.add(arr)
-            if is_multiprocess:
-                self._index.value = new_index
-            else:
-                self._index = new_index
+                self._index.add(arr)
+
+            self._id_to_meta = new_id_to_meta

-            self._id_to_meta.update(new_id_to_meta)

    def _save_faiss_index(self):
        """
        Save the current Faiss index + metadata to disk so it can persist across runs.
        """
-        with self._storage_lock:
-            faiss.write_index(
-                self._get_index(),
-                self._faiss_index_file,
-            )
+        faiss.write_index(self._index, self._faiss_index_file)

-            # Save metadata dict to JSON. Convert all keys to strings for JSON storage.
-            # _id_to_meta is { int: { '__id__': doc_id, '__vector__': [float,...], ... } }
-            # We'll keep the int -> dict, but JSON requires string keys.
-            serializable_dict = {}
-            for fid, meta in self._id_to_meta.items():
-                serializable_dict[str(fid)] = meta
+        # Save metadata dict to JSON. Convert all keys to strings for JSON storage.
+        # _id_to_meta is { int: { '__id__': doc_id, '__vector__': [float,...], ... } }
+        # We'll keep the int -> dict, but JSON requires string keys.
+        serializable_dict = {}
+        for fid, meta in self._id_to_meta.items():
+            serializable_dict[str(fid)] = meta
+
+        with open(self._meta_file, "w", encoding="utf-8") as f:
+            json.dump(serializable_dict, f)

-            with open(self._meta_file, "w", encoding="utf-8") as f:
-                json.dump(serializable_dict, f)

    def _load_faiss_index(self):
        """
@@ -331,31 +302,22 @@ class FaissVectorDBStorage(BaseVectorStorage):

        try:
            # Load the Faiss index
-            loaded_index = faiss.read_index(self._faiss_index_file)
-            if is_multiprocess:
-                self._index.value = loaded_index
-            else:
-                self._index = loaded_index
-
+            self._index = faiss.read_index(self._faiss_index_file)
            # Load metadata
            with open(self._meta_file, "r", encoding="utf-8") as f:
                stored_dict = json.load(f)

            # Convert string keys back to int
-            self._id_to_meta.update({})
+            self._id_to_meta = {}
            for fid_str, meta in stored_dict.items():
                fid = int(fid_str)
                self._id_to_meta[fid] = meta

            logger.info(
-                f"Faiss index loaded with {loaded_index.ntotal} vectors from {self._faiss_index_file}"
+                f"Faiss index loaded with {self._index.ntotal} vectors from {self._faiss_index_file}"
            )
        except Exception as e:
            logger.error(f"Failed to load Faiss index or metadata: {e}")
            logger.warning("Starting with an empty Faiss index.")
-            new_index = faiss.IndexFlatIP(self._dim)
-            if is_multiprocess:
-                self._index.value = new_index
-            else:
-                self._index = new_index
-            self._id_to_meta.update({})
+            self._index = faiss.IndexFlatIP(self._dim)
+            self._id_to_meta = {}