revert vector and graph use local data(single process)
This commit is contained in:
@@ -10,19 +10,12 @@ import pipmaster as pm
|
||||
|
||||
from lightrag.utils import logger, compute_mdhash_id
|
||||
from lightrag.base import BaseVectorStorage
|
||||
from .shared_storage import (
|
||||
get_namespace_data,
|
||||
get_storage_lock,
|
||||
get_namespace_object,
|
||||
is_multiprocess,
|
||||
try_initialize_namespace,
|
||||
)
|
||||
|
||||
if not pm.is_installed("faiss"):
|
||||
pm.install("faiss")
|
||||
|
||||
import faiss # type: ignore
|
||||
|
||||
from threading import Lock as ThreadLock
|
||||
|
||||
@final
|
||||
@dataclass
|
||||
@@ -51,35 +44,29 @@ class FaissVectorDBStorage(BaseVectorStorage):
|
||||
self._max_batch_size = self.global_config["embedding_batch_num"]
|
||||
# Embedding dimension (e.g. 768) must match your embedding function
|
||||
self._dim = self.embedding_func.embedding_dim
|
||||
self._storage_lock = get_storage_lock()
|
||||
self._storage_lock = ThreadLock()
|
||||
|
||||
# check need_init must before get_namespace_object/get_namespace_data
|
||||
need_init = try_initialize_namespace("faiss_indices")
|
||||
self._index = get_namespace_object("faiss_indices")
|
||||
self._id_to_meta = get_namespace_data("faiss_meta")
|
||||
# Create an empty Faiss index for inner product (useful for normalized vectors = cosine similarity).
|
||||
# If you have a large number of vectors, you might want IVF or other indexes.
|
||||
# For demonstration, we use a simple IndexFlatIP.
|
||||
self._index = faiss.IndexFlatIP(self._dim)
|
||||
|
||||
# Keep a local store for metadata, IDs, etc.
|
||||
# Maps <int faiss_id> → metadata (including your original ID).
|
||||
self._id_to_meta = {}
|
||||
|
||||
# Attempt to load an existing index + metadata from disk
|
||||
with self._storage_lock:
|
||||
self._load_faiss_index()
|
||||
|
||||
if need_init:
|
||||
if is_multiprocess:
|
||||
# Create an empty Faiss index for inner product (useful for normalized vectors = cosine similarity).
|
||||
# If you have a large number of vectors, you might want IVF or other indexes.
|
||||
# For demonstration, we use a simple IndexFlatIP.
|
||||
self._index.value = faiss.IndexFlatIP(self._dim)
|
||||
# Keep a local store for metadata, IDs, etc.
|
||||
# Maps <int faiss_id> → metadata (including your original ID).
|
||||
self._id_to_meta.update({})
|
||||
# Attempt to load an existing index + metadata from disk
|
||||
self._load_faiss_index()
|
||||
else:
|
||||
self._index = faiss.IndexFlatIP(self._dim)
|
||||
self._id_to_meta.update({})
|
||||
self._load_faiss_index()
|
||||
|
||||
def _get_index(self):
|
||||
"""
|
||||
Helper method to get the correct index object based on multiprocess mode.
|
||||
Returns the actual index object that can be used for operations.
|
||||
"""
|
||||
return self._index.value if is_multiprocess else self._index
|
||||
"""Check if the shtorage should be reloaded"""
|
||||
return self._index
|
||||
|
||||
async def index_done_callback(self) -> None:
|
||||
with self._storage_lock:
|
||||
self._save_faiss_index()
|
||||
|
||||
async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
|
||||
"""
|
||||
@@ -134,34 +121,33 @@ class FaissVectorDBStorage(BaseVectorStorage):
|
||||
# Normalize embeddings for cosine similarity (in-place)
|
||||
faiss.normalize_L2(embeddings)
|
||||
|
||||
with self._storage_lock:
|
||||
# Upsert logic:
|
||||
# 1. Identify which vectors to remove if they exist
|
||||
# 2. Remove them
|
||||
# 3. Add the new vectors
|
||||
existing_ids_to_remove = []
|
||||
for meta, emb in zip(list_data, embeddings):
|
||||
faiss_internal_id = self._find_faiss_id_by_custom_id(meta["__id__"])
|
||||
if faiss_internal_id is not None:
|
||||
existing_ids_to_remove.append(faiss_internal_id)
|
||||
# Upsert logic:
|
||||
# 1. Identify which vectors to remove if they exist
|
||||
# 2. Remove them
|
||||
# 3. Add the new vectors
|
||||
existing_ids_to_remove = []
|
||||
for meta, emb in zip(list_data, embeddings):
|
||||
faiss_internal_id = self._find_faiss_id_by_custom_id(meta["__id__"])
|
||||
if faiss_internal_id is not None:
|
||||
existing_ids_to_remove.append(faiss_internal_id)
|
||||
|
||||
if existing_ids_to_remove:
|
||||
self._remove_faiss_ids(existing_ids_to_remove)
|
||||
if existing_ids_to_remove:
|
||||
self._remove_faiss_ids(existing_ids_to_remove)
|
||||
|
||||
# Step 2: Add new vectors
|
||||
index = self._get_index()
|
||||
start_idx = index.ntotal
|
||||
index.add(embeddings)
|
||||
# Step 2: Add new vectors
|
||||
index = self._get_index()
|
||||
start_idx = index.ntotal
|
||||
index.add(embeddings)
|
||||
|
||||
# Step 3: Store metadata + vector for each new ID
|
||||
for i, meta in enumerate(list_data):
|
||||
fid = start_idx + i
|
||||
# Store the raw vector so we can rebuild if something is removed
|
||||
meta["__vector__"] = embeddings[i].tolist()
|
||||
self._id_to_meta.update({fid: meta})
|
||||
# Step 3: Store metadata + vector for each new ID
|
||||
for i, meta in enumerate(list_data):
|
||||
fid = start_idx + i
|
||||
# Store the raw vector so we can rebuild if something is removed
|
||||
meta["__vector__"] = embeddings[i].tolist()
|
||||
self._id_to_meta.update({fid: meta})
|
||||
|
||||
logger.info(f"Upserted {len(list_data)} vectors into Faiss index.")
|
||||
return [m["__id__"] for m in list_data]
|
||||
logger.info(f"Upserted {len(list_data)} vectors into Faiss index.")
|
||||
return [m["__id__"] for m in list_data]
|
||||
|
||||
async def query(self, query: str, top_k: int) -> list[dict[str, Any]]:
|
||||
"""
|
||||
@@ -177,57 +163,54 @@ class FaissVectorDBStorage(BaseVectorStorage):
|
||||
)
|
||||
|
||||
# Perform the similarity search
|
||||
with self._storage_lock:
|
||||
distances, indices = self._get_index().search(embedding, top_k)
|
||||
distances, indices = self._get_index().search(embedding, top_k)
|
||||
|
||||
distances = distances[0]
|
||||
indices = indices[0]
|
||||
distances = distances[0]
|
||||
indices = indices[0]
|
||||
|
||||
results = []
|
||||
for dist, idx in zip(distances, indices):
|
||||
if idx == -1:
|
||||
# Faiss returns -1 if no neighbor
|
||||
continue
|
||||
results = []
|
||||
for dist, idx in zip(distances, indices):
|
||||
if idx == -1:
|
||||
# Faiss returns -1 if no neighbor
|
||||
continue
|
||||
|
||||
# Cosine similarity threshold
|
||||
if dist < self.cosine_better_than_threshold:
|
||||
continue
|
||||
# Cosine similarity threshold
|
||||
if dist < self.cosine_better_than_threshold:
|
||||
continue
|
||||
|
||||
meta = self._id_to_meta.get(idx, {})
|
||||
results.append(
|
||||
{
|
||||
**meta,
|
||||
"id": meta.get("__id__"),
|
||||
"distance": float(dist),
|
||||
"created_at": meta.get("__created_at__"),
|
||||
}
|
||||
)
|
||||
meta = self._id_to_meta.get(idx, {})
|
||||
results.append(
|
||||
{
|
||||
**meta,
|
||||
"id": meta.get("__id__"),
|
||||
"distance": float(dist),
|
||||
"created_at": meta.get("__created_at__"),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
return results
|
||||
|
||||
@property
|
||||
def client_storage(self):
|
||||
# Return whatever structure LightRAG might need for debugging
|
||||
with self._storage_lock:
|
||||
return {"data": list(self._id_to_meta.values())}
|
||||
return {"data": list(self._id_to_meta.values())}
|
||||
|
||||
async def delete(self, ids: list[str]):
|
||||
"""
|
||||
Delete vectors for the provided custom IDs.
|
||||
"""
|
||||
logger.info(f"Deleting {len(ids)} vectors from {self.namespace}")
|
||||
with self._storage_lock:
|
||||
to_remove = []
|
||||
for cid in ids:
|
||||
fid = self._find_faiss_id_by_custom_id(cid)
|
||||
if fid is not None:
|
||||
to_remove.append(fid)
|
||||
to_remove = []
|
||||
for cid in ids:
|
||||
fid = self._find_faiss_id_by_custom_id(cid)
|
||||
if fid is not None:
|
||||
to_remove.append(fid)
|
||||
|
||||
if to_remove:
|
||||
self._remove_faiss_ids(to_remove)
|
||||
logger.debug(
|
||||
f"Successfully deleted {len(to_remove)} vectors from {self.namespace}"
|
||||
)
|
||||
if to_remove:
|
||||
self._remove_faiss_ids(to_remove)
|
||||
logger.debug(
|
||||
f"Successfully deleted {len(to_remove)} vectors from {self.namespace}"
|
||||
)
|
||||
|
||||
async def delete_entity(self, entity_name: str) -> None:
|
||||
entity_id = compute_mdhash_id(entity_name, prefix="ent-")
|
||||
@@ -239,23 +222,18 @@ class FaissVectorDBStorage(BaseVectorStorage):
|
||||
Delete relations for a given entity by scanning metadata.
|
||||
"""
|
||||
logger.debug(f"Searching relations for entity {entity_name}")
|
||||
with self._storage_lock:
|
||||
relations = []
|
||||
for fid, meta in self._id_to_meta.items():
|
||||
if (
|
||||
meta.get("src_id") == entity_name
|
||||
or meta.get("tgt_id") == entity_name
|
||||
):
|
||||
relations.append(fid)
|
||||
relations = []
|
||||
for fid, meta in self._id_to_meta.items():
|
||||
if (
|
||||
meta.get("src_id") == entity_name
|
||||
or meta.get("tgt_id") == entity_name
|
||||
):
|
||||
relations.append(fid)
|
||||
|
||||
logger.debug(f"Found {len(relations)} relations for {entity_name}")
|
||||
if relations:
|
||||
self._remove_faiss_ids(relations)
|
||||
logger.debug(f"Deleted {len(relations)} relations for {entity_name}")
|
||||
|
||||
async def index_done_callback(self) -> None:
|
||||
with self._storage_lock:
|
||||
self._save_faiss_index()
|
||||
logger.debug(f"Found {len(relations)} relations for {entity_name}")
|
||||
if relations:
|
||||
self._remove_faiss_ids(relations)
|
||||
logger.debug(f"Deleted {len(relations)} relations for {entity_name}")
|
||||
|
||||
# --------------------------------------------------------------------------------
|
||||
# Internal helper methods
|
||||
@@ -265,11 +243,10 @@ class FaissVectorDBStorage(BaseVectorStorage):
|
||||
"""
|
||||
Return the Faiss internal ID for a given custom ID, or None if not found.
|
||||
"""
|
||||
with self._storage_lock:
|
||||
for fid, meta in self._id_to_meta.items():
|
||||
if meta.get("__id__") == custom_id:
|
||||
return fid
|
||||
return None
|
||||
for fid, meta in self._id_to_meta.items():
|
||||
if meta.get("__id__") == custom_id:
|
||||
return fid
|
||||
return None
|
||||
|
||||
def _remove_faiss_ids(self, fid_list):
|
||||
"""
|
||||
@@ -277,48 +254,42 @@ class FaissVectorDBStorage(BaseVectorStorage):
|
||||
Because IndexFlatIP doesn't support 'removals',
|
||||
we rebuild the index excluding those vectors.
|
||||
"""
|
||||
keep_fids = [fid for fid in self._id_to_meta if fid not in fid_list]
|
||||
|
||||
# Rebuild the index
|
||||
vectors_to_keep = []
|
||||
new_id_to_meta = {}
|
||||
for new_fid, old_fid in enumerate(keep_fids):
|
||||
vec_meta = self._id_to_meta[old_fid]
|
||||
vectors_to_keep.append(vec_meta["__vector__"]) # stored as list
|
||||
new_id_to_meta[new_fid] = vec_meta
|
||||
|
||||
with self._storage_lock:
|
||||
keep_fids = [fid for fid in self._id_to_meta if fid not in fid_list]
|
||||
|
||||
# Rebuild the index
|
||||
vectors_to_keep = []
|
||||
new_id_to_meta = {}
|
||||
for new_fid, old_fid in enumerate(keep_fids):
|
||||
vec_meta = self._id_to_meta[old_fid]
|
||||
vectors_to_keep.append(vec_meta["__vector__"]) # stored as list
|
||||
new_id_to_meta[new_fid] = vec_meta
|
||||
|
||||
# Re-init index
|
||||
new_index = faiss.IndexFlatIP(self._dim)
|
||||
# Re-init index
|
||||
self._index = faiss.IndexFlatIP(self._dim)
|
||||
if vectors_to_keep:
|
||||
arr = np.array(vectors_to_keep, dtype=np.float32)
|
||||
new_index.add(arr)
|
||||
if is_multiprocess:
|
||||
self._index.value = new_index
|
||||
else:
|
||||
self._index = new_index
|
||||
self._index.add(arr)
|
||||
|
||||
self._id_to_meta = new_id_to_meta
|
||||
|
||||
self._id_to_meta.update(new_id_to_meta)
|
||||
|
||||
def _save_faiss_index(self):
|
||||
"""
|
||||
Save the current Faiss index + metadata to disk so it can persist across runs.
|
||||
"""
|
||||
with self._storage_lock:
|
||||
faiss.write_index(
|
||||
self._get_index(),
|
||||
self._faiss_index_file,
|
||||
)
|
||||
faiss.write_index(self._index, self._faiss_index_file)
|
||||
|
||||
# Save metadata dict to JSON. Convert all keys to strings for JSON storage.
|
||||
# _id_to_meta is { int: { '__id__': doc_id, '__vector__': [float,...], ... } }
|
||||
# We'll keep the int -> dict, but JSON requires string keys.
|
||||
serializable_dict = {}
|
||||
for fid, meta in self._id_to_meta.items():
|
||||
serializable_dict[str(fid)] = meta
|
||||
# Save metadata dict to JSON. Convert all keys to strings for JSON storage.
|
||||
# _id_to_meta is { int: { '__id__': doc_id, '__vector__': [float,...], ... } }
|
||||
# We'll keep the int -> dict, but JSON requires string keys.
|
||||
serializable_dict = {}
|
||||
for fid, meta in self._id_to_meta.items():
|
||||
serializable_dict[str(fid)] = meta
|
||||
|
||||
with open(self._meta_file, "w", encoding="utf-8") as f:
|
||||
json.dump(serializable_dict, f)
|
||||
|
||||
with open(self._meta_file, "w", encoding="utf-8") as f:
|
||||
json.dump(serializable_dict, f)
|
||||
|
||||
def _load_faiss_index(self):
|
||||
"""
|
||||
@@ -331,31 +302,22 @@ class FaissVectorDBStorage(BaseVectorStorage):
|
||||
|
||||
try:
|
||||
# Load the Faiss index
|
||||
loaded_index = faiss.read_index(self._faiss_index_file)
|
||||
if is_multiprocess:
|
||||
self._index.value = loaded_index
|
||||
else:
|
||||
self._index = loaded_index
|
||||
|
||||
self._index = faiss.read_index(self._faiss_index_file)
|
||||
# Load metadata
|
||||
with open(self._meta_file, "r", encoding="utf-8") as f:
|
||||
stored_dict = json.load(f)
|
||||
|
||||
# Convert string keys back to int
|
||||
self._id_to_meta.update({})
|
||||
self._id_to_meta = {}
|
||||
for fid_str, meta in stored_dict.items():
|
||||
fid = int(fid_str)
|
||||
self._id_to_meta[fid] = meta
|
||||
|
||||
logger.info(
|
||||
f"Faiss index loaded with {loaded_index.ntotal} vectors from {self._faiss_index_file}"
|
||||
f"Faiss index loaded with {self._index.ntotal} vectors from {self._faiss_index_file}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load Faiss index or metadata: {e}")
|
||||
logger.warning("Starting with an empty Faiss index.")
|
||||
new_index = faiss.IndexFlatIP(self._dim)
|
||||
if is_multiprocess:
|
||||
self._index.value = new_index
|
||||
else:
|
||||
self._index = new_index
|
||||
self._id_to_meta.update({})
|
||||
self._index = faiss.IndexFlatIP(self._dim)
|
||||
self._id_to_meta = {}
|
||||
|
Reference in New Issue
Block a user