Merge branch 'main' into loginPage

2025-03-11 23:57:03 +08:00
parent 6b22e8065b 2ffd7f9111
commit e5214f1a70
39 changed files with 1702 additions and 531 deletions
--- a/lightrag/base.py
+++ b/lightrag/base.py
@@ -81,6 +81,9 @@ class QueryParam:
    history_turns: int = 3
    """Number of complete conversation turns (user-assistant pairs) to consider in the response context."""

+    ids: list[str] | None = None
+    """List of ids to filter the results."""
+

@dataclass
 class StorageNameSpace(ABC):
@@ -107,7 +110,9 @@ class BaseVectorStorage(StorageNameSpace, ABC):
    meta_fields: set[str] = field(default_factory=set)

    @abstractmethod
-    async def query(self, query: str, top_k: int) -> list[dict[str, Any]]:
+    async def query(
+        self, query: str, top_k: int, ids: list[str] | None = None
+    ) -> list[dict[str, Any]]:
        """Query the vector storage and retrieve top_k results."""

    @abstractmethod
@@ -122,6 +127,30 @@ class BaseVectorStorage(StorageNameSpace, ABC):
    async def delete_entity_relation(self, entity_name: str) -> None:
        """Delete relations for a given entity."""

+    @abstractmethod
+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        pass
+
+    @abstractmethod
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        pass
+

@dataclass
 class BaseKVStorage(StorageNameSpace, ABC):
--- a/lightrag/kg/chroma_impl.py
+++ b/lightrag/kg/chroma_impl.py
@@ -269,3 +269,67 @@ class ChromaVectorDBStorage(BaseVectorStorage):
        except Exception as e:
            logger.error(f"Error during prefix search in ChromaDB: {str(e)}")
            raise
+
+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        try:
+            # Query the collection for a single vector by ID
+            result = self._collection.get(
+                ids=[id], include=["metadatas", "embeddings", "documents"]
+            )
+
+            if not result or not result["ids"] or len(result["ids"]) == 0:
+                return None
+
+            # Format the result to match the expected structure
+            return {
+                "id": result["ids"][0],
+                "vector": result["embeddings"][0],
+                "content": result["documents"][0],
+                **result["metadatas"][0],
+            }
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for ID {id}: {e}")
+            return None
+
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        if not ids:
+            return []
+
+        try:
+            # Query the collection for multiple vectors by IDs
+            result = self._collection.get(
+                ids=ids, include=["metadatas", "embeddings", "documents"]
+            )
+
+            if not result or not result["ids"] or len(result["ids"]) == 0:
+                return []
+
+            # Format the results to match the expected structure
+            return [
+                {
+                    "id": result["ids"][i],
+                    "vector": result["embeddings"][i],
+                    "content": result["documents"][i],
+                    **result["metadatas"][i],
+                }
+                for i in range(len(result["ids"]))
+            ]
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
+            return []
--- a/lightrag/kg/faiss_impl.py
+++ b/lightrag/kg/faiss_impl.py
@@ -392,3 +392,46 @@ class FaissVectorDBStorage(BaseVectorStorage):

        logger.debug(f"Found {len(matching_records)} records with prefix '{prefix}'")
        return matching_records
+
+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        # Find the Faiss internal ID for the custom ID
+        fid = self._find_faiss_id_by_custom_id(id)
+        if fid is None:
+            return None
+
+        # Get the metadata for the found ID
+        metadata = self._id_to_meta.get(fid, {})
+        if not metadata:
+            return None
+
+        return {**metadata, "id": metadata.get("__id__")}
+
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        if not ids:
+            return []
+
+        results = []
+        for id in ids:
+            fid = self._find_faiss_id_by_custom_id(id)
+            if fid is not None:
+                metadata = self._id_to_meta.get(fid, {})
+                if metadata:
+                    results.append({**metadata, "id": metadata.get("__id__")})
+
+        return results
--- a/lightrag/kg/milvus_impl.py
+++ b/lightrag/kg/milvus_impl.py
@@ -231,3 +231,57 @@ class MilvusVectorDBStorage(BaseVectorStorage):
        except Exception as e:
            logger.error(f"Error searching for records with prefix '{prefix}': {e}")
            return []
+
+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        try:
+            # Query Milvus for a specific ID
+            result = self._client.query(
+                collection_name=self.namespace,
+                filter=f'id == "{id}"',
+                output_fields=list(self.meta_fields) + ["id"],
+            )
+
+            if not result or len(result) == 0:
+                return None
+
+            return result[0]
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for ID {id}: {e}")
+            return None
+
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        if not ids:
+            return []
+
+        try:
+            # Prepare the ID filter expression
+            id_list = '", "'.join(ids)
+            filter_expr = f'id in ["{id_list}"]'
+
+            # Query Milvus with the filter
+            result = self._client.query(
+                collection_name=self.namespace,
+                filter=filter_expr,
+                output_fields=list(self.meta_fields) + ["id"],
+            )
+
+            return result or []
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
+            return []
--- a/lightrag/kg/mongo_impl.py
+++ b/lightrag/kg/mongo_impl.py
@@ -1071,6 +1071,59 @@ class MongoVectorDBStorage(BaseVectorStorage):
            logger.error(f"Error searching by prefix in {self.namespace}: {str(e)}")
            return []

+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        try:
+            # Search for the specific ID in MongoDB
+            result = await self._data.find_one({"_id": id})
+            if result:
+                # Format the result to include id field expected by API
+                result_dict = dict(result)
+                if "_id" in result_dict and "id" not in result_dict:
+                    result_dict["id"] = result_dict["_id"]
+                return result_dict
+            return None
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for ID {id}: {e}")
+            return None
+
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        if not ids:
+            return []
+
+        try:
+            # Query MongoDB for multiple IDs
+            cursor = self._data.find({"_id": {"$in": ids}})
+            results = await cursor.to_list(length=None)
+
+            # Format results to include id field expected by API
+            formatted_results = []
+            for result in results:
+                result_dict = dict(result)
+                if "_id" in result_dict and "id" not in result_dict:
+                    result_dict["id"] = result_dict["_id"]
+                formatted_results.append(result_dict)
+
+            return formatted_results
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
+            return []
+

 async def get_or_create_collection(db: AsyncIOMotorDatabase, collection_name: str):
    collection_names = await db.list_collection_names()
--- a/lightrag/kg/nano_vector_db_impl.py
+++ b/lightrag/kg/nano_vector_db_impl.py
@@ -256,3 +256,33 @@ class NanoVectorDBStorage(BaseVectorStorage):

        logger.debug(f"Found {len(matching_records)} records with prefix '{prefix}'")
        return matching_records
+
+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        client = await self._get_client()
+        result = client.get([id])
+        if result:
+            return result[0]
+        return None
+
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        if not ids:
+            return []
+
+        client = await self._get_client()
+        return client.get(ids)
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -176,23 +176,6 @@ class Neo4JStorage(BaseGraphStorage):
        # Noe4J handles persistence automatically
        pass

-    def _ensure_label(self, label: str) -> str:
-        """Ensure a label is valid
-
-        Args:
-            label: The label to validate
-
-        Returns:
-            str: The cleaned label
-
-        Raises:
-            ValueError: If label is empty after cleaning
-        """
-        clean_label = label.strip('"')
-        if not clean_label:
-            raise ValueError("Neo4j: Label cannot be empty")
-        return clean_label
-
    async def has_node(self, node_id: str) -> bool:
        """
        Check if a node with the given label exists in the database
@@ -207,20 +190,17 @@ class Neo4JStorage(BaseGraphStorage):
            ValueError: If node_id is invalid
            Exception: If there is an error executing the query
        """
-        entity_name_label = self._ensure_label(node_id)
        async with self._driver.session(
            database=self._DATABASE, default_access_mode="READ"
        ) as session:
            try:
-                query = f"MATCH (n:`{entity_name_label}`) RETURN count(n) > 0 AS node_exists"
-                result = await session.run(query)
+                query = "MATCH (n:base {entity_id: $entity_id}) RETURN count(n) > 0 AS node_exists"
+                result = await session.run(query, entity_id=node_id)
                single_result = await result.single()
                await result.consume()  # Ensure result is fully consumed
                return single_result["node_exists"]
            except Exception as e:
-                logger.error(
-                    f"Error checking node existence for {entity_name_label}: {str(e)}"
-                )
+                logger.error(f"Error checking node existence for {node_id}: {str(e)}")
                await result.consume()  # Ensure results are consumed even on error
                raise

@@ -239,24 +219,25 @@ class Neo4JStorage(BaseGraphStorage):
            ValueError: If either node_id is invalid
            Exception: If there is an error executing the query
        """
-        entity_name_label_source = self._ensure_label(source_node_id)
-        entity_name_label_target = self._ensure_label(target_node_id)
-
        async with self._driver.session(
            database=self._DATABASE, default_access_mode="READ"
        ) as session:
            try:
                query = (
-                    f"MATCH (a:`{entity_name_label_source}`)-[r]-(b:`{entity_name_label_target}`) "
+                    "MATCH (a:base {entity_id: $source_entity_id})-[r]-(b:base {entity_id: $target_entity_id}) "
                    "RETURN COUNT(r) > 0 AS edgeExists"
                )
-                result = await session.run(query)
+                result = await session.run(
+                    query,
+                    source_entity_id=source_node_id,
+                    target_entity_id=target_node_id,
+                )
                single_result = await result.single()
                await result.consume()  # Ensure result is fully consumed
                return single_result["edgeExists"]
            except Exception as e:
                logger.error(
-                    f"Error checking edge existence between {entity_name_label_source} and {entity_name_label_target}: {str(e)}"
+                    f"Error checking edge existence between {source_node_id} and {target_node_id}: {str(e)}"
                )
                await result.consume()  # Ensure results are consumed even on error
                raise
@@ -275,13 +256,12 @@ class Neo4JStorage(BaseGraphStorage):
            ValueError: If node_id is invalid
            Exception: If there is an error executing the query
        """
-        entity_name_label = self._ensure_label(node_id)
        async with self._driver.session(
            database=self._DATABASE, default_access_mode="READ"
        ) as session:
            try:
-                query = f"MATCH (n:`{entity_name_label}` {{entity_id: $entity_id}}) RETURN n"
-                result = await session.run(query, entity_id=entity_name_label)
+                query = "MATCH (n:base {entity_id: $entity_id}) RETURN n"
+                result = await session.run(query, entity_id=node_id)
                try:
                    records = await result.fetch(
                        2
@@ -289,20 +269,25 @@ class Neo4JStorage(BaseGraphStorage):

                    if len(records) > 1:
                        logger.warning(
-                            f"Multiple nodes found with label '{entity_name_label}'. Using first node."
+                            f"Multiple nodes found with label '{node_id}'. Using first node."
                        )
                    if records:
                        node = records[0]["n"]
                        node_dict = dict(node)
-                        logger.debug(
-                            f"{inspect.currentframe().f_code.co_name}: query: {query}, result: {node_dict}"
-                        )
+                        # Remove base label from labels list if it exists
+                        if "labels" in node_dict:
+                            node_dict["labels"] = [
+                                label
+                                for label in node_dict["labels"]
+                                if label != "base"
+                            ]
+                        logger.debug(f"Neo4j query node {query} return: {node_dict}")
                        return node_dict
                    return None
                finally:
                    await result.consume()  # Ensure result is fully consumed
            except Exception as e:
-                logger.error(f"Error getting node for {entity_name_label}: {str(e)}")
+                logger.error(f"Error getting node for {node_id}: {str(e)}")
                raise

    async def node_degree(self, node_id: str) -> int:
@@ -320,43 +305,32 @@ class Neo4JStorage(BaseGraphStorage):
            ValueError: If node_id is invalid
            Exception: If there is an error executing the query
        """
-        entity_name_label = self._ensure_label(node_id)
-
        async with self._driver.session(
            database=self._DATABASE, default_access_mode="READ"
        ) as session:
            try:
-                query = f"""
-                    MATCH (n:`{entity_name_label}`)
+                query = """
+                    MATCH (n:base {entity_id: $entity_id})
                    OPTIONAL MATCH (n)-[r]-()
-                    RETURN n, COUNT(r) AS degree
+                    RETURN COUNT(r) AS degree
                """
-                result = await session.run(query)
+                result = await session.run(query, entity_id=node_id)
                try:
-                    records = await result.fetch(100)
+                    record = await result.single()

-                    if not records:
-                        logger.warning(
-                            f"No node found with label '{entity_name_label}'"
-                        )
+                    if not record:
+                        logger.warning(f"No node found with label '{node_id}'")
                        return 0

-                    if len(records) > 1:
-                        logger.warning(
-                            f"Multiple nodes ({len(records)}) found with label '{entity_name_label}', using first node's degree"
-                        )
-
-                    degree = records[0]["degree"]
+                    degree = record["degree"]
                    logger.debug(
-                        f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{degree}"
+                        "Neo4j query node degree for {node_id} return: {degree}"
                    )
                    return degree
                finally:
                    await result.consume()  # Ensure result is fully consumed
            except Exception as e:
-                logger.error(
-                    f"Error getting node degree for {entity_name_label}: {str(e)}"
-                )
+                logger.error(f"Error getting node degree for {node_id}: {str(e)}")
                raise

    async def edge_degree(self, src_id: str, tgt_id: str) -> int:
@@ -369,11 +343,8 @@ class Neo4JStorage(BaseGraphStorage):
        Returns:
            int: Sum of the degrees of both nodes
        """
-        entity_name_label_source = self._ensure_label(src_id)
-        entity_name_label_target = self._ensure_label(tgt_id)
-
-        src_degree = await self.node_degree(entity_name_label_source)
-        trg_degree = await self.node_degree(entity_name_label_target)
+        src_degree = await self.node_degree(src_id)
+        trg_degree = await self.node_degree(tgt_id)

        # Convert None to 0 for addition
        src_degree = 0 if src_degree is None else src_degree
@@ -399,24 +370,24 @@ class Neo4JStorage(BaseGraphStorage):
            Exception: If there is an error executing the query
        """
        try:
-            entity_name_label_source = self._ensure_label(source_node_id)
-            entity_name_label_target = self._ensure_label(target_node_id)
-
            async with self._driver.session(
                database=self._DATABASE, default_access_mode="READ"
            ) as session:
-                query = f"""
-                MATCH (start:`{entity_name_label_source}`)-[r]-(end:`{entity_name_label_target}`)
+                query = """
+                MATCH (start:base {entity_id: $source_entity_id})-[r]-(end:base {entity_id: $target_entity_id})
                RETURN properties(r) as edge_properties
                """
-
-                result = await session.run(query)
+                result = await session.run(
+                    query,
+                    source_entity_id=source_node_id,
+                    target_entity_id=target_node_id,
+                )
                try:
                    records = await result.fetch(2)

                    if len(records) > 1:
                        logger.warning(
-                            f"Multiple edges found between '{entity_name_label_source}' and '{entity_name_label_target}'. Using first edge."
+                            f"Multiple edges found between '{source_node_id}' and '{target_node_id}'. Using first edge."
                        )
                    if records:
                        try:
@@ -433,7 +404,7 @@ class Neo4JStorage(BaseGraphStorage):
                                if key not in edge_result:
                                    edge_result[key] = default_value
                                    logger.warning(
-                                        f"Edge between {entity_name_label_source} and {entity_name_label_target} "
+                                        f"Edge between {source_node_id} and {target_node_id} "
                                        f"missing {key}, using default: {default_value}"
                                    )

@@ -443,8 +414,8 @@ class Neo4JStorage(BaseGraphStorage):
                            return edge_result
                        except (KeyError, TypeError, ValueError) as e:
                            logger.error(
-                                f"Error processing edge properties between {entity_name_label_source} "
-                                f"and {entity_name_label_target}: {str(e)}"
+                                f"Error processing edge properties between {source_node_id} "
+                                f"and {target_node_id}: {str(e)}"
                            )
                            # Return default edge properties on error
                            return {
@@ -455,7 +426,7 @@ class Neo4JStorage(BaseGraphStorage):
                            }

                    logger.debug(
-                        f"{inspect.currentframe().f_code.co_name}: No edge found between {entity_name_label_source} and {entity_name_label_target}"
+                        f"{inspect.currentframe().f_code.co_name}: No edge found between {source_node_id} and {target_node_id}"
                    )
                    # Return default edge properties when no edge found
                    return {
@@ -488,29 +459,33 @@ class Neo4JStorage(BaseGraphStorage):
            Exception: If there is an error executing the query
        """
        try:
-            node_label = self._ensure_label(source_node_id)
-
-            query = f"""MATCH (n:`{node_label}`)
-                    OPTIONAL MATCH (n)-[r]-(connected)
-                    RETURN n, r, connected"""
-
            async with self._driver.session(
                database=self._DATABASE, default_access_mode="READ"
            ) as session:
                try:
-                    results = await session.run(query)
-                    edges = []
+                    query = """MATCH (n:base {entity_id: $entity_id})
+                            OPTIONAL MATCH (n)-[r]-(connected:base)
+                            WHERE connected.entity_id IS NOT NULL
+                            RETURN n, r, connected"""
+                    results = await session.run(query, entity_id=source_node_id)

+                    edges = []
                    async for record in results:
                        source_node = record["n"]
                        connected_node = record["connected"]

+                        # Skip if either node is None
+                        if not source_node or not connected_node:
+                            continue
+
                        source_label = (
-                            list(source_node.labels)[0] if source_node.labels else None
+                            source_node.get("entity_id")
+                            if source_node.get("entity_id")
+                            else None
                        )
                        target_label = (
-                            list(connected_node.labels)[0]
-                            if connected_node and connected_node.labels
+                            connected_node.get("entity_id")
+                            if connected_node.get("entity_id")
                            else None
                        )

@@ -520,7 +495,9 @@ class Neo4JStorage(BaseGraphStorage):
                    await results.consume()  # Ensure results are consumed
                    return edges
                except Exception as e:
-                    logger.error(f"Error getting edges for node {node_label}: {str(e)}")
+                    logger.error(
+                        f"Error getting edges for node {source_node_id}: {str(e)}"
+                    )
                    await results.consume()  # Ensure results are consumed even on error
                    raise
        except Exception as e:
@@ -547,8 +524,9 @@ class Neo4JStorage(BaseGraphStorage):
            node_id: The unique identifier for the node (used as label)
            node_data: Dictionary of node properties
        """
-        label = self._ensure_label(node_id)
        properties = node_data
+        entity_type = properties["entity_type"]
+        entity_id = properties["entity_id"]
        if "entity_id" not in properties:
            raise ValueError("Neo4j: node properties must contain an 'entity_id' field")

@@ -556,13 +534,17 @@ class Neo4JStorage(BaseGraphStorage):
            async with self._driver.session(database=self._DATABASE) as session:

                async def execute_upsert(tx: AsyncManagedTransaction):
-                    query = f"""
-                    MERGE (n:`{label}` {{entity_id: $properties.entity_id}})
+                    query = (
+                        """
+                    MERGE (n:base {entity_id: $properties.entity_id})
                    SET n += $properties
+                    SET n:`%s`
                    """
+                        % entity_type
+                    )
                    result = await tx.run(query, properties=properties)
                    logger.debug(
-                        f"Upserted node with label '{label}' and properties: {properties}"
+                        f"Upserted node with entity_id '{entity_id}' and properties: {properties}"
                    )
                    await result.consume()  # Ensure result is fully consumed

@@ -583,52 +565,6 @@ class Neo4JStorage(BaseGraphStorage):
            )
        ),
    )
-    async def _get_unique_node_entity_id(self, node_label: str) -> str:
-        """
-        Get the entity_id of a node with the given label, ensuring the node is unique.
-
-        Args:
-            node_label (str): Label of the node to check
-
-        Returns:
-            str: The entity_id of the unique node
-
-        Raises:
-            ValueError: If no node with the given label exists or if multiple nodes have the same label
-        """
-        async with self._driver.session(
-            database=self._DATABASE, default_access_mode="READ"
-        ) as session:
-            query = f"""
-            MATCH (n:`{node_label}`)
-            RETURN n, count(n) as node_count
-            """
-            result = await session.run(query)
-            try:
-                records = await result.fetch(
-                    2
-                )  # We only need to know if there are 0, 1, or >1 nodes
-
-                if not records or records[0]["node_count"] == 0:
-                    raise ValueError(
-                        f"Neo4j: node with label '{node_label}' does not exist"
-                    )
-
-                if records[0]["node_count"] > 1:
-                    raise ValueError(
-                        f"Neo4j: multiple nodes found with label '{node_label}', cannot determine unique node"
-                    )
-
-                node = records[0]["n"]
-                if "entity_id" not in node:
-                    raise ValueError(
-                        f"Neo4j: node with label '{node_label}' does not have an entity_id property"
-                    )
-
-                return node["entity_id"]
-            finally:
-                await result.consume()  # Ensure result is fully consumed
-
    @retry(
        stop=stop_after_attempt(3),
        wait=wait_exponential(multiplier=1, min=4, max=10),
@@ -657,38 +593,30 @@ class Neo4JStorage(BaseGraphStorage):
        Raises:
            ValueError: If either source or target node does not exist or is not unique
        """
-        source_label = self._ensure_label(source_node_id)
-        target_label = self._ensure_label(target_node_id)
-        edge_properties = edge_data
-
-        # Get entity_ids for source and target nodes, ensuring they are unique
-        source_entity_id = await self._get_unique_node_entity_id(source_label)
-        target_entity_id = await self._get_unique_node_entity_id(target_label)
-
        try:
+            edge_properties = edge_data
            async with self._driver.session(database=self._DATABASE) as session:

                async def execute_upsert(tx: AsyncManagedTransaction):
-                    query = f"""
-                    MATCH (source:`{source_label}` {{entity_id: $source_entity_id}})
+                    query = """
+                    MATCH (source:base {entity_id: $source_entity_id})
                    WITH source
-                    MATCH (target:`{target_label}` {{entity_id: $target_entity_id}})
+                    MATCH (target:base {entity_id: $target_entity_id})
                    MERGE (source)-[r:DIRECTED]-(target)
                    SET r += $properties
                    RETURN r, source, target
                    """
                    result = await tx.run(
                        query,
-                        source_entity_id=source_entity_id,
-                        target_entity_id=target_entity_id,
+                        source_entity_id=source_node_id,
+                        target_entity_id=target_node_id,
                        properties=edge_properties,
                    )
                    try:
-                        records = await result.fetch(100)
+                        records = await result.fetch(2)
                        if records:
                            logger.debug(
-                                f"Upserted edge from '{source_label}' (entity_id: {source_entity_id}) "
-                                f"to '{target_label}' (entity_id: {target_entity_id}) "
+                                f"Upserted edge from '{source_node_id}' to '{target_node_id}'"
                                f"with properties: {edge_properties}"
                            )
                    finally:
@@ -726,7 +654,6 @@ class Neo4JStorage(BaseGraphStorage):
        Returns:
            KnowledgeGraph: Complete connected subgraph for specified node
        """
-        label = node_label.strip('"')
        result = KnowledgeGraph()
        seen_nodes = set()
        seen_edges = set()
@@ -735,7 +662,7 @@ class Neo4JStorage(BaseGraphStorage):
            database=self._DATABASE, default_access_mode="READ"
        ) as session:
            try:
-                if label == "*":
+                if node_label == "*":
                    main_query = """
                    MATCH (n)
                    OPTIONAL MATCH (n)-[r]-()
@@ -760,12 +687,11 @@ class Neo4JStorage(BaseGraphStorage):
                    # Main query uses partial matching
                    main_query = """
                    MATCH (start)
-                    WHERE any(label IN labels(start) WHERE
+                    WHERE
                        CASE
-                            WHEN $inclusive THEN label CONTAINS $label
-                            ELSE label = $label
+                            WHEN $inclusive THEN start.entity_id CONTAINS $entity_id
+                            ELSE start.entity_id = $entity_id
                        END
-                    )
                    WITH start
                    CALL apoc.path.subgraphAll(start, {
                        relationshipFilter: '',
@@ -799,7 +725,7 @@ class Neo4JStorage(BaseGraphStorage):
                        main_query,
                        {
                            "max_nodes": MAX_GRAPH_NODES,
-                            "label": label,
+                            "entity_id": node_label,
                            "inclusive": inclusive,
                            "max_depth": max_depth,
                            "min_degree": min_degree,
@@ -818,7 +744,11 @@ class Neo4JStorage(BaseGraphStorage):
                                result.nodes.append(
                                    KnowledgeGraphNode(
                                        id=f"{node_id}",
-                                        labels=list(node.labels),
+                                        labels=[
+                                            label
+                                            for label in node.labels
+                                            if label != "base"
+                                        ],
                                        properties=dict(node),
                                    )
                                )
@@ -849,7 +779,7 @@ class Neo4JStorage(BaseGraphStorage):

            except neo4jExceptions.ClientError as e:
                logger.warning(f"APOC plugin error: {str(e)}")
-                if label != "*":
+                if node_label != "*":
                    logger.warning(
                        "Neo4j: falling back to basic Cypher recursive search..."
                    )
@@ -857,12 +787,14 @@ class Neo4JStorage(BaseGraphStorage):
                        logger.warning(
                            "Neo4j: inclusive search mode is not supported in recursive query, using exact matching"
                        )
-                    return await self._robust_fallback(label, max_depth, min_degree)
+                    return await self._robust_fallback(
+                        node_label, max_depth, min_degree
+                    )

        return result

    async def _robust_fallback(
-        self, label: str, max_depth: int, min_degree: int = 0
+        self, node_label: str, max_depth: int, min_degree: int = 0
    ) -> KnowledgeGraph:
        """
        Fallback implementation when APOC plugin is not available or incompatible.
@@ -895,12 +827,11 @@ class Neo4JStorage(BaseGraphStorage):
                database=self._DATABASE, default_access_mode="READ"
            ) as session:
                query = """
-                MATCH (a)-[r]-(b)
-                WHERE id(a) = toInteger($node_id)
+                MATCH (a:base {entity_id: $entity_id})-[r]-(b)
                WITH r, b, id(r) as edge_id, id(b) as target_id
                RETURN r, b, edge_id, target_id
                """
-                results = await session.run(query, {"node_id": node.id})
+                results = await session.run(query, entity_id=node.id)

                # Get all records and release database connection
                records = await results.fetch(
@@ -928,14 +859,16 @@ class Neo4JStorage(BaseGraphStorage):
                    edge_id = str(record["edge_id"])
                    if edge_id not in visited_edges:
                        b_node = record["b"]
-                        target_id = str(record["target_id"])
+                        target_id = b_node.get("entity_id")

-                        if b_node.labels:  # Only process if target node has labels
+                        if target_id:  # Only process if target node has entity_id
                            # Create KnowledgeGraphNode for target
                            target_node = KnowledgeGraphNode(
                                id=f"{target_id}",
-                                labels=list(b_node.labels),
-                                properties=dict(b_node),
+                                labels=[
+                                    label for label in b_node.labels if label != "base"
+                                ],
+                                properties=dict(b_node.properties),
                            )

                            # Create KnowledgeGraphEdge
@@ -961,11 +894,11 @@ class Neo4JStorage(BaseGraphStorage):
        async with self._driver.session(
            database=self._DATABASE, default_access_mode="READ"
        ) as session:
-            query = f"""
-            MATCH (n:`{label}`)
+            query = """
+            MATCH (n:base {entity_id: $entity_id})
            RETURN id(n) as node_id, n
            """
-            node_result = await session.run(query)
+            node_result = await session.run(query, entity_id=node_label)
            try:
                node_record = await node_result.single()
                if not node_record:
@@ -973,9 +906,11 @@ class Neo4JStorage(BaseGraphStorage):

                # Create initial KnowledgeGraphNode
                start_node = KnowledgeGraphNode(
-                    id=f"{node_record['node_id']}",
-                    labels=list(node_record["n"].labels),
-                    properties=dict(node_record["n"]),
+                    id=f"{node_record['n'].get('entity_id')}",
+                    labels=[
+                        label for label in node_record["n"].labels if label != "base"
+                    ],
+                    properties=dict(node_record["n"].properties),
                )
            finally:
                await node_result.consume()  # Ensure results are consumed
@@ -999,11 +934,10 @@ class Neo4JStorage(BaseGraphStorage):

            # Method 2: Query compatible with older versions
            query = """
-                MATCH (n)
-                WITH DISTINCT labels(n) AS node_labels
-                UNWIND node_labels AS label
-                RETURN DISTINCT label
-                ORDER BY label
+            MATCH (n)
+            WHERE n.entity_id IS NOT NULL
+            RETURN DISTINCT n.entity_id AS label
+            ORDER BY label
            """
            result = await session.run(query)
            labels = []
@@ -1034,15 +968,14 @@ class Neo4JStorage(BaseGraphStorage):
        Args:
            node_id: The label of the node to delete
        """
-        label = self._ensure_label(node_id)

        async def _do_delete(tx: AsyncManagedTransaction):
-            query = f"""
-            MATCH (n:`{label}`)
+            query = """
+            MATCH (n:base {entity_id: $entity_id})
            DETACH DELETE n
            """
-            result = await tx.run(query)
-            logger.debug(f"Deleted node with label '{label}'")
+            result = await tx.run(query, entity_id=node_id)
+            logger.debug(f"Deleted node with label '{node_id}'")
            await result.consume()  # Ensure result is fully consumed

        try:
@@ -1092,16 +1025,16 @@ class Neo4JStorage(BaseGraphStorage):
            edges: List of edges to be deleted, each edge is a (source, target) tuple
        """
        for source, target in edges:
-            source_label = self._ensure_label(source)
-            target_label = self._ensure_label(target)

            async def _do_delete_edge(tx: AsyncManagedTransaction):
-                query = f"""
-                MATCH (source:`{source_label}`)-[r]-(target:`{target_label}`)
+                query = """
+                MATCH (source:base {entity_id: $source_entity_id})-[r]-(target:base {entity_id: $target_entity_id})
                DELETE r
                """
-                result = await tx.run(query)
-                logger.debug(f"Deleted edge from '{source_label}' to '{target_label}'")
+                result = await tx.run(
+                    query, source_entity_id=source, target_entity_id=target
+                )
+                logger.debug(f"Deleted edge from '{source}' to '{target}'")
                await result.consume()  # Ensure result is fully consumed

            try:
--- a/lightrag/kg/oracle_impl.py
+++ b/lightrag/kg/oracle_impl.py
@@ -529,6 +529,80 @@ class OracleVectorDBStorage(BaseVectorStorage):
            logger.error(f"Error searching records with prefix '{prefix}': {e}")
            return []

+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        try:
+            # Determine the table name based on namespace
+            table_name = namespace_to_table_name(self.namespace)
+            if not table_name:
+                logger.error(f"Unknown namespace for ID lookup: {self.namespace}")
+                return None
+
+            # Create the appropriate ID field name based on namespace
+            id_field = "entity_id" if "NODES" in table_name else "relation_id"
+            if "CHUNKS" in table_name:
+                id_field = "chunk_id"
+
+            # Prepare and execute the query
+            query = f"""
+                SELECT * FROM {table_name}
+                WHERE {id_field} = :id AND workspace = :workspace
+            """
+            params = {"id": id, "workspace": self.db.workspace}
+
+            result = await self.db.query(query, params)
+            return result
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for ID {id}: {e}")
+            return None
+
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        if not ids:
+            return []
+
+        try:
+            # Determine the table name based on namespace
+            table_name = namespace_to_table_name(self.namespace)
+            if not table_name:
+                logger.error(f"Unknown namespace for IDs lookup: {self.namespace}")
+                return []
+
+            # Create the appropriate ID field name based on namespace
+            id_field = "entity_id" if "NODES" in table_name else "relation_id"
+            if "CHUNKS" in table_name:
+                id_field = "chunk_id"
+
+            # Format the list of IDs for SQL IN clause
+            ids_list = ", ".join([f"'{id}'" for id in ids])
+
+            # Prepare and execute the query
+            query = f"""
+                SELECT * FROM {table_name}
+                WHERE {id_field} IN ({ids_list}) AND workspace = :workspace
+            """
+            params = {"workspace": self.db.workspace}
+
+            results = await self.db.query(query, params, multirows=True)
+            return results or []
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
+            return []
+

@final
@dataclass
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@@ -438,6 +438,8 @@ class PGVectorStorage(BaseVectorStorage):
            "entity_name": item["entity_name"],
            "content": item["content"],
            "content_vector": json.dumps(item["__vector__"].tolist()),
+            "chunk_id": item["source_id"],
+            # TODO: add document_id
        }
        return upsert_sql, data

@@ -450,6 +452,8 @@ class PGVectorStorage(BaseVectorStorage):
            "target_id": item["tgt_id"],
            "content": item["content"],
            "content_vector": json.dumps(item["__vector__"].tolist()),
+            "chunk_id": item["source_id"],
+            # TODO: add document_id
        }
        return upsert_sql, data

@@ -492,13 +496,20 @@ class PGVectorStorage(BaseVectorStorage):
            await self.db.execute(upsert_sql, data)

    #################### query method ###############
-    async def query(self, query: str, top_k: int) -> list[dict[str, Any]]:
+    async def query(
+        self, query: str, top_k: int, ids: list[str] | None = None
+    ) -> list[dict[str, Any]]:
        embeddings = await self.embedding_func([query])
        embedding = embeddings[0]
        embedding_string = ",".join(map(str, embedding))

+        if ids:
+            formatted_ids = ",".join(f"'{id}'" for id in ids)
+        else:
+            formatted_ids = "NULL"
+
        sql = SQL_TEMPLATES[self.base_namespace].format(
-            embedding_string=embedding_string
+            embedding_string=embedding_string, doc_ids=formatted_ids
        )
        params = {
            "workspace": self.db.workspace,
@@ -610,6 +621,60 @@ class PGVectorStorage(BaseVectorStorage):
            logger.error(f"Error during prefix search for '{prefix}': {e}")
            return []

+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        table_name = namespace_to_table_name(self.namespace)
+        if not table_name:
+            logger.error(f"Unknown namespace for ID lookup: {self.namespace}")
+            return None
+
+        query = f"SELECT * FROM {table_name} WHERE workspace=$1 AND id=$2"
+        params = {"workspace": self.db.workspace, "id": id}
+
+        try:
+            result = await self.db.query(query, params)
+            if result:
+                return dict(result)
+            return None
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for ID {id}: {e}")
+            return None
+
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        if not ids:
+            return []
+
+        table_name = namespace_to_table_name(self.namespace)
+        if not table_name:
+            logger.error(f"Unknown namespace for IDs lookup: {self.namespace}")
+            return []
+
+        ids_str = ",".join([f"'{id}'" for id in ids])
+        query = f"SELECT * FROM {table_name} WHERE workspace=$1 AND id IN ({ids_str})"
+        params = {"workspace": self.db.workspace}
+
+        try:
+            results = await self.db.query(query, params, multirows=True)
+            return [dict(record) for record in results]
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
+            return []
+

@final
@dataclass
@@ -1491,6 +1556,7 @@ TABLES = {
                    content_vector VECTOR,
                    create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    update_time TIMESTAMP,
+                    chunk_id VARCHAR(255) NULL,
 	                CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id)
                    )"""
    },
@@ -1504,6 +1570,7 @@ TABLES = {
                    content_vector VECTOR,
                    create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    update_time TIMESTAMP,
+                    chunk_id VARCHAR(255) NULL,
 	                CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id)
                    )"""
    },
@@ -1586,8 +1653,9 @@ SQL_TEMPLATES = {
                      content_vector=EXCLUDED.content_vector,
                      update_time = CURRENT_TIMESTAMP
                     """,
-    "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, content_vector)
-                      VALUES ($1, $2, $3, $4, $5)
+    "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
+                      content_vector, chunk_id)
+                      VALUES ($1, $2, $3, $4, $5, $6)
                      ON CONFLICT (workspace,id) DO UPDATE
                      SET entity_name=EXCLUDED.entity_name,
                      content=EXCLUDED.content,
@@ -1595,8 +1663,8 @@ SQL_TEMPLATES = {
                      update_time=CURRENT_TIMESTAMP
                     """,
    "upsert_relationship": """INSERT INTO LIGHTRAG_VDB_RELATION (workspace, id, source_id,
-                      target_id, content, content_vector)
-                      VALUES ($1, $2, $3, $4, $5, $6)
+                      target_id, content, content_vector, chunk_id)
+                      VALUES ($1, $2, $3, $4, $5, $6, $7)
                      ON CONFLICT (workspace,id) DO UPDATE
                      SET source_id=EXCLUDED.source_id,
                      target_id=EXCLUDED.target_id,
@@ -1604,21 +1672,21 @@ SQL_TEMPLATES = {
                      content_vector=EXCLUDED.content_vector, update_time = CURRENT_TIMESTAMP
                     """,
    # SQL for VectorStorage
-    "entities": """SELECT entity_name FROM
-        (SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
-        FROM LIGHTRAG_VDB_ENTITY where workspace=$1)
-        WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
-       """,
-    "relationships": """SELECT source_id as src_id, target_id as tgt_id FROM
-        (SELECT id, source_id,target_id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
-        FROM LIGHTRAG_VDB_RELATION where workspace=$1)
-        WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
-       """,
-    "chunks": """SELECT id FROM
-        (SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
-        FROM LIGHTRAG_DOC_CHUNKS where workspace=$1)
-        WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
-       """,
+    # "entities": """SELECT entity_name FROM
+    #     (SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+    #     FROM LIGHTRAG_VDB_ENTITY where workspace=$1)
+    #     WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
+    #    """,
+    # "relationships": """SELECT source_id as src_id, target_id as tgt_id FROM
+    #     (SELECT id, source_id,target_id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+    #     FROM LIGHTRAG_VDB_RELATION where workspace=$1)
+    #     WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
+    #    """,
+    # "chunks": """SELECT id FROM
+    #     (SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+    #     FROM LIGHTRAG_DOC_CHUNKS where workspace=$1)
+    #     WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
+    #    """,
    # DROP tables
    "drop_all": """
 	    DROP TABLE IF EXISTS LIGHTRAG_DOC_FULL CASCADE;
@@ -1642,4 +1710,55 @@ SQL_TEMPLATES = {
    "drop_vdb_relation": """
 	    DROP TABLE IF EXISTS LIGHTRAG_VDB_RELATION CASCADE;
       """,
+    "relationships": """
+    WITH relevant_chunks AS (
+        SELECT id as chunk_id
+        FROM LIGHTRAG_DOC_CHUNKS
+        WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
+    )
+    SELECT source_id as src_id, target_id as tgt_id
+    FROM (
+        SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance
+        FROM LIGHTRAG_VDB_RELATION r
+        WHERE r.workspace=$1
+        AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks)
+    ) filtered
+    WHERE distance>$2
+    ORDER BY distance DESC
+    LIMIT $3
+    """,
+    "entities": """
+        WITH relevant_chunks AS (
+            SELECT id as chunk_id
+            FROM LIGHTRAG_DOC_CHUNKS
+            WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
+        )
+        SELECT entity_name FROM
+            (
+                SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+                FROM LIGHTRAG_VDB_ENTITY
+                where workspace=$1
+                AND chunk_id IN (SELECT chunk_id FROM relevant_chunks)
+            )
+        WHERE distance>$2
+        ORDER BY distance DESC
+        LIMIT $3
+    """,
+    "chunks": """
+        WITH relevant_chunks AS (
+            SELECT id as chunk_id
+            FROM LIGHTRAG_DOC_CHUNKS
+            WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
+        )
+        SELECT id FROM
+            (
+                SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+                FROM LIGHTRAG_DOC_CHUNKS
+                where workspace=$1
+                AND id IN (SELECT chunk_id FROM relevant_chunks)
+            )
+            WHERE distance>$2
+            ORDER BY distance DESC
+            LIMIT $3
+    """,
 }
--- a/lightrag/kg/tidb_impl.py
+++ b/lightrag/kg/tidb_impl.py
@@ -463,6 +463,100 @@ class TiDBVectorDBStorage(BaseVectorStorage):
            logger.error(f"Error searching records with prefix '{prefix}': {e}")
            return []

+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        try:
+            # Determine which table to query based on namespace
+            if self.namespace == NameSpace.VECTOR_STORE_ENTITIES:
+                sql_template = """
+                    SELECT entity_id as id, name as entity_name, entity_type, description, content
+                    FROM LIGHTRAG_GRAPH_NODES
+                    WHERE entity_id = :entity_id AND workspace = :workspace
+                """
+                params = {"entity_id": id, "workspace": self.db.workspace}
+            elif self.namespace == NameSpace.VECTOR_STORE_RELATIONSHIPS:
+                sql_template = """
+                    SELECT relation_id as id, source_name as src_id, target_name as tgt_id,
+                           keywords, description, content
+                    FROM LIGHTRAG_GRAPH_EDGES
+                    WHERE relation_id = :relation_id AND workspace = :workspace
+                """
+                params = {"relation_id": id, "workspace": self.db.workspace}
+            elif self.namespace == NameSpace.VECTOR_STORE_CHUNKS:
+                sql_template = """
+                    SELECT chunk_id as id, content, tokens, chunk_order_index, full_doc_id
+                    FROM LIGHTRAG_DOC_CHUNKS
+                    WHERE chunk_id = :chunk_id AND workspace = :workspace
+                """
+                params = {"chunk_id": id, "workspace": self.db.workspace}
+            else:
+                logger.warning(
+                    f"Namespace {self.namespace} not supported for get_by_id"
+                )
+                return None
+
+            result = await self.db.query(sql_template, params=params)
+            return result
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for ID {id}: {e}")
+            return None
+
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        if not ids:
+            return []
+
+        try:
+            # Format IDs for SQL IN clause
+            ids_str = ", ".join([f"'{id}'" for id in ids])
+
+            # Determine which table to query based on namespace
+            if self.namespace == NameSpace.VECTOR_STORE_ENTITIES:
+                sql_template = f"""
+                    SELECT entity_id as id, name as entity_name, entity_type, description, content
+                    FROM LIGHTRAG_GRAPH_NODES
+                    WHERE entity_id IN ({ids_str}) AND workspace = :workspace
+                """
+            elif self.namespace == NameSpace.VECTOR_STORE_RELATIONSHIPS:
+                sql_template = f"""
+                    SELECT relation_id as id, source_name as src_id, target_name as tgt_id,
+                           keywords, description, content
+                    FROM LIGHTRAG_GRAPH_EDGES
+                    WHERE relation_id IN ({ids_str}) AND workspace = :workspace
+                """
+            elif self.namespace == NameSpace.VECTOR_STORE_CHUNKS:
+                sql_template = f"""
+                    SELECT chunk_id as id, content, tokens, chunk_order_index, full_doc_id
+                    FROM LIGHTRAG_DOC_CHUNKS
+                    WHERE chunk_id IN ({ids_str}) AND workspace = :workspace
+                """
+            else:
+                logger.warning(
+                    f"Namespace {self.namespace} not supported for get_by_ids"
+                )
+                return []
+
+            params = {"workspace": self.db.workspace}
+            results = await self.db.query(sql_template, params=params, multirows=True)
+            return results if results else []
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
+            return []
+

@final
@dataclass
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -30,11 +30,10 @@ from .namespace import NameSpace, make_namespace
 from .operate import (
    chunking_by_token_size,
    extract_entities,
-    extract_keywords_only,
    kg_query,
-    kg_query_with_keywords,
    mix_kg_vector_query,
    naive_query,
+    query_with_keywords,
 )
 from .prompt import GRAPH_FIELD_SEP, PROMPTS
 from .utils import (
@@ -45,6 +44,9 @@ from .utils import (
    encode_string_by_tiktoken,
    lazy_external_import,
    limit_async_func_call,
+    get_content_summary,
+    clean_text,
+    check_storage_env_vars,
    logger,
 )
 from .types import KnowledgeGraph
@@ -309,7 +311,7 @@ class LightRAG:
            # Verify storage implementation compatibility
            verify_storage_implementation(storage_type, storage_name)
            # Check environment variables
-            # self.check_storage_env_vars(storage_name)
+            check_storage_env_vars(storage_name)

        # Ensure vector_db_storage_cls_kwargs has required fields
        self.vector_db_storage_cls_kwargs = {
@@ -536,11 +538,6 @@ class LightRAG:
        storage_class = lazy_external_import(import_path, storage_name)
        return storage_class

-    @staticmethod
-    def clean_text(text: str) -> str:
-        """Clean text by removing null bytes (0x00) and whitespace"""
-        return text.strip().replace("\x00", "")
-
    def insert(
        self,
        input: str | list[str],
@@ -602,8 +599,8 @@ class LightRAG:
        update_storage = False
        try:
            # Clean input texts
-            full_text = self.clean_text(full_text)
-            text_chunks = [self.clean_text(chunk) for chunk in text_chunks]
+            full_text = clean_text(full_text)
+            text_chunks = [clean_text(chunk) for chunk in text_chunks]

            # Process cleaned texts
            if doc_id is None:
@@ -682,7 +679,7 @@ class LightRAG:
            contents = {id_: doc for id_, doc in zip(ids, input)}
        else:
            # Clean input text and remove duplicates
-            input = list(set(self.clean_text(doc) for doc in input))
+            input = list(set(clean_text(doc) for doc in input))
            # Generate contents dict of MD5 hash IDs and documents
            contents = {compute_mdhash_id(doc, prefix="doc-"): doc for doc in input}

@@ -698,7 +695,7 @@ class LightRAG:
        new_docs: dict[str, Any] = {
            id_: {
                "content": content,
-                "content_summary": self._get_content_summary(content),
+                "content_summary": get_content_summary(content),
                "content_length": len(content),
                "status": DocStatus.PENDING,
                "created_at": datetime.now().isoformat(),
@@ -1063,7 +1060,7 @@ class LightRAG:
            all_chunks_data: dict[str, dict[str, str]] = {}
            chunk_to_source_map: dict[str, str] = {}
            for chunk_data in custom_kg.get("chunks", []):
-                chunk_content = self.clean_text(chunk_data["content"])
+                chunk_content = clean_text(chunk_data["content"])
                source_id = chunk_data["source_id"]
                tokens = len(
                    encode_string_by_tiktoken(
@@ -1296,8 +1293,17 @@ class LightRAG:
        self, query: str, prompt: str, param: QueryParam = QueryParam()
    ):
        """
-        1. Extract keywords from the 'query' using new function in operate.py.
-        2. Then run the standard aquery() flow with the final prompt (formatted_question).
+        Query with separate keyword extraction step.
+
+        This method extracts keywords from the query first, then uses them for the query.
+
+        Args:
+            query: User query
+            prompt: Additional prompt for the query
+            param: Query parameters
+
+        Returns:
+            Query response
        """
        loop = always_get_an_event_loop()
        return loop.run_until_complete(
@@ -1308,66 +1314,29 @@ class LightRAG:
        self, query: str, prompt: str, param: QueryParam = QueryParam()
    ) -> str | AsyncIterator[str]:
        """
-        1. Calls extract_keywords_only to get HL/LL keywords from 'query'.
-        2. Then calls kg_query(...) or naive_query(...), etc. as the main query, while also injecting the newly extracted keywords if needed.
+        Async version of query_with_separate_keyword_extraction.
+
+        Args:
+            query: User query
+            prompt: Additional prompt for the query
+            param: Query parameters
+
+        Returns:
+            Query response or async iterator
        """
-        # ---------------------
-        # STEP 1: Keyword Extraction
-        # ---------------------
-        hl_keywords, ll_keywords = await extract_keywords_only(
-            text=query,
+        response = await query_with_keywords(
+            query=query,
+            prompt=prompt,
            param=param,
+            knowledge_graph_inst=self.chunk_entity_relation_graph,
+            entities_vdb=self.entities_vdb,
+            relationships_vdb=self.relationships_vdb,
+            chunks_vdb=self.chunks_vdb,
+            text_chunks_db=self.text_chunks,
            global_config=asdict(self),
-            hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
+            hashing_kv=self.llm_response_cache,
        )

-        param.hl_keywords = hl_keywords
-        param.ll_keywords = ll_keywords
-
-        # ---------------------
-        # STEP 2: Final Query Logic
-        # ---------------------
-
-        # Create a new string with the prompt and the keywords
-        ll_keywords_str = ", ".join(ll_keywords)
-        hl_keywords_str = ", ".join(hl_keywords)
-        formatted_question = f"{prompt}\n\n### Keywords:\nHigh-level: {hl_keywords_str}\nLow-level: {ll_keywords_str}\n\n### Query:\n{query}"
-
-        if param.mode in ["local", "global", "hybrid"]:
-            response = await kg_query_with_keywords(
-                formatted_question,
-                self.chunk_entity_relation_graph,
-                self.entities_vdb,
-                self.relationships_vdb,
-                self.text_chunks,
-                param,
-                asdict(self),
-                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
-            )
-        elif param.mode == "naive":
-            response = await naive_query(
-                formatted_question,
-                self.chunks_vdb,
-                self.text_chunks,
-                param,
-                asdict(self),
-                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
-            )
-        elif param.mode == "mix":
-            response = await mix_kg_vector_query(
-                formatted_question,
-                self.chunk_entity_relation_graph,
-                self.entities_vdb,
-                self.relationships_vdb,
-                self.chunks_vdb,
-                self.text_chunks,
-                param,
-                asdict(self),
-                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
-            )
-        else:
-            raise ValueError(f"Unknown mode {param.mode}")
-
        await self._query_done()
        return response

@@ -1465,21 +1434,6 @@ class LightRAG:
            ]
        )

-    def _get_content_summary(self, content: str, max_length: int = 100) -> str:
-        """Get summary of document content
-
-        Args:
-            content: Original document content
-            max_length: Maximum length of summary
-
-        Returns:
-            Truncated content with ellipsis if needed
-        """
-        content = content.strip()
-        if len(content) <= max_length:
-            return content
-        return content[:max_length] + "..."
-
    async def get_processing_status(self) -> dict[str, int]:
        """Get current document processing status counts

@@ -1756,19 +1710,7 @@ class LightRAG:
    async def get_entity_info(
        self, entity_name: str, include_vector_data: bool = False
    ) -> dict[str, str | None | dict[str, str]]:
-        """Get detailed information of an entity
-
-        Args:
-            entity_name: Entity name (no need for quotes)
-            include_vector_data: Whether to include data from the vector database
-
-        Returns:
-            dict: A dictionary containing entity information, including:
-                - entity_name: Entity name
-                - source_id: Source document ID
-                - graph_data: Complete node data from the graph database
-                - vector_data: (optional) Data from the vector database
-        """
+        """Get detailed information of an entity"""

        # Get information from the graph
        node_data = await self.chunk_entity_relation_graph.get_node(entity_name)
@@ -1783,29 +1725,15 @@ class LightRAG:
        # Optional: Get vector database information
        if include_vector_data:
            entity_id = compute_mdhash_id(entity_name, prefix="ent-")
-            vector_data = self.entities_vdb._client.get([entity_id])
-            result["vector_data"] = vector_data[0] if vector_data else None
+            vector_data = await self.entities_vdb.get_by_id(entity_id)
+            result["vector_data"] = vector_data

        return result

    async def get_relation_info(
        self, src_entity: str, tgt_entity: str, include_vector_data: bool = False
    ) -> dict[str, str | None | dict[str, str]]:
-        """Get detailed information of a relationship
-
-        Args:
-            src_entity: Source entity name (no need for quotes)
-            tgt_entity: Target entity name (no need for quotes)
-            include_vector_data: Whether to include data from the vector database
-
-        Returns:
-            dict: A dictionary containing relationship information, including:
-                - src_entity: Source entity name
-                - tgt_entity: Target entity name
-                - source_id: Source document ID
-                - graph_data: Complete edge data from the graph database
-                - vector_data: (optional) Data from the vector database
-        """
+        """Get detailed information of a relationship"""

        # Get information from the graph
        edge_data = await self.chunk_entity_relation_graph.get_edge(
@@ -1823,8 +1751,8 @@ class LightRAG:
        # Optional: Get vector database information
        if include_vector_data:
            rel_id = compute_mdhash_id(src_entity + tgt_entity, prefix="rel-")
-            vector_data = self.relationships_vdb._client.get([rel_id])
-            result["vector_data"] = vector_data[0] if vector_data else None
+            vector_data = await self.relationships_vdb.get_by_id(rel_id)
+            result["vector_data"] = vector_data

        return result

@@ -2622,6 +2550,12 @@ class LightRAG:

            # 9. Delete source entities
            for entity_name in source_entities:
+                if entity_name == target_entity:
+                    logger.info(
+                        f"Skipping deletion of '{entity_name}' as it's also the target entity"
+                    )
+                    continue
+
                # Delete entity node from knowledge graph
                await self.chunk_entity_relation_graph.delete_node(entity_name)

--- a/lightrag/llm/azure_openai.py
+++ b/lightrag/llm/azure_openai.py
@@ -55,6 +55,7 @@ async def azure_openai_complete_if_cache(

    openai_async_client = AsyncAzureOpenAI(
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
+        azure_deployment=model,
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    )
@@ -136,6 +137,7 @@ async def azure_openai_embed(

    openai_async_client = AsyncAzureOpenAI(
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
+        azure_deployment=model,
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    )
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -141,18 +141,36 @@ async def _handle_single_entity_extraction(
 ):
    if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
        return None
-    # add this record as a node in the G
+
+    # Clean and validate entity name
    entity_name = clean_str(record_attributes[1]).strip('"')
    if not entity_name.strip():
+        logger.warning(
+            f"Entity extraction error: empty entity name in: {record_attributes}"
+        )
        return None
+
+    # Clean and validate entity type
    entity_type = clean_str(record_attributes[2]).strip('"')
+    if not entity_type.strip() or entity_type.startswith('("'):
+        logger.warning(
+            f"Entity extraction error: invalid entity type in: {record_attributes}"
+        )
+        return None
+
+    # Clean and validate description
    entity_description = clean_str(record_attributes[3]).strip('"')
-    entity_source_id = chunk_key
+    if not entity_description.strip():
+        logger.warning(
+            f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'"
+        )
+        return None
+
    return dict(
        entity_name=entity_name,
        entity_type=entity_type,
        description=entity_description,
-        source_id=entity_source_id,
+        source_id=chunk_key,
        metadata={"created_at": time.time()},
    )

@@ -438,47 +456,22 @@ async def extract_entities(
        else:
            return await use_llm_func(input_text)

-    async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
-        """ "Prpocess a single chunk
+    async def _process_extraction_result(result: str, chunk_key: str):
+        """Process a single extraction result (either initial or gleaning)
        Args:
-            chunk_key_dp (tuple[str, TextChunkSchema]):
-                ("chunck-xxxxxx", {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int})
+            result (str): The extraction result to process
+            chunk_key (str): The chunk key for source tracking
+        Returns:
+            tuple: (nodes_dict, edges_dict) containing the extracted entities and relationships
        """
-        nonlocal processed_chunks
-        chunk_key = chunk_key_dp[0]
-        chunk_dp = chunk_key_dp[1]
-        content = chunk_dp["content"]
-        # hint_prompt = entity_extract_prompt.format(**context_base, input_text=content)
-        hint_prompt = entity_extract_prompt.format(
-            **context_base, input_text="{input_text}"
-        ).format(**context_base, input_text=content)
-
-        final_result = await _user_llm_func_with_cache(hint_prompt)
-        history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
-        for now_glean_index in range(entity_extract_max_gleaning):
-            glean_result = await _user_llm_func_with_cache(
-                continue_prompt, history_messages=history
-            )
-
-            history += pack_user_ass_to_openai_messages(continue_prompt, glean_result)
-            final_result += glean_result
-            if now_glean_index == entity_extract_max_gleaning - 1:
-                break
-
-            if_loop_result: str = await _user_llm_func_with_cache(
-                if_loop_prompt, history_messages=history
-            )
-            if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
-            if if_loop_result != "yes":
-                break
+        maybe_nodes = defaultdict(list)
+        maybe_edges = defaultdict(list)

        records = split_string_by_multi_markers(
-            final_result,
+            result,
            [context_base["record_delimiter"], context_base["completion_delimiter"]],
        )

-        maybe_nodes = defaultdict(list)
-        maybe_edges = defaultdict(list)
        for record in records:
            record = re.search(r"\((.*)\)", record)
            if record is None:
@@ -487,6 +480,7 @@ async def extract_entities(
            record_attributes = split_string_by_multi_markers(
                record, [context_base["tuple_delimiter"]]
            )
+
            if_entities = await _handle_single_entity_extraction(
                record_attributes, chunk_key
            )
@@ -501,6 +495,62 @@ async def extract_entities(
                maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append(
                    if_relation
                )
+
+        return maybe_nodes, maybe_edges
+
+    async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
+        """Process a single chunk
+        Args:
+            chunk_key_dp (tuple[str, TextChunkSchema]):
+                ("chunk-xxxxxx", {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int})
+        """
+        nonlocal processed_chunks
+        chunk_key = chunk_key_dp[0]
+        chunk_dp = chunk_key_dp[1]
+        content = chunk_dp["content"]
+
+        # Get initial extraction
+        hint_prompt = entity_extract_prompt.format(
+            **context_base, input_text="{input_text}"
+        ).format(**context_base, input_text=content)
+
+        final_result = await _user_llm_func_with_cache(hint_prompt)
+        history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
+
+        # Process initial extraction
+        maybe_nodes, maybe_edges = await _process_extraction_result(
+            final_result, chunk_key
+        )
+
+        # Process additional gleaning results
+        for now_glean_index in range(entity_extract_max_gleaning):
+            glean_result = await _user_llm_func_with_cache(
+                continue_prompt, history_messages=history
+            )
+
+            history += pack_user_ass_to_openai_messages(continue_prompt, glean_result)
+
+            # Process gleaning result separately
+            glean_nodes, glean_edges = await _process_extraction_result(
+                glean_result, chunk_key
+            )
+
+            # Merge results
+            for entity_name, entities in glean_nodes.items():
+                maybe_nodes[entity_name].extend(entities)
+            for edge_key, edges in glean_edges.items():
+                maybe_edges[edge_key].extend(edges)
+
+            if now_glean_index == entity_extract_max_gleaning - 1:
+                break
+
+            if_loop_result: str = await _user_llm_func_with_cache(
+                if_loop_prompt, history_messages=history
+            )
+            if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
+            if if_loop_result != "yes":
+                break
+
        processed_chunks += 1
        entities_count = len(maybe_nodes)
        relations_count = len(maybe_edges)
@@ -912,7 +962,10 @@ async def mix_kg_vector_query(
        try:
            # Reduce top_k for vector search in hybrid mode since we have structured information from KG
            mix_topk = min(10, query_param.top_k)
-            results = await chunks_vdb.query(augmented_query, top_k=mix_topk)
+            # TODO: add ids to the query
+            results = await chunks_vdb.query(
+                augmented_query, top_k=mix_topk, ids=query_param.ids
+            )
            if not results:
                return None

@@ -1121,7 +1174,11 @@ async def _get_node_data(
    logger.info(
        f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}"
    )
-    results = await entities_vdb.query(query, top_k=query_param.top_k)
+
+    results = await entities_vdb.query(
+        query, top_k=query_param.top_k, ids=query_param.ids
+    )
+
    if not len(results):
        return "", "", ""
    # get entity information
@@ -1374,7 +1431,10 @@ async def _get_edge_data(
    logger.info(
        f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}"
    )
-    results = await relationships_vdb.query(keywords, top_k=query_param.top_k)
+
+    results = await relationships_vdb.query(
+        keywords, top_k=query_param.top_k, ids=query_param.ids
+    )

    if not len(results):
        return "", "", ""
@@ -1623,7 +1683,9 @@ async def naive_query(
    if cached_response is not None:
        return cached_response

-    results = await chunks_vdb.query(query, top_k=query_param.top_k)
+    results = await chunks_vdb.query(
+        query, top_k=query_param.top_k, ids=query_param.ids
+    )
    if not len(results):
        return PROMPTS["fail_response"]

@@ -1854,3 +1916,90 @@ async def kg_query_with_keywords(
        )

    return response
+
+
+async def query_with_keywords(
+    query: str,
+    prompt: str,
+    param: QueryParam,
+    knowledge_graph_inst: BaseGraphStorage,
+    entities_vdb: BaseVectorStorage,
+    relationships_vdb: BaseVectorStorage,
+    chunks_vdb: BaseVectorStorage,
+    text_chunks_db: BaseKVStorage,
+    global_config: dict[str, str],
+    hashing_kv: BaseKVStorage | None = None,
+) -> str | AsyncIterator[str]:
+    """
+    Extract keywords from the query and then use them for retrieving information.
+
+    1. Extracts high-level and low-level keywords from the query
+    2. Formats the query with the extracted keywords and prompt
+    3. Uses the appropriate query method based on param.mode
+
+    Args:
+        query: The user's query
+        prompt: Additional prompt to prepend to the query
+        param: Query parameters
+        knowledge_graph_inst: Knowledge graph storage
+        entities_vdb: Entities vector database
+        relationships_vdb: Relationships vector database
+        chunks_vdb: Document chunks vector database
+        text_chunks_db: Text chunks storage
+        global_config: Global configuration
+        hashing_kv: Cache storage
+
+    Returns:
+        Query response or async iterator
+    """
+    # Extract keywords
+    hl_keywords, ll_keywords = await extract_keywords_only(
+        text=query,
+        param=param,
+        global_config=global_config,
+        hashing_kv=hashing_kv,
+    )
+
+    param.hl_keywords = hl_keywords
+    param.ll_keywords = ll_keywords
+
+    # Create a new string with the prompt and the keywords
+    ll_keywords_str = ", ".join(ll_keywords)
+    hl_keywords_str = ", ".join(hl_keywords)
+    formatted_question = f"{prompt}\n\n### Keywords:\nHigh-level: {hl_keywords_str}\nLow-level: {ll_keywords_str}\n\n### Query:\n{query}"
+
+    # Use appropriate query method based on mode
+    if param.mode in ["local", "global", "hybrid"]:
+        return await kg_query_with_keywords(
+            formatted_question,
+            knowledge_graph_inst,
+            entities_vdb,
+            relationships_vdb,
+            text_chunks_db,
+            param,
+            global_config,
+            hashing_kv=hashing_kv,
+        )
+    elif param.mode == "naive":
+        return await naive_query(
+            formatted_question,
+            chunks_vdb,
+            text_chunks_db,
+            param,
+            global_config,
+            hashing_kv=hashing_kv,
+        )
+    elif param.mode == "mix":
+        return await mix_kg_vector_query(
+            formatted_question,
+            knowledge_graph_inst,
+            entities_vdb,
+            relationships_vdb,
+            chunks_vdb,
+            text_chunks_db,
+            param,
+            global_config,
+            hashing_kv=hashing_kv,
+        )
+    else:
+        raise ValueError(f"Unknown mode {param.mode}")
--- a/lightrag/prompt.py
+++ b/lightrag/prompt.py
@@ -236,7 +236,7 @@ Given the query and conversation history, list both high-level and low-level key
 ---Instructions---

 - Consider both the current query and relevant conversation history when extracting keywords
- Output the keywords in JSON format
+- Output the keywords in JSON format, it will be parsed by a JSON parser, do not add any extra content in output
 - The JSON should have two keys:
  - "high_level_keywords" for overarching concepts or themes
  - "low_level_keywords" for specific entities or details
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -890,3 +890,52 @@ def lazy_external_import(module_name: str, class_name: str) -> Callable[..., Any
        return cls(*args, **kwargs)

    return import_class
+
+
+def get_content_summary(content: str, max_length: int = 100) -> str:
+    """Get summary of document content
+
+    Args:
+        content: Original document content
+        max_length: Maximum length of summary
+
+    Returns:
+        Truncated content with ellipsis if needed
+    """
+    content = content.strip()
+    if len(content) <= max_length:
+        return content
+    return content[:max_length] + "..."
+
+
+def clean_text(text: str) -> str:
+    """Clean text by removing null bytes (0x00) and whitespace
+
+    Args:
+        text: Input text to clean
+
+    Returns:
+        Cleaned text
+    """
+    return text.strip().replace("\x00", "")
+
+
+def check_storage_env_vars(storage_name: str) -> None:
+    """Check if all required environment variables for storage implementation exist
+
+    Args:
+        storage_name: Storage implementation name
+
+    Raises:
+        ValueError: If required environment variables are missing
+    """
+    from lightrag.kg import STORAGE_ENV_REQUIREMENTS
+
+    required_vars = STORAGE_ENV_REQUIREMENTS.get(storage_name, [])
+    missing_vars = [var for var in required_vars if var not in os.environ]
+
+    if missing_vars:
+        raise ValueError(
+            f"Storage implementation '{storage_name}' requires the following "
+            f"environment variables: {', '.join(missing_vars)}"
+        )