Merge pull request #1419 from earayu/feat/support_aget_docs_by_ids

feat: Add `aget_docs_by_ids` to check document processing status
2025-04-22 11:09:54 +08:00
parent cc67c6eb8d 7597a5bdfb
commit d9747f0fc2
1 changed files with 64 additions and 0 deletions
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -1517,6 +1517,70 @@ class LightRAG:
        """
        return await self.doc_status.get_docs_by_status(status)
    async def aget_docs_by_ids(
        self, ids: str | list[str]
    ) -> dict[str, DocProcessingStatus]:
        """Retrieves the processing status for one or more documents by their IDs.
        Args:
            ids: A single document ID (string) or a list of document IDs (list of strings).
        Returns:
            A dictionary where keys are the document IDs for which a status was found,
            and values are the corresponding DocProcessingStatus objects. IDs that
            are not found in the storage will be omitted from the result dictionary.
        """
        if isinstance(ids, str):
            # Ensure input is always a list of IDs for uniform processing
            id_list = [ids]
        elif (
            ids is None
        ):  # Handle potential None input gracefully, although type hint suggests str/list
            logger.warning(
                "aget_docs_by_ids called with None input, returning empty dict."
            )
            return {}
        else:
            # Assume input is already a list if not a string
            id_list = ids
        # Return early if the final list of IDs is empty
        if not id_list:
            logger.debug("aget_docs_by_ids called with an empty list of IDs.")
            return {}
        # Create tasks to fetch document statuses concurrently using the doc_status storage
        tasks = [self.doc_status.get_by_id(doc_id) for doc_id in id_list]
        # Execute tasks concurrently and gather the results. Results maintain order.
        # Type hint indicates results can be DocProcessingStatus or None if not found.
        results_list: list[Optional[DocProcessingStatus]] = await asyncio.gather(*tasks)
        # Build the result dictionary, mapping found IDs to their statuses
        found_statuses: dict[str, DocProcessingStatus] = {}
        # Keep track of IDs for which no status was found (for logging purposes)
        not_found_ids: list[str] = []
        # Iterate through the results, correlating them back to the original IDs
        for i, status_obj in enumerate(results_list):
            doc_id = id_list[
                i
            ]  # Get the original ID corresponding to this result index
            if status_obj:
                # If a status object was returned (not None), add it to the result dict
                found_statuses[doc_id] = status_obj
            else:
                # If status_obj is None, the document ID was not found in storage
                not_found_ids.append(doc_id)
        # Log a warning if any of the requested document IDs were not found
        if not_found_ids:
            logger.warning(
                f"Document statuses not found for the following IDs: {not_found_ids}"
            )
        # Return the dictionary containing statuses only for the found document IDs
        return found_statuses
    # TODO: Deprecated (Deleting documents can cause hallucinations in RAG.)
    # Document delete is not working properly for most of the storage implementations.
    async def adelete_by_doc_id(self, doc_id: str) -> None: