Merge branch 'main' into add-multi-worker-support

2025-03-01 15:55:37 +08:00
parent d18eb52ccc 5bbe61a02d
commit 3507e894d9
31 changed files with 1755 additions and 1371 deletions
--- a/lightrag/init.py
+++ b/lightrag/init.py
@@ -1,5 +1,5 @@
 from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam

-__version__ = "1.2.1"
+__version__ = "1.2.2"
 __author__ = "Zirui Guo"
 __url__ = "https://github.com/HKUDS/LightRAG"
--- a/lightrag/api/routers/graph_routes.py
+++ b/lightrag/api/routers/graph_routes.py
@@ -20,8 +20,8 @@ def create_graph_routes(rag, api_key: Optional[str] = None):
        return await rag.get_graph_labels()

    @router.get("/graphs", dependencies=[Depends(optional_api_key)])
-    async def get_knowledge_graph(label: str):
+    async def get_knowledge_graph(label: str, max_depth: int = 3):
        """Get knowledge graph for a specific label"""
-        return await rag.get_knowledge_graph(node_label=label, max_depth=3)
+        return await rag.get_knowledge_graph(node_label=label, max_depth=max_depth)

    return router
--- a/lightrag/api/webui/assets/index-BDX8o1Ld.js
+++ b/lightrag/api/webui/assets/index-BDX8o1Ld.js
--- a/lightrag/api/webui/assets/index-CLsJV-0i.css
+++ b/lightrag/api/webui/assets/index-CLsJV-0i.css
--- a/lightrag/api/webui/assets/index-DbuMPJAD.js
+++ b/lightrag/api/webui/assets/index-DbuMPJAD.js
--- a/lightrag/api/webui/assets/index-rP-YlyR1.css
+++ b/lightrag/api/webui/assets/index-rP-YlyR1.css
--- a/lightrag/api/webui/index.html
+++ b/lightrag/api/webui/index.html
@@ -2,11 +2,11 @@
 <html lang="en">
  <head>
    <meta charset="UTF-8" />
-    <link rel="icon" type="image/svg+xml" href="./vite.svg" />
+    <link rel="icon" type="image/svg+xml" href="./logo.png" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>Lightrag</title>
-    <script type="module" crossorigin src="./assets/index-BDX8o1Ld.js"></script>
-    <link rel="stylesheet" crossorigin href="./assets/index-CLsJV-0i.css">
+    <script type="module" crossorigin src="./assets/index-DbuMPJAD.js"></script>
+    <link rel="stylesheet" crossorigin href="./assets/index-rP-YlyR1.css">
  </head>
  <body>
    <div id="root"></div>
--- a/lightrag/api/webui/logo.png
+++ b/lightrag/api/webui/logo.png
--- a/lightrag/api/webui/vite.svg
+++ b/lightrag/api/webui/vite.svg
@@ -1 +0,0 @@
-<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="iconify iconify--logos" width="31.88" height="32" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 257"><defs><linearGradient id="IconifyId1813088fe1fbc01fb466" x1="-.828%" x2="57.636%" y1="7.652%" y2="78.411%"><stop offset="0%" stop-color="#41D1FF"></stop><stop offset="100%" stop-color="#BD34FE"></stop></linearGradient><linearGradient id="IconifyId1813088fe1fbc01fb467" x1="43.376%" x2="50.316%" y1="2.242%" y2="89.03%"><stop offset="0%" stop-color="#FFEA83"></stop><stop offset="8.333%" stop-color="#FFDD35"></stop><stop offset="100%" stop-color="#FFA800"></stop></linearGradient></defs><path fill="url(#IconifyId1813088fe1fbc01fb466)" d="M255.153 37.938L134.897 252.976c-2.483 4.44-8.862 4.466-11.382.048L.875 37.958c-2.746-4.814 1.371-10.646 6.827-9.67l120.385 21.517a6.537 6.537 0 0 0 2.322-.004l117.867-21.483c5.438-.991 9.574 4.796 6.877 9.62Z"></path><path fill="url(#IconifyId1813088fe1fbc01fb467)" d="M185.432.063L96.44 17.501a3.268 3.268 0 0 0-2.634 3.014l-5.474 92.456a3.268 3.268 0 0 0 3.997 3.378l24.777-5.718c2.318-.535 4.413 1.507 3.936 3.838l-7.361 36.047c-.495 2.426 1.782 4.5 4.151 3.78l15.304-4.649c2.372-.72 4.652 1.36 4.15 3.788l-11.698 56.621c-.732 3.542 3.979 5.473 5.943 2.437l1.313-2.028l72.516-144.72c1.215-2.423-.88-5.186-3.54-4.672l-25.505 4.922c-2.396.462-4.435-1.77-3.759-4.114l16.646-57.705c.677-2.35-1.37-4.583-3.769-4.113Z"></path></svg>
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -363,14 +363,14 @@ class LightRAG:
                self.namespace_prefix, NameSpace.VECTOR_STORE_ENTITIES
            ),
            embedding_func=self.embedding_func,
-            meta_fields={"entity_name"},
+            meta_fields={"entity_name", "source_id", "content"},
        )
        self.relationships_vdb: BaseVectorStorage = self.vector_db_storage_cls(  # type: ignore
            namespace=make_namespace(
                self.namespace_prefix, NameSpace.VECTOR_STORE_RELATIONSHIPS
            ),
            embedding_func=self.embedding_func,
-            meta_fields={"src_id", "tgt_id"},
+            meta_fields={"src_id", "tgt_id", "source_id", "content"},
        )
        self.chunks_vdb: BaseVectorStorage = self.vector_db_storage_cls(  # type: ignore
            namespace=make_namespace(
@@ -408,16 +408,31 @@ class LightRAG:

        self._storages_status = StoragesStatus.CREATED

-        # Initialize storages
        if self.auto_manage_storages_states:
-            loop = always_get_an_event_loop()
-            loop.run_until_complete(self.initialize_storages())
+            self._run_async_safely(self.initialize_storages, "Storage Initialization")

    def __del__(self):
-        # Finalize storages
        if self.auto_manage_storages_states:
+            self._run_async_safely(self.finalize_storages, "Storage Finalization")
+
+    def _run_async_safely(self, async_func, action_name=""):
+        """Safely execute an async function, avoiding event loop conflicts."""
+        try:
            loop = always_get_an_event_loop()
-            loop.run_until_complete(self.finalize_storages())
+            if loop.is_running():
+                task = loop.create_task(async_func())
+                task.add_done_callback(
+                    lambda t: logger.info(f"{action_name} completed!")
+                )
+            else:
+                loop.run_until_complete(async_func())
+        except RuntimeError:
+            logger.warning(
+                f"No running event loop, creating a new loop for {action_name}."
+            )
+            loop = asyncio.new_event_loop()
+            loop.run_until_complete(async_func())
+            loop.close()

    async def initialize_storages(self):
        """Asynchronously initialize the storages"""
@@ -491,7 +506,7 @@ class LightRAG:
        input: str | list[str],
        split_by_character: str | None = None,
        split_by_character_only: bool = False,
-        ids: list[str] | None = None,
+        ids: str | list[str] | None = None,
    ) -> None:
        """Sync Insert documents with checkpoint support

@@ -500,7 +515,7 @@ class LightRAG:
            split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
            split_by_character_only: if split_by_character_only is True, split the string by character only, when
            split_by_character is None, this parameter is ignored.
-            ids: list of unique document IDs, if not provided, MD5 hash IDs will be generated
+            ids: single string of the document ID or list of unique document IDs, if not provided, MD5 hash IDs will be generated
        """
        loop = always_get_an_event_loop()
        loop.run_until_complete(
@@ -512,7 +527,7 @@ class LightRAG:
        input: str | list[str],
        split_by_character: str | None = None,
        split_by_character_only: bool = False,
-        ids: list[str] | None = None,
+        ids: str | list[str] | None = None,
    ) -> None:
        """Async Insert documents with checkpoint support

@@ -528,12 +543,19 @@ class LightRAG:
            split_by_character, split_by_character_only
        )

-    def insert_custom_chunks(self, full_text: str, text_chunks: list[str]) -> None:
+    def insert_custom_chunks(
+        self,
+        full_text: str,
+        text_chunks: list[str],
+        doc_id: str | list[str] | None = None,
+    ) -> None:
        loop = always_get_an_event_loop()
-        loop.run_until_complete(self.ainsert_custom_chunks(full_text, text_chunks))
+        loop.run_until_complete(
+            self.ainsert_custom_chunks(full_text, text_chunks, doc_id)
+        )

    async def ainsert_custom_chunks(
-        self, full_text: str, text_chunks: list[str]
+        self, full_text: str, text_chunks: list[str], doc_id: str | None = None
    ) -> None:
        update_storage = False
        try:
@@ -542,7 +564,10 @@ class LightRAG:
            text_chunks = [self.clean_text(chunk) for chunk in text_chunks]

            # Process cleaned texts
-            doc_key = compute_mdhash_id(full_text, prefix="doc-")
+            if doc_id is None:
+                doc_key = compute_mdhash_id(full_text, prefix="doc-")
+            else:
+                doc_key = doc_id
            new_docs = {doc_key: {"content": full_text}}

            _add_doc_keys = await self.full_docs.filter_keys({doc_key})
@@ -598,6 +623,8 @@ class LightRAG:
        """
        if isinstance(input, str):
            input = [input]
+        if isinstance(ids, str):
+            ids = [ids]

        # 1. Validate ids if provided or generate MD5 hash IDs
        if ids is not None:
@@ -1366,12 +1393,14 @@ class LightRAG:

            logger.debug(f"Starting deletion for document {doc_id}")

+            doc_to_chunk_id = doc_id.replace("doc", "chunk")
+
            # 2. Get all related chunks
-            chunks = await self.text_chunks.get_by_id(doc_id)
+            chunks = await self.text_chunks.get_by_id(doc_to_chunk_id)
            if not chunks:
                return

-            chunk_ids = list(chunks.keys())
+            chunk_ids = {chunks["full_doc_id"].replace("doc", "chunk")}
            logger.debug(f"Found {len(chunk_ids)} chunks to delete")

            # 3. Before deleting, check the related entities and relationships for these chunks
@@ -1380,7 +1409,7 @@ class LightRAG:
                entities = [
                    dp
                    for dp in self.entities_vdb.client_storage["data"]
-                    if dp.get("source_id") == chunk_id
+                    if chunk_id in dp.get("source_id")
                ]
                logger.debug(f"Chunk {chunk_id} has {len(entities)} related entities")

@@ -1388,7 +1417,7 @@ class LightRAG:
                relations = [
                    dp
                    for dp in self.relationships_vdb.client_storage["data"]
-                    if dp.get("source_id") == chunk_id
+                    if chunk_id in dp.get("source_id")
                ]
                logger.debug(f"Chunk {chunk_id} has {len(relations)} related relations")

@@ -1499,42 +1528,71 @@ class LightRAG:
                f"Updated {len(entities_to_update)} entities and {len(relationships_to_update)} relationships."
            )

+            async def process_data(data_type, vdb, chunk_id):
+                # Check data (entities or relationships)
+                data_with_chunk = [
+                    dp
+                    for dp in vdb.client_storage["data"]
+                    if chunk_id in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
+                ]
+
+                data_for_vdb = {}
+                if data_with_chunk:
+                    logger.warning(
+                        f"found {len(data_with_chunk)} {data_type} still referencing chunk {chunk_id}"
+                    )
+
+                    for item in data_with_chunk:
+                        old_sources = item["source_id"].split(GRAPH_FIELD_SEP)
+                        new_sources = [src for src in old_sources if src != chunk_id]
+
+                        if not new_sources:
+                            logger.info(
+                                f"{data_type} {item.get('entity_name', 'N/A')} is deleted because source_id is not exists"
+                            )
+                            await vdb.delete_entity(item)
+                        else:
+                            item["source_id"] = GRAPH_FIELD_SEP.join(new_sources)
+                            item_id = item["__id__"]
+                            data_for_vdb[item_id] = item.copy()
+                            if data_type == "entities":
+                                data_for_vdb[item_id]["content"] = data_for_vdb[
+                                    item_id
+                                ].get("content") or (
+                                    item.get("entity_name", "")
+                                    + (item.get("description") or "")
+                                )
+                            else:  # relationships
+                                data_for_vdb[item_id]["content"] = data_for_vdb[
+                                    item_id
+                                ].get("content") or (
+                                    (item.get("keywords") or "")
+                                    + (item.get("src_id") or "")
+                                    + (item.get("tgt_id") or "")
+                                    + (item.get("description") or "")
+                                )
+
+                    if data_for_vdb:
+                        await vdb.upsert(data_for_vdb)
+                        logger.info(f"Successfully updated {data_type} in vector DB")
+
            # Add verification step
            async def verify_deletion():
                # Verify if the document has been deleted
                if await self.full_docs.get_by_id(doc_id):
-                    logger.error(f"Document {doc_id} still exists in full_docs")
+                    logger.warning(f"Document {doc_id} still exists in full_docs")

                # Verify if chunks have been deleted
-                remaining_chunks = await self.text_chunks.get_by_id(doc_id)
+                remaining_chunks = await self.text_chunks.get_by_id(doc_to_chunk_id)
                if remaining_chunks:
-                    logger.error(f"Found {len(remaining_chunks)} remaining chunks")
+                    logger.warning(f"Found {len(remaining_chunks)} remaining chunks")

                # Verify entities and relationships
                for chunk_id in chunk_ids:
-                    # Check entities
-                    entities_with_chunk = [
-                        dp
-                        for dp in self.entities_vdb.client_storage["data"]
-                        if chunk_id
-                        in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
-                    ]
-                    if entities_with_chunk:
-                        logger.error(
-                            f"Found {len(entities_with_chunk)} entities still referencing chunk {chunk_id}"
-                        )
-
-                    # Check relationships
-                    relations_with_chunk = [
-                        dp
-                        for dp in self.relationships_vdb.client_storage["data"]
-                        if chunk_id
-                        in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
-                    ]
-                    if relations_with_chunk:
-                        logger.error(
-                            f"Found {len(relations_with_chunk)} relations still referencing chunk {chunk_id}"
-                        )
+                    await process_data("entities", self.entities_vdb, chunk_id)
+                    await process_data(
+                        "relationships", self.relationships_vdb, chunk_id
+                    )

            await verify_deletion()

--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -323,6 +323,7 @@ async def _merge_edges_then_upsert(
        tgt_id=tgt_id,
        description=description,
        keywords=keywords,
+        source_id=source_id,
    )

    return edge_data
@@ -365,7 +366,7 @@ async def extract_entities(
        tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
        record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
        completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
-        entity_types=",".join(entity_types),
+        entity_types=", ".join(entity_types),
        language=language,
    )
    # add example's format
@@ -562,6 +563,7 @@ async def extract_entities(
            compute_mdhash_id(dp["entity_name"], prefix="ent-"): {
                "content": dp["entity_name"] + dp["description"],
                "entity_name": dp["entity_name"],
+                "source_id": dp["source_id"],
            }
            for dp in all_entities_data
        }
@@ -572,6 +574,7 @@ async def extract_entities(
            compute_mdhash_id(dp["src_id"] + dp["tgt_id"], prefix="rel-"): {
                "src_id": dp["src_id"],
                "tgt_id": dp["tgt_id"],
+                "source_id": dp["source_id"],
                "content": dp["keywords"]
                + dp["src_id"]
                + dp["tgt_id"]
@@ -595,7 +598,7 @@ async def kg_query(
    global_config: dict[str, str],
    hashing_kv: BaseKVStorage | None = None,
    system_prompt: str | None = None,
-) -> str:
+) -> str | AsyncIterator[str]:
    # Handle cache
    use_model_func = global_config["llm_model_func"]
    args_hash = compute_args_hash(query_param.mode, query, cache_type="query")
@@ -1127,7 +1130,7 @@ async def _get_node_data(
    len_node_datas = len(node_datas)
    node_datas = truncate_list_by_token_size(
        node_datas,
-        key=lambda x: x["description"],
+        key=lambda x: x["description"] if x["description"] is not None else "",
        max_token_size=query_param.max_token_for_local_context,
    )
    logger.debug(
@@ -1310,7 +1313,7 @@ async def _find_most_related_edges_from_entities(
    )
    all_edges_data = truncate_list_by_token_size(
        all_edges_data,
-        key=lambda x: x["description"],
+        key=lambda x: x["description"] if x["description"] is not None else "",
        max_token_size=query_param.max_token_for_global_context,
    )

@@ -1364,7 +1367,7 @@ async def _get_edge_data(
    )
    edge_datas = truncate_list_by_token_size(
        edge_datas,
-        key=lambda x: x["description"],
+        key=lambda x: x["description"] if x["description"] is not None else "",
        max_token_size=query_param.max_token_for_global_context,
    )
    use_entities, use_text_units = await asyncio.gather(
@@ -1468,7 +1471,7 @@ async def _find_most_related_entities_from_relationships(
    len_node_datas = len(node_datas)
    node_datas = truncate_list_by_token_size(
        node_datas,
-        key=lambda x: x["description"],
+        key=lambda x: x["description"] if x["description"] is not None else "",
        max_token_size=query_param.max_token_for_local_context,
    )
    logger.debug(
--- a/lightrag/prompt.py
+++ b/lightrag/prompt.py
@@ -47,8 +47,9 @@ Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_
 #############################
 ---Real Data---
 ######################
-Entity_types: {entity_types}
-Text: {input_text}
+Entity_types: [{entity_types}]
+Text:
+{input_text}
 ######################
 Output:"""
				`@@ -1 +0,0 @@`
				<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="iconify iconify--logos" width="31.88" height="32" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 257"><defs><linearGradient id="IconifyId1813088fe1fbc01fb466" x1="-.828%" x2="57.636%" y1="7.652%" y2="78.411%"><stop offset="0%" stop-color="#41D1FF"></stop><stop offset="100%" stop-color="#BD34FE"></stop></linearGradient><linearGradient id="IconifyId1813088fe1fbc01fb467" x1="43.376%" x2="50.316%" y1="2.242%" y2="89.03%"><stop offset="0%" stop-color="#FFEA83"></stop><stop offset="8.333%" stop-color="#FFDD35"></stop><stop offset="100%" stop-color="#FFA800"></stop></linearGradient></defs><path fill="url(#IconifyId1813088fe1fbc01fb466)" d="M255.153 37.938L134.897 252.976c-2.483 4.44-8.862 4.466-11.382.048L.875 37.958c-2.746-4.814 1.371-10.646 6.827-9.67l120.385 21.517a6.537 6.537 0 0 0 2.322-.004l117.867-21.483c5.438-.991 9.574 4.796 6.877 9.62Z"></path><path fill="url(#IconifyId1813088fe1fbc01fb467)" d="M185.432.063L96.44 17.501a3.268 3.268 0 0 0-2.634 3.014l-5.474 92.456a3.268 3.268 0 0 0 3.997 3.378l24.777-5.718c2.318-.535 4.413 1.507 3.936 3.838l-7.361 36.047c-.495 2.426 1.782 4.5 4.151 3.78l15.304-4.649c2.372-.72 4.652 1.36 4.15 3.788l-11.698 56.621c-.732 3.542 3.979 5.473 5.943 2.437l1.313-2.028l72.516-144.72c1.215-2.423-.88-5.186-3.54-4.672l-25.505 4.922c-2.396.462-4.435-1.77-3.759-4.114l16.646-57.705c.677-2.35-1.37-4.583-3.769-4.113Z"></path></svg>