From aaa8194423e18db4503b4b04fa5543bd63980b41 Mon Sep 17 00:00:00 2001
From: Saifeddine ALOUI <aloui.seifeddine@gmail.com>
Date: Wed, 5 Mar 2025 15:32:39 +0100
Subject: [PATCH 01/54] Update document_routes.py

---
 lightrag/api/routers/document_routes.py | 114 +++++++++++++++---------
 1 file changed, 73 insertions(+), 41 deletions(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index d9dfe913..9d161f6c 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -16,7 +16,7 @@ from pydantic import BaseModel, Field, field_validator
 
 from lightrag import LightRAG
 from lightrag.base import DocProcessingStatus, DocStatus
-from ..utils_api import get_api_key_dependency
+from lightrag.api.utils_api import get_api_key_dependency, global_args
 
 
 router = APIRouter(prefix="/documents", tags=["documents"])
@@ -237,54 +237,86 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                     )
                     return False
             case ".pdf":
-                if not pm.is_installed("pypdf2"):  # type: ignore
-                    pm.install("pypdf2")
-                from PyPDF2 import PdfReader  # type: ignore
-                from io import BytesIO
+                if global_args["main_args"].document_loading_tool=="DOCLING":
+                    if not pm.is_installed("docling"):  # type: ignore
+                        pm.install("docling")
+                    from docling.document_converter import DocumentConverter
+                    converter = DocumentConverter()
+                    result = converter.convert(file_path)
+                    content = result.document.export_to_markdown()
+                else:
+                    if not pm.is_installed("pypdf2"):  # type: ignore
+                        pm.install("pypdf2")
+                    from PyPDF2 import PdfReader  # type: ignore
+                    from io import BytesIO
 
-                pdf_file = BytesIO(file)
-                reader = PdfReader(pdf_file)
-                for page in reader.pages:
-                    content += page.extract_text() + "\n"
+                    pdf_file = BytesIO(file)
+                    reader = PdfReader(pdf_file)
+                    for page in reader.pages:
+                        content += page.extract_text() + "\n"
             case ".docx":
-                if not pm.is_installed("python-docx"):  # type: ignore
-                    pm.install("docx")
-                from docx import Document  # type: ignore
-                from io import BytesIO
+                if global_args["main_args"].document_loading_tool=="DOCLING":
+                    if not pm.is_installed("docling"):  # type: ignore
+                        pm.install("docling")
+                    from docling.document_converter import DocumentConverter
+                    converter = DocumentConverter()
+                    result = converter.convert(file_path)
+                    content = result.document.export_to_markdown()
+                else:
+                    if not pm.is_installed("python-docx"):  # type: ignore
+                        pm.install("docx")
+                    from docx import Document  # type: ignore
+                    from io import BytesIO
 
-                docx_file = BytesIO(file)
-                doc = Document(docx_file)
-                content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
+                    docx_file = BytesIO(file)
+                    doc = Document(docx_file)
+                    content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
             case ".pptx":
-                if not pm.is_installed("python-pptx"):  # type: ignore
-                    pm.install("pptx")
-                from pptx import Presentation  # type: ignore
-                from io import BytesIO
+                if global_args["main_args"].document_loading_tool=="DOCLING":
+                    if not pm.is_installed("docling"):  # type: ignore
+                        pm.install("docling")
+                    from docling.document_converter import DocumentConverter
+                    converter = DocumentConverter()
+                    result = converter.convert(file_path)
+                    content = result.document.export_to_markdown()
+                else:
+                    if not pm.is_installed("python-pptx"):  # type: ignore
+                        pm.install("pptx")
+                    from pptx import Presentation  # type: ignore
+                    from io import BytesIO
 
-                pptx_file = BytesIO(file)
-                prs = Presentation(pptx_file)
-                for slide in prs.slides:
-                    for shape in slide.shapes:
-                        if hasattr(shape, "text"):
-                            content += shape.text + "\n"
+                    pptx_file = BytesIO(file)
+                    prs = Presentation(pptx_file)
+                    for slide in prs.slides:
+                        for shape in slide.shapes:
+                            if hasattr(shape, "text"):
+                                content += shape.text + "\n"
             case ".xlsx":
-                if not pm.is_installed("openpyxl"):  # type: ignore
-                    pm.install("openpyxl")
-                from openpyxl import load_workbook  # type: ignore
-                from io import BytesIO
+                if global_args["main_args"].document_loading_tool=="DOCLING":
+                    if not pm.is_installed("docling"):  # type: ignore
+                        pm.install("docling")
+                    from docling.document_converter import DocumentConverter
+                    converter = DocumentConverter()
+                    result = converter.convert(file_path)
+                    content = result.document.export_to_markdown()
+                else:
+                    if not pm.is_installed("openpyxl"):  # type: ignore
+                        pm.install("openpyxl")
+                    from openpyxl import load_workbook  # type: ignore
+                    from io import BytesIO
 
-                xlsx_file = BytesIO(file)
-                wb = load_workbook(xlsx_file)
-                for sheet in wb:
-                    content += f"Sheet: {sheet.title}\n"
-                    for row in sheet.iter_rows(values_only=True):
-                        content += (
-                            "\t".join(
-                                str(cell) if cell is not None else "" for cell in row
+                    xlsx_file = BytesIO(file)
+                    wb = load_workbook(xlsx_file)
+                    for sheet in wb:
+                        content += f"Sheet: {sheet.title}\n"
+                        for row in sheet.iter_rows(values_only=True):
+                            content += (
+                                "\t".join(
+                                    str(cell) if cell is not None else "" for cell in row
+                                )
+                                + "\n"
                             )
-                            + "\n"
-                        )
-                    content += "\n"
+                        content += "\n"
             case _:
                 logger.error(
                     f"Unsupported file type: {file_path.name} (extension {ext})"

From 95a6a274ca7d0588e72e76f8eb445870a128f868 Mon Sep 17 00:00:00 2001
From: Saifeddine ALOUI <aloui.seifeddine@gmail.com>
Date: Wed, 5 Mar 2025 15:33:06 +0100
Subject: [PATCH 02/54] Update ollama_api.py

---
 lightrag/api/routers/ollama_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/api/routers/ollama_api.py b/lightrag/api/routers/ollama_api.py
index 9688d073..37d7354e 100644
--- a/lightrag/api/routers/ollama_api.py
+++ b/lightrag/api/routers/ollama_api.py
@@ -11,7 +11,7 @@ import asyncio
 from ascii_colors import trace_exception
 from lightrag import LightRAG, QueryParam
 from lightrag.utils import encode_string_by_tiktoken
-from ..utils_api import ollama_server_infos
+from lightrag.api.utils_api import ollama_server_infos
 
 
 # query mode according to query prefix (bypass is not LightRAG quer mode)

From c62422eadee4ac9f666c55fa04706ce52812fd32 Mon Sep 17 00:00:00 2001
From: Saifeddine ALOUI <aloui.seifeddine@gmail.com>
Date: Wed, 5 Mar 2025 15:33:54 +0100
Subject: [PATCH 03/54] Update utils_api.py

---
 lightrag/api/utils_api.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lightrag/api/utils_api.py b/lightrag/api/utils_api.py
index ed1250d4..39b2950f 100644
--- a/lightrag/api/utils_api.py
+++ b/lightrag/api/utils_api.py
@@ -17,6 +17,10 @@ from starlette.status import HTTP_403_FORBIDDEN
 # Load environment variables
 load_dotenv(override=True)
 
+global_args={
+    "main_args":None
+}
+
 
 class OllamaServerInfos:
     # Constants for emulated Ollama model information
@@ -340,9 +344,13 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:
     # Inject chunk configuration
     args.chunk_size = get_env_value("CHUNK_SIZE", 1200, int)
     args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int)
+    
+    # Select Document loading tool
+    args.document_loading_tool = get_env_value("DOCUMENT_LOADING_TOOL", "DOCLING")
 
     ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name
 
+    global_args["main_args"]= args
     return args
 
 

From 39c24f4a597c9e82e45975e322ee28156a6fb202 Mon Sep 17 00:00:00 2001
From: Saifeddine ALOUI <aloui.seifeddine@gmail.com>
Date: Wed, 5 Mar 2025 15:36:17 +0100
Subject: [PATCH 04/54] Update utils_api.py

---
 lightrag/api/utils_api.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/lightrag/api/utils_api.py b/lightrag/api/utils_api.py
index 39b2950f..8ba4565f 100644
--- a/lightrag/api/utils_api.py
+++ b/lightrag/api/utils_api.py
@@ -17,9 +17,7 @@ from starlette.status import HTTP_403_FORBIDDEN
 # Load environment variables
 load_dotenv(override=True)
 
-global_args={
-    "main_args":None
-}
+global_args = {"main_args": None}
 
 
 class OllamaServerInfos:
@@ -344,13 +342,13 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:
     # Inject chunk configuration
     args.chunk_size = get_env_value("CHUNK_SIZE", 1200, int)
     args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int)
-    
+
     # Select Document loading tool
     args.document_loading_tool = get_env_value("DOCUMENT_LOADING_TOOL", "DOCLING")
 
     ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name
 
-    global_args["main_args"]= args
+    global_args["main_args"] = args
     return args
 
 

From 6e4daea056940b17f6773c59e492bd8a5eb5d308 Mon Sep 17 00:00:00 2001
From: Saifeddine ALOUI <aloui.seifeddine@gmail.com>
Date: Wed, 5 Mar 2025 15:36:47 +0100
Subject: [PATCH 05/54] Linting

---
 lightrag/api/routers/document_routes.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index 9d161f6c..a6830389 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -237,10 +237,11 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                     )
                     return False
             case ".pdf":
-                if global_args["main_args"].document_loading_tool=="DOCLING":
+                if global_args["main_args"].document_loading_tool == "DOCLING":
                     if not pm.is_installed("docling"):  # type: ignore
                         pm.install("docling")
                     from docling.document_converter import DocumentConverter
+
                     converter = DocumentConverter()
                     result = converter.convert(file_path)
                     content = result.document.export_to_markdown()
@@ -255,10 +256,11 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                     for page in reader.pages:
                         content += page.extract_text() + "\n"
             case ".docx":
-                if global_args["main_args"].document_loading_tool=="DOCLING":
+                if global_args["main_args"].document_loading_tool == "DOCLING":
                     if not pm.is_installed("docling"):  # type: ignore
                         pm.install("docling")
                     from docling.document_converter import DocumentConverter
+
                     converter = DocumentConverter()
                     result = converter.convert(file_path)
                     content = result.document.export_to_markdown()
@@ -270,12 +272,15 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
 
                     docx_file = BytesIO(file)
                     doc = Document(docx_file)
-                    content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
+                    content = "\n".join(
+                        [paragraph.text for paragraph in doc.paragraphs]
+                    )
             case ".pptx":
-                if global_args["main_args"].document_loading_tool=="DOCLING":
+                if global_args["main_args"].document_loading_tool == "DOCLING":
                     if not pm.is_installed("docling"):  # type: ignore
                         pm.install("docling")
                     from docling.document_converter import DocumentConverter
+
                     converter = DocumentConverter()
                     result = converter.convert(file_path)
                     content = result.document.export_to_markdown()
@@ -292,10 +297,11 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                             if hasattr(shape, "text"):
                                 content += shape.text + "\n"
             case ".xlsx":
-                if global_args["main_args"].document_loading_tool=="DOCLING":
+                if global_args["main_args"].document_loading_tool == "DOCLING":
                     if not pm.is_installed("docling"):  # type: ignore
                         pm.install("docling")
                     from docling.document_converter import DocumentConverter
+
                     converter = DocumentConverter()
                     result = converter.convert(file_path)
                     content = result.document.export_to_markdown()
@@ -312,7 +318,8 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                         for row in sheet.iter_rows(values_only=True):
                             content += (
                                 "\t".join(
-                                    str(cell) if cell is not None else "" for cell in row
+                                    str(cell) if cell is not None else ""
+                                    for cell in row
                                 )
                                 + "\n"
                             )

From 00f3c6c6ddce60687d25c4c7022efb5bba1e4b5d Mon Sep 17 00:00:00 2001
From: Saifeddine ALOUI <aloui.seifeddine@gmail.com>
Date: Thu, 6 Mar 2025 01:11:48 +0100
Subject: [PATCH 06/54] Upgraded document loading engine

---
 lightrag/api/routers/document_routes.py | 8 ++++----
 lightrag/api/utils_api.py               | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index a6830389..dcb8f961 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -237,7 +237,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                     )
                     return False
             case ".pdf":
-                if global_args["main_args"].document_loading_tool == "DOCLING":
+                if global_args["main_args"].document_loading_engine == "DOCLING":
                     if not pm.is_installed("docling"):  # type: ignore
                         pm.install("docling")
                     from docling.document_converter import DocumentConverter
@@ -256,7 +256,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                     for page in reader.pages:
                         content += page.extract_text() + "\n"
             case ".docx":
-                if global_args["main_args"].document_loading_tool == "DOCLING":
+                if global_args["main_args"].document_loading_engine == "DOCLING":
                     if not pm.is_installed("docling"):  # type: ignore
                         pm.install("docling")
                     from docling.document_converter import DocumentConverter
@@ -276,7 +276,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                         [paragraph.text for paragraph in doc.paragraphs]
                     )
             case ".pptx":
-                if global_args["main_args"].document_loading_tool == "DOCLING":
+                if global_args["main_args"].document_loading_engine == "DOCLING":
                     if not pm.is_installed("docling"):  # type: ignore
                         pm.install("docling")
                     from docling.document_converter import DocumentConverter
@@ -297,7 +297,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                             if hasattr(shape, "text"):
                                 content += shape.text + "\n"
             case ".xlsx":
-                if global_args["main_args"].document_loading_tool == "DOCLING":
+                if global_args["main_args"].document_loading_engine == "DOCLING":
                     if not pm.is_installed("docling"):  # type: ignore
                         pm.install("docling")
                     from docling.document_converter import DocumentConverter
diff --git a/lightrag/api/utils_api.py b/lightrag/api/utils_api.py
index 8ba4565f..ae674968 100644
--- a/lightrag/api/utils_api.py
+++ b/lightrag/api/utils_api.py
@@ -344,7 +344,7 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:
     args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int)
 
     # Select Document loading tool
-    args.document_loading_tool = get_env_value("DOCUMENT_LOADING_TOOL", "DOCLING")
+    args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DOCLING")
 
     ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name
 

From 6e3b23069c0a76a5bfa7e27189faac57ff7d0691 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Fri, 7 Mar 2025 16:43:18 +0800
Subject: [PATCH 07/54] - Remove useless `_label_exists` method

---
 lightrag/kg/neo4j_impl.py | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index fec39138..2498341d 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -164,23 +164,13 @@ class Neo4JStorage(BaseGraphStorage):
         # Noe4J handles persistence automatically
         pass
 
-    async def _label_exists(self, label: str) -> bool:
-        """Check if a label exists in the Neo4j database."""
-        query = "CALL db.labels() YIELD label RETURN label"
-        try:
-            async with self._driver.session(database=self._DATABASE) as session:
-                result = await session.run(query)
-                labels = [record["label"] for record in await result.data()]
-                return label in labels
-        except Exception as e:
-            logger.error(f"Error checking label existence: {e}")
-            return False
-
     async def _ensure_label(self, label: str) -> str:
-        """Ensure a label exists by validating it."""
+        """Ensure a label is valid
+        
+        Args:
+            label: The label to validate
+        """
         clean_label = label.strip('"')
-        if not await self._label_exists(clean_label):
-            logger.warning(f"Label '{clean_label}' does not exist in Neo4j")
         return clean_label
 
     async def has_node(self, node_id: str) -> bool:
@@ -290,7 +280,7 @@ class Neo4JStorage(BaseGraphStorage):
                 if record:
                     try:
                         result = dict(record["edge_properties"])
-                        logger.info(f"Result: {result}")
+                        logger.debug(f"Result: {result}")
                         # Ensure required keys exist with defaults
                         required_keys = {
                             "weight": 0.0,

From 0ee2e7fd4800050ef2d1819c157a196ed66cf4fa Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Fri, 7 Mar 2025 16:56:48 +0800
Subject: [PATCH 08/54] Suppress Neo4j warning logs by setting logger level.

---
 lightrag/kg/neo4j_impl.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index 2498341d..265c0347 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -15,6 +15,7 @@ from tenacity import (
     retry_if_exception_type,
 )
 
+import logging
 from ..utils import logger
 from ..base import BaseGraphStorage
 from ..types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge
@@ -37,6 +38,8 @@ config.read("config.ini", "utf-8")
 # Get maximum number of graph nodes from environment variable, default is 1000
 MAX_GRAPH_NODES = int(os.getenv("MAX_GRAPH_NODES", 1000))
 
+# Set neo4j logger level to ERROR to suppress warning logs
+logging.getLogger("neo4j").setLevel(logging.ERROR)
 
 @final
 @dataclass

From af803f4e7ad3267fcd184fd6c3914b4c6b2c6bef Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 8 Mar 2025 01:20:36 +0800
Subject: [PATCH 09/54] Refactor Neo4J graph query with min_degree an inclusive
 match support

---
 lightrag/kg/neo4j_impl.py | 434 ++++++++++++++++++++++++--------------
 1 file changed, 275 insertions(+), 159 deletions(-)

diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index 265c0347..f6567249 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -41,6 +41,7 @@ MAX_GRAPH_NODES = int(os.getenv("MAX_GRAPH_NODES", 1000))
 # Set neo4j logger level to ERROR to suppress warning logs
 logging.getLogger("neo4j").setLevel(logging.ERROR)
 
+
 @final
 @dataclass
 class Neo4JStorage(BaseGraphStorage):
@@ -63,19 +64,25 @@ class Neo4JStorage(BaseGraphStorage):
         MAX_CONNECTION_POOL_SIZE = int(
             os.environ.get(
                 "NEO4J_MAX_CONNECTION_POOL_SIZE",
-                config.get("neo4j", "connection_pool_size", fallback=800),
+                config.get("neo4j", "connection_pool_size", fallback=50),  # Reduced from 800
             )
         )
         CONNECTION_TIMEOUT = float(
             os.environ.get(
                 "NEO4J_CONNECTION_TIMEOUT",
-                config.get("neo4j", "connection_timeout", fallback=60.0),
+                config.get("neo4j", "connection_timeout", fallback=30.0),  # Reduced from 60.0
             ),
         )
         CONNECTION_ACQUISITION_TIMEOUT = float(
             os.environ.get(
                 "NEO4J_CONNECTION_ACQUISITION_TIMEOUT",
-                config.get("neo4j", "connection_acquisition_timeout", fallback=60.0),
+                config.get("neo4j", "connection_acquisition_timeout", fallback=30.0),  # Reduced from 60.0
+            ),
+        )
+        MAX_TRANSACTION_RETRY_TIME = float(
+            os.environ.get(
+                "NEO4J_MAX_TRANSACTION_RETRY_TIME",
+                config.get("neo4j", "max_transaction_retry_time", fallback=30.0),
             ),
         )
         DATABASE = os.environ.get(
@@ -88,6 +95,7 @@ class Neo4JStorage(BaseGraphStorage):
             max_connection_pool_size=MAX_CONNECTION_POOL_SIZE,
             connection_timeout=CONNECTION_TIMEOUT,
             connection_acquisition_timeout=CONNECTION_ACQUISITION_TIMEOUT,
+            max_transaction_retry_time=MAX_TRANSACTION_RETRY_TIME,
         )
 
         # Try to connect to the database
@@ -169,21 +177,24 @@ class Neo4JStorage(BaseGraphStorage):
 
     async def _ensure_label(self, label: str) -> str:
         """Ensure a label is valid
-        
+
         Args:
             label: The label to validate
         """
         clean_label = label.strip('"')
+        if not clean_label:
+            raise ValueError("Neo4j: Label cannot be empty")
         return clean_label
 
     async def has_node(self, node_id: str) -> bool:
         entity_name_label = await self._ensure_label(node_id)
-        async with self._driver.session(database=self._DATABASE) as session:
+        async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
             query = (
                 f"MATCH (n:`{entity_name_label}`) RETURN count(n) > 0 AS node_exists"
             )
             result = await session.run(query)
             single_result = await result.single()
+            await result.consume()  # Ensure result is fully consumed
             logger.debug(
                 f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{single_result['node_exists']}"
             )
@@ -193,13 +204,14 @@ class Neo4JStorage(BaseGraphStorage):
         entity_name_label_source = source_node_id.strip('"')
         entity_name_label_target = target_node_id.strip('"')
 
-        async with self._driver.session(database=self._DATABASE) as session:
+        async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
             query = (
                 f"MATCH (a:`{entity_name_label_source}`)-[r]-(b:`{entity_name_label_target}`) "
                 "RETURN COUNT(r) > 0 AS edgeExists"
             )
             result = await session.run(query)
             single_result = await result.single()
+            await result.consume()  # Ensure result is fully consumed
             logger.debug(
                 f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{single_result['edgeExists']}"
             )
@@ -215,13 +227,16 @@ class Neo4JStorage(BaseGraphStorage):
             dict: Node properties if found
             None: If node not found
         """
-        async with self._driver.session(database=self._DATABASE) as session:
+        async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
             entity_name_label = await self._ensure_label(node_id)
             query = f"MATCH (n:`{entity_name_label}`) RETURN n"
             result = await session.run(query)
-            record = await result.single()
-            if record:
-                node = record["n"]
+            records = await result.fetch(2)  # Get up to 2 records to check for duplicates
+            await result.consume()  # Ensure result is fully consumed
+            if len(records) > 1:
+                logger.warning(f"Multiple nodes found with label '{entity_name_label}'. Using first node.")
+            if records:
+                node = records[0]["n"]
                 node_dict = dict(node)
                 logger.debug(
                     f"{inspect.currentframe().f_code.co_name}: query: {query}, result: {node_dict}"
@@ -230,23 +245,40 @@ class Neo4JStorage(BaseGraphStorage):
             return None
 
     async def node_degree(self, node_id: str) -> int:
+        """Get the degree (number of relationships) of a node with the given label.
+        If multiple nodes have the same label, returns the degree of the first node.
+        If no node is found, returns 0.
+        
+        Args:
+            node_id: The label of the node
+            
+        Returns:
+            int: The number of relationships the node has, or 0 if no node found
+        """
         entity_name_label = node_id.strip('"')
 
-        async with self._driver.session(database=self._DATABASE) as session:
+        async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
             query = f"""
                 MATCH (n:`{entity_name_label}`)
-                RETURN COUNT{{ (n)--() }} AS totalEdgeCount
+                OPTIONAL MATCH (n)-[r]-()
+                RETURN n, COUNT(r) AS degree
             """
             result = await session.run(query)
-            record = await result.single()
-            if record:
-                edge_count = record["totalEdgeCount"]
-                logger.debug(
-                    f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{edge_count}"
-                )
-                return edge_count
-            else:
-                return None
+            records = await result.fetch(100)
+            await result.consume()  # Ensure result is fully consumed
+            
+            if not records:
+                logger.warning(f"No node found with label '{entity_name_label}'")
+                return 0
+                
+            if len(records) > 1:
+                logger.warning(f"Multiple nodes ({len(records)}) found with label '{entity_name_label}', using first node's degree")
+            
+            degree = records[0]["degree"]
+            logger.debug(
+                f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{degree}"
+            )
+            return degree
 
     async def edge_degree(self, src_id: str, tgt_id: str) -> int:
         entity_name_label_source = src_id.strip('"')
@@ -264,6 +296,31 @@ class Neo4JStorage(BaseGraphStorage):
         )
         return degrees
 
+    async def check_duplicate_nodes(self) -> list[tuple[str, int]]:
+        """Find all labels that have multiple nodes
+        
+        Returns:
+            list[tuple[str, int]]: List of tuples containing (label, node_count) for labels with multiple nodes
+        """
+        async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
+            query = """
+            MATCH (n)
+            WITH labels(n) as nodeLabels
+            UNWIND nodeLabels as label
+            WITH label, count(*) as node_count
+            WHERE node_count > 1
+            RETURN label, node_count
+            ORDER BY node_count DESC
+            """
+            result = await session.run(query)
+            duplicates = []
+            async for record in result:
+                label = record["label"]
+                count = record["node_count"]
+                logger.info(f"Found {count} nodes with label: {label}")
+                duplicates.append((label, count))
+            return duplicates
+
     async def get_edge(
         self, source_node_id: str, target_node_id: str
     ) -> dict[str, str] | None:
@@ -271,18 +328,21 @@ class Neo4JStorage(BaseGraphStorage):
             entity_name_label_source = source_node_id.strip('"')
             entity_name_label_target = target_node_id.strip('"')
 
-            async with self._driver.session(database=self._DATABASE) as session:
+            async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
                 query = f"""
-                MATCH (start:`{entity_name_label_source}`)-[r]->(end:`{entity_name_label_target}`)
+                MATCH (start:`{entity_name_label_source}`)-[r]-(end:`{entity_name_label_target}`)
                 RETURN properties(r) as edge_properties
-                LIMIT 1
                 """
 
                 result = await session.run(query)
-                record = await result.single()
-                if record:
+                records = await result.fetch(2)  # Get up to 2 records to check for duplicates
+                if len(records) > 1:
+                    logger.warning(
+                        f"Multiple edges found between '{entity_name_label_source}' and '{entity_name_label_target}'. Using first edge."
+                    )
+                if records:
                     try:
-                        result = dict(record["edge_properties"])
+                        result = dict(records[0]["edge_properties"])
                         logger.debug(f"Result: {result}")
                         # Ensure required keys exist with defaults
                         required_keys = {
@@ -349,24 +409,27 @@ class Neo4JStorage(BaseGraphStorage):
         query = f"""MATCH (n:`{node_label}`)
                 OPTIONAL MATCH (n)-[r]-(connected)
                 RETURN n, r, connected"""
-        async with self._driver.session(database=self._DATABASE) as session:
+        async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
             results = await session.run(query)
             edges = []
-            async for record in results:
-                source_node = record["n"]
-                connected_node = record["connected"]
+            try:
+                async for record in results:
+                    source_node = record["n"]
+                    connected_node = record["connected"]
 
-                source_label = (
-                    list(source_node.labels)[0] if source_node.labels else None
-                )
-                target_label = (
-                    list(connected_node.labels)[0]
-                    if connected_node and connected_node.labels
-                    else None
-                )
+                    source_label = (
+                        list(source_node.labels)[0] if source_node.labels else None
+                    )
+                    target_label = (
+                        list(connected_node.labels)[0]
+                        if connected_node and connected_node.labels
+                        else None
+                    )
 
-                if source_label and target_label:
-                    edges.append((source_label, target_label))
+                    if source_label and target_label:
+                        edges.append((source_label, target_label))
+            finally:
+                await results.consume()  # Ensure results are consumed even if processing fails
 
             return edges
 
@@ -427,30 +490,46 @@ class Neo4JStorage(BaseGraphStorage):
     ) -> None:
         """
         Upsert an edge and its properties between two nodes identified by their labels.
+        Checks if both source and target nodes exist before creating the edge.
 
         Args:
             source_node_id (str): Label of the source node (used as identifier)
             target_node_id (str): Label of the target node (used as identifier)
             edge_data (dict): Dictionary of properties to set on the edge
+
+        Raises:
+            ValueError: If either source or target node does not exist
         """
         source_label = await self._ensure_label(source_node_id)
         target_label = await self._ensure_label(target_node_id)
         edge_properties = edge_data
 
+        # Check if both nodes exist
+        source_exists = await self.has_node(source_label)
+        target_exists = await self.has_node(target_label)
+
+        if not source_exists:
+            raise ValueError(f"Neo4j: source node with label '{source_label}' does not exist")
+        if not target_exists:
+            raise ValueError(f"Neo4j: target node with label '{target_label}' does not exist")
+
         async def _do_upsert_edge(tx: AsyncManagedTransaction):
             query = f"""
             MATCH (source:`{source_label}`)
             WITH source
             MATCH (target:`{target_label}`)
-            MERGE (source)-[r:DIRECTED]->(target)
+            MERGE (source)-[r:DIRECTED]-(target)
             SET r += $properties
             RETURN r
             """
             result = await tx.run(query, properties=edge_properties)
-            record = await result.single()
-            logger.debug(
-                f"Upserted edge from '{source_label}' to '{target_label}' with properties: {edge_properties}, result: {record['r'] if record else None}"
-            )
+            try:
+                record = await result.single()
+                logger.debug(
+                    f"Upserted edge from '{source_label}' to '{target_label}' with properties: {edge_properties}, result: {record['r'] if record else None}"
+                )
+            finally:
+                await result.consume()  # Ensure result is consumed
 
         try:
             async with self._driver.session(database=self._DATABASE) as session:
@@ -463,145 +542,179 @@ class Neo4JStorage(BaseGraphStorage):
         print("Implemented but never called.")
 
     async def get_knowledge_graph(
-        self, node_label: str, max_depth: int = 5
+        self,
+        node_label: str,
+        max_depth: int = 3,
+        min_degree: int = 0,
+        inclusive: bool = False,
     ) -> KnowledgeGraph:
         """
         Retrieve a connected subgraph of nodes where the label includes the specified `node_label`.
         Maximum number of nodes is constrained by the environment variable `MAX_GRAPH_NODES` (default: 1000).
         When reducing the number of nodes, the prioritization criteria are as follows:
-            1. Label matching nodes take precedence (nodes containing the specified label string)
-            2. Followed by nodes directly connected to the matching nodes
-            3. Finally, the degree of the nodes
+            1. min_degree does not affect nodes directly connected to the matching nodes
+            2. Label matching nodes take precedence
+            3. Followed by nodes directly connected to the matching nodes
+            4. Finally, the degree of the nodes
 
         Args:
-            node_label (str): String to match in node labels (will match any node containing this string in its label)
-            max_depth (int, optional): Maximum depth of the graph. Defaults to 5.
+            node_label: Label of the starting node
+            max_depth: Maximum depth of the subgraph
+            min_degree: Minimum degree of nodes to include. Defaults to 0
+            inclusive: Do an inclusive search if true
         Returns:
             KnowledgeGraph: Complete connected subgraph for specified node
         """
         label = node_label.strip('"')
-        # Escape single quotes to prevent injection attacks
-        escaped_label = label.replace("'", "\\'")
         result = KnowledgeGraph()
         seen_nodes = set()
         seen_edges = set()
 
-        async with self._driver.session(database=self._DATABASE) as session:
+        async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
             try:
                 if label == "*":
                     main_query = """
                     MATCH (n)
                     OPTIONAL MATCH (n)-[r]-()
                     WITH n, count(r) AS degree
+                    WHERE degree >= $min_degree
                     ORDER BY degree DESC
                     LIMIT $max_nodes
-                    WITH collect(n) AS nodes
-                    MATCH (a)-[r]->(b)
-                    WHERE a IN nodes AND b IN nodes
-                    RETURN nodes, collect(DISTINCT r) AS relationships
+                    WITH collect({node: n}) AS filtered_nodes
+                    UNWIND filtered_nodes AS node_info
+                    WITH collect(node_info.node) AS kept_nodes, filtered_nodes
+                    MATCH (a)-[r]-(b)
+                    WHERE a IN kept_nodes AND b IN kept_nodes
+                    RETURN filtered_nodes AS node_info,
+                           collect(DISTINCT r) AS relationships
                     """
                     result_set = await session.run(
-                        main_query, {"max_nodes": MAX_GRAPH_NODES}
+                        main_query,
+                        {"max_nodes": MAX_GRAPH_NODES, "min_degree": min_degree},
                     )
 
                 else:
-                    validate_query = f"""
-                    MATCH (n)
-                    WHERE any(label IN labels(n) WHERE label CONTAINS '{escaped_label}')
-                    RETURN n LIMIT 1
-                    """
-                    validate_result = await session.run(validate_query)
-                    if not await validate_result.single():
-                        logger.warning(
-                            f"No nodes containing '{label}' in their labels found!"
-                        )
-                        return result
-
                     # Main query uses partial matching
-                    main_query = f"""
+                    main_query = """
                     MATCH (start)
-                    WHERE any(label IN labels(start) WHERE label CONTAINS '{escaped_label}')
+                    WHERE any(label IN labels(start) WHERE
+                        CASE
+                            WHEN $inclusive THEN label CONTAINS $label
+                            ELSE label = $label
+                        END
+                    )
                     WITH start
-                    CALL apoc.path.subgraphAll(start, {{
-                        relationshipFilter: '>',
+                    CALL apoc.path.subgraphAll(start, {
+                        relationshipFilter: '',
                         minLevel: 0,
-                        maxLevel: {max_depth},
+                        maxLevel: $max_depth,
                         bfs: true
-                    }})
+                    })
                     YIELD nodes, relationships
                     WITH start, nodes, relationships
                     UNWIND nodes AS node
                     OPTIONAL MATCH (node)-[r]-()
-                    WITH node, count(r) AS degree, start, nodes, relationships,
-                            CASE
-                            WHEN id(node) = id(start) THEN 2
-                            WHEN EXISTS((start)-->(node)) OR EXISTS((node)-->(start)) THEN 1
-                            ELSE 0
-                            END AS priority
-                    ORDER BY priority DESC, degree DESC
+                    WITH node, count(r) AS degree, start, nodes, relationships
+                    WHERE node = start OR EXISTS((start)--(node)) OR degree >= $min_degree
+                    ORDER BY
+                        CASE
+                            WHEN node = start THEN 3
+                            WHEN EXISTS((start)--(node)) THEN 2
+                            ELSE 1
+                        END DESC,
+                        degree DESC
                     LIMIT $max_nodes
-                    WITH collect(node) AS filtered_nodes, nodes, relationships
-                    RETURN filtered_nodes AS nodes,
-                            [rel IN relationships WHERE startNode(rel) IN filtered_nodes AND endNode(rel) IN filtered_nodes] AS relationships
+                    WITH collect({node: node}) AS filtered_nodes
+                    UNWIND filtered_nodes AS node_info
+                    WITH collect(node_info.node) AS kept_nodes, filtered_nodes
+                    MATCH (a)-[r]-(b)
+                    WHERE a IN kept_nodes AND b IN kept_nodes
+                    RETURN filtered_nodes AS node_info,
+                           collect(DISTINCT r) AS relationships
                     """
                     result_set = await session.run(
-                        main_query, {"max_nodes": MAX_GRAPH_NODES}
+                        main_query,
+                        {
+                            "max_nodes": MAX_GRAPH_NODES,
+                            "label": label,
+                            "inclusive": inclusive,
+                            "max_depth": max_depth,
+                            "min_degree": min_degree,
+                        },
                     )
 
-                record = await result_set.single()
+                try:
+                    record = await result_set.single()
 
-                if record:
-                    # Handle nodes (compatible with multi-label cases)
-                    for node in record["nodes"]:
-                        # Use node ID + label combination as unique identifier
-                        node_id = node.id
-                        if node_id not in seen_nodes:
-                            result.nodes.append(
-                                KnowledgeGraphNode(
-                                    id=f"{node_id}",
-                                    labels=list(node.labels),
-                                    properties=dict(node),
+                    if record:
+                        # Handle nodes (compatible with multi-label cases)
+                        for node_info in record["node_info"]:
+                            node = node_info["node"]
+                            node_id = node.id
+                            if node_id not in seen_nodes:
+                                result.nodes.append(
+                                    KnowledgeGraphNode(
+                                        id=f"{node_id}",
+                                        labels=list(node.labels),
+                                        properties=dict(node),
+                                    )
                                 )
-                            )
-                            seen_nodes.add(node_id)
+                                seen_nodes.add(node_id)
 
-                    # Handle relationships (including direction information)
-                    for rel in record["relationships"]:
-                        edge_id = rel.id
-                        if edge_id not in seen_edges:
-                            start = rel.start_node
-                            end = rel.end_node
-                            result.edges.append(
-                                KnowledgeGraphEdge(
-                                    id=f"{edge_id}",
-                                    type=rel.type,
-                                    source=f"{start.id}",
-                                    target=f"{end.id}",
-                                    properties=dict(rel),
+                        # Handle relationships (including direction information)
+                        for rel in record["relationships"]:
+                            edge_id = rel.id
+                            if edge_id not in seen_edges:
+                                start = rel.start_node
+                                end = rel.end_node
+                                result.edges.append(
+                                    KnowledgeGraphEdge(
+                                        id=f"{edge_id}",
+                                        type=rel.type,
+                                        source=f"{start.id}",
+                                        target=f"{end.id}",
+                                        properties=dict(rel),
+                                    )
                                 )
-                            )
-                            seen_edges.add(edge_id)
+                                seen_edges.add(edge_id)
 
-                    logger.info(
-                        f"Subgraph query successful | Node count: {len(result.nodes)} | Edge count: {len(result.edges)}"
-                    )
+                        logger.info(
+                            f"Subgraph query successful | Node count: {len(result.nodes)} | Edge count: {len(result.edges)}"
+                        )
+                finally:
+                    await result_set.consume()  # Ensure result set is consumed
 
             except neo4jExceptions.ClientError as e:
-                logger.error(f"APOC query failed: {str(e)}")
-                return await self._robust_fallback(label, max_depth)
+                logger.warning(
+                    f"APOC plugin error: {str(e)}, falling back to basic Cypher implementation"
+                )
+                if inclusive:
+                    logger.warning(
+                        "Inclusive search mode is not supported in recursive query, using exact matching"
+                    )
+                return await self._robust_fallback(label, max_depth, min_degree)
 
         return result
 
     async def _robust_fallback(
-        self, label: str, max_depth: int
+        self, label: str, max_depth: int, min_degree: int = 0
     ) -> Dict[str, List[Dict]]:
-        """Enhanced fallback query solution"""
+        """
+        Fallback implementation when APOC plugin is not available or incompatible.
+        This method implements the same functionality as get_knowledge_graph but uses
+        only basic Cypher queries and recursive traversal instead of APOC procedures.
+        """
         result = {"nodes": [], "edges": []}
         visited_nodes = set()
         visited_edges = set()
 
         async def traverse(current_label: str, current_depth: int):
+            # Check traversal limits
             if current_depth > max_depth:
+                logger.debug(f"Reached max depth: {max_depth}")
+                return
+            if len(visited_nodes) >= MAX_GRAPH_NODES:
+                logger.debug(f"Reached max nodes limit: {MAX_GRAPH_NODES}")
                 return
 
             # Get current node details
@@ -614,46 +727,46 @@ class Neo4JStorage(BaseGraphStorage):
                 return
             visited_nodes.add(node_id)
 
-            # Add node data (with complete labels)
-            node_data = {k: v for k, v in node.items()}
-            node_data["labels"] = [
-                current_label
-            ]  # Assume get_node method returns label information
-            result["nodes"].append(node_data)
+            # Add node data with label as ID
+            result["nodes"].append({
+                "id": current_label,
+                "labels": current_label,
+                "properties": node
+            })
 
-            # Get all outgoing and incoming edges
+            # Get connected nodes that meet the degree requirement
+            # Note: We don't need to check a's degree since it's the current node
+            # and was already validated in the previous iteration
             query = f"""
-            MATCH (a)-[r]-(b)
-            WHERE a:`{current_label}` OR b:`{current_label}`
-            RETURN a, r, b,
-                   CASE WHEN startNode(r) = a THEN 'OUTGOING' ELSE 'INCOMING' END AS direction
+            MATCH (a:`{current_label}`)-[r]-(b)
+            WITH r, b,
+                 COUNT((b)--()) AS b_degree
+            WHERE b_degree >= $min_degree OR EXISTS((a)--(b))
+            RETURN r, b
             """
-            async with self._driver.session(database=self._DATABASE) as session:
-                results = await session.run(query)
+            async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
+                results = await session.run(query, {"min_degree": min_degree})
                 async for record in results:
                     # Handle edges
                     rel = record["r"]
                     edge_id = f"{rel.id}_{rel.type}"
                     if edge_id not in visited_edges:
-                        edge_data = dict(rel)
-                        edge_data.update(
-                            {
-                                "source": list(record["a"].labels)[0],
-                                "target": list(record["b"].labels)[0],
+                        b_node = record["b"]
+                        if b_node.labels:  # Only process if target node has labels
+                            target_label = list(b_node.labels)[0]
+                            result["edges"].append({
+                                "id": f"{current_label}_{target_label}",
                                 "type": rel.type,
-                                "direction": record["direction"],
-                            }
-                        )
-                        result["edges"].append(edge_data)
-                        visited_edges.add(edge_id)
+                                "source": current_label,
+                                "target": target_label,
+                                "properties": dict(rel)
+                            })
+                            visited_edges.add(edge_id)
 
-                        # Recursively traverse adjacent nodes
-                        next_label = (
-                            list(record["b"].labels)[0]
-                            if record["direction"] == "OUTGOING"
-                            else list(record["a"].labels)[0]
-                        )
-                        await traverse(next_label, current_depth + 1)
+                            # Continue traversal
+                            await traverse(target_label, current_depth + 1)
+                        else:
+                            logger.warning(f"Skipping edge {edge_id} due to missing labels on target node")
 
         await traverse(label, 0)
         return result
@@ -664,7 +777,7 @@ class Neo4JStorage(BaseGraphStorage):
         Returns:
             ["Person", "Company", ...]  # Alphabetically sorted label list
         """
-        async with self._driver.session(database=self._DATABASE) as session:
+        async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
             # Method 1: Direct metadata query (Available for Neo4j 4.3+)
             # query = "CALL db.labels() YIELD label RETURN label"
 
@@ -679,8 +792,11 @@ class Neo4JStorage(BaseGraphStorage):
 
             result = await session.run(query)
             labels = []
-            async for record in result:
-                labels.append(record["label"])
+            try:
+                async for record in result:
+                    labels.append(record["label"])
+            finally:
+                await result.consume()  # Ensure results are consumed even if processing fails
             return labels
 
     @retry(
@@ -763,7 +879,7 @@ class Neo4JStorage(BaseGraphStorage):
 
             async def _do_delete_edge(tx: AsyncManagedTransaction):
                 query = f"""
-                MATCH (source:`{source_label}`)-[r]->(target:`{target_label}`)
+                MATCH (source:`{source_label}`)-[r]-(target:`{target_label}`)
                 DELETE r
                 """
                 await tx.run(query)

From c07b592e1bfe73cde40c46f46e06f1dc9c3ae292 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 8 Mar 2025 02:39:51 +0800
Subject: [PATCH 10/54] Add missing await consume

---
 lightrag/kg/neo4j_impl.py | 250 ++++++++++++++++++++------------------
 1 file changed, 130 insertions(+), 120 deletions(-)

diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index f6567249..ea316d0f 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -64,19 +64,19 @@ class Neo4JStorage(BaseGraphStorage):
         MAX_CONNECTION_POOL_SIZE = int(
             os.environ.get(
                 "NEO4J_MAX_CONNECTION_POOL_SIZE",
-                config.get("neo4j", "connection_pool_size", fallback=50),  # Reduced from 800
+                config.get("neo4j", "connection_pool_size", fallback=50),
             )
         )
         CONNECTION_TIMEOUT = float(
             os.environ.get(
                 "NEO4J_CONNECTION_TIMEOUT",
-                config.get("neo4j", "connection_timeout", fallback=30.0),  # Reduced from 60.0
+                config.get("neo4j", "connection_timeout", fallback=30.0),
             ),
         )
         CONNECTION_ACQUISITION_TIMEOUT = float(
             os.environ.get(
                 "NEO4J_CONNECTION_ACQUISITION_TIMEOUT",
-                config.get("neo4j", "connection_acquisition_timeout", fallback=30.0),  # Reduced from 60.0
+                config.get("neo4j", "connection_acquisition_timeout", fallback=30.0),
             ),
         )
         MAX_TRANSACTION_RETRY_TIME = float(
@@ -188,23 +188,24 @@ class Neo4JStorage(BaseGraphStorage):
 
     async def has_node(self, node_id: str) -> bool:
         entity_name_label = await self._ensure_label(node_id)
-        async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
             query = (
                 f"MATCH (n:`{entity_name_label}`) RETURN count(n) > 0 AS node_exists"
             )
             result = await session.run(query)
             single_result = await result.single()
             await result.consume()  # Ensure result is fully consumed
-            logger.debug(
-                f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{single_result['node_exists']}"
-            )
             return single_result["node_exists"]
 
     async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
         entity_name_label_source = source_node_id.strip('"')
         entity_name_label_target = target_node_id.strip('"')
 
-        async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
             query = (
                 f"MATCH (a:`{entity_name_label_source}`)-[r]-(b:`{entity_name_label_target}`) "
                 "RETURN COUNT(r) > 0 AS edgeExists"
@@ -212,9 +213,6 @@ class Neo4JStorage(BaseGraphStorage):
             result = await session.run(query)
             single_result = await result.single()
             await result.consume()  # Ensure result is fully consumed
-            logger.debug(
-                f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{single_result['edgeExists']}"
-            )
             return single_result["edgeExists"]
 
     async def get_node(self, node_id: str) -> dict[str, str] | None:
@@ -227,14 +225,20 @@ class Neo4JStorage(BaseGraphStorage):
             dict: Node properties if found
             None: If node not found
         """
-        async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
             entity_name_label = await self._ensure_label(node_id)
             query = f"MATCH (n:`{entity_name_label}`) RETURN n"
             result = await session.run(query)
-            records = await result.fetch(2)  # Get up to 2 records to check for duplicates
+            records = await result.fetch(
+                2
+            )  # Get up to 2 records to check for duplicates
             await result.consume()  # Ensure result is fully consumed
             if len(records) > 1:
-                logger.warning(f"Multiple nodes found with label '{entity_name_label}'. Using first node.")
+                logger.warning(
+                    f"Multiple nodes found with label '{entity_name_label}'. Using first node."
+                )
             if records:
                 node = records[0]["n"]
                 node_dict = dict(node)
@@ -248,16 +252,18 @@ class Neo4JStorage(BaseGraphStorage):
         """Get the degree (number of relationships) of a node with the given label.
         If multiple nodes have the same label, returns the degree of the first node.
         If no node is found, returns 0.
-        
+
         Args:
             node_id: The label of the node
-            
+
         Returns:
             int: The number of relationships the node has, or 0 if no node found
         """
         entity_name_label = node_id.strip('"')
 
-        async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
             query = f"""
                 MATCH (n:`{entity_name_label}`)
                 OPTIONAL MATCH (n)-[r]-()
@@ -266,14 +272,16 @@ class Neo4JStorage(BaseGraphStorage):
             result = await session.run(query)
             records = await result.fetch(100)
             await result.consume()  # Ensure result is fully consumed
-            
+
             if not records:
                 logger.warning(f"No node found with label '{entity_name_label}'")
                 return 0
-                
+
             if len(records) > 1:
-                logger.warning(f"Multiple nodes ({len(records)}) found with label '{entity_name_label}', using first node's degree")
-            
+                logger.warning(
+                    f"Multiple nodes ({len(records)}) found with label '{entity_name_label}', using first node's degree"
+                )
+
             degree = records[0]["degree"]
             logger.debug(
                 f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{degree}"
@@ -296,30 +304,6 @@ class Neo4JStorage(BaseGraphStorage):
         )
         return degrees
 
-    async def check_duplicate_nodes(self) -> list[tuple[str, int]]:
-        """Find all labels that have multiple nodes
-        
-        Returns:
-            list[tuple[str, int]]: List of tuples containing (label, node_count) for labels with multiple nodes
-        """
-        async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
-            query = """
-            MATCH (n)
-            WITH labels(n) as nodeLabels
-            UNWIND nodeLabels as label
-            WITH label, count(*) as node_count
-            WHERE node_count > 1
-            RETURN label, node_count
-            ORDER BY node_count DESC
-            """
-            result = await session.run(query)
-            duplicates = []
-            async for record in result:
-                label = record["label"]
-                count = record["node_count"]
-                logger.info(f"Found {count} nodes with label: {label}")
-                duplicates.append((label, count))
-            return duplicates
 
     async def get_edge(
         self, source_node_id: str, target_node_id: str
@@ -328,64 +312,69 @@ class Neo4JStorage(BaseGraphStorage):
             entity_name_label_source = source_node_id.strip('"')
             entity_name_label_target = target_node_id.strip('"')
 
-            async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
+            async with self._driver.session(
+                database=self._DATABASE, default_access_mode="READ"
+            ) as session:
                 query = f"""
                 MATCH (start:`{entity_name_label_source}`)-[r]-(end:`{entity_name_label_target}`)
                 RETURN properties(r) as edge_properties
                 """
 
                 result = await session.run(query)
-                records = await result.fetch(2)  # Get up to 2 records to check for duplicates
-                if len(records) > 1:
-                    logger.warning(
-                        f"Multiple edges found between '{entity_name_label_source}' and '{entity_name_label_target}'. Using first edge."
+                try:
+                    records = await result.fetch(2)  # Get up to 2 records to check for duplicates
+                    if len(records) > 1:
+                        logger.warning(
+                            f"Multiple edges found between '{entity_name_label_source}' and '{entity_name_label_target}'. Using first edge."
+                        )
+                    if records:
+                        try:
+                            result = dict(records[0]["edge_properties"])
+                            logger.debug(f"Result: {result}")
+                            # Ensure required keys exist with defaults
+                            required_keys = {
+                                "weight": 0.0,
+                                "source_id": None,
+                                "description": None,
+                                "keywords": None,
+                            }
+                            for key, default_value in required_keys.items():
+                                if key not in result:
+                                    result[key] = default_value
+                                    logger.warning(
+                                        f"Edge between {entity_name_label_source} and {entity_name_label_target} "
+                                        f"missing {key}, using default: {default_value}"
+                                    )
+
+                            logger.debug(
+                                f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{result}"
+                            )
+                            return result
+                        except (KeyError, TypeError, ValueError) as e:
+                            logger.error(
+                                f"Error processing edge properties between {entity_name_label_source} "
+                                f"and {entity_name_label_target}: {str(e)}"
+                            )
+                            # Return default edge properties on error
+                            return {
+                                "weight": 0.0,
+                                "description": None,
+                                "keywords": None,
+                                "source_id": None,
+                            }
+
+                    logger.debug(
+                        f"{inspect.currentframe().f_code.co_name}: No edge found between {entity_name_label_source} and {entity_name_label_target}"
                     )
-                if records:
-                    try:
-                        result = dict(records[0]["edge_properties"])
-                        logger.debug(f"Result: {result}")
-                        # Ensure required keys exist with defaults
-                        required_keys = {
-                            "weight": 0.0,
-                            "source_id": None,
-                            "description": None,
-                            "keywords": None,
-                        }
-                        for key, default_value in required_keys.items():
-                            if key not in result:
-                                result[key] = default_value
-                                logger.warning(
-                                    f"Edge between {entity_name_label_source} and {entity_name_label_target} "
-                                    f"missing {key}, using default: {default_value}"
-                                )
-
-                        logger.debug(
-                            f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{result}"
-                        )
-                        return result
-                    except (KeyError, TypeError, ValueError) as e:
-                        logger.error(
-                            f"Error processing edge properties between {entity_name_label_source} "
-                            f"and {entity_name_label_target}: {str(e)}"
-                        )
-                        # Return default edge properties on error
-                        return {
-                            "weight": 0.0,
-                            "description": None,
-                            "keywords": None,
-                            "source_id": None,
-                        }
-
-                logger.debug(
-                    f"{inspect.currentframe().f_code.co_name}: No edge found between {entity_name_label_source} and {entity_name_label_target}"
-                )
-                # Return default edge properties when no edge found
-                return {
-                    "weight": 0.0,
-                    "description": None,
-                    "keywords": None,
-                    "source_id": None,
-                }
+                    # Return default edge properties when no edge found
+                    return {
+                        "weight": 0.0,
+                        "description": None,
+                        "keywords": None,
+                        "source_id": None,
+                    }
+                finally:
+                    await result.consume()  # Ensure result is fully consumed
 
         except Exception as e:
             logger.error(
@@ -409,7 +398,9 @@ class Neo4JStorage(BaseGraphStorage):
         query = f"""MATCH (n:`{node_label}`)
                 OPTIONAL MATCH (n)-[r]-(connected)
                 RETURN n, r, connected"""
-        async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
             results = await session.run(query)
             edges = []
             try:
@@ -429,7 +420,9 @@ class Neo4JStorage(BaseGraphStorage):
                     if source_label and target_label:
                         edges.append((source_label, target_label))
             finally:
-                await results.consume()  # Ensure results are consumed even if processing fails
+                await (
+                    results.consume()
+                )  # Ensure results are consumed even if processing fails
 
             return edges
 
@@ -461,10 +454,11 @@ class Neo4JStorage(BaseGraphStorage):
             MERGE (n:`{label}`)
             SET n += $properties
             """
-            await tx.run(query, properties=properties)
+            result = await tx.run(query, properties=properties)
             logger.debug(
                 f"Upserted node with label '{label}' and properties: {properties}"
             )
+            await result.consume()  # Ensure result is fully consumed
 
         try:
             async with self._driver.session(database=self._DATABASE) as session:
@@ -509,9 +503,13 @@ class Neo4JStorage(BaseGraphStorage):
         target_exists = await self.has_node(target_label)
 
         if not source_exists:
-            raise ValueError(f"Neo4j: source node with label '{source_label}' does not exist")
+            raise ValueError(
+                f"Neo4j: source node with label '{source_label}' does not exist"
+            )
         if not target_exists:
-            raise ValueError(f"Neo4j: target node with label '{target_label}' does not exist")
+            raise ValueError(
+                f"Neo4j: target node with label '{target_label}' does not exist"
+            )
 
         async def _do_upsert_edge(tx: AsyncManagedTransaction):
             query = f"""
@@ -570,7 +568,9 @@ class Neo4JStorage(BaseGraphStorage):
         seen_nodes = set()
         seen_edges = set()
 
-        async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
             try:
                 if label == "*":
                     main_query = """
@@ -728,11 +728,9 @@ class Neo4JStorage(BaseGraphStorage):
             visited_nodes.add(node_id)
 
             # Add node data with label as ID
-            result["nodes"].append({
-                "id": current_label,
-                "labels": current_label,
-                "properties": node
-            })
+            result["nodes"].append(
+                {"id": current_label, "labels": current_label, "properties": node}
+            )
 
             # Get connected nodes that meet the degree requirement
             # Note: We don't need to check a's degree since it's the current node
@@ -744,7 +742,9 @@ class Neo4JStorage(BaseGraphStorage):
             WHERE b_degree >= $min_degree OR EXISTS((a)--(b))
             RETURN r, b
             """
-            async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
+            async with self._driver.session(
+                database=self._DATABASE, default_access_mode="READ"
+            ) as session:
                 results = await session.run(query, {"min_degree": min_degree})
                 async for record in results:
                     # Handle edges
@@ -754,19 +754,23 @@ class Neo4JStorage(BaseGraphStorage):
                         b_node = record["b"]
                         if b_node.labels:  # Only process if target node has labels
                             target_label = list(b_node.labels)[0]
-                            result["edges"].append({
-                                "id": f"{current_label}_{target_label}",
-                                "type": rel.type,
-                                "source": current_label,
-                                "target": target_label,
-                                "properties": dict(rel)
-                            })
+                            result["edges"].append(
+                                {
+                                    "id": f"{current_label}_{target_label}",
+                                    "type": rel.type,
+                                    "source": current_label,
+                                    "target": target_label,
+                                    "properties": dict(rel),
+                                }
+                            )
                             visited_edges.add(edge_id)
 
                             # Continue traversal
                             await traverse(target_label, current_depth + 1)
                         else:
-                            logger.warning(f"Skipping edge {edge_id} due to missing labels on target node")
+                            logger.warning(
+                                f"Skipping edge {edge_id} due to missing labels on target node"
+                            )
 
         await traverse(label, 0)
         return result
@@ -777,7 +781,9 @@ class Neo4JStorage(BaseGraphStorage):
         Returns:
             ["Person", "Company", ...]  # Alphabetically sorted label list
         """
-        async with self._driver.session(database=self._DATABASE, default_access_mode="READ") as session:
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
             # Method 1: Direct metadata query (Available for Neo4j 4.3+)
             # query = "CALL db.labels() YIELD label RETURN label"
 
@@ -796,7 +802,9 @@ class Neo4JStorage(BaseGraphStorage):
                 async for record in result:
                     labels.append(record["label"])
             finally:
-                await result.consume()  # Ensure results are consumed even if processing fails
+                await (
+                    result.consume()
+                )  # Ensure results are consumed even if processing fails
             return labels
 
     @retry(
@@ -824,8 +832,9 @@ class Neo4JStorage(BaseGraphStorage):
             MATCH (n:`{label}`)
             DETACH DELETE n
             """
-            await tx.run(query)
+            result = await tx.run(query)
             logger.debug(f"Deleted node with label '{label}'")
+            await result.consume()  # Ensure result is fully consumed
 
         try:
             async with self._driver.session(database=self._DATABASE) as session:
@@ -882,8 +891,9 @@ class Neo4JStorage(BaseGraphStorage):
                 MATCH (source:`{source_label}`)-[r]-(target:`{target_label}`)
                 DELETE r
                 """
-                await tx.run(query)
+                result = await tx.run(query)
                 logger.debug(f"Deleted edge from '{source_label}' to '{target_label}'")
+                await result.consume()  # Ensure result is fully consumed
 
             try:
                 async with self._driver.session(database=self._DATABASE) as session:

From fcb04e47e5f1beda21c9304ba3c07d90e2e07fc1 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 8 Mar 2025 04:28:54 +0800
Subject: [PATCH 11/54] Refactor Neo4J APOC fall back retrival implementaion

---
 lightrag/kg/neo4j_impl.py | 255 ++++++++++++++++++++++----------------
 1 file changed, 149 insertions(+), 106 deletions(-)

diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index ea316d0f..60e8982e 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -3,7 +3,7 @@ import inspect
 import os
 import re
 from dataclasses import dataclass
-from typing import Any, List, Dict, final
+from typing import Any, final, Optional
 import numpy as np
 import configparser
 
@@ -304,7 +304,6 @@ class Neo4JStorage(BaseGraphStorage):
         )
         return degrees
 
-
     async def get_edge(
         self, source_node_id: str, target_node_id: str
     ) -> dict[str, str] | None:
@@ -321,60 +320,59 @@ class Neo4JStorage(BaseGraphStorage):
                 """
 
                 result = await session.run(query)
-                try:
-                    records = await result.fetch(2)  # Get up to 2 records to check for duplicates
-                    if len(records) > 1:
-                        logger.warning(
-                            f"Multiple edges found between '{entity_name_label_source}' and '{entity_name_label_target}'. Using first edge."
-                        )
-                    if records:
-                        try:
-                            result = dict(records[0]["edge_properties"])
-                            logger.debug(f"Result: {result}")
-                            # Ensure required keys exist with defaults
-                            required_keys = {
-                                "weight": 0.0,
-                                "source_id": None,
-                                "description": None,
-                                "keywords": None,
-                            }
-                            for key, default_value in required_keys.items():
-                                if key not in result:
-                                    result[key] = default_value
-                                    logger.warning(
-                                        f"Edge between {entity_name_label_source} and {entity_name_label_target} "
-                                        f"missing {key}, using default: {default_value}"
-                                    )
-
-                            logger.debug(
-                                f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{result}"
-                            )
-                            return result
-                        except (KeyError, TypeError, ValueError) as e:
-                            logger.error(
-                                f"Error processing edge properties between {entity_name_label_source} "
-                                f"and {entity_name_label_target}: {str(e)}"
-                            )
-                            # Return default edge properties on error
-                            return {
-                                "weight": 0.0,
-                                "description": None,
-                                "keywords": None,
-                                "source_id": None,
-                            }
-
-                    logger.debug(
-                        f"{inspect.currentframe().f_code.co_name}: No edge found between {entity_name_label_source} and {entity_name_label_target}"
+                records = await result.fetch(2)  # Get up to 2 records to check for duplicates
+                await result.consume()  # Ensure result is fully consumed before processing records
+                
+                if len(records) > 1:
+                    logger.warning(
+                        f"Multiple edges found between '{entity_name_label_source}' and '{entity_name_label_target}'. Using first edge."
                     )
-                    # Return default edge properties when no edge found
-                    return {
-                        "weight": 0.0,
-                        "description": None,
-                        "keywords": None,
-                        "source_id": None,
-                    }
-                finally:
-                    await result.consume()  # Ensure result is fully consumed
+                if records:
+                    try:
+                        edge_result = dict(records[0]["edge_properties"])
+                        logger.debug(f"Result: {edge_result}")
+                        # Ensure required keys exist with defaults
+                        required_keys = {
+                            "weight": 0.0,
+                            "source_id": None,
+                            "description": None,
+                            "keywords": None,
+                        }
+                        for key, default_value in required_keys.items():
+                            if key not in edge_result:
+                                edge_result[key] = default_value
+                                logger.warning(
+                                    f"Edge between {entity_name_label_source} and {entity_name_label_target} "
+                                    f"missing {key}, using default: {default_value}"
+                                )
+
+                        logger.debug(
+                            f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{edge_result}"
+                        )
+                        return edge_result
+                    except (KeyError, TypeError, ValueError) as e:
+                        logger.error(
+                            f"Error processing edge properties between {entity_name_label_source} "
+                            f"and {entity_name_label_target}: {str(e)}"
+                        )
+                        # Return default edge properties on error
+                        return {
+                            "weight": 0.0,
+                            "description": None,
+                            "keywords": None,
+                            "source_id": None,
+                        }
+
+                logger.debug(
+                    f"{inspect.currentframe().f_code.co_name}: No edge found between {entity_name_label_source} and {entity_name_label_target}"
+                )
+                # Return default edge properties when no edge found
+                return {
+                    "weight": 0.0,
+                    "description": None,
+                    "keywords": None,
+                    "source_id": None,
+                }
 
         except Exception as e:
             logger.error(
@@ -685,30 +683,36 @@ class Neo4JStorage(BaseGraphStorage):
                     await result_set.consume()  # Ensure result set is consumed
 
             except neo4jExceptions.ClientError as e:
-                logger.warning(
-                    f"APOC plugin error: {str(e)}, falling back to basic Cypher implementation"
-                )
-                if inclusive:
+                logger.warning(f"APOC plugin error: {str(e)}")
+                if label != "*":
                     logger.warning(
-                        "Inclusive search mode is not supported in recursive query, using exact matching"
+                        "Neo4j: falling back to basic Cypher recursive search..."
                     )
-                return await self._robust_fallback(label, max_depth, min_degree)
+                    if inclusive:
+                        logger.warning(
+                            "Neo4j: inclusive search mode is not supported in recursive query, using exact matching"
+                        )
+                    return await self._robust_fallback(label, max_depth, min_degree)
 
         return result
 
     async def _robust_fallback(
         self, label: str, max_depth: int, min_degree: int = 0
-    ) -> Dict[str, List[Dict]]:
+    ) -> KnowledgeGraph:
         """
         Fallback implementation when APOC plugin is not available or incompatible.
         This method implements the same functionality as get_knowledge_graph but uses
         only basic Cypher queries and recursive traversal instead of APOC procedures.
         """
-        result = {"nodes": [], "edges": []}
+        result = KnowledgeGraph()
         visited_nodes = set()
         visited_edges = set()
 
-        async def traverse(current_label: str, current_depth: int):
+        async def traverse(
+            node: KnowledgeGraphNode,
+            edge: Optional[KnowledgeGraphEdge],
+            current_depth: int,
+        ):
             # Check traversal limits
             if current_depth > max_depth:
                 logger.debug(f"Reached max depth: {max_depth}")
@@ -717,62 +721,101 @@ class Neo4JStorage(BaseGraphStorage):
                 logger.debug(f"Reached max nodes limit: {MAX_GRAPH_NODES}")
                 return
 
-            # Get current node details
-            node = await self.get_node(current_label)
-            if not node:
+            # Check if node already visited
+            if node.id in visited_nodes:
                 return
 
-            node_id = f"{current_label}"
-            if node_id in visited_nodes:
-                return
-            visited_nodes.add(node_id)
-
-            # Add node data with label as ID
-            result["nodes"].append(
-                {"id": current_label, "labels": current_label, "properties": node}
-            )
-
-            # Get connected nodes that meet the degree requirement
-            # Note: We don't need to check a's degree since it's the current node
-            # and was already validated in the previous iteration
-            query = f"""
-            MATCH (a:`{current_label}`)-[r]-(b)
-            WITH r, b,
-                 COUNT((b)--()) AS b_degree
-            WHERE b_degree >= $min_degree OR EXISTS((a)--(b))
-            RETURN r, b
-            """
+            # Get all edges and target nodes
             async with self._driver.session(
                 database=self._DATABASE, default_access_mode="READ"
             ) as session:
-                results = await session.run(query, {"min_degree": min_degree})
-                async for record in results:
-                    # Handle edges
+                query = """
+                MATCH (a)-[r]-(b)
+                WHERE id(a) = toInteger($node_id)
+                WITH r, b, id(r) as edge_id, id(b) as target_id
+                RETURN r, b, edge_id, target_id
+                """
+                results = await session.run(query, {"node_id": node.id})
+
+                # Get all records and release database connection
+                records = await results.fetch()
+                await results.consume()  # Ensure results are consumed
+
+                # Nodes not connected to start node need to check degree
+                if current_depth > 1 and len(records) < min_degree:
+                    return
+
+                # Add current node to result
+                result.nodes.append(node)
+                visited_nodes.add(node.id)
+
+                # Add edge to result if it exists and not already added
+                if edge and edge.id not in visited_edges:
+                    result.edges.append(edge)
+                    visited_edges.add(edge.id)
+
+                # Prepare nodes and edges for recursive processing
+                nodes_to_process = []
+                for record in records:
                     rel = record["r"]
-                    edge_id = f"{rel.id}_{rel.type}"
+                    edge_id = str(record["edge_id"])
                     if edge_id not in visited_edges:
                         b_node = record["b"]
-                        if b_node.labels:  # Only process if target node has labels
-                            target_label = list(b_node.labels)[0]
-                            result["edges"].append(
-                                {
-                                    "id": f"{current_label}_{target_label}",
-                                    "type": rel.type,
-                                    "source": current_label,
-                                    "target": target_label,
-                                    "properties": dict(rel),
-                                }
-                            )
-                            visited_edges.add(edge_id)
+                        target_id = str(record["target_id"])
 
-                            # Continue traversal
-                            await traverse(target_label, current_depth + 1)
+                        if b_node.labels:  # Only process if target node has labels
+                            # Create KnowledgeGraphNode for target
+                            target_node = KnowledgeGraphNode(
+                                id=target_id,
+                                labels=list(b_node.labels),
+                                properties=dict(b_node),
+                            )
+
+                            # Create KnowledgeGraphEdge
+                            target_edge = KnowledgeGraphEdge(
+                                id=edge_id,
+                                type=rel.type,
+                                source=node.id,
+                                target=target_id,
+                                properties=dict(rel),
+                            )
+
+                            nodes_to_process.append((target_node, target_edge))
                         else:
                             logger.warning(
                                 f"Skipping edge {edge_id} due to missing labels on target node"
                             )
 
-        await traverse(label, 0)
+                # Process nodes after releasing database connection
+                for target_node, target_edge in nodes_to_process:
+                    await traverse(target_node, target_edge, current_depth + 1)
+
+        # Get the starting node's data
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
+            query = f"""
+            MATCH (n:`{label}`)
+            RETURN id(n) as node_id, n
+            """
+            node_result = await session.run(query)
+            try:
+                node_record = await node_result.single()
+                if not node_record:
+                    return result
+
+                # Create initial KnowledgeGraphNode
+                start_node = KnowledgeGraphNode(
+                    id=str(node_record["node_id"]),
+                    labels=list(node_record["n"].labels),
+                    properties=dict(node_record["n"]),
+                )
+            finally:
+                await node_result.consume()  # Ensure results are consumed
+
+            # Start traversal with the initial node
+            await traverse(start_node, None, 0)
+
         return result
 
     async def get_all_labels(self) -> list[str]:

From 84222b8b76bb077b144463af8acfde8df188d505 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 8 Mar 2025 10:19:20 +0800
Subject: [PATCH 12/54] Refactor Neo4JStorage methods for robustness and
 clarity.

- Add error handling and resource cleanup
- Improve method documentation
- Optimize result consumption
---
 lightrag/kg/neo4j_impl.py | 412 +++++++++++++++++++++++---------------
 1 file changed, 255 insertions(+), 157 deletions(-)

diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index 60e8982e..082b4bf2 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -163,13 +163,14 @@ class Neo4JStorage(BaseGraphStorage):
         }
 
     async def close(self):
+        """Close the Neo4j driver and release all resources"""
         if self._driver:
             await self._driver.close()
             self._driver = None
 
     async def __aexit__(self, exc_type, exc, tb):
-        if self._driver:
-            await self._driver.close()
+        """Ensure driver is closed when context manager exits"""
+        await self.close()
 
     async def index_done_callback(self) -> None:
         # Noe4J handles persistence automatically
@@ -187,33 +188,72 @@ class Neo4JStorage(BaseGraphStorage):
         return clean_label
 
     async def has_node(self, node_id: str) -> bool:
+        """
+        Check if a node with the given label exists in the database
+
+        Args:
+            node_id: Label of the node to check
+
+        Returns:
+            bool: True if node exists, False otherwise
+
+        Raises:
+            ValueError: If node_id is invalid
+            Exception: If there is an error executing the query
+        """
         entity_name_label = await self._ensure_label(node_id)
         async with self._driver.session(
             database=self._DATABASE, default_access_mode="READ"
         ) as session:
-            query = (
-                f"MATCH (n:`{entity_name_label}`) RETURN count(n) > 0 AS node_exists"
-            )
-            result = await session.run(query)
-            single_result = await result.single()
-            await result.consume()  # Ensure result is fully consumed
-            return single_result["node_exists"]
+            try:
+                query = f"MATCH (n:`{entity_name_label}`) RETURN count(n) > 0 AS node_exists"
+                result = await session.run(query)
+                single_result = await result.single()
+                await result.consume()  # Ensure result is fully consumed
+                return single_result["node_exists"]
+            except Exception as e:
+                logger.error(
+                    f"Error checking node existence for {entity_name_label}: {str(e)}"
+                )
+                await result.consume()  # Ensure results are consumed even on error
+                raise
 
     async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
-        entity_name_label_source = source_node_id.strip('"')
-        entity_name_label_target = target_node_id.strip('"')
+        """
+        Check if an edge exists between two nodes
+
+        Args:
+            source_node_id: Label of the source node
+            target_node_id: Label of the target node
+
+        Returns:
+            bool: True if edge exists, False otherwise
+
+        Raises:
+            ValueError: If either node_id is invalid
+            Exception: If there is an error executing the query
+        """
+        entity_name_label_source = await self._ensure_label(source_node_id)
+        entity_name_label_target = await self._ensure_label(target_node_id)
 
         async with self._driver.session(
             database=self._DATABASE, default_access_mode="READ"
         ) as session:
-            query = (
-                f"MATCH (a:`{entity_name_label_source}`)-[r]-(b:`{entity_name_label_target}`) "
-                "RETURN COUNT(r) > 0 AS edgeExists"
-            )
-            result = await session.run(query)
-            single_result = await result.single()
-            await result.consume()  # Ensure result is fully consumed
-            return single_result["edgeExists"]
+            try:
+                query = (
+                    f"MATCH (a:`{entity_name_label_source}`)-[r]-(b:`{entity_name_label_target}`) "
+                    "RETURN COUNT(r) > 0 AS edgeExists"
+                )
+                result = await session.run(query)
+                single_result = await result.single()
+                await result.consume()  # Ensure result is fully consumed
+                return single_result["edgeExists"]
+            except Exception as e:
+                logger.error(
+                    f"Error checking edge existence between {entity_name_label_source} and {entity_name_label_target}: {str(e)}"
+                )
+                await result.consume()  # Ensure results are consumed even on error
+                raise
 
     async def get_node(self, node_id: str) -> dict[str, str] | None:
         """Get node by its label identifier.
@@ -224,29 +264,40 @@ class Neo4JStorage(BaseGraphStorage):
         Returns:
             dict: Node properties if found
             None: If node not found
+
+        Raises:
+            ValueError: If node_id is invalid
+            Exception: If there is an error executing the query
         """
+        entity_name_label = await self._ensure_label(node_id)
         async with self._driver.session(
             database=self._DATABASE, default_access_mode="READ"
         ) as session:
-            entity_name_label = await self._ensure_label(node_id)
-            query = f"MATCH (n:`{entity_name_label}`) RETURN n"
-            result = await session.run(query)
-            records = await result.fetch(
-                2
-            )  # Get up to 2 records to check for duplicates
-            await result.consume()  # Ensure result is fully consumed
-            if len(records) > 1:
-                logger.warning(
-                    f"Multiple nodes found with label '{entity_name_label}'. Using first node."
-                )
-            if records:
-                node = records[0]["n"]
-                node_dict = dict(node)
-                logger.debug(
-                    f"{inspect.currentframe().f_code.co_name}: query: {query}, result: {node_dict}"
-                )
-                return node_dict
-            return None
+            try:
+                query = f"MATCH (n:`{entity_name_label}`) RETURN n"
+                result = await session.run(query)
+                try:
+                    records = await result.fetch(
+                        2
+                    )  # Get up to 2 records to check for duplicates
+
+                    if len(records) > 1:
+                        logger.warning(
+                            f"Multiple nodes found with label '{entity_name_label}'. Using first node."
+                        )
+                    if records:
+                        node = records[0]["n"]
+                        node_dict = dict(node)
+                        logger.debug(
+                            f"{inspect.currentframe().f_code.co_name}: query: {query}, result: {node_dict}"
+                        )
+                        return node_dict
+                    return None
+                finally:
+                    await result.consume()  # Ensure result is fully consumed
+            except Exception as e:
+                logger.error(f"Error getting node for {entity_name_label}: {str(e)}")
+                raise
 
     async def node_degree(self, node_id: str) -> int:
         """Get the degree (number of relationships) of a node with the given label.
@@ -258,39 +309,63 @@ class Neo4JStorage(BaseGraphStorage):
 
         Returns:
             int: The number of relationships the node has, or 0 if no node found
+
+        Raises:
+            ValueError: If node_id is invalid
+            Exception: If there is an error executing the query
         """
-        entity_name_label = node_id.strip('"')
+        entity_name_label = await self._ensure_label(node_id)
 
         async with self._driver.session(
             database=self._DATABASE, default_access_mode="READ"
         ) as session:
-            query = f"""
-                MATCH (n:`{entity_name_label}`)
-                OPTIONAL MATCH (n)-[r]-()
-                RETURN n, COUNT(r) AS degree
-            """
-            result = await session.run(query)
-            records = await result.fetch(100)
-            await result.consume()  # Ensure result is fully consumed
+            try:
+                query = f"""
+                    MATCH (n:`{entity_name_label}`)
+                    OPTIONAL MATCH (n)-[r]-()
+                    RETURN n, COUNT(r) AS degree
+                """
+                result = await session.run(query)
+                try:
+                    records = await result.fetch(100)
 
-            if not records:
-                logger.warning(f"No node found with label '{entity_name_label}'")
-                return 0
+                    if not records:
+                        logger.warning(
+                            f"No node found with label '{entity_name_label}'"
+                        )
+                        return 0
 
-            if len(records) > 1:
-                logger.warning(
-                    f"Multiple nodes ({len(records)}) found with label '{entity_name_label}', using first node's degree"
+                    if len(records) > 1:
+                        logger.warning(
+                            f"Multiple nodes ({len(records)}) found with label '{entity_name_label}', using first node's degree"
+                        )
+
+                    degree = records[0]["degree"]
+                    logger.debug(
+                        f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{degree}"
+                    )
+                    return degree
+                finally:
+                    await result.consume()  # Ensure result is fully consumed
+            except Exception as e:
+                logger.error(
+                    f"Error getting node degree for {entity_name_label}: {str(e)}"
                 )
-
-            degree = records[0]["degree"]
-            logger.debug(
-                f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{degree}"
-            )
-            return degree
+                raise
 
     async def edge_degree(self, src_id: str, tgt_id: str) -> int:
-        entity_name_label_source = src_id.strip('"')
-        entity_name_label_target = tgt_id.strip('"')
+        """Get the total degree (sum of relationships) of two nodes.
+
+        Args:
+            src_id: Label of the source node
+            tgt_id: Label of the target node
+
+        Returns:
+            int: Sum of the degrees of both nodes
+        """
+        entity_name_label_source = await self._ensure_label(src_id)
+        entity_name_label_target = await self._ensure_label(tgt_id)
+
         src_degree = await self.node_degree(entity_name_label_source)
         trg_degree = await self.node_degree(entity_name_label_target)
 
@@ -299,17 +374,27 @@ class Neo4JStorage(BaseGraphStorage):
         trg_degree = 0 if trg_degree is None else trg_degree
 
         degrees = int(src_degree) + int(trg_degree)
-        logger.debug(
-            f"{inspect.currentframe().f_code.co_name}:query:src_Degree+trg_degree:result:{degrees}"
-        )
         return degrees
 
     async def get_edge(
         self, source_node_id: str, target_node_id: str
     ) -> dict[str, str] | None:
+        """Get edge properties between two nodes.
+
+        Args:
+            source_node_id: Label of the source node
+            target_node_id: Label of the target node
+
+        Returns:
+            dict: Edge properties if found, default properties if not found or on error
+
+        Raises:
+            ValueError: If either node_id is invalid
+            Exception: If there is an error executing the query
+        """
         try:
-            entity_name_label_source = source_node_id.strip('"')
-            entity_name_label_target = target_node_id.strip('"')
+            entity_name_label_source = await self._ensure_label(source_node_id)
+            entity_name_label_target = await self._ensure_label(target_node_id)
 
             async with self._driver.session(
                 database=self._DATABASE, default_access_mode="READ"
@@ -320,109 +405,123 @@ class Neo4JStorage(BaseGraphStorage):
                 """
 
                 result = await session.run(query)
-                records = await result.fetch(2)  # Get up to 2 records to check for duplicates
-                await result.consume()  # Ensure result is fully consumed before processing records
-                
-                if len(records) > 1:
-                    logger.warning(
-                        f"Multiple edges found between '{entity_name_label_source}' and '{entity_name_label_target}'. Using first edge."
+                try:
+                    records = await result.fetch(
+                        2
+                    )  # Get up to 2 records to check for duplicates
+
+                    if len(records) > 1:
+                        logger.warning(
+                            f"Multiple edges found between '{entity_name_label_source}' and '{entity_name_label_target}'. Using first edge."
+                        )
+                    if records:
+                        try:
+                            edge_result = dict(records[0]["edge_properties"])
+                            logger.debug(f"Result: {edge_result}")
+                            # Ensure required keys exist with defaults
+                            required_keys = {
+                                "weight": 0.0,
+                                "source_id": None,
+                                "description": None,
+                                "keywords": None,
+                            }
+                            for key, default_value in required_keys.items():
+                                if key not in edge_result:
+                                    edge_result[key] = default_value
+                                    logger.warning(
+                                        f"Edge between {entity_name_label_source} and {entity_name_label_target} "
+                                        f"missing {key}, using default: {default_value}"
+                                    )
+
+                            logger.debug(
+                                f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{edge_result}"
+                            )
+                            return edge_result
+                        except (KeyError, TypeError, ValueError) as e:
+                            logger.error(
+                                f"Error processing edge properties between {entity_name_label_source} "
+                                f"and {entity_name_label_target}: {str(e)}"
+                            )
+                            # Return default edge properties on error
+                            return {
+                                "weight": 0.0,
+                                "source_id": None,
+                                "description": None,
+                                "keywords": None,
+                            }
+
+                    logger.debug(
+                        f"{inspect.currentframe().f_code.co_name}: No edge found between {entity_name_label_source} and {entity_name_label_target}"
                     )
-                if records:
-                    try:
-                        edge_result = dict(records[0]["edge_properties"])
-                        logger.debug(f"Result: {edge_result}")
-                        # Ensure required keys exist with defaults
-                        required_keys = {
-                            "weight": 0.0,
-                            "source_id": None,
-                            "description": None,
-                            "keywords": None,
-                        }
-                        for key, default_value in required_keys.items():
-                            if key not in edge_result:
-                                edge_result[key] = default_value
-                                logger.warning(
-                                    f"Edge between {entity_name_label_source} and {entity_name_label_target} "
-                                    f"missing {key}, using default: {default_value}"
-                                )
-
-                        logger.debug(
-                            f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{edge_result}"
-                        )
-                        return edge_result
-                    except (KeyError, TypeError, ValueError) as e:
-                        logger.error(
-                            f"Error processing edge properties between {entity_name_label_source} "
-                            f"and {entity_name_label_target}: {str(e)}"
-                        )
-                        # Return default edge properties on error
-                        return {
-                            "weight": 0.0,
-                            "description": None,
-                            "keywords": None,
-                            "source_id": None,
-                        }
-
-                logger.debug(
-                    f"{inspect.currentframe().f_code.co_name}: No edge found between {entity_name_label_source} and {entity_name_label_target}"
-                )
-                # Return default edge properties when no edge found
-                return {
-                    "weight": 0.0,
-                    "description": None,
-                    "keywords": None,
-                    "source_id": None,
-                }
+                    # Return default edge properties when no edge found
+                    return {
+                        "weight": 0.0,
+                        "source_id": None,
+                        "description": None,
+                        "keywords": None,
+                    }
+                finally:
+                    await result.consume()  # Ensure result is fully consumed
 
         except Exception as e:
             logger.error(
                 f"Error in get_edge between {source_node_id} and {target_node_id}: {str(e)}"
             )
-            # Return default edge properties on error
-            return {
-                "weight": 0.0,
-                "description": None,
-                "keywords": None,
-                "source_id": None,
-            }
+            raise
 
     async def get_node_edges(self, source_node_id: str) -> list[tuple[str, str]] | None:
-        node_label = source_node_id.strip('"')
+        """Retrieves all edges (relationships) for a particular node identified by its label.
 
+        Args:
+            source_node_id: Label of the node to get edges for
+
+        Returns:
+            list[tuple[str, str]]: List of (source_label, target_label) tuples representing edges
+            None: If no edges found
+
+        Raises:
+            ValueError: If source_node_id is invalid
+            Exception: If there is an error executing the query
         """
-        Retrieves all edges (relationships) for a particular node identified by its label.
-        :return: List of dictionaries containing edge information
-        """
-        query = f"""MATCH (n:`{node_label}`)
-                OPTIONAL MATCH (n)-[r]-(connected)
-                RETURN n, r, connected"""
-        async with self._driver.session(
-            database=self._DATABASE, default_access_mode="READ"
-        ) as session:
-            results = await session.run(query)
-            edges = []
-            try:
-                async for record in results:
-                    source_node = record["n"]
-                    connected_node = record["connected"]
+        try:
+            node_label = await self._ensure_label(source_node_id)
 
-                    source_label = (
-                        list(source_node.labels)[0] if source_node.labels else None
-                    )
-                    target_label = (
-                        list(connected_node.labels)[0]
-                        if connected_node and connected_node.labels
-                        else None
-                    )
+            query = f"""MATCH (n:`{node_label}`)
+                    OPTIONAL MATCH (n)-[r]-(connected)
+                    RETURN n, r, connected"""
 
-                    if source_label and target_label:
-                        edges.append((source_label, target_label))
-            finally:
-                await (
-                    results.consume()
-                )  # Ensure results are consumed even if processing fails
+            async with self._driver.session(
+                database=self._DATABASE, default_access_mode="READ"
+            ) as session:
+                try:
+                    results = await session.run(query)
+                    edges = []
 
-            return edges
+                    async for record in results:
+                        source_node = record["n"]
+                        connected_node = record["connected"]
+
+                        source_label = (
+                            list(source_node.labels)[0] if source_node.labels else None
+                        )
+                        target_label = (
+                            list(connected_node.labels)[0]
+                            if connected_node and connected_node.labels
+                            else None
+                        )
+
+                        if source_label and target_label:
+                            edges.append((source_label, target_label))
+
+                    await results.consume()  # Ensure results are consumed
+                    return edges if edges else None
+                except Exception as e:
+                    logger.error(f"Error getting edges for node {node_label}: {str(e)}")
+                    await results.consume()  # Ensure results are consumed even on error
+                    raise
+        except Exception as e:
+            logger.error(f"Error in get_node_edges for {source_node_id}: {str(e)}")
+            raise
 
     @retry(
         stop=stop_after_attempt(3),
@@ -838,7 +937,6 @@ class Neo4JStorage(BaseGraphStorage):
                 RETURN DISTINCT label
                 ORDER BY label
             """
-
             result = await session.run(query)
             labels = []
             try:

From 78f8d7a1ce1186ce3398afb946f3da79bad50df7 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 8 Mar 2025 10:20:10 +0800
Subject: [PATCH 13/54] Convert node and edge IDs to f-strings for consistency.

- Use f-strings for node IDs
- Use f-strings for edge IDs
- Ensure consistent ID formatting
---
 lightrag/kg/neo4j_impl.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index 082b4bf2..05deb0a9 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -865,17 +865,17 @@ class Neo4JStorage(BaseGraphStorage):
                         if b_node.labels:  # Only process if target node has labels
                             # Create KnowledgeGraphNode for target
                             target_node = KnowledgeGraphNode(
-                                id=target_id,
+                                id=f"{target_id}",
                                 labels=list(b_node.labels),
                                 properties=dict(b_node),
                             )
 
                             # Create KnowledgeGraphEdge
                             target_edge = KnowledgeGraphEdge(
-                                id=edge_id,
+                                id=f"{edge_id}",
                                 type=rel.type,
-                                source=node.id,
-                                target=target_id,
+                                source=f"{node.id}",
+                                target=f"{target_id}",
                                 properties=dict(rel),
                             )
 
@@ -905,7 +905,7 @@ class Neo4JStorage(BaseGraphStorage):
 
                 # Create initial KnowledgeGraphNode
                 start_node = KnowledgeGraphNode(
-                    id=str(node_record["node_id"]),
+                    id=f"{node_record['node_id']}",
                     labels=list(node_record["n"].labels),
                     properties=dict(node_record["n"]),
                 )

From af26d656985e0d9dd722c1cc8ea0d65f6348dc79 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 8 Mar 2025 10:23:27 +0800
Subject: [PATCH 14/54] Convert _ensure_label method from async to sync

---
 lightrag/kg/neo4j_impl.py | 40 ++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index 05deb0a9..cf3c024f 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -176,11 +176,17 @@ class Neo4JStorage(BaseGraphStorage):
         # Noe4J handles persistence automatically
         pass
 
-    async def _ensure_label(self, label: str) -> str:
+    def _ensure_label(self, label: str) -> str:
         """Ensure a label is valid
 
         Args:
             label: The label to validate
+            
+        Returns:
+            str: The cleaned label
+            
+        Raises:
+            ValueError: If label is empty after cleaning
         """
         clean_label = label.strip('"')
         if not clean_label:
@@ -201,7 +207,7 @@ class Neo4JStorage(BaseGraphStorage):
             ValueError: If node_id is invalid
             Exception: If there is an error executing the query
         """
-        entity_name_label = await self._ensure_label(node_id)
+        entity_name_label = self._ensure_label(node_id)
         async with self._driver.session(
             database=self._DATABASE, default_access_mode="READ"
         ) as session:
@@ -233,8 +239,8 @@ class Neo4JStorage(BaseGraphStorage):
             ValueError: If either node_id is invalid
             Exception: If there is an error executing the query
         """
-        entity_name_label_source = await self._ensure_label(source_node_id)
-        entity_name_label_target = await self._ensure_label(target_node_id)
+        entity_name_label_source = self._ensure_label(source_node_id)
+        entity_name_label_target = self._ensure_label(target_node_id)
 
         async with self._driver.session(
             database=self._DATABASE, default_access_mode="READ"
@@ -269,7 +275,7 @@ class Neo4JStorage(BaseGraphStorage):
             ValueError: If node_id is invalid
             Exception: If there is an error executing the query
         """
-        entity_name_label = await self._ensure_label(node_id)
+        entity_name_label = self._ensure_label(node_id)
         async with self._driver.session(
             database=self._DATABASE, default_access_mode="READ"
         ) as session:
@@ -314,7 +320,7 @@ class Neo4JStorage(BaseGraphStorage):
             ValueError: If node_id is invalid
             Exception: If there is an error executing the query
         """
-        entity_name_label = await self._ensure_label(node_id)
+        entity_name_label = self._ensure_label(node_id)
 
         async with self._driver.session(
             database=self._DATABASE, default_access_mode="READ"
@@ -363,8 +369,8 @@ class Neo4JStorage(BaseGraphStorage):
         Returns:
             int: Sum of the degrees of both nodes
         """
-        entity_name_label_source = await self._ensure_label(src_id)
-        entity_name_label_target = await self._ensure_label(tgt_id)
+        entity_name_label_source = self._ensure_label(src_id)
+        entity_name_label_target = self._ensure_label(tgt_id)
 
         src_degree = await self.node_degree(entity_name_label_source)
         trg_degree = await self.node_degree(entity_name_label_target)
@@ -393,8 +399,8 @@ class Neo4JStorage(BaseGraphStorage):
             Exception: If there is an error executing the query
         """
         try:
-            entity_name_label_source = await self._ensure_label(source_node_id)
-            entity_name_label_target = await self._ensure_label(target_node_id)
+            entity_name_label_source = self._ensure_label(source_node_id)
+            entity_name_label_target = self._ensure_label(target_node_id)
 
             async with self._driver.session(
                 database=self._DATABASE, default_access_mode="READ"
@@ -484,7 +490,7 @@ class Neo4JStorage(BaseGraphStorage):
             Exception: If there is an error executing the query
         """
         try:
-            node_label = await self._ensure_label(source_node_id)
+            node_label = self._ensure_label(source_node_id)
 
             query = f"""MATCH (n:`{node_label}`)
                     OPTIONAL MATCH (n)-[r]-(connected)
@@ -543,7 +549,7 @@ class Neo4JStorage(BaseGraphStorage):
             node_id: The unique identifier for the node (used as label)
             node_data: Dictionary of node properties
         """
-        label = await self._ensure_label(node_id)
+        label = self._ensure_label(node_id)
         properties = node_data
 
         async def _do_upsert(tx: AsyncManagedTransaction):
@@ -591,8 +597,8 @@ class Neo4JStorage(BaseGraphStorage):
         Raises:
             ValueError: If either source or target node does not exist
         """
-        source_label = await self._ensure_label(source_node_id)
-        target_label = await self._ensure_label(target_node_id)
+        source_label = self._ensure_label(source_node_id)
+        target_label = self._ensure_label(target_node_id)
         edge_properties = edge_data
 
         # Check if both nodes exist
@@ -966,7 +972,7 @@ class Neo4JStorage(BaseGraphStorage):
         Args:
             node_id: The label of the node to delete
         """
-        label = await self._ensure_label(node_id)
+        label = self._ensure_label(node_id)
 
         async def _do_delete(tx: AsyncManagedTransaction):
             query = f"""
@@ -1024,8 +1030,8 @@ class Neo4JStorage(BaseGraphStorage):
             edges: List of edges to be deleted, each edge is a (source, target) tuple
         """
         for source, target in edges:
-            source_label = await self._ensure_label(source)
-            target_label = await self._ensure_label(target)
+            source_label = self._ensure_label(source)
+            target_label = self._ensure_label(target)
 
             async def _do_delete_edge(tx: AsyncManagedTransaction):
                 query = f"""

From 887f6ed81a2cb6036163105433b160e1343daf98 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 8 Mar 2025 11:20:22 +0800
Subject: [PATCH 15/54] Fix return empty list when no edges is found

---
 lightrag/kg/neo4j_impl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index cf3c024f..34226df7 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -520,7 +520,7 @@ class Neo4JStorage(BaseGraphStorage):
                             edges.append((source_label, target_label))
 
                     await results.consume()  # Ensure results are consumed
-                    return edges if edges else None
+                    return edges
                 except Exception as e:
                     logger.error(f"Error getting edges for node {node_label}: {str(e)}")
                     await results.consume()  # Ensure results are consumed even on error

From 22a93fb717b7a66dda345fbacdb2e6d5df874707 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 8 Mar 2025 11:29:08 +0800
Subject: [PATCH 16/54] Limit neighbor nodes fetch to 1000 in Neo4JStorage.

---
 lightrag/kg/neo4j_impl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index 34226df7..7e1007b9 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -843,7 +843,7 @@ class Neo4JStorage(BaseGraphStorage):
                 results = await session.run(query, {"node_id": node.id})
 
                 # Get all records and release database connection
-                records = await results.fetch()
+                records = await results.fetch(1000)  # Max neighbour nodes we can handled
                 await results.consume()  # Ensure results are consumed
 
                 # Nodes not connected to start node need to check degree

From fb4a4c736edca76f8ab5968c0b4d8869bec94bf2 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 8 Mar 2025 11:36:24 +0800
Subject: [PATCH 17/54] Add duplicate edge upsert checking and logging

---
 lightrag/kg/neo4j_impl.py | 78 ++++++++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 34 deletions(-)

diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index 7e1007b9..1e46798a 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -412,9 +412,7 @@ class Neo4JStorage(BaseGraphStorage):
 
                 result = await session.run(query)
                 try:
-                    records = await result.fetch(
-                        2
-                    )  # Get up to 2 records to check for duplicates
+                    records = await result.fetch(2)
 
                     if len(records) > 1:
                         logger.warning(
@@ -552,20 +550,20 @@ class Neo4JStorage(BaseGraphStorage):
         label = self._ensure_label(node_id)
         properties = node_data
 
-        async def _do_upsert(tx: AsyncManagedTransaction):
-            query = f"""
-            MERGE (n:`{label}`)
-            SET n += $properties
-            """
-            result = await tx.run(query, properties=properties)
-            logger.debug(
-                f"Upserted node with label '{label}' and properties: {properties}"
-            )
-            await result.consume()  # Ensure result is fully consumed
-
         try:
             async with self._driver.session(database=self._DATABASE) as session:
-                await session.execute_write(_do_upsert)
+                async def execute_upsert(tx: AsyncManagedTransaction):
+                    query = f"""
+                    MERGE (n:`{label}`)
+                    SET n += $properties
+                    """
+                    result = await tx.run(query, properties=properties)
+                    logger.debug(
+                        f"Upserted node with label '{label}' and properties: {properties}"
+                    )
+                    await result.consume()  # Ensure result is fully consumed
+                
+                await session.execute_write(execute_upsert)
         except Exception as e:
             logger.error(f"Error during upsert: {str(e)}")
             raise
@@ -614,27 +612,39 @@ class Neo4JStorage(BaseGraphStorage):
                 f"Neo4j: target node with label '{target_label}' does not exist"
             )
 
-        async def _do_upsert_edge(tx: AsyncManagedTransaction):
-            query = f"""
-            MATCH (source:`{source_label}`)
-            WITH source
-            MATCH (target:`{target_label}`)
-            MERGE (source)-[r:DIRECTED]-(target)
-            SET r += $properties
-            RETURN r
-            """
-            result = await tx.run(query, properties=edge_properties)
-            try:
-                record = await result.single()
-                logger.debug(
-                    f"Upserted edge from '{source_label}' to '{target_label}' with properties: {edge_properties}, result: {record['r'] if record else None}"
-                )
-            finally:
-                await result.consume()  # Ensure result is consumed
-
         try:
             async with self._driver.session(database=self._DATABASE) as session:
-                await session.execute_write(_do_upsert_edge)
+                async def execute_upsert(tx: AsyncManagedTransaction):
+                    query = f"""
+                    MATCH (source:`{source_label}`)
+                    WITH source
+                    MATCH (target:`{target_label}`)
+                    MERGE (source)-[r:DIRECTED]-(target)
+                    SET r += $properties
+                    RETURN r, source, target
+                    """
+                    result = await tx.run(query, properties=edge_properties)
+                    try:
+                        records = await result.fetch(100)
+                        if len(records) > 1:
+                            source_nodes = [dict(r['source']) for r in records]
+                            target_nodes = [dict(r['target']) for r in records]
+                            logger.warning(
+                                f"Multiple edges created: found {len(records)} results for edge between "
+                                f"source label '{source_label}' and target label '{target_label}'. "
+                                f"Source nodes: {source_nodes}, "
+                                f"Target nodes: {target_nodes}. "
+                                "Using first edge only."
+                            )
+                        if records:
+                            logger.debug(
+                                f"Upserted edge from '{source_label}' to '{target_label}' "
+                                f"with properties: {edge_properties}"
+                            )
+                    finally:
+                        await result.consume()  # Ensure result is consumed
+                
+                await session.execute_write(execute_upsert)
         except Exception as e:
             logger.error(f"Error during edge upsert: {str(e)}")
             raise

From 59a2202e7c354333a02ff7d907a53306dea3c41e Mon Sep 17 00:00:00 2001
From: baoheping <1340473515@qq.com>
Date: Sat, 8 Mar 2025 09:26:21 +0000
Subject: [PATCH 18/54] Added Minimum Degree

---
 lightrag_webui/bun.lock                       |  10 +
 lightrag_webui/package.json                   |   2 +
 lightrag_webui/src/components/ThemeToggle.tsx |   6 +-
 .../documents/ClearDocumentsDialog.tsx        |  18 +-
 .../documents/UploadDocumentsDialog.tsx       |  22 +-
 .../components/graph/FullScreenControl.tsx    |   6 +-
 .../src/components/graph/GraphLabels.tsx      |  10 +-
 .../src/components/graph/GraphSearch.tsx      |   6 +-
 .../src/components/graph/LayoutsControl.tsx   |   9 +-
 .../src/components/graph/PropertiesView.tsx   |  27 +-
 .../src/components/graph/Settings.tsx         |  37 +--
 .../src/components/graph/StatusCard.tsx       |  38 +--
 .../src/components/graph/StatusIndicator.tsx  |   4 +-
 .../src/components/graph/ZoomControl.tsx      |   8 +-
 .../src/components/retrieval/ChatMessage.tsx  |   7 +-
 .../components/retrieval/QuerySettings.tsx    |  84 ++++---
 .../src/features/DocumentManager.tsx          |  46 ++--
 .../src/features/RetrievalTesting.tsx         |  14 +-
 lightrag_webui/src/features/SiteHeader.tsx    |  13 +-
 lightrag_webui/src/i18n.js                    |  21 ++
 lightrag_webui/src/locales/en.json            | 234 +++++++++++++++++
 lightrag_webui/src/locales/zh.json            | 236 ++++++++++++++++++
 lightrag_webui/src/main.tsx                   |   2 +
 23 files changed, 705 insertions(+), 155 deletions(-)
 create mode 100644 lightrag_webui/src/i18n.js
 create mode 100644 lightrag_webui/src/locales/en.json
 create mode 100644 lightrag_webui/src/locales/zh.json

diff --git a/lightrag_webui/bun.lock b/lightrag_webui/bun.lock
index 6157e38c..3ca0d887 100644
--- a/lightrag_webui/bun.lock
+++ b/lightrag_webui/bun.lock
@@ -34,11 +34,13 @@
         "cmdk": "^1.0.4",
         "graphology": "^0.26.0",
         "graphology-generators": "^0.11.2",
+        "i18next": "^24.2.2",
         "lucide-react": "^0.475.0",
         "minisearch": "^7.1.2",
         "react": "^19.0.0",
         "react-dom": "^19.0.0",
         "react-dropzone": "^14.3.6",
+        "react-i18next": "^15.4.1",
         "react-markdown": "^9.1.0",
         "react-number-format": "^5.4.3",
         "react-syntax-highlighter": "^15.6.1",
@@ -765,8 +767,12 @@
 
     "hoist-non-react-statics": ["hoist-non-react-statics@3.3.2", "", { "dependencies": { "react-is": "^16.7.0" } }, "sha512-/gGivxi8JPKWNm/W0jSmzcMPpfpPLc3dY/6GxhX2hQ9iGj3aDfklV4ET7NjKpSinLpJ5vafa9iiGIEZg10SfBw=="],
 
+    "html-parse-stringify": ["html-parse-stringify@3.0.1", "", { "dependencies": { "void-elements": "3.1.0" } }, "sha512-KknJ50kTInJ7qIScF3jeaFRpMpE8/lfiTdzf/twXyPBLAGrLRTmkz3AdTnKeh40X8k9L2fdYwEp/42WGXIRGcg=="],
+
     "html-url-attributes": ["html-url-attributes@3.0.1", "", {}, "sha512-ol6UPyBWqsrO6EJySPz2O7ZSr856WDrEzM5zMqp+FJJLGMW35cLYmmZnl0vztAZxRUoNZJFTCohfjuIJ8I4QBQ=="],
 
+    "i18next": ["i18next@24.2.2", "", { "dependencies": { "@babel/runtime": "^7.23.2" }, "peerDependencies": { "typescript": "^5" }, "optionalPeers": ["typescript"] }, "sha512-NE6i86lBCKRYZa5TaUDkU5S4HFgLIEJRLr3Whf2psgaxBleQ2LC1YW1Vc+SCgkAW7VEzndT6al6+CzegSUHcTQ=="],
+
     "ignore": ["ignore@5.3.2", "", {}, "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g=="],
 
     "import-fresh": ["import-fresh@3.3.1", "", { "dependencies": { "parent-module": "^1.0.0", "resolve-from": "^4.0.0" } }, "sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ=="],
@@ -1093,6 +1099,8 @@
 
     "react-dropzone": ["react-dropzone@14.3.6", "", { "dependencies": { "attr-accept": "^2.2.4", "file-selector": "^2.1.0", "prop-types": "^15.8.1" }, "peerDependencies": { "react": ">= 16.8 || 18.0.0" } }, "sha512-U792j+x0rcwH/U/Slv/OBNU/LGFYbDLHKKiJoPhNaOianayZevCt4Y5S0CraPssH/6/wT6xhKDfzdXUgCBS0HQ=="],
 
+    "react-i18next": ["react-i18next@15.4.1", "", { "dependencies": { "@babel/runtime": "^7.25.0", "html-parse-stringify": "^3.0.1" }, "peerDependencies": { "i18next": ">= 23.2.3", "react": ">= 16.8.0" } }, "sha512-ahGab+IaSgZmNPYXdV1n+OYky95TGpFwnKRflX/16dY04DsYYKHtVLjeny7sBSCREEcoMbAgSkFiGLF5g5Oofw=="],
+
     "react-is": ["react-is@16.13.1", "", {}, "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ=="],
 
     "react-markdown": ["react-markdown@9.1.0", "", { "dependencies": { "@types/hast": "^3.0.0", "@types/mdast": "^4.0.0", "devlop": "^1.0.0", "hast-util-to-jsx-runtime": "^2.0.0", "html-url-attributes": "^3.0.0", "mdast-util-to-hast": "^13.0.0", "remark-parse": "^11.0.0", "remark-rehype": "^11.0.0", "unified": "^11.0.0", "unist-util-visit": "^5.0.0", "vfile": "^6.0.0" }, "peerDependencies": { "@types/react": ">=18", "react": ">=18" } }, "sha512-xaijuJB0kzGiUdG7nc2MOMDUDBWPyGAjZtUrow9XxUeua8IqeP+VlIfAZ3bphpcLTnSZXz6z9jcVC/TCwbfgdw=="],
@@ -1271,6 +1279,8 @@
 
     "vite": ["vite@6.1.1", "", { "dependencies": { "esbuild": "^0.24.2", "postcss": "^8.5.2", "rollup": "^4.30.1" }, "optionalDependencies": { "fsevents": "~2.3.3" }, "peerDependencies": { "@types/node": "^18.0.0 || ^20.0.0 || >=22.0.0", "jiti": ">=1.21.0", "less": "*", "lightningcss": "^1.21.0", "sass": "*", "sass-embedded": "*", "stylus": "*", "sugarss": "*", "terser": "^5.16.0", "tsx": "^4.8.1", "yaml": "^2.4.2" }, "optionalPeers": ["@types/node", "jiti", "less", "lightningcss", "sass", "sass-embedded", "stylus", "sugarss", "terser", "tsx", "yaml"], "bin": { "vite": "bin/vite.js" } }, "sha512-4GgM54XrwRfrOp297aIYspIti66k56v16ZnqHvrIM7mG+HjDlAwS7p+Srr7J6fGvEdOJ5JcQ/D9T7HhtdXDTzA=="],
 
+    "void-elements": ["void-elements@3.1.0", "", {}, "sha512-Dhxzh5HZuiHQhbvTW9AMetFfBHDMYpo23Uo9btPXgdYP+3T5S+p+jgNy7spra+veYhBP2dCSgxR/i2Y02h5/6w=="],
+
     "which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="],
 
     "which-boxed-primitive": ["which-boxed-primitive@1.1.1", "", { "dependencies": { "is-bigint": "^1.1.0", "is-boolean-object": "^1.2.1", "is-number-object": "^1.1.1", "is-string": "^1.1.1", "is-symbol": "^1.1.1" } }, "sha512-TbX3mj8n0odCBFVlY8AxkqcHASw3L60jIuF8jFP78az3C2YhmGvqbHBpAjTRH2/xqYunrJ9g1jSyjCjpoWzIAA=="],
diff --git a/lightrag_webui/package.json b/lightrag_webui/package.json
index 578ee36f..97fba74d 100644
--- a/lightrag_webui/package.json
+++ b/lightrag_webui/package.json
@@ -43,11 +43,13 @@
     "cmdk": "^1.0.4",
     "graphology": "^0.26.0",
     "graphology-generators": "^0.11.2",
+    "i18next": "^24.2.2",
     "lucide-react": "^0.475.0",
     "minisearch": "^7.1.2",
     "react": "^19.0.0",
     "react-dom": "^19.0.0",
     "react-dropzone": "^14.3.6",
+    "react-i18next": "^15.4.1",
     "react-markdown": "^9.1.0",
     "react-number-format": "^5.4.3",
     "react-syntax-highlighter": "^15.6.1",
diff --git a/lightrag_webui/src/components/ThemeToggle.tsx b/lightrag_webui/src/components/ThemeToggle.tsx
index 8e92d862..ff333ff0 100644
--- a/lightrag_webui/src/components/ThemeToggle.tsx
+++ b/lightrag_webui/src/components/ThemeToggle.tsx
@@ -3,6 +3,7 @@ import useTheme from '@/hooks/useTheme'
 import { MoonIcon, SunIcon } from 'lucide-react'
 import { useCallback } from 'react'
 import { controlButtonVariant } from '@/lib/constants'
+import { useTranslation } from 'react-i18next'
 
 /**
  * Component that toggles the theme between light and dark.
@@ -11,13 +12,14 @@ export default function ThemeToggle() {
   const { theme, setTheme } = useTheme()
   const setLight = useCallback(() => setTheme('light'), [setTheme])
   const setDark = useCallback(() => setTheme('dark'), [setTheme])
+  const { t } = useTranslation()
 
   if (theme === 'dark') {
     return (
       <Button
         onClick={setLight}
         variant={controlButtonVariant}
-        tooltip="Switch to light theme"
+        tooltip={t('header.themeToggle.switchToLight')}
         size="icon"
         side="bottom"
       >
@@ -29,7 +31,7 @@ export default function ThemeToggle() {
     <Button
       onClick={setDark}
       variant={controlButtonVariant}
-      tooltip="Switch to dark theme"
+      tooltip={t('header.themeToggle.switchToDark')}
       size="icon"
       side="bottom"
     >
diff --git a/lightrag_webui/src/components/documents/ClearDocumentsDialog.tsx b/lightrag_webui/src/components/documents/ClearDocumentsDialog.tsx
index 12231df4..58841ff6 100644
--- a/lightrag_webui/src/components/documents/ClearDocumentsDialog.tsx
+++ b/lightrag_webui/src/components/documents/ClearDocumentsDialog.tsx
@@ -13,38 +13,40 @@ import { errorMessage } from '@/lib/utils'
 import { clearDocuments } from '@/api/lightrag'
 
 import { EraserIcon } from 'lucide-react'
+import { useTranslation } from 'react-i18next'
 
 export default function ClearDocumentsDialog() {
+  const { t } = useTranslation()
   const [open, setOpen] = useState(false)
 
   const handleClear = useCallback(async () => {
     try {
       const result = await clearDocuments()
       if (result.status === 'success') {
-        toast.success('Documents cleared successfully')
+        toast.success(t('documentPanel.clearDocuments.success'))
         setOpen(false)
       } else {
-        toast.error(`Clear Documents Failed:\n${result.message}`)
+        toast.error(t('documentPanel.clearDocuments.failed', { message: result.message }))
       }
     } catch (err) {
-      toast.error('Clear Documents Failed:\n' + errorMessage(err))
+      toast.error(t('documentPanel.clearDocuments.error', { error: errorMessage(err) }))
     }
   }, [setOpen])
 
   return (
     <Dialog open={open} onOpenChange={setOpen}>
       <DialogTrigger asChild>
-        <Button variant="outline" side="bottom" tooltip='Clear documents' size="sm">
-          <EraserIcon/> Clear
+        <Button variant="outline" side="bottom" tooltip={t('documentPanel.clearDocuments.tooltip')} size="sm">
+          <EraserIcon/> {t('documentPanel.clearDocuments.button')}
         </Button>
       </DialogTrigger>
       <DialogContent className="sm:max-w-xl" onCloseAutoFocus={(e) => e.preventDefault()}>
         <DialogHeader>
-          <DialogTitle>Clear documents</DialogTitle>
-          <DialogDescription>Do you really want to clear all documents?</DialogDescription>
+          <DialogTitle>{t('documentPanel.clearDocuments.title')}</DialogTitle>
+          <DialogDescription>{t('documentPanel.clearDocuments.confirm')}</DialogDescription>
         </DialogHeader>
         <Button variant="destructive" onClick={handleClear}>
-          YES
+        {t('documentPanel.clearDocuments.confirmButton')}
         </Button>
       </DialogContent>
     </Dialog>
diff --git a/lightrag_webui/src/components/documents/UploadDocumentsDialog.tsx b/lightrag_webui/src/components/documents/UploadDocumentsDialog.tsx
index 7149eb28..7f17393c 100644
--- a/lightrag_webui/src/components/documents/UploadDocumentsDialog.tsx
+++ b/lightrag_webui/src/components/documents/UploadDocumentsDialog.tsx
@@ -14,8 +14,10 @@ import { errorMessage } from '@/lib/utils'
 import { uploadDocument } from '@/api/lightrag'
 
 import { UploadIcon } from 'lucide-react'
+import { useTranslation } from 'react-i18next'
 
 export default function UploadDocumentsDialog() {
+  const { t } = useTranslation()
   const [open, setOpen] = useState(false)
   const [isUploading, setIsUploading] = useState(false)
   const [progresses, setProgresses] = useState<Record<string, number>>({})
@@ -29,24 +31,24 @@ export default function UploadDocumentsDialog() {
           filesToUpload.map(async (file) => {
             try {
               const result = await uploadDocument(file, (percentCompleted: number) => {
-                console.debug(`Uploading ${file.name}: ${percentCompleted}%`)
+                console.debug(t('documentPanel.uploadDocuments.uploading', { name: file.name, percent: percentCompleted }))
                 setProgresses((pre) => ({
                   ...pre,
                   [file.name]: percentCompleted
                 }))
               })
               if (result.status === 'success') {
-                toast.success(`Upload Success:\n${file.name} uploaded successfully`)
+                toast.success(t('documentPanel.uploadDocuments.success', { name: file.name }))
               } else {
-                toast.error(`Upload Failed:\n${file.name}\n${result.message}`)
+                toast.error(t('documentPanel.uploadDocuments.failed', { name: file.name, message: result.message }))
               }
             } catch (err) {
-              toast.error(`Upload Failed:\n${file.name}\n${errorMessage(err)}`)
+              toast.error(t('documentPanel.uploadDocuments.error', { name: file.name, error: errorMessage(err) }))
             }
           })
         )
       } catch (err) {
-        toast.error('Upload Failed\n' + errorMessage(err))
+        toast.error(t('documentPanel.uploadDocuments.generalError', { error: errorMessage(err) }))
       } finally {
         setIsUploading(false)
         // setOpen(false)
@@ -66,21 +68,21 @@ export default function UploadDocumentsDialog() {
       }}
     >
       <DialogTrigger asChild>
-        <Button variant="default" side="bottom" tooltip="Upload documents" size="sm">
-          <UploadIcon /> Upload
+        <Button variant="default" side="bottom" tooltip={t('documentPanel.uploadDocuments.tooltip')} size="sm">
+          <UploadIcon /> {t('documentPanel.uploadDocuments.button')}
         </Button>
       </DialogTrigger>
       <DialogContent className="sm:max-w-xl" onCloseAutoFocus={(e) => e.preventDefault()}>
         <DialogHeader>
-          <DialogTitle>Upload documents</DialogTitle>
+          <DialogTitle>{t('documentPanel.uploadDocuments.title')}</DialogTitle>
           <DialogDescription>
-            Drag and drop your documents here or click to browse.
+            {t('documentPanel.uploadDocuments.description')}
           </DialogDescription>
         </DialogHeader>
         <FileUploader
           maxFileCount={Infinity}
           maxSize={200 * 1024 * 1024}
-          description="supported types: TXT, MD, DOCX, PDF, PPTX, RTF, ODT, EPUB, HTML, HTM, TEX, JSON, XML, YAML, YML, CSV, LOG, CONF, INI, PROPERTIES, SQL, BAT, SH, C, CPP, PY, JAVA, JS, TS, SWIFT, GO, RB, PHP, CSS, SCSS, LESS"
+          description={t('documentPanel.uploadDocuments.fileTypes')}
           onUpload={handleDocumentsUpload}
           progresses={progresses}
           disabled={isUploading}
diff --git a/lightrag_webui/src/components/graph/FullScreenControl.tsx b/lightrag_webui/src/components/graph/FullScreenControl.tsx
index b645b993..dd8e8d8e 100644
--- a/lightrag_webui/src/components/graph/FullScreenControl.tsx
+++ b/lightrag_webui/src/components/graph/FullScreenControl.tsx
@@ -2,21 +2,23 @@ import { useFullScreen } from '@react-sigma/core'
 import { MaximizeIcon, MinimizeIcon } from 'lucide-react'
 import { controlButtonVariant } from '@/lib/constants'
 import Button from '@/components/ui/Button'
+import { useTranslation } from 'react-i18next'
 
 /**
  * Component that toggles full screen mode.
  */
 const FullScreenControl = () => {
   const { isFullScreen, toggle } = useFullScreen()
+  const { t } = useTranslation()
 
   return (
     <>
       {isFullScreen ? (
-        <Button variant={controlButtonVariant} onClick={toggle} tooltip="Windowed" size="icon">
+        <Button variant={controlButtonVariant} onClick={toggle} tooltip={t('graphPanel.sideBar.fullScreenControl.windowed')} size="icon">
           <MinimizeIcon />
         </Button>
       ) : (
-        <Button variant={controlButtonVariant} onClick={toggle} tooltip="Full Screen" size="icon">
+        <Button variant={controlButtonVariant} onClick={toggle} tooltip={t('graphPanel.sideBar.fullScreenControl.fullScreen')} size="icon">
           <MaximizeIcon />
         </Button>
       )}
diff --git a/lightrag_webui/src/components/graph/GraphLabels.tsx b/lightrag_webui/src/components/graph/GraphLabels.tsx
index a3849e1f..7bc26c88 100644
--- a/lightrag_webui/src/components/graph/GraphLabels.tsx
+++ b/lightrag_webui/src/components/graph/GraphLabels.tsx
@@ -5,6 +5,7 @@ import { useSettingsStore } from '@/stores/settings'
 import { useGraphStore } from '@/stores/graph'
 import { labelListLimit } from '@/lib/constants'
 import MiniSearch from 'minisearch'
+import { useTranslation } from 'react-i18next'
 
 const lastGraph: any = {
   graph: null,
@@ -13,6 +14,7 @@ const lastGraph: any = {
 }
 
 const GraphLabels = () => {
+  const { t } = useTranslation()
   const label = useSettingsStore.use.queryLabel()
   const graph = useGraphStore.use.sigmaGraph()
 
@@ -69,7 +71,7 @@ const GraphLabels = () => {
 
       return result.length <= labelListLimit
         ? result
-        : [...result.slice(0, labelListLimit), `And ${result.length - labelListLimit} others`]
+        : [...result.slice(0, labelListLimit), t('graphLabels.andOthers', { count: result.length - labelListLimit })]
     },
     [getSearchEngine]
   )
@@ -84,14 +86,14 @@ const GraphLabels = () => {
       className="ml-2"
       triggerClassName="max-h-8"
       searchInputClassName="max-h-8"
-      triggerTooltip="Select query label"
+      triggerTooltip={t('graphPanel.graphLabels.selectTooltip')}
       fetcher={fetchData}
       renderOption={(item) => <div>{item}</div>}
       getOptionValue={(item) => item}
       getDisplayValue={(item) => <div>{item}</div>}
       notFound={<div className="py-6 text-center text-sm">No labels found</div>}
-      label="Label"
-      placeholder="Search labels..."
+      label={t('graphPanel.graphLabels.label')}
+      placeholder={t('graphPanel.graphLabels.placeholder')}
       value={label !== null ? label : ''}
       onChange={setQueryLabel}
     />
diff --git a/lightrag_webui/src/components/graph/GraphSearch.tsx b/lightrag_webui/src/components/graph/GraphSearch.tsx
index 3edc3ede..bbb8cb5b 100644
--- a/lightrag_webui/src/components/graph/GraphSearch.tsx
+++ b/lightrag_webui/src/components/graph/GraphSearch.tsx
@@ -9,6 +9,7 @@ import { AsyncSearch } from '@/components/ui/AsyncSearch'
 import { searchResultLimit } from '@/lib/constants'
 import { useGraphStore } from '@/stores/graph'
 import MiniSearch from 'minisearch'
+import { useTranslation } from 'react-i18next'
 
 interface OptionItem {
   id: string
@@ -44,6 +45,7 @@ export const GraphSearchInput = ({
   onFocus?: GraphSearchInputProps['onFocus']
   value?: GraphSearchInputProps['value']
 }) => {
+  const { t } = useTranslation()
   const graph = useGraphStore.use.sigmaGraph()
 
   const searchEngine = useMemo(() => {
@@ -97,7 +99,7 @@ export const GraphSearchInput = ({
           {
             type: 'message',
             id: messageId,
-            message: `And ${result.length - searchResultLimit} others`
+            message: t('graphPanel.search.message', { count: result.length - searchResultLimit })
           }
         ]
     },
@@ -118,7 +120,7 @@ export const GraphSearchInput = ({
         if (id !== messageId && onFocus) onFocus(id ? { id, type: 'nodes' } : null)
       }}
       label={'item'}
-      placeholder="Search nodes..."
+      placeholder={t('graphPanel.search.placeholder')}
     />
   )
 }
diff --git a/lightrag_webui/src/components/graph/LayoutsControl.tsx b/lightrag_webui/src/components/graph/LayoutsControl.tsx
index c57b371a..0ed97f2f 100644
--- a/lightrag_webui/src/components/graph/LayoutsControl.tsx
+++ b/lightrag_webui/src/components/graph/LayoutsControl.tsx
@@ -16,6 +16,7 @@ import { controlButtonVariant } from '@/lib/constants'
 import { useSettingsStore } from '@/stores/settings'
 
 import { GripIcon, PlayIcon, PauseIcon } from 'lucide-react'
+import { useTranslation } from 'react-i18next'
 
 type LayoutName =
   | 'Circular'
@@ -28,6 +29,7 @@ type LayoutName =
 const WorkerLayoutControl = ({ layout, autoRunFor }: WorkerLayoutControlProps) => {
   const sigma = useSigma()
   const { stop, start, isRunning } = layout
+  const { t } = useTranslation()
 
   /**
    * Init component when Sigma or component settings change.
@@ -61,7 +63,7 @@ const WorkerLayoutControl = ({ layout, autoRunFor }: WorkerLayoutControlProps) =
     <Button
       size="icon"
       onClick={() => (isRunning ? stop() : start())}
-      tooltip={isRunning ? 'Stop the layout animation' : 'Start the layout animation'}
+      tooltip={isRunning ? t('graphPanel.sideBar.layoutsControl.stopAnimation') : t('graphPanel.sideBar.layoutsControl.startAnimation')}
       variant={controlButtonVariant}
     >
       {isRunning ? <PauseIcon /> : <PlayIcon />}
@@ -74,6 +76,7 @@ const WorkerLayoutControl = ({ layout, autoRunFor }: WorkerLayoutControlProps) =
  */
 const LayoutsControl = () => {
   const sigma = useSigma()
+  const { t } = useTranslation()
   const [layout, setLayout] = useState<LayoutName>('Circular')
   const [opened, setOpened] = useState<boolean>(false)
 
@@ -149,7 +152,7 @@ const LayoutsControl = () => {
               size="icon"
               variant={controlButtonVariant}
               onClick={() => setOpened((e: boolean) => !e)}
-              tooltip="Layout Graph"
+              tooltip={t('graphPanel.sideBar.layoutsControl.layoutGraph')}
             >
               <GripIcon />
             </Button>
@@ -166,7 +169,7 @@ const LayoutsControl = () => {
                       key={name}
                       className="cursor-pointer text-xs"
                     >
-                      {name}
+                      {t(`graphPanel.sideBar.layoutsControl.layouts.${name}`)}
                     </CommandItem>
                   ))}
                 </CommandGroup>
diff --git a/lightrag_webui/src/components/graph/PropertiesView.tsx b/lightrag_webui/src/components/graph/PropertiesView.tsx
index dec80460..4571b02b 100644
--- a/lightrag_webui/src/components/graph/PropertiesView.tsx
+++ b/lightrag_webui/src/components/graph/PropertiesView.tsx
@@ -2,6 +2,7 @@ import { useEffect, useState } from 'react'
 import { useGraphStore, RawNodeType, RawEdgeType } from '@/stores/graph'
 import Text from '@/components/ui/Text'
 import useLightragGraph from '@/hooks/useLightragGraph'
+import { useTranslation } from 'react-i18next'
 
 /**
  * Component that view properties of elements in graph.
@@ -147,21 +148,22 @@ const PropertyRow = ({
 }
 
 const NodePropertiesView = ({ node }: { node: NodeType }) => {
+  const { t } = useTranslation()
   return (
     <div className="flex flex-col gap-2">
-      <label className="text-md pl-1 font-bold tracking-wide text-sky-300">Node</label>
+      <label className="text-md pl-1 font-bold tracking-wide text-sky-300">{t('graphPanel.propertiesView.node.title')}</label>
       <div className="bg-primary/5 max-h-96 overflow-auto rounded p-1">
-        <PropertyRow name={'Id'} value={node.id} />
+        <PropertyRow name={t('graphPanel.propertiesView.node.id')} value={node.id} />
         <PropertyRow
-          name={'Labels'}
+          name={t('graphPanel.propertiesView.node.labels')}
           value={node.labels.join(', ')}
           onClick={() => {
             useGraphStore.getState().setSelectedNode(node.id, true)
           }}
         />
-        <PropertyRow name={'Degree'} value={node.degree} />
+        <PropertyRow name={t('graphPanel.propertiesView.node.degree')} value={node.degree} />
       </div>
-      <label className="text-md pl-1 font-bold tracking-wide text-yellow-400/90">Properties</label>
+      <label className="text-md pl-1 font-bold tracking-wide text-yellow-400/90">{t('graphPanel.propertiesView.node.properties')}</label>
       <div className="bg-primary/5 max-h-96 overflow-auto rounded p-1">
         {Object.keys(node.properties)
           .sort()
@@ -172,7 +174,7 @@ const NodePropertiesView = ({ node }: { node: NodeType }) => {
       {node.relationships.length > 0 && (
         <>
           <label className="text-md pl-1 font-bold tracking-wide text-teal-600/90">
-            Relationships
+          {t('graphPanel.propertiesView.node.relationships')}
           </label>
           <div className="bg-primary/5 max-h-96 overflow-auto rounded p-1">
             {node.relationships.map(({ type, id, label }) => {
@@ -195,28 +197,29 @@ const NodePropertiesView = ({ node }: { node: NodeType }) => {
 }
 
 const EdgePropertiesView = ({ edge }: { edge: EdgeType }) => {
+  const { t } = useTranslation()
   return (
     <div className="flex flex-col gap-2">
-      <label className="text-md pl-1 font-bold tracking-wide text-teal-600">Relationship</label>
+      <label className="text-md pl-1 font-bold tracking-wide text-teal-600">{t('graphPanel.propertiesView.edge.title')}</label>
       <div className="bg-primary/5 max-h-96 overflow-auto rounded p-1">
-        <PropertyRow name={'Id'} value={edge.id} />
-        {edge.type && <PropertyRow name={'Type'} value={edge.type} />}
+        <PropertyRow name={t('graphPanel.propertiesView.edge.id')} value={edge.id} />
+        {edge.type && <PropertyRow name={t('graphPanel.propertiesView.edge.type')} value={edge.type} />}
         <PropertyRow
-          name={'Source'}
+          name={t('graphPanel.propertiesView.edge.source')}
           value={edge.sourceNode ? edge.sourceNode.labels.join(', ') : edge.source}
           onClick={() => {
             useGraphStore.getState().setSelectedNode(edge.source, true)
           }}
         />
         <PropertyRow
-          name={'Target'}
+          name={t('graphPanel.propertiesView.edge.target')}
           value={edge.targetNode ? edge.targetNode.labels.join(', ') : edge.target}
           onClick={() => {
             useGraphStore.getState().setSelectedNode(edge.target, true)
           }}
         />
       </div>
-      <label className="text-md pl-1 font-bold tracking-wide text-yellow-400/90">Properties</label>
+      <label className="text-md pl-1 font-bold tracking-wide text-yellow-400/90">{t('graphPanel.propertiesView.edge.properties')}</label>
       <div className="bg-primary/5 max-h-96 overflow-auto rounded p-1">
         {Object.keys(edge.properties)
           .sort()
diff --git a/lightrag_webui/src/components/graph/Settings.tsx b/lightrag_webui/src/components/graph/Settings.tsx
index 4d2b998d..ddf05d40 100644
--- a/lightrag_webui/src/components/graph/Settings.tsx
+++ b/lightrag_webui/src/components/graph/Settings.tsx
@@ -10,6 +10,7 @@ import { useSettingsStore } from '@/stores/settings'
 import { useBackendState } from '@/stores/state'
 
 import { SettingsIcon } from 'lucide-react'
+import { useTranslation } from "react-i18next";
 
 /**
  * Component that displays a checkbox with a label.
@@ -195,10 +196,12 @@ export default function Settings() {
     [setTempApiKey]
   )
 
+  const { t } = useTranslation();
+
   return (
     <Popover open={opened} onOpenChange={setOpened}>
       <PopoverTrigger asChild>
-        <Button variant={controlButtonVariant} tooltip="Settings" size="icon">
+        <Button variant={controlButtonVariant} tooltip={t("graphPanel.sideBar.settings.settings")} size="icon">
           <SettingsIcon />
         </Button>
       </PopoverTrigger>
@@ -212,7 +215,7 @@ export default function Settings() {
           <LabeledCheckBox
             checked={enableHealthCheck}
             onCheckedChange={setEnableHealthCheck}
-            label="Health Check"
+            label={t("graphPanel.sideBar.settings.healthCheck")}
           />
 
           <Separator />
@@ -220,12 +223,12 @@ export default function Settings() {
           <LabeledCheckBox
             checked={showPropertyPanel}
             onCheckedChange={setShowPropertyPanel}
-            label="Show Property Panel"
+            label={t("graphPanel.sideBar.settings.showPropertyPanel")}
           />
           <LabeledCheckBox
             checked={showNodeSearchBar}
             onCheckedChange={setShowNodeSearchBar}
-            label="Show Search Bar"
+            label={t("graphPanel.sideBar.settings.showSearchBar")}
           />
 
           <Separator />
@@ -233,12 +236,12 @@ export default function Settings() {
           <LabeledCheckBox
             checked={showNodeLabel}
             onCheckedChange={setShowNodeLabel}
-            label="Show Node Label"
+            label={t("graphPanel.sideBar.settings.showNodeLabel")}
           />
           <LabeledCheckBox
             checked={enableNodeDrag}
             onCheckedChange={setEnableNodeDrag}
-            label="Node Draggable"
+            label={t("graphPanel.sideBar.settings.nodeDraggable")}
           />
 
           <Separator />
@@ -246,28 +249,34 @@ export default function Settings() {
           <LabeledCheckBox
             checked={showEdgeLabel}
             onCheckedChange={setShowEdgeLabel}
-            label="Show Edge Label"
+            label={t("graphPanel.sideBar.settings.showEdgeLabel")}
           />
           <LabeledCheckBox
             checked={enableHideUnselectedEdges}
             onCheckedChange={setEnableHideUnselectedEdges}
-            label="Hide Unselected Edges"
+            label={t("graphPanel.sideBar.settings.hideUnselectedEdges")}
           />
           <LabeledCheckBox
             checked={enableEdgeEvents}
             onCheckedChange={setEnableEdgeEvents}
-            label="Edge Events"
+            label={t("graphPanel.sideBar.settings.edgeEvents")}
           />
 
           <Separator />
           <LabeledNumberInput
-            label="Max Query Depth"
+            label={t("graphPanel.sideBar.settings.maxQueryDepth")}
             min={1}
             value={graphQueryMaxDepth}
             onEditFinished={setGraphQueryMaxDepth}
           />
           <LabeledNumberInput
-            label="Max Layout Iterations"
+            label={t("graphPanel.sideBar.settings.minDegree")}
+            min={0}
+            value={graphMinDegree}
+            onEditFinished={setGraphMinDegree}
+          />   
+          <LabeledNumberInput
+            label={t("graphPanel.sideBar.settings.maxLayoutIterations")}
             min={1}
             max={20}
             value={graphLayoutMaxIterations}
@@ -277,14 +286,14 @@ export default function Settings() {
           <Separator />
 
           <div className="flex flex-col gap-2">
-            <label className="text-sm font-medium">API Key</label>
+            <label className="text-sm font-medium">{t("graphPanel.sideBar.settings.apiKey")}</label>
             <form className="flex h-6 gap-2" onSubmit={(e) => e.preventDefault()}>
               <div className="w-0 flex-1">
                 <Input
                   type="password"
                   value={tempApiKey}
                   onChange={handleTempApiKeyChange}
-                  placeholder="Enter your API key"
+                  placeholder={t("graphPanel.sideBar.settings.enterYourAPIkey")}
                   className="max-h-full w-full min-w-0"
                   autoComplete="off"
                 />
@@ -295,7 +304,7 @@ export default function Settings() {
                 size="sm"
                 className="max-h-full shrink-0"
               >
-                Save
+                {t("graphPanel.sideBar.settings.save")}
               </Button>
             </form>
           </div>
diff --git a/lightrag_webui/src/components/graph/StatusCard.tsx b/lightrag_webui/src/components/graph/StatusCard.tsx
index 3084d103..e67cbd30 100644
--- a/lightrag_webui/src/components/graph/StatusCard.tsx
+++ b/lightrag_webui/src/components/graph/StatusCard.tsx
@@ -1,58 +1,60 @@
 import { LightragStatus } from '@/api/lightrag'
+import { useTranslation } from 'react-i18next'
 
 const StatusCard = ({ status }: { status: LightragStatus | null }) => {
+  const { t } = useTranslation()
   if (!status) {
-    return <div className="text-muted-foreground text-sm">Status information unavailable</div>
+    return <div className="text-muted-foreground text-sm">{t('graphPanel.statusCard.unavailable')}</div>
   }
 
   return (
     <div className="min-w-[300px] space-y-3 text-sm">
       <div className="space-y-1">
-        <h4 className="font-medium">Storage Info</h4>
+        <h4 className="font-medium">{t('graphPanel.statusCard.storageInfo')}</h4>
         <div className="text-muted-foreground grid grid-cols-2 gap-1">
-          <span>Working Directory:</span>
+          <span>{t('graphPanel.statusCard.workingDirectory')}:</span>
           <span className="truncate">{status.working_directory}</span>
-          <span>Input Directory:</span>
+          <span>{t('graphPanel.statusCard.inputDirectory')}:</span>
           <span className="truncate">{status.input_directory}</span>
         </div>
       </div>
 
       <div className="space-y-1">
-        <h4 className="font-medium">LLM Configuration</h4>
+        <h4 className="font-medium">{t('graphPanel.statusCard.llmConfig')}</h4>
         <div className="text-muted-foreground grid grid-cols-2 gap-1">
-          <span>LLM Binding:</span>
+          <span>{t('graphPanel.statusCard.llmBinding')}:</span>
           <span>{status.configuration.llm_binding}</span>
-          <span>LLM Binding Host:</span>
+          <span>{t('graphPanel.statusCard.llmBindingHost')}:</span>
           <span>{status.configuration.llm_binding_host}</span>
-          <span>LLM Model:</span>
+          <span>{t('graphPanel.statusCard.llmModel')}:</span>
           <span>{status.configuration.llm_model}</span>
-          <span>Max Tokens:</span>
+          <span>{t('graphPanel.statusCard.maxTokens')}:</span>
           <span>{status.configuration.max_tokens}</span>
         </div>
       </div>
 
       <div className="space-y-1">
-        <h4 className="font-medium">Embedding Configuration</h4>
+        <h4 className="font-medium">{t('graphPanel.statusCard.embeddingConfig')}</h4>
         <div className="text-muted-foreground grid grid-cols-2 gap-1">
-          <span>Embedding Binding:</span>
+          <span>{t('graphPanel.statusCard.embeddingBinding')}:</span>
           <span>{status.configuration.embedding_binding}</span>
-          <span>Embedding Binding Host:</span>
+          <span>{t('graphPanel.statusCard.embeddingBindingHost')}:</span>
           <span>{status.configuration.embedding_binding_host}</span>
-          <span>Embedding Model:</span>
+          <span>{t('graphPanel.statusCard.embeddingModel')}:</span>
           <span>{status.configuration.embedding_model}</span>
         </div>
       </div>
 
       <div className="space-y-1">
-        <h4 className="font-medium">Storage Configuration</h4>
+        <h4 className="font-medium">{t('graphPanel.statusCard.storageConfig')}</h4>
         <div className="text-muted-foreground grid grid-cols-2 gap-1">
-          <span>KV Storage:</span>
+          <span>{t('graphPanel.statusCard.kvStorage')}:</span>
           <span>{status.configuration.kv_storage}</span>
-          <span>Doc Status Storage:</span>
+          <span>{t('graphPanel.statusCard.docStatusStorage')}:</span>
           <span>{status.configuration.doc_status_storage}</span>
-          <span>Graph Storage:</span>
+          <span>{t('graphPanel.statusCard.graphStorage')}:</span>
           <span>{status.configuration.graph_storage}</span>
-          <span>Vector Storage:</span>
+          <span>{t('graphPanel.statusCard.vectorStorage')}:</span>
           <span>{status.configuration.vector_storage}</span>
         </div>
       </div>
diff --git a/lightrag_webui/src/components/graph/StatusIndicator.tsx b/lightrag_webui/src/components/graph/StatusIndicator.tsx
index 3272d9fa..d7a1831f 100644
--- a/lightrag_webui/src/components/graph/StatusIndicator.tsx
+++ b/lightrag_webui/src/components/graph/StatusIndicator.tsx
@@ -3,8 +3,10 @@ import { useBackendState } from '@/stores/state'
 import { useEffect, useState } from 'react'
 import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/Popover'
 import StatusCard from '@/components/graph/StatusCard'
+import { useTranslation } from 'react-i18next'
 
 const StatusIndicator = () => {
+  const { t } = useTranslation()
   const health = useBackendState.use.health()
   const lastCheckTime = useBackendState.use.lastCheckTime()
   const status = useBackendState.use.status()
@@ -33,7 +35,7 @@ const StatusIndicator = () => {
               )}
             />
             <span className="text-muted-foreground text-xs">
-              {health ? 'Connected' : 'Disconnected'}
+              {health ? t('graphPanel.statusIndicator.connected') : t('graphPanel.statusIndicator.disconnected')}
             </span>
           </div>
         </PopoverTrigger>
diff --git a/lightrag_webui/src/components/graph/ZoomControl.tsx b/lightrag_webui/src/components/graph/ZoomControl.tsx
index 790b4423..0aa55416 100644
--- a/lightrag_webui/src/components/graph/ZoomControl.tsx
+++ b/lightrag_webui/src/components/graph/ZoomControl.tsx
@@ -3,12 +3,14 @@ import { useCallback } from 'react'
 import Button from '@/components/ui/Button'
 import { ZoomInIcon, ZoomOutIcon, FullscreenIcon } from 'lucide-react'
 import { controlButtonVariant } from '@/lib/constants'
+import { useTranslation } from "react-i18next";
 
 /**
  * Component that provides zoom controls for the graph viewer.
  */
 const ZoomControl = () => {
   const { zoomIn, zoomOut, reset } = useCamera({ duration: 200, factor: 1.5 })
+  const { t } = useTranslation();
 
   const handleZoomIn = useCallback(() => zoomIn(), [zoomIn])
   const handleZoomOut = useCallback(() => zoomOut(), [zoomOut])
@@ -16,16 +18,16 @@ const ZoomControl = () => {
 
   return (
     <>
-      <Button variant={controlButtonVariant} onClick={handleZoomIn} tooltip="Zoom In" size="icon">
+      <Button variant={controlButtonVariant} onClick={handleZoomIn} tooltip={t("graphPanel.sideBar.zoomControl.zoomIn")} size="icon">
         <ZoomInIcon />
       </Button>
-      <Button variant={controlButtonVariant} onClick={handleZoomOut} tooltip="Zoom Out" size="icon">
+      <Button variant={controlButtonVariant} onClick={handleZoomOut} tooltip={t("graphPanel.sideBar.zoomControl.zoomOut")} size="icon">
         <ZoomOutIcon />
       </Button>
       <Button
         variant={controlButtonVariant}
         onClick={handleResetZoom}
-        tooltip="Reset Zoom"
+        tooltip={t("graphPanel.sideBar.zoomControl.resetZoom")}
         size="icon"
       >
         <FullscreenIcon />
diff --git a/lightrag_webui/src/components/retrieval/ChatMessage.tsx b/lightrag_webui/src/components/retrieval/ChatMessage.tsx
index ea3dba12..53f1d2bc 100644
--- a/lightrag_webui/src/components/retrieval/ChatMessage.tsx
+++ b/lightrag_webui/src/components/retrieval/ChatMessage.tsx
@@ -15,18 +15,21 @@ import { Prism as SyntaxHighlighter } from 'react-syntax-highlighter'
 import { oneLight, oneDark } from 'react-syntax-highlighter/dist/cjs/styles/prism'
 
 import { LoaderIcon, CopyIcon } from 'lucide-react'
+import { useTranslation } from 'react-i18next'
 
 export type MessageWithError = Message & {
   isError?: boolean
 }
 
 export const ChatMessage = ({ message }: { message: MessageWithError }) => {
+  const { t } = useTranslation()
+
   const handleCopyMarkdown = useCallback(async () => {
     if (message.content) {
       try {
         await navigator.clipboard.writeText(message.content)
       } catch (err) {
-        console.error('Failed to copy:', err)
+        console.error(t('chat.copyError'), err)
       }
     }
   }, [message])
@@ -57,7 +60,7 @@ export const ChatMessage = ({ message }: { message: MessageWithError }) => {
           <Button
             onClick={handleCopyMarkdown}
             className="absolute right-0 bottom-0 size-6 rounded-md opacity-20 transition-opacity hover:opacity-100"
-            tooltip="Copy to clipboard"
+            tooltip={t('retrievePanel.chatMessage.copyTooltip')}
             variant="default"
             size="icon"
           >
diff --git a/lightrag_webui/src/components/retrieval/QuerySettings.tsx b/lightrag_webui/src/components/retrieval/QuerySettings.tsx
index bef34857..f3ec98fb 100644
--- a/lightrag_webui/src/components/retrieval/QuerySettings.tsx
+++ b/lightrag_webui/src/components/retrieval/QuerySettings.tsx
@@ -14,8 +14,10 @@ import {
   SelectValue
 } from '@/components/ui/Select'
 import { useSettingsStore } from '@/stores/settings'
+import { useTranslation } from 'react-i18next'
 
 export default function QuerySettings() {
+  const { t } = useTranslation()
   const querySettings = useSettingsStore((state) => state.querySettings)
 
   const handleChange = useCallback((key: keyof QueryRequest, value: any) => {
@@ -25,8 +27,8 @@ export default function QuerySettings() {
   return (
     <Card className="flex shrink-0 flex-col">
       <CardHeader className="px-4 pt-4 pb-2">
-        <CardTitle>Parameters</CardTitle>
-        <CardDescription>Configure your query parameters</CardDescription>
+        <CardTitle>{t('retrievePanel.querySettings.parametersTitle')}</CardTitle>
+        <CardDescription>{t('retrievePanel.querySettings.parametersDescription')}</CardDescription>
       </CardHeader>
       <CardContent className="m-0 flex grow flex-col p-0 text-xs">
         <div className="relative size-full">
@@ -35,8 +37,8 @@ export default function QuerySettings() {
             <>
               <Text
                 className="ml-1"
-                text="Query Mode"
-                tooltip="Select the retrieval strategy:\n• Naive: Basic search without advanced techniques\n• Local: Context-dependent information retrieval\n• Global: Utilizes global knowledge base\n• Hybrid: Combines local and global retrieval\n• Mix: Integrates knowledge graph with vector retrieval"
+                text={t('retrievePanel.querySettings.queryMode')} 
+                tooltip={t('retrievePanel.querySettings.queryModeTooltip')}
                 side="left"
               />
               <Select
@@ -48,11 +50,11 @@ export default function QuerySettings() {
                 </SelectTrigger>
                 <SelectContent>
                   <SelectGroup>
-                    <SelectItem value="naive">Naive</SelectItem>
-                    <SelectItem value="local">Local</SelectItem>
-                    <SelectItem value="global">Global</SelectItem>
-                    <SelectItem value="hybrid">Hybrid</SelectItem>
-                    <SelectItem value="mix">Mix</SelectItem>
+                    <SelectItem value="naive">{t('retrievePanel.querySettings.queryModeOptions.naive')}</SelectItem>
+                    <SelectItem value="local">{t('retrievePanel.querySettings.queryModeOptions.local')}</SelectItem>
+                    <SelectItem value="global">{t('retrievePanel.querySettings.queryModeOptions.global')}</SelectItem>
+                    <SelectItem value="hybrid">{t('retrievePanel.querySettings.queryModeOptions.hybrid')}</SelectItem>
+                    <SelectItem value="mix">{t('retrievePanel.querySettings.queryModeOptions.mix')}</SelectItem>
                   </SelectGroup>
                 </SelectContent>
               </Select>
@@ -62,8 +64,8 @@ export default function QuerySettings() {
             <>
               <Text
                 className="ml-1"
-                text="Response Format"
-                tooltip="Defines the response format. Examples:\n• Multiple Paragraphs\n• Single Paragraph\n• Bullet Points"
+                text={t('retrievePanel.querySettings.responseFormat')}
+                tooltip={t('retrievePanel.querySettings.responseFormatTooltip')}
                 side="left"
               />
               <Select
@@ -75,9 +77,9 @@ export default function QuerySettings() {
                 </SelectTrigger>
                 <SelectContent>
                   <SelectGroup>
-                    <SelectItem value="Multiple Paragraphs">Multiple Paragraphs</SelectItem>
-                    <SelectItem value="Single Paragraph">Single Paragraph</SelectItem>
-                    <SelectItem value="Bullet Points">Bullet Points</SelectItem>
+                    <SelectItem value="Multiple Paragraphs">{t('retrievePanel.querySettings.responseFormatOptions.multipleParagraphs')}</SelectItem>
+                    <SelectItem value="Single Paragraph">{t('retrievePanel.querySettings.responseFormatOptions.singleParagraph')}</SelectItem>
+                    <SelectItem value="Bullet Points">{t('retrievePanel.querySettings.responseFormatOptions.bulletPoints')}</SelectItem>
                   </SelectGroup>
                 </SelectContent>
               </Select>
@@ -87,8 +89,8 @@ export default function QuerySettings() {
             <>
               <Text
                 className="ml-1"
-                text="Top K Results"
-                tooltip="Number of top items to retrieve. Represents entities in 'local' mode and relationships in 'global' mode"
+                text={t('retrievePanel.querySettings.topK')}
+                tooltip={t('retrievePanel.querySettings.topKTooltip')}
                 side="left"
               />
               <NumberInput
@@ -97,7 +99,7 @@ export default function QuerySettings() {
                 value={querySettings.top_k}
                 onValueChange={(v) => handleChange('top_k', v)}
                 min={1}
-                placeholder="Number of results"
+                placeholder={t('retrievePanel.querySettings.topKPlaceholder')}
               />
             </>
 
@@ -106,8 +108,8 @@ export default function QuerySettings() {
               <>
                 <Text
                   className="ml-1"
-                  text="Max Tokens for Text Unit"
-                  tooltip="Maximum number of tokens allowed for each retrieved text chunk"
+                  text={t('retrievePanel.querySettings.maxTokensTextUnit')}
+                  tooltip={t('retrievePanel.querySettings.maxTokensTextUnitTooltip')}
                   side="left"
                 />
                 <NumberInput
@@ -116,14 +118,14 @@ export default function QuerySettings() {
                   value={querySettings.max_token_for_text_unit}
                   onValueChange={(v) => handleChange('max_token_for_text_unit', v)}
                   min={1}
-                  placeholder="Max tokens for text unit"
+                  placeholder={t('retrievePanel.querySettings.maxTokensTextUnit')}
                 />
               </>
 
               <>
                 <Text
-                  text="Max Tokens for Global Context"
-                  tooltip="Maximum number of tokens allocated for relationship descriptions in global retrieval"
+                  text={t('retrievePanel.querySettings.maxTokensGlobalContext')}
+                  tooltip={t('retrievePanel.querySettings.maxTokensGlobalContextTooltip')}
                   side="left"
                 />
                 <NumberInput
@@ -132,15 +134,15 @@ export default function QuerySettings() {
                   value={querySettings.max_token_for_global_context}
                   onValueChange={(v) => handleChange('max_token_for_global_context', v)}
                   min={1}
-                  placeholder="Max tokens for global context"
+                  placeholder={t('retrievePanel.querySettings.maxTokensGlobalContext')}
                 />
               </>
 
               <>
                 <Text
                   className="ml-1"
-                  text="Max Tokens for Local Context"
-                  tooltip="Maximum number of tokens allocated for entity descriptions in local retrieval"
+                  text={t('retrievePanel.querySettings.maxTokensLocalContext')}
+                  tooltip={t('retrievePanel.querySettings.maxTokensLocalContextTooltip')}
                   side="left"
                 />
                 <NumberInput
@@ -149,7 +151,7 @@ export default function QuerySettings() {
                   value={querySettings.max_token_for_local_context}
                   onValueChange={(v) => handleChange('max_token_for_local_context', v)}
                   min={1}
-                  placeholder="Max tokens for local context"
+                  placeholder={t('retrievePanel.querySettings.maxTokensLocalContext')}
                 />
               </>
             </>
@@ -158,8 +160,8 @@ export default function QuerySettings() {
             <>
               <Text
                 className="ml-1"
-                text="History Turns"
-                tooltip="Number of complete conversation turns (user-assistant pairs) to consider in the response context"
+                text={t('retrievePanel.querySettings.historyTurns')}
+                tooltip={t('retrievePanel.querySettings.historyTurnsTooltip')}
                 side="left"
               />
               <NumberInput
@@ -170,7 +172,7 @@ export default function QuerySettings() {
                 value={querySettings.history_turns}
                 onValueChange={(v) => handleChange('history_turns', v)}
                 min={0}
-                placeholder="Number of history turns"
+                placeholder={t('retrievePanel.querySettings.historyTurnsPlaceholder')}
               />
             </>
 
@@ -179,8 +181,8 @@ export default function QuerySettings() {
               <>
                 <Text
                   className="ml-1"
-                  text="High-Level Keywords"
-                  tooltip="List of high-level keywords to prioritize in retrieval. Separate with commas"
+                  text={t('retrievePanel.querySettings.hlKeywords')}
+                  tooltip={t('retrievePanel.querySettings.hlKeywordsTooltip')}
                   side="left"
                 />
                 <Input
@@ -194,15 +196,15 @@ export default function QuerySettings() {
                       .filter((k) => k !== '')
                     handleChange('hl_keywords', keywords)
                   }}
-                  placeholder="Enter keywords"
+                  placeholder={t('retrievePanel.querySettings.hlkeywordsPlaceHolder')}
                 />
               </>
 
               <>
                 <Text
                   className="ml-1"
-                  text="Low-Level Keywords"
-                  tooltip="List of low-level keywords to refine retrieval focus. Separate with commas"
+                  text={t('retrievePanel.querySettings.llKeywords')}
+                  tooltip={t('retrievePanel.querySettings.llKeywordsTooltip')}
                   side="left"
                 />
                 <Input
@@ -216,7 +218,7 @@ export default function QuerySettings() {
                       .filter((k) => k !== '')
                     handleChange('ll_keywords', keywords)
                   }}
-                  placeholder="Enter keywords"
+                  placeholder={t('retrievePanel.querySettings.hlkeywordsPlaceHolder')}
                 />
               </>
             </>
@@ -226,8 +228,8 @@ export default function QuerySettings() {
               <div className="flex items-center gap-2">
                 <Text
                   className="ml-1"
-                  text="Only Need Context"
-                  tooltip="If True, only returns the retrieved context without generating a response"
+                  text={t('retrievePanel.querySettings.onlyNeedContext')}
+                  tooltip={t('retrievePanel.querySettings.onlyNeedContextTooltip')}
                   side="left"
                 />
                 <div className="grow" />
@@ -242,8 +244,8 @@ export default function QuerySettings() {
               <div className="flex items-center gap-2">
                 <Text
                   className="ml-1"
-                  text="Only Need Prompt"
-                  tooltip="If True, only returns the generated prompt without producing a response"
+                  text={t('retrievePanel.querySettings.onlyNeedPrompt')}
+                  tooltip={t('retrievePanel.querySettings.onlyNeedPromptTooltip')}
                   side="left"
                 />
                 <div className="grow" />
@@ -258,8 +260,8 @@ export default function QuerySettings() {
               <div className="flex items-center gap-2">
                 <Text
                   className="ml-1"
-                  text="Stream Response"
-                  tooltip="If True, enables streaming output for real-time responses"
+                  text={t('retrievePanel.querySettings.streamResponse')}
+                  tooltip={t('retrievePanel.querySettings.streamResponseTooltip')}
                   side="left"
                 />
                 <div className="grow" />
diff --git a/lightrag_webui/src/features/DocumentManager.tsx b/lightrag_webui/src/features/DocumentManager.tsx
index 30e55d48..b8841fe4 100644
--- a/lightrag_webui/src/features/DocumentManager.tsx
+++ b/lightrag_webui/src/features/DocumentManager.tsx
@@ -1,4 +1,5 @@
 import { useState, useEffect, useCallback } from 'react'
+import { useTranslation } from 'react-i18next'
 import Button from '@/components/ui/Button'
 import {
   Table,
@@ -22,6 +23,7 @@ import { useBackendState } from '@/stores/state'
 import { RefreshCwIcon } from 'lucide-react'
 
 export default function DocumentManager() {
+  const { t } = useTranslation()
   const health = useBackendState.use.health()
   const [docs, setDocs] = useState<DocsStatusesResponse | null>(null)
 
@@ -44,7 +46,7 @@ export default function DocumentManager() {
         setDocs(null)
       }
     } catch (err) {
-      toast.error('Failed to load documents\n' + errorMessage(err))
+      toast.error(t('documentPanel.documentManager.errors.loadFailed', { error: errorMessage(err) }))
     }
   }, [setDocs])
 
@@ -57,7 +59,7 @@ export default function DocumentManager() {
       const { status } = await scanNewDocuments()
       toast.message(status)
     } catch (err) {
-      toast.error('Failed to load documents\n' + errorMessage(err))
+      toast.error(t('documentPanel.documentManager.errors.scanFailed', { error: errorMessage(err) }))
     }
   }, [])
 
@@ -69,7 +71,7 @@ export default function DocumentManager() {
       try {
         await fetchDocuments()
       } catch (err) {
-        toast.error('Failed to get scan progress\n' + errorMessage(err))
+        toast.error(t('documentPanel.documentManager.errors.scanProgressFailed', { error: errorMessage(err) }))
       }
     }, 5000)
     return () => clearInterval(interval)
@@ -78,7 +80,7 @@ export default function DocumentManager() {
   return (
     <Card className="!size-full !rounded-none !border-none">
       <CardHeader>
-        <CardTitle className="text-lg">Document Management</CardTitle>
+        <CardTitle className="text-lg">{t('documentPanel.documentManager.title')}</CardTitle>
       </CardHeader>
       <CardContent className="space-y-4">
         <div className="flex gap-2">
@@ -86,10 +88,10 @@ export default function DocumentManager() {
             variant="outline"
             onClick={scanDocuments}
             side="bottom"
-            tooltip="Scan documents"
+            tooltip={t('documentPanel.documentManager.scanTooltip')}
             size="sm"
           >
-            <RefreshCwIcon /> Scan
+            <RefreshCwIcon /> {t('documentPanel.documentManager.scanButton')}
           </Button>
           <div className="flex-1" />
           <ClearDocumentsDialog />
@@ -98,29 +100,29 @@ export default function DocumentManager() {
 
         <Card>
           <CardHeader>
-            <CardTitle>Uploaded documents</CardTitle>
-            <CardDescription>view the uploaded documents here</CardDescription>
+            <CardTitle>{t('documentPanel.documentManager.uploadedTitle')}</CardTitle>
+            <CardDescription>{t('documentPanel.documentManager.uploadedDescription')}</CardDescription>
           </CardHeader>
 
           <CardContent>
             {!docs && (
               <EmptyCard
-                title="No documents uploaded"
-                description="upload documents to see them here"
+                title={t('documentPanel.documentManager.emptyTitle')}
+                description={t('documentPanel.documentManager.emptyDescription')}
               />
             )}
             {docs && (
               <Table>
                 <TableHeader>
                   <TableRow>
-                    <TableHead>ID</TableHead>
-                    <TableHead>Summary</TableHead>
-                    <TableHead>Status</TableHead>
-                    <TableHead>Length</TableHead>
-                    <TableHead>Chunks</TableHead>
-                    <TableHead>Created</TableHead>
-                    <TableHead>Updated</TableHead>
-                    <TableHead>Metadata</TableHead>
+                    <TableHead>{t('documentPanel.documentManager.columns.id')}</TableHead>
+                    <TableHead>{t('documentPanel.documentManager.columns.summary')}</TableHead>
+                    <TableHead>{t('documentPanel.documentManager.columns.status')}</TableHead>
+                    <TableHead>{t('documentPanel.documentManager.columns.length')}</TableHead>
+                    <TableHead>{t('documentPanel.documentManager.columns.chunks')}</TableHead>
+                    <TableHead>{t('documentPanel.documentManager.columns.created')}</TableHead>
+                    <TableHead>{t('documentPanel.documentManager.columns.updated')}</TableHead>
+                    <TableHead>{t('documentPanel.documentManager.columns.metadata')}</TableHead>
                   </TableRow>
                 </TableHeader>
                 <TableBody className="text-sm">
@@ -137,13 +139,13 @@ export default function DocumentManager() {
                         </TableCell>
                         <TableCell>
                           {status === 'processed' && (
-                            <span className="text-green-600">Completed</span>
+                            <span className="text-green-600">{t('documentPanel.documentManager.status.completed')}</span>
                           )}
                           {status === 'processing' && (
-                            <span className="text-blue-600">Processing</span>
+                            <span className="text-blue-600">{t('documentPanel.documentManager.status.processing')}</span>
                           )}
-                          {status === 'pending' && <span className="text-yellow-600">Pending</span>}
-                          {status === 'failed' && <span className="text-red-600">Failed</span>}
+                          {status === 'pending' && <span className="text-yellow-600">{t('documentPanel.documentManager.status.pending')}</span>}
+                          {status === 'failed' && <span className="text-red-600">{t('documentPanel.documentManager.status.failed')}</span>}
                           {doc.error && (
                             <span className="ml-2 text-red-500" title={doc.error}>
                               ⚠️
diff --git a/lightrag_webui/src/features/RetrievalTesting.tsx b/lightrag_webui/src/features/RetrievalTesting.tsx
index 340255a2..84955aa1 100644
--- a/lightrag_webui/src/features/RetrievalTesting.tsx
+++ b/lightrag_webui/src/features/RetrievalTesting.tsx
@@ -8,8 +8,10 @@ import { useDebounce } from '@/hooks/useDebounce'
 import QuerySettings from '@/components/retrieval/QuerySettings'
 import { ChatMessage, MessageWithError } from '@/components/retrieval/ChatMessage'
 import { EraserIcon, SendIcon } from 'lucide-react'
+import { useTranslation } from 'react-i18next'
 
 export default function RetrievalTesting() {
+  const { t } = useTranslation()
   const [messages, setMessages] = useState<MessageWithError[]>(
     () => useSettingsStore.getState().retrievalHistory || []
   )
@@ -89,7 +91,7 @@ export default function RetrievalTesting() {
         }
       } catch (err) {
         // Handle error
-        updateAssistantMessage(`Error: Failed to get response\n${errorMessage(err)}`, true)
+        updateAssistantMessage(`${t('retrievePanel.retrieval.error')}\n${errorMessage(err)}`, true)
       } finally {
         // Clear loading and add messages to state
         setIsLoading(false)
@@ -98,7 +100,7 @@ export default function RetrievalTesting() {
           .setRetrievalHistory([...prevMessages, userMessage, assistantMessage])
       }
     },
-    [inputValue, isLoading, messages, setMessages]
+    [inputValue, isLoading, messages, setMessages, t]
   )
 
   const debouncedMessages = useDebounce(messages, 100)
@@ -117,7 +119,7 @@ export default function RetrievalTesting() {
             <div className="flex min-h-0 flex-1 flex-col gap-2">
               {messages.length === 0 ? (
                 <div className="text-muted-foreground flex h-full items-center justify-center text-lg">
-                  Start a retrieval by typing your query below
+                  {t('retrievePanel.retrieval.startPrompt')} 
                 </div>
               ) : (
                 messages.map((message, idx) => (
@@ -143,18 +145,18 @@ export default function RetrievalTesting() {
             size="sm"
           >
             <EraserIcon />
-            Clear
+            {t('retrievePanel.retrieval.clear')}
           </Button>
           <Input
             className="flex-1"
             value={inputValue}
             onChange={(e) => setInputValue(e.target.value)}
-            placeholder="Type your query..."
+            placeholder={t('retrievePanel.retrieval.placeholder')}
             disabled={isLoading}
           />
           <Button type="submit" variant="default" disabled={isLoading} size="sm">
             <SendIcon />
-            Send
+            {t('retrievePanel.retrieval.send')}
           </Button>
         </form>
       </div>
diff --git a/lightrag_webui/src/features/SiteHeader.tsx b/lightrag_webui/src/features/SiteHeader.tsx
index c09ce089..ac3bdd70 100644
--- a/lightrag_webui/src/features/SiteHeader.tsx
+++ b/lightrag_webui/src/features/SiteHeader.tsx
@@ -4,6 +4,7 @@ import ThemeToggle from '@/components/ThemeToggle'
 import { TabsList, TabsTrigger } from '@/components/ui/Tabs'
 import { useSettingsStore } from '@/stores/settings'
 import { cn } from '@/lib/utils'
+import { useTranslation } from 'react-i18next'
 
 import { ZapIcon, GithubIcon } from 'lucide-react'
 
@@ -29,21 +30,22 @@ function NavigationTab({ value, currentTab, children }: NavigationTabProps) {
 
 function TabsNavigation() {
   const currentTab = useSettingsStore.use.currentTab()
+  const { t } = useTranslation()
 
   return (
     <div className="flex h-8 self-center">
       <TabsList className="h-full gap-2">
         <NavigationTab value="documents" currentTab={currentTab}>
-          Documents
+          {t('header.documents')}
         </NavigationTab>
         <NavigationTab value="knowledge-graph" currentTab={currentTab}>
-          Knowledge Graph
+          {t('header.knowledgeGraph')}
         </NavigationTab>
         <NavigationTab value="retrieval" currentTab={currentTab}>
-          Retrieval
+          {t('header.retrieval')}
         </NavigationTab>
         <NavigationTab value="api" currentTab={currentTab}>
-          API
+          {t('header.api')}
         </NavigationTab>
       </TabsList>
     </div>
@@ -51,6 +53,7 @@ function TabsNavigation() {
 }
 
 export default function SiteHeader() {
+  const { t } = useTranslation()
   return (
     <header className="border-border/40 bg-background/95 supports-[backdrop-filter]:bg-background/60 sticky top-0 z-50 flex h-10 w-full border-b px-4 backdrop-blur">
       <a href="/" className="mr-6 flex items-center gap-2">
@@ -64,7 +67,7 @@ export default function SiteHeader() {
       </div>
 
       <nav className="flex items-center">
-        <Button variant="ghost" size="icon" side="bottom" tooltip="Project Repository">
+        <Button variant="ghost" size="icon" side="bottom" tooltip={t('header.projectRepository')}>
           <a href={SiteInfo.github} target="_blank" rel="noopener noreferrer">
             <GithubIcon className="size-4" aria-hidden="true" />
           </a>
diff --git a/lightrag_webui/src/i18n.js b/lightrag_webui/src/i18n.js
new file mode 100644
index 00000000..41140bb2
--- /dev/null
+++ b/lightrag_webui/src/i18n.js
@@ -0,0 +1,21 @@
+import i18n from "i18next";
+import { initReactI18next } from "react-i18next";
+
+import en from "./locales/en.json";
+import zh from "./locales/zh.json";
+
+i18n
+  .use(initReactI18next)
+  .init({
+    resources: {
+      en: { translation: en },
+      zh: { translation: zh }
+    },
+    lng: "en", // default
+    fallbackLng: "en",
+    interpolation: {
+      escapeValue: false
+    }
+  });
+
+export default i18n;
diff --git a/lightrag_webui/src/locales/en.json b/lightrag_webui/src/locales/en.json
new file mode 100644
index 00000000..6a1e122b
--- /dev/null
+++ b/lightrag_webui/src/locales/en.json
@@ -0,0 +1,234 @@
+{
+  "header": {
+    "documents": "Documents",
+    "knowledgeGraph": "Knowledge Graph",
+    "retrieval": "Retrieval",
+    "api": "API",
+    "projectRepository": "Project Repository",
+    "themeToggle": {
+      "switchToLight": "Switch to light theme",
+      "switchToDark": "Switch to dark theme"
+    }     
+  },  
+  "documentPanel": {
+    "clearDocuments": {
+      "button": "Clear",
+      "tooltip": "Clear documents",
+      "title": "Clear Documents",
+      "confirm": "Do you really want to clear all documents?",
+      "confirmButton": "YES",
+      "success": "Documents cleared successfully",
+      "failed": "Clear Documents Failed:\n{{message}}",
+      "error": "Clear Documents Failed:\n{{error}}"
+    },    
+    "uploadDocuments": {
+      "button": "Upload",
+      "tooltip": "Upload documents",
+      "title": "Upload Documents",
+      "description": "Drag and drop your documents here or click to browse.",
+      "uploading": "Uploading {{name}}: {{percent}}%",
+      "success": "Upload Success:\n{{name}} uploaded successfully",
+      "failed": "Upload Failed:\n{{name}}\n{{message}}",
+      "error": "Upload Failed:\n{{name}}\n{{error}}",
+      "generalError": "Upload Failed\n{{error}}",
+      "fileTypes": "Supported types: TXT, MD, DOCX, PDF, PPTX, RTF, ODT, EPUB, HTML, HTM, TEX, JSON, XML, YAML, YML, CSV, LOG, CONF, INI, PROPERTIES, SQL, BAT, SH, C, CPP, PY, JAVA, JS, TS, SWIFT, GO, RB, PHP, CSS, SCSS, LESS"
+    },    
+    "documentManager": {
+      "title": "Document Management",
+      "scanButton": "Scan",
+      "scanTooltip": "Scan documents",
+      "uploadedTitle": "Uploaded Documents",
+      "uploadedDescription": "List of uploaded documents and their statuses.",
+      "emptyTitle": "No Documents",
+      "emptyDescription": "There are no uploaded documents yet.",
+      "columns": {
+        "id": "ID",
+        "summary": "Summary",
+        "status": "Status",
+        "length": "Length",
+        "chunks": "Chunks",
+        "created": "Created",
+        "updated": "Updated",
+        "metadata": "Metadata"
+      },
+      "status": {
+        "completed": "Completed",
+        "processing": "Processing",
+        "pending": "Pending",
+        "failed": "Failed"
+      },
+      "errors": {
+        "loadFailed": "Failed to load documents\n{{error}}",
+        "scanFailed": "Failed to scan documents\n{{error}}",
+        "scanProgressFailed": "Failed to get scan progress\n{{error}}"
+      }
+    }
+  },
+  "graphPanel": {
+    "sideBar": {
+      "settings": {
+        "settings": "Settings",
+        "healthCheck": "Health Check",
+        "showPropertyPanel": "Show Property Panel",
+        "showSearchBar": "Show Search Bar",
+        "showNodeLabel": "Show Node Label",
+        "nodeDraggable": "Node Draggable",
+        "showEdgeLabel": "Show Edge Label",
+        "hideUnselectedEdges": "Hide Unselected Edges",
+        "edgeEvents": "Edge Events",
+        "maxQueryDepth": "Max Query Depth",
+        "minDegree": "Minimum Degree",
+        "maxLayoutIterations": "Max Layout Iterations",
+        "apiKey": "API Key",
+        "enterYourAPIkey": "Enter your API key",
+        "save": "Save"      
+      },
+
+      "zoomControl": {
+        "zoomIn": "Zoom In",
+        "zoomOut": "Zoom Out",
+        "resetZoom": "Reset Zoom"
+      },
+
+      "layoutsControl": {
+        "startAnimation": "Start the layout animation",
+        "stopAnimation": "Stop the layout animation",
+        "layoutGraph": "Layout Graph",
+        "layouts": {
+          "Circular": "Circular",
+          "Circlepack": "Circlepack",
+          "Random": "Random",
+          "Noverlaps": "Noverlaps",
+          "Force Directed": "Force Directed",
+          "Force Atlas": "Force Atlas"
+        }
+      },
+
+      "fullScreenControl": {
+        "fullScreen": "Full Screen",
+        "windowed": "Windowed"
+      }      
+    },
+    "statusIndicator": {
+      "connected": "Connected",
+      "disconnected": "Disconnected"
+    },
+    "statusCard": {
+      "unavailable": "Status information unavailable",
+      "storageInfo": "Storage Info",
+      "workingDirectory": "Working Directory",
+      "inputDirectory": "Input Directory",
+      "llmConfig": "LLM Configuration",
+      "llmBinding": "LLM Binding",
+      "llmBindingHost": "LLM Binding Host",
+      "llmModel": "LLM Model",
+      "maxTokens": "Max Tokens",
+      "embeddingConfig": "Embedding Configuration",
+      "embeddingBinding": "Embedding Binding",
+      "embeddingBindingHost": "Embedding Binding Host",
+      "embeddingModel": "Embedding Model",
+      "storageConfig": "Storage Configuration",
+      "kvStorage": "KV Storage",
+      "docStatusStorage": "Doc Status Storage",
+      "graphStorage": "Graph Storage",
+      "vectorStorage": "Vector Storage"
+    },
+    "propertiesView": {
+      "node": {
+        "title": "Node",
+        "id": "ID",
+        "labels": "Labels",
+        "degree": "Degree",
+        "properties": "Properties",
+        "relationships": "Relationships"
+      },
+      "edge": {
+        "title": "Relationship",
+        "id": "ID",
+        "type": "Type",
+        "source": "Source",
+        "target": "Target",
+        "properties": "Properties"
+      }      
+    },
+    "search": {
+      "placeholder": "Search nodes...",
+      "message": "And {count} others"
+    },
+    "graphLabels": {
+      "selectTooltip": "Select query label",
+      "noLabels": "No labels found",
+      "label": "Label",
+      "placeholder": "Search labels...",
+      "andOthers": "And {count} others"
+    }        
+  },
+  "retrievePanel": {
+    "chatMessage": {
+      "copyTooltip": "Copy to clipboard",
+      "copyError": "Failed to copy text to clipboard"
+    },
+    "retrieval": {
+      "startPrompt": "Start a retrieval by typing your query below",
+      "clear": "Clear",
+      "send": "Send",
+      "placeholder": "Type your query...",
+      "error": "Error: Failed to get response"
+    },
+    "querySettings": {
+      "parametersTitle": "Parameters",
+      "parametersDescription": "Configure your query parameters",
+
+      "queryMode": "Query Mode",
+      "queryModeTooltip": "Select the retrieval strategy:\n• Naive: Basic search without advanced techniques\n• Local: Context-dependent information retrieval\n• Global: Utilizes global knowledge base\n• Hybrid: Combines local and global retrieval\n• Mix: Integrates knowledge graph with vector retrieval",
+      "queryModeOptions": {
+        "naive": "Naive",
+        "local": "Local",
+        "global": "Global",
+        "hybrid": "Hybrid",
+        "mix": "Mix"
+      },
+
+      "responseFormat": "Response Format",
+      "responseFormatTooltip": "Defines the response format. Examples:\n• Multiple Paragraphs\n• Single Paragraph\n• Bullet Points",
+      "responseFormatOptions": {
+        "multipleParagraphs": "Multiple Paragraphs",
+        "singleParagraph": "Single Paragraph",
+        "bulletPoints": "Bullet Points"
+      },
+
+      "topK": "Top K Results",
+      "topKTooltip": "Number of top items to retrieve. Represents entities in 'local' mode and relationships in 'global' mode",
+      "topKPlaceholder": "Number of results",
+
+      "maxTokensTextUnit": "Max Tokens for Text Unit",
+      "maxTokensTextUnitTooltip": "Maximum number of tokens allowed for each retrieved text chunk",
+
+      "maxTokensGlobalContext": "Max Tokens for Global Context",
+      "maxTokensGlobalContextTooltip": "Maximum number of tokens allocated for relationship descriptions in global retrieval",
+
+      "maxTokensLocalContext": "Max Tokens for Local Context",
+      "maxTokensLocalContextTooltip": "Maximum number of tokens allocated for entity descriptions in local retrieval",
+
+      "historyTurns": "History Turns",
+      "historyTurnsTooltip": "Number of complete conversation turns (user-assistant pairs) to consider in the response context",
+      "historyTurnsPlaceholder": "Number of history turns",
+
+      "hlKeywords": "High-Level Keywords",
+      "hlKeywordsTooltip": "List of high-level keywords to prioritize in retrieval. Separate with commas",
+      "hlkeywordsPlaceHolder": "Enter keywords",
+
+      "llKeywords": "Low-Level Keywords",
+      "llKeywordsTooltip": "List of low-level keywords to refine retrieval focus. Separate with commas",
+
+      "onlyNeedContext": "Only Need Context",
+      "onlyNeedContextTooltip": "If True, only returns the retrieved context without generating a response",
+
+      "onlyNeedPrompt": "Only Need Prompt",
+      "onlyNeedPromptTooltip": "If True, only returns the generated prompt without producing a response",
+
+      "streamResponse": "Stream Response",
+      "streamResponseTooltip": "If True, enables streaming output for real-time responses"
+    }   
+  } 
+}
diff --git a/lightrag_webui/src/locales/zh.json b/lightrag_webui/src/locales/zh.json
new file mode 100644
index 00000000..17c6ec9d
--- /dev/null
+++ b/lightrag_webui/src/locales/zh.json
@@ -0,0 +1,236 @@
+{
+  "header": {
+    "documents": "文档",
+    "knowledgeGraph": "知识图谱",
+    "retrieval": "检索",
+    "api": "API",
+    "projectRepository": "项目仓库",
+    "themeToggle": {
+      "switchToLight": "切换到亮色主题",
+      "switchToDark": "切换到暗色主题"
+    }    
+  },  
+  "documentPanel": {
+    "clearDocuments": {
+      "button": "清除",
+      "tooltip": "清除文档",
+      "title": "清除文档",
+      "confirm": "您确定要清除所有文档吗？",
+      "confirmButton": "确定",
+      "success": "文档已成功清除",
+      "failed": "清除文档失败:\n{{message}}",
+      "error": "清除文档失败:\n{{error}}"
+    },
+    "uploadDocuments": {
+      "button": "上传",
+      "tooltip": "上传文档",
+      "title": "上传文档",
+      "description": "拖放文档到此处或点击浏览。",
+      "uploading": "正在上传 {{name}}: {{percent}}%",
+      "success": "上传成功:\n{{name}} 上传成功",
+      "failed": "上传失败:\n{{name}}\n{{message}}",
+      "error": "上传失败:\n{{name}}\n{{error}}",
+      "generalError": "上传失败\n{{error}}",
+      "fileTypes": "支持的文件类型: TXT, MD, DOCX, PDF, PPTX, RTF, ODT, EPUB, HTML, HTM, TEX, JSON, XML, YAML, YML, CSV, LOG, CONF, INI, PROPERTIES, SQL, BAT, SH, C, CPP, PY, JAVA, JS, TS, SWIFT, GO, RB, PHP, CSS, SCSS, LESS"
+    },
+    "documentManager": {
+      "title": "文档管理",
+      "scanButton": "扫描",
+      "scanTooltip": "扫描文档",
+      "uploadedTitle": "已上传文档",
+      "uploadedDescription": "已上传文档及其状态列表。",
+      "emptyTitle": "暂无文档",
+      "emptyDescription": "尚未上传任何文档。",
+      "columns": {
+        "id": "ID",
+        "summary": "摘要",
+        "status": "状态",
+        "length": "长度",
+        "chunks": "分块",
+        "created": "创建时间",
+        "updated": "更新时间",
+        "metadata": "元数据"
+      },
+      "status": {
+        "completed": "已完成",
+        "processing": "处理中",
+        "pending": "待处理",
+        "failed": "失败"
+      },
+      "errors": {
+        "loadFailed": "加载文档失败\n{{error}}",
+        "scanFailed": "扫描文档失败\n{{error}}",
+        "scanProgressFailed": "获取扫描进度失败\n{{error}}"
+      }
+    }
+  },
+  "graphPanel": {
+    "sideBar": {
+      "settings": {
+        "settings": "设置",
+        "healthCheck": "健康检查",
+        "showPropertyPanel": "显示属性面板",
+        "showSearchBar": "显示搜索栏",
+        "showNodeLabel": "显示节点标签",
+        "nodeDraggable": "节点可拖动",
+        "showEdgeLabel": "显示边标签",
+        "hideUnselectedEdges": "隐藏未选中边",
+        "edgeEvents": "边事件",
+        "maxQueryDepth": "最大查询深度",
+        "minDegree": "最小度数",
+        "maxLayoutIterations": "最大布局迭代次数",
+        "apiKey": "API 密钥",
+        "enterYourAPIkey": "输入您的 API 密钥",
+        "save": "保存"
+      },
+
+      "zoomControl": {
+        "zoomIn": "放大",
+        "zoomOut": "缩小",
+        "resetZoom": "重置缩放"
+      },
+
+      "layoutsControl": {
+        "startAnimation": "开始布局动画",
+        "stopAnimation": "停止布局动画",
+        "layoutGraph": "布局图",
+        "layouts": {
+          "Circular": "环形布局",
+          "Circlepack": "圆形打包布局",
+          "Random": "随机布局",
+          "Noverlaps": "无重叠布局",
+          "Force Directed": "力导向布局",
+          "Force Atlas": "力导向图谱布局"
+        }
+      },
+      
+      "fullScreenControl": {
+        "fullScreen": "全屏",
+        "windowed": "窗口模式"
+      }      
+    },
+    "statusIndicator": {
+      "connected": "已连接",
+      "disconnected": "未连接"
+    },
+    "statusCard": {
+      "unavailable": "状态信息不可用",
+      "storageInfo": "存储信息",
+      "workingDirectory": "工作目录",
+      "inputDirectory": "输入目录",
+      "llmConfig": "LLM 配置",
+      "llmBinding": "LLM 绑定",
+      "llmBindingHost": "LLM 绑定主机",
+      "llmModel": "LLM 模型",
+      "maxTokens": "最大 Token 数",
+      "embeddingConfig": "嵌入配置",
+      "embeddingBinding": "嵌入绑定",
+      "embeddingBindingHost": "嵌入绑定主机",
+      "embeddingModel": "嵌入模型",
+      "storageConfig": "存储配置",
+      "kvStorage": "KV 存储",
+      "docStatusStorage": "文档状态存储",
+      "graphStorage": "图存储",
+      "vectorStorage": "向量存储"
+    },
+    "propertiesView": {
+      "node": {
+        "title": "节点",
+        "id": "ID",
+        "labels": "标签",
+        "degree": "度数",
+        "properties": "属性",
+        "relationships": "关系"
+      },
+      "edge": {
+        "title": "关系",
+        "id": "ID",
+        "type": "类型",
+        "source": "源",
+        "target": "目标",
+        "properties": "属性"
+      } 
+    },
+    "search": {
+      "placeholder": "搜索节点...",
+      "message": "以及其它 {count} 项"
+    },
+    "graphLabels": {
+      "selectTooltip": "选择查询标签",
+      "noLabels": "未找到标签",
+      "label": "标签",
+      "placeholder": "搜索标签...",
+      "andOthers": "以及其它 {count} 个"
+    }    
+  },
+  "retrievePanel": {
+    "chatMessage": {
+      "copyTooltip": "复制到剪贴板",
+      "copyError": "无法复制文本到剪贴板"
+    },
+
+    "retrieval": {
+      "startPrompt": "在下面输入您的查询以开始检索",
+      "clear": "清除",
+      "send": "发送",
+      "placeholder": "输入您的查询...",
+      "error": "错误：无法获取响应"
+    },
+    "querySettings": {
+      "parametersTitle": "参数设置",
+      "parametersDescription": "配置查询参数",
+
+      "queryMode": "查询模式",
+      "queryModeTooltip": "选择检索策略：\n• 朴素：不使用高级技术的基本搜索\n• 本地：基于上下文的信息检索\n• 全局：利用全局知识库\n• 混合：结合本地和全局检索\n• 综合：集成知识图谱与向量检索",
+      "queryModeOptions": {
+        "naive": "朴素",
+        "local": "本地",
+        "global": "全局",
+        "hybrid": "混合",
+        "mix": "综合"
+      },
+
+      "responseFormat": "响应格式",
+      "responseFormatTooltip": "定义响应格式。例如：\n• 多个段落\n• 单个段落\n• 项目符号",
+      "responseFormatOptions": {
+        "multipleParagraphs": "多个段落",
+        "singleParagraph": "单个段落",
+        "bulletPoints": "项目符号"
+      },
+
+      "topK": "Top K 结果数",
+      "topKTooltip": "要检索的前 K 个项目数量。在“本地”模式下表示实体，在“全局”模式下表示关系",
+      "topKPlaceholder": "结果数",
+
+      "maxTokensTextUnit": "文本单元最大 Token 数",
+      "maxTokensTextUnitTooltip": "每个检索到的文本块允许的最大 Token 数",
+
+      "maxTokensGlobalContext": "全局上下文最大 Token 数",
+      "maxTokensGlobalContextTooltip": "在全局检索中为关系描述分配的最大 Token 数",
+
+      "maxTokensLocalContext": "本地上下文最大 Token 数",
+      "maxTokensLocalContextTooltip": "在本地检索中为实体描述分配的最大 Token 数",
+
+      "historyTurns": "历史轮次",
+      "historyTurnsTooltip": "在响应上下文中考虑的完整对话轮次（用户-助手对）",
+      "historyTurnsPlaceholder": "历史轮次的数量",
+
+      "hlKeywords": "高级关键词",
+      "hlKeywordsTooltip": "检索时优先考虑的高级关键词。请用逗号分隔",
+      "hlkeywordsPlaceHolder": "输入关键词",
+
+      "llKeywords": "低级关键词",
+      "llKeywordsTooltip": "用于优化检索焦点的低级关键词。请用逗号分隔",
+
+      "onlyNeedContext": "仅需要上下文",
+      "onlyNeedContextTooltip": "如果为 True，则仅返回检索到的上下文，而不会生成回复",
+
+      "onlyNeedPrompt": "仅需要提示",
+      "onlyNeedPromptTooltip": "如果为 True，则仅返回生成的提示，而不会生成回复",
+
+      "streamResponse": "流式响应",
+      "streamResponseTooltip": "如果为 True，则启用流式输出以获得实时响应"
+    }
+  }  
+}
+  
diff --git a/lightrag_webui/src/main.tsx b/lightrag_webui/src/main.tsx
index 2caec890..6cf8a1e5 100644
--- a/lightrag_webui/src/main.tsx
+++ b/lightrag_webui/src/main.tsx
@@ -2,6 +2,8 @@ import { StrictMode } from 'react'
 import { createRoot } from 'react-dom/client'
 import './index.css'
 import App from './App.tsx'
+import "./i18n";
+
 
 createRoot(document.getElementById('root')!).render(
   <StrictMode>

From 95c06f1bde92bb5ced2c2a2536b2c304a414db0a Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 8 Mar 2025 22:36:41 +0800
Subject: [PATCH 19/54] Add graph DB lock to shared storage system
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Introduced new graph_db_lock
• Added detailed lock debugging output
---
 lightrag/kg/shared_storage.py | 94 +++++++++++++++++++++++++----------
 1 file changed, 69 insertions(+), 25 deletions(-)

diff --git a/lightrag/kg/shared_storage.py b/lightrag/kg/shared_storage.py
index c8c154aa..67206971 100644
--- a/lightrag/kg/shared_storage.py
+++ b/lightrag/kg/shared_storage.py
@@ -7,12 +7,18 @@ from typing import Any, Dict, Optional, Union, TypeVar, Generic
 
 
 # Define a direct print function for critical logs that must be visible in all processes
-def direct_log(message, level="INFO"):
+def direct_log(message, level="INFO", enable_output: bool = True):
     """
     Log a message directly to stderr to ensure visibility in all processes,
     including the Gunicorn master process.
+    
+    Args:
+        message: The message to log
+        level: Log level (default: "INFO")
+        enable_output: Whether to actually output the log (default: True)
     """
-    print(f"{level}: {message}", file=sys.stderr, flush=True)
+    if enable_output:
+        print(f"{level}: {message}", file=sys.stderr, flush=True)
 
 
 T = TypeVar("T")
@@ -32,55 +38,88 @@ _update_flags: Optional[Dict[str, bool]] = None  # namespace -> updated
 _storage_lock: Optional[LockType] = None
 _internal_lock: Optional[LockType] = None
 _pipeline_status_lock: Optional[LockType] = None
+_graph_db_lock: Optional[LockType] = None
 
 
 class UnifiedLock(Generic[T]):
     """Provide a unified lock interface type for asyncio.Lock and multiprocessing.Lock"""
 
-    def __init__(self, lock: Union[ProcessLock, asyncio.Lock], is_async: bool):
+    def __init__(self, lock: Union[ProcessLock, asyncio.Lock], is_async: bool, name: str = "unnamed", enable_logging: bool = True):
         self._lock = lock
         self._is_async = is_async
+        self._pid = os.getpid()  # for debug only
+        self._name = name  # for debug only
+        self._enable_logging = enable_logging  # for debug only
 
     async def __aenter__(self) -> "UnifiedLock[T]":
-        if self._is_async:
-            await self._lock.acquire()
-        else:
-            self._lock.acquire()
-        return self
+        try:
+            direct_log(f"== Lock == Process {self._pid}: Acquiring lock '{self._name}' (async={self._is_async})", enable_output=self._enable_logging)
+            if self._is_async:
+                await self._lock.acquire()
+            else:
+                self._lock.acquire()
+            direct_log(f"== Lock == Process {self._pid}: Lock '{self._name}' acquired (async={self._is_async})", enable_output=self._enable_logging)
+            return self
+        except Exception as e:
+            direct_log(f"== Lock == Process {self._pid}: Failed to acquire lock '{self._name}': {e}", level="ERROR", enable_output=self._enable_logging)
+            raise
 
     async def __aexit__(self, exc_type, exc_val, exc_tb):
-        if self._is_async:
-            self._lock.release()
-        else:
-            self._lock.release()
+        try:
+            direct_log(f"== Lock == Process {self._pid}: Releasing lock '{self._name}' (async={self._is_async})", enable_output=self._enable_logging)
+            if self._is_async:
+                self._lock.release()
+            else:
+                self._lock.release()
+            direct_log(f"== Lock == Process {self._pid}: Lock '{self._name}' released (async={self._is_async})", enable_output=self._enable_logging)
+        except Exception as e:
+            direct_log(f"== Lock == Process {self._pid}: Failed to release lock '{self._name}': {e}", level="ERROR", enable_output=self._enable_logging)
+            raise
 
     def __enter__(self) -> "UnifiedLock[T]":
         """For backward compatibility"""
-        if self._is_async:
-            raise RuntimeError("Use 'async with' for shared_storage lock")
-        self._lock.acquire()
-        return self
+        try:
+            if self._is_async:
+                raise RuntimeError("Use 'async with' for shared_storage lock")
+            direct_log(f"== Lock == Process {self._pid}: Acquiring lock '{self._name}' (sync)", enable_output=self._enable_logging)
+            self._lock.acquire()
+            direct_log(f"== Lock == Process {self._pid}: Lock '{self._name}' acquired (sync)", enable_output=self._enable_logging)
+            return self
+        except Exception as e:
+            direct_log(f"== Lock == Process {self._pid}: Failed to acquire lock '{self._name}' (sync): {e}", level="ERROR", enable_output=self._enable_logging)
+            raise
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         """For backward compatibility"""
-        if self._is_async:
-            raise RuntimeError("Use 'async with' for shared_storage lock")
-        self._lock.release()
+        try:
+            if self._is_async:
+                raise RuntimeError("Use 'async with' for shared_storage lock")
+            direct_log(f"== Lock == Process {self._pid}: Releasing lock '{self._name}' (sync)", enable_output=self._enable_logging)
+            self._lock.release()
+            direct_log(f"== Lock == Process {self._pid}: Lock '{self._name}' released (sync)", enable_output=self._enable_logging)
+        except Exception as e:
+            direct_log(f"== Lock == Process {self._pid}: Failed to release lock '{self._name}' (sync): {e}", level="ERROR", enable_output=self._enable_logging)
+            raise
 
 
-def get_internal_lock() -> UnifiedLock:
+def get_internal_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified storage lock for data consistency"""
-    return UnifiedLock(lock=_internal_lock, is_async=not is_multiprocess)
+    return UnifiedLock(lock=_internal_lock, is_async=not is_multiprocess, name="internal_lock", enable_logging=enable_logging)
 
 
-def get_storage_lock() -> UnifiedLock:
+def get_storage_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified storage lock for data consistency"""
-    return UnifiedLock(lock=_storage_lock, is_async=not is_multiprocess)
+    return UnifiedLock(lock=_storage_lock, is_async=not is_multiprocess, name="storage_lock", enable_logging=enable_logging)
 
 
-def get_pipeline_status_lock() -> UnifiedLock:
+def get_pipeline_status_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified storage lock for data consistency"""
-    return UnifiedLock(lock=_pipeline_status_lock, is_async=not is_multiprocess)
+    return UnifiedLock(lock=_pipeline_status_lock, is_async=not is_multiprocess, name="pipeline_status_lock", enable_logging=enable_logging)
+
+
+def get_graph_db_lock(enable_logging: bool = False) -> UnifiedLock:
+    """return unified graph database lock for ensuring atomic operations"""
+    return UnifiedLock(lock=_graph_db_lock, is_async=not is_multiprocess, name="graph_db_lock", enable_logging=enable_logging)
 
 
 def initialize_share_data(workers: int = 1):
@@ -108,6 +147,7 @@ def initialize_share_data(workers: int = 1):
         _storage_lock, \
         _internal_lock, \
         _pipeline_status_lock, \
+        _graph_db_lock, \
         _shared_dicts, \
         _init_flags, \
         _initialized, \
@@ -128,6 +168,7 @@ def initialize_share_data(workers: int = 1):
         _internal_lock = _manager.Lock()
         _storage_lock = _manager.Lock()
         _pipeline_status_lock = _manager.Lock()
+        _graph_db_lock = _manager.Lock()
         _shared_dicts = _manager.dict()
         _init_flags = _manager.dict()
         _update_flags = _manager.dict()
@@ -139,6 +180,7 @@ def initialize_share_data(workers: int = 1):
         _internal_lock = asyncio.Lock()
         _storage_lock = asyncio.Lock()
         _pipeline_status_lock = asyncio.Lock()
+        _graph_db_lock = asyncio.Lock()
         _shared_dicts = {}
         _init_flags = {}
         _update_flags = {}
@@ -304,6 +346,7 @@ def finalize_share_data():
         _storage_lock, \
         _internal_lock, \
         _pipeline_status_lock, \
+        _graph_db_lock, \
         _shared_dicts, \
         _init_flags, \
         _initialized, \
@@ -369,6 +412,7 @@ def finalize_share_data():
     _storage_lock = None
     _internal_lock = None
     _pipeline_status_lock = None
+    _graph_db_lock = None
     _update_flags = None
 
     direct_log(f"Process {os.getpid()} storage data finalization complete")

From 73452e63fa76f6b710de42ca34e4f5823c27e01a Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sat, 8 Mar 2025 22:48:12 +0800
Subject: [PATCH 20/54] Add async lock for atomic graph database operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Introduced graph_db_lock mechanism
• Ensured atomic node/edge merge and insert operation
---
 lightrag/operate.py | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 30983145..f89a551d 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -519,19 +519,24 @@ async def extract_entities(
         for k, v in m_edges.items():
             maybe_edges[tuple(sorted(k))].extend(v)
 
-    all_entities_data = await asyncio.gather(
-        *[
-            _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
-            for k, v in maybe_nodes.items()
-        ]
-    )
+    from .kg.shared_storage import get_graph_db_lock
+    graph_db_lock = get_graph_db_lock(enable_logging = True)
+    
+    # Ensure that nodes and edges are merged and upserted atomically
+    async with graph_db_lock:
+        all_entities_data = await asyncio.gather(
+            *[
+                _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
+                for k, v in maybe_nodes.items()
+            ]
+        )
 
-    all_relationships_data = await asyncio.gather(
-        *[
-            _merge_edges_then_upsert(k[0], k[1], v, knowledge_graph_inst, global_config)
-            for k, v in maybe_edges.items()
-        ]
-    )
+        all_relationships_data = await asyncio.gather(
+            *[
+                _merge_edges_then_upsert(k[0], k[1], v, knowledge_graph_inst, global_config)
+                for k, v in maybe_edges.items()
+            ]
+        )
 
     if not (all_entities_data or all_relationships_data):
         log_message = "Didn't extract any entities and relationships."

From 18c077040939c7e5a90a90e06af1b0da3c6911f6 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 9 Mar 2025 00:24:55 +0800
Subject: [PATCH 21/54] fix: duplicate nodes for same entity(label)  problem in
 Neo4j

- Add entity_id field as key in Neo4j nodes
- Use  entity_id for nodes retrival and upsert
---
 lightrag/kg/neo4j_impl.py | 106 ++++++++++++++++++++++++++------------
 lightrag/operate.py       |   2 +
 2 files changed, 74 insertions(+), 34 deletions(-)

diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index 1e46798a..0b660d68 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -280,12 +280,10 @@ class Neo4JStorage(BaseGraphStorage):
             database=self._DATABASE, default_access_mode="READ"
         ) as session:
             try:
-                query = f"MATCH (n:`{entity_name_label}`) RETURN n"
-                result = await session.run(query)
+                query = f"MATCH (n:`{entity_name_label}` {{entity_id: $entity_id}}) RETURN n"
+                result = await session.run(query, entity_id=entity_name_label)
                 try:
-                    records = await result.fetch(
-                        2
-                    )  # Get up to 2 records to check for duplicates
+                    records = await result.fetch(2)  # Get 2 records for duplication check
 
                     if len(records) > 1:
                         logger.warning(
@@ -549,12 +547,14 @@ class Neo4JStorage(BaseGraphStorage):
         """
         label = self._ensure_label(node_id)
         properties = node_data
+        if "entity_id" not in properties:
+            raise ValueError("Neo4j: node properties must contain an 'entity_id' field")
 
         try:
             async with self._driver.session(database=self._DATABASE) as session:
                 async def execute_upsert(tx: AsyncManagedTransaction):
                     query = f"""
-                    MERGE (n:`{label}`)
+                    MERGE (n:`{label}` {{entity_id: $properties.entity_id}})
                     SET n += $properties
                     """
                     result = await tx.run(query, properties=properties)
@@ -568,6 +568,56 @@ class Neo4JStorage(BaseGraphStorage):
             logger.error(f"Error during upsert: {str(e)}")
             raise
 
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type(
+            (
+                neo4jExceptions.ServiceUnavailable,
+                neo4jExceptions.TransientError,
+                neo4jExceptions.WriteServiceUnavailable,
+                neo4jExceptions.ClientError,
+            )
+        ),
+    )
+    async def _get_unique_node_entity_id(self, node_label: str) -> str:
+        """
+        Get the entity_id of a node with the given label, ensuring the node is unique.
+
+        Args:
+            node_label (str): Label of the node to check
+
+        Returns:
+            str: The entity_id of the unique node
+
+        Raises:
+            ValueError: If no node with the given label exists or if multiple nodes have the same label
+        """
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
+            query = f"""
+            MATCH (n:`{node_label}`)
+            RETURN n, count(n) as node_count
+            """
+            result = await session.run(query)
+            try:
+                records = await result.fetch(2)  # We only need to know if there are 0, 1, or >1 nodes
+                
+                if not records or records[0]["node_count"] == 0:
+                    raise ValueError(f"Neo4j: node with label '{node_label}' does not exist")
+                
+                if records[0]["node_count"] > 1:
+                    raise ValueError(f"Neo4j: multiple nodes found with label '{node_label}', cannot determine unique node")
+                
+                node = records[0]["n"]
+                if "entity_id" not in node:
+                    raise ValueError(f"Neo4j: node with label '{node_label}' does not have an entity_id property")
+                
+                return node["entity_id"]
+            finally:
+                await result.consume()  # Ensure result is fully consumed
+
     @retry(
         stop=stop_after_attempt(3),
         wait=wait_exponential(multiplier=1, min=4, max=10),
@@ -585,7 +635,8 @@ class Neo4JStorage(BaseGraphStorage):
     ) -> None:
         """
         Upsert an edge and its properties between two nodes identified by their labels.
-        Checks if both source and target nodes exist before creating the edge.
+        Ensures both source and target nodes exist and are unique before creating the edge.
+        Uses entity_id property to uniquely identify nodes.
 
         Args:
             source_node_id (str): Label of the source node (used as identifier)
@@ -593,52 +644,39 @@ class Neo4JStorage(BaseGraphStorage):
             edge_data (dict): Dictionary of properties to set on the edge
 
         Raises:
-            ValueError: If either source or target node does not exist
+            ValueError: If either source or target node does not exist or is not unique
         """
         source_label = self._ensure_label(source_node_id)
         target_label = self._ensure_label(target_node_id)
         edge_properties = edge_data
 
-        # Check if both nodes exist
-        source_exists = await self.has_node(source_label)
-        target_exists = await self.has_node(target_label)
-
-        if not source_exists:
-            raise ValueError(
-                f"Neo4j: source node with label '{source_label}' does not exist"
-            )
-        if not target_exists:
-            raise ValueError(
-                f"Neo4j: target node with label '{target_label}' does not exist"
-            )
+        # Get entity_ids for source and target nodes, ensuring they are unique
+        source_entity_id = await self._get_unique_node_entity_id(source_label)
+        target_entity_id = await self._get_unique_node_entity_id(target_label)
 
         try:
             async with self._driver.session(database=self._DATABASE) as session:
                 async def execute_upsert(tx: AsyncManagedTransaction):
                     query = f"""
-                    MATCH (source:`{source_label}`)
+                    MATCH (source:`{source_label}` {{entity_id: $source_entity_id}})
                     WITH source
-                    MATCH (target:`{target_label}`)
+                    MATCH (target:`{target_label}` {{entity_id: $target_entity_id}})
                     MERGE (source)-[r:DIRECTED]-(target)
                     SET r += $properties
                     RETURN r, source, target
                     """
-                    result = await tx.run(query, properties=edge_properties)
+                    result = await tx.run(
+                        query, 
+                        source_entity_id=source_entity_id,
+                        target_entity_id=target_entity_id,
+                        properties=edge_properties
+                    )
                     try:
                         records = await result.fetch(100)
-                        if len(records) > 1:
-                            source_nodes = [dict(r['source']) for r in records]
-                            target_nodes = [dict(r['target']) for r in records]
-                            logger.warning(
-                                f"Multiple edges created: found {len(records)} results for edge between "
-                                f"source label '{source_label}' and target label '{target_label}'. "
-                                f"Source nodes: {source_nodes}, "
-                                f"Target nodes: {target_nodes}. "
-                                "Using first edge only."
-                            )
                         if records:
                             logger.debug(
-                                f"Upserted edge from '{source_label}' to '{target_label}' "
+                                f"Upserted edge from '{source_label}' (entity_id: {source_entity_id}) "
+                                f"to '{target_label}' (entity_id: {target_entity_id}) "
                                 f"with properties: {edge_properties}"
                             )
                     finally:
diff --git a/lightrag/operate.py b/lightrag/operate.py
index f89a551d..fb7b27a0 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -220,6 +220,7 @@ async def _merge_nodes_then_upsert(
         entity_name, description, global_config
     )
     node_data = dict(
+        entity_id=entity_name,
         entity_type=entity_type,
         description=description,
         source_id=source_id,
@@ -301,6 +302,7 @@ async def _merge_edges_then_upsert(
             await knowledge_graph_inst.upsert_node(
                 need_insert_id,
                 node_data={
+                    "entity_id": need_insert_id,
                     "source_id": source_id,
                     "description": description,
                     "entity_type": "UNKNOWN",

From 3cf4268e7abdd238036e596f06c6036d636a6c74 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 9 Mar 2025 00:59:40 +0800
Subject: [PATCH 22/54] Change logging level from INFO to DEBUG for cache
 hit/miss messages

---
 lightrag/utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index bb1d6fae..1b65097e 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -667,11 +667,11 @@ async def handle_cache(
                 cache_type=cache_type,
             )
             if best_cached_response is not None:
-                logger.info(f"Embedding cached hit(mode:{mode} type:{cache_type})")
+                logger.debug(f"Embedding cached hit(mode:{mode} type:{cache_type})")
                 return best_cached_response, None, None, None
             else:
                 # if caching keyword embedding is enabled, return the quantized embedding for saving it latter
-                logger.info(f"Embedding cached missed(mode:{mode} type:{cache_type})")
+                logger.debug(f"Embedding cached missed(mode:{mode} type:{cache_type})")
                 return None, quantized, min_val, max_val
 
     # For default mode or is_embedding_cache_enabled is False, use regular cache
@@ -681,10 +681,10 @@ async def handle_cache(
     else:
         mode_cache = await hashing_kv.get_by_id(mode) or {}
     if args_hash in mode_cache:
-        logger.info(f"Non-embedding cached hit(mode:{mode} type:{cache_type})")
+        logger.debug(f"Non-embedding cached hit(mode:{mode} type:{cache_type})")
         return mode_cache[args_hash]["return"], None, None, None
 
-    logger.info(f"Non-embedding cached missed(mode:{mode} type:{cache_type})")
+    logger.debug(f"Non-embedding cached missed(mode:{mode} type:{cache_type})")
     return None, None, None, None
 
 

From c5d0962872bf525945931cac19245f5553db3e5d Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 9 Mar 2025 01:00:42 +0800
Subject: [PATCH 23/54] Fix linting

---
 lightrag/kg/neo4j_impl.py     |  46 +++++++++------
 lightrag/kg/shared_storage.py | 102 ++++++++++++++++++++++++++++------
 lightrag/operate.py           |   9 ++-
 3 files changed, 120 insertions(+), 37 deletions(-)

diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index 0b660d68..d0841eec 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -181,10 +181,10 @@ class Neo4JStorage(BaseGraphStorage):
 
         Args:
             label: The label to validate
-            
+
         Returns:
             str: The cleaned label
-            
+
         Raises:
             ValueError: If label is empty after cleaning
         """
@@ -283,7 +283,9 @@ class Neo4JStorage(BaseGraphStorage):
                 query = f"MATCH (n:`{entity_name_label}` {{entity_id: $entity_id}}) RETURN n"
                 result = await session.run(query, entity_id=entity_name_label)
                 try:
-                    records = await result.fetch(2)  # Get 2 records for duplication check
+                    records = await result.fetch(
+                        2
+                    )  # Get 2 records for duplication check
 
                     if len(records) > 1:
                         logger.warning(
@@ -552,6 +554,7 @@ class Neo4JStorage(BaseGraphStorage):
 
         try:
             async with self._driver.session(database=self._DATABASE) as session:
+
                 async def execute_upsert(tx: AsyncManagedTransaction):
                     query = f"""
                     MERGE (n:`{label}` {{entity_id: $properties.entity_id}})
@@ -562,7 +565,7 @@ class Neo4JStorage(BaseGraphStorage):
                         f"Upserted node with label '{label}' and properties: {properties}"
                     )
                     await result.consume()  # Ensure result is fully consumed
-                
+
                 await session.execute_write(execute_upsert)
         except Exception as e:
             logger.error(f"Error during upsert: {str(e)}")
@@ -602,18 +605,26 @@ class Neo4JStorage(BaseGraphStorage):
             """
             result = await session.run(query)
             try:
-                records = await result.fetch(2)  # We only need to know if there are 0, 1, or >1 nodes
-                
+                records = await result.fetch(
+                    2
+                )  # We only need to know if there are 0, 1, or >1 nodes
+
                 if not records or records[0]["node_count"] == 0:
-                    raise ValueError(f"Neo4j: node with label '{node_label}' does not exist")
-                
+                    raise ValueError(
+                        f"Neo4j: node with label '{node_label}' does not exist"
+                    )
+
                 if records[0]["node_count"] > 1:
-                    raise ValueError(f"Neo4j: multiple nodes found with label '{node_label}', cannot determine unique node")
-                
+                    raise ValueError(
+                        f"Neo4j: multiple nodes found with label '{node_label}', cannot determine unique node"
+                    )
+
                 node = records[0]["n"]
                 if "entity_id" not in node:
-                    raise ValueError(f"Neo4j: node with label '{node_label}' does not have an entity_id property")
-                
+                    raise ValueError(
+                        f"Neo4j: node with label '{node_label}' does not have an entity_id property"
+                    )
+
                 return node["entity_id"]
             finally:
                 await result.consume()  # Ensure result is fully consumed
@@ -656,6 +667,7 @@ class Neo4JStorage(BaseGraphStorage):
 
         try:
             async with self._driver.session(database=self._DATABASE) as session:
+
                 async def execute_upsert(tx: AsyncManagedTransaction):
                     query = f"""
                     MATCH (source:`{source_label}` {{entity_id: $source_entity_id}})
@@ -666,10 +678,10 @@ class Neo4JStorage(BaseGraphStorage):
                     RETURN r, source, target
                     """
                     result = await tx.run(
-                        query, 
+                        query,
                         source_entity_id=source_entity_id,
                         target_entity_id=target_entity_id,
-                        properties=edge_properties
+                        properties=edge_properties,
                     )
                     try:
                         records = await result.fetch(100)
@@ -681,7 +693,7 @@ class Neo4JStorage(BaseGraphStorage):
                             )
                     finally:
                         await result.consume()  # Ensure result is consumed
-                
+
                 await session.execute_write(execute_upsert)
         except Exception as e:
             logger.error(f"Error during edge upsert: {str(e)}")
@@ -891,7 +903,9 @@ class Neo4JStorage(BaseGraphStorage):
                 results = await session.run(query, {"node_id": node.id})
 
                 # Get all records and release database connection
-                records = await results.fetch(1000)  # Max neighbour nodes we can handled
+                records = await results.fetch(
+                    1000
+                )  # Max neighbour nodes we can handled
                 await results.consume()  # Ensure results are consumed
 
                 # Nodes not connected to start node need to check degree
diff --git a/lightrag/kg/shared_storage.py b/lightrag/kg/shared_storage.py
index 67206971..9ccb2a99 100644
--- a/lightrag/kg/shared_storage.py
+++ b/lightrag/kg/shared_storage.py
@@ -11,7 +11,7 @@ def direct_log(message, level="INFO", enable_output: bool = True):
     """
     Log a message directly to stderr to ensure visibility in all processes,
     including the Gunicorn master process.
-    
+
     Args:
         message: The message to log
         level: Log level (default: "INFO")
@@ -44,7 +44,13 @@ _graph_db_lock: Optional[LockType] = None
 class UnifiedLock(Generic[T]):
     """Provide a unified lock interface type for asyncio.Lock and multiprocessing.Lock"""
 
-    def __init__(self, lock: Union[ProcessLock, asyncio.Lock], is_async: bool, name: str = "unnamed", enable_logging: bool = True):
+    def __init__(
+        self,
+        lock: Union[ProcessLock, asyncio.Lock],
+        is_async: bool,
+        name: str = "unnamed",
+        enable_logging: bool = True,
+    ):
         self._lock = lock
         self._is_async = is_async
         self._pid = os.getpid()  # for debug only
@@ -53,27 +59,47 @@ class UnifiedLock(Generic[T]):
 
     async def __aenter__(self) -> "UnifiedLock[T]":
         try:
-            direct_log(f"== Lock == Process {self._pid}: Acquiring lock '{self._name}' (async={self._is_async})", enable_output=self._enable_logging)
+            direct_log(
+                f"== Lock == Process {self._pid}: Acquiring lock '{self._name}' (async={self._is_async})",
+                enable_output=self._enable_logging,
+            )
             if self._is_async:
                 await self._lock.acquire()
             else:
                 self._lock.acquire()
-            direct_log(f"== Lock == Process {self._pid}: Lock '{self._name}' acquired (async={self._is_async})", enable_output=self._enable_logging)
+            direct_log(
+                f"== Lock == Process {self._pid}: Lock '{self._name}' acquired (async={self._is_async})",
+                enable_output=self._enable_logging,
+            )
             return self
         except Exception as e:
-            direct_log(f"== Lock == Process {self._pid}: Failed to acquire lock '{self._name}': {e}", level="ERROR", enable_output=self._enable_logging)
+            direct_log(
+                f"== Lock == Process {self._pid}: Failed to acquire lock '{self._name}': {e}",
+                level="ERROR",
+                enable_output=self._enable_logging,
+            )
             raise
 
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         try:
-            direct_log(f"== Lock == Process {self._pid}: Releasing lock '{self._name}' (async={self._is_async})", enable_output=self._enable_logging)
+            direct_log(
+                f"== Lock == Process {self._pid}: Releasing lock '{self._name}' (async={self._is_async})",
+                enable_output=self._enable_logging,
+            )
             if self._is_async:
                 self._lock.release()
             else:
                 self._lock.release()
-            direct_log(f"== Lock == Process {self._pid}: Lock '{self._name}' released (async={self._is_async})", enable_output=self._enable_logging)
+            direct_log(
+                f"== Lock == Process {self._pid}: Lock '{self._name}' released (async={self._is_async})",
+                enable_output=self._enable_logging,
+            )
         except Exception as e:
-            direct_log(f"== Lock == Process {self._pid}: Failed to release lock '{self._name}': {e}", level="ERROR", enable_output=self._enable_logging)
+            direct_log(
+                f"== Lock == Process {self._pid}: Failed to release lock '{self._name}': {e}",
+                level="ERROR",
+                enable_output=self._enable_logging,
+            )
             raise
 
     def __enter__(self) -> "UnifiedLock[T]":
@@ -81,12 +107,22 @@ class UnifiedLock(Generic[T]):
         try:
             if self._is_async:
                 raise RuntimeError("Use 'async with' for shared_storage lock")
-            direct_log(f"== Lock == Process {self._pid}: Acquiring lock '{self._name}' (sync)", enable_output=self._enable_logging)
+            direct_log(
+                f"== Lock == Process {self._pid}: Acquiring lock '{self._name}' (sync)",
+                enable_output=self._enable_logging,
+            )
             self._lock.acquire()
-            direct_log(f"== Lock == Process {self._pid}: Lock '{self._name}' acquired (sync)", enable_output=self._enable_logging)
+            direct_log(
+                f"== Lock == Process {self._pid}: Lock '{self._name}' acquired (sync)",
+                enable_output=self._enable_logging,
+            )
             return self
         except Exception as e:
-            direct_log(f"== Lock == Process {self._pid}: Failed to acquire lock '{self._name}' (sync): {e}", level="ERROR", enable_output=self._enable_logging)
+            direct_log(
+                f"== Lock == Process {self._pid}: Failed to acquire lock '{self._name}' (sync): {e}",
+                level="ERROR",
+                enable_output=self._enable_logging,
+            )
             raise
 
     def __exit__(self, exc_type, exc_val, exc_tb):
@@ -94,32 +130,62 @@ class UnifiedLock(Generic[T]):
         try:
             if self._is_async:
                 raise RuntimeError("Use 'async with' for shared_storage lock")
-            direct_log(f"== Lock == Process {self._pid}: Releasing lock '{self._name}' (sync)", enable_output=self._enable_logging)
+            direct_log(
+                f"== Lock == Process {self._pid}: Releasing lock '{self._name}' (sync)",
+                enable_output=self._enable_logging,
+            )
             self._lock.release()
-            direct_log(f"== Lock == Process {self._pid}: Lock '{self._name}' released (sync)", enable_output=self._enable_logging)
+            direct_log(
+                f"== Lock == Process {self._pid}: Lock '{self._name}' released (sync)",
+                enable_output=self._enable_logging,
+            )
         except Exception as e:
-            direct_log(f"== Lock == Process {self._pid}: Failed to release lock '{self._name}' (sync): {e}", level="ERROR", enable_output=self._enable_logging)
+            direct_log(
+                f"== Lock == Process {self._pid}: Failed to release lock '{self._name}' (sync): {e}",
+                level="ERROR",
+                enable_output=self._enable_logging,
+            )
             raise
 
 
 def get_internal_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified storage lock for data consistency"""
-    return UnifiedLock(lock=_internal_lock, is_async=not is_multiprocess, name="internal_lock", enable_logging=enable_logging)
+    return UnifiedLock(
+        lock=_internal_lock,
+        is_async=not is_multiprocess,
+        name="internal_lock",
+        enable_logging=enable_logging,
+    )
 
 
 def get_storage_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified storage lock for data consistency"""
-    return UnifiedLock(lock=_storage_lock, is_async=not is_multiprocess, name="storage_lock", enable_logging=enable_logging)
+    return UnifiedLock(
+        lock=_storage_lock,
+        is_async=not is_multiprocess,
+        name="storage_lock",
+        enable_logging=enable_logging,
+    )
 
 
 def get_pipeline_status_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified storage lock for data consistency"""
-    return UnifiedLock(lock=_pipeline_status_lock, is_async=not is_multiprocess, name="pipeline_status_lock", enable_logging=enable_logging)
+    return UnifiedLock(
+        lock=_pipeline_status_lock,
+        is_async=not is_multiprocess,
+        name="pipeline_status_lock",
+        enable_logging=enable_logging,
+    )
 
 
 def get_graph_db_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified graph database lock for ensuring atomic operations"""
-    return UnifiedLock(lock=_graph_db_lock, is_async=not is_multiprocess, name="graph_db_lock", enable_logging=enable_logging)
+    return UnifiedLock(
+        lock=_graph_db_lock,
+        is_async=not is_multiprocess,
+        name="graph_db_lock",
+        enable_logging=enable_logging,
+    )
 
 
 def initialize_share_data(workers: int = 1):
diff --git a/lightrag/operate.py b/lightrag/operate.py
index fb7b27a0..6c1bfd05 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -522,8 +522,9 @@ async def extract_entities(
             maybe_edges[tuple(sorted(k))].extend(v)
 
     from .kg.shared_storage import get_graph_db_lock
-    graph_db_lock = get_graph_db_lock(enable_logging = True)
-    
+
+    graph_db_lock = get_graph_db_lock(enable_logging=True)
+
     # Ensure that nodes and edges are merged and upserted atomically
     async with graph_db_lock:
         all_entities_data = await asyncio.gather(
@@ -535,7 +536,9 @@ async def extract_entities(
 
         all_relationships_data = await asyncio.gather(
             *[
-                _merge_edges_then_upsert(k[0], k[1], v, knowledge_graph_inst, global_config)
+                _merge_edges_then_upsert(
+                    k[0], k[1], v, knowledge_graph_inst, global_config
+                )
                 for k, v in maybe_edges.items()
             ]
         )

From 6a969e8de442fd2e9bb54eb3d4b8be9805c887cc Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 9 Mar 2025 01:14:24 +0800
Subject: [PATCH 24/54] Disable logging for graph database lock acquisition and
 release

---
 lightrag/operate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 6c1bfd05..ce686feb 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -523,7 +523,7 @@ async def extract_entities(
 
     from .kg.shared_storage import get_graph_db_lock
 
-    graph_db_lock = get_graph_db_lock(enable_logging=True)
+    graph_db_lock = get_graph_db_lock(enable_logging=False)
 
     # Ensure that nodes and edges are merged and upserted atomically
     async with graph_db_lock:

From 53cfb72db48500297a152301987a8bdbc88a930a Mon Sep 17 00:00:00 2001
From: Saifeddine ALOUI <aloui.seifeddine@gmail.com>
Date: Sat, 8 Mar 2025 23:07:51 +0100
Subject: [PATCH 25/54] linted

---
 lightrag/api/routers/document_routes.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index 2f1d4d03..c1666192 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -16,7 +16,11 @@ from pydantic import BaseModel, Field, field_validator
 
 from lightrag import LightRAG
 from lightrag.base import DocProcessingStatus, DocStatus
-from lightrag.api.utils_api import get_api_key_dependency, global_args
+from lightrag.api.utils_api import (
+    get_api_key_dependency,
+    global_args,
+    get_auth_dependency,
+)
 
 router = APIRouter(
     prefix="/documents",

From 90527875fd74c0eb4ace001ef96d9de71ffa3146 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 9 Mar 2025 15:22:06 +0800
Subject: [PATCH 26/54] Fix async issues in namespace init

---
 lightrag/kg/json_doc_status_impl.py |  2 +-
 lightrag/kg/json_kv_impl.py         |  2 +-
 lightrag/kg/shared_storage.py       | 18 ++++++++++--------
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py
index 01c657fa..824bd052 100644
--- a/lightrag/kg/json_doc_status_impl.py
+++ b/lightrag/kg/json_doc_status_impl.py
@@ -33,7 +33,7 @@ class JsonDocStatusStorage(DocStatusStorage):
     async def initialize(self):
         """Initialize storage data"""
         # check need_init must before get_namespace_data
-        need_init = try_initialize_namespace(self.namespace)
+        need_init = await try_initialize_namespace(self.namespace)
         self._data = await get_namespace_data(self.namespace)
         if need_init:
             loaded_data = load_json(self._file_name) or {}
diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py
index c0b61a63..96217d4b 100644
--- a/lightrag/kg/json_kv_impl.py
+++ b/lightrag/kg/json_kv_impl.py
@@ -29,7 +29,7 @@ class JsonKVStorage(BaseKVStorage):
     async def initialize(self):
         """Initialize storage data"""
         # check need_init must before get_namespace_data
-        need_init = try_initialize_namespace(self.namespace)
+        need_init = await try_initialize_namespace(self.namespace)
         self._data = await get_namespace_data(self.namespace)
         if need_init:
             loaded_data = load_json(self._file_name) or {}
diff --git a/lightrag/kg/shared_storage.py b/lightrag/kg/shared_storage.py
index 9ccb2a99..68747ff8 100644
--- a/lightrag/kg/shared_storage.py
+++ b/lightrag/kg/shared_storage.py
@@ -355,7 +355,7 @@ async def get_all_update_flags_status() -> Dict[str, list]:
     return result
 
 
-def try_initialize_namespace(namespace: str) -> bool:
+async def try_initialize_namespace(namespace: str) -> bool:
     """
     Returns True if the current worker(process) gets initialization permission for loading data later.
     The worker does not get the permission is prohibited to load data from files.
@@ -365,15 +365,17 @@ def try_initialize_namespace(namespace: str) -> bool:
     if _init_flags is None:
         raise ValueError("Try to create nanmespace before Shared-Data is initialized")
 
-    if namespace not in _init_flags:
-        _init_flags[namespace] = True
+    async with get_internal_lock():
+        if namespace not in _init_flags:
+            _init_flags[namespace] = True
+            direct_log(
+                f"Process {os.getpid()} ready to initialize storage namespace: [{namespace}]"
+            )
+            return True
         direct_log(
-            f"Process {os.getpid()} ready to initialize storage namespace: [{namespace}]"
+            f"Process {os.getpid()} storage namespace already initialized: [{namespace}]"
         )
-        return True
-    direct_log(
-        f"Process {os.getpid()} storage namespace already initialized: [{namespace}]"
-    )
+
     return False
 
 

From c854aabde09b569e15721212a31a285206a1e07f Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 9 Mar 2025 15:25:10 +0800
Subject: [PATCH 27/54] Add process ID to log messages for better multi-process
 debugging clarity

- Add PID to KV and Neo4j storage  logs
- Add PID to query context logs
- Improve KV data count logging for llm cache
---
 lightrag/kg/json_doc_status_impl.py |  3 ++-
 lightrag/kg/json_kv_impl.py         | 28 +++++++++++++++++++++++++---
 lightrag/kg/neo4j_impl.py           |  2 +-
 lightrag/operate.py                 |  2 ++
 4 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py
index 824bd052..e05c04f6 100644
--- a/lightrag/kg/json_doc_status_impl.py
+++ b/lightrag/kg/json_doc_status_impl.py
@@ -40,7 +40,7 @@ class JsonDocStatusStorage(DocStatusStorage):
             async with self._storage_lock:
                 self._data.update(loaded_data)
                 logger.info(
-                    f"Loaded document status storage with {len(loaded_data)} records"
+                    f"Process {os.getpid()} doc status load {self.namespace} with {len(loaded_data)} records"
                 )
 
     async def filter_keys(self, keys: set[str]) -> set[str]:
@@ -90,6 +90,7 @@ class JsonDocStatusStorage(DocStatusStorage):
             data_dict = (
                 dict(self._data) if hasattr(self._data, "_getvalue") else self._data
             )
+            logger.info(f"Process {os.getpid()} doc status writting {len(data_dict)} records to {self.namespace}")
             write_json(data_dict, self._file_name)
 
     async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py
index 96217d4b..c0aa81b2 100644
--- a/lightrag/kg/json_kv_impl.py
+++ b/lightrag/kg/json_kv_impl.py
@@ -35,13 +35,34 @@ class JsonKVStorage(BaseKVStorage):
             loaded_data = load_json(self._file_name) or {}
             async with self._storage_lock:
                 self._data.update(loaded_data)
-                logger.info(f"Load KV {self.namespace} with {len(loaded_data)} data")
+                
+                # Calculate data count based on namespace
+                if self.namespace.endswith("cache"):
+                    # For cache namespaces, sum the cache entries across all cache types
+                    data_count = sum(len(first_level_dict) for first_level_dict in loaded_data.values() 
+                                    if isinstance(first_level_dict, dict))
+                else:
+                    # For non-cache namespaces, use the original count method
+                    data_count = len(loaded_data)
+                
+                logger.info(f"Process {os.getpid()} KV load {self.namespace} with {data_count} records")
 
     async def index_done_callback(self) -> None:
         async with self._storage_lock:
             data_dict = (
                 dict(self._data) if hasattr(self._data, "_getvalue") else self._data
             )
+            
+            # Calculate data count based on namespace
+            if self.namespace.endswith("cache"):
+                # # For cache namespaces, sum the cache entries across all cache types
+                data_count = sum(len(first_level_dict) for first_level_dict in data_dict.values() 
+                                if isinstance(first_level_dict, dict))
+            else:
+                # For non-cache namespaces, use the original count method
+                data_count = len(data_dict)
+                
+            logger.info(f"Process {os.getpid()} KV writting {data_count} records to {self.namespace}")
             write_json(data_dict, self._file_name)
 
     async def get_all(self) -> dict[str, Any]:
@@ -73,12 +94,13 @@ class JsonKVStorage(BaseKVStorage):
             return set(keys) - set(self._data.keys())
 
     async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
-        logger.info(f"Inserting {len(data)} to {self.namespace}")
         if not data:
             return
         async with self._storage_lock:
             left_data = {k: v for k, v in data.items() if k not in self._data}
-            self._data.update(left_data)
+            if left_data:
+                logger.info(f"Process {os.getpid()} KV inserting {len(left_data)} to {self.namespace}")
+                self._data.update(left_data)
 
     async def delete(self, ids: list[str]) -> None:
         async with self._storage_lock:
diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index d0841eec..8d5a1a55 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -842,7 +842,7 @@ class Neo4JStorage(BaseGraphStorage):
                                 seen_edges.add(edge_id)
 
                         logger.info(
-                            f"Subgraph query successful | Node count: {len(result.nodes)} | Edge count: {len(result.edges)}"
+                            f"Process {os.getpid()} graph query return: {len(result.nodes)} nodes, {len(result.edges)} edges"
                         )
                 finally:
                     await result_set.consume()  # Ensure result set is consumed
diff --git a/lightrag/operate.py b/lightrag/operate.py
index ce686feb..d16e170c 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import asyncio
 import json
 import re
+import os
 from typing import Any, AsyncIterator
 from collections import Counter, defaultdict
 
@@ -1027,6 +1028,7 @@ async def _build_query_context(
     text_chunks_db: BaseKVStorage,
     query_param: QueryParam,
 ):
+    logger.info(f"Process {os.getpid()} buidling query context...")
     if query_param.mode == "local":
         entities_context, relations_context, text_units_context = await _get_node_data(
             ll_keywords,

From 020a6b5ae0605effd2a0a0c78903c31d146443ae Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 9 Mar 2025 16:45:57 +0800
Subject: [PATCH 28/54] Refactor LLM cache config to use argparse and add
 status display

---
 lightrag/api/lightrag_server.py |  8 +++-----
 lightrag/api/utils_api.py       | 11 ++++++++++-
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index 5df4f765..c42a816a 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -50,9 +50,6 @@ from .auth import auth_handler
 # This update allows the user to put a different.env file for each lightrag folder
 load_dotenv(".env", override=True)
 
-# Read entity extraction cache config
-enable_llm_cache = os.getenv("ENABLE_LLM_CACHE_FOR_EXTRACT", "false").lower() == "true"
-
 # Initialize config parser
 config = configparser.ConfigParser()
 config.read("config.ini")
@@ -326,7 +323,7 @@ def create_app(args):
             vector_db_storage_cls_kwargs={
                 "cosine_better_than_threshold": args.cosine_threshold
             },
-            enable_llm_cache_for_entity_extract=enable_llm_cache,  # Read from environment variable
+            enable_llm_cache_for_entity_extract=args.enable_llm_cache,  # Read from args
             embedding_cache_config={
                 "enabled": True,
                 "similarity_threshold": 0.95,
@@ -355,7 +352,7 @@ def create_app(args):
             vector_db_storage_cls_kwargs={
                 "cosine_better_than_threshold": args.cosine_threshold
             },
-            enable_llm_cache_for_entity_extract=enable_llm_cache,  # Read from environment variable
+            enable_llm_cache_for_entity_extract=args.enable_llm_cache,  # Read from args
             embedding_cache_config={
                 "enabled": True,
                 "similarity_threshold": 0.95,
@@ -419,6 +416,7 @@ def create_app(args):
                 "doc_status_storage": args.doc_status_storage,
                 "graph_storage": args.graph_storage,
                 "vector_storage": args.vector_storage,
+                "enable_llm_cache": args.enable_llm_cache,
             },
             "update_status": update_status,
         }
diff --git a/lightrag/api/utils_api.py b/lightrag/api/utils_api.py
index dc467449..da443558 100644
--- a/lightrag/api/utils_api.py
+++ b/lightrag/api/utils_api.py
@@ -359,6 +359,13 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:
     # Inject chunk configuration
     args.chunk_size = get_env_value("CHUNK_SIZE", 1200, int)
     args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int)
+    
+    # Inject LLM cache configuration
+    args.enable_llm_cache = get_env_value(
+        "ENABLE_LLM_CACHE_FOR_EXTRACT", 
+        False, 
+        bool
+    )
 
     ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name
 
@@ -451,8 +458,10 @@ def display_splash_screen(args: argparse.Namespace) -> None:
     ASCIIColors.yellow(f"{args.history_turns}")
     ASCIIColors.white("    ├─ Cosine Threshold: ", end="")
     ASCIIColors.yellow(f"{args.cosine_threshold}")
-    ASCIIColors.white("    └─ Top-K: ", end="")
+    ASCIIColors.white("    ├─ Top-K: ", end="")
     ASCIIColors.yellow(f"{args.top_k}")
+    ASCIIColors.white("    └─ LLM Cache Enabled: ", end="")
+    ASCIIColors.yellow(f"{args.enable_llm_cache}")
 
     # System Configuration
     ASCIIColors.magenta("\n💾 Storage Configuration:")

From e47883d8728ceb91f571632e87512d7398fe07e4 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 9 Mar 2025 17:33:15 +0800
Subject: [PATCH 29/54] Add atomic data initialization lock to prevent race
 conditions

---
 lightrag/kg/json_doc_status_impl.py | 22 +++++++++--------
 lightrag/kg/json_kv_impl.py         | 38 +++++++++++++++--------------
 lightrag/kg/shared_storage.py       | 18 +++++++++++++-
 3 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py
index e05c04f6..67a4705a 100644
--- a/lightrag/kg/json_doc_status_impl.py
+++ b/lightrag/kg/json_doc_status_impl.py
@@ -15,6 +15,7 @@ from lightrag.utils import (
 from .shared_storage import (
     get_namespace_data,
     get_storage_lock,
+    get_data_init_lock,
     try_initialize_namespace,
 )
 
@@ -27,21 +28,22 @@ class JsonDocStatusStorage(DocStatusStorage):
     def __post_init__(self):
         working_dir = self.global_config["working_dir"]
         self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
-        self._storage_lock = get_storage_lock()
         self._data = None
 
     async def initialize(self):
         """Initialize storage data"""
-        # check need_init must before get_namespace_data
-        need_init = await try_initialize_namespace(self.namespace)
+        self._storage_lock = get_storage_lock()
         self._data = await get_namespace_data(self.namespace)
-        if need_init:
-            loaded_data = load_json(self._file_name) or {}
-            async with self._storage_lock:
-                self._data.update(loaded_data)
-                logger.info(
-                    f"Process {os.getpid()} doc status load {self.namespace} with {len(loaded_data)} records"
-                )
+        async with get_data_init_lock():
+            # check need_init must before get_namespace_data
+            need_init = await try_initialize_namespace(self.namespace)
+            if need_init:
+                loaded_data = load_json(self._file_name) or {}
+                async with self._storage_lock:
+                    self._data.update(loaded_data)
+                    logger.info(
+                        f"Process {os.getpid()} doc status load {self.namespace} with {len(loaded_data)} records"
+                    )
 
     async def filter_keys(self, keys: set[str]) -> set[str]:
         """Return keys that should be processed (not in storage or not successfully processed)"""
diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py
index c0aa81b2..5070c0b4 100644
--- a/lightrag/kg/json_kv_impl.py
+++ b/lightrag/kg/json_kv_impl.py
@@ -13,6 +13,7 @@ from lightrag.utils import (
 from .shared_storage import (
     get_namespace_data,
     get_storage_lock,
+    get_data_init_lock,
     try_initialize_namespace,
 )
 
@@ -23,29 +24,30 @@ class JsonKVStorage(BaseKVStorage):
     def __post_init__(self):
         working_dir = self.global_config["working_dir"]
         self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
-        self._storage_lock = get_storage_lock()
         self._data = None
 
     async def initialize(self):
         """Initialize storage data"""
-        # check need_init must before get_namespace_data
-        need_init = await try_initialize_namespace(self.namespace)
+        self._storage_lock = get_storage_lock()
         self._data = await get_namespace_data(self.namespace)
-        if need_init:
-            loaded_data = load_json(self._file_name) or {}
-            async with self._storage_lock:
-                self._data.update(loaded_data)
-                
-                # Calculate data count based on namespace
-                if self.namespace.endswith("cache"):
-                    # For cache namespaces, sum the cache entries across all cache types
-                    data_count = sum(len(first_level_dict) for first_level_dict in loaded_data.values() 
-                                    if isinstance(first_level_dict, dict))
-                else:
-                    # For non-cache namespaces, use the original count method
-                    data_count = len(loaded_data)
-                
-                logger.info(f"Process {os.getpid()} KV load {self.namespace} with {data_count} records")
+        async with get_data_init_lock():
+            # check need_init must before get_namespace_data
+            need_init = await try_initialize_namespace(self.namespace)
+            if need_init:
+                loaded_data = load_json(self._file_name) or {}
+                async with self._storage_lock:
+                    self._data.update(loaded_data)
+                    
+                    # Calculate data count based on namespace
+                    if self.namespace.endswith("cache"):
+                        # For cache namespaces, sum the cache entries across all cache types
+                        data_count = sum(len(first_level_dict) for first_level_dict in loaded_data.values() 
+                                        if isinstance(first_level_dict, dict))
+                    else:
+                        # For non-cache namespaces, use the original count method
+                        data_count = len(loaded_data)
+                    
+                    logger.info(f"Process {os.getpid()} KV load {self.namespace} with {data_count} records")
 
     async def index_done_callback(self) -> None:
         async with self._storage_lock:
diff --git a/lightrag/kg/shared_storage.py b/lightrag/kg/shared_storage.py
index 68747ff8..e3c25d34 100644
--- a/lightrag/kg/shared_storage.py
+++ b/lightrag/kg/shared_storage.py
@@ -39,6 +39,7 @@ _storage_lock: Optional[LockType] = None
 _internal_lock: Optional[LockType] = None
 _pipeline_status_lock: Optional[LockType] = None
 _graph_db_lock: Optional[LockType] = None
+_data_init_lock: Optional[LockType] = None
 
 
 class UnifiedLock(Generic[T]):
@@ -188,6 +189,16 @@ def get_graph_db_lock(enable_logging: bool = False) -> UnifiedLock:
     )
 
 
+def get_data_init_lock(enable_logging: bool = False) -> UnifiedLock:
+    """return unified data initialization lock for ensuring atomic data initialization"""
+    return UnifiedLock(
+        lock=_data_init_lock,
+        is_async=not is_multiprocess,
+        name="data_init_lock",
+        enable_logging=enable_logging,
+    )
+
+
 def initialize_share_data(workers: int = 1):
     """
     Initialize shared storage data for single or multi-process mode.
@@ -214,6 +225,7 @@ def initialize_share_data(workers: int = 1):
         _internal_lock, \
         _pipeline_status_lock, \
         _graph_db_lock, \
+        _data_init_lock, \
         _shared_dicts, \
         _init_flags, \
         _initialized, \
@@ -226,15 +238,16 @@ def initialize_share_data(workers: int = 1):
         )
         return
 
-    _manager = Manager()
     _workers = workers
 
     if workers > 1:
         is_multiprocess = True
+        _manager = Manager()
         _internal_lock = _manager.Lock()
         _storage_lock = _manager.Lock()
         _pipeline_status_lock = _manager.Lock()
         _graph_db_lock = _manager.Lock()
+        _data_init_lock = _manager.Lock()
         _shared_dicts = _manager.dict()
         _init_flags = _manager.dict()
         _update_flags = _manager.dict()
@@ -247,6 +260,7 @@ def initialize_share_data(workers: int = 1):
         _storage_lock = asyncio.Lock()
         _pipeline_status_lock = asyncio.Lock()
         _graph_db_lock = asyncio.Lock()
+        _data_init_lock = asyncio.Lock()
         _shared_dicts = {}
         _init_flags = {}
         _update_flags = {}
@@ -415,6 +429,7 @@ def finalize_share_data():
         _internal_lock, \
         _pipeline_status_lock, \
         _graph_db_lock, \
+        _data_init_lock, \
         _shared_dicts, \
         _init_flags, \
         _initialized, \
@@ -481,6 +496,7 @@ def finalize_share_data():
     _internal_lock = None
     _pipeline_status_lock = None
     _graph_db_lock = None
+    _data_init_lock = None
     _update_flags = None
 
     direct_log(f"Process {os.getpid()} storage data finalization complete")

From 04862033d6a0f442691572297ee0b6bdc7f30bdf Mon Sep 17 00:00:00 2001
From: Saifeddine ALOUI <aloui.seifeddine@gmail.com>
Date: Sun, 9 Mar 2025 13:14:39 +0100
Subject: [PATCH 30/54] Made the defa&ult mode non docling

---
 lightrag/api/utils_api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lightrag/api/utils_api.py b/lightrag/api/utils_api.py
index 55a81c61..581d5f8c 100644
--- a/lightrag/api/utils_api.py
+++ b/lightrag/api/utils_api.py
@@ -362,8 +362,8 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:
     args.chunk_size = get_env_value("CHUNK_SIZE", 1200, int)
     args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int)
 
-    # Select Document loading tool
-    args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DOCLING")
+    # Select Document loading tool (DOCLING, DEFAULT)
+    args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
 
     ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name
 

From bc42afe7b65f92a5d73eb01f5410bdde9385ddd0 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 9 Mar 2025 22:15:26 +0800
Subject: [PATCH 31/54] Unify llm_response_cache and hashing_kv, prevent
 creating an independent hashing_kv.

---
 lightrag/api/lightrag_server.py |  6 +--
 lightrag/api/utils_api.py       |  6 +--
 lightrag/lightrag.py            | 90 ++++-----------------------------
 lightrag/operate.py             |  2 +-
 lightrag/utils.py               | 22 ++++----
 5 files changed, 30 insertions(+), 96 deletions(-)

diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index c42a816a..8871650a 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -323,7 +323,7 @@ def create_app(args):
             vector_db_storage_cls_kwargs={
                 "cosine_better_than_threshold": args.cosine_threshold
             },
-            enable_llm_cache_for_entity_extract=args.enable_llm_cache,  # Read from args
+            enable_llm_cache_for_entity_extract=args.enable_llm_cache_for_extract,
             embedding_cache_config={
                 "enabled": True,
                 "similarity_threshold": 0.95,
@@ -352,7 +352,7 @@ def create_app(args):
             vector_db_storage_cls_kwargs={
                 "cosine_better_than_threshold": args.cosine_threshold
             },
-            enable_llm_cache_for_entity_extract=args.enable_llm_cache,  # Read from args
+            enable_llm_cache_for_entity_extract=args.enable_llm_cache_for_extract,
             embedding_cache_config={
                 "enabled": True,
                 "similarity_threshold": 0.95,
@@ -416,7 +416,7 @@ def create_app(args):
                 "doc_status_storage": args.doc_status_storage,
                 "graph_storage": args.graph_storage,
                 "vector_storage": args.vector_storage,
-                "enable_llm_cache": args.enable_llm_cache,
+                "enable_llm_cache_for_extract": args.enable_llm_cache_for_extract,
             },
             "update_status": update_status,
         }
diff --git a/lightrag/api/utils_api.py b/lightrag/api/utils_api.py
index da443558..9a619f9e 100644
--- a/lightrag/api/utils_api.py
+++ b/lightrag/api/utils_api.py
@@ -361,7 +361,7 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:
     args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int)
     
     # Inject LLM cache configuration
-    args.enable_llm_cache = get_env_value(
+    args.enable_llm_cache_for_extract = get_env_value(
         "ENABLE_LLM_CACHE_FOR_EXTRACT", 
         False, 
         bool
@@ -460,8 +460,8 @@ def display_splash_screen(args: argparse.Namespace) -> None:
     ASCIIColors.yellow(f"{args.cosine_threshold}")
     ASCIIColors.white("    ├─ Top-K: ", end="")
     ASCIIColors.yellow(f"{args.top_k}")
-    ASCIIColors.white("    └─ LLM Cache Enabled: ", end="")
-    ASCIIColors.yellow(f"{args.enable_llm_cache}")
+    ASCIIColors.white("    └─ LLM Cache for Extraction Enabled: ", end="")
+    ASCIIColors.yellow(f"{args.enable_llm_cache_for_extract}")
 
     # System Configuration
     ASCIIColors.magenta("\n💾 Storage Configuration:")
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index b06520fc..a91aa6fa 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -354,6 +354,7 @@ class LightRAG:
             namespace=make_namespace(
                 self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
             ),
+            global_config=asdict(self),  # Add global_config to ensure cache works properly
             embedding_func=self.embedding_func,
         )
 
@@ -404,18 +405,8 @@ class LightRAG:
             embedding_func=None,
         )
 
-        if self.llm_response_cache and hasattr(
-            self.llm_response_cache, "global_config"
-        ):
-            hashing_kv = self.llm_response_cache
-        else:
-            hashing_kv = self.key_string_value_json_storage_cls(  # type: ignore
-                namespace=make_namespace(
-                    self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
-                ),
-                global_config=asdict(self),
-                embedding_func=self.embedding_func,
-            )
+        # Directly use llm_response_cache, don't create a new object
+        hashing_kv = self.llm_response_cache
 
         self.llm_model_func = limit_async_func_call(self.llm_model_max_async)(
             partial(
@@ -1260,16 +1251,7 @@ class LightRAG:
                 self.text_chunks,
                 param,
                 asdict(self),
-                hashing_kv=self.llm_response_cache
-                if self.llm_response_cache
-                and hasattr(self.llm_response_cache, "global_config")
-                else self.key_string_value_json_storage_cls(
-                    namespace=make_namespace(
-                        self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
-                    ),
-                    global_config=asdict(self),
-                    embedding_func=self.embedding_func,
-                ),
+                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
                 system_prompt=system_prompt,
             )
         elif param.mode == "naive":
@@ -1279,16 +1261,7 @@ class LightRAG:
                 self.text_chunks,
                 param,
                 asdict(self),
-                hashing_kv=self.llm_response_cache
-                if self.llm_response_cache
-                and hasattr(self.llm_response_cache, "global_config")
-                else self.key_string_value_json_storage_cls(
-                    namespace=make_namespace(
-                        self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
-                    ),
-                    global_config=asdict(self),
-                    embedding_func=self.embedding_func,
-                ),
+                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
                 system_prompt=system_prompt,
             )
         elif param.mode == "mix":
@@ -1301,16 +1274,7 @@ class LightRAG:
                 self.text_chunks,
                 param,
                 asdict(self),
-                hashing_kv=self.llm_response_cache
-                if self.llm_response_cache
-                and hasattr(self.llm_response_cache, "global_config")
-                else self.key_string_value_json_storage_cls(
-                    namespace=make_namespace(
-                        self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
-                    ),
-                    global_config=asdict(self),
-                    embedding_func=self.embedding_func,
-                ),
+                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
                 system_prompt=system_prompt,
             )
         else:
@@ -1344,14 +1308,7 @@ class LightRAG:
             text=query,
             param=param,
             global_config=asdict(self),
-            hashing_kv=self.llm_response_cache
-            or self.key_string_value_json_storage_cls(
-                namespace=make_namespace(
-                    self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
-                ),
-                global_config=asdict(self),
-                embedding_func=self.embedding_func,
-            ),
+            hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
         )
 
         param.hl_keywords = hl_keywords
@@ -1375,16 +1332,7 @@ class LightRAG:
                 self.text_chunks,
                 param,
                 asdict(self),
-                hashing_kv=self.llm_response_cache
-                if self.llm_response_cache
-                and hasattr(self.llm_response_cache, "global_config")
-                else self.key_string_value_json_storage_cls(
-                    namespace=make_namespace(
-                        self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
-                    ),
-                    global_config=asdict(self),
-                    embedding_func=self.embedding_func,
-                ),
+                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
             )
         elif param.mode == "naive":
             response = await naive_query(
@@ -1393,16 +1341,7 @@ class LightRAG:
                 self.text_chunks,
                 param,
                 asdict(self),
-                hashing_kv=self.llm_response_cache
-                if self.llm_response_cache
-                and hasattr(self.llm_response_cache, "global_config")
-                else self.key_string_value_json_storage_cls(
-                    namespace=make_namespace(
-                        self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
-                    ),
-                    global_config=asdict(self),
-                    embedding_func=self.embedding_func,
-                ),
+                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
             )
         elif param.mode == "mix":
             response = await mix_kg_vector_query(
@@ -1414,16 +1353,7 @@ class LightRAG:
                 self.text_chunks,
                 param,
                 asdict(self),
-                hashing_kv=self.llm_response_cache
-                if self.llm_response_cache
-                and hasattr(self.llm_response_cache, "global_config")
-                else self.key_string_value_json_storage_cls(
-                    namespace=make_namespace(
-                        self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
-                    ),
-                    global_config=asdict(self),
-                    embedding_func=self.embedding_func,
-                ),
+                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
             )
         else:
             raise ValueError(f"Unknown mode {param.mode}")
diff --git a/lightrag/operate.py b/lightrag/operate.py
index d16e170c..9ba3b06d 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -410,7 +410,6 @@ async def extract_entities(
                 _prompt,
                 "default",
                 cache_type="extract",
-                force_llm_cache=True,
             )
             if cached_return:
                 logger.debug(f"Found cache for {arg_hash}")
@@ -432,6 +431,7 @@ async def extract_entities(
                     cache_type="extract",
                 ),
             )
+            logger.info(f"Extract: saved cache for {arg_hash}")
             return res
 
         if history_messages:
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 1b65097e..02c3236d 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -633,15 +633,15 @@ async def handle_cache(
     prompt,
     mode="default",
     cache_type=None,
-    force_llm_cache=False,
 ):
     """Generic cache handling function"""
-    if hashing_kv is None or not (
-        force_llm_cache or hashing_kv.global_config.get("enable_llm_cache")
-    ):
+    if hashing_kv is None:
         return None, None, None, None
 
-    if mode != "default":
+    if mode != "default":  # handle cache for all type of query
+        if not hashing_kv.global_config.get("enable_llm_cache"):
+            return None, None, None, None
+
         # Get embedding cache configuration
         embedding_cache_config = hashing_kv.global_config.get(
             "embedding_cache_config",
@@ -651,8 +651,7 @@ async def handle_cache(
         use_llm_check = embedding_cache_config.get("use_llm_check", False)
 
         quantized = min_val = max_val = None
-        if is_embedding_cache_enabled:
-            # Use embedding cache
+        if is_embedding_cache_enabled:  # Use embedding simularity to match cache
             current_embedding = await hashing_kv.embedding_func([prompt])
             llm_model_func = hashing_kv.global_config.get("llm_model_func")
             quantized, min_val, max_val = quantize_embedding(current_embedding[0])
@@ -674,8 +673,13 @@ async def handle_cache(
                 logger.debug(f"Embedding cached missed(mode:{mode} type:{cache_type})")
                 return None, quantized, min_val, max_val
 
-    # For default mode or is_embedding_cache_enabled is False, use regular cache
-    # default mode is for extract_entities or naive query
+    else:  # handle cache for entity extraction
+        if not hashing_kv.global_config.get("enable_llm_cache_for_entity_extract"):
+            return None, None, None, None
+
+    # Here is the conditions of code reaching this point:
+    #     1. All query mode: enable_llm_cache is True and embedding simularity is not enabled
+    #     2. Entity extract: enable_llm_cache_for_entity_extract is True
     if exists_func(hashing_kv, "get_by_mode_and_id"):
         mode_cache = await hashing_kv.get_by_mode_and_id(mode, args_hash) or {}
     else:

From c938989920b6b8c90f561ff4007f1e12f2f33596 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Sun, 9 Mar 2025 23:33:03 +0800
Subject: [PATCH 32/54] Fix llm cache save problem in json_kv storage

---
 lightrag/kg/json_doc_status_impl.py | 4 ++--
 lightrag/kg/json_kv_impl.py         | 6 ++----
 lightrag/operate.py                 | 2 +-
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py
index 67a4705a..11766fa7 100644
--- a/lightrag/kg/json_doc_status_impl.py
+++ b/lightrag/kg/json_doc_status_impl.py
@@ -96,12 +96,12 @@ class JsonDocStatusStorage(DocStatusStorage):
             write_json(data_dict, self._file_name)
 
     async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
-        logger.info(f"Inserting {len(data)} to {self.namespace}")
         if not data:
             return
-
+        logger.info(f"Inserting {len(data)} to {self.namespace}")
         async with self._storage_lock:
             self._data.update(data)
+
         await self.index_done_callback()
 
     async def get_by_id(self, id: str) -> Union[dict[str, Any], None]:
diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py
index 5070c0b4..b90bf1d8 100644
--- a/lightrag/kg/json_kv_impl.py
+++ b/lightrag/kg/json_kv_impl.py
@@ -98,11 +98,9 @@ class JsonKVStorage(BaseKVStorage):
     async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
         if not data:
             return
+        logger.info(f"Inserting {len(data)} to {self.namespace}")
         async with self._storage_lock:
-            left_data = {k: v for k, v in data.items() if k not in self._data}
-            if left_data:
-                logger.info(f"Process {os.getpid()} KV inserting {len(left_data)} to {self.namespace}")
-                self._data.update(left_data)
+            self._data.update(data)
 
     async def delete(self, ids: list[str]) -> None:
         async with self._storage_lock:
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 9ba3b06d..cfd8b6f8 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -403,6 +403,7 @@ async def extract_entities(
             else:
                 _prompt = input_text
 
+            # TODO： add cache_type="extract"
             arg_hash = compute_args_hash(_prompt)
             cached_return, _1, _2, _3 = await handle_cache(
                 llm_response_cache,
@@ -431,7 +432,6 @@ async def extract_entities(
                     cache_type="extract",
                 ),
             )
-            logger.info(f"Extract: saved cache for {arg_hash}")
             return res
 
         if history_messages:

From 4977c718f1f0bc62eec4a855721ecc4c71ae5558 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Mon, 10 Mar 2025 00:12:35 +0800
Subject: [PATCH 33/54] Improve KV storage initialize logic

---
 lightrag/kg/json_doc_status_impl.py | 2 +-
 lightrag/kg/json_kv_impl.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py
index 11766fa7..b5249540 100644
--- a/lightrag/kg/json_doc_status_impl.py
+++ b/lightrag/kg/json_doc_status_impl.py
@@ -33,10 +33,10 @@ class JsonDocStatusStorage(DocStatusStorage):
     async def initialize(self):
         """Initialize storage data"""
         self._storage_lock = get_storage_lock()
-        self._data = await get_namespace_data(self.namespace)
         async with get_data_init_lock():
             # check need_init must before get_namespace_data
             need_init = await try_initialize_namespace(self.namespace)
+            self._data = await get_namespace_data(self.namespace)
             if need_init:
                 loaded_data = load_json(self._file_name) or {}
                 async with self._storage_lock:
diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py
index b90bf1d8..81439151 100644
--- a/lightrag/kg/json_kv_impl.py
+++ b/lightrag/kg/json_kv_impl.py
@@ -29,10 +29,10 @@ class JsonKVStorage(BaseKVStorage):
     async def initialize(self):
         """Initialize storage data"""
         self._storage_lock = get_storage_lock()
-        self._data = await get_namespace_data(self.namespace)
         async with get_data_init_lock():
             # check need_init must before get_namespace_data
             need_init = await try_initialize_namespace(self.namespace)
+            self._data = await get_namespace_data(self.namespace)
             if need_init:
                 loaded_data = load_json(self._file_name) or {}
                 async with self._storage_lock:

From d2708b966d5f623b9ee3d68736d51e2c83063b30 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Mon, 10 Mar 2025 01:17:25 +0800
Subject: [PATCH 34/54] Added update flag to avoid persistence if no data is
 changed for KV storage

---
 lightrag/kg/json_doc_status_impl.py | 22 ++++++++++++----
 lightrag/kg/json_kv_impl.py         | 41 ++++++++++++++++++-----------
 lightrag/kg/shared_storage.py       | 15 +++++++++++
 3 files changed, 58 insertions(+), 20 deletions(-)

diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py
index b5249540..c33059ad 100644
--- a/lightrag/kg/json_doc_status_impl.py
+++ b/lightrag/kg/json_doc_status_impl.py
@@ -16,6 +16,9 @@ from .shared_storage import (
     get_namespace_data,
     get_storage_lock,
     get_data_init_lock,
+    get_update_flag,
+    set_all_update_flags,
+    clear_all_update_flags,
     try_initialize_namespace,
 )
 
@@ -29,10 +32,13 @@ class JsonDocStatusStorage(DocStatusStorage):
         working_dir = self.global_config["working_dir"]
         self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
         self._data = None
+        self._storage_lock = None
+        self.storage_updated = None
 
     async def initialize(self):
         """Initialize storage data"""
         self._storage_lock = get_storage_lock()
+        self.storage_updated = await get_update_flag(self.namespace)
         async with get_data_init_lock():
             # check need_init must before get_namespace_data
             need_init = await try_initialize_namespace(self.namespace)
@@ -89,11 +95,13 @@ class JsonDocStatusStorage(DocStatusStorage):
 
     async def index_done_callback(self) -> None:
         async with self._storage_lock:
-            data_dict = (
-                dict(self._data) if hasattr(self._data, "_getvalue") else self._data
-            )
-            logger.info(f"Process {os.getpid()} doc status writting {len(data_dict)} records to {self.namespace}")
-            write_json(data_dict, self._file_name)
+            if self.storage_updated:
+                data_dict = (
+                    dict(self._data) if hasattr(self._data, "_getvalue") else self._data
+                )
+                logger.info(f"Process {os.getpid()} doc status writting {len(data_dict)} records to {self.namespace}")
+                write_json(data_dict, self._file_name)
+                await clear_all_update_flags(self.namespace)
 
     async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
         if not data:
@@ -101,6 +109,7 @@ class JsonDocStatusStorage(DocStatusStorage):
         logger.info(f"Inserting {len(data)} to {self.namespace}")
         async with self._storage_lock:
             self._data.update(data)
+            await set_all_update_flags(self.namespace)
 
         await self.index_done_callback()
 
@@ -112,9 +121,12 @@ class JsonDocStatusStorage(DocStatusStorage):
         async with self._storage_lock:
             for doc_id in doc_ids:
                 self._data.pop(doc_id, None)
+            await set_all_update_flags(self.namespace)
         await self.index_done_callback()
 
     async def drop(self) -> None:
         """Drop the storage"""
         async with self._storage_lock:
             self._data.clear()
+            await set_all_update_flags(self.namespace)
+        await self.index_done_callback()
diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py
index 81439151..c69b53ec 100644
--- a/lightrag/kg/json_kv_impl.py
+++ b/lightrag/kg/json_kv_impl.py
@@ -14,6 +14,9 @@ from .shared_storage import (
     get_namespace_data,
     get_storage_lock,
     get_data_init_lock,
+    get_update_flag,
+    set_all_update_flags,
+    clear_all_update_flags,
     try_initialize_namespace,
 )
 
@@ -25,10 +28,13 @@ class JsonKVStorage(BaseKVStorage):
         working_dir = self.global_config["working_dir"]
         self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
         self._data = None
+        self._storage_lock = None
+        self.storage_updated = None
 
     async def initialize(self):
         """Initialize storage data"""
         self._storage_lock = get_storage_lock()
+        self.storage_updated = await get_update_flag(self.namespace)
         async with get_data_init_lock():
             # check need_init must before get_namespace_data
             need_init = await try_initialize_namespace(self.namespace)
@@ -51,21 +57,24 @@ class JsonKVStorage(BaseKVStorage):
 
     async def index_done_callback(self) -> None:
         async with self._storage_lock:
-            data_dict = (
-                dict(self._data) if hasattr(self._data, "_getvalue") else self._data
-            )
-            
-            # Calculate data count based on namespace
-            if self.namespace.endswith("cache"):
-                # # For cache namespaces, sum the cache entries across all cache types
-                data_count = sum(len(first_level_dict) for first_level_dict in data_dict.values() 
-                                if isinstance(first_level_dict, dict))
-            else:
-                # For non-cache namespaces, use the original count method
-                data_count = len(data_dict)
-                
-            logger.info(f"Process {os.getpid()} KV writting {data_count} records to {self.namespace}")
-            write_json(data_dict, self._file_name)
+            if self.storage_updated:
+                data_dict = (
+                    dict(self._data) if hasattr(self._data, "_getvalue") else self._data
+                )
+
+                # Calculate data count based on namespace
+                if self.namespace.endswith("cache"):
+                    # # For cache namespaces, sum the cache entries across all cache types
+                    data_count = sum(len(first_level_dict) for first_level_dict in data_dict.values() 
+                                    if isinstance(first_level_dict, dict))
+                else:
+                    # For non-cache namespaces, use the original count method
+                    data_count = len(data_dict)
+                    
+                logger.info(f"Process {os.getpid()} KV writting {data_count} records to {self.namespace}")
+                write_json(data_dict, self._file_name)
+                await clear_all_update_flags(self.namespace)
+
 
     async def get_all(self) -> dict[str, Any]:
         """Get all data from storage
@@ -101,9 +110,11 @@ class JsonKVStorage(BaseKVStorage):
         logger.info(f"Inserting {len(data)} to {self.namespace}")
         async with self._storage_lock:
             self._data.update(data)
+            await set_all_update_flags(self.namespace)
 
     async def delete(self, ids: list[str]) -> None:
         async with self._storage_lock:
             for doc_id in ids:
                 self._data.pop(doc_id, None)
+            await set_all_update_flags(self.namespace)
         await self.index_done_callback()
diff --git a/lightrag/kg/shared_storage.py b/lightrag/kg/shared_storage.py
index e3c25d34..9ce04d23 100644
--- a/lightrag/kg/shared_storage.py
+++ b/lightrag/kg/shared_storage.py
@@ -344,6 +344,21 @@ async def set_all_update_flags(namespace: str):
             else:
                 _update_flags[namespace][i] = True
 
+async def clear_all_update_flags(namespace: str):
+    """Clear all update flag of namespace indicating all workers need to reload data from files"""
+    global _update_flags
+    if _update_flags is None:
+        raise ValueError("Try to create namespace before Shared-Data is initialized")
+
+    async with get_internal_lock():
+        if namespace not in _update_flags:
+            raise ValueError(f"Namespace {namespace} not found in update flags")
+        # Update flags for both modes
+        for i in range(len(_update_flags[namespace])):
+            if is_multiprocess:
+                _update_flags[namespace][i].value = False
+            else:
+                _update_flags[namespace][i] = False
 
 async def get_all_update_flags_status() -> Dict[str, list]:
     """

From 6b0acce6440dea3e438b10f5e28208277440572b Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Mon, 10 Mar 2025 01:45:58 +0800
Subject: [PATCH 35/54] Avoid redundant llm cache updates

---
 lightrag/kg/json_doc_status_impl.py |  3 ++-
 lightrag/kg/json_kv_impl.py         |  3 ++-
 lightrag/utils.py                   | 30 +++++++++++++++++++++++++----
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py
index c33059ad..3c1fb4c2 100644
--- a/lightrag/kg/json_doc_status_impl.py
+++ b/lightrag/kg/json_doc_status_impl.py
@@ -20,6 +20,7 @@ from .shared_storage import (
     set_all_update_flags,
     clear_all_update_flags,
     try_initialize_namespace,
+    is_multiprocess,
 )
 
 
@@ -95,7 +96,7 @@ class JsonDocStatusStorage(DocStatusStorage):
 
     async def index_done_callback(self) -> None:
         async with self._storage_lock:
-            if self.storage_updated:
+            if (is_multiprocess and self.storage_updated.value) or (not is_multiprocess and self.storage_updated):
                 data_dict = (
                     dict(self._data) if hasattr(self._data, "_getvalue") else self._data
                 )
diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py
index c69b53ec..b5d963fb 100644
--- a/lightrag/kg/json_kv_impl.py
+++ b/lightrag/kg/json_kv_impl.py
@@ -18,6 +18,7 @@ from .shared_storage import (
     set_all_update_flags,
     clear_all_update_flags,
     try_initialize_namespace,
+    is_multiprocess,
 )
 
 
@@ -57,7 +58,7 @@ class JsonKVStorage(BaseKVStorage):
 
     async def index_done_callback(self) -> None:
         async with self._storage_lock:
-            if self.storage_updated:
+            if (is_multiprocess and self.storage_updated.value) or (not is_multiprocess and self.storage_updated):
                 data_dict = (
                     dict(self._data) if hasattr(self._data, "_getvalue") else self._data
                 )
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 02c3236d..56548420 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -705,9 +705,22 @@ class CacheData:
 
 
 async def save_to_cache(hashing_kv, cache_data: CacheData):
-    if hashing_kv is None or hasattr(cache_data.content, "__aiter__"):
+    """Save data to cache, with improved handling for streaming responses and duplicate content.
+    
+    Args:
+        hashing_kv: The key-value storage for caching
+        cache_data: The cache data to save
+    """
+    # Skip if storage is None or content is a streaming response
+    if hashing_kv is None or not cache_data.content:
         return
-
+    
+    # If content is a streaming response, don't cache it
+    if hasattr(cache_data.content, "__aiter__"):
+        logger.debug("Streaming response detected, skipping cache")
+        return
+    
+    # Get existing cache data
     if exists_func(hashing_kv, "get_by_mode_and_id"):
         mode_cache = (
             await hashing_kv.get_by_mode_and_id(cache_data.mode, cache_data.args_hash)
@@ -715,7 +728,15 @@ async def save_to_cache(hashing_kv, cache_data: CacheData):
         )
     else:
         mode_cache = await hashing_kv.get_by_id(cache_data.mode) or {}
-
+    
+    # Check if we already have identical content cached
+    if cache_data.args_hash in mode_cache:
+        existing_content = mode_cache[cache_data.args_hash].get("return")
+        if existing_content == cache_data.content:
+            logger.info(f"Cache content unchanged for {cache_data.args_hash}, skipping update")
+            return
+    
+    # Update cache with new content
     mode_cache[cache_data.args_hash] = {
         "return": cache_data.content,
         "cache_type": cache_data.cache_type,
@@ -729,7 +750,8 @@ async def save_to_cache(hashing_kv, cache_data: CacheData):
         "embedding_max": cache_data.max_val,
         "original_prompt": cache_data.prompt,
     }
-
+    
+    # Only upsert if there's actual new content
     await hashing_kv.upsert({cache_data.mode: mode_cache})
 
 

From 14e1b31d1cd5273d188ecb7ecc3df4d17ff95075 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Mon, 10 Mar 2025 02:05:55 +0800
Subject: [PATCH 36/54] Improved logging clarity in storage operations

---
 lightrag/kg/json_doc_status_impl.py | 2 +-
 lightrag/kg/json_kv_impl.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py
index 3c1fb4c2..5b378c17 100644
--- a/lightrag/kg/json_doc_status_impl.py
+++ b/lightrag/kg/json_doc_status_impl.py
@@ -107,7 +107,7 @@ class JsonDocStatusStorage(DocStatusStorage):
     async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
         if not data:
             return
-        logger.info(f"Inserting {len(data)} to {self.namespace}")
+        logger.info(f"Inserting {len(data)} records to {self.namespace}")
         async with self._storage_lock:
             self._data.update(data)
             await set_all_update_flags(self.namespace)
diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py
index b5d963fb..6c855a25 100644
--- a/lightrag/kg/json_kv_impl.py
+++ b/lightrag/kg/json_kv_impl.py
@@ -108,7 +108,7 @@ class JsonKVStorage(BaseKVStorage):
     async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
         if not data:
             return
-        logger.info(f"Inserting {len(data)} to {self.namespace}")
+        logger.info(f"Inserting {len(data)} records to {self.namespace}")
         async with self._storage_lock:
             self._data.update(data)
             await set_all_update_flags(self.namespace)

From 4065a7df92cbe388741b463c4e48d3863920ef87 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Mon, 10 Mar 2025 02:07:19 +0800
Subject: [PATCH 37/54] Fix linting

---
 lightrag/api/utils_api.py           |  6 ++----
 lightrag/kg/json_doc_status_impl.py |  8 +++++--
 lightrag/kg/json_kv_impl.py         | 33 +++++++++++++++++++----------
 lightrag/kg/shared_storage.py       |  2 ++
 lightrag/lightrag.py                |  4 +++-
 lightrag/utils.py                   | 16 ++++++++------
 6 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/lightrag/api/utils_api.py b/lightrag/api/utils_api.py
index 9a619f9e..ffe63abd 100644
--- a/lightrag/api/utils_api.py
+++ b/lightrag/api/utils_api.py
@@ -359,12 +359,10 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:
     # Inject chunk configuration
     args.chunk_size = get_env_value("CHUNK_SIZE", 1200, int)
     args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int)
-    
+
     # Inject LLM cache configuration
     args.enable_llm_cache_for_extract = get_env_value(
-        "ENABLE_LLM_CACHE_FOR_EXTRACT", 
-        False, 
-        bool
+        "ENABLE_LLM_CACHE_FOR_EXTRACT", False, bool
     )
 
     ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name
diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py
index 5b378c17..4502397b 100644
--- a/lightrag/kg/json_doc_status_impl.py
+++ b/lightrag/kg/json_doc_status_impl.py
@@ -96,11 +96,15 @@ class JsonDocStatusStorage(DocStatusStorage):
 
     async def index_done_callback(self) -> None:
         async with self._storage_lock:
-            if (is_multiprocess and self.storage_updated.value) or (not is_multiprocess and self.storage_updated):
+            if (is_multiprocess and self.storage_updated.value) or (
+                not is_multiprocess and self.storage_updated
+            ):
                 data_dict = (
                     dict(self._data) if hasattr(self._data, "_getvalue") else self._data
                 )
-                logger.info(f"Process {os.getpid()} doc status writting {len(data_dict)} records to {self.namespace}")
+                logger.info(
+                    f"Process {os.getpid()} doc status writting {len(data_dict)} records to {self.namespace}"
+                )
                 write_json(data_dict, self._file_name)
                 await clear_all_update_flags(self.namespace)
 
diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py
index 6c855a25..80abe92e 100644
--- a/lightrag/kg/json_kv_impl.py
+++ b/lightrag/kg/json_kv_impl.py
@@ -44,21 +44,28 @@ class JsonKVStorage(BaseKVStorage):
                 loaded_data = load_json(self._file_name) or {}
                 async with self._storage_lock:
                     self._data.update(loaded_data)
-                    
+
                     # Calculate data count based on namespace
                     if self.namespace.endswith("cache"):
                         # For cache namespaces, sum the cache entries across all cache types
-                        data_count = sum(len(first_level_dict) for first_level_dict in loaded_data.values() 
-                                        if isinstance(first_level_dict, dict))
+                        data_count = sum(
+                            len(first_level_dict)
+                            for first_level_dict in loaded_data.values()
+                            if isinstance(first_level_dict, dict)
+                        )
                     else:
                         # For non-cache namespaces, use the original count method
                         data_count = len(loaded_data)
-                    
-                    logger.info(f"Process {os.getpid()} KV load {self.namespace} with {data_count} records")
+
+                    logger.info(
+                        f"Process {os.getpid()} KV load {self.namespace} with {data_count} records"
+                    )
 
     async def index_done_callback(self) -> None:
         async with self._storage_lock:
-            if (is_multiprocess and self.storage_updated.value) or (not is_multiprocess and self.storage_updated):
+            if (is_multiprocess and self.storage_updated.value) or (
+                not is_multiprocess and self.storage_updated
+            ):
                 data_dict = (
                     dict(self._data) if hasattr(self._data, "_getvalue") else self._data
                 )
@@ -66,17 +73,21 @@ class JsonKVStorage(BaseKVStorage):
                 # Calculate data count based on namespace
                 if self.namespace.endswith("cache"):
                     # # For cache namespaces, sum the cache entries across all cache types
-                    data_count = sum(len(first_level_dict) for first_level_dict in data_dict.values() 
-                                    if isinstance(first_level_dict, dict))
+                    data_count = sum(
+                        len(first_level_dict)
+                        for first_level_dict in data_dict.values()
+                        if isinstance(first_level_dict, dict)
+                    )
                 else:
                     # For non-cache namespaces, use the original count method
                     data_count = len(data_dict)
-                    
-                logger.info(f"Process {os.getpid()} KV writting {data_count} records to {self.namespace}")
+
+                logger.info(
+                    f"Process {os.getpid()} KV writting {data_count} records to {self.namespace}"
+                )
                 write_json(data_dict, self._file_name)
                 await clear_all_update_flags(self.namespace)
 
-
     async def get_all(self) -> dict[str, Any]:
         """Get all data from storage
 
diff --git a/lightrag/kg/shared_storage.py b/lightrag/kg/shared_storage.py
index 9ce04d23..63ff1f0d 100644
--- a/lightrag/kg/shared_storage.py
+++ b/lightrag/kg/shared_storage.py
@@ -344,6 +344,7 @@ async def set_all_update_flags(namespace: str):
             else:
                 _update_flags[namespace][i] = True
 
+
 async def clear_all_update_flags(namespace: str):
     """Clear all update flag of namespace indicating all workers need to reload data from files"""
     global _update_flags
@@ -360,6 +361,7 @@ async def clear_all_update_flags(namespace: str):
             else:
                 _update_flags[namespace][i] = False
 
+
 async def get_all_update_flags_status() -> Dict[str, list]:
     """
     Get update flags status for all namespaces.
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index a91aa6fa..ceb47a01 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -354,7 +354,9 @@ class LightRAG:
             namespace=make_namespace(
                 self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
             ),
-            global_config=asdict(self),  # Add global_config to ensure cache works properly
+            global_config=asdict(
+                self
+            ),  # Add global_config to ensure cache works properly
             embedding_func=self.embedding_func,
         )
 
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 56548420..e8f79610 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -706,7 +706,7 @@ class CacheData:
 
 async def save_to_cache(hashing_kv, cache_data: CacheData):
     """Save data to cache, with improved handling for streaming responses and duplicate content.
-    
+
     Args:
         hashing_kv: The key-value storage for caching
         cache_data: The cache data to save
@@ -714,12 +714,12 @@ async def save_to_cache(hashing_kv, cache_data: CacheData):
     # Skip if storage is None or content is a streaming response
     if hashing_kv is None or not cache_data.content:
         return
-    
+
     # If content is a streaming response, don't cache it
     if hasattr(cache_data.content, "__aiter__"):
         logger.debug("Streaming response detected, skipping cache")
         return
-    
+
     # Get existing cache data
     if exists_func(hashing_kv, "get_by_mode_and_id"):
         mode_cache = (
@@ -728,14 +728,16 @@ async def save_to_cache(hashing_kv, cache_data: CacheData):
         )
     else:
         mode_cache = await hashing_kv.get_by_id(cache_data.mode) or {}
-    
+
     # Check if we already have identical content cached
     if cache_data.args_hash in mode_cache:
         existing_content = mode_cache[cache_data.args_hash].get("return")
         if existing_content == cache_data.content:
-            logger.info(f"Cache content unchanged for {cache_data.args_hash}, skipping update")
+            logger.info(
+                f"Cache content unchanged for {cache_data.args_hash}, skipping update"
+            )
             return
-    
+
     # Update cache with new content
     mode_cache[cache_data.args_hash] = {
         "return": cache_data.content,
@@ -750,7 +752,7 @@ async def save_to_cache(hashing_kv, cache_data: CacheData):
         "embedding_max": cache_data.max_val,
         "original_prompt": cache_data.prompt,
     }
-    
+
     # Only upsert if there's actual new content
     await hashing_kv.upsert({cache_data.mode: mode_cache})
 

From 46610682ce9ce6197e7319f805408ef475d1ed0d Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Mon, 10 Mar 2025 15:41:00 +0800
Subject: [PATCH 38/54] Fix data persistence issue in single-process mode

In single-process mode, data updates and persistence were not working properly because the update flags were not being correctly handled between different objects.
---
 lightrag/kg/json_doc_status_impl.py |  5 +----
 lightrag/kg/json_kv_impl.py         |  5 +----
 lightrag/kg/shared_storage.py       | 13 ++++++++++---
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py
index 4502397b..57a34ae5 100644
--- a/lightrag/kg/json_doc_status_impl.py
+++ b/lightrag/kg/json_doc_status_impl.py
@@ -20,7 +20,6 @@ from .shared_storage import (
     set_all_update_flags,
     clear_all_update_flags,
     try_initialize_namespace,
-    is_multiprocess,
 )
 
 
@@ -96,9 +95,7 @@ class JsonDocStatusStorage(DocStatusStorage):
 
     async def index_done_callback(self) -> None:
         async with self._storage_lock:
-            if (is_multiprocess and self.storage_updated.value) or (
-                not is_multiprocess and self.storage_updated
-            ):
+            if self.storage_updated.value:
                 data_dict = (
                     dict(self._data) if hasattr(self._data, "_getvalue") else self._data
                 )
diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py
index 80abe92e..e7deaf15 100644
--- a/lightrag/kg/json_kv_impl.py
+++ b/lightrag/kg/json_kv_impl.py
@@ -18,7 +18,6 @@ from .shared_storage import (
     set_all_update_flags,
     clear_all_update_flags,
     try_initialize_namespace,
-    is_multiprocess,
 )
 
 
@@ -63,9 +62,7 @@ class JsonKVStorage(BaseKVStorage):
 
     async def index_done_callback(self) -> None:
         async with self._storage_lock:
-            if (is_multiprocess and self.storage_updated.value) or (
-                not is_multiprocess and self.storage_updated
-            ):
+            if self.storage_updated.value:
                 data_dict = (
                     dict(self._data) if hasattr(self._data, "_getvalue") else self._data
                 )
diff --git a/lightrag/kg/shared_storage.py b/lightrag/kg/shared_storage.py
index 63ff1f0d..9bf072be 100644
--- a/lightrag/kg/shared_storage.py
+++ b/lightrag/kg/shared_storage.py
@@ -322,7 +322,12 @@ async def get_update_flag(namespace: str):
         if is_multiprocess and _manager is not None:
             new_update_flag = _manager.Value("b", False)
         else:
-            new_update_flag = False
+            # Create a simple mutable object to store boolean value for compatibility with mutiprocess
+            class MutableBoolean:
+                def __init__(self, initial_value=False):
+                    self.value = initial_value
+            
+            new_update_flag = MutableBoolean(False)
 
         _update_flags[namespace].append(new_update_flag)
         return new_update_flag
@@ -342,7 +347,8 @@ async def set_all_update_flags(namespace: str):
             if is_multiprocess:
                 _update_flags[namespace][i].value = True
             else:
-                _update_flags[namespace][i] = True
+                # Use .value attribute instead of direct assignment
+                _update_flags[namespace][i].value = True
 
 
 async def clear_all_update_flags(namespace: str):
@@ -359,7 +365,8 @@ async def clear_all_update_flags(namespace: str):
             if is_multiprocess:
                 _update_flags[namespace][i].value = False
             else:
-                _update_flags[namespace][i] = False
+                # Use .value attribute instead of direct assignment
+                _update_flags[namespace][i].value = False
 
 
 async def get_all_update_flags_status() -> Dict[str, list]:

From 57a41eedb89e7d0d876b5b9b792aba4d06dd7fde Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Mon, 10 Mar 2025 15:41:46 +0800
Subject: [PATCH 39/54] Fix linting

---
 lightrag/kg/shared_storage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/kg/shared_storage.py b/lightrag/kg/shared_storage.py
index 9bf072be..382e490b 100644
--- a/lightrag/kg/shared_storage.py
+++ b/lightrag/kg/shared_storage.py
@@ -326,7 +326,7 @@ async def get_update_flag(namespace: str):
             class MutableBoolean:
                 def __init__(self, initial_value=False):
                     self.value = initial_value
-            
+
             new_update_flag = MutableBoolean(False)
 
         _update_flags[namespace].append(new_update_flag)

From 3cca18c59c3d81cad721154a78673d131593d4b2 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Mon, 10 Mar 2025 16:48:59 +0800
Subject: [PATCH 40/54] Refactor pipeline status updates and entity extraction.

- Let all parrallel jobs using one pipe_status objects
- Improved thread safety with pipeline_status_lock
- Only pipeline jobs can add message to pipe_status
- Marked insert_custom_chunks as deprecated
---
 lightrag/lightrag.py | 22 ++++++++++++----------
 lightrag/operate.py  | 35 ++++++++++++++++++++++-------------
 2 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 6abd7a17..5b42fa3d 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -583,6 +583,7 @@ class LightRAG:
             split_by_character, split_by_character_only
         )
 
+    # TODO: deprecated, use insert instead
     def insert_custom_chunks(
         self,
         full_text: str,
@@ -594,6 +595,7 @@ class LightRAG:
             self.ainsert_custom_chunks(full_text, text_chunks, doc_id)
         )
 
+    # TODO: deprecated, use ainsert instead
     async def ainsert_custom_chunks(
         self, full_text: str, text_chunks: list[str], doc_id: str | None = None
     ) -> None:
@@ -885,7 +887,7 @@ class LightRAG:
                                 self.chunks_vdb.upsert(chunks)
                             )
                             entity_relation_task = asyncio.create_task(
-                                self._process_entity_relation_graph(chunks)
+                                self._process_entity_relation_graph(chunks, pipeline_status, pipeline_status_lock)
                             )
                             full_docs_task = asyncio.create_task(
                                 self.full_docs.upsert(
@@ -1000,21 +1002,23 @@ class LightRAG:
                 pipeline_status["latest_message"] = log_message
                 pipeline_status["history_messages"].append(log_message)
 
-    async def _process_entity_relation_graph(self, chunk: dict[str, Any]) -> None:
+    async def _process_entity_relation_graph(self, chunk: dict[str, Any], pipeline_status=None, pipeline_status_lock=None) -> None:
         try:
             await extract_entities(
                 chunk,
                 knowledge_graph_inst=self.chunk_entity_relation_graph,
                 entity_vdb=self.entities_vdb,
                 relationships_vdb=self.relationships_vdb,
-                llm_response_cache=self.llm_response_cache,
                 global_config=asdict(self),
+                pipeline_status=pipeline_status,
+                pipeline_status_lock=pipeline_status_lock,
+                llm_response_cache=self.llm_response_cache,
             )
         except Exception as e:
             logger.error("Failed to extract entities and relationships")
             raise e
 
-    async def _insert_done(self) -> None:
+    async def _insert_done(self, pipeline_status=None, pipeline_status_lock=None) -> None:
         tasks = [
             cast(StorageNameSpace, storage_inst).index_done_callback()
             for storage_inst in [  # type: ignore
@@ -1033,12 +1037,10 @@ class LightRAG:
         log_message = "All Insert done"
         logger.info(log_message)
 
-        # 获取 pipeline_status 并更新 latest_message 和 history_messages
-        from lightrag.kg.shared_storage import get_namespace_data
-
-        pipeline_status = await get_namespace_data("pipeline_status")
-        pipeline_status["latest_message"] = log_message
-        pipeline_status["history_messages"].append(log_message)
+        if pipeline_status is not None and pipeline_status_lock is not None:
+            async with pipeline_status_lock:
+                pipeline_status["latest_message"] = log_message
+                pipeline_status["history_messages"].append(log_message)
 
     def insert_custom_kg(
         self, custom_kg: dict[str, Any], full_doc_id: str = None
diff --git a/lightrag/operate.py b/lightrag/operate.py
index ba39fe89..5d6b7c7d 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -340,11 +340,10 @@ async def extract_entities(
     entity_vdb: BaseVectorStorage,
     relationships_vdb: BaseVectorStorage,
     global_config: dict[str, str],
+    pipeline_status: dict = None,
+    pipeline_status_lock = None,
     llm_response_cache: BaseKVStorage | None = None,
 ) -> None:
-    from lightrag.kg.shared_storage import get_namespace_data
-
-    pipeline_status = await get_namespace_data("pipeline_status")
     use_llm_func: callable = global_config["llm_model_func"]
     entity_extract_max_gleaning = global_config["entity_extract_max_gleaning"]
     enable_llm_cache_for_entity_extract: bool = global_config[
@@ -507,8 +506,10 @@ async def extract_entities(
         relations_count = len(maybe_edges)
         log_message = f"  Chunk {processed_chunks}/{total_chunks}: extracted {entities_count} entities and {relations_count} relationships (deduplicated)"
         logger.info(log_message)
-        pipeline_status["latest_message"] = log_message
-        pipeline_status["history_messages"].append(log_message)
+        if pipeline_status is not None:
+            async with pipeline_status_lock:
+                pipeline_status["latest_message"] = log_message
+                pipeline_status["history_messages"].append(log_message)
         return dict(maybe_nodes), dict(maybe_edges)
 
     tasks = [_process_single_content(c) for c in ordered_chunks]
@@ -547,25 +548,33 @@ async def extract_entities(
     if not (all_entities_data or all_relationships_data):
         log_message = "Didn't extract any entities and relationships."
         logger.info(log_message)
-        pipeline_status["latest_message"] = log_message
-        pipeline_status["history_messages"].append(log_message)
+        if pipeline_status is not None:
+            async with pipeline_status_lock:
+                pipeline_status["latest_message"] = log_message
+                pipeline_status["history_messages"].append(log_message)
         return
 
     if not all_entities_data:
         log_message = "Didn't extract any entities"
         logger.info(log_message)
-        pipeline_status["latest_message"] = log_message
-        pipeline_status["history_messages"].append(log_message)
+        if pipeline_status is not None:
+            async with pipeline_status_lock:
+                pipeline_status["latest_message"] = log_message
+                pipeline_status["history_messages"].append(log_message)
     if not all_relationships_data:
         log_message = "Didn't extract any relationships"
         logger.info(log_message)
-        pipeline_status["latest_message"] = log_message
-        pipeline_status["history_messages"].append(log_message)
+        if pipeline_status is not None:
+            async with pipeline_status_lock:
+                pipeline_status["latest_message"] = log_message
+                pipeline_status["history_messages"].append(log_message)
 
     log_message = f"Extracted {len(all_entities_data)} entities and {len(all_relationships_data)} relationships (deduplicated)"
     logger.info(log_message)
-    pipeline_status["latest_message"] = log_message
-    pipeline_status["history_messages"].append(log_message)
+    if pipeline_status is not None:
+        async with pipeline_status_lock:
+            pipeline_status["latest_message"] = log_message
+            pipeline_status["history_messages"].append(log_message)
     verbose_debug(
         f"New entities:{all_entities_data}, relationships:{all_relationships_data}"
     )

From 5d64f3b0a03fbb856504f5729530166d1107f495 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Mon, 10 Mar 2025 17:14:14 +0800
Subject: [PATCH 41/54] Improved auto-scan task initialization and status
 tracking.

- Added autoscan status tracking in pipeline
- Ensured auto-scan runs only once per startup
---
 lightrag/api/lightrag_server.py | 34 +++++++++++++++++----------------
 lightrag/kg/shared_storage.py   |  1 +
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index 8871650a..fd09a691 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -141,23 +141,25 @@ def create_app(args):
         try:
             # Initialize database connections
             await rag.initialize_storages()
-            await initialize_pipeline_status()
 
-            # Auto scan documents if enabled
-            if args.auto_scan_at_startup:
-                # Check if a task is already running (with lock protection)
-                pipeline_status = await get_namespace_data("pipeline_status")
-                should_start_task = False
-                async with get_pipeline_status_lock():
-                    if not pipeline_status.get("busy", False):
-                        should_start_task = True
-                # Only start the task if no other task is running
-                if should_start_task:
-                    # Create background task
-                    task = asyncio.create_task(run_scanning_process(rag, doc_manager))
-                    app.state.background_tasks.add(task)
-                    task.add_done_callback(app.state.background_tasks.discard)
-                    logger.info("Auto scan task started at startup.")
+            await initialize_pipeline_status()
+            pipeline_status = await get_namespace_data("pipeline_status")
+
+            should_start_autoscan = False
+            async with get_pipeline_status_lock():
+                # Auto scan documents if enabled
+                if args.auto_scan_at_startup:
+                    if not pipeline_status.get("autoscanned", False):
+                        pipeline_status["autoscanned"] = True
+                        should_start_autoscan = True
+
+            # Only run auto scan when no other process started it first
+            if should_start_autoscan:
+                # Create background task
+                task = asyncio.create_task(run_scanning_process(rag, doc_manager))
+                app.state.background_tasks.add(task)
+                task.add_done_callback(app.state.background_tasks.discard)
+                logger.info(f"Process {os.getpid()} auto scan task started at startup.")
 
             ASCIIColors.green("\nServer is ready to accept connections! 🚀\n")
 
diff --git a/lightrag/kg/shared_storage.py b/lightrag/kg/shared_storage.py
index 382e490b..736887a6 100644
--- a/lightrag/kg/shared_storage.py
+++ b/lightrag/kg/shared_storage.py
@@ -286,6 +286,7 @@ async def initialize_pipeline_status():
         history_messages = _manager.list() if is_multiprocess else []
         pipeline_namespace.update(
             {
+                "autoscanned": False,  # Auto-scan started
                 "busy": False,  # Control concurrent processes
                 "job_name": "Default Job",  # Current job name (indexing files/indexing texts)
                 "job_start": None,  # Job start time

From bbff3ed0abc26b5b403e75c432ca1fd95b6071c3 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Mon, 10 Mar 2025 17:30:40 +0800
Subject: [PATCH 42/54] Fix linting

---
 lightrag/lightrag.py | 12 +++++++++---
 lightrag/operate.py  |  2 +-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 5b42fa3d..3cd379b6 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -887,7 +887,9 @@ class LightRAG:
                                 self.chunks_vdb.upsert(chunks)
                             )
                             entity_relation_task = asyncio.create_task(
-                                self._process_entity_relation_graph(chunks, pipeline_status, pipeline_status_lock)
+                                self._process_entity_relation_graph(
+                                    chunks, pipeline_status, pipeline_status_lock
+                                )
                             )
                             full_docs_task = asyncio.create_task(
                                 self.full_docs.upsert(
@@ -1002,7 +1004,9 @@ class LightRAG:
                 pipeline_status["latest_message"] = log_message
                 pipeline_status["history_messages"].append(log_message)
 
-    async def _process_entity_relation_graph(self, chunk: dict[str, Any], pipeline_status=None, pipeline_status_lock=None) -> None:
+    async def _process_entity_relation_graph(
+        self, chunk: dict[str, Any], pipeline_status=None, pipeline_status_lock=None
+    ) -> None:
         try:
             await extract_entities(
                 chunk,
@@ -1018,7 +1022,9 @@ class LightRAG:
             logger.error("Failed to extract entities and relationships")
             raise e
 
-    async def _insert_done(self, pipeline_status=None, pipeline_status_lock=None) -> None:
+    async def _insert_done(
+        self, pipeline_status=None, pipeline_status_lock=None
+    ) -> None:
         tasks = [
             cast(StorageNameSpace, storage_inst).index_done_callback()
             for storage_inst in [  # type: ignore
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 5d6b7c7d..e352ff79 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -341,7 +341,7 @@ async def extract_entities(
     relationships_vdb: BaseVectorStorage,
     global_config: dict[str, str],
     pipeline_status: dict = None,
-    pipeline_status_lock = None,
+    pipeline_status_lock=None,
     llm_response_cache: BaseKVStorage | None = None,
 ) -> None:
     use_llm_func: callable = global_config["llm_model_func"]

From 8317ec9757d32a2572b1afd3afe3b2bfb8d35e1f Mon Sep 17 00:00:00 2001
From: zrguo <zrguo.bupt@qq.com>
Date: Mon, 10 Mar 2025 23:00:06 +0800
Subject: [PATCH 43/54] Update __init__.py

---
 lightrag/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/__init__.py b/lightrag/__init__.py
index e4cb3e63..382060f7 100644
--- a/lightrag/__init__.py
+++ b/lightrag/__init__.py
@@ -1,5 +1,5 @@
 from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
 
-__version__ = "1.2.4"
+__version__ = "1.2.5"
 __author__ = "Zirui Guo"
 __url__ = "https://github.com/HKUDS/LightRAG"

From aefd5969903d5283bbeb1eb413eba4a555eede7a Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Tue, 11 Mar 2025 09:59:42 +0800
Subject: [PATCH 44/54] Refactor Neo4JStorage to use entity_id for node
 identification, use entity_type for node label

---
 lightrag/kg/neo4j_impl.py | 287 +++++++++++++-------------------------
 1 file changed, 95 insertions(+), 192 deletions(-)

diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index 8d5a1a55..18bd6859 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -176,23 +176,6 @@ class Neo4JStorage(BaseGraphStorage):
         # Noe4J handles persistence automatically
         pass
 
-    def _ensure_label(self, label: str) -> str:
-        """Ensure a label is valid
-
-        Args:
-            label: The label to validate
-
-        Returns:
-            str: The cleaned label
-
-        Raises:
-            ValueError: If label is empty after cleaning
-        """
-        clean_label = label.strip('"')
-        if not clean_label:
-            raise ValueError("Neo4j: Label cannot be empty")
-        return clean_label
-
     async def has_node(self, node_id: str) -> bool:
         """
         Check if a node with the given label exists in the database
@@ -207,19 +190,18 @@ class Neo4JStorage(BaseGraphStorage):
             ValueError: If node_id is invalid
             Exception: If there is an error executing the query
         """
-        entity_name_label = self._ensure_label(node_id)
         async with self._driver.session(
             database=self._DATABASE, default_access_mode="READ"
         ) as session:
             try:
-                query = f"MATCH (n:`{entity_name_label}`) RETURN count(n) > 0 AS node_exists"
-                result = await session.run(query)
+                query = "MATCH (n:base {entity_id: $entity_id}) RETURN count(n) > 0 AS node_exists"
+                result = await session.run(query, entity_id = node_id)
                 single_result = await result.single()
                 await result.consume()  # Ensure result is fully consumed
                 return single_result["node_exists"]
             except Exception as e:
                 logger.error(
-                    f"Error checking node existence for {entity_name_label}: {str(e)}"
+                    f"Error checking node existence for {node_id}: {str(e)}"
                 )
                 await result.consume()  # Ensure results are consumed even on error
                 raise
@@ -239,24 +221,21 @@ class Neo4JStorage(BaseGraphStorage):
             ValueError: If either node_id is invalid
             Exception: If there is an error executing the query
         """
-        entity_name_label_source = self._ensure_label(source_node_id)
-        entity_name_label_target = self._ensure_label(target_node_id)
-
         async with self._driver.session(
             database=self._DATABASE, default_access_mode="READ"
         ) as session:
             try:
                 query = (
-                    f"MATCH (a:`{entity_name_label_source}`)-[r]-(b:`{entity_name_label_target}`) "
+                    "MATCH (a:base {entity_id: $source_entity_id})-[r]-(b:base {entity_id: $target_entity_id}) "
                     "RETURN COUNT(r) > 0 AS edgeExists"
                 )
-                result = await session.run(query)
+                result = await session.run(query, source_entity_id = source_node_id, target_entity_id = target_node_id)
                 single_result = await result.single()
                 await result.consume()  # Ensure result is fully consumed
                 return single_result["edgeExists"]
             except Exception as e:
                 logger.error(
-                    f"Error checking edge existence between {entity_name_label_source} and {entity_name_label_target}: {str(e)}"
+                    f"Error checking edge existence between {source_node_id} and {target_node_id}: {str(e)}"
                 )
                 await result.consume()  # Ensure results are consumed even on error
                 raise
@@ -275,13 +254,12 @@ class Neo4JStorage(BaseGraphStorage):
             ValueError: If node_id is invalid
             Exception: If there is an error executing the query
         """
-        entity_name_label = self._ensure_label(node_id)
         async with self._driver.session(
             database=self._DATABASE, default_access_mode="READ"
         ) as session:
             try:
-                query = f"MATCH (n:`{entity_name_label}` {{entity_id: $entity_id}}) RETURN n"
-                result = await session.run(query, entity_id=entity_name_label)
+                query = "MATCH (n:base {entity_id: $entity_id}) RETURN n"
+                result = await session.run(query, entity_id=node_id)
                 try:
                     records = await result.fetch(
                         2
@@ -289,20 +267,21 @@ class Neo4JStorage(BaseGraphStorage):
 
                     if len(records) > 1:
                         logger.warning(
-                            f"Multiple nodes found with label '{entity_name_label}'. Using first node."
+                            f"Multiple nodes found with label '{node_id}'. Using first node."
                         )
                     if records:
                         node = records[0]["n"]
                         node_dict = dict(node)
-                        logger.debug(
-                            f"{inspect.currentframe().f_code.co_name}: query: {query}, result: {node_dict}"
-                        )
+                        # Remove base label from labels list if it exists
+                        if "labels" in node_dict:
+                            node_dict["labels"] = [label for label in node_dict["labels"] if label != "base"]
+                        logger.debug(f"Neo4j query node {query} return: {node_dict}")
                         return node_dict
                     return None
                 finally:
                     await result.consume()  # Ensure result is fully consumed
             except Exception as e:
-                logger.error(f"Error getting node for {entity_name_label}: {str(e)}")
+                logger.error(f"Error getting node for {node_id}: {str(e)}")
                 raise
 
     async def node_degree(self, node_id: str) -> int:
@@ -320,42 +299,33 @@ class Neo4JStorage(BaseGraphStorage):
             ValueError: If node_id is invalid
             Exception: If there is an error executing the query
         """
-        entity_name_label = self._ensure_label(node_id)
-
         async with self._driver.session(
             database=self._DATABASE, default_access_mode="READ"
         ) as session:
             try:
-                query = f"""
-                    MATCH (n:`{entity_name_label}`)
+                query = """
+                    MATCH (n:base {entity_id: $entity_id})
                     OPTIONAL MATCH (n)-[r]-()
-                    RETURN n, COUNT(r) AS degree
+                    RETURN COUNT(r) AS degree
                 """
-                result = await session.run(query)
+                result = await session.run(query, entity_id = node_id)
                 try:
-                    records = await result.fetch(100)
+                    record = await result.single()
 
-                    if not records:
+                    if not record:
                         logger.warning(
-                            f"No node found with label '{entity_name_label}'"
+                            f"No node found with label '{node_id}'"
                         )
                         return 0
 
-                    if len(records) > 1:
-                        logger.warning(
-                            f"Multiple nodes ({len(records)}) found with label '{entity_name_label}', using first node's degree"
-                        )
-
-                    degree = records[0]["degree"]
-                    logger.debug(
-                        f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{degree}"
-                    )
+                    degree = record["degree"]
+                    logger.debug("Neo4j query node degree for {node_id} return: {degree}")
                     return degree
                 finally:
                     await result.consume()  # Ensure result is fully consumed
             except Exception as e:
                 logger.error(
-                    f"Error getting node degree for {entity_name_label}: {str(e)}"
+                    f"Error getting node degree for {node_id}: {str(e)}"
                 )
                 raise
 
@@ -369,11 +339,8 @@ class Neo4JStorage(BaseGraphStorage):
         Returns:
             int: Sum of the degrees of both nodes
         """
-        entity_name_label_source = self._ensure_label(src_id)
-        entity_name_label_target = self._ensure_label(tgt_id)
-
-        src_degree = await self.node_degree(entity_name_label_source)
-        trg_degree = await self.node_degree(entity_name_label_target)
+        src_degree = await self.node_degree(src_id)
+        trg_degree = await self.node_degree(tgt_id)
 
         # Convert None to 0 for addition
         src_degree = 0 if src_degree is None else src_degree
@@ -399,24 +366,20 @@ class Neo4JStorage(BaseGraphStorage):
             Exception: If there is an error executing the query
         """
         try:
-            entity_name_label_source = self._ensure_label(source_node_id)
-            entity_name_label_target = self._ensure_label(target_node_id)
-
             async with self._driver.session(
                 database=self._DATABASE, default_access_mode="READ"
             ) as session:
-                query = f"""
-                MATCH (start:`{entity_name_label_source}`)-[r]-(end:`{entity_name_label_target}`)
+                query = """
+                MATCH (start:base {entity_id: $source_entity_id})-[r]-(end:base {entity_id: $target_entity_id})
                 RETURN properties(r) as edge_properties
                 """
-
-                result = await session.run(query)
+                result = await session.run(query, source_entity_id=source_node_id, target_entity_id=target_node_id)
                 try:
                     records = await result.fetch(2)
 
                     if len(records) > 1:
                         logger.warning(
-                            f"Multiple edges found between '{entity_name_label_source}' and '{entity_name_label_target}'. Using first edge."
+                            f"Multiple edges found between '{source_node_id}' and '{target_node_id}'. Using first edge."
                         )
                     if records:
                         try:
@@ -433,7 +396,7 @@ class Neo4JStorage(BaseGraphStorage):
                                 if key not in edge_result:
                                     edge_result[key] = default_value
                                     logger.warning(
-                                        f"Edge between {entity_name_label_source} and {entity_name_label_target} "
+                                        f"Edge between {source_node_id} and {target_node_id} "
                                         f"missing {key}, using default: {default_value}"
                                     )
 
@@ -443,8 +406,8 @@ class Neo4JStorage(BaseGraphStorage):
                             return edge_result
                         except (KeyError, TypeError, ValueError) as e:
                             logger.error(
-                                f"Error processing edge properties between {entity_name_label_source} "
-                                f"and {entity_name_label_target}: {str(e)}"
+                                f"Error processing edge properties between {source_node_id} "
+                                f"and {target_node_id}: {str(e)}"
                             )
                             # Return default edge properties on error
                             return {
@@ -455,7 +418,7 @@ class Neo4JStorage(BaseGraphStorage):
                             }
 
                     logger.debug(
-                        f"{inspect.currentframe().f_code.co_name}: No edge found between {entity_name_label_source} and {entity_name_label_target}"
+                        f"{inspect.currentframe().f_code.co_name}: No edge found between {source_node_id} and {target_node_id}"
                     )
                     # Return default edge properties when no edge found
                     return {
@@ -488,30 +451,30 @@ class Neo4JStorage(BaseGraphStorage):
             Exception: If there is an error executing the query
         """
         try:
-            node_label = self._ensure_label(source_node_id)
-
-            query = f"""MATCH (n:`{node_label}`)
-                    OPTIONAL MATCH (n)-[r]-(connected)
-                    RETURN n, r, connected"""
-
             async with self._driver.session(
                 database=self._DATABASE, default_access_mode="READ"
             ) as session:
                 try:
-                    results = await session.run(query)
-                    edges = []
+                    query = """MATCH (n:base {entity_id: $entity_id})
+                            OPTIONAL MATCH (n)-[r]-(connected:base)
+                            WHERE connected.entity_id IS NOT NULL
+                            RETURN n, r, connected"""
+                    results = await session.run(query, entity_id=source_node_id)
 
+                    edges = []
                     async for record in results:
                         source_node = record["n"]
                         connected_node = record["connected"]
 
+                        # Skip if either node is None
+                        if not source_node or not connected_node:
+                            continue
+
                         source_label = (
-                            list(source_node.labels)[0] if source_node.labels else None
+                            source_node.get("entity_id") if source_node.get("entity_id") else None
                         )
                         target_label = (
-                            list(connected_node.labels)[0]
-                            if connected_node and connected_node.labels
-                            else None
+                            connected_node.get("entity_id") if connected_node.get("entity_id") else None
                         )
 
                         if source_label and target_label:
@@ -520,7 +483,7 @@ class Neo4JStorage(BaseGraphStorage):
                     await results.consume()  # Ensure results are consumed
                     return edges
                 except Exception as e:
-                    logger.error(f"Error getting edges for node {node_label}: {str(e)}")
+                    logger.error(f"Error getting edges for node {source_node_id}: {str(e)}")
                     await results.consume()  # Ensure results are consumed even on error
                     raise
         except Exception as e:
@@ -547,8 +510,9 @@ class Neo4JStorage(BaseGraphStorage):
             node_id: The unique identifier for the node (used as label)
             node_data: Dictionary of node properties
         """
-        label = self._ensure_label(node_id)
         properties = node_data
+        entity_type = properties["entity_type"]
+        entity_id = properties["entity_id"]
         if "entity_id" not in properties:
             raise ValueError("Neo4j: node properties must contain an 'entity_id' field")
 
@@ -556,13 +520,14 @@ class Neo4JStorage(BaseGraphStorage):
             async with self._driver.session(database=self._DATABASE) as session:
 
                 async def execute_upsert(tx: AsyncManagedTransaction):
-                    query = f"""
-                    MERGE (n:`{label}` {{entity_id: $properties.entity_id}})
+                    query = """
+                    MERGE (n:base {entity_id: $properties.entity_id})
                     SET n += $properties
-                    """
+                    SET n:`%s`
+                    """ % entity_type
                     result = await tx.run(query, properties=properties)
                     logger.debug(
-                        f"Upserted node with label '{label}' and properties: {properties}"
+                        f"Upserted node with entity_id '{entity_id}' and properties: {properties}"
                     )
                     await result.consume()  # Ensure result is fully consumed
 
@@ -583,51 +548,6 @@ class Neo4JStorage(BaseGraphStorage):
             )
         ),
     )
-    async def _get_unique_node_entity_id(self, node_label: str) -> str:
-        """
-        Get the entity_id of a node with the given label, ensuring the node is unique.
-
-        Args:
-            node_label (str): Label of the node to check
-
-        Returns:
-            str: The entity_id of the unique node
-
-        Raises:
-            ValueError: If no node with the given label exists or if multiple nodes have the same label
-        """
-        async with self._driver.session(
-            database=self._DATABASE, default_access_mode="READ"
-        ) as session:
-            query = f"""
-            MATCH (n:`{node_label}`)
-            RETURN n, count(n) as node_count
-            """
-            result = await session.run(query)
-            try:
-                records = await result.fetch(
-                    2
-                )  # We only need to know if there are 0, 1, or >1 nodes
-
-                if not records or records[0]["node_count"] == 0:
-                    raise ValueError(
-                        f"Neo4j: node with label '{node_label}' does not exist"
-                    )
-
-                if records[0]["node_count"] > 1:
-                    raise ValueError(
-                        f"Neo4j: multiple nodes found with label '{node_label}', cannot determine unique node"
-                    )
-
-                node = records[0]["n"]
-                if "entity_id" not in node:
-                    raise ValueError(
-                        f"Neo4j: node with label '{node_label}' does not have an entity_id property"
-                    )
-
-                return node["entity_id"]
-            finally:
-                await result.consume()  # Ensure result is fully consumed
 
     @retry(
         stop=stop_after_attempt(3),
@@ -657,38 +577,30 @@ class Neo4JStorage(BaseGraphStorage):
         Raises:
             ValueError: If either source or target node does not exist or is not unique
         """
-        source_label = self._ensure_label(source_node_id)
-        target_label = self._ensure_label(target_node_id)
-        edge_properties = edge_data
-
-        # Get entity_ids for source and target nodes, ensuring they are unique
-        source_entity_id = await self._get_unique_node_entity_id(source_label)
-        target_entity_id = await self._get_unique_node_entity_id(target_label)
-
         try:
+            edge_properties = edge_data
             async with self._driver.session(database=self._DATABASE) as session:
 
                 async def execute_upsert(tx: AsyncManagedTransaction):
-                    query = f"""
-                    MATCH (source:`{source_label}` {{entity_id: $source_entity_id}})
+                    query = """
+                    MATCH (source:base {entity_id: $source_entity_id})
                     WITH source
-                    MATCH (target:`{target_label}` {{entity_id: $target_entity_id}})
+                    MATCH (target:base {entity_id: $target_entity_id})
                     MERGE (source)-[r:DIRECTED]-(target)
                     SET r += $properties
                     RETURN r, source, target
                     """
                     result = await tx.run(
                         query,
-                        source_entity_id=source_entity_id,
-                        target_entity_id=target_entity_id,
+                        source_entity_id=source_node_id,
+                        target_entity_id=target_node_id,
                         properties=edge_properties,
                     )
                     try:
-                        records = await result.fetch(100)
+                        records = await result.fetch(2)
                         if records:
                             logger.debug(
-                                f"Upserted edge from '{source_label}' (entity_id: {source_entity_id}) "
-                                f"to '{target_label}' (entity_id: {target_entity_id}) "
+                                f"Upserted edge from '{source_node_id}' to '{target_node_id}'"
                                 f"with properties: {edge_properties}"
                             )
                     finally:
@@ -726,7 +638,6 @@ class Neo4JStorage(BaseGraphStorage):
         Returns:
             KnowledgeGraph: Complete connected subgraph for specified node
         """
-        label = node_label.strip('"')
         result = KnowledgeGraph()
         seen_nodes = set()
         seen_edges = set()
@@ -735,7 +646,7 @@ class Neo4JStorage(BaseGraphStorage):
             database=self._DATABASE, default_access_mode="READ"
         ) as session:
             try:
-                if label == "*":
+                if node_label == "*":
                     main_query = """
                     MATCH (n)
                     OPTIONAL MATCH (n)-[r]-()
@@ -760,12 +671,11 @@ class Neo4JStorage(BaseGraphStorage):
                     # Main query uses partial matching
                     main_query = """
                     MATCH (start)
-                    WHERE any(label IN labels(start) WHERE
+                    WHERE
                         CASE
-                            WHEN $inclusive THEN label CONTAINS $label
-                            ELSE label = $label
+                            WHEN $inclusive THEN start.entity_id CONTAINS $entity_id
+                            ELSE start.entity_id = $entity_id
                         END
-                    )
                     WITH start
                     CALL apoc.path.subgraphAll(start, {
                         relationshipFilter: '',
@@ -799,7 +709,7 @@ class Neo4JStorage(BaseGraphStorage):
                         main_query,
                         {
                             "max_nodes": MAX_GRAPH_NODES,
-                            "label": label,
+                            "entity_id": node_label,
                             "inclusive": inclusive,
                             "max_depth": max_depth,
                             "min_degree": min_degree,
@@ -818,7 +728,7 @@ class Neo4JStorage(BaseGraphStorage):
                                 result.nodes.append(
                                     KnowledgeGraphNode(
                                         id=f"{node_id}",
-                                        labels=list(node.labels),
+                                        labels=[label for label in node.labels if label != "base"],
                                         properties=dict(node),
                                     )
                                 )
@@ -849,7 +759,7 @@ class Neo4JStorage(BaseGraphStorage):
 
             except neo4jExceptions.ClientError as e:
                 logger.warning(f"APOC plugin error: {str(e)}")
-                if label != "*":
+                if node_label != "*":
                     logger.warning(
                         "Neo4j: falling back to basic Cypher recursive search..."
                     )
@@ -857,12 +767,12 @@ class Neo4JStorage(BaseGraphStorage):
                         logger.warning(
                             "Neo4j: inclusive search mode is not supported in recursive query, using exact matching"
                         )
-                    return await self._robust_fallback(label, max_depth, min_degree)
+                    return await self._robust_fallback(node_label, max_depth, min_degree)
 
         return result
 
     async def _robust_fallback(
-        self, label: str, max_depth: int, min_degree: int = 0
+        self, node_label: str, max_depth: int, min_degree: int = 0
     ) -> KnowledgeGraph:
         """
         Fallback implementation when APOC plugin is not available or incompatible.
@@ -895,12 +805,11 @@ class Neo4JStorage(BaseGraphStorage):
                 database=self._DATABASE, default_access_mode="READ"
             ) as session:
                 query = """
-                MATCH (a)-[r]-(b)
-                WHERE id(a) = toInteger($node_id)
+                MATCH (a:base {entity_id: $entity_id})-[r]-(b)
                 WITH r, b, id(r) as edge_id, id(b) as target_id
                 RETURN r, b, edge_id, target_id
                 """
-                results = await session.run(query, {"node_id": node.id})
+                results = await session.run(query, entity_id=node.id)
 
                 # Get all records and release database connection
                 records = await results.fetch(
@@ -928,14 +837,14 @@ class Neo4JStorage(BaseGraphStorage):
                     edge_id = str(record["edge_id"])
                     if edge_id not in visited_edges:
                         b_node = record["b"]
-                        target_id = str(record["target_id"])
+                        target_id = b_node.get("entity_id")
 
-                        if b_node.labels:  # Only process if target node has labels
+                        if target_id:  # Only process if target node has entity_id
                             # Create KnowledgeGraphNode for target
                             target_node = KnowledgeGraphNode(
                                 id=f"{target_id}",
-                                labels=list(b_node.labels),
-                                properties=dict(b_node),
+                                labels=[label for label in b_node.labels if label != "base"],
+                                properties=dict(b_node.properties),
                             )
 
                             # Create KnowledgeGraphEdge
@@ -961,11 +870,11 @@ class Neo4JStorage(BaseGraphStorage):
         async with self._driver.session(
             database=self._DATABASE, default_access_mode="READ"
         ) as session:
-            query = f"""
-            MATCH (n:`{label}`)
+            query = """
+            MATCH (n:base {entity_id: $entity_id})
             RETURN id(n) as node_id, n
             """
-            node_result = await session.run(query)
+            node_result = await session.run(query, entity_id=node_label)
             try:
                 node_record = await node_result.single()
                 if not node_record:
@@ -973,9 +882,9 @@ class Neo4JStorage(BaseGraphStorage):
 
                 # Create initial KnowledgeGraphNode
                 start_node = KnowledgeGraphNode(
-                    id=f"{node_record['node_id']}",
-                    labels=list(node_record["n"].labels),
-                    properties=dict(node_record["n"]),
+                    id=f"{node_record['n'].get('entity_id')}",
+                    labels=[label for label in node_record["n"].labels if label != "base"],
+                    properties=dict(node_record["n"].properties),
                 )
             finally:
                 await node_result.consume()  # Ensure results are consumed
@@ -999,11 +908,10 @@ class Neo4JStorage(BaseGraphStorage):
 
             # Method 2: Query compatible with older versions
             query = """
-                MATCH (n)
-                WITH DISTINCT labels(n) AS node_labels
-                UNWIND node_labels AS label
-                RETURN DISTINCT label
-                ORDER BY label
+            MATCH (n)
+            WHERE n.entity_id IS NOT NULL
+            RETURN DISTINCT n.entity_id AS label
+            ORDER BY label
             """
             result = await session.run(query)
             labels = []
@@ -1034,15 +942,13 @@ class Neo4JStorage(BaseGraphStorage):
         Args:
             node_id: The label of the node to delete
         """
-        label = self._ensure_label(node_id)
-
         async def _do_delete(tx: AsyncManagedTransaction):
-            query = f"""
-            MATCH (n:`{label}`)
+            query = """
+            MATCH (n:base {entity_id: $entity_id})
             DETACH DELETE n
             """
-            result = await tx.run(query)
-            logger.debug(f"Deleted node with label '{label}'")
+            result = await tx.run(query, entity_id=node_id)
+            logger.debug(f"Deleted node with label '{node_id}'")
             await result.consume()  # Ensure result is fully consumed
 
         try:
@@ -1092,16 +998,13 @@ class Neo4JStorage(BaseGraphStorage):
             edges: List of edges to be deleted, each edge is a (source, target) tuple
         """
         for source, target in edges:
-            source_label = self._ensure_label(source)
-            target_label = self._ensure_label(target)
-
             async def _do_delete_edge(tx: AsyncManagedTransaction):
-                query = f"""
-                MATCH (source:`{source_label}`)-[r]-(target:`{target_label}`)
+                query = """
+                MATCH (source:base {entity_id: $source_entity_id})-[r]-(target:base {entity_id: $target_entity_id})
                 DELETE r
                 """
-                result = await tx.run(query)
-                logger.debug(f"Deleted edge from '{source_label}' to '{target_label}'")
+                result = await tx.run(query, source_entity_id=source, target_entity_id=target)
+                logger.debug(f"Deleted edge from '{source}' to '{target}'")
                 await result.consume()  # Ensure result is fully consumed
 
             try:

From 7fddabb441cf6beb303a5a743938fa8bb700a9a7 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Tue, 11 Mar 2025 10:28:25 +0800
Subject: [PATCH 45/54] Fix linting

---
 lightrag/kg/neo4j_impl.py | 82 ++++++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 26 deletions(-)

diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index 18bd6859..d0c6c779 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -195,14 +195,12 @@ class Neo4JStorage(BaseGraphStorage):
         ) as session:
             try:
                 query = "MATCH (n:base {entity_id: $entity_id}) RETURN count(n) > 0 AS node_exists"
-                result = await session.run(query, entity_id = node_id)
+                result = await session.run(query, entity_id=node_id)
                 single_result = await result.single()
                 await result.consume()  # Ensure result is fully consumed
                 return single_result["node_exists"]
             except Exception as e:
-                logger.error(
-                    f"Error checking node existence for {node_id}: {str(e)}"
-                )
+                logger.error(f"Error checking node existence for {node_id}: {str(e)}")
                 await result.consume()  # Ensure results are consumed even on error
                 raise
 
@@ -229,7 +227,11 @@ class Neo4JStorage(BaseGraphStorage):
                     "MATCH (a:base {entity_id: $source_entity_id})-[r]-(b:base {entity_id: $target_entity_id}) "
                     "RETURN COUNT(r) > 0 AS edgeExists"
                 )
-                result = await session.run(query, source_entity_id = source_node_id, target_entity_id = target_node_id)
+                result = await session.run(
+                    query,
+                    source_entity_id=source_node_id,
+                    target_entity_id=target_node_id,
+                )
                 single_result = await result.single()
                 await result.consume()  # Ensure result is fully consumed
                 return single_result["edgeExists"]
@@ -274,7 +276,11 @@ class Neo4JStorage(BaseGraphStorage):
                         node_dict = dict(node)
                         # Remove base label from labels list if it exists
                         if "labels" in node_dict:
-                            node_dict["labels"] = [label for label in node_dict["labels"] if label != "base"]
+                            node_dict["labels"] = [
+                                label
+                                for label in node_dict["labels"]
+                                if label != "base"
+                            ]
                         logger.debug(f"Neo4j query node {query} return: {node_dict}")
                         return node_dict
                     return None
@@ -308,25 +314,23 @@ class Neo4JStorage(BaseGraphStorage):
                     OPTIONAL MATCH (n)-[r]-()
                     RETURN COUNT(r) AS degree
                 """
-                result = await session.run(query, entity_id = node_id)
+                result = await session.run(query, entity_id=node_id)
                 try:
                     record = await result.single()
 
                     if not record:
-                        logger.warning(
-                            f"No node found with label '{node_id}'"
-                        )
+                        logger.warning(f"No node found with label '{node_id}'")
                         return 0
 
                     degree = record["degree"]
-                    logger.debug("Neo4j query node degree for {node_id} return: {degree}")
+                    logger.debug(
+                        "Neo4j query node degree for {node_id} return: {degree}"
+                    )
                     return degree
                 finally:
                     await result.consume()  # Ensure result is fully consumed
             except Exception as e:
-                logger.error(
-                    f"Error getting node degree for {node_id}: {str(e)}"
-                )
+                logger.error(f"Error getting node degree for {node_id}: {str(e)}")
                 raise
 
     async def edge_degree(self, src_id: str, tgt_id: str) -> int:
@@ -373,7 +377,11 @@ class Neo4JStorage(BaseGraphStorage):
                 MATCH (start:base {entity_id: $source_entity_id})-[r]-(end:base {entity_id: $target_entity_id})
                 RETURN properties(r) as edge_properties
                 """
-                result = await session.run(query, source_entity_id=source_node_id, target_entity_id=target_node_id)
+                result = await session.run(
+                    query,
+                    source_entity_id=source_node_id,
+                    target_entity_id=target_node_id,
+                )
                 try:
                     records = await result.fetch(2)
 
@@ -471,10 +479,14 @@ class Neo4JStorage(BaseGraphStorage):
                             continue
 
                         source_label = (
-                            source_node.get("entity_id") if source_node.get("entity_id") else None
+                            source_node.get("entity_id")
+                            if source_node.get("entity_id")
+                            else None
                         )
                         target_label = (
-                            connected_node.get("entity_id") if connected_node.get("entity_id") else None
+                            connected_node.get("entity_id")
+                            if connected_node.get("entity_id")
+                            else None
                         )
 
                         if source_label and target_label:
@@ -483,7 +495,9 @@ class Neo4JStorage(BaseGraphStorage):
                     await results.consume()  # Ensure results are consumed
                     return edges
                 except Exception as e:
-                    logger.error(f"Error getting edges for node {source_node_id}: {str(e)}")
+                    logger.error(
+                        f"Error getting edges for node {source_node_id}: {str(e)}"
+                    )
                     await results.consume()  # Ensure results are consumed even on error
                     raise
         except Exception as e:
@@ -520,11 +534,14 @@ class Neo4JStorage(BaseGraphStorage):
             async with self._driver.session(database=self._DATABASE) as session:
 
                 async def execute_upsert(tx: AsyncManagedTransaction):
-                    query = """
+                    query = (
+                        """
                     MERGE (n:base {entity_id: $properties.entity_id})
                     SET n += $properties
                     SET n:`%s`
-                    """ % entity_type
+                    """
+                        % entity_type
+                    )
                     result = await tx.run(query, properties=properties)
                     logger.debug(
                         f"Upserted node with entity_id '{entity_id}' and properties: {properties}"
@@ -548,7 +565,6 @@ class Neo4JStorage(BaseGraphStorage):
             )
         ),
     )
-
     @retry(
         stop=stop_after_attempt(3),
         wait=wait_exponential(multiplier=1, min=4, max=10),
@@ -728,7 +744,11 @@ class Neo4JStorage(BaseGraphStorage):
                                 result.nodes.append(
                                     KnowledgeGraphNode(
                                         id=f"{node_id}",
-                                        labels=[label for label in node.labels if label != "base"],
+                                        labels=[
+                                            label
+                                            for label in node.labels
+                                            if label != "base"
+                                        ],
                                         properties=dict(node),
                                     )
                                 )
@@ -767,7 +787,9 @@ class Neo4JStorage(BaseGraphStorage):
                         logger.warning(
                             "Neo4j: inclusive search mode is not supported in recursive query, using exact matching"
                         )
-                    return await self._robust_fallback(node_label, max_depth, min_degree)
+                    return await self._robust_fallback(
+                        node_label, max_depth, min_degree
+                    )
 
         return result
 
@@ -843,7 +865,9 @@ class Neo4JStorage(BaseGraphStorage):
                             # Create KnowledgeGraphNode for target
                             target_node = KnowledgeGraphNode(
                                 id=f"{target_id}",
-                                labels=[label for label in b_node.labels if label != "base"],
+                                labels=[
+                                    label for label in b_node.labels if label != "base"
+                                ],
                                 properties=dict(b_node.properties),
                             )
 
@@ -883,7 +907,9 @@ class Neo4JStorage(BaseGraphStorage):
                 # Create initial KnowledgeGraphNode
                 start_node = KnowledgeGraphNode(
                     id=f"{node_record['n'].get('entity_id')}",
-                    labels=[label for label in node_record["n"].labels if label != "base"],
+                    labels=[
+                        label for label in node_record["n"].labels if label != "base"
+                    ],
                     properties=dict(node_record["n"].properties),
                 )
             finally:
@@ -942,6 +968,7 @@ class Neo4JStorage(BaseGraphStorage):
         Args:
             node_id: The label of the node to delete
         """
+
         async def _do_delete(tx: AsyncManagedTransaction):
             query = """
             MATCH (n:base {entity_id: $entity_id})
@@ -998,12 +1025,15 @@ class Neo4JStorage(BaseGraphStorage):
             edges: List of edges to be deleted, each edge is a (source, target) tuple
         """
         for source, target in edges:
+
             async def _do_delete_edge(tx: AsyncManagedTransaction):
                 query = """
                 MATCH (source:base {entity_id: $source_entity_id})-[r]-(target:base {entity_id: $target_entity_id})
                 DELETE r
                 """
-                result = await tx.run(query, source_entity_id=source, target_entity_id=target)
+                result = await tx.run(
+                    query, source_entity_id=source, target_entity_id=target
+                )
                 logger.debug(f"Deleted edge from '{source}' to '{target}'")
                 await result.consume()  # Ensure result is fully consumed
 

From 37754f14b51d5ea274ae184db772cc0e50ed3d72 Mon Sep 17 00:00:00 2001
From: Zhenya Zhu <zzyagg@gmail.com>
Date: Tue, 11 Mar 2025 11:54:30 +0800
Subject: [PATCH 46/54] force keywords_extraction output as JSON

---
 lightrag/prompt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/prompt.py b/lightrag/prompt.py
index 1486ccf8..f81cd441 100644
--- a/lightrag/prompt.py
+++ b/lightrag/prompt.py
@@ -236,7 +236,7 @@ Given the query and conversation history, list both high-level and low-level key
 ---Instructions---
 
 - Consider both the current query and relevant conversation history when extracting keywords
-- Output the keywords in JSON format
+- Output the keywords in JSON format, it will be parsed by a JSON parser, do not add any extra content in output
 - The JSON should have two keys:
   - "high_level_keywords" for overarching concepts or themes
   - "low_level_keywords" for specific entities or details

From d77401961dc84f802a49b3c060df6f7b136cff04 Mon Sep 17 00:00:00 2001
From: Zhichun Wu <zhicwu@gmail.com>
Date: Tue, 11 Mar 2025 11:57:41 +0800
Subject: [PATCH 47/54] Resolve the issue with making API calls to Azure OpenAI
 service

---
 lightrag/llm/azure_openai.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lightrag/llm/azure_openai.py b/lightrag/llm/azure_openai.py
index 84e45cfb..3405d29e 100644
--- a/lightrag/llm/azure_openai.py
+++ b/lightrag/llm/azure_openai.py
@@ -55,6 +55,7 @@ async def azure_openai_complete_if_cache(
 
     openai_async_client = AsyncAzureOpenAI(
         azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
+        azure_deployment=model,
         api_key=os.getenv("AZURE_OPENAI_API_KEY"),
         api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
     )
@@ -136,6 +137,7 @@ async def azure_openai_embed(
 
     openai_async_client = AsyncAzureOpenAI(
         azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
+        azure_deployment=model,
         api_key=os.getenv("AZURE_OPENAI_API_KEY"),
         api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
     )

From 061350b2bf13557b9bb40c592e775d31235f1d73 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Tue, 11 Mar 2025 12:08:10 +0800
Subject: [PATCH 48/54] Improve Entity Extraction Robustness for Truncated LLM
 Responses

---
 lightrag/operate.py | 128 ++++++++++++++++++++++++++++++--------------
 1 file changed, 87 insertions(+), 41 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index e352ff79..f808f3c2 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -141,18 +141,36 @@ async def _handle_single_entity_extraction(
 ):
     if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
         return None
-    # add this record as a node in the G
+
+    # Clean and validate entity name
     entity_name = clean_str(record_attributes[1]).strip('"')
     if not entity_name.strip():
+        logger.warning(
+            f"Entity extraction error: empty entity name in: {record_attributes}"
+        )
         return None
+
+    # Clean and validate entity type
     entity_type = clean_str(record_attributes[2]).strip('"')
+    if not entity_type.strip() or entity_type.startswith('("'):
+        logger.warning(
+            f"Entity extraction error: invalid entity type in: {record_attributes}"
+        )
+        return None
+
+    # Clean and validate description
     entity_description = clean_str(record_attributes[3]).strip('"')
-    entity_source_id = chunk_key
+    if not entity_description.strip():
+        logger.warning(
+            f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'"
+        )
+        return None
+
     return dict(
         entity_name=entity_name,
         entity_type=entity_type,
         description=entity_description,
-        source_id=entity_source_id,
+        source_id=chunk_key,
         metadata={"created_at": time.time()},
     )
 
@@ -438,47 +456,22 @@ async def extract_entities(
         else:
             return await use_llm_func(input_text)
 
-    async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
-        """ "Prpocess a single chunk
+    async def _process_extraction_result(result: str, chunk_key: str):
+        """Process a single extraction result (either initial or gleaning)
         Args:
-            chunk_key_dp (tuple[str, TextChunkSchema]):
-                ("chunck-xxxxxx", {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int})
+            result (str): The extraction result to process
+            chunk_key (str): The chunk key for source tracking
+        Returns:
+            tuple: (nodes_dict, edges_dict) containing the extracted entities and relationships
         """
-        nonlocal processed_chunks
-        chunk_key = chunk_key_dp[0]
-        chunk_dp = chunk_key_dp[1]
-        content = chunk_dp["content"]
-        # hint_prompt = entity_extract_prompt.format(**context_base, input_text=content)
-        hint_prompt = entity_extract_prompt.format(
-            **context_base, input_text="{input_text}"
-        ).format(**context_base, input_text=content)
-
-        final_result = await _user_llm_func_with_cache(hint_prompt)
-        history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
-        for now_glean_index in range(entity_extract_max_gleaning):
-            glean_result = await _user_llm_func_with_cache(
-                continue_prompt, history_messages=history
-            )
-
-            history += pack_user_ass_to_openai_messages(continue_prompt, glean_result)
-            final_result += glean_result
-            if now_glean_index == entity_extract_max_gleaning - 1:
-                break
-
-            if_loop_result: str = await _user_llm_func_with_cache(
-                if_loop_prompt, history_messages=history
-            )
-            if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
-            if if_loop_result != "yes":
-                break
-
-        records = split_string_by_multi_markers(
-            final_result,
-            [context_base["record_delimiter"], context_base["completion_delimiter"]],
-        )
-
         maybe_nodes = defaultdict(list)
         maybe_edges = defaultdict(list)
+        
+        records = split_string_by_multi_markers(
+            result,
+            [context_base["record_delimiter"], context_base["completion_delimiter"]],
+        )
+        
         for record in records:
             record = re.search(r"\((.*)\)", record)
             if record is None:
@@ -487,13 +480,14 @@ async def extract_entities(
             record_attributes = split_string_by_multi_markers(
                 record, [context_base["tuple_delimiter"]]
             )
+            
             if_entities = await _handle_single_entity_extraction(
                 record_attributes, chunk_key
             )
             if if_entities is not None:
                 maybe_nodes[if_entities["entity_name"]].append(if_entities)
                 continue
-
+                
             if_relation = await _handle_single_relationship_extraction(
                 record_attributes, chunk_key
             )
@@ -501,6 +495,58 @@ async def extract_entities(
                 maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append(
                     if_relation
                 )
+                
+        return maybe_nodes, maybe_edges
+
+    async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
+        """Process a single chunk
+        Args:
+            chunk_key_dp (tuple[str, TextChunkSchema]):
+                ("chunk-xxxxxx", {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int})
+        """
+        nonlocal processed_chunks
+        chunk_key = chunk_key_dp[0]
+        chunk_dp = chunk_key_dp[1]
+        content = chunk_dp["content"]
+        
+        # Get initial extraction
+        hint_prompt = entity_extract_prompt.format(
+            **context_base, input_text="{input_text}"
+        ).format(**context_base, input_text=content)
+        
+        final_result = await _user_llm_func_with_cache(hint_prompt)
+        history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
+        
+        # Process initial extraction
+        maybe_nodes, maybe_edges = await _process_extraction_result(final_result, chunk_key)
+        
+        # Process additional gleaning results
+        for now_glean_index in range(entity_extract_max_gleaning):
+            glean_result = await _user_llm_func_with_cache(
+                continue_prompt, history_messages=history
+            )
+            
+            history += pack_user_ass_to_openai_messages(continue_prompt, glean_result)
+            
+            # Process gleaning result separately
+            glean_nodes, glean_edges = await _process_extraction_result(glean_result, chunk_key)
+            
+            # Merge results
+            for entity_name, entities in glean_nodes.items():
+                maybe_nodes[entity_name].extend(entities)
+            for edge_key, edges in glean_edges.items():
+                maybe_edges[edge_key].extend(edges)
+                
+            if now_glean_index == entity_extract_max_gleaning - 1:
+                break
+                
+            if_loop_result: str = await _user_llm_func_with_cache(
+                if_loop_prompt, history_messages=history
+            )
+            if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
+            if if_loop_result != "yes":
+                break
+                
         processed_chunks += 1
         entities_count = len(maybe_nodes)
         relations_count = len(maybe_edges)

From 9d1dc2c9c3786aea25fba48cf946348e2a374b6b Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Tue, 11 Mar 2025 12:23:51 +0800
Subject: [PATCH 49/54] Fix linting

---
 lightrag/operate.py | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index f808f3c2..09e51fcf 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -466,12 +466,12 @@ async def extract_entities(
         """
         maybe_nodes = defaultdict(list)
         maybe_edges = defaultdict(list)
-        
+
         records = split_string_by_multi_markers(
             result,
             [context_base["record_delimiter"], context_base["completion_delimiter"]],
         )
-        
+
         for record in records:
             record = re.search(r"\((.*)\)", record)
             if record is None:
@@ -480,14 +480,14 @@ async def extract_entities(
             record_attributes = split_string_by_multi_markers(
                 record, [context_base["tuple_delimiter"]]
             )
-            
+
             if_entities = await _handle_single_entity_extraction(
                 record_attributes, chunk_key
             )
             if if_entities is not None:
                 maybe_nodes[if_entities["entity_name"]].append(if_entities)
                 continue
-                
+
             if_relation = await _handle_single_relationship_extraction(
                 record_attributes, chunk_key
             )
@@ -495,7 +495,7 @@ async def extract_entities(
                 maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append(
                     if_relation
                 )
-                
+
         return maybe_nodes, maybe_edges
 
     async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
@@ -508,45 +508,49 @@ async def extract_entities(
         chunk_key = chunk_key_dp[0]
         chunk_dp = chunk_key_dp[1]
         content = chunk_dp["content"]
-        
+
         # Get initial extraction
         hint_prompt = entity_extract_prompt.format(
             **context_base, input_text="{input_text}"
         ).format(**context_base, input_text=content)
-        
+
         final_result = await _user_llm_func_with_cache(hint_prompt)
         history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
-        
+
         # Process initial extraction
-        maybe_nodes, maybe_edges = await _process_extraction_result(final_result, chunk_key)
-        
+        maybe_nodes, maybe_edges = await _process_extraction_result(
+            final_result, chunk_key
+        )
+
         # Process additional gleaning results
         for now_glean_index in range(entity_extract_max_gleaning):
             glean_result = await _user_llm_func_with_cache(
                 continue_prompt, history_messages=history
             )
-            
+
             history += pack_user_ass_to_openai_messages(continue_prompt, glean_result)
-            
+
             # Process gleaning result separately
-            glean_nodes, glean_edges = await _process_extraction_result(glean_result, chunk_key)
-            
+            glean_nodes, glean_edges = await _process_extraction_result(
+                glean_result, chunk_key
+            )
+
             # Merge results
             for entity_name, entities in glean_nodes.items():
                 maybe_nodes[entity_name].extend(entities)
             for edge_key, edges in glean_edges.items():
                 maybe_edges[edge_key].extend(edges)
-                
+
             if now_glean_index == entity_extract_max_gleaning - 1:
                 break
-                
+
             if_loop_result: str = await _user_llm_func_with_cache(
                 if_loop_prompt, history_messages=history
             )
             if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
             if if_loop_result != "yes":
                 break
-                
+
         processed_chunks += 1
         entities_count = len(maybe_nodes)
         relations_count = len(maybe_edges)

From 62b304600bbc90160a64de305d713560ec3b007b Mon Sep 17 00:00:00 2001
From: zrguo <zrguo.bupt@qq.com>
Date: Tue, 11 Mar 2025 15:43:04 +0800
Subject: [PATCH 50/54] clean lightrag.py

---
 lightrag/lightrag.py | 126 ++++++++++++++-----------------------------
 lightrag/operate.py  |  86 +++++++++++++++++++++++++++++
 lightrag/utils.py    |  46 ++++++++++++++++
 3 files changed, 172 insertions(+), 86 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 3a7d340a..a5cb3b22 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -30,11 +30,10 @@ from .namespace import NameSpace, make_namespace
 from .operate import (
     chunking_by_token_size,
     extract_entities,
-    extract_keywords_only,
     kg_query,
-    kg_query_with_keywords,
     mix_kg_vector_query,
     naive_query,
+    query_with_keywords,
 )
 from .prompt import GRAPH_FIELD_SEP, PROMPTS
 from .utils import (
@@ -45,6 +44,9 @@ from .utils import (
     encode_string_by_tiktoken,
     lazy_external_import,
     limit_async_func_call,
+    get_content_summary,
+    clean_text,
+    check_storage_env_vars,
     logger,
 )
 from .types import KnowledgeGraph
@@ -309,7 +311,7 @@ class LightRAG:
             # Verify storage implementation compatibility
             verify_storage_implementation(storage_type, storage_name)
             # Check environment variables
-            # self.check_storage_env_vars(storage_name)
+            check_storage_env_vars(storage_name)
 
         # Ensure vector_db_storage_cls_kwargs has required fields
         self.vector_db_storage_cls_kwargs = {
@@ -536,11 +538,6 @@ class LightRAG:
         storage_class = lazy_external_import(import_path, storage_name)
         return storage_class
 
-    @staticmethod
-    def clean_text(text: str) -> str:
-        """Clean text by removing null bytes (0x00) and whitespace"""
-        return text.strip().replace("\x00", "")
-
     def insert(
         self,
         input: str | list[str],
@@ -602,8 +599,8 @@ class LightRAG:
         update_storage = False
         try:
             # Clean input texts
-            full_text = self.clean_text(full_text)
-            text_chunks = [self.clean_text(chunk) for chunk in text_chunks]
+            full_text = clean_text(full_text)
+            text_chunks = [clean_text(chunk) for chunk in text_chunks]
 
             # Process cleaned texts
             if doc_id is None:
@@ -682,7 +679,7 @@ class LightRAG:
             contents = {id_: doc for id_, doc in zip(ids, input)}
         else:
             # Clean input text and remove duplicates
-            input = list(set(self.clean_text(doc) for doc in input))
+            input = list(set(clean_text(doc) for doc in input))
             # Generate contents dict of MD5 hash IDs and documents
             contents = {compute_mdhash_id(doc, prefix="doc-"): doc for doc in input}
 
@@ -698,7 +695,7 @@ class LightRAG:
         new_docs: dict[str, Any] = {
             id_: {
                 "content": content,
-                "content_summary": self._get_content_summary(content),
+                "content_summary": get_content_summary(content),
                 "content_length": len(content),
                 "status": DocStatus.PENDING,
                 "created_at": datetime.now().isoformat(),
@@ -1063,7 +1060,7 @@ class LightRAG:
             all_chunks_data: dict[str, dict[str, str]] = {}
             chunk_to_source_map: dict[str, str] = {}
             for chunk_data in custom_kg.get("chunks", []):
-                chunk_content = self.clean_text(chunk_data["content"])
+                chunk_content = clean_text(chunk_data["content"])
                 source_id = chunk_data["source_id"]
                 tokens = len(
                     encode_string_by_tiktoken(
@@ -1296,8 +1293,17 @@ class LightRAG:
         self, query: str, prompt: str, param: QueryParam = QueryParam()
     ):
         """
-        1. Extract keywords from the 'query' using new function in operate.py.
-        2. Then run the standard aquery() flow with the final prompt (formatted_question).
+        Query with separate keyword extraction step.
+        
+        This method extracts keywords from the query first, then uses them for the query.
+        
+        Args:
+            query: User query
+            prompt: Additional prompt for the query
+            param: Query parameters
+            
+        Returns:
+            Query response
         """
         loop = always_get_an_event_loop()
         return loop.run_until_complete(
@@ -1308,66 +1314,29 @@ class LightRAG:
         self, query: str, prompt: str, param: QueryParam = QueryParam()
     ) -> str | AsyncIterator[str]:
         """
-        1. Calls extract_keywords_only to get HL/LL keywords from 'query'.
-        2. Then calls kg_query(...) or naive_query(...), etc. as the main query, while also injecting the newly extracted keywords if needed.
+        Async version of query_with_separate_keyword_extraction.
+        
+        Args:
+            query: User query
+            prompt: Additional prompt for the query
+            param: Query parameters
+            
+        Returns:
+            Query response or async iterator
         """
-        # ---------------------
-        # STEP 1: Keyword Extraction
-        # ---------------------
-        hl_keywords, ll_keywords = await extract_keywords_only(
-            text=query,
+        response = await query_with_keywords(
+            query=query,
+            prompt=prompt,
             param=param,
+            knowledge_graph_inst=self.chunk_entity_relation_graph,
+            entities_vdb=self.entities_vdb,
+            relationships_vdb=self.relationships_vdb,
+            chunks_vdb=self.chunks_vdb,
+            text_chunks_db=self.text_chunks,
             global_config=asdict(self),
-            hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
+            hashing_kv=self.llm_response_cache,
         )
-
-        param.hl_keywords = hl_keywords
-        param.ll_keywords = ll_keywords
-
-        # ---------------------
-        # STEP 2: Final Query Logic
-        # ---------------------
-
-        # Create a new string with the prompt and the keywords
-        ll_keywords_str = ", ".join(ll_keywords)
-        hl_keywords_str = ", ".join(hl_keywords)
-        formatted_question = f"{prompt}\n\n### Keywords:\nHigh-level: {hl_keywords_str}\nLow-level: {ll_keywords_str}\n\n### Query:\n{query}"
-
-        if param.mode in ["local", "global", "hybrid"]:
-            response = await kg_query_with_keywords(
-                formatted_question,
-                self.chunk_entity_relation_graph,
-                self.entities_vdb,
-                self.relationships_vdb,
-                self.text_chunks,
-                param,
-                asdict(self),
-                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
-            )
-        elif param.mode == "naive":
-            response = await naive_query(
-                formatted_question,
-                self.chunks_vdb,
-                self.text_chunks,
-                param,
-                asdict(self),
-                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
-            )
-        elif param.mode == "mix":
-            response = await mix_kg_vector_query(
-                formatted_question,
-                self.chunk_entity_relation_graph,
-                self.entities_vdb,
-                self.relationships_vdb,
-                self.chunks_vdb,
-                self.text_chunks,
-                param,
-                asdict(self),
-                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
-            )
-        else:
-            raise ValueError(f"Unknown mode {param.mode}")
-
+        
         await self._query_done()
         return response
 
@@ -1465,21 +1434,6 @@ class LightRAG:
             ]
         )
 
-    def _get_content_summary(self, content: str, max_length: int = 100) -> str:
-        """Get summary of document content
-
-        Args:
-            content: Original document content
-            max_length: Maximum length of summary
-
-        Returns:
-            Truncated content with ellipsis if needed
-        """
-        content = content.strip()
-        if len(content) <= max_length:
-            return content
-        return content[:max_length] + "..."
-
     async def get_processing_status(self) -> dict[str, int]:
         """Get current document processing status counts
 
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 5baec1eb..95a5c72e 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1916,3 +1916,89 @@ async def kg_query_with_keywords(
         )
 
     return response
+
+async def query_with_keywords(
+    query: str,
+    prompt: str,
+    param: QueryParam,
+    knowledge_graph_inst: BaseGraphStorage,
+    entities_vdb: BaseVectorStorage,
+    relationships_vdb: BaseVectorStorage,
+    chunks_vdb: BaseVectorStorage,
+    text_chunks_db: BaseKVStorage,
+    global_config: dict[str, str],
+    hashing_kv: BaseKVStorage | None = None,
+) -> str | AsyncIterator[str]:
+    """
+    Extract keywords from the query and then use them for retrieving information.
+    
+    1. Extracts high-level and low-level keywords from the query
+    2. Formats the query with the extracted keywords and prompt
+    3. Uses the appropriate query method based on param.mode
+    
+    Args:
+        query: The user's query
+        prompt: Additional prompt to prepend to the query
+        param: Query parameters
+        knowledge_graph_inst: Knowledge graph storage
+        entities_vdb: Entities vector database
+        relationships_vdb: Relationships vector database
+        chunks_vdb: Document chunks vector database
+        text_chunks_db: Text chunks storage
+        global_config: Global configuration
+        hashing_kv: Cache storage
+        
+    Returns:
+        Query response or async iterator
+    """
+    # Extract keywords
+    hl_keywords, ll_keywords = await extract_keywords_only(
+        text=query,
+        param=param,
+        global_config=global_config,
+        hashing_kv=hashing_kv,
+    )
+
+    param.hl_keywords = hl_keywords
+    param.ll_keywords = ll_keywords
+
+    # Create a new string with the prompt and the keywords
+    ll_keywords_str = ", ".join(ll_keywords)
+    hl_keywords_str = ", ".join(hl_keywords)
+    formatted_question = f"{prompt}\n\n### Keywords:\nHigh-level: {hl_keywords_str}\nLow-level: {ll_keywords_str}\n\n### Query:\n{query}"
+
+    # Use appropriate query method based on mode
+    if param.mode in ["local", "global", "hybrid"]:
+        return await kg_query_with_keywords(
+            formatted_question,
+            knowledge_graph_inst,
+            entities_vdb,
+            relationships_vdb,
+            text_chunks_db,
+            param,
+            global_config,
+            hashing_kv=hashing_kv,
+        )
+    elif param.mode == "naive":
+        return await naive_query(
+            formatted_question,
+            chunks_vdb,
+            text_chunks_db,
+            param,
+            global_config,
+            hashing_kv=hashing_kv,
+        )
+    elif param.mode == "mix":
+        return await mix_kg_vector_query(
+            formatted_question,
+            knowledge_graph_inst,
+            entities_vdb,
+            relationships_vdb,
+            chunks_vdb,
+            text_chunks_db,
+            param,
+            global_config,
+            hashing_kv=hashing_kv,
+        )
+    else:
+        raise ValueError(f"Unknown mode {param.mode}")
diff --git a/lightrag/utils.py b/lightrag/utils.py
index e8f79610..1143b326 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -890,3 +890,49 @@ def lazy_external_import(module_name: str, class_name: str) -> Callable[..., Any
         return cls(*args, **kwargs)
 
     return import_class
+
+def get_content_summary(content: str, max_length: int = 100) -> str:
+    """Get summary of document content
+
+    Args:
+        content: Original document content
+        max_length: Maximum length of summary
+
+    Returns:
+        Truncated content with ellipsis if needed
+    """
+    content = content.strip()
+    if len(content) <= max_length:
+        return content
+    return content[:max_length] + "..."
+
+def clean_text(text: str) -> str:
+    """Clean text by removing null bytes (0x00) and whitespace
+
+    Args:
+        text: Input text to clean
+
+    Returns:
+        Cleaned text
+    """
+    return text.strip().replace("\x00", "")
+
+def check_storage_env_vars(storage_name: str) -> None:
+    """Check if all required environment variables for storage implementation exist
+
+    Args:
+        storage_name: Storage implementation name
+
+    Raises:
+        ValueError: If required environment variables are missing
+    """
+    from lightrag.kg import STORAGE_ENV_REQUIREMENTS
+    
+    required_vars = STORAGE_ENV_REQUIREMENTS.get(storage_name, [])
+    missing_vars = [var for var in required_vars if var not in os.environ]
+
+    if missing_vars:
+        raise ValueError(
+            f"Storage implementation '{storage_name}' requires the following "
+            f"environment variables: {', '.join(missing_vars)}"
+        )
\ No newline at end of file

From 418aea3895f26e1680d0e0b4909d8e6c52b67240 Mon Sep 17 00:00:00 2001
From: zrguo <zrguo.bupt@qq.com>
Date: Tue, 11 Mar 2025 15:44:01 +0800
Subject: [PATCH 51/54] fix linting

---
 lightrag/lightrag.py | 12 ++++++------
 lightrag/operate.py  |  7 ++++---
 lightrag/utils.py    |  7 +++++--
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index a5cb3b22..48b464a8 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -1294,14 +1294,14 @@ class LightRAG:
     ):
         """
         Query with separate keyword extraction step.
-        
+
         This method extracts keywords from the query first, then uses them for the query.
-        
+
         Args:
             query: User query
             prompt: Additional prompt for the query
             param: Query parameters
-            
+
         Returns:
             Query response
         """
@@ -1315,12 +1315,12 @@ class LightRAG:
     ) -> str | AsyncIterator[str]:
         """
         Async version of query_with_separate_keyword_extraction.
-        
+
         Args:
             query: User query
             prompt: Additional prompt for the query
             param: Query parameters
-            
+
         Returns:
             Query response or async iterator
         """
@@ -1336,7 +1336,7 @@ class LightRAG:
             global_config=asdict(self),
             hashing_kv=self.llm_response_cache,
         )
-        
+
         await self._query_done()
         return response
 
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 95a5c72e..1815f308 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1917,6 +1917,7 @@ async def kg_query_with_keywords(
 
     return response
 
+
 async def query_with_keywords(
     query: str,
     prompt: str,
@@ -1931,11 +1932,11 @@ async def query_with_keywords(
 ) -> str | AsyncIterator[str]:
     """
     Extract keywords from the query and then use them for retrieving information.
-    
+
     1. Extracts high-level and low-level keywords from the query
     2. Formats the query with the extracted keywords and prompt
     3. Uses the appropriate query method based on param.mode
-    
+
     Args:
         query: The user's query
         prompt: Additional prompt to prepend to the query
@@ -1947,7 +1948,7 @@ async def query_with_keywords(
         text_chunks_db: Text chunks storage
         global_config: Global configuration
         hashing_kv: Cache storage
-        
+
     Returns:
         Query response or async iterator
     """
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 1143b326..b8f00c5d 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -891,6 +891,7 @@ def lazy_external_import(module_name: str, class_name: str) -> Callable[..., Any
 
     return import_class
 
+
 def get_content_summary(content: str, max_length: int = 100) -> str:
     """Get summary of document content
 
@@ -906,6 +907,7 @@ def get_content_summary(content: str, max_length: int = 100) -> str:
         return content
     return content[:max_length] + "..."
 
+
 def clean_text(text: str) -> str:
     """Clean text by removing null bytes (0x00) and whitespace
 
@@ -917,6 +919,7 @@ def clean_text(text: str) -> str:
     """
     return text.strip().replace("\x00", "")
 
+
 def check_storage_env_vars(storage_name: str) -> None:
     """Check if all required environment variables for storage implementation exist
 
@@ -927,7 +930,7 @@ def check_storage_env_vars(storage_name: str) -> None:
         ValueError: If required environment variables are missing
     """
     from lightrag.kg import STORAGE_ENV_REQUIREMENTS
-    
+
     required_vars = STORAGE_ENV_REQUIREMENTS.get(storage_name, [])
     missing_vars = [var for var in required_vars if var not in os.environ]
 
@@ -935,4 +938,4 @@ def check_storage_env_vars(storage_name: str) -> None:
         raise ValueError(
             f"Storage implementation '{storage_name}' requires the following "
             f"environment variables: {', '.join(missing_vars)}"
-        )
\ No newline at end of file
+        )

From c26cb3a9ea6747e745805ccc2e48ec5e6f52b0fa Mon Sep 17 00:00:00 2001
From: zrguo <zrguo.bupt@qq.com>
Date: Tue, 11 Mar 2025 16:05:04 +0800
Subject: [PATCH 52/54] fix merge bugs

---
 lightrag/base.py                   | 24 ++++++++
 lightrag/kg/chroma_impl.py         | 64 ++++++++++++++++++++
 lightrag/kg/faiss_impl.py          | 43 ++++++++++++++
 lightrag/kg/milvus_impl.py         | 54 +++++++++++++++++
 lightrag/kg/mongo_impl.py          | 53 +++++++++++++++++
 lightrag/kg/nano_vector_db_impl.py | 30 ++++++++++
 lightrag/kg/oracle_impl.py         | 74 +++++++++++++++++++++++
 lightrag/kg/postgres_impl.py       | 54 +++++++++++++++++
 lightrag/kg/tidb_impl.py           | 94 ++++++++++++++++++++++++++++++
 lightrag/lightrag.py               | 38 ++----------
 10 files changed, 496 insertions(+), 32 deletions(-)

diff --git a/lightrag/base.py b/lightrag/base.py
index c84c7c62..86566787 100644
--- a/lightrag/base.py
+++ b/lightrag/base.py
@@ -127,6 +127,30 @@ class BaseVectorStorage(StorageNameSpace, ABC):
     async def delete_entity_relation(self, entity_name: str) -> None:
         """Delete relations for a given entity."""
 
+    @abstractmethod
+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        pass
+
+    @abstractmethod
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        pass
+
 
 @dataclass
 class BaseKVStorage(StorageNameSpace, ABC):
diff --git a/lightrag/kg/chroma_impl.py b/lightrag/kg/chroma_impl.py
index 6b521180..f668c87a 100644
--- a/lightrag/kg/chroma_impl.py
+++ b/lightrag/kg/chroma_impl.py
@@ -269,3 +269,67 @@ class ChromaVectorDBStorage(BaseVectorStorage):
         except Exception as e:
             logger.error(f"Error during prefix search in ChromaDB: {str(e)}")
             raise
+
+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        try:
+            # Query the collection for a single vector by ID
+            result = self._collection.get(
+                ids=[id], include=["metadatas", "embeddings", "documents"]
+            )
+
+            if not result or not result["ids"] or len(result["ids"]) == 0:
+                return None
+
+            # Format the result to match the expected structure
+            return {
+                "id": result["ids"][0],
+                "vector": result["embeddings"][0],
+                "content": result["documents"][0],
+                **result["metadatas"][0],
+            }
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for ID {id}: {e}")
+            return None
+
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        if not ids:
+            return []
+
+        try:
+            # Query the collection for multiple vectors by IDs
+            result = self._collection.get(
+                ids=ids, include=["metadatas", "embeddings", "documents"]
+            )
+
+            if not result or not result["ids"] or len(result["ids"]) == 0:
+                return []
+
+            # Format the results to match the expected structure
+            return [
+                {
+                    "id": result["ids"][i],
+                    "vector": result["embeddings"][i],
+                    "content": result["documents"][i],
+                    **result["metadatas"][i],
+                }
+                for i in range(len(result["ids"]))
+            ]
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
+            return []
diff --git a/lightrag/kg/faiss_impl.py b/lightrag/kg/faiss_impl.py
index ab036e6f..a5716e9c 100644
--- a/lightrag/kg/faiss_impl.py
+++ b/lightrag/kg/faiss_impl.py
@@ -392,3 +392,46 @@ class FaissVectorDBStorage(BaseVectorStorage):
 
         logger.debug(f"Found {len(matching_records)} records with prefix '{prefix}'")
         return matching_records
+
+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        # Find the Faiss internal ID for the custom ID
+        fid = self._find_faiss_id_by_custom_id(id)
+        if fid is None:
+            return None
+
+        # Get the metadata for the found ID
+        metadata = self._id_to_meta.get(fid, {})
+        if not metadata:
+            return None
+
+        return {**metadata, "id": metadata.get("__id__")}
+
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        if not ids:
+            return []
+
+        results = []
+        for id in ids:
+            fid = self._find_faiss_id_by_custom_id(id)
+            if fid is not None:
+                metadata = self._id_to_meta.get(fid, {})
+                if metadata:
+                    results.append({**metadata, "id": metadata.get("__id__")})
+
+        return results
diff --git a/lightrag/kg/milvus_impl.py b/lightrag/kg/milvus_impl.py
index f3a6fcc4..4fb5f012 100644
--- a/lightrag/kg/milvus_impl.py
+++ b/lightrag/kg/milvus_impl.py
@@ -231,3 +231,57 @@ class MilvusVectorDBStorage(BaseVectorStorage):
         except Exception as e:
             logger.error(f"Error searching for records with prefix '{prefix}': {e}")
             return []
+
+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        try:
+            # Query Milvus for a specific ID
+            result = self._client.query(
+                collection_name=self.namespace,
+                filter=f'id == "{id}"',
+                output_fields=list(self.meta_fields) + ["id"],
+            )
+
+            if not result or len(result) == 0:
+                return None
+
+            return result[0]
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for ID {id}: {e}")
+            return None
+
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        if not ids:
+            return []
+
+        try:
+            # Prepare the ID filter expression
+            id_list = '", "'.join(ids)
+            filter_expr = f'id in ["{id_list}"]'
+
+            # Query Milvus with the filter
+            result = self._client.query(
+                collection_name=self.namespace,
+                filter=filter_expr,
+                output_fields=list(self.meta_fields) + ["id"],
+            )
+
+            return result or []
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
+            return []
diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py
index f2ab6ae0..da4dc32c 100644
--- a/lightrag/kg/mongo_impl.py
+++ b/lightrag/kg/mongo_impl.py
@@ -1071,6 +1071,59 @@ class MongoVectorDBStorage(BaseVectorStorage):
             logger.error(f"Error searching by prefix in {self.namespace}: {str(e)}")
             return []
 
+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        try:
+            # Search for the specific ID in MongoDB
+            result = await self._data.find_one({"_id": id})
+            if result:
+                # Format the result to include id field expected by API
+                result_dict = dict(result)
+                if "_id" in result_dict and "id" not in result_dict:
+                    result_dict["id"] = result_dict["_id"]
+                return result_dict
+            return None
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for ID {id}: {e}")
+            return None
+
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        if not ids:
+            return []
+
+        try:
+            # Query MongoDB for multiple IDs
+            cursor = self._data.find({"_id": {"$in": ids}})
+            results = await cursor.to_list(length=None)
+
+            # Format results to include id field expected by API
+            formatted_results = []
+            for result in results:
+                result_dict = dict(result)
+                if "_id" in result_dict and "id" not in result_dict:
+                    result_dict["id"] = result_dict["_id"]
+                formatted_results.append(result_dict)
+
+            return formatted_results
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
+            return []
+
 
 async def get_or_create_collection(db: AsyncIOMotorDatabase, collection_name: str):
     collection_names = await db.list_collection_names()
diff --git a/lightrag/kg/nano_vector_db_impl.py b/lightrag/kg/nano_vector_db_impl.py
index 07ccd566..ac010f16 100644
--- a/lightrag/kg/nano_vector_db_impl.py
+++ b/lightrag/kg/nano_vector_db_impl.py
@@ -256,3 +256,33 @@ class NanoVectorDBStorage(BaseVectorStorage):
 
         logger.debug(f"Found {len(matching_records)} records with prefix '{prefix}'")
         return matching_records
+
+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        client = await self._get_client()
+        result = client.get([id])
+        if result:
+            return result[0]
+        return None
+
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        if not ids:
+            return []
+
+        client = await self._get_client()
+        return client.get(ids)
diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py
index eda3ca63..32790f4f 100644
--- a/lightrag/kg/oracle_impl.py
+++ b/lightrag/kg/oracle_impl.py
@@ -529,6 +529,80 @@ class OracleVectorDBStorage(BaseVectorStorage):
             logger.error(f"Error searching records with prefix '{prefix}': {e}")
             return []
 
+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        try:
+            # Determine the table name based on namespace
+            table_name = namespace_to_table_name(self.namespace)
+            if not table_name:
+                logger.error(f"Unknown namespace for ID lookup: {self.namespace}")
+                return None
+
+            # Create the appropriate ID field name based on namespace
+            id_field = "entity_id" if "NODES" in table_name else "relation_id"
+            if "CHUNKS" in table_name:
+                id_field = "chunk_id"
+
+            # Prepare and execute the query
+            query = f"""
+                SELECT * FROM {table_name}
+                WHERE {id_field} = :id AND workspace = :workspace
+            """
+            params = {"id": id, "workspace": self.db.workspace}
+
+            result = await self.db.query(query, params)
+            return result
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for ID {id}: {e}")
+            return None
+
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        if not ids:
+            return []
+
+        try:
+            # Determine the table name based on namespace
+            table_name = namespace_to_table_name(self.namespace)
+            if not table_name:
+                logger.error(f"Unknown namespace for IDs lookup: {self.namespace}")
+                return []
+
+            # Create the appropriate ID field name based on namespace
+            id_field = "entity_id" if "NODES" in table_name else "relation_id"
+            if "CHUNKS" in table_name:
+                id_field = "chunk_id"
+
+            # Format the list of IDs for SQL IN clause
+            ids_list = ", ".join([f"'{id}'" for id in ids])
+
+            # Prepare and execute the query
+            query = f"""
+                SELECT * FROM {table_name}
+                WHERE {id_field} IN ({ids_list}) AND workspace = :workspace
+            """
+            params = {"workspace": self.db.workspace}
+
+            results = await self.db.query(query, params, multirows=True)
+            return results or []
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
+            return []
+
 
 @final
 @dataclass
diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py
index 1d525bdb..49d462f6 100644
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@@ -621,6 +621,60 @@ class PGVectorStorage(BaseVectorStorage):
             logger.error(f"Error during prefix search for '{prefix}': {e}")
             return []
 
+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        table_name = namespace_to_table_name(self.namespace)
+        if not table_name:
+            logger.error(f"Unknown namespace for ID lookup: {self.namespace}")
+            return None
+
+        query = f"SELECT * FROM {table_name} WHERE workspace=$1 AND id=$2"
+        params = {"workspace": self.db.workspace, "id": id}
+
+        try:
+            result = await self.db.query(query, params)
+            if result:
+                return dict(result)
+            return None
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for ID {id}: {e}")
+            return None
+
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        if not ids:
+            return []
+
+        table_name = namespace_to_table_name(self.namespace)
+        if not table_name:
+            logger.error(f"Unknown namespace for IDs lookup: {self.namespace}")
+            return []
+
+        ids_str = ",".join([f"'{id}'" for id in ids])
+        query = f"SELECT * FROM {table_name} WHERE workspace=$1 AND id IN ({ids_str})"
+        params = {"workspace": self.db.workspace}
+
+        try:
+            results = await self.db.query(query, params, multirows=True)
+            return [dict(record) for record in results]
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
+            return []
+
 
 @final
 @dataclass
diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py
index 7af9b48a..c4485df6 100644
--- a/lightrag/kg/tidb_impl.py
+++ b/lightrag/kg/tidb_impl.py
@@ -463,6 +463,100 @@ class TiDBVectorDBStorage(BaseVectorStorage):
             logger.error(f"Error searching records with prefix '{prefix}': {e}")
             return []
 
+    async def get_by_id(self, id: str) -> dict[str, Any] | None:
+        """Get vector data by its ID
+
+        Args:
+            id: The unique identifier of the vector
+
+        Returns:
+            The vector data if found, or None if not found
+        """
+        try:
+            # Determine which table to query based on namespace
+            if self.namespace == NameSpace.VECTOR_STORE_ENTITIES:
+                sql_template = """
+                    SELECT entity_id as id, name as entity_name, entity_type, description, content
+                    FROM LIGHTRAG_GRAPH_NODES
+                    WHERE entity_id = :entity_id AND workspace = :workspace
+                """
+                params = {"entity_id": id, "workspace": self.db.workspace}
+            elif self.namespace == NameSpace.VECTOR_STORE_RELATIONSHIPS:
+                sql_template = """
+                    SELECT relation_id as id, source_name as src_id, target_name as tgt_id,
+                           keywords, description, content
+                    FROM LIGHTRAG_GRAPH_EDGES
+                    WHERE relation_id = :relation_id AND workspace = :workspace
+                """
+                params = {"relation_id": id, "workspace": self.db.workspace}
+            elif self.namespace == NameSpace.VECTOR_STORE_CHUNKS:
+                sql_template = """
+                    SELECT chunk_id as id, content, tokens, chunk_order_index, full_doc_id
+                    FROM LIGHTRAG_DOC_CHUNKS
+                    WHERE chunk_id = :chunk_id AND workspace = :workspace
+                """
+                params = {"chunk_id": id, "workspace": self.db.workspace}
+            else:
+                logger.warning(
+                    f"Namespace {self.namespace} not supported for get_by_id"
+                )
+                return None
+
+            result = await self.db.query(sql_template, params=params)
+            return result
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for ID {id}: {e}")
+            return None
+
+    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
+        """Get multiple vector data by their IDs
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            List of vector data objects that were found
+        """
+        if not ids:
+            return []
+
+        try:
+            # Format IDs for SQL IN clause
+            ids_str = ", ".join([f"'{id}'" for id in ids])
+
+            # Determine which table to query based on namespace
+            if self.namespace == NameSpace.VECTOR_STORE_ENTITIES:
+                sql_template = f"""
+                    SELECT entity_id as id, name as entity_name, entity_type, description, content
+                    FROM LIGHTRAG_GRAPH_NODES
+                    WHERE entity_id IN ({ids_str}) AND workspace = :workspace
+                """
+            elif self.namespace == NameSpace.VECTOR_STORE_RELATIONSHIPS:
+                sql_template = f"""
+                    SELECT relation_id as id, source_name as src_id, target_name as tgt_id,
+                           keywords, description, content
+                    FROM LIGHTRAG_GRAPH_EDGES
+                    WHERE relation_id IN ({ids_str}) AND workspace = :workspace
+                """
+            elif self.namespace == NameSpace.VECTOR_STORE_CHUNKS:
+                sql_template = f"""
+                    SELECT chunk_id as id, content, tokens, chunk_order_index, full_doc_id
+                    FROM LIGHTRAG_DOC_CHUNKS
+                    WHERE chunk_id IN ({ids_str}) AND workspace = :workspace
+                """
+            else:
+                logger.warning(
+                    f"Namespace {self.namespace} not supported for get_by_ids"
+                )
+                return []
+
+            params = {"workspace": self.db.workspace}
+            results = await self.db.query(sql_template, params=params, multirows=True)
+            return results if results else []
+        except Exception as e:
+            logger.error(f"Error retrieving vector data for IDs {ids}: {e}")
+            return []
+
 
 @final
 @dataclass
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 3a7d340a..8ab8ece6 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -1756,19 +1756,7 @@ class LightRAG:
     async def get_entity_info(
         self, entity_name: str, include_vector_data: bool = False
     ) -> dict[str, str | None | dict[str, str]]:
-        """Get detailed information of an entity
-
-        Args:
-            entity_name: Entity name (no need for quotes)
-            include_vector_data: Whether to include data from the vector database
-
-        Returns:
-            dict: A dictionary containing entity information, including:
-                - entity_name: Entity name
-                - source_id: Source document ID
-                - graph_data: Complete node data from the graph database
-                - vector_data: (optional) Data from the vector database
-        """
+        """Get detailed information of an entity"""
 
         # Get information from the graph
         node_data = await self.chunk_entity_relation_graph.get_node(entity_name)
@@ -1783,29 +1771,15 @@ class LightRAG:
         # Optional: Get vector database information
         if include_vector_data:
             entity_id = compute_mdhash_id(entity_name, prefix="ent-")
-            vector_data = self.entities_vdb._client.get([entity_id])
-            result["vector_data"] = vector_data[0] if vector_data else None
+            vector_data = await self.entities_vdb.get_by_id(entity_id)
+            result["vector_data"] = vector_data
 
         return result
 
     async def get_relation_info(
         self, src_entity: str, tgt_entity: str, include_vector_data: bool = False
     ) -> dict[str, str | None | dict[str, str]]:
-        """Get detailed information of a relationship
-
-        Args:
-            src_entity: Source entity name (no need for quotes)
-            tgt_entity: Target entity name (no need for quotes)
-            include_vector_data: Whether to include data from the vector database
-
-        Returns:
-            dict: A dictionary containing relationship information, including:
-                - src_entity: Source entity name
-                - tgt_entity: Target entity name
-                - source_id: Source document ID
-                - graph_data: Complete edge data from the graph database
-                - vector_data: (optional) Data from the vector database
-        """
+        """Get detailed information of a relationship"""
 
         # Get information from the graph
         edge_data = await self.chunk_entity_relation_graph.get_edge(
@@ -1823,8 +1797,8 @@ class LightRAG:
         # Optional: Get vector database information
         if include_vector_data:
             rel_id = compute_mdhash_id(src_entity + tgt_entity, prefix="rel-")
-            vector_data = self.relationships_vdb._client.get([rel_id])
-            result["vector_data"] = vector_data[0] if vector_data else None
+            vector_data = await self.relationships_vdb.get_by_id(rel_id)
+            result["vector_data"] = vector_data
 
         return result
 

From ea05b8e49ffd3fb495c63fdd76c3168dbbd97cf1 Mon Sep 17 00:00:00 2001
From: zrguo <zrguo.bupt@qq.com>
Date: Tue, 11 Mar 2025 16:19:44 +0800
Subject: [PATCH 53/54] Fix the merge bug with Neo4j

---
 lightrag/lightrag.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 3a7d340a..2116cf58 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -2622,6 +2622,12 @@ class LightRAG:
 
             # 9. Delete source entities
             for entity_name in source_entities:
+                if entity_name == target_entity:
+                    logger.info(
+                        f"Skipping deletion of '{entity_name}' as it's also the target entity"
+                    )
+                    continue
+
                 # Delete entity node from knowledge graph
                 await self.chunk_entity_relation_graph.delete_node(entity_name)
 

From 2ffd7f9111a61bd4cf94227a0732272ed0c42062 Mon Sep 17 00:00:00 2001
From: zrguo <zrguo.bupt@qq.com>
Date: Tue, 11 Mar 2025 21:13:50 +0800
Subject: [PATCH 54/54] fix linting

---
 .../components/retrieval/QuerySettings.tsx    |  2 +-
 .../src/features/RetrievalTesting.tsx         |  2 +-
 lightrag_webui/src/locales/en.json            | 20 +++++++++----------
 lightrag_webui/src/locales/zh.json            | 15 +++++++-------
 4 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/lightrag_webui/src/components/retrieval/QuerySettings.tsx b/lightrag_webui/src/components/retrieval/QuerySettings.tsx
index f3ec98fb..ee785fc8 100644
--- a/lightrag_webui/src/components/retrieval/QuerySettings.tsx
+++ b/lightrag_webui/src/components/retrieval/QuerySettings.tsx
@@ -37,7 +37,7 @@ export default function QuerySettings() {
             <>
               <Text
                 className="ml-1"
-                text={t('retrievePanel.querySettings.queryMode')} 
+                text={t('retrievePanel.querySettings.queryMode')}
                 tooltip={t('retrievePanel.querySettings.queryModeTooltip')}
                 side="left"
               />
diff --git a/lightrag_webui/src/features/RetrievalTesting.tsx b/lightrag_webui/src/features/RetrievalTesting.tsx
index 84955aa1..c7fdf2a9 100644
--- a/lightrag_webui/src/features/RetrievalTesting.tsx
+++ b/lightrag_webui/src/features/RetrievalTesting.tsx
@@ -119,7 +119,7 @@ export default function RetrievalTesting() {
             <div className="flex min-h-0 flex-1 flex-col gap-2">
               {messages.length === 0 ? (
                 <div className="text-muted-foreground flex h-full items-center justify-center text-lg">
-                  {t('retrievePanel.retrieval.startPrompt')} 
+                  {t('retrievePanel.retrieval.startPrompt')}
                 </div>
               ) : (
                 messages.map((message, idx) => (
diff --git a/lightrag_webui/src/locales/en.json b/lightrag_webui/src/locales/en.json
index 6a1e122b..31df1fe4 100644
--- a/lightrag_webui/src/locales/en.json
+++ b/lightrag_webui/src/locales/en.json
@@ -8,8 +8,8 @@
     "themeToggle": {
       "switchToLight": "Switch to light theme",
       "switchToDark": "Switch to dark theme"
-    }     
-  },  
+    }
+  },
   "documentPanel": {
     "clearDocuments": {
       "button": "Clear",
@@ -20,7 +20,7 @@
       "success": "Documents cleared successfully",
       "failed": "Clear Documents Failed:\n{{message}}",
       "error": "Clear Documents Failed:\n{{error}}"
-    },    
+    },
     "uploadDocuments": {
       "button": "Upload",
       "tooltip": "Upload documents",
@@ -32,7 +32,7 @@
       "error": "Upload Failed:\n{{name}}\n{{error}}",
       "generalError": "Upload Failed\n{{error}}",
       "fileTypes": "Supported types: TXT, MD, DOCX, PDF, PPTX, RTF, ODT, EPUB, HTML, HTM, TEX, JSON, XML, YAML, YML, CSV, LOG, CONF, INI, PROPERTIES, SQL, BAT, SH, C, CPP, PY, JAVA, JS, TS, SWIFT, GO, RB, PHP, CSS, SCSS, LESS"
-    },    
+    },
     "documentManager": {
       "title": "Document Management",
       "scanButton": "Scan",
@@ -81,7 +81,7 @@
         "maxLayoutIterations": "Max Layout Iterations",
         "apiKey": "API Key",
         "enterYourAPIkey": "Enter your API key",
-        "save": "Save"      
+        "save": "Save"
       },
 
       "zoomControl": {
@@ -107,7 +107,7 @@
       "fullScreenControl": {
         "fullScreen": "Full Screen",
         "windowed": "Windowed"
-      }      
+      }
     },
     "statusIndicator": {
       "connected": "Connected",
@@ -149,7 +149,7 @@
         "source": "Source",
         "target": "Target",
         "properties": "Properties"
-      }      
+      }
     },
     "search": {
       "placeholder": "Search nodes...",
@@ -161,7 +161,7 @@
       "label": "Label",
       "placeholder": "Search labels...",
       "andOthers": "And {count} others"
-    }        
+    }
   },
   "retrievePanel": {
     "chatMessage": {
@@ -229,6 +229,6 @@
 
       "streamResponse": "Stream Response",
       "streamResponseTooltip": "If True, enables streaming output for real-time responses"
-    }   
-  } 
+    }
+  }
 }
diff --git a/lightrag_webui/src/locales/zh.json b/lightrag_webui/src/locales/zh.json
index 17c6ec9d..e9a98240 100644
--- a/lightrag_webui/src/locales/zh.json
+++ b/lightrag_webui/src/locales/zh.json
@@ -8,8 +8,8 @@
     "themeToggle": {
       "switchToLight": "切换到亮色主题",
       "switchToDark": "切换到暗色主题"
-    }    
-  },  
+    }
+  },
   "documentPanel": {
     "clearDocuments": {
       "button": "清除",
@@ -103,11 +103,11 @@
           "Force Atlas": "力导向图谱布局"
         }
       },
-      
+
       "fullScreenControl": {
         "fullScreen": "全屏",
         "windowed": "窗口模式"
-      }      
+      }
     },
     "statusIndicator": {
       "connected": "已连接",
@@ -149,7 +149,7 @@
         "source": "源",
         "target": "目标",
         "properties": "属性"
-      } 
+      }
     },
     "search": {
       "placeholder": "搜索节点...",
@@ -161,7 +161,7 @@
       "label": "标签",
       "placeholder": "搜索标签...",
       "andOthers": "以及其它 {count} 个"
-    }    
+    }
   },
   "retrievePanel": {
     "chatMessage": {
@@ -231,6 +231,5 @@
       "streamResponse": "流式响应",
       "streamResponseTooltip": "如果为 True，则启用流式输出以获得实时响应"
     }
-  }  
+  }
 }
-