From 0ec61d640769a32b141afcf4ef58c5d5a7e9fe11 Mon Sep 17 00:00:00 2001
From: Roy <arindamroy11235@gmail.com>
Date: Fri, 7 Mar 2025 18:45:28 +0000
Subject: [PATCH 01/18] Update project dependencies and example test files

- Updated requirements.txt with latest package versions
- Added support for filtering query results by IDs in base and operate modules
- Modified PostgreSQL vector storage to include document and chunk ID fields
---
 .gitignore                   | 21 +++++++++++
 lightrag/base.py             |  5 ++-
 lightrag/kg/postgres_impl.py |  6 +++-
 lightrag/lightrag.py         |  1 +
 lightrag/operate.py          | 11 +++++-
 requirements.txt             | 67 +++++++++++++++++++++++++++---------
 6 files changed, 91 insertions(+), 20 deletions(-)

diff --git a/.gitignore b/.gitignore
index a4afe4ea..4f28427f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,3 +64,24 @@ gui/
 
 # unit-test files
 test_*
+Miniconda3-latest-Linux-x86_64.sh
+requirements_basic.txt
+requirements.txt
+examples/test_chromadb.py
+examples/test_faiss.py
+examples/test_neo4j.py
+.gitignore
+requirements.txt
+examples/test_chromadb.py
+examples/test_faiss.py
+examples/*
+tests/test_lightrag_ollama_chat.py
+requirements.txt
+requirements.txt
+examples/test_chromadb.py
+examples/test_faiss.py
+examples/test_neo4j.py
+tests/test_lightrag_ollama_chat.py
+examples/test_chromadb.py
+examples/test_faiss.py
+examples/test_neo4j.py
diff --git a/lightrag/base.py b/lightrag/base.py
index 5f6a1bf1..e7ab3127 100644
--- a/lightrag/base.py
+++ b/lightrag/base.py
@@ -81,6 +81,9 @@ class QueryParam:
     history_turns: int = 3
     """Number of complete conversation turns (user-assistant pairs) to consider in the response context."""
 
+    ids: list[str] | None = None
+    """List of ids to filter the results."""
+
 
 @dataclass
 class StorageNameSpace(ABC):
@@ -107,7 +110,7 @@ class BaseVectorStorage(StorageNameSpace, ABC):
     meta_fields: set[str] = field(default_factory=set)
 
     @abstractmethod
-    async def query(self, query: str, top_k: int) -> list[dict[str, Any]]:
+    async def query(self, query: str, top_k: int, ids: list[str] = None) -> list[dict[str, Any]]:
         """Query the vector storage and retrieve top_k results."""
 
     @abstractmethod
diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py
index 54a59f5d..a069cec0 100644
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@@ -492,7 +492,7 @@ class PGVectorStorage(BaseVectorStorage):
             await self.db.execute(upsert_sql, data)
 
     #################### query method ###############
-    async def query(self, query: str, top_k: int) -> list[dict[str, Any]]:
+    async def query(self, query: str, top_k: int, ids: list[str] = None) -> list[dict[str, Any]]:
         embeddings = await self.embedding_func([query])
         embedding = embeddings[0]
         embedding_string = ",".join(map(str, embedding))
@@ -1387,6 +1387,8 @@ TABLES = {
                     content_vector VECTOR,
                     create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                     update_time TIMESTAMP,
+                    document_id VARCHAR(255) NULL,
+                    chunk_id VARCHAR(255) NULL,
 	                CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id)
                     )"""
     },
@@ -1400,6 +1402,8 @@ TABLES = {
                     content_vector VECTOR,
                     create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                     update_time TIMESTAMP,
+                    document_id VARCHAR(255) NULL,
+                    chunk_id VARCHAR(255) NULL,
 	                CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id)
                     )"""
     },
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 0554ab76..ae6fd9dc 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -1243,6 +1243,7 @@ class LightRAG:
                     embedding_func=self.embedding_func,
                 ),
                 system_prompt=system_prompt,
+                ids = param.ids
             )
         elif param.mode == "naive":
             response = await naive_query(
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 30983145..6c0e1e4c 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -602,6 +602,7 @@ async def kg_query(
     global_config: dict[str, str],
     hashing_kv: BaseKVStorage | None = None,
     system_prompt: str | None = None,
+    ids: list[str] | None = None,
 ) -> str | AsyncIterator[str]:
     # Handle cache
     use_model_func = global_config["llm_model_func"]
@@ -649,6 +650,7 @@ async def kg_query(
         relationships_vdb,
         text_chunks_db,
         query_param,
+        ids
     )
 
     if query_param.only_need_context:
@@ -1016,6 +1018,7 @@ async def _build_query_context(
     relationships_vdb: BaseVectorStorage,
     text_chunks_db: BaseKVStorage,
     query_param: QueryParam,
+    ids: list[str] = None,
 ):
     if query_param.mode == "local":
         entities_context, relations_context, text_units_context = await _get_node_data(
@@ -1032,6 +1035,7 @@ async def _build_query_context(
             relationships_vdb,
             text_chunks_db,
             query_param,
+            ids = ids
         )
     else:  # hybrid mode
         ll_data, hl_data = await asyncio.gather(
@@ -1348,11 +1352,16 @@ async def _get_edge_data(
     relationships_vdb: BaseVectorStorage,
     text_chunks_db: BaseKVStorage,
     query_param: QueryParam,
+    ids: list[str] | None = None,
 ):
     logger.info(
         f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}"
     )
-    results = await relationships_vdb.query(keywords, top_k=query_param.top_k)
+    if ids: 
+        #TODO: add ids to the query
+        results = await relationships_vdb.query(keywords, top_k = query_param.top_k, ids = ids)
+    else:   
+        results = await relationships_vdb.query(keywords, top_k=query_param.top_k)
 
     if not len(results):
         return "", "", ""
diff --git a/requirements.txt b/requirements.txt
index d9a5c68e..088d8843 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,17 +1,50 @@
-aiohttp
-configparser
-future
-
-# Basic modules
-gensim
-pipmaster
-pydantic
-python-dotenv
-
-setuptools
-tenacity
-
-# LLM packages
-tiktoken
-
-# Extra libraries are installed when needed using pipmaster
+aioboto3==14.1.0
+aiofiles==24.1.0
+aiohttp==3.11.13
+ascii_colors==0.5.2
+asyncpg==0.30.0
+chromadb==0.6.3
+community==1.0.0b1
+docx==0.2.4
+# faiss
+fastapi==0.115.11
+glm==0.4.4
+graspologic==3.4.1
+gunicorn==23.0.0
+httpx==0.28.1
+imgui_bundle==1.6.2
+jsonlines==4.0.0
+llama_index==0.12.22
+moderngl==5.12.0
+motor==3.7.0
+nano_vectordb==0.0.4.3
+neo4j==5.28.1
+nest_asyncio==1.6.0
+networkx==3.4.2
+numpy
+openpyxl==3.1.5
+oracledb==3.0.0
+Pillow==11.1.0
+pipmaster==0.4.0
+protobuf
+psutil==7.0.0
+psycopg==3.2.5
+psycopg_pool==3.2.6
+pydantic==2.10.6
+pymilvus==2.5.4
+pymongo==4.11.2
+PyPDF2==3.0.1
+python-dotenv==1.0.1
+pyvis==0.3.2
+qdrant_client==1.13.3
+redis==5.2.1
+Requests==2.32.3
+sentence_transformers==3.4.1
+setuptools==75.8.0
+SQLAlchemy==2.0.38
+starlette==0.46.0
+tenacity==9.0.0
+tiktoken==0.9.0
+torch==2.6.0
+transformers==4.49.0
+uvicorn==0.34.0

From bbe139cfebad995efc24a1a3e7f375530c6202cd Mon Sep 17 00:00:00 2001
From: Roy <arindamroy11235@gmail.com>
Date: Fri, 7 Mar 2025 20:18:01 +0000
Subject: [PATCH 02/18] Enhance PostgreSQL vector storage with chunk_id support

- Updated SQL templates for entity and relationship upsert to include chunk_id
- Modified PGVectorStorage methods to add chunk_id when inserting or updating records
- Expanded database schema to track chunk-level metadata
---
 lightrag/kg/postgres_impl.py       | 11 +++++++----
 tests/test_lightrag_ollama_chat.py | 14 +++++++-------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py
index a069cec0..3fc05f59 100644
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@@ -438,6 +438,7 @@ class PGVectorStorage(BaseVectorStorage):
             "entity_name": item["entity_name"],
             "content": item["content"],
             "content_vector": json.dumps(item["__vector__"].tolist()),
+            "chunk_id": item["source_id"],
         }
         return upsert_sql, data
 
@@ -450,6 +451,7 @@ class PGVectorStorage(BaseVectorStorage):
             "target_id": item["tgt_id"],
             "content": item["content"],
             "content_vector": json.dumps(item["__vector__"].tolist()),
+            "chunk_id": item["source_id"]
         }
         return upsert_sql, data
 
@@ -1486,8 +1488,9 @@ SQL_TEMPLATES = {
                       content_vector=EXCLUDED.content_vector,
                       update_time = CURRENT_TIMESTAMP
                      """,
-    "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, content_vector)
-                      VALUES ($1, $2, $3, $4, $5)
+    "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, 
+                      content_vector, chunk_id)
+                      VALUES ($1, $2, $3, $4, $5, $6)
                       ON CONFLICT (workspace,id) DO UPDATE
                       SET entity_name=EXCLUDED.entity_name,
                       content=EXCLUDED.content,
@@ -1495,8 +1498,8 @@ SQL_TEMPLATES = {
                       update_time=CURRENT_TIMESTAMP
                      """,
     "upsert_relationship": """INSERT INTO LIGHTRAG_VDB_RELATION (workspace, id, source_id,
-                      target_id, content, content_vector)
-                      VALUES ($1, $2, $3, $4, $5, $6)
+                      target_id, content, content_vector, chunk_id)
+                      VALUES ($1, $2, $3, $4, $5, $6, $7)
                       ON CONFLICT (workspace,id) DO UPDATE
                       SET source_id=EXCLUDED.source_id,
                       target_id=EXCLUDED.target_id,
diff --git a/tests/test_lightrag_ollama_chat.py b/tests/test_lightrag_ollama_chat.py
index 80038928..f19e2974 100644
--- a/tests/test_lightrag_ollama_chat.py
+++ b/tests/test_lightrag_ollama_chat.py
@@ -38,16 +38,16 @@ class McpError(Exception):
 
 DEFAULT_CONFIG = {
     "server": {
-        "host": "localhost",
-        "port": 9621,
-        "model": "lightrag:latest",
+        "host": "host.docker.internal",
+        "port": 11434,
+        "model": "llama3.2:latest",
         "timeout": 300,
         "max_retries": 1,
         "retry_delay": 1,
     },
     "test_cases": {
-        "basic": {"query": "唐僧有几个徒弟"},
-        "generate": {"query": "电视剧西游记导演是谁"},
+        "basic": {"query": "How many disciples did Tang Seng have?"},
+        "generate": {"query": "Who directed the TV series Journey to the West?"},
     },
 }
 
@@ -763,8 +763,8 @@ def parse_args() -> argparse.Namespace:
 Configuration file (config.json):
   {
     "server": {
-      "host": "localhost",      # Server address
-      "port": 9621,            # Server port
+      "host": "host.docker.internal",      # Server address
+      "port": 11434,            # Server port
       "model": "lightrag:latest" # Default model name
     },
     "test_cases": {

From 528fb11364b231981028c99f723804085f722e0a Mon Sep 17 00:00:00 2001
From: Roy <arindamroy11235@gmail.com>
Date: Sat, 8 Mar 2025 15:43:17 +0000
Subject: [PATCH 03/18] Refactor vector query methods to support optional ID
 filtering

- Updated BaseVectorStorage query method signature to accept optional IDs
- Modified operate.py to pass query parameter IDs to vector storage queries
- Updated PostgreSQL vector storage SQL templates to filter results by document IDs
- Removed unused parameters and simplified query logic across multiple files
---
 lightrag/base.py             |  3 +-
 lightrag/kg/postgres_impl.py | 98 ++++++++++++++++++++++++++++--------
 lightrag/lightrag.py         |  1 -
 lightrag/operate.py          | 17 +++----
 4 files changed, 85 insertions(+), 34 deletions(-)

diff --git a/lightrag/base.py b/lightrag/base.py
index e7ab3127..20fe2a5b 100644
--- a/lightrag/base.py
+++ b/lightrag/base.py
@@ -108,9 +108,8 @@ class BaseVectorStorage(StorageNameSpace, ABC):
     embedding_func: EmbeddingFunc
     cosine_better_than_threshold: float = field(default=0.2)
     meta_fields: set[str] = field(default_factory=set)
-
     @abstractmethod
-    async def query(self, query: str, top_k: int, ids: list[str] = None) -> list[dict[str, Any]]:
+    async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]:
         """Query the vector storage and retrieve top_k results."""
 
     @abstractmethod
diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py
index 3fc05f59..c1ca7aa9 100644
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@@ -439,6 +439,7 @@ class PGVectorStorage(BaseVectorStorage):
             "content": item["content"],
             "content_vector": json.dumps(item["__vector__"].tolist()),
             "chunk_id": item["source_id"],
+            #TODO: add document_id
         }
         return upsert_sql, data
 
@@ -452,6 +453,7 @@ class PGVectorStorage(BaseVectorStorage):
             "content": item["content"],
             "content_vector": json.dumps(item["__vector__"].tolist()),
             "chunk_id": item["source_id"]
+            #TODO: add document_id
         }
         return upsert_sql, data
 
@@ -494,13 +496,19 @@ class PGVectorStorage(BaseVectorStorage):
             await self.db.execute(upsert_sql, data)
 
     #################### query method ###############
-    async def query(self, query: str, top_k: int, ids: list[str] = None) -> list[dict[str, Any]]:
+    async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]:
         embeddings = await self.embedding_func([query])
         embedding = embeddings[0]
         embedding_string = ",".join(map(str, embedding))
 
+        if ids:
+            formatted_ids = ",".join(f"'{id}'" for id in ids)
+        else:
+            formatted_ids = "NULL"
+            
         sql = SQL_TEMPLATES[self.base_namespace].format(
-            embedding_string=embedding_string
+            embedding_string=embedding_string,
+            doc_ids=formatted_ids
         )
         params = {
             "workspace": self.db.workspace,
@@ -1389,7 +1397,6 @@ TABLES = {
                     content_vector VECTOR,
                     create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                     update_time TIMESTAMP,
-                    document_id VARCHAR(255) NULL,
                     chunk_id VARCHAR(255) NULL,
 	                CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id)
                     )"""
@@ -1404,7 +1411,6 @@ TABLES = {
                     content_vector VECTOR,
                     create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                     update_time TIMESTAMP,
-                    document_id VARCHAR(255) NULL,
                     chunk_id VARCHAR(255) NULL,
 	                CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id)
                     )"""
@@ -1507,21 +1513,21 @@ SQL_TEMPLATES = {
                       content_vector=EXCLUDED.content_vector, update_time = CURRENT_TIMESTAMP
                      """,
     # SQL for VectorStorage
-    "entities": """SELECT entity_name FROM
-        (SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
-        FROM LIGHTRAG_VDB_ENTITY where workspace=$1)
-        WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
-       """,
-    "relationships": """SELECT source_id as src_id, target_id as tgt_id FROM
-        (SELECT id, source_id,target_id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
-        FROM LIGHTRAG_VDB_RELATION where workspace=$1)
-        WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
-       """,
-    "chunks": """SELECT id FROM
-        (SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
-        FROM LIGHTRAG_DOC_CHUNKS where workspace=$1)
-        WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
-       """,
+    # "entities": """SELECT entity_name FROM
+    #     (SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+    #     FROM LIGHTRAG_VDB_ENTITY where workspace=$1)
+    #     WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
+    #    """,
+    # "relationships": """SELECT source_id as src_id, target_id as tgt_id FROM
+    #     (SELECT id, source_id,target_id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+    #     FROM LIGHTRAG_VDB_RELATION where workspace=$1)
+    #     WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
+    #    """,
+    # "chunks": """SELECT id FROM
+    #     (SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+    #     FROM LIGHTRAG_DOC_CHUNKS where workspace=$1)
+    #     WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
+    #    """,
     # DROP tables
     "drop_all": """
 	    DROP TABLE IF EXISTS LIGHTRAG_DOC_FULL CASCADE;
@@ -1545,4 +1551,56 @@ SQL_TEMPLATES = {
     "drop_vdb_relation": """
 	    DROP TABLE IF EXISTS LIGHTRAG_VDB_RELATION CASCADE;
        """,
-}
+    "relationships": """
+    WITH relevant_chunks AS (
+        SELECT id as chunk_id 
+        FROM LIGHTRAG_DOC_CHUNKS 
+        WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
+    )
+    SELECT source_id as src_id, target_id as tgt_id 
+    FROM (
+        SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance
+        FROM LIGHTRAG_VDB_RELATION r
+        WHERE r.workspace=$1 
+        AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks)
+    ) filtered
+    WHERE distance>$2 
+    ORDER BY distance DESC 
+    LIMIT $3
+    """,
+    "entities": 
+    '''
+        WITH relevant_chunks AS (
+            SELECT id as chunk_id 
+            FROM LIGHTRAG_DOC_CHUNKS 
+            WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
+        )
+        SELECT entity_name FROM
+            (
+                SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+                FROM LIGHTRAG_VDB_ENTITY 
+                where workspace=$1
+                AND chunk_id IN (SELECT chunk_id FROM relevant_chunks)
+            )
+        WHERE distance>$2 
+        ORDER BY distance DESC  
+        LIMIT $3
+    ''',
+    'chunks': """
+        WITH relevant_chunks AS (
+            SELECT id as chunk_id 
+            FROM LIGHTRAG_DOC_CHUNKS 
+            WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
+        )
+        SELECT id FROM
+            (
+                SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+                FROM LIGHTRAG_DOC_CHUNKS 
+                where workspace=$1
+                AND chunk_id IN (SELECT chunk_id FROM relevant_chunks)
+            )
+            WHERE distance>$2 
+            ORDER BY distance DESC 
+            LIMIT $3
+    """
+}
\ No newline at end of file
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index ae6fd9dc..0554ab76 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -1243,7 +1243,6 @@ class LightRAG:
                     embedding_func=self.embedding_func,
                 ),
                 system_prompt=system_prompt,
-                ids = param.ids
             )
         elif param.mode == "naive":
             response = await naive_query(
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 6c0e1e4c..7910917a 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -602,7 +602,6 @@ async def kg_query(
     global_config: dict[str, str],
     hashing_kv: BaseKVStorage | None = None,
     system_prompt: str | None = None,
-    ids: list[str] | None = None,
 ) -> str | AsyncIterator[str]:
     # Handle cache
     use_model_func = global_config["llm_model_func"]
@@ -650,7 +649,6 @@ async def kg_query(
         relationships_vdb,
         text_chunks_db,
         query_param,
-        ids
     )
 
     if query_param.only_need_context:
@@ -1035,7 +1033,6 @@ async def _build_query_context(
             relationships_vdb,
             text_chunks_db,
             query_param,
-            ids = ids
         )
     else:  # hybrid mode
         ll_data, hl_data = await asyncio.gather(
@@ -1104,7 +1101,9 @@ async def _get_node_data(
     logger.info(
         f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}"
     )
-    results = await entities_vdb.query(query, top_k=query_param.top_k)
+
+    results = await entities_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids)
+
     if not len(results):
         return "", "", ""
     # get entity information
@@ -1352,16 +1351,12 @@ async def _get_edge_data(
     relationships_vdb: BaseVectorStorage,
     text_chunks_db: BaseKVStorage,
     query_param: QueryParam,
-    ids: list[str] | None = None,
 ):
     logger.info(
         f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}"
     )
-    if ids: 
-        #TODO: add ids to the query
-        results = await relationships_vdb.query(keywords, top_k = query_param.top_k, ids = ids)
-    else:   
-        results = await relationships_vdb.query(keywords, top_k=query_param.top_k)
+
+    results = await relationships_vdb.query(keywords, top_k = query_param.top_k, ids = query_param.ids)
 
     if not len(results):
         return "", "", ""
@@ -1610,7 +1605,7 @@ async def naive_query(
     if cached_response is not None:
         return cached_response
 
-    results = await chunks_vdb.query(query, top_k=query_param.top_k)
+    results = await chunks_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids)
     if not len(results):
         return PROMPTS["fail_response"]
 

From e31c0c8f6c7e4be70245b59556cadef76fad0706 Mon Sep 17 00:00:00 2001
From: Roy <arindamroy11235@gmail.com>
Date: Sat, 8 Mar 2025 20:25:20 +0000
Subject: [PATCH 04/18] Update vector query methods to support ID filtering in
 PostgreSQL

- Modified `mix_kg_vector_query` in operate.py to pass optional IDs to vector search
- Updated PostgreSQL SQL template to filter results using document IDs instead of chunk_id
- Improved query flexibility by allowing precise document selection during vector search
---
 lightrag/kg/postgres_impl.py | 2 +-
 lightrag/operate.py          | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py
index c1ca7aa9..b0284e91 100644
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@@ -1597,7 +1597,7 @@ SQL_TEMPLATES = {
                 SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
                 FROM LIGHTRAG_DOC_CHUNKS 
                 where workspace=$1
-                AND chunk_id IN (SELECT chunk_id FROM relevant_chunks)
+                AND id IN (SELECT chunk_id FROM relevant_chunks)
             )
             WHERE distance>$2 
             ORDER BY distance DESC 
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 7910917a..4d440ca5 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -892,7 +892,8 @@ async def mix_kg_vector_query(
         try:
             # Reduce top_k for vector search in hybrid mode since we have structured information from KG
             mix_topk = min(10, query_param.top_k)
-            results = await chunks_vdb.query(augmented_query, top_k=mix_topk)
+            # TODO: add ids to the query
+            results = await chunks_vdb.query(augmented_query, top_k=mix_topk, ids = query_param.ids)
             if not results:
                 return None
 

From 16536c870c22434a7357beaa356b46d37c681f6f Mon Sep 17 00:00:00 2001
From: Roy <arindamroy11235@gmail.com>
Date: Sat, 8 Mar 2025 21:06:19 +0000
Subject: [PATCH 05/18] Commit unncessary files update

---
 .gitignore                         | 21 ---------------------
 tests/test_lightrag_ollama_chat.py | 14 +++++++-------
 2 files changed, 7 insertions(+), 28 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4f28427f..a4afe4ea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,24 +64,3 @@ gui/
 
 # unit-test files
 test_*
-Miniconda3-latest-Linux-x86_64.sh
-requirements_basic.txt
-requirements.txt
-examples/test_chromadb.py
-examples/test_faiss.py
-examples/test_neo4j.py
-.gitignore
-requirements.txt
-examples/test_chromadb.py
-examples/test_faiss.py
-examples/*
-tests/test_lightrag_ollama_chat.py
-requirements.txt
-requirements.txt
-examples/test_chromadb.py
-examples/test_faiss.py
-examples/test_neo4j.py
-tests/test_lightrag_ollama_chat.py
-examples/test_chromadb.py
-examples/test_faiss.py
-examples/test_neo4j.py
diff --git a/tests/test_lightrag_ollama_chat.py b/tests/test_lightrag_ollama_chat.py
index f19e2974..80038928 100644
--- a/tests/test_lightrag_ollama_chat.py
+++ b/tests/test_lightrag_ollama_chat.py
@@ -38,16 +38,16 @@ class McpError(Exception):
 
 DEFAULT_CONFIG = {
     "server": {
-        "host": "host.docker.internal",
-        "port": 11434,
-        "model": "llama3.2:latest",
+        "host": "localhost",
+        "port": 9621,
+        "model": "lightrag:latest",
         "timeout": 300,
         "max_retries": 1,
         "retry_delay": 1,
     },
     "test_cases": {
-        "basic": {"query": "How many disciples did Tang Seng have?"},
-        "generate": {"query": "Who directed the TV series Journey to the West?"},
+        "basic": {"query": "唐僧有几个徒弟"},
+        "generate": {"query": "电视剧西游记导演是谁"},
     },
 }
 
@@ -763,8 +763,8 @@ def parse_args() -> argparse.Namespace:
 Configuration file (config.json):
   {
     "server": {
-      "host": "host.docker.internal",      # Server address
-      "port": 11434,            # Server port
+      "host": "localhost",      # Server address
+      "port": 9621,            # Server port
       "model": "lightrag:latest" # Default model name
     },
     "test_cases": {

From d6a426d3e656784c3bfa4c18ed7cce593f8e135f Mon Sep 17 00:00:00 2001
From: Roy <arindamroy11235@gmail.com>
Date: Sat, 8 Mar 2025 21:09:45 +0000
Subject: [PATCH 06/18] Included all requirements in the package

---
 requirements.txt | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 088d8843..3f6f3668 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -48,3 +48,18 @@ tiktoken==0.9.0
 torch==2.6.0
 transformers==4.49.0
 uvicorn==0.34.0
+aiohttp
+configparser
+future
+
+# Basic modules
+gensim
+pipmaster
+pydantic
+python-dotenv
+
+setuptools
+tenacity
+
+# LLM packages
+tiktoken

From a1708a2638ab0ed02bc303d994823b373db9bdbe Mon Sep 17 00:00:00 2001
From: Roy <arindamroy11235@gmail.com>
Date: Sat, 8 Mar 2025 21:13:29 +0000
Subject: [PATCH 07/18] Add optional ids filter to QueryParam for RAG document
 filtering

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 00da54fb..6eb25e65 100644
--- a/README.md
+++ b/README.md
@@ -176,6 +176,8 @@ class QueryParam:
     """Maximum number of tokens allocated for relationship descriptions in global retrieval."""
     max_token_for_local_context: int = 4000
     """Maximum number of tokens allocated for entity descriptions in local retrieval."""
+    ids: list[str] | None = None
+    """List of ids to filter the RAG."""
     ...
 ```
 

From 4ce28b31bd34d01065097a2caf4665fe8ce12b59 Mon Sep 17 00:00:00 2001
From: Roy <arindamroy11235@gmail.com>
Date: Sat, 8 Mar 2025 21:31:41 +0000
Subject: [PATCH 08/18] minor readme fix

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6eb25e65..018a94e6 100644
--- a/README.md
+++ b/README.md
@@ -176,7 +176,7 @@ class QueryParam:
     """Maximum number of tokens allocated for relationship descriptions in global retrieval."""
     max_token_for_local_context: int = 4000
     """Maximum number of tokens allocated for entity descriptions in local retrieval."""
-    ids: list[str] | None = None
+    ids: list[str] | None = None # ONLY SUPPORTED FOR PG VECTOR DBs
     """List of ids to filter the RAG."""
     ...
 ```

From 7807379bee5a8d060826c732cb9f5c75d6c4c7aa Mon Sep 17 00:00:00 2001
From: Roy <arindamroy11235@gmail.com>
Date: Mon, 10 Mar 2025 09:18:22 +0000
Subject: [PATCH 09/18] Remove unused ids parameter from _build_query_context
 function

---
 lightrag/operate.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index f6fa4bfe..3c5ed329 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1017,7 +1017,6 @@ async def _build_query_context(
     relationships_vdb: BaseVectorStorage,
     text_chunks_db: BaseKVStorage,
     query_param: QueryParam,
-    ids: list[str] = None,
 ):
     if query_param.mode == "local":
         entities_context, relations_context, text_units_context = await _get_node_data(

From 8317ec9757d32a2572b1afd3afe3b2bfb8d35e1f Mon Sep 17 00:00:00 2001
From: zrguo <zrguo.bupt@qq.com>
Date: Mon, 10 Mar 2025 23:00:06 +0800
Subject: [PATCH 10/18] Update __init__.py

---
 lightrag/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/__init__.py b/lightrag/__init__.py
index e4cb3e63..382060f7 100644
--- a/lightrag/__init__.py
+++ b/lightrag/__init__.py
@@ -1,5 +1,5 @@
 from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
 
-__version__ = "1.2.4"
+__version__ = "1.2.5"
 __author__ = "Zirui Guo"
 __url__ = "https://github.com/HKUDS/LightRAG"

From 92ae895713a2aebed3dabf0ef656a44872d61de4 Mon Sep 17 00:00:00 2001
From: Roy <arindamroy11235@gmail.com>
Date: Mon, 10 Mar 2025 15:39:18 +0000
Subject: [PATCH 11/18] Refactor requirements and code formatting

- Simplified requirements.txt by removing specific version constraints
- Added comment about extra library installation using pipmaster
- Improved code formatting in base.py, operate.py, and postgres_impl.py
- Cleaned up SQL templates and query method signatures with consistent formatting
---
 lightrag/base.py             |  5 ++-
 lightrag/kg/postgres_impl.py | 60 ++++++++++++++++++------------------
 lightrag/operate.py          | 16 +++++++---
 requirements.txt             | 52 ++-----------------------------
 4 files changed, 48 insertions(+), 85 deletions(-)

diff --git a/lightrag/base.py b/lightrag/base.py
index 61787efc..c84c7c62 100644
--- a/lightrag/base.py
+++ b/lightrag/base.py
@@ -108,8 +108,11 @@ class BaseVectorStorage(StorageNameSpace, ABC):
     embedding_func: EmbeddingFunc
     cosine_better_than_threshold: float = field(default=0.2)
     meta_fields: set[str] = field(default_factory=set)
+
     @abstractmethod
-    async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]:
+    async def query(
+        self, query: str, top_k: int, ids: list[str] | None = None
+    ) -> list[dict[str, Any]]:
         """Query the vector storage and retrieve top_k results."""
 
     @abstractmethod
diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py
index ad794e3b..1d525bdb 100644
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@@ -439,7 +439,7 @@ class PGVectorStorage(BaseVectorStorage):
             "content": item["content"],
             "content_vector": json.dumps(item["__vector__"].tolist()),
             "chunk_id": item["source_id"],
-            #TODO: add document_id
+            # TODO: add document_id
         }
         return upsert_sql, data
 
@@ -452,8 +452,8 @@ class PGVectorStorage(BaseVectorStorage):
             "target_id": item["tgt_id"],
             "content": item["content"],
             "content_vector": json.dumps(item["__vector__"].tolist()),
-            "chunk_id": item["source_id"]
-            #TODO: add document_id
+            "chunk_id": item["source_id"],
+            # TODO: add document_id
         }
         return upsert_sql, data
 
@@ -496,7 +496,9 @@ class PGVectorStorage(BaseVectorStorage):
             await self.db.execute(upsert_sql, data)
 
     #################### query method ###############
-    async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]:
+    async def query(
+        self, query: str, top_k: int, ids: list[str] | None = None
+    ) -> list[dict[str, Any]]:
         embeddings = await self.embedding_func([query])
         embedding = embeddings[0]
         embedding_string = ",".join(map(str, embedding))
@@ -505,10 +507,9 @@ class PGVectorStorage(BaseVectorStorage):
             formatted_ids = ",".join(f"'{id}'" for id in ids)
         else:
             formatted_ids = "NULL"
-            
+
         sql = SQL_TEMPLATES[self.base_namespace].format(
-            embedding_string=embedding_string,
-            doc_ids=formatted_ids
+            embedding_string=embedding_string, doc_ids=formatted_ids
         )
         params = {
             "workspace": self.db.workspace,
@@ -1598,7 +1599,7 @@ SQL_TEMPLATES = {
                       content_vector=EXCLUDED.content_vector,
                       update_time = CURRENT_TIMESTAMP
                      """,
-    "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, 
+    "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
                       content_vector, chunk_id)
                       VALUES ($1, $2, $3, $4, $5, $6)
                       ON CONFLICT (workspace,id) DO UPDATE
@@ -1657,54 +1658,53 @@ SQL_TEMPLATES = {
        """,
     "relationships": """
     WITH relevant_chunks AS (
-        SELECT id as chunk_id 
-        FROM LIGHTRAG_DOC_CHUNKS 
+        SELECT id as chunk_id
+        FROM LIGHTRAG_DOC_CHUNKS
         WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
     )
-    SELECT source_id as src_id, target_id as tgt_id 
+    SELECT source_id as src_id, target_id as tgt_id
     FROM (
         SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance
         FROM LIGHTRAG_VDB_RELATION r
-        WHERE r.workspace=$1 
+        WHERE r.workspace=$1
         AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks)
     ) filtered
-    WHERE distance>$2 
-    ORDER BY distance DESC 
+    WHERE distance>$2
+    ORDER BY distance DESC
     LIMIT $3
     """,
-    "entities": 
-    '''
+    "entities": """
         WITH relevant_chunks AS (
-            SELECT id as chunk_id 
-            FROM LIGHTRAG_DOC_CHUNKS 
+            SELECT id as chunk_id
+            FROM LIGHTRAG_DOC_CHUNKS
             WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
         )
         SELECT entity_name FROM
             (
                 SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
-                FROM LIGHTRAG_VDB_ENTITY 
+                FROM LIGHTRAG_VDB_ENTITY
                 where workspace=$1
                 AND chunk_id IN (SELECT chunk_id FROM relevant_chunks)
             )
-        WHERE distance>$2 
-        ORDER BY distance DESC  
+        WHERE distance>$2
+        ORDER BY distance DESC
         LIMIT $3
-    ''',
-    'chunks': """
+    """,
+    "chunks": """
         WITH relevant_chunks AS (
-            SELECT id as chunk_id 
-            FROM LIGHTRAG_DOC_CHUNKS 
+            SELECT id as chunk_id
+            FROM LIGHTRAG_DOC_CHUNKS
             WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
         )
         SELECT id FROM
             (
                 SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
-                FROM LIGHTRAG_DOC_CHUNKS 
+                FROM LIGHTRAG_DOC_CHUNKS
                 where workspace=$1
                 AND id IN (SELECT chunk_id FROM relevant_chunks)
             )
-            WHERE distance>$2 
-            ORDER BY distance DESC 
+            WHERE distance>$2
+            ORDER BY distance DESC
             LIMIT $3
-    """
-}
\ No newline at end of file
+    """,
+}
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 3c5ed329..5e90a77b 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -893,7 +893,9 @@ async def mix_kg_vector_query(
             # Reduce top_k for vector search in hybrid mode since we have structured information from KG
             mix_topk = min(10, query_param.top_k)
             # TODO: add ids to the query
-            results = await chunks_vdb.query(augmented_query, top_k=mix_topk, ids = query_param.ids)
+            results = await chunks_vdb.query(
+                augmented_query, top_k=mix_topk, ids=query_param.ids
+            )
             if not results:
                 return None
 
@@ -1102,7 +1104,9 @@ async def _get_node_data(
         f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}"
     )
 
-    results = await entities_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids)
+    results = await entities_vdb.query(
+        query, top_k=query_param.top_k, ids=query_param.ids
+    )
 
     if not len(results):
         return "", "", ""
@@ -1357,7 +1361,9 @@ async def _get_edge_data(
         f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}"
     )
 
-    results = await relationships_vdb.query(keywords, top_k = query_param.top_k, ids = query_param.ids)
+    results = await relationships_vdb.query(
+        keywords, top_k=query_param.top_k, ids=query_param.ids
+    )
 
     if not len(results):
         return "", "", ""
@@ -1606,7 +1612,9 @@ async def naive_query(
     if cached_response is not None:
         return cached_response
 
-    results = await chunks_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids)
+    results = await chunks_vdb.query(
+        query, top_k=query_param.top_k, ids=query_param.ids
+    )
     if not len(results):
         return PROMPTS["fail_response"]
 
diff --git a/requirements.txt b/requirements.txt
index 3f6f3668..d9a5c68e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,53 +1,3 @@
-aioboto3==14.1.0
-aiofiles==24.1.0
-aiohttp==3.11.13
-ascii_colors==0.5.2
-asyncpg==0.30.0
-chromadb==0.6.3
-community==1.0.0b1
-docx==0.2.4
-# faiss
-fastapi==0.115.11
-glm==0.4.4
-graspologic==3.4.1
-gunicorn==23.0.0
-httpx==0.28.1
-imgui_bundle==1.6.2
-jsonlines==4.0.0
-llama_index==0.12.22
-moderngl==5.12.0
-motor==3.7.0
-nano_vectordb==0.0.4.3
-neo4j==5.28.1
-nest_asyncio==1.6.0
-networkx==3.4.2
-numpy
-openpyxl==3.1.5
-oracledb==3.0.0
-Pillow==11.1.0
-pipmaster==0.4.0
-protobuf
-psutil==7.0.0
-psycopg==3.2.5
-psycopg_pool==3.2.6
-pydantic==2.10.6
-pymilvus==2.5.4
-pymongo==4.11.2
-PyPDF2==3.0.1
-python-dotenv==1.0.1
-pyvis==0.3.2
-qdrant_client==1.13.3
-redis==5.2.1
-Requests==2.32.3
-sentence_transformers==3.4.1
-setuptools==75.8.0
-SQLAlchemy==2.0.38
-starlette==0.46.0
-tenacity==9.0.0
-tiktoken==0.9.0
-torch==2.6.0
-transformers==4.49.0
-uvicorn==0.34.0
 aiohttp
 configparser
 future
@@ -63,3 +13,5 @@ tenacity
 
 # LLM packages
 tiktoken
+
+# Extra libraries are installed when needed using pipmaster

From 37754f14b51d5ea274ae184db772cc0e50ed3d72 Mon Sep 17 00:00:00 2001
From: Zhenya Zhu <zzyagg@gmail.com>
Date: Tue, 11 Mar 2025 11:54:30 +0800
Subject: [PATCH 12/18] force keywords_extraction output as JSON

---
 lightrag/prompt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/prompt.py b/lightrag/prompt.py
index 1486ccf8..f81cd441 100644
--- a/lightrag/prompt.py
+++ b/lightrag/prompt.py
@@ -236,7 +236,7 @@ Given the query and conversation history, list both high-level and low-level key
 ---Instructions---
 
 - Consider both the current query and relevant conversation history when extracting keywords
-- Output the keywords in JSON format
+- Output the keywords in JSON format, it will be parsed by a JSON parser, do not add any extra content in output
 - The JSON should have two keys:
   - "high_level_keywords" for overarching concepts or themes
   - "low_level_keywords" for specific entities or details

From d77401961dc84f802a49b3c060df6f7b136cff04 Mon Sep 17 00:00:00 2001
From: Zhichun Wu <zhicwu@gmail.com>
Date: Tue, 11 Mar 2025 11:57:41 +0800
Subject: [PATCH 13/18] Resolve the issue with making API calls to Azure OpenAI
 service

---
 lightrag/llm/azure_openai.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lightrag/llm/azure_openai.py b/lightrag/llm/azure_openai.py
index 84e45cfb..3405d29e 100644
--- a/lightrag/llm/azure_openai.py
+++ b/lightrag/llm/azure_openai.py
@@ -55,6 +55,7 @@ async def azure_openai_complete_if_cache(
 
     openai_async_client = AsyncAzureOpenAI(
         azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
+        azure_deployment=model,
         api_key=os.getenv("AZURE_OPENAI_API_KEY"),
         api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
     )
@@ -136,6 +137,7 @@ async def azure_openai_embed(
 
     openai_async_client = AsyncAzureOpenAI(
         azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
+        azure_deployment=model,
         api_key=os.getenv("AZURE_OPENAI_API_KEY"),
         api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
     )

From 061350b2bf13557b9bb40c592e775d31235f1d73 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Tue, 11 Mar 2025 12:08:10 +0800
Subject: [PATCH 14/18] Improve Entity Extraction Robustness for Truncated LLM
 Responses

---
 lightrag/operate.py | 128 ++++++++++++++++++++++++++++++--------------
 1 file changed, 87 insertions(+), 41 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index e352ff79..f808f3c2 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -141,18 +141,36 @@ async def _handle_single_entity_extraction(
 ):
     if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
         return None
-    # add this record as a node in the G
+
+    # Clean and validate entity name
     entity_name = clean_str(record_attributes[1]).strip('"')
     if not entity_name.strip():
+        logger.warning(
+            f"Entity extraction error: empty entity name in: {record_attributes}"
+        )
         return None
+
+    # Clean and validate entity type
     entity_type = clean_str(record_attributes[2]).strip('"')
+    if not entity_type.strip() or entity_type.startswith('("'):
+        logger.warning(
+            f"Entity extraction error: invalid entity type in: {record_attributes}"
+        )
+        return None
+
+    # Clean and validate description
     entity_description = clean_str(record_attributes[3]).strip('"')
-    entity_source_id = chunk_key
+    if not entity_description.strip():
+        logger.warning(
+            f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'"
+        )
+        return None
+
     return dict(
         entity_name=entity_name,
         entity_type=entity_type,
         description=entity_description,
-        source_id=entity_source_id,
+        source_id=chunk_key,
         metadata={"created_at": time.time()},
     )
 
@@ -438,47 +456,22 @@ async def extract_entities(
         else:
             return await use_llm_func(input_text)
 
-    async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
-        """ "Prpocess a single chunk
+    async def _process_extraction_result(result: str, chunk_key: str):
+        """Process a single extraction result (either initial or gleaning)
         Args:
-            chunk_key_dp (tuple[str, TextChunkSchema]):
-                ("chunck-xxxxxx", {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int})
+            result (str): The extraction result to process
+            chunk_key (str): The chunk key for source tracking
+        Returns:
+            tuple: (nodes_dict, edges_dict) containing the extracted entities and relationships
         """
-        nonlocal processed_chunks
-        chunk_key = chunk_key_dp[0]
-        chunk_dp = chunk_key_dp[1]
-        content = chunk_dp["content"]
-        # hint_prompt = entity_extract_prompt.format(**context_base, input_text=content)
-        hint_prompt = entity_extract_prompt.format(
-            **context_base, input_text="{input_text}"
-        ).format(**context_base, input_text=content)
-
-        final_result = await _user_llm_func_with_cache(hint_prompt)
-        history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
-        for now_glean_index in range(entity_extract_max_gleaning):
-            glean_result = await _user_llm_func_with_cache(
-                continue_prompt, history_messages=history
-            )
-
-            history += pack_user_ass_to_openai_messages(continue_prompt, glean_result)
-            final_result += glean_result
-            if now_glean_index == entity_extract_max_gleaning - 1:
-                break
-
-            if_loop_result: str = await _user_llm_func_with_cache(
-                if_loop_prompt, history_messages=history
-            )
-            if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
-            if if_loop_result != "yes":
-                break
-
-        records = split_string_by_multi_markers(
-            final_result,
-            [context_base["record_delimiter"], context_base["completion_delimiter"]],
-        )
-
         maybe_nodes = defaultdict(list)
         maybe_edges = defaultdict(list)
+        
+        records = split_string_by_multi_markers(
+            result,
+            [context_base["record_delimiter"], context_base["completion_delimiter"]],
+        )
+        
         for record in records:
             record = re.search(r"\((.*)\)", record)
             if record is None:
@@ -487,13 +480,14 @@ async def extract_entities(
             record_attributes = split_string_by_multi_markers(
                 record, [context_base["tuple_delimiter"]]
             )
+            
             if_entities = await _handle_single_entity_extraction(
                 record_attributes, chunk_key
             )
             if if_entities is not None:
                 maybe_nodes[if_entities["entity_name"]].append(if_entities)
                 continue
-
+                
             if_relation = await _handle_single_relationship_extraction(
                 record_attributes, chunk_key
             )
@@ -501,6 +495,58 @@ async def extract_entities(
                 maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append(
                     if_relation
                 )
+                
+        return maybe_nodes, maybe_edges
+
+    async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
+        """Process a single chunk
+        Args:
+            chunk_key_dp (tuple[str, TextChunkSchema]):
+                ("chunk-xxxxxx", {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int})
+        """
+        nonlocal processed_chunks
+        chunk_key = chunk_key_dp[0]
+        chunk_dp = chunk_key_dp[1]
+        content = chunk_dp["content"]
+        
+        # Get initial extraction
+        hint_prompt = entity_extract_prompt.format(
+            **context_base, input_text="{input_text}"
+        ).format(**context_base, input_text=content)
+        
+        final_result = await _user_llm_func_with_cache(hint_prompt)
+        history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
+        
+        # Process initial extraction
+        maybe_nodes, maybe_edges = await _process_extraction_result(final_result, chunk_key)
+        
+        # Process additional gleaning results
+        for now_glean_index in range(entity_extract_max_gleaning):
+            glean_result = await _user_llm_func_with_cache(
+                continue_prompt, history_messages=history
+            )
+            
+            history += pack_user_ass_to_openai_messages(continue_prompt, glean_result)
+            
+            # Process gleaning result separately
+            glean_nodes, glean_edges = await _process_extraction_result(glean_result, chunk_key)
+            
+            # Merge results
+            for entity_name, entities in glean_nodes.items():
+                maybe_nodes[entity_name].extend(entities)
+            for edge_key, edges in glean_edges.items():
+                maybe_edges[edge_key].extend(edges)
+                
+            if now_glean_index == entity_extract_max_gleaning - 1:
+                break
+                
+            if_loop_result: str = await _user_llm_func_with_cache(
+                if_loop_prompt, history_messages=history
+            )
+            if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
+            if if_loop_result != "yes":
+                break
+                
         processed_chunks += 1
         entities_count = len(maybe_nodes)
         relations_count = len(maybe_edges)

From 9d1dc2c9c3786aea25fba48cf946348e2a374b6b Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Tue, 11 Mar 2025 12:23:51 +0800
Subject: [PATCH 15/18] Fix linting

---
 lightrag/operate.py | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index f808f3c2..09e51fcf 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -466,12 +466,12 @@ async def extract_entities(
         """
         maybe_nodes = defaultdict(list)
         maybe_edges = defaultdict(list)
-        
+
         records = split_string_by_multi_markers(
             result,
             [context_base["record_delimiter"], context_base["completion_delimiter"]],
         )
-        
+
         for record in records:
             record = re.search(r"\((.*)\)", record)
             if record is None:
@@ -480,14 +480,14 @@ async def extract_entities(
             record_attributes = split_string_by_multi_markers(
                 record, [context_base["tuple_delimiter"]]
             )
-            
+
             if_entities = await _handle_single_entity_extraction(
                 record_attributes, chunk_key
             )
             if if_entities is not None:
                 maybe_nodes[if_entities["entity_name"]].append(if_entities)
                 continue
-                
+
             if_relation = await _handle_single_relationship_extraction(
                 record_attributes, chunk_key
             )
@@ -495,7 +495,7 @@ async def extract_entities(
                 maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append(
                     if_relation
                 )
-                
+
         return maybe_nodes, maybe_edges
 
     async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
@@ -508,45 +508,49 @@ async def extract_entities(
         chunk_key = chunk_key_dp[0]
         chunk_dp = chunk_key_dp[1]
         content = chunk_dp["content"]
-        
+
         # Get initial extraction
         hint_prompt = entity_extract_prompt.format(
             **context_base, input_text="{input_text}"
         ).format(**context_base, input_text=content)
-        
+
         final_result = await _user_llm_func_with_cache(hint_prompt)
         history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
-        
+
         # Process initial extraction
-        maybe_nodes, maybe_edges = await _process_extraction_result(final_result, chunk_key)
-        
+        maybe_nodes, maybe_edges = await _process_extraction_result(
+            final_result, chunk_key
+        )
+
         # Process additional gleaning results
         for now_glean_index in range(entity_extract_max_gleaning):
             glean_result = await _user_llm_func_with_cache(
                 continue_prompt, history_messages=history
             )
-            
+
             history += pack_user_ass_to_openai_messages(continue_prompt, glean_result)
-            
+
             # Process gleaning result separately
-            glean_nodes, glean_edges = await _process_extraction_result(glean_result, chunk_key)
-            
+            glean_nodes, glean_edges = await _process_extraction_result(
+                glean_result, chunk_key
+            )
+
             # Merge results
             for entity_name, entities in glean_nodes.items():
                 maybe_nodes[entity_name].extend(entities)
             for edge_key, edges in glean_edges.items():
                 maybe_edges[edge_key].extend(edges)
-                
+
             if now_glean_index == entity_extract_max_gleaning - 1:
                 break
-                
+
             if_loop_result: str = await _user_llm_func_with_cache(
                 if_loop_prompt, history_messages=history
             )
             if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
             if if_loop_result != "yes":
                 break
-                
+
         processed_chunks += 1
         entities_count = len(maybe_nodes)
         relations_count = len(maybe_edges)

From 62b304600bbc90160a64de305d713560ec3b007b Mon Sep 17 00:00:00 2001
From: zrguo <zrguo.bupt@qq.com>
Date: Tue, 11 Mar 2025 15:43:04 +0800
Subject: [PATCH 16/18] clean lightrag.py

---
 lightrag/lightrag.py | 126 ++++++++++++++-----------------------------
 lightrag/operate.py  |  86 +++++++++++++++++++++++++++++
 lightrag/utils.py    |  46 ++++++++++++++++
 3 files changed, 172 insertions(+), 86 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 3a7d340a..a5cb3b22 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -30,11 +30,10 @@ from .namespace import NameSpace, make_namespace
 from .operate import (
     chunking_by_token_size,
     extract_entities,
-    extract_keywords_only,
     kg_query,
-    kg_query_with_keywords,
     mix_kg_vector_query,
     naive_query,
+    query_with_keywords,
 )
 from .prompt import GRAPH_FIELD_SEP, PROMPTS
 from .utils import (
@@ -45,6 +44,9 @@ from .utils import (
     encode_string_by_tiktoken,
     lazy_external_import,
     limit_async_func_call,
+    get_content_summary,
+    clean_text,
+    check_storage_env_vars,
     logger,
 )
 from .types import KnowledgeGraph
@@ -309,7 +311,7 @@ class LightRAG:
             # Verify storage implementation compatibility
             verify_storage_implementation(storage_type, storage_name)
             # Check environment variables
-            # self.check_storage_env_vars(storage_name)
+            check_storage_env_vars(storage_name)
 
         # Ensure vector_db_storage_cls_kwargs has required fields
         self.vector_db_storage_cls_kwargs = {
@@ -536,11 +538,6 @@ class LightRAG:
         storage_class = lazy_external_import(import_path, storage_name)
         return storage_class
 
-    @staticmethod
-    def clean_text(text: str) -> str:
-        """Clean text by removing null bytes (0x00) and whitespace"""
-        return text.strip().replace("\x00", "")
-
     def insert(
         self,
         input: str | list[str],
@@ -602,8 +599,8 @@ class LightRAG:
         update_storage = False
         try:
             # Clean input texts
-            full_text = self.clean_text(full_text)
-            text_chunks = [self.clean_text(chunk) for chunk in text_chunks]
+            full_text = clean_text(full_text)
+            text_chunks = [clean_text(chunk) for chunk in text_chunks]
 
             # Process cleaned texts
             if doc_id is None:
@@ -682,7 +679,7 @@ class LightRAG:
             contents = {id_: doc for id_, doc in zip(ids, input)}
         else:
             # Clean input text and remove duplicates
-            input = list(set(self.clean_text(doc) for doc in input))
+            input = list(set(clean_text(doc) for doc in input))
             # Generate contents dict of MD5 hash IDs and documents
             contents = {compute_mdhash_id(doc, prefix="doc-"): doc for doc in input}
 
@@ -698,7 +695,7 @@ class LightRAG:
         new_docs: dict[str, Any] = {
             id_: {
                 "content": content,
-                "content_summary": self._get_content_summary(content),
+                "content_summary": get_content_summary(content),
                 "content_length": len(content),
                 "status": DocStatus.PENDING,
                 "created_at": datetime.now().isoformat(),
@@ -1063,7 +1060,7 @@ class LightRAG:
             all_chunks_data: dict[str, dict[str, str]] = {}
             chunk_to_source_map: dict[str, str] = {}
             for chunk_data in custom_kg.get("chunks", []):
-                chunk_content = self.clean_text(chunk_data["content"])
+                chunk_content = clean_text(chunk_data["content"])
                 source_id = chunk_data["source_id"]
                 tokens = len(
                     encode_string_by_tiktoken(
@@ -1296,8 +1293,17 @@ class LightRAG:
         self, query: str, prompt: str, param: QueryParam = QueryParam()
     ):
         """
-        1. Extract keywords from the 'query' using new function in operate.py.
-        2. Then run the standard aquery() flow with the final prompt (formatted_question).
+        Query with separate keyword extraction step.
+        
+        This method extracts keywords from the query first, then uses them for the query.
+        
+        Args:
+            query: User query
+            prompt: Additional prompt for the query
+            param: Query parameters
+            
+        Returns:
+            Query response
         """
         loop = always_get_an_event_loop()
         return loop.run_until_complete(
@@ -1308,66 +1314,29 @@ class LightRAG:
         self, query: str, prompt: str, param: QueryParam = QueryParam()
     ) -> str | AsyncIterator[str]:
         """
-        1. Calls extract_keywords_only to get HL/LL keywords from 'query'.
-        2. Then calls kg_query(...) or naive_query(...), etc. as the main query, while also injecting the newly extracted keywords if needed.
+        Async version of query_with_separate_keyword_extraction.
+        
+        Args:
+            query: User query
+            prompt: Additional prompt for the query
+            param: Query parameters
+            
+        Returns:
+            Query response or async iterator
         """
-        # ---------------------
-        # STEP 1: Keyword Extraction
-        # ---------------------
-        hl_keywords, ll_keywords = await extract_keywords_only(
-            text=query,
+        response = await query_with_keywords(
+            query=query,
+            prompt=prompt,
             param=param,
+            knowledge_graph_inst=self.chunk_entity_relation_graph,
+            entities_vdb=self.entities_vdb,
+            relationships_vdb=self.relationships_vdb,
+            chunks_vdb=self.chunks_vdb,
+            text_chunks_db=self.text_chunks,
             global_config=asdict(self),
-            hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
+            hashing_kv=self.llm_response_cache,
         )
-
-        param.hl_keywords = hl_keywords
-        param.ll_keywords = ll_keywords
-
-        # ---------------------
-        # STEP 2: Final Query Logic
-        # ---------------------
-
-        # Create a new string with the prompt and the keywords
-        ll_keywords_str = ", ".join(ll_keywords)
-        hl_keywords_str = ", ".join(hl_keywords)
-        formatted_question = f"{prompt}\n\n### Keywords:\nHigh-level: {hl_keywords_str}\nLow-level: {ll_keywords_str}\n\n### Query:\n{query}"
-
-        if param.mode in ["local", "global", "hybrid"]:
-            response = await kg_query_with_keywords(
-                formatted_question,
-                self.chunk_entity_relation_graph,
-                self.entities_vdb,
-                self.relationships_vdb,
-                self.text_chunks,
-                param,
-                asdict(self),
-                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
-            )
-        elif param.mode == "naive":
-            response = await naive_query(
-                formatted_question,
-                self.chunks_vdb,
-                self.text_chunks,
-                param,
-                asdict(self),
-                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
-            )
-        elif param.mode == "mix":
-            response = await mix_kg_vector_query(
-                formatted_question,
-                self.chunk_entity_relation_graph,
-                self.entities_vdb,
-                self.relationships_vdb,
-                self.chunks_vdb,
-                self.text_chunks,
-                param,
-                asdict(self),
-                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
-            )
-        else:
-            raise ValueError(f"Unknown mode {param.mode}")
-
+        
         await self._query_done()
         return response
 
@@ -1465,21 +1434,6 @@ class LightRAG:
             ]
         )
 
-    def _get_content_summary(self, content: str, max_length: int = 100) -> str:
-        """Get summary of document content
-
-        Args:
-            content: Original document content
-            max_length: Maximum length of summary
-
-        Returns:
-            Truncated content with ellipsis if needed
-        """
-        content = content.strip()
-        if len(content) <= max_length:
-            return content
-        return content[:max_length] + "..."
-
     async def get_processing_status(self) -> dict[str, int]:
         """Get current document processing status counts
 
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 5baec1eb..95a5c72e 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1916,3 +1916,89 @@ async def kg_query_with_keywords(
         )
 
     return response
+
+async def query_with_keywords(
+    query: str,
+    prompt: str,
+    param: QueryParam,
+    knowledge_graph_inst: BaseGraphStorage,
+    entities_vdb: BaseVectorStorage,
+    relationships_vdb: BaseVectorStorage,
+    chunks_vdb: BaseVectorStorage,
+    text_chunks_db: BaseKVStorage,
+    global_config: dict[str, str],
+    hashing_kv: BaseKVStorage | None = None,
+) -> str | AsyncIterator[str]:
+    """
+    Extract keywords from the query and then use them for retrieving information.
+    
+    1. Extracts high-level and low-level keywords from the query
+    2. Formats the query with the extracted keywords and prompt
+    3. Uses the appropriate query method based on param.mode
+    
+    Args:
+        query: The user's query
+        prompt: Additional prompt to prepend to the query
+        param: Query parameters
+        knowledge_graph_inst: Knowledge graph storage
+        entities_vdb: Entities vector database
+        relationships_vdb: Relationships vector database
+        chunks_vdb: Document chunks vector database
+        text_chunks_db: Text chunks storage
+        global_config: Global configuration
+        hashing_kv: Cache storage
+        
+    Returns:
+        Query response or async iterator
+    """
+    # Extract keywords
+    hl_keywords, ll_keywords = await extract_keywords_only(
+        text=query,
+        param=param,
+        global_config=global_config,
+        hashing_kv=hashing_kv,
+    )
+
+    param.hl_keywords = hl_keywords
+    param.ll_keywords = ll_keywords
+
+    # Create a new string with the prompt and the keywords
+    ll_keywords_str = ", ".join(ll_keywords)
+    hl_keywords_str = ", ".join(hl_keywords)
+    formatted_question = f"{prompt}\n\n### Keywords:\nHigh-level: {hl_keywords_str}\nLow-level: {ll_keywords_str}\n\n### Query:\n{query}"
+
+    # Use appropriate query method based on mode
+    if param.mode in ["local", "global", "hybrid"]:
+        return await kg_query_with_keywords(
+            formatted_question,
+            knowledge_graph_inst,
+            entities_vdb,
+            relationships_vdb,
+            text_chunks_db,
+            param,
+            global_config,
+            hashing_kv=hashing_kv,
+        )
+    elif param.mode == "naive":
+        return await naive_query(
+            formatted_question,
+            chunks_vdb,
+            text_chunks_db,
+            param,
+            global_config,
+            hashing_kv=hashing_kv,
+        )
+    elif param.mode == "mix":
+        return await mix_kg_vector_query(
+            formatted_question,
+            knowledge_graph_inst,
+            entities_vdb,
+            relationships_vdb,
+            chunks_vdb,
+            text_chunks_db,
+            param,
+            global_config,
+            hashing_kv=hashing_kv,
+        )
+    else:
+        raise ValueError(f"Unknown mode {param.mode}")
diff --git a/lightrag/utils.py b/lightrag/utils.py
index e8f79610..1143b326 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -890,3 +890,49 @@ def lazy_external_import(module_name: str, class_name: str) -> Callable[..., Any
         return cls(*args, **kwargs)
 
     return import_class
+
+def get_content_summary(content: str, max_length: int = 100) -> str:
+    """Get summary of document content
+
+    Args:
+        content: Original document content
+        max_length: Maximum length of summary
+
+    Returns:
+        Truncated content with ellipsis if needed
+    """
+    content = content.strip()
+    if len(content) <= max_length:
+        return content
+    return content[:max_length] + "..."
+
+def clean_text(text: str) -> str:
+    """Clean text by removing null bytes (0x00) and whitespace
+
+    Args:
+        text: Input text to clean
+
+    Returns:
+        Cleaned text
+    """
+    return text.strip().replace("\x00", "")
+
+def check_storage_env_vars(storage_name: str) -> None:
+    """Check if all required environment variables for storage implementation exist
+
+    Args:
+        storage_name: Storage implementation name
+
+    Raises:
+        ValueError: If required environment variables are missing
+    """
+    from lightrag.kg import STORAGE_ENV_REQUIREMENTS
+    
+    required_vars = STORAGE_ENV_REQUIREMENTS.get(storage_name, [])
+    missing_vars = [var for var in required_vars if var not in os.environ]
+
+    if missing_vars:
+        raise ValueError(
+            f"Storage implementation '{storage_name}' requires the following "
+            f"environment variables: {', '.join(missing_vars)}"
+        )
\ No newline at end of file

From 418aea3895f26e1680d0e0b4909d8e6c52b67240 Mon Sep 17 00:00:00 2001
From: zrguo <zrguo.bupt@qq.com>
Date: Tue, 11 Mar 2025 15:44:01 +0800
Subject: [PATCH 17/18] fix linting

---
 lightrag/lightrag.py | 12 ++++++------
 lightrag/operate.py  |  7 ++++---
 lightrag/utils.py    |  7 +++++--
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index a5cb3b22..48b464a8 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -1294,14 +1294,14 @@ class LightRAG:
     ):
         """
         Query with separate keyword extraction step.
-        
+
         This method extracts keywords from the query first, then uses them for the query.
-        
+
         Args:
             query: User query
             prompt: Additional prompt for the query
             param: Query parameters
-            
+
         Returns:
             Query response
         """
@@ -1315,12 +1315,12 @@ class LightRAG:
     ) -> str | AsyncIterator[str]:
         """
         Async version of query_with_separate_keyword_extraction.
-        
+
         Args:
             query: User query
             prompt: Additional prompt for the query
             param: Query parameters
-            
+
         Returns:
             Query response or async iterator
         """
@@ -1336,7 +1336,7 @@ class LightRAG:
             global_config=asdict(self),
             hashing_kv=self.llm_response_cache,
         )
-        
+
         await self._query_done()
         return response
 
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 95a5c72e..1815f308 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1917,6 +1917,7 @@ async def kg_query_with_keywords(
 
     return response
 
+
 async def query_with_keywords(
     query: str,
     prompt: str,
@@ -1931,11 +1932,11 @@ async def query_with_keywords(
 ) -> str | AsyncIterator[str]:
     """
     Extract keywords from the query and then use them for retrieving information.
-    
+
     1. Extracts high-level and low-level keywords from the query
     2. Formats the query with the extracted keywords and prompt
     3. Uses the appropriate query method based on param.mode
-    
+
     Args:
         query: The user's query
         prompt: Additional prompt to prepend to the query
@@ -1947,7 +1948,7 @@ async def query_with_keywords(
         text_chunks_db: Text chunks storage
         global_config: Global configuration
         hashing_kv: Cache storage
-        
+
     Returns:
         Query response or async iterator
     """
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 1143b326..b8f00c5d 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -891,6 +891,7 @@ def lazy_external_import(module_name: str, class_name: str) -> Callable[..., Any
 
     return import_class
 
+
 def get_content_summary(content: str, max_length: int = 100) -> str:
     """Get summary of document content
 
@@ -906,6 +907,7 @@ def get_content_summary(content: str, max_length: int = 100) -> str:
         return content
     return content[:max_length] + "..."
 
+
 def clean_text(text: str) -> str:
     """Clean text by removing null bytes (0x00) and whitespace
 
@@ -917,6 +919,7 @@ def clean_text(text: str) -> str:
     """
     return text.strip().replace("\x00", "")
 
+
 def check_storage_env_vars(storage_name: str) -> None:
     """Check if all required environment variables for storage implementation exist
 
@@ -927,7 +930,7 @@ def check_storage_env_vars(storage_name: str) -> None:
         ValueError: If required environment variables are missing
     """
     from lightrag.kg import STORAGE_ENV_REQUIREMENTS
-    
+
     required_vars = STORAGE_ENV_REQUIREMENTS.get(storage_name, [])
     missing_vars = [var for var in required_vars if var not in os.environ]
 
@@ -935,4 +938,4 @@ def check_storage_env_vars(storage_name: str) -> None:
         raise ValueError(
             f"Storage implementation '{storage_name}' requires the following "
             f"environment variables: {', '.join(missing_vars)}"
-        )
\ No newline at end of file
+        )

From ea05b8e49ffd3fb495c63fdd76c3168dbbd97cf1 Mon Sep 17 00:00:00 2001
From: zrguo <zrguo.bupt@qq.com>
Date: Tue, 11 Mar 2025 16:19:44 +0800
Subject: [PATCH 18/18] Fix the merge bug with Neo4j

---
 lightrag/lightrag.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 3a7d340a..2116cf58 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -2622,6 +2622,12 @@ class LightRAG:
 
             # 9. Delete source entities
             for entity_name in source_entities:
+                if entity_name == target_entity:
+                    logger.info(
+                        f"Skipping deletion of '{entity_name}' as it's also the target entity"
+                    )
+                    continue
+
                 # Delete entity node from knowledge graph
                 await self.chunk_entity_relation_graph.delete_node(entity_name)