From 0ec61d640769a32b141afcf4ef58c5d5a7e9fe11 Mon Sep 17 00:00:00 2001 From: Roy Date: Fri, 7 Mar 2025 18:45:28 +0000 Subject: [PATCH 01/18] Update project dependencies and example test files - Updated requirements.txt with latest package versions - Added support for filtering query results by IDs in base and operate modules - Modified PostgreSQL vector storage to include document and chunk ID fields --- .gitignore | 21 +++++++++++ lightrag/base.py | 5 ++- lightrag/kg/postgres_impl.py | 6 +++- lightrag/lightrag.py | 1 + lightrag/operate.py | 11 +++++- requirements.txt | 67 +++++++++++++++++++++++++++--------- 6 files changed, 91 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index a4afe4ea..4f28427f 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,24 @@ gui/ # unit-test files test_* +Miniconda3-latest-Linux-x86_64.sh +requirements_basic.txt +requirements.txt +examples/test_chromadb.py +examples/test_faiss.py +examples/test_neo4j.py +.gitignore +requirements.txt +examples/test_chromadb.py +examples/test_faiss.py +examples/* +tests/test_lightrag_ollama_chat.py +requirements.txt +requirements.txt +examples/test_chromadb.py +examples/test_faiss.py +examples/test_neo4j.py +tests/test_lightrag_ollama_chat.py +examples/test_chromadb.py +examples/test_faiss.py +examples/test_neo4j.py diff --git a/lightrag/base.py b/lightrag/base.py index 5f6a1bf1..e7ab3127 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -81,6 +81,9 @@ class QueryParam: history_turns: int = 3 """Number of complete conversation turns (user-assistant pairs) to consider in the response context.""" + ids: list[str] | None = None + """List of ids to filter the results.""" + @dataclass class StorageNameSpace(ABC): @@ -107,7 +110,7 @@ class BaseVectorStorage(StorageNameSpace, ABC): meta_fields: set[str] = field(default_factory=set) @abstractmethod - async def query(self, query: str, top_k: int) -> list[dict[str, Any]]: + async def query(self, query: str, top_k: int, ids: list[str] = None) -> list[dict[str, Any]]: """Query the vector storage and retrieve top_k results.""" @abstractmethod diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 54a59f5d..a069cec0 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -492,7 +492,7 @@ class PGVectorStorage(BaseVectorStorage): await self.db.execute(upsert_sql, data) #################### query method ############### - async def query(self, query: str, top_k: int) -> list[dict[str, Any]]: + async def query(self, query: str, top_k: int, ids: list[str] = None) -> list[dict[str, Any]]: embeddings = await self.embedding_func([query]) embedding = embeddings[0] embedding_string = ",".join(map(str, embedding)) @@ -1387,6 +1387,8 @@ TABLES = { content_vector VECTOR, create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, update_time TIMESTAMP, + document_id VARCHAR(255) NULL, + chunk_id VARCHAR(255) NULL, CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id) )""" }, @@ -1400,6 +1402,8 @@ TABLES = { content_vector VECTOR, create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, update_time TIMESTAMP, + document_id VARCHAR(255) NULL, + chunk_id VARCHAR(255) NULL, CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id) )""" }, diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 0554ab76..ae6fd9dc 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1243,6 +1243,7 @@ class LightRAG: embedding_func=self.embedding_func, ), system_prompt=system_prompt, + ids = param.ids ) elif param.mode == "naive": response = await naive_query( diff --git a/lightrag/operate.py b/lightrag/operate.py index 30983145..6c0e1e4c 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -602,6 +602,7 @@ async def kg_query( global_config: dict[str, str], hashing_kv: BaseKVStorage | None = None, system_prompt: str | None = None, + ids: list[str] | None = None, ) -> str | AsyncIterator[str]: # Handle cache use_model_func = global_config["llm_model_func"] @@ -649,6 +650,7 @@ async def kg_query( relationships_vdb, text_chunks_db, query_param, + ids ) if query_param.only_need_context: @@ -1016,6 +1018,7 @@ async def _build_query_context( relationships_vdb: BaseVectorStorage, text_chunks_db: BaseKVStorage, query_param: QueryParam, + ids: list[str] = None, ): if query_param.mode == "local": entities_context, relations_context, text_units_context = await _get_node_data( @@ -1032,6 +1035,7 @@ async def _build_query_context( relationships_vdb, text_chunks_db, query_param, + ids = ids ) else: # hybrid mode ll_data, hl_data = await asyncio.gather( @@ -1348,11 +1352,16 @@ async def _get_edge_data( relationships_vdb: BaseVectorStorage, text_chunks_db: BaseKVStorage, query_param: QueryParam, + ids: list[str] | None = None, ): logger.info( f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}" ) - results = await relationships_vdb.query(keywords, top_k=query_param.top_k) + if ids: + #TODO: add ids to the query + results = await relationships_vdb.query(keywords, top_k = query_param.top_k, ids = ids) + else: + results = await relationships_vdb.query(keywords, top_k=query_param.top_k) if not len(results): return "", "", "" diff --git a/requirements.txt b/requirements.txt index d9a5c68e..088d8843 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,50 @@ -aiohttp -configparser -future - -# Basic modules -gensim -pipmaster -pydantic -python-dotenv - -setuptools -tenacity - -# LLM packages -tiktoken - -# Extra libraries are installed when needed using pipmaster +aioboto3==14.1.0 +aiofiles==24.1.0 +aiohttp==3.11.13 +ascii_colors==0.5.2 +asyncpg==0.30.0 +chromadb==0.6.3 +community==1.0.0b1 +docx==0.2.4 +# faiss +fastapi==0.115.11 +glm==0.4.4 +graspologic==3.4.1 +gunicorn==23.0.0 +httpx==0.28.1 +imgui_bundle==1.6.2 +jsonlines==4.0.0 +llama_index==0.12.22 +moderngl==5.12.0 +motor==3.7.0 +nano_vectordb==0.0.4.3 +neo4j==5.28.1 +nest_asyncio==1.6.0 +networkx==3.4.2 +numpy +openpyxl==3.1.5 +oracledb==3.0.0 +Pillow==11.1.0 +pipmaster==0.4.0 +protobuf +psutil==7.0.0 +psycopg==3.2.5 +psycopg_pool==3.2.6 +pydantic==2.10.6 +pymilvus==2.5.4 +pymongo==4.11.2 +PyPDF2==3.0.1 +python-dotenv==1.0.1 +pyvis==0.3.2 +qdrant_client==1.13.3 +redis==5.2.1 +Requests==2.32.3 +sentence_transformers==3.4.1 +setuptools==75.8.0 +SQLAlchemy==2.0.38 +starlette==0.46.0 +tenacity==9.0.0 +tiktoken==0.9.0 +torch==2.6.0 +transformers==4.49.0 +uvicorn==0.34.0 From bbe139cfebad995efc24a1a3e7f375530c6202cd Mon Sep 17 00:00:00 2001 From: Roy Date: Fri, 7 Mar 2025 20:18:01 +0000 Subject: [PATCH 02/18] Enhance PostgreSQL vector storage with chunk_id support - Updated SQL templates for entity and relationship upsert to include chunk_id - Modified PGVectorStorage methods to add chunk_id when inserting or updating records - Expanded database schema to track chunk-level metadata --- lightrag/kg/postgres_impl.py | 11 +++++++---- tests/test_lightrag_ollama_chat.py | 14 +++++++------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index a069cec0..3fc05f59 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -438,6 +438,7 @@ class PGVectorStorage(BaseVectorStorage): "entity_name": item["entity_name"], "content": item["content"], "content_vector": json.dumps(item["__vector__"].tolist()), + "chunk_id": item["source_id"], } return upsert_sql, data @@ -450,6 +451,7 @@ class PGVectorStorage(BaseVectorStorage): "target_id": item["tgt_id"], "content": item["content"], "content_vector": json.dumps(item["__vector__"].tolist()), + "chunk_id": item["source_id"] } return upsert_sql, data @@ -1486,8 +1488,9 @@ SQL_TEMPLATES = { content_vector=EXCLUDED.content_vector, update_time = CURRENT_TIMESTAMP """, - "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, content_vector) - VALUES ($1, $2, $3, $4, $5) + "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, + content_vector, chunk_id) + VALUES ($1, $2, $3, $4, $5, $6) ON CONFLICT (workspace,id) DO UPDATE SET entity_name=EXCLUDED.entity_name, content=EXCLUDED.content, @@ -1495,8 +1498,8 @@ SQL_TEMPLATES = { update_time=CURRENT_TIMESTAMP """, "upsert_relationship": """INSERT INTO LIGHTRAG_VDB_RELATION (workspace, id, source_id, - target_id, content, content_vector) - VALUES ($1, $2, $3, $4, $5, $6) + target_id, content, content_vector, chunk_id) + VALUES ($1, $2, $3, $4, $5, $6, $7) ON CONFLICT (workspace,id) DO UPDATE SET source_id=EXCLUDED.source_id, target_id=EXCLUDED.target_id, diff --git a/tests/test_lightrag_ollama_chat.py b/tests/test_lightrag_ollama_chat.py index 80038928..f19e2974 100644 --- a/tests/test_lightrag_ollama_chat.py +++ b/tests/test_lightrag_ollama_chat.py @@ -38,16 +38,16 @@ class McpError(Exception): DEFAULT_CONFIG = { "server": { - "host": "localhost", - "port": 9621, - "model": "lightrag:latest", + "host": "host.docker.internal", + "port": 11434, + "model": "llama3.2:latest", "timeout": 300, "max_retries": 1, "retry_delay": 1, }, "test_cases": { - "basic": {"query": "唐僧有几个徒弟"}, - "generate": {"query": "电视剧西游记导演是谁"}, + "basic": {"query": "How many disciples did Tang Seng have?"}, + "generate": {"query": "Who directed the TV series Journey to the West?"}, }, } @@ -763,8 +763,8 @@ def parse_args() -> argparse.Namespace: Configuration file (config.json): { "server": { - "host": "localhost", # Server address - "port": 9621, # Server port + "host": "host.docker.internal", # Server address + "port": 11434, # Server port "model": "lightrag:latest" # Default model name }, "test_cases": { From 528fb11364b231981028c99f723804085f722e0a Mon Sep 17 00:00:00 2001 From: Roy Date: Sat, 8 Mar 2025 15:43:17 +0000 Subject: [PATCH 03/18] Refactor vector query methods to support optional ID filtering - Updated BaseVectorStorage query method signature to accept optional IDs - Modified operate.py to pass query parameter IDs to vector storage queries - Updated PostgreSQL vector storage SQL templates to filter results by document IDs - Removed unused parameters and simplified query logic across multiple files --- lightrag/base.py | 3 +- lightrag/kg/postgres_impl.py | 98 ++++++++++++++++++++++++++++-------- lightrag/lightrag.py | 1 - lightrag/operate.py | 17 +++---- 4 files changed, 85 insertions(+), 34 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index e7ab3127..20fe2a5b 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -108,9 +108,8 @@ class BaseVectorStorage(StorageNameSpace, ABC): embedding_func: EmbeddingFunc cosine_better_than_threshold: float = field(default=0.2) meta_fields: set[str] = field(default_factory=set) - @abstractmethod - async def query(self, query: str, top_k: int, ids: list[str] = None) -> list[dict[str, Any]]: + async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]: """Query the vector storage and retrieve top_k results.""" @abstractmethod diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 3fc05f59..c1ca7aa9 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -439,6 +439,7 @@ class PGVectorStorage(BaseVectorStorage): "content": item["content"], "content_vector": json.dumps(item["__vector__"].tolist()), "chunk_id": item["source_id"], + #TODO: add document_id } return upsert_sql, data @@ -452,6 +453,7 @@ class PGVectorStorage(BaseVectorStorage): "content": item["content"], "content_vector": json.dumps(item["__vector__"].tolist()), "chunk_id": item["source_id"] + #TODO: add document_id } return upsert_sql, data @@ -494,13 +496,19 @@ class PGVectorStorage(BaseVectorStorage): await self.db.execute(upsert_sql, data) #################### query method ############### - async def query(self, query: str, top_k: int, ids: list[str] = None) -> list[dict[str, Any]]: + async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]: embeddings = await self.embedding_func([query]) embedding = embeddings[0] embedding_string = ",".join(map(str, embedding)) + if ids: + formatted_ids = ",".join(f"'{id}'" for id in ids) + else: + formatted_ids = "NULL" + sql = SQL_TEMPLATES[self.base_namespace].format( - embedding_string=embedding_string + embedding_string=embedding_string, + doc_ids=formatted_ids ) params = { "workspace": self.db.workspace, @@ -1389,7 +1397,6 @@ TABLES = { content_vector VECTOR, create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, update_time TIMESTAMP, - document_id VARCHAR(255) NULL, chunk_id VARCHAR(255) NULL, CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id) )""" @@ -1404,7 +1411,6 @@ TABLES = { content_vector VECTOR, create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, update_time TIMESTAMP, - document_id VARCHAR(255) NULL, chunk_id VARCHAR(255) NULL, CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id) )""" @@ -1507,21 +1513,21 @@ SQL_TEMPLATES = { content_vector=EXCLUDED.content_vector, update_time = CURRENT_TIMESTAMP """, # SQL for VectorStorage - "entities": """SELECT entity_name FROM - (SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance - FROM LIGHTRAG_VDB_ENTITY where workspace=$1) - WHERE distance>$2 ORDER BY distance DESC LIMIT $3 - """, - "relationships": """SELECT source_id as src_id, target_id as tgt_id FROM - (SELECT id, source_id,target_id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance - FROM LIGHTRAG_VDB_RELATION where workspace=$1) - WHERE distance>$2 ORDER BY distance DESC LIMIT $3 - """, - "chunks": """SELECT id FROM - (SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance - FROM LIGHTRAG_DOC_CHUNKS where workspace=$1) - WHERE distance>$2 ORDER BY distance DESC LIMIT $3 - """, + # "entities": """SELECT entity_name FROM + # (SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance + # FROM LIGHTRAG_VDB_ENTITY where workspace=$1) + # WHERE distance>$2 ORDER BY distance DESC LIMIT $3 + # """, + # "relationships": """SELECT source_id as src_id, target_id as tgt_id FROM + # (SELECT id, source_id,target_id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance + # FROM LIGHTRAG_VDB_RELATION where workspace=$1) + # WHERE distance>$2 ORDER BY distance DESC LIMIT $3 + # """, + # "chunks": """SELECT id FROM + # (SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance + # FROM LIGHTRAG_DOC_CHUNKS where workspace=$1) + # WHERE distance>$2 ORDER BY distance DESC LIMIT $3 + # """, # DROP tables "drop_all": """ DROP TABLE IF EXISTS LIGHTRAG_DOC_FULL CASCADE; @@ -1545,4 +1551,56 @@ SQL_TEMPLATES = { "drop_vdb_relation": """ DROP TABLE IF EXISTS LIGHTRAG_VDB_RELATION CASCADE; """, -} + "relationships": """ + WITH relevant_chunks AS ( + SELECT id as chunk_id + FROM LIGHTRAG_DOC_CHUNKS + WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}]) + ) + SELECT source_id as src_id, target_id as tgt_id + FROM ( + SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance + FROM LIGHTRAG_VDB_RELATION r + WHERE r.workspace=$1 + AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks) + ) filtered + WHERE distance>$2 + ORDER BY distance DESC + LIMIT $3 + """, + "entities": + ''' + WITH relevant_chunks AS ( + SELECT id as chunk_id + FROM LIGHTRAG_DOC_CHUNKS + WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}]) + ) + SELECT entity_name FROM + ( + SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance + FROM LIGHTRAG_VDB_ENTITY + where workspace=$1 + AND chunk_id IN (SELECT chunk_id FROM relevant_chunks) + ) + WHERE distance>$2 + ORDER BY distance DESC + LIMIT $3 + ''', + 'chunks': """ + WITH relevant_chunks AS ( + SELECT id as chunk_id + FROM LIGHTRAG_DOC_CHUNKS + WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}]) + ) + SELECT id FROM + ( + SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance + FROM LIGHTRAG_DOC_CHUNKS + where workspace=$1 + AND chunk_id IN (SELECT chunk_id FROM relevant_chunks) + ) + WHERE distance>$2 + ORDER BY distance DESC + LIMIT $3 + """ +} \ No newline at end of file diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index ae6fd9dc..0554ab76 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1243,7 +1243,6 @@ class LightRAG: embedding_func=self.embedding_func, ), system_prompt=system_prompt, - ids = param.ids ) elif param.mode == "naive": response = await naive_query( diff --git a/lightrag/operate.py b/lightrag/operate.py index 6c0e1e4c..7910917a 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -602,7 +602,6 @@ async def kg_query( global_config: dict[str, str], hashing_kv: BaseKVStorage | None = None, system_prompt: str | None = None, - ids: list[str] | None = None, ) -> str | AsyncIterator[str]: # Handle cache use_model_func = global_config["llm_model_func"] @@ -650,7 +649,6 @@ async def kg_query( relationships_vdb, text_chunks_db, query_param, - ids ) if query_param.only_need_context: @@ -1035,7 +1033,6 @@ async def _build_query_context( relationships_vdb, text_chunks_db, query_param, - ids = ids ) else: # hybrid mode ll_data, hl_data = await asyncio.gather( @@ -1104,7 +1101,9 @@ async def _get_node_data( logger.info( f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}" ) - results = await entities_vdb.query(query, top_k=query_param.top_k) + + results = await entities_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids) + if not len(results): return "", "", "" # get entity information @@ -1352,16 +1351,12 @@ async def _get_edge_data( relationships_vdb: BaseVectorStorage, text_chunks_db: BaseKVStorage, query_param: QueryParam, - ids: list[str] | None = None, ): logger.info( f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}" ) - if ids: - #TODO: add ids to the query - results = await relationships_vdb.query(keywords, top_k = query_param.top_k, ids = ids) - else: - results = await relationships_vdb.query(keywords, top_k=query_param.top_k) + + results = await relationships_vdb.query(keywords, top_k = query_param.top_k, ids = query_param.ids) if not len(results): return "", "", "" @@ -1610,7 +1605,7 @@ async def naive_query( if cached_response is not None: return cached_response - results = await chunks_vdb.query(query, top_k=query_param.top_k) + results = await chunks_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids) if not len(results): return PROMPTS["fail_response"] From e31c0c8f6c7e4be70245b59556cadef76fad0706 Mon Sep 17 00:00:00 2001 From: Roy Date: Sat, 8 Mar 2025 20:25:20 +0000 Subject: [PATCH 04/18] Update vector query methods to support ID filtering in PostgreSQL - Modified `mix_kg_vector_query` in operate.py to pass optional IDs to vector search - Updated PostgreSQL SQL template to filter results using document IDs instead of chunk_id - Improved query flexibility by allowing precise document selection during vector search --- lightrag/kg/postgres_impl.py | 2 +- lightrag/operate.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index c1ca7aa9..b0284e91 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -1597,7 +1597,7 @@ SQL_TEMPLATES = { SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance FROM LIGHTRAG_DOC_CHUNKS where workspace=$1 - AND chunk_id IN (SELECT chunk_id FROM relevant_chunks) + AND id IN (SELECT chunk_id FROM relevant_chunks) ) WHERE distance>$2 ORDER BY distance DESC diff --git a/lightrag/operate.py b/lightrag/operate.py index 7910917a..4d440ca5 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -892,7 +892,8 @@ async def mix_kg_vector_query( try: # Reduce top_k for vector search in hybrid mode since we have structured information from KG mix_topk = min(10, query_param.top_k) - results = await chunks_vdb.query(augmented_query, top_k=mix_topk) + # TODO: add ids to the query + results = await chunks_vdb.query(augmented_query, top_k=mix_topk, ids = query_param.ids) if not results: return None From 16536c870c22434a7357beaa356b46d37c681f6f Mon Sep 17 00:00:00 2001 From: Roy Date: Sat, 8 Mar 2025 21:06:19 +0000 Subject: [PATCH 05/18] Commit unncessary files update --- .gitignore | 21 --------------------- tests/test_lightrag_ollama_chat.py | 14 +++++++------- 2 files changed, 7 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index 4f28427f..a4afe4ea 100644 --- a/.gitignore +++ b/.gitignore @@ -64,24 +64,3 @@ gui/ # unit-test files test_* -Miniconda3-latest-Linux-x86_64.sh -requirements_basic.txt -requirements.txt -examples/test_chromadb.py -examples/test_faiss.py -examples/test_neo4j.py -.gitignore -requirements.txt -examples/test_chromadb.py -examples/test_faiss.py -examples/* -tests/test_lightrag_ollama_chat.py -requirements.txt -requirements.txt -examples/test_chromadb.py -examples/test_faiss.py -examples/test_neo4j.py -tests/test_lightrag_ollama_chat.py -examples/test_chromadb.py -examples/test_faiss.py -examples/test_neo4j.py diff --git a/tests/test_lightrag_ollama_chat.py b/tests/test_lightrag_ollama_chat.py index f19e2974..80038928 100644 --- a/tests/test_lightrag_ollama_chat.py +++ b/tests/test_lightrag_ollama_chat.py @@ -38,16 +38,16 @@ class McpError(Exception): DEFAULT_CONFIG = { "server": { - "host": "host.docker.internal", - "port": 11434, - "model": "llama3.2:latest", + "host": "localhost", + "port": 9621, + "model": "lightrag:latest", "timeout": 300, "max_retries": 1, "retry_delay": 1, }, "test_cases": { - "basic": {"query": "How many disciples did Tang Seng have?"}, - "generate": {"query": "Who directed the TV series Journey to the West?"}, + "basic": {"query": "唐僧有几个徒弟"}, + "generate": {"query": "电视剧西游记导演是谁"}, }, } @@ -763,8 +763,8 @@ def parse_args() -> argparse.Namespace: Configuration file (config.json): { "server": { - "host": "host.docker.internal", # Server address - "port": 11434, # Server port + "host": "localhost", # Server address + "port": 9621, # Server port "model": "lightrag:latest" # Default model name }, "test_cases": { From d6a426d3e656784c3bfa4c18ed7cce593f8e135f Mon Sep 17 00:00:00 2001 From: Roy Date: Sat, 8 Mar 2025 21:09:45 +0000 Subject: [PATCH 06/18] Included all requirements in the package --- requirements.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/requirements.txt b/requirements.txt index 088d8843..3f6f3668 100644 --- a/requirements.txt +++ b/requirements.txt @@ -48,3 +48,18 @@ tiktoken==0.9.0 torch==2.6.0 transformers==4.49.0 uvicorn==0.34.0 +aiohttp +configparser +future + +# Basic modules +gensim +pipmaster +pydantic +python-dotenv + +setuptools +tenacity + +# LLM packages +tiktoken From a1708a2638ab0ed02bc303d994823b373db9bdbe Mon Sep 17 00:00:00 2001 From: Roy Date: Sat, 8 Mar 2025 21:13:29 +0000 Subject: [PATCH 07/18] Add optional ids filter to QueryParam for RAG document filtering --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 00da54fb..6eb25e65 100644 --- a/README.md +++ b/README.md @@ -176,6 +176,8 @@ class QueryParam: """Maximum number of tokens allocated for relationship descriptions in global retrieval.""" max_token_for_local_context: int = 4000 """Maximum number of tokens allocated for entity descriptions in local retrieval.""" + ids: list[str] | None = None + """List of ids to filter the RAG.""" ... ``` From 4ce28b31bd34d01065097a2caf4665fe8ce12b59 Mon Sep 17 00:00:00 2001 From: Roy Date: Sat, 8 Mar 2025 21:31:41 +0000 Subject: [PATCH 08/18] minor readme fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6eb25e65..018a94e6 100644 --- a/README.md +++ b/README.md @@ -176,7 +176,7 @@ class QueryParam: """Maximum number of tokens allocated for relationship descriptions in global retrieval.""" max_token_for_local_context: int = 4000 """Maximum number of tokens allocated for entity descriptions in local retrieval.""" - ids: list[str] | None = None + ids: list[str] | None = None # ONLY SUPPORTED FOR PG VECTOR DBs """List of ids to filter the RAG.""" ... ``` From 7807379bee5a8d060826c732cb9f5c75d6c4c7aa Mon Sep 17 00:00:00 2001 From: Roy Date: Mon, 10 Mar 2025 09:18:22 +0000 Subject: [PATCH 09/18] Remove unused ids parameter from _build_query_context function --- lightrag/operate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index f6fa4bfe..3c5ed329 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1017,7 +1017,6 @@ async def _build_query_context( relationships_vdb: BaseVectorStorage, text_chunks_db: BaseKVStorage, query_param: QueryParam, - ids: list[str] = None, ): if query_param.mode == "local": entities_context, relations_context, text_units_context = await _get_node_data( From 8317ec9757d32a2572b1afd3afe3b2bfb8d35e1f Mon Sep 17 00:00:00 2001 From: zrguo Date: Mon, 10 Mar 2025 23:00:06 +0800 Subject: [PATCH 10/18] Update __init__.py --- lightrag/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/__init__.py b/lightrag/__init__.py index e4cb3e63..382060f7 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam -__version__ = "1.2.4" +__version__ = "1.2.5" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/LightRAG" From 92ae895713a2aebed3dabf0ef656a44872d61de4 Mon Sep 17 00:00:00 2001 From: Roy Date: Mon, 10 Mar 2025 15:39:18 +0000 Subject: [PATCH 11/18] Refactor requirements and code formatting - Simplified requirements.txt by removing specific version constraints - Added comment about extra library installation using pipmaster - Improved code formatting in base.py, operate.py, and postgres_impl.py - Cleaned up SQL templates and query method signatures with consistent formatting --- lightrag/base.py | 5 ++- lightrag/kg/postgres_impl.py | 60 ++++++++++++++++++------------------ lightrag/operate.py | 16 +++++++--- requirements.txt | 52 ++----------------------------- 4 files changed, 48 insertions(+), 85 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index 61787efc..c84c7c62 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -108,8 +108,11 @@ class BaseVectorStorage(StorageNameSpace, ABC): embedding_func: EmbeddingFunc cosine_better_than_threshold: float = field(default=0.2) meta_fields: set[str] = field(default_factory=set) + @abstractmethod - async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]: + async def query( + self, query: str, top_k: int, ids: list[str] | None = None + ) -> list[dict[str, Any]]: """Query the vector storage and retrieve top_k results.""" @abstractmethod diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index ad794e3b..1d525bdb 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -439,7 +439,7 @@ class PGVectorStorage(BaseVectorStorage): "content": item["content"], "content_vector": json.dumps(item["__vector__"].tolist()), "chunk_id": item["source_id"], - #TODO: add document_id + # TODO: add document_id } return upsert_sql, data @@ -452,8 +452,8 @@ class PGVectorStorage(BaseVectorStorage): "target_id": item["tgt_id"], "content": item["content"], "content_vector": json.dumps(item["__vector__"].tolist()), - "chunk_id": item["source_id"] - #TODO: add document_id + "chunk_id": item["source_id"], + # TODO: add document_id } return upsert_sql, data @@ -496,7 +496,9 @@ class PGVectorStorage(BaseVectorStorage): await self.db.execute(upsert_sql, data) #################### query method ############### - async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]: + async def query( + self, query: str, top_k: int, ids: list[str] | None = None + ) -> list[dict[str, Any]]: embeddings = await self.embedding_func([query]) embedding = embeddings[0] embedding_string = ",".join(map(str, embedding)) @@ -505,10 +507,9 @@ class PGVectorStorage(BaseVectorStorage): formatted_ids = ",".join(f"'{id}'" for id in ids) else: formatted_ids = "NULL" - + sql = SQL_TEMPLATES[self.base_namespace].format( - embedding_string=embedding_string, - doc_ids=formatted_ids + embedding_string=embedding_string, doc_ids=formatted_ids ) params = { "workspace": self.db.workspace, @@ -1598,7 +1599,7 @@ SQL_TEMPLATES = { content_vector=EXCLUDED.content_vector, update_time = CURRENT_TIMESTAMP """, - "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, + "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, content_vector, chunk_id) VALUES ($1, $2, $3, $4, $5, $6) ON CONFLICT (workspace,id) DO UPDATE @@ -1657,54 +1658,53 @@ SQL_TEMPLATES = { """, "relationships": """ WITH relevant_chunks AS ( - SELECT id as chunk_id - FROM LIGHTRAG_DOC_CHUNKS + SELECT id as chunk_id + FROM LIGHTRAG_DOC_CHUNKS WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}]) ) - SELECT source_id as src_id, target_id as tgt_id + SELECT source_id as src_id, target_id as tgt_id FROM ( SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance FROM LIGHTRAG_VDB_RELATION r - WHERE r.workspace=$1 + WHERE r.workspace=$1 AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks) ) filtered - WHERE distance>$2 - ORDER BY distance DESC + WHERE distance>$2 + ORDER BY distance DESC LIMIT $3 """, - "entities": - ''' + "entities": """ WITH relevant_chunks AS ( - SELECT id as chunk_id - FROM LIGHTRAG_DOC_CHUNKS + SELECT id as chunk_id + FROM LIGHTRAG_DOC_CHUNKS WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}]) ) SELECT entity_name FROM ( SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance - FROM LIGHTRAG_VDB_ENTITY + FROM LIGHTRAG_VDB_ENTITY where workspace=$1 AND chunk_id IN (SELECT chunk_id FROM relevant_chunks) ) - WHERE distance>$2 - ORDER BY distance DESC + WHERE distance>$2 + ORDER BY distance DESC LIMIT $3 - ''', - 'chunks': """ + """, + "chunks": """ WITH relevant_chunks AS ( - SELECT id as chunk_id - FROM LIGHTRAG_DOC_CHUNKS + SELECT id as chunk_id + FROM LIGHTRAG_DOC_CHUNKS WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}]) ) SELECT id FROM ( SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance - FROM LIGHTRAG_DOC_CHUNKS + FROM LIGHTRAG_DOC_CHUNKS where workspace=$1 AND id IN (SELECT chunk_id FROM relevant_chunks) ) - WHERE distance>$2 - ORDER BY distance DESC + WHERE distance>$2 + ORDER BY distance DESC LIMIT $3 - """ -} \ No newline at end of file + """, +} diff --git a/lightrag/operate.py b/lightrag/operate.py index 3c5ed329..5e90a77b 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -893,7 +893,9 @@ async def mix_kg_vector_query( # Reduce top_k for vector search in hybrid mode since we have structured information from KG mix_topk = min(10, query_param.top_k) # TODO: add ids to the query - results = await chunks_vdb.query(augmented_query, top_k=mix_topk, ids = query_param.ids) + results = await chunks_vdb.query( + augmented_query, top_k=mix_topk, ids=query_param.ids + ) if not results: return None @@ -1102,7 +1104,9 @@ async def _get_node_data( f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}" ) - results = await entities_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids) + results = await entities_vdb.query( + query, top_k=query_param.top_k, ids=query_param.ids + ) if not len(results): return "", "", "" @@ -1357,7 +1361,9 @@ async def _get_edge_data( f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}" ) - results = await relationships_vdb.query(keywords, top_k = query_param.top_k, ids = query_param.ids) + results = await relationships_vdb.query( + keywords, top_k=query_param.top_k, ids=query_param.ids + ) if not len(results): return "", "", "" @@ -1606,7 +1612,9 @@ async def naive_query( if cached_response is not None: return cached_response - results = await chunks_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids) + results = await chunks_vdb.query( + query, top_k=query_param.top_k, ids=query_param.ids + ) if not len(results): return PROMPTS["fail_response"] diff --git a/requirements.txt b/requirements.txt index 3f6f3668..d9a5c68e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,53 +1,3 @@ -aioboto3==14.1.0 -aiofiles==24.1.0 -aiohttp==3.11.13 -ascii_colors==0.5.2 -asyncpg==0.30.0 -chromadb==0.6.3 -community==1.0.0b1 -docx==0.2.4 -# faiss -fastapi==0.115.11 -glm==0.4.4 -graspologic==3.4.1 -gunicorn==23.0.0 -httpx==0.28.1 -imgui_bundle==1.6.2 -jsonlines==4.0.0 -llama_index==0.12.22 -moderngl==5.12.0 -motor==3.7.0 -nano_vectordb==0.0.4.3 -neo4j==5.28.1 -nest_asyncio==1.6.0 -networkx==3.4.2 -numpy -openpyxl==3.1.5 -oracledb==3.0.0 -Pillow==11.1.0 -pipmaster==0.4.0 -protobuf -psutil==7.0.0 -psycopg==3.2.5 -psycopg_pool==3.2.6 -pydantic==2.10.6 -pymilvus==2.5.4 -pymongo==4.11.2 -PyPDF2==3.0.1 -python-dotenv==1.0.1 -pyvis==0.3.2 -qdrant_client==1.13.3 -redis==5.2.1 -Requests==2.32.3 -sentence_transformers==3.4.1 -setuptools==75.8.0 -SQLAlchemy==2.0.38 -starlette==0.46.0 -tenacity==9.0.0 -tiktoken==0.9.0 -torch==2.6.0 -transformers==4.49.0 -uvicorn==0.34.0 aiohttp configparser future @@ -63,3 +13,5 @@ tenacity # LLM packages tiktoken + +# Extra libraries are installed when needed using pipmaster From 37754f14b51d5ea274ae184db772cc0e50ed3d72 Mon Sep 17 00:00:00 2001 From: Zhenya Zhu Date: Tue, 11 Mar 2025 11:54:30 +0800 Subject: [PATCH 12/18] force keywords_extraction output as JSON --- lightrag/prompt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/prompt.py b/lightrag/prompt.py index 1486ccf8..f81cd441 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -236,7 +236,7 @@ Given the query and conversation history, list both high-level and low-level key ---Instructions--- - Consider both the current query and relevant conversation history when extracting keywords -- Output the keywords in JSON format +- Output the keywords in JSON format, it will be parsed by a JSON parser, do not add any extra content in output - The JSON should have two keys: - "high_level_keywords" for overarching concepts or themes - "low_level_keywords" for specific entities or details From d77401961dc84f802a49b3c060df6f7b136cff04 Mon Sep 17 00:00:00 2001 From: Zhichun Wu Date: Tue, 11 Mar 2025 11:57:41 +0800 Subject: [PATCH 13/18] Resolve the issue with making API calls to Azure OpenAI service --- lightrag/llm/azure_openai.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lightrag/llm/azure_openai.py b/lightrag/llm/azure_openai.py index 84e45cfb..3405d29e 100644 --- a/lightrag/llm/azure_openai.py +++ b/lightrag/llm/azure_openai.py @@ -55,6 +55,7 @@ async def azure_openai_complete_if_cache( openai_async_client = AsyncAzureOpenAI( azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"), + azure_deployment=model, api_key=os.getenv("AZURE_OPENAI_API_KEY"), api_version=os.getenv("AZURE_OPENAI_API_VERSION"), ) @@ -136,6 +137,7 @@ async def azure_openai_embed( openai_async_client = AsyncAzureOpenAI( azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"), + azure_deployment=model, api_key=os.getenv("AZURE_OPENAI_API_KEY"), api_version=os.getenv("AZURE_OPENAI_API_VERSION"), ) From 061350b2bf13557b9bb40c592e775d31235f1d73 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 11 Mar 2025 12:08:10 +0800 Subject: [PATCH 14/18] Improve Entity Extraction Robustness for Truncated LLM Responses --- lightrag/operate.py | 128 ++++++++++++++++++++++++++++++-------------- 1 file changed, 87 insertions(+), 41 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index e352ff79..f808f3c2 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -141,18 +141,36 @@ async def _handle_single_entity_extraction( ): if len(record_attributes) < 4 or record_attributes[0] != '"entity"': return None - # add this record as a node in the G + + # Clean and validate entity name entity_name = clean_str(record_attributes[1]).strip('"') if not entity_name.strip(): + logger.warning( + f"Entity extraction error: empty entity name in: {record_attributes}" + ) return None + + # Clean and validate entity type entity_type = clean_str(record_attributes[2]).strip('"') + if not entity_type.strip() or entity_type.startswith('("'): + logger.warning( + f"Entity extraction error: invalid entity type in: {record_attributes}" + ) + return None + + # Clean and validate description entity_description = clean_str(record_attributes[3]).strip('"') - entity_source_id = chunk_key + if not entity_description.strip(): + logger.warning( + f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'" + ) + return None + return dict( entity_name=entity_name, entity_type=entity_type, description=entity_description, - source_id=entity_source_id, + source_id=chunk_key, metadata={"created_at": time.time()}, ) @@ -438,47 +456,22 @@ async def extract_entities( else: return await use_llm_func(input_text) - async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): - """ "Prpocess a single chunk + async def _process_extraction_result(result: str, chunk_key: str): + """Process a single extraction result (either initial or gleaning) Args: - chunk_key_dp (tuple[str, TextChunkSchema]): - ("chunck-xxxxxx", {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int}) + result (str): The extraction result to process + chunk_key (str): The chunk key for source tracking + Returns: + tuple: (nodes_dict, edges_dict) containing the extracted entities and relationships """ - nonlocal processed_chunks - chunk_key = chunk_key_dp[0] - chunk_dp = chunk_key_dp[1] - content = chunk_dp["content"] - # hint_prompt = entity_extract_prompt.format(**context_base, input_text=content) - hint_prompt = entity_extract_prompt.format( - **context_base, input_text="{input_text}" - ).format(**context_base, input_text=content) - - final_result = await _user_llm_func_with_cache(hint_prompt) - history = pack_user_ass_to_openai_messages(hint_prompt, final_result) - for now_glean_index in range(entity_extract_max_gleaning): - glean_result = await _user_llm_func_with_cache( - continue_prompt, history_messages=history - ) - - history += pack_user_ass_to_openai_messages(continue_prompt, glean_result) - final_result += glean_result - if now_glean_index == entity_extract_max_gleaning - 1: - break - - if_loop_result: str = await _user_llm_func_with_cache( - if_loop_prompt, history_messages=history - ) - if_loop_result = if_loop_result.strip().strip('"').strip("'").lower() - if if_loop_result != "yes": - break - - records = split_string_by_multi_markers( - final_result, - [context_base["record_delimiter"], context_base["completion_delimiter"]], - ) - maybe_nodes = defaultdict(list) maybe_edges = defaultdict(list) + + records = split_string_by_multi_markers( + result, + [context_base["record_delimiter"], context_base["completion_delimiter"]], + ) + for record in records: record = re.search(r"\((.*)\)", record) if record is None: @@ -487,13 +480,14 @@ async def extract_entities( record_attributes = split_string_by_multi_markers( record, [context_base["tuple_delimiter"]] ) + if_entities = await _handle_single_entity_extraction( record_attributes, chunk_key ) if if_entities is not None: maybe_nodes[if_entities["entity_name"]].append(if_entities) continue - + if_relation = await _handle_single_relationship_extraction( record_attributes, chunk_key ) @@ -501,6 +495,58 @@ async def extract_entities( maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append( if_relation ) + + return maybe_nodes, maybe_edges + + async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): + """Process a single chunk + Args: + chunk_key_dp (tuple[str, TextChunkSchema]): + ("chunk-xxxxxx", {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int}) + """ + nonlocal processed_chunks + chunk_key = chunk_key_dp[0] + chunk_dp = chunk_key_dp[1] + content = chunk_dp["content"] + + # Get initial extraction + hint_prompt = entity_extract_prompt.format( + **context_base, input_text="{input_text}" + ).format(**context_base, input_text=content) + + final_result = await _user_llm_func_with_cache(hint_prompt) + history = pack_user_ass_to_openai_messages(hint_prompt, final_result) + + # Process initial extraction + maybe_nodes, maybe_edges = await _process_extraction_result(final_result, chunk_key) + + # Process additional gleaning results + for now_glean_index in range(entity_extract_max_gleaning): + glean_result = await _user_llm_func_with_cache( + continue_prompt, history_messages=history + ) + + history += pack_user_ass_to_openai_messages(continue_prompt, glean_result) + + # Process gleaning result separately + glean_nodes, glean_edges = await _process_extraction_result(glean_result, chunk_key) + + # Merge results + for entity_name, entities in glean_nodes.items(): + maybe_nodes[entity_name].extend(entities) + for edge_key, edges in glean_edges.items(): + maybe_edges[edge_key].extend(edges) + + if now_glean_index == entity_extract_max_gleaning - 1: + break + + if_loop_result: str = await _user_llm_func_with_cache( + if_loop_prompt, history_messages=history + ) + if_loop_result = if_loop_result.strip().strip('"').strip("'").lower() + if if_loop_result != "yes": + break + processed_chunks += 1 entities_count = len(maybe_nodes) relations_count = len(maybe_edges) From 9d1dc2c9c3786aea25fba48cf946348e2a374b6b Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 11 Mar 2025 12:23:51 +0800 Subject: [PATCH 15/18] Fix linting --- lightrag/operate.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index f808f3c2..09e51fcf 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -466,12 +466,12 @@ async def extract_entities( """ maybe_nodes = defaultdict(list) maybe_edges = defaultdict(list) - + records = split_string_by_multi_markers( result, [context_base["record_delimiter"], context_base["completion_delimiter"]], ) - + for record in records: record = re.search(r"\((.*)\)", record) if record is None: @@ -480,14 +480,14 @@ async def extract_entities( record_attributes = split_string_by_multi_markers( record, [context_base["tuple_delimiter"]] ) - + if_entities = await _handle_single_entity_extraction( record_attributes, chunk_key ) if if_entities is not None: maybe_nodes[if_entities["entity_name"]].append(if_entities) continue - + if_relation = await _handle_single_relationship_extraction( record_attributes, chunk_key ) @@ -495,7 +495,7 @@ async def extract_entities( maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append( if_relation ) - + return maybe_nodes, maybe_edges async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): @@ -508,45 +508,49 @@ async def extract_entities( chunk_key = chunk_key_dp[0] chunk_dp = chunk_key_dp[1] content = chunk_dp["content"] - + # Get initial extraction hint_prompt = entity_extract_prompt.format( **context_base, input_text="{input_text}" ).format(**context_base, input_text=content) - + final_result = await _user_llm_func_with_cache(hint_prompt) history = pack_user_ass_to_openai_messages(hint_prompt, final_result) - + # Process initial extraction - maybe_nodes, maybe_edges = await _process_extraction_result(final_result, chunk_key) - + maybe_nodes, maybe_edges = await _process_extraction_result( + final_result, chunk_key + ) + # Process additional gleaning results for now_glean_index in range(entity_extract_max_gleaning): glean_result = await _user_llm_func_with_cache( continue_prompt, history_messages=history ) - + history += pack_user_ass_to_openai_messages(continue_prompt, glean_result) - + # Process gleaning result separately - glean_nodes, glean_edges = await _process_extraction_result(glean_result, chunk_key) - + glean_nodes, glean_edges = await _process_extraction_result( + glean_result, chunk_key + ) + # Merge results for entity_name, entities in glean_nodes.items(): maybe_nodes[entity_name].extend(entities) for edge_key, edges in glean_edges.items(): maybe_edges[edge_key].extend(edges) - + if now_glean_index == entity_extract_max_gleaning - 1: break - + if_loop_result: str = await _user_llm_func_with_cache( if_loop_prompt, history_messages=history ) if_loop_result = if_loop_result.strip().strip('"').strip("'").lower() if if_loop_result != "yes": break - + processed_chunks += 1 entities_count = len(maybe_nodes) relations_count = len(maybe_edges) From 62b304600bbc90160a64de305d713560ec3b007b Mon Sep 17 00:00:00 2001 From: zrguo Date: Tue, 11 Mar 2025 15:43:04 +0800 Subject: [PATCH 16/18] clean lightrag.py --- lightrag/lightrag.py | 126 ++++++++++++++----------------------------- lightrag/operate.py | 86 +++++++++++++++++++++++++++++ lightrag/utils.py | 46 ++++++++++++++++ 3 files changed, 172 insertions(+), 86 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 3a7d340a..a5cb3b22 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -30,11 +30,10 @@ from .namespace import NameSpace, make_namespace from .operate import ( chunking_by_token_size, extract_entities, - extract_keywords_only, kg_query, - kg_query_with_keywords, mix_kg_vector_query, naive_query, + query_with_keywords, ) from .prompt import GRAPH_FIELD_SEP, PROMPTS from .utils import ( @@ -45,6 +44,9 @@ from .utils import ( encode_string_by_tiktoken, lazy_external_import, limit_async_func_call, + get_content_summary, + clean_text, + check_storage_env_vars, logger, ) from .types import KnowledgeGraph @@ -309,7 +311,7 @@ class LightRAG: # Verify storage implementation compatibility verify_storage_implementation(storage_type, storage_name) # Check environment variables - # self.check_storage_env_vars(storage_name) + check_storage_env_vars(storage_name) # Ensure vector_db_storage_cls_kwargs has required fields self.vector_db_storage_cls_kwargs = { @@ -536,11 +538,6 @@ class LightRAG: storage_class = lazy_external_import(import_path, storage_name) return storage_class - @staticmethod - def clean_text(text: str) -> str: - """Clean text by removing null bytes (0x00) and whitespace""" - return text.strip().replace("\x00", "") - def insert( self, input: str | list[str], @@ -602,8 +599,8 @@ class LightRAG: update_storage = False try: # Clean input texts - full_text = self.clean_text(full_text) - text_chunks = [self.clean_text(chunk) for chunk in text_chunks] + full_text = clean_text(full_text) + text_chunks = [clean_text(chunk) for chunk in text_chunks] # Process cleaned texts if doc_id is None: @@ -682,7 +679,7 @@ class LightRAG: contents = {id_: doc for id_, doc in zip(ids, input)} else: # Clean input text and remove duplicates - input = list(set(self.clean_text(doc) for doc in input)) + input = list(set(clean_text(doc) for doc in input)) # Generate contents dict of MD5 hash IDs and documents contents = {compute_mdhash_id(doc, prefix="doc-"): doc for doc in input} @@ -698,7 +695,7 @@ class LightRAG: new_docs: dict[str, Any] = { id_: { "content": content, - "content_summary": self._get_content_summary(content), + "content_summary": get_content_summary(content), "content_length": len(content), "status": DocStatus.PENDING, "created_at": datetime.now().isoformat(), @@ -1063,7 +1060,7 @@ class LightRAG: all_chunks_data: dict[str, dict[str, str]] = {} chunk_to_source_map: dict[str, str] = {} for chunk_data in custom_kg.get("chunks", []): - chunk_content = self.clean_text(chunk_data["content"]) + chunk_content = clean_text(chunk_data["content"]) source_id = chunk_data["source_id"] tokens = len( encode_string_by_tiktoken( @@ -1296,8 +1293,17 @@ class LightRAG: self, query: str, prompt: str, param: QueryParam = QueryParam() ): """ - 1. Extract keywords from the 'query' using new function in operate.py. - 2. Then run the standard aquery() flow with the final prompt (formatted_question). + Query with separate keyword extraction step. + + This method extracts keywords from the query first, then uses them for the query. + + Args: + query: User query + prompt: Additional prompt for the query + param: Query parameters + + Returns: + Query response """ loop = always_get_an_event_loop() return loop.run_until_complete( @@ -1308,66 +1314,29 @@ class LightRAG: self, query: str, prompt: str, param: QueryParam = QueryParam() ) -> str | AsyncIterator[str]: """ - 1. Calls extract_keywords_only to get HL/LL keywords from 'query'. - 2. Then calls kg_query(...) or naive_query(...), etc. as the main query, while also injecting the newly extracted keywords if needed. + Async version of query_with_separate_keyword_extraction. + + Args: + query: User query + prompt: Additional prompt for the query + param: Query parameters + + Returns: + Query response or async iterator """ - # --------------------- - # STEP 1: Keyword Extraction - # --------------------- - hl_keywords, ll_keywords = await extract_keywords_only( - text=query, + response = await query_with_keywords( + query=query, + prompt=prompt, param=param, + knowledge_graph_inst=self.chunk_entity_relation_graph, + entities_vdb=self.entities_vdb, + relationships_vdb=self.relationships_vdb, + chunks_vdb=self.chunks_vdb, + text_chunks_db=self.text_chunks, global_config=asdict(self), - hashing_kv=self.llm_response_cache, # Directly use llm_response_cache + hashing_kv=self.llm_response_cache, ) - - param.hl_keywords = hl_keywords - param.ll_keywords = ll_keywords - - # --------------------- - # STEP 2: Final Query Logic - # --------------------- - - # Create a new string with the prompt and the keywords - ll_keywords_str = ", ".join(ll_keywords) - hl_keywords_str = ", ".join(hl_keywords) - formatted_question = f"{prompt}\n\n### Keywords:\nHigh-level: {hl_keywords_str}\nLow-level: {ll_keywords_str}\n\n### Query:\n{query}" - - if param.mode in ["local", "global", "hybrid"]: - response = await kg_query_with_keywords( - formatted_question, - self.chunk_entity_relation_graph, - self.entities_vdb, - self.relationships_vdb, - self.text_chunks, - param, - asdict(self), - hashing_kv=self.llm_response_cache, # Directly use llm_response_cache - ) - elif param.mode == "naive": - response = await naive_query( - formatted_question, - self.chunks_vdb, - self.text_chunks, - param, - asdict(self), - hashing_kv=self.llm_response_cache, # Directly use llm_response_cache - ) - elif param.mode == "mix": - response = await mix_kg_vector_query( - formatted_question, - self.chunk_entity_relation_graph, - self.entities_vdb, - self.relationships_vdb, - self.chunks_vdb, - self.text_chunks, - param, - asdict(self), - hashing_kv=self.llm_response_cache, # Directly use llm_response_cache - ) - else: - raise ValueError(f"Unknown mode {param.mode}") - + await self._query_done() return response @@ -1465,21 +1434,6 @@ class LightRAG: ] ) - def _get_content_summary(self, content: str, max_length: int = 100) -> str: - """Get summary of document content - - Args: - content: Original document content - max_length: Maximum length of summary - - Returns: - Truncated content with ellipsis if needed - """ - content = content.strip() - if len(content) <= max_length: - return content - return content[:max_length] + "..." - async def get_processing_status(self) -> dict[str, int]: """Get current document processing status counts diff --git a/lightrag/operate.py b/lightrag/operate.py index 5baec1eb..95a5c72e 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1916,3 +1916,89 @@ async def kg_query_with_keywords( ) return response + +async def query_with_keywords( + query: str, + prompt: str, + param: QueryParam, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + chunks_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage, + global_config: dict[str, str], + hashing_kv: BaseKVStorage | None = None, +) -> str | AsyncIterator[str]: + """ + Extract keywords from the query and then use them for retrieving information. + + 1. Extracts high-level and low-level keywords from the query + 2. Formats the query with the extracted keywords and prompt + 3. Uses the appropriate query method based on param.mode + + Args: + query: The user's query + prompt: Additional prompt to prepend to the query + param: Query parameters + knowledge_graph_inst: Knowledge graph storage + entities_vdb: Entities vector database + relationships_vdb: Relationships vector database + chunks_vdb: Document chunks vector database + text_chunks_db: Text chunks storage + global_config: Global configuration + hashing_kv: Cache storage + + Returns: + Query response or async iterator + """ + # Extract keywords + hl_keywords, ll_keywords = await extract_keywords_only( + text=query, + param=param, + global_config=global_config, + hashing_kv=hashing_kv, + ) + + param.hl_keywords = hl_keywords + param.ll_keywords = ll_keywords + + # Create a new string with the prompt and the keywords + ll_keywords_str = ", ".join(ll_keywords) + hl_keywords_str = ", ".join(hl_keywords) + formatted_question = f"{prompt}\n\n### Keywords:\nHigh-level: {hl_keywords_str}\nLow-level: {ll_keywords_str}\n\n### Query:\n{query}" + + # Use appropriate query method based on mode + if param.mode in ["local", "global", "hybrid"]: + return await kg_query_with_keywords( + formatted_question, + knowledge_graph_inst, + entities_vdb, + relationships_vdb, + text_chunks_db, + param, + global_config, + hashing_kv=hashing_kv, + ) + elif param.mode == "naive": + return await naive_query( + formatted_question, + chunks_vdb, + text_chunks_db, + param, + global_config, + hashing_kv=hashing_kv, + ) + elif param.mode == "mix": + return await mix_kg_vector_query( + formatted_question, + knowledge_graph_inst, + entities_vdb, + relationships_vdb, + chunks_vdb, + text_chunks_db, + param, + global_config, + hashing_kv=hashing_kv, + ) + else: + raise ValueError(f"Unknown mode {param.mode}") diff --git a/lightrag/utils.py b/lightrag/utils.py index e8f79610..1143b326 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -890,3 +890,49 @@ def lazy_external_import(module_name: str, class_name: str) -> Callable[..., Any return cls(*args, **kwargs) return import_class + +def get_content_summary(content: str, max_length: int = 100) -> str: + """Get summary of document content + + Args: + content: Original document content + max_length: Maximum length of summary + + Returns: + Truncated content with ellipsis if needed + """ + content = content.strip() + if len(content) <= max_length: + return content + return content[:max_length] + "..." + +def clean_text(text: str) -> str: + """Clean text by removing null bytes (0x00) and whitespace + + Args: + text: Input text to clean + + Returns: + Cleaned text + """ + return text.strip().replace("\x00", "") + +def check_storage_env_vars(storage_name: str) -> None: + """Check if all required environment variables for storage implementation exist + + Args: + storage_name: Storage implementation name + + Raises: + ValueError: If required environment variables are missing + """ + from lightrag.kg import STORAGE_ENV_REQUIREMENTS + + required_vars = STORAGE_ENV_REQUIREMENTS.get(storage_name, []) + missing_vars = [var for var in required_vars if var not in os.environ] + + if missing_vars: + raise ValueError( + f"Storage implementation '{storage_name}' requires the following " + f"environment variables: {', '.join(missing_vars)}" + ) \ No newline at end of file From 418aea3895f26e1680d0e0b4909d8e6c52b67240 Mon Sep 17 00:00:00 2001 From: zrguo Date: Tue, 11 Mar 2025 15:44:01 +0800 Subject: [PATCH 17/18] fix linting --- lightrag/lightrag.py | 12 ++++++------ lightrag/operate.py | 7 ++++--- lightrag/utils.py | 7 +++++-- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index a5cb3b22..48b464a8 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1294,14 +1294,14 @@ class LightRAG: ): """ Query with separate keyword extraction step. - + This method extracts keywords from the query first, then uses them for the query. - + Args: query: User query prompt: Additional prompt for the query param: Query parameters - + Returns: Query response """ @@ -1315,12 +1315,12 @@ class LightRAG: ) -> str | AsyncIterator[str]: """ Async version of query_with_separate_keyword_extraction. - + Args: query: User query prompt: Additional prompt for the query param: Query parameters - + Returns: Query response or async iterator """ @@ -1336,7 +1336,7 @@ class LightRAG: global_config=asdict(self), hashing_kv=self.llm_response_cache, ) - + await self._query_done() return response diff --git a/lightrag/operate.py b/lightrag/operate.py index 95a5c72e..1815f308 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1917,6 +1917,7 @@ async def kg_query_with_keywords( return response + async def query_with_keywords( query: str, prompt: str, @@ -1931,11 +1932,11 @@ async def query_with_keywords( ) -> str | AsyncIterator[str]: """ Extract keywords from the query and then use them for retrieving information. - + 1. Extracts high-level and low-level keywords from the query 2. Formats the query with the extracted keywords and prompt 3. Uses the appropriate query method based on param.mode - + Args: query: The user's query prompt: Additional prompt to prepend to the query @@ -1947,7 +1948,7 @@ async def query_with_keywords( text_chunks_db: Text chunks storage global_config: Global configuration hashing_kv: Cache storage - + Returns: Query response or async iterator """ diff --git a/lightrag/utils.py b/lightrag/utils.py index 1143b326..b8f00c5d 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -891,6 +891,7 @@ def lazy_external_import(module_name: str, class_name: str) -> Callable[..., Any return import_class + def get_content_summary(content: str, max_length: int = 100) -> str: """Get summary of document content @@ -906,6 +907,7 @@ def get_content_summary(content: str, max_length: int = 100) -> str: return content return content[:max_length] + "..." + def clean_text(text: str) -> str: """Clean text by removing null bytes (0x00) and whitespace @@ -917,6 +919,7 @@ def clean_text(text: str) -> str: """ return text.strip().replace("\x00", "") + def check_storage_env_vars(storage_name: str) -> None: """Check if all required environment variables for storage implementation exist @@ -927,7 +930,7 @@ def check_storage_env_vars(storage_name: str) -> None: ValueError: If required environment variables are missing """ from lightrag.kg import STORAGE_ENV_REQUIREMENTS - + required_vars = STORAGE_ENV_REQUIREMENTS.get(storage_name, []) missing_vars = [var for var in required_vars if var not in os.environ] @@ -935,4 +938,4 @@ def check_storage_env_vars(storage_name: str) -> None: raise ValueError( f"Storage implementation '{storage_name}' requires the following " f"environment variables: {', '.join(missing_vars)}" - ) \ No newline at end of file + ) From ea05b8e49ffd3fb495c63fdd76c3168dbbd97cf1 Mon Sep 17 00:00:00 2001 From: zrguo Date: Tue, 11 Mar 2025 16:19:44 +0800 Subject: [PATCH 18/18] Fix the merge bug with Neo4j --- lightrag/lightrag.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 3a7d340a..2116cf58 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -2622,6 +2622,12 @@ class LightRAG: # 9. Delete source entities for entity_name in source_entities: + if entity_name == target_entity: + logger.info( + f"Skipping deletion of '{entity_name}' as it's also the target entity" + ) + continue + # Delete entity node from knowledge graph await self.chunk_entity_relation_graph.delete_node(entity_name)