From 0ec61d640769a32b141afcf4ef58c5d5a7e9fe11 Mon Sep 17 00:00:00 2001 From: Roy Date: Fri, 7 Mar 2025 18:45:28 +0000 Subject: [PATCH 1/4] Update project dependencies and example test files - Updated requirements.txt with latest package versions - Added support for filtering query results by IDs in base and operate modules - Modified PostgreSQL vector storage to include document and chunk ID fields --- .gitignore | 21 +++++++++++ lightrag/base.py | 5 ++- lightrag/kg/postgres_impl.py | 6 +++- lightrag/lightrag.py | 1 + lightrag/operate.py | 11 +++++- requirements.txt | 67 +++++++++++++++++++++++++++--------- 6 files changed, 91 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index a4afe4ea..4f28427f 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,24 @@ gui/ # unit-test files test_* +Miniconda3-latest-Linux-x86_64.sh +requirements_basic.txt +requirements.txt +examples/test_chromadb.py +examples/test_faiss.py +examples/test_neo4j.py +.gitignore +requirements.txt +examples/test_chromadb.py +examples/test_faiss.py +examples/* +tests/test_lightrag_ollama_chat.py +requirements.txt +requirements.txt +examples/test_chromadb.py +examples/test_faiss.py +examples/test_neo4j.py +tests/test_lightrag_ollama_chat.py +examples/test_chromadb.py +examples/test_faiss.py +examples/test_neo4j.py diff --git a/lightrag/base.py b/lightrag/base.py index 5f6a1bf1..e7ab3127 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -81,6 +81,9 @@ class QueryParam: history_turns: int = 3 """Number of complete conversation turns (user-assistant pairs) to consider in the response context.""" + ids: list[str] | None = None + """List of ids to filter the results.""" + @dataclass class StorageNameSpace(ABC): @@ -107,7 +110,7 @@ class BaseVectorStorage(StorageNameSpace, ABC): meta_fields: set[str] = field(default_factory=set) @abstractmethod - async def query(self, query: str, top_k: int) -> list[dict[str, Any]]: + async def query(self, query: str, top_k: int, ids: list[str] = None) -> list[dict[str, Any]]: """Query the vector storage and retrieve top_k results.""" @abstractmethod diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 54a59f5d..a069cec0 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -492,7 +492,7 @@ class PGVectorStorage(BaseVectorStorage): await self.db.execute(upsert_sql, data) #################### query method ############### - async def query(self, query: str, top_k: int) -> list[dict[str, Any]]: + async def query(self, query: str, top_k: int, ids: list[str] = None) -> list[dict[str, Any]]: embeddings = await self.embedding_func([query]) embedding = embeddings[0] embedding_string = ",".join(map(str, embedding)) @@ -1387,6 +1387,8 @@ TABLES = { content_vector VECTOR, create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, update_time TIMESTAMP, + document_id VARCHAR(255) NULL, + chunk_id VARCHAR(255) NULL, CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id) )""" }, @@ -1400,6 +1402,8 @@ TABLES = { content_vector VECTOR, create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, update_time TIMESTAMP, + document_id VARCHAR(255) NULL, + chunk_id VARCHAR(255) NULL, CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id) )""" }, diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 0554ab76..ae6fd9dc 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1243,6 +1243,7 @@ class LightRAG: embedding_func=self.embedding_func, ), system_prompt=system_prompt, + ids = param.ids ) elif param.mode == "naive": response = await naive_query( diff --git a/lightrag/operate.py b/lightrag/operate.py index 30983145..6c0e1e4c 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -602,6 +602,7 @@ async def kg_query( global_config: dict[str, str], hashing_kv: BaseKVStorage | None = None, system_prompt: str | None = None, + ids: list[str] | None = None, ) -> str | AsyncIterator[str]: # Handle cache use_model_func = global_config["llm_model_func"] @@ -649,6 +650,7 @@ async def kg_query( relationships_vdb, text_chunks_db, query_param, + ids ) if query_param.only_need_context: @@ -1016,6 +1018,7 @@ async def _build_query_context( relationships_vdb: BaseVectorStorage, text_chunks_db: BaseKVStorage, query_param: QueryParam, + ids: list[str] = None, ): if query_param.mode == "local": entities_context, relations_context, text_units_context = await _get_node_data( @@ -1032,6 +1035,7 @@ async def _build_query_context( relationships_vdb, text_chunks_db, query_param, + ids = ids ) else: # hybrid mode ll_data, hl_data = await asyncio.gather( @@ -1348,11 +1352,16 @@ async def _get_edge_data( relationships_vdb: BaseVectorStorage, text_chunks_db: BaseKVStorage, query_param: QueryParam, + ids: list[str] | None = None, ): logger.info( f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}" ) - results = await relationships_vdb.query(keywords, top_k=query_param.top_k) + if ids: + #TODO: add ids to the query + results = await relationships_vdb.query(keywords, top_k = query_param.top_k, ids = ids) + else: + results = await relationships_vdb.query(keywords, top_k=query_param.top_k) if not len(results): return "", "", "" diff --git a/requirements.txt b/requirements.txt index d9a5c68e..088d8843 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,50 @@ -aiohttp -configparser -future - -# Basic modules -gensim -pipmaster -pydantic -python-dotenv - -setuptools -tenacity - -# LLM packages -tiktoken - -# Extra libraries are installed when needed using pipmaster +aioboto3==14.1.0 +aiofiles==24.1.0 +aiohttp==3.11.13 +ascii_colors==0.5.2 +asyncpg==0.30.0 +chromadb==0.6.3 +community==1.0.0b1 +docx==0.2.4 +# faiss +fastapi==0.115.11 +glm==0.4.4 +graspologic==3.4.1 +gunicorn==23.0.0 +httpx==0.28.1 +imgui_bundle==1.6.2 +jsonlines==4.0.0 +llama_index==0.12.22 +moderngl==5.12.0 +motor==3.7.0 +nano_vectordb==0.0.4.3 +neo4j==5.28.1 +nest_asyncio==1.6.0 +networkx==3.4.2 +numpy +openpyxl==3.1.5 +oracledb==3.0.0 +Pillow==11.1.0 +pipmaster==0.4.0 +protobuf +psutil==7.0.0 +psycopg==3.2.5 +psycopg_pool==3.2.6 +pydantic==2.10.6 +pymilvus==2.5.4 +pymongo==4.11.2 +PyPDF2==3.0.1 +python-dotenv==1.0.1 +pyvis==0.3.2 +qdrant_client==1.13.3 +redis==5.2.1 +Requests==2.32.3 +sentence_transformers==3.4.1 +setuptools==75.8.0 +SQLAlchemy==2.0.38 +starlette==0.46.0 +tenacity==9.0.0 +tiktoken==0.9.0 +torch==2.6.0 +transformers==4.49.0 +uvicorn==0.34.0 From bbe139cfebad995efc24a1a3e7f375530c6202cd Mon Sep 17 00:00:00 2001 From: Roy Date: Fri, 7 Mar 2025 20:18:01 +0000 Subject: [PATCH 2/4] Enhance PostgreSQL vector storage with chunk_id support - Updated SQL templates for entity and relationship upsert to include chunk_id - Modified PGVectorStorage methods to add chunk_id when inserting or updating records - Expanded database schema to track chunk-level metadata --- lightrag/kg/postgres_impl.py | 11 +++++++---- tests/test_lightrag_ollama_chat.py | 14 +++++++------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index a069cec0..3fc05f59 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -438,6 +438,7 @@ class PGVectorStorage(BaseVectorStorage): "entity_name": item["entity_name"], "content": item["content"], "content_vector": json.dumps(item["__vector__"].tolist()), + "chunk_id": item["source_id"], } return upsert_sql, data @@ -450,6 +451,7 @@ class PGVectorStorage(BaseVectorStorage): "target_id": item["tgt_id"], "content": item["content"], "content_vector": json.dumps(item["__vector__"].tolist()), + "chunk_id": item["source_id"] } return upsert_sql, data @@ -1486,8 +1488,9 @@ SQL_TEMPLATES = { content_vector=EXCLUDED.content_vector, update_time = CURRENT_TIMESTAMP """, - "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, content_vector) - VALUES ($1, $2, $3, $4, $5) + "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, + content_vector, chunk_id) + VALUES ($1, $2, $3, $4, $5, $6) ON CONFLICT (workspace,id) DO UPDATE SET entity_name=EXCLUDED.entity_name, content=EXCLUDED.content, @@ -1495,8 +1498,8 @@ SQL_TEMPLATES = { update_time=CURRENT_TIMESTAMP """, "upsert_relationship": """INSERT INTO LIGHTRAG_VDB_RELATION (workspace, id, source_id, - target_id, content, content_vector) - VALUES ($1, $2, $3, $4, $5, $6) + target_id, content, content_vector, chunk_id) + VALUES ($1, $2, $3, $4, $5, $6, $7) ON CONFLICT (workspace,id) DO UPDATE SET source_id=EXCLUDED.source_id, target_id=EXCLUDED.target_id, diff --git a/tests/test_lightrag_ollama_chat.py b/tests/test_lightrag_ollama_chat.py index 80038928..f19e2974 100644 --- a/tests/test_lightrag_ollama_chat.py +++ b/tests/test_lightrag_ollama_chat.py @@ -38,16 +38,16 @@ class McpError(Exception): DEFAULT_CONFIG = { "server": { - "host": "localhost", - "port": 9621, - "model": "lightrag:latest", + "host": "host.docker.internal", + "port": 11434, + "model": "llama3.2:latest", "timeout": 300, "max_retries": 1, "retry_delay": 1, }, "test_cases": { - "basic": {"query": "唐僧有几个徒弟"}, - "generate": {"query": "电视剧西游记导演是谁"}, + "basic": {"query": "How many disciples did Tang Seng have?"}, + "generate": {"query": "Who directed the TV series Journey to the West?"}, }, } @@ -763,8 +763,8 @@ def parse_args() -> argparse.Namespace: Configuration file (config.json): { "server": { - "host": "localhost", # Server address - "port": 9621, # Server port + "host": "host.docker.internal", # Server address + "port": 11434, # Server port "model": "lightrag:latest" # Default model name }, "test_cases": { From 528fb11364b231981028c99f723804085f722e0a Mon Sep 17 00:00:00 2001 From: Roy Date: Sat, 8 Mar 2025 15:43:17 +0000 Subject: [PATCH 3/4] Refactor vector query methods to support optional ID filtering - Updated BaseVectorStorage query method signature to accept optional IDs - Modified operate.py to pass query parameter IDs to vector storage queries - Updated PostgreSQL vector storage SQL templates to filter results by document IDs - Removed unused parameters and simplified query logic across multiple files --- lightrag/base.py | 3 +- lightrag/kg/postgres_impl.py | 98 ++++++++++++++++++++++++++++-------- lightrag/lightrag.py | 1 - lightrag/operate.py | 17 +++---- 4 files changed, 85 insertions(+), 34 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index e7ab3127..20fe2a5b 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -108,9 +108,8 @@ class BaseVectorStorage(StorageNameSpace, ABC): embedding_func: EmbeddingFunc cosine_better_than_threshold: float = field(default=0.2) meta_fields: set[str] = field(default_factory=set) - @abstractmethod - async def query(self, query: str, top_k: int, ids: list[str] = None) -> list[dict[str, Any]]: + async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]: """Query the vector storage and retrieve top_k results.""" @abstractmethod diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 3fc05f59..c1ca7aa9 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -439,6 +439,7 @@ class PGVectorStorage(BaseVectorStorage): "content": item["content"], "content_vector": json.dumps(item["__vector__"].tolist()), "chunk_id": item["source_id"], + #TODO: add document_id } return upsert_sql, data @@ -452,6 +453,7 @@ class PGVectorStorage(BaseVectorStorage): "content": item["content"], "content_vector": json.dumps(item["__vector__"].tolist()), "chunk_id": item["source_id"] + #TODO: add document_id } return upsert_sql, data @@ -494,13 +496,19 @@ class PGVectorStorage(BaseVectorStorage): await self.db.execute(upsert_sql, data) #################### query method ############### - async def query(self, query: str, top_k: int, ids: list[str] = None) -> list[dict[str, Any]]: + async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]: embeddings = await self.embedding_func([query]) embedding = embeddings[0] embedding_string = ",".join(map(str, embedding)) + if ids: + formatted_ids = ",".join(f"'{id}'" for id in ids) + else: + formatted_ids = "NULL" + sql = SQL_TEMPLATES[self.base_namespace].format( - embedding_string=embedding_string + embedding_string=embedding_string, + doc_ids=formatted_ids ) params = { "workspace": self.db.workspace, @@ -1389,7 +1397,6 @@ TABLES = { content_vector VECTOR, create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, update_time TIMESTAMP, - document_id VARCHAR(255) NULL, chunk_id VARCHAR(255) NULL, CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id) )""" @@ -1404,7 +1411,6 @@ TABLES = { content_vector VECTOR, create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, update_time TIMESTAMP, - document_id VARCHAR(255) NULL, chunk_id VARCHAR(255) NULL, CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id) )""" @@ -1507,21 +1513,21 @@ SQL_TEMPLATES = { content_vector=EXCLUDED.content_vector, update_time = CURRENT_TIMESTAMP """, # SQL for VectorStorage - "entities": """SELECT entity_name FROM - (SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance - FROM LIGHTRAG_VDB_ENTITY where workspace=$1) - WHERE distance>$2 ORDER BY distance DESC LIMIT $3 - """, - "relationships": """SELECT source_id as src_id, target_id as tgt_id FROM - (SELECT id, source_id,target_id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance - FROM LIGHTRAG_VDB_RELATION where workspace=$1) - WHERE distance>$2 ORDER BY distance DESC LIMIT $3 - """, - "chunks": """SELECT id FROM - (SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance - FROM LIGHTRAG_DOC_CHUNKS where workspace=$1) - WHERE distance>$2 ORDER BY distance DESC LIMIT $3 - """, + # "entities": """SELECT entity_name FROM + # (SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance + # FROM LIGHTRAG_VDB_ENTITY where workspace=$1) + # WHERE distance>$2 ORDER BY distance DESC LIMIT $3 + # """, + # "relationships": """SELECT source_id as src_id, target_id as tgt_id FROM + # (SELECT id, source_id,target_id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance + # FROM LIGHTRAG_VDB_RELATION where workspace=$1) + # WHERE distance>$2 ORDER BY distance DESC LIMIT $3 + # """, + # "chunks": """SELECT id FROM + # (SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance + # FROM LIGHTRAG_DOC_CHUNKS where workspace=$1) + # WHERE distance>$2 ORDER BY distance DESC LIMIT $3 + # """, # DROP tables "drop_all": """ DROP TABLE IF EXISTS LIGHTRAG_DOC_FULL CASCADE; @@ -1545,4 +1551,56 @@ SQL_TEMPLATES = { "drop_vdb_relation": """ DROP TABLE IF EXISTS LIGHTRAG_VDB_RELATION CASCADE; """, -} + "relationships": """ + WITH relevant_chunks AS ( + SELECT id as chunk_id + FROM LIGHTRAG_DOC_CHUNKS + WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}]) + ) + SELECT source_id as src_id, target_id as tgt_id + FROM ( + SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance + FROM LIGHTRAG_VDB_RELATION r + WHERE r.workspace=$1 + AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks) + ) filtered + WHERE distance>$2 + ORDER BY distance DESC + LIMIT $3 + """, + "entities": + ''' + WITH relevant_chunks AS ( + SELECT id as chunk_id + FROM LIGHTRAG_DOC_CHUNKS + WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}]) + ) + SELECT entity_name FROM + ( + SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance + FROM LIGHTRAG_VDB_ENTITY + where workspace=$1 + AND chunk_id IN (SELECT chunk_id FROM relevant_chunks) + ) + WHERE distance>$2 + ORDER BY distance DESC + LIMIT $3 + ''', + 'chunks': """ + WITH relevant_chunks AS ( + SELECT id as chunk_id + FROM LIGHTRAG_DOC_CHUNKS + WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}]) + ) + SELECT id FROM + ( + SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance + FROM LIGHTRAG_DOC_CHUNKS + where workspace=$1 + AND chunk_id IN (SELECT chunk_id FROM relevant_chunks) + ) + WHERE distance>$2 + ORDER BY distance DESC + LIMIT $3 + """ +} \ No newline at end of file diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index ae6fd9dc..0554ab76 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1243,7 +1243,6 @@ class LightRAG: embedding_func=self.embedding_func, ), system_prompt=system_prompt, - ids = param.ids ) elif param.mode == "naive": response = await naive_query( diff --git a/lightrag/operate.py b/lightrag/operate.py index 6c0e1e4c..7910917a 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -602,7 +602,6 @@ async def kg_query( global_config: dict[str, str], hashing_kv: BaseKVStorage | None = None, system_prompt: str | None = None, - ids: list[str] | None = None, ) -> str | AsyncIterator[str]: # Handle cache use_model_func = global_config["llm_model_func"] @@ -650,7 +649,6 @@ async def kg_query( relationships_vdb, text_chunks_db, query_param, - ids ) if query_param.only_need_context: @@ -1035,7 +1033,6 @@ async def _build_query_context( relationships_vdb, text_chunks_db, query_param, - ids = ids ) else: # hybrid mode ll_data, hl_data = await asyncio.gather( @@ -1104,7 +1101,9 @@ async def _get_node_data( logger.info( f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}" ) - results = await entities_vdb.query(query, top_k=query_param.top_k) + + results = await entities_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids) + if not len(results): return "", "", "" # get entity information @@ -1352,16 +1351,12 @@ async def _get_edge_data( relationships_vdb: BaseVectorStorage, text_chunks_db: BaseKVStorage, query_param: QueryParam, - ids: list[str] | None = None, ): logger.info( f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}" ) - if ids: - #TODO: add ids to the query - results = await relationships_vdb.query(keywords, top_k = query_param.top_k, ids = ids) - else: - results = await relationships_vdb.query(keywords, top_k=query_param.top_k) + + results = await relationships_vdb.query(keywords, top_k = query_param.top_k, ids = query_param.ids) if not len(results): return "", "", "" @@ -1610,7 +1605,7 @@ async def naive_query( if cached_response is not None: return cached_response - results = await chunks_vdb.query(query, top_k=query_param.top_k) + results = await chunks_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids) if not len(results): return PROMPTS["fail_response"] From e31c0c8f6c7e4be70245b59556cadef76fad0706 Mon Sep 17 00:00:00 2001 From: Roy Date: Sat, 8 Mar 2025 20:25:20 +0000 Subject: [PATCH 4/4] Update vector query methods to support ID filtering in PostgreSQL - Modified `mix_kg_vector_query` in operate.py to pass optional IDs to vector search - Updated PostgreSQL SQL template to filter results using document IDs instead of chunk_id - Improved query flexibility by allowing precise document selection during vector search --- lightrag/kg/postgres_impl.py | 2 +- lightrag/operate.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index c1ca7aa9..b0284e91 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -1597,7 +1597,7 @@ SQL_TEMPLATES = { SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance FROM LIGHTRAG_DOC_CHUNKS where workspace=$1 - AND chunk_id IN (SELECT chunk_id FROM relevant_chunks) + AND id IN (SELECT chunk_id FROM relevant_chunks) ) WHERE distance>$2 ORDER BY distance DESC diff --git a/lightrag/operate.py b/lightrag/operate.py index 7910917a..4d440ca5 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -892,7 +892,8 @@ async def mix_kg_vector_query( try: # Reduce top_k for vector search in hybrid mode since we have structured information from KG mix_topk = min(10, query_param.top_k) - results = await chunks_vdb.query(augmented_query, top_k=mix_topk) + # TODO: add ids to the query + results = await chunks_vdb.query(augmented_query, top_k=mix_topk, ids = query_param.ids) if not results: return None