From 92ae895713a2aebed3dabf0ef656a44872d61de4 Mon Sep 17 00:00:00 2001 From: Roy Date: Mon, 10 Mar 2025 15:39:18 +0000 Subject: [PATCH] Refactor requirements and code formatting - Simplified requirements.txt by removing specific version constraints - Added comment about extra library installation using pipmaster - Improved code formatting in base.py, operate.py, and postgres_impl.py - Cleaned up SQL templates and query method signatures with consistent formatting --- lightrag/base.py | 5 ++- lightrag/kg/postgres_impl.py | 60 ++++++++++++++++++------------------ lightrag/operate.py | 16 +++++++--- requirements.txt | 52 ++----------------------------- 4 files changed, 48 insertions(+), 85 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index 61787efc..c84c7c62 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -108,8 +108,11 @@ class BaseVectorStorage(StorageNameSpace, ABC): embedding_func: EmbeddingFunc cosine_better_than_threshold: float = field(default=0.2) meta_fields: set[str] = field(default_factory=set) + @abstractmethod - async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]: + async def query( + self, query: str, top_k: int, ids: list[str] | None = None + ) -> list[dict[str, Any]]: """Query the vector storage and retrieve top_k results.""" @abstractmethod diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index ad794e3b..1d525bdb 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -439,7 +439,7 @@ class PGVectorStorage(BaseVectorStorage): "content": item["content"], "content_vector": json.dumps(item["__vector__"].tolist()), "chunk_id": item["source_id"], - #TODO: add document_id + # TODO: add document_id } return upsert_sql, data @@ -452,8 +452,8 @@ class PGVectorStorage(BaseVectorStorage): "target_id": item["tgt_id"], "content": item["content"], "content_vector": json.dumps(item["__vector__"].tolist()), - "chunk_id": item["source_id"] - #TODO: add document_id + "chunk_id": item["source_id"], + # TODO: add document_id } return upsert_sql, data @@ -496,7 +496,9 @@ class PGVectorStorage(BaseVectorStorage): await self.db.execute(upsert_sql, data) #################### query method ############### - async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]: + async def query( + self, query: str, top_k: int, ids: list[str] | None = None + ) -> list[dict[str, Any]]: embeddings = await self.embedding_func([query]) embedding = embeddings[0] embedding_string = ",".join(map(str, embedding)) @@ -505,10 +507,9 @@ class PGVectorStorage(BaseVectorStorage): formatted_ids = ",".join(f"'{id}'" for id in ids) else: formatted_ids = "NULL" - + sql = SQL_TEMPLATES[self.base_namespace].format( - embedding_string=embedding_string, - doc_ids=formatted_ids + embedding_string=embedding_string, doc_ids=formatted_ids ) params = { "workspace": self.db.workspace, @@ -1598,7 +1599,7 @@ SQL_TEMPLATES = { content_vector=EXCLUDED.content_vector, update_time = CURRENT_TIMESTAMP """, - "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, + "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, content_vector, chunk_id) VALUES ($1, $2, $3, $4, $5, $6) ON CONFLICT (workspace,id) DO UPDATE @@ -1657,54 +1658,53 @@ SQL_TEMPLATES = { """, "relationships": """ WITH relevant_chunks AS ( - SELECT id as chunk_id - FROM LIGHTRAG_DOC_CHUNKS + SELECT id as chunk_id + FROM LIGHTRAG_DOC_CHUNKS WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}]) ) - SELECT source_id as src_id, target_id as tgt_id + SELECT source_id as src_id, target_id as tgt_id FROM ( SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance FROM LIGHTRAG_VDB_RELATION r - WHERE r.workspace=$1 + WHERE r.workspace=$1 AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks) ) filtered - WHERE distance>$2 - ORDER BY distance DESC + WHERE distance>$2 + ORDER BY distance DESC LIMIT $3 """, - "entities": - ''' + "entities": """ WITH relevant_chunks AS ( - SELECT id as chunk_id - FROM LIGHTRAG_DOC_CHUNKS + SELECT id as chunk_id + FROM LIGHTRAG_DOC_CHUNKS WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}]) ) SELECT entity_name FROM ( SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance - FROM LIGHTRAG_VDB_ENTITY + FROM LIGHTRAG_VDB_ENTITY where workspace=$1 AND chunk_id IN (SELECT chunk_id FROM relevant_chunks) ) - WHERE distance>$2 - ORDER BY distance DESC + WHERE distance>$2 + ORDER BY distance DESC LIMIT $3 - ''', - 'chunks': """ + """, + "chunks": """ WITH relevant_chunks AS ( - SELECT id as chunk_id - FROM LIGHTRAG_DOC_CHUNKS + SELECT id as chunk_id + FROM LIGHTRAG_DOC_CHUNKS WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}]) ) SELECT id FROM ( SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance - FROM LIGHTRAG_DOC_CHUNKS + FROM LIGHTRAG_DOC_CHUNKS where workspace=$1 AND id IN (SELECT chunk_id FROM relevant_chunks) ) - WHERE distance>$2 - ORDER BY distance DESC + WHERE distance>$2 + ORDER BY distance DESC LIMIT $3 - """ -} \ No newline at end of file + """, +} diff --git a/lightrag/operate.py b/lightrag/operate.py index 3c5ed329..5e90a77b 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -893,7 +893,9 @@ async def mix_kg_vector_query( # Reduce top_k for vector search in hybrid mode since we have structured information from KG mix_topk = min(10, query_param.top_k) # TODO: add ids to the query - results = await chunks_vdb.query(augmented_query, top_k=mix_topk, ids = query_param.ids) + results = await chunks_vdb.query( + augmented_query, top_k=mix_topk, ids=query_param.ids + ) if not results: return None @@ -1102,7 +1104,9 @@ async def _get_node_data( f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}" ) - results = await entities_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids) + results = await entities_vdb.query( + query, top_k=query_param.top_k, ids=query_param.ids + ) if not len(results): return "", "", "" @@ -1357,7 +1361,9 @@ async def _get_edge_data( f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}" ) - results = await relationships_vdb.query(keywords, top_k = query_param.top_k, ids = query_param.ids) + results = await relationships_vdb.query( + keywords, top_k=query_param.top_k, ids=query_param.ids + ) if not len(results): return "", "", "" @@ -1606,7 +1612,9 @@ async def naive_query( if cached_response is not None: return cached_response - results = await chunks_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids) + results = await chunks_vdb.query( + query, top_k=query_param.top_k, ids=query_param.ids + ) if not len(results): return PROMPTS["fail_response"] diff --git a/requirements.txt b/requirements.txt index 3f6f3668..d9a5c68e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,53 +1,3 @@ -aioboto3==14.1.0 -aiofiles==24.1.0 -aiohttp==3.11.13 -ascii_colors==0.5.2 -asyncpg==0.30.0 -chromadb==0.6.3 -community==1.0.0b1 -docx==0.2.4 -# faiss -fastapi==0.115.11 -glm==0.4.4 -graspologic==3.4.1 -gunicorn==23.0.0 -httpx==0.28.1 -imgui_bundle==1.6.2 -jsonlines==4.0.0 -llama_index==0.12.22 -moderngl==5.12.0 -motor==3.7.0 -nano_vectordb==0.0.4.3 -neo4j==5.28.1 -nest_asyncio==1.6.0 -networkx==3.4.2 -numpy -openpyxl==3.1.5 -oracledb==3.0.0 -Pillow==11.1.0 -pipmaster==0.4.0 -protobuf -psutil==7.0.0 -psycopg==3.2.5 -psycopg_pool==3.2.6 -pydantic==2.10.6 -pymilvus==2.5.4 -pymongo==4.11.2 -PyPDF2==3.0.1 -python-dotenv==1.0.1 -pyvis==0.3.2 -qdrant_client==1.13.3 -redis==5.2.1 -Requests==2.32.3 -sentence_transformers==3.4.1 -setuptools==75.8.0 -SQLAlchemy==2.0.38 -starlette==0.46.0 -tenacity==9.0.0 -tiktoken==0.9.0 -torch==2.6.0 -transformers==4.49.0 -uvicorn==0.34.0 aiohttp configparser future @@ -63,3 +13,5 @@ tenacity # LLM packages tiktoken + +# Extra libraries are installed when needed using pipmaster