Refactor requirements and code formatting

- Simplified requirements.txt by removing specific version constraints
- Added comment about extra library installation using pipmaster
- Improved code formatting in base.py, operate.py, and postgres_impl.py
- Cleaned up SQL templates and query method signatures with consistent formatting
This commit is contained in:
Roy
2025-03-10 15:39:18 +00:00
parent 3fa6d8757a
commit 92ae895713
4 changed files with 48 additions and 85 deletions

View File

@@ -108,8 +108,11 @@ class BaseVectorStorage(StorageNameSpace, ABC):
embedding_func: EmbeddingFunc embedding_func: EmbeddingFunc
cosine_better_than_threshold: float = field(default=0.2) cosine_better_than_threshold: float = field(default=0.2)
meta_fields: set[str] = field(default_factory=set) meta_fields: set[str] = field(default_factory=set)
@abstractmethod @abstractmethod
async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]: async def query(
self, query: str, top_k: int, ids: list[str] | None = None
) -> list[dict[str, Any]]:
"""Query the vector storage and retrieve top_k results.""" """Query the vector storage and retrieve top_k results."""
@abstractmethod @abstractmethod

View File

@@ -439,7 +439,7 @@ class PGVectorStorage(BaseVectorStorage):
"content": item["content"], "content": item["content"],
"content_vector": json.dumps(item["__vector__"].tolist()), "content_vector": json.dumps(item["__vector__"].tolist()),
"chunk_id": item["source_id"], "chunk_id": item["source_id"],
#TODO: add document_id # TODO: add document_id
} }
return upsert_sql, data return upsert_sql, data
@@ -452,8 +452,8 @@ class PGVectorStorage(BaseVectorStorage):
"target_id": item["tgt_id"], "target_id": item["tgt_id"],
"content": item["content"], "content": item["content"],
"content_vector": json.dumps(item["__vector__"].tolist()), "content_vector": json.dumps(item["__vector__"].tolist()),
"chunk_id": item["source_id"] "chunk_id": item["source_id"],
#TODO: add document_id # TODO: add document_id
} }
return upsert_sql, data return upsert_sql, data
@@ -496,7 +496,9 @@ class PGVectorStorage(BaseVectorStorage):
await self.db.execute(upsert_sql, data) await self.db.execute(upsert_sql, data)
#################### query method ############### #################### query method ###############
async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]: async def query(
self, query: str, top_k: int, ids: list[str] | None = None
) -> list[dict[str, Any]]:
embeddings = await self.embedding_func([query]) embeddings = await self.embedding_func([query])
embedding = embeddings[0] embedding = embeddings[0]
embedding_string = ",".join(map(str, embedding)) embedding_string = ",".join(map(str, embedding))
@@ -505,10 +507,9 @@ class PGVectorStorage(BaseVectorStorage):
formatted_ids = ",".join(f"'{id}'" for id in ids) formatted_ids = ",".join(f"'{id}'" for id in ids)
else: else:
formatted_ids = "NULL" formatted_ids = "NULL"
sql = SQL_TEMPLATES[self.base_namespace].format( sql = SQL_TEMPLATES[self.base_namespace].format(
embedding_string=embedding_string, embedding_string=embedding_string, doc_ids=formatted_ids
doc_ids=formatted_ids
) )
params = { params = {
"workspace": self.db.workspace, "workspace": self.db.workspace,
@@ -1598,7 +1599,7 @@ SQL_TEMPLATES = {
content_vector=EXCLUDED.content_vector, content_vector=EXCLUDED.content_vector,
update_time = CURRENT_TIMESTAMP update_time = CURRENT_TIMESTAMP
""", """,
"upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
content_vector, chunk_id) content_vector, chunk_id)
VALUES ($1, $2, $3, $4, $5, $6) VALUES ($1, $2, $3, $4, $5, $6)
ON CONFLICT (workspace,id) DO UPDATE ON CONFLICT (workspace,id) DO UPDATE
@@ -1657,54 +1658,53 @@ SQL_TEMPLATES = {
""", """,
"relationships": """ "relationships": """
WITH relevant_chunks AS ( WITH relevant_chunks AS (
SELECT id as chunk_id SELECT id as chunk_id
FROM LIGHTRAG_DOC_CHUNKS FROM LIGHTRAG_DOC_CHUNKS
WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}]) WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
) )
SELECT source_id as src_id, target_id as tgt_id SELECT source_id as src_id, target_id as tgt_id
FROM ( FROM (
SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance
FROM LIGHTRAG_VDB_RELATION r FROM LIGHTRAG_VDB_RELATION r
WHERE r.workspace=$1 WHERE r.workspace=$1
AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks) AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks)
) filtered ) filtered
WHERE distance>$2 WHERE distance>$2
ORDER BY distance DESC ORDER BY distance DESC
LIMIT $3 LIMIT $3
""", """,
"entities": "entities": """
'''
WITH relevant_chunks AS ( WITH relevant_chunks AS (
SELECT id as chunk_id SELECT id as chunk_id
FROM LIGHTRAG_DOC_CHUNKS FROM LIGHTRAG_DOC_CHUNKS
WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}]) WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
) )
SELECT entity_name FROM SELECT entity_name FROM
( (
SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
FROM LIGHTRAG_VDB_ENTITY FROM LIGHTRAG_VDB_ENTITY
where workspace=$1 where workspace=$1
AND chunk_id IN (SELECT chunk_id FROM relevant_chunks) AND chunk_id IN (SELECT chunk_id FROM relevant_chunks)
) )
WHERE distance>$2 WHERE distance>$2
ORDER BY distance DESC ORDER BY distance DESC
LIMIT $3 LIMIT $3
''', """,
'chunks': """ "chunks": """
WITH relevant_chunks AS ( WITH relevant_chunks AS (
SELECT id as chunk_id SELECT id as chunk_id
FROM LIGHTRAG_DOC_CHUNKS FROM LIGHTRAG_DOC_CHUNKS
WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}]) WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
) )
SELECT id FROM SELECT id FROM
( (
SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
FROM LIGHTRAG_DOC_CHUNKS FROM LIGHTRAG_DOC_CHUNKS
where workspace=$1 where workspace=$1
AND id IN (SELECT chunk_id FROM relevant_chunks) AND id IN (SELECT chunk_id FROM relevant_chunks)
) )
WHERE distance>$2 WHERE distance>$2
ORDER BY distance DESC ORDER BY distance DESC
LIMIT $3 LIMIT $3
""" """,
} }

View File

@@ -893,7 +893,9 @@ async def mix_kg_vector_query(
# Reduce top_k for vector search in hybrid mode since we have structured information from KG # Reduce top_k for vector search in hybrid mode since we have structured information from KG
mix_topk = min(10, query_param.top_k) mix_topk = min(10, query_param.top_k)
# TODO: add ids to the query # TODO: add ids to the query
results = await chunks_vdb.query(augmented_query, top_k=mix_topk, ids = query_param.ids) results = await chunks_vdb.query(
augmented_query, top_k=mix_topk, ids=query_param.ids
)
if not results: if not results:
return None return None
@@ -1102,7 +1104,9 @@ async def _get_node_data(
f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}" f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}"
) )
results = await entities_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids) results = await entities_vdb.query(
query, top_k=query_param.top_k, ids=query_param.ids
)
if not len(results): if not len(results):
return "", "", "" return "", "", ""
@@ -1357,7 +1361,9 @@ async def _get_edge_data(
f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}" f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}"
) )
results = await relationships_vdb.query(keywords, top_k = query_param.top_k, ids = query_param.ids) results = await relationships_vdb.query(
keywords, top_k=query_param.top_k, ids=query_param.ids
)
if not len(results): if not len(results):
return "", "", "" return "", "", ""
@@ -1606,7 +1612,9 @@ async def naive_query(
if cached_response is not None: if cached_response is not None:
return cached_response return cached_response
results = await chunks_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids) results = await chunks_vdb.query(
query, top_k=query_param.top_k, ids=query_param.ids
)
if not len(results): if not len(results):
return PROMPTS["fail_response"] return PROMPTS["fail_response"]

View File

@@ -1,53 +1,3 @@
aioboto3==14.1.0
aiofiles==24.1.0
aiohttp==3.11.13
ascii_colors==0.5.2
asyncpg==0.30.0
chromadb==0.6.3
community==1.0.0b1
docx==0.2.4
# faiss
fastapi==0.115.11
glm==0.4.4
graspologic==3.4.1
gunicorn==23.0.0
httpx==0.28.1
imgui_bundle==1.6.2
jsonlines==4.0.0
llama_index==0.12.22
moderngl==5.12.0
motor==3.7.0
nano_vectordb==0.0.4.3
neo4j==5.28.1
nest_asyncio==1.6.0
networkx==3.4.2
numpy
openpyxl==3.1.5
oracledb==3.0.0
Pillow==11.1.0
pipmaster==0.4.0
protobuf
psutil==7.0.0
psycopg==3.2.5
psycopg_pool==3.2.6
pydantic==2.10.6
pymilvus==2.5.4
pymongo==4.11.2
PyPDF2==3.0.1
python-dotenv==1.0.1
pyvis==0.3.2
qdrant_client==1.13.3
redis==5.2.1
Requests==2.32.3
sentence_transformers==3.4.1
setuptools==75.8.0
SQLAlchemy==2.0.38
starlette==0.46.0
tenacity==9.0.0
tiktoken==0.9.0
torch==2.6.0
transformers==4.49.0
uvicorn==0.34.0
aiohttp aiohttp
configparser configparser
future future
@@ -63,3 +13,5 @@ tenacity
# LLM packages # LLM packages
tiktoken tiktoken
# Extra libraries are installed when needed using pipmaster