From 02ac96ff8e4e98797b5a6175a68e86cfe35fab15 Mon Sep 17 00:00:00 2001 From: Samuel Chan Date: Sun, 2 Feb 2025 18:20:32 +0800 Subject: [PATCH] - Fix the bug from main stream that using doc['status'] - Improve the performance of Apache AGE. - Revise the README.md for Apache AGE indexing. --- README.md | 33 +++++++++++++++++++++++++++++++-- lightrag/kg/postgres_impl.py | 23 ++++++++++++++++++++--- lightrag/lightrag.py | 2 +- 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index dd215b04..950c5c5a 100644 --- a/README.md +++ b/README.md @@ -455,9 +455,38 @@ For production level scenarios you will most likely want to leverage an enterpri * If you prefer docker, please start with this image if you are a beginner to avoid hiccups (DO read the overview): https://hub.docker.com/r/shangor/postgres-for-rag * How to start? Ref to: [examples/lightrag_zhipu_postgres_demo.py](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_zhipu_postgres_demo.py) * Create index for AGE example: (Change below `dickens` to your graph name if necessary) - ``` + ```sql + load 'age'; SET search_path = ag_catalog, "$user", public; - CREATE INDEX idx_entity ON dickens."Entity" USING gin (agtype_access_operator(properties, '"node_id"')); + CREATE INDEX CONCURRENTLY entity_p_idx ON dickens."Entity" (id); + CREATE INDEX CONCURRENTLY vertex_p_idx ON dickens."_ag_label_vertex" (id); + CREATE INDEX CONCURRENTLY directed_p_idx ON dickens."DIRECTED" (id); + CREATE INDEX CONCURRENTLY directed_eid_idx ON dickens."DIRECTED" (end_id); + CREATE INDEX CONCURRENTLY directed_sid_idx ON dickens."DIRECTED" (start_id); + CREATE INDEX CONCURRENTLY directed_seid_idx ON dickens."DIRECTED" (start_id,end_id); + CREATE INDEX CONCURRENTLY edge_p_idx ON dickens."_ag_label_edge" (id); + CREATE INDEX CONCURRENTLY edge_sid_idx ON dickens."_ag_label_edge" (start_id); + CREATE INDEX CONCURRENTLY edge_eid_idx ON dickens."_ag_label_edge" (end_id); + CREATE INDEX CONCURRENTLY edge_seid_idx ON dickens."_ag_label_edge" (start_id,end_id); + create INDEX CONCURRENTLY vertex_idx_node_id ON dickens."_ag_label_vertex" (ag_catalog.agtype_access_operator(properties, '"node_id"'::agtype)); + create INDEX CONCURRENTLY entity_idx_node_id ON dickens."Entity" (ag_catalog.agtype_access_operator(properties, '"node_id"'::agtype)); + CREATE INDEX CONCURRENTLY entity_node_id_gin_idx ON dickens."Entity" using gin(properties); + ALTER TABLE dickens."DIRECTED" CLUSTER ON directed_sid_idx; + + -- drop if necessary + drop INDEX entity_p_idx; + drop INDEX vertex_p_idx; + drop INDEX directed_p_idx; + drop INDEX directed_eid_idx; + drop INDEX directed_sid_idx; + drop INDEX directed_seid_idx; + drop INDEX edge_p_idx; + drop INDEX edge_sid_idx; + drop INDEX edge_eid_idx; + drop INDEX edge_seid_idx; + drop INDEX vertex_idx_node_id; + drop INDEX entity_idx_node_id; + drop INDEX entity_node_id_gin_idx; ``` * Known issue of the Apache AGE: The released versions got below issue: > You might find that the properties of the nodes/edges are empty. diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index b315abca..af62c522 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -30,6 +30,7 @@ from ..base import ( DocStatus, DocProcessingStatus, BaseGraphStorage, + T, ) if sys.platform.startswith("win"): @@ -442,6 +443,22 @@ class PGDocStatusStorage(DocStatusStorage): existed = set([element["id"] for element in result]) return set(data) - existed + async def get_by_id(self, id: str) -> Union[T, None]: + sql = "select * from LIGHTRAG_DOC_STATUS where workspace=$1 and id=$2" + params = {"workspace": self.db.workspace, "id": id} + result = await self.db.query(sql, params, True) + if result is None: + return None + else: + return DocProcessingStatus( + content_length=result[0]["content_length"], + content_summary=result[0]["content_summary"], + status=result[0]["status"], + chunks_count=result[0]["chunks_count"], + created_at=result[0]["created_at"], + updated_at=result[0]["updated_at"], + ) + async def get_status_counts(self) -> Dict[str, int]: """Get counts of documents in each status""" sql = """SELECT status as "status", COUNT(1) as "count" @@ -884,9 +901,9 @@ class PGGraphStorage(BaseGraphStorage): query = """SELECT * FROM cypher('%s', $$ MATCH (n:Entity {node_id: "%s"}) - OPTIONAL MATCH (n)-[r]-(connected) - RETURN n, r, connected - $$) AS (n agtype, r agtype, connected agtype)""" % ( + OPTIONAL MATCH (n)-[]-(connected) + RETURN n, connected + $$) AS (n agtype, connected agtype)""" % ( self.graph_name, label, ) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 22db6994..6c022d95 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -373,7 +373,7 @@ class LightRAG: doc_id for doc_id in new_docs.keys() if (current_doc := await self.doc_status.get_by_id(doc_id)) is None - or current_doc["status"] == DocStatus.FAILED + or current_doc.status == DocStatus.FAILED } new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}