fix postgres support

2025-03-17 23:59:47 +08:00
parent 6115f60072
commit dfd19b8d27
2 changed files with 28 additions and 18 deletions
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@@ -423,6 +423,7 @@ class PGVectorStorage(BaseVectorStorage):
                "full_doc_id": item["full_doc_id"],
                "content": item["content"],
                "content_vector": json.dumps(item["__vector__"].tolist()),
+                "file_path": item["file_path"],
            }
        except Exception as e:
            logger.error(f"Error to prepare upsert,\nsql: {e}\nitem: {item}")
@@ -445,6 +446,7 @@ class PGVectorStorage(BaseVectorStorage):
            "content": item["content"],
            "content_vector": json.dumps(item["__vector__"].tolist()),
            "chunk_ids": chunk_ids,
+            "file_path": item["file_path"],
            # TODO: add document_id
        }
        return upsert_sql, data
@@ -465,6 +467,7 @@ class PGVectorStorage(BaseVectorStorage):
            "content": item["content"],
            "content_vector": json.dumps(item["__vector__"].tolist()),
            "chunk_ids": chunk_ids,
+            "file_path": item["file_path"],
            # TODO: add document_id
        }
        return upsert_sql, data
@@ -740,6 +743,7 @@ class PGDocStatusStorage(DocStatusStorage):
                chunks_count=result[0]["chunks_count"],
                created_at=result[0]["created_at"],
                updated_at=result[0]["updated_at"],
+                file_path=result[0]["file_path"],
            )

    async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
@@ -774,6 +778,7 @@ class PGDocStatusStorage(DocStatusStorage):
                created_at=element["created_at"],
                updated_at=element["updated_at"],
                chunks_count=element["chunks_count"],
+                file_path=element["file_path"],
            )
            for element in result
        }
@@ -793,14 +798,15 @@ class PGDocStatusStorage(DocStatusStorage):
        if not data:
            return

-        sql = """insert into LIGHTRAG_DOC_STATUS(workspace,id,content,content_summary,content_length,chunks_count,status)
-                 values($1,$2,$3,$4,$5,$6,$7)
+        sql = """insert into LIGHTRAG_DOC_STATUS(workspace,id,content,content_summary,content_length,chunks_count,status,file_path)
+                 values($1,$2,$3,$4,$5,$6,$7,$8)
                  on conflict(id,workspace) do update set
                  content = EXCLUDED.content,
                  content_summary = EXCLUDED.content_summary,
                  content_length = EXCLUDED.content_length,
                  chunks_count = EXCLUDED.chunks_count,
                  status = EXCLUDED.status,
+                  file_path = EXCLUDED.file_path,
                  updated_at = CURRENT_TIMESTAMP"""
        for k, v in data.items():
            # chunks_count is optional
@@ -814,6 +820,7 @@ class PGDocStatusStorage(DocStatusStorage):
                    "content_length": v["content_length"],
                    "chunks_count": v["chunks_count"] if "chunks_count" in v else -1,
                    "status": v["status"],
+                    "file_path": v["file_path"],
                },
            )

@@ -1549,6 +1556,7 @@ TABLES = {
                    tokens INTEGER,
                    content TEXT,
                    content_vector VECTOR,
+                    file_path VARCHAR(256),
                    create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    update_time TIMESTAMP,
 	                CONSTRAINT LIGHTRAG_DOC_CHUNKS_PK PRIMARY KEY (workspace, id)
@@ -1564,6 +1572,7 @@ TABLES = {
                    create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    update_time TIMESTAMP,
                    chunk_id TEXT NULL,
+                    file_path TEXT NULL,
 	                CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id)
                    )"""
    },
@@ -1578,6 +1587,7 @@ TABLES = {
                    create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    update_time TIMESTAMP,
                    chunk_id TEXT NULL,
+                    file_path TEXT NULL,
 	                CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id)
                    )"""
    },
@@ -1602,6 +1612,7 @@ TABLES = {
 	               content_length int4 NULL,
 	               chunks_count int4 NULL,
 	               status varchar(64) NULL,
+	               file_path TEXT NULL,
 	               created_at timestamp DEFAULT CURRENT_TIMESTAMP NULL,
 	               updated_at timestamp DEFAULT CURRENT_TIMESTAMP NULL,
 	               CONSTRAINT LIGHTRAG_DOC_STATUS_PK PRIMARY KEY (workspace, id)
@@ -1650,35 +1661,38 @@ SQL_TEMPLATES = {
                                      update_time = CURRENT_TIMESTAMP
                                     """,
    "upsert_chunk": """INSERT INTO LIGHTRAG_DOC_CHUNKS (workspace, id, tokens,
-                      chunk_order_index, full_doc_id, content, content_vector)
-                      VALUES ($1, $2, $3, $4, $5, $6, $7)
+                      chunk_order_index, full_doc_id, content, content_vector, file_path)
+                      VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
                      ON CONFLICT (workspace,id) DO UPDATE
                      SET tokens=EXCLUDED.tokens,
                      chunk_order_index=EXCLUDED.chunk_order_index,
                      full_doc_id=EXCLUDED.full_doc_id,
                      content = EXCLUDED.content,
                      content_vector=EXCLUDED.content_vector,
+                      file_path=EXCLUDED.file_path,
                      update_time = CURRENT_TIMESTAMP
                     """,
    "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
-                      content_vector, chunk_ids)
-                      VALUES ($1, $2, $3, $4, $5, $6::varchar[])
+                      content_vector, chunk_ids, file_path)
+                      VALUES ($1, $2, $3, $4, $5, $6::varchar[], $7::varchar[])
                      ON CONFLICT (workspace,id) DO UPDATE
                      SET entity_name=EXCLUDED.entity_name,
                      content=EXCLUDED.content,
                      content_vector=EXCLUDED.content_vector,
                      chunk_ids=EXCLUDED.chunk_ids,
+                      file_path=EXCLUDED.file_path,
                      update_time=CURRENT_TIMESTAMP
                     """,
    "upsert_relationship": """INSERT INTO LIGHTRAG_VDB_RELATION (workspace, id, source_id,
-                      target_id, content, content_vector, chunk_ids)
-                      VALUES ($1, $2, $3, $4, $5, $6, $7::varchar[])
+                      target_id, content, content_vector, chunk_ids, file_path)
+                      VALUES ($1, $2, $3, $4, $5, $6, $7::varchar[], $8::varchar[])
                      ON CONFLICT (workspace,id) DO UPDATE
                      SET source_id=EXCLUDED.source_id,
                      target_id=EXCLUDED.target_id,
                      content=EXCLUDED.content,
                      content_vector=EXCLUDED.content_vector,
                      chunk_ids=EXCLUDED.chunk_ids,
+                      file_path=EXCLUDED.file_path,
                      update_time = CURRENT_TIMESTAMP
                     """,
    # SQL for VectorStorage
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -677,12 +677,10 @@ async def extract_entities(
                "entity_type": dp["entity_type"],
                "content": f"{dp['entity_name']}\n{dp['description']}",
                "source_id": dp["source_id"],
-                "file_path": dp.get("metadata", {}).get("file_path", "unknown_source"),
+                "file_path": dp.get("file_path", "unknown_source"),
                "metadata": {
-                    "created_at": dp.get("metadata", {}).get("created_at", time.time()),
-                    "file_path": dp.get("metadata", {}).get(
-                        "file_path", "unknown_source"
-                    ),
+                    "created_at": dp.get("created_at", time.time()),
+                    "file_path": dp.get("file_path", "unknown_source"),
                },
            }
            for dp in all_entities_data
@@ -697,12 +695,10 @@ async def extract_entities(
                "keywords": dp["keywords"],
                "content": f"{dp['src_id']}\t{dp['tgt_id']}\n{dp['keywords']}\n{dp['description']}",
                "source_id": dp["source_id"],
-                "file_path": dp.get("metadata", {}).get("file_path", "unknown_source"),
+                "file_path": dp.get("file_path", "unknown_source"),
                "metadata": {
-                    "created_at": dp.get("metadata", {}).get("created_at", time.time()),
-                    "file_path": dp.get("metadata", {}).get(
-                        "file_path", "unknown_source"
-                    ),
+                    "created_at": dp.get("created_at", time.time()),
+                    "file_path": dp.get("file_path", "unknown_source"),
                },
            }
            for dp in all_relationships_data