From 791330400788e360fd3e985fad5fac59e7fec21c Mon Sep 17 00:00:00 2001 From: Lukas Selch Date: Mon, 17 Feb 2025 15:12:35 +0100 Subject: [PATCH 01/65] Fixed broken ainsert_custom_kg() --- lightrag/lightrag.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index bf1c02d2..7b3e8605 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -36,6 +36,7 @@ from .utils import ( limit_async_func_call, logger, set_logger, + encode_string_by_tiktoken, ) from .types import KnowledgeGraph @@ -863,7 +864,14 @@ class LightRAG: source_id = chunk_data["source_id"] chunk_id = compute_mdhash_id(chunk_content.strip(), prefix="chunk-") - chunk_entry = {"content": chunk_content.strip(), "source_id": source_id} + chunk_entry = { + "content": chunk_content.strip(), + "source_id": source_id, + "tokens": len(encode_string_by_tiktoken(chunk_entry["content"])), + "chunk_order_id": 0, + "full_doc_id": source_id, + "status": DocStatus.PROCESSED + } all_chunks_data[chunk_id] = chunk_entry chunk_to_source_map[source_id] = chunk_id update_storage = True From 86f5a88db792c26094617f3c135a0078c9bfdcf1 Mon Sep 17 00:00:00 2001 From: Lukas Selch Date: Mon, 17 Feb 2025 15:20:23 +0100 Subject: [PATCH 02/65] Fixed wrong variable name --- lightrag/lightrag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 7b3e8605..2f7bb5e4 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -867,7 +867,7 @@ class LightRAG: chunk_entry = { "content": chunk_content.strip(), "source_id": source_id, - "tokens": len(encode_string_by_tiktoken(chunk_entry["content"])), + "tokens": len(encode_string_by_tiktoken(chunk_content.strip())), "chunk_order_id": 0, "full_doc_id": source_id, "status": DocStatus.PROCESSED From 537e10303dafcc1e46fa43d65d28eae4b0f63111 Mon Sep 17 00:00:00 2001 From: Lukas Selch Date: Mon, 17 Feb 2025 15:25:50 +0100 Subject: [PATCH 03/65] Fixed formatting --- lightrag/lightrag.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 2f7bb5e4..1c3bd089 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -860,17 +860,17 @@ class LightRAG: all_chunks_data: dict[str, dict[str, str]] = {} chunk_to_source_map: dict[str, str] = {} for chunk_data in custom_kg.get("chunks", {}): - chunk_content = chunk_data["content"] + chunk_content = chunk_data["content"].strip() source_id = chunk_data["source_id"] - chunk_id = compute_mdhash_id(chunk_content.strip(), prefix="chunk-") + chunk_id = compute_mdhash_id(chunk_content, prefix="chunk-") chunk_entry = { - "content": chunk_content.strip(), + "content": chunk_content, "source_id": source_id, - "tokens": len(encode_string_by_tiktoken(chunk_content.strip())), + "tokens": len(encode_string_by_tiktoken(chunk_content)), "chunk_order_id": 0, "full_doc_id": source_id, - "status": DocStatus.PROCESSED + "status": DocStatus.PROCESSED, } all_chunks_data[chunk_id] = chunk_entry chunk_to_source_map[source_id] = chunk_id From bc630c862000893f09540219d6655b69c94ee3a3 Mon Sep 17 00:00:00 2001 From: Lukas Selch Date: Wed, 19 Feb 2025 07:15:30 +0100 Subject: [PATCH 04/65] Renamed chunk_order_index and improve token calculation --- lightrag/lightrag.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 1c3bd089..8513ac19 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -862,13 +862,15 @@ class LightRAG: for chunk_data in custom_kg.get("chunks", {}): chunk_content = chunk_data["content"].strip() source_id = chunk_data["source_id"] + tokens = len(encode_string_by_tiktoken(chunk_content, model_name=self.tiktoken_model_name)) + chunk_order_index = 0 if "chunk_order_index" not in chunk_data.keys() else chunk_data["chunk_order_index"] chunk_id = compute_mdhash_id(chunk_content, prefix="chunk-") chunk_entry = { "content": chunk_content, "source_id": source_id, - "tokens": len(encode_string_by_tiktoken(chunk_content)), - "chunk_order_id": 0, + "tokens": tokens, + "chunk_order_index": chunk_order_index, "full_doc_id": source_id, "status": DocStatus.PROCESSED, } From 701d8bb48e3e3224b357e677c5bde042963dc0de Mon Sep 17 00:00:00 2001 From: Lukas Selch Date: Wed, 19 Feb 2025 10:28:25 +0100 Subject: [PATCH 05/65] Applied lint --- lightrag/lightrag.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 8513ac19..e73e4c1b 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -862,8 +862,16 @@ class LightRAG: for chunk_data in custom_kg.get("chunks", {}): chunk_content = chunk_data["content"].strip() source_id = chunk_data["source_id"] - tokens = len(encode_string_by_tiktoken(chunk_content, model_name=self.tiktoken_model_name)) - chunk_order_index = 0 if "chunk_order_index" not in chunk_data.keys() else chunk_data["chunk_order_index"] + tokens = len( + encode_string_by_tiktoken( + chunk_content, model_name=self.tiktoken_model_name + ) + ) + chunk_order_index = ( + 0 + if "chunk_order_index" not in chunk_data.keys() + else chunk_data["chunk_order_index"] + ) chunk_id = compute_mdhash_id(chunk_content, prefix="chunk-") chunk_entry = { From 7fab9accfe220f154ca4aede8e2d1f2dd3870602 Mon Sep 17 00:00:00 2001 From: Lukas Selch Date: Wed, 19 Feb 2025 14:58:51 +0100 Subject: [PATCH 06/65] Updated documentation examples to include chunk_order_index case --- README.md | 8 ++++++++ examples/insert_custom_kg.py | 9 +++++++++ 2 files changed, 17 insertions(+) diff --git a/README.md b/README.md index 92a32703..f43dd370 100644 --- a/README.md +++ b/README.md @@ -608,14 +608,22 @@ custom_kg = { { "content": "ProductX, developed by CompanyA, has revolutionized the market with its cutting-edge features.", "source_id": "Source1", + "chunk_order_index": 0, + }, + { + "content": "One outstanding feature of ProductX is its advanced AI capabilities.", + "source_id": "Source1", + "chunk_order_index": 1, }, { "content": "PersonA is a prominent researcher at UniversityB, focusing on artificial intelligence and machine learning.", "source_id": "Source2", + "chunk_order_index": 0, }, { "content": "None", "source_id": "UNKNOWN", + "chunk_order_index": 0, }, ], } diff --git a/examples/insert_custom_kg.py b/examples/insert_custom_kg.py index 50ad925e..db489c96 100644 --- a/examples/insert_custom_kg.py +++ b/examples/insert_custom_kg.py @@ -87,18 +87,27 @@ custom_kg = { { "content": "ProductX, developed by CompanyA, has revolutionized the market with its cutting-edge features.", "source_id": "Source1", + "source_chunk_index": 0, + }, + { + "content": "One outstanding feature of ProductX is its advanced AI capabilities.", + "source_id": "Source1", + "chunk_order_index": 1, }, { "content": "PersonA is a prominent researcher at UniversityB, focusing on artificial intelligence and machine learning.", "source_id": "Source2", + "source_chunk_index": 0, }, { "content": "EventY, held in CityC, attracts technology enthusiasts and companies from around the globe.", "source_id": "Source3", + "source_chunk_index": 0, }, { "content": "None", "source_id": "UNKNOWN", + "source_chunk_index": 0, }, ], } From d3c9af909ba45cec3fc0cc5f43cf5a7f3079d322 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 19 Feb 2025 18:40:13 +0100 Subject: [PATCH 07/65] Update networkx_impl.py --- lightrag/kg/networkx_impl.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/lightrag/kg/networkx_impl.py b/lightrag/kg/networkx_impl.py index 313d9f8d..a6d0edbb 100644 --- a/lightrag/kg/networkx_impl.py +++ b/lightrag/kg/networkx_impl.py @@ -20,13 +20,8 @@ if not pm.is_installed("networkx"): if not pm.is_installed("graspologic"): pm.install("graspologic") -try: - from graspologic import embed - import networkx as nx -except ImportError as e: - raise ImportError( - "`networkx` library is not installed. Please install it via pip: `pip install networkx`." - ) from e +from graspologic import embed +import networkx as nx @final From 52abf9cc50c33316b5ce16b3c5897471e497d702 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 19 Feb 2025 19:32:23 +0100 Subject: [PATCH 08/65] Removed useless try/except --- lightrag/kg/neo4j_impl.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py index 82631cf8..0ddc611d 100644 --- a/lightrag/kg/neo4j_impl.py +++ b/lightrag/kg/neo4j_impl.py @@ -23,18 +23,13 @@ import pipmaster as pm if not pm.is_installed("neo4j"): pm.install("neo4j") -try: - from neo4j import ( - AsyncGraphDatabase, - exceptions as neo4jExceptions, - AsyncDriver, - AsyncManagedTransaction, - GraphDatabase, - ) -except ImportError as e: - raise ImportError( - "`neo4j` library is not installed. Please install it via pip: `pip install neo4j`." - ) from e +from neo4j import ( + AsyncGraphDatabase, + exceptions as neo4jExceptions, + AsyncDriver, + AsyncManagedTransaction, + GraphDatabase, +) config = configparser.ConfigParser() config.read("config.ini", "utf-8") From 5d19a888b0fc37e07fc430f06b0475ac80991d32 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 19 Feb 2025 19:36:14 +0100 Subject: [PATCH 09/65] Update age_impl.py --- lightrag/kg/age_impl.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/lightrag/kg/age_impl.py b/lightrag/kg/age_impl.py index 243a110b..cb28ab6f 100644 --- a/lightrag/kg/age_impl.py +++ b/lightrag/kg/age_impl.py @@ -23,7 +23,6 @@ from ..base import BaseGraphStorage if sys.platform.startswith("win"): import asyncio.windows_events - asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) @@ -34,15 +33,9 @@ if not pm.is_installed("psycopg-pool"): if not pm.is_installed("asyncpg"): pm.install("asyncpg") -try: - import psycopg - from psycopg.rows import namedtuple_row - from psycopg_pool import AsyncConnectionPool, PoolTimeout -except ImportError: - raise ImportError( - "`psycopg-pool, psycopg[binary,pool], asyncpg` library is not installed. Please install it via pip: `pip install psycopg-pool psycopg[binary,pool] asyncpg`." - ) - +import psycopg +from psycopg.rows import namedtuple_row +from psycopg_pool import AsyncConnectionPool, PoolTimeout class AGEQueryException(Exception): """Exception for the AGE queries.""" From 2d085cda435c5d4aedb46c19a7da0aa3273cab63 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 19 Feb 2025 19:44:10 +0100 Subject: [PATCH 10/65] Update chroma_impl.py --- lightrag/kg/chroma_impl.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/lightrag/kg/chroma_impl.py b/lightrag/kg/chroma_impl.py index 62a9b601..8196cb2a 100644 --- a/lightrag/kg/chroma_impl.py +++ b/lightrag/kg/chroma_impl.py @@ -10,13 +10,8 @@ import pipmaster as pm if not pm.is_installed("chromadb"): pm.install("chromadb") -try: - from chromadb import HttpClient, PersistentClient - from chromadb.config import Settings -except ImportError as e: - raise ImportError( - "`chromadb` library is not installed. Please install it via pip: `pip install chromadb`." - ) from e +from chromadb import HttpClient, PersistentClient +from chromadb.config import Settings @final From 8a0dbc97eb19ccff0ef0d1c755555a5195887a60 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 19 Feb 2025 19:44:27 +0100 Subject: [PATCH 11/65] Update faiss_impl.py --- lightrag/kg/faiss_impl.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/lightrag/kg/faiss_impl.py b/lightrag/kg/faiss_impl.py index 75abf389..7c67e212 100644 --- a/lightrag/kg/faiss_impl.py +++ b/lightrag/kg/faiss_impl.py @@ -20,12 +20,7 @@ from lightrag.base import ( if not pm.is_installed("faiss"): pm.install("faiss") -try: - import faiss -except ImportError as e: - raise ImportError( - "`faiss` library is not installed. Please install it via pip: `pip install faiss`." - ) from e +import faiss @final From d25a5231751523694c5a4b820e3ce83a08659247 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 19 Feb 2025 19:46:43 +0100 Subject: [PATCH 12/65] Update gremlin_impl.py --- lightrag/kg/gremlin_impl.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/lightrag/kg/gremlin_impl.py b/lightrag/kg/gremlin_impl.py index 40a9f007..5d1ea47b 100644 --- a/lightrag/kg/gremlin_impl.py +++ b/lightrag/kg/gremlin_impl.py @@ -2,6 +2,7 @@ import asyncio import inspect import json import os +import pipmaster as pm from dataclasses import dataclass from typing import Any, Dict, List, final @@ -20,15 +21,12 @@ from lightrag.utils import logger from ..base import BaseGraphStorage -try: - from gremlin_python.driver import client, serializer - from gremlin_python.driver.aiohttp.transport import AiohttpTransport - from gremlin_python.driver.protocol import GremlinServerError -except ImportError as e: - raise ImportError( - "`gremlin` library is not installed. Please install it via pip: `pip install gremlin`." - ) from e +if not pm.is_installed("gremlinpython"): + pm.install("gremlinpython") +from gremlin_python.driver import client, serializer +from gremlin_python.driver.aiohttp.transport import AiohttpTransport +from gremlin_python.driver.protocol import GremlinServerError @final @dataclass From ea5d8207a2b1bd1b69729dc2c75aab7ab5e32882 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 19 Feb 2025 19:47:20 +0100 Subject: [PATCH 13/65] Update milvus_impl.py --- lightrag/kg/milvus_impl.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/lightrag/kg/milvus_impl.py b/lightrag/kg/milvus_impl.py index b1746514..833460a8 100644 --- a/lightrag/kg/milvus_impl.py +++ b/lightrag/kg/milvus_impl.py @@ -14,13 +14,8 @@ if not pm.is_installed("configparser"): if not pm.is_installed("pymilvus"): pm.install("pymilvus") -try: - import configparser - from pymilvus import MilvusClient -except ImportError as e: - raise ImportError( - "`pymilvus` library is not installed. Please install it via pip: `pip install pymilvus`." - ) from e +import configparser +from pymilvus import MilvusClient config = configparser.ConfigParser() config.read("config.ini", "utf-8") From b9d4ea5f0d199f2df5fee0f4d3812251cd5fe68f Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 19 Feb 2025 19:49:23 +0100 Subject: [PATCH 14/65] Update mongo_impl.py --- lightrag/kg/mongo_impl.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index a6e6edfd..bc427773 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -25,18 +25,13 @@ if not pm.is_installed("pymongo"): if not pm.is_installed("motor"): pm.install("motor") -try: - from motor.motor_asyncio import ( - AsyncIOMotorClient, - AsyncIOMotorDatabase, - AsyncIOMotorCollection, - ) - from pymongo.operations import SearchIndexModel - from pymongo.errors import PyMongoError -except ImportError as e: - raise ImportError( - "`motor, pymongo` library is not installed. Please install it via pip: `pip install motor pymongo`." - ) from e +from motor.motor_asyncio import ( + AsyncIOMotorClient, + AsyncIOMotorDatabase, + AsyncIOMotorCollection, +) +from pymongo.operations import SearchIndexModel +from pymongo.errors import PyMongoError config = configparser.ConfigParser() config.read("config.ini", "utf-8") From d3c443529cbd015a3e4f576774c52c1157b20077 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 19 Feb 2025 19:49:41 +0100 Subject: [PATCH 15/65] Update nano_vector_db_impl.py --- lightrag/kg/nano_vector_db_impl.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/lightrag/kg/nano_vector_db_impl.py b/lightrag/kg/nano_vector_db_impl.py index b246790b..2c29632b 100644 --- a/lightrag/kg/nano_vector_db_impl.py +++ b/lightrag/kg/nano_vector_db_impl.py @@ -18,13 +18,7 @@ from lightrag.base import ( if not pm.is_installed("nano-vectordb"): pm.install("nano-vectordb") -try: - from nano_vectordb import NanoVectorDB -except ImportError as e: - raise ImportError( - "`nano-vectordb` library is not installed. Please install it via pip: `pip install nano-vectordb`." - ) from e - +from nano_vectordb import NanoVectorDB @final @dataclass From 9ec46b8c5c00a2173cac8b64312ffcc600fb6252 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 19 Feb 2025 19:50:24 +0100 Subject: [PATCH 16/65] Update oracle_impl.py --- lightrag/kg/oracle_impl.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index 0916f6b0..57db7e5b 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -26,14 +26,8 @@ if not pm.is_installed("graspologic"): if not pm.is_installed("oracledb"): pm.install("oracledb") -try: - from graspologic import embed - import oracledb - -except ImportError as e: - raise ImportError( - "`oracledb` library is not installed. Please install it via pip: `pip install oracledb`." - ) from e +from graspologic import embed +import oracledb class OracleDB: From 521dbf2be15f9510a0547c042b22bcab737d45d2 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 19 Feb 2025 19:50:46 +0100 Subject: [PATCH 17/65] Update postgres_impl.py --- lightrag/kg/postgres_impl.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index a9c4b3b7..874c6f22 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -38,15 +38,8 @@ import pipmaster as pm if not pm.is_installed("asyncpg"): pm.install("asyncpg") -try: - import asyncpg - from asyncpg import Pool - -except ImportError as e: - raise ImportError( - "`asyncpg` library is not installed. Please install it via pip: `pip install asyncpg`." - ) from e - +import asyncpg +from asyncpg import Pool class PostgreSQLDB: def __init__(self, config: dict[str, Any], **kwargs: Any): From 473e52a095dea15b909e1415c136b90f90d9117d Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 19 Feb 2025 19:51:39 +0100 Subject: [PATCH 18/65] Update qdrant_impl.py --- lightrag/kg/qdrant_impl.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/lightrag/kg/qdrant_impl.py b/lightrag/kg/qdrant_impl.py index f9edc510..d350e7fa 100644 --- a/lightrag/kg/qdrant_impl.py +++ b/lightrag/kg/qdrant_impl.py @@ -15,16 +15,10 @@ config.read("config.ini", "utf-8") import pipmaster as pm -if not pm.is_installed("qdrant_client"): - pm.install("qdrant_client") +if not pm.is_installed("qdrant-client"): + pm.install("qdrant-client") -try: - from qdrant_client import QdrantClient, models - -except ImportError: - raise ImportError( - "`qdrant_client` library is not installed. Please install it via pip: `pip install qdrant-client`." - ) +from qdrant_client import QdrantClient, models def compute_mdhash_id_for_qdrant( From b44d582ebf96cb46772df250f2a0ba618f4f99bb Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 19 Feb 2025 19:52:10 +0100 Subject: [PATCH 19/65] Update tidb_impl.py --- lightrag/kg/tidb_impl.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index ed9c8d4b..7afff265 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -20,13 +20,8 @@ if not pm.is_installed("pymysql"): if not pm.is_installed("sqlalchemy"): pm.install("sqlalchemy") -try: - from sqlalchemy import create_engine, text +from sqlalchemy import create_engine, text -except ImportError as e: - raise ImportError( - "`pymysql, sqlalchemy` library is not installed. Please install it via pip: `pip install pymysql sqlalchemy`." - ) from e class TiDB: From 098f811ad27fe0396536c8a9ee891e1d81d7570c Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Wed, 19 Feb 2025 20:07:34 +0100 Subject: [PATCH 20/65] add template github --- .github/ISSUE_TEMPLATE/bug_report.yml | 64 ++++++++++++++++++++++ .github/ISSUE_TEMPLATE/config.yml | 1 + .github/ISSUE_TEMPLATE/feature_request.yml | 33 +++++++++++ .github/ISSUE_TEMPLATE/general_issue.yml | 58 ++++++++++++++++++++ .github/pull_request_template.md | 32 +++++++++++ 5 files changed, 188 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yml create mode 100644 .github/ISSUE_TEMPLATE/config.yml create mode 100644 .github/ISSUE_TEMPLATE/feature_request.yml create mode 100644 .github/ISSUE_TEMPLATE/general_issue.yml create mode 100644 .github/pull_request_template.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 00000000..f2e3ad04 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,64 @@ +name: Bug Report +description: File a bug report +title: "[Bug]: " +labels: ["bug", "triage"] + +body: + - type: checkboxes + id: existingcheck + attributes: + label: Do you need to file an issue? + description: Please help us manage our time by avoiding duplicates and common questions with the steps below. + options: + - label: I have searched the existing issues and this bug is not already filed. + - label: My model is hosted on OpenAI or Azure. If not, please look at the "model providers" issue and don't file a new one here. + - label: I believe this is a legitimate bug, not just a question. If this is a question, please use the Discussions area. + - type: textarea + id: description + attributes: + label: Describe the bug + description: A clear and concise description of what the bug is. + placeholder: What went wrong? + - type: textarea + id: reproduce + attributes: + label: Steps to reproduce + description: Steps to reproduce the behavior. + placeholder: How can we replicate the issue? + - type: textarea + id: expected_behavior + attributes: + label: Expected Behavior + description: A clear and concise description of what you expected to happen. + placeholder: What should have happened? + - type: textarea + id: configused + attributes: + label: LightRAG Config Used + description: The LightRAG configuration used for the run. + placeholder: The settings.yaml content or LightRAG configuration + value: | + ```yaml + # Paste your config here + ``` + - type: textarea + id: screenshotslogs + attributes: + label: Logs and screenshots + description: If applicable, add screenshots and logs to help explain your problem. + placeholder: Add logs and screenshots here + - type: textarea + id: additional_information + attributes: + label: Additional Information + description: | + - LightRAG Version: e.g., v0.1.1 + - Operating System: e.g., Windows 10, Ubuntu 20.04 + - Python Version: e.g., 3.8 + - Related Issues: e.g., #1 + - Any other relevant information. + value: | + - LightRAG Version: + - Operating System: + - Python Version: + - Related Issues: diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 00000000..0086358d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: true diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 00000000..aa2a6da7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,33 @@ +name: Feature Request +description: File a feature request +labels: ["enhancement"] +title: "[Feature Request]: <title>" + +body: + - type: checkboxes + id: existingcheck + attributes: + label: Do you need to file an issue? + description: Please help us manage our time by avoiding duplicates and common questions with the steps below. + options: + - label: I have searched the existing issues and this feature is not already filed. + - label: My model is hosted on OpenAI or Azure. If not, please look at the "model providers" issue and don't file a new one here. + - label: I believe this is a legitimate feature request, not just a question. If this is a question, please use the Discussions area. + - type: textarea + id: problem_description + attributes: + label: Problem Description + description: A clear and concise description of the problem you're trying to solve. + placeholder: What problem are you trying to solve? + - type: textarea + id: solution_description + attributes: + label: Proposed Solution + description: A clear and concise description of what you want to happen. + placeholder: How do you envision the solution? + - type: textarea + id: additional_context + attributes: + label: Additional Context + description: Add any other context or screenshots about the feature request here. + placeholder: Any additional information diff --git a/.github/ISSUE_TEMPLATE/general_issue.yml b/.github/ISSUE_TEMPLATE/general_issue.yml new file mode 100644 index 00000000..c023c2d9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/general_issue.yml @@ -0,0 +1,58 @@ +name: General Issue +description: File a general issue +title: "[Issue]: <title>" +labels: ["triage"] + +body: + - type: checkboxes + id: existingcheck + attributes: + label: Do you need to file an issue? + description: Please help us manage our time by avoiding duplicates and common questions with the steps below. + options: + - label: I have searched the existing issues and this issue is not already filed. + - label: My model is hosted on OpenAI or Azure. If not, please look at the "model providers" issue and don't file a new one here. + - label: I believe this is a legitimate issue, not just a question. If this is a question, please use the Discussions area. + - type: textarea + id: description + attributes: + label: Describe the issue + description: A clear and concise description of what the issue is. + placeholder: What went wrong? + - type: textarea + id: reproduce + attributes: + label: Steps to reproduce + description: Steps to reproduce the behavior. + placeholder: How can we replicate the issue? + - type: textarea + id: configused + attributes: + label: LightRAG Config Used + description: The LightRAG configuration used for the run. + placeholder: The settings.yaml content or LightRAG configuration + value: | + ```yaml + # Paste your config here + ``` + - type: textarea + id: screenshotslogs + attributes: + label: Logs and screenshots + description: If applicable, add screenshots and logs to help explain your problem. + placeholder: Add logs and screenshots here + - type: textarea + id: additional_information + attributes: + label: Additional Information + description: | + - LightRAG Version: e.g., v0.1.1 + - Operating System: e.g., Windows 10, Ubuntu 20.04 + - Python Version: e.g., 3.8 + - Related Issues: e.g., #1 + - Any other relevant information. + value: | + - LightRAG Version: + - Operating System: + - Python Version: + - Related Issues: diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 00000000..6eb2f2ac --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,32 @@ +<!-- +Thanks for contributing to LightRAG! + +Please ensure your pull request is ready for review before submitting. + +About this template + +This template helps contributors provide a clear and concise description of their changes. Feel free to adjust it as needed. +--> + +## Description + +[Briefly describe the changes made in this pull request.] + +## Related Issues + +[Reference any related issues or tasks addressed by this pull request.] + +## Changes Made + +[List the specific changes made in this pull request.] + +## Checklist + +- [ ] Changes tested locally +- [ ] Code reviewed +- [ ] Documentation updated (if necessary) +- [ ] Unit tests added (if applicable) + +## Additional Notes + +[Add any additional notes or context for the reviewer(s).] From fb5938aac40768913d66552c8e21498a86a42d72 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Wed, 19 Feb 2025 20:17:57 +0100 Subject: [PATCH 21/65] updated template --- .github/ISSUE_TEMPLATE/bug_report.yml | 3 -- .github/ISSUE_TEMPLATE/config.yml | 2 +- .github/ISSUE_TEMPLATE/feature_request.yml | 1 - .github/ISSUE_TEMPLATE/general_issue.yml | 58 ---------------------- .github/ISSUE_TEMPLATE/question.yml | 26 ++++++++++ 5 files changed, 27 insertions(+), 63 deletions(-) delete mode 100644 .github/ISSUE_TEMPLATE/general_issue.yml create mode 100644 .github/ISSUE_TEMPLATE/question.yml diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index f2e3ad04..35d55b99 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -11,7 +11,6 @@ body: description: Please help us manage our time by avoiding duplicates and common questions with the steps below. options: - label: I have searched the existing issues and this bug is not already filed. - - label: My model is hosted on OpenAI or Azure. If not, please look at the "model providers" issue and don't file a new one here. - label: I believe this is a legitimate bug, not just a question. If this is a question, please use the Discussions area. - type: textarea id: description @@ -38,9 +37,7 @@ body: description: The LightRAG configuration used for the run. placeholder: The settings.yaml content or LightRAG configuration value: | - ```yaml # Paste your config here - ``` - type: textarea id: screenshotslogs attributes: diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 0086358d..3ba13e0c 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1 +1 @@ -blank_issues_enabled: true +blank_issues_enabled: false diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index aa2a6da7..790fdb1e 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -11,7 +11,6 @@ body: description: Please help us manage our time by avoiding duplicates and common questions with the steps below. options: - label: I have searched the existing issues and this feature is not already filed. - - label: My model is hosted on OpenAI or Azure. If not, please look at the "model providers" issue and don't file a new one here. - label: I believe this is a legitimate feature request, not just a question. If this is a question, please use the Discussions area. - type: textarea id: problem_description diff --git a/.github/ISSUE_TEMPLATE/general_issue.yml b/.github/ISSUE_TEMPLATE/general_issue.yml deleted file mode 100644 index c023c2d9..00000000 --- a/.github/ISSUE_TEMPLATE/general_issue.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: General Issue -description: File a general issue -title: "[Issue]: <title>" -labels: ["triage"] - -body: - - type: checkboxes - id: existingcheck - attributes: - label: Do you need to file an issue? - description: Please help us manage our time by avoiding duplicates and common questions with the steps below. - options: - - label: I have searched the existing issues and this issue is not already filed. - - label: My model is hosted on OpenAI or Azure. If not, please look at the "model providers" issue and don't file a new one here. - - label: I believe this is a legitimate issue, not just a question. If this is a question, please use the Discussions area. - - type: textarea - id: description - attributes: - label: Describe the issue - description: A clear and concise description of what the issue is. - placeholder: What went wrong? - - type: textarea - id: reproduce - attributes: - label: Steps to reproduce - description: Steps to reproduce the behavior. - placeholder: How can we replicate the issue? - - type: textarea - id: configused - attributes: - label: LightRAG Config Used - description: The LightRAG configuration used for the run. - placeholder: The settings.yaml content or LightRAG configuration - value: | - ```yaml - # Paste your config here - ``` - - type: textarea - id: screenshotslogs - attributes: - label: Logs and screenshots - description: If applicable, add screenshots and logs to help explain your problem. - placeholder: Add logs and screenshots here - - type: textarea - id: additional_information - attributes: - label: Additional Information - description: | - - LightRAG Version: e.g., v0.1.1 - - Operating System: e.g., Windows 10, Ubuntu 20.04 - - Python Version: e.g., 3.8 - - Related Issues: e.g., #1 - - Any other relevant information. - value: | - - LightRAG Version: - - Operating System: - - Python Version: - - Related Issues: diff --git a/.github/ISSUE_TEMPLATE/question.yml b/.github/ISSUE_TEMPLATE/question.yml new file mode 100644 index 00000000..b8ed439d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.yml @@ -0,0 +1,26 @@ +name: Question +description: Ask a general question +labels: ["question"] +title: "[Question]: <title>" + +body: + - type: checkboxes + id: existingcheck + attributes: + label: Do you need to ask a question? + description: Please help us manage our time by avoiding duplicates and common questions with the steps below. + options: + - label: I have searched the existing issues and discussions and this question is not already answered. + - label: I believe this is a legitimate question, not just a duplicate or common issue. + - type: textarea + id: question + attributes: + label: Your Question + description: A clear and concise description of your question. + placeholder: What is your question? + - type: textarea + id: context + attributes: + label: Additional Context + description: Provide any additional context or details that might help us understand your question better. + placeholder: Add any relevant information here From 05914213e2a1df6d9d765b589f18459aaae91009 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Wed, 19 Feb 2025 20:27:56 +0100 Subject: [PATCH 22/65] updated templates --- .github/ISSUE_TEMPLATE/bug_report.yml | 6 +++--- .github/ISSUE_TEMPLATE/feature_request.yml | 22 ++++++++-------------- .github/ISSUE_TEMPLATE/question.yml | 4 ++-- 3 files changed, 13 insertions(+), 19 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 35d55b99..e5845de1 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -8,10 +8,10 @@ body: id: existingcheck attributes: label: Do you need to file an issue? - description: Please help us manage our time by avoiding duplicates and common questions with the steps below. + description: Please help us manage our time by avoiding duplicates and common bugs with the steps below. options: - label: I have searched the existing issues and this bug is not already filed. - - label: I believe this is a legitimate bug, not just a question. If this is a question, please use the Discussions area. + - label: I believe this is a legitimate bug, not just a question or feature request. - type: textarea id: description attributes: @@ -35,7 +35,7 @@ body: attributes: label: LightRAG Config Used description: The LightRAG configuration used for the run. - placeholder: The settings.yaml content or LightRAG configuration + placeholder: The settings content or LightRAG configuration value: | # Paste your config here - type: textarea diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index 790fdb1e..5e5b5dd4 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -7,23 +7,17 @@ body: - type: checkboxes id: existingcheck attributes: - label: Do you need to file an issue? - description: Please help us manage our time by avoiding duplicates and common questions with the steps below. + label: Do you need to file a feature request? + description: Please help us manage our time by avoiding duplicates and common feature request with the steps below. options: - - label: I have searched the existing issues and this feature is not already filed. - - label: I believe this is a legitimate feature request, not just a question. If this is a question, please use the Discussions area. + - label: I have searched the existing feature request and this feature request is not already filed. + - label: I believe this is a legitimate feature request, not just a question or bug. - type: textarea - id: problem_description + id: feature_request_description attributes: - label: Problem Description - description: A clear and concise description of the problem you're trying to solve. - placeholder: What problem are you trying to solve? - - type: textarea - id: solution_description - attributes: - label: Proposed Solution - description: A clear and concise description of what you want to happen. - placeholder: How do you envision the solution? + label: Feature Request Description + description: A clear and concise description of the feature request you would like. + placeholder: What this feature request add more or improve? - type: textarea id: additional_context attributes: diff --git a/.github/ISSUE_TEMPLATE/question.yml b/.github/ISSUE_TEMPLATE/question.yml index b8ed439d..156a5497 100644 --- a/.github/ISSUE_TEMPLATE/question.yml +++ b/.github/ISSUE_TEMPLATE/question.yml @@ -10,8 +10,8 @@ body: label: Do you need to ask a question? description: Please help us manage our time by avoiding duplicates and common questions with the steps below. options: - - label: I have searched the existing issues and discussions and this question is not already answered. - - label: I believe this is a legitimate question, not just a duplicate or common issue. + - label: I have searched the existing question and discussions and this question is not already answered. + - label: I believe this is a legitimate question, not just a bug or feature request. - type: textarea id: question attributes: From 45ee4dd08cba90523e32863b19ca446c02875615 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI <aloui.seifeddine@gmail.com> Date: Wed, 19 Feb 2025 20:50:39 +0100 Subject: [PATCH 23/65] fixed linting --- lightrag/kg/age_impl.py | 2 ++ lightrag/kg/gremlin_impl.py | 1 + lightrag/kg/nano_vector_db_impl.py | 1 + lightrag/kg/postgres_impl.py | 1 + lightrag/kg/tidb_impl.py | 1 - 5 files changed, 5 insertions(+), 1 deletion(-) diff --git a/lightrag/kg/age_impl.py b/lightrag/kg/age_impl.py index cb28ab6f..97b3825d 100644 --- a/lightrag/kg/age_impl.py +++ b/lightrag/kg/age_impl.py @@ -23,6 +23,7 @@ from ..base import BaseGraphStorage if sys.platform.startswith("win"): import asyncio.windows_events + asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) @@ -37,6 +38,7 @@ import psycopg from psycopg.rows import namedtuple_row from psycopg_pool import AsyncConnectionPool, PoolTimeout + class AGEQueryException(Exception): """Exception for the AGE queries.""" diff --git a/lightrag/kg/gremlin_impl.py b/lightrag/kg/gremlin_impl.py index 5d1ea47b..3a26401d 100644 --- a/lightrag/kg/gremlin_impl.py +++ b/lightrag/kg/gremlin_impl.py @@ -28,6 +28,7 @@ from gremlin_python.driver import client, serializer from gremlin_python.driver.aiohttp.transport import AiohttpTransport from gremlin_python.driver.protocol import GremlinServerError + @final @dataclass class GremlinStorage(BaseGraphStorage): diff --git a/lightrag/kg/nano_vector_db_impl.py b/lightrag/kg/nano_vector_db_impl.py index 2c29632b..315b5a8f 100644 --- a/lightrag/kg/nano_vector_db_impl.py +++ b/lightrag/kg/nano_vector_db_impl.py @@ -20,6 +20,7 @@ if not pm.is_installed("nano-vectordb"): from nano_vectordb import NanoVectorDB + @final @dataclass class NanoVectorDBStorage(BaseVectorStorage): diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 874c6f22..4ffa2fb2 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -41,6 +41,7 @@ if not pm.is_installed("asyncpg"): import asyncpg from asyncpg import Pool + class PostgreSQLDB: def __init__(self, config: dict[str, Any], **kwargs: Any): self.host = config.get("host", "localhost") diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index 7afff265..8b8fa2b6 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -23,7 +23,6 @@ if not pm.is_installed("sqlalchemy"): from sqlalchemy import create_engine, text - class TiDB: def __init__(self, config, **kwargs): self.host = config.get("host", None) From 8958046b74d30d186fc7abfde99c806b42b4798f Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Wed, 19 Feb 2025 22:07:25 +0100 Subject: [PATCH 24/65] cleaned code --- lightrag/lightrag.py | 3 ++- lightrag/operate.py | 6 ++---- lightrag/prompt.py | 18 ++++++++---------- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index f9ab2333..efcded4c 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -687,7 +687,7 @@ class LightRAG: return update_storage = True - logger.info(f"[New Docs] inserting {len(new_docs)} docs") + logger.info(f"Inserting {len(new_docs)} docs") inserting_chunks: dict[str, Any] = {} for chunk_text in text_chunks: @@ -914,6 +914,7 @@ class LightRAG: if storage_inst is not None ] await asyncio.gather(*tasks) + logger.info("All Insert done") def insert_custom_kg(self, custom_kg: dict[str, Any]) -> None: loop = always_get_an_event_loop() diff --git a/lightrag/operate.py b/lightrag/operate.py index 3ada34ab..9552f2ed 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -491,11 +491,9 @@ async def extract_entities( already_processed += 1 already_entities += len(maybe_nodes) already_relations += len(maybe_edges) - now_ticks = PROMPTS["process_tickers"][ - already_processed % len(PROMPTS["process_tickers"]) - ] + logger.debug( - f"{now_ticks} Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r", + f"Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r", ) return dict(maybe_nodes), dict(maybe_edges) diff --git a/lightrag/prompt.py b/lightrag/prompt.py index a25ab672..c6cbf7db 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -9,15 +9,14 @@ PROMPTS["DEFAULT_LANGUAGE"] = "English" PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>" PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##" PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>" -PROMPTS["process_tickers"] = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"] PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event", "category"] -PROMPTS["entity_extraction"] = """-Goal- +PROMPTS["entity_extraction"] = """---Goal--- Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities. Use {language} as output language. --Steps- +---Steps--- 1. Identify all entities. For each identified entity, extract the following information: - entity_name: Name of the entity, use same language as input text. If English, capitalized the name. - entity_type: One of the following types: [{entity_types}] @@ -41,18 +40,17 @@ Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_ 5. When finished, output {completion_delimiter} ###################### --Examples- +---Examples--- ###################### {examples} ############################# --Real Data- +---Real Data--- ###################### Entity_types: {entity_types} Text: {input_text} ###################### -Output: -""" +Output:""" PROMPTS["entity_extraction_examples"] = [ """Example 1: @@ -137,7 +135,7 @@ Make sure it is written in third person, and include the entity names so we the Use {language} as output language. ####### --Data- +---Data--- Entities: {entity_name} Description List: {description_list} ####### @@ -205,12 +203,12 @@ Given the query and conversation history, list both high-level and low-level key - "low_level_keywords" for specific entities or details ###################### --Examples- +---Examples--- ###################### {examples} ############################# --Real Data- +---Real Data--- ###################### Conversation History: {history} From 9277fe8c29187cdca009d88267632e974faf6b62 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Wed, 19 Feb 2025 22:22:41 +0100 Subject: [PATCH 25/65] fixed return --- lightrag/kg/chroma_impl.py | 4 ++-- lightrag/kg/faiss_impl.py | 5 ++--- lightrag/kg/json_doc_status_impl.py | 4 ++++ lightrag/kg/json_kv_impl.py | 3 +++ lightrag/kg/milvus_impl.py | 10 +++++----- lightrag/kg/mongo_impl.py | 16 +++++++++++----- lightrag/kg/nano_vector_db_impl.py | 7 +++---- lightrag/kg/oracle_impl.py | 4 ++++ lightrag/kg/postgres_impl.py | 16 ++++++++++++---- lightrag/kg/qdrant_impl.py | 6 +++--- lightrag/kg/redis_impl.py | 3 +++ lightrag/kg/tidb_impl.py | 13 ++++++++----- 12 files changed, 60 insertions(+), 31 deletions(-) diff --git a/lightrag/kg/chroma_impl.py b/lightrag/kg/chroma_impl.py index 62a9b601..5befd8d0 100644 --- a/lightrag/kg/chroma_impl.py +++ b/lightrag/kg/chroma_impl.py @@ -113,9 +113,9 @@ class ChromaVectorDBStorage(BaseVectorStorage): raise async def upsert(self, data: dict[str, dict[str, Any]]) -> None: + logger.info(f"Inserting {len(data)} to {self.namespace}") if not data: - logger.warning("Empty data provided to vector DB") - return [] + return try: ids = list(data.keys()) diff --git a/lightrag/kg/faiss_impl.py b/lightrag/kg/faiss_impl.py index 75abf389..e8f8206d 100644 --- a/lightrag/kg/faiss_impl.py +++ b/lightrag/kg/faiss_impl.py @@ -84,10 +84,9 @@ class FaissVectorDBStorage(BaseVectorStorage): ... } """ - logger.info(f"Inserting {len(data)} vectors to {self.namespace}") + logger.info(f"Inserting {len(data)} to {self.namespace}") if not data: - logger.warning("You are inserting empty data to the vector DB") - return [] + return current_time = time.time() diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py index 1a05abc2..b6133a4c 100644 --- a/lightrag/kg/json_doc_status_impl.py +++ b/lightrag/kg/json_doc_status_impl.py @@ -58,6 +58,10 @@ class JsonDocStatusStorage(DocStatusStorage): write_json(self._data, self._file_name) async def upsert(self, data: dict[str, dict[str, Any]]) -> None: + logger.info(f"Inserting {len(data)} to {self.namespace}") + if not data: + return + self._data.update(data) await self.index_done_callback() diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index 7e13dea7..e1ea507a 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -43,6 +43,9 @@ class JsonKVStorage(BaseKVStorage): return set(keys) - set(self._data.keys()) async def upsert(self, data: dict[str, dict[str, Any]]) -> None: + logger.info(f"Inserting {len(data)} to {self.namespace}") + if not data: + return left_data = {k: v for k, v in data.items() if k not in self._data} self._data.update(left_data) diff --git a/lightrag/kg/milvus_impl.py b/lightrag/kg/milvus_impl.py index b1746514..342fd0a8 100644 --- a/lightrag/kg/milvus_impl.py +++ b/lightrag/kg/milvus_impl.py @@ -80,11 +80,11 @@ class MilvusVectorDBStorage(BaseVectorStorage): ) async def upsert(self, data: dict[str, dict[str, Any]]) -> None: - logger.info(f"Inserting {len(data)} vectors to {self.namespace}") - if not len(data): - logger.warning("You insert an empty data to vector DB") - return [] - list_data = [ + logger.info(f"Inserting {len(data)} to {self.namespace}") + if not data: + return + + list_data: list[dict[str, Any]] = [ { "id": k, **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields}, diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index a6e6edfd..f6a25ba6 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -113,8 +113,12 @@ class MongoKVStorage(BaseKVStorage): return keys - existing_ids async def upsert(self, data: dict[str, dict[str, Any]]) -> None: + logger.info(f"Inserting {len(data)} to {self.namespace}") + if not data: + return + if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE): - update_tasks = [] + update_tasks: list[Any] = [] for mode, items in data.items(): for k, v in items.items(): key = f"{mode}_{k}" @@ -186,7 +190,10 @@ class MongoDocStatusStorage(DocStatusStorage): return data - existing_ids async def upsert(self, data: dict[str, dict[str, Any]]) -> None: - update_tasks = [] + logger.info(f"Inserting {len(data)} to {self.namespace}") + if not data: + return + update_tasks: list[Any] = [] for k, v in data.items(): data[k]["_id"] = k update_tasks.append( @@ -860,10 +867,9 @@ class MongoVectorDBStorage(BaseVectorStorage): logger.debug("vector index already exist") async def upsert(self, data: dict[str, dict[str, Any]]) -> None: - logger.debug(f"Inserting {len(data)} vectors to {self.namespace}") + logger.info(f"Inserting {len(data)} to {self.namespace}") if not data: - logger.warning("You are inserting an empty data set to vector DB") - return [] + return list_data = [ { diff --git a/lightrag/kg/nano_vector_db_impl.py b/lightrag/kg/nano_vector_db_impl.py index b246790b..7462bd7c 100644 --- a/lightrag/kg/nano_vector_db_impl.py +++ b/lightrag/kg/nano_vector_db_impl.py @@ -50,10 +50,9 @@ class NanoVectorDBStorage(BaseVectorStorage): ) async def upsert(self, data: dict[str, dict[str, Any]]) -> None: - logger.info(f"Inserting {len(data)} vectors to {self.namespace}") - if not len(data): - logger.warning("You insert an empty data to vector DB") - return [] + logger.info(f"Inserting {len(data)} to {self.namespace}") + if not data: + return current_time = time.time() list_data = [ diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index 0916f6b0..014726fb 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -332,6 +332,10 @@ class OracleKVStorage(BaseKVStorage): ################ INSERT METHODS ################ async def upsert(self, data: dict[str, dict[str, Any]]) -> None: + logger.info(f"Inserting {len(data)} to {self.namespace}") + if not data: + return + if is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS): list_data = [ { diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index a9c4b3b7..ad7c4b5e 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -353,6 +353,10 @@ class PGKVStorage(BaseKVStorage): ################ INSERT METHODS ################ async def upsert(self, data: dict[str, dict[str, Any]]) -> None: + logger.info(f"Inserting {len(data)} to {self.namespace}") + if not data: + return + if is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS): pass elif is_namespace(self.namespace, NameSpace.KV_STORE_FULL_DOCS): @@ -454,10 +458,10 @@ class PGVectorStorage(BaseVectorStorage): return upsert_sql, data async def upsert(self, data: dict[str, dict[str, Any]]) -> None: - logger.info(f"Inserting {len(data)} vectors to {self.namespace}") - if not len(data): - logger.warning("You insert an empty data to vector DB") - return [] + logger.info(f"Inserting {len(data)} to {self.namespace}") + if not data: + return + current_time = time.time() list_data = [ { @@ -618,6 +622,10 @@ class PGDocStatusStorage(DocStatusStorage): Args: data: dictionary of document IDs and their status data """ + logger.info(f"Inserting {len(data)} to {self.namespace}") + if not data: + return + sql = """insert into LIGHTRAG_DOC_STATUS(workspace,id,content,content_summary,content_length,chunks_count,status) values($1,$2,$3,$4,$5,$6,$7) on conflict(id,workspace) do update set diff --git a/lightrag/kg/qdrant_impl.py b/lightrag/kg/qdrant_impl.py index f9edc510..d54b2408 100644 --- a/lightrag/kg/qdrant_impl.py +++ b/lightrag/kg/qdrant_impl.py @@ -93,9 +93,9 @@ class QdrantVectorDBStorage(BaseVectorStorage): ) async def upsert(self, data: dict[str, dict[str, Any]]) -> None: - if not len(data): - logger.warning("You insert an empty data to vector DB") - return [] + logger.info(f"Inserting {len(data)} to {self.namespace}") + if not data: + return list_data = [ { "id": k, diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py index 4bff6c62..7e177346 100644 --- a/lightrag/kg/redis_impl.py +++ b/lightrag/kg/redis_impl.py @@ -49,6 +49,9 @@ class RedisKVStorage(BaseKVStorage): return set(keys) - existing_ids async def upsert(self, data: dict[str, dict[str, Any]]) -> None: + logger.info(f"Inserting {len(data)} to {self.namespace}") + if not data: + return pipe = self._redis.pipeline() for k, v in data.items(): diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index ed9c8d4b..4266d07c 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -217,6 +217,9 @@ class TiDBKVStorage(BaseKVStorage): ################ INSERT full_doc AND chunks ################ async def upsert(self, data: dict[str, dict[str, Any]]) -> None: + logger.info(f"Inserting {len(data)} to {self.namespace}") + if not data: + return left_data = {k: v for k, v in data.items() if k not in self._data} self._data.update(left_data) if is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS): @@ -324,12 +327,12 @@ class TiDBVectorDBStorage(BaseVectorStorage): ###### INSERT entities And relationships ###### async def upsert(self, data: dict[str, dict[str, Any]]) -> None: - # ignore, upsert in TiDBKVStorage already - if not len(data): - logger.warning("You insert an empty data to vector DB") - return [] + logger.info(f"Inserting {len(data)} to {self.namespace}") + if not data: + return if is_namespace(self.namespace, NameSpace.VECTOR_STORE_CHUNKS): - return [] + return + logger.info(f"Inserting {len(data)} vectors to {self.namespace}") list_data = [ From eb3306f34c2886772f6045b1ab4bbaa85f541e3f Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Wed, 19 Feb 2025 22:52:49 +0100 Subject: [PATCH 26/65] cleanup --- lightrag/api/lightrag_server.py | 5 ---- lightrag/base.py | 4 --- lightrag/kg/age_impl.py | 7 ------ lightrag/kg/gremlin_impl.py | 3 --- lightrag/kg/mongo_impl.py | 18 ------------- lightrag/kg/neo4j_impl.py | 25 ------------------- lightrag/kg/networkx_impl.py | 3 --- lightrag/kg/oracle_impl.py | 3 --- lightrag/kg/postgres_impl.py | 11 +++----- lightrag/kg/tidb_impl.py | 3 --- lightrag/lightrag.py | 4 --- .../lightrag_visualizer/graph_visualizer.py | 5 +++- 12 files changed, 7 insertions(+), 84 deletions(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 0cf1d01e..96315b82 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -1682,11 +1682,6 @@ def create_app(args): trace_exception(e) raise HTTPException(status_code=500, detail=str(e)) - # query all graph labels - @app.get("/graph/label/list") - async def get_graph_labels(): - return await rag.get_graph_labels() - # query all graph @app.get("/graphs") async def get_knowledge_graph(label: str): diff --git a/lightrag/base.py b/lightrag/base.py index 5f6a1bf1..af060435 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -198,10 +198,6 @@ class BaseGraphStorage(StorageNameSpace, ABC): ) -> tuple[np.ndarray[Any, Any], list[str]]: """Get all labels in the graph.""" - @abstractmethod - async def get_all_labels(self) -> list[str]: - """Get a knowledge graph of a node.""" - @abstractmethod async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 diff --git a/lightrag/kg/age_impl.py b/lightrag/kg/age_impl.py index 243a110b..225b350b 100644 --- a/lightrag/kg/age_impl.py +++ b/lightrag/kg/age_impl.py @@ -65,10 +65,6 @@ class AGEQueryException(Exception): @final @dataclass class AGEStorage(BaseGraphStorage): - @staticmethod - def load_nx_graph(file_name): - print("no preloading of graph with AGE in production") - def __init__(self, namespace, global_config, embedding_func): super().__init__( namespace=namespace, @@ -625,9 +621,6 @@ class AGEStorage(BaseGraphStorage): ) -> tuple[np.ndarray[Any, Any], list[str]]: raise NotImplementedError - async def get_all_labels(self) -> list[str]: - raise NotImplementedError - async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 ) -> KnowledgeGraph: diff --git a/lightrag/kg/gremlin_impl.py b/lightrag/kg/gremlin_impl.py index 40a9f007..de12bbd9 100644 --- a/lightrag/kg/gremlin_impl.py +++ b/lightrag/kg/gremlin_impl.py @@ -404,9 +404,6 @@ class GremlinStorage(BaseGraphStorage): ) -> tuple[np.ndarray[Any, Any], list[str]]: raise NotImplementedError - async def get_all_labels(self) -> list[str]: - raise NotImplementedError - async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 ) -> KnowledgeGraph: diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index f6a25ba6..b3479b2f 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -606,24 +606,6 @@ class MongoGraphStorage(BaseGraphStorage): # ------------------------------------------------------------------------- # - async def get_all_labels(self) -> list[str]: - """ - Get all existing node _id in the database - Returns: - [id1, id2, ...] # Alphabetically sorted id list - """ - # Use MongoDB's distinct and aggregation to get all unique labels - pipeline = [ - {"$group": {"_id": "$_id"}}, # Group by _id - {"$sort": {"_id": 1}}, # Sort alphabetically - ] - - cursor = self.collection.aggregate(pipeline) - labels = [] - async for doc in cursor: - labels.append(doc["_id"]) - return labels - async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 ) -> KnowledgeGraph: diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py index 82631cf8..296f4295 100644 --- a/lightrag/kg/neo4j_impl.py +++ b/lightrag/kg/neo4j_impl.py @@ -633,31 +633,6 @@ class Neo4JStorage(BaseGraphStorage): await traverse(label, 0) return result - async def get_all_labels(self) -> list[str]: - """ - Get all existing node labels in the database - Returns: - ["Person", "Company", ...] # Alphabetically sorted label list - """ - async with self._driver.session(database=self._DATABASE) as session: - # Method 1: Direct metadata query (Available for Neo4j 4.3+) - # query = "CALL db.labels() YIELD label RETURN label" - - # Method 2: Query compatible with older versions - query = """ - MATCH (n) - WITH DISTINCT labels(n) AS node_labels - UNWIND node_labels AS label - RETURN DISTINCT label - ORDER BY label - """ - - result = await session.run(query) - labels = [] - async for record in result: - labels.append(record["label"]) - return labels - async def delete_node(self, node_id: str) -> None: raise NotImplementedError diff --git a/lightrag/kg/networkx_impl.py b/lightrag/kg/networkx_impl.py index 313d9f8d..9b0d7474 100644 --- a/lightrag/kg/networkx_impl.py +++ b/lightrag/kg/networkx_impl.py @@ -172,9 +172,6 @@ class NetworkXStorage(BaseGraphStorage): if self._graph.has_edge(source, target): self._graph.remove_edge(source, target) - async def get_all_labels(self) -> list[str]: - raise NotImplementedError - async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 ) -> KnowledgeGraph: diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index 014726fb..958425fc 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -676,9 +676,6 @@ class OracleGraphStorage(BaseGraphStorage): async def delete_node(self, node_id: str) -> None: raise NotImplementedError - async def get_all_labels(self) -> list[str]: - raise NotImplementedError - async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 ) -> KnowledgeGraph: diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index ad7c4b5e..5f0a51a0 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -186,12 +186,10 @@ class PostgreSQLDB: asyncpg.exceptions.UniqueViolationError, asyncpg.exceptions.DuplicateTableError, ) as e: - if upsert: - print("Key value duplicate, but upsert succeeded.") - else: - logger.error(f"Upsert error: {e}") + if not upsert: + logger.error(f"PostgreSQL, upsert error: {e}") except Exception as e: - logger.error(f"PostgreSQL database,\nsql:{sql},\ndata:{data},\nerror:{e}") + logger.error(f"PostgreSQL database, sql:{sql}, data:{data}, error:{e}") raise @@ -1095,9 +1093,6 @@ class PGGraphStorage(BaseGraphStorage): ) -> tuple[np.ndarray[Any, Any], list[str]]: raise NotImplementedError - async def get_all_labels(self) -> list[str]: - raise NotImplementedError - async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 ) -> KnowledgeGraph: diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index 4266d07c..730fe4d2 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -566,9 +566,6 @@ class TiDBGraphStorage(BaseGraphStorage): async def delete_node(self, node_id: str) -> None: raise NotImplementedError - async def get_all_labels(self) -> list[str]: - raise NotImplementedError - async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 ) -> KnowledgeGraph: diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index efcded4c..2f3e1800 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -614,10 +614,6 @@ class LightRAG: self.storages_status = StoragesStatus.FINALIZED logger.debug("Finalized Storages") - async def get_graph_labels(self): - text = await self.chunk_entity_relation_graph.get_all_labels() - return text - async def get_knowledge_graph( self, nodel_label: str, max_depth: int ) -> KnowledgeGraph: diff --git a/lightrag/tools/lightrag_visualizer/graph_visualizer.py b/lightrag/tools/lightrag_visualizer/graph_visualizer.py index 8a6f0976..9950041f 100644 --- a/lightrag/tools/lightrag_visualizer/graph_visualizer.py +++ b/lightrag/tools/lightrag_visualizer/graph_visualizer.py @@ -1,6 +1,6 @@ from typing import Optional, Tuple, Dict, List import numpy as np -import networkx as nx + import pipmaster as pm # Added automatic libraries install using pipmaster @@ -12,7 +12,10 @@ if not pm.is_installed("pyglm"): pm.install("pyglm") if not pm.is_installed("python-louvain"): pm.install("python-louvain") +if not pm.is_installed("networkx"): + pm.install("networkx") +import networkx as nx import moderngl from imgui_bundle import imgui, immapp, hello_imgui import community From 89c35c82c77c1846d6ef6bc32b958b4910935c5c Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Wed, 19 Feb 2025 22:57:46 +0100 Subject: [PATCH 27/65] removed get_knowledge_graph --- lightrag/api/lightrag_server.py | 4 - lightrag/base.py | 7 -- lightrag/kg/age_impl.py | 6 -- lightrag/kg/gremlin_impl.py | 6 -- lightrag/kg/mongo_impl.py | 174 -------------------------------- lightrag/kg/neo4j_impl.py | 94 ----------------- lightrag/kg/networkx_impl.py | 6 -- lightrag/kg/oracle_impl.py | 6 -- lightrag/kg/postgres_impl.py | 6 -- lightrag/kg/tidb_impl.py | 7 -- lightrag/lightrag.py | 8 -- 11 files changed, 324 deletions(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 96315b82..58931eec 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -1683,10 +1683,6 @@ def create_app(args): raise HTTPException(status_code=500, detail=str(e)) # query all graph - @app.get("/graphs") - async def get_knowledge_graph(label: str): - return await rag.get_knowledge_graph(nodel_label=label, max_depth=100) - # Add Ollama API routes ollama_api = OllamaAPI(rag, top_k=args.top_k) app.include_router(ollama_api.router, prefix="/api") diff --git a/lightrag/base.py b/lightrag/base.py index af060435..5f6f8850 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -13,7 +13,6 @@ from typing import ( ) import numpy as np from .utils import EmbeddingFunc -from .types import KnowledgeGraph load_dotenv() @@ -198,12 +197,6 @@ class BaseGraphStorage(StorageNameSpace, ABC): ) -> tuple[np.ndarray[Any, Any], list[str]]: """Get all labels in the graph.""" - @abstractmethod - async def get_knowledge_graph( - self, node_label: str, max_depth: int = 5 - ) -> KnowledgeGraph: - """Retrieve a subgraph of the knowledge graph starting from a given node.""" - class DocStatus(str, Enum): """Document processing status""" diff --git a/lightrag/kg/age_impl.py b/lightrag/kg/age_impl.py index 225b350b..1e6b3545 100644 --- a/lightrag/kg/age_impl.py +++ b/lightrag/kg/age_impl.py @@ -8,7 +8,6 @@ from dataclasses import dataclass from typing import Any, Dict, List, NamedTuple, Optional, Union, final import numpy as np import pipmaster as pm -from lightrag.types import KnowledgeGraph from tenacity import ( retry, @@ -621,11 +620,6 @@ class AGEStorage(BaseGraphStorage): ) -> tuple[np.ndarray[Any, Any], list[str]]: raise NotImplementedError - async def get_knowledge_graph( - self, node_label: str, max_depth: int = 5 - ) -> KnowledgeGraph: - raise NotImplementedError - async def index_done_callback(self) -> None: # AGES handles persistence automatically pass diff --git a/lightrag/kg/gremlin_impl.py b/lightrag/kg/gremlin_impl.py index de12bbd9..9ebe1eb4 100644 --- a/lightrag/kg/gremlin_impl.py +++ b/lightrag/kg/gremlin_impl.py @@ -15,7 +15,6 @@ from tenacity import ( wait_exponential, ) -from lightrag.types import KnowledgeGraph from lightrag.utils import logger from ..base import BaseGraphStorage @@ -403,8 +402,3 @@ class GremlinStorage(BaseGraphStorage): self, algorithm: str ) -> tuple[np.ndarray[Any, Any], list[str]]: raise NotImplementedError - - async def get_knowledge_graph( - self, node_label: str, max_depth: int = 5 - ) -> KnowledgeGraph: - raise NotImplementedError diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index b3479b2f..1ae1fc42 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -16,7 +16,6 @@ from ..base import ( ) from ..namespace import NameSpace, is_namespace from ..utils import logger -from ..types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge import pipmaster as pm if not pm.is_installed("pymongo"): @@ -604,179 +603,6 @@ class MongoGraphStorage(BaseGraphStorage): # ------------------------------------------------------------------------- # QUERY # ------------------------------------------------------------------------- - # - - async def get_knowledge_graph( - self, node_label: str, max_depth: int = 5 - ) -> KnowledgeGraph: - """ - Get complete connected subgraph for specified node (including the starting node itself) - - Args: - node_label: Label of the nodes to start from - max_depth: Maximum depth of traversal (default: 5) - - Returns: - KnowledgeGraph object containing nodes and edges of the subgraph - """ - label = node_label - result = KnowledgeGraph() - seen_nodes = set() - seen_edges = set() - - try: - if label == "*": - # Get all nodes and edges - async for node_doc in self.collection.find({}): - node_id = str(node_doc["_id"]) - if node_id not in seen_nodes: - result.nodes.append( - KnowledgeGraphNode( - id=node_id, - labels=[node_doc.get("_id")], - properties={ - k: v - for k, v in node_doc.items() - if k not in ["_id", "edges"] - }, - ) - ) - seen_nodes.add(node_id) - - # Process edges - for edge in node_doc.get("edges", []): - edge_id = f"{node_id}-{edge['target']}" - if edge_id not in seen_edges: - result.edges.append( - KnowledgeGraphEdge( - id=edge_id, - type=edge.get("relation", ""), - source=node_id, - target=edge["target"], - properties={ - k: v - for k, v in edge.items() - if k not in ["target", "relation"] - }, - ) - ) - seen_edges.add(edge_id) - else: - # Verify if starting node exists - start_nodes = self.collection.find({"_id": label}) - start_nodes_exist = await start_nodes.to_list(length=1) - if not start_nodes_exist: - logger.warning(f"Starting node with label {label} does not exist!") - return result - - # Use $graphLookup for traversal - pipeline = [ - { - "$match": {"_id": label} - }, # Start with nodes having the specified label - { - "$graphLookup": { - "from": self._collection_name, - "startWith": "$edges.target", - "connectFromField": "edges.target", - "connectToField": "_id", - "maxDepth": max_depth, - "depthField": "depth", - "as": "connected_nodes", - } - }, - ] - - async for doc in self.collection.aggregate(pipeline): - # Add the start node - node_id = str(doc["_id"]) - if node_id not in seen_nodes: - result.nodes.append( - KnowledgeGraphNode( - id=node_id, - labels=[ - doc.get( - "_id", - ) - ], - properties={ - k: v - for k, v in doc.items() - if k - not in [ - "_id", - "edges", - "connected_nodes", - "depth", - ] - }, - ) - ) - seen_nodes.add(node_id) - - # Add edges from start node - for edge in doc.get("edges", []): - edge_id = f"{node_id}-{edge['target']}" - if edge_id not in seen_edges: - result.edges.append( - KnowledgeGraphEdge( - id=edge_id, - type=edge.get("relation", ""), - source=node_id, - target=edge["target"], - properties={ - k: v - for k, v in edge.items() - if k not in ["target", "relation"] - }, - ) - ) - seen_edges.add(edge_id) - - # Add connected nodes and their edges - for connected in doc.get("connected_nodes", []): - node_id = str(connected["_id"]) - if node_id not in seen_nodes: - result.nodes.append( - KnowledgeGraphNode( - id=node_id, - labels=[connected.get("_id")], - properties={ - k: v - for k, v in connected.items() - if k not in ["_id", "edges", "depth"] - }, - ) - ) - seen_nodes.add(node_id) - - # Add edges from connected nodes - for edge in connected.get("edges", []): - edge_id = f"{node_id}-{edge['target']}" - if edge_id not in seen_edges: - result.edges.append( - KnowledgeGraphEdge( - id=edge_id, - type=edge.get("relation", ""), - source=node_id, - target=edge["target"], - properties={ - k: v - for k, v in edge.items() - if k not in ["target", "relation"] - }, - ) - ) - seen_edges.add(edge_id) - - logger.info( - f"Subgraph query successful | Node count: {len(result.nodes)} | Edge count: {len(result.edges)}" - ) - - except PyMongoError as e: - logger.error(f"MongoDB query failed: {str(e)}") - - return result async def index_done_callback(self) -> None: # Mongo handles persistence automatically diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py index 296f4295..cba38d22 100644 --- a/lightrag/kg/neo4j_impl.py +++ b/lightrag/kg/neo4j_impl.py @@ -17,7 +17,6 @@ from tenacity import ( from ..utils import logger from ..base import BaseGraphStorage -from ..types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge import pipmaster as pm if not pm.is_installed("neo4j"): @@ -474,99 +473,6 @@ class Neo4JStorage(BaseGraphStorage): async def _node2vec_embed(self): print("Implemented but never called.") - async def get_knowledge_graph( - self, node_label: str, max_depth: int = 5 - ) -> KnowledgeGraph: - """ - Get complete connected subgraph for specified node (including the starting node itself) - - Key fixes: - 1. Include the starting node itself - 2. Handle multi-label nodes - 3. Clarify relationship directions - 4. Add depth control - """ - label = node_label.strip('"') - result = KnowledgeGraph() - seen_nodes = set() - seen_edges = set() - - async with self._driver.session(database=self._DATABASE) as session: - try: - main_query = "" - if label == "*": - main_query = """ - MATCH (n) - WITH collect(DISTINCT n) AS nodes - MATCH ()-[r]-() - RETURN nodes, collect(DISTINCT r) AS relationships; - """ - else: - # Critical debug step: first verify if starting node exists - validate_query = f"MATCH (n:`{label}`) RETURN n LIMIT 1" - validate_result = await session.run(validate_query) - if not await validate_result.single(): - logger.warning(f"Starting node {label} does not exist!") - return result - - # Optimized query (including direction handling and self-loops) - main_query = f""" - MATCH (start:`{label}`) - WITH start - CALL apoc.path.subgraphAll(start, {{ - relationshipFilter: '>', - minLevel: 0, - maxLevel: {max_depth}, - bfs: true - }}) - YIELD nodes, relationships - RETURN nodes, relationships - """ - result_set = await session.run(main_query) - record = await result_set.single() - - if record: - # Handle nodes (compatible with multi-label cases) - for node in record["nodes"]: - # Use node ID + label combination as unique identifier - node_id = node.id - if node_id not in seen_nodes: - result.nodes.append( - KnowledgeGraphNode( - id=f"{node_id}", - labels=list(node.labels), - properties=dict(node), - ) - ) - seen_nodes.add(node_id) - - # Handle relationships (including direction information) - for rel in record["relationships"]: - edge_id = rel.id - if edge_id not in seen_edges: - start = rel.start_node - end = rel.end_node - result.edges.append( - KnowledgeGraphEdge( - id=f"{edge_id}", - type=rel.type, - source=f"{start.id}", - target=f"{end.id}", - properties=dict(rel), - ) - ) - seen_edges.add(edge_id) - - logger.info( - f"Subgraph query successful | Node count: {len(result.nodes)} | Edge count: {len(result.edges)}" - ) - - except neo4jExceptions.ClientError as e: - logger.error(f"APOC query failed: {str(e)}") - return await self._robust_fallback(label, max_depth) - - return result - async def _robust_fallback( self, label: str, max_depth: int ) -> Dict[str, List[Dict]]: diff --git a/lightrag/kg/networkx_impl.py b/lightrag/kg/networkx_impl.py index 9b0d7474..545c5464 100644 --- a/lightrag/kg/networkx_impl.py +++ b/lightrag/kg/networkx_impl.py @@ -5,7 +5,6 @@ from typing import Any, final import numpy as np -from lightrag.types import KnowledgeGraph from lightrag.utils import ( logger, ) @@ -171,8 +170,3 @@ class NetworkXStorage(BaseGraphStorage): for source, target in edges: if self._graph.has_edge(source, target): self._graph.remove_edge(source, target) - - async def get_knowledge_graph( - self, node_label: str, max_depth: int = 5 - ) -> KnowledgeGraph: - raise NotImplementedError diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index 958425fc..d9df3801 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -8,7 +8,6 @@ from typing import Any, Union, final import numpy as np import configparser -from lightrag.types import KnowledgeGraph from ..base import ( BaseGraphStorage, @@ -676,11 +675,6 @@ class OracleGraphStorage(BaseGraphStorage): async def delete_node(self, node_id: str) -> None: raise NotImplementedError - async def get_knowledge_graph( - self, node_label: str, max_depth: int = 5 - ) -> KnowledgeGraph: - raise NotImplementedError - N_T = { NameSpace.KV_STORE_FULL_DOCS: "LIGHTRAG_DOC_FULL", diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 5f0a51a0..8b3e2238 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -7,7 +7,6 @@ from typing import Any, Union, final import numpy as np import configparser -from lightrag.types import KnowledgeGraph import sys from tenacity import ( @@ -1093,11 +1092,6 @@ class PGGraphStorage(BaseGraphStorage): ) -> tuple[np.ndarray[Any, Any], list[str]]: raise NotImplementedError - async def get_knowledge_graph( - self, node_label: str, max_depth: int = 5 - ) -> KnowledgeGraph: - raise NotImplementedError - async def drop(self) -> None: """Drop the storage""" drop_sql = SQL_TEMPLATES["drop_vdb_entity"] diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index 730fe4d2..0e7ac91d 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -5,8 +5,6 @@ from typing import Any, Union, final import numpy as np -from lightrag.types import KnowledgeGraph - from ..base import BaseGraphStorage, BaseKVStorage, BaseVectorStorage from ..namespace import NameSpace, is_namespace @@ -566,11 +564,6 @@ class TiDBGraphStorage(BaseGraphStorage): async def delete_node(self, node_id: str) -> None: raise NotImplementedError - async def get_knowledge_graph( - self, node_label: str, max_depth: int = 5 - ) -> KnowledgeGraph: - raise NotImplementedError - N_T = { NameSpace.KV_STORE_FULL_DOCS: "LIGHTRAG_DOC_FULL", diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 2f3e1800..e46d548c 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -38,7 +38,6 @@ from .utils import ( logger, set_logger, ) -from .types import KnowledgeGraph config = configparser.ConfigParser() config.read("config.ini", "utf-8") @@ -614,13 +613,6 @@ class LightRAG: self.storages_status = StoragesStatus.FINALIZED logger.debug("Finalized Storages") - async def get_knowledge_graph( - self, nodel_label: str, max_depth: int - ) -> KnowledgeGraph: - return await self.chunk_entity_relation_graph.get_knowledge_graph( - node_label=nodel_label, max_depth=max_depth - ) - def _get_storage_class(self, storage_name: str) -> Callable[..., Any]: import_path = STORAGES[storage_name] storage_class = lazy_external_import(import_path, storage_name) From 98d005dc1c5bb8a34897c9fd77c2d3f1b2089322 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Wed, 19 Feb 2025 23:26:21 +0100 Subject: [PATCH 28/65] updated paralle --- lightrag/lightrag.py | 123 +++++++++++++++++++++++++++++-------------- lightrag/operate.py | 10 +--- 2 files changed, 85 insertions(+), 48 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index e46d548c..6343b291 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -790,6 +790,7 @@ class LightRAG: logger.info(f"Number of batches to process: {len(docs_batches)}.") + tasks: list[tuple[str, DocProcessingStatus, dict[str, Any], Any]] = [] # 3. iterate over batches for batch_idx, docs_batch in enumerate(docs_batches): # 4. iterate over batch @@ -825,47 +826,91 @@ class LightRAG: ) } - # Process document (text chunks and full docs) in parallel - tasks = [ - self.chunks_vdb.upsert(chunks), - self._process_entity_relation_graph(chunks), - self.full_docs.upsert({doc_id: {"content": status_doc.content}}), - self.text_chunks.upsert(chunks), - ] - try: - await asyncio.gather(*tasks) - await self.doc_status.upsert( - { - doc_status_id: { - "status": DocStatus.PROCESSED, - "chunks_count": len(chunks), - "content": status_doc.content, - "content_summary": status_doc.content_summary, - "content_length": status_doc.content_length, - "created_at": status_doc.created_at, - "updated_at": datetime.now().isoformat(), - } - } - ) - await self._insert_done() + # Prepare async tasks with full context + tasks.extend( + [ + ( + doc_status_id, + status_doc, + chunks, + self.chunks_vdb.upsert(chunks), + ), + ( + doc_status_id, + status_doc, + chunks, + self._process_entity_relation_graph(chunks), + ), + ( + doc_status_id, + status_doc, + chunks, + self.full_docs.upsert( + {doc_id: {"content": status_doc.content}} + ), + ), + ( + doc_status_id, + status_doc, + chunks, + self.text_chunks.upsert(chunks), + ), + ] + ) - except Exception as e: - logger.error(f"Failed to process document {doc_id}: {str(e)}") - await self.doc_status.upsert( - { - doc_status_id: { - "status": DocStatus.FAILED, - "error": str(e), - "content": status_doc.content, - "content_summary": status_doc.content_summary, - "content_length": status_doc.content_length, - "created_at": status_doc.created_at, - "updated_at": datetime.now().isoformat(), - } + # Execute tasks as they complete + for future in asyncio.as_completed([task for _, _, _, task in tasks]): + try: + # Wait for the completed task + await future + + # Retrieve the full context of the completed task + completed_doc_status_id, status_doc, chunks, _ = next( + (doc_id, s_doc, ch, task) + for doc_id, s_doc, ch, task in tasks + if task == future + ) + + # Update status to processed + await self.doc_status.upsert( + { + completed_doc_status_id: { + "status": DocStatus.PROCESSED, + "chunks_count": len(chunks), + "content": status_doc.content, + "content_summary": status_doc.content_summary, + "content_length": status_doc.content_length, + "created_at": status_doc.created_at, + "updated_at": datetime.now().isoformat(), } - ) - continue - logger.info(f"Completed batch {batch_idx + 1} of {len(docs_batches)}.") + } + ) + logger.info(f"Completed doc_id: {completed_doc_status_id}") + except Exception as e: + # Retrieve the context of the failed task + failed_doc_status_id, status_doc, chunks, _ = next( + (doc_id, s_doc, ch, task) + for doc_id, s_doc, ch, task in tasks + if task == future + ) + logger.error( + f"Failed to process document {failed_doc_status_id}: {str(e)}" + ) + + await self.doc_status.upsert( + { + failed_doc_status_id: { + "status": DocStatus.FAILED, + "error": str(e), + "content": status_doc.content, + "content_summary": status_doc.content_summary, + "content_length": status_doc.content_length, + "created_at": status_doc.created_at, + "updated_at": datetime.now().isoformat(), + } + } + ) + await self._insert_done() async def _process_entity_relation_graph(self, chunk: dict[str, Any]) -> None: try: diff --git a/lightrag/operate.py b/lightrag/operate.py index 9552f2ed..27950b7d 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1326,15 +1326,12 @@ async def _get_edge_data( ), ) - if not all([n is not None for n in edge_datas]): - logger.warning("Some edges are missing, maybe the storage is damaged") - edge_datas = [ { "src_id": k["src_id"], "tgt_id": k["tgt_id"], "rank": d, - "created_at": k.get("__created_at__", None), # 从 KV 存储中获取时间元数据 + "created_at": k.get("__created_at__", None), **v, } for k, v, d in zip(results, edge_datas, edge_degree) @@ -1343,16 +1340,11 @@ async def _get_edge_data( edge_datas = sorted( edge_datas, key=lambda x: (x["rank"], x["weight"]), reverse=True ) - len_edge_datas = len(edge_datas) edge_datas = truncate_list_by_token_size( edge_datas, key=lambda x: x["description"], max_token_size=query_param.max_token_for_global_context, ) - logger.debug( - f"Truncate relations from {len_edge_datas} to {len(edge_datas)} (max tokens:{query_param.max_token_for_global_context})" - ) - use_entities, use_text_units = await asyncio.gather( _find_most_related_entities_from_relationships( edge_datas, query_param, knowledge_graph_inst From 4875283d9fc6ec82705b24765dc00edc59a4d27a Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Wed, 19 Feb 2025 23:45:51 +0100 Subject: [PATCH 29/65] make it smalled --- lightrag/lightrag.py | 138 +++++++++++++++---------------------------- 1 file changed, 49 insertions(+), 89 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 6343b291..91aacddf 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -768,14 +768,17 @@ class LightRAG: 4. Update the document status """ # 1. Get all pending, failed, and abnormally terminated processing documents. - to_process_docs: dict[str, DocProcessingStatus] = {} + # Run the asynchronous status retrievals in parallel using asyncio.gather + processing_docs, failed_docs, pending_docs = await asyncio.gather( + self.doc_status.get_docs_by_status(DocStatus.PROCESSING), + self.doc_status.get_docs_by_status(DocStatus.FAILED), + self.doc_status.get_docs_by_status(DocStatus.PENDING), + ) - processing_docs = await self.doc_status.get_docs_by_status(DocStatus.PROCESSING) + to_process_docs: dict[str, DocProcessingStatus] = {} to_process_docs.update(processing_docs) - failed_docs = await self.doc_status.get_docs_by_status(DocStatus.FAILED) to_process_docs.update(failed_docs) - pendings_docs = await self.doc_status.get_docs_by_status(DocStatus.PENDING) - to_process_docs.update(pendings_docs) + to_process_docs.update(pending_docs) if not to_process_docs: logger.info("All documents have been processed or are duplicates") @@ -789,10 +792,11 @@ class LightRAG: ] logger.info(f"Number of batches to process: {len(docs_batches)}.") - - tasks: list[tuple[str, DocProcessingStatus, dict[str, Any], Any]] = [] # 3. iterate over batches for batch_idx, docs_batch in enumerate(docs_batches): + logger.info( + f"Start processing batch {batch_idx + 1} of {len(docs_batches)}." + ) # 4. iterate over batch for doc_id_processing_status in docs_batch: doc_id, status_doc = doc_id_processing_status @@ -826,91 +830,47 @@ class LightRAG: ) } - # Prepare async tasks with full context - tasks.extend( - [ - ( - doc_status_id, - status_doc, - chunks, - self.chunks_vdb.upsert(chunks), - ), - ( - doc_status_id, - status_doc, - chunks, - self._process_entity_relation_graph(chunks), - ), - ( - doc_status_id, - status_doc, - chunks, - self.full_docs.upsert( - {doc_id: {"content": status_doc.content}} - ), - ), - ( - doc_status_id, - status_doc, - chunks, - self.text_chunks.upsert(chunks), - ), - ] - ) - - # Execute tasks as they complete - for future in asyncio.as_completed([task for _, _, _, task in tasks]): - try: - # Wait for the completed task - await future - - # Retrieve the full context of the completed task - completed_doc_status_id, status_doc, chunks, _ = next( - (doc_id, s_doc, ch, task) - for doc_id, s_doc, ch, task in tasks - if task == future - ) - - # Update status to processed - await self.doc_status.upsert( - { - completed_doc_status_id: { - "status": DocStatus.PROCESSED, - "chunks_count": len(chunks), - "content": status_doc.content, - "content_summary": status_doc.content_summary, - "content_length": status_doc.content_length, - "created_at": status_doc.created_at, - "updated_at": datetime.now().isoformat(), + # Process document (text chunks and full docs) in parallel + tasks = [ + self.chunks_vdb.upsert(chunks), + self._process_entity_relation_graph(chunks), + self.full_docs.upsert({doc_id: {"content": status_doc.content}}), + self.text_chunks.upsert(chunks), + self.doc_status.upsert( + { + doc_status_id: { + "status": DocStatus.PROCESSED, + "chunks_count": len(chunks), + "content": status_doc.content, + "content_summary": status_doc.content_summary, + "content_length": status_doc.content_length, + "created_at": status_doc.created_at, + "updated_at": datetime.now().isoformat(), + } } - } - ) - logger.info(f"Completed doc_id: {completed_doc_status_id}") - except Exception as e: - # Retrieve the context of the failed task - failed_doc_status_id, status_doc, chunks, _ = next( - (doc_id, s_doc, ch, task) - for doc_id, s_doc, ch, task in tasks - if task == future - ) - logger.error( - f"Failed to process document {failed_doc_status_id}: {str(e)}" - ) + ), + ] + try: + await asyncio.gather(*tasks) + await self._insert_done() - await self.doc_status.upsert( - { - failed_doc_status_id: { - "status": DocStatus.FAILED, - "error": str(e), - "content": status_doc.content, - "content_summary": status_doc.content_summary, - "content_length": status_doc.content_length, - "created_at": status_doc.created_at, - "updated_at": datetime.now().isoformat(), + except Exception as e: + logger.error(f"Failed to process document {doc_id}: {str(e)}") + await self.doc_status.upsert( + { + doc_status_id: { + "status": DocStatus.FAILED, + "error": str(e), + "content": status_doc.content, + "content_summary": status_doc.content_summary, + "content_length": status_doc.content_length, + "created_at": status_doc.created_at, + "updated_at": datetime.now().isoformat(), + } } - } - ) - await self._insert_done() + ) + continue + logger.info(f"Completed batch {batch_idx + 1} of {len(docs_batches)}.") async def _process_entity_relation_graph(self, chunk: dict[str, Any]) -> None: try: From efdc8a2d26e029cc24902ee7a0df6c9ce1cef7c1 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Wed, 19 Feb 2025 23:53:25 +0100 Subject: [PATCH 30/65] multi batches --- lightrag/lightrag.py | 144 +++++++++++++++++++++++-------------------- 1 file changed, 78 insertions(+), 66 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 91aacddf..fff53623 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -792,85 +792,97 @@ class LightRAG: ] logger.info(f"Number of batches to process: {len(docs_batches)}.") + + batches: list[Any] = [] # 3. iterate over batches for batch_idx, docs_batch in enumerate(docs_batches): - logger.info( - f"Start processing batch {batch_idx + 1} of {len(docs_batches)}." - ) - # 4. iterate over batch - for doc_id_processing_status in docs_batch: - doc_id, status_doc = doc_id_processing_status - # Update status in processing - doc_status_id = compute_mdhash_id(status_doc.content, prefix="doc-") - await self.doc_status.upsert( - { - doc_status_id: { - "status": DocStatus.PROCESSING, - "updated_at": datetime.now().isoformat(), - "content": status_doc.content, - "content_summary": status_doc.content_summary, - "content_length": status_doc.content_length, - "created_at": status_doc.created_at, - } - } - ) - # Generate chunks from document - chunks: dict[str, Any] = { - compute_mdhash_id(dp["content"], prefix="chunk-"): { - **dp, - "full_doc_id": doc_id, - } - for dp in self.chunking_func( - status_doc.content, - split_by_character, - split_by_character_only, - self.chunk_overlap_token_size, - self.chunk_token_size, - self.tiktoken_model_name, - ) - } - # Process document (text chunks and full docs) in parallel - tasks = [ - self.chunks_vdb.upsert(chunks), - self._process_entity_relation_graph(chunks), - self.full_docs.upsert({doc_id: {"content": status_doc.content}}), - self.text_chunks.upsert(chunks), - self.doc_status.upsert( - { - doc_status_id: { - "status": DocStatus.PROCESSED, - "chunks_count": len(chunks), - "content": status_doc.content, - "content_summary": status_doc.content_summary, - "content_length": status_doc.content_length, - "created_at": status_doc.created_at, - "updated_at": datetime.now().isoformat(), - } - } - ), - ] - try: - await asyncio.gather(*tasks) - await self._insert_done() - - except Exception as e: - logger.error(f"Failed to process document {doc_id}: {str(e)}") + async def batch( + batch_idx: int, + docs_batch: list[tuple[str, DocProcessingStatus]], + size_batch: int, + ) -> None: + logger.info(f"Start processing batch {batch_idx + 1} of {size_batch}.") + # 4. iterate over batch + for doc_id_processing_status in docs_batch: + doc_id, status_doc = doc_id_processing_status + # Update status in processing + doc_status_id = compute_mdhash_id(status_doc.content, prefix="doc-") await self.doc_status.upsert( { doc_status_id: { - "status": DocStatus.FAILED, - "error": str(e), + "status": DocStatus.PROCESSING, + "updated_at": datetime.now().isoformat(), "content": status_doc.content, "content_summary": status_doc.content_summary, "content_length": status_doc.content_length, "created_at": status_doc.created_at, - "updated_at": datetime.now().isoformat(), } } ) - continue - logger.info(f"Completed batch {batch_idx + 1} of {len(docs_batches)}.") + # Generate chunks from document + chunks: dict[str, Any] = { + compute_mdhash_id(dp["content"], prefix="chunk-"): { + **dp, + "full_doc_id": doc_id, + } + for dp in self.chunking_func( + status_doc.content, + split_by_character, + split_by_character_only, + self.chunk_overlap_token_size, + self.chunk_token_size, + self.tiktoken_model_name, + ) + } + + # Process document (text chunks and full docs) in parallel + tasks = [ + self.chunks_vdb.upsert(chunks), + self._process_entity_relation_graph(chunks), + self.full_docs.upsert( + {doc_id: {"content": status_doc.content}} + ), + self.text_chunks.upsert(chunks), + self.doc_status.upsert( + { + doc_status_id: { + "status": DocStatus.PROCESSED, + "chunks_count": len(chunks), + "content": status_doc.content, + "content_summary": status_doc.content_summary, + "content_length": status_doc.content_length, + "created_at": status_doc.created_at, + "updated_at": datetime.now().isoformat(), + } + } + ), + ] + try: + await asyncio.gather(*tasks) + + except Exception as e: + logger.error(f"Failed to process document {doc_id}: {str(e)}") + await self.doc_status.upsert( + { + doc_status_id: { + "status": DocStatus.FAILED, + "error": str(e), + "content": status_doc.content, + "content_summary": status_doc.content_summary, + "content_length": status_doc.content_length, + "created_at": status_doc.created_at, + "updated_at": datetime.now().isoformat(), + } + } + ) + continue + logger.info(f"Completed batch {batch_idx + 1} of {len(docs_batches)}.") + + batches.append(batch(batch_idx, docs_batch, len(docs_batches))) + + await asyncio.gather(*batches) + await self._insert_done() async def _process_entity_relation_graph(self, chunk: dict[str, Any]) -> None: try: From 80a61d7e7a1384b16b363a1a47a82613ede454d9 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 00:09:46 +0100 Subject: [PATCH 31/65] fixed the behaviour --- lightrag/lightrag.py | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index fff53623..156bed49 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -808,18 +808,6 @@ class LightRAG: doc_id, status_doc = doc_id_processing_status # Update status in processing doc_status_id = compute_mdhash_id(status_doc.content, prefix="doc-") - await self.doc_status.upsert( - { - doc_status_id: { - "status": DocStatus.PROCESSING, - "updated_at": datetime.now().isoformat(), - "content": status_doc.content, - "content_summary": status_doc.content_summary, - "content_length": status_doc.content_length, - "created_at": status_doc.created_at, - } - } - ) # Generate chunks from document chunks: dict[str, Any] = { compute_mdhash_id(dp["content"], prefix="chunk-"): { @@ -838,13 +826,28 @@ class LightRAG: # Process document (text chunks and full docs) in parallel tasks = [ + self.doc_status.upsert( + { + doc_status_id: { + "status": DocStatus.PROCESSING, + "updated_at": datetime.now().isoformat(), + "content": status_doc.content, + "content_summary": status_doc.content_summary, + "content_length": status_doc.content_length, + "created_at": status_doc.created_at, + } + } + ), self.chunks_vdb.upsert(chunks), self._process_entity_relation_graph(chunks), self.full_docs.upsert( {doc_id: {"content": status_doc.content}} ), self.text_chunks.upsert(chunks), - self.doc_status.upsert( + ] + try: + await asyncio.gather(*tasks) + await self.doc_status.upsert( { doc_status_id: { "status": DocStatus.PROCESSED, @@ -856,11 +859,7 @@ class LightRAG: "updated_at": datetime.now().isoformat(), } } - ), - ] - try: - await asyncio.gather(*tasks) - + ) except Exception as e: logger.error(f"Failed to process document {doc_id}: {str(e)}") await self.doc_status.upsert( From fa99d72269dea53324472d72aff3c3ace1c0e805 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 00:19:47 +0100 Subject: [PATCH 32/65] added lock --- lightrag/lightrag.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 156bed49..22686ab2 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -7,7 +7,7 @@ from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial from typing import Any, AsyncIterator, Callable, Iterator, cast - +from asyncio import Lock from .base import ( BaseGraphStorage, BaseKVStorage, @@ -357,6 +357,9 @@ class LightRAG: convert_response_to_json ) + # Lock for entity extraction + _entity_lock = Lock() + # Custom Chunking Function chunking_func: Callable[ [ @@ -823,7 +826,6 @@ class LightRAG: self.tiktoken_model_name, ) } - # Process document (text chunks and full docs) in parallel tasks = [ self.doc_status.upsert( @@ -896,8 +898,9 @@ class LightRAG: if new_kg is None: logger.info("No new entities or relationships extracted.") else: - logger.info("New entities or relationships extracted.") - self.chunk_entity_relation_graph = new_kg + async with self._entity_lock: + logger.info("New entities or relationships extracted.") + self.chunk_entity_relation_graph = new_kg except Exception as e: logger.error("Failed to extract entities and relationships") From c8eaa45024c97609f2359c1db007eac86359b645 Mon Sep 17 00:00:00 2001 From: zrguo <zrguo.bupt@qq.com> Date: Thu, 20 Feb 2025 14:42:13 +0800 Subject: [PATCH 33/65] add Chinese template --- .github/ISSUE_TEMPLATE/bug_report_zh.yml | 61 +++++++++++++++++++ .github/ISSUE_TEMPLATE/feature_request_zh.yml | 26 ++++++++ .github/ISSUE_TEMPLATE/question_zh.yml | 26 ++++++++ 3 files changed, 113 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report_zh.yml create mode 100644 .github/ISSUE_TEMPLATE/feature_request_zh.yml create mode 100644 .github/ISSUE_TEMPLATE/question_zh.yml diff --git a/.github/ISSUE_TEMPLATE/bug_report_zh.yml b/.github/ISSUE_TEMPLATE/bug_report_zh.yml new file mode 100644 index 00000000..191f96e2 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report_zh.yml @@ -0,0 +1,61 @@ +name: Bug Report +description: 提交 bug +title: "[Bug]: <title>" +labels: ["bug", "triage"] + +body: + - type: checkboxes + id: existingcheck + attributes: + label: 是否需要提交问题? + description: 请通过以下步骤避免重复和常见的 bug。 + options: + - label: 我已经搜索过现有的问题,这个 bug 还没有被报告。 + - label: 我认为这是一个合法的 bug,而不是一个问题或功能请求。 + - type: textarea + id: description + attributes: + label: 描述 bug + description: 清晰简洁地描述 bug 的问题。 + placeholder: 出现了什么问题? + - type: textarea + id: reproduce + attributes: + label: 重现步骤 + description: 重现此问题的步骤。 + placeholder: 我们如何复现此问题? + - type: textarea + id: expected_behavior + attributes: + label: 预期行为 + description: 清晰简洁地描述你预期的结果。 + placeholder: 应该发生什么? + - type: textarea + id: configused + attributes: + label: 使用的 LightRAG 配置 + description: 运行时使用的 LightRAG 配置。 + placeholder: 配置内容或 LightRAG 配置 + value: | + # 请粘贴你的配置内容 + - type: textarea + id: screenshotslogs + attributes: + label: 日志和截图 + description: 如有需要,请添加截图和日志以帮助解释你的问题。 + placeholder: 在此添加日志和截图 + - type: textarea + id: additional_information + attributes: + label: 其他信息 + description: | + - LightRAG 版本:例如 v0.1.1 + - 操作系统:例如 Windows 10,Ubuntu 20.04 + - Python 版本:例如 3.8 + - 相关问题:例如 #1 + - 其他相关信息。 + value: | + - LightRAG 版本: + - 操作系统: + - Python 版本: + - 相关问题: diff --git a/.github/ISSUE_TEMPLATE/feature_request_zh.yml b/.github/ISSUE_TEMPLATE/feature_request_zh.yml new file mode 100644 index 00000000..8d3bf558 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request_zh.yml @@ -0,0 +1,26 @@ +name: Feature Request +description: 提交功能请求 +labels: ["enhancement"] +title: "[Feature Request]: <title>" + +body: + - type: checkboxes + id: existingcheck + attributes: + label: 是否需要提交功能请求? + description: 请通过以下步骤避免重复和常见的功能请求。 + options: + - label: 我已经搜索过现有的功能请求,这个功能请求还没有被报告。 + - label: 我认为这是一个合法的功能请求,而不是一个问题或 bug。 + - type: textarea + id: feature_request_description + attributes: + label: 功能请求描述 + description: 清晰简洁地描述你希望的功能请求。 + placeholder: 这个功能请求会增加或改进什么? + - type: textarea + id: additional_context + attributes: + label: 额外上下文 + description: 在此添加任何关于功能请求的其他上下文或截图。 + placeholder: 任何其他信息 diff --git a/.github/ISSUE_TEMPLATE/question_zh.yml b/.github/ISSUE_TEMPLATE/question_zh.yml new file mode 100644 index 00000000..f2d5d108 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question_zh.yml @@ -0,0 +1,26 @@ +name: Question +description: 提交问题 +labels: ["question"] +title: "[Question]: <title>" + +body: + - type: checkboxes + id: existingcheck + attributes: + label: 是否需要提问? + description: 请通过以下步骤避免重复和常见的问题。 + options: + - label: 我已经搜索过现有的问题和讨论,这个问题还没有被回答。 + - label: 我认为这是一个合法的问题,而不是一个 bug 或功能请求。 + - type: textarea + id: question + attributes: + label: 你的问题 + description: 清晰简洁地描述你的问题。 + placeholder: 你的问题是什么? + - type: textarea + id: context + attributes: + label: 额外上下文 + description: 提供任何可能帮助我们更好理解你的问题的额外上下文或细节。 + placeholder: 在此添加任何相关信息 From fbe7e9994d7a667031e8cb38553a9e1c7b0b13db Mon Sep 17 00:00:00 2001 From: zrguo <zrguo.bupt@qq.com> Date: Thu, 20 Feb 2025 14:44:06 +0800 Subject: [PATCH 34/65] revert --- .github/ISSUE_TEMPLATE/bug_report_zh.yml | 61 ------------------- .github/ISSUE_TEMPLATE/feature_request_zh.yml | 26 -------- .github/ISSUE_TEMPLATE/question_zh.yml | 26 -------- 3 files changed, 113 deletions(-) delete mode 100644 .github/ISSUE_TEMPLATE/bug_report_zh.yml delete mode 100644 .github/ISSUE_TEMPLATE/feature_request_zh.yml delete mode 100644 .github/ISSUE_TEMPLATE/question_zh.yml diff --git a/.github/ISSUE_TEMPLATE/bug_report_zh.yml b/.github/ISSUE_TEMPLATE/bug_report_zh.yml deleted file mode 100644 index 191f96e2..00000000 --- a/.github/ISSUE_TEMPLATE/bug_report_zh.yml +++ /dev/null @@ -1,61 +0,0 @@ -name: Bug Report -description: 提交 bug -title: "[Bug]: <title>" -labels: ["bug", "triage"] - -body: - - type: checkboxes - id: existingcheck - attributes: - label: 是否需要提交问题? - description: 请通过以下步骤避免重复和常见的 bug。 - options: - - label: 我已经搜索过现有的问题,这个 bug 还没有被报告。 - - label: 我认为这是一个合法的 bug,而不是一个问题或功能请求。 - - type: textarea - id: description - attributes: - label: 描述 bug - description: 清晰简洁地描述 bug 的问题。 - placeholder: 出现了什么问题? - - type: textarea - id: reproduce - attributes: - label: 重现步骤 - description: 重现此问题的步骤。 - placeholder: 我们如何复现此问题? - - type: textarea - id: expected_behavior - attributes: - label: 预期行为 - description: 清晰简洁地描述你预期的结果。 - placeholder: 应该发生什么? - - type: textarea - id: configused - attributes: - label: 使用的 LightRAG 配置 - description: 运行时使用的 LightRAG 配置。 - placeholder: 配置内容或 LightRAG 配置 - value: | - # 请粘贴你的配置内容 - - type: textarea - id: screenshotslogs - attributes: - label: 日志和截图 - description: 如有需要,请添加截图和日志以帮助解释你的问题。 - placeholder: 在此添加日志和截图 - - type: textarea - id: additional_information - attributes: - label: 其他信息 - description: | - - LightRAG 版本:例如 v0.1.1 - - 操作系统:例如 Windows 10,Ubuntu 20.04 - - Python 版本:例如 3.8 - - 相关问题:例如 #1 - - 其他相关信息。 - value: | - - LightRAG 版本: - - 操作系统: - - Python 版本: - - 相关问题: diff --git a/.github/ISSUE_TEMPLATE/feature_request_zh.yml b/.github/ISSUE_TEMPLATE/feature_request_zh.yml deleted file mode 100644 index 8d3bf558..00000000 --- a/.github/ISSUE_TEMPLATE/feature_request_zh.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: Feature Request -description: 提交功能请求 -labels: ["enhancement"] -title: "[Feature Request]: <title>" - -body: - - type: checkboxes - id: existingcheck - attributes: - label: 是否需要提交功能请求? - description: 请通过以下步骤避免重复和常见的功能请求。 - options: - - label: 我已经搜索过现有的功能请求,这个功能请求还没有被报告。 - - label: 我认为这是一个合法的功能请求,而不是一个问题或 bug。 - - type: textarea - id: feature_request_description - attributes: - label: 功能请求描述 - description: 清晰简洁地描述你希望的功能请求。 - placeholder: 这个功能请求会增加或改进什么? - - type: textarea - id: additional_context - attributes: - label: 额外上下文 - description: 在此添加任何关于功能请求的其他上下文或截图。 - placeholder: 任何其他信息 diff --git a/.github/ISSUE_TEMPLATE/question_zh.yml b/.github/ISSUE_TEMPLATE/question_zh.yml deleted file mode 100644 index f2d5d108..00000000 --- a/.github/ISSUE_TEMPLATE/question_zh.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: Question -description: 提交问题 -labels: ["question"] -title: "[Question]: <title>" - -body: - - type: checkboxes - id: existingcheck - attributes: - label: 是否需要提问? - description: 请通过以下步骤避免重复和常见的问题。 - options: - - label: 我已经搜索过现有的问题和讨论,这个问题还没有被回答。 - - label: 我认为这是一个合法的问题,而不是一个 bug 或功能请求。 - - type: textarea - id: question - attributes: - label: 你的问题 - description: 清晰简洁地描述你的问题。 - placeholder: 你的问题是什么? - - type: textarea - id: context - attributes: - label: 额外上下文 - description: 提供任何可能帮助我们更好理解你的问题的额外上下文或细节。 - placeholder: 在此添加任何相关信息 From 0b941178482691394a9d2385fdb8b530f9818f84 Mon Sep 17 00:00:00 2001 From: Pankaj Kaushal <pankaj@getcalmo.com> Date: Wed, 19 Feb 2025 11:38:50 +0100 Subject: [PATCH 35/65] Add LlamaIndex LLM implementation module - Implemented LlamaIndex interface for language model interactions - Added async chat completion support - Included embedding generation functionality - Implemented retry mechanisms for API calls - Added configuration and message formatting utilities - Supports OpenAI-style message handling and external settings --- lightrag/llm/llama_index_impl.py | 249 +++++++++++++++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 lightrag/llm/llama_index_impl.py diff --git a/lightrag/llm/llama_index_impl.py b/lightrag/llm/llama_index_impl.py new file mode 100644 index 00000000..f6667c00 --- /dev/null +++ b/lightrag/llm/llama_index_impl.py @@ -0,0 +1,249 @@ +""" +LlamaIndex LLM Interface Module +========================== + +This module provides interfaces for interacting with LlamaIndex's language models, +including text generation and embedding capabilities. + +Author: Lightrag team +Created: 2024-03-19 +License: MIT License + +Version: 1.0.0 + +Change Log: +- 1.0.0 (2024-03-19): Initial release + * Added async chat completion support + * Added embedding generation + * Added stream response capability + * Added support for external settings configuration + * Added OpenAI-style message handling + +Dependencies: + - llama_index + - numpy + - pipmaster + - Python >= 3.10 + +Usage: + from lightrag.llm.llama_index_impl import llama_index_complete, llama_index_embed +""" + +__version__ = "1.0.0" +__author__ = "lightrag Team" +__status__ = "Production" + +import pipmaster as pm +from core.logging_config import setup_logger +from llama_index.core.llms import ( + ChatMessage, + MessageRole, + ChatResponse, +) +from typing import List, Optional + +# Install required dependencies +if not pm.is_installed("llama-index"): + pm.install("llama-index") + +from llama_index.core.embeddings import BaseEmbedding +from llama_index.core.settings import Settings as LlamaIndexSettings +from tenacity import ( + retry, + stop_after_attempt, + wait_exponential, + retry_if_exception_type, +) +from lightrag.utils import ( + wrap_embedding_func_with_attrs, + locate_json_string_body_from_string, +) +from lightrag.exceptions import ( + APIConnectionError, + RateLimitError, + APITimeoutError, +) +import numpy as np + +logger = setup_logger("lightrag.llm.llama_index_impl") + + +def configure_llama_index(settings: LlamaIndexSettings = None, **kwargs): + """ + Configure LlamaIndex settings. + + Args: + settings: LlamaIndex Settings instance. If None, uses default settings. + **kwargs: Additional settings to override/configure + """ + if settings is None: + settings = LlamaIndexSettings() + + # Update settings with any provided kwargs + for key, value in kwargs.items(): + if hasattr(settings, key): + setattr(settings, key, value) + else: + logger.warning(f"Unknown LlamaIndex setting: {key}") + + # Set as global settings + LlamaIndexSettings.set_global(settings) + return settings + + +def format_chat_messages(messages): + """Format chat messages into LlamaIndex format.""" + formatted_messages = [] + + for msg in messages: + role = msg.get("role", "user") + content = msg.get("content", "") + + if role == "system": + formatted_messages.append( + ChatMessage(role=MessageRole.SYSTEM, content=content) + ) + elif role == "assistant": + formatted_messages.append( + ChatMessage(role=MessageRole.ASSISTANT, content=content) + ) + elif role == "user": + formatted_messages.append( + ChatMessage(role=MessageRole.USER, content=content) + ) + else: + logger.warning(f"Unknown role {role}, treating as user message") + formatted_messages.append( + ChatMessage(role=MessageRole.USER, content=content) + ) + + return formatted_messages + + +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=60), + retry=retry_if_exception_type( + (RateLimitError, APIConnectionError, APITimeoutError) + ), +) +async def llama_index_complete_if_cache( + model: str, + prompt: str, + system_prompt: Optional[str] = None, + history_messages: List[dict] = [], + **kwargs, +) -> str: + """Complete the prompt using LlamaIndex.""" + try: + # Format messages for chat + formatted_messages = [] + + # Add system message if provided + if system_prompt: + formatted_messages.append( + ChatMessage(role=MessageRole.SYSTEM, content=system_prompt) + ) + + # Add history messages + for msg in history_messages: + formatted_messages.append( + ChatMessage( + role=MessageRole.USER + if msg["role"] == "user" + else MessageRole.ASSISTANT, + content=msg["content"], + ) + ) + + # Add current prompt + formatted_messages.append(ChatMessage(role=MessageRole.USER, content=prompt)) + + # Get LLM instance from kwargs + if "llm_instance" not in kwargs: + raise ValueError("llm_instance must be provided in kwargs") + llm = kwargs["llm_instance"] + + # Get response + response: ChatResponse = await llm.achat(messages=formatted_messages) + + # In newer versions, the response is in message.content + content = response.message.content + return content + + except Exception as e: + logger.error(f"Error in llama_index_complete_if_cache: {str(e)}") + raise + + +async def llama_index_complete( + prompt, + system_prompt=None, + history_messages=None, + keyword_extraction=False, + settings: LlamaIndexSettings = None, + **kwargs, +) -> str: + """ + Main completion function for LlamaIndex + + Args: + prompt: Input prompt + system_prompt: Optional system prompt + history_messages: Optional chat history + keyword_extraction: Whether to extract keywords from response + settings: Optional LlamaIndex settings + **kwargs: Additional arguments + """ + if history_messages is None: + history_messages = [] + + keyword_extraction = kwargs.pop("keyword_extraction", None) + result = await llama_index_complete_if_cache( + kwargs.get("llm_instance"), + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + if keyword_extraction: + return locate_json_string_body_from_string(result) + return result + + +@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192) +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=60), + retry=retry_if_exception_type( + (RateLimitError, APIConnectionError, APITimeoutError) + ), +) +async def llama_index_embed( + texts: list[str], + embed_model: BaseEmbedding = None, + settings: LlamaIndexSettings = None, + **kwargs, +) -> np.ndarray: + """ + Generate embeddings using LlamaIndex + + Args: + texts: List of texts to embed + embed_model: LlamaIndex embedding model + settings: Optional LlamaIndex settings + **kwargs: Additional arguments + """ + if settings: + configure_llama_index(settings) + + if embed_model is None: + raise ValueError("embed_model must be provided") + + # LlamaIndex's embed_query returns a list of floats + embeddings = [] + for text in texts: + embedding = await embed_model.aembed_query(text) + embeddings.append(embedding) + + return np.array(embeddings) From 3b25e32e8d4dc2de95805a50e29092b64fd92254 Mon Sep 17 00:00:00 2001 From: Pankaj Kaushal <pankaj@getcalmo.com> Date: Wed, 19 Feb 2025 12:33:01 +0100 Subject: [PATCH 36/65] Removed verbose module-level documentation --- lightrag/llm/llama_index_impl.py | 35 -------------------------------- 1 file changed, 35 deletions(-) diff --git a/lightrag/llm/llama_index_impl.py b/lightrag/llm/llama_index_impl.py index f6667c00..7eea441a 100644 --- a/lightrag/llm/llama_index_impl.py +++ b/lightrag/llm/llama_index_impl.py @@ -1,38 +1,3 @@ -""" -LlamaIndex LLM Interface Module -========================== - -This module provides interfaces for interacting with LlamaIndex's language models, -including text generation and embedding capabilities. - -Author: Lightrag team -Created: 2024-03-19 -License: MIT License - -Version: 1.0.0 - -Change Log: -- 1.0.0 (2024-03-19): Initial release - * Added async chat completion support - * Added embedding generation - * Added stream response capability - * Added support for external settings configuration - * Added OpenAI-style message handling - -Dependencies: - - llama_index - - numpy - - pipmaster - - Python >= 3.10 - -Usage: - from lightrag.llm.llama_index_impl import llama_index_complete, llama_index_embed -""" - -__version__ = "1.0.0" -__author__ = "lightrag Team" -__status__ = "Production" - import pipmaster as pm from core.logging_config import setup_logger from llama_index.core.llms import ( From 8a06be9395f7209337b2b575d30be220ac3b823f Mon Sep 17 00:00:00 2001 From: Pankaj Kaushal <pankaj@getcalmo.com> Date: Wed, 19 Feb 2025 14:54:12 +0100 Subject: [PATCH 37/65] Add LlamaIndex Wrapper and Example Implementations - Updated README.md with new Wrappers section detailing LlamaIndex integration - Added LlamaIndex wrapper implementation in `lightrag/wrapper/llama_index_impl.py` - Created two example scripts demonstrating LlamaIndex usage: - Direct OpenAI integration - LiteLLM proxy integration - Added wrapper documentation in `lightrag/wrapper/Readme.md` - Included comprehensive usage examples and configuration details --- README.md | 38 ++++ ...g_api_llamaindex_direct_demo_simplified.py | 98 +++++++++ ..._api_llamaindex_litellm_demo_simplified.py | 102 +++++++++ lightrag/wrapper/Readme.md | 177 +++++++++++++++ lightrag/wrapper/__init__.py | 0 lightrag/wrapper/llama_index_impl.py | 207 ++++++++++++++++++ 6 files changed, 622 insertions(+) create mode 100644 examples/lightrag_api_llamaindex_direct_demo_simplified.py create mode 100644 examples/lightrag_api_llamaindex_litellm_demo_simplified.py create mode 100644 lightrag/wrapper/Readme.md create mode 100644 lightrag/wrapper/__init__.py create mode 100644 lightrag/wrapper/llama_index_impl.py diff --git a/README.md b/README.md index 97d6279c..432261f7 100644 --- a/README.md +++ b/README.md @@ -312,7 +312,45 @@ rag = LightRAG( In order to run this experiment on low RAM GPU you should select small model and tune context window (increasing context increase memory consumption). For example, running this ollama example on repurposed mining GPU with 6Gb of RAM required to set context size to 26k while using `gemma2:2b`. It was able to find 197 entities and 19 relations on `book.txt`. </details> +<details> +<summary> <b>Wrappers</b> </summary> +LightRAG supports integration with various frameworks and model providers through wrappers. These wrappers provide a consistent interface while abstracting away the specifics of each framework. + +### Current Wrappers + +1. **LlamaIndex** (`wrapper/llama_index_impl.py`): + - Integrates with OpenAI and other providers through LlamaIndex + - Supports both direct API access and proxy services like LiteLLM + - Provides consistent interfaces for embeddings and completions + - See [LlamaIndex Wrapper Documentation](lightrag/wrapper/Readme.md) for detailed setup and examples + +### Example Usage + +```python +# Using LlamaIndex with direct OpenAI access +from lightrag import LightRAG +from lightrag.wrapper.llama_index_impl import llama_index_complete_if_cache, llama_index_embed +from llama_index.embeddings.openai import OpenAIEmbedding +from llama_index.llms.openai import OpenAI + +rag = LightRAG( + working_dir="your/path", + llm_model_func=llm_model_func, # LlamaIndex-compatible completion function + embedding_func=EmbeddingFunc( # LlamaIndex-compatible embedding function + embedding_dim=1536, + max_token_size=8192, + func=lambda texts: llama_index_embed(texts, embed_model=embed_model) + ), +) +``` + +#### For detailed documentation and examples, see: +- [LlamaIndex Wrapper Documentation](lightrag/wrapper/Readme.md) +- [Direct OpenAI Example](examples/lightrag_api_llamaindex_direct_demo_simplified.py) +- [LiteLLM Proxy Example](examples/lightrag_api_llamaindex_litellm_demo_simplified.py) + +</details> <details> <summary> <b>Conversation History Support</b> </summary> diff --git a/examples/lightrag_api_llamaindex_direct_demo_simplified.py b/examples/lightrag_api_llamaindex_direct_demo_simplified.py new file mode 100644 index 00000000..50dfec96 --- /dev/null +++ b/examples/lightrag_api_llamaindex_direct_demo_simplified.py @@ -0,0 +1,98 @@ +import os +from lightrag import LightRAG, QueryParam +from lightrag.wrapper.llama_index_impl import llama_index_complete_if_cache, llama_index_embed +from lightrag.utils import EmbeddingFunc +from llama_index.llms.openai import OpenAI +from llama_index.embeddings.openai import OpenAIEmbedding +import asyncio + +# Configure working directory +DEFAULT_RAG_DIR = "index_default" +WORKING_DIR = os.environ.get("RAG_DIR", f"{DEFAULT_RAG_DIR}") +print(f"WORKING_DIR: {WORKING_DIR}") + +# Model configuration +LLM_MODEL = os.environ.get("LLM_MODEL", "gpt-4") +print(f"LLM_MODEL: {LLM_MODEL}") +EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "text-embedding-3-small") +print(f"EMBEDDING_MODEL: {EMBEDDING_MODEL}") +EMBEDDING_MAX_TOKEN_SIZE = int(os.environ.get("EMBEDDING_MAX_TOKEN_SIZE", 8192)) +print(f"EMBEDDING_MAX_TOKEN_SIZE: {EMBEDDING_MAX_TOKEN_SIZE}") + +# OpenAI configuration +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "your-api-key-here") + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +# Initialize LLM function +async def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs): + try: + # Initialize OpenAI if not in kwargs + if 'llm_instance' not in kwargs: + llm_instance = OpenAI( + model=LLM_MODEL, + api_key=OPENAI_API_KEY, + temperature=0.7, + ) + kwargs['llm_instance'] = llm_instance + + response = await llama_index_complete_if_cache( + kwargs['llm_instance'], + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + return response + except Exception as e: + print(f"LLM request failed: {str(e)}") + raise + +# Initialize embedding function +async def embedding_func(texts): + try: + embed_model = OpenAIEmbedding( + model=EMBEDDING_MODEL, + api_key=OPENAI_API_KEY, + ) + return await llama_index_embed(texts, embed_model=embed_model) + except Exception as e: + print(f"Embedding failed: {str(e)}") + raise + +# Get embedding dimension +async def get_embedding_dim(): + test_text = ["This is a test sentence."] + embedding = await embedding_func(test_text) + embedding_dim = embedding.shape[1] + print(f"embedding_dim={embedding_dim}") + return embedding_dim + +# Initialize RAG instance +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=asyncio.run(get_embedding_dim()), + max_token_size=EMBEDDING_MAX_TOKEN_SIZE, + func=embedding_func, + ), +) + +# Insert example text +with open("./book.txt", "r", encoding="utf-8") as f: + rag.insert(f.read()) + +# Test different query modes +print("\nNaive Search:") +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) + +print("\nLocal Search:") +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) + +print("\nGlobal Search:") +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) + +print("\nHybrid Search:") +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) \ No newline at end of file diff --git a/examples/lightrag_api_llamaindex_litellm_demo_simplified.py b/examples/lightrag_api_llamaindex_litellm_demo_simplified.py new file mode 100644 index 00000000..11bdeba8 --- /dev/null +++ b/examples/lightrag_api_llamaindex_litellm_demo_simplified.py @@ -0,0 +1,102 @@ +import os +from lightrag import LightRAG, QueryParam +from lightrag.wrapper.llama_index_impl import llama_index_complete_if_cache, llama_index_embed +from lightrag.utils import EmbeddingFunc +from llama_index.llms.litellm import LiteLLM +from llama_index.embeddings.litellm import LiteLLMEmbedding +import asyncio + +# Configure working directory +DEFAULT_RAG_DIR = "index_default" +WORKING_DIR = os.environ.get("RAG_DIR", f"{DEFAULT_RAG_DIR}") +print(f"WORKING_DIR: {WORKING_DIR}") + +# Model configuration +LLM_MODEL = os.environ.get("LLM_MODEL", "gpt-4o") +print(f"LLM_MODEL: {LLM_MODEL}") +EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "embedding-model") +print(f"EMBEDDING_MODEL: {EMBEDDING_MODEL}") +EMBEDDING_MAX_TOKEN_SIZE = int(os.environ.get("EMBEDDING_MAX_TOKEN_SIZE", 8192)) +print(f"EMBEDDING_MAX_TOKEN_SIZE: {EMBEDDING_MAX_TOKEN_SIZE}") + +# LiteLLM configuration +LITELLM_URL = os.environ.get("LITELLM_URL", "http://localhost:4000") +print(f"LITELLM_URL: {LITELLM_URL}") +LITELLM_KEY = os.environ.get("LITELLM_KEY", "sk-1234") + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +# Initialize LLM function +async def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs): + try: + # Initialize LiteLLM if not in kwargs + if 'llm_instance' not in kwargs: + llm_instance = LiteLLM( + model=f"openai/{LLM_MODEL}", # Format: "provider/model_name" + api_base=LITELLM_URL, + api_key=LITELLM_KEY, + temperature=0.7, + ) + kwargs['llm_instance'] = llm_instance + + response = await llama_index_complete_if_cache( + kwargs['llm_instance'], + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + return response + except Exception as e: + print(f"LLM request failed: {str(e)}") + raise + +# Initialize embedding function +async def embedding_func(texts): + try: + embed_model = LiteLLMEmbedding( + model_name=f"openai/{EMBEDDING_MODEL}", + api_base=LITELLM_URL, + api_key=LITELLM_KEY, + ) + return await llama_index_embed(texts, embed_model=embed_model) + except Exception as e: + print(f"Embedding failed: {str(e)}") + raise + +# Get embedding dimension +async def get_embedding_dim(): + test_text = ["This is a test sentence."] + embedding = await embedding_func(test_text) + embedding_dim = embedding.shape[1] + print(f"embedding_dim={embedding_dim}") + return embedding_dim + +# Initialize RAG instance +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=asyncio.run(get_embedding_dim()), + max_token_size=EMBEDDING_MAX_TOKEN_SIZE, + func=embedding_func, + ), +) + +# Insert example text +with open("./book.txt", "r", encoding="utf-8") as f: + rag.insert(f.read()) + +# Test different query modes +print("\nNaive Search:") +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) + +print("\nLocal Search:") +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) + +print("\nGlobal Search:") +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) + +print("\nHybrid Search:") +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) \ No newline at end of file diff --git a/lightrag/wrapper/Readme.md b/lightrag/wrapper/Readme.md new file mode 100644 index 00000000..ece56458 --- /dev/null +++ b/lightrag/wrapper/Readme.md @@ -0,0 +1,177 @@ +## Wrapper Directory + +The `wrapper` directory contains integrations with different frameworks. These wrappers provide a consistent interface to LightRAG while abstracting away the specifics of each framework. + +## Wrapper Directory Structure + +``` +lightrag/ +├── wrapper/ # Wrappers for different model providers and frameworks +│ ├── llama_index_impl.py # LlamaIndex integration for embeddings and completions +│ └── ... # Other framework wrappers +├── kg/ # Knowledge graph implementations +├── utils/ # Utility functions and helpers +└── ... +``` +Current wrappers: + +1. **LlamaIndex** (`wrapper/llama_index.py`): + - Provides integration with OpenAI and other providers through LlamaIndex + - Supports both direct API access and proxy services like LiteLLM + - Handles embeddings and completions with consistent interfaces + - See example implementations: + - [Direct OpenAI Usage](../examples/lightrag_api_llamaindex_direct_demo_simplified.py) + - [LiteLLM Proxy Usage](../examples/lightrag_api_llamaindex_litellm_demo_simplified.py) + +<details> +<summary> <b>Using LlamaIndex</b> </summary> + +LightRAG supports LlamaIndex for embeddings and completions in two ways: direct OpenAI usage or through LiteLLM proxy. + +### Setup + +First, install the required dependencies: +```bash +pip install llama-index-llms-litellm llama-index-embeddings-litellm +``` + +### Standard OpenAI Usage + +```python +from lightrag import LightRAG +from lightrag.wrapper.llama_index_impl import llama_index_complete_if_cache, llama_index_embed +from llama_index.embeddings.openai import OpenAIEmbedding +from llama_index.llms.openai import OpenAI +from lightrag.utils import EmbeddingFunc + +# Initialize with direct OpenAI access +async def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs): + try: + # Initialize OpenAI if not in kwargs + if 'llm_instance' not in kwargs: + llm_instance = OpenAI( + model="gpt-4", + api_key="your-openai-key", + temperature=0.7, + ) + kwargs['llm_instance'] = llm_instance + + response = await llama_index_complete_if_cache( + kwargs['llm_instance'], + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + return response + except Exception as e: + logger.error(f"LLM request failed: {str(e)}") + raise + +# Initialize LightRAG with OpenAI +rag = LightRAG( + working_dir="your/path", + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=1536, + max_token_size=8192, + func=lambda texts: llama_index_embed( + texts, + embed_model=OpenAIEmbedding( + model="text-embedding-3-large", + api_key="your-openai-key" + ) + ), + ), +) +``` + +### Using LiteLLM Proxy + +1. Use any LLM provider through LiteLLM +2. Leverage LlamaIndex's embedding and completion capabilities +3. Maintain consistent configuration across services + +```python +from lightrag import LightRAG +from lightrag.wrapper.llama_index_impl import llama_index_complete_if_cache, llama_index_embed +from llama_index.llms.litellm import LiteLLM +from llama_index.embeddings.litellm import LiteLLMEmbedding +from lightrag.utils import EmbeddingFunc + +# Initialize with LiteLLM proxy +async def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs): + try: + # Initialize LiteLLM if not in kwargs + if 'llm_instance' not in kwargs: + llm_instance = LiteLLM( + model=f"openai/{settings.LLM_MODEL}", # Format: "provider/model_name" + api_base=settings.LITELLM_URL, + api_key=settings.LITELLM_KEY, + temperature=0.7, + ) + kwargs['llm_instance'] = llm_instance + + response = await llama_index_complete_if_cache( + kwargs['llm_instance'], + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + return response + except Exception as e: + logger.error(f"LLM request failed: {str(e)}") + raise + +# Initialize LightRAG with LiteLLM +rag = LightRAG( + working_dir="your/path", + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=1536, + max_token_size=8192, + func=lambda texts: llama_index_embed( + texts, + embed_model=LiteLLMEmbedding( + model_name=f"openai/{settings.EMBEDDING_MODEL}", + api_base=settings.LITELLM_URL, + api_key=settings.LITELLM_KEY, + ) + ), + ), +) +``` + +### Environment Variables + +For OpenAI direct usage: +```bash +OPENAI_API_KEY=your-openai-key +``` + +For LiteLLM proxy: +```bash +# LiteLLM Configuration +LITELLM_URL=http://litellm:4000 +LITELLM_KEY=your-litellm-key + +# Model Configuration +LLM_MODEL=gpt-4 +EMBEDDING_MODEL=text-embedding-3-large +EMBEDDING_MAX_TOKEN_SIZE=8192 +``` + +### Key Differences +1. **Direct OpenAI**: + - Simpler setup + - Direct API access + - Requires OpenAI API key + +2. **LiteLLM Proxy**: + - Model provider agnostic + - Centralized API key management + - Support for multiple providers + - Better cost control and monitoring + +</details> diff --git a/lightrag/wrapper/__init__.py b/lightrag/wrapper/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/lightrag/wrapper/llama_index_impl.py b/lightrag/wrapper/llama_index_impl.py new file mode 100644 index 00000000..f79dade5 --- /dev/null +++ b/lightrag/wrapper/llama_index_impl.py @@ -0,0 +1,207 @@ +import pipmaster as pm +from llama_index.core.llms import ( + ChatMessage, + MessageRole, + ChatResponse, +) +from typing import List, Optional + +# Install required dependencies +if not pm.is_installed("llama-index"): + pm.install("llama-index") + +from llama_index.core.embeddings import BaseEmbedding +from llama_index.core.settings import Settings as LlamaIndexSettings +from tenacity import ( + retry, + stop_after_attempt, + wait_exponential, + retry_if_exception_type, +) +from lightrag.utils import ( + wrap_embedding_func_with_attrs, + locate_json_string_body_from_string, +) +from lightrag.exceptions import ( + APIConnectionError, + RateLimitError, + APITimeoutError, +) +import numpy as np + + +def configure_llama_index(settings: LlamaIndexSettings = None, **kwargs): + """ + Configure LlamaIndex settings. + + Args: + settings: LlamaIndex Settings instance. If None, uses default settings. + **kwargs: Additional settings to override/configure + """ + if settings is None: + settings = LlamaIndexSettings() + + # Update settings with any provided kwargs + for key, value in kwargs.items(): + if hasattr(settings, key): + setattr(settings, key, value) + else: + logger.warning(f"Unknown LlamaIndex setting: {key}") + + # Set as global settings + LlamaIndexSettings.set_global(settings) + return settings + + +def format_chat_messages(messages): + """Format chat messages into LlamaIndex format.""" + formatted_messages = [] + + for msg in messages: + role = msg.get("role", "user") + content = msg.get("content", "") + + if role == "system": + formatted_messages.append( + ChatMessage(role=MessageRole.SYSTEM, content=content) + ) + elif role == "assistant": + formatted_messages.append( + ChatMessage(role=MessageRole.ASSISTANT, content=content) + ) + elif role == "user": + formatted_messages.append( + ChatMessage(role=MessageRole.USER, content=content) + ) + else: + logger.warning(f"Unknown role {role}, treating as user message") + formatted_messages.append( + ChatMessage(role=MessageRole.USER, content=content) + ) + + return formatted_messages + + +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=60), + retry=retry_if_exception_type( + (RateLimitError, APIConnectionError, APITimeoutError) + ), +) +async def llama_index_complete_if_cache( + model: str, + prompt: str, + system_prompt: Optional[str] = None, + history_messages: List[dict] = [], + **kwargs, +) -> str: + """Complete the prompt using LlamaIndex.""" + try: + # Format messages for chat + formatted_messages = [] + + # Add system message if provided + if system_prompt: + formatted_messages.append( + ChatMessage(role=MessageRole.SYSTEM, content=system_prompt) + ) + + # Add history messages + for msg in history_messages: + formatted_messages.append( + ChatMessage( + role=MessageRole.USER + if msg["role"] == "user" + else MessageRole.ASSISTANT, + content=msg["content"], + ) + ) + + # Add current prompt + formatted_messages.append(ChatMessage(role=MessageRole.USER, content=prompt)) + + # Get LLM instance from kwargs + if "llm_instance" not in kwargs: + raise ValueError("llm_instance must be provided in kwargs") + llm = kwargs["llm_instance"] + + # Get response + response: ChatResponse = await llm.achat(messages=formatted_messages) + + # In newer versions, the response is in message.content + content = response.message.content + return content + + except Exception as e: + logger.error(f"Error in llama_index_complete_if_cache: {str(e)}") + raise + + +async def llama_index_complete( + prompt, + system_prompt=None, + history_messages=None, + keyword_extraction=False, + settings: LlamaIndexSettings = None, + **kwargs, +) -> str: + """ + Main completion function for LlamaIndex + + Args: + prompt: Input prompt + system_prompt: Optional system prompt + history_messages: Optional chat history + keyword_extraction: Whether to extract keywords from response + settings: Optional LlamaIndex settings + **kwargs: Additional arguments + """ + if history_messages is None: + history_messages = [] + + keyword_extraction = kwargs.pop("keyword_extraction", None) + result = await llama_index_complete_if_cache( + kwargs.get("llm_instance"), + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + if keyword_extraction: + return locate_json_string_body_from_string(result) + return result + + +@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192) +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=60), + retry=retry_if_exception_type( + (RateLimitError, APIConnectionError, APITimeoutError) + ), +) +async def llama_index_embed( + texts: list[str], + embed_model: BaseEmbedding = None, + settings: LlamaIndexSettings = None, + **kwargs, +) -> np.ndarray: + """ + Generate embeddings using LlamaIndex + + Args: + texts: List of texts to embed + embed_model: LlamaIndex embedding model + settings: Optional LlamaIndex settings + **kwargs: Additional arguments + """ + if settings: + configure_llama_index(settings) + + if embed_model is None: + raise ValueError("embed_model must be provided") + + # Use _get_text_embeddings for batch processing + embeddings = embed_model._get_text_embeddings(texts) + return np.array(embeddings) From 203fdf2565768da025aadf35eb3b850ed7663f90 Mon Sep 17 00:00:00 2001 From: Pankaj Kaushal <pankaj@getcalmo.com> Date: Wed, 19 Feb 2025 14:55:44 +0100 Subject: [PATCH 38/65] Remove LlamaIndex implementation from llm directory as per @MdNazishArmanShorthillsAI - Deleted `lightrag/llm/llama_index_impl.py` - Reorganization of the LlamaIndex wrapper location --- lightrag/llm/llama_index_impl.py | 214 ------------------------------- 1 file changed, 214 deletions(-) delete mode 100644 lightrag/llm/llama_index_impl.py diff --git a/lightrag/llm/llama_index_impl.py b/lightrag/llm/llama_index_impl.py deleted file mode 100644 index 7eea441a..00000000 --- a/lightrag/llm/llama_index_impl.py +++ /dev/null @@ -1,214 +0,0 @@ -import pipmaster as pm -from core.logging_config import setup_logger -from llama_index.core.llms import ( - ChatMessage, - MessageRole, - ChatResponse, -) -from typing import List, Optional - -# Install required dependencies -if not pm.is_installed("llama-index"): - pm.install("llama-index") - -from llama_index.core.embeddings import BaseEmbedding -from llama_index.core.settings import Settings as LlamaIndexSettings -from tenacity import ( - retry, - stop_after_attempt, - wait_exponential, - retry_if_exception_type, -) -from lightrag.utils import ( - wrap_embedding_func_with_attrs, - locate_json_string_body_from_string, -) -from lightrag.exceptions import ( - APIConnectionError, - RateLimitError, - APITimeoutError, -) -import numpy as np - -logger = setup_logger("lightrag.llm.llama_index_impl") - - -def configure_llama_index(settings: LlamaIndexSettings = None, **kwargs): - """ - Configure LlamaIndex settings. - - Args: - settings: LlamaIndex Settings instance. If None, uses default settings. - **kwargs: Additional settings to override/configure - """ - if settings is None: - settings = LlamaIndexSettings() - - # Update settings with any provided kwargs - for key, value in kwargs.items(): - if hasattr(settings, key): - setattr(settings, key, value) - else: - logger.warning(f"Unknown LlamaIndex setting: {key}") - - # Set as global settings - LlamaIndexSettings.set_global(settings) - return settings - - -def format_chat_messages(messages): - """Format chat messages into LlamaIndex format.""" - formatted_messages = [] - - for msg in messages: - role = msg.get("role", "user") - content = msg.get("content", "") - - if role == "system": - formatted_messages.append( - ChatMessage(role=MessageRole.SYSTEM, content=content) - ) - elif role == "assistant": - formatted_messages.append( - ChatMessage(role=MessageRole.ASSISTANT, content=content) - ) - elif role == "user": - formatted_messages.append( - ChatMessage(role=MessageRole.USER, content=content) - ) - else: - logger.warning(f"Unknown role {role}, treating as user message") - formatted_messages.append( - ChatMessage(role=MessageRole.USER, content=content) - ) - - return formatted_messages - - -@retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=4, max=60), - retry=retry_if_exception_type( - (RateLimitError, APIConnectionError, APITimeoutError) - ), -) -async def llama_index_complete_if_cache( - model: str, - prompt: str, - system_prompt: Optional[str] = None, - history_messages: List[dict] = [], - **kwargs, -) -> str: - """Complete the prompt using LlamaIndex.""" - try: - # Format messages for chat - formatted_messages = [] - - # Add system message if provided - if system_prompt: - formatted_messages.append( - ChatMessage(role=MessageRole.SYSTEM, content=system_prompt) - ) - - # Add history messages - for msg in history_messages: - formatted_messages.append( - ChatMessage( - role=MessageRole.USER - if msg["role"] == "user" - else MessageRole.ASSISTANT, - content=msg["content"], - ) - ) - - # Add current prompt - formatted_messages.append(ChatMessage(role=MessageRole.USER, content=prompt)) - - # Get LLM instance from kwargs - if "llm_instance" not in kwargs: - raise ValueError("llm_instance must be provided in kwargs") - llm = kwargs["llm_instance"] - - # Get response - response: ChatResponse = await llm.achat(messages=formatted_messages) - - # In newer versions, the response is in message.content - content = response.message.content - return content - - except Exception as e: - logger.error(f"Error in llama_index_complete_if_cache: {str(e)}") - raise - - -async def llama_index_complete( - prompt, - system_prompt=None, - history_messages=None, - keyword_extraction=False, - settings: LlamaIndexSettings = None, - **kwargs, -) -> str: - """ - Main completion function for LlamaIndex - - Args: - prompt: Input prompt - system_prompt: Optional system prompt - history_messages: Optional chat history - keyword_extraction: Whether to extract keywords from response - settings: Optional LlamaIndex settings - **kwargs: Additional arguments - """ - if history_messages is None: - history_messages = [] - - keyword_extraction = kwargs.pop("keyword_extraction", None) - result = await llama_index_complete_if_cache( - kwargs.get("llm_instance"), - prompt, - system_prompt=system_prompt, - history_messages=history_messages, - **kwargs, - ) - if keyword_extraction: - return locate_json_string_body_from_string(result) - return result - - -@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192) -@retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=4, max=60), - retry=retry_if_exception_type( - (RateLimitError, APIConnectionError, APITimeoutError) - ), -) -async def llama_index_embed( - texts: list[str], - embed_model: BaseEmbedding = None, - settings: LlamaIndexSettings = None, - **kwargs, -) -> np.ndarray: - """ - Generate embeddings using LlamaIndex - - Args: - texts: List of texts to embed - embed_model: LlamaIndex embedding model - settings: Optional LlamaIndex settings - **kwargs: Additional arguments - """ - if settings: - configure_llama_index(settings) - - if embed_model is None: - raise ValueError("embed_model must be provided") - - # LlamaIndex's embed_query returns a list of floats - embeddings = [] - for text in texts: - embedding = await embed_model.aembed_query(text) - embeddings.append(embedding) - - return np.array(embeddings) From 04604841c9c905ab6f18a6b426897e9d9665cd3e Mon Sep 17 00:00:00 2001 From: Pankaj Kaushal <pankaj@getcalmo.com> Date: Wed, 19 Feb 2025 14:59:49 +0100 Subject: [PATCH 39/65] Add logger import --- lightrag/wrapper/llama_index_impl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lightrag/wrapper/llama_index_impl.py b/lightrag/wrapper/llama_index_impl.py index f79dade5..4e1618b1 100644 --- a/lightrag/wrapper/llama_index_impl.py +++ b/lightrag/wrapper/llama_index_impl.py @@ -5,6 +5,7 @@ from llama_index.core.llms import ( ChatResponse, ) from typing import List, Optional +from lightrag.utils import logger # Install required dependencies if not pm.is_installed("llama-index"): From 277070e03bc7202f015cc8ec18cd37bd0b4e20a8 Mon Sep 17 00:00:00 2001 From: Pankaj Kaushal <pankaj@getcalmo.com> Date: Wed, 19 Feb 2025 15:01:51 +0100 Subject: [PATCH 40/65] Linting and formatting --- ...g_api_llamaindex_direct_demo_simplified.py | 31 ++++++++++++++----- ..._api_llamaindex_litellm_demo_simplified.py | 31 ++++++++++++++----- 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/examples/lightrag_api_llamaindex_direct_demo_simplified.py b/examples/lightrag_api_llamaindex_direct_demo_simplified.py index 50dfec96..a1781842 100644 --- a/examples/lightrag_api_llamaindex_direct_demo_simplified.py +++ b/examples/lightrag_api_llamaindex_direct_demo_simplified.py @@ -1,6 +1,9 @@ import os from lightrag import LightRAG, QueryParam -from lightrag.wrapper.llama_index_impl import llama_index_complete_if_cache, llama_index_embed +from lightrag.wrapper.llama_index_impl import ( + llama_index_complete_if_cache, + llama_index_embed, +) from lightrag.utils import EmbeddingFunc from llama_index.llms.openai import OpenAI from llama_index.embeddings.openai import OpenAIEmbedding @@ -25,20 +28,21 @@ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "your-api-key-here") if not os.path.exists(WORKING_DIR): os.mkdir(WORKING_DIR) + # Initialize LLM function async def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs): try: # Initialize OpenAI if not in kwargs - if 'llm_instance' not in kwargs: + if "llm_instance" not in kwargs: llm_instance = OpenAI( model=LLM_MODEL, api_key=OPENAI_API_KEY, temperature=0.7, ) - kwargs['llm_instance'] = llm_instance + kwargs["llm_instance"] = llm_instance response = await llama_index_complete_if_cache( - kwargs['llm_instance'], + kwargs["llm_instance"], prompt, system_prompt=system_prompt, history_messages=history_messages, @@ -49,6 +53,7 @@ async def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwar print(f"LLM request failed: {str(e)}") raise + # Initialize embedding function async def embedding_func(texts): try: @@ -61,6 +66,7 @@ async def embedding_func(texts): print(f"Embedding failed: {str(e)}") raise + # Get embedding dimension async def get_embedding_dim(): test_text = ["This is a test sentence."] @@ -69,6 +75,7 @@ async def get_embedding_dim(): print(f"embedding_dim={embedding_dim}") return embedding_dim + # Initialize RAG instance rag = LightRAG( working_dir=WORKING_DIR, @@ -86,13 +93,21 @@ with open("./book.txt", "r", encoding="utf-8") as f: # Test different query modes print("\nNaive Search:") -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="naive")) +) print("\nLocal Search:") -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="local")) +) print("\nGlobal Search:") -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="global")) +) print("\nHybrid Search:") -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) \ No newline at end of file +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid")) +) diff --git a/examples/lightrag_api_llamaindex_litellm_demo_simplified.py b/examples/lightrag_api_llamaindex_litellm_demo_simplified.py index 11bdeba8..a1ab90db 100644 --- a/examples/lightrag_api_llamaindex_litellm_demo_simplified.py +++ b/examples/lightrag_api_llamaindex_litellm_demo_simplified.py @@ -1,6 +1,9 @@ import os from lightrag import LightRAG, QueryParam -from lightrag.wrapper.llama_index_impl import llama_index_complete_if_cache, llama_index_embed +from lightrag.wrapper.llama_index_impl import ( + llama_index_complete_if_cache, + llama_index_embed, +) from lightrag.utils import EmbeddingFunc from llama_index.llms.litellm import LiteLLM from llama_index.embeddings.litellm import LiteLLMEmbedding @@ -27,21 +30,22 @@ LITELLM_KEY = os.environ.get("LITELLM_KEY", "sk-1234") if not os.path.exists(WORKING_DIR): os.mkdir(WORKING_DIR) + # Initialize LLM function async def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs): try: # Initialize LiteLLM if not in kwargs - if 'llm_instance' not in kwargs: + if "llm_instance" not in kwargs: llm_instance = LiteLLM( model=f"openai/{LLM_MODEL}", # Format: "provider/model_name" api_base=LITELLM_URL, api_key=LITELLM_KEY, temperature=0.7, ) - kwargs['llm_instance'] = llm_instance + kwargs["llm_instance"] = llm_instance response = await llama_index_complete_if_cache( - kwargs['llm_instance'], + kwargs["llm_instance"], prompt, system_prompt=system_prompt, history_messages=history_messages, @@ -52,6 +56,7 @@ async def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwar print(f"LLM request failed: {str(e)}") raise + # Initialize embedding function async def embedding_func(texts): try: @@ -65,6 +70,7 @@ async def embedding_func(texts): print(f"Embedding failed: {str(e)}") raise + # Get embedding dimension async def get_embedding_dim(): test_text = ["This is a test sentence."] @@ -73,6 +79,7 @@ async def get_embedding_dim(): print(f"embedding_dim={embedding_dim}") return embedding_dim + # Initialize RAG instance rag = LightRAG( working_dir=WORKING_DIR, @@ -90,13 +97,21 @@ with open("./book.txt", "r", encoding="utf-8") as f: # Test different query modes print("\nNaive Search:") -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="naive")) +) print("\nLocal Search:") -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="local")) +) print("\nGlobal Search:") -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="global")) +) print("\nHybrid Search:") -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) \ No newline at end of file +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid")) +) From 173a806b9a8f7a75b50bf3f9321fb1d71d6863ae Mon Sep 17 00:00:00 2001 From: Pankaj Kaushal <pankaj@getcalmo.com> Date: Thu, 20 Feb 2025 10:22:26 +0100 Subject: [PATCH 41/65] Moved back to llm dir as per https://github.com/HKUDS/LightRAG/pull/864#issuecomment-2669705946 - Created two new example scripts demonstrating LightRAG integration with LlamaIndex: - `lightrag_llamaindex_direct_demo.py`: Direct OpenAI integration - `lightrag_llamaindex_litellm_demo.py`: LiteLLM proxy integration - Both examples showcase different search modes (naive, local, global, hybrid) - Includes configuration for working directory, models, and API settings - Demonstrates text insertion and querying using LightRAG with LlamaIndex - removed wrapper directory and references to it --- ..._simplified.py => lightrag_llamaindex_direct_demo.py} | 8 ++++---- ...simplified.py => lightrag_llamaindex_litellm_demo.py} | 9 ++++----- lightrag/{wrapper => llm}/Readme.md | 0 lightrag/{wrapper => llm}/llama_index_impl.py | 0 lightrag/wrapper/__init__.py | 0 5 files changed, 8 insertions(+), 9 deletions(-) rename examples/{lightrag_api_llamaindex_direct_demo_simplified.py => lightrag_llamaindex_direct_demo.py} (95%) rename examples/{lightrag_api_llamaindex_litellm_demo_simplified.py => lightrag_llamaindex_litellm_demo.py} (92%) rename lightrag/{wrapper => llm}/Readme.md (100%) rename lightrag/{wrapper => llm}/llama_index_impl.py (100%) delete mode 100644 lightrag/wrapper/__init__.py diff --git a/examples/lightrag_api_llamaindex_direct_demo_simplified.py b/examples/lightrag_llamaindex_direct_demo.py similarity index 95% rename from examples/lightrag_api_llamaindex_direct_demo_simplified.py rename to examples/lightrag_llamaindex_direct_demo.py index a1781842..5db158ce 100644 --- a/examples/lightrag_api_llamaindex_direct_demo_simplified.py +++ b/examples/lightrag_llamaindex_direct_demo.py @@ -1,6 +1,6 @@ import os from lightrag import LightRAG, QueryParam -from lightrag.wrapper.llama_index_impl import ( +from lightrag.llm.llama_index_impl import ( llama_index_complete_if_cache, llama_index_embed, ) @@ -10,14 +10,13 @@ from llama_index.embeddings.openai import OpenAIEmbedding import asyncio # Configure working directory -DEFAULT_RAG_DIR = "index_default" -WORKING_DIR = os.environ.get("RAG_DIR", f"{DEFAULT_RAG_DIR}") +WORKING_DIR = "./index_default" print(f"WORKING_DIR: {WORKING_DIR}") # Model configuration LLM_MODEL = os.environ.get("LLM_MODEL", "gpt-4") print(f"LLM_MODEL: {LLM_MODEL}") -EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "text-embedding-3-small") +EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "text-embedding-3-large") print(f"EMBEDDING_MODEL: {EMBEDDING_MODEL}") EMBEDDING_MAX_TOKEN_SIZE = int(os.environ.get("EMBEDDING_MAX_TOKEN_SIZE", 8192)) print(f"EMBEDDING_MAX_TOKEN_SIZE: {EMBEDDING_MAX_TOKEN_SIZE}") @@ -26,6 +25,7 @@ print(f"EMBEDDING_MAX_TOKEN_SIZE: {EMBEDDING_MAX_TOKEN_SIZE}") OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "your-api-key-here") if not os.path.exists(WORKING_DIR): + print(f"Creating working directory: {WORKING_DIR}") os.mkdir(WORKING_DIR) diff --git a/examples/lightrag_api_llamaindex_litellm_demo_simplified.py b/examples/lightrag_llamaindex_litellm_demo.py similarity index 92% rename from examples/lightrag_api_llamaindex_litellm_demo_simplified.py rename to examples/lightrag_llamaindex_litellm_demo.py index a1ab90db..3511ecf3 100644 --- a/examples/lightrag_api_llamaindex_litellm_demo_simplified.py +++ b/examples/lightrag_llamaindex_litellm_demo.py @@ -1,6 +1,6 @@ import os from lightrag import LightRAG, QueryParam -from lightrag.wrapper.llama_index_impl import ( +from lightrag.llm.llama_index_impl import ( llama_index_complete_if_cache, llama_index_embed, ) @@ -10,14 +10,13 @@ from llama_index.embeddings.litellm import LiteLLMEmbedding import asyncio # Configure working directory -DEFAULT_RAG_DIR = "index_default" -WORKING_DIR = os.environ.get("RAG_DIR", f"{DEFAULT_RAG_DIR}") +WORKING_DIR = "./index_default" print(f"WORKING_DIR: {WORKING_DIR}") # Model configuration -LLM_MODEL = os.environ.get("LLM_MODEL", "gpt-4o") +LLM_MODEL = os.environ.get("LLM_MODEL", "gpt-4") print(f"LLM_MODEL: {LLM_MODEL}") -EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "embedding-model") +EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "text-embedding-3-large") print(f"EMBEDDING_MODEL: {EMBEDDING_MODEL}") EMBEDDING_MAX_TOKEN_SIZE = int(os.environ.get("EMBEDDING_MAX_TOKEN_SIZE", 8192)) print(f"EMBEDDING_MAX_TOKEN_SIZE: {EMBEDDING_MAX_TOKEN_SIZE}") diff --git a/lightrag/wrapper/Readme.md b/lightrag/llm/Readme.md similarity index 100% rename from lightrag/wrapper/Readme.md rename to lightrag/llm/Readme.md diff --git a/lightrag/wrapper/llama_index_impl.py b/lightrag/llm/llama_index_impl.py similarity index 100% rename from lightrag/wrapper/llama_index_impl.py rename to lightrag/llm/llama_index_impl.py diff --git a/lightrag/wrapper/__init__.py b/lightrag/wrapper/__init__.py deleted file mode 100644 index e69de29b..00000000 From 9934241a1e44a3e071139f793e9ee4e689d032a3 Mon Sep 17 00:00:00 2001 From: Pankaj Kaushal <pankaj@getcalmo.com> Date: Thu, 20 Feb 2025 10:31:19 +0100 Subject: [PATCH 42/65] Update README.md: Refactor LlamaIndex section and example code - Simplified LlamaIndex documentation in README - Removed wrapper directory references - Updated example code to reflect new directory structure - Cleaned up custom knowledge graph example - Adjusted file paths and import statements --- README.md | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 432261f7..9a518d8d 100644 --- a/README.md +++ b/README.md @@ -313,30 +313,26 @@ In order to run this experiment on low RAM GPU you should select small model and </details> <details> -<summary> <b>Wrappers</b> </summary> +<summary> <b>LlamaIndex</b> </summary> -LightRAG supports integration with various frameworks and model providers through wrappers. These wrappers provide a consistent interface while abstracting away the specifics of each framework. +LightRAG supports integration with LlamaIndex. -### Current Wrappers - -1. **LlamaIndex** (`wrapper/llama_index_impl.py`): +1. **LlamaIndex** (`llm/llama_index_impl.py`): - Integrates with OpenAI and other providers through LlamaIndex - - Supports both direct API access and proxy services like LiteLLM - - Provides consistent interfaces for embeddings and completions - - See [LlamaIndex Wrapper Documentation](lightrag/wrapper/Readme.md) for detailed setup and examples + - See [LlamaIndex Documentation](lightrag/llm/Readme.md) for detailed setup and examples ### Example Usage ```python # Using LlamaIndex with direct OpenAI access from lightrag import LightRAG -from lightrag.wrapper.llama_index_impl import llama_index_complete_if_cache, llama_index_embed +from lightrag.llm.llama_index_impl import llama_index_complete_if_cache, llama_index_embed from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.llms.openai import OpenAI rag = LightRAG( working_dir="your/path", - llm_model_func=llm_model_func, # LlamaIndex-compatible completion function + llm_model_func=llama_index_complete_if_cache, # LlamaIndex-compatible completion function embedding_func=EmbeddingFunc( # LlamaIndex-compatible embedding function embedding_dim=1536, max_token_size=8192, @@ -346,9 +342,9 @@ rag = LightRAG( ``` #### For detailed documentation and examples, see: -- [LlamaIndex Wrapper Documentation](lightrag/wrapper/Readme.md) -- [Direct OpenAI Example](examples/lightrag_api_llamaindex_direct_demo_simplified.py) -- [LiteLLM Proxy Example](examples/lightrag_api_llamaindex_litellm_demo_simplified.py) +- [LlamaIndex Documentation](lightrag/llm/Readme.md) +- [Direct OpenAI Example](examples/lightrag_llamaindex_direct_demo.py) +- [LiteLLM Proxy Example](examples/lightrag_llamaindex_litellm_demo.py) </details> <details> @@ -499,22 +495,14 @@ custom_kg = { { "content": "ProductX, developed by CompanyA, has revolutionized the market with its cutting-edge features.", "source_id": "Source1", - "chunk_order_index": 0, - }, - { - "content": "One outstanding feature of ProductX is its advanced AI capabilities.", - "source_id": "Source1", - "chunk_order_index": 1, }, { "content": "PersonA is a prominent researcher at UniversityB, focusing on artificial intelligence and machine learning.", "source_id": "Source2", - "chunk_order_index": 0, }, { "content": "None", "source_id": "UNKNOWN", - "chunk_order_index": 0, }, ], } From 6f09bfc970c784ae88b9d8b8dba275213150cfb7 Mon Sep 17 00:00:00 2001 From: Pankaj Kaushal <pankaj@getcalmo.com> Date: Thu, 20 Feb 2025 10:33:15 +0100 Subject: [PATCH 43/65] Update LlamaIndex README: improve documentation and example paths - Updated file paths for LlamaIndex examples - Simplified README structure - Corrected import statements to reflect new directory layout - Removed outdated wrapper directory references --- lightrag/llm/Readme.md | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/lightrag/llm/Readme.md b/lightrag/llm/Readme.md index ece56458..969d70e3 100644 --- a/lightrag/llm/Readme.md +++ b/lightrag/llm/Readme.md @@ -1,27 +1,11 @@ -## Wrapper Directory -The `wrapper` directory contains integrations with different frameworks. These wrappers provide a consistent interface to LightRAG while abstracting away the specifics of each framework. - -## Wrapper Directory Structure - -``` -lightrag/ -├── wrapper/ # Wrappers for different model providers and frameworks -│ ├── llama_index_impl.py # LlamaIndex integration for embeddings and completions -│ └── ... # Other framework wrappers -├── kg/ # Knowledge graph implementations -├── utils/ # Utility functions and helpers -└── ... -``` -Current wrappers: - -1. **LlamaIndex** (`wrapper/llama_index.py`): +1. **LlamaIndex** (`llm/llama_index.py`): - Provides integration with OpenAI and other providers through LlamaIndex - Supports both direct API access and proxy services like LiteLLM - Handles embeddings and completions with consistent interfaces - See example implementations: - - [Direct OpenAI Usage](../examples/lightrag_api_llamaindex_direct_demo_simplified.py) - - [LiteLLM Proxy Usage](../examples/lightrag_api_llamaindex_litellm_demo_simplified.py) + - [Direct OpenAI Usage](../../examples/lightrag_llamaindex_direct_demo.py) + - [LiteLLM Proxy Usage](../../examples/lightrag_llamaindex_litellm_demo.py) <details> <summary> <b>Using LlamaIndex</b> </summary> @@ -39,7 +23,7 @@ pip install llama-index-llms-litellm llama-index-embeddings-litellm ```python from lightrag import LightRAG -from lightrag.wrapper.llama_index_impl import llama_index_complete_if_cache, llama_index_embed +from lightrag.llm.llama_index_impl import llama_index_complete_if_cache, llama_index_embed from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.llms.openai import OpenAI from lightrag.utils import EmbeddingFunc @@ -94,7 +78,7 @@ rag = LightRAG( ```python from lightrag import LightRAG -from lightrag.wrapper.llama_index_impl import llama_index_complete_if_cache, llama_index_embed +from lightrag.llm.llama_index_impl import llama_index_complete_if_cache, llama_index_embed from llama_index.llms.litellm import LiteLLM from llama_index.embeddings.litellm import LiteLLMEmbedding from lightrag.utils import EmbeddingFunc From 9f2c659d9cba73214e178c7b8910b3e510ea760b Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 12:54:14 +0100 Subject: [PATCH 44/65] remove unused log --- lightrag/kg/oracle_impl.py | 2 +- lightrag/kg/postgres_impl.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index de61a2ca..35983ad3 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -44,7 +44,7 @@ class OracleDB: self.increment = 1 logger.info(f"Using the label {self.workspace} for Oracle Graph as identifier") if self.user is None or self.password is None: - raise ValueError("Missing database user or password in addon_params") + raise ValueError("Missing database user or password") try: oracledb.defaults.fetch_lobs = False diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index ababc05f..52370821 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -55,7 +55,7 @@ class PostgreSQLDB: if self.user is None or self.password is None or self.database is None: raise ValueError( - "Missing database user, password, or database in addon_params" + "Missing database user, password, or database" ) async def initdb(self): From de56aeb7c5fcb60ea2c391a54f7e1d4ed0559178 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 12:54:52 +0100 Subject: [PATCH 45/65] removed lock --- lightrag/lightrag.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index a4daeced..a34ae20d 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -7,7 +7,7 @@ from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial from typing import Any, AsyncIterator, Callable, Iterator, cast -from asyncio import Lock + from .base import ( BaseGraphStorage, BaseKVStorage, @@ -358,9 +358,6 @@ class LightRAG: convert_response_to_json ) - # Lock for entity extraction - _entity_lock = Lock() - # Custom Chunking Function chunking_func: Callable[ [ @@ -1203,7 +1200,6 @@ class LightRAG: # --------------------- # STEP 1: Keyword Extraction # --------------------- - # We'll assume 'extract_keywords_only(...)' returns (hl_keywords, ll_keywords). hl_keywords, ll_keywords = await extract_keywords_only( text=query, param=param, From bae21a6fadbc6093bb4094ec6e151fff9592d721 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 12:57:25 +0100 Subject: [PATCH 46/65] added max paralle insert --- lightrag/lightrag.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index a34ae20d..22c32770 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -347,6 +347,9 @@ class LightRAG: """If True, enables caching for entity extraction steps to reduce LLM costs.""" # Extensions + max_parallel_insert: int = field(default_factory=lambda: int(os.getenv("MAX_PARALLEL_INSERT", 20))) + """Maximum number of parallel insert operations.""" + addon_params: dict[str, Any] = field(default_factory=dict) # Storages Management @@ -786,10 +789,9 @@ class LightRAG: return # 2. split docs into chunks, insert chunks, update doc status - batch_size = self.addon_params.get("insert_batch_size", 10) docs_batches = [ - list(to_process_docs.items())[i : i + batch_size] - for i in range(0, len(to_process_docs), batch_size) + list(to_process_docs.items())[i : i + self.max_parallel_insert] + for i in range(0, len(to_process_docs), self.max_parallel_insert) ] logger.info(f"Number of batches to process: {len(docs_batches)}.") From 37addb7c01682d1e273c74532bbf9ceada0020f2 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 13:05:35 +0100 Subject: [PATCH 47/65] added final --- lightrag/lightrag.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 22c32770..f2d48444 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -6,7 +6,7 @@ import configparser from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial -from typing import Any, AsyncIterator, Callable, Iterator, cast +from typing import Any, AsyncIterator, Callable, Iterator, cast, final from .base import ( BaseGraphStorage, @@ -225,7 +225,7 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop: asyncio.set_event_loop(new_loop) return new_loop - +@final @dataclass class LightRAG: """LightRAG: Simple and Fast Retrieval-Augmented Generation.""" From 2370a4336b0e16387db942a6e4f03c29d32ee5e7 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 13:05:59 +0100 Subject: [PATCH 48/65] added field --- lightrag/lightrag.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index f2d48444..28d5d078 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -231,12 +231,12 @@ class LightRAG: """LightRAG: Simple and Fast Retrieval-Augmented Generation.""" working_dir: str = field( - default_factory=lambda: f"./lightrag_cache_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}" + default=f"./lightrag_cache_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}" ) """Directory where cache and temporary files are stored.""" embedding_cache_config: dict[str, Any] = field( - default_factory=lambda: { + default={ "enabled": False, "similarity_threshold": 0.95, "use_llm_check": False, @@ -261,32 +261,31 @@ class LightRAG: """Storage type for tracking document processing statuses.""" # Logging - current_log_level = logger.level - log_level: int = field(default=current_log_level) + log_level: int = field(default=logger.level) """Logging level for the system (e.g., 'DEBUG', 'INFO', 'WARNING').""" log_dir: str = field(default=os.getcwd()) """Directory where logs are stored. Defaults to the current working directory.""" # Text chunking - chunk_token_size: int = int(os.getenv("CHUNK_SIZE", "1200")) + chunk_token_size: int = field(default=int(os.getenv("CHUNK_SIZE", 1200))) """Maximum number of tokens per text chunk when splitting documents.""" - chunk_overlap_token_size: int = int(os.getenv("CHUNK_OVERLAP_SIZE", "100")) + chunk_overlap_token_size: int = field(default=int(os.getenv("CHUNK_OVERLAP_SIZE", 100))) """Number of overlapping tokens between consecutive text chunks to preserve context.""" - tiktoken_model_name: str = "gpt-4o-mini" + tiktoken_model_name: str = field(default="gpt-4o-mini") """Model name used for tokenization when chunking text.""" # Entity extraction - entity_extract_max_gleaning: int = 1 + entity_extract_max_gleaning: int = field(default=1) """Maximum number of entity extraction attempts for ambiguous content.""" - entity_summary_to_max_tokens: int = int(os.getenv("MAX_TOKEN_SUMMARY", "500")) + entity_summary_to_max_tokens: int = field(default=int(os.getenv("MAX_TOKEN_SUMMARY", 500))) """Maximum number of tokens used for summarizing extracted entities.""" # Node embedding - node_embedding_algorithm: str = "node2vec" + node_embedding_algorithm: str = field(default="node2vec") """Algorithm used for node embedding in knowledge graphs.""" node2vec_params: dict[str, int] = field( From f5a93c7bb5d78f5b6a0c94e96ead46c7cb3bd147 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 13:06:16 +0100 Subject: [PATCH 49/65] added fields --- lightrag/lightrag.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 28d5d078..5b01c18b 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -307,26 +307,26 @@ class LightRAG: - random_seed: Seed value for reproducibility. """ - embedding_func: EmbeddingFunc | None = None + embedding_func: EmbeddingFunc | None = field(default=None) """Function for computing text embeddings. Must be set before use.""" - embedding_batch_num: int = 32 + embedding_batch_num: int = field(default=32) """Batch size for embedding computations.""" - embedding_func_max_async: int = 16 + embedding_func_max_async: int = field(default=16) """Maximum number of concurrent embedding function calls.""" # LLM Configuration - llm_model_func: Callable[..., object] | None = None + llm_model_func: Callable[..., object] | None = field(default=None) """Function for interacting with the large language model (LLM). Must be set before use.""" - llm_model_name: str = "meta-llama/Llama-3.2-1B-Instruct" + llm_model_name: str = field(default="gpt-4o-mini") """Name of the LLM model used for generating responses.""" - llm_model_max_token_size: int = int(os.getenv("MAX_TOKENS", "32768")) + llm_model_max_token_size: int = field(default=int(os.getenv("MAX_TOKENS", 32768))) """Maximum number of tokens allowed per LLM response.""" - llm_model_max_async: int = int(os.getenv("MAX_ASYNC", "16")) + llm_model_max_async: int = field(default=int(os.getenv("MAX_ASYNC", 16))) """Maximum number of concurrent LLM calls.""" llm_model_kwargs: dict[str, Any] = field(default_factory=dict) From 4b478d1c0ff521cd25d607f3ff1a5c3a66e238d3 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 13:06:34 +0100 Subject: [PATCH 50/65] added fields --- lightrag/lightrag.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 5b01c18b..5706e189 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -339,20 +339,20 @@ class LightRAG: namespace_prefix: str = field(default="") """Prefix for namespacing stored data across different environments.""" - enable_llm_cache: bool = True + enable_llm_cache: bool = field(default=True) """Enables caching for LLM responses to avoid redundant computations.""" - enable_llm_cache_for_entity_extract: bool = True + enable_llm_cache_for_entity_extract: bool = field(default=True) """If True, enables caching for entity extraction steps to reduce LLM costs.""" # Extensions - max_parallel_insert: int = field(default_factory=lambda: int(os.getenv("MAX_PARALLEL_INSERT", 20))) + max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 20))) """Maximum number of parallel insert operations.""" addon_params: dict[str, Any] = field(default_factory=dict) # Storages Management - auto_manage_storages_states: bool = True + auto_manage_storages_states: bool = field(default=True) """If True, lightrag will automatically calls initialize_storages and finalize_storages at the appropriate times.""" """Dictionary for additional parameters and extensions.""" From 32d0f1acb04c9499024b7d953957736cef0c850c Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 13:09:33 +0100 Subject: [PATCH 51/65] added docs and fields --- lightrag/kg/postgres_impl.py | 4 +--- lightrag/lightrag.py | 44 ++++++++++++++++++++++++++++-------- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 52370821..d7ace41a 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -54,9 +54,7 @@ class PostgreSQLDB: self.pool: Pool | None = None if self.user is None or self.password is None or self.database is None: - raise ValueError( - "Missing database user, password, or database" - ) + raise ValueError("Missing database user, password, or database") async def initdb(self): try: diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 5706e189..247e09ec 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -225,6 +225,7 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop: asyncio.set_event_loop(new_loop) return new_loop + @final @dataclass class LightRAG: @@ -271,7 +272,9 @@ class LightRAG: chunk_token_size: int = field(default=int(os.getenv("CHUNK_SIZE", 1200))) """Maximum number of tokens per text chunk when splitting documents.""" - chunk_overlap_token_size: int = field(default=int(os.getenv("CHUNK_OVERLAP_SIZE", 100))) + chunk_overlap_token_size: int = field( + default=int(os.getenv("CHUNK_OVERLAP_SIZE", 100)) + ) """Number of overlapping tokens between consecutive text chunks to preserve context.""" tiktoken_model_name: str = field(default="gpt-4o-mini") @@ -281,11 +284,13 @@ class LightRAG: entity_extract_max_gleaning: int = field(default=1) """Maximum number of entity extraction attempts for ambiguous content.""" - entity_summary_to_max_tokens: int = field(default=int(os.getenv("MAX_TOKEN_SUMMARY", 500))) + entity_summary_to_max_tokens: int = field( + default=int(os.getenv("MAX_TOKEN_SUMMARY", 500)) + ) """Maximum number of tokens used for summarizing extracted entities.""" # Node embedding - node_embedding_algorithm: str = field(default="node2vec") + node_embedding_algorithm: str = field(default="node2vec") """Algorithm used for node embedding in knowledge graphs.""" node2vec_params: dict[str, int] = field( @@ -348,19 +353,22 @@ class LightRAG: # Extensions max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 20))) """Maximum number of parallel insert operations.""" - + addon_params: dict[str, Any] = field(default_factory=dict) # Storages Management auto_manage_storages_states: bool = field(default=True) """If True, lightrag will automatically calls initialize_storages and finalize_storages at the appropriate times.""" - """Dictionary for additional parameters and extensions.""" - convert_response_to_json_func: Callable[[str], dict[str, Any]] = ( - convert_response_to_json + convert_response_to_json_func: Callable[[str], dict[str, Any]] = field( + default_factory=lambda: convert_response_to_json ) + """ + Custom function for converting LLM responses to JSON format. + + The default function is :func:`.utils.convert_response_to_json`. + """ - # Custom Chunking Function chunking_func: Callable[ [ str, @@ -371,7 +379,25 @@ class LightRAG: str, ], list[dict[str, Any]], - ] = chunking_by_token_size + ] = field(default_factory=lambda: chunking_by_token_size) + """ + Custom chunking function for splitting text into chunks before processing. + + The function should take the following parameters: + + - `content`: The text to be split into chunks. + - `split_by_character`: The character to split the text on. If None, the text is split into chunks of `chunk_token_size` tokens. + - `split_by_character_only`: If True, the text is split only on the specified character. + - `chunk_token_size`: The maximum number of tokens per chunk. + - `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks. + - `tiktoken_model_name`: The name of the tiktoken model to use for tokenization. + + The function should return a list of dictionaries, where each dictionary contains the following keys: + - `tokens`: The number of tokens in the chunk. + - `content`: The text content of the chunk. + + Defaults to `chunking_by_token_size` if not specified. + """ def verify_storage_implementation( self, storage_type: str, storage_name: str From 72b978d6d5dec60f18431ddf7f2488b6908a2d32 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 13:13:38 +0100 Subject: [PATCH 52/65] cleanup --- lightrag/lightrag.py | 227 ++++++++++++++++++++++++------------------- 1 file changed, 128 insertions(+), 99 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 247e09ec..481025af 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -231,23 +231,16 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop: class LightRAG: """LightRAG: Simple and Fast Retrieval-Augmented Generation.""" + # Directory + # --- + working_dir: str = field( default=f"./lightrag_cache_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}" ) """Directory where cache and temporary files are stored.""" - embedding_cache_config: dict[str, Any] = field( - default={ - "enabled": False, - "similarity_threshold": 0.95, - "use_llm_check": False, - } - ) - """Configuration for embedding cache. - - enabled: If True, enables caching to avoid redundant computations. - - similarity_threshold: Minimum similarity score to use cached embeddings. - - use_llm_check: If True, validates cached embeddings using an LLM. - """ + # Storage + # --- kv_storage: str = field(default="JsonKVStorage") """Storage backend for key-value data.""" @@ -262,13 +255,27 @@ class LightRAG: """Storage type for tracking document processing statuses.""" # Logging + # --- + log_level: int = field(default=logger.level) """Logging level for the system (e.g., 'DEBUG', 'INFO', 'WARNING').""" log_dir: str = field(default=os.getcwd()) """Directory where logs are stored. Defaults to the current working directory.""" + # Entity extraction + # --- + + entity_extract_max_gleaning: int = field(default=1) + """Maximum number of entity extraction attempts for ambiguous content.""" + + entity_summary_to_max_tokens: int = field( + default=int(os.getenv("MAX_TOKEN_SUMMARY", 500)) + ) + # Text chunking + # --- + chunk_token_size: int = field(default=int(os.getenv("CHUNK_SIZE", 1200))) """Maximum number of tokens per text chunk when splitting documents.""" @@ -280,95 +287,8 @@ class LightRAG: tiktoken_model_name: str = field(default="gpt-4o-mini") """Model name used for tokenization when chunking text.""" - # Entity extraction - entity_extract_max_gleaning: int = field(default=1) - """Maximum number of entity extraction attempts for ambiguous content.""" - - entity_summary_to_max_tokens: int = field( - default=int(os.getenv("MAX_TOKEN_SUMMARY", 500)) - ) """Maximum number of tokens used for summarizing extracted entities.""" - # Node embedding - node_embedding_algorithm: str = field(default="node2vec") - """Algorithm used for node embedding in knowledge graphs.""" - - node2vec_params: dict[str, int] = field( - default_factory=lambda: { - "dimensions": 1536, - "num_walks": 10, - "walk_length": 40, - "window_size": 2, - "iterations": 3, - "random_seed": 3, - } - ) - """Configuration for the node2vec embedding algorithm: - - dimensions: Number of dimensions for embeddings. - - num_walks: Number of random walks per node. - - walk_length: Number of steps per random walk. - - window_size: Context window size for training. - - iterations: Number of iterations for training. - - random_seed: Seed value for reproducibility. - """ - - embedding_func: EmbeddingFunc | None = field(default=None) - """Function for computing text embeddings. Must be set before use.""" - - embedding_batch_num: int = field(default=32) - """Batch size for embedding computations.""" - - embedding_func_max_async: int = field(default=16) - """Maximum number of concurrent embedding function calls.""" - - # LLM Configuration - llm_model_func: Callable[..., object] | None = field(default=None) - """Function for interacting with the large language model (LLM). Must be set before use.""" - - llm_model_name: str = field(default="gpt-4o-mini") - """Name of the LLM model used for generating responses.""" - - llm_model_max_token_size: int = field(default=int(os.getenv("MAX_TOKENS", 32768))) - """Maximum number of tokens allowed per LLM response.""" - - llm_model_max_async: int = field(default=int(os.getenv("MAX_ASYNC", 16))) - """Maximum number of concurrent LLM calls.""" - - llm_model_kwargs: dict[str, Any] = field(default_factory=dict) - """Additional keyword arguments passed to the LLM model function.""" - - # Storage - vector_db_storage_cls_kwargs: dict[str, Any] = field(default_factory=dict) - """Additional parameters for vector database storage.""" - - namespace_prefix: str = field(default="") - """Prefix for namespacing stored data across different environments.""" - - enable_llm_cache: bool = field(default=True) - """Enables caching for LLM responses to avoid redundant computations.""" - - enable_llm_cache_for_entity_extract: bool = field(default=True) - """If True, enables caching for entity extraction steps to reduce LLM costs.""" - - # Extensions - max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 20))) - """Maximum number of parallel insert operations.""" - - addon_params: dict[str, Any] = field(default_factory=dict) - - # Storages Management - auto_manage_storages_states: bool = field(default=True) - """If True, lightrag will automatically calls initialize_storages and finalize_storages at the appropriate times.""" - - convert_response_to_json_func: Callable[[str], dict[str, Any]] = field( - default_factory=lambda: convert_response_to_json - ) - """ - Custom function for converting LLM responses to JSON format. - - The default function is :func:`.utils.convert_response_to_json`. - """ - chunking_func: Callable[ [ str, @@ -399,6 +319,115 @@ class LightRAG: Defaults to `chunking_by_token_size` if not specified. """ + # Node embedding + # --- + + node_embedding_algorithm: str = field(default="node2vec") + """Algorithm used for node embedding in knowledge graphs.""" + + node2vec_params: dict[str, int] = field( + default_factory=lambda: { + "dimensions": 1536, + "num_walks": 10, + "walk_length": 40, + "window_size": 2, + "iterations": 3, + "random_seed": 3, + } + ) + """Configuration for the node2vec embedding algorithm: + - dimensions: Number of dimensions for embeddings. + - num_walks: Number of random walks per node. + - walk_length: Number of steps per random walk. + - window_size: Context window size for training. + - iterations: Number of iterations for training. + - random_seed: Seed value for reproducibility. + """ + + # Embedding + # --- + + embedding_func: EmbeddingFunc | None = field(default=None) + """Function for computing text embeddings. Must be set before use.""" + + embedding_batch_num: int = field(default=32) + """Batch size for embedding computations.""" + + embedding_func_max_async: int = field(default=16) + """Maximum number of concurrent embedding function calls.""" + + embedding_cache_config: dict[str, Any] = field( + default={ + "enabled": False, + "similarity_threshold": 0.95, + "use_llm_check": False, + } + ) + """Configuration for embedding cache. + - enabled: If True, enables caching to avoid redundant computations. + - similarity_threshold: Minimum similarity score to use cached embeddings. + - use_llm_check: If True, validates cached embeddings using an LLM. + """ + + # LLM Configuration + # --- + + llm_model_func: Callable[..., object] | None = field(default=None) + """Function for interacting with the large language model (LLM). Must be set before use.""" + + llm_model_name: str = field(default="gpt-4o-mini") + """Name of the LLM model used for generating responses.""" + + llm_model_max_token_size: int = field(default=int(os.getenv("MAX_TOKENS", 32768))) + """Maximum number of tokens allowed per LLM response.""" + + llm_model_max_async: int = field(default=int(os.getenv("MAX_ASYNC", 16))) + """Maximum number of concurrent LLM calls.""" + + llm_model_kwargs: dict[str, Any] = field(default_factory=dict) + """Additional keyword arguments passed to the LLM model function.""" + + # Storage + # --- + + vector_db_storage_cls_kwargs: dict[str, Any] = field(default_factory=dict) + """Additional parameters for vector database storage.""" + + namespace_prefix: str = field(default="") + """Prefix for namespacing stored data across different environments.""" + + enable_llm_cache: bool = field(default=True) + """Enables caching for LLM responses to avoid redundant computations.""" + + enable_llm_cache_for_entity_extract: bool = field(default=True) + """If True, enables caching for entity extraction steps to reduce LLM costs.""" + + # Extensions + # --- + + max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 20))) + """Maximum number of parallel insert operations.""" + + addon_params: dict[str, Any] = field(default_factory=dict) + + # Storages Management + # --- + + auto_manage_storages_states: bool = field(default=True) + """If True, lightrag will automatically calls initialize_storages and finalize_storages at the appropriate times.""" + + # Storages Management + # --- + + convert_response_to_json_func: Callable[[str], dict[str, Any]] = field( + default_factory=lambda: convert_response_to_json + ) + """ + Custom function for converting LLM responses to JSON format. + + The default function is :func:`.utils.convert_response_to_json`. + """ + def verify_storage_implementation( self, storage_type: str, storage_name: str ) -> None: From 32e489865c0378c61fe240151b3b1572f3b7e1ae Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 13:18:17 +0100 Subject: [PATCH 53/65] cleanup code --- docker-compose.yml | 2 - examples/lightrag_api_oracle_demo.py | 1 - .../lightrag_openai_compatible_stream_demo.py | 7 - examples/lightrag_tidb_demo.py | 1 - lightrag/lightrag.py | 121 ++++++------------ lightrag/utils.py | 44 +++++++ reproduce/Step_3.py | 10 +- reproduce/Step_3_openai_compatible.py | 11 +- 8 files changed, 89 insertions(+), 108 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index b5659692..4ced24ca 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3.8' - services: lightrag: build: . diff --git a/examples/lightrag_api_oracle_demo.py b/examples/lightrag_api_oracle_demo.py index e66e3f94..3675795e 100644 --- a/examples/lightrag_api_oracle_demo.py +++ b/examples/lightrag_api_oracle_demo.py @@ -98,7 +98,6 @@ async def init(): # Initialize LightRAG # We use Oracle DB as the KV/vector/graph storage - # You can add `addon_params={"example_number": 1, "language": "Simplfied Chinese"}` to control the prompt rag = LightRAG( enable_llm_cache=False, working_dir=WORKING_DIR, diff --git a/examples/lightrag_openai_compatible_stream_demo.py b/examples/lightrag_openai_compatible_stream_demo.py index 93c4297c..7509e4dc 100644 --- a/examples/lightrag_openai_compatible_stream_demo.py +++ b/examples/lightrag_openai_compatible_stream_demo.py @@ -1,9 +1,7 @@ import os -import inspect from lightrag import LightRAG from lightrag.llm import openai_complete, openai_embed from lightrag.utils import EmbeddingFunc -from lightrag.lightrag import always_get_an_event_loop from lightrag import QueryParam # WorkingDir @@ -48,8 +46,3 @@ async def print_stream(stream): print(chunk, end="", flush=True) -loop = always_get_an_event_loop() -if inspect.isasyncgen(resp): - loop.run_until_complete(print_stream(resp)) -else: - print(resp) diff --git a/examples/lightrag_tidb_demo.py b/examples/lightrag_tidb_demo.py index f4004f84..f2ee9ad8 100644 --- a/examples/lightrag_tidb_demo.py +++ b/examples/lightrag_tidb_demo.py @@ -63,7 +63,6 @@ async def main(): # Initialize LightRAG # We use TiDB DB as the KV/vector - # You can add `addon_params={"example_number": 1, "language": "Simplfied Chinese"}` to control the prompt rag = LightRAG( enable_llm_cache=False, working_dir=WORKING_DIR, diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 481025af..8b695883 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -32,8 +32,10 @@ from .operate import ( from .prompt import GRAPH_FIELD_SEP from .utils import ( EmbeddingFunc, + always_get_an_event_loop, compute_mdhash_id, convert_response_to_json, + lazy_external_import, limit_async_func_call, logger, set_logger, @@ -182,48 +184,9 @@ STORAGES = { } -def lazy_external_import(module_name: str, class_name: str) -> Callable[..., Any]: - """Lazily import a class from an external module based on the package of the caller.""" - # Get the caller's module and package - import inspect - - caller_frame = inspect.currentframe().f_back - module = inspect.getmodule(caller_frame) - package = module.__package__ if module else None - - def import_class(*args: Any, **kwargs: Any): - import importlib - - module = importlib.import_module(module_name, package=package) - cls = getattr(module, class_name) - return cls(*args, **kwargs) - - return import_class -def always_get_an_event_loop() -> asyncio.AbstractEventLoop: - """ - Ensure that there is always an event loop available. - This function tries to get the current event loop. If the current event loop is closed or does not exist, - it creates a new event loop and sets it as the current event loop. - - Returns: - asyncio.AbstractEventLoop: The current or newly created event loop. - """ - try: - # Try to get the current event loop - current_loop = asyncio.get_event_loop() - if current_loop.is_closed(): - raise RuntimeError("Event loop is closed.") - return current_loop - - except RuntimeError: - # If no event loop exists or it is closed, create a new one - logger.info("Creating a new event loop in main thread.") - new_loop = asyncio.new_event_loop() - asyncio.set_event_loop(new_loop) - return new_loop @final @@ -428,46 +391,6 @@ class LightRAG: The default function is :func:`.utils.convert_response_to_json`. """ - def verify_storage_implementation( - self, storage_type: str, storage_name: str - ) -> None: - """Verify if storage implementation is compatible with specified storage type - - Args: - storage_type: Storage type (KV_STORAGE, GRAPH_STORAGE etc.) - storage_name: Storage implementation name - - Raises: - ValueError: If storage implementation is incompatible or missing required methods - """ - if storage_type not in STORAGE_IMPLEMENTATIONS: - raise ValueError(f"Unknown storage type: {storage_type}") - - storage_info = STORAGE_IMPLEMENTATIONS[storage_type] - if storage_name not in storage_info["implementations"]: - raise ValueError( - f"Storage implementation '{storage_name}' is not compatible with {storage_type}. " - f"Compatible implementations are: {', '.join(storage_info['implementations'])}" - ) - - def check_storage_env_vars(self, storage_name: str) -> None: - """Check if all required environment variables for storage implementation exist - - Args: - storage_name: Storage implementation name - - Raises: - ValueError: If required environment variables are missing - """ - required_vars = STORAGE_ENV_REQUIREMENTS.get(storage_name, []) - missing_vars = [var for var in required_vars if var not in os.environ] - - if missing_vars: - raise ValueError( - f"Storage implementation '{storage_name}' requires the following " - f"environment variables: {', '.join(missing_vars)}" - ) - def __post_init__(self): os.makedirs(self.log_dir, exist_ok=True) log_file = os.path.join(self.log_dir, "lightrag.log") @@ -1681,3 +1604,43 @@ class LightRAG: result["vector_data"] = vector_data[0] if vector_data else None return result + + def verify_storage_implementation( + self, storage_type: str, storage_name: str + ) -> None: + """Verify if storage implementation is compatible with specified storage type + + Args: + storage_type: Storage type (KV_STORAGE, GRAPH_STORAGE etc.) + storage_name: Storage implementation name + + Raises: + ValueError: If storage implementation is incompatible or missing required methods + """ + if storage_type not in STORAGE_IMPLEMENTATIONS: + raise ValueError(f"Unknown storage type: {storage_type}") + + storage_info = STORAGE_IMPLEMENTATIONS[storage_type] + if storage_name not in storage_info["implementations"]: + raise ValueError( + f"Storage implementation '{storage_name}' is not compatible with {storage_type}. " + f"Compatible implementations are: {', '.join(storage_info['implementations'])}" + ) + + def check_storage_env_vars(self, storage_name: str) -> None: + """Check if all required environment variables for storage implementation exist + + Args: + storage_name: Storage implementation name + + Raises: + ValueError: If required environment variables are missing + """ + required_vars = STORAGE_ENV_REQUIREMENTS.get(storage_name, []) + missing_vars = [var for var in required_vars if var not in os.environ] + + if missing_vars: + raise ValueError( + f"Storage implementation '{storage_name}' requires the following " + f"environment variables: {', '.join(missing_vars)}" + ) \ No newline at end of file diff --git a/lightrag/utils.py b/lightrag/utils.py index d932f149..62f62d4d 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -713,3 +713,47 @@ def get_conversation_turns( ) return "\n".join(formatted_turns) + +def always_get_an_event_loop() -> asyncio.AbstractEventLoop: + """ + Ensure that there is always an event loop available. + + This function tries to get the current event loop. If the current event loop is closed or does not exist, + it creates a new event loop and sets it as the current event loop. + + Returns: + asyncio.AbstractEventLoop: The current or newly created event loop. + """ + try: + # Try to get the current event loop + current_loop = asyncio.get_event_loop() + if current_loop.is_closed(): + raise RuntimeError("Event loop is closed.") + return current_loop + + except RuntimeError: + # If no event loop exists or it is closed, create a new one + logger.info("Creating a new event loop in main thread.") + new_loop = asyncio.new_event_loop() + asyncio.set_event_loop(new_loop) + return new_loop + + +def lazy_external_import(module_name: str, class_name: str) -> Callable[..., Any]: + """Lazily import a class from an external module based on the package of the caller.""" + # Get the caller's module and package + import inspect + + caller_frame = inspect.currentframe().f_back + module = inspect.getmodule(caller_frame) + package = module.__package__ if module else None + + def import_class(*args: Any, **kwargs: Any): + import importlib + + module = importlib.import_module(module_name, package=package) + cls = getattr(module, class_name) + return cls(*args, **kwargs) + + return import_class + \ No newline at end of file diff --git a/reproduce/Step_3.py b/reproduce/Step_3.py index f9ee3257..be5ba99d 100644 --- a/reproduce/Step_3.py +++ b/reproduce/Step_3.py @@ -1,7 +1,7 @@ import re import json -import asyncio from lightrag import LightRAG, QueryParam +from lightrag.utils import always_get_an_event_loop def extract_queries(file_path): @@ -23,14 +23,6 @@ async def process_query(query_text, rag_instance, query_param): return None, {"query": query_text, "error": str(e)} -def always_get_an_event_loop() -> asyncio.AbstractEventLoop: - try: - loop = asyncio.get_event_loop() - except RuntimeError: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - return loop - def run_queries_and_save_to_json( queries, rag_instance, query_param, output_file, error_file diff --git a/reproduce/Step_3_openai_compatible.py b/reproduce/Step_3_openai_compatible.py index e4833adf..b1d33f93 100644 --- a/reproduce/Step_3_openai_compatible.py +++ b/reproduce/Step_3_openai_compatible.py @@ -1,10 +1,9 @@ import os import re import json -import asyncio from lightrag import LightRAG, QueryParam from lightrag.llm.openai import openai_complete_if_cache, openai_embed -from lightrag.utils import EmbeddingFunc +from lightrag.utils import EmbeddingFunc, always_get_an_event_loop import numpy as np @@ -55,13 +54,7 @@ async def process_query(query_text, rag_instance, query_param): return None, {"query": query_text, "error": str(e)} -def always_get_an_event_loop() -> asyncio.AbstractEventLoop: - try: - loop = asyncio.get_event_loop() - except RuntimeError: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - return loop + def run_queries_and_save_to_json( From c7bc2c63cfaab68a263ebe14a626c24079a123b0 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 13:21:41 +0100 Subject: [PATCH 54/65] cleanup storages --- .../lightrag_openai_compatible_stream_demo.py | 2 - lightrag/kg/__init__.py | 137 +++++++++++++++- lightrag/lightrag.py | 147 +----------------- lightrag/utils.py | 6 +- reproduce/Step_3.py | 1 - reproduce/Step_3_openai_compatible.py | 3 - 6 files changed, 142 insertions(+), 154 deletions(-) diff --git a/examples/lightrag_openai_compatible_stream_demo.py b/examples/lightrag_openai_compatible_stream_demo.py index 7509e4dc..750f139e 100644 --- a/examples/lightrag_openai_compatible_stream_demo.py +++ b/examples/lightrag_openai_compatible_stream_demo.py @@ -44,5 +44,3 @@ async def print_stream(stream): async for chunk in stream: if chunk: print(chunk, end="", flush=True) - - diff --git a/lightrag/kg/__init__.py b/lightrag/kg/__init__.py index 087eaac9..2f3eae87 100644 --- a/lightrag/kg/__init__.py +++ b/lightrag/kg/__init__.py @@ -1 +1,136 @@ -# print ("init package vars here. ......") +STORAGE_IMPLEMENTATIONS = { + "KV_STORAGE": { + "implementations": [ + "JsonKVStorage", + "MongoKVStorage", + "RedisKVStorage", + "TiDBKVStorage", + "PGKVStorage", + "OracleKVStorage", + ], + "required_methods": ["get_by_id", "upsert"], + }, + "GRAPH_STORAGE": { + "implementations": [ + "NetworkXStorage", + "Neo4JStorage", + "MongoGraphStorage", + "TiDBGraphStorage", + "AGEStorage", + "GremlinStorage", + "PGGraphStorage", + "OracleGraphStorage", + ], + "required_methods": ["upsert_node", "upsert_edge"], + }, + "VECTOR_STORAGE": { + "implementations": [ + "NanoVectorDBStorage", + "MilvusVectorDBStorage", + "ChromaVectorDBStorage", + "TiDBVectorDBStorage", + "PGVectorStorage", + "FaissVectorDBStorage", + "QdrantVectorDBStorage", + "OracleVectorDBStorage", + "MongoVectorDBStorage", + ], + "required_methods": ["query", "upsert"], + }, + "DOC_STATUS_STORAGE": { + "implementations": [ + "JsonDocStatusStorage", + "PGDocStatusStorage", + "PGDocStatusStorage", + "MongoDocStatusStorage", + ], + "required_methods": ["get_docs_by_status"], + }, +} + +# Storage implementation environment variable without default value +STORAGE_ENV_REQUIREMENTS: dict[str, list[str]] = { + # KV Storage Implementations + "JsonKVStorage": [], + "MongoKVStorage": [], + "RedisKVStorage": ["REDIS_URI"], + "TiDBKVStorage": ["TIDB_USER", "TIDB_PASSWORD", "TIDB_DATABASE"], + "PGKVStorage": ["POSTGRES_USER", "POSTGRES_PASSWORD", "POSTGRES_DATABASE"], + "OracleKVStorage": [ + "ORACLE_DSN", + "ORACLE_USER", + "ORACLE_PASSWORD", + "ORACLE_CONFIG_DIR", + ], + # Graph Storage Implementations + "NetworkXStorage": [], + "Neo4JStorage": ["NEO4J_URI", "NEO4J_USERNAME", "NEO4J_PASSWORD"], + "MongoGraphStorage": [], + "TiDBGraphStorage": ["TIDB_USER", "TIDB_PASSWORD", "TIDB_DATABASE"], + "AGEStorage": [ + "AGE_POSTGRES_DB", + "AGE_POSTGRES_USER", + "AGE_POSTGRES_PASSWORD", + ], + "GremlinStorage": ["GREMLIN_HOST", "GREMLIN_PORT", "GREMLIN_GRAPH"], + "PGGraphStorage": [ + "POSTGRES_USER", + "POSTGRES_PASSWORD", + "POSTGRES_DATABASE", + ], + "OracleGraphStorage": [ + "ORACLE_DSN", + "ORACLE_USER", + "ORACLE_PASSWORD", + "ORACLE_CONFIG_DIR", + ], + # Vector Storage Implementations + "NanoVectorDBStorage": [], + "MilvusVectorDBStorage": [], + "ChromaVectorDBStorage": [], + "TiDBVectorDBStorage": ["TIDB_USER", "TIDB_PASSWORD", "TIDB_DATABASE"], + "PGVectorStorage": ["POSTGRES_USER", "POSTGRES_PASSWORD", "POSTGRES_DATABASE"], + "FaissVectorDBStorage": [], + "QdrantVectorDBStorage": ["QDRANT_URL"], # QDRANT_API_KEY has default value None + "OracleVectorDBStorage": [ + "ORACLE_DSN", + "ORACLE_USER", + "ORACLE_PASSWORD", + "ORACLE_CONFIG_DIR", + ], + "MongoVectorDBStorage": [], + # Document Status Storage Implementations + "JsonDocStatusStorage": [], + "PGDocStatusStorage": ["POSTGRES_USER", "POSTGRES_PASSWORD", "POSTGRES_DATABASE"], + "MongoDocStatusStorage": [], +} + +# Storage implementation module mapping +STORAGES = { + "NetworkXStorage": ".kg.networkx_impl", + "JsonKVStorage": ".kg.json_kv_impl", + "NanoVectorDBStorage": ".kg.nano_vector_db_impl", + "JsonDocStatusStorage": ".kg.json_doc_status_impl", + "Neo4JStorage": ".kg.neo4j_impl", + "OracleKVStorage": ".kg.oracle_impl", + "OracleGraphStorage": ".kg.oracle_impl", + "OracleVectorDBStorage": ".kg.oracle_impl", + "MilvusVectorDBStorage": ".kg.milvus_impl", + "MongoKVStorage": ".kg.mongo_impl", + "MongoDocStatusStorage": ".kg.mongo_impl", + "MongoGraphStorage": ".kg.mongo_impl", + "MongoVectorDBStorage": ".kg.mongo_impl", + "RedisKVStorage": ".kg.redis_impl", + "ChromaVectorDBStorage": ".kg.chroma_impl", + "TiDBKVStorage": ".kg.tidb_impl", + "TiDBVectorDBStorage": ".kg.tidb_impl", + "TiDBGraphStorage": ".kg.tidb_impl", + "PGKVStorage": ".kg.postgres_impl", + "PGVectorStorage": ".kg.postgres_impl", + "AGEStorage": ".kg.age_impl", + "PGGraphStorage": ".kg.postgres_impl", + "GremlinStorage": ".kg.gremlin_impl", + "PGDocStatusStorage": ".kg.postgres_impl", + "FaissVectorDBStorage": ".kg.faiss_impl", + "QdrantVectorDBStorage": ".kg.qdrant_impl", +} diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 8b695883..174947f3 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -8,6 +8,8 @@ from datetime import datetime from functools import partial from typing import Any, AsyncIterator, Callable, Iterator, cast, final +from lightrag.kg import STORAGE_ENV_REQUIREMENTS, STORAGE_IMPLEMENTATIONS, STORAGES + from .base import ( BaseGraphStorage, BaseKVStorage, @@ -45,149 +47,6 @@ from .utils import ( config = configparser.ConfigParser() config.read("config.ini", "utf-8") -# Storage type and implementation compatibility validation table -STORAGE_IMPLEMENTATIONS = { - "KV_STORAGE": { - "implementations": [ - "JsonKVStorage", - "MongoKVStorage", - "RedisKVStorage", - "TiDBKVStorage", - "PGKVStorage", - "OracleKVStorage", - ], - "required_methods": ["get_by_id", "upsert"], - }, - "GRAPH_STORAGE": { - "implementations": [ - "NetworkXStorage", - "Neo4JStorage", - "MongoGraphStorage", - "TiDBGraphStorage", - "AGEStorage", - "GremlinStorage", - "PGGraphStorage", - "OracleGraphStorage", - ], - "required_methods": ["upsert_node", "upsert_edge"], - }, - "VECTOR_STORAGE": { - "implementations": [ - "NanoVectorDBStorage", - "MilvusVectorDBStorage", - "ChromaVectorDBStorage", - "TiDBVectorDBStorage", - "PGVectorStorage", - "FaissVectorDBStorage", - "QdrantVectorDBStorage", - "OracleVectorDBStorage", - "MongoVectorDBStorage", - ], - "required_methods": ["query", "upsert"], - }, - "DOC_STATUS_STORAGE": { - "implementations": [ - "JsonDocStatusStorage", - "PGDocStatusStorage", - "PGDocStatusStorage", - "MongoDocStatusStorage", - ], - "required_methods": ["get_docs_by_status"], - }, -} - -# Storage implementation environment variable without default value -STORAGE_ENV_REQUIREMENTS: dict[str, list[str]] = { - # KV Storage Implementations - "JsonKVStorage": [], - "MongoKVStorage": [], - "RedisKVStorage": ["REDIS_URI"], - "TiDBKVStorage": ["TIDB_USER", "TIDB_PASSWORD", "TIDB_DATABASE"], - "PGKVStorage": ["POSTGRES_USER", "POSTGRES_PASSWORD", "POSTGRES_DATABASE"], - "OracleKVStorage": [ - "ORACLE_DSN", - "ORACLE_USER", - "ORACLE_PASSWORD", - "ORACLE_CONFIG_DIR", - ], - # Graph Storage Implementations - "NetworkXStorage": [], - "Neo4JStorage": ["NEO4J_URI", "NEO4J_USERNAME", "NEO4J_PASSWORD"], - "MongoGraphStorage": [], - "TiDBGraphStorage": ["TIDB_USER", "TIDB_PASSWORD", "TIDB_DATABASE"], - "AGEStorage": [ - "AGE_POSTGRES_DB", - "AGE_POSTGRES_USER", - "AGE_POSTGRES_PASSWORD", - ], - "GremlinStorage": ["GREMLIN_HOST", "GREMLIN_PORT", "GREMLIN_GRAPH"], - "PGGraphStorage": [ - "POSTGRES_USER", - "POSTGRES_PASSWORD", - "POSTGRES_DATABASE", - ], - "OracleGraphStorage": [ - "ORACLE_DSN", - "ORACLE_USER", - "ORACLE_PASSWORD", - "ORACLE_CONFIG_DIR", - ], - # Vector Storage Implementations - "NanoVectorDBStorage": [], - "MilvusVectorDBStorage": [], - "ChromaVectorDBStorage": [], - "TiDBVectorDBStorage": ["TIDB_USER", "TIDB_PASSWORD", "TIDB_DATABASE"], - "PGVectorStorage": ["POSTGRES_USER", "POSTGRES_PASSWORD", "POSTGRES_DATABASE"], - "FaissVectorDBStorage": [], - "QdrantVectorDBStorage": ["QDRANT_URL"], # QDRANT_API_KEY has default value None - "OracleVectorDBStorage": [ - "ORACLE_DSN", - "ORACLE_USER", - "ORACLE_PASSWORD", - "ORACLE_CONFIG_DIR", - ], - "MongoVectorDBStorage": [], - # Document Status Storage Implementations - "JsonDocStatusStorage": [], - "PGDocStatusStorage": ["POSTGRES_USER", "POSTGRES_PASSWORD", "POSTGRES_DATABASE"], - "MongoDocStatusStorage": [], -} - -# Storage implementation module mapping -STORAGES = { - "NetworkXStorage": ".kg.networkx_impl", - "JsonKVStorage": ".kg.json_kv_impl", - "NanoVectorDBStorage": ".kg.nano_vector_db_impl", - "JsonDocStatusStorage": ".kg.json_doc_status_impl", - "Neo4JStorage": ".kg.neo4j_impl", - "OracleKVStorage": ".kg.oracle_impl", - "OracleGraphStorage": ".kg.oracle_impl", - "OracleVectorDBStorage": ".kg.oracle_impl", - "MilvusVectorDBStorage": ".kg.milvus_impl", - "MongoKVStorage": ".kg.mongo_impl", - "MongoDocStatusStorage": ".kg.mongo_impl", - "MongoGraphStorage": ".kg.mongo_impl", - "MongoVectorDBStorage": ".kg.mongo_impl", - "RedisKVStorage": ".kg.redis_impl", - "ChromaVectorDBStorage": ".kg.chroma_impl", - "TiDBKVStorage": ".kg.tidb_impl", - "TiDBVectorDBStorage": ".kg.tidb_impl", - "TiDBGraphStorage": ".kg.tidb_impl", - "PGKVStorage": ".kg.postgres_impl", - "PGVectorStorage": ".kg.postgres_impl", - "AGEStorage": ".kg.age_impl", - "PGGraphStorage": ".kg.postgres_impl", - "GremlinStorage": ".kg.gremlin_impl", - "PGDocStatusStorage": ".kg.postgres_impl", - "FaissVectorDBStorage": ".kg.faiss_impl", - "QdrantVectorDBStorage": ".kg.qdrant_impl", -} - - - - - - @final @dataclass @@ -1643,4 +1502,4 @@ class LightRAG: raise ValueError( f"Storage implementation '{storage_name}' requires the following " f"environment variables: {', '.join(missing_vars)}" - ) \ No newline at end of file + ) diff --git a/lightrag/utils.py b/lightrag/utils.py index 62f62d4d..d402d14c 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -714,6 +714,7 @@ def get_conversation_turns( return "\n".join(formatted_turns) + def always_get_an_event_loop() -> asyncio.AbstractEventLoop: """ Ensure that there is always an event loop available. @@ -737,8 +738,8 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop: new_loop = asyncio.new_event_loop() asyncio.set_event_loop(new_loop) return new_loop - - + + def lazy_external_import(module_name: str, class_name: str) -> Callable[..., Any]: """Lazily import a class from an external module based on the package of the caller.""" # Get the caller's module and package @@ -756,4 +757,3 @@ def lazy_external_import(module_name: str, class_name: str) -> Callable[..., Any return cls(*args, **kwargs) return import_class - \ No newline at end of file diff --git a/reproduce/Step_3.py b/reproduce/Step_3.py index be5ba99d..facb913e 100644 --- a/reproduce/Step_3.py +++ b/reproduce/Step_3.py @@ -23,7 +23,6 @@ async def process_query(query_text, rag_instance, query_param): return None, {"query": query_text, "error": str(e)} - def run_queries_and_save_to_json( queries, rag_instance, query_param, output_file, error_file ): diff --git a/reproduce/Step_3_openai_compatible.py b/reproduce/Step_3_openai_compatible.py index b1d33f93..885220fa 100644 --- a/reproduce/Step_3_openai_compatible.py +++ b/reproduce/Step_3_openai_compatible.py @@ -54,9 +54,6 @@ async def process_query(query_text, rag_instance, query_param): return None, {"query": query_text, "error": str(e)} - - - def run_queries_and_save_to_json( queries, rag_instance, query_param, output_file, error_file ): From 59bb75d4a1b3552381a6f53a02753fd818608ec3 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 13:27:55 +0100 Subject: [PATCH 55/65] added log path --- lightrag/lightrag.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 174947f3..9f4db5ab 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -82,8 +82,8 @@ class LightRAG: log_level: int = field(default=logger.level) """Logging level for the system (e.g., 'DEBUG', 'INFO', 'WARNING').""" - log_dir: str = field(default=os.getcwd()) - """Directory where logs are stored. Defaults to the current working directory.""" + log_file_path: str = field(default=os.path.join(os.getcwd(), "lightrag.log")) + """Log file path.""" # Entity extraction # --- @@ -251,9 +251,8 @@ class LightRAG: """ def __post_init__(self): - os.makedirs(self.log_dir, exist_ok=True) - log_file = os.path.join(self.log_dir, "lightrag.log") - set_logger(log_file) + os.makedirs(os.path.dirname(self.log_file_path), exist_ok=True) + set_logger(self.log_file_path) logger.setLevel(self.log_level) logger.info(f"Logger initialized for working directory: {self.working_dir}") From 60717fd6be185cc3d4a93f57473a7091adc5b327 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 13:30:30 +0100 Subject: [PATCH 56/65] cleanup storage state --- lightrag/lightrag.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 9f4db5ab..1a8dcf5c 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -250,12 +250,14 @@ class LightRAG: The default function is :func:`.utils.convert_response_to_json`. """ + _storages_status: StoragesStatus = field(default=StoragesStatus.NOT_CREATED) + def __post_init__(self): + logger.setLevel(self.log_level) os.makedirs(os.path.dirname(self.log_file_path), exist_ok=True) set_logger(self.log_file_path) - - logger.setLevel(self.log_level) logger.info(f"Logger initialized for working directory: {self.working_dir}") + if not os.path.exists(self.working_dir): logger.info(f"Creating working directory {self.working_dir}") os.makedirs(self.working_dir) @@ -283,9 +285,6 @@ class LightRAG: **self.vector_db_storage_cls_kwargs, } - # Life cycle - self.storages_status = StoragesStatus.NOT_CREATED - # Show config global_config = asdict(self) _print_config = ",\n ".join([f"{k} = {v}" for k, v in global_config.items()]) @@ -393,7 +392,7 @@ class LightRAG: ) ) - self.storages_status = StoragesStatus.CREATED + self._storages_status = StoragesStatus.CREATED # Initialize storages if self.auto_manage_storages_states: @@ -408,7 +407,7 @@ class LightRAG: async def initialize_storages(self): """Asynchronously initialize the storages""" - if self.storages_status == StoragesStatus.CREATED: + if self._storages_status == StoragesStatus.CREATED: tasks = [] for storage in ( @@ -426,12 +425,12 @@ class LightRAG: await asyncio.gather(*tasks) - self.storages_status = StoragesStatus.INITIALIZED + self._storages_status = StoragesStatus.INITIALIZED logger.debug("Initialized Storages") async def finalize_storages(self): """Asynchronously finalize the storages""" - if self.storages_status == StoragesStatus.INITIALIZED: + if self._storages_status == StoragesStatus.INITIALIZED: tasks = [] for storage in ( @@ -449,7 +448,7 @@ class LightRAG: await asyncio.gather(*tasks) - self.storages_status = StoragesStatus.FINALIZED + self._storages_status = StoragesStatus.FINALIZED logger.debug("Finalized Storages") def _get_storage_class(self, storage_name: str) -> Callable[..., Any]: From 38dc2466dade429e1a54c07103779aff071a2012 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 13:34:47 +0100 Subject: [PATCH 57/65] cleanup --- examples/lightrag_openai_compatible_stream_demo.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/lightrag_openai_compatible_stream_demo.py b/examples/lightrag_openai_compatible_stream_demo.py index 750f139e..a974ca14 100644 --- a/examples/lightrag_openai_compatible_stream_demo.py +++ b/examples/lightrag_openai_compatible_stream_demo.py @@ -1,7 +1,8 @@ +import inspect import os from lightrag import LightRAG from lightrag.llm import openai_complete, openai_embed -from lightrag.utils import EmbeddingFunc +from lightrag.utils import EmbeddingFunc, always_get_an_event_loop from lightrag import QueryParam # WorkingDir @@ -44,3 +45,10 @@ async def print_stream(stream): async for chunk in stream: if chunk: print(chunk, end="", flush=True) + + +loop = always_get_an_event_loop() +if inspect.isasyncgen(resp): + loop.run_until_complete(print_stream(resp)) +else: + print(resp) From 4923f5c1d794e0293aeeea5050780cd62249b05e Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 13:39:46 +0100 Subject: [PATCH 58/65] cleanup kg --- lightrag/kg/__init__.py | 22 ++++++++++++++++++++++ lightrag/lightrag.py | 26 +++----------------------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/lightrag/kg/__init__.py b/lightrag/kg/__init__.py index 2f3eae87..18aa2796 100644 --- a/lightrag/kg/__init__.py +++ b/lightrag/kg/__init__.py @@ -134,3 +134,25 @@ STORAGES = { "FaissVectorDBStorage": ".kg.faiss_impl", "QdrantVectorDBStorage": ".kg.qdrant_impl", } + +def verify_storage_implementation( + storage_type: str, storage_name: str + ) -> None: + """Verify if storage implementation is compatible with specified storage type + + Args: + storage_type: Storage type (KV_STORAGE, GRAPH_STORAGE etc.) + storage_name: Storage implementation name + + Raises: + ValueError: If storage implementation is incompatible or missing required methods + """ + if storage_type not in STORAGE_IMPLEMENTATIONS: + raise ValueError(f"Unknown storage type: {storage_type}") + + storage_info = STORAGE_IMPLEMENTATIONS[storage_type] + if storage_name not in storage_info["implementations"]: + raise ValueError( + f"Storage implementation '{storage_name}' is not compatible with {storage_type}. " + f"Compatible implementations are: {', '.join(storage_info['implementations'])}" + ) \ No newline at end of file diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 1a8dcf5c..ca9733ce 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -8,7 +8,7 @@ from datetime import datetime from functools import partial from typing import Any, AsyncIterator, Callable, Iterator, cast, final -from lightrag.kg import STORAGE_ENV_REQUIREMENTS, STORAGE_IMPLEMENTATIONS, STORAGES +from lightrag.kg import STORAGE_ENV_REQUIREMENTS, STORAGES, verify_storage_implementation from .base import ( BaseGraphStorage, @@ -44,6 +44,7 @@ from .utils import ( encode_string_by_tiktoken, ) +# TODO: TO REMOVE @Yannick config = configparser.ConfigParser() config.read("config.ini", "utf-8") @@ -272,7 +273,7 @@ class LightRAG: for storage_type, storage_name in storage_configs: # Verify storage implementation compatibility - self.verify_storage_implementation(storage_type, storage_name) + verify_storage_implementation(storage_type, storage_name) # Check environment variables # self.check_storage_env_vars(storage_name) @@ -1462,27 +1463,6 @@ class LightRAG: return result - def verify_storage_implementation( - self, storage_type: str, storage_name: str - ) -> None: - """Verify if storage implementation is compatible with specified storage type - - Args: - storage_type: Storage type (KV_STORAGE, GRAPH_STORAGE etc.) - storage_name: Storage implementation name - - Raises: - ValueError: If storage implementation is incompatible or missing required methods - """ - if storage_type not in STORAGE_IMPLEMENTATIONS: - raise ValueError(f"Unknown storage type: {storage_type}") - - storage_info = STORAGE_IMPLEMENTATIONS[storage_type] - if storage_name not in storage_info["implementations"]: - raise ValueError( - f"Storage implementation '{storage_name}' is not compatible with {storage_type}. " - f"Compatible implementations are: {', '.join(storage_info['implementations'])}" - ) def check_storage_env_vars(self, storage_name: str) -> None: """Check if all required environment variables for storage implementation exist From 8413537ad2879c830d6769704db22c1d2987f7ef Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 13:44:17 +0100 Subject: [PATCH 59/65] cleanup --- lightrag/kg/__init__.py | 35 +++++++++++++++++------------------ lightrag/lightrag.py | 16 ++++++++++------ 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/lightrag/kg/__init__.py b/lightrag/kg/__init__.py index 18aa2796..4943fc1d 100644 --- a/lightrag/kg/__init__.py +++ b/lightrag/kg/__init__.py @@ -135,24 +135,23 @@ STORAGES = { "QdrantVectorDBStorage": ".kg.qdrant_impl", } -def verify_storage_implementation( - storage_type: str, storage_name: str - ) -> None: - """Verify if storage implementation is compatible with specified storage type - Args: - storage_type: Storage type (KV_STORAGE, GRAPH_STORAGE etc.) - storage_name: Storage implementation name +def verify_storage_implementation(storage_type: str, storage_name: str) -> None: + """Verify if storage implementation is compatible with specified storage type - Raises: - ValueError: If storage implementation is incompatible or missing required methods - """ - if storage_type not in STORAGE_IMPLEMENTATIONS: - raise ValueError(f"Unknown storage type: {storage_type}") + Args: + storage_type: Storage type (KV_STORAGE, GRAPH_STORAGE etc.) + storage_name: Storage implementation name - storage_info = STORAGE_IMPLEMENTATIONS[storage_type] - if storage_name not in storage_info["implementations"]: - raise ValueError( - f"Storage implementation '{storage_name}' is not compatible with {storage_type}. " - f"Compatible implementations are: {', '.join(storage_info['implementations'])}" - ) \ No newline at end of file + Raises: + ValueError: If storage implementation is incompatible or missing required methods + """ + if storage_type not in STORAGE_IMPLEMENTATIONS: + raise ValueError(f"Unknown storage type: {storage_type}") + + storage_info = STORAGE_IMPLEMENTATIONS[storage_type] + if storage_name not in storage_info["implementations"]: + raise ValueError( + f"Storage implementation '{storage_name}' is not compatible with {storage_type}. " + f"Compatible implementations are: {', '.join(storage_info['implementations'])}" + ) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index ca9733ce..5bb05764 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -8,7 +8,11 @@ from datetime import datetime from functools import partial from typing import Any, AsyncIterator, Callable, Iterator, cast, final -from lightrag.kg import STORAGE_ENV_REQUIREMENTS, STORAGES, verify_storage_implementation +from lightrag.kg import ( + STORAGE_ENV_REQUIREMENTS, + STORAGES, + verify_storage_implementation, +) from .base import ( BaseGraphStorage, @@ -251,6 +255,10 @@ class LightRAG: The default function is :func:`.utils.convert_response_to_json`. """ + cosine_better_than_threshold: float = field( + default=float(os.getenv("COSINE_THRESHOLD", 0.2)) + ) + _storages_status: StoragesStatus = field(default=StoragesStatus.NOT_CREATED) def __post_init__(self): @@ -278,11 +286,8 @@ class LightRAG: # self.check_storage_env_vars(storage_name) # Ensure vector_db_storage_cls_kwargs has required fields - default_vector_db_kwargs = { - "cosine_better_than_threshold": float(os.getenv("COSINE_THRESHOLD", "0.2")) - } self.vector_db_storage_cls_kwargs = { - **default_vector_db_kwargs, + "cosine_better_than_threshold": self.cosine_better_than_threshold, **self.vector_db_storage_cls_kwargs, } @@ -1463,7 +1468,6 @@ class LightRAG: return result - def check_storage_env_vars(self, storage_name: str) -> None: """Check if all required environment variables for storage implementation exist From 3e820cc68ea08127c821f806eb822c96f4cc21b1 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 14:04:59 +0100 Subject: [PATCH 60/65] fixed default factory --- lightrag/lightrag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 5bb05764..990c1bcf 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -184,7 +184,7 @@ class LightRAG: """Maximum number of concurrent embedding function calls.""" embedding_cache_config: dict[str, Any] = field( - default={ + default_factory= lambda: { "enabled": False, "similarity_threshold": 0.95, "use_llm_check": False, From 214e3e8ad5c4d479d73afc3aee72ecdbfd3b0bf3 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 14:12:19 +0100 Subject: [PATCH 61/65] fixed last update --- examples/test_faiss.py | 2 +- lightrag/__init__.py | 2 +- lightrag/kg/networkx_impl.py | 4 ++-- lightrag/lightrag.py | 5 ++--- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/examples/test_faiss.py b/examples/test_faiss.py index ab0ef9f7..c3ac6f47 100644 --- a/examples/test_faiss.py +++ b/examples/test_faiss.py @@ -70,7 +70,7 @@ def main(): ), vector_storage="FaissVectorDBStorage", vector_db_storage_cls_kwargs={ - "cosine_better_than_threshold": 0.3 # Your desired threshold + "cosine_better_than_threshold": 0.2 # Your desired threshold }, ) diff --git a/lightrag/__init__.py b/lightrag/__init__.py index 025fb73b..99f4052f 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam -__version__ = "1.1.7" +__version__ = "1.1.10" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/LightRAG" diff --git a/lightrag/kg/networkx_impl.py b/lightrag/kg/networkx_impl.py index 614715c4..853bd369 100644 --- a/lightrag/kg/networkx_impl.py +++ b/lightrag/kg/networkx_impl.py @@ -16,12 +16,12 @@ import pipmaster as pm if not pm.is_installed("networkx"): pm.install("networkx") + if not pm.is_installed("graspologic"): pm.install("graspologic") -from graspologic import embed import networkx as nx - +from graspologic import embed @final @dataclass diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 990c1bcf..38a6e835 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -738,9 +738,8 @@ class LightRAG: if new_kg is None: logger.info("No new entities or relationships extracted.") else: - async with self._entity_lock: - logger.info("New entities or relationships extracted.") - self.chunk_entity_relation_graph = new_kg + logger.info("New entities or relationships extracted.") + self.chunk_entity_relation_graph = new_kg except Exception as e: logger.error("Failed to extract entities and relationships") From c4562f71b9dcac80fd95b5e5c32dae7d6fba3a67 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 14:17:26 +0100 Subject: [PATCH 62/65] cleanup extraction --- lightrag/kg/networkx_impl.py | 3 ++- lightrag/lightrag.py | 10 ++-------- lightrag/operate.py | 24 ++++++++++++------------ 3 files changed, 16 insertions(+), 21 deletions(-) diff --git a/lightrag/kg/networkx_impl.py b/lightrag/kg/networkx_impl.py index 853bd369..1874719f 100644 --- a/lightrag/kg/networkx_impl.py +++ b/lightrag/kg/networkx_impl.py @@ -16,13 +16,14 @@ import pipmaster as pm if not pm.is_installed("networkx"): pm.install("networkx") - + if not pm.is_installed("graspologic"): pm.install("graspologic") import networkx as nx from graspologic import embed + @final @dataclass class NetworkXStorage(BaseGraphStorage): diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 38a6e835..71784a8b 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -184,7 +184,7 @@ class LightRAG: """Maximum number of concurrent embedding function calls.""" embedding_cache_config: dict[str, Any] = field( - default_factory= lambda: { + default_factory=lambda: { "enabled": False, "similarity_threshold": 0.95, "use_llm_check": False, @@ -727,7 +727,7 @@ class LightRAG: async def _process_entity_relation_graph(self, chunk: dict[str, Any]) -> None: try: - new_kg = await extract_entities( + await extract_entities( chunk, knowledge_graph_inst=self.chunk_entity_relation_graph, entity_vdb=self.entities_vdb, @@ -735,12 +735,6 @@ class LightRAG: llm_response_cache=self.llm_response_cache, global_config=asdict(self), ) - if new_kg is None: - logger.info("No new entities or relationships extracted.") - else: - logger.info("New entities or relationships extracted.") - self.chunk_entity_relation_graph = new_kg - except Exception as e: logger.error("Failed to extract entities and relationships") raise e diff --git a/lightrag/operate.py b/lightrag/operate.py index 27950b7d..a79192ac 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -329,7 +329,7 @@ async def extract_entities( relationships_vdb: BaseVectorStorage, global_config: dict[str, str], llm_response_cache: BaseKVStorage | None = None, -) -> BaseGraphStorage | None: +) -> None: use_llm_func: callable = global_config["llm_model_func"] entity_extract_max_gleaning = global_config["entity_extract_max_gleaning"] enable_llm_cache_for_entity_extract: bool = global_config[ @@ -522,16 +522,18 @@ async def extract_entities( ] ) - if not len(all_entities_data) and not len(all_relationships_data): - logger.warning( - "Didn't extract any entities and relationships, maybe your LLM is not working" - ) - return None + if not (all_entities_data or all_relationships_data): + logger.info("Didn't extract any entities and relationships.") + return - if not len(all_entities_data): - logger.warning("Didn't extract any entities") - if not len(all_relationships_data): - logger.warning("Didn't extract any relationships") + if not all_entities_data: + logger.info("Didn't extract any entities") + if not all_relationships_data: + logger.info("Didn't extract any relationships") + + logger.info( + f"New entities or relationships extracted, entities:{all_entities_data}, relationships:{all_relationships_data}" + ) if entity_vdb is not None: data_for_vdb = { @@ -560,8 +562,6 @@ async def extract_entities( } await relationships_vdb.upsert(data_for_vdb) - return knowledge_graph_inst - async def kg_query( query: str, From 439685e69c2a12931fd38ebfd31517ae0c0f5e13 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 14:29:36 +0100 Subject: [PATCH 63/65] Revert "removed get_knowledge_graph" --- lightrag/api/lightrag_server.py | 4 + lightrag/base.py | 7 ++ lightrag/kg/age_impl.py | 6 ++ lightrag/kg/gremlin_impl.py | 6 ++ lightrag/kg/mongo_impl.py | 174 ++++++++++++++++++++++++++++++++ lightrag/kg/neo4j_impl.py | 94 +++++++++++++++++ lightrag/kg/networkx_impl.py | 6 ++ lightrag/kg/oracle_impl.py | 6 ++ lightrag/kg/postgres_impl.py | 6 ++ lightrag/kg/tidb_impl.py | 7 ++ lightrag/lightrag.py | 8 ++ 11 files changed, 324 insertions(+) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 58931eec..96315b82 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -1683,6 +1683,10 @@ def create_app(args): raise HTTPException(status_code=500, detail=str(e)) # query all graph + @app.get("/graphs") + async def get_knowledge_graph(label: str): + return await rag.get_knowledge_graph(nodel_label=label, max_depth=100) + # Add Ollama API routes ollama_api = OllamaAPI(rag, top_k=args.top_k) app.include_router(ollama_api.router, prefix="/api") diff --git a/lightrag/base.py b/lightrag/base.py index 5f6f8850..af060435 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -13,6 +13,7 @@ from typing import ( ) import numpy as np from .utils import EmbeddingFunc +from .types import KnowledgeGraph load_dotenv() @@ -197,6 +198,12 @@ class BaseGraphStorage(StorageNameSpace, ABC): ) -> tuple[np.ndarray[Any, Any], list[str]]: """Get all labels in the graph.""" + @abstractmethod + async def get_knowledge_graph( + self, node_label: str, max_depth: int = 5 + ) -> KnowledgeGraph: + """Retrieve a subgraph of the knowledge graph starting from a given node.""" + class DocStatus(str, Enum): """Document processing status""" diff --git a/lightrag/kg/age_impl.py b/lightrag/kg/age_impl.py index 583423bb..077c7321 100644 --- a/lightrag/kg/age_impl.py +++ b/lightrag/kg/age_impl.py @@ -8,6 +8,7 @@ from dataclasses import dataclass from typing import Any, Dict, List, NamedTuple, Optional, Union, final import numpy as np import pipmaster as pm +from lightrag.types import KnowledgeGraph from tenacity import ( retry, @@ -615,6 +616,11 @@ class AGEStorage(BaseGraphStorage): ) -> tuple[np.ndarray[Any, Any], list[str]]: raise NotImplementedError + async def get_knowledge_graph( + self, node_label: str, max_depth: int = 5 + ) -> KnowledgeGraph: + raise NotImplementedError + async def index_done_callback(self) -> None: # AGES handles persistence automatically pass diff --git a/lightrag/kg/gremlin_impl.py b/lightrag/kg/gremlin_impl.py index 45bc1fab..39077b5f 100644 --- a/lightrag/kg/gremlin_impl.py +++ b/lightrag/kg/gremlin_impl.py @@ -16,6 +16,7 @@ from tenacity import ( wait_exponential, ) +from lightrag.types import KnowledgeGraph from lightrag.utils import logger from ..base import BaseGraphStorage @@ -401,3 +402,8 @@ class GremlinStorage(BaseGraphStorage): self, algorithm: str ) -> tuple[np.ndarray[Any, Any], list[str]]: raise NotImplementedError + + async def get_knowledge_graph( + self, node_label: str, max_depth: int = 5 + ) -> KnowledgeGraph: + raise NotImplementedError diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index cfae4abd..07b48f8b 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -16,6 +16,7 @@ from ..base import ( ) from ..namespace import NameSpace, is_namespace from ..utils import logger +from ..types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge import pipmaster as pm if not pm.is_installed("pymongo"): @@ -598,6 +599,179 @@ class MongoGraphStorage(BaseGraphStorage): # ------------------------------------------------------------------------- # QUERY # ------------------------------------------------------------------------- + # + + async def get_knowledge_graph( + self, node_label: str, max_depth: int = 5 + ) -> KnowledgeGraph: + """ + Get complete connected subgraph for specified node (including the starting node itself) + + Args: + node_label: Label of the nodes to start from + max_depth: Maximum depth of traversal (default: 5) + + Returns: + KnowledgeGraph object containing nodes and edges of the subgraph + """ + label = node_label + result = KnowledgeGraph() + seen_nodes = set() + seen_edges = set() + + try: + if label == "*": + # Get all nodes and edges + async for node_doc in self.collection.find({}): + node_id = str(node_doc["_id"]) + if node_id not in seen_nodes: + result.nodes.append( + KnowledgeGraphNode( + id=node_id, + labels=[node_doc.get("_id")], + properties={ + k: v + for k, v in node_doc.items() + if k not in ["_id", "edges"] + }, + ) + ) + seen_nodes.add(node_id) + + # Process edges + for edge in node_doc.get("edges", []): + edge_id = f"{node_id}-{edge['target']}" + if edge_id not in seen_edges: + result.edges.append( + KnowledgeGraphEdge( + id=edge_id, + type=edge.get("relation", ""), + source=node_id, + target=edge["target"], + properties={ + k: v + for k, v in edge.items() + if k not in ["target", "relation"] + }, + ) + ) + seen_edges.add(edge_id) + else: + # Verify if starting node exists + start_nodes = self.collection.find({"_id": label}) + start_nodes_exist = await start_nodes.to_list(length=1) + if not start_nodes_exist: + logger.warning(f"Starting node with label {label} does not exist!") + return result + + # Use $graphLookup for traversal + pipeline = [ + { + "$match": {"_id": label} + }, # Start with nodes having the specified label + { + "$graphLookup": { + "from": self._collection_name, + "startWith": "$edges.target", + "connectFromField": "edges.target", + "connectToField": "_id", + "maxDepth": max_depth, + "depthField": "depth", + "as": "connected_nodes", + } + }, + ] + + async for doc in self.collection.aggregate(pipeline): + # Add the start node + node_id = str(doc["_id"]) + if node_id not in seen_nodes: + result.nodes.append( + KnowledgeGraphNode( + id=node_id, + labels=[ + doc.get( + "_id", + ) + ], + properties={ + k: v + for k, v in doc.items() + if k + not in [ + "_id", + "edges", + "connected_nodes", + "depth", + ] + }, + ) + ) + seen_nodes.add(node_id) + + # Add edges from start node + for edge in doc.get("edges", []): + edge_id = f"{node_id}-{edge['target']}" + if edge_id not in seen_edges: + result.edges.append( + KnowledgeGraphEdge( + id=edge_id, + type=edge.get("relation", ""), + source=node_id, + target=edge["target"], + properties={ + k: v + for k, v in edge.items() + if k not in ["target", "relation"] + }, + ) + ) + seen_edges.add(edge_id) + + # Add connected nodes and their edges + for connected in doc.get("connected_nodes", []): + node_id = str(connected["_id"]) + if node_id not in seen_nodes: + result.nodes.append( + KnowledgeGraphNode( + id=node_id, + labels=[connected.get("_id")], + properties={ + k: v + for k, v in connected.items() + if k not in ["_id", "edges", "depth"] + }, + ) + ) + seen_nodes.add(node_id) + + # Add edges from connected nodes + for edge in connected.get("edges", []): + edge_id = f"{node_id}-{edge['target']}" + if edge_id not in seen_edges: + result.edges.append( + KnowledgeGraphEdge( + id=edge_id, + type=edge.get("relation", ""), + source=node_id, + target=edge["target"], + properties={ + k: v + for k, v in edge.items() + if k not in ["target", "relation"] + }, + ) + ) + seen_edges.add(edge_id) + + logger.info( + f"Subgraph query successful | Node count: {len(result.nodes)} | Edge count: {len(result.edges)}" + ) + + except PyMongoError as e: + logger.error(f"MongoDB query failed: {str(e)}") + + return result async def index_done_callback(self) -> None: # Mongo handles persistence automatically diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py index 9754ffc5..de0273ad 100644 --- a/lightrag/kg/neo4j_impl.py +++ b/lightrag/kg/neo4j_impl.py @@ -17,6 +17,7 @@ from tenacity import ( from ..utils import logger from ..base import BaseGraphStorage +from ..types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge import pipmaster as pm if not pm.is_installed("neo4j"): @@ -468,6 +469,99 @@ class Neo4JStorage(BaseGraphStorage): async def _node2vec_embed(self): print("Implemented but never called.") + async def get_knowledge_graph( + self, node_label: str, max_depth: int = 5 + ) -> KnowledgeGraph: + """ + Get complete connected subgraph for specified node (including the starting node itself) + + Key fixes: + 1. Include the starting node itself + 2. Handle multi-label nodes + 3. Clarify relationship directions + 4. Add depth control + """ + label = node_label.strip('"') + result = KnowledgeGraph() + seen_nodes = set() + seen_edges = set() + + async with self._driver.session(database=self._DATABASE) as session: + try: + main_query = "" + if label == "*": + main_query = """ + MATCH (n) + WITH collect(DISTINCT n) AS nodes + MATCH ()-[r]-() + RETURN nodes, collect(DISTINCT r) AS relationships; + """ + else: + # Critical debug step: first verify if starting node exists + validate_query = f"MATCH (n:`{label}`) RETURN n LIMIT 1" + validate_result = await session.run(validate_query) + if not await validate_result.single(): + logger.warning(f"Starting node {label} does not exist!") + return result + + # Optimized query (including direction handling and self-loops) + main_query = f""" + MATCH (start:`{label}`) + WITH start + CALL apoc.path.subgraphAll(start, {{ + relationshipFilter: '>', + minLevel: 0, + maxLevel: {max_depth}, + bfs: true + }}) + YIELD nodes, relationships + RETURN nodes, relationships + """ + result_set = await session.run(main_query) + record = await result_set.single() + + if record: + # Handle nodes (compatible with multi-label cases) + for node in record["nodes"]: + # Use node ID + label combination as unique identifier + node_id = node.id + if node_id not in seen_nodes: + result.nodes.append( + KnowledgeGraphNode( + id=f"{node_id}", + labels=list(node.labels), + properties=dict(node), + ) + ) + seen_nodes.add(node_id) + + # Handle relationships (including direction information) + for rel in record["relationships"]: + edge_id = rel.id + if edge_id not in seen_edges: + start = rel.start_node + end = rel.end_node + result.edges.append( + KnowledgeGraphEdge( + id=f"{edge_id}", + type=rel.type, + source=f"{start.id}", + target=f"{end.id}", + properties=dict(rel), + ) + ) + seen_edges.add(edge_id) + + logger.info( + f"Subgraph query successful | Node count: {len(result.nodes)} | Edge count: {len(result.edges)}" + ) + + except neo4jExceptions.ClientError as e: + logger.error(f"APOC query failed: {str(e)}") + return await self._robust_fallback(label, max_depth) + + return result + async def _robust_fallback( self, label: str, max_depth: int ) -> Dict[str, List[Dict]]: diff --git a/lightrag/kg/networkx_impl.py b/lightrag/kg/networkx_impl.py index 1874719f..3e7a08fd 100644 --- a/lightrag/kg/networkx_impl.py +++ b/lightrag/kg/networkx_impl.py @@ -5,6 +5,7 @@ from typing import Any, final import numpy as np +from lightrag.types import KnowledgeGraph from lightrag.utils import ( logger, ) @@ -166,3 +167,8 @@ class NetworkXStorage(BaseGraphStorage): for source, target in edges: if self._graph.has_edge(source, target): self._graph.remove_edge(source, target) + + async def get_knowledge_graph( + self, node_label: str, max_depth: int = 5 + ) -> KnowledgeGraph: + raise NotImplementedError diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index 35983ad3..d65688da 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -8,6 +8,7 @@ from typing import Any, Union, final import numpy as np import configparser +from lightrag.types import KnowledgeGraph from ..base import ( BaseGraphStorage, @@ -669,6 +670,11 @@ class OracleGraphStorage(BaseGraphStorage): async def delete_node(self, node_id: str) -> None: raise NotImplementedError + async def get_knowledge_graph( + self, node_label: str, max_depth: int = 5 + ) -> KnowledgeGraph: + raise NotImplementedError + N_T = { NameSpace.KV_STORE_FULL_DOCS: "LIGHTRAG_DOC_FULL", diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index d7ace41a..a0e0f184 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -7,6 +7,7 @@ from typing import Any, Union, final import numpy as np import configparser +from lightrag.types import KnowledgeGraph import sys from tenacity import ( @@ -1084,6 +1085,11 @@ class PGGraphStorage(BaseGraphStorage): ) -> tuple[np.ndarray[Any, Any], list[str]]: raise NotImplementedError + async def get_knowledge_graph( + self, node_label: str, max_depth: int = 5 + ) -> KnowledgeGraph: + raise NotImplementedError + async def drop(self) -> None: """Drop the storage""" drop_sql = SQL_TEMPLATES["drop_vdb_entity"] diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index 2feb782a..7ba2cf66 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -5,6 +5,8 @@ from typing import Any, Union, final import numpy as np +from lightrag.types import KnowledgeGraph + from ..base import BaseGraphStorage, BaseKVStorage, BaseVectorStorage from ..namespace import NameSpace, is_namespace @@ -558,6 +560,11 @@ class TiDBGraphStorage(BaseGraphStorage): async def delete_node(self, node_id: str) -> None: raise NotImplementedError + async def get_knowledge_graph( + self, node_label: str, max_depth: int = 5 + ) -> KnowledgeGraph: + raise NotImplementedError + N_T = { NameSpace.KV_STORE_FULL_DOCS: "LIGHTRAG_DOC_FULL", diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 71784a8b..0ba34ef7 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -47,6 +47,7 @@ from .utils import ( set_logger, encode_string_by_tiktoken, ) +from .types import KnowledgeGraph # TODO: TO REMOVE @Yannick config = configparser.ConfigParser() @@ -457,6 +458,13 @@ class LightRAG: self._storages_status = StoragesStatus.FINALIZED logger.debug("Finalized Storages") + async def get_knowledge_graph( + self, nodel_label: str, max_depth: int + ) -> KnowledgeGraph: + return await self.chunk_entity_relation_graph.get_knowledge_graph( + node_label=nodel_label, max_depth=max_depth + ) + def _get_storage_class(self, storage_name: str) -> Callable[..., Any]: import_path = STORAGES[storage_name] storage_class = lazy_external_import(import_path, storage_name) From 3647bc9b11588b89647e0d32a12c500d45f16a23 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 14:32:24 +0100 Subject: [PATCH 64/65] updated version to 1.1.11 --- lightrag/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/__init__.py b/lightrag/__init__.py index 99f4052f..2a78af9b 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam -__version__ = "1.1.10" +__version__ = "1.1.11" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/LightRAG" From 678e0f9aea4dece46afab81ed4e8c2005d1de9f7 Mon Sep 17 00:00:00 2001 From: Yannick Stephan <stephan.yannick@me.com> Date: Thu, 20 Feb 2025 15:09:43 +0100 Subject: [PATCH 65/65] Revert "Cleanup of code" --- lightrag/api/lightrag_server.py | 5 ++++ lightrag/base.py | 4 +++ lightrag/kg/age_impl.py | 7 ++++++ lightrag/kg/gremlin_impl.py | 3 +++ lightrag/kg/mongo_impl.py | 18 +++++++++++++ lightrag/kg/neo4j_impl.py | 25 +++++++++++++++++++ lightrag/kg/networkx_impl.py | 3 +++ lightrag/kg/oracle_impl.py | 3 +++ lightrag/kg/postgres_impl.py | 11 +++++--- lightrag/kg/tidb_impl.py | 3 +++ lightrag/lightrag.py | 4 +++ .../lightrag_visualizer/graph_visualizer.py | 5 +--- 12 files changed, 84 insertions(+), 7 deletions(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 96315b82..0cf1d01e 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -1682,6 +1682,11 @@ def create_app(args): trace_exception(e) raise HTTPException(status_code=500, detail=str(e)) + # query all graph labels + @app.get("/graph/label/list") + async def get_graph_labels(): + return await rag.get_graph_labels() + # query all graph @app.get("/graphs") async def get_knowledge_graph(label: str): diff --git a/lightrag/base.py b/lightrag/base.py index af060435..5f6a1bf1 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -198,6 +198,10 @@ class BaseGraphStorage(StorageNameSpace, ABC): ) -> tuple[np.ndarray[Any, Any], list[str]]: """Get all labels in the graph.""" + @abstractmethod + async def get_all_labels(self) -> list[str]: + """Get a knowledge graph of a node.""" + @abstractmethod async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 diff --git a/lightrag/kg/age_impl.py b/lightrag/kg/age_impl.py index 077c7321..97b3825d 100644 --- a/lightrag/kg/age_impl.py +++ b/lightrag/kg/age_impl.py @@ -60,6 +60,10 @@ class AGEQueryException(Exception): @final @dataclass class AGEStorage(BaseGraphStorage): + @staticmethod + def load_nx_graph(file_name): + print("no preloading of graph with AGE in production") + def __init__(self, namespace, global_config, embedding_func): super().__init__( namespace=namespace, @@ -616,6 +620,9 @@ class AGEStorage(BaseGraphStorage): ) -> tuple[np.ndarray[Any, Any], list[str]]: raise NotImplementedError + async def get_all_labels(self) -> list[str]: + raise NotImplementedError + async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 ) -> KnowledgeGraph: diff --git a/lightrag/kg/gremlin_impl.py b/lightrag/kg/gremlin_impl.py index 39077b5f..3a26401d 100644 --- a/lightrag/kg/gremlin_impl.py +++ b/lightrag/kg/gremlin_impl.py @@ -403,6 +403,9 @@ class GremlinStorage(BaseGraphStorage): ) -> tuple[np.ndarray[Any, Any], list[str]]: raise NotImplementedError + async def get_all_labels(self) -> list[str]: + raise NotImplementedError + async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 ) -> KnowledgeGraph: diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index 07b48f8b..0048b384 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -601,6 +601,24 @@ class MongoGraphStorage(BaseGraphStorage): # ------------------------------------------------------------------------- # + async def get_all_labels(self) -> list[str]: + """ + Get all existing node _id in the database + Returns: + [id1, id2, ...] # Alphabetically sorted id list + """ + # Use MongoDB's distinct and aggregation to get all unique labels + pipeline = [ + {"$group": {"_id": "$_id"}}, # Group by _id + {"$sort": {"_id": 1}}, # Sort alphabetically + ] + + cursor = self.collection.aggregate(pipeline) + labels = [] + async for doc in cursor: + labels.append(doc["_id"]) + return labels + async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 ) -> KnowledgeGraph: diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py index de0273ad..0ddc611d 100644 --- a/lightrag/kg/neo4j_impl.py +++ b/lightrag/kg/neo4j_impl.py @@ -628,6 +628,31 @@ class Neo4JStorage(BaseGraphStorage): await traverse(label, 0) return result + async def get_all_labels(self) -> list[str]: + """ + Get all existing node labels in the database + Returns: + ["Person", "Company", ...] # Alphabetically sorted label list + """ + async with self._driver.session(database=self._DATABASE) as session: + # Method 1: Direct metadata query (Available for Neo4j 4.3+) + # query = "CALL db.labels() YIELD label RETURN label" + + # Method 2: Query compatible with older versions + query = """ + MATCH (n) + WITH DISTINCT labels(n) AS node_labels + UNWIND node_labels AS label + RETURN DISTINCT label + ORDER BY label + """ + + result = await session.run(query) + labels = [] + async for record in result: + labels.append(record["label"]) + return labels + async def delete_node(self, node_id: str) -> None: raise NotImplementedError diff --git a/lightrag/kg/networkx_impl.py b/lightrag/kg/networkx_impl.py index 3e7a08fd..9850b8c4 100644 --- a/lightrag/kg/networkx_impl.py +++ b/lightrag/kg/networkx_impl.py @@ -168,6 +168,9 @@ class NetworkXStorage(BaseGraphStorage): if self._graph.has_edge(source, target): self._graph.remove_edge(source, target) + async def get_all_labels(self) -> list[str]: + raise NotImplementedError + async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 ) -> KnowledgeGraph: diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index d65688da..af2ededb 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -670,6 +670,9 @@ class OracleGraphStorage(BaseGraphStorage): async def delete_node(self, node_id: str) -> None: raise NotImplementedError + async def get_all_labels(self) -> list[str]: + raise NotImplementedError + async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 ) -> KnowledgeGraph: diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index a0e0f184..cbbd98c7 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -178,10 +178,12 @@ class PostgreSQLDB: asyncpg.exceptions.UniqueViolationError, asyncpg.exceptions.DuplicateTableError, ) as e: - if not upsert: - logger.error(f"PostgreSQL, upsert error: {e}") + if upsert: + print("Key value duplicate, but upsert succeeded.") + else: + logger.error(f"Upsert error: {e}") except Exception as e: - logger.error(f"PostgreSQL database, sql:{sql}, data:{data}, error:{e}") + logger.error(f"PostgreSQL database,\nsql:{sql},\ndata:{data},\nerror:{e}") raise @@ -1085,6 +1087,9 @@ class PGGraphStorage(BaseGraphStorage): ) -> tuple[np.ndarray[Any, Any], list[str]]: raise NotImplementedError + async def get_all_labels(self) -> list[str]: + raise NotImplementedError + async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 ) -> KnowledgeGraph: diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index 7ba2cf66..4adb0141 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -560,6 +560,9 @@ class TiDBGraphStorage(BaseGraphStorage): async def delete_node(self, node_id: str) -> None: raise NotImplementedError + async def get_all_labels(self) -> list[str]: + raise NotImplementedError + async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 ) -> KnowledgeGraph: diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 0ba34ef7..db61788a 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -458,6 +458,10 @@ class LightRAG: self._storages_status = StoragesStatus.FINALIZED logger.debug("Finalized Storages") + async def get_graph_labels(self): + text = await self.chunk_entity_relation_graph.get_all_labels() + return text + async def get_knowledge_graph( self, nodel_label: str, max_depth: int ) -> KnowledgeGraph: diff --git a/lightrag/tools/lightrag_visualizer/graph_visualizer.py b/lightrag/tools/lightrag_visualizer/graph_visualizer.py index 9950041f..8a6f0976 100644 --- a/lightrag/tools/lightrag_visualizer/graph_visualizer.py +++ b/lightrag/tools/lightrag_visualizer/graph_visualizer.py @@ -1,6 +1,6 @@ from typing import Optional, Tuple, Dict, List import numpy as np - +import networkx as nx import pipmaster as pm # Added automatic libraries install using pipmaster @@ -12,10 +12,7 @@ if not pm.is_installed("pyglm"): pm.install("pyglm") if not pm.is_installed("python-louvain"): pm.install("python-louvain") -if not pm.is_installed("networkx"): - pm.install("networkx") -import networkx as nx import moderngl from imgui_bundle import imgui, immapp, hello_imgui import community