From ff0bd1512eac3c7bcfc4940542d658ec0f60a7f3 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sat, 8 Feb 2025 22:57:37 +0100 Subject: [PATCH 01/42] cleaned base and added missing status --- lightrag/base.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index e71cac3f..661f1a79 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -56,7 +56,7 @@ class QueryParam: @dataclass class StorageNameSpace: namespace: str - global_config: dict + global_config: dict[str, Any] async def index_done_callback(self): """commit the storage operations after indexing""" @@ -92,22 +92,24 @@ class BaseKVStorage(Generic[T], StorageNameSpace): async def get_by_id(self, id: str) -> Union[T, None]: raise NotImplementedError - async def get_by_ids( - self, ids: list[str], fields: Union[set[str], None] = None - ) -> list[Union[T, None]]: + async def get_by_ids(self, ids: list[str]) -> list[Union[T, None]]: raise NotImplementedError async def filter_keys(self, data: list[str]) -> set[str]: """return un-exist keys""" raise NotImplementedError - async def upsert(self, data: dict[str, T]): + async def upsert(self, data: dict[str, T]) -> None: raise NotImplementedError - async def drop(self): + async def drop(self) -> None: raise NotImplementedError - - + + async def get_by_status_and_ids( + self, status: str, ids: list[str] + ) -> list[dict[str, Any]]: + raise NotImplementedError + @dataclass class BaseGraphStorage(StorageNameSpace): embedding_func: EmbeddingFunc = None From fe3050adcea36c67bac854f20faa762d6b187e8b Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sat, 8 Feb 2025 23:02:40 +0100 Subject: [PATCH 02/42] updated json kv --- lightrag/kg/json_kv_impl.py | 46 +++++++++---------------------------- 1 file changed, 11 insertions(+), 35 deletions(-) diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index 2fb753fe..f19463ce 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -51,6 +51,7 @@ Usage: import asyncio import os from dataclasses import dataclass +from typing import Any, Union from lightrag.utils import ( logger, @@ -68,7 +69,7 @@ class JsonKVStorage(BaseKVStorage): def __post_init__(self): working_dir = self.global_config["working_dir"] self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json") - self._data = load_json(self._file_name) or {} + self._data: dict[str, Any] = load_json(self._file_name) or {} self._lock = asyncio.Lock() logger.info(f"Load KV {self.namespace} with {len(self._data)} data") @@ -78,15 +79,13 @@ class JsonKVStorage(BaseKVStorage): async def index_done_callback(self): write_json(self._data, self._file_name) - async def get_by_id(self, id): + async def get_by_id(self, id: str): return self._data.get(id, None) - async def get_by_ids(self, ids, fields=None): - if fields is None: - return [self._data.get(id, None) for id in ids] + async def get_by_ids(self, ids: list[str]): return [ ( - {k: v for k, v in self._data[id].items() if k in fields} + {k: v for k, v in self._data[id].items() } if self._data.get(id, None) else None ) @@ -96,7 +95,7 @@ class JsonKVStorage(BaseKVStorage): async def filter_keys(self, data: list[str]) -> set[str]: return set([s for s in data if s not in self._data]) - async def upsert(self, data: dict[str, dict]): + async def upsert(self, data: dict[str, dict[str, Any]]): left_data = {k: v for k, v in data.items() if k not in self._data} self._data.update(left_data) return left_data @@ -104,31 +103,8 @@ class JsonKVStorage(BaseKVStorage): async def drop(self): self._data = {} - async def filter(self, filter_func): - """Filter key-value pairs based on a filter function - - Args: - filter_func: The filter function, which takes a value as an argument and returns a boolean value - - Returns: - Dict: Key-value pairs that meet the condition - """ - result = {} - async with self._lock: - for key, value in self._data.items(): - if filter_func(value): - result[key] = value - return result - - async def delete(self, ids: list[str]): - """Delete data with specified IDs - - Args: - ids: List of IDs to delete - """ - async with self._lock: - for id in ids: - if id in self._data: - del self._data[id] - await self.index_done_callback() - logger.info(f"Successfully deleted {len(ids)} items from {self.namespace}") + async def get_by_status_and_ids( + self, status: str + ) -> Union[list[dict[str, Any]], None]: + result = [v for _, v in self._data.items() if v["status"] == status] + return result if result else None From eb552afcdc04ec7fc429712bf275f65c60fbc92d Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sat, 8 Feb 2025 23:16:04 +0100 Subject: [PATCH 03/42] fixed base and generic --- lightrag/base.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index 661f1a79..23acb7ad 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -4,7 +4,6 @@ from typing import ( TypedDict, Union, Literal, - Generic, TypeVar, Optional, Dict, @@ -83,30 +82,30 @@ class BaseVectorStorage(StorageNameSpace): @dataclass -class BaseKVStorage(Generic[T], StorageNameSpace): +class BaseKVStorage(StorageNameSpace): embedding_func: EmbeddingFunc async def all_keys(self) -> list[str]: raise NotImplementedError - async def get_by_id(self, id: str) -> Union[T, None]: + async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: raise NotImplementedError - async def get_by_ids(self, ids: list[str]) -> list[Union[T, None]]: + async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: raise NotImplementedError async def filter_keys(self, data: list[str]) -> set[str]: """return un-exist keys""" raise NotImplementedError - async def upsert(self, data: dict[str, T]) -> None: + async def upsert(self, data: dict[str, Any]) -> None: raise NotImplementedError async def drop(self) -> None: raise NotImplementedError async def get_by_status_and_ids( - self, status: str, ids: list[str] + self, status: str ) -> list[dict[str, Any]]: raise NotImplementedError From cff415d91f2d0f86c77087a7f21d2f03d0b7b760 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sat, 8 Feb 2025 23:18:12 +0100 Subject: [PATCH 04/42] implemented method and cleaned the mess --- lightrag/kg/json_kv_impl.py | 63 +++---------------------------- lightrag/kg/jsondocstatus_impl.py | 6 +-- lightrag/kg/mongo_impl.py | 30 +++++++-------- lightrag/kg/oracle_impl.py | 31 ++++----------- lightrag/kg/postgres_impl.py | 17 ++++++--- lightrag/kg/redis_impl.py | 28 +++++++------- lightrag/kg/tidb_impl.py | 16 +++++--- 7 files changed, 66 insertions(+), 125 deletions(-) diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index f19463ce..6ee49f7c 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -1,53 +1,3 @@ -""" -JsonDocStatus Storage Module -======================= - -This module provides a storage interface for graphs using NetworkX, a popular Python library for creating, manipulating, and studying the structure, dynamics, and functions of complex networks. - -The `NetworkXStorage` class extends the `BaseGraphStorage` class from the LightRAG library, providing methods to load, save, manipulate, and query graphs using NetworkX. - -Author: lightrag team -Created: 2024-01-25 -License: MIT - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -Version: 1.0.0 - -Dependencies: - - NetworkX - - NumPy - - LightRAG - - graspologic - -Features: - - Load and save graphs in various formats (e.g., GEXF, GraphML, JSON) - - Query graph nodes and edges - - Calculate node and edge degrees - - Embed nodes using various algorithms (e.g., Node2Vec) - - Remove nodes and edges from the graph - -Usage: - from lightrag.storage.networkx_storage import NetworkXStorage - -""" - import asyncio import os from dataclasses import dataclass @@ -58,12 +8,10 @@ from lightrag.utils import ( load_json, write_json, ) - from lightrag.base import ( BaseKVStorage, ) - @dataclass class JsonKVStorage(BaseKVStorage): def __post_init__(self): @@ -79,13 +27,13 @@ class JsonKVStorage(BaseKVStorage): async def index_done_callback(self): write_json(self._data, self._file_name) - async def get_by_id(self, id: str): + async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: return self._data.get(id, None) - async def get_by_ids(self, ids: list[str]): + async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: return [ ( - {k: v for k, v in self._data[id].items() } + {k: v for k, v in self._data[id].items()} if self._data.get(id, None) else None ) @@ -95,12 +43,11 @@ class JsonKVStorage(BaseKVStorage): async def filter_keys(self, data: list[str]) -> set[str]: return set([s for s in data if s not in self._data]) - async def upsert(self, data: dict[str, dict[str, Any]]): + async def upsert(self, data: dict[str, dict[str, Any]]) -> None: left_data = {k: v for k, v in data.items() if k not in self._data} self._data.update(left_data) - return left_data - async def drop(self): + async def drop(self) -> None: self._data = {} async def get_by_status_and_ids( diff --git a/lightrag/kg/jsondocstatus_impl.py b/lightrag/kg/jsondocstatus_impl.py index 8f326170..8bd972c6 100644 --- a/lightrag/kg/jsondocstatus_impl.py +++ b/lightrag/kg/jsondocstatus_impl.py @@ -50,7 +50,7 @@ Usage: import os from dataclasses import dataclass -from typing import Union, Dict +from typing import Any, Union, Dict from lightrag.utils import ( logger, @@ -104,7 +104,7 @@ class JsonDocStatusStorage(DocStatusStorage): """Save data to file after indexing""" write_json(self._data, self._file_name) - async def upsert(self, data: dict[str, dict]): + async def upsert(self, data: dict[str, Any]) -> None: """Update or insert document status Args: @@ -114,7 +114,7 @@ class JsonDocStatusStorage(DocStatusStorage): await self.index_done_callback() return data - async def get_by_id(self, id: str): + async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: return self._data.get(id) async def get(self, doc_id: str) -> Union[DocProcessingStatus, None]: diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index 7afc4240..d0598ca4 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -12,7 +12,7 @@ if not pm.is_installed("motor"): from pymongo import MongoClient from motor.motor_asyncio import AsyncIOMotorClient -from typing import Union, List, Tuple +from typing import Any, TypeVar, Union, List, Tuple from ..utils import logger from ..base import BaseKVStorage, BaseGraphStorage @@ -32,18 +32,11 @@ class MongoKVStorage(BaseKVStorage): async def all_keys(self) -> list[str]: return [x["_id"] for x in self._data.find({}, {"_id": 1})] - async def get_by_id(self, id): + async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: return self._data.find_one({"_id": id}) - async def get_by_ids(self, ids, fields=None): - if fields is None: - return list(self._data.find({"_id": {"$in": ids}})) - return list( - self._data.find( - {"_id": {"$in": ids}}, - {field: 1 for field in fields}, - ) - ) + async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: + return list(self._data.find({"_id": {"$in": ids}})) async def filter_keys(self, data: list[str]) -> set[str]: existing_ids = [ @@ -51,7 +44,7 @@ class MongoKVStorage(BaseKVStorage): ] return set([s for s in data if s not in existing_ids]) - async def upsert(self, data: dict[str, dict]): + async def upsert(self, data: dict[str, dict[str, Any]]) -> None: if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE): for mode, items in data.items(): for k, v in tqdm_async(items.items(), desc="Upserting"): @@ -66,7 +59,6 @@ class MongoKVStorage(BaseKVStorage): for k, v in tqdm_async(data.items(), desc="Upserting"): self._data.update_one({"_id": k}, {"$set": v}, upsert=True) data[k]["_id"] = k - return data async def get_by_mode_and_id(self, mode: str, id: str) -> Union[dict, None]: if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE): @@ -81,9 +73,15 @@ class MongoKVStorage(BaseKVStorage): else: return None - async def drop(self): - """ """ - pass + async def drop(self) -> None: + """Drop the collection""" + await self._data.drop() + + async def get_by_status_and_ids( + self, status: str + ) -> Union[list[dict[str, Any]], None]: + """Get documents by status and ids""" + return self._data.find({"status": status}) @dataclass diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index a1a05759..9438c323 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -4,7 +4,7 @@ import asyncio # import html # import os from dataclasses import dataclass -from typing import Union +from typing import Any, TypeVar, Union import numpy as np import array import pipmaster as pm @@ -181,7 +181,7 @@ class OracleKVStorage(BaseKVStorage): ################ QUERY METHODS ################ - async def get_by_id(self, id: str) -> Union[dict, None]: + async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: """get doc_full data based on id.""" SQL = SQL_TEMPLATES["get_by_id_" + self.namespace] params = {"workspace": self.db.workspace, "id": id} @@ -211,7 +211,7 @@ class OracleKVStorage(BaseKVStorage): else: return None - async def get_by_ids(self, ids: list[str], fields=None) -> Union[list[dict], None]: + async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: """get doc_chunks data based on id""" SQL = SQL_TEMPLATES["get_by_ids_" + self.namespace].format( ids=",".join([f"'{id}'" for id in ids]) @@ -238,15 +238,10 @@ class OracleKVStorage(BaseKVStorage): return None async def get_by_status_and_ids( - self, status: str, ids: list[str] - ) -> Union[list[dict], None]: + self, status: str + ) -> Union[list[dict[str, Any]], None]: """Specifically for llm_response_cache.""" - if ids is not None: - SQL = SQL_TEMPLATES["get_by_status_ids_" + self.namespace].format( - ids=",".join([f"'{id}'" for id in ids]) - ) - else: - SQL = SQL_TEMPLATES["get_by_status_" + self.namespace] + SQL = SQL_TEMPLATES["get_by_status_" + self.namespace] params = {"workspace": self.db.workspace, "status": status} res = await self.db.query(SQL, params, multirows=True) if res: @@ -270,7 +265,7 @@ class OracleKVStorage(BaseKVStorage): return set(keys) ################ INSERT METHODS ################ - async def upsert(self, data: dict[str, dict]): + async def upsert(self, data: dict[str, Any]) -> None: if is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS): list_data = [ { @@ -328,14 +323,6 @@ class OracleKVStorage(BaseKVStorage): } await self.db.execute(upsert_sql, _data) - return None - - async def change_status(self, id: str, status: str): - SQL = SQL_TEMPLATES["change_status"].format( - table_name=namespace_to_table_name(self.namespace) - ) - params = {"workspace": self.db.workspace, "id": id, "status": status} - await self.db.execute(SQL, params) async def index_done_callback(self): if is_namespace( @@ -343,8 +330,7 @@ class OracleKVStorage(BaseKVStorage): (NameSpace.KV_STORE_FULL_DOCS, NameSpace.KV_STORE_TEXT_CHUNKS), ): logger.info("full doc and chunk data had been saved into oracle db!") - - + @dataclass class OracleVectorDBStorage(BaseVectorStorage): # should pass db object to self.db @@ -745,7 +731,6 @@ SQL_TEMPLATES = { "get_by_status_full_docs": "select id,status from LIGHTRAG_DOC_FULL t where workspace=:workspace AND status=:status", "get_by_status_text_chunks": "select id,status from LIGHTRAG_DOC_CHUNKS where workspace=:workspace and status=:status", "filter_keys": "select id from {table_name} where workspace=:workspace and id in ({ids})", - "change_status": "update {table_name} set status=:status,updatetime=SYSDATE where workspace=:workspace and id=:id", "merge_doc_full": """MERGE INTO LIGHTRAG_DOC_FULL a USING DUAL ON (a.id = :id and a.workspace = :workspace) diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 8884d92e..dccb2d7f 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -30,7 +30,6 @@ from ..base import ( DocStatus, DocProcessingStatus, BaseGraphStorage, - T, ) from ..namespace import NameSpace, is_namespace @@ -184,7 +183,7 @@ class PGKVStorage(BaseKVStorage): ################ QUERY METHODS ################ - async def get_by_id(self, id: str) -> Union[dict, None]: + async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: """Get doc_full data by id.""" sql = SQL_TEMPLATES["get_by_id_" + self.namespace] params = {"workspace": self.db.workspace, "id": id} @@ -214,7 +213,7 @@ class PGKVStorage(BaseKVStorage): return None # Query by id - async def get_by_ids(self, ids: List[str], fields=None) -> Union[List[dict], None]: + async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: """Get doc_chunks data by id""" sql = SQL_TEMPLATES["get_by_ids_" + self.namespace].format( ids=",".join([f"'{id}'" for id in ids]) @@ -238,6 +237,14 @@ class PGKVStorage(BaseKVStorage): return res else: return None + + async def get_by_status_and_ids( + self, status: str + ) -> Union[list[dict[str, Any]], None]: + """Specifically for llm_response_cache.""" + SQL = SQL_TEMPLATES["get_by_status_" + self.namespace] + params = {"workspace": self.db.workspace, "status": status} + return await self.db.query(SQL, params, multirows=True) async def all_keys(self) -> list[dict]: if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE): @@ -270,7 +277,7 @@ class PGKVStorage(BaseKVStorage): print(params) ################ INSERT METHODS ################ - async def upsert(self, data: Dict[str, dict]): + async def upsert(self, data: dict[str, Any]) -> None: if is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS): pass elif is_namespace(self.namespace, NameSpace.KV_STORE_FULL_DOCS): @@ -447,7 +454,7 @@ class PGDocStatusStorage(DocStatusStorage): existed = set([element["id"] for element in result]) return set(data) - existed - async def get_by_id(self, id: str) -> Union[T, None]: + async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: sql = "select * from LIGHTRAG_DOC_STATUS where workspace=$1 and id=$2" params = {"workspace": self.db.workspace, "id": id} result = await self.db.query(sql, params, True) diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py index 147ea5f3..15faa843 100644 --- a/lightrag/kg/redis_impl.py +++ b/lightrag/kg/redis_impl.py @@ -1,4 +1,5 @@ import os +from typing import Any, TypeVar, Union from tqdm.asyncio import tqdm as tqdm_async from dataclasses import dataclass import pipmaster as pm @@ -28,21 +29,11 @@ class RedisKVStorage(BaseKVStorage): data = await self._redis.get(f"{self.namespace}:{id}") return json.loads(data) if data else None - async def get_by_ids(self, ids, fields=None): + async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: pipe = self._redis.pipeline() for id in ids: pipe.get(f"{self.namespace}:{id}") results = await pipe.execute() - - if fields: - # Filter fields if specified - return [ - {field: value.get(field) for field in fields if field in value} - if (value := json.loads(result)) - else None - for result in results - ] - return [json.loads(result) if result else None for result in results] async def filter_keys(self, data: list[str]) -> set[str]: @@ -54,7 +45,7 @@ class RedisKVStorage(BaseKVStorage): existing_ids = {data[i] for i, exists in enumerate(results) if exists} return set(data) - existing_ids - async def upsert(self, data: dict[str, dict]): + async def upsert(self, data: dict[str, Any]) -> None: pipe = self._redis.pipeline() for k, v in tqdm_async(data.items(), desc="Upserting"): pipe.set(f"{self.namespace}:{k}", json.dumps(v)) @@ -62,9 +53,18 @@ class RedisKVStorage(BaseKVStorage): for k in data: data[k]["_id"] = k - return data - async def drop(self): + async def drop(self) -> None: keys = await self._redis.keys(f"{self.namespace}:*") if keys: await self._redis.delete(*keys) + + async def get_by_status_and_ids( + self, status: str, + ) -> Union[list[dict[str, Any]], None]: + pipe = self._redis.pipeline() + for key in await self._redis.keys(f"{self.namespace}:*"): + pipe.hgetall(key) + results = await pipe.execute() + return [data for data in results if data.get("status") == status] or None + diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index cb819d47..97d5794f 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -1,7 +1,7 @@ import asyncio import os from dataclasses import dataclass -from typing import Union +from typing import Any, TypeVar, Union import numpy as np import pipmaster as pm @@ -108,7 +108,7 @@ class TiDBKVStorage(BaseKVStorage): ################ QUERY METHODS ################ - async def get_by_id(self, id: str) -> Union[dict, None]: + async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: """根据 id 获取 doc_full 数据.""" SQL = SQL_TEMPLATES["get_by_id_" + self.namespace] params = {"id": id} @@ -122,16 +122,14 @@ class TiDBKVStorage(BaseKVStorage): return None # Query by id - async def get_by_ids(self, ids: list[str], fields=None) -> Union[list[dict], None]: + async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: """根据 id 获取 doc_chunks 数据""" SQL = SQL_TEMPLATES["get_by_ids_" + self.namespace].format( ids=",".join([f"'{id}'" for id in ids]) ) - # print("get_by_ids:"+SQL) res = await self.db.query(SQL, multirows=True) if res: data = res # [{"data":i} for i in res] - # print(data) return data else: return None @@ -158,7 +156,7 @@ class TiDBKVStorage(BaseKVStorage): return data ################ INSERT full_doc AND chunks ################ - async def upsert(self, data: dict[str, dict]): + async def upsert(self, data: dict[str, Any]) -> None: left_data = {k: v for k, v in data.items() if k not in self._data} self._data.update(left_data) if is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS): @@ -335,6 +333,12 @@ class TiDBVectorDBStorage(BaseVectorStorage): merge_sql = SQL_TEMPLATES["insert_relationship"] await self.db.execute(merge_sql, data) + async def get_by_status_and_ids( + self, status: str + ) -> Union[list[dict], None]: + SQL = SQL_TEMPLATES["get_by_status_" + self.namespace] + params = {"workspace": self.db.workspace, "status": status} + return await self.db.query(SQL, params, multirows=True) @dataclass class TiDBGraphStorage(BaseGraphStorage): From 5a082a0052cc008bb51a30fbc25f72d0aaee683e Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sat, 8 Feb 2025 23:20:37 +0100 Subject: [PATCH 05/42] cleaned code --- lightrag/kg/json_kv_impl.py | 4 +--- lightrag/kg/mongo_impl.py | 6 ++---- lightrag/kg/oracle_impl.py | 8 ++------ lightrag/kg/postgres_impl.py | 4 +--- lightrag/kg/redis_impl.py | 6 ++---- lightrag/kg/tidb_impl.py | 6 ++---- 6 files changed, 10 insertions(+), 24 deletions(-) diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index 6ee49f7c..927ffe32 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -50,8 +50,6 @@ class JsonKVStorage(BaseKVStorage): async def drop(self) -> None: self._data = {} - async def get_by_status_and_ids( - self, status: str - ) -> Union[list[dict[str, Any]], None]: + async def get_by_status_and_ids(self, status: str) -> Union[list[dict[str, Any]], None]: result = [v for _, v in self._data.items() if v["status"] == status] return result if result else None diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index d0598ca4..7cfdb994 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -12,7 +12,7 @@ if not pm.is_installed("motor"): from pymongo import MongoClient from motor.motor_asyncio import AsyncIOMotorClient -from typing import Any, TypeVar, Union, List, Tuple +from typing import Any, Union, List, Tuple from ..utils import logger from ..base import BaseKVStorage, BaseGraphStorage @@ -77,9 +77,7 @@ class MongoKVStorage(BaseKVStorage): """Drop the collection""" await self._data.drop() - async def get_by_status_and_ids( - self, status: str - ) -> Union[list[dict[str, Any]], None]: + async def get_by_status_and_ids(self, status: str) -> Union[list[dict[str, Any]], None]: """Get documents by status and ids""" return self._data.find({"status": status}) diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index 9438c323..f51a5eb8 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -4,7 +4,7 @@ import asyncio # import html # import os from dataclasses import dataclass -from typing import Any, TypeVar, Union +from typing import Any, Union import numpy as np import array import pipmaster as pm @@ -243,11 +243,7 @@ class OracleKVStorage(BaseKVStorage): """Specifically for llm_response_cache.""" SQL = SQL_TEMPLATES["get_by_status_" + self.namespace] params = {"workspace": self.db.workspace, "status": status} - res = await self.db.query(SQL, params, multirows=True) - if res: - return res - else: - return None + return await self.db.query(SQL, params, multirows=True) async def filter_keys(self, keys: list[str]) -> set[str]: """Return keys that don't exist in storage""" diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index dccb2d7f..c6757765 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -238,9 +238,7 @@ class PGKVStorage(BaseKVStorage): else: return None - async def get_by_status_and_ids( - self, status: str - ) -> Union[list[dict[str, Any]], None]: + async def get_by_status_and_ids(self, status: str) -> Union[list[dict[str, Any]], None]: """Specifically for llm_response_cache.""" SQL = SQL_TEMPLATES["get_by_status_" + self.namespace] params = {"workspace": self.db.workspace, "status": status} diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py index 15faa843..b3ff890f 100644 --- a/lightrag/kg/redis_impl.py +++ b/lightrag/kg/redis_impl.py @@ -1,5 +1,5 @@ import os -from typing import Any, TypeVar, Union +from typing import Any, Union from tqdm.asyncio import tqdm as tqdm_async from dataclasses import dataclass import pipmaster as pm @@ -59,9 +59,7 @@ class RedisKVStorage(BaseKVStorage): if keys: await self._redis.delete(*keys) - async def get_by_status_and_ids( - self, status: str, - ) -> Union[list[dict[str, Any]], None]: + async def get_by_status_and_ids(self, status: str) -> Union[list[dict[str, Any]], None]: pipe = self._redis.pipeline() for key in await self._redis.keys(f"{self.namespace}:*"): pipe.hgetall(key) diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index 97d5794f..81450e87 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -1,7 +1,7 @@ import asyncio import os from dataclasses import dataclass -from typing import Any, TypeVar, Union +from typing import Any, Union import numpy as np import pipmaster as pm @@ -333,9 +333,7 @@ class TiDBVectorDBStorage(BaseVectorStorage): merge_sql = SQL_TEMPLATES["insert_relationship"] await self.db.execute(merge_sql, data) - async def get_by_status_and_ids( - self, status: str - ) -> Union[list[dict], None]: + async def get_by_status_and_ids(self, status: str) -> Union[list[dict[str, Any]], None]: SQL = SQL_TEMPLATES["get_by_status_" + self.namespace] params = {"workspace": self.db.workspace, "status": status} return await self.db.query(SQL, params, multirows=True) From 677013e9d57ab0aa379154c27adff1c72ce8f474 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sat, 8 Feb 2025 23:24:00 +0100 Subject: [PATCH 06/42] cleaned code --- lightrag/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index 23acb7ad..2b655549 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -104,9 +104,7 @@ class BaseKVStorage(StorageNameSpace): async def drop(self) -> None: raise NotImplementedError - async def get_by_status_and_ids( - self, status: str - ) -> list[dict[str, Any]]: + async def get_by_status_and_ids(self, status: str) -> Union[list[dict[str, Any]], None]: raise NotImplementedError @dataclass From d2db250ee78e64d3f4e5e6e7fb35ed56d905338f Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sat, 8 Feb 2025 23:25:42 +0100 Subject: [PATCH 07/42] added type to be more clear --- lightrag/lightrag.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 6b925be3..f2b3ba68 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -245,19 +245,19 @@ class LightRAG: #### # add embedding func by walter #### - self.full_docs = self.key_string_value_json_storage_cls( + self.full_docs: BaseKVStorage = self.key_string_value_json_storage_cls( namespace=make_namespace( self.namespace_prefix, NameSpace.KV_STORE_FULL_DOCS ), embedding_func=self.embedding_func, ) - self.text_chunks = self.key_string_value_json_storage_cls( + self.text_chunks: BaseVectorStorage = self.key_string_value_json_storage_cls( namespace=make_namespace( self.namespace_prefix, NameSpace.KV_STORE_TEXT_CHUNKS ), embedding_func=self.embedding_func, ) - self.chunk_entity_relation_graph = self.graph_storage_cls( + self.chunk_entity_relation_graph: BaseGraphStorage = self.graph_storage_cls( namespace=make_namespace( self.namespace_prefix, NameSpace.GRAPH_STORE_CHUNK_ENTITY_RELATION ), From 020fdecc733947d2018378cdb3b973ae40c691e0 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sat, 8 Feb 2025 23:33:11 +0100 Subject: [PATCH 08/42] cleaned typed not useful --- lightrag/operate.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index c8c50f61..d746b789 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -568,7 +568,7 @@ async def kg_query( knowledge_graph_inst: BaseGraphStorage, entities_vdb: BaseVectorStorage, relationships_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], + text_chunks_db: BaseKVStorage, query_param: QueryParam, global_config: dict, hashing_kv: BaseKVStorage = None, @@ -777,7 +777,7 @@ async def mix_kg_vector_query( entities_vdb: BaseVectorStorage, relationships_vdb: BaseVectorStorage, chunks_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], + text_chunks_db: BaseKVStorage, query_param: QueryParam, global_config: dict, hashing_kv: BaseKVStorage = None, @@ -969,7 +969,7 @@ async def _build_query_context( knowledge_graph_inst: BaseGraphStorage, entities_vdb: BaseVectorStorage, relationships_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], + text_chunks_db: BaseKVStorage, query_param: QueryParam, ): # ll_entities_context, ll_relations_context, ll_text_units_context = "", "", "" @@ -1052,7 +1052,7 @@ async def _get_node_data( query, knowledge_graph_inst: BaseGraphStorage, entities_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], + text_chunks_db: BaseKVStorage, query_param: QueryParam, ): # get similar entities @@ -1145,7 +1145,7 @@ async def _get_node_data( async def _find_most_related_text_unit_from_entities( node_datas: list[dict], query_param: QueryParam, - text_chunks_db: BaseKVStorage[TextChunkSchema], + text_chunks_db: BaseKVStorage, knowledge_graph_inst: BaseGraphStorage, ): text_units = [ @@ -1268,7 +1268,7 @@ async def _get_edge_data( keywords, knowledge_graph_inst: BaseGraphStorage, relationships_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], + text_chunks_db: BaseKVStorage, query_param: QueryParam, ): results = await relationships_vdb.query(keywords, top_k=query_param.top_k) @@ -1421,7 +1421,7 @@ async def _find_most_related_entities_from_relationships( async def _find_related_text_unit_from_relationships( edge_datas: list[dict], query_param: QueryParam, - text_chunks_db: BaseKVStorage[TextChunkSchema], + text_chunks_db: BaseKVStorage, knowledge_graph_inst: BaseGraphStorage, ): text_units = [ @@ -1496,7 +1496,7 @@ def combine_contexts(entities, relationships, sources): async def naive_query( query, chunks_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], + text_chunks_db: BaseKVStorage, query_param: QueryParam, global_config: dict, hashing_kv: BaseKVStorage = None, @@ -1599,7 +1599,7 @@ async def kg_query_with_keywords( knowledge_graph_inst: BaseGraphStorage, entities_vdb: BaseVectorStorage, relationships_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], + text_chunks_db: BaseKVStorage, query_param: QueryParam, global_config: dict, hashing_kv: BaseKVStorage = None, From 2929d1fc390e2a89c7ca0951817460cca08670e2 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sat, 8 Feb 2025 23:52:27 +0100 Subject: [PATCH 09/42] fixed pipe --- lightrag/lightrag.py | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index f2b3ba68..0ae47d1f 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -586,7 +586,7 @@ class LightRAG: if update_storage: await self._insert_done() - async def apipeline_process_documents(self, string_or_strings): + async def apipeline_process_documents(self, string_or_strings: str | list[str]): """Input list remove duplicates, generate document IDs and initial pendding status, filter out already stored documents, store docs Args: string_or_strings: Single document string or list of document strings @@ -628,20 +628,24 @@ class LightRAG: # 4. Store original document for doc_id, doc in new_docs.items(): - await self.full_docs.upsert({doc_id: {"content": doc["content"]}}) - await self.full_docs.change_status(doc_id, DocStatus.PENDING) + await self.full_docs.upsert( + { + doc_id: { + "content": doc["content"], + "status": DocStatus.PENDING + } + } + ) logger.info(f"Stored {len(new_docs)} new unique documents") async def apipeline_process_chunks(self): """Get pendding documents, split into chunks,insert chunks""" # 1. get all pending and failed documents _todo_doc_keys = [] - _failed_doc = await self.full_docs.get_by_status_and_ids( - status=DocStatus.FAILED, ids=None - ) - _pendding_doc = await self.full_docs.get_by_status_and_ids( - status=DocStatus.PENDING, ids=None - ) + + _failed_doc = await self.full_docs.get_by_status_and_ids(status=DocStatus.FAILED) + _pendding_doc = await self.full_docs.get_by_status_and_ids(status=DocStatus.PENDING) + if _failed_doc: _todo_doc_keys.extend([doc["id"] for doc in _failed_doc]) if _pendding_doc: @@ -671,7 +675,7 @@ class LightRAG: compute_mdhash_id(dp["content"], prefix="chunk-"): { **dp, "full_doc_id": doc_id, - "status": DocStatus.PENDING, + "status": DocStatus.PROCESSED, } for dp in chunking_by_token_size( doc["content"], @@ -681,17 +685,15 @@ class LightRAG: ) } chunk_cnt += len(chunks) - await self.text_chunks.upsert(chunks) - await self.text_chunks.change_status(doc_id, DocStatus.PROCESSING) - + try: # Store chunks in vector database await self.chunks_vdb.upsert(chunks) # Update doc status - await self.full_docs.change_status(doc_id, DocStatus.PROCESSED) + await self.text_chunks.upsert({**chunks, "status": DocStatus.PENDING}) except Exception as e: # Mark as failed if any step fails - await self.full_docs.change_status(doc_id, DocStatus.FAILED) + await self.text_chunks.upsert({**chunks, "status": DocStatus.FAILED}) raise e except Exception as e: import traceback @@ -705,12 +707,8 @@ class LightRAG: """Get pendding or failed chunks, extract entities and relationships from each chunk""" # 1. get all pending and failed chunks _todo_chunk_keys = [] - _failed_chunks = await self.text_chunks.get_by_status_and_ids( - status=DocStatus.FAILED, ids=None - ) - _pendding_chunks = await self.text_chunks.get_by_status_and_ids( - status=DocStatus.PENDING, ids=None - ) + _failed_chunks = await self.text_chunks.get_by_status_and_ids(status=DocStatus.FAILED) + _pendding_chunks = await self.text_chunks.get_by_status_and_ids(status=DocStatus.PENDING) if _failed_chunks: _todo_chunk_keys.extend([doc["id"] for doc in _failed_chunks]) if _pendding_chunks: @@ -744,11 +742,11 @@ class LightRAG: if maybe_new_kg is None: logger.info("No entities or relationships extracted!") # Update status to processed - await self.text_chunks.change_status(chunk_id, DocStatus.PROCESSED) + await self.text_chunks.upsert({chunk_id: {"status": DocStatus.PROCESSED}}) except Exception as e: logger.error("Failed to extract entities and relationships") # Mark as failed if any step fails - await self.text_chunks.change_status(chunk_id, DocStatus.FAILED) + await self.text_chunks.upsert({chunk_id: {"status": DocStatus.FAILED}}) raise e with tqdm_async( From 50c7f262621b23c362b89d8e91780b047d9b8aa6 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sat, 8 Feb 2025 23:58:15 +0100 Subject: [PATCH 10/42] cleanup code --- lightrag/base.py | 11 +++++---- lightrag/kg/json_kv_impl.py | 7 ++++-- lightrag/kg/mongo_impl.py | 6 +++-- lightrag/kg/oracle_impl.py | 3 ++- lightrag/kg/postgres_impl.py | 10 ++++---- lightrag/kg/redis_impl.py | 9 ++++---- lightrag/kg/tidb_impl.py | 9 +++++--- lightrag/lightrag.py | 45 ++++++++++++++++++++++-------------- 8 files changed, 63 insertions(+), 37 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index 2b655549..a53c8a83 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -91,7 +91,7 @@ class BaseKVStorage(StorageNameSpace): async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: raise NotImplementedError - async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: + async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: raise NotImplementedError async def filter_keys(self, data: list[str]) -> set[str]: @@ -103,10 +103,13 @@ class BaseKVStorage(StorageNameSpace): async def drop(self) -> None: raise NotImplementedError - - async def get_by_status_and_ids(self, status: str) -> Union[list[dict[str, Any]], None]: + + async def get_by_status_and_ids( + self, status: str + ) -> Union[list[dict[str, Any]], None]: raise NotImplementedError - + + @dataclass class BaseGraphStorage(StorageNameSpace): embedding_func: EmbeddingFunc = None diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index 927ffe32..70a60aa2 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -12,6 +12,7 @@ from lightrag.base import ( BaseKVStorage, ) + @dataclass class JsonKVStorage(BaseKVStorage): def __post_init__(self): @@ -30,7 +31,7 @@ class JsonKVStorage(BaseKVStorage): async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: return self._data.get(id, None) - async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: + async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: return [ ( {k: v for k, v in self._data[id].items()} @@ -50,6 +51,8 @@ class JsonKVStorage(BaseKVStorage): async def drop(self) -> None: self._data = {} - async def get_by_status_and_ids(self, status: str) -> Union[list[dict[str, Any]], None]: + async def get_by_status_and_ids( + self, status: str + ) -> Union[list[dict[str, Any]], None]: result = [v for _, v in self._data.items() if v["status"] == status] return result if result else None diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index 7cfdb994..ce703dfb 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -35,7 +35,7 @@ class MongoKVStorage(BaseKVStorage): async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: return self._data.find_one({"_id": id}) - async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: + async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: return list(self._data.find({"_id": {"$in": ids}})) async def filter_keys(self, data: list[str]) -> set[str]: @@ -77,7 +77,9 @@ class MongoKVStorage(BaseKVStorage): """Drop the collection""" await self._data.drop() - async def get_by_status_and_ids(self, status: str) -> Union[list[dict[str, Any]], None]: + async def get_by_status_and_ids( + self, status: str + ) -> Union[list[dict[str, Any]], None]: """Get documents by status and ids""" return self._data.find({"status": status}) diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index f51a5eb8..3c064eba 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -326,7 +326,8 @@ class OracleKVStorage(BaseKVStorage): (NameSpace.KV_STORE_FULL_DOCS, NameSpace.KV_STORE_TEXT_CHUNKS), ): logger.info("full doc and chunk data had been saved into oracle db!") - + + @dataclass class OracleVectorDBStorage(BaseVectorStorage): # should pass db object to self.db diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index c6757765..ba11fea7 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -213,7 +213,7 @@ class PGKVStorage(BaseKVStorage): return None # Query by id - async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: + async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: """Get doc_chunks data by id""" sql = SQL_TEMPLATES["get_by_ids_" + self.namespace].format( ids=",".join([f"'{id}'" for id in ids]) @@ -237,12 +237,14 @@ class PGKVStorage(BaseKVStorage): return res else: return None - - async def get_by_status_and_ids(self, status: str) -> Union[list[dict[str, Any]], None]: + + async def get_by_status_and_ids( + self, status: str + ) -> Union[list[dict[str, Any]], None]: """Specifically for llm_response_cache.""" SQL = SQL_TEMPLATES["get_by_status_" + self.namespace] params = {"workspace": self.db.workspace, "status": status} - return await self.db.query(SQL, params, multirows=True) + return await self.db.query(SQL, params, multirows=True) async def all_keys(self) -> list[dict]: if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE): diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py index b3ff890f..095cc3b6 100644 --- a/lightrag/kg/redis_impl.py +++ b/lightrag/kg/redis_impl.py @@ -29,7 +29,7 @@ class RedisKVStorage(BaseKVStorage): data = await self._redis.get(f"{self.namespace}:{id}") return json.loads(data) if data else None - async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: + async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: pipe = self._redis.pipeline() for id in ids: pipe.get(f"{self.namespace}:{id}") @@ -58,11 +58,12 @@ class RedisKVStorage(BaseKVStorage): keys = await self._redis.keys(f"{self.namespace}:*") if keys: await self._redis.delete(*keys) - - async def get_by_status_and_ids(self, status: str) -> Union[list[dict[str, Any]], None]: + + async def get_by_status_and_ids( + self, status: str + ) -> Union[list[dict[str, Any]], None]: pipe = self._redis.pipeline() for key in await self._redis.keys(f"{self.namespace}:*"): pipe.hgetall(key) results = await pipe.execute() return [data for data in results if data.get("status") == status] or None - diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index 81450e87..b8e6e985 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -122,7 +122,7 @@ class TiDBKVStorage(BaseKVStorage): return None # Query by id - async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: + async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: """根据 id 获取 doc_chunks 数据""" SQL = SQL_TEMPLATES["get_by_ids_" + self.namespace].format( ids=",".join([f"'{id}'" for id in ids]) @@ -333,10 +333,13 @@ class TiDBVectorDBStorage(BaseVectorStorage): merge_sql = SQL_TEMPLATES["insert_relationship"] await self.db.execute(merge_sql, data) - async def get_by_status_and_ids(self, status: str) -> Union[list[dict[str, Any]], None]: + async def get_by_status_and_ids( + self, status: str + ) -> Union[list[dict[str, Any]], None]: SQL = SQL_TEMPLATES["get_by_status_" + self.namespace] params = {"workspace": self.db.workspace, "status": status} - return await self.db.query(SQL, params, multirows=True) + return await self.db.query(SQL, params, multirows=True) + @dataclass class TiDBGraphStorage(BaseGraphStorage): diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 0ae47d1f..7a87e0e7 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -629,12 +629,7 @@ class LightRAG: # 4. Store original document for doc_id, doc in new_docs.items(): await self.full_docs.upsert( - { - doc_id: { - "content": doc["content"], - "status": DocStatus.PENDING - } - } + {doc_id: {"content": doc["content"], "status": DocStatus.PENDING}} ) logger.info(f"Stored {len(new_docs)} new unique documents") @@ -642,10 +637,14 @@ class LightRAG: """Get pendding documents, split into chunks,insert chunks""" # 1. get all pending and failed documents _todo_doc_keys = [] - - _failed_doc = await self.full_docs.get_by_status_and_ids(status=DocStatus.FAILED) - _pendding_doc = await self.full_docs.get_by_status_and_ids(status=DocStatus.PENDING) - + + _failed_doc = await self.full_docs.get_by_status_and_ids( + status=DocStatus.FAILED + ) + _pendding_doc = await self.full_docs.get_by_status_and_ids( + status=DocStatus.PENDING + ) + if _failed_doc: _todo_doc_keys.extend([doc["id"] for doc in _failed_doc]) if _pendding_doc: @@ -685,15 +684,19 @@ class LightRAG: ) } chunk_cnt += len(chunks) - + try: # Store chunks in vector database await self.chunks_vdb.upsert(chunks) # Update doc status - await self.text_chunks.upsert({**chunks, "status": DocStatus.PENDING}) + await self.text_chunks.upsert( + {**chunks, "status": DocStatus.PENDING} + ) except Exception as e: # Mark as failed if any step fails - await self.text_chunks.upsert({**chunks, "status": DocStatus.FAILED}) + await self.text_chunks.upsert( + {**chunks, "status": DocStatus.FAILED} + ) raise e except Exception as e: import traceback @@ -707,8 +710,12 @@ class LightRAG: """Get pendding or failed chunks, extract entities and relationships from each chunk""" # 1. get all pending and failed chunks _todo_chunk_keys = [] - _failed_chunks = await self.text_chunks.get_by_status_and_ids(status=DocStatus.FAILED) - _pendding_chunks = await self.text_chunks.get_by_status_and_ids(status=DocStatus.PENDING) + _failed_chunks = await self.text_chunks.get_by_status_and_ids( + status=DocStatus.FAILED + ) + _pendding_chunks = await self.text_chunks.get_by_status_and_ids( + status=DocStatus.PENDING + ) if _failed_chunks: _todo_chunk_keys.extend([doc["id"] for doc in _failed_chunks]) if _pendding_chunks: @@ -742,11 +749,15 @@ class LightRAG: if maybe_new_kg is None: logger.info("No entities or relationships extracted!") # Update status to processed - await self.text_chunks.upsert({chunk_id: {"status": DocStatus.PROCESSED}}) + await self.text_chunks.upsert( + {chunk_id: {"status": DocStatus.PROCESSED}} + ) except Exception as e: logger.error("Failed to extract entities and relationships") # Mark as failed if any step fails - await self.text_chunks.upsert({chunk_id: {"status": DocStatus.FAILED}}) + await self.text_chunks.upsert( + {chunk_id: {"status": DocStatus.FAILED}} + ) raise e with tqdm_async( From d267f066023a0bce760a3ea20939cfad6d9833cd Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 10:32:59 +0100 Subject: [PATCH 11/42] cleaned base and optional unuseful --- lightrag/base.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index a53c8a83..3fe58a77 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -5,14 +5,9 @@ from typing import ( Union, Literal, TypeVar, - Optional, - Dict, Any, - List, ) -from enum import Enum -import numpy as np from .utils import EmbeddingFunc @@ -88,10 +83,10 @@ class BaseKVStorage(StorageNameSpace): async def all_keys(self) -> list[str]: raise NotImplementedError - async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: + async def get_by_id(self, id: str) -> dict[str, Any]: raise NotImplementedError - async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: + async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]: raise NotImplementedError async def filter_keys(self, data: list[str]) -> set[str]: @@ -104,7 +99,7 @@ class BaseKVStorage(StorageNameSpace): async def drop(self) -> None: raise NotImplementedError - async def get_by_status_and_ids( + async def get_by_status( self, status: str ) -> Union[list[dict[str, Any]], None]: raise NotImplementedError From 31fe96d74a9727a89cb51d9e6d9e23cf4acc8a66 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 10:33:15 +0100 Subject: [PATCH 12/42] cleaned optional not used --- lightrag/kg/json_kv_impl.py | 8 ++++---- lightrag/kg/jsondocstatus_impl.py | 7 +++---- lightrag/kg/mongo_impl.py | 6 +++--- lightrag/kg/oracle_impl.py | 18 +++++------------- lightrag/kg/postgres_impl.py | 23 ++++++++--------------- lightrag/kg/redis_impl.py | 4 ++-- lightrag/kg/tidb_impl.py | 25 +++++++------------------ 7 files changed, 32 insertions(+), 59 deletions(-) diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index 70a60aa2..59da1b54 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -28,10 +28,10 @@ class JsonKVStorage(BaseKVStorage): async def index_done_callback(self): write_json(self._data, self._file_name) - async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: - return self._data.get(id, None) + async def get_by_id(self, id: str) -> dict[str, Any]: + return self._data.get(id, {}) - async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: + async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]: return [ ( {k: v for k, v in self._data[id].items()} @@ -51,7 +51,7 @@ class JsonKVStorage(BaseKVStorage): async def drop(self) -> None: self._data = {} - async def get_by_status_and_ids( + async def get_by_status( self, status: str ) -> Union[list[dict[str, Any]], None]: result = [v for _, v in self._data.items() if v["status"] == status] diff --git a/lightrag/kg/jsondocstatus_impl.py b/lightrag/kg/jsondocstatus_impl.py index 8bd972c6..603487bc 100644 --- a/lightrag/kg/jsondocstatus_impl.py +++ b/lightrag/kg/jsondocstatus_impl.py @@ -72,7 +72,7 @@ class JsonDocStatusStorage(DocStatusStorage): def __post_init__(self): working_dir = self.global_config["working_dir"] self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json") - self._data = load_json(self._file_name) or {} + self._data: dict[str, Any] = load_json(self._file_name) or {} logger.info(f"Loaded document status storage with {len(self._data)} records") async def filter_keys(self, data: list[str]) -> set[str]: @@ -112,10 +112,9 @@ class JsonDocStatusStorage(DocStatusStorage): """ self._data.update(data) await self.index_done_callback() - return data - async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: - return self._data.get(id) + async def get_by_id(self, id: str) -> dict[str, Any]: + return self._data.get(id, {}) async def get(self, doc_id: str) -> Union[DocProcessingStatus, None]: """Get document status by ID""" diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index ce703dfb..eb896b63 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -32,10 +32,10 @@ class MongoKVStorage(BaseKVStorage): async def all_keys(self) -> list[str]: return [x["_id"] for x in self._data.find({}, {"_id": 1})] - async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: + async def get_by_id(self, id: str) -> dict[str, Any]: return self._data.find_one({"_id": id}) - async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: + async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]: return list(self._data.find({"_id": {"$in": ids}})) async def filter_keys(self, data: list[str]) -> set[str]: @@ -77,7 +77,7 @@ class MongoKVStorage(BaseKVStorage): """Drop the collection""" await self._data.drop() - async def get_by_status_and_ids( + async def get_by_status( self, status: str ) -> Union[list[dict[str, Any]], None]: """Get documents by status and ids""" diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index 3c064eba..0e55194d 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -181,7 +181,7 @@ class OracleKVStorage(BaseKVStorage): ################ QUERY METHODS ################ - async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: + async def get_by_id(self, id: str) -> dict[str, Any]: """get doc_full data based on id.""" SQL = SQL_TEMPLATES["get_by_id_" + self.namespace] params = {"workspace": self.db.workspace, "id": id} @@ -191,12 +191,9 @@ class OracleKVStorage(BaseKVStorage): res = {} for row in array_res: res[row["id"]] = row - else: - res = await self.db.query(SQL, params) - if res: return res else: - return None + return await self.db.query(SQL, params) async def get_by_mode_and_id(self, mode: str, id: str) -> Union[dict, None]: """Specifically for llm_response_cache.""" @@ -211,7 +208,7 @@ class OracleKVStorage(BaseKVStorage): else: return None - async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: + async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]: """get doc_chunks data based on id""" SQL = SQL_TEMPLATES["get_by_ids_" + self.namespace].format( ids=",".join([f"'{id}'" for id in ids]) @@ -230,14 +227,9 @@ class OracleKVStorage(BaseKVStorage): for row in res: dict_res[row["mode"]][row["id"]] = row res = [{k: v} for k, v in dict_res.items()] - if res: - data = res # [{"data":i} for i in res] - # print(data) - return data - else: - return None + return res - async def get_by_status_and_ids( + async def get_by_status( self, status: str ) -> Union[list[dict[str, Any]], None]: """Specifically for llm_response_cache.""" diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index ba11fea7..d966fd85 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -183,7 +183,7 @@ class PGKVStorage(BaseKVStorage): ################ QUERY METHODS ################ - async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: + async def get_by_id(self, id: str) -> dict[str, Any]: """Get doc_full data by id.""" sql = SQL_TEMPLATES["get_by_id_" + self.namespace] params = {"workspace": self.db.workspace, "id": id} @@ -192,12 +192,9 @@ class PGKVStorage(BaseKVStorage): res = {} for row in array_res: res[row["id"]] = row - else: - res = await self.db.query(sql, params) - if res: return res else: - return None + return await self.db.query(sql, params) async def get_by_mode_and_id(self, mode: str, id: str) -> Union[dict, None]: """Specifically for llm_response_cache.""" @@ -213,7 +210,7 @@ class PGKVStorage(BaseKVStorage): return None # Query by id - async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: + async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]: """Get doc_chunks data by id""" sql = SQL_TEMPLATES["get_by_ids_" + self.namespace].format( ids=",".join([f"'{id}'" for id in ids]) @@ -230,15 +227,11 @@ class PGKVStorage(BaseKVStorage): dict_res[mode] = {} for row in array_res: dict_res[row["mode"]][row["id"]] = row - res = [{k: v} for k, v in dict_res.items()] + return [{k: v} for k, v in dict_res.items()] else: - res = await self.db.query(sql, params, multirows=True) - if res: - return res - else: - return None + return await self.db.query(sql, params, multirows=True) - async def get_by_status_and_ids( + async def get_by_status( self, status: str ) -> Union[list[dict[str, Any]], None]: """Specifically for llm_response_cache.""" @@ -454,12 +447,12 @@ class PGDocStatusStorage(DocStatusStorage): existed = set([element["id"] for element in result]) return set(data) - existed - async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: + async def get_by_id(self, id: str) -> dict[str, Any]: sql = "select * from LIGHTRAG_DOC_STATUS where workspace=$1 and id=$2" params = {"workspace": self.db.workspace, "id": id} result = await self.db.query(sql, params, True) if result is None or result == []: - return None + return {} else: return DocProcessingStatus( content_length=result[0]["content_length"], diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py index 095cc3b6..7c5c7030 100644 --- a/lightrag/kg/redis_impl.py +++ b/lightrag/kg/redis_impl.py @@ -29,7 +29,7 @@ class RedisKVStorage(BaseKVStorage): data = await self._redis.get(f"{self.namespace}:{id}") return json.loads(data) if data else None - async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: + async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]: pipe = self._redis.pipeline() for id in ids: pipe.get(f"{self.namespace}:{id}") @@ -59,7 +59,7 @@ class RedisKVStorage(BaseKVStorage): if keys: await self._redis.delete(*keys) - async def get_by_status_and_ids( + async def get_by_status( self, status: str ) -> Union[list[dict[str, Any]], None]: pipe = self._redis.pipeline() diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index b8e6e985..55dbe303 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -108,31 +108,20 @@ class TiDBKVStorage(BaseKVStorage): ################ QUERY METHODS ################ - async def get_by_id(self, id: str) -> Union[dict[str, Any], None]: - """根据 id 获取 doc_full 数据.""" + async def get_by_id(self, id: str) -> dict[str, Any]: + """Fetch doc_full data by id.""" SQL = SQL_TEMPLATES["get_by_id_" + self.namespace] params = {"id": id} # print("get_by_id:"+SQL) - res = await self.db.query(SQL, params) - if res: - data = res # {"data":res} - # print (data) - return data - else: - return None + return await self.db.query(SQL, params) # Query by id - async def get_by_ids(self, ids: list[str]) -> list[Union[dict[str, Any], None]]: - """根据 id 获取 doc_chunks 数据""" + async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]: + """Fetch doc_chunks data by id""" SQL = SQL_TEMPLATES["get_by_ids_" + self.namespace].format( ids=",".join([f"'{id}'" for id in ids]) ) - res = await self.db.query(SQL, multirows=True) - if res: - data = res # [{"data":i} for i in res] - return data - else: - return None + return await self.db.query(SQL, multirows=True) async def filter_keys(self, keys: list[str]) -> set[str]: """过滤掉重复内容""" @@ -333,7 +322,7 @@ class TiDBVectorDBStorage(BaseVectorStorage): merge_sql = SQL_TEMPLATES["insert_relationship"] await self.db.execute(merge_sql, data) - async def get_by_status_and_ids( + async def get_by_status( self, status: str ) -> Union[list[dict[str, Any]], None]: SQL = SQL_TEMPLATES["get_by_status_" + self.namespace] From f4287804cef3a920785503bf469a77594c75be10 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 10:39:48 +0100 Subject: [PATCH 13/42] added typing --- lightrag/operate.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index d746b789..ec896cc4 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -2,7 +2,7 @@ import asyncio import json import re from tqdm.asyncio import tqdm as tqdm_async -from typing import Union +from typing import Any, Union from collections import Counter, defaultdict from .utils import ( logger, @@ -42,9 +42,9 @@ def chunking_by_token_size( max_token_size=1024, tiktoken_model="gpt-4o", **kwargs, -): +) -> list[dict[str, Any]]: tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) - results = [] + results: list[dict[str, Any]] = [] if split_by_character: raw_chunks = content.split(split_by_character) new_chunks = [] From af477e8a262f201726c2b51a8b2f42bf848950f5 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 11:00:04 +0100 Subject: [PATCH 14/42] cleaned typing --- lightrag/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index 3fe58a77..a91595b2 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -66,10 +66,10 @@ class BaseVectorStorage(StorageNameSpace): embedding_func: EmbeddingFunc meta_fields: set = field(default_factory=set) - async def query(self, query: str, top_k: int) -> list[dict]: + async def query(self, query: str, top_k: int) -> list[dict[str, Any]]: raise NotImplementedError - async def upsert(self, data: dict[str, dict]): + async def upsert(self, data: dict[str, dict[str, Any]]) -> None: """Use 'content' field from value for embedding, use key as id. If embedding_func is None, use 'embedding' field from value """ From fd77099af50fbdd55b9b5480fcab43c7c069a245 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 11:10:46 +0100 Subject: [PATCH 15/42] cleaned insert by using pipe --- lightrag/lightrag.py | 336 ++++++++++++++----------------------------- 1 file changed, 105 insertions(+), 231 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 7a87e0e7..b2bdfaac 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -4,17 +4,12 @@ from tqdm.asyncio import tqdm as tqdm_async from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial -from typing import Type, cast, Dict - +from typing import Any, Type, Union +import traceback from .operate import ( chunking_by_token_size, - extract_entities, - # local_query,global_query,hybrid_query, - kg_query, - naive_query, - mix_kg_vector_query, - extract_keywords_only, - kg_query_with_keywords, + extract_entities + # local_query,global_query,hybrid_query,, ) from .utils import ( @@ -30,8 +25,6 @@ from .base import ( BaseGraphStorage, BaseKVStorage, BaseVectorStorage, - StorageNameSpace, - QueryParam, DocStatus, ) @@ -176,7 +169,7 @@ class LightRAG: enable_llm_cache_for_entity_extract: bool = True # extension - addon_params: dict = field(default_factory=dict) + addon_params: dict[str, Any] = field(default_factory=dict) convert_response_to_json_func: callable = convert_response_to_json # Add new field for document status storage type @@ -251,7 +244,7 @@ class LightRAG: ), embedding_func=self.embedding_func, ) - self.text_chunks: BaseVectorStorage = self.key_string_value_json_storage_cls( + self.text_chunks: BaseKVStorage = self.key_string_value_json_storage_cls( namespace=make_namespace( self.namespace_prefix, NameSpace.KV_STORE_TEXT_CHUNKS ), @@ -281,7 +274,7 @@ class LightRAG: embedding_func=self.embedding_func, meta_fields={"src_id", "tgt_id"}, ) - self.chunks_vdb = self.vector_db_storage_cls( + self.chunks_vdb: BaseVectorStorage = self.vector_db_storage_cls( namespace=make_namespace( self.namespace_prefix, NameSpace.VECTOR_STORE_CHUNKS ), @@ -310,7 +303,7 @@ class LightRAG: # Initialize document status storage self.doc_status_storage_cls = self._get_storage_class(self.doc_status_storage) - self.doc_status = self.doc_status_storage_cls( + self.doc_status: BaseKVStorage = self.doc_status_storage_cls( namespace=make_namespace(self.namespace_prefix, NameSpace.DOC_STATUS), global_config=global_config, embedding_func=None, @@ -359,7 +352,9 @@ class LightRAG: ) async def ainsert( - self, string_or_strings, split_by_character=None, split_by_character_only=False + self, string_or_strings: Union[str, list[str]], + split_by_character: str | None = None, + split_by_character_only: bool = False ): """Insert documents with checkpoint support @@ -370,154 +365,10 @@ class LightRAG: split_by_character_only: if split_by_character_only is True, split the string by character only, when split_by_character is None, this parameter is ignored. """ - if isinstance(string_or_strings, str): - string_or_strings = [string_or_strings] + await self.apipeline_process_documents(string_or_strings) + await self.apipeline_process_chunks(split_by_character, split_by_character_only) + await self.apipeline_process_extract_graph() - # 1. Remove duplicate contents from the list - unique_contents = list(set(doc.strip() for doc in string_or_strings)) - - # 2. Generate document IDs and initial status - new_docs = { - compute_mdhash_id(content, prefix="doc-"): { - "content": content, - "content_summary": self._get_content_summary(content), - "content_length": len(content), - "status": DocStatus.PENDING, - "created_at": datetime.now().isoformat(), - "updated_at": datetime.now().isoformat(), - } - for content in unique_contents - } - - # 3. Filter out already processed documents - # _add_doc_keys = await self.doc_status.filter_keys(list(new_docs.keys())) - _add_doc_keys = set() - for doc_id in new_docs.keys(): - current_doc = await self.doc_status.get_by_id(doc_id) - - if current_doc is None: - _add_doc_keys.add(doc_id) - continue # skip to the next doc_id - - status = None - if isinstance(current_doc, dict): - status = current_doc["status"] - else: - status = current_doc.status - - if status == DocStatus.FAILED: - _add_doc_keys.add(doc_id) - - new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} - - if not new_docs: - logger.info("All documents have been processed or are duplicates") - return - - logger.info(f"Processing {len(new_docs)} new unique documents") - - # Process documents in batches - batch_size = self.addon_params.get("insert_batch_size", 10) - for i in range(0, len(new_docs), batch_size): - batch_docs = dict(list(new_docs.items())[i : i + batch_size]) - - for doc_id, doc in tqdm_async( - batch_docs.items(), desc=f"Processing batch {i // batch_size + 1}" - ): - try: - # Update status to processing - doc_status = { - "content_summary": doc["content_summary"], - "content_length": doc["content_length"], - "status": DocStatus.PROCESSING, - "created_at": doc["created_at"], - "updated_at": datetime.now().isoformat(), - } - await self.doc_status.upsert({doc_id: doc_status}) - - # Generate chunks from document - chunks = { - compute_mdhash_id(dp["content"], prefix="chunk-"): { - **dp, - "full_doc_id": doc_id, - } - for dp in self.chunking_func( - doc["content"], - split_by_character=split_by_character, - split_by_character_only=split_by_character_only, - overlap_token_size=self.chunk_overlap_token_size, - max_token_size=self.chunk_token_size, - tiktoken_model=self.tiktoken_model_name, - **self.chunking_func_kwargs, - ) - } - - # Update status with chunks information - doc_status.update( - { - "chunks_count": len(chunks), - "updated_at": datetime.now().isoformat(), - } - ) - await self.doc_status.upsert({doc_id: doc_status}) - - try: - # Store chunks in vector database - await self.chunks_vdb.upsert(chunks) - - # Extract and store entities and relationships - maybe_new_kg = await extract_entities( - chunks, - knowledge_graph_inst=self.chunk_entity_relation_graph, - entity_vdb=self.entities_vdb, - relationships_vdb=self.relationships_vdb, - llm_response_cache=self.llm_response_cache, - global_config=asdict(self), - ) - - if maybe_new_kg is None: - raise Exception( - "Failed to extract entities and relationships" - ) - - self.chunk_entity_relation_graph = maybe_new_kg - - # Store original document and chunks - await self.full_docs.upsert( - {doc_id: {"content": doc["content"]}} - ) - await self.text_chunks.upsert(chunks) - - # Update status to processed - doc_status.update( - { - "status": DocStatus.PROCESSED, - "updated_at": datetime.now().isoformat(), - } - ) - await self.doc_status.upsert({doc_id: doc_status}) - - except Exception as e: - # Mark as failed if any step fails - doc_status.update( - { - "status": DocStatus.FAILED, - "error": str(e), - "updated_at": datetime.now().isoformat(), - } - ) - await self.doc_status.upsert({doc_id: doc_status}) - raise e - - except Exception as e: - import traceback - - error_msg = f"Failed to process document {doc_id}: {str(e)}\n{traceback.format_exc()}" - logger.error(error_msg) - continue - else: - # Only update index when processing succeeds - await self._insert_done() def insert_custom_chunks(self, full_text: str, text_chunks: list[str]): loop = always_get_an_event_loop() @@ -597,34 +448,32 @@ class LightRAG: # 1. Remove duplicate contents from the list unique_contents = list(set(doc.strip() for doc in string_or_strings)) - logger.info( - f"Received {len(string_or_strings)} docs, contains {len(unique_contents)} new unique documents" - ) - # 2. Generate document IDs and initial status - new_docs = { + new_docs: dict[str, Any] = { compute_mdhash_id(content, prefix="doc-"): { "content": content, "content_summary": self._get_content_summary(content), "content_length": len(content), "status": DocStatus.PENDING, "created_at": datetime.now().isoformat(), - "updated_at": None, + "updated_at": datetime.now().isoformat(), } for content in unique_contents } # 3. Filter out already processed documents - _not_stored_doc_keys = await self.full_docs.filter_keys(list(new_docs.keys())) - if len(_not_stored_doc_keys) < len(new_docs): - logger.info( - f"Skipping {len(new_docs) - len(_not_stored_doc_keys)} already existing documents" - ) - new_docs = {k: v for k, v in new_docs.items() if k in _not_stored_doc_keys} + _add_doc_keys: set[str] = set() + for doc_id in new_docs.keys(): + current_doc = await self.doc_status.get_by_id(doc_id) + + if not current_doc or current_doc["status"] == DocStatus.FAILED: + _add_doc_keys.add(doc_id) + + new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} if not new_docs: logger.info("All documents have been processed or are duplicates") - return None + return # 4. Store original document for doc_id, doc in new_docs.items(): @@ -633,96 +482,121 @@ class LightRAG: ) logger.info(f"Stored {len(new_docs)} new unique documents") - async def apipeline_process_chunks(self): + async def apipeline_process_chunks( + self, + split_by_character: str | None = None, + split_by_character_only: bool = False + ) -> None: """Get pendding documents, split into chunks,insert chunks""" # 1. get all pending and failed documents - _todo_doc_keys = [] + to_process_doc_keys: list[str] = [] - _failed_doc = await self.full_docs.get_by_status_and_ids( + # Process failes + to_process_docs = await self.full_docs.get_by_status( status=DocStatus.FAILED ) - _pendding_doc = await self.full_docs.get_by_status_and_ids( + if to_process_docs: + to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) + + # Process Pending + to_process_docs = await self.full_docs.get_by_status( status=DocStatus.PENDING ) + if to_process_docs: + to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) - if _failed_doc: - _todo_doc_keys.extend([doc["id"] for doc in _failed_doc]) - if _pendding_doc: - _todo_doc_keys.extend([doc["id"] for doc in _pendding_doc]) - if not _todo_doc_keys: + if not to_process_doc_keys: logger.info("All documents have been processed or are duplicates") - return None - else: - logger.info(f"Filtered out {len(_todo_doc_keys)} not processed documents") + return - new_docs = { - doc["id"]: doc for doc in await self.full_docs.get_by_ids(_todo_doc_keys) - } + full_docs_ids = await self.full_docs.get_by_ids(to_process_doc_keys) + new_docs = {} + if full_docs_ids: + new_docs = {doc["id"]: doc for doc in full_docs_ids or []} + if not new_docs: + logger.info("All documents have been processed or are duplicates") + return + # 2. split docs into chunks, insert chunks, update doc status - chunk_cnt = 0 batch_size = self.addon_params.get("insert_batch_size", 10) for i in range(0, len(new_docs), batch_size): batch_docs = dict(list(new_docs.items())[i : i + batch_size]) + for doc_id, doc in tqdm_async( - batch_docs.items(), - desc=f"Level 1 - Spliting doc in batch {i // batch_size + 1}", + batch_docs.items(), desc=f"Processing batch {i // batch_size + 1}" ): + doc_status: dict[str, Any] = { + "content_summary": doc["content_summary"], + "content_length": doc["content_length"], + "status": DocStatus.PROCESSING, + "created_at": doc["created_at"], + "updated_at": datetime.now().isoformat(), + } try: + await self.doc_status.upsert({doc_id: doc_status}) + # Generate chunks from document - chunks = { + chunks: dict[str, Any] = { compute_mdhash_id(dp["content"], prefix="chunk-"): { **dp, "full_doc_id": doc_id, - "status": DocStatus.PROCESSED, } - for dp in chunking_by_token_size( + for dp in self.chunking_func( doc["content"], + split_by_character=split_by_character, + split_by_character_only=split_by_character_only, overlap_token_size=self.chunk_overlap_token_size, max_token_size=self.chunk_token_size, tiktoken_model=self.tiktoken_model_name, + **self.chunking_func_kwargs, ) } - chunk_cnt += len(chunks) - try: - # Store chunks in vector database - await self.chunks_vdb.upsert(chunks) - # Update doc status - await self.text_chunks.upsert( - {**chunks, "status": DocStatus.PENDING} - ) - except Exception as e: - # Mark as failed if any step fails - await self.text_chunks.upsert( - {**chunks, "status": DocStatus.FAILED} - ) - raise e + # Update status with chunks information + doc_status.update( + { + "chunks_count": len(chunks), + "updated_at": datetime.now().isoformat(), + } + ) + await self.doc_status.upsert({doc_id: doc_status}) + await self.chunks_vdb.upsert(chunks) + except Exception as e: - import traceback - - error_msg = f"Failed to process document {doc_id}: {str(e)}\n{traceback.format_exc()}" - logger.error(error_msg) + doc_status.update( + { + "status": DocStatus.FAILED, + "error": str(e), + "updated_at": datetime.now().isoformat(), + } + ) + await self.doc_status.upsert({doc_id: doc_status}) + logger.error(f"Failed to process document {doc_id}: {str(e)}\n{traceback.format_exc()}") continue - logger.info(f"Stored {chunk_cnt} chunks from {len(new_docs)} documents") async def apipeline_process_extract_graph(self): """Get pendding or failed chunks, extract entities and relationships from each chunk""" # 1. get all pending and failed chunks - _todo_chunk_keys = [] - _failed_chunks = await self.text_chunks.get_by_status_and_ids( + to_process_doc_keys: list[str] = [] + + # Process failes + to_process_docs = await self.full_docs.get_by_status( status=DocStatus.FAILED ) - _pendding_chunks = await self.text_chunks.get_by_status_and_ids( + if to_process_docs: + to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) + + # Process Pending + to_process_docs = await self.full_docs.get_by_status( status=DocStatus.PENDING ) - if _failed_chunks: - _todo_chunk_keys.extend([doc["id"] for doc in _failed_chunks]) - if _pendding_chunks: - _todo_chunk_keys.extend([doc["id"] for doc in _pendding_chunks]) - if not _todo_chunk_keys: - logger.info("All chunks have been processed or are duplicates") - return None + if to_process_docs: + to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) + + if not to_process_doc_keys: + logger.info("All documents have been processed or are duplicates") + return # Process documents in batches batch_size = self.addon_params.get("insert_batch_size", 10) @@ -731,9 +605,9 @@ class LightRAG: batch_size ) # Control the number of tasks that are processed simultaneously - async def process_chunk(chunk_id): + async def process_chunk(chunk_id: str): async with semaphore: - chunks = { + chunks:dict[str, Any] = { i["id"]: i for i in await self.text_chunks.get_by_ids([chunk_id]) } # Extract and store entities and relationships @@ -761,13 +635,13 @@ class LightRAG: raise e with tqdm_async( - total=len(_todo_chunk_keys), + total=len(to_process_doc_keys), desc="\nLevel 1 - Processing chunks", unit="chunk", position=0, ) as progress: - tasks = [] - for chunk_id in _todo_chunk_keys: + tasks: list[asyncio.Task[None]] = [] + for chunk_id in to_process_doc_keys: task = asyncio.create_task(process_chunk(chunk_id)) tasks.append(task) From 61fd3e6127eaf8f7755dc1ff482b7a7fd9dd34cf Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 11:21:32 +0100 Subject: [PATCH 16/42] cleaned code --- lightrag/lightrag.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index b2bdfaac..69dd85e9 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -19,7 +19,7 @@ from .utils import ( convert_response_to_json, logger, set_logger, - statistic_data, + statistic_data ) from .base import ( BaseGraphStorage, @@ -31,7 +31,6 @@ from .base import ( from .namespace import NameSpace, make_namespace from .prompt import GRAPH_FIELD_SEP - STORAGES = { "NetworkXStorage": ".kg.networkx_impl", "JsonKVStorage": ".kg.json_kv_impl", @@ -560,8 +559,8 @@ class LightRAG: "updated_at": datetime.now().isoformat(), } ) - await self.doc_status.upsert({doc_id: doc_status}) await self.chunks_vdb.upsert(chunks) + await self.doc_status.upsert({doc_id: doc_status}) except Exception as e: doc_status.update( @@ -621,7 +620,7 @@ class LightRAG: global_config=asdict(self), ) if maybe_new_kg is None: - logger.info("No entities or relationships extracted!") + logger.warning("No entities or relationships extracted!") # Update status to processed await self.text_chunks.upsert( {chunk_id: {"status": DocStatus.PROCESSED}} From 4cce14e65ea124a9a087ad1495dd38b0cd3a03c1 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 11:24:08 +0100 Subject: [PATCH 17/42] cleaned import --- lightrag/base.py | 20 ++++---- lightrag/kg/json_kv_impl.py | 4 +- lightrag/kg/mongo_impl.py | 4 +- lightrag/kg/oracle_impl.py | 4 +- lightrag/kg/postgres_impl.py | 4 +- lightrag/kg/redis_impl.py | 4 +- lightrag/kg/tidb_impl.py | 4 +- lightrag/lightrag.py | 88 ++++++++++++++++++------------------ 8 files changed, 62 insertions(+), 70 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index a91595b2..4b963b43 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -1,6 +1,8 @@ +from enum import Enum import os from dataclasses import dataclass, field from typing import ( + Optional, TypedDict, Union, Literal, @@ -8,6 +10,8 @@ from typing import ( Any, ) +import numpy as np + from .utils import EmbeddingFunc @@ -99,9 +103,7 @@ class BaseKVStorage(StorageNameSpace): async def drop(self) -> None: raise NotImplementedError - async def get_by_status( - self, status: str - ) -> Union[list[dict[str, Any]], None]: + async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]: raise NotImplementedError @@ -148,12 +150,12 @@ class BaseGraphStorage(StorageNameSpace): async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: raise NotImplementedError("Node embedding is not used in lightrag.") - async def get_all_labels(self) -> List[str]: + async def get_all_labels(self) -> list[str]: raise NotImplementedError async def get_knowledge_graph( self, node_label: str, max_depth: int = 5 - ) -> Dict[str, List[Dict]]: + ) -> dict[str, list[dict]]: raise NotImplementedError @@ -177,20 +179,20 @@ class DocProcessingStatus: updated_at: str # ISO format timestamp chunks_count: Optional[int] = None # Number of chunks after splitting error: Optional[str] = None # Error message if failed - metadata: Dict[str, Any] = field(default_factory=dict) # Additional metadata + metadata: dict[str, Any] = field(default_factory=dict) # Additional metadata class DocStatusStorage(BaseKVStorage): """Base class for document status storage""" - async def get_status_counts(self) -> Dict[str, int]: + async def get_status_counts(self) -> dict[str, int]: """Get counts of documents in each status""" raise NotImplementedError - async def get_failed_docs(self) -> Dict[str, DocProcessingStatus]: + async def get_failed_docs(self) -> dict[str, DocProcessingStatus]: """Get all failed documents""" raise NotImplementedError - async def get_pending_docs(self) -> Dict[str, DocProcessingStatus]: + async def get_pending_docs(self) -> dict[str, DocProcessingStatus]: """Get all pending documents""" raise NotImplementedError diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index 59da1b54..e9225375 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -51,8 +51,6 @@ class JsonKVStorage(BaseKVStorage): async def drop(self) -> None: self._data = {} - async def get_by_status( - self, status: str - ) -> Union[list[dict[str, Any]], None]: + async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]: result = [v for _, v in self._data.items() if v["status"] == status] return result if result else None diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index eb896b63..b7b438bd 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -77,9 +77,7 @@ class MongoKVStorage(BaseKVStorage): """Drop the collection""" await self._data.drop() - async def get_by_status( - self, status: str - ) -> Union[list[dict[str, Any]], None]: + async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]: """Get documents by status and ids""" return self._data.find({"status": status}) diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index 0e55194d..c82db9a6 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -229,9 +229,7 @@ class OracleKVStorage(BaseKVStorage): res = [{k: v} for k, v in dict_res.items()] return res - async def get_by_status( - self, status: str - ) -> Union[list[dict[str, Any]], None]: + async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]: """Specifically for llm_response_cache.""" SQL = SQL_TEMPLATES["get_by_status_" + self.namespace] params = {"workspace": self.db.workspace, "status": status} diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index d966fd85..01e3688a 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -231,9 +231,7 @@ class PGKVStorage(BaseKVStorage): else: return await self.db.query(sql, params, multirows=True) - async def get_by_status( - self, status: str - ) -> Union[list[dict[str, Any]], None]: + async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]: """Specifically for llm_response_cache.""" SQL = SQL_TEMPLATES["get_by_status_" + self.namespace] params = {"workspace": self.db.workspace, "status": status} diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py index 7c5c7030..f9283dda 100644 --- a/lightrag/kg/redis_impl.py +++ b/lightrag/kg/redis_impl.py @@ -59,9 +59,7 @@ class RedisKVStorage(BaseKVStorage): if keys: await self._redis.delete(*keys) - async def get_by_status( - self, status: str - ) -> Union[list[dict[str, Any]], None]: + async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]: pipe = self._redis.pipeline() for key in await self._redis.keys(f"{self.namespace}:*"): pipe.hgetall(key) diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index 55dbe303..1f454639 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -322,9 +322,7 @@ class TiDBVectorDBStorage(BaseVectorStorage): merge_sql = SQL_TEMPLATES["insert_relationship"] await self.db.execute(merge_sql, data) - async def get_by_status( - self, status: str - ) -> Union[list[dict[str, Any]], None]: + async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]: SQL = SQL_TEMPLATES["get_by_status_" + self.namespace] params = {"workspace": self.db.workspace, "status": status} return await self.db.query(SQL, params, multirows=True) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 69dd85e9..87018b53 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -4,11 +4,16 @@ from tqdm.asyncio import tqdm as tqdm_async from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial -from typing import Any, Type, Union +from typing import Any, Type, Union, cast import traceback from .operate import ( chunking_by_token_size, - extract_entities + extract_entities, + extract_keywords_only, + kg_query, + kg_query_with_keywords, + mix_kg_vector_query, + naive_query, # local_query,global_query,hybrid_query,, ) @@ -19,18 +24,21 @@ from .utils import ( convert_response_to_json, logger, set_logger, - statistic_data + statistic_data, ) from .base import ( BaseGraphStorage, BaseKVStorage, BaseVectorStorage, DocStatus, + QueryParam, + StorageNameSpace, ) from .namespace import NameSpace, make_namespace from .prompt import GRAPH_FIELD_SEP + STORAGES = { "NetworkXStorage": ".kg.networkx_impl", "JsonKVStorage": ".kg.json_kv_impl", @@ -351,9 +359,10 @@ class LightRAG: ) async def ainsert( - self, string_or_strings: Union[str, list[str]], - split_by_character: str | None = None, - split_by_character_only: bool = False + self, + string_or_strings: Union[str, list[str]], + split_by_character: str | None = None, + split_by_character_only: bool = False, ): """Insert documents with checkpoint support @@ -368,7 +377,6 @@ class LightRAG: await self.apipeline_process_chunks(split_by_character, split_by_character_only) await self.apipeline_process_extract_graph() - def insert_custom_chunks(self, full_text: str, text_chunks: list[str]): loop = always_get_an_event_loop() return loop.run_until_complete( @@ -482,31 +490,27 @@ class LightRAG: logger.info(f"Stored {len(new_docs)} new unique documents") async def apipeline_process_chunks( - self, - split_by_character: str | None = None, - split_by_character_only: bool = False - ) -> None: + self, + split_by_character: str | None = None, + split_by_character_only: bool = False, + ) -> None: """Get pendding documents, split into chunks,insert chunks""" # 1. get all pending and failed documents to_process_doc_keys: list[str] = [] # Process failes - to_process_docs = await self.full_docs.get_by_status( - status=DocStatus.FAILED - ) + to_process_docs = await self.full_docs.get_by_status(status=DocStatus.FAILED) if to_process_docs: to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) - + # Process Pending - to_process_docs = await self.full_docs.get_by_status( - status=DocStatus.PENDING - ) + to_process_docs = await self.full_docs.get_by_status(status=DocStatus.PENDING) if to_process_docs: to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) if not to_process_doc_keys: logger.info("All documents have been processed or are duplicates") - return + return full_docs_ids = await self.full_docs.get_by_ids(to_process_doc_keys) new_docs = {} @@ -515,8 +519,8 @@ class LightRAG: if not new_docs: logger.info("All documents have been processed or are duplicates") - return - + return + # 2. split docs into chunks, insert chunks, update doc status batch_size = self.addon_params.get("insert_batch_size", 10) for i in range(0, len(new_docs), batch_size): @@ -526,11 +530,11 @@ class LightRAG: batch_docs.items(), desc=f"Processing batch {i // batch_size + 1}" ): doc_status: dict[str, Any] = { - "content_summary": doc["content_summary"], - "content_length": doc["content_length"], - "status": DocStatus.PROCESSING, - "created_at": doc["created_at"], - "updated_at": datetime.now().isoformat(), + "content_summary": doc["content_summary"], + "content_length": doc["content_length"], + "status": DocStatus.PROCESSING, + "created_at": doc["created_at"], + "updated_at": datetime.now().isoformat(), } try: await self.doc_status.upsert({doc_id: doc_status}) @@ -564,14 +568,16 @@ class LightRAG: except Exception as e: doc_status.update( - { - "status": DocStatus.FAILED, - "error": str(e), - "updated_at": datetime.now().isoformat(), - } - ) + { + "status": DocStatus.FAILED, + "error": str(e), + "updated_at": datetime.now().isoformat(), + } + ) await self.doc_status.upsert({doc_id: doc_status}) - logger.error(f"Failed to process document {doc_id}: {str(e)}\n{traceback.format_exc()}") + logger.error( + f"Failed to process document {doc_id}: {str(e)}\n{traceback.format_exc()}" + ) continue async def apipeline_process_extract_graph(self): @@ -580,22 +586,18 @@ class LightRAG: to_process_doc_keys: list[str] = [] # Process failes - to_process_docs = await self.full_docs.get_by_status( - status=DocStatus.FAILED - ) + to_process_docs = await self.full_docs.get_by_status(status=DocStatus.FAILED) if to_process_docs: to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) - + # Process Pending - to_process_docs = await self.full_docs.get_by_status( - status=DocStatus.PENDING - ) + to_process_docs = await self.full_docs.get_by_status(status=DocStatus.PENDING) if to_process_docs: to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) if not to_process_doc_keys: logger.info("All documents have been processed or are duplicates") - return + return # Process documents in batches batch_size = self.addon_params.get("insert_batch_size", 10) @@ -606,7 +608,7 @@ class LightRAG: async def process_chunk(chunk_id: str): async with semaphore: - chunks:dict[str, Any] = { + chunks: dict[str, Any] = { i["id"]: i for i in await self.text_chunks.get_by_ids([chunk_id]) } # Extract and store entities and relationships @@ -1051,7 +1053,7 @@ class LightRAG: return content return content[:max_length] + "..." - async def get_processing_status(self) -> Dict[str, int]: + async def get_processing_status(self) -> dict[str, int]: """Get current document processing status counts Returns: From c7c565287ae3df0b38f72486459eccf0bc3b83a8 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 11:29:05 +0100 Subject: [PATCH 18/42] added docs --- lightrag/lightrag.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 87018b53..60527486 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -351,8 +351,20 @@ class LightRAG: storage.db = db_client def insert( - self, string_or_strings, split_by_character=None, split_by_character_only=False + self, + string_or_strings: Union[str, list[str]], + split_by_character: str | None = None, + split_by_character_only: bool = False, ): + """Sync Insert documents with checkpoint support + + Args: + string_or_strings: Single document string or list of document strings + split_by_character: if split_by_character is not None, split the string by character, if chunk longer than + chunk_size, split the sub chunk by token size. + split_by_character_only: if split_by_character_only is True, split the string by character only, when + split_by_character is None, this parameter is ignored. + """ loop = always_get_an_event_loop() return loop.run_until_complete( self.ainsert(string_or_strings, split_by_character, split_by_character_only) @@ -364,7 +376,7 @@ class LightRAG: split_by_character: str | None = None, split_by_character_only: bool = False, ): - """Insert documents with checkpoint support + """Async Insert documents with checkpoint support Args: string_or_strings: Single document string or list of document strings From 1159a69e4ef19731baf4e18ca61cf88aff728071 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 11:30:54 +0100 Subject: [PATCH 19/42] added docs --- lightrag/lightrag.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 60527486..819a33e1 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -457,7 +457,13 @@ class LightRAG: await self._insert_done() async def apipeline_process_documents(self, string_or_strings: str | list[str]): - """Input list remove duplicates, generate document IDs and initial pendding status, filter out already stored documents, store docs + """Pipeline process documents + + 1. Remove duplicate contents from the list + 2. Generate document IDs and initial status + 3. Filter out already stored documents + 4. Store docs + Args: string_or_strings: Single document string or list of document strings """ @@ -506,7 +512,18 @@ class LightRAG: split_by_character: str | None = None, split_by_character_only: bool = False, ) -> None: - """Get pendding documents, split into chunks,insert chunks""" + """Pipeline process chunks + + 1. Get pending documents + 2. Split documents into chunks + 3. Insert chunks + + Args: + split_by_character (str | None): If not None, split the string by character, if chunk longer than + chunk_size, split the sub chunk by token size. + split_by_character_only (bool): If split_by_character_only is True, split the string by character only, + when split_by_character is None, this parameter is ignored. + """ # 1. get all pending and failed documents to_process_doc_keys: list[str] = [] From 4acf92dfd97b665d2e498a8289b205d10ecf00ba Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 11:35:31 +0100 Subject: [PATCH 20/42] cleaned code --- lightrag/lightrag.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 819a33e1..f33427cf 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -487,14 +487,13 @@ class LightRAG: } # 3. Filter out already processed documents - _add_doc_keys: set[str] = set() + add_doc_keys: set[str] = set() for doc_id in new_docs.keys(): current_doc = await self.doc_status.get_by_id(doc_id) - if not current_doc or current_doc["status"] == DocStatus.FAILED: - _add_doc_keys.add(doc_id) + add_doc_keys.add(doc_id) - new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} + new_docs = {k: v for k, v in new_docs.items() if k in add_doc_keys} if not new_docs: logger.info("All documents have been processed or are duplicates") @@ -503,7 +502,7 @@ class LightRAG: # 4. Store original document for doc_id, doc in new_docs.items(): await self.full_docs.upsert( - {doc_id: {"content": doc["content"], "status": DocStatus.PENDING}} + {doc_id: doc} ) logger.info(f"Stored {len(new_docs)} new unique documents") @@ -610,7 +609,23 @@ class LightRAG: continue async def apipeline_process_extract_graph(self): - """Get pendding or failed chunks, extract entities and relationships from each chunk""" + """ + Process pending or failed chunks to extract entities and relationships. + + This method retrieves all chunks that are currently marked as pending or have previously failed. + It then extracts entities and relationships from each chunk and updates the status accordingly. + + Steps: + 1. Retrieve all pending and failed chunks. + 2. For each chunk, attempt to extract entities and relationships. + 3. Update the chunk's status to processed if successful, or failed if an error occurs. + + Raises: + Exception: If there is an error during the extraction process. + + Returns: + None + """ # 1. get all pending and failed chunks to_process_doc_keys: list[str] = [] From 572a75b14148ff617106558f031371fc19798e5b Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 11:36:33 +0100 Subject: [PATCH 21/42] cleaned code --- lightrag/lightrag.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index f33427cf..b190054d 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -500,10 +500,7 @@ class LightRAG: return # 4. Store original document - for doc_id, doc in new_docs.items(): - await self.full_docs.upsert( - {doc_id: doc} - ) + await self.full_docs.upsert(new_docs) logger.info(f"Stored {len(new_docs)} new unique documents") async def apipeline_process_chunks( From 1f8fc4459122bfc10f64f374fe1d7177e1f395d1 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 11:46:01 +0100 Subject: [PATCH 22/42] cleaned type --- lightrag/lightrag.py | 21 +++++++++++++++------ lightrag/operate.py | 11 +++++------ lightrag/utils.py | 2 +- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index b190054d..c2f45fe7 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -4,7 +4,7 @@ from tqdm.asyncio import tqdm as tqdm_async from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial -from typing import Any, Type, Union, cast +from typing import Any, Callable, Optional, Type, Union, cast import traceback from .operate import ( chunking_by_token_size, @@ -177,13 +177,24 @@ class LightRAG: # extension addon_params: dict[str, Any] = field(default_factory=dict) - convert_response_to_json_func: callable = convert_response_to_json + convert_response_to_json_func: Callable[[str], dict[str, Any]] = convert_response_to_json # Add new field for document status storage type doc_status_storage: str = field(default="JsonDocStatusStorage") # Custom Chunking Function - chunking_func: callable = chunking_by_token_size + chunking_func: Callable[ + [ + str, + Optional[str], + bool, + int, + int, + str, + ], + list[dict[str, Any]], + ] = chunking_by_token_size + chunking_func_kwargs: dict = field(default_factory=dict) def __post_init__(self): @@ -538,9 +549,7 @@ class LightRAG: return full_docs_ids = await self.full_docs.get_by_ids(to_process_doc_keys) - new_docs = {} - if full_docs_ids: - new_docs = {doc["id"]: doc for doc in full_docs_ids or []} + new_docs = {doc["id"]: doc for doc in full_docs_ids or []} if not new_docs: logger.info("All documents have been processed or are duplicates") diff --git a/lightrag/operate.py b/lightrag/operate.py index ec896cc4..7c70d948 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -36,12 +36,11 @@ import time def chunking_by_token_size( content: str, - split_by_character=None, - split_by_character_only=False, - overlap_token_size=128, - max_token_size=1024, - tiktoken_model="gpt-4o", - **kwargs, + split_by_character: Union[str, None]=None, + split_by_character_only: bool =False, + overlap_token_size: int =128, + max_token_size: int =1024, + tiktoken_model: str="gpt-4o" ) -> list[dict[str, Any]]: tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) results: list[dict[str, Any]] = [] diff --git a/lightrag/utils.py b/lightrag/utils.py index ed0b6c06..28d9bfaa 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -98,7 +98,7 @@ def locate_json_string_body_from_string(content: str) -> Union[str, None]: return None -def convert_response_to_json(response: str) -> dict: +def convert_response_to_json(response: str) -> dict[str, Any]: json_str = locate_json_string_body_from_string(response) assert json_str is not None, f"Unable to parse JSON from response: {response}" try: From 914c8ffcd7a3bcc542fe4e83216ae9c2061a26f2 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 11:48:27 +0100 Subject: [PATCH 23/42] cleaned code --- lightrag/lightrag.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index c2f45fe7..2fbba5ac 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -194,8 +194,6 @@ class LightRAG: ], list[dict[str, Any]], ] = chunking_by_token_size - - chunking_func_kwargs: dict = field(default_factory=dict) def __post_init__(self): os.makedirs(self.log_dir, exist_ok=True) @@ -581,12 +579,11 @@ class LightRAG: } for dp in self.chunking_func( doc["content"], - split_by_character=split_by_character, - split_by_character_only=split_by_character_only, - overlap_token_size=self.chunk_overlap_token_size, - max_token_size=self.chunk_token_size, - tiktoken_model=self.tiktoken_model_name, - **self.chunking_func_kwargs, + split_by_character, + split_by_character_only, + self.chunk_overlap_token_size, + self.chunk_token_size, + self.tiktoken_model_name, ) } From bf89dc18b7eed06cc7bdef3aff30f4e982c6d6cf Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 13:03:50 +0100 Subject: [PATCH 24/42] fixed the processed --- lightrag/lightrag.py | 113 ++++++++++++++++++++++++++----------------- 1 file changed, 68 insertions(+), 45 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 2fbba5ac..e6a0f50f 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -497,11 +497,8 @@ class LightRAG: # 3. Filter out already processed documents add_doc_keys: set[str] = set() - for doc_id in new_docs.keys(): - current_doc = await self.doc_status.get_by_id(doc_id) - if not current_doc or current_doc["status"] == DocStatus.FAILED: - add_doc_keys.add(doc_id) - + excluded_ids = await self.doc_status.all_keys() + add_doc_keys = new_docs.keys() - excluded_ids new_docs = {k: v for k, v in new_docs.items() if k in add_doc_keys} if not new_docs: @@ -509,7 +506,7 @@ class LightRAG: return # 4. Store original document - await self.full_docs.upsert(new_docs) + await self.doc_status.upsert(new_docs) logger.info(f"Stored {len(new_docs)} new unique documents") async def apipeline_process_chunks( @@ -533,21 +530,22 @@ class LightRAG: to_process_doc_keys: list[str] = [] # Process failes - to_process_docs = await self.full_docs.get_by_status(status=DocStatus.FAILED) + to_process_docs = await self.doc_status.get_by_status(status=DocStatus.FAILED) if to_process_docs: to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) # Process Pending - to_process_docs = await self.full_docs.get_by_status(status=DocStatus.PENDING) + to_process_docs = await self.doc_status.get_by_status(status=DocStatus.PENDING) if to_process_docs: to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) if not to_process_doc_keys: logger.info("All documents have been processed or are duplicates") return - - full_docs_ids = await self.full_docs.get_by_ids(to_process_doc_keys) - new_docs = {doc["id"]: doc for doc in full_docs_ids or []} + + # If included in text_chunks is all processed, return + new_docs_ids = await self.text_chunks.filter_keys(to_process_doc_keys) + new_docs = await self.doc_status.get_by_ids(list(new_docs_ids)) if not new_docs: logger.info("All documents have been processed or are duplicates") @@ -555,12 +553,10 @@ class LightRAG: # 2. split docs into chunks, insert chunks, update doc status batch_size = self.addon_params.get("insert_batch_size", 10) - for i in range(0, len(new_docs), batch_size): - batch_docs = dict(list(new_docs.items())[i : i + batch_size]) - - for doc_id, doc in tqdm_async( - batch_docs.items(), desc=f"Processing batch {i // batch_size + 1}" - ): + batch_docs_list = [new_docs[i:i+batch_size] for i in range(0, len(new_docs), batch_size)] + for i, el in enumerate(batch_docs_list): + items = ((k, v) for d in el for k, v in d.items()) + for doc_id, doc in tqdm_async(items, desc=f"Level 1 - Spliting doc in batch {i // len(batch_docs_list) + 1}"): doc_status: dict[str, Any] = { "content_summary": doc["content_summary"], "content_length": doc["content_length"], @@ -570,7 +566,7 @@ class LightRAG: } try: await self.doc_status.upsert({doc_id: doc_status}) - + # Generate chunks from document chunks: dict[str, Any] = { compute_mdhash_id(dp["content"], prefix="chunk-"): { @@ -588,16 +584,21 @@ class LightRAG: } # Update status with chunks information + + await self._process_entity_relation_graph(chunks) + await self.chunks_vdb.upsert(chunks) + await self.text_chunks.upsert(chunks) doc_status.update( { + "status": DocStatus.PROCESSED, "chunks_count": len(chunks), "updated_at": datetime.now().isoformat(), } - ) - await self.chunks_vdb.upsert(chunks) + ) await self.doc_status.upsert({doc_id: doc_status}) except Exception as e: + # Update status with failed information doc_status.update( { "status": DocStatus.FAILED, @@ -611,6 +612,26 @@ class LightRAG: ) continue + + async def _process_entity_relation_graph(self, chunk: dict[str, Any]) -> None: + try: + new_kg = await extract_entities( + chunk, + knowledge_graph_inst=self.chunk_entity_relation_graph, + entity_vdb=self.entities_vdb, + relationships_vdb=self.relationships_vdb, + llm_response_cache=self.llm_response_cache, + global_config=asdict(self), + ) + if new_kg is None: + logger.info("No entities or relationships extracted!") + else: + self.chunk_entity_relation_graph = new_kg + + except Exception as e: + logger.error("Failed to extract entities and relationships") + raise e + async def apipeline_process_extract_graph(self): """ Process pending or failed chunks to extract entities and relationships. @@ -633,12 +654,12 @@ class LightRAG: to_process_doc_keys: list[str] = [] # Process failes - to_process_docs = await self.full_docs.get_by_status(status=DocStatus.FAILED) + to_process_docs = await self.doc_status.get_by_status(status=DocStatus.FAILED) if to_process_docs: to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) # Process Pending - to_process_docs = await self.full_docs.get_by_status(status=DocStatus.PENDING) + to_process_docs = await self.doc_status.get_by_status(status=DocStatus.PENDING) if to_process_docs: to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) @@ -658,29 +679,31 @@ class LightRAG: chunks: dict[str, Any] = { i["id"]: i for i in await self.text_chunks.get_by_ids([chunk_id]) } - # Extract and store entities and relationships - try: - maybe_new_kg = await extract_entities( - chunks, - knowledge_graph_inst=self.chunk_entity_relation_graph, - entity_vdb=self.entities_vdb, - relationships_vdb=self.relationships_vdb, - llm_response_cache=self.llm_response_cache, - global_config=asdict(self), - ) - if maybe_new_kg is None: - logger.warning("No entities or relationships extracted!") - # Update status to processed - await self.text_chunks.upsert( - {chunk_id: {"status": DocStatus.PROCESSED}} - ) - except Exception as e: - logger.error("Failed to extract entities and relationships") - # Mark as failed if any step fails - await self.text_chunks.upsert( - {chunk_id: {"status": DocStatus.FAILED}} - ) - raise e + async def _process_chunk(chunk_id: str): + chunks: dict[str, Any] = { + i["id"]: i for i in await self.text_chunks.get_by_ids([chunk_id]) + } + + # Extract and store entities and relationships + try: + maybe_new_kg = await extract_entities( + chunks, + knowledge_graph_inst=self.chunk_entity_relation_graph, + entity_vdb=self.entities_vdb, + relationships_vdb=self.relationships_vdb, + llm_response_cache=self.llm_response_cache, + global_config=asdict(self), + ) + if maybe_new_kg is None: + logger.warning("No entities or relationships extracted!") + # Update status to processed + await self.text_chunks.upsert(chunks) + await self.doc_status.upsert({chunk_id: {"status": DocStatus.PROCESSED}}) + except Exception as e: + logger.error("Failed to extract entities and relationships") + # Mark as failed if any step fails + await self.doc_status.upsert({chunk_id: {"status": DocStatus.FAILED}}) + raise e with tqdm_async( total=len(to_process_doc_keys), From 263a3011792c94b4ff3d03c23c364345bd1bb9dc Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 13:16:21 +0100 Subject: [PATCH 25/42] fixed filtering --- lightrag/lightrag.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index e6a0f50f..77797a5b 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -396,7 +396,6 @@ class LightRAG: """ await self.apipeline_process_documents(string_or_strings) await self.apipeline_process_chunks(split_by_character, split_by_character_only) - await self.apipeline_process_extract_graph() def insert_custom_chunks(self, full_text: str, text_chunks: list[str]): loop = always_get_an_event_loop() @@ -544,9 +543,11 @@ class LightRAG: return # If included in text_chunks is all processed, return - new_docs_ids = await self.text_chunks.filter_keys(to_process_doc_keys) - new_docs = await self.doc_status.get_by_ids(list(new_docs_ids)) + new_docs = await self.doc_status.get_by_ids(to_process_doc_keys) + text_chunks_new_docs_ids = await self.text_chunks.filter_keys(to_process_doc_keys) + full_docs_new_docs_ids = await self.full_docs.filter_keys(to_process_doc_keys) + if not new_docs: logger.info("All documents have been processed or are duplicates") return @@ -582,12 +583,19 @@ class LightRAG: self.tiktoken_model_name, ) } - - # Update status with chunks information - - await self._process_entity_relation_graph(chunks) await self.chunks_vdb.upsert(chunks) - await self.text_chunks.upsert(chunks) + + # Update status with chunks information + await self._process_entity_relation_graph(chunks) + + if not doc_id in full_docs_new_docs_ids: + await self.full_docs.upsert( + {doc_id: {"content": doc["content"]}} + ) + + if not doc_id in text_chunks_new_docs_ids: + await self.text_chunks.upsert(chunks) + doc_status.update( { "status": DocStatus.PROCESSED, From acbe3e2ff2e8be547835fac9971fe75e07541707 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 13:18:47 +0100 Subject: [PATCH 26/42] cleaned code --- lightrag/lightrag.py | 250 ++++++++++++++++++++++--------------------- lightrag/operate.py | 10 +- 2 files changed, 133 insertions(+), 127 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 77797a5b..f37b4e09 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -24,7 +24,6 @@ from .utils import ( convert_response_to_json, logger, set_logger, - statistic_data, ) from .base import ( BaseGraphStorage, @@ -177,7 +176,9 @@ class LightRAG: # extension addon_params: dict[str, Any] = field(default_factory=dict) - convert_response_to_json_func: Callable[[str], dict[str, Any]] = convert_response_to_json + convert_response_to_json_func: Callable[[str], dict[str, Any]] = ( + convert_response_to_json + ) # Add new field for document status storage type doc_status_storage: str = field(default="JsonDocStatusStorage") @@ -360,7 +361,7 @@ class LightRAG: storage.db = db_client def insert( - self, + self, string_or_strings: Union[str, list[str]], split_by_character: str | None = None, split_by_character_only: bool = False, @@ -373,7 +374,7 @@ class LightRAG: chunk_size, split the sub chunk by token size. split_by_character_only: if split_by_character_only is True, split the string by character only, when split_by_character is None, this parameter is ignored. - """ + """ loop = always_get_an_event_loop() return loop.run_until_complete( self.ainsert(string_or_strings, split_by_character, split_by_character_only) @@ -505,7 +506,7 @@ class LightRAG: return # 4. Store original document - await self.doc_status.upsert(new_docs) + await self.doc_status.upsert(new_docs) logger.info(f"Stored {len(new_docs)} new unique documents") async def apipeline_process_chunks( @@ -541,23 +542,29 @@ class LightRAG: if not to_process_doc_keys: logger.info("All documents have been processed or are duplicates") return - + # If included in text_chunks is all processed, return new_docs = await self.doc_status.get_by_ids(to_process_doc_keys) - - text_chunks_new_docs_ids = await self.text_chunks.filter_keys(to_process_doc_keys) + text_chunks_new_docs_ids = await self.text_chunks.filter_keys( + to_process_doc_keys + ) full_docs_new_docs_ids = await self.full_docs.filter_keys(to_process_doc_keys) - + if not new_docs: logger.info("All documents have been processed or are duplicates") return # 2. split docs into chunks, insert chunks, update doc status batch_size = self.addon_params.get("insert_batch_size", 10) - batch_docs_list = [new_docs[i:i+batch_size] for i in range(0, len(new_docs), batch_size)] + batch_docs_list = [ + new_docs[i : i + batch_size] for i in range(0, len(new_docs), batch_size) + ] for i, el in enumerate(batch_docs_list): items = ((k, v) for d in el for k, v in d.items()) - for doc_id, doc in tqdm_async(items, desc=f"Level 1 - Spliting doc in batch {i // len(batch_docs_list) + 1}"): + for doc_id, doc in tqdm_async( + items, + desc=f"Level 1 - Spliting doc in batch {i // len(batch_docs_list) + 1}", + ): doc_status: dict[str, Any] = { "content_summary": doc["content_summary"], "content_length": doc["content_length"], @@ -567,7 +574,7 @@ class LightRAG: } try: await self.doc_status.upsert({doc_id: doc_status}) - + # Generate chunks from document chunks: dict[str, Any] = { compute_mdhash_id(dp["content"], prefix="chunk-"): { @@ -584,26 +591,27 @@ class LightRAG: ) } await self.chunks_vdb.upsert(chunks) - + # Update status with chunks information await self._process_entity_relation_graph(chunks) - - if not doc_id in full_docs_new_docs_ids: + + if doc_id not in full_docs_new_docs_ids: await self.full_docs.upsert( - {doc_id: {"content": doc["content"]}} - ) - - if not doc_id in text_chunks_new_docs_ids: + {doc_id: {"content": doc["content"]}} + ) + + if doc_id not in text_chunks_new_docs_ids: await self.text_chunks.upsert(chunks) - + doc_status.update( { "status": DocStatus.PROCESSED, "chunks_count": len(chunks), "updated_at": datetime.now().isoformat(), } - ) + ) await self.doc_status.upsert({doc_id: doc_status}) + await self._insert_done() except Exception as e: # Update status with failed information @@ -620,122 +628,120 @@ class LightRAG: ) continue - async def _process_entity_relation_graph(self, chunk: dict[str, Any]) -> None: - try: - new_kg = await extract_entities( - chunk, - knowledge_graph_inst=self.chunk_entity_relation_graph, - entity_vdb=self.entities_vdb, - relationships_vdb=self.relationships_vdb, - llm_response_cache=self.llm_response_cache, - global_config=asdict(self), - ) - if new_kg is None: - logger.info("No entities or relationships extracted!") - else: - self.chunk_entity_relation_graph = new_kg - - except Exception as e: - logger.error("Failed to extract entities and relationships") - raise e - - async def apipeline_process_extract_graph(self): - """ - Process pending or failed chunks to extract entities and relationships. + try: + new_kg = await extract_entities( + chunk, + knowledge_graph_inst=self.chunk_entity_relation_graph, + entity_vdb=self.entities_vdb, + relationships_vdb=self.relationships_vdb, + llm_response_cache=self.llm_response_cache, + global_config=asdict(self), + ) + if new_kg is None: + logger.info("No entities or relationships extracted!") + else: + self.chunk_entity_relation_graph = new_kg - This method retrieves all chunks that are currently marked as pending or have previously failed. - It then extracts entities and relationships from each chunk and updates the status accordingly. + except Exception as e: + logger.error("Failed to extract entities and relationships") + raise e - Steps: - 1. Retrieve all pending and failed chunks. - 2. For each chunk, attempt to extract entities and relationships. - 3. Update the chunk's status to processed if successful, or failed if an error occurs. + # async def apipeline_process_extract_graph(self): + # """ + # Process pending or failed chunks to extract entities and relationships. - Raises: - Exception: If there is an error during the extraction process. + # This method retrieves all chunks that are currently marked as pending or have previously failed. + # It then extracts entities and relationships from each chunk and updates the status accordingly. - Returns: - None - """ - # 1. get all pending and failed chunks - to_process_doc_keys: list[str] = [] + # Steps: + # 1. Retrieve all pending and failed chunks. + # 2. For each chunk, attempt to extract entities and relationships. + # 3. Update the chunk's status to processed if successful, or failed if an error occurs. - # Process failes - to_process_docs = await self.doc_status.get_by_status(status=DocStatus.FAILED) - if to_process_docs: - to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) + # Raises: + # Exception: If there is an error during the extraction process. - # Process Pending - to_process_docs = await self.doc_status.get_by_status(status=DocStatus.PENDING) - if to_process_docs: - to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) + # Returns: + # None + # """ + # # 1. get all pending and failed chunks + # to_process_doc_keys: list[str] = [] - if not to_process_doc_keys: - logger.info("All documents have been processed or are duplicates") - return + # # Process failes + # to_process_docs = await self.doc_status.get_by_status(status=DocStatus.FAILED) + # if to_process_docs: + # to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) - # Process documents in batches - batch_size = self.addon_params.get("insert_batch_size", 10) + # # Process Pending + # to_process_docs = await self.doc_status.get_by_status(status=DocStatus.PENDING) + # if to_process_docs: + # to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) - semaphore = asyncio.Semaphore( - batch_size - ) # Control the number of tasks that are processed simultaneously + # if not to_process_doc_keys: + # logger.info("All documents have been processed or are duplicates") + # return - async def process_chunk(chunk_id: str): - async with semaphore: - chunks: dict[str, Any] = { - i["id"]: i for i in await self.text_chunks.get_by_ids([chunk_id]) - } - async def _process_chunk(chunk_id: str): - chunks: dict[str, Any] = { - i["id"]: i for i in await self.text_chunks.get_by_ids([chunk_id]) - } + # # Process documents in batches + # batch_size = self.addon_params.get("insert_batch_size", 10) - # Extract and store entities and relationships - try: - maybe_new_kg = await extract_entities( - chunks, - knowledge_graph_inst=self.chunk_entity_relation_graph, - entity_vdb=self.entities_vdb, - relationships_vdb=self.relationships_vdb, - llm_response_cache=self.llm_response_cache, - global_config=asdict(self), - ) - if maybe_new_kg is None: - logger.warning("No entities or relationships extracted!") - # Update status to processed - await self.text_chunks.upsert(chunks) - await self.doc_status.upsert({chunk_id: {"status": DocStatus.PROCESSED}}) - except Exception as e: - logger.error("Failed to extract entities and relationships") - # Mark as failed if any step fails - await self.doc_status.upsert({chunk_id: {"status": DocStatus.FAILED}}) - raise e + # semaphore = asyncio.Semaphore( + # batch_size + # ) # Control the number of tasks that are processed simultaneously - with tqdm_async( - total=len(to_process_doc_keys), - desc="\nLevel 1 - Processing chunks", - unit="chunk", - position=0, - ) as progress: - tasks: list[asyncio.Task[None]] = [] - for chunk_id in to_process_doc_keys: - task = asyncio.create_task(process_chunk(chunk_id)) - tasks.append(task) + # async def process_chunk(chunk_id: str): + # async with semaphore: + # chunks: dict[str, Any] = { + # i["id"]: i for i in await self.text_chunks.get_by_ids([chunk_id]) + # } + # async def _process_chunk(chunk_id: str): + # chunks: dict[str, Any] = { + # i["id"]: i for i in await self.text_chunks.get_by_ids([chunk_id]) + # } - for future in asyncio.as_completed(tasks): - await future - progress.update(1) - progress.set_postfix( - { - "LLM call": statistic_data["llm_call"], - "LLM cache": statistic_data["llm_cache"], - } - ) + # # Extract and store entities and relationships + # try: + # maybe_new_kg = await extract_entities( + # chunks, + # knowledge_graph_inst=self.chunk_entity_relation_graph, + # entity_vdb=self.entities_vdb, + # relationships_vdb=self.relationships_vdb, + # llm_response_cache=self.llm_response_cache, + # global_config=asdict(self), + # ) + # if maybe_new_kg is None: + # logger.warning("No entities or relationships extracted!") + # # Update status to processed + # await self.text_chunks.upsert(chunks) + # await self.doc_status.upsert({chunk_id: {"status": DocStatus.PROCESSED}}) + # except Exception as e: + # logger.error("Failed to extract entities and relationships") + # # Mark as failed if any step fails + # await self.doc_status.upsert({chunk_id: {"status": DocStatus.FAILED}}) + # raise e - # Ensure all indexes are updated after each document - await self._insert_done() + # with tqdm_async( + # total=len(to_process_doc_keys), + # desc="\nLevel 1 - Processing chunks", + # unit="chunk", + # position=0, + # ) as progress: + # tasks: list[asyncio.Task[None]] = [] + # for chunk_id in to_process_doc_keys: + # task = asyncio.create_task(process_chunk(chunk_id)) + # tasks.append(task) + + # for future in asyncio.as_completed(tasks): + # await future + # progress.update(1) + # progress.set_postfix( + # { + # "LLM call": statistic_data["llm_call"], + # "LLM cache": statistic_data["llm_cache"], + # } + # ) + + # # Ensure all indexes are updated after each document async def _insert_done(self): tasks = [] diff --git a/lightrag/operate.py b/lightrag/operate.py index 7c70d948..811b4194 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -36,11 +36,11 @@ import time def chunking_by_token_size( content: str, - split_by_character: Union[str, None]=None, - split_by_character_only: bool =False, - overlap_token_size: int =128, - max_token_size: int =1024, - tiktoken_model: str="gpt-4o" + split_by_character: Union[str, None] = None, + split_by_character_only: bool = False, + overlap_token_size: int = 128, + max_token_size: int = 1024, + tiktoken_model: str = "gpt-4o", ) -> list[dict[str, Any]]: tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) results: list[dict[str, Any]] = [] From 1c7d14ef763a76662ca5b8471c17feeed60c3428 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 13:54:04 +0100 Subject: [PATCH 27/42] updated the pipe --- lightrag/lightrag.py | 128 +++++++++++++++++++++++++------------------ 1 file changed, 76 insertions(+), 52 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index f37b4e09..f3a3ac9a 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -4,7 +4,7 @@ from tqdm.asyncio import tqdm as tqdm_async from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial -from typing import Any, Callable, Optional, Type, Union, cast +from typing import Any, Callable, Coroutine, Optional, Type, Union, cast import traceback from .operate import ( chunking_by_token_size, @@ -561,72 +561,96 @@ class LightRAG: ] for i, el in enumerate(batch_docs_list): items = ((k, v) for d in el for k, v in d.items()) + + tasks: dict[str, list[Coroutine[Any, Any, None]]] = {} + + doc_status: dict[str, Any] = { + "status": DocStatus.PROCESSING, + "updated_at": datetime.now().isoformat(), + } + for doc_id, doc in tqdm_async( items, desc=f"Level 1 - Spliting doc in batch {i // len(batch_docs_list) + 1}", ): - doc_status: dict[str, Any] = { - "content_summary": doc["content_summary"], - "content_length": doc["content_length"], - "status": DocStatus.PROCESSING, - "created_at": doc["created_at"], - "updated_at": datetime.now().isoformat(), + doc_status.update( + { + "content_summary": doc["content_summary"], + "content_length": doc["content_length"], + "created_at": doc["created_at"], + } + ) + + await self.doc_status.upsert({doc_id: doc_status}) + + # Generate chunks from document + chunks: dict[str, Any] = { + compute_mdhash_id(dp["content"], prefix="chunk-"): { + **dp, + "full_doc_id": doc_id, + } + for dp in self.chunking_func( + doc["content"], + split_by_character, + split_by_character_only, + self.chunk_overlap_token_size, + self.chunk_token_size, + self.tiktoken_model_name, + ) } try: - await self.doc_status.upsert({doc_id: doc_status}) - - # Generate chunks from document - chunks: dict[str, Any] = { - compute_mdhash_id(dp["content"], prefix="chunk-"): { - **dp, - "full_doc_id": doc_id, - } - for dp in self.chunking_func( - doc["content"], - split_by_character, - split_by_character_only, - self.chunk_overlap_token_size, - self.chunk_token_size, - self.tiktoken_model_name, - ) - } - await self.chunks_vdb.upsert(chunks) - - # Update status with chunks information - await self._process_entity_relation_graph(chunks) - - if doc_id not in full_docs_new_docs_ids: - await self.full_docs.upsert( - {doc_id: {"content": doc["content"]}} - ) - - if doc_id not in text_chunks_new_docs_ids: - await self.text_chunks.upsert(chunks) - - doc_status.update( - { - "status": DocStatus.PROCESSED, - "chunks_count": len(chunks), - "updated_at": datetime.now().isoformat(), - } - ) - await self.doc_status.upsert({doc_id: doc_status}) - await self._insert_done() - + # If fails it's failed on full doc and text chunks upset + if doc["status"] != DocStatus.FAILED: + # Ensure chunk insertion and graph processing happen sequentially + await self._process_entity_relation_graph(chunks) + await self.chunks_vdb.upsert(chunks) except Exception as e: - # Update status with failed information doc_status.update( { - "status": DocStatus.FAILED, + "status": DocStatus.PENDING, "error": str(e), "updated_at": datetime.now().isoformat(), } ) await self.doc_status.upsert({doc_id: doc_status}) - logger.error( - f"Failed to process document {doc_id}: {str(e)}\n{traceback.format_exc()}" + + if doc_id not in full_docs_new_docs_ids: + tasks[doc_id].append( + self.full_docs.upsert({doc_id: {"content": doc["content"]}}) ) - continue + + if doc_id not in text_chunks_new_docs_ids: + tasks[doc_id].append(self.text_chunks.upsert(chunks)) + + for doc_id, task in tasks.items(): + try: + await asyncio.gather(*task) + + # Update document status + doc_status.update( + { + "status": DocStatus.PROCESSED, + "chunks_count": len(chunks), + "updated_at": datetime.now().isoformat(), + } + ) + await self.doc_status.upsert({doc_id: doc_status}) + await self._insert_done() + + except Exception as e: + # Update status with failed information + doc_status.update( + { + "status": DocStatus.FAILED, + "error": str(e), + "updated_at": datetime.now().isoformat(), + } + ) + await self.doc_status.upsert({doc_id: doc_status}) + logger.error( + f"Failed to process document {doc_id}: {str(e)}\n{traceback.format_exc()}" + ) + continue async def _process_entity_relation_graph(self, chunk: dict[str, Any]) -> None: try: From 6a4a77bfe95a7afdbc35e1f950e12e3181ec9dee Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 14:11:11 +0100 Subject: [PATCH 28/42] make more clear --- lightrag/lightrag.py | 75 +++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index f3a3ac9a..f9196cf3 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -509,6 +509,26 @@ class LightRAG: await self.doc_status.upsert(new_docs) logger.info(f"Stored {len(new_docs)} new unique documents") + async def _get_pending_documents(self) -> list[str]: + """Fetch all pending and failed documents.""" + to_process_doc_keys: list[str] = [] + + # Fetch failed documents + failed_docs = await self.doc_status.get_by_status(status=DocStatus.FAILED) + if failed_docs: + to_process_doc_keys.extend([doc["id"] for doc in failed_docs]) + + # Fetch pending documents + pending_docs = await self.doc_status.get_by_status(status=DocStatus.PENDING) + if pending_docs: + to_process_doc_keys.extend([doc["id"] for doc in pending_docs]) + + if not to_process_doc_keys: + logger.info("All documents have been processed or are duplicates") + return [] + + return to_process_doc_keys + async def apipeline_process_chunks( self, split_by_character: str | None = None, @@ -527,52 +547,36 @@ class LightRAG: when split_by_character is None, this parameter is ignored. """ # 1. get all pending and failed documents - to_process_doc_keys: list[str] = [] - - # Process failes - to_process_docs = await self.doc_status.get_by_status(status=DocStatus.FAILED) - if to_process_docs: - to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) - - # Process Pending - to_process_docs = await self.doc_status.get_by_status(status=DocStatus.PENDING) - if to_process_docs: - to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) - - if not to_process_doc_keys: - logger.info("All documents have been processed or are duplicates") - return - - # If included in text_chunks is all processed, return - new_docs = await self.doc_status.get_by_ids(to_process_doc_keys) - text_chunks_new_docs_ids = await self.text_chunks.filter_keys( - to_process_doc_keys - ) - full_docs_new_docs_ids = await self.full_docs.filter_keys(to_process_doc_keys) - - if not new_docs: + pending_doc_ids = await self._get_pending_documents() + + if not pending_doc_ids: logger.info("All documents have been processed or are duplicates") return + + # Get allready processed documents (text chunks and full docs) + text_chunks_processed_doc_ids = await self.text_chunks.filter_keys(pending_doc_ids) + full_docs_processed_doc_ids = await self.full_docs.filter_keys(pending_doc_ids) # 2. split docs into chunks, insert chunks, update doc status batch_size = self.addon_params.get("insert_batch_size", 10) batch_docs_list = [ - new_docs[i : i + batch_size] for i in range(0, len(new_docs), batch_size) + pending_doc_ids[i : i + batch_size] for i in range(0, len(pending_doc_ids), batch_size) ] - for i, el in enumerate(batch_docs_list): - items = ((k, v) for d in el for k, v in d.items()) - - tasks: dict[str, list[Coroutine[Any, Any, None]]] = {} - + batch_len = len(batch_docs_list) + 1 + # 3. iterate over batches + tasks: dict[str, list[Coroutine[Any, Any, None]]] = {} + for batch_idx, doc_ids in enumerate(batch_docs_list): + doc_status: dict[str, Any] = { "status": DocStatus.PROCESSING, "updated_at": datetime.now().isoformat(), } - for doc_id, doc in tqdm_async( - items, - desc=f"Level 1 - Spliting doc in batch {i // len(batch_docs_list) + 1}", + for doc_id in tqdm_async( + doc_ids, + desc=f"Level 1 - Batch {batch_idx} / {batch_len}", ): + doc = await self.doc_status.get_by_id(doc_id) doc_status.update( { "content_summary": doc["content_summary"], @@ -580,7 +584,6 @@ class LightRAG: "created_at": doc["created_at"], } ) - await self.doc_status.upsert({doc_id: doc_status}) # Generate chunks from document @@ -614,12 +617,12 @@ class LightRAG: ) await self.doc_status.upsert({doc_id: doc_status}) - if doc_id not in full_docs_new_docs_ids: + if doc_id not in full_docs_processed_doc_ids: tasks[doc_id].append( self.full_docs.upsert({doc_id: {"content": doc["content"]}}) ) - if doc_id not in text_chunks_new_docs_ids: + if doc_id not in text_chunks_processed_doc_ids: tasks[doc_id].append(self.text_chunks.upsert(chunks)) for doc_id, task in tasks.items(): From 5faae814177c8f0518311678efa3b31c4f5f4e0b Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 14:24:35 +0100 Subject: [PATCH 29/42] simplified process --- lightrag/lightrag.py | 178 +++++++++---------------------------------- 1 file changed, 36 insertions(+), 142 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index f9196cf3..985fd329 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -563,29 +563,29 @@ class LightRAG: pending_doc_ids[i : i + batch_size] for i in range(0, len(pending_doc_ids), batch_size) ] batch_len = len(batch_docs_list) + 1 + # 3. iterate over batches tasks: dict[str, list[Coroutine[Any, Any, None]]] = {} for batch_idx, doc_ids in enumerate(batch_docs_list): - - doc_status: dict[str, Any] = { - "status": DocStatus.PROCESSING, - "updated_at": datetime.now().isoformat(), - } + # 4. iterate over batch for doc_id in tqdm_async( doc_ids, desc=f"Level 1 - Batch {batch_idx} / {batch_len}", ): - doc = await self.doc_status.get_by_id(doc_id) - doc_status.update( + # Update status in processing + status_doc = await self.doc_status.get_by_id(doc_id) + await self.doc_status.upsert( { - "content_summary": doc["content_summary"], - "content_length": doc["content_length"], - "created_at": doc["created_at"], + doc_id: { + "status": DocStatus.PROCESSING, + "updated_at": datetime.now().isoformat(), + "content_summary": status_doc["content_summary"], + "content_length": status_doc["content_length"], + "created_at": status_doc["created_at"], + } } ) - await self.doc_status.upsert({doc_id: doc_status}) - # Generate chunks from document chunks: dict[str, Any] = { compute_mdhash_id(dp["content"], prefix="chunk-"): { @@ -593,7 +593,7 @@ class LightRAG: "full_doc_id": doc_id, } for dp in self.chunking_func( - doc["content"], + status_doc["content"], split_by_character, split_by_character_only, self.chunk_overlap_token_size, @@ -601,57 +601,47 @@ class LightRAG: self.tiktoken_model_name, ) } - try: - # If fails it's failed on full doc and text chunks upset - if doc["status"] != DocStatus.FAILED: - # Ensure chunk insertion and graph processing happen sequentially - await self._process_entity_relation_graph(chunks) - await self.chunks_vdb.upsert(chunks) - except Exception as e: - doc_status.update( - { - "status": DocStatus.PENDING, - "error": str(e), - "updated_at": datetime.now().isoformat(), - } - ) - await self.doc_status.upsert({doc_id: doc_status}) + + # Ensure chunk insertion and graph processing happen sequentially, not in parallel + await self._process_entity_relation_graph(chunks) + await self.chunks_vdb.upsert(chunks) + # Check if document already processed the doc if doc_id not in full_docs_processed_doc_ids: tasks[doc_id].append( - self.full_docs.upsert({doc_id: {"content": doc["content"]}}) + self.full_docs.upsert({doc_id: {"content": status_doc["content"]}}) ) - + + # check if chunks already processed the doc if doc_id not in text_chunks_processed_doc_ids: tasks[doc_id].append(self.text_chunks.upsert(chunks)) for doc_id, task in tasks.items(): try: await asyncio.gather(*task) - - # Update document status - doc_status.update( + await self.doc_status.upsert( { - "status": DocStatus.PROCESSED, - "chunks_count": len(chunks), - "updated_at": datetime.now().isoformat(), + doc_id: { + "status": DocStatus.PROCESSED, + "chunks_count": len(chunks), + "updated_at": datetime.now().isoformat(), + } } ) - await self.doc_status.upsert({doc_id: doc_status}) await self._insert_done() except Exception as e: - # Update status with failed information - doc_status.update( - { - "status": DocStatus.FAILED, - "error": str(e), - "updated_at": datetime.now().isoformat(), - } - ) - await self.doc_status.upsert({doc_id: doc_status}) logger.error( f"Failed to process document {doc_id}: {str(e)}\n{traceback.format_exc()}" + ) + await self.doc_status.upsert( + { + doc_id: { + "status": DocStatus.FAILED, + "error": str(e), + "updated_at": datetime.now().isoformat(), + } + } ) continue @@ -674,102 +664,6 @@ class LightRAG: logger.error("Failed to extract entities and relationships") raise e - # async def apipeline_process_extract_graph(self): - # """ - # Process pending or failed chunks to extract entities and relationships. - - # This method retrieves all chunks that are currently marked as pending or have previously failed. - # It then extracts entities and relationships from each chunk and updates the status accordingly. - - # Steps: - # 1. Retrieve all pending and failed chunks. - # 2. For each chunk, attempt to extract entities and relationships. - # 3. Update the chunk's status to processed if successful, or failed if an error occurs. - - # Raises: - # Exception: If there is an error during the extraction process. - - # Returns: - # None - # """ - # # 1. get all pending and failed chunks - # to_process_doc_keys: list[str] = [] - - # # Process failes - # to_process_docs = await self.doc_status.get_by_status(status=DocStatus.FAILED) - # if to_process_docs: - # to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) - - # # Process Pending - # to_process_docs = await self.doc_status.get_by_status(status=DocStatus.PENDING) - # if to_process_docs: - # to_process_doc_keys.extend([doc["id"] for doc in to_process_docs]) - - # if not to_process_doc_keys: - # logger.info("All documents have been processed or are duplicates") - # return - - # # Process documents in batches - # batch_size = self.addon_params.get("insert_batch_size", 10) - - # semaphore = asyncio.Semaphore( - # batch_size - # ) # Control the number of tasks that are processed simultaneously - - # async def process_chunk(chunk_id: str): - # async with semaphore: - # chunks: dict[str, Any] = { - # i["id"]: i for i in await self.text_chunks.get_by_ids([chunk_id]) - # } - # async def _process_chunk(chunk_id: str): - # chunks: dict[str, Any] = { - # i["id"]: i for i in await self.text_chunks.get_by_ids([chunk_id]) - # } - - # # Extract and store entities and relationships - # try: - # maybe_new_kg = await extract_entities( - # chunks, - # knowledge_graph_inst=self.chunk_entity_relation_graph, - # entity_vdb=self.entities_vdb, - # relationships_vdb=self.relationships_vdb, - # llm_response_cache=self.llm_response_cache, - # global_config=asdict(self), - # ) - # if maybe_new_kg is None: - # logger.warning("No entities or relationships extracted!") - # # Update status to processed - # await self.text_chunks.upsert(chunks) - # await self.doc_status.upsert({chunk_id: {"status": DocStatus.PROCESSED}}) - # except Exception as e: - # logger.error("Failed to extract entities and relationships") - # # Mark as failed if any step fails - # await self.doc_status.upsert({chunk_id: {"status": DocStatus.FAILED}}) - # raise e - - # with tqdm_async( - # total=len(to_process_doc_keys), - # desc="\nLevel 1 - Processing chunks", - # unit="chunk", - # position=0, - # ) as progress: - # tasks: list[asyncio.Task[None]] = [] - # for chunk_id in to_process_doc_keys: - # task = asyncio.create_task(process_chunk(chunk_id)) - # tasks.append(task) - - # for future in asyncio.as_completed(tasks): - # await future - # progress.update(1) - # progress.set_postfix( - # { - # "LLM call": statistic_data["llm_call"], - # "LLM cache": statistic_data["llm_cache"], - # } - # ) - - # # Ensure all indexes are updated after each document - async def _insert_done(self): tasks = [] for storage_inst in [ From f8779cb193986e8f615c496cc1ce1d06b2c45cb8 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 14:32:48 +0100 Subject: [PATCH 30/42] updated naming --- lightrag/lightrag.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 985fd329..5d608208 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -396,7 +396,7 @@ class LightRAG: split_by_character is None, this parameter is ignored. """ await self.apipeline_process_documents(string_or_strings) - await self.apipeline_process_chunks(split_by_character, split_by_character_only) + await self.apipeline_process_enqueue_documents(split_by_character, split_by_character_only) def insert_custom_chunks(self, full_text: str, text_chunks: list[str]): loop = always_get_an_event_loop() @@ -465,7 +465,7 @@ class LightRAG: if update_storage: await self._insert_done() - async def apipeline_process_documents(self, string_or_strings: str | list[str]): + async def apipeline_enqueue_documents(self, string_or_strings: str | list[str]): """Pipeline process documents 1. Remove duplicate contents from the list @@ -505,7 +505,7 @@ class LightRAG: logger.info("All documents have been processed or are duplicates") return - # 4. Store original document + # 4. Store status document await self.doc_status.upsert(new_docs) logger.info(f"Stored {len(new_docs)} new unique documents") @@ -529,23 +529,21 @@ class LightRAG: return to_process_doc_keys - async def apipeline_process_chunks( + async def apipeline_process_enqueue_documents( self, split_by_character: str | None = None, split_by_character_only: bool = False, ) -> None: - """Pipeline process chunks - - 1. Get pending documents - 2. Split documents into chunks - 3. Insert chunks - - Args: - split_by_character (str | None): If not None, split the string by character, if chunk longer than - chunk_size, split the sub chunk by token size. - split_by_character_only (bool): If split_by_character_only is True, split the string by character only, - when split_by_character is None, this parameter is ignored. """ + Process pending documents by splitting them into chunks, processing + each chunk for entity and relation extraction, and updating the + document status. + + 1. Get all pending and failed documents + 2. Split document content into chunks + 3. Process each chunk for entity and relation extraction + 4. Update the document status + """ # 1. get all pending and failed documents pending_doc_ids = await self._get_pending_documents() @@ -612,10 +610,11 @@ class LightRAG: self.full_docs.upsert({doc_id: {"content": status_doc["content"]}}) ) - # check if chunks already processed the doc + # Check if chunks already processed the doc if doc_id not in text_chunks_processed_doc_ids: tasks[doc_id].append(self.text_chunks.upsert(chunks)) + # Process document (text chunks and full docs) in parallel for doc_id, task in tasks.items(): try: await asyncio.gather(*task) From 37943a65a325804c55a6a9f89cc44b4dbec2a099 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 14:36:00 +0100 Subject: [PATCH 31/42] cleaned code --- lightrag/lightrag.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 5d608208..b2049d7f 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -5,7 +5,6 @@ from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial from typing import Any, Callable, Coroutine, Optional, Type, Union, cast -import traceback from .operate import ( chunking_by_token_size, extract_entities, @@ -560,16 +559,17 @@ class LightRAG: batch_docs_list = [ pending_doc_ids[i : i + batch_size] for i in range(0, len(pending_doc_ids), batch_size) ] - batch_len = len(batch_docs_list) + 1 - + # 3. iterate over batches tasks: dict[str, list[Coroutine[Any, Any, None]]] = {} - for batch_idx, doc_ids in enumerate(batch_docs_list): - + for batch_idx, doc_ids in tqdm_async( + enumerate(batch_docs_list), + desc=f"Process Batches", + ): # 4. iterate over batch for doc_id in tqdm_async( doc_ids, - desc=f"Level 1 - Batch {batch_idx} / {batch_len}", + desc=f"Process Batch {batch_idx}", ): # Update status in processing status_doc = await self.doc_status.get_by_id(doc_id) @@ -631,7 +631,7 @@ class LightRAG: except Exception as e: logger.error( - f"Failed to process document {doc_id}: {str(e)}\n{traceback.format_exc()}" + f"Failed to process document {doc_id}: {str(e)}" ) await self.doc_status.upsert( { From abcdcd5a73b15bb49b2b6c5d302a78901f6c6b37 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 14:36:49 +0100 Subject: [PATCH 32/42] cleaned docs --- examples/lightrag_oracle_demo.py | 5 ++- lightrag/lightrag.py | 53 +++++++++++++++++--------------- 2 files changed, 31 insertions(+), 27 deletions(-) diff --git a/examples/lightrag_oracle_demo.py b/examples/lightrag_oracle_demo.py index 47020fd6..f5269fae 100644 --- a/examples/lightrag_oracle_demo.py +++ b/examples/lightrag_oracle_demo.py @@ -121,9 +121,8 @@ async def main(): texts = [x for x in all_text.split("\n") if x] # New mode use pipeline - await rag.apipeline_process_documents(texts) - await rag.apipeline_process_chunks() - await rag.apipeline_process_extract_graph() + await rag.apipeline_enqueue_documents(texts) + await rag.apipeline_process_enqueue_documents() # Old method use ainsert # await rag.ainsert(texts) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index b2049d7f..ef4a9db5 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -395,7 +395,9 @@ class LightRAG: split_by_character is None, this parameter is ignored. """ await self.apipeline_process_documents(string_or_strings) - await self.apipeline_process_enqueue_documents(split_by_character, split_by_character_only) + await self.apipeline_process_enqueue_documents( + split_by_character, split_by_character_only + ) def insert_custom_chunks(self, full_text: str, text_chunks: list[str]): loop = always_get_an_event_loop() @@ -511,23 +513,23 @@ class LightRAG: async def _get_pending_documents(self) -> list[str]: """Fetch all pending and failed documents.""" to_process_doc_keys: list[str] = [] - + # Fetch failed documents failed_docs = await self.doc_status.get_by_status(status=DocStatus.FAILED) if failed_docs: to_process_doc_keys.extend([doc["id"] for doc in failed_docs]) - + # Fetch pending documents pending_docs = await self.doc_status.get_by_status(status=DocStatus.PENDING) if pending_docs: to_process_doc_keys.extend([doc["id"] for doc in pending_docs]) - + if not to_process_doc_keys: logger.info("All documents have been processed or are duplicates") return [] - + return to_process_doc_keys - + async def apipeline_process_enqueue_documents( self, split_by_character: str | None = None, @@ -535,36 +537,39 @@ class LightRAG: ) -> None: """ Process pending documents by splitting them into chunks, processing - each chunk for entity and relation extraction, and updating the + each chunk for entity and relation extraction, and updating the document status. - + 1. Get all pending and failed documents 2. Split document content into chunks 3. Process each chunk for entity and relation extraction 4. Update the document status - """ + """ # 1. get all pending and failed documents pending_doc_ids = await self._get_pending_documents() - + if not pending_doc_ids: logger.info("All documents have been processed or are duplicates") return - + # Get allready processed documents (text chunks and full docs) - text_chunks_processed_doc_ids = await self.text_chunks.filter_keys(pending_doc_ids) + text_chunks_processed_doc_ids = await self.text_chunks.filter_keys( + pending_doc_ids + ) full_docs_processed_doc_ids = await self.full_docs.filter_keys(pending_doc_ids) # 2. split docs into chunks, insert chunks, update doc status batch_size = self.addon_params.get("insert_batch_size", 10) batch_docs_list = [ - pending_doc_ids[i : i + batch_size] for i in range(0, len(pending_doc_ids), batch_size) + pending_doc_ids[i : i + batch_size] + for i in range(0, len(pending_doc_ids), batch_size) ] - + # 3. iterate over batches tasks: dict[str, list[Coroutine[Any, Any, None]]] = {} for batch_idx, doc_ids in tqdm_async( enumerate(batch_docs_list), - desc=f"Process Batches", + desc="Process Batches", ): # 4. iterate over batch for doc_id in tqdm_async( @@ -580,7 +585,7 @@ class LightRAG: "updated_at": datetime.now().isoformat(), "content_summary": status_doc["content_summary"], "content_length": status_doc["content_length"], - "created_at": status_doc["created_at"], + "created_at": status_doc["created_at"], } } ) @@ -599,22 +604,24 @@ class LightRAG: self.tiktoken_model_name, ) } - - # Ensure chunk insertion and graph processing happen sequentially, not in parallel + + # Ensure chunk insertion and graph processing happen sequentially, not in parallel await self._process_entity_relation_graph(chunks) await self.chunks_vdb.upsert(chunks) # Check if document already processed the doc if doc_id not in full_docs_processed_doc_ids: tasks[doc_id].append( - self.full_docs.upsert({doc_id: {"content": status_doc["content"]}}) + self.full_docs.upsert( + {doc_id: {"content": status_doc["content"]}} + ) ) - + # Check if chunks already processed the doc if doc_id not in text_chunks_processed_doc_ids: tasks[doc_id].append(self.text_chunks.upsert(chunks)) - # Process document (text chunks and full docs) in parallel + # Process document (text chunks and full docs) in parallel for doc_id, task in tasks.items(): try: await asyncio.gather(*task) @@ -630,9 +637,7 @@ class LightRAG: await self._insert_done() except Exception as e: - logger.error( - f"Failed to process document {doc_id}: {str(e)}" - ) + logger.error(f"Failed to process document {doc_id}: {str(e)}") await self.doc_status.upsert( { doc_id: { From 58d776561d6382ef88b940c7f425f52cd397c51c Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 14:39:32 +0100 Subject: [PATCH 33/42] cleaned docs --- lightrag/lightrag.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index ef4a9db5..aaae68c9 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -467,16 +467,14 @@ class LightRAG: await self._insert_done() async def apipeline_enqueue_documents(self, string_or_strings: str | list[str]): - """Pipeline process documents - + """ + Pipeline for Processing Documents + 1. Remove duplicate contents from the list 2. Generate document IDs and initial status - 3. Filter out already stored documents - 4. Store docs - - Args: - string_or_strings: Single document string or list of document strings - """ + 3. Filter out already processed documents + 4. Enqueue document in status + """ if isinstance(string_or_strings, str): string_or_strings = [string_or_strings] From 82481ecf28d6c0d42e4dbc5b81abd04b44763657 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 14:55:52 +0100 Subject: [PATCH 34/42] cleaned code --- lightrag/base.py | 6 ------ lightrag/kg/json_kv_impl.py | 13 +++---------- lightrag/kg/mongo_impl.py | 8 -------- lightrag/kg/oracle_impl.py | 6 ------ lightrag/kg/postgres_impl.py | 10 ---------- lightrag/kg/redis_impl.py | 17 +++-------------- lightrag/lightrag.py | 20 ++++++++++++-------- 7 files changed, 18 insertions(+), 62 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index 4b963b43..60b9b3f1 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -84,9 +84,6 @@ class BaseVectorStorage(StorageNameSpace): class BaseKVStorage(StorageNameSpace): embedding_func: EmbeddingFunc - async def all_keys(self) -> list[str]: - raise NotImplementedError - async def get_by_id(self, id: str) -> dict[str, Any]: raise NotImplementedError @@ -103,9 +100,6 @@ class BaseKVStorage(StorageNameSpace): async def drop(self) -> None: raise NotImplementedError - async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]: - raise NotImplementedError - @dataclass class BaseGraphStorage(StorageNameSpace): diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index e9225375..14565c86 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -1,7 +1,7 @@ import asyncio import os from dataclasses import dataclass -from typing import Any, Union +from typing import Any from lightrag.utils import ( logger, @@ -21,10 +21,7 @@ class JsonKVStorage(BaseKVStorage): self._data: dict[str, Any] = load_json(self._file_name) or {} self._lock = asyncio.Lock() logger.info(f"Load KV {self.namespace} with {len(self._data)} data") - - async def all_keys(self) -> list[str]: - return list(self._data.keys()) - + async def index_done_callback(self): write_json(self._data, self._file_name) @@ -49,8 +46,4 @@ class JsonKVStorage(BaseKVStorage): self._data.update(left_data) async def drop(self) -> None: - self._data = {} - - async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]: - result = [v for _, v in self._data.items() if v["status"] == status] - return result if result else None + self._data = {} \ No newline at end of file diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index b7b438bd..45d4bb07 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -29,9 +29,6 @@ class MongoKVStorage(BaseKVStorage): self._data = database.get_collection(self.namespace) logger.info(f"Use MongoDB as KV {self.namespace}") - async def all_keys(self) -> list[str]: - return [x["_id"] for x in self._data.find({}, {"_id": 1})] - async def get_by_id(self, id: str) -> dict[str, Any]: return self._data.find_one({"_id": id}) @@ -77,11 +74,6 @@ class MongoKVStorage(BaseKVStorage): """Drop the collection""" await self._data.drop() - async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]: - """Get documents by status and ids""" - return self._data.find({"status": status}) - - @dataclass class MongoGraphStorage(BaseGraphStorage): """ diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index c82db9a6..b648c9bc 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -229,12 +229,6 @@ class OracleKVStorage(BaseKVStorage): res = [{k: v} for k, v in dict_res.items()] return res - async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]: - """Specifically for llm_response_cache.""" - SQL = SQL_TEMPLATES["get_by_status_" + self.namespace] - params = {"workspace": self.db.workspace, "status": status} - return await self.db.query(SQL, params, multirows=True) - async def filter_keys(self, keys: list[str]) -> set[str]: """Return keys that don't exist in storage""" SQL = SQL_TEMPLATES["filter_keys"].format( diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 01e3688a..b37f8434 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -237,16 +237,6 @@ class PGKVStorage(BaseKVStorage): params = {"workspace": self.db.workspace, "status": status} return await self.db.query(SQL, params, multirows=True) - async def all_keys(self) -> list[dict]: - if is_namespace(self.namespace, NameSpace.KV_STORE_LLM_RESPONSE_CACHE): - sql = "select workspace,mode,id from lightrag_llm_cache" - res = await self.db.query(sql, multirows=True) - return res - else: - logger.error( - f"all_keys is only implemented for llm_response_cache, not for {self.namespace}" - ) - async def filter_keys(self, keys: List[str]) -> Set[str]: """Filter out duplicated content""" sql = SQL_TEMPLATES["filter_keys"].format( diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py index f9283dda..025f293f 100644 --- a/lightrag/kg/redis_impl.py +++ b/lightrag/kg/redis_impl.py @@ -1,5 +1,5 @@ import os -from typing import Any, Union +from typing import Any from tqdm.asyncio import tqdm as tqdm_async from dataclasses import dataclass import pipmaster as pm @@ -20,11 +20,7 @@ class RedisKVStorage(BaseKVStorage): redis_url = os.environ.get("REDIS_URI", "redis://localhost:6379") self._redis = Redis.from_url(redis_url, decode_responses=True) logger.info(f"Use Redis as KV {self.namespace}") - - async def all_keys(self) -> list[str]: - keys = await self._redis.keys(f"{self.namespace}:*") - return [key.split(":", 1)[-1] for key in keys] - + async def get_by_id(self, id): data = await self._redis.get(f"{self.namespace}:{id}") return json.loads(data) if data else None @@ -57,11 +53,4 @@ class RedisKVStorage(BaseKVStorage): async def drop(self) -> None: keys = await self._redis.keys(f"{self.namespace}:*") if keys: - await self._redis.delete(*keys) - - async def get_by_status(self, status: str) -> Union[list[dict[str, Any]], None]: - pipe = self._redis.pipeline() - for key in await self._redis.keys(f"{self.namespace}:*"): - pipe.hgetall(key) - results = await pipe.execute() - return [data for data in results if data.get("status") == status] or None + await self._redis.delete(*keys) \ No newline at end of file diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index aaae68c9..00174fcd 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -29,6 +29,7 @@ from .base import ( BaseKVStorage, BaseVectorStorage, DocStatus, + DocStatusStorage, QueryParam, StorageNameSpace, ) @@ -319,7 +320,7 @@ class LightRAG: # Initialize document status storage self.doc_status_storage_cls = self._get_storage_class(self.doc_status_storage) - self.doc_status: BaseKVStorage = self.doc_status_storage_cls( + self.doc_status: DocStatusStorage = self.doc_status_storage_cls( namespace=make_namespace(self.namespace_prefix, NameSpace.DOC_STATUS), global_config=global_config, embedding_func=None, @@ -394,10 +395,8 @@ class LightRAG: split_by_character_only: if split_by_character_only is True, split the string by character only, when split_by_character is None, this parameter is ignored. """ - await self.apipeline_process_documents(string_or_strings) - await self.apipeline_process_enqueue_documents( - split_by_character, split_by_character_only - ) + await self.apipeline_enqueue_documents(string_or_strings) + await self.apipeline_process_enqueue_documents(split_by_character, split_by_character_only) def insert_custom_chunks(self, full_text: str, text_chunks: list[str]): loop = always_get_an_event_loop() @@ -496,8 +495,13 @@ class LightRAG: # 3. Filter out already processed documents add_doc_keys: set[str] = set() - excluded_ids = await self.doc_status.all_keys() + # Get docs ids + in_process_keys = list(new_docs.keys()) + # Get in progress docs ids + excluded_ids = await self.doc_status.get_by_ids(in_process_keys) + # Exclude already in process add_doc_keys = new_docs.keys() - excluded_ids + # Filter new_docs = {k: v for k, v in new_docs.items() if k in add_doc_keys} if not new_docs: @@ -513,12 +517,12 @@ class LightRAG: to_process_doc_keys: list[str] = [] # Fetch failed documents - failed_docs = await self.doc_status.get_by_status(status=DocStatus.FAILED) + failed_docs = await self.doc_status.get_failed_docs() if failed_docs: to_process_doc_keys.extend([doc["id"] for doc in failed_docs]) # Fetch pending documents - pending_docs = await self.doc_status.get_by_status(status=DocStatus.PENDING) + pending_docs = await self.doc_status.get_pending_docs() if pending_docs: to_process_doc_keys.extend([doc["id"] for doc in pending_docs]) From 28b53144d9c1da12535003b4183441823551c7f6 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 15:00:07 +0100 Subject: [PATCH 35/42] updated readme --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index 456d9a72..edd33fb0 100644 --- a/README.md +++ b/README.md @@ -408,6 +408,21 @@ rag = LightRAG( with open("./newText.txt") as f: rag.insert(f.read()) ``` + +### Insert using Pipeline +The `apipeline_enqueue_documents` and `apipeline_process_enqueue_documents` functions allow you to perform incremental insertion of documents into the graph. + +This is useful for scenarios where you want to process documents in the background while still allowing the main thread to continue executing. + +And using a routine to process news documents. + +```python +rag = LightRAG(..) +await rag.apipeline_enqueue_documents(string_or_strings) +# Your routine in loop +await rag.apipeline_process_enqueue_documents(string_or_strings) +``` + ### Separate Keyword Extraction We've introduced a new function `query_with_separate_keyword_extraction` to enhance the keyword extraction capabilities. This function separates the keyword extraction process from the user's prompt, focusing solely on the query to improve the relevance of extracted keywords. From 7116bd18721906e07e6a8ac7e1fa917586c8f972 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 15:24:30 +0100 Subject: [PATCH 36/42] cleaned code --- lightrag/kg/json_kv_impl.py | 4 ++-- lightrag/kg/jsondocstatus_impl.py | 8 ++++---- lightrag/kg/mongo_impl.py | 1 + lightrag/kg/redis_impl.py | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index 14565c86..ff184dbd 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -21,7 +21,7 @@ class JsonKVStorage(BaseKVStorage): self._data: dict[str, Any] = load_json(self._file_name) or {} self._lock = asyncio.Lock() logger.info(f"Load KV {self.namespace} with {len(self._data)} data") - + async def index_done_callback(self): write_json(self._data, self._file_name) @@ -46,4 +46,4 @@ class JsonKVStorage(BaseKVStorage): self._data.update(left_data) async def drop(self) -> None: - self._data = {} \ No newline at end of file + self._data = {} diff --git a/lightrag/kg/jsondocstatus_impl.py b/lightrag/kg/jsondocstatus_impl.py index 603487bc..31aa836a 100644 --- a/lightrag/kg/jsondocstatus_impl.py +++ b/lightrag/kg/jsondocstatus_impl.py @@ -50,7 +50,7 @@ Usage: import os from dataclasses import dataclass -from typing import Any, Union, Dict +from typing import Any, Union from lightrag.utils import ( logger, @@ -85,18 +85,18 @@ class JsonDocStatusStorage(DocStatusStorage): ] ) - async def get_status_counts(self) -> Dict[str, int]: + async def get_status_counts(self) -> dict[str, int]: """Get counts of documents in each status""" counts = {status: 0 for status in DocStatus} for doc in self._data.values(): counts[doc["status"]] += 1 return counts - async def get_failed_docs(self) -> Dict[str, DocProcessingStatus]: + async def get_failed_docs(self) -> dict[str, DocProcessingStatus]: """Get all failed documents""" return {k: v for k, v in self._data.items() if v["status"] == DocStatus.FAILED} - async def get_pending_docs(self) -> Dict[str, DocProcessingStatus]: + async def get_pending_docs(self) -> dict[str, DocProcessingStatus]: """Get all pending documents""" return {k: v for k, v in self._data.items() if v["status"] == DocStatus.PENDING} diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index 45d4bb07..35902d37 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -74,6 +74,7 @@ class MongoKVStorage(BaseKVStorage): """Drop the collection""" await self._data.drop() + @dataclass class MongoGraphStorage(BaseGraphStorage): """ diff --git a/lightrag/kg/redis_impl.py b/lightrag/kg/redis_impl.py index 025f293f..05da41b7 100644 --- a/lightrag/kg/redis_impl.py +++ b/lightrag/kg/redis_impl.py @@ -20,7 +20,7 @@ class RedisKVStorage(BaseKVStorage): redis_url = os.environ.get("REDIS_URI", "redis://localhost:6379") self._redis = Redis.from_url(redis_url, decode_responses=True) logger.info(f"Use Redis as KV {self.namespace}") - + async def get_by_id(self, id): data = await self._redis.get(f"{self.namespace}:{id}") return json.loads(data) if data else None @@ -53,4 +53,4 @@ class RedisKVStorage(BaseKVStorage): async def drop(self) -> None: keys = await self._redis.keys(f"{self.namespace}:*") if keys: - await self._redis.delete(*keys) \ No newline at end of file + await self._redis.delete(*keys) From 948d21b41d0d6c1203a1518abf39bd5d1015b124 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 15:24:37 +0100 Subject: [PATCH 37/42] added docs and content --- lightrag/base.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index 60b9b3f1..c7f77e0b 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -165,15 +165,24 @@ class DocStatus(str, Enum): @dataclass class DocProcessingStatus: """Document processing status data structure""" - - content_summary: str # First 100 chars of document content - content_length: int # Total length of document - status: DocStatus # Current processing status - created_at: str # ISO format timestamp - updated_at: str # ISO format timestamp - chunks_count: Optional[int] = None # Number of chunks after splitting - error: Optional[str] = None # Error message if failed - metadata: dict[str, Any] = field(default_factory=dict) # Additional metadata + content: str + """Original content of the document""" + content_summary: str + """First 100 chars of document content, used for preview""" + content_length: int + """Total length of document""" + status: DocStatus + """Current processing status""" + created_at: str + """ISO format timestamp when document was created""" + updated_at: str + """ISO format timestamp when document was last updated""" + chunks_count: Optional[int] = None + """Number of chunks after splitting, used for processing""" + error: Optional[str] = None + """Error message if failed""" + metadata: dict[str, Any] = field(default_factory=dict) + """Additional metadata""" class DocStatusStorage(BaseKVStorage): From 1949c4a2c635898fbae4d7bfb481f2b5d0bbedb7 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 15:24:52 +0100 Subject: [PATCH 38/42] improved get status --- lightrag/lightrag.py | 96 ++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 53 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 00174fcd..3bfa2649 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -13,7 +13,6 @@ from .operate import ( kg_query_with_keywords, mix_kg_vector_query, naive_query, - # local_query,global_query,hybrid_query,, ) from .utils import ( @@ -28,6 +27,7 @@ from .base import ( BaseGraphStorage, BaseKVStorage, BaseVectorStorage, + DocProcessingStatus, DocStatus, DocStatusStorage, QueryParam, @@ -396,7 +396,9 @@ class LightRAG: split_by_character is None, this parameter is ignored. """ await self.apipeline_enqueue_documents(string_or_strings) - await self.apipeline_process_enqueue_documents(split_by_character, split_by_character_only) + await self.apipeline_process_enqueue_documents( + split_by_character, split_by_character_only + ) def insert_custom_chunks(self, full_text: str, text_chunks: list[str]): loop = always_get_an_event_loop() @@ -468,12 +470,12 @@ class LightRAG: async def apipeline_enqueue_documents(self, string_or_strings: str | list[str]): """ Pipeline for Processing Documents - + 1. Remove duplicate contents from the list 2. Generate document IDs and initial status 3. Filter out already processed documents - 4. Enqueue document in status - """ + 4. Enqueue document in status + """ if isinstance(string_or_strings, str): string_or_strings = [string_or_strings] @@ -512,26 +514,6 @@ class LightRAG: await self.doc_status.upsert(new_docs) logger.info(f"Stored {len(new_docs)} new unique documents") - async def _get_pending_documents(self) -> list[str]: - """Fetch all pending and failed documents.""" - to_process_doc_keys: list[str] = [] - - # Fetch failed documents - failed_docs = await self.doc_status.get_failed_docs() - if failed_docs: - to_process_doc_keys.extend([doc["id"] for doc in failed_docs]) - - # Fetch pending documents - pending_docs = await self.doc_status.get_pending_docs() - if pending_docs: - to_process_doc_keys.extend([doc["id"] for doc in pending_docs]) - - if not to_process_doc_keys: - logger.info("All documents have been processed or are duplicates") - return [] - - return to_process_doc_keys - async def apipeline_process_enqueue_documents( self, split_by_character: str | None = None, @@ -548,46 +530,53 @@ class LightRAG: 4. Update the document status """ # 1. get all pending and failed documents - pending_doc_ids = await self._get_pending_documents() + to_process_docs: dict[str, DocProcessingStatus] = {} - if not pending_doc_ids: + # Fetch failed documents + failed_docs = await self.doc_status.get_failed_docs() + to_process_docs.update(failed_docs) + + pending_docs = await self.doc_status.get_pending_docs() + to_process_docs.update(pending_docs) + + if not to_process_docs: logger.info("All documents have been processed or are duplicates") - return + return + to_process_docs_ids = list(to_process_docs.keys()) # Get allready processed documents (text chunks and full docs) - text_chunks_processed_doc_ids = await self.text_chunks.filter_keys( - pending_doc_ids - ) - full_docs_processed_doc_ids = await self.full_docs.filter_keys(pending_doc_ids) + text_chunks_processed_doc_ids = await self.text_chunks.filter_keys(to_process_docs_ids) + full_docs_processed_doc_ids = await self.full_docs.filter_keys(to_process_docs_ids) # 2. split docs into chunks, insert chunks, update doc status batch_size = self.addon_params.get("insert_batch_size", 10) batch_docs_list = [ - pending_doc_ids[i : i + batch_size] - for i in range(0, len(pending_doc_ids), batch_size) + list(to_process_docs.items())[i : i + batch_size] + for i in range(0, len(to_process_docs), batch_size) ] # 3. iterate over batches tasks: dict[str, list[Coroutine[Any, Any, None]]] = {} - for batch_idx, doc_ids in tqdm_async( + for batch_idx, ids_doc_processing_status in tqdm_async( enumerate(batch_docs_list), desc="Process Batches", ): # 4. iterate over batch - for doc_id in tqdm_async( - doc_ids, + for id_doc_processing_status in tqdm_async( + ids_doc_processing_status, desc=f"Process Batch {batch_idx}", ): # Update status in processing - status_doc = await self.doc_status.get_by_id(doc_id) + id_doc, status_doc = id_doc_processing_status + await self.doc_status.upsert( { - doc_id: { + id_doc: { "status": DocStatus.PROCESSING, "updated_at": datetime.now().isoformat(), - "content_summary": status_doc["content_summary"], - "content_length": status_doc["content_length"], - "created_at": status_doc["created_at"], + "content_summary": status_doc.content_summary, + "content_length": status_doc.content_length, + "created_at": status_doc.created_at, } } ) @@ -595,10 +584,10 @@ class LightRAG: chunks: dict[str, Any] = { compute_mdhash_id(dp["content"], prefix="chunk-"): { **dp, - "full_doc_id": doc_id, + "full_doc_id": id_doc_processing_status, } for dp in self.chunking_func( - status_doc["content"], + status_doc.content, split_by_character, split_by_character_only, self.chunk_overlap_token_size, @@ -611,25 +600,26 @@ class LightRAG: await self._process_entity_relation_graph(chunks) await self.chunks_vdb.upsert(chunks) + tasks[id_doc] = [] # Check if document already processed the doc - if doc_id not in full_docs_processed_doc_ids: - tasks[doc_id].append( + if id_doc not in full_docs_processed_doc_ids: + tasks[id_doc].append( self.full_docs.upsert( - {doc_id: {"content": status_doc["content"]}} + {id_doc: {"content": status_doc.content}} ) ) # Check if chunks already processed the doc - if doc_id not in text_chunks_processed_doc_ids: - tasks[doc_id].append(self.text_chunks.upsert(chunks)) + if id_doc not in text_chunks_processed_doc_ids: + tasks[id_doc].append(self.text_chunks.upsert(chunks)) # Process document (text chunks and full docs) in parallel - for doc_id, task in tasks.items(): + for id_doc_processing_status, task in tasks.items(): try: await asyncio.gather(*task) await self.doc_status.upsert( { - doc_id: { + id_doc_processing_status: { "status": DocStatus.PROCESSED, "chunks_count": len(chunks), "updated_at": datetime.now().isoformat(), @@ -639,10 +629,10 @@ class LightRAG: await self._insert_done() except Exception as e: - logger.error(f"Failed to process document {doc_id}: {str(e)}") + logger.error(f"Failed to process document {id_doc_processing_status}: {str(e)}") await self.doc_status.upsert( { - doc_id: { + id_doc_processing_status: { "status": DocStatus.FAILED, "error": str(e), "updated_at": datetime.now().isoformat(), From d1d422e5e42a7cd1259d6f644174835ebd192a22 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 15:25:02 +0100 Subject: [PATCH 39/42] cleaned readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index edd33fb0..62dc032b 100644 --- a/README.md +++ b/README.md @@ -410,7 +410,7 @@ with open("./newText.txt") as f: ``` ### Insert using Pipeline -The `apipeline_enqueue_documents` and `apipeline_process_enqueue_documents` functions allow you to perform incremental insertion of documents into the graph. +The `apipeline_enqueue_documents` and `apipeline_process_enqueue_documents` functions allow you to perform incremental insertion of documents into the graph. This is useful for scenarios where you want to process documents in the background while still allowing the main thread to continue executing. From c36c6743d62ca068ba3cae04d4939d97c2690ab3 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 15:25:58 +0100 Subject: [PATCH 40/42] cleaned code --- lightrag/base.py | 1 + lightrag/lightrag.py | 24 ++++++++++++++---------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/lightrag/base.py b/lightrag/base.py index c7f77e0b..0a98c2d5 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -165,6 +165,7 @@ class DocStatus(str, Enum): @dataclass class DocProcessingStatus: """Document processing status data structure""" + content: str """Original content of the document""" content_summary: str diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 3bfa2649..79eecef7 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -535,18 +535,22 @@ class LightRAG: # Fetch failed documents failed_docs = await self.doc_status.get_failed_docs() to_process_docs.update(failed_docs) - + pending_docs = await self.doc_status.get_pending_docs() to_process_docs.update(pending_docs) - + if not to_process_docs: logger.info("All documents have been processed or are duplicates") - return + return to_process_docs_ids = list(to_process_docs.keys()) # Get allready processed documents (text chunks and full docs) - text_chunks_processed_doc_ids = await self.text_chunks.filter_keys(to_process_docs_ids) - full_docs_processed_doc_ids = await self.full_docs.filter_keys(to_process_docs_ids) + text_chunks_processed_doc_ids = await self.text_chunks.filter_keys( + to_process_docs_ids + ) + full_docs_processed_doc_ids = await self.full_docs.filter_keys( + to_process_docs_ids + ) # 2. split docs into chunks, insert chunks, update doc status batch_size = self.addon_params.get("insert_batch_size", 10) @@ -568,7 +572,7 @@ class LightRAG: ): # Update status in processing id_doc, status_doc = id_doc_processing_status - + await self.doc_status.upsert( { id_doc: { @@ -604,9 +608,7 @@ class LightRAG: # Check if document already processed the doc if id_doc not in full_docs_processed_doc_ids: tasks[id_doc].append( - self.full_docs.upsert( - {id_doc: {"content": status_doc.content}} - ) + self.full_docs.upsert({id_doc: {"content": status_doc.content}}) ) # Check if chunks already processed the doc @@ -629,7 +631,9 @@ class LightRAG: await self._insert_done() except Exception as e: - logger.error(f"Failed to process document {id_doc_processing_status}: {str(e)}") + logger.error( + f"Failed to process document {id_doc_processing_status}: {str(e)}" + ) await self.doc_status.upsert( { id_doc_processing_status: { From 93717e6705958599cfefed76b74ca0c2ba436535 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 15:36:01 +0100 Subject: [PATCH 41/42] cleaned code --- lightrag/kg/postgres_impl.py | 6 ++++-- lightrag/lightrag.py | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index b37f8434..77fe6198 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -443,6 +443,7 @@ class PGDocStatusStorage(DocStatusStorage): return {} else: return DocProcessingStatus( + content=result[0]["content"], content_length=result[0]["content_length"], content_summary=result[0]["content_summary"], status=result[0]["status"], @@ -471,10 +472,9 @@ class PGDocStatusStorage(DocStatusStorage): sql = "select * from LIGHTRAG_DOC_STATUS where workspace=$1 and status=$1" params = {"workspace": self.db.workspace, "status": status} result = await self.db.query(sql, params, True) - # Result is like [{'id': 'id1', 'status': 'PENDING', 'updated_at': '2023-07-01 00:00:00'}, {'id': 'id2', 'status': 'PENDING', 'updated_at': '2023-07-01 00:00:00'}, ...] - # Converting to be a dict return { element["id"]: DocProcessingStatus( + content=result[0]["content"], content_summary=element["content_summary"], content_length=element["content_length"], status=element["status"], @@ -506,6 +506,7 @@ class PGDocStatusStorage(DocStatusStorage): sql = """insert into LIGHTRAG_DOC_STATUS(workspace,id,content_summary,content_length,chunks_count,status) values($1,$2,$3,$4,$5,$6) on conflict(id,workspace) do update set + content = EXCLUDED.content, content_summary = EXCLUDED.content_summary, content_length = EXCLUDED.content_length, chunks_count = EXCLUDED.chunks_count, @@ -518,6 +519,7 @@ class PGDocStatusStorage(DocStatusStorage): { "workspace": self.db.workspace, "id": k, + "content": v["content"], "content_summary": v["content_summary"], "content_length": v["content_length"], "chunks_count": v["chunks_count"] if "chunks_count" in v else -1, diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 79eecef7..5d00c508 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -544,6 +544,7 @@ class LightRAG: return to_process_docs_ids = list(to_process_docs.keys()) + # Get allready processed documents (text chunks and full docs) text_chunks_processed_doc_ids = await self.text_chunks.filter_keys( to_process_docs_ids @@ -570,9 +571,8 @@ class LightRAG: ids_doc_processing_status, desc=f"Process Batch {batch_idx}", ): - # Update status in processing id_doc, status_doc = id_doc_processing_status - + # Update status in processing await self.doc_status.upsert( { id_doc: { @@ -601,8 +601,8 @@ class LightRAG: } # Ensure chunk insertion and graph processing happen sequentially, not in parallel - await self._process_entity_relation_graph(chunks) await self.chunks_vdb.upsert(chunks) + await self._process_entity_relation_graph(chunks) tasks[id_doc] = [] # Check if document already processed the doc From f84be33d5c2d32995d61d5841cb50757582feaf4 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 15:54:15 +0100 Subject: [PATCH 42/42] cleaned code --- lightrag/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/base.py b/lightrag/base.py index 0a98c2d5..7a3b4f5f 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -43,7 +43,7 @@ class QueryParam: hl_keywords: list[str] = field(default_factory=list) ll_keywords: list[str] = field(default_factory=list) # Conversation history support - conversation_history: list[dict] = field( + conversation_history: list[dict[str, str]] = field( default_factory=list ) # Format: [{"role": "user/assistant", "content": "message"}] history_turns: int = (