From d904f2abd55063d639202704a327d1f2458ff4ab Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Tue, 12 Nov 2024 15:38:16 +0800 Subject: [PATCH 01/22] Update lightrag_api_oracle_demo..py --- examples/lightrag_api_oracle_demo..py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/lightrag_api_oracle_demo..py b/examples/lightrag_api_oracle_demo..py index 3bfae452..9b4e2741 100644 --- a/examples/lightrag_api_oracle_demo..py +++ b/examples/lightrag_api_oracle_demo..py @@ -149,13 +149,13 @@ class Response(BaseModel): # API routes -rag = None # 定义为全局对象 +rag = None @asynccontextmanager async def lifespan(app: FastAPI): global rag - rag = await init() # 在应用启动时初始化 `rag` + rag = await init() print("done!") yield From 83f8a5139c8f076bcb893eb91ac4f0c660364fb6 Mon Sep 17 00:00:00 2001 From: Rick Battle Date: Tue, 12 Nov 2024 09:30:21 -0700 Subject: [PATCH 02/22] Only update storage if there was something to insert Before, the `finally` block would always call `_insert_done()`, which writes out the `vdb_*` and `kv_store_*` files ... even if there was nothing to insert (because all docs had already been inserted). This was causing the speed of skippable inserts to become very slow as the graph grew. --- lightrag/lightrag.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 50e33405..67337098 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -227,6 +227,7 @@ class LightRAG: return loop.run_until_complete(self.ainsert(string_or_strings)) async def ainsert(self, string_or_strings): + update_storage = False try: if isinstance(string_or_strings, str): string_or_strings = [string_or_strings] @@ -240,6 +241,7 @@ class LightRAG: if not len(new_docs): logger.warning("All docs are already in the storage") return + update_storage = True logger.info(f"[New Docs] inserting {len(new_docs)} docs") inserting_chunks = {} @@ -286,7 +288,8 @@ class LightRAG: await self.full_docs.upsert(new_docs) await self.text_chunks.upsert(inserting_chunks) finally: - await self._insert_done() + if update_storage: + await self._insert_done() async def _insert_done(self): tasks = [] From 38e1956395f9d4925b51e52ccce3ad1f42ffd3e7 Mon Sep 17 00:00:00 2001 From: david Date: Wed, 13 Nov 2024 14:20:36 +0800 Subject: [PATCH 03/22] fix hf embedding to support loading to different device --- lightrag/llm.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lightrag/llm.py b/lightrag/llm.py index f4045e80..6cc46c85 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -693,13 +693,17 @@ async def bedrock_embedding( async def hf_embedding(texts: list[str], tokenizer, embed_model) -> np.ndarray: + device = next(embed_model.parameters()).device input_ids = tokenizer( texts, return_tensors="pt", padding=True, truncation=True - ).input_ids + ).input_ids.to(device) with torch.no_grad(): outputs = embed_model(input_ids) embeddings = outputs.last_hidden_state.mean(dim=1) - return embeddings.detach().numpy() + if embeddings.dtype == torch.bfloat16: + return embeddings.detach().to(torch.float32).cpu().numpy() + else: + return embeddings.detach().cpu().numpy() async def ollama_embedding(texts: list[str], embed_model, **kwargs) -> np.ndarray: From 30cf5a62ee5dd8d9c922bd852e26186ad1c47519 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Thu, 14 Nov 2024 15:59:37 +0800 Subject: [PATCH 04/22] fix cache bug --- lightrag/__init__.py | 2 +- lightrag/operate.py | 24 ++++++++++++++++++------ lightrag/utils.py | 18 ++++++++++++------ 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/lightrag/__init__.py b/lightrag/__init__.py index 6d9003ff..c8b61765 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam -__version__ = "1.0.0" +__version__ = "1.0.1" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/LightRAG" diff --git a/lightrag/operate.py b/lightrag/operate.py index db7c9401..b11e14fe 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -630,10 +630,16 @@ async def _find_most_related_edges_from_entities( all_related_edges = await asyncio.gather( *[knowledge_graph_inst.get_node_edges(dp["entity_name"]) for dp in node_datas] ) - all_edges = set() + all_edges = [] + seen = set() + for this_edges in all_related_edges: - all_edges.update([tuple(sorted(e)) for e in this_edges]) - all_edges = list(all_edges) + for e in this_edges: + sorted_edge = tuple(sorted(e)) + if sorted_edge not in seen: + seen.add(sorted_edge) + all_edges.append(sorted_edge) + all_edges_pack = await asyncio.gather( *[knowledge_graph_inst.get_edge(e[0], e[1]) for e in all_edges] ) @@ -833,10 +839,16 @@ async def _find_most_related_entities_from_relationships( query_param: QueryParam, knowledge_graph_inst: BaseGraphStorage, ): - entity_names = set() + entity_names = [] + seen = set() + for e in edge_datas: - entity_names.add(e["src_id"]) - entity_names.add(e["tgt_id"]) + if e["src_id"] not in seen: + entity_names.append(e["src_id"]) + seen.add(e["src_id"]) + if e["tgt_id"] not in seen: + entity_names.append(e["tgt_id"]) + seen.add(e["tgt_id"]) node_datas = await asyncio.gather( *[knowledge_graph_inst.get_node(entity_name) for entity_name in entity_names] diff --git a/lightrag/utils.py b/lightrag/utils.py index 104c9fec..a849f804 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -274,13 +274,19 @@ def process_combine_contexts(hl, ll): if list_ll: list_ll = [",".join(item[1:]) for item in list_ll if item] - combined_sources_set = set(filter(None, list_hl + list_ll)) + combined_sources = [] + seen = set() - combined_sources = [",\t".join(header)] + for item in list_hl + list_ll: + if item and item not in seen: + combined_sources.append(item) + seen.add(item) - for i, item in enumerate(combined_sources_set, start=1): - combined_sources.append(f"{i},\t{item}") + combined_sources_result = [",\t".join(header)] - combined_sources = "\n".join(combined_sources) + for i, item in enumerate(combined_sources, start=1): + combined_sources_result.append(f"{i},\t{item}") - return combined_sources + combined_sources_result = "\n".join(combined_sources_result) + + return combined_sources_result From 371e44499871563d6a52c735c2b1efe357eeec20 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Thu, 14 Nov 2024 16:02:30 +0800 Subject: [PATCH 05/22] Update Discord Link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 74c40f15..be6c1bb5 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ - +

From 186cd34a033d13c13317cba356e0076cffc113fe Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Thu, 14 Nov 2024 16:21:20 +0800 Subject: [PATCH 06/22] Update Discord Link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index be6c1bb5..b62f01a1 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ This repository hosts the code of LightRAG. The structure of this code is based - [x] [2024.10.29]🎯📢LightRAG now supports multiple file types, including PDF, DOC, PPT, and CSV via `textract`. - [x] [2024.10.20]🎯📢We’ve added a new feature to LightRAG: Graph Visualization. - [x] [2024.10.18]🎯📢We’ve added a link to a [LightRAG Introduction Video](https://youtu.be/oageL-1I0GE). Thanks to the author! -- [x] [2024.10.17]🎯📢We have created a [Discord channel](https://discord.gg/mvsfu2Tg)! Welcome to join for sharing and discussions! 🎉🎉 +- [x] [2024.10.17]🎯📢We have created a [Discord channel](https://discord.gg/E4HgTnck)! Welcome to join for sharing and discussions! 🎉🎉 - [x] [2024.10.16]🎯📢LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! - [x] [2024.10.15]🎯📢LightRAG now supports [Hugging Face models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! From 5661d76860436f7bf5aef2e50d9ee4a59660146c Mon Sep 17 00:00:00 2001 From: Richard <164130786@qq.com> Date: Fri, 15 Nov 2024 13:11:43 +0800 Subject: [PATCH 07/22] fix neo4j bug --- lightrag/kg/neo4j_impl.py | 1 + lightrag/lightrag.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py index e6b33a9b..32bfbe2e 100644 --- a/lightrag/kg/neo4j_impl.py +++ b/lightrag/kg/neo4j_impl.py @@ -214,6 +214,7 @@ class Neo4JStorage(BaseGraphStorage): neo4jExceptions.ServiceUnavailable, neo4jExceptions.TransientError, neo4jExceptions.WriteServiceUnavailable, + neo4jExceptions.ClientError ) ), ) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 67337098..ce27e76d 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -174,8 +174,7 @@ class LightRAG: ) self.chunk_entity_relation_graph = self.graph_storage_cls( namespace="chunk_entity_relation", - global_config=asdict(self), - embedding_func=self.embedding_func, + global_config=asdict(self) ) #### # add embedding func by walter over From 8e16f0815ce75a81797fb45d2f755204af9cd638 Mon Sep 17 00:00:00 2001 From: tmuife <43266626@qq.com> Date: Mon, 18 Nov 2024 10:00:06 +0800 Subject: [PATCH 08/22] change the type of binding parameters in Oracle23AI --- lightrag/kg/oracle_impl.py | 320 +++++++++++++++++++++---------------- 1 file changed, 178 insertions(+), 142 deletions(-) diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index 96a9e795..fd8bf536 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -114,16 +114,17 @@ class OracleDB: logger.info("Finished check all tables in Oracle database") - async def query(self, sql: str, multirows: bool = False) -> Union[dict, None]: + async def query(self, sql: str, params: dict = None, multirows: bool = False) -> Union[dict, None]: async with self.pool.acquire() as connection: connection.inputtypehandler = self.input_type_handler connection.outputtypehandler = self.output_type_handler with connection.cursor() as cursor: try: - await cursor.execute(sql) + await cursor.execute(sql, params) except Exception as e: logger.error(f"Oracle database error: {e}") print(sql) + print(params) raise columns = [column[0].lower() for column in cursor.description] if multirows: @@ -140,7 +141,7 @@ class OracleDB: data = None return data - async def execute(self, sql: str, data: list = None): + async def execute(self, sql: str, data: list | dict = None): # logger.info("go into OracleDB execute method") try: async with self.pool.acquire() as connection: @@ -172,11 +173,10 @@ class OracleKVStorage(BaseKVStorage): async def get_by_id(self, id: str) -> Union[dict, None]: """根据 id 获取 doc_full 数据.""" - SQL = SQL_TEMPLATES["get_by_id_" + self.namespace].format( - workspace=self.db.workspace, id=id - ) + SQL = SQL_TEMPLATES["get_by_id_" + self.namespace] + params = {"workspace":self.db.workspace, "id":id} # print("get_by_id:"+SQL) - res = await self.db.query(SQL) + res = await self.db.query(SQL,params) if res: data = res # {"data":res} # print (data) @@ -187,11 +187,11 @@ class OracleKVStorage(BaseKVStorage): # Query by id async def get_by_ids(self, ids: list[str], fields=None) -> Union[list[dict], None]: """根据 id 获取 doc_chunks 数据""" - SQL = SQL_TEMPLATES["get_by_ids_" + self.namespace].format( - workspace=self.db.workspace, ids=",".join([f"'{id}'" for id in ids]) - ) - # print("get_by_ids:"+SQL) - res = await self.db.query(SQL, multirows=True) + SQL = SQL_TEMPLATES["get_by_ids_" + self.namespace].format(ids=",".join([f"'{id}'" for id in ids])) + params = {"workspace":self.db.workspace} + #print("get_by_ids:"+SQL) + #print(params) + res = await self.db.query(SQL,params, multirows=True) if res: data = res # [{"data":i} for i in res] # print(data) @@ -201,12 +201,16 @@ class OracleKVStorage(BaseKVStorage): async def filter_keys(self, keys: list[str]) -> set[str]: """过滤掉重复内容""" - SQL = SQL_TEMPLATES["filter_keys"].format( - table_name=N_T[self.namespace], - workspace=self.db.workspace, - ids=",".join([f"'{k}'" for k in keys]), - ) - res = await self.db.query(SQL, multirows=True) + SQL = SQL_TEMPLATES["filter_keys"].format(table_name=N_T[self.namespace], + ids=",".join([f"'{id}'" for id in keys])) + params = {"workspace":self.db.workspace} + try: + await self.db.query(SQL, params) + except Exception as e: + logger.error(f"Oracle database error: {e}") + print(SQL) + print(params) + res = await self.db.query(SQL, params,multirows=True) data = None if res: exist_keys = [key["id"] for key in res] @@ -243,29 +247,31 @@ class OracleKVStorage(BaseKVStorage): d["__vector__"] = embeddings[i] # print(list_data) for item in list_data: - merge_sql = SQL_TEMPLATES["merge_chunk"].format(check_id=item["__id__"]) - - values = [ - item["__id__"], - item["content"], - self.db.workspace, - item["tokens"], - item["chunk_order_index"], - item["full_doc_id"], - item["__vector__"], - ] + merge_sql = SQL_TEMPLATES["merge_chunk"] + data = {"check_id":item["__id__"], + "id":item["__id__"], + "content":item["content"], + "workspace":self.db.workspace, + "tokens":item["tokens"], + "chunk_order_index":item["chunk_order_index"], + "full_doc_id":item["full_doc_id"], + "content_vector":item["__vector__"] + } # print(merge_sql) - await self.db.execute(merge_sql, values) + await self.db.execute(merge_sql, data) if self.namespace == "full_docs": for k, v in self._data.items(): # values.clear() - merge_sql = SQL_TEMPLATES["merge_doc_full"].format( - check_id=k, - ) - values = [k, self._data[k]["content"], self.db.workspace] + merge_sql = SQL_TEMPLATES["merge_doc_full"] + data = { + "check_id":k, + "id":k, + "content":v["content"], + "workspace":self.db.workspace + } # print(merge_sql) - await self.db.execute(merge_sql, values) + await self.db.execute(merge_sql, data) return left_data async def index_done_callback(self): @@ -295,18 +301,17 @@ class OracleVectorDBStorage(BaseVectorStorage): # 转换精度 dtype = str(embedding.dtype).upper() dimension = embedding.shape[0] - embedding_string = ", ".join(map(str, embedding.tolist())) + embedding_string = "["+", ".join(map(str, embedding.tolist()))+"]" - SQL = SQL_TEMPLATES[self.namespace].format( - embedding_string=embedding_string, - dimension=dimension, - dtype=dtype, - workspace=self.db.workspace, - top_k=top_k, - better_than_threshold=self.cosine_better_than_threshold, - ) + SQL = SQL_TEMPLATES[self.namespace].format(dimension=dimension, dtype=dtype) + params = { + "embedding_string": embedding_string, + "workspace": self.db.workspace, + "top_k": top_k, + "better_than_threshold": self.cosine_better_than_threshold, + } # print(SQL) - results = await self.db.query(SQL, multirows=True) + results = await self.db.query(SQL,params=params, multirows=True) # print("vector search result:",results) return results @@ -328,6 +333,8 @@ class OracleGraphStorage(BaseGraphStorage): entity_type = node_data["entity_type"] description = node_data["description"] source_id = node_data["source_id"] + logger.debug(f"entity_name:{entity_name}, entity_type:{entity_type}") + content = entity_name + description contents = [content] batches = [ @@ -339,22 +346,18 @@ class OracleGraphStorage(BaseGraphStorage): ) embeddings = np.concatenate(embeddings_list) content_vector = embeddings[0] - merge_sql = SQL_TEMPLATES["merge_node"].format( - workspace=self.db.workspace, name=entity_name, source_chunk_id=source_id - ) + merge_sql = SQL_TEMPLATES["merge_node"] + data = { + "workspace":self.db.workspace, + "name":entity_name, + "entity_type":entity_type, + "description":description, + "source_chunk_id":source_id, + "content":content, + "content_vector":content_vector + } # print(merge_sql) - await self.db.execute( - merge_sql, - [ - self.db.workspace, - entity_name, - entity_type, - description, - source_id, - content, - content_vector, - ], - ) + await self.db.execute(merge_sql,data) # self._graph.add_node(node_id, **node_data) async def upsert_edge( @@ -368,6 +371,8 @@ class OracleGraphStorage(BaseGraphStorage): keywords = edge_data["keywords"] description = edge_data["description"] source_chunk_id = edge_data["source_id"] + logger.debug(f"source_name:{source_name}, target_name:{target_name}, keywords: {keywords}") + content = keywords + source_name + target_name + description contents = [content] batches = [ @@ -379,27 +384,20 @@ class OracleGraphStorage(BaseGraphStorage): ) embeddings = np.concatenate(embeddings_list) content_vector = embeddings[0] - merge_sql = SQL_TEMPLATES["merge_edge"].format( - workspace=self.db.workspace, - source_name=source_name, - target_name=target_name, - source_chunk_id=source_chunk_id, - ) + merge_sql = SQL_TEMPLATES["merge_edge"] + data = { + "workspace":self.db.workspace, + "source_name":source_name, + "target_name":target_name, + "weight":weight, + "keywords":keywords, + "description":description, + "source_chunk_id":source_chunk_id, + "content":content, + "content_vector":content_vector + } # print(merge_sql) - await self.db.execute( - merge_sql, - [ - self.db.workspace, - source_name, - target_name, - weight, - keywords, - description, - source_chunk_id, - content, - content_vector, - ], - ) + await self.db.execute(merge_sql,data) # self._graph.add_edge(source_node_id, target_node_id, **edge_data) async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: @@ -429,12 +427,14 @@ class OracleGraphStorage(BaseGraphStorage): #################### query method ################# async def has_node(self, node_id: str) -> bool: """根据节点id检查节点是否存在""" - SQL = SQL_TEMPLATES["has_node"].format( - workspace=self.db.workspace, node_id=node_id - ) + SQL = SQL_TEMPLATES["has_node"] + params = { + "workspace":self.db.workspace, + "node_id":node_id + } # print(SQL) # print(self.db.workspace, node_id) - res = await self.db.query(SQL) + res = await self.db.query(SQL,params) if res: # print("Node exist!",res) return True @@ -444,13 +444,14 @@ class OracleGraphStorage(BaseGraphStorage): async def has_edge(self, source_node_id: str, target_node_id: str) -> bool: """根据源和目标节点id检查边是否存在""" - SQL = SQL_TEMPLATES["has_edge"].format( - workspace=self.db.workspace, - source_node_id=source_node_id, - target_node_id=target_node_id, - ) + SQL = SQL_TEMPLATES["has_edge"] + params = { + "workspace":self.db.workspace, + "source_node_id":source_node_id, + "target_node_id":target_node_id + } # print(SQL) - res = await self.db.query(SQL) + res = await self.db.query(SQL,params) if res: # print("Edge exist!",res) return True @@ -460,11 +461,13 @@ class OracleGraphStorage(BaseGraphStorage): async def node_degree(self, node_id: str) -> int: """根据节点id获取节点的度""" - SQL = SQL_TEMPLATES["node_degree"].format( - workspace=self.db.workspace, node_id=node_id - ) + SQL = SQL_TEMPLATES["node_degree"] + params = { + "workspace":self.db.workspace, + "node_id":node_id + } # print(SQL) - res = await self.db.query(SQL) + res = await self.db.query(SQL,params) if res: # print("Node degree",res["degree"]) return res["degree"] @@ -480,12 +483,14 @@ class OracleGraphStorage(BaseGraphStorage): async def get_node(self, node_id: str) -> Union[dict, None]: """根据节点id获取节点数据""" - SQL = SQL_TEMPLATES["get_node"].format( - workspace=self.db.workspace, node_id=node_id - ) + SQL = SQL_TEMPLATES["get_node"] + params = { + "workspace":self.db.workspace, + "node_id":node_id + } # print(self.db.workspace, node_id) # print(SQL) - res = await self.db.query(SQL) + res = await self.db.query(SQL,params) if res: # print("Get node!",self.db.workspace, node_id,res) return res @@ -497,12 +502,13 @@ class OracleGraphStorage(BaseGraphStorage): self, source_node_id: str, target_node_id: str ) -> Union[dict, None]: """根据源和目标节点id获取边""" - SQL = SQL_TEMPLATES["get_edge"].format( - workspace=self.db.workspace, - source_node_id=source_node_id, - target_node_id=target_node_id, - ) - res = await self.db.query(SQL) + SQL = SQL_TEMPLATES["get_edge"] + params = { + "workspace":self.db.workspace, + "source_node_id":source_node_id, + "target_node_id":target_node_id + } + res = await self.db.query(SQL,params) if res: # print("Get edge!",self.db.workspace, source_node_id, target_node_id,res[0]) return res @@ -513,10 +519,12 @@ class OracleGraphStorage(BaseGraphStorage): async def get_node_edges(self, source_node_id: str): """根据节点id获取节点的所有边""" if await self.has_node(source_node_id): - SQL = SQL_TEMPLATES["get_node_edges"].format( - workspace=self.db.workspace, source_node_id=source_node_id - ) - res = await self.db.query(sql=SQL, multirows=True) + SQL = SQL_TEMPLATES["get_node_edges"] + params = { + "workspace":self.db.workspace, + "source_node_id":source_node_id + } + res = await self.db.query(sql=SQL, params=params, multirows=True) if res: data = [(i["source_name"], i["target_name"]) for i in res] # print("Get node edge!",self.db.workspace, source_node_id,data) @@ -524,8 +532,22 @@ class OracleGraphStorage(BaseGraphStorage): else: # print("Node Edge not exist!",self.db.workspace, source_node_id) return [] + + async def get_all_nodes(self, limit: int): + """查询所有节点""" + SQL = SQL_TEMPLATES["get_all_nodes"] + params = {"workspace":self.db.workspace, "limit":str(limit)} + res = await self.db.query(sql=SQL,params=params, multirows=True) + if res: + return res - + async def get_all_edges(self, limit: int): + """查询所有边""" + SQL = SQL_TEMPLATES["get_all_edges"] + params = {"workspace":self.db.workspace, "limit":str(limit)} + res = await self.db.query(sql=SQL,params=params, multirows=True) + if res: + return res N_T = { "full_docs": "LIGHTRAG_DOC_FULL", "text_chunks": "LIGHTRAG_DOC_CHUNKS", @@ -619,82 +641,96 @@ TABLES = { SQL_TEMPLATES = { # SQL for KVStorage - "get_by_id_full_docs": "select ID,NVL(content,'') as content from LIGHTRAG_DOC_FULL where workspace='{workspace}' and ID='{id}'", - "get_by_id_text_chunks": "select ID,TOKENS,NVL(content,'') as content,CHUNK_ORDER_INDEX,FULL_DOC_ID from LIGHTRAG_DOC_CHUNKS where workspace='{workspace}' and ID='{id}'", - "get_by_ids_full_docs": "select ID,NVL(content,'') as content from LIGHTRAG_DOC_FULL where workspace='{workspace}' and ID in ({ids})", - "get_by_ids_text_chunks": "select ID,TOKENS,NVL(content,'') as content,CHUNK_ORDER_INDEX,FULL_DOC_ID from LIGHTRAG_DOC_CHUNKS where workspace='{workspace}' and ID in ({ids})", - "filter_keys": "select id from {table_name} where workspace='{workspace}' and id in ({ids})", + "get_by_id_full_docs": "select ID,NVL(content,'') as content from LIGHTRAG_DOC_FULL where workspace=:workspace and ID=:id", + "get_by_id_text_chunks": "select ID,TOKENS,NVL(content,'') as content,CHUNK_ORDER_INDEX,FULL_DOC_ID from LIGHTRAG_DOC_CHUNKS where workspace=:workspace and ID=:id", + "get_by_ids_full_docs": "select ID,NVL(content,'') as content from LIGHTRAG_DOC_FULL where workspace=:workspace and ID in ({ids})", + "get_by_ids_text_chunks": "select ID,TOKENS,NVL(content,'') as content,CHUNK_ORDER_INDEX,FULL_DOC_ID from LIGHTRAG_DOC_CHUNKS where workspace=:workspace and ID in ({ids})", + "filter_keys": "select id from {table_name} where workspace=:workspace and id in ({ids})", "merge_doc_full": """ MERGE INTO LIGHTRAG_DOC_FULL a USING DUAL - ON (a.id = '{check_id}') + ON (a.id = :check_id) WHEN NOT MATCHED THEN - INSERT(id,content,workspace) values(:1,:2,:3) + INSERT(id,content,workspace) values(:id,:content,:workspace) """, "merge_chunk": """MERGE INTO LIGHTRAG_DOC_CHUNKS a USING DUAL - ON (a.id = '{check_id}') + ON (a.id = :check_id) WHEN NOT MATCHED THEN INSERT(id,content,workspace,tokens,chunk_order_index,full_doc_id,content_vector) - values (:1,:2,:3,:4,:5,:6,:7) """, + values (:id,:content,:workspace,:tokens,:chunk_order_index,:full_doc_id,:content_vector) """, # SQL for VectorStorage "entities": """SELECT name as entity_name FROM - (SELECT id,name,VECTOR_DISTANCE(content_vector,vector('[{embedding_string}]',{dimension},{dtype}),COSINE) as distance - FROM LIGHTRAG_GRAPH_NODES WHERE workspace='{workspace}') - WHERE distance>{better_than_threshold} ORDER BY distance ASC FETCH FIRST {top_k} ROWS ONLY""", + (SELECT id,name,VECTOR_DISTANCE(content_vector,vector(:embedding_string,{dimension},{dtype}),COSINE) as distance + FROM LIGHTRAG_GRAPH_NODES WHERE workspace=:workspace) + WHERE distance>:better_than_threshold ORDER BY distance ASC FETCH FIRST :top_k ROWS ONLY""", "relationships": """SELECT source_name as src_id, target_name as tgt_id FROM - (SELECT id,source_name,target_name,VECTOR_DISTANCE(content_vector,vector('[{embedding_string}]',{dimension},{dtype}),COSINE) as distance - FROM LIGHTRAG_GRAPH_EDGES WHERE workspace='{workspace}') - WHERE distance>{better_than_threshold} ORDER BY distance ASC FETCH FIRST {top_k} ROWS ONLY""", + (SELECT id,source_name,target_name,VECTOR_DISTANCE(content_vector,vector(:embedding_string,{dimension},{dtype}),COSINE) as distance + FROM LIGHTRAG_GRAPH_EDGES WHERE workspace=:workspace) + WHERE distance>:better_than_threshold ORDER BY distance ASC FETCH FIRST :top_k ROWS ONLY""", "chunks": """SELECT id FROM - (SELECT id,VECTOR_DISTANCE(content_vector,vector('[{embedding_string}]',{dimension},{dtype}),COSINE) as distance - FROM LIGHTRAG_DOC_CHUNKS WHERE workspace='{workspace}') - WHERE distance>{better_than_threshold} ORDER BY distance ASC FETCH FIRST {top_k} ROWS ONLY""", + (SELECT id,VECTOR_DISTANCE(content_vector,vector(:embedding_string,{dimension},{dtype}),COSINE) as distance + FROM LIGHTRAG_DOC_CHUNKS WHERE workspace=:workspace) + WHERE distance>:better_than_threshold ORDER BY distance ASC FETCH FIRST :top_k ROWS ONLY""", # SQL for GraphStorage "has_node": """SELECT * FROM GRAPH_TABLE (lightrag_graph MATCH (a) - WHERE a.workspace='{workspace}' AND a.name='{node_id}' + WHERE a.workspace=:workspace AND a.name=:node_id COLUMNS (a.name))""", "has_edge": """SELECT * FROM GRAPH_TABLE (lightrag_graph MATCH (a) -[e]-> (b) - WHERE e.workspace='{workspace}' and a.workspace='{workspace}' and b.workspace='{workspace}' - AND a.name='{source_node_id}' AND b.name='{target_node_id}' + WHERE e.workspace=:workspace and a.workspace=:workspace and b.workspace=:workspace + AND a.name=:source_node_id AND b.name=:target_node_id COLUMNS (e.source_name,e.target_name) )""", "node_degree": """SELECT count(1) as degree FROM GRAPH_TABLE (lightrag_graph MATCH (a)-[e]->(b) - WHERE a.workspace='{workspace}' and a.workspace='{workspace}' and b.workspace='{workspace}' - AND a.name='{node_id}' or b.name = '{node_id}' + WHERE a.workspace=:workspace and a.workspace=:workspace and b.workspace=:workspace + AND a.name=:node_id or b.name = :node_id COLUMNS (a.name))""", "get_node": """SELECT t1.name,t2.entity_type,t2.source_chunk_id as source_id,NVL(t2.description,'') AS description FROM GRAPH_TABLE (lightrag_graph MATCH (a) - WHERE a.workspace='{workspace}' AND a.name='{node_id}' + WHERE a.workspace=:workspace AND a.name=:node_id COLUMNS (a.name) ) t1 JOIN LIGHTRAG_GRAPH_NODES t2 on t1.name=t2.name - WHERE t2.workspace='{workspace}'""", + WHERE t2.workspace=:workspace""", "get_edge": """SELECT t1.source_id,t2.weight,t2.source_chunk_id as source_id,t2.keywords, NVL(t2.description,'') AS description,NVL(t2.KEYWORDS,'') AS keywords FROM GRAPH_TABLE (lightrag_graph MATCH (a)-[e]->(b) - WHERE e.workspace='{workspace}' and a.workspace='{workspace}' and b.workspace='{workspace}' - AND a.name='{source_node_id}' and b.name = '{target_node_id}' + WHERE e.workspace=:workspace and a.workspace=:workspace and b.workspace=:workspace + AND a.name=:source_node_id and b.name = :target_node_id COLUMNS (e.id,a.name as source_id) ) t1 JOIN LIGHTRAG_GRAPH_EDGES t2 on t1.id=t2.id""", "get_node_edges": """SELECT source_name,target_name FROM GRAPH_TABLE (lightrag_graph MATCH (a)-[e]->(b) - WHERE e.workspace='{workspace}' and a.workspace='{workspace}' and b.workspace='{workspace}' - AND a.name='{source_node_id}' + WHERE e.workspace=:workspace and a.workspace=:workspace and b.workspace=:workspace + AND a.name=:source_node_id COLUMNS (a.name as source_name,b.name as target_name))""", "merge_node": """MERGE INTO LIGHTRAG_GRAPH_NODES a USING DUAL - ON (a.workspace = '{workspace}' and a.name='{name}' and a.source_chunk_id='{source_chunk_id}') + ON (a.workspace = :workspace and a.name=:name and a.source_chunk_id=:source_chunk_id) WHEN NOT MATCHED THEN INSERT(workspace,name,entity_type,description,source_chunk_id,content,content_vector) - values (:1,:2,:3,:4,:5,:6,:7) """, + values (:workspace,:name,:entity_type,:description,:source_chunk_id,:content,:content_vector) """, "merge_edge": """MERGE INTO LIGHTRAG_GRAPH_EDGES a USING DUAL - ON (a.workspace = '{workspace}' and a.source_name='{source_name}' and a.target_name='{target_name}' and a.source_chunk_id='{source_chunk_id}') + ON (a.workspace = :workspace and a.source_name=:source_name and a.target_name=:target_name and a.source_chunk_id=:source_chunk_id) WHEN NOT MATCHED THEN INSERT(workspace,source_name,target_name,weight,keywords,description,source_chunk_id,content,content_vector) - values (:1,:2,:3,:4,:5,:6,:7,:8,:9) """, + values (:workspace,:source_name,:target_name,:weight,:keywords,:description,:source_chunk_id,:content,:content_vector) """, + "get_all_nodes":"""SELECT t1.name as id,t1.entity_type as label,t1.DESCRIPTION,t2.content + FROM LIGHTRAG_GRAPH_NODES t1 + LEFT JOIN LIGHTRAG_DOC_CHUNKS t2 on t1.source_chunk_id=t2.id + WHERE t1.workspace=:workspace + order by t1.CREATETIME DESC + fetch first :limit rows only + """, + "get_all_edges":"""SELECT t1.id,t1.keywords as label,t1.keywords, t1.source_name as source, t1.target_name as target, + t1.weight,t1.DESCRIPTION,t2.content + FROM LIGHTRAG_GRAPH_EDGES t1 + LEFT JOIN LIGHTRAG_DOC_CHUNKS t2 on t1.source_chunk_id=t2.id + WHERE t1.workspace=:workspace + order by t1.CREATETIME DESC + fetch first :limit rows only""" } From 89d7967349fe30c1bf682935ce95d54082b7d9fe Mon Sep 17 00:00:00 2001 From: tmuife <43266626@qq.com> Date: Mon, 18 Nov 2024 13:52:49 +0800 Subject: [PATCH 09/22] use pre-commit reformat --- lightrag/kg/oracle_impl.py | 176 ++++++++++++++++++------------------- 1 file changed, 87 insertions(+), 89 deletions(-) diff --git a/lightrag/kg/oracle_impl.py b/lightrag/kg/oracle_impl.py index fd8bf536..b46d36d8 100644 --- a/lightrag/kg/oracle_impl.py +++ b/lightrag/kg/oracle_impl.py @@ -114,7 +114,9 @@ class OracleDB: logger.info("Finished check all tables in Oracle database") - async def query(self, sql: str, params: dict = None, multirows: bool = False) -> Union[dict, None]: + async def query( + self, sql: str, params: dict = None, multirows: bool = False + ) -> Union[dict, None]: async with self.pool.acquire() as connection: connection.inputtypehandler = self.input_type_handler connection.outputtypehandler = self.output_type_handler @@ -174,9 +176,9 @@ class OracleKVStorage(BaseKVStorage): async def get_by_id(self, id: str) -> Union[dict, None]: """根据 id 获取 doc_full 数据.""" SQL = SQL_TEMPLATES["get_by_id_" + self.namespace] - params = {"workspace":self.db.workspace, "id":id} + params = {"workspace": self.db.workspace, "id": id} # print("get_by_id:"+SQL) - res = await self.db.query(SQL,params) + res = await self.db.query(SQL, params) if res: data = res # {"data":res} # print (data) @@ -187,11 +189,13 @@ class OracleKVStorage(BaseKVStorage): # Query by id async def get_by_ids(self, ids: list[str], fields=None) -> Union[list[dict], None]: """根据 id 获取 doc_chunks 数据""" - SQL = SQL_TEMPLATES["get_by_ids_" + self.namespace].format(ids=",".join([f"'{id}'" for id in ids])) - params = {"workspace":self.db.workspace} - #print("get_by_ids:"+SQL) - #print(params) - res = await self.db.query(SQL,params, multirows=True) + SQL = SQL_TEMPLATES["get_by_ids_" + self.namespace].format( + ids=",".join([f"'{id}'" for id in ids]) + ) + params = {"workspace": self.db.workspace} + # print("get_by_ids:"+SQL) + # print(params) + res = await self.db.query(SQL, params, multirows=True) if res: data = res # [{"data":i} for i in res] # print(data) @@ -201,16 +205,17 @@ class OracleKVStorage(BaseKVStorage): async def filter_keys(self, keys: list[str]) -> set[str]: """过滤掉重复内容""" - SQL = SQL_TEMPLATES["filter_keys"].format(table_name=N_T[self.namespace], - ids=",".join([f"'{id}'" for id in keys])) - params = {"workspace":self.db.workspace} + SQL = SQL_TEMPLATES["filter_keys"].format( + table_name=N_T[self.namespace], ids=",".join([f"'{id}'" for id in keys]) + ) + params = {"workspace": self.db.workspace} try: await self.db.query(SQL, params) except Exception as e: logger.error(f"Oracle database error: {e}") print(SQL) print(params) - res = await self.db.query(SQL, params,multirows=True) + res = await self.db.query(SQL, params, multirows=True) data = None if res: exist_keys = [key["id"] for key in res] @@ -248,15 +253,16 @@ class OracleKVStorage(BaseKVStorage): # print(list_data) for item in list_data: merge_sql = SQL_TEMPLATES["merge_chunk"] - data = {"check_id":item["__id__"], - "id":item["__id__"], - "content":item["content"], - "workspace":self.db.workspace, - "tokens":item["tokens"], - "chunk_order_index":item["chunk_order_index"], - "full_doc_id":item["full_doc_id"], - "content_vector":item["__vector__"] - } + data = { + "check_id": item["__id__"], + "id": item["__id__"], + "content": item["content"], + "workspace": self.db.workspace, + "tokens": item["tokens"], + "chunk_order_index": item["chunk_order_index"], + "full_doc_id": item["full_doc_id"], + "content_vector": item["__vector__"], + } # print(merge_sql) await self.db.execute(merge_sql, data) @@ -265,11 +271,11 @@ class OracleKVStorage(BaseKVStorage): # values.clear() merge_sql = SQL_TEMPLATES["merge_doc_full"] data = { - "check_id":k, - "id":k, - "content":v["content"], - "workspace":self.db.workspace - } + "check_id": k, + "id": k, + "content": v["content"], + "workspace": self.db.workspace, + } # print(merge_sql) await self.db.execute(merge_sql, data) return left_data @@ -301,7 +307,7 @@ class OracleVectorDBStorage(BaseVectorStorage): # 转换精度 dtype = str(embedding.dtype).upper() dimension = embedding.shape[0] - embedding_string = "["+", ".join(map(str, embedding.tolist()))+"]" + embedding_string = "[" + ", ".join(map(str, embedding.tolist())) + "]" SQL = SQL_TEMPLATES[self.namespace].format(dimension=dimension, dtype=dtype) params = { @@ -309,9 +315,9 @@ class OracleVectorDBStorage(BaseVectorStorage): "workspace": self.db.workspace, "top_k": top_k, "better_than_threshold": self.cosine_better_than_threshold, - } + } # print(SQL) - results = await self.db.query(SQL,params=params, multirows=True) + results = await self.db.query(SQL, params=params, multirows=True) # print("vector search result:",results) return results @@ -346,18 +352,18 @@ class OracleGraphStorage(BaseGraphStorage): ) embeddings = np.concatenate(embeddings_list) content_vector = embeddings[0] - merge_sql = SQL_TEMPLATES["merge_node"] + merge_sql = SQL_TEMPLATES["merge_node"] data = { - "workspace":self.db.workspace, - "name":entity_name, - "entity_type":entity_type, - "description":description, - "source_chunk_id":source_id, - "content":content, - "content_vector":content_vector - } + "workspace": self.db.workspace, + "name": entity_name, + "entity_type": entity_type, + "description": description, + "source_chunk_id": source_id, + "content": content, + "content_vector": content_vector, + } # print(merge_sql) - await self.db.execute(merge_sql,data) + await self.db.execute(merge_sql, data) # self._graph.add_node(node_id, **node_data) async def upsert_edge( @@ -371,7 +377,9 @@ class OracleGraphStorage(BaseGraphStorage): keywords = edge_data["keywords"] description = edge_data["description"] source_chunk_id = edge_data["source_id"] - logger.debug(f"source_name:{source_name}, target_name:{target_name}, keywords: {keywords}") + logger.debug( + f"source_name:{source_name}, target_name:{target_name}, keywords: {keywords}" + ) content = keywords + source_name + target_name + description contents = [content] @@ -384,20 +392,20 @@ class OracleGraphStorage(BaseGraphStorage): ) embeddings = np.concatenate(embeddings_list) content_vector = embeddings[0] - merge_sql = SQL_TEMPLATES["merge_edge"] + merge_sql = SQL_TEMPLATES["merge_edge"] data = { - "workspace":self.db.workspace, - "source_name":source_name, - "target_name":target_name, - "weight":weight, - "keywords":keywords, - "description":description, - "source_chunk_id":source_chunk_id, - "content":content, - "content_vector":content_vector - } + "workspace": self.db.workspace, + "source_name": source_name, + "target_name": target_name, + "weight": weight, + "keywords": keywords, + "description": description, + "source_chunk_id": source_chunk_id, + "content": content, + "content_vector": content_vector, + } # print(merge_sql) - await self.db.execute(merge_sql,data) + await self.db.execute(merge_sql, data) # self._graph.add_edge(source_node_id, target_node_id, **edge_data) async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: @@ -428,13 +436,10 @@ class OracleGraphStorage(BaseGraphStorage): async def has_node(self, node_id: str) -> bool: """根据节点id检查节点是否存在""" SQL = SQL_TEMPLATES["has_node"] - params = { - "workspace":self.db.workspace, - "node_id":node_id - } + params = {"workspace": self.db.workspace, "node_id": node_id} # print(SQL) # print(self.db.workspace, node_id) - res = await self.db.query(SQL,params) + res = await self.db.query(SQL, params) if res: # print("Node exist!",res) return True @@ -446,12 +451,12 @@ class OracleGraphStorage(BaseGraphStorage): """根据源和目标节点id检查边是否存在""" SQL = SQL_TEMPLATES["has_edge"] params = { - "workspace":self.db.workspace, - "source_node_id":source_node_id, - "target_node_id":target_node_id - } + "workspace": self.db.workspace, + "source_node_id": source_node_id, + "target_node_id": target_node_id, + } # print(SQL) - res = await self.db.query(SQL,params) + res = await self.db.query(SQL, params) if res: # print("Edge exist!",res) return True @@ -462,12 +467,9 @@ class OracleGraphStorage(BaseGraphStorage): async def node_degree(self, node_id: str) -> int: """根据节点id获取节点的度""" SQL = SQL_TEMPLATES["node_degree"] - params = { - "workspace":self.db.workspace, - "node_id":node_id - } + params = {"workspace": self.db.workspace, "node_id": node_id} # print(SQL) - res = await self.db.query(SQL,params) + res = await self.db.query(SQL, params) if res: # print("Node degree",res["degree"]) return res["degree"] @@ -484,13 +486,10 @@ class OracleGraphStorage(BaseGraphStorage): async def get_node(self, node_id: str) -> Union[dict, None]: """根据节点id获取节点数据""" SQL = SQL_TEMPLATES["get_node"] - params = { - "workspace":self.db.workspace, - "node_id":node_id - } + params = {"workspace": self.db.workspace, "node_id": node_id} # print(self.db.workspace, node_id) # print(SQL) - res = await self.db.query(SQL,params) + res = await self.db.query(SQL, params) if res: # print("Get node!",self.db.workspace, node_id,res) return res @@ -504,11 +503,11 @@ class OracleGraphStorage(BaseGraphStorage): """根据源和目标节点id获取边""" SQL = SQL_TEMPLATES["get_edge"] params = { - "workspace":self.db.workspace, - "source_node_id":source_node_id, - "target_node_id":target_node_id - } - res = await self.db.query(SQL,params) + "workspace": self.db.workspace, + "source_node_id": source_node_id, + "target_node_id": target_node_id, + } + res = await self.db.query(SQL, params) if res: # print("Get edge!",self.db.workspace, source_node_id, target_node_id,res[0]) return res @@ -520,10 +519,7 @@ class OracleGraphStorage(BaseGraphStorage): """根据节点id获取节点的所有边""" if await self.has_node(source_node_id): SQL = SQL_TEMPLATES["get_node_edges"] - params = { - "workspace":self.db.workspace, - "source_node_id":source_node_id - } + params = {"workspace": self.db.workspace, "source_node_id": source_node_id} res = await self.db.query(sql=SQL, params=params, multirows=True) if res: data = [(i["source_name"], i["target_name"]) for i in res] @@ -532,22 +528,24 @@ class OracleGraphStorage(BaseGraphStorage): else: # print("Node Edge not exist!",self.db.workspace, source_node_id) return [] - + async def get_all_nodes(self, limit: int): """查询所有节点""" SQL = SQL_TEMPLATES["get_all_nodes"] - params = {"workspace":self.db.workspace, "limit":str(limit)} - res = await self.db.query(sql=SQL,params=params, multirows=True) + params = {"workspace": self.db.workspace, "limit": str(limit)} + res = await self.db.query(sql=SQL, params=params, multirows=True) if res: return res async def get_all_edges(self, limit: int): """查询所有边""" SQL = SQL_TEMPLATES["get_all_edges"] - params = {"workspace":self.db.workspace, "limit":str(limit)} - res = await self.db.query(sql=SQL,params=params, multirows=True) + params = {"workspace": self.db.workspace, "limit": str(limit)} + res = await self.db.query(sql=SQL, params=params, multirows=True) if res: return res + + N_T = { "full_docs": "LIGHTRAG_DOC_FULL", "text_chunks": "LIGHTRAG_DOC_CHUNKS", @@ -719,18 +717,18 @@ SQL_TEMPLATES = { WHEN NOT MATCHED THEN INSERT(workspace,source_name,target_name,weight,keywords,description,source_chunk_id,content,content_vector) values (:workspace,:source_name,:target_name,:weight,:keywords,:description,:source_chunk_id,:content,:content_vector) """, - "get_all_nodes":"""SELECT t1.name as id,t1.entity_type as label,t1.DESCRIPTION,t2.content + "get_all_nodes": """SELECT t1.name as id,t1.entity_type as label,t1.DESCRIPTION,t2.content FROM LIGHTRAG_GRAPH_NODES t1 LEFT JOIN LIGHTRAG_DOC_CHUNKS t2 on t1.source_chunk_id=t2.id WHERE t1.workspace=:workspace order by t1.CREATETIME DESC fetch first :limit rows only """, - "get_all_edges":"""SELECT t1.id,t1.keywords as label,t1.keywords, t1.source_name as source, t1.target_name as target, + "get_all_edges": """SELECT t1.id,t1.keywords as label,t1.keywords, t1.source_name as source, t1.target_name as target, t1.weight,t1.DESCRIPTION,t2.content FROM LIGHTRAG_GRAPH_EDGES t1 LEFT JOIN LIGHTRAG_DOC_CHUNKS t2 on t1.source_chunk_id=t2.id WHERE t1.workspace=:workspace order by t1.CREATETIME DESC - fetch first :limit rows only""" + fetch first :limit rows only""", } From e203aad3de6fe36c61a6cba73911e7df8311feff Mon Sep 17 00:00:00 2001 From: WinstonCHEN1 <1281838223@qq.com> Date: Mon, 18 Nov 2024 14:24:04 -0800 Subject: [PATCH 10/22] fix:error working directory name in Step_1.py --- reproduce/Step_1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reproduce/Step_1.py b/reproduce/Step_1.py index 43c44056..e318c145 100644 --- a/reproduce/Step_1.py +++ b/reproduce/Step_1.py @@ -24,7 +24,7 @@ def insert_text(rag, file_path): cls = "agriculture" -WORKING_DIR = "../{cls}" +WORKING_DIR = f"../{cls}" if not os.path.exists(WORKING_DIR): os.mkdir(WORKING_DIR) From c9becdf5f40e6f3b291b9a3d50a9624d677e5d65 Mon Sep 17 00:00:00 2001 From: luoyifan <1625370020@qq.com> Date: Tue, 19 Nov 2024 14:02:38 +0800 Subject: [PATCH 11/22] A more robust approach for result to json. --- lightrag/operate.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index b11e14fe..eb600c4b 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -418,7 +418,7 @@ async def local_query( .replace("model", "") .strip() ) - result = "{" + result.split("{")[1].split("}")[0] + "}" + result = "{" + result.split("{")[-1].split("}")[0] + "}" keywords_data = json.loads(result) keywords = keywords_data.get("low_level_keywords", []) @@ -691,7 +691,7 @@ async def global_query( .replace("model", "") .strip() ) - result = "{" + result.split("{")[1].split("}")[0] + "}" + result = "{" + result.split("{")[-1].split("}")[0] + "}" keywords_data = json.loads(result) keywords = keywords_data.get("high_level_keywords", []) @@ -940,7 +940,7 @@ async def hybrid_query( .replace("model", "") .strip() ) - result = "{" + result.split("{")[1].split("}")[0] + "}" + result = "{" + result.split("{")[-1].split("}")[0] + "}" keywords_data = json.loads(result) hl_keywords = keywords_data.get("high_level_keywords", []) ll_keywords = keywords_data.get("low_level_keywords", []) From bcaaaad9598cf89b248af84d228b68a7220d1096 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Tue, 19 Nov 2024 16:52:26 +0800 Subject: [PATCH 12/22] Update --- README.md | 8 ++++++-- lightrag/kg/neo4j_impl.py | 2 +- lightrag/lightrag.py | 3 +-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index b62f01a1..d7e3d418 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,9 @@ + +

@@ -35,8 +37,10 @@ This repository hosts the code of LightRAG. The structure of this code is based ## Algorithm Flowchart -![LightRAG_Self excalidraw](https://github.com/user-attachments/assets/aa5c4892-2e44-49e6-a116-2403ed80a1a3) - +![LightRAG Indexing Flowchart](https://learnopencv.com/wp-content/uploads/2024/11/LightRAG-VectorDB-Json-KV-Store-Indexing-Flowchart-scaled.jpg) +*Figure 1: LightRAG Indexing Flowchart* +![LightRAG Retrieval and Querying Flowchart](https://learnopencv.com/wp-content/uploads/2024/11/LightRAG-Querying-Flowchart-Dual-Level-Retrieval-Generation-Knowledge-Graphs-scaled.jpg) +*Figure 2: LightRAG Retrieval and Querying Flowchart* ## Install diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py index 32bfbe2e..f9fcb46d 100644 --- a/lightrag/kg/neo4j_impl.py +++ b/lightrag/kg/neo4j_impl.py @@ -214,7 +214,7 @@ class Neo4JStorage(BaseGraphStorage): neo4jExceptions.ServiceUnavailable, neo4jExceptions.TransientError, neo4jExceptions.WriteServiceUnavailable, - neo4jExceptions.ClientError + neo4jExceptions.ClientError, ) ), ) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index ce27e76d..7fafadcf 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -173,8 +173,7 @@ class LightRAG: embedding_func=self.embedding_func, ) self.chunk_entity_relation_graph = self.graph_storage_cls( - namespace="chunk_entity_relation", - global_config=asdict(self) + namespace="chunk_entity_relation", global_config=asdict(self) ) #### # add embedding func by walter over From 9d871fbc71aff5c270b255ae742e3f66578c1789 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Tue, 19 Nov 2024 16:54:14 +0800 Subject: [PATCH 13/22] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d7e3d418..36e1b2a9 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,6 @@ - -

@@ -18,6 +16,10 @@

+

+ + +

This repository hosts the code of LightRAG. The structure of this code is based on [nano-graphrag](https://github.com/gusye1234/nano-graphrag). ![LightRAG Diagram](https://i-blog.csdnimg.cn/direct/b2aaf634151b4706892693ffb43d9093.png) From a50ed0b164ddb2679f2d37a17f35fe1c3fa959d5 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:57:20 +0800 Subject: [PATCH 14/22] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 36e1b2a9..21bc1e7a 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ This repository hosts the code of LightRAG. The structure of this code is based ## 🎉 News +- [x] [2024.11.19]🎯📢We have added a detailed blog link introducing LightRAG on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author! - [x] [2024.11.12]🎯📢You can [use Oracle Database 23ai for all storage types (kv/vector/graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py) now. - [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete-entity). - [x] [2024.11.09]🎯📢Introducing the [LightRAG Gui](https://lightrag-gui.streamlit.app), which allows you to insert, query, visualize, and download LightRAG knowledge. From 2a3d92b51502794c4e10a76ddf31e848ac6827a9 Mon Sep 17 00:00:00 2001 From: Magic_yuan <72277650+magicyuan876@users.noreply.github.com> Date: Thu, 21 Nov 2024 10:37:09 +0800 Subject: [PATCH 15/22] =?UTF-8?q?=E4=BD=BF=E7=94=A8AzureOpenAI=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0=EF=BC=8C=E6=94=AF=E6=8C=81RPM/TPM=E9=99=90=E5=88=B6?= =?UTF-8?q?=E3=80=82=E4=BF=AE=E5=A4=8D=E5=8E=9F=E5=85=88429=E5=93=8D?= =?UTF-8?q?=E5=BA=94=E5=8D=B3=E6=8A=9B=E5=87=BA=E5=BC=82=E5=B8=B8=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/lightrag_azure_openai_demo.py | 63 +++++++++++--------------- 1 file changed, 27 insertions(+), 36 deletions(-) diff --git a/examples/lightrag_azure_openai_demo.py b/examples/lightrag_azure_openai_demo.py index e29a6a9d..98d7c0e0 100644 --- a/examples/lightrag_azure_openai_demo.py +++ b/examples/lightrag_azure_openai_demo.py @@ -6,6 +6,7 @@ import numpy as np from dotenv import load_dotenv import aiohttp import logging +from openai import AzureOpenAI logging.basicConfig(level=logging.INFO) @@ -32,11 +33,12 @@ os.mkdir(WORKING_DIR) async def llm_model_func( prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: - headers = { - "Content-Type": "application/json", - "api-key": AZURE_OPENAI_API_KEY, - } - endpoint = f"{AZURE_OPENAI_ENDPOINT}openai/deployments/{AZURE_OPENAI_DEPLOYMENT}/chat/completions?api-version={AZURE_OPENAI_API_VERSION}" + + client = AzureOpenAI( + api_key=AZURE_OPENAI_API_KEY, + api_version=AZURE_OPENAI_API_VERSION, + azure_endpoint=AZURE_OPENAI_ENDPOINT + ) messages = [] if system_prompt: @@ -45,41 +47,30 @@ async def llm_model_func( messages.extend(history_messages) messages.append({"role": "user", "content": prompt}) - payload = { - "messages": messages, - "temperature": kwargs.get("temperature", 0), - "top_p": kwargs.get("top_p", 1), - "n": kwargs.get("n", 1), - } - - async with aiohttp.ClientSession() as session: - async with session.post(endpoint, headers=headers, json=payload) as response: - if response.status != 200: - raise ValueError( - f"Request failed with status {response.status}: {await response.text()}" - ) - result = await response.json() - return result["choices"][0]["message"]["content"] + chat_completion = client.chat.completions.create( + model=AZURE_OPENAI_DEPLOYMENT, # model = "deployment_name". + messages=messages, + temperature=kwargs.get("temperature", 0), + top_p=kwargs.get("top_p", 1), + n=kwargs.get("n", 1), + ) + return chat_completion.choices[0].message.content async def embedding_func(texts: list[str]) -> np.ndarray: - headers = { - "Content-Type": "application/json", - "api-key": AZURE_OPENAI_API_KEY, - } - endpoint = f"{AZURE_OPENAI_ENDPOINT}openai/deployments/{AZURE_EMBEDDING_DEPLOYMENT}/embeddings?api-version={AZURE_EMBEDDING_API_VERSION}" - payload = {"input": texts} - - async with aiohttp.ClientSession() as session: - async with session.post(endpoint, headers=headers, json=payload) as response: - if response.status != 200: - raise ValueError( - f"Request failed with status {response.status}: {await response.text()}" - ) - result = await response.json() - embeddings = [item["embedding"] for item in result["data"]] - return np.array(embeddings) + client = AzureOpenAI( + api_key=AZURE_OPENAI_API_KEY, + api_version=AZURE_EMBEDDING_API_VERSION, + azure_endpoint=AZURE_OPENAI_ENDPOINT + ) + embedding = client.embeddings.create( + model=AZURE_EMBEDDING_DEPLOYMENT, + input=texts + ) + + embeddings = [item.embedding for item in embedding.data] + return np.array(embeddings) async def test_funcs(): From e0488bc9fca8cb917430c8e2adc5a07e49de45f0 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Thu, 21 Nov 2024 10:56:15 +0800 Subject: [PATCH 16/22] Update Discord Link --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 21bc1e7a..ecfb0ebc 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@

- +

@@ -34,7 +34,7 @@ This repository hosts the code of LightRAG. The structure of this code is based - [x] [2024.10.29]🎯📢LightRAG now supports multiple file types, including PDF, DOC, PPT, and CSV via `textract`. - [x] [2024.10.20]🎯📢We’ve added a new feature to LightRAG: Graph Visualization. - [x] [2024.10.18]🎯📢We’ve added a link to a [LightRAG Introduction Video](https://youtu.be/oageL-1I0GE). Thanks to the author! -- [x] [2024.10.17]🎯📢We have created a [Discord channel](https://discord.gg/E4HgTnck)! Welcome to join for sharing and discussions! 🎉🎉 +- [x] [2024.10.17]🎯📢We have created a [Discord channel](https://discord.gg/yF2MmDJyGJ)! Welcome to join for sharing and discussions! 🎉🎉 - [x] [2024.10.16]🎯📢LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! - [x] [2024.10.15]🎯📢LightRAG now supports [Hugging Face models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! From 116e1d20a9142c673bf70e15d430652a4a75be57 Mon Sep 17 00:00:00 2001 From: zzzcccxx <865637742@qq.com> Date: Thu, 21 Nov 2024 14:35:18 +0800 Subject: [PATCH 17/22] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86relation=5Fcou?= =?UTF-8?q?nts=E8=AE=A1=E6=95=B0=E8=A7=84=E5=88=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lightrag/operate.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index eb600c4b..cf236633 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -578,24 +578,20 @@ async def _find_most_related_text_unit_from_entities( all_text_units_lookup = {} for index, (this_text_units, this_edges) in enumerate(zip(text_units, edges)): for c_id in this_text_units: - if c_id in all_text_units_lookup: - continue - relation_counts = 0 - if this_edges: # Add check for None edges + if c_id not in all_text_units_lookup: + all_text_units_lookup[c_id] = { + "data": await text_chunks_db.get_by_id(c_id), + "order": index, + "relation_counts": 0, + } + + if this_edges: for e in this_edges: if ( e[1] in all_one_hop_text_units_lookup and c_id in all_one_hop_text_units_lookup[e[1]] ): - relation_counts += 1 - - chunk_data = await text_chunks_db.get_by_id(c_id) - if chunk_data is not None and "content" in chunk_data: # Add content check - all_text_units_lookup[c_id] = { - "data": chunk_data, - "order": index, - "relation_counts": relation_counts, - } + all_text_units_lookup[c_id]["relation_counts"] += 1 # Filter out None values and ensure data has content all_text_units = [ From 717afe4d59d7c7addfe5985c002268e492995f24 Mon Sep 17 00:00:00 2001 From: Magic_yuan <72277650+magicyuan876@users.noreply.github.com> Date: Thu, 21 Nov 2024 18:44:38 +0800 Subject: [PATCH 18/22] Update lightrag_azure_openai_demo.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 更新代码格式 --- examples/lightrag_azure_openai_demo.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/examples/lightrag_azure_openai_demo.py b/examples/lightrag_azure_openai_demo.py index 98d7c0e0..4e134830 100644 --- a/examples/lightrag_azure_openai_demo.py +++ b/examples/lightrag_azure_openai_demo.py @@ -31,13 +31,12 @@ os.mkdir(WORKING_DIR) async def llm_model_func( - prompt, system_prompt=None, history_messages=[], **kwargs + prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: - client = AzureOpenAI( - api_key=AZURE_OPENAI_API_KEY, - api_version=AZURE_OPENAI_API_VERSION, - azure_endpoint=AZURE_OPENAI_ENDPOINT + api_key=LLM_AZURE_OPENAI_KEY, + api_version=LLM_AZURE_OPENAI_VERSION, + azure_endpoint=LLM_AZURE_OPENAI_API ) messages = [] @@ -48,7 +47,7 @@ async def llm_model_func( messages.append({"role": "user", "content": prompt}) chat_completion = client.chat.completions.create( - model=AZURE_OPENAI_DEPLOYMENT, # model = "deployment_name". + model=LLM_AZURE_OPENAI_DEPLOYMENT, # model = "deployment_name". messages=messages, temperature=kwargs.get("temperature", 0), top_p=kwargs.get("top_p", 1), @@ -58,7 +57,6 @@ async def llm_model_func( async def embedding_func(texts: list[str]) -> np.ndarray: - client = AzureOpenAI( api_key=AZURE_OPENAI_API_KEY, api_version=AZURE_EMBEDDING_API_VERSION, @@ -68,7 +66,7 @@ async def embedding_func(texts: list[str]) -> np.ndarray: model=AZURE_EMBEDDING_DEPLOYMENT, input=texts ) - + embeddings = [item.embedding for item in embedding.data] return np.array(embeddings) From 143f82d820b268ab8816ce170013373af0ea4be8 Mon Sep 17 00:00:00 2001 From: Magic_yuan <72277650+magicyuan876@users.noreply.github.com> Date: Thu, 21 Nov 2024 18:49:29 +0800 Subject: [PATCH 19/22] Update lightrag_azure_openai_demo.py --- examples/lightrag_azure_openai_demo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/lightrag_azure_openai_demo.py b/examples/lightrag_azure_openai_demo.py index 4e134830..7073e3cb 100644 --- a/examples/lightrag_azure_openai_demo.py +++ b/examples/lightrag_azure_openai_demo.py @@ -34,9 +34,9 @@ async def llm_model_func( prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: client = AzureOpenAI( - api_key=LLM_AZURE_OPENAI_KEY, - api_version=LLM_AZURE_OPENAI_VERSION, - azure_endpoint=LLM_AZURE_OPENAI_API + api_key=AZURE_OPENAI_API_KEY, + api_version=AZURE_OPENAI_API_VERSION, + azure_endpoint=AZURE_OPENAI_DEPLOYMENT ) messages = [] From f51b5c68f59ad9911ad680e33f9fcd81de170522 Mon Sep 17 00:00:00 2001 From: Magic_yuan <72277650+magicyuan876@users.noreply.github.com> Date: Thu, 21 Nov 2024 11:11:23 +0000 Subject: [PATCH 20/22] =?UTF-8?q?=E6=9B=B4=E6=96=B0format?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/lightrag_azure_openai_demo.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/examples/lightrag_azure_openai_demo.py b/examples/lightrag_azure_openai_demo.py index 7073e3cb..6a3fafc1 100644 --- a/examples/lightrag_azure_openai_demo.py +++ b/examples/lightrag_azure_openai_demo.py @@ -4,7 +4,6 @@ from lightrag import LightRAG, QueryParam from lightrag.utils import EmbeddingFunc import numpy as np from dotenv import load_dotenv -import aiohttp import logging from openai import AzureOpenAI @@ -31,12 +30,12 @@ os.mkdir(WORKING_DIR) async def llm_model_func( - prompt, system_prompt=None, history_messages=[], **kwargs + prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: client = AzureOpenAI( api_key=AZURE_OPENAI_API_KEY, api_version=AZURE_OPENAI_API_VERSION, - azure_endpoint=AZURE_OPENAI_DEPLOYMENT + azure_endpoint=AZURE_OPENAI_ENDPOINT, ) messages = [] @@ -47,7 +46,7 @@ async def llm_model_func( messages.append({"role": "user", "content": prompt}) chat_completion = client.chat.completions.create( - model=LLM_AZURE_OPENAI_DEPLOYMENT, # model = "deployment_name". + model=AZURE_OPENAI_DEPLOYMENT, # model = "deployment_name". messages=messages, temperature=kwargs.get("temperature", 0), top_p=kwargs.get("top_p", 1), @@ -60,12 +59,9 @@ async def embedding_func(texts: list[str]) -> np.ndarray: client = AzureOpenAI( api_key=AZURE_OPENAI_API_KEY, api_version=AZURE_EMBEDDING_API_VERSION, - azure_endpoint=AZURE_OPENAI_ENDPOINT - ) - embedding = client.embeddings.create( - model=AZURE_EMBEDDING_DEPLOYMENT, - input=texts + azure_endpoint=AZURE_OPENAI_ENDPOINT, ) + embedding = client.embeddings.create(model=AZURE_EMBEDDING_DEPLOYMENT, input=texts) embeddings = [item.embedding for item in embedding.data] return np.array(embeddings) From e6f380fc7a008fc2da1fc3246680425522fc723d Mon Sep 17 00:00:00 2001 From: lzd <2428183599@qq.com> Date: Fri, 22 Nov 2024 13:59:14 +0800 Subject: [PATCH 21/22] =?UTF-8?q?=E5=88=A0=E9=99=A4Neo4JStorage.has=5Fedge?= =?UTF-8?q?=E4=B8=AD=E5=AE=9A=E4=B9=89=E7=9A=84=E6=B2=A1=E6=9C=89=E7=94=A8?= =?UTF-8?q?=E5=88=B0=E7=9A=84=E5=90=8C=E6=AD=A5close=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lightrag/kg/neo4j_impl.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py index f9fcb46d..84efae81 100644 --- a/lightrag/kg/neo4j_impl.py +++ b/lightrag/kg/neo4j_impl.py @@ -86,9 +86,6 @@ class Neo4JStorage(BaseGraphStorage): ) return single_result["edgeExists"] - def close(self): - self._driver.close() - async def get_node(self, node_id: str) -> Union[dict, None]: async with self._driver.session() as session: entity_name_label = node_id.strip('"') From 8161bd19f958b5c8ceba728b0ed14205e8d3d26a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=9C=A8Data=20Intelligence=20Lab=40HKU=E2=9C=A8?= <118165258+HKUDS@users.noreply.github.com> Date: Fri, 22 Nov 2024 17:39:36 +0800 Subject: [PATCH 22/22] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ecfb0ebc..6d5af135 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,8 @@ This repository hosts the code of LightRAG. The structure of this code is based ## 🎉 News -- [x] [2024.11.19]🎯📢We have added a detailed blog link introducing LightRAG on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author! -- [x] [2024.11.12]🎯📢You can [use Oracle Database 23ai for all storage types (kv/vector/graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py) now. +- [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author! +- [x] [2024.11.12]🎯📢LightRAG now supports [Oracle Database 23ai for all storage types (KV, vector, and graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py). - [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete-entity). - [x] [2024.11.09]🎯📢Introducing the [LightRAG Gui](https://lightrag-gui.streamlit.app), which allows you to insert, query, visualize, and download LightRAG knowledge. - [x] [2024.11.04]🎯📢You can now [use Neo4J for Storage](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-neo4j-for-storage).