diff --git a/.gitignore b/.gitignore index 5a41ae32..5eabc386 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ dickens/ book.txt lightrag-dev/ .idea/ -dist/ \ No newline at end of file +dist/ +env/ \ No newline at end of file diff --git a/lightrag/kg/__init__.py b/lightrag/kg/__init__.py index db81e005..ddd2bb79 100644 --- a/lightrag/kg/__init__.py +++ b/lightrag/kg/__init__.py @@ -1,5 +1,27 @@ -from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam +print ("init package vars here. ......") +from .neo4j import GraphStorage as Neo4JStorage + + +# import sys +# import importlib +# # Specify the path to the directory containing the module +# # Add the directory to the system path +# module_dir = '/Users/kenwiltshire/documents/dev/LightRag/lightrag/kg' +# sys.path.append(module_dir) +# # Specify the module name +# module_name = 'neo4j' +# # Import the module +# spec = importlib.util.spec_from_file_location(module_name, f'{module_dir}/{module_name}.py') + +# Neo4JStorage = importlib.util.module_from_spec(spec) +# spec.loader.exec_module(Neo4JStorage) + + + +# Relative imports are still possible by adding a leading period to the module name when using the from ... import form: + +# # Import names from pkg.string +# from .string import name1, name2 +# # Import pkg.string +# from . import string -__version__ = "0.0.7" -__author__ = "Zirui Guo" -__url__ = "https://github.com/HKUDS/LightRAG" diff --git a/lightrag/kg/neo4j.py b/lightrag/kg/neo4j.py index 5ec5b0cc..7205d9dc 100644 --- a/lightrag/kg/neo4j.py +++ b/lightrag/kg/neo4j.py @@ -3,11 +3,15 @@ import html import os from dataclasses import dataclass from typing import Any, Union, cast -import networkx as nx import numpy as np from nano_vectordb import NanoVectorDB -from .utils import load_json, logger, write_json + + +# import package.common.utils as utils + + +from lightrag.utils import load_json, logger, write_json from ..base import ( BaseGraphStorage ) @@ -22,10 +26,10 @@ PASSWORD = "your_password" @dataclass class GraphStorage(BaseGraphStorage): @staticmethod - def load_nx_graph(file_name) -> nx.Graph: - if os.path.exists(file_name): - return nx.read_graphml(file_name) - return None + # def load_nx_graph(file_name) -> nx.Graph: + # if os.path.exists(file_name): + # return nx.read_graphml(file_name) + # return None def __post_init__(self): # self._graph = preloaded_graph or nx.Graph() @@ -102,7 +106,7 @@ class GraphStorage(BaseGraphStorage): result = session.run( """MATCH (n1:{node_label1})-[r]-(n2:{node_label2}) RETURN count(r) AS degree""" - .format(node_label1=node_label1, node_label2=node_label2) + .format(entity_name__label_source=entity_name__label_source, entity_name_label_target=entity_name_label_target) ) record = result.single() return record["degree"] @@ -263,7 +267,7 @@ class GraphStorage(BaseGraphStorage): with self._driver.session() as session: #Define the Cypher query options = self.global_config["node2vec_params"] - query = f"""CALL gds.node2vec.stream('myGraph', {**options}) + query = f"""CALL gds.node2vec.stream('myGraph', {options}) # **options YIELD nodeId, embedding RETURN nodeId, embedding""" # Run the query and process the results diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 5137af42..28a4af19 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1,5 +1,6 @@ import asyncio import os +import importlib from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial @@ -23,6 +24,11 @@ from .storage import ( NanoVectorDBStorage, NetworkXStorage, ) + +from .kg.neo4j import ( + GraphStorage as Neo4JStorage +) + from .utils import ( EmbeddingFunc, compute_mdhash_id, @@ -93,7 +99,14 @@ class LightRAG: key_string_value_json_storage_cls: Type[BaseKVStorage] = JsonKVStorage vector_db_storage_cls: Type[BaseVectorStorage] = NanoVectorDBStorage vector_db_storage_cls_kwargs: dict = field(default_factory=dict) - graph_storage_cls: Type[BaseGraphStorage] = NetworkXStorage + + # module = importlib.import_module('kg.neo4j') + # Neo4JStorage = getattr(module, 'GraphStorage') + + if True==False: + graph_storage_cls: Type[BaseGraphStorage] = Neo4JStorage + else: + graph_storage_cls: Type[BaseGraphStorage] = NetworkXStorage enable_llm_cache: bool = True # extension diff --git a/lightrag/llm.py b/lightrag/llm.py index 4dcf535c..208b22a5 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -72,7 +72,9 @@ async def openai_complete_if_cache( @retry( stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=4, max=10), + #kw_ + wait=wait_exponential(multiplier=1, min=4, max=60), + # wait=wait_exponential(multiplier=1, min=4, max=10), retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), ) async def azure_openai_complete_if_cache(model, diff --git a/lightrag/storage.py b/lightrag/storage.py index 85ba2aaa..1f22fc56 100644 --- a/lightrag/storage.py +++ b/lightrag/storage.py @@ -6,8 +6,6 @@ from typing import Any, Union, cast import networkx as nx import numpy as np from nano_vectordb import NanoVectorDB -from kg.neo4j import GraphStorage - from .utils import load_json, logger, write_json from .base import ( @@ -99,66 +97,14 @@ class NanoVectorDBStorage(BaseVectorStorage): d["__vector__"] = embeddings[i] results = self._client.upsert(datas=list_data) return results - - -@dataclass -class PineConeVectorDBStorage(BaseVectorStorage): - cosine_better_than_threshold: float = 0.2 - - def __post_init__(self): - self._client_file_name = os.path.join( - self.global_config["working_dir"], f"vdb_{self.namespace}.json" - ) - self._max_batch_size = self.global_config["embedding_batch_num"] - self._client = NanoVectorDB( - self.embedding_func.embedding_dim, storage_file=self._client_file_name - ) - import os - from pinecone import Pinecone - - pc = Pinecone() #api_key=os.environ.get('PINECONE_API_KEY')) - # From here on, everything is identical to the REST-based SDK. - self._client = pc.Index(host=self._client_pinecone_host)#'my-index-8833ca1.svc.us-east1-gcp.pinecone.io') - - self.cosine_better_than_threshold = self.global_config.get( - "cosine_better_than_threshold", self.cosine_better_than_threshold - ) - - async def upsert(self, data: dict[str, dict]): - logger.info(f"Inserting {len(data)} vectors to {self.namespace}") - if not len(data): - logger.warning("You insert an empty data to vector DB") - return [] - list_data = [ - { - "__id__": k, - **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields}, - } - for k, v in data.items() - ] - contents = [v["content"] for v in data.values()] - batches = [ - contents[i : i + self._max_batch_size] - for i in range(0, len(contents), self._max_batch_size) - ] - embeddings_list = await asyncio.gather( - *[self.embedding_func(batch) for batch in batches] - ) - embeddings = np.concatenate(embeddings_list) - for i, d in enumerate(list_data): - d["__vector__"] = embeddings[i] - # self._client.upsert(vectors=[]) pinecone - results = self._client.upsert(datas=list_data) - return results async def query(self, query: str, top_k=5): embedding = await self.embedding_func([query]) embedding = embedding[0] - # self._client.query(vector=[...], top_key=10) pinecone results = self._client.query( - vector=embedding, + query=embedding, top_k=top_k, - better_than_threshold=self.cosine_better_than_threshold, ??? + better_than_threshold=self.cosine_better_than_threshold, ) results = [ {**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results @@ -166,8 +112,7 @@ class PineConeVectorDBStorage(BaseVectorStorage): return results async def index_done_callback(self): - print("self._client.save()") - # self._client.save() + self._client.save() @dataclass @@ -298,5 +243,3 @@ class NetworkXStorage(BaseGraphStorage): nodes_ids = [self._graph.nodes[node_id]["id"] for node_id in nodes] return embeddings, nodes_ids - - diff --git a/requirements.txt b/requirements.txt index 5b3396fb..897c53f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,5 @@ torch transformers xxhash pyvis -aiohttp \ No newline at end of file +aiohttp +neo4j diff --git a/test.py b/test.py new file mode 100644 index 00000000..4167222b --- /dev/null +++ b/test.py @@ -0,0 +1,36 @@ +import os +from lightrag import LightRAG, QueryParam +from lightrag.llm import gpt_4o_mini_complete, gpt_4o_complete + +######### +# Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert() +# import nest_asyncio +# nest_asyncio.apply() +######### + +WORKING_DIR = "./dickens" + + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=gpt_4o_mini_complete # Use gpt_4o_mini_complete LLM model + # llm_model_func=gpt_4o_complete # Optionally, use a stronger model +) + +with open("./book.txt") as f: + rag.insert(f.read()) + +# Perform naive search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) + +# Perform local search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) + +# Perform global search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) + +# Perform hybrid search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) \ No newline at end of file