From 465c9a13d984ab8c9bdf4cc20c5e628f285e09e8 Mon Sep 17 00:00:00 2001 From: Ken Wiltshire Date: Fri, 1 Nov 2024 16:29:36 -0400 Subject: [PATCH] cleaning code for pull --- Dockerfile | 56 +++++++++++++++++++++++++++++ README.md | 5 ++- lightrag/kg/neo4j_impl.py | 75 ++------------------------------------- test.py | 22 +----------- test_neo4j.py | 2 +- 5 files changed, 64 insertions(+), 96 deletions(-) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..1b60c089 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,56 @@ +FROM debian:bullseye-slim +ENV JAVA_HOME=/opt/java/openjdk +COPY --from=eclipse-temurin:17 $JAVA_HOME $JAVA_HOME +ENV PATH="${JAVA_HOME}/bin:${PATH}" \ + NEO4J_SHA256=7ce97bd9a4348af14df442f00b3dc5085b5983d6f03da643744838c7a1bc8ba7 \ + NEO4J_TARBALL=neo4j-enterprise-5.24.2-unix.tar.gz \ + NEO4J_EDITION=enterprise \ + NEO4J_HOME="/var/lib/neo4j" \ + LANG=C.UTF-8 +ARG NEO4J_URI=https://dist.neo4j.org/neo4j-enterprise-5.24.2-unix.tar.gz + +RUN addgroup --gid 7474 --system neo4j && adduser --uid 7474 --system --no-create-home --home "${NEO4J_HOME}" --ingroup neo4j neo4j + +COPY ./local-package/* /startup/ + +RUN apt update \ + && apt-get install -y curl gcc git jq make procps tini wget \ + && curl --fail --silent --show-error --location --remote-name ${NEO4J_URI} \ + && echo "${NEO4J_SHA256} ${NEO4J_TARBALL}" | sha256sum -c --strict --quiet \ + && tar --extract --file ${NEO4J_TARBALL} --directory /var/lib \ + && mv /var/lib/neo4j-* "${NEO4J_HOME}" \ + && rm ${NEO4J_TARBALL} \ + && sed -i 's/Package Type:.*/Package Type: docker bullseye/' $NEO4J_HOME/packaging_info \ + && mv /startup/neo4j-admin-report.sh "${NEO4J_HOME}"/bin/neo4j-admin-report \ + && mv "${NEO4J_HOME}"/data /data \ + && mv "${NEO4J_HOME}"/logs /logs \ + && chown -R neo4j:neo4j /data \ + && chmod -R 777 /data \ + && chown -R neo4j:neo4j /logs \ + && chmod -R 777 /logs \ + && chown -R neo4j:neo4j "${NEO4J_HOME}" \ + && chmod -R 777 "${NEO4J_HOME}" \ + && chmod -R 755 "${NEO4J_HOME}/bin" \ + && ln -s /data "${NEO4J_HOME}"/data \ + && ln -s /logs "${NEO4J_HOME}"/logs \ + && git clone https://github.com/ncopa/su-exec.git \ + && cd su-exec \ + && git checkout 4c3bb42b093f14da70d8ab924b487ccfbb1397af \ + && echo d6c40440609a23483f12eb6295b5191e94baf08298a856bab6e15b10c3b82891 su-exec.c | sha256sum -c \ + && echo 2a87af245eb125aca9305a0b1025525ac80825590800f047419dc57bba36b334 Makefile | sha256sum -c \ + && make \ + && mv /su-exec/su-exec /usr/bin/su-exec \ + && apt-get -y purge --auto-remove curl gcc git make \ + && rm -rf /var/lib/apt/lists/* /su-exec + + +ENV PATH "${NEO4J_HOME}"/bin:$PATH + +WORKDIR "${NEO4J_HOME}" + +VOLUME /data /logs + +EXPOSE 7474 7473 7687 + +ENTRYPOINT ["tini", "-g", "--", "/startup/docker-entrypoint.sh"] +CMD ["neo4j"] \ No newline at end of file diff --git a/README.md b/README.md index cc54fcd0..33abb13b 100644 --- a/README.md +++ b/README.md @@ -160,7 +160,10 @@ rag = LightRAG( Using Neo4J for Storage * For production level scenarios you will most likely want to leverage an enterprise solution -for KG storage. +* for KG storage. Running Neo4J in Docker is recommended for seamless local testing. +* See: https://hub.docker.com/_/neo4j + + ```python export NEO4J_URI="neo4j://localhost:7687" export NEO4J_USERNAME="neo4j" diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py index a7c39fec..9ad725b7 100644 --- a/lightrag/kg/neo4j_impl.py +++ b/lightrag/kg/neo4j_impl.py @@ -74,9 +74,6 @@ class GraphStorage(BaseGraphStorage): ) result = tx.run(query) single_result = result.single() - # if result.single() == None: - # print (f"this should not happen: ---- {label1}/{label2} {query}") - logger.debug( f'{inspect.currentframe().f_code.co_name}:query:{query}:result:{single_result["edgeExists"]}' ) @@ -84,7 +81,7 @@ class GraphStorage(BaseGraphStorage): return single_result["edgeExists"] def close(self): self._driver.close() - #hard code relaitionship type + #hard code relaitionship type, directed. with self._driver.session() as session: result = session.read_transaction(_check_edge_existence, entity_name_label_source, entity_name_label_target) return result @@ -111,7 +108,6 @@ class GraphStorage(BaseGraphStorage): def _find_node_degree(session, label): with session.begin_transaction() as tx: - # query = "MATCH (n:`{label}`) RETURN n, size((n)--()) AS degree".format(label=label) query = f""" MATCH (n:`{label}`) RETURN COUNT{{ (n)--() }} AS totalEdgeCount @@ -132,7 +128,6 @@ class GraphStorage(BaseGraphStorage): return degree - # degree = session.read_transaction(get_edge_degree, 1, 2) async def edge_degree(self, src_id: str, tgt_id: str) -> int: entity_name_label_source = src_id.strip('\"') entity_name_label_target = tgt_id.strip('\"') @@ -208,7 +203,6 @@ class GraphStorage(BaseGraphStorage): target_label = list(connected_node.labels)[0] if connected_node and connected_node.labels else None if source_label and target_label: - print (f"appending: {(source_label, target_label)}") edges.append((source_label, target_label)) return edges @@ -218,44 +212,6 @@ class GraphStorage(BaseGraphStorage): return edges - - # from typing import List, Tuple - # async def get_node_connections(driver: GraphDatabase.driver, label: str) -> List[Tuple[str, str]]: - # def get_connections_for_node(tx): - # query = f""" - # MATCH (n:`{label}`) - # OPTIONAL MATCH (n)-[r]-(connected) - # RETURN n, r, connected - # """ - # results = tx.run(query) - - - # connections = [] - # for record in results: - # source_node = record['n'] - # connected_node = record['connected'] - - # source_label = list(source_node.labels)[0] if source_node.labels else None - # target_label = list(connected_node.labels)[0] if connected_node and connected_node.labels else None - - # if source_label and target_label: - # connections.append((source_label, target_label)) - - # logger.debug( - # f'{inspect.currentframe().f_code.co_name}:query:{query}:result:{connections}' - # ) - # return connections - - # with driver.session() as session: - - # return session.read_transaction(get_connections_for_node) - - - - - - #upsert_node - @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10), @@ -366,32 +322,5 @@ class GraphStorage(BaseGraphStorage): # return result async def _node2vec_embed(self): - print ("this is never called. checking to be sure.") + print ("Implemented but never called.") - # async def _node2vec_embed(self): - with self._driver.session() as session: - #Define the Cypher query - options = self.global_config["node2vec_params"] - logger.debug(f"building embeddings with options {options}") - query = f"""CALL gds.node2vec.write('91fbae6c', { - options - }) - YIELD nodeId, labels, embedding - RETURN - nodeId AS id, - labels[0] AS distinctLabel, - embedding AS nodeToVecEmbedding - """ - # Run the query and process the results - results = session.run(query) - embeddings = [] - node_labels = [] - for record in results: - node_id = record["id"] - embedding = record["nodeToVecEmbedding"] - label = record["distinctLabel"] - print(f"Node id/label: {label}/{node_id}, Embedding: {embedding}") - embeddings.append(embedding) - node_labels.append(label) - return embeddings, node_labels - diff --git a/test.py b/test.py index aa7179c1..219e23eb 100644 --- a/test.py +++ b/test.py @@ -8,27 +8,7 @@ from pprint import pprint # nest_asyncio.apply() ######### -WORKING_DIR = "./dickensTestEmbedcall" - - -# G = nx.read_graphml('./dickensTestEmbedcall/graph_chunk_entity_relation.graphml') -# nx.write_gexf(G, "graph_chunk_entity_relation.gefx") - -import networkx as nx -from networkx_query import search_nodes, search_edges -G = nx.read_graphml('./dickensTestEmbedcall/graph_chunk_entity_relation.graphml') -query = {} # Empty query matches all nodes -result = search_nodes(G, query) - -# Extract node IDs from the result -node_ids = sorted([node for node in result]) - -print("All node IDs in the graph:") -pprint(node_ids) -raise Exception - - -# raise Exception +WORKING_DIR = "./dickens" if not os.path.exists(WORKING_DIR): os.mkdir(WORKING_DIR) diff --git a/test_neo4j.py b/test_neo4j.py index 6c789479..044c12e9 100644 --- a/test_neo4j.py +++ b/test_neo4j.py @@ -17,7 +17,7 @@ rag = LightRAG( working_dir=WORKING_DIR, llm_model_func=gpt_4o_mini_complete, # Use gpt_4o_mini_complete LLM model kg="Neo4JStorage", - log_level="DEBUG" + log_level="INFO" # llm_model_func=gpt_4o_complete # Optionally, use a stronger model )