cleaning code for pull

This commit is contained in:
Ken Wiltshire
2024-11-01 16:29:36 -04:00
parent 8547dd4941
commit 465c9a13d9
5 changed files with 64 additions and 96 deletions

56
Dockerfile Normal file
View File

@@ -0,0 +1,56 @@
FROM debian:bullseye-slim
ENV JAVA_HOME=/opt/java/openjdk
COPY --from=eclipse-temurin:17 $JAVA_HOME $JAVA_HOME
ENV PATH="${JAVA_HOME}/bin:${PATH}" \
NEO4J_SHA256=7ce97bd9a4348af14df442f00b3dc5085b5983d6f03da643744838c7a1bc8ba7 \
NEO4J_TARBALL=neo4j-enterprise-5.24.2-unix.tar.gz \
NEO4J_EDITION=enterprise \
NEO4J_HOME="/var/lib/neo4j" \
LANG=C.UTF-8
ARG NEO4J_URI=https://dist.neo4j.org/neo4j-enterprise-5.24.2-unix.tar.gz
RUN addgroup --gid 7474 --system neo4j && adduser --uid 7474 --system --no-create-home --home "${NEO4J_HOME}" --ingroup neo4j neo4j
COPY ./local-package/* /startup/
RUN apt update \
&& apt-get install -y curl gcc git jq make procps tini wget \
&& curl --fail --silent --show-error --location --remote-name ${NEO4J_URI} \
&& echo "${NEO4J_SHA256} ${NEO4J_TARBALL}" | sha256sum -c --strict --quiet \
&& tar --extract --file ${NEO4J_TARBALL} --directory /var/lib \
&& mv /var/lib/neo4j-* "${NEO4J_HOME}" \
&& rm ${NEO4J_TARBALL} \
&& sed -i 's/Package Type:.*/Package Type: docker bullseye/' $NEO4J_HOME/packaging_info \
&& mv /startup/neo4j-admin-report.sh "${NEO4J_HOME}"/bin/neo4j-admin-report \
&& mv "${NEO4J_HOME}"/data /data \
&& mv "${NEO4J_HOME}"/logs /logs \
&& chown -R neo4j:neo4j /data \
&& chmod -R 777 /data \
&& chown -R neo4j:neo4j /logs \
&& chmod -R 777 /logs \
&& chown -R neo4j:neo4j "${NEO4J_HOME}" \
&& chmod -R 777 "${NEO4J_HOME}" \
&& chmod -R 755 "${NEO4J_HOME}/bin" \
&& ln -s /data "${NEO4J_HOME}"/data \
&& ln -s /logs "${NEO4J_HOME}"/logs \
&& git clone https://github.com/ncopa/su-exec.git \
&& cd su-exec \
&& git checkout 4c3bb42b093f14da70d8ab924b487ccfbb1397af \
&& echo d6c40440609a23483f12eb6295b5191e94baf08298a856bab6e15b10c3b82891 su-exec.c | sha256sum -c \
&& echo 2a87af245eb125aca9305a0b1025525ac80825590800f047419dc57bba36b334 Makefile | sha256sum -c \
&& make \
&& mv /su-exec/su-exec /usr/bin/su-exec \
&& apt-get -y purge --auto-remove curl gcc git make \
&& rm -rf /var/lib/apt/lists/* /su-exec
ENV PATH "${NEO4J_HOME}"/bin:$PATH
WORKDIR "${NEO4J_HOME}"
VOLUME /data /logs
EXPOSE 7474 7473 7687
ENTRYPOINT ["tini", "-g", "--", "/startup/docker-entrypoint.sh"]
CMD ["neo4j"]

View File

@@ -160,7 +160,10 @@ rag = LightRAG(
<summary> Using Neo4J for Storage </summary>
* For production level scenarios you will most likely want to leverage an enterprise solution
for KG storage.
* for KG storage. Running Neo4J in Docker is recommended for seamless local testing.
* See: https://hub.docker.com/_/neo4j
```python
export NEO4J_URI="neo4j://localhost:7687"
export NEO4J_USERNAME="neo4j"

View File

@@ -74,9 +74,6 @@ class GraphStorage(BaseGraphStorage):
)
result = tx.run(query)
single_result = result.single()
# if result.single() == None:
# print (f"this should not happen: ---- {label1}/{label2} {query}")
logger.debug(
f'{inspect.currentframe().f_code.co_name}:query:{query}:result:{single_result["edgeExists"]}'
)
@@ -84,7 +81,7 @@ class GraphStorage(BaseGraphStorage):
return single_result["edgeExists"]
def close(self):
self._driver.close()
#hard code relaitionship type
#hard code relaitionship type, directed.
with self._driver.session() as session:
result = session.read_transaction(_check_edge_existence, entity_name_label_source, entity_name_label_target)
return result
@@ -111,7 +108,6 @@ class GraphStorage(BaseGraphStorage):
def _find_node_degree(session, label):
with session.begin_transaction() as tx:
# query = "MATCH (n:`{label}`) RETURN n, size((n)--()) AS degree".format(label=label)
query = f"""
MATCH (n:`{label}`)
RETURN COUNT{{ (n)--() }} AS totalEdgeCount
@@ -132,7 +128,6 @@ class GraphStorage(BaseGraphStorage):
return degree
# degree = session.read_transaction(get_edge_degree, 1, 2)
async def edge_degree(self, src_id: str, tgt_id: str) -> int:
entity_name_label_source = src_id.strip('\"')
entity_name_label_target = tgt_id.strip('\"')
@@ -208,7 +203,6 @@ class GraphStorage(BaseGraphStorage):
target_label = list(connected_node.labels)[0] if connected_node and connected_node.labels else None
if source_label and target_label:
print (f"appending: {(source_label, target_label)}")
edges.append((source_label, target_label))
return edges
@@ -218,44 +212,6 @@ class GraphStorage(BaseGraphStorage):
return edges
# from typing import List, Tuple
# async def get_node_connections(driver: GraphDatabase.driver, label: str) -> List[Tuple[str, str]]:
# def get_connections_for_node(tx):
# query = f"""
# MATCH (n:`{label}`)
# OPTIONAL MATCH (n)-[r]-(connected)
# RETURN n, r, connected
# """
# results = tx.run(query)
# connections = []
# for record in results:
# source_node = record['n']
# connected_node = record['connected']
# source_label = list(source_node.labels)[0] if source_node.labels else None
# target_label = list(connected_node.labels)[0] if connected_node and connected_node.labels else None
# if source_label and target_label:
# connections.append((source_label, target_label))
# logger.debug(
# f'{inspect.currentframe().f_code.co_name}:query:{query}:result:{connections}'
# )
# return connections
# with driver.session() as session:
# return session.read_transaction(get_connections_for_node)
#upsert_node
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=4, max=10),
@@ -366,32 +322,5 @@ class GraphStorage(BaseGraphStorage):
# return result
async def _node2vec_embed(self):
print ("this is never called. checking to be sure.")
# async def _node2vec_embed(self):
with self._driver.session() as session:
#Define the Cypher query
options = self.global_config["node2vec_params"]
logger.debug(f"building embeddings with options {options}")
query = f"""CALL gds.node2vec.write('91fbae6c', {
options
})
YIELD nodeId, labels, embedding
RETURN
nodeId AS id,
labels[0] AS distinctLabel,
embedding AS nodeToVecEmbedding
"""
# Run the query and process the results
results = session.run(query)
embeddings = []
node_labels = []
for record in results:
node_id = record["id"]
embedding = record["nodeToVecEmbedding"]
label = record["distinctLabel"]
print(f"Node id/label: {label}/{node_id}, Embedding: {embedding}")
embeddings.append(embedding)
node_labels.append(label)
return embeddings, node_labels
print ("Implemented but never called.")

22
test.py
View File

@@ -8,27 +8,7 @@ from pprint import pprint
# nest_asyncio.apply()
#########
WORKING_DIR = "./dickensTestEmbedcall"
# G = nx.read_graphml('./dickensTestEmbedcall/graph_chunk_entity_relation.graphml')
# nx.write_gexf(G, "graph_chunk_entity_relation.gefx")
import networkx as nx
from networkx_query import search_nodes, search_edges
G = nx.read_graphml('./dickensTestEmbedcall/graph_chunk_entity_relation.graphml')
query = {} # Empty query matches all nodes
result = search_nodes(G, query)
# Extract node IDs from the result
node_ids = sorted([node for node in result])
print("All node IDs in the graph:")
pprint(node_ids)
raise Exception
# raise Exception
WORKING_DIR = "./dickens"
if not os.path.exists(WORKING_DIR):
os.mkdir(WORKING_DIR)

View File

@@ -17,7 +17,7 @@ rag = LightRAG(
working_dir=WORKING_DIR,
llm_model_func=gpt_4o_mini_complete, # Use gpt_4o_mini_complete LLM model
kg="Neo4JStorage",
log_level="DEBUG"
log_level="INFO"
# llm_model_func=gpt_4o_complete # Optionally, use a stronger model
)