Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9,808 changes: 175 additions & 9,633 deletions examples/gds-example.ipynb

Large diffs are not rendered by default.

12 changes: 11 additions & 1 deletion justfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,23 @@ py-test-gds:
trap "cd $ENV_DIR && docker compose down" EXIT
cd $ENV_DIR && docker compose up -d
cd -
cd python-wrapper && \
NEO4J_URI=bolt://localhost:7687 \
NEO4J_USER=neo4j \
NEO4J_PASSWORD=password \
NEO4J_DB=neo4j \
cd python-wrapper && uv run --group dev --extra gds pytest tests --include-neo4j-and-gds
uv run --group dev --extra gds pytest tests --include-neo4j-and-gds
cd ..

py-test-gds-sessions:
#!/usr/bin/env bash
cd python-wrapper && \
GDS_SESSION_URI=bolt://localhost:7688 \
NEO4J_URI=bolt://localhost:7687 \
NEO4J_USER=neo4j \
NEO4J_PASSWORD=password \
uv run --group dev --extra gds pytest tests --include-neo4j-and-gds

local-neo4j-setup:
#!/usr/bin/env bash
set -e
Expand Down
10 changes: 2 additions & 8 deletions python-wrapper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ requires-python = ">=3.10"

[project.optional-dependencies]
pandas = ["pandas>=2, <3", "pandas-stubs>=2, <3"]
gds = ["graphdatascience>=1, <2"]
gds = ["graphdatascience>=1.20, <2"]
neo4j = ["neo4j"]
snowflake = ["snowflake-snowpark-python>=1, <2"]

Expand Down Expand Up @@ -76,9 +76,9 @@ notebook = [
"palettable>=3.3.3",
"matplotlib>=3.9.4",
"snowflake-snowpark-python==1.42.0",
"dotenv",
"requests",
"marimo",
"python-dotenv"
]

[project.urls]
Expand Down Expand Up @@ -174,9 +174,3 @@ exclude = [
]
plugins = ['pydantic.mypy']
untyped_calls_exclude=["nbconvert"]

[tool.marimo.runtime]
output_max_bytes = 20_000_000
#
#[tool.marimo.server]
#follow_symlink = true
72 changes: 33 additions & 39 deletions python-wrapper/src/neo4j_viz/gds.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

import warnings
from itertools import chain
from typing import Optional, cast
from typing import Optional, cast, Collection
from uuid import uuid4

import pandas as pd
from graphdatascience import Graph, GraphDataScience
from graphdatascience.graph.v2 import GraphV2
from graphdatascience.session import AuraGraphDataScience

from neo4j_viz.colors import NEO4J_COLORS_DISCRETE, ColorSpace

Expand All @@ -15,48 +17,38 @@


def _fetch_node_dfs(
gds: GraphDataScience,
G: Graph,
gds: GraphDataScience | AuraGraphDataScience,
G: GraphV2,
node_properties_by_label: dict[str, list[str]],
node_labels: list[str],
node_labels: Collection[str],
additional_db_node_properties: list[str],
) -> dict[str, pd.DataFrame]:
return {
lbl: gds.graph.nodeProperties.stream(
lbl: gds.v2.graph.node_properties.stream(
G,
node_properties=node_properties_by_label[lbl],
node_labels=[lbl],
separate_property_columns=True,
db_node_properties=additional_db_node_properties,
)
for lbl in node_labels
}


def _fetch_rel_dfs(gds: GraphDataScience, G: Graph) -> list[pd.DataFrame]:
rel_types = G.relationship_types()

rel_props = {rel_type: G.relationship_properties(rel_type) for rel_type in rel_types}
def _fetch_rel_dfs(gds: GraphDataScience, G: GraphV2) -> list[pd.DataFrame]:
rel_props = G.relationship_properties()

rel_dfs: list[pd.DataFrame] = []
# Have to call per stream per relationship type as there was a bug in GDS < 2.21
for rel_type, props in rel_props.items():
assert isinstance(props, list)
if len(props) > 0:
rel_df = gds.graph.relationshipProperties.stream(
G, relationship_types=rel_type, relationship_properties=list(props), separate_property_columns=True
)
else:
rel_df = gds.graph.relationships.stream(G, relationship_types=[rel_type])

rel_df = gds.v2.graph.relationships.stream(G, relationship_types=[rel_type], relationship_properties=list(props))
rel_dfs.append(rel_df)

return rel_dfs


def from_gds(
gds: GraphDataScience,
G: Graph,
gds: GraphDataScience | AuraGraphDataScience,
G: Graph | GraphV2,
node_properties: Optional[list[str]] = None,
db_node_properties: Optional[list[str]] = None,
max_node_count: int = 10_000,
Expand All @@ -76,9 +68,9 @@ def from_gds(

Parameters
----------
gds : GraphDataScience
GraphDataScience object.
G : Graph
gds
GraphDataScience object. AuraGraphDataScience object if using Aura Graph Analytics.
G
Graph object.
node_properties : list[str], optional
Additional properties to include in the visualization node, by default None which means that all node
Expand All @@ -91,50 +83,52 @@ def from_gds(
"""
if db_node_properties is None:
db_node_properties = []
if isinstance(G, Graph):
G_v2 = gds.v2.graph.get(G.name())
else:
G_v2 = G

node_properties_from_gds = G.node_properties()
assert isinstance(node_properties_from_gds, pd.Series)
actual_node_properties: dict[str, list[str]] = cast(dict[str, list[str]], node_properties_from_gds.to_dict())
all_actual_node_properties = list(chain.from_iterable(actual_node_properties.values()))
gds_properties_per_label = G_v2.node_properties()
all_gds_properties = list(chain.from_iterable(gds_properties_per_label.values()))

node_properties_by_label_sets: dict[str, set[str]] = dict()
if node_properties is None:
node_properties_by_label_sets = {k: set(v) for k, v in actual_node_properties.items()}
node_properties_by_label_sets = {k: set(v) for k, v in gds_properties_per_label.items()}
else:
for prop in node_properties:
if prop not in all_actual_node_properties:
if prop not in all_gds_properties:
raise ValueError(f"There is no node property '{prop}' in graph '{G.name()}'")

for label, props in actual_node_properties.items():
for label, props in gds_properties_per_label.items():
node_properties_by_label_sets[label] = {
prop for prop in actual_node_properties[label] if prop in node_properties
prop for prop in gds_properties_per_label[label] if prop in node_properties
}

node_properties_by_label = {k: list(v) for k, v in node_properties_by_label_sets.items()}

node_count = G.node_count()
node_count = G_v2.node_count()
if node_count > max_node_count:
warnings.warn(
f"The '{G.name()}' projection's node count ({G.node_count()}) exceeds `max_node_count` ({max_node_count}), so subsampling will be applied. Increase `max_node_count` if needed"
f"The '{G_v2.name()}' projection's node count ({G_v2.node_count()}) exceeds `max_node_count` ({max_node_count}), so subsampling will be applied. Increase `max_node_count` if needed"
)
sampling_ratio = float(max_node_count) / node_count
sample_name = f"neo4j-viz_sample_{uuid4()}"
G_fetched, _ = gds.graph.sample.rwr(sample_name, G, samplingRatio=sampling_ratio, nodeLabelStratification=True)
G_fetched, _ = gds.v2.graph.sample.rwr(G_v2, sample_name, sampling_ratio=sampling_ratio, node_label_stratification=True)
else:
G_fetched = G
G_fetched = G_v2

property_name = None
try:
# Since GDS does not allow us to only fetch node IDs, we add the degree property
# as a temporary property to ensure that we have at least one property for each label to fetch
if sum([len(props) == 0 for props in node_properties_by_label.values()]) > 0:
property_name = f"neo4j-viz_property_{uuid4()}"
gds.degree.mutate(G_fetched, mutateProperty=property_name)
gds.v2.degree_centrality.mutate(G_fetched, mutate_property=property_name)
for props in node_properties_by_label.values():
props.append(property_name)

node_dfs = _fetch_node_dfs(
gds, G_fetched, node_properties_by_label, G_fetched.node_labels(), db_node_properties
gds, G_fetched, node_properties_by_label, node_properties_by_label.keys(), db_node_properties
)
if property_name is not None:
for df in node_dfs.values():
Expand All @@ -145,7 +139,7 @@ def from_gds(
if G_fetched.name() != G.name():
G_fetched.drop()
elif property_name is not None:
gds.graph.nodeProperties.drop(G_fetched, node_properties=[property_name])
gds.v2.graph.node_properties.drop(G_fetched, node_properties=[property_name])

for df in node_dfs.values():
if property_name is not None and property_name in df.columns:
Expand All @@ -154,7 +148,7 @@ def from_gds(
node_props_df = pd.concat(node_dfs.values(), ignore_index=True, axis=0).drop_duplicates(subset=["nodeId"])

for lbl, df in node_dfs.items():
if "labels" in all_actual_node_properties:
if "labels" in all_gds_properties:
df.rename(columns={"labels": "__labels"}, inplace=True)
df["labels"] = lbl

Expand Down
56 changes: 37 additions & 19 deletions python-wrapper/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import random
from typing import Any, Generator

import pytest
Expand Down Expand Up @@ -31,56 +32,73 @@ def pytest_collection_modifyitems(config: Any, items: Any) -> None:


@pytest.fixture(scope="package")
def aura_ds_instance() -> Generator[Any, None, None]:
def aura_db_instance() -> Generator[Any, None, None]:
if os.environ.get("AURA_API_CLIENT_ID", None) is None:
yield None
return

from tests.gds_helper import aura_api, create_aurads_instance
from tests.gds_helper import aura_api, create_auradb_instance

api = aura_api()
id, dbms_connection_info = create_aurads_instance(api)
dbms_connection_info = create_auradb_instance(api)

old_uri = os.environ.get("NEO4J_URI", "")
# setting as environment variables to run notebooks with this connection
os.environ["NEO4J_URI"] = dbms_connection_info.get_uri()
assert isinstance(dbms_connection_info.username, str)
os.environ["NEO4J_USER"] = dbms_connection_info.username
assert isinstance(dbms_connection_info.password, str)
os.environ["NEO4J_PASSWORD"] = dbms_connection_info.password
old_instance = os.environ.get("AURA_INSTANCEID", "")
if dbms_connection_info.aura_instance_id:
os.environ["AURA_INSTANCEID"] = dbms_connection_info.aura_instance_id

yield dbms_connection_info

# Clear Neo4j_URI after test (rerun should create a new instance)
os.environ["NEO4J_URI"] = ""
api.delete_instance(id)
os.environ["NEO4J_URI"] = old_uri
os.environ["AURA_INSTANCEID"] = old_instance
assert dbms_connection_info.aura_instance_id is not None
api.delete_instance(dbms_connection_info.aura_instance_id)


@pytest.fixture(scope="package")
def gds(aura_ds_instance: Any) -> Generator[Any, None, None]:
from graphdatascience import GraphDataScience
def gds(aura_db_instance: Any) -> Generator[Any, None, None]:
from graphdatascience.session import SessionMemory

from tests.gds_helper import connect_to_plugin_gds, gds_sessions, connect_to_local_gds_session

from tests.gds_helper import connect_to_plugin_gds
if aura_db_instance:
sessions = gds_sessions()

if aura_ds_instance:
yield GraphDataScience(
endpoint=aura_ds_instance.uri,
auth=(aura_ds_instance.username, aura_ds_instance.password),
aura_ds=True,
database="neo4j",
gds = sessions.get_or_create(
f"neo4j-viz-ci-{os.environ.get('GITHUB_RUN_ID', random.randint(0, 10**6))}",
memory=SessionMemory.m_2GB,
db_connection=aura_db_instance,
)

yield gds
gds.delete()
else:
NEO4J_URI = os.environ.get("NEO4J_URI", "neo4j://localhost:7687")
gds = connect_to_plugin_gds(NEO4J_URI)
neo4j_uri = os.environ["NEO4J_URI"]
neo4j_auth = (os.environ.get("NEO4J_USER", "neo4j"), os.environ.get("NEO4J_PASSWORD", "password"))

session_uri = os.environ.get("GDS_SESSION_URI")
if session_uri:
gds = connect_to_local_gds_session(session_uri, neo4j_uri, neo4j_auth) # type: ignore
else:
gds = connect_to_plugin_gds(neo4j_uri, neo4j_auth) # type: ignore
yield gds
gds.close()


@pytest.fixture(scope="package")
def neo4j_driver(aura_ds_instance: Any) -> Generator[Any, None, None]:
def neo4j_driver(aura_db_instance: Any) -> Generator[Any, None, None]:
import neo4j

if aura_ds_instance:
if aura_db_instance:
driver = neo4j.GraphDatabase.driver(
aura_ds_instance.uri, auth=(aura_ds_instance.username, aura_ds_instance.password)
aura_db_instance.uri, auth=(aura_db_instance.username, aura_db_instance.password)
)
else:
NEO4J_URI = os.environ.get("NEO4J_URI", "neo4j://localhost:7687")
Expand Down
Loading
Loading