diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cc92791..0b73c91 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,74 +1,93 @@ name: CI -on: +on: push: - path: - - 'ge/*' - - 'tests/*' + paths: + - "ge/**" + - "tests/**" + - "examples/**" + - ".github/workflows/**" + - "setup.py" + - "README.md" pull_request: - path: - - 'ge/*' - - 'tests/*' - + paths: + - "ge/**" + - "tests/**" + - "examples/**" + - ".github/workflows/**" + - "setup.py" + - "README.md" + jobs: build: - - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 timeout-minutes: 180 strategy: + fail-fast: false matrix: - python-version: [3.6,3.7,3.8] - tf-version: [1.4.0,1.15.0,2.5.0,2.6.0,2.7.0,2.8.0,2.9.0] - - exclude: - - python-version: 3.7 - tf-version: 1.4.0 - - python-version: 3.7 - tf-version: 1.15.0 - - python-version: 3.8 - tf-version: 1.4.0 - - python-version: 3.8 - tf-version: 1.14.0 - - python-version: 3.8 - tf-version: 1.15.0 - - python-version: 3.6 - tf-version: 2.7.0 - - python-version: 3.6 - tf-version: 2.8.0 - - python-version: 3.6 - tf-version: 2.9.0 - - python-version: 3.9 - tf-version: 1.4.0 - - python-version: 3.9 - tf-version: 1.15.0 - - python-version: 3.9 - tf-version: 2.2.0 + include: + - python-version: "3.7" + tf-version: "1.15.5" + use-legacy-keras: "0" + - python-version: "3.10" + tf-version: "2.10.0" + use-legacy-keras: "0" + - python-version: "3.10" + tf-version: "2.15.0" + use-legacy-keras: "0" + - python-version: "3.11" + tf-version: "2.15.0" + use-legacy-keras: "0" + - python-version: "3.10" + tf-version: "2.20.0" + use-legacy-keras: "1" + - python-version: "3.11" + tf-version: "2.20.0" + use-legacy-keras: "1" + - python-version: "3.12" + tf-version: "2.20.0" + use-legacy-keras: "0" + - python-version: "3.13" + tf-version: "2.20.0" + use-legacy-keras: "0" + steps: - - - uses: actions/checkout@v3 - - - name: Setup python environment - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} + - uses: actions/checkout@v4 + + - name: Setup Python environment + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + if [[ "${{ matrix.tf-version }}" == "2.10.0" ]]; then + python -m pip install -q "numpy<2" + fi + python -m pip install -q "tensorflow==${{ matrix.tf-version }}" + if [[ "${{ matrix.tf-version }}" == 1.* ]]; then + python -m pip install -q "protobuf==3.20.3" + fi + if [[ "${{ matrix.use-legacy-keras }}" == "1" ]]; then + python -m pip install -q "tf-keras~=2.20" + fi + python -m pip install -e ".[test]" + if [[ "${{ matrix.tf-version }}" == "2.10.0" ]]; then + python -m pip install -q "numpy<2" + fi + + - name: Test with pytest + timeout-minutes: 180 + env: + TF_USE_LEGACY_KERAS: ${{ matrix.use-legacy-keras }} + run: | + pytest --cov=ge --cov=examples --cov-report=xml - - name: Install dependencies - run: | - pip3 install -q tensorflow==${{ matrix.tf-version }} - pip install -q protobuf==3.19.0 - pip install -q requests - pip install -e . - - name: Test with pytest - timeout-minutes: 180 - run: | - pip install -q pytest - pip install -q pytest-cov - pip install -q python-coveralls - pytest --cov=ge --cov-report=xml - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3.1.0 - with: - token: ${{secrets.CODECOV_TOKEN}} - file: ./coverage.xml - flags: pytest - name: py${{ matrix.python-version }}-tf${{ matrix.tf-version }} + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ./coverage.xml + flags: pytest + name: py${{ matrix.python-version }}-tf${{ matrix.tf-version }} diff --git a/README.md b/README.md index 2a17812..91ab902 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![GitHub Issues](https://img.shields.io/github/issues/shenweichen/graphembedding.svg )](https://github.com/shenweichen/graphembedding/issues) -![CI status](https://github.com/shenweichen/graphembedding/workflows/CI/badge.svg) +[![CI status](https://github.com/shenweichen/graphembedding/actions/workflows/ci.yml/badge.svg)](https://github.com/shenweichen/graphembedding/actions/workflows/ci.yml) [![codecov](https://codecov.io/gh/shenweichen/graphembedding/branch/master/graph/badge.svg)](https://codecov.io/gh/shenweichen/graphembedding) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/c46407f5931f40048e28860dccf7dabc)](https://www.codacy.com/gh/shenweichen/GraphEmbedding/dashboard?utm_source=github.com&utm_medium=referral&utm_content=shenweichen/GraphEmbedding&utm_campaign=Badge_Grade) [![Disscussion](https://img.shields.io/badge/chat-wechat-brightgreen?style=flat)](./README.md#disscussiongroup--related-projects) @@ -21,13 +21,16 @@ | Struc2Vec | [KDD 2017][struc2vec: Learning Node Representations from Structural Identity](https://arxiv.org/pdf/1704.03165.pdf) | [【Graph Embedding】Struc2Vec:算法原理,实现和应用](https://zhuanlan.zhihu.com/p/56733145) | + + # How to run examples -1. clone the repo and make sure you have installed `tensorflow` or `tensorflow-gpu` on your local machine. -2. run following commands + +1. Clone the repo and install dependencies. +2. Run one example script. + ```bash -python setup.py install -cd examples -python deepwalk_wiki.py +pip install -e .[tf] +python examples/deepwalk_wiki.py ``` ## DisscussionGroup & Related Projects diff --git a/examples/alias.py b/examples/alias.py index 8d2324e..b345a14 100644 --- a/examples/alias.py +++ b/examples/alias.py @@ -1,30 +1,47 @@ +import sys +from pathlib import Path + import matplotlib.pyplot as plt import numpy as np +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + from ge.alias import alias_sample, create_alias_table -def gen_prob_dist(N): - p = np.random.randint(0, 100, N) - return p/np.sum(p) +def gen_prob_dist(size): + probabilities = np.random.randint(0, 100, size) + return probabilities / np.sum(probabilities) + + +def simulate(size=100, sample_count=10000): + truth = gen_prob_dist(size) + accept, alias = create_alias_table(truth) + + sampled = np.zeros(size) + for _ in range(sample_count): + sampled[alias_sample(accept, alias)] += 1 + return sampled / np.sum(sampled), truth -def simulate(N=100, k=10000,): +def main(smoke=False, show=True): + size = 20 if smoke else 100 + sample_count = 300 if smoke else 10000 + alias_result, truth = simulate(size=size, sample_count=sample_count) - truth = gen_prob_dist(N) + assert np.isclose(alias_result.sum(), 1.0) + assert np.isclose(truth.sum(), 1.0) - area_ratio = truth - accept, alias = create_alias_table(area_ratio) + if show: + plt.bar(list(range(len(alias_result))), alias_result, label="alias_result") + plt.bar(list(range(len(truth))), truth, label="truth") + plt.legend() + plt.show() - ans = np.zeros(N) - for _ in range(k): - i = alias_sample(accept, alias) - ans[i] += 1 - return ans/np.sum(ans), truth + return alias_result, truth if __name__ == "__main__": - alias_result, truth = simulate() - plt.bar(list(range(len(alias_result))), alias_result, label='alias_result') - plt.bar(list(range(len(truth))), truth, label='truth') - plt.legend() + main() diff --git a/examples/deepwalk_wiki.py b/examples/deepwalk_wiki.py index e81e025..a452d60 100644 --- a/examples/deepwalk_wiki.py +++ b/examples/deepwalk_wiki.py @@ -1,53 +1,77 @@ +from pathlib import Path +import sys +import matplotlib.pyplot as plt +import networkx as nx import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.manifold import TSNE + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) -from ge.classify import read_node_label, Classifier from ge import DeepWalk -from sklearn.linear_model import LogisticRegression +from ge.classify import Classifier, read_node_label -import matplotlib.pyplot as plt -import networkx as nx -from sklearn.manifold import TSNE +WIKI_GRAPH_PATH = PROJECT_ROOT / "data" / "wiki" / "Wiki_edgelist.txt" +WIKI_LABEL_PATH = PROJECT_ROOT / "data" / "wiki" / "wiki_labels.txt" +SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" -def evaluate_embeddings(embeddings): - X, Y = read_node_label('../data/wiki/wiki_labels.txt') - tr_frac = 0.8 - print("Training classifier using {:.2f}% nodes...".format( - tr_frac * 100)) +def evaluate_embeddings(embeddings, label_path): + x_data, y_data = read_node_label(str(label_path)) + train_fraction = 0.8 + print("Training classifier using {:.2f}% nodes...".format(train_fraction * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) - clf.split_train_evaluate(X, Y, tr_frac) - + clf.split_train_evaluate(x_data, y_data, train_fraction) -def plot_embeddings(embeddings,): - X, Y = read_node_label('../data/wiki/wiki_labels.txt') - emb_list = [] - for k in X: - emb_list.append(embeddings[k]) - emb_list = np.array(emb_list) +def plot_embeddings(embeddings, label_path, show=True): + x_data, y_data = read_node_label(str(label_path)) - model = TSNE(n_components=2) - node_pos = model.fit_transform(emb_list) + embedding_list = np.array([embeddings[node] for node in x_data]) + node_pos = TSNE(n_components=2).fit_transform(embedding_list) color_idx = {} - for i in range(len(X)): - color_idx.setdefault(Y[i][0], []) - color_idx[Y[i][0]].append(i) + for index, label in enumerate(y_data): + color_idx.setdefault(label[0], []) + color_idx[label[0]].append(index) - for c, idx in color_idx.items(): - plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) + for label, indexes in color_idx.items(): + plt.scatter(node_pos[indexes, 0], node_pos[indexes, 1], label=label) plt.legend() - plt.show() + if show: + plt.show() + else: + plt.close() -if __name__ == "__main__": - G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt', - create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) +def main(smoke=False, show=True): + graph_path = SMOKE_GRAPH_PATH if smoke else WIKI_GRAPH_PATH + graph = nx.read_edgelist( + str(graph_path), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) - model = DeepWalk(G, walk_length=10, num_walks=80, workers=1) - model.train(window_size=5, iter=3) + model = DeepWalk( + graph, + walk_length=3 if smoke else 10, + num_walks=2 if smoke else 80, + workers=1, + ) + model.train(window_size=2 if smoke else 5, iter=1 if smoke else 3, workers=1) embeddings = model.get_embeddings() + assert len(embeddings) > 0 + + if not smoke: + evaluate_embeddings(embeddings, WIKI_LABEL_PATH) + plot_embeddings(embeddings, WIKI_LABEL_PATH, show=show) - evaluate_embeddings(embeddings) - plot_embeddings(embeddings) + return embeddings + + +if __name__ == "__main__": + main() diff --git a/examples/line_wiki.py b/examples/line_wiki.py index 5771d99..bc30233 100644 --- a/examples/line_wiki.py +++ b/examples/line_wiki.py @@ -1,53 +1,78 @@ - -import numpy as np - -from ge.classify import read_node_label, Classifier -from ge import LINE -from sklearn.linear_model import LogisticRegression +from pathlib import Path +import sys import matplotlib.pyplot as plt import networkx as nx +import numpy as np +from sklearn.linear_model import LogisticRegression from sklearn.manifold import TSNE +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) -def evaluate_embeddings(embeddings): - X, Y = read_node_label('../data/wiki/wiki_labels.txt') - tr_frac = 0.8 - print("Training classifier using {:.2f}% nodes...".format( - tr_frac * 100)) - clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) - clf.split_train_evaluate(X, Y, tr_frac) +try: + from ge import LINE +except ImportError as exc: + raise ImportError( + "Unable to import LINE. Use a supported Python/TensorFlow environment " + "(for example Python 3.10-3.12 with tensorflow installed)." + ) from exc +from ge.classify import Classifier, read_node_label +WIKI_GRAPH_PATH = PROJECT_ROOT / "data" / "wiki" / "Wiki_edgelist.txt" +WIKI_LABEL_PATH = PROJECT_ROOT / "data" / "wiki" / "wiki_labels.txt" +SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" -def plot_embeddings(embeddings,): - X, Y = read_node_label('../data/wiki/wiki_labels.txt') - emb_list = [] - for k in X: - emb_list.append(embeddings[k]) - emb_list = np.array(emb_list) +def evaluate_embeddings(embeddings, label_path): + x_data, y_data = read_node_label(str(label_path)) + train_fraction = 0.8 + print("Training classifier using {:.2f}% nodes...".format(train_fraction * 100)) + clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) + clf.split_train_evaluate(x_data, y_data, train_fraction) - model = TSNE(n_components=2) - node_pos = model.fit_transform(emb_list) + +def plot_embeddings(embeddings, label_path, show=True): + x_data, y_data = read_node_label(str(label_path)) + + embedding_list = np.array([embeddings[node] for node in x_data]) + node_pos = TSNE(n_components=2).fit_transform(embedding_list) color_idx = {} - for i in range(len(X)): - color_idx.setdefault(Y[i][0], []) - color_idx[Y[i][0]].append(i) + for index, label in enumerate(y_data): + color_idx.setdefault(label[0], []) + color_idx[label[0]].append(index) - for c, idx in color_idx.items(): - plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) + for label, indexes in color_idx.items(): + plt.scatter(node_pos[indexes, 0], node_pos[indexes, 1], label=label) plt.legend() - plt.show() + if show: + plt.show() + else: + plt.close() -if __name__ == "__main__": - G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt', - create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) +def main(smoke=False, show=True): + graph_path = SMOKE_GRAPH_PATH if smoke else WIKI_GRAPH_PATH + graph = nx.read_edgelist( + str(graph_path), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) - model = LINE(G, embedding_size=128, order='second') - model.train(batch_size=1024, epochs=50, verbose=2) + model = LINE(graph, embedding_size=8 if smoke else 128, order="second") + model.train(batch_size=2 if smoke else 1024, epochs=1 if smoke else 50, verbose=0 if smoke else 2) embeddings = model.get_embeddings() + assert len(embeddings) > 0 + + if not smoke: + evaluate_embeddings(embeddings, WIKI_LABEL_PATH) + plot_embeddings(embeddings, WIKI_LABEL_PATH, show=show) - evaluate_embeddings(embeddings) - plot_embeddings(embeddings) + return embeddings + + +if __name__ == "__main__": + main() diff --git a/examples/node2vec_flight.py b/examples/node2vec_flight.py index a37a880..e3127ef 100644 --- a/examples/node2vec_flight.py +++ b/examples/node2vec_flight.py @@ -1,88 +1,80 @@ -import numpy as np - - - -from ge.classify import read_node_label,Classifier - -from ge import Node2Vec - -from sklearn.linear_model import LogisticRegression - - +from pathlib import Path +import sys import matplotlib.pyplot as plt - import networkx as nx - +import numpy as np +from sklearn.linear_model import LogisticRegression from sklearn.manifold import TSNE +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) +from ge import Node2Vec +from ge.classify import Classifier, read_node_label -def evaluate_embeddings(embeddings): - - X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True) - - tr_frac = 0.8 - - print("Training classifier using {:.2f}% nodes...".format( +FLIGHT_GRAPH_PATH = PROJECT_ROOT / "data" / "flight" / "brazil-airports.edgelist" +FLIGHT_LABEL_PATH = PROJECT_ROOT / "data" / "flight" / "labels-brazil-airports.txt" +SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" - tr_frac * 100)) +def evaluate_embeddings(embeddings, label_path): + x_data, y_data = read_node_label(str(label_path), skip_head=True) + train_fraction = 0.8 + print("Training classifier using {:.2f}% nodes...".format(train_fraction * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) + clf.split_train_evaluate(x_data, y_data, train_fraction) - clf.split_train_evaluate(X, Y, tr_frac) - - - - - -def plot_embeddings(embeddings,): - - X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True) - - - - emb_list = [] - - for k in X: - - emb_list.append(embeddings[k]) - - emb_list = np.array(emb_list) - - - - model = TSNE(n_components=2) - - node_pos = model.fit_transform(emb_list) +def plot_embeddings(embeddings, label_path, show=True): + x_data, y_data = read_node_label(str(label_path), skip_head=True) + embedding_list = np.array([embeddings[node] for node in x_data]) + node_pos = TSNE(n_components=2).fit_transform(embedding_list) color_idx = {} + for index, label in enumerate(y_data): + color_idx.setdefault(label[0], []) + color_idx[label[0]].append(index) - for i in range(len(X)): - - color_idx.setdefault(Y[i][0], []) - - color_idx[Y[i][0]].append(i) - - - - for c, idx in color_idx.items(): - - plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) # c=node_colors) - + for label, indexes in color_idx.items(): + plt.scatter(node_pos[indexes, 0], node_pos[indexes, 1], label=label) plt.legend() + if show: + plt.show() + else: + plt.close() + + +def main(smoke=False, show=True): + graph_path = SMOKE_GRAPH_PATH if smoke else FLIGHT_GRAPH_PATH + graph = nx.read_edgelist( + str(graph_path), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) + + model = Node2Vec( + graph, + walk_length=3 if smoke else 10, + num_walks=2 if smoke else 80, + workers=1, + p=0.25, + q=2, + use_rejection_sampling=False, + ) + model.train(window_size=2 if smoke else 5, iter=1 if smoke else 3, workers=1) + embeddings = model.get_embeddings() + assert len(embeddings) > 0 - plt.show() + if not smoke: + evaluate_embeddings(embeddings, FLIGHT_LABEL_PATH) + plot_embeddings(embeddings, FLIGHT_LABEL_PATH, show=show) -if __name__ == "__main__": - G = nx.read_edgelist('../data/flight/brazil-airports.edgelist', create_using=nx.DiGraph(), nodetype=None, - data=[('weight', int)]) + return embeddings - model = Node2Vec(G, 10, 80, workers=1, p=0.25, q=2, use_rejection_sampling=0) - model.train() - embeddings = model.get_embeddings() - evaluate_embeddings(embeddings) - plot_embeddings(embeddings) +if __name__ == "__main__": + main() diff --git a/examples/node2vec_wiki.py b/examples/node2vec_wiki.py index 45ea1c0..fb6db4d 100644 --- a/examples/node2vec_wiki.py +++ b/examples/node2vec_wiki.py @@ -1,53 +1,80 @@ +from pathlib import Path +import sys +import matplotlib.pyplot as plt +import networkx as nx import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.manifold import TSNE + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) -from ge.classify import read_node_label, Classifier from ge import Node2Vec -from sklearn.linear_model import LogisticRegression +from ge.classify import Classifier, read_node_label -import matplotlib.pyplot as plt -import networkx as nx -from sklearn.manifold import TSNE +WIKI_GRAPH_PATH = PROJECT_ROOT / "data" / "wiki" / "Wiki_edgelist.txt" +WIKI_LABEL_PATH = PROJECT_ROOT / "data" / "wiki" / "wiki_labels.txt" +SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" -def evaluate_embeddings(embeddings): - X, Y = read_node_label('../data/wiki/wiki_labels.txt') - tr_frac = 0.8 - print("Training classifier using {:.2f}% nodes...".format( - tr_frac * 100)) +def evaluate_embeddings(embeddings, label_path): + x_data, y_data = read_node_label(str(label_path)) + train_fraction = 0.8 + print("Training classifier using {:.2f}% nodes...".format(train_fraction * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) - clf.split_train_evaluate(X, Y, tr_frac) - + clf.split_train_evaluate(x_data, y_data, train_fraction) -def plot_embeddings(embeddings,): - X, Y = read_node_label('../data/wiki/wiki_labels.txt') - emb_list = [] - for k in X: - emb_list.append(embeddings[k]) - emb_list = np.array(emb_list) +def plot_embeddings(embeddings, label_path, show=True): + x_data, y_data = read_node_label(str(label_path)) - model = TSNE(n_components=2) - node_pos = model.fit_transform(emb_list) + embedding_list = np.array([embeddings[node] for node in x_data]) + node_pos = TSNE(n_components=2).fit_transform(embedding_list) color_idx = {} - for i in range(len(X)): - color_idx.setdefault(Y[i][0], []) - color_idx[Y[i][0]].append(i) + for index, label in enumerate(y_data): + color_idx.setdefault(label[0], []) + color_idx[label[0]].append(index) - for c, idx in color_idx.items(): - plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) + for label, indexes in color_idx.items(): + plt.scatter(node_pos[indexes, 0], node_pos[indexes, 1], label=label) plt.legend() - plt.show() + if show: + plt.show() + else: + plt.close() + + +def main(smoke=False, show=True): + graph_path = SMOKE_GRAPH_PATH if smoke else WIKI_GRAPH_PATH + graph = nx.read_edgelist( + str(graph_path), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) + + model = Node2Vec( + graph, + walk_length=3 if smoke else 10, + num_walks=2 if smoke else 80, + p=0.25, + q=4, + workers=1, + use_rejection_sampling=False, + ) + model.train(window_size=2 if smoke else 5, iter=1 if smoke else 3, workers=1) + embeddings = model.get_embeddings() + assert len(embeddings) > 0 + + if not smoke: + evaluate_embeddings(embeddings, WIKI_LABEL_PATH) + plot_embeddings(embeddings, WIKI_LABEL_PATH, show=show) + + return embeddings if __name__ == "__main__": - G=nx.read_edgelist('../data/wiki/Wiki_edgelist.txt', - create_using = nx.DiGraph(), nodetype = None, data = [('weight', int)]) - model = Node2Vec(G, walk_length=10, num_walks=80, - p=0.25, q=4, workers=1, use_rejection_sampling=0) - model.train(window_size = 5, iter = 3) - embeddings=model.get_embeddings() - - evaluate_embeddings(embeddings) - plot_embeddings(embeddings) + main() diff --git a/examples/sdne_wiki.py b/examples/sdne_wiki.py index 9cfc467..09553ae 100644 --- a/examples/sdne_wiki.py +++ b/examples/sdne_wiki.py @@ -1,54 +1,82 @@ - -import numpy as np - -from ge.classify import read_node_label, Classifier -from ge import SDNE -from sklearn.linear_model import LogisticRegression +from pathlib import Path +import sys import matplotlib.pyplot as plt import networkx as nx +import numpy as np +from sklearn.linear_model import LogisticRegression from sklearn.manifold import TSNE +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) -def evaluate_embeddings(embeddings): - X, Y = read_node_label('../data/wiki/wiki_labels.txt') - tr_frac = 0.8 - print("Training classifier using {:.2f}% nodes...".format( - tr_frac * 100)) - clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) - clf.split_train_evaluate(X, Y, tr_frac) +try: + from ge import SDNE +except ImportError as exc: + raise ImportError( + "Unable to import SDNE. Use a supported Python/TensorFlow environment " + "(for example Python 3.10-3.12 with tensorflow installed)." + ) from exc +from ge.classify import Classifier, read_node_label +WIKI_GRAPH_PATH = PROJECT_ROOT / "data" / "wiki" / "Wiki_edgelist.txt" +WIKI_LABEL_PATH = PROJECT_ROOT / "data" / "wiki" / "wiki_labels.txt" +SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" -def plot_embeddings(embeddings,): - X, Y = read_node_label('../data/wiki/wiki_labels.txt') - emb_list = [] - for k in X: - emb_list.append(embeddings[k]) - emb_list = np.array(emb_list) +def evaluate_embeddings(embeddings, label_path): + x_data, y_data = read_node_label(str(label_path)) + train_fraction = 0.8 + print("Training classifier using {:.2f}% nodes...".format(train_fraction * 100)) + clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) + clf.split_train_evaluate(x_data, y_data, train_fraction) - model = TSNE(n_components=2) - node_pos = model.fit_transform(emb_list) + +def plot_embeddings(embeddings, label_path, show=True): + x_data, y_data = read_node_label(str(label_path)) + + embedding_list = np.array([embeddings[node] for node in x_data]) + node_pos = TSNE(n_components=2).fit_transform(embedding_list) color_idx = {} - for i in range(len(X)): - color_idx.setdefault(Y[i][0], []) - color_idx[Y[i][0]].append(i) + for index, label in enumerate(y_data): + color_idx.setdefault(label[0], []) + color_idx[label[0]].append(index) - for c, idx in color_idx.items(): - plt.scatter(node_pos[idx, 0], node_pos[idx, 1], - label=c) # c=node_colors) + for label, indexes in color_idx.items(): + plt.scatter(node_pos[indexes, 0], node_pos[indexes, 1], label=label) plt.legend() - plt.show() + if show: + plt.show() + else: + plt.close() -if __name__ == "__main__": - G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt', - create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) +def main(smoke=False, show=True): + graph_path = SMOKE_GRAPH_PATH if smoke else WIKI_GRAPH_PATH + graph = nx.read_edgelist( + str(graph_path), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) - model = SDNE(G, hidden_size=[256, 128],) - model.train(batch_size=3000, epochs=40, verbose=2) + model = SDNE(graph, hidden_size=[8, 4] if smoke else [256, 128]) + model.train( + batch_size=2 if smoke else 3000, + epochs=1 if smoke else 40, + verbose=0 if smoke else 2, + ) embeddings = model.get_embeddings() + assert len(embeddings) > 0 + + if not smoke: + evaluate_embeddings(embeddings, WIKI_LABEL_PATH) + plot_embeddings(embeddings, WIKI_LABEL_PATH, show=show) - evaluate_embeddings(embeddings) - plot_embeddings(embeddings) + return embeddings + + +if __name__ == "__main__": + main() diff --git a/examples/struc2vec_flight.py b/examples/struc2vec_flight.py index 8863675..dedb578 100644 --- a/examples/struc2vec_flight.py +++ b/examples/struc2vec_flight.py @@ -1,88 +1,87 @@ -import numpy as np - - - -from ge.classify import read_node_label,Classifier - -from ge import Struc2Vec - -from sklearn.linear_model import LogisticRegression - - +from pathlib import Path +import sys +import tempfile import matplotlib.pyplot as plt - import networkx as nx - +import numpy as np +from sklearn.linear_model import LogisticRegression from sklearn.manifold import TSNE +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) +from ge import Struc2Vec +from ge.classify import Classifier, read_node_label -def evaluate_embeddings(embeddings): - - X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True) - - tr_frac = 0.8 - - print("Training classifier using {:.2f}% nodes...".format( +FLIGHT_GRAPH_PATH = PROJECT_ROOT / "data" / "flight" / "brazil-airports.edgelist" +FLIGHT_LABEL_PATH = PROJECT_ROOT / "data" / "flight" / "labels-brazil-airports.txt" +SMOKE_GRAPH_PATH = PROJECT_ROOT / "tests" / "Wiki_edgelist.txt" - tr_frac * 100)) +def evaluate_embeddings(embeddings, label_path): + x_data, y_data = read_node_label(str(label_path), skip_head=True) + train_fraction = 0.8 + print("Training classifier using {:.2f}% nodes...".format(train_fraction * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) - - clf.split_train_evaluate(X, Y, tr_frac) - - - + clf.split_train_evaluate(x_data, y_data, train_fraction) -def plot_embeddings(embeddings,): - - X, Y = read_node_label('../data/flight/labels-brazil-airports.txt',skip_head=True) - - - - emb_list = [] - - for k in X: - - emb_list.append(embeddings[k]) - - emb_list = np.array(emb_list) - - - - model = TSNE(n_components=2) - - node_pos = model.fit_transform(emb_list) - +def plot_embeddings(embeddings, label_path, show=True): + x_data, y_data = read_node_label(str(label_path), skip_head=True) + embedding_list = np.array([embeddings[node] for node in x_data]) + node_pos = TSNE(n_components=2).fit_transform(embedding_list) color_idx = {} + for index, label in enumerate(y_data): + color_idx.setdefault(label[0], []) + color_idx[label[0]].append(index) - for i in range(len(X)): - - color_idx.setdefault(Y[i][0], []) - - color_idx[Y[i][0]].append(i) - - - - for c, idx in color_idx.items(): - - plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) # c=node_colors) - + for label, indexes in color_idx.items(): + plt.scatter(node_pos[indexes, 0], node_pos[indexes, 1], label=label) plt.legend() + if show: + plt.show() + else: + plt.close() + + +def main(smoke=False, show=True): + graph_path = SMOKE_GRAPH_PATH if smoke else FLIGHT_GRAPH_PATH + graph = nx.read_edgelist( + str(graph_path), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) + + with tempfile.TemporaryDirectory(prefix="struc2vec-") as temp_dir: + model = Struc2Vec( + graph, + walk_length=3 if smoke else 10, + num_walks=1 if smoke else 80, + workers=1 if smoke else 4, + verbose=0 if smoke else 40, + temp_path=temp_dir + "/", + ) + model.train( + embed_size=8 if smoke else 128, + window_size=2 if smoke else 5, + workers=1, + iter=1 if smoke else 3, + ) + embeddings = model.get_embeddings() + + assert len(embeddings) > 0 + + if not smoke: + evaluate_embeddings(embeddings, FLIGHT_LABEL_PATH) + plot_embeddings(embeddings, FLIGHT_LABEL_PATH, show=show) + + return embeddings - plt.show() if __name__ == "__main__": - G = nx.read_edgelist('../data/flight/brazil-airports.edgelist', create_using=nx.DiGraph(), nodetype=None, - data=[('weight', int)]) - - model = Struc2Vec(G, 10, 80, workers=4, verbose=40, ) - model.train() - embeddings = model.get_embeddings() - - evaluate_embeddings(embeddings) - plot_embeddings(embeddings) \ No newline at end of file + main() diff --git a/ge/__init__.py b/ge/__init__.py index cf4f59d..89d2335 100644 --- a/ge/__init__.py +++ b/ge/__init__.py @@ -1 +1,8 @@ -from .models import * \ No newline at end of file +from .alias import alias_sample, create_alias_table + +__all__ = ["alias_sample", "create_alias_table"] + +try: + from .models import * # noqa: F401,F403 +except ImportError: + pass diff --git a/ge/models/__init__.py b/ge/models/__init__.py index d2375e9..c008e9f 100644 --- a/ge/models/__init__.py +++ b/ge/models/__init__.py @@ -1,8 +1,19 @@ from .deepwalk import DeepWalk from .node2vec import Node2Vec -from .line import LINE -from .sdne import SDNE from .struc2vec import Struc2Vec +__all__ = ["DeepWalk", "Node2Vec", "Struc2Vec"] -__all__ = ["DeepWalk", "Node2Vec", "LINE", "SDNE", "Struc2Vec"] +try: + from .line import LINE + + __all__.append("LINE") +except ImportError: + LINE = None + +try: + from .sdne import SDNE + + __all__.append("SDNE") +except ImportError: + SDNE = None diff --git a/ge/models/line.py b/ge/models/line.py index 993a5aa..6bae314 100644 --- a/ge/models/line.py +++ b/ge/models/line.py @@ -21,17 +21,17 @@ import random import numpy as np -from deepctr.layers.utils import reduce_sum -from tensorflow.python.keras import backend as K -from tensorflow.python.keras.layers import Embedding, Input, Lambda -from tensorflow.python.keras.models import Model +import tensorflow as tf +from tensorflow.keras import backend as K +from tensorflow.keras.layers import Embedding, Input, Lambda +from tensorflow.keras.models import Model from ..alias import create_alias_table, alias_sample from ..utils import preprocess_nxgraph def line_loss(y_true, y_pred): - return -K.mean(K.log(K.sigmoid(y_true * y_pred))) + return -K.mean(tf.math.log_sigmoid(y_true * y_pred)) def create_model(numNodes, embedding_size, order='second'): @@ -48,10 +48,14 @@ def create_model(numNodes, embedding_size, order='second'): v_i_emb_second = second_emb(v_i) v_j_context_emb = context_emb(v_j) - first = Lambda(lambda x: reduce_sum( - x[0] * x[1], axis=-1, keep_dims=False), name='first_order')([v_i_emb, v_j_emb]) - second = Lambda(lambda x: reduce_sum( - x[0] * x[1], axis=-1, keep_dims=False), name='second_order')([v_i_emb_second, v_j_context_emb]) + first = Lambda( + lambda x: tf.reduce_sum(x[0] * x[1], axis=-1, keepdims=False), + name='first_order', + )([v_i_emb, v_j_emb]) + second = Lambda( + lambda x: tf.reduce_sum(x[0] * x[1], axis=-1, keepdims=False), + name='second_order', + )([v_i_emb_second, v_j_context_emb]) if order == 'first': output_list = [first] @@ -162,18 +166,20 @@ def batch_iter(self, node2idx): cur_t = edges[shuffle_indices[i]][1] h.append(cur_h) t.append(cur_t) - sign = np.ones(len(h)) + sign = np.ones(len(h), dtype=np.float32) else: - sign = np.ones(len(h)) * -1 + sign = np.ones(len(h), dtype=np.float32) * -1 t = [] for i in range(len(h)): t.append(alias_sample( self.node_accept, self.node_alias)) + heads = np.asarray(h, dtype=np.int32) + tails = np.asarray(t, dtype=np.int32) if self.order == 'all': - yield ([np.array(h), np.array(t)], [sign, sign]) + yield ((heads, tails), (sign, sign)) else: - yield ([np.array(h), np.array(t)], [sign]) + yield ((heads, tails), (sign,)) mod += 1 mod %= mod_size if mod == 0: @@ -205,8 +211,23 @@ def get_embeddings(self, ): def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1, times=1): self.reset_training_config(batch_size, times) - hist = self.model.fit_generator(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, - steps_per_epoch=self.steps_per_epoch, - verbose=verbose) + try: + hist = self.model.fit( + self.batch_it, + epochs=epochs, + initial_epoch=initial_epoch, + steps_per_epoch=self.steps_per_epoch, + verbose=verbose, + ) + except TypeError: + if not hasattr(self.model, "fit_generator"): + raise + hist = self.model.fit_generator( + self.batch_it, + epochs=epochs, + initial_epoch=initial_epoch, + steps_per_epoch=self.steps_per_epoch, + verbose=verbose, + ) return hist diff --git a/ge/models/sdne.py b/ge/models/sdne.py index 923586d..4e55bb0 100644 --- a/ge/models/sdne.py +++ b/ge/models/sdne.py @@ -22,39 +22,43 @@ import numpy as np import scipy.sparse as sp import tensorflow as tf -from tensorflow.python.keras import backend as K -from tensorflow.python.keras.callbacks import History -from tensorflow.python.keras.layers import Dense, Input -from tensorflow.python.keras.models import Model -from tensorflow.python.keras.regularizers import l1_l2 +from tensorflow.keras.callbacks import History +from tensorflow.keras.layers import Dense, Input +from tensorflow.keras.models import Model +from tensorflow.keras.regularizers import l1_l2 from ..utils import preprocess_nxgraph def l_2nd(beta): def loss_2nd(y_true, y_pred): - b_ = np.ones_like(y_true) - b_[y_true != 0] = beta - x = K.square((y_true - y_pred) * b_) - t = K.sum(x, axis=-1, ) - return K.mean(t) + beta_weight = tf.cast(beta, y_true.dtype) + ones = tf.ones_like(y_true) + b_ = tf.where(tf.not_equal(y_true, 0), beta_weight * ones, ones) + x = tf.square((y_true - y_pred) * b_) + return tf.reduce_mean(tf.reduce_sum(x, axis=-1)) return loss_2nd def l_1st(alpha): def loss_1st(y_true, y_pred): - L = y_true - Y = y_pred - batch_size = tf.to_float(K.shape(L)[0]) - return alpha * 2 * tf.linalg.trace(tf.matmul(tf.matmul(Y, L, transpose_a=True), Y)) / batch_size + laplacian = y_true + embeddings = y_pred + batch_size = tf.cast(tf.shape(laplacian)[0], embeddings.dtype) + alpha_weight = tf.cast(alpha, embeddings.dtype) + return ( + alpha_weight + * 2.0 + * tf.linalg.trace(tf.matmul(tf.matmul(embeddings, laplacian, transpose_a=True), embeddings)) + / batch_size + ) return loss_1st def create_model(node_size, hidden_size=[256, 128], l1=1e-5, l2=1e-4): A = Input(shape=(node_size,)) - L = Input(shape=(None,)) fc = A for i in range(len(hidden_size)): if i == len(hidden_size) - 1: @@ -69,7 +73,7 @@ def create_model(node_size, hidden_size=[256, 128], l1=1e-5, l2=1e-4): kernel_regularizer=l1_l2(l1, l2))(fc) A_ = Dense(node_size, 'relu', name='2nd')(fc) - model = Model(inputs=[A, L], outputs=[A_, Y]) + model = Model(inputs=A, outputs=[A_, Y]) emb = Model(inputs=A, outputs=Y) return model, emb @@ -90,7 +94,6 @@ def __init__(self, graph, hidden_size=[32, 16], alpha=1e-6, beta=5., nu1=1e-5, n self.A, self.L = _create_A_L(self.graph, self.node2idx) # Adj Matrix,L Matrix self.reset_model() - self.inputs = [self.A, self.L] self._embeddings = {} def reset_model(self, opt='adam'): @@ -101,17 +104,26 @@ def reset_model(self, opt='adam'): self.get_embeddings() def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1): + adjacency = self.A.toarray().astype(np.float32) + laplacian = self.L.toarray().astype(np.float32) if batch_size >= self.node_size: if batch_size > self.node_size: print('batch_size({0}) > node_size({1}),set batch_size = {1}'.format( batch_size, self.node_size)) batch_size = self.node_size - return self.model.fit([self.A.todense(), self.L.todense()], [self.A.todense(), self.L.todense()], - batch_size=batch_size, epochs=epochs, initial_epoch=initial_epoch, verbose=verbose, - shuffle=False, ) + return self.model.fit( + adjacency, + [adjacency, laplacian], + batch_size=batch_size, + epochs=epochs, + initial_epoch=initial_epoch, + verbose=verbose, + shuffle=False, + ) else: steps_per_epoch = (self.node_size - 1) // batch_size + 1 hist = History() + hist.set_model(self.model) hist.on_train_begin() logs = {} for epoch in range(initial_epoch, epochs): @@ -120,10 +132,9 @@ def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1): for i in range(steps_per_epoch): index = np.arange( i * batch_size, min((i + 1) * batch_size, self.node_size)) - A_train = self.A[index, :].todense() - L_mat_train = self.L[index][:, index].todense() - inp = [A_train, L_mat_train] - batch_losses = self.model.train_on_batch(inp, inp) + A_train = adjacency[index, :] + L_mat_train = laplacian[index][:, index] + batch_losses = np.asarray(self.model.train_on_batch(A_train, [A_train, L_mat_train])) losses += batch_losses losses = losses / steps_per_epoch @@ -139,11 +150,14 @@ def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1): return hist def evaluate(self, ): - return self.model.evaluate(x=self.inputs, y=self.inputs, batch_size=self.node_size) + adjacency = self.A.toarray().astype(np.float32) + laplacian = self.L.toarray().astype(np.float32) + return self.model.evaluate(x=adjacency, y=[adjacency, laplacian], batch_size=self.node_size) def get_embeddings(self): self._embeddings = {} - embeddings = self.emb_model.predict(self.A.todense(), batch_size=self.node_size) + adjacency = self.A.toarray().astype(np.float32) + embeddings = self.emb_model.predict(adjacency, batch_size=self.node_size, verbose=0) look_back = self.idx2node for i, embedding in enumerate(embeddings): self._embeddings[look_back[i]] = embedding diff --git a/setup.py b/setup.py index 616afb6..990b387 100644 --- a/setup.py +++ b/setup.py @@ -2,55 +2,39 @@ with open("README.md", "r") as fh: - long_description = fh.read() REQUIRED_PACKAGES = [ - # 'tensorflow>=1.4.0', - 'gensim>=4.0.0', - 'networkx', - 'joblib', - 'fastdtw', - 'tqdm', - 'numpy', - 'scikit-learn', - 'pandas', - 'matplotlib', - 'deepctr' + "gensim>=4.0.0", + "networkx", + "joblib", + "fastdtw", + "tqdm", + "numpy", + "scikit-learn", + "pandas", + "matplotlib", ] setuptools.setup( - name="ge", - - version="0.0.0", - + version="0.1.0", author="Weichen Shen", - author_email="weichenswc@163.com", - url="https://github.com/shenweichen/GraphEmbedding", - packages=setuptools.find_packages(exclude=[]), - - python_requires='>=3.5', # 3.4.6 - + python_requires=">=3.7", install_requires=REQUIRED_PACKAGES, - extras_require={ - - "cpu": ['tensorflow>=1.4.0,!=1.7.*,!=1.8.*'], - - "gpu": ['tensorflow-gpu>=1.4.0,!=1.7.*,!=1.8.*'], - - }, - - entry_points={ - + "tf": ["tensorflow>=1.15.5"], + "test": [ + "pytest>=7.0.0", + "pytest-cov>=4.0.0", + "python-coveralls>=2.9.3", + ], }, + entry_points={}, license="MIT license", - - ) diff --git a/tests/deepwalk_test.py b/tests/deepwalk_test.py index 10a83a6..d0c034d 100644 --- a/tests/deepwalk_test.py +++ b/tests/deepwalk_test.py @@ -1,15 +1,28 @@ +from pathlib import Path + import networkx as nx +import pytest +pytest.importorskip("gensim") +pytest.importorskip("pandas") from ge import DeepWalk +TEST_GRAPH_PATH = Path(__file__).resolve().parent / "Wiki_edgelist.txt" + def test_DeepWalk(): - G = nx.read_edgelist('./tests/Wiki_edgelist.txt', - create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) + graph = nx.read_edgelist( + str(TEST_GRAPH_PATH), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) - model = DeepWalk(G, walk_length=3, num_walks=2, workers=1) - model.train(window_size=3, iter=1) + model = DeepWalk(graph, walk_length=3, num_walks=2, workers=1) + model.train(embed_size=8, window_size=2, iter=1, workers=1) embeddings = model.get_embeddings() + assert len(embeddings) == graph.number_of_nodes() + assert all(len(vector) == 8 for vector in embeddings.values()) if __name__ == "__main__": diff --git a/tests/examples_test.py b/tests/examples_test.py new file mode 100644 index 0000000..aec32ed --- /dev/null +++ b/tests/examples_test.py @@ -0,0 +1,47 @@ +import importlib.util +from pathlib import Path + +import pytest + +EXAMPLES_DIR = Path(__file__).resolve().parents[1] / "examples" +EXAMPLE_FILES = [ + "alias.py", + "deepwalk_wiki.py", + "line_wiki.py", + "node2vec_flight.py", + "node2vec_wiki.py", + "sdne_wiki.py", + "struc2vec_flight.py", +] +TF_EXAMPLES = {"line_wiki.py", "sdne_wiki.py"} +GENSIM_EXAMPLES = {"deepwalk_wiki.py", "node2vec_flight.py", "node2vec_wiki.py", "struc2vec_flight.py"} + + +def load_example_module(example_file): + module_path = EXAMPLES_DIR / example_file + spec = importlib.util.spec_from_file_location(f"example_{module_path.stem}", module_path) + assert spec is not None and spec.loader is not None + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +@pytest.mark.parametrize("example_file", EXAMPLE_FILES) +def test_examples_smoke(example_file): + if example_file in TF_EXAMPLES: + pytest.importorskip("tensorflow") + if example_file in GENSIM_EXAMPLES: + pytest.importorskip("gensim") + pytest.importorskip("pandas") + if example_file == "struc2vec_flight.py": + pytest.importorskip("fastdtw") + + module = load_example_module(example_file) + result = module.main(smoke=True, show=False) + + if isinstance(result, dict): + assert len(result) > 0 + elif isinstance(result, tuple): + assert all(item is not None for item in result) + else: + assert result is not None diff --git a/tests/line_test.py b/tests/line_test.py index 2b2e2b7..832320d 100644 --- a/tests/line_test.py +++ b/tests/line_test.py @@ -1,15 +1,27 @@ +from pathlib import Path + import networkx as nx +import pytest +pytest.importorskip("tensorflow") from ge import LINE +TEST_GRAPH_PATH = Path(__file__).resolve().parent / "Wiki_edgelist.txt" + def test_LINE(): - G = nx.read_edgelist('./tests/Wiki_edgelist.txt', - create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) + graph = nx.read_edgelist( + str(TEST_GRAPH_PATH), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) - model = LINE(G, embedding_size=2, order='second') - model.train(batch_size=2, epochs=1, verbose=2) + model = LINE(graph, embedding_size=4, order="second") + model.train(batch_size=2, epochs=1, verbose=0) embeddings = model.get_embeddings() + assert len(embeddings) == graph.number_of_nodes() + assert all(len(vector) == 4 for vector in embeddings.values()) if __name__ == "__main__": diff --git a/tests/node2vec_test.py b/tests/node2vec_test.py index 3ca9756..26215c1 100644 --- a/tests/node2vec_test.py +++ b/tests/node2vec_test.py @@ -1,21 +1,39 @@ +from pathlib import Path + import networkx as nx import pytest +pytest.importorskip("gensim") +pytest.importorskip("pandas") from ge import Node2Vec +TEST_GRAPH_PATH = Path(__file__).resolve().parent / "Wiki_edgelist.txt" + @pytest.mark.parametrize( - 'use_rejection_sampling', - [True, False - ] + "use_rejection_sampling", + [True, False], ) def test_Node2Vec(use_rejection_sampling): - G = nx.read_edgelist('./tests/Wiki_edgelist.txt', - create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) - model = Node2Vec(G, walk_length=10, num_walks=80, - p=0.25, q=4, workers=1, use_rejection_sampling=use_rejection_sampling) - model.train(window_size=5, iter=3) + graph = nx.read_edgelist( + str(TEST_GRAPH_PATH), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) + model = Node2Vec( + graph, + walk_length=3, + num_walks=2, + p=0.25, + q=4, + workers=1, + use_rejection_sampling=use_rejection_sampling, + ) + model.train(embed_size=8, window_size=2, iter=1, workers=1) embeddings = model.get_embeddings() + assert len(embeddings) == graph.number_of_nodes() + assert all(len(vector) == 8 for vector in embeddings.values()) if __name__ == "__main__": diff --git a/tests/sdne_test.py b/tests/sdne_test.py index 5393414..1dac226 100644 --- a/tests/sdne_test.py +++ b/tests/sdne_test.py @@ -1,18 +1,27 @@ +from pathlib import Path + import networkx as nx -import tensorflow as tf +import pytest +pytest.importorskip("tensorflow") from ge import SDNE +TEST_GRAPH_PATH = Path(__file__).resolve().parent / "Wiki_edgelist.txt" + def test_SDNE(): - if tf.__version__ >= '1.15.0': - return #todo - G = nx.read_edgelist('./tests/Wiki_edgelist.txt', - create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) + graph = nx.read_edgelist( + str(TEST_GRAPH_PATH), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) - model = SDNE(G, hidden_size=[8, 4], ) - model.train(batch_size=2, epochs=1, verbose=2) + model = SDNE(graph, hidden_size=[8, 4]) + model.train(batch_size=2, epochs=1, verbose=0) embeddings = model.get_embeddings() + assert len(embeddings) == graph.number_of_nodes() + assert all(len(vector) == 4 for vector in embeddings.values()) if __name__ == "__main__": diff --git a/tests/struct2vec_test.py b/tests/struct2vec_test.py index 4bf408e..ce3685d 100644 --- a/tests/struct2vec_test.py +++ b/tests/struct2vec_test.py @@ -1,15 +1,38 @@ +from pathlib import Path +import tempfile + import networkx as nx +import pytest +pytest.importorskip("fastdtw") +pytest.importorskip("gensim") +pytest.importorskip("pandas") from ge import Struc2Vec +TEST_GRAPH_PATH = Path(__file__).resolve().parent / "Wiki_edgelist.txt" + def test_Struc2Vec(): - G = nx.read_edgelist('./tests/Wiki_edgelist.txt', create_using=nx.DiGraph(), nodetype=None, - data=[('weight', int)]) + graph = nx.read_edgelist( + str(TEST_GRAPH_PATH), + create_using=nx.DiGraph(), + nodetype=None, + data=[("weight", int)], + ) - model = Struc2Vec(G, 3, 1, workers=1, verbose=40, ) - model.train() - embeddings = model.get_embeddings() + with tempfile.TemporaryDirectory(prefix="struc2vec-test-") as temp_dir: + model = Struc2Vec( + graph, + walk_length=3, + num_walks=1, + workers=1, + verbose=0, + temp_path=temp_dir + "/", + ) + model.train(embed_size=8, window_size=2, workers=1, iter=1) + embeddings = model.get_embeddings() + assert len(embeddings) == graph.number_of_nodes() + assert all(len(vector) == 8 for vector in embeddings.values()) if __name__ == "__main__":