From 1269b1b52b166c75b7bb714a5cdf3a12a895103e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Thu, 4 May 2017 15:40:32 +0000 Subject: [PATCH 01/23] plot filters --- nips2016/mnist.ipynb | 120 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 110 insertions(+), 10 deletions(-) diff --git a/nips2016/mnist.ipynb b/nips2016/mnist.ipynb index 8d594f6..73846be 100644 --- a/nips2016/mnist.ipynb +++ b/nips2016/mnist.ipynb @@ -116,7 +116,7 @@ "val_data = coarsening.perm_data(val_data, perm)\n", "test_data = coarsening.perm_data(test_data, perm)\n", "print('Execution time: {:.2f}s'.format(time.process_time() - t_start))\n", - "del perm" + "#del perm" ] }, { @@ -164,10 +164,10 @@ "source": [ "common = {}\n", "common['dir_name'] = 'mnist/'\n", - "common['num_epochs'] = 20\n", + "common['num_epochs'] = 4 #20\n", "common['batch_size'] = 100\n", "common['decay_steps'] = mnist.train.num_examples / common['batch_size']\n", - "common['eval_frequency'] = 30 * common['num_epochs']\n", + "common['eval_frequency'] = 100 #30 * common['num_epochs']\n", "common['brelu'] = 'b1relu'\n", "common['pool'] = 'mpool1'\n", "C = max(mnist.train.labels) + 1 # number of classes\n", @@ -183,7 +183,7 @@ }, "outputs": [], "source": [ - "if True:\n", + "if False:\n", " name = 'softmax'\n", " params = common.copy()\n", " params['dir_name'] += name\n", @@ -228,13 +228,14 @@ }, "outputs": [], "source": [ - "if True:\n", + "if False:\n", " name = 'fgconv_softmax'\n", " params = common.copy()\n", " params['dir_name'] += name\n", " params['filter'] = 'fourier'\n", " params['K'] = [L[0].shape[0]]\n", - " model_perf.test(models.cgcnn(L, **params), name, params,\n", + " model_f = models.cgcnn(L, **params)\n", + " model_perf.test(model_f, name, params,\n", " train_data, train_labels, val_data, val_labels, test_data, test_labels)" ] }, @@ -246,12 +247,13 @@ }, "outputs": [], "source": [ - "if True:\n", + "if False:\n", " name = 'sgconv_softmax'\n", " params = common.copy()\n", " params['dir_name'] += name\n", " params['filter'] = 'spline'\n", - " model_perf.test(models.cgcnn(L, **params), name, params,\n", + " model_s = models.cgcnn(L, **params)\n", + " model_perf.test(model_s, name, params,\n", " train_data, train_labels, val_data, val_labels, test_data, test_labels)" ] }, @@ -264,14 +266,15 @@ "outputs": [], "source": [ "# With 'chebyshev2' and 'b2relu', it corresponds to cgcnn2_2(L[0], F=10, K=20).\n", - "if True:\n", + "if False:\n", " name = 'cgconv_softmax'\n", " params = common.copy()\n", " params['dir_name'] += name\n", " params['filter'] = 'chebyshev5'\n", "# params['filter'] = 'chebyshev2'\n", "# params['brelu'] = 'b2relu'\n", - " model_perf.test(models.cgcnn(L, **params), name, params,\n", + " model_c = models.cgcnn(L, **params)\n", + " model_perf.test(model_c, name, params,\n", " train_data, train_labels, val_data, val_labels, test_data, test_labels)" ] }, @@ -361,6 +364,103 @@ "model_perf.show()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def plot_filters(coeffs):\n", + " fig = plt.figure(figsize=(15,5))\n", + " ax = fig.add_subplot(1,1,1)\n", + " for coeff in coeffs:\n", + " c = eval(coeff)\n", + " label = '{}: L={:1.2e}, |dL|={:1.2e}'.format(coeff, L(c), np.linalg.norm(dL(X,Y,c)))\n", + " ax.plot(lamb, c, '.-', label=label)\n", + "# np.testing.assert_allclose(np.linalg.norm(c)**2, E, rtol=1e-2)\n", + " ax.set_xlim(lamb[0], lamb[-1])\n", + " ax.set_title('Filter coefficients, M={}, N={}, eps={}'.format(M, N, eps))\n", + " ax.set_xlabel('frequency')\n", + " ax.set_ylabel('amplitude')\n", + " ax.legend(loc='best')\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "a = np.array([1,2,3])\n", + "print(a[[2,0,1]][:2])\n", + "\n", + "print(mnist.train.labels[0])\n", + "plt.imshow(mnist.train.images[0].reshape(28,28))\n", + "\n", + "a = np.random.permutation(range(len(perm)))\n", + "b = a[perm]\n", + "c = b[idx]\n", + "assert np.all(a == c)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "model = model_f\n", + "\n", + "sess = tf.Session(graph=model.graph)\n", + "filename = tf.train.latest_checkpoint(os.path.join('..', 'checkpoints', model.dir_name))\n", + "model.op_saver.restore(sess, filename)\n", + "var = model.graph.get_tensor_by_name('conv1/weights' + ':0')\n", + "val = sess.run(var)\n", + "sess.close()\n", + "\n", + "lamb, U = graph.fourier(L[0])\n", + "\n", + "#filters = model_f.get_var('conv1/weights')\n", + "filters = val\n", + "#filt_fourier = filt\n", + "\n", + "i = 6\n", + "\n", + "print(filters.shape)\n", + "plt.figure(figsize=(15,5))\n", + "plt.plot(lamb, filters[:,i,0])\n", + "\n", + "print(lamb[0], lamb[-1])\n", + "\n", + "filt = U.dot(filters[:,i,0])\n", + "plt.figure(figsize=(15,5))\n", + "plt.plot(filt)\n", + "\n", + "print(len(lamb)-28**2)\n", + "indices = np.array(perm) >= 28**2\n", + "\n", + "print(train_data[0,indices])\n", + "\n", + "idx = np.argsort(perm)\n", + "filt = filt[idx]\n", + "plt.figure(figsize=(15,5))\n", + "plt.plot(train_data[0,perm])\n", + "\n", + "plt.figure(figsize=(15,5))\n", + "img = train_data[0,idx][:28**2].reshape(28,28)\n", + "plt.imshow(train_data[0,idx][:28**2].reshape(28,28))\n", + "assert np.allclose(train_data[0,idx][:28**2].reshape(28,28), mnist.train.images[0].reshape(28,28))" + ] + }, { "cell_type": "code", "execution_count": null, From a1aaf9b73a22dcb46310b1e2f440a74eb23a05e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Thu, 4 May 2017 15:53:58 +0000 Subject: [PATCH 02/23] gitignore: local configuration --- .gitignore | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index d170d81..0dc54b7 100644 --- a/.gitignore +++ b/.gitignore @@ -5,11 +5,13 @@ __pycache__/ # IPython checkpoints .ipynb_checkpoints/ -# Datasets +# Local configuration +.env +.python-version + +# Data data/ -# Tensorflow summaries +# Tensorflow summaries & model parameters summaries/ - -# Model parameters checkpoints/ From 76cebe61187dae9d0516cc58c2e7a5627f4c4434 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Tue, 16 May 2017 13:00:29 +0200 Subject: [PATCH 03/23] makefile: clean notebook JSON --- experiments/makefile | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 experiments/makefile diff --git a/experiments/makefile b/experiments/makefile new file mode 100644 index 0000000..6bfd9c2 --- /dev/null +++ b/experiments/makefile @@ -0,0 +1,18 @@ +NB = $(sort $(wildcard *.ipynb)) + +run: $(NB) + +$(NB): + jupyter nbconvert --inplace --execute --ExecutePreprocessor.timeout=-1 $@ + +clean: + @for nb in $(NB); do \ + echo "$$(jq --indent 1 ' \ + .metadata = {} \ + | (.cells[] | select(has("outputs")) | .outputs) = [] \ + | (.cells[] | select(has("execution_count")) | .execution_count) = null \ + | .cells[].metadata = {} \ + ' $$nb)" > $$nb; \ + done + +.PHONY: run $(NB) clean From f3064f746b0a2c428ad22c896b5220431cfb3678 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Tue, 16 May 2017 12:29:24 +0000 Subject: [PATCH 04/23] wikipedia: hyperlink graph --- experiments/4_wikipedia_traffic.ipynb | 220 ++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 experiments/4_wikipedia_traffic.ipynb diff --git a/experiments/4_wikipedia_traffic.ipynb b/experiments/4_wikipedia_traffic.ipynb new file mode 100644 index 0000000..73b0a6d --- /dev/null +++ b/experiments/4_wikipedia_traffic.ipynb @@ -0,0 +1,220 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Wikipedia Traffic +", + " +", + "This notebook aims at modeling user traffic on [Wikipedia](https://wikipedia.org) using a recurrent graph convolutional neural network. +", + " +", + "Goal: anomaly detection. Can be used to detect events in the real world. Other applications: +", + "* intrusion detection on telecomunnication networks, +", + "* anomaly detection on energy networks, +", + "* accident detection on transporation networks. +", + " +", + "Events: Super Bowl, Academy Awards, Grammy, Miss Universe, Golden Globe. Mostly December-February. +", + "Missed: Charlie Hebdo, Ebola +", + " +", + "Network is very large: 5M nodes, 300M edges. Downsampling ideas: +", + "* Choose a category, e.g. science. +", + "* Take most active ones. +", + "* Concatenate in modules / communities / super-nodes. +", + " +", + "Raw data +", + "* [Wikimedia SQL dumps](https://dumps.wikimedia.org/enwiki/), to construct the hyperlink graph. +", + " * Network size: 5M nodes, 300M edges. +", + "* [Pagecounts](https://dumps.wikimedia.org/other/pagecounts-raw/) as activations on the graph. +", + " * Data from 2014-09-23 0h to 2015-06-05 22h. +", + " * 6142 hours in total." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline +", + " +", + "import os +", + " +", + "import numpy as np +", + "import pandas as pd +", + "import matplotlib.pyplot as plt +", + "import seaborn as sns +", + "import graph_tool.all as gt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext dotenv +", + "%dotenv .env +", + " +", + "WIKI_RAW = os.environ.get('WIKI_RAW') # Downloaded from dumps.wikimedia.org. +", + "WIKI_CLEAN = os.environ.get('WIKI_CLEAN') # Processed by Kirell Benzi." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.set_context(\"notebook\", font_scale=1.5) +", + "plt.rcParams['figure.figsize'] = (17, 5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1 Hyperlink graph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "g = gt.load_graph(os.path.join(WIKI_CLEAN, 'enwiki-20150403-graph.gt'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "g.is_directed() +", + "#g.set_directed(False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('{:.2e} vertices'.format(g.num_vertices())) +", + "print('{:.2e} edges'.format(g.num_edges())) +", + " +", + "g.list_properties()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "idx = 42 +", + "page_title = g.vertex_properties['page_title'][idx] +", + "page_id = g.vertex_properties['page_id'][idx] +", + "print('{}: {}'.format(page_id, page_title))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hist = gt.vertex_hist(g, 'total') +", + "plt.loglog(hist[1][:-1], hist[0]) +", + "plt.xlabel('#edges') +", + "plt.ylabel('#nodes');" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Too large to be drawn in full. +", + "#gt.sfdp_layout +", + "#gt.graph_draw(g)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Remove uninteresting pages. +", + "#g.set_vertex_filter() +", + "#g.remove_vertex" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "A = gt.adjacency(g) +", + "A" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 2 +} From 2ca8bc1caff284073eb7f13e5bfacd11aa68334e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Tue, 16 May 2017 19:38:31 +0000 Subject: [PATCH 05/23] wikipedia: pages & page views --- experiments/4_wikipedia_traffic.ipynb | 250 +++++++++++++++++++++++++- 1 file changed, 249 insertions(+), 1 deletion(-) diff --git a/experiments/4_wikipedia_traffic.ipynb b/experiments/4_wikipedia_traffic.ipynb index 73b0a6d..7fb854a 100644 --- a/experiments/4_wikipedia_traffic.ipynb +++ b/experiments/4_wikipedia_traffic.ipynb @@ -44,7 +44,7 @@ ", " * Network size: 5M nodes, 300M edges. ", - "* [Pagecounts](https://dumps.wikimedia.org/other/pagecounts-raw/) as activations on the graph. + "* [Pagecounts](https://dumps.wikimedia.org/other/pagecounts-all-sites/) as activations on the graph. ", " * Data from 2014-09-23 0h to 2015-06-05 22h. ", @@ -62,8 +62,14 @@ " ", "import os +", + "import datetime ", " +", + "import IPython.display as ipd +", + "from tqdm import tqdm_notebook ", "import numpy as np ", @@ -212,6 +218,248 @@ ", "A" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2 Pages +", + " +", + "A lot of pages in `pagecounts` are redirections to actual pages. We need to merge the hits." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "filepath = os.path.join(WIKI_CLEAN, 'enwiki-20150403-page-redirect.csv.gz') +", + "redirect = pd.read_csv(filepath, compression='gzip', sep='|', encoding='utf-8', quoting=3, index_col=1) +", + " +", + "redirect.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#assert len(redirect) == len(redirect['page_id'].unique()) +", + "print('{:.2e} unique pages, {:.2e} pages including redirections'.format( +", + " len(redirect['fix_page_id'].unique()), +", + " len(redirect)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "redirect.loc[page_id]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def id2title(page_id): +", + " return redirect.at[page_id, 'fix_page_title'] +", + " #return redirect[redirect['page_id'] == page_id]['fix_page_title'].values[0] +", + "id2title(330)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def find_in_title(string): +", + " +", + " def find(page_title, string): +", + " try: +", + " return string.lower() in page_title.lower() +", + " except: +", + " return False +", + " +", + " #b = redirect['fix_page_title'].apply(find, string=string) +", + " b = redirect['page_title'].apply(find, string=string) +", + " #return redirect[b] +", + " return redirect[b & (redirect['is_redirect'] == 0)] +", + " +", + "#find_in_title('ebola') +", + "find_in_title('zirka')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3 Page views / counts +", + " +", + "Graph has 4M nodes but lot of pages are not seen much. `signal_500.h5` lists only 118k pages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Kirell's signal which includes views when greater than 500. +", + "filepath = os.path.join(WIKI_CLEAN, 'signal_500.h5') +", + "signal = pd.read_hdf(filepath, 'data') +", + "signal['count_views'].plot(kind='hist', logy=True) +", + "print(len(signal), len(signal['page_id'].unique()), len(signal['layer'].unique()), signal['count_views'].max()) +", + "signal.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "filepath = '../data/wikipedia/activations_all.h5' +", + " +", + "if os.path.exists(filepath): +", + " activations = pd.read_hdf(filepath, 'activations') +", + " +", + "else: +", + " START = datetime.datetime(2014, 9, 23, 2) +", + " #END = datetime.datetime(2014, 9, 24, 2) +", + " END = datetime.datetime(2015, 6, 5, 20) +", + " +", + " activations = pd.DataFrame(columns=pd.date_range(START, END, freq='H')) +", + " +", + " folder = os.path.join(WIKI_CLEAN, 'pagecounts_clean') +", + " for date in tqdm_notebook(activations.columns): +", + " filename = 'pagecounts-{:4d}{:02d}{:02d}-{:02d}0000.csv.gz'.format(date.year, date.month, date.day, date.hour) +", + " filename = os.path.join(folder, filename) +", + " pagecounts = pd.read_csv(filename, compression='gzip', index_col=0, squeeze=True) +", + " #print(len(pagecounts), filename) +", + " print(date) +", + " activations[date] = pagecounts +", + " activations[date] = activations[date].fillna(0).astype(np.int32) +", + " +", + " activations.to_hdf(filepath, 'activations') +", + " +", + "print(activations.shape) +", + "activations.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* Predictable fluctuations with unpredictable spikes. Those are outliers. +", + "* Anomalies should be outliers persisting for many hours." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "page_id = 40817806 +", + "page_id = 25 +", + "title = '{} ({})'.format(id2title(page_id), page_id) +", + "activations.loc[page_id].plot(title=title) +", + "plt.ylabel('#hits per hour');" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#activations.plot(kind='hist', logy=True);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TO_REMOVE = [15580374, 42727860] # page ids to remove (Main page, Undefined)" + ] } ], "metadata": {}, From 38c33e0f3c3c627183f92640447e646cf5bf7f6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Tue, 16 May 2017 21:40:11 +0200 Subject: [PATCH 06/23] requirements: update --- requirements.txt | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8217513..15ce4fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,22 @@ numpy scipy +pandas +tables scikit-learn matplotlib +seaborn -gensim tensorflow-gpu #tensorflow jupyter ipython + +python-dotenv +tqdm + +# Only needed for NLP experiments. +gensim + +# Cannot be installed with pip. +#graph-tool From 4c4f2b510f86f102763e26ffc33c2f9ec6c32bc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Tue, 16 May 2017 20:10:29 +0000 Subject: [PATCH 07/23] makefile: some echo implementations interpret \n --- experiments/4_wikipedia_traffic.ipynb | 347 +++++++++----------------- experiments/makefile | 2 +- 2 files changed, 117 insertions(+), 232 deletions(-) diff --git a/experiments/4_wikipedia_traffic.ipynb b/experiments/4_wikipedia_traffic.ipynb index 7fb854a..83e18df 100644 --- a/experiments/4_wikipedia_traffic.ipynb +++ b/experiments/4_wikipedia_traffic.ipynb @@ -4,50 +4,28 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Wikipedia Traffic -", - " -", - "This notebook aims at modeling user traffic on [Wikipedia](https://wikipedia.org) using a recurrent graph convolutional neural network. -", - " -", - "Goal: anomaly detection. Can be used to detect events in the real world. Other applications: -", - "* intrusion detection on telecomunnication networks, -", - "* anomaly detection on energy networks, -", - "* accident detection on transporation networks. -", - " -", - "Events: Super Bowl, Academy Awards, Grammy, Miss Universe, Golden Globe. Mostly December-February. -", - "Missed: Charlie Hebdo, Ebola -", - " -", - "Network is very large: 5M nodes, 300M edges. Downsampling ideas: -", - "* Choose a category, e.g. science. -", - "* Take most active ones. -", - "* Concatenate in modules / communities / super-nodes. -", - " -", - "Raw data -", - "* [Wikimedia SQL dumps](https://dumps.wikimedia.org/enwiki/), to construct the hyperlink graph. -", - " * Network size: 5M nodes, 300M edges. -", - "* [Pagecounts](https://dumps.wikimedia.org/other/pagecounts-all-sites/) as activations on the graph. -", - " * Data from 2014-09-23 0h to 2015-06-05 22h. -", + "# Wikipedia Traffic\n", + "\n", + "This notebook aims at modeling user traffic on [Wikipedia](https://wikipedia.org) using a recurrent graph convolutional neural network.\n", + "\n", + "Goal: anomaly detection. Can be used to detect events in the real world. Other applications:\n", + "* intrusion detection on telecomunnication networks,\n", + "* anomaly detection on energy networks,\n", + "* accident detection on transporation networks.\n", + "\n", + "Events: Super Bowl, Academy Awards, Grammy, Miss Universe, Golden Globe. Mostly December-February.\n", + "Missed: Charlie Hebdo, Ebola\n", + "\n", + "Network is very large: 5M nodes, 300M edges. Downsampling ideas:\n", + "* Choose a category, e.g. science.\n", + "* Take most active ones.\n", + "* Concatenate in modules / communities / super-nodes.\n", + "\n", + "Raw data\n", + "* [Wikimedia SQL dumps](https://dumps.wikimedia.org/enwiki/), to construct the hyperlink graph.\n", + " * Network size: 5M nodes, 300M edges.\n", + "* [Pagecounts](https://dumps.wikimedia.org/other/pagecounts-all-sites/) as activations on the graph.\n", + " * Data from 2014-09-23 0h to 2015-06-05 22h.\n", " * 6142 hours in total." ] }, @@ -57,28 +35,17 @@ "metadata": {}, "outputs": [], "source": [ - "%matplotlib inline -", - " -", - "import os -", - "import datetime -", - " -", - "import IPython.display as ipd -", - "from tqdm import tqdm_notebook -", - "import numpy as np -", - "import pandas as pd -", - "import matplotlib.pyplot as plt -", - "import seaborn as sns -", + "%matplotlib inline\n", + "\n", + "import os\n", + "import datetime\n", + "\n", + "import IPython.display as ipd\n", + "from tqdm import tqdm_notebook\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", "import graph_tool.all as gt" ] }, @@ -88,14 +55,10 @@ "metadata": {}, "outputs": [], "source": [ - "%load_ext dotenv -", - "%dotenv .env -", - " -", - "WIKI_RAW = os.environ.get('WIKI_RAW') # Downloaded from dumps.wikimedia.org. -", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "WIKI_RAW = os.environ.get('WIKI_RAW') # Downloaded from dumps.wikimedia.org.\n", "WIKI_CLEAN = os.environ.get('WIKI_CLEAN') # Processed by Kirell Benzi." ] }, @@ -105,8 +68,7 @@ "metadata": {}, "outputs": [], "source": [ - "sns.set_context(\"notebook\", font_scale=1.5) -", + "sns.set_context(\"notebook\", font_scale=1.5)\n", "plt.rcParams['figure.figsize'] = (17, 5)" ] }, @@ -132,8 +94,7 @@ "metadata": {}, "outputs": [], "source": [ - "g.is_directed() -", + "g.is_directed()\n", "#g.set_directed(False)" ] }, @@ -143,12 +104,9 @@ "metadata": {}, "outputs": [], "source": [ - "print('{:.2e} vertices'.format(g.num_vertices())) -", - "print('{:.2e} edges'.format(g.num_edges())) -", - " -", + "print('{:.2e} vertices'.format(g.num_vertices()))\n", + "print('{:.2e} edges'.format(g.num_edges()))\n", + "\n", "g.list_properties()" ] }, @@ -158,12 +116,9 @@ "metadata": {}, "outputs": [], "source": [ - "idx = 42 -", - "page_title = g.vertex_properties['page_title'][idx] -", - "page_id = g.vertex_properties['page_id'][idx] -", + "idx = 42\n", + "page_title = g.vertex_properties['page_title'][idx]\n", + "page_id = g.vertex_properties['page_id'][idx]\n", "print('{}: {}'.format(page_id, page_title))" ] }, @@ -173,12 +128,9 @@ "metadata": {}, "outputs": [], "source": [ - "hist = gt.vertex_hist(g, 'total') -", - "plt.loglog(hist[1][:-1], hist[0]) -", - "plt.xlabel('#edges') -", + "hist = gt.vertex_hist(g, 'total')\n", + "plt.loglog(hist[1][:-1], hist[0])\n", + "plt.xlabel('#edges')\n", "plt.ylabel('#nodes');" ] }, @@ -188,10 +140,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Too large to be drawn in full. -", - "#gt.sfdp_layout -", + "# Too large to be drawn in full.\n", + "#gt.sfdp_layout\n", "#gt.graph_draw(g)" ] }, @@ -201,10 +151,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Remove uninteresting pages. -", - "#g.set_vertex_filter() -", + "# Remove uninteresting pages.\n", + "#g.set_vertex_filter()\n", "#g.remove_vertex" ] }, @@ -214,8 +162,7 @@ "metadata": {}, "outputs": [], "source": [ - "A = gt.adjacency(g) -", + "A = gt.adjacency(g)\n", "A" ] }, @@ -223,10 +170,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 2 Pages -", - " -", + "## 2 Pages\n", + "\n", "A lot of pages in `pagecounts` are redirections to actual pages. We need to merge the hits." ] }, @@ -236,12 +181,9 @@ "metadata": {}, "outputs": [], "source": [ - "filepath = os.path.join(WIKI_CLEAN, 'enwiki-20150403-page-redirect.csv.gz') -", - "redirect = pd.read_csv(filepath, compression='gzip', sep='|', encoding='utf-8', quoting=3, index_col=1) -", - " -", + "filepath = os.path.join(WIKI_CLEAN, 'enwiki-20150403-page-redirect.csv.gz')\n", + "redirect = pd.read_csv(filepath, compression='gzip', sep='|', encoding='utf-8', quoting=3, index_col=1)\n", + "\n", "redirect.head()" ] }, @@ -251,12 +193,9 @@ "metadata": {}, "outputs": [], "source": [ - "#assert len(redirect) == len(redirect['page_id'].unique()) -", - "print('{:.2e} unique pages, {:.2e} pages including redirections'.format( -", - " len(redirect['fix_page_id'].unique()), -", + "#assert len(redirect) == len(redirect['page_id'].unique())\n", + "print('{:.2e} unique pages, {:.2e} pages including redirections'.format(\n", + " len(redirect['fix_page_id'].unique()),\n", " len(redirect)))" ] }, @@ -275,12 +214,9 @@ "metadata": {}, "outputs": [], "source": [ - "def id2title(page_id): -", - " return redirect.at[page_id, 'fix_page_title'] -", - " #return redirect[redirect['page_id'] == page_id]['fix_page_title'].values[0] -", + "def id2title(page_id):\n", + " return redirect.at[page_id, 'fix_page_title']\n", + " #return redirect[redirect['page_id'] == page_id]['fix_page_title'].values[0]\n", "id2title(330)" ] }, @@ -290,34 +226,20 @@ "metadata": {}, "outputs": [], "source": [ - "def find_in_title(string): -", - " -", - " def find(page_title, string): -", - " try: -", - " return string.lower() in page_title.lower() -", - " except: -", - " return False -", - " -", - " #b = redirect['fix_page_title'].apply(find, string=string) -", - " b = redirect['page_title'].apply(find, string=string) -", - " #return redirect[b] -", - " return redirect[b & (redirect['is_redirect'] == 0)] -", - " -", - "#find_in_title('ebola') -", + "def find_in_title(string):\n", + "\n", + " def find(page_title, string):\n", + " try:\n", + " return string.lower() in page_title.lower()\n", + " except:\n", + " return False\n", + "\n", + " #b = redirect['fix_page_title'].apply(find, string=string)\n", + " b = redirect['page_title'].apply(find, string=string)\n", + " #return redirect[b]\n", + " return redirect[b & (redirect['is_redirect'] == 0)]\n", + "\n", + "#find_in_title('ebola')\n", "find_in_title('zirka')" ] }, @@ -325,10 +247,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 3 Page views / counts -", - " -", + "## 3 Page views / counts\n", + "\n", "Graph has 4M nodes but lot of pages are not seen much. `signal_500.h5` lists only 118k pages." ] }, @@ -338,16 +258,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Kirell's signal which includes views when greater than 500. -", - "filepath = os.path.join(WIKI_CLEAN, 'signal_500.h5') -", - "signal = pd.read_hdf(filepath, 'data') -", - "signal['count_views'].plot(kind='hist', logy=True) -", - "print(len(signal), len(signal['page_id'].unique()), len(signal['layer'].unique()), signal['count_views'].max()) -", + "# Kirell's signal which includes views when greater than 500.\n", + "filepath = os.path.join(WIKI_CLEAN, 'signal_500.h5')\n", + "signal = pd.read_hdf(filepath, 'data')\n", + "signal['count_views'].plot(kind='hist', logy=True)\n", + "print(len(signal), len(signal['page_id'].unique()), len(signal['layer'].unique()), signal['count_views'].max())\n", "signal.head()" ] }, @@ -357,56 +272,31 @@ "metadata": {}, "outputs": [], "source": [ - "filepath = '../data/wikipedia/activations_all.h5' -", - " -", - "if os.path.exists(filepath): -", - " activations = pd.read_hdf(filepath, 'activations') -", - " -", - "else: -", - " START = datetime.datetime(2014, 9, 23, 2) -", - " #END = datetime.datetime(2014, 9, 24, 2) -", - " END = datetime.datetime(2015, 6, 5, 20) -", - " -", - " activations = pd.DataFrame(columns=pd.date_range(START, END, freq='H')) -", - " -", - " folder = os.path.join(WIKI_CLEAN, 'pagecounts_clean') -", - " for date in tqdm_notebook(activations.columns): -", - " filename = 'pagecounts-{:4d}{:02d}{:02d}-{:02d}0000.csv.gz'.format(date.year, date.month, date.day, date.hour) -", - " filename = os.path.join(folder, filename) -", - " pagecounts = pd.read_csv(filename, compression='gzip', index_col=0, squeeze=True) -", - " #print(len(pagecounts), filename) -", - " print(date) -", - " activations[date] = pagecounts -", - " activations[date] = activations[date].fillna(0).astype(np.int32) -", - " -", - " activations.to_hdf(filepath, 'activations') -", - " -", - "print(activations.shape) -", + "filepath = '../data/wikipedia/activations_all.h5'\n", + "\n", + "if os.path.exists(filepath):\n", + " activations = pd.read_hdf(filepath, 'activations')\n", + "\n", + "else:\n", + " START = datetime.datetime(2014, 9, 23, 2)\n", + " #END = datetime.datetime(2014, 9, 24, 2)\n", + " END = datetime.datetime(2015, 6, 5, 20)\n", + "\n", + " activations = pd.DataFrame(columns=pd.date_range(START, END, freq='H'))\n", + "\n", + " folder = os.path.join(WIKI_CLEAN, 'pagecounts_clean')\n", + " for date in tqdm_notebook(activations.columns):\n", + " filename = 'pagecounts-{:4d}{:02d}{:02d}-{:02d}0000.csv.gz'.format(date.year, date.month, date.day, date.hour)\n", + " filename = os.path.join(folder, filename)\n", + " pagecounts = pd.read_csv(filename, compression='gzip', index_col=0, squeeze=True)\n", + " #print(len(pagecounts), filename)\n", + " print(date)\n", + " activations[date] = pagecounts\n", + " activations[date] = activations[date].fillna(0).astype(np.int32)\n", + "\n", + " activations.to_hdf(filepath, 'activations')\n", + "\n", + "print(activations.shape)\n", "activations.head()" ] }, @@ -414,8 +304,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "* Predictable fluctuations with unpredictable spikes. Those are outliers. -", + "* Predictable fluctuations with unpredictable spikes. Those are outliers.\n", "* Anomalies should be outliers persisting for many hours." ] }, @@ -425,14 +314,10 @@ "metadata": {}, "outputs": [], "source": [ - "page_id = 40817806 -", - "page_id = 25 -", - "title = '{} ({})'.format(id2title(page_id), page_id) -", - "activations.loc[page_id].plot(title=title) -", + "page_id = 40817806\n", + "page_id = 25\n", + "title = '{} ({})'.format(id2title(page_id), page_id)\n", + "activations.loc[page_id].plot(title=title)\n", "plt.ylabel('#hits per hour');" ] }, @@ -465,4 +350,4 @@ "metadata": {}, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/experiments/makefile b/experiments/makefile index 6bfd9c2..22a1c95 100644 --- a/experiments/makefile +++ b/experiments/makefile @@ -7,7 +7,7 @@ $(NB): clean: @for nb in $(NB); do \ - echo "$$(jq --indent 1 ' \ + printf "%s" "$$(jq --indent 1 ' \ .metadata = {} \ | (.cells[] | select(has("outputs")) | .outputs) = [] \ | (.cells[] | select(has("execution_count")) | .execution_count) = null \ From a270cda7c336d3064f68d4d0c260c29df9a4de98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Wed, 17 May 2017 13:53:31 +0000 Subject: [PATCH 08/23] wikipedia: compute average page views then select pages and load data --- experiments/4_wikipedia_traffic.ipynb | 157 ++++++++++++++++++++------ 1 file changed, 125 insertions(+), 32 deletions(-) diff --git a/experiments/4_wikipedia_traffic.ipynb b/experiments/4_wikipedia_traffic.ipynb index 83e18df..661240e 100644 --- a/experiments/4_wikipedia_traffic.ipynb +++ b/experiments/4_wikipedia_traffic.ipynb @@ -69,7 +69,8 @@ "outputs": [], "source": [ "sns.set_context(\"notebook\", font_scale=1.5)\n", - "plt.rcParams['figure.figsize'] = (17, 5)" + "plt.rcParams['figure.figsize'] = (17, 5)\n", + "plt.rcParams['agg.path.chunksize'] = 10000 # OverflowError when plotting large series." ] }, { @@ -239,8 +240,7 @@ " #return redirect[b]\n", " return redirect[b & (redirect['is_redirect'] == 0)]\n", "\n", - "#find_in_title('ebola')\n", - "find_in_title('zirka')" + "find_in_title('ebola')" ] }, { @@ -272,32 +272,97 @@ "metadata": {}, "outputs": [], "source": [ - "filepath = '../data/wikipedia/activations_all.h5'\n", + "def get_pagecounts(date):\n", + " filename = 'pagecounts-{:4d}{:02d}{:02d}-{:02d}0000.csv.gz'.format(date.year, date.month, date.day, date.hour)\n", + " filepath = os.path.join('..', 'data', 'wikipedia', 'pagecounts_clean', filename)\n", + " return pd.read_csv(filepath, compression='gzip', index_col=0, squeeze=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "START = datetime.datetime(2014, 9, 23, 2)\n", + "END = datetime.datetime(2014, 9, 24, 3)\n", + "END = datetime.datetime(2015, 6, 5, 20)\n", + "dates = pd.date_range(START, END, freq='H')\n", + "\n", + "activations_tot = pd.Series(\n", + " data=0,\n", + " index=g.vp['page_id'].get_array(),\n", + " dtype=np.int64\n", + ")\n", "\n", - "if os.path.exists(filepath):\n", - " activations = pd.read_hdf(filepath, 'activations')\n", + "for date in tqdm_notebook(dates):\n", + " pagecounts = get_pagecounts(date)\n", + " activations_tot += pagecounts.reindex(activations_tot.index).fillna(0).astype(np.int32)\n", "\n", - "else:\n", - " START = datetime.datetime(2014, 9, 23, 2)\n", - " #END = datetime.datetime(2014, 9, 24, 2)\n", - " END = datetime.datetime(2015, 6, 5, 20)\n", + "print(activations_tot.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The largest is the main page.\n", + "plt.semilogy(np.sort(activations_tot.values)[::-1])\n", + "\n", + "main_page = activations_tot.argmax()\n", + "print('{} ({}): {:.2e} views in total'.format(id2title(main_page), main_page, activations_tot[main_page]))\n", + "\n", + "print('{:.2e} views in total'.format(activations_tot.sum()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Power law.\n", + "activations_tot.drop(main_page).plot(kind='hist', logy=True, bins=100);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MIN_AVG_VIEWS = 100\n", "\n", - " activations = pd.DataFrame(columns=pd.date_range(START, END, freq='H'))\n", + "keep = activations_tot.index[activations_tot >= MIN_AVG_VIEWS * len(dates)]\n", + "print('{} pages have more than {} views in total ({:.0f} per hour on average)'.format(\n", + " len(keep), MIN_AVG_VIEWS * len(dates), MIN_AVG_VIEWS))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "activations = pd.DataFrame(\n", + " data=0,\n", + " index=keep,\n", + " columns=dates,\n", + " dtype=np.int32\n", + ")\n", "\n", - " folder = os.path.join(WIKI_CLEAN, 'pagecounts_clean')\n", - " for date in tqdm_notebook(activations.columns):\n", - " filename = 'pagecounts-{:4d}{:02d}{:02d}-{:02d}0000.csv.gz'.format(date.year, date.month, date.day, date.hour)\n", - " filename = os.path.join(folder, filename)\n", - " pagecounts = pd.read_csv(filename, compression='gzip', index_col=0, squeeze=True)\n", - " #print(len(pagecounts), filename)\n", - " print(date)\n", - " activations[date] = pagecounts\n", - " activations[date] = activations[date].fillna(0).astype(np.int32)\n", + "for date in tqdm_notebook(dates):\n", + " pagecounts = get_pagecounts(date)\n", + " activations[date] = pagecounts.reindex(activations.index).fillna(0).astype(np.int32)\n", "\n", - " activations.to_hdf(filepath, 'activations')\n", + "filepath = os.path.join('..', 'data', 'wikipedia', 'activations_{}.h5'.format(MIN_AVG_VIEWS))\n", + "activations.to_hdf(filepath, 'activations')\n", "\n", - "print(activations.shape)\n", - "activations.head()" + "print('activations: {} x {} = {}'.format(*activations.shape, activations.size))\n", + "ipd.display(activations.head())\n", + "ipd.display(activations.info())" ] }, { @@ -314,11 +379,15 @@ "metadata": {}, "outputs": [], "source": [ - "page_id = 40817806\n", - "page_id = 25\n", - "title = '{} ({})'.format(id2title(page_id), page_id)\n", - "activations.loc[page_id].plot(title=title)\n", - "plt.ylabel('#hits per hour');" + "activations = pd.read_hdf(filepath, 'activations')\n", + "\n", + "DROP = [\n", + " 15580374, # Main page draws ~10% traffic\n", + " 42727860, # Undefined has the largest peaks of traffic while being inactive after 2014-10\n", + "# 8063851, # Feynman point has a very large traffic peak which is probably an error.\n", + "# 2697304, # Gold_as_an_investment has many traffic peaks.\n", + "]\n", + "activations.drop(DROP, inplace=True)" ] }, { @@ -327,14 +396,18 @@ "metadata": {}, "outputs": [], "source": [ - "#activations.plot(kind='hist', logy=True);" + "print('Max of {0} views at page id {2} and time {1}'.format(\n", + " activations.unstack().max(), *activations.unstack().argmax())) \n", + "plt.plot(activations.values.reshape(-1));" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "## Cleanup" + "plt.hist(activations.values.reshape(-1), bins=100, log=True);" ] }, { @@ -343,7 +416,27 @@ "metadata": {}, "outputs": [], "source": [ - "TO_REMOVE = [15580374, 42727860] # page ids to remove (Main page, Undefined)" + "# Events.\n", + "page_id = 40817806 # Ebola\n", + "page_id = 44635 # Grammy\n", + "page_id = 150340 # Miss Universe\n", + "page_id = 27718 # Super Bowl\n", + "#page_id = 324 # Academy Awards\n", + "#page_id = 44969225 # Charlie Hebdo shooting\n", + "#page_id = 2251390 # Charlie Hebdo\n", + "\n", + "# Remarkable things.\n", + "#page_id = 25\n", + "#page_id = 15580374 # Main Page --> largest traffic (~10%)\n", + "#page_id = 42727860 # Undefined --> hits only before mid-oct 2014\n", + "#page_id = 670 # Alphabet --> strange drop\n", + "#page_id = 8063851 # Shall distinguish outliers (counting errors?) from real events\n", + "#page_id = 2697304 # Lots of peaks --> correlated with fluctuations on market?\n", + "\n", + "page_title = id2title(page_id)\n", + "activations.loc[page_id].plot(title='{} ({})'.format(page_title, page_id), logy=True)\n", + "plt.ylabel('#views per hour');\n", + "#plt.savefig('{}_{}.png'.format(page_id, page_title.lower()), dpi=300)" ] } ], From 995c86b06f140d0ccd43b12f01ee5409b98a57a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Wed, 17 May 2017 22:42:24 +0000 Subject: [PATCH 09/23] wikipedia: match graph & activations --- experiments/4_wikipedia_traffic.ipynb | 165 +++++++++++++++++++------- 1 file changed, 119 insertions(+), 46 deletions(-) diff --git a/experiments/4_wikipedia_traffic.ipynb b/experiments/4_wikipedia_traffic.ipynb index 661240e..b818bd2 100644 --- a/experiments/4_wikipedia_traffic.ipynb +++ b/experiments/4_wikipedia_traffic.ipynb @@ -105,9 +105,11 @@ "metadata": {}, "outputs": [], "source": [ - "print('{:.2e} vertices'.format(g.num_vertices()))\n", - "print('{:.2e} edges'.format(g.num_edges()))\n", + "def print_graph(graph):\n", + " print('{} vertices, {} edges'.format(\n", + " graph.num_vertices(), graph.num_edges()))\n", "\n", + "print_graph(g)\n", "g.list_properties()" ] }, @@ -135,38 +137,6 @@ "plt.ylabel('#nodes');" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Too large to be drawn in full.\n", - "#gt.sfdp_layout\n", - "#gt.graph_draw(g)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Remove uninteresting pages.\n", - "#g.set_vertex_filter()\n", - "#g.remove_vertex" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "A = gt.adjacency(g)\n", - "A" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -216,9 +186,11 @@ "outputs": [], "source": [ "def id2title(page_id):\n", - " return redirect.at[page_id, 'fix_page_title']\n", - " #return redirect[redirect['page_id'] == page_id]['fix_page_title'].values[0]\n", - "id2title(330)" + " page_title = redirect.at[page_id, 'fix_page_title']\n", + " #page_title = g.vp['page_title'][page_id]\n", + " print('{}: https://en.wikipedia.org/?curid={}'.format(page_title, page_id))\n", + " return page_title\n", + "id2title(12)" ] }, { @@ -357,10 +329,11 @@ " pagecounts = get_pagecounts(date)\n", " activations[date] = pagecounts.reindex(activations.index).fillna(0).astype(np.int32)\n", "\n", + "activations.sort_index(inplace=True)\n", + "\n", "filepath = os.path.join('..', 'data', 'wikipedia', 'activations_{}.h5'.format(MIN_AVG_VIEWS))\n", "activations.to_hdf(filepath, 'activations')\n", "\n", - "print('activations: {} x {} = {}'.format(*activations.shape, activations.size))\n", "ipd.display(activations.head())\n", "ipd.display(activations.info())" ] @@ -379,15 +352,21 @@ "metadata": {}, "outputs": [], "source": [ - "activations = pd.read_hdf(filepath, 'activations')\n", + "def load_activations(filepath):\n", + " activations = pd.read_hdf(filepath, 'activations')\n", + "\n", + " DROP = [\n", + " 15580374, # Main page draws ~10% traffic.\n", + " 42727860, # Undefined has the largest peaks of traffic while being inactive after 2014-10.\n", + " # 8063851, # Feynman point has a very large traffic peak which is probably an error.\n", + " # 2697304, # Gold_as_an_investment has many traffic peaks.\n", + " ]\n", + " activations.drop(DROP, inplace=True)\n", + " \n", + " print('activations: {} page ids x {} hours = {}'.format(*activations.shape, activations.size))\n", + " return activations\n", "\n", - "DROP = [\n", - " 15580374, # Main page draws ~10% traffic\n", - " 42727860, # Undefined has the largest peaks of traffic while being inactive after 2014-10\n", - "# 8063851, # Feynman point has a very large traffic peak which is probably an error.\n", - "# 2697304, # Gold_as_an_investment has many traffic peaks.\n", - "]\n", - "activations.drop(DROP, inplace=True)" + "activations = load_activations(filepath)" ] }, { @@ -438,6 +417,100 @@ "plt.ylabel('#views per hour');\n", "#plt.savefig('{}_{}.png'.format(page_id, page_title.lower()), dpi=300)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4 Matching graph & activations\n", + "\n", + "Further analysis\n", + "* Ratio of in / out neighbors.\n", + "* Proportion of bidirectional hyperlinks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "activations = load_activations(os.path.join('..', 'data', 'wikipedia', 'activations_100.h5'))\n", + "graph = gt.load_graph(os.path.join('..', 'data', 'wikipedia', 'enwiki-20150403-graph.gt'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print_graph(graph)\n", + "\n", + "mask = np.in1d(graph.vp['page_id'].get_array(), activations.index)\n", + "g = gt.GraphView(graph, vfilt=mask)\n", + "print_graph(g)\n", + "\n", + "l = gt.label_largest_component(g)\n", + "g = gt.GraphView(g, vfilt=l)\n", + "print_graph(g)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "g = gt.Graph(g, prune=True)\n", + "\n", + "sort = np.argsort(g.vp['page_id'].get_array())\n", + "sort = np.argsort(sort)\n", + "sort = g.new_vertex_property('int64_t', sort)\n", + "\n", + "g = gt.Graph(g, vorder=sort) # directed=False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "activations = activations.loc[g.vp['page_id'].get_array()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.testing.assert_equal(g.vp['page_id'].get_array(), activations.index)\n", + "\n", + "g.save(os.path.join('..', 'data', 'wikipedia', 'graph.gt'))\n", + "activations.to_hdf(os.path.join('..', 'data', 'wikipedia', 'activations.h5'), 'activations')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#gt.sfdp_layout()\n", + "#gt.graph_draw(g)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "A = gt.adjacency(g)\n", + "ipd.display(A)" + ] } ], "metadata": {}, From 9d204607c2e36ebecc9992cbe39a60e0fdf3b9fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Wed, 17 May 2017 23:28:36 +0000 Subject: [PATCH 10/23] layers.py: move layers in a class hierarchy --- lib/layers.py | 243 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 lib/layers.py diff --git a/lib/layers.py b/lib/layers.py new file mode 100644 index 0000000..c47bae5 --- /dev/null +++ b/lib/layers.py @@ -0,0 +1,243 @@ +from . import graph + +import numpy as np +import scipy.sparse +import tensorflow as tf + + +class Layer: + pass + + +class Fourier(Layer): + """Graph convolutional layers that filter in Fourier.""" + + def __init__(self, Fout, K): + self.Fout = Fout + self.K = K + + def __call__(self, x, L): + assert K == L.shape[0] # artificial but useful to compute number of parameters + N, M, Fin = x.get_shape() + N, M, Fin = int(N), int(M), int(Fin) + # Fourier basis + _, U = graph.fourier(L) + U = tf.constant(U.T, dtype=tf.float32) + # Weights + W = self._weight_variable([M, self.Fout, Fin], regularization=False) + return self._filter_in_fourier(x, L, self.Fout, self.K, U, W) + + def _filter_in_fourier(self, x, L, Fout, K, U, W): + # TODO: N x F x M would avoid the permutations + N, M, Fin = x.get_shape() + N, M, Fin = int(N), int(M), int(Fin) + x = tf.transpose(x, perm=[1, 2, 0]) # M x Fin x N + # Transform to Fourier domain + x = tf.reshape(x, [M, Fin*N]) # M x Fin*N + x = tf.matmul(U, x) # M x Fin*N + x = tf.reshape(x, [M, Fin, N]) # M x Fin x N + # Filter + x = tf.matmul(W, x) # for each feature + x = tf.transpose(x) # N x Fout x M + x = tf.reshape(x, [N*Fout, M]) # N*Fout x M + # Transform back to graph domain + x = tf.matmul(x, U) # N*Fout x M + x = tf.reshape(x, [N, Fout, M]) # N x Fout x M + return tf.transpose(x, perm=[0, 2, 1]) # N x M x Fout + + +class Spline(Fourier): + + def __call__(self, x, L): + N, M, Fin = x.get_shape() + N, M, Fin = int(N), int(M), int(Fin) + # Fourier basis + lamb, U = graph.fourier(L) + U = tf.constant(U.T, dtype=tf.float32) # M x M + # Spline basis + B = self._bspline_basis(self.K, lamb, degree=3) # M x K + # B = _bspline_basis(K, len(lamb), degree=3) # M x K + B = tf.constant(B, dtype=tf.float32) + # Weights + W = self._weight_variable([self.K, self.Fout*Fin], regularization=False) + W = tf.matmul(B, W) # M x Fout*Fin + W = tf.reshape(W, [M, self.Fout, Fin]) + return self._filter_in_fourier(x, L, self.Fout, self.K, U, W) + + def _bspline_basis(self, K, x, degree=3): + """ + Return the B-spline basis. + + K: number of control points. + x: evaluation points + or number of evenly distributed evaluation points. + degree: degree of the spline. Cubic spline by default. + """ + if np.isscalar(x): + x = np.linspace(0, 1, x) + + # Evenly distributed knot vectors. + kv1 = x.min() * np.ones(degree) + kv2 = np.linspace(x.min(), x.max(), K-degree+1) + kv3 = x.max() * np.ones(degree) + kv = np.concatenate((kv1, kv2, kv3)) + + # Cox - DeBoor recursive function to compute one spline over x. + def cox_deboor(k, d): + # Test for end conditions, the rectangular degree zero spline. + if (d == 0): + return ((x - kv[k] >= 0) & (x - kv[k + 1] < 0)).astype(int) + + denom1 = kv[k + d] - kv[k] + term1 = 0 + if denom1 > 0: + term1 = ((x - kv[k]) / denom1) * cox_deboor(k, d - 1) + + denom2 = kv[k + d + 1] - kv[k + 1] + term2 = 0 + if denom2 > 0: + term2 = ((-(x - kv[k + d + 1]) / denom2) * cox_deboor(k + 1, d - 1)) + + return term1 + term2 + + # Compute basis for each point + basis = np.column_stack([cox_deboor(k, degree) for k in range(K)]) + basis[-1, -1] = 1 + return basis + + +class Chebyshev(Layer): + + def __init__(self, Fout, K): + self.Fout = Fout + self.K = K + + +class Chebyshev2(Chebyshev): + + def __call__(self, x, L): + """ + Filtering with Chebyshev interpolation + Implementation: numpy. + + Data: x of size N x M x F + N: number of signals + M: number of vertices + F: number of features per signal per vertex + """ + N, M, Fin = x.get_shape() + N, M, Fin = int(N), int(M), int(Fin) + # Rescale Laplacian. Copy to not modify the shared L. + L = scipy.sparse.csr_matrix(L) + L = graph.rescale_L(L, lmax=2) + # Transform to Chebyshev basis + x = tf.transpose(x, perm=[1, 2, 0]) # M x Fin x N + x = tf.reshape(x, [M, Fin*N]) # M x Fin*N + def chebyshev(x): + return graph.chebyshev(L, x, self.K) + x = tf.py_func(chebyshev, [x], [tf.float32])[0] # K x M x Fin*N + x = tf.reshape(x, [self.K, M, Fin, N]) # K x M x Fin x N + x = tf.transpose(x, perm=[3, 1, 2, 0]) # N x M x Fin x K + x = tf.reshape(x, [N*M, Fin*self.K]) # N*M x Fin*K + # Filter: Fin*Fout filters of order K, i.e. one filterbank per feature. + W = self._weight_variable([Fin*K, self.Fout], regularization=False) + x = tf.matmul(x, W) # N*M x Fout + return tf.reshape(x, [N, M, self.Fout]) # N x M x Fout + + +def Chebyshev5(Chebyshev): + + def __call__(self, x, L): + N, M, Fin = x.get_shape() + N, M, Fin = int(N), int(M), int(Fin) + # Rescale Laplacian and store as a TF sparse tensor. Copy to not modify the shared L. + L = scipy.sparse.csr_matrix(L) + L = graph.rescale_L(L, lmax=2) + L = L.tocoo() + indices = np.column_stack((L.row, L.col)) + L = tf.SparseTensor(indices, L.data, L.shape) + L = tf.sparse_reorder(L) + # Transform to Chebyshev basis + x0 = tf.transpose(x, perm=[1, 2, 0]) # M x Fin x N + x0 = tf.reshape(x0, [M, Fin*N]) # M x Fin*N + x = tf.expand_dims(x0, 0) # 1 x M x Fin*N + def concat(x, x_): + x_ = tf.expand_dims(x_, 0) # 1 x M x Fin*N + return tf.concat([x, x_], axis=0) # K x M x Fin*N + if self.K > 1: + x1 = tf.sparse_tensor_dense_matmul(L, x0) + x = concat(x, x1) + for k in range(2, self.K): + x2 = 2 * tf.sparse_tensor_dense_matmul(L, x1) - x0 # M x Fin*N + x = concat(x, x2) + x0, x1 = x1, x2 + x = tf.reshape(x, [self.K, M, Fin, N]) # K x M x Fin x N + x = tf.transpose(x, perm=[3, 1, 2, 0]) # N x M x Fin x K + x = tf.reshape(x, [N*M, Fin*self.K]) # N*M x Fin*K + # Filter: Fin*Fout filters of order K, i.e. one filterbank per feature pair. + W = self._weight_variable([Fin*self.K, self.Fout], regularization=False) + x = tf.matmul(x, W) # N*M x Fout + return tf.reshape(x, [N, M, self.Fout]) # N x M x Fout + + +class Bias(Layer): + pass + + +class Bias1Relu(Bias): + """Bias and ReLU. One bias per filter.""" + def __call__(self, x): + N, M, F = x.get_shape() + b = self._bias_variable([1, 1, int(F)], regularization=False) + return tf.nn.relu(x + b) + + +class Bias2Relu(Bias): + """Bias and ReLU. One bias per vertex per filter.""" + def __call__(self, x): + N, M, F = x.get_shape() + b = self._bias_variable([1, int(M), int(F)], regularization=False) + return tf.nn.relu(x + b) + + +class Pooling(Layer): + def __init__(self, p): + self.p = p + + +class MaxPooling(Pooling): + def __call__(self, x): + """Max pooling of size p. Should be a power of 2.""" + if self.p > 1: + x = tf.expand_dims(x, 3) # N x M x F x 1 + x = tf.nn.max_pool(x, ksize=[1,self.p,1,1], strides=[1,self.p,1,1], padding='SAME') + #tf.maximum + return tf.squeeze(x, [3]) # N x M/p x F + else: + return x + + +class AvgPooling(Pooling): + def __call__(self, x): + """Average pooling of size p. Should be a power of 2.""" + if self.p > 1: + x = tf.expand_dims(x, 3) # N x M x F x 1 + x = tf.nn.avg_pool(x, ksize=[1,self.p,1,1], strides=[1,self.p,1,1], padding='SAME') + return tf.squeeze(x, [3]) # N x M/p x F + else: + return x + + +class Dense(Layer): + + def __init__(self, Mout, relu=True): + self.Mout = Mout + self.relu = relu + + def __call__(self, x): + """Fully connected layer with Mout features.""" + N, Min = x.get_shape() + W = self._weight_variable([int(Min), self.Mout], regularization=True) + b = self._bias_variable([self.Mout], regularization=True) + x = tf.matmul(x, W) + b + return tf.nn.relu(x) if self.relu else x From 4955b8f32352be643b2bc4d418de71ff3468e146 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Thu, 18 May 2017 08:13:28 +0000 Subject: [PATCH 11/23] wikipedia: compute graph diameter and don't keep copies in memory --- experiments/4_wikipedia_traffic.ipynb | 67 ++++++++++++++++----------- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/experiments/4_wikipedia_traffic.ipynb b/experiments/4_wikipedia_traffic.ipynb index b818bd2..fca6a33 100644 --- a/experiments/4_wikipedia_traffic.ipynb +++ b/experiments/4_wikipedia_traffic.ipynb @@ -86,7 +86,7 @@ "metadata": {}, "outputs": [], "source": [ - "g = gt.load_graph(os.path.join(WIKI_CLEAN, 'enwiki-20150403-graph.gt'))" + "graph = gt.load_graph(os.path.join(WIKI_CLEAN, 'enwiki-20150403-graph.gt'))" ] }, { @@ -95,8 +95,8 @@ "metadata": {}, "outputs": [], "source": [ - "g.is_directed()\n", - "#g.set_directed(False)" + "graph.is_directed()\n", + "#graph.set_directed(False)" ] }, { @@ -109,8 +109,8 @@ " print('{} vertices, {} edges'.format(\n", " graph.num_vertices(), graph.num_edges()))\n", "\n", - "print_graph(g)\n", - "g.list_properties()" + "print_graph(graph)\n", + "graph.list_properties()" ] }, { @@ -120,8 +120,8 @@ "outputs": [], "source": [ "idx = 42\n", - "page_title = g.vertex_properties['page_title'][idx]\n", - "page_id = g.vertex_properties['page_id'][idx]\n", + "page_title = graph.vertex_properties['page_title'][idx]\n", + "page_id = graph.vertex_properties['page_id'][idx]\n", "print('{}: {}'.format(page_id, page_title))" ] }, @@ -131,7 +131,7 @@ "metadata": {}, "outputs": [], "source": [ - "hist = gt.vertex_hist(g, 'total')\n", + "hist = gt.vertex_hist(graph, 'total')\n", "plt.loglog(hist[1][:-1], hist[0])\n", "plt.xlabel('#edges')\n", "plt.ylabel('#nodes');" @@ -187,7 +187,7 @@ "source": [ "def id2title(page_id):\n", " page_title = redirect.at[page_id, 'fix_page_title']\n", - " #page_title = g.vp['page_title'][page_id]\n", + " #page_title = graph.vp['page_title'][id]\n", " print('{}: https://en.wikipedia.org/?curid={}'.format(page_title, page_id))\n", " return page_title\n", "id2title(12)" @@ -263,7 +263,7 @@ "\n", "activations_tot = pd.Series(\n", " data=0,\n", - " index=g.vp['page_id'].get_array(),\n", + " index=graph.vp['page_id'].get_array(),\n", " dtype=np.int64\n", ")\n", "\n", @@ -435,8 +435,19 @@ "metadata": {}, "outputs": [], "source": [ - "activations = load_activations(os.path.join('..', 'data', 'wikipedia', 'activations_100.h5'))\n", - "graph = gt.load_graph(os.path.join('..', 'data', 'wikipedia', 'enwiki-20150403-graph.gt'))" + "#activations = load_activations(os.path.join('..', 'data', 'wikipedia', 'activations_100.h5'))\n", + "#graph = gt.load_graph(os.path.join('..', 'data', 'wikipedia', 'enwiki-20150403-graph.gt'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_diameter(graph):\n", + " d = gt.pseudo_diameter(graph)[0]\n", + " print('Pseudo-diameter: {}'.format(int(d)))" ] }, { @@ -446,14 +457,16 @@ "outputs": [], "source": [ "print_graph(graph)\n", + "compute_diameter(graph)\n", "\n", "mask = np.in1d(graph.vp['page_id'].get_array(), activations.index)\n", - "g = gt.GraphView(graph, vfilt=mask)\n", - "print_graph(g)\n", + "graph = gt.GraphView(graph, vfilt=mask)\n", + "print_graph(graph)\n", "\n", - "l = gt.label_largest_component(g)\n", - "g = gt.GraphView(g, vfilt=l)\n", - "print_graph(g)" + "l = gt.label_largest_component(graph)\n", + "graph = gt.GraphView(graph, vfilt=l)\n", + "print_graph(graph)\n", + "compute_diameter(graph)" ] }, { @@ -462,13 +475,13 @@ "metadata": {}, "outputs": [], "source": [ - "g = gt.Graph(g, prune=True)\n", + "graph = gt.Graph(graph, prune=True)\n", "\n", - "sort = np.argsort(g.vp['page_id'].get_array())\n", + "sort = np.argsort(graph.vp['page_id'].get_array())\n", "sort = np.argsort(sort)\n", - "sort = g.new_vertex_property('int64_t', sort)\n", + "sort = graph.new_vertex_property('int64_t', sort)\n", "\n", - "g = gt.Graph(g, vorder=sort) # directed=False" + "graph = gt.Graph(graph, vorder=sort) # directed=False" ] }, { @@ -477,7 +490,7 @@ "metadata": {}, "outputs": [], "source": [ - "activations = activations.loc[g.vp['page_id'].get_array()]" + "activations = activations.loc[graph.vp['page_id'].get_array()]" ] }, { @@ -486,9 +499,9 @@ "metadata": {}, "outputs": [], "source": [ - "np.testing.assert_equal(g.vp['page_id'].get_array(), activations.index)\n", + "np.testing.assert_equal(graph.vp['page_id'].get_array(), activations.index)\n", "\n", - "g.save(os.path.join('..', 'data', 'wikipedia', 'graph.gt'))\n", + "graph.save(os.path.join('..', 'data', 'wikipedia', 'graph.gt'))\n", "activations.to_hdf(os.path.join('..', 'data', 'wikipedia', 'activations.h5'), 'activations')" ] }, @@ -499,7 +512,7 @@ "outputs": [], "source": [ "#gt.sfdp_layout()\n", - "#gt.graph_draw(g)" + "#gt.graph_draw(graph)" ] }, { @@ -508,7 +521,7 @@ "metadata": {}, "outputs": [], "source": [ - "A = gt.adjacency(g)\n", + "A = gt.adjacency(graph)\n", "ipd.display(A)" ] } @@ -516,4 +529,4 @@ "metadata": {}, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} From df328f7a2ad0114e14b86fe05a54fb8cfab98503 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Thu, 18 May 2017 11:55:10 +0200 Subject: [PATCH 12/23] trials: clean JSON metadata --- trials/1_learning_filters.ipynb | 164 ++++++++------------------------ trials/2_classification.ipynb | 145 +++++++--------------------- trials/3_tensorflow.ipynb | 44 ++------- trials/4_coarsening.ipynb | 60 +++--------- trials/makefile | 9 +- 5 files changed, 99 insertions(+), 323 deletions(-) diff --git a/trials/1_learning_filters.ipynb b/trials/1_learning_filters.ipynb index 7f29d95..33ed47b 100644 --- a/trials/1_learning_filters.ipynb +++ b/trials/1_learning_filters.ipynb @@ -2,9 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "collapsed": true - }, + "metadata": {}, "source": [ "# Trial 1: learning graph filters\n", "\n", @@ -24,9 +22,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import time\n", @@ -39,9 +35,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": true - }, + "metadata": {}, "source": [ "## Problem setting\n", "\n", @@ -54,9 +48,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "M = 100 # nodes\n", @@ -142,9 +134,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def laplacian(W, normalized=True):\n", @@ -190,9 +180,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def fourier(L):\n", @@ -264,9 +252,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def gen_filter(type='step', t=2):\n", @@ -333,9 +319,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "N = 200 # signals\n", @@ -389,9 +373,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def L(c):\n", @@ -434,9 +416,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "t_start = time.process_time()\n", @@ -490,9 +470,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def sgd(c0, L, dL, learning_rate=.1, batch_size=100, crit=1e-3, maxit=100, window=10):\n", @@ -541,9 +519,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def sgd_plot_convergence(c0, L, dL, params, crit, maxit):\n", @@ -619,9 +595,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def plot_filters(coeffs):\n", @@ -661,9 +635,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "K = 5\n", @@ -722,9 +694,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "K = 10\n", @@ -788,9 +758,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def polynomial_order(K):\n", @@ -851,9 +819,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "K = 15\n", @@ -880,9 +846,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def filter_chebyshev(X, c):\n", @@ -924,9 +888,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "t_start = time.process_time()\n", @@ -953,9 +915,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "c0 = np.random.uniform(0, 1, K)\n", @@ -975,9 +935,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def plot_coefficients(coeffs):\n", @@ -1037,9 +995,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def rescale_L(L):\n", @@ -1088,9 +1044,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def eval_clenshaw(x, c):\n", @@ -1136,9 +1090,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def test(c):\n", @@ -1187,9 +1139,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def vectorize(Xt, Y):\n", @@ -1237,9 +1187,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def cheby_coeff_direct(X, Y, K, svd=False):\n", @@ -1269,9 +1217,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "params = []\n", @@ -1298,9 +1244,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "plot_coefficients(['c_crs', 'c_crd', 'c_cro', 'c_cs', 'c_co', 'c_cg'])\n", @@ -1319,9 +1263,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def lanczos(L, X, K):\n", @@ -1468,9 +1410,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def lanczos_basis_eval_f(L, X, K):\n", @@ -1554,9 +1494,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def lanczos_basis_eval(L, X, K, ret_q=False, impl=2):\n", @@ -1645,9 +1583,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "c0 = np.random.uniform(0, 1, K)\n", @@ -1666,9 +1602,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "plot_coefficients(['c_ls', 'c_ld', 'c_lo', 'c_lf'])" @@ -1686,9 +1620,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def c_l(n):\n", @@ -1722,9 +1654,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def polynomial_order(K, step=1):\n", @@ -1799,25 +1729,7 @@ ] } ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, + "metadata": {}, "nbformat": 4, - "nbformat_minor": 0 -} + "nbformat_minor": 1 +} \ No newline at end of file diff --git a/trials/2_classification.ipynb b/trials/2_classification.ipynb index d06d46d..03b6105 100644 --- a/trials/2_classification.ipynb +++ b/trials/2_classification.ipynb @@ -12,9 +12,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import time\n", @@ -50,9 +48,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def mnist(a, b, N):\n", @@ -109,9 +105,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def test_sklearn(tauR):\n", @@ -145,9 +139,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def test_optim(clf, X, y, ax=None):\n", @@ -177,9 +169,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "class rls:\n", @@ -222,9 +212,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "t_start = time.process_time()\n", @@ -246,9 +234,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def lanczos(L, X, K):\n", @@ -301,9 +287,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def test():\n", @@ -350,9 +334,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "class gflc_noweights:\n", @@ -417,10 +399,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false, - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "class gflc_weights():\n", @@ -516,9 +495,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "class gflc_split():\n", @@ -637,9 +614,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "lamb, U = graph.fourier(L)\n", @@ -649,9 +624,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def plot_filters(C, spectrum=False):\n", @@ -700,9 +673,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def plot_features(C, x):\n", @@ -741,9 +712,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def scorer(clf, X, y):\n", @@ -758,9 +727,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def perf(clf, nfolds=3):\n", @@ -813,9 +780,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def cross_validation(clf, nfolds, nvalidations):\n", @@ -839,9 +804,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def test_classification(clf, params, param, values, nfolds=10, nvalidations=1):\n", @@ -867,9 +830,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "test_classification(rls, {}, 'tauR', [1e8,1e7,1e6,1e5,1e4,1e3,1e-5,1e-8], 10, 10)" @@ -878,9 +839,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "params = {'F':1, 'K':2, 'tauR':1e3, 'niter':5, 'algo':'direct'}\n", @@ -890,9 +849,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "params = {'F':2, 'K':10, 'tauR':1e4, 'niter':5, 'algo':'direct'}\n", @@ -902,9 +859,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "params = {'F':2, 'K':4, 'tauR':1e4, 'niter':5, 'algo':'direct'}\n", @@ -921,9 +876,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "Xfull = X" @@ -932,9 +885,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def sample(X, p, seed=None):\n", @@ -984,9 +935,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "#clf_weights = gflc_weights(F=3, K=4, tauR=1e-3, niter=5, algo='direct')\n", @@ -997,9 +946,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "#test_classification(rls, {}, 'tauR', [1e1,1e0])\n", @@ -1010,9 +957,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "test_classification(rls, {}, 'tauR', [1e8,1e7,1e6,1e5,1e4,1e3,1e-5,1e-8], 10, 10)" @@ -1021,9 +966,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "params = {'F':2, 'K':2, 'tauR':1e3, 'niter':5, 'algo':'direct'}\n", @@ -1033,9 +976,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "params = {'F':2, 'K':10, 'tauR':1e5, 'niter':5, 'algo':'direct'}\n", @@ -1045,9 +986,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "params = {'F':2, 'K':4, 'tauR':1e5, 'niter':5, 'algo':'direct'}\n", @@ -1055,25 +994,7 @@ ] } ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, + "metadata": {}, "nbformat": 4, - "nbformat_minor": 0 -} + "nbformat_minor": 1 +} \ No newline at end of file diff --git a/trials/3_tensorflow.ipynb b/trials/3_tensorflow.ipynb index 361ece6..21de92a 100644 --- a/trials/3_tensorflow.ipynb +++ b/trials/3_tensorflow.ipynb @@ -12,9 +12,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf" @@ -30,9 +28,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from tensorflow.examples.tutorials.mnist import input_data\n", @@ -51,9 +47,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "x = tf.placeholder(tf.float32, [None, 784])\n", @@ -72,9 +66,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "y_ = tf.placeholder(tf.float32, [None, 10])\n", @@ -100,9 +92,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))\n", @@ -111,25 +101,7 @@ ] } ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, + "metadata": {}, "nbformat": 4, - "nbformat_minor": 0 -} + "nbformat_minor": 1 +} \ No newline at end of file diff --git a/trials/4_coarsening.ipynb b/trials/4_coarsening.ipynb index 414b138..46924b5 100644 --- a/trials/4_coarsening.ipynb +++ b/trials/4_coarsening.ipynb @@ -2,9 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "collapsed": true - }, + "metadata": {}, "source": [ "# Trial 4: graph coarsening\n", "\n", @@ -27,9 +25,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import os\n", @@ -41,9 +37,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if False:\n", @@ -75,9 +69,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# INPUT\n", @@ -170,9 +162,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "#http://nbviewer.ipython.org/gist/Midnighter/9992103\n", @@ -202,9 +192,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Coarsen a graph given by rr,cc,vv. rr is assumed to be ordered\n", @@ -258,9 +246,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "maxsize = 200\n", @@ -299,9 +285,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import sys\n", @@ -387,9 +371,7 @@ }, { "cell_type": "raw", - "metadata": { - "collapsed": true - }, + "metadata": {}, "source": [ "# Matlab results\n", "\n", @@ -458,25 +440,7 @@ ] } ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, + "metadata": {}, "nbformat": 4, - "nbformat_minor": 0 -} + "nbformat_minor": 1 +} \ No newline at end of file diff --git a/trials/makefile b/trials/makefile index 3a29a42..22a1c95 100644 --- a/trials/makefile +++ b/trials/makefile @@ -6,6 +6,13 @@ $(NB): jupyter nbconvert --inplace --execute --ExecutePreprocessor.timeout=-1 $@ clean: - jupyter nbconvert --inplace --ClearOutputPreprocessor.enabled=True $(NB) + @for nb in $(NB); do \ + printf "%s" "$$(jq --indent 1 ' \ + .metadata = {} \ + | (.cells[] | select(has("outputs")) | .outputs) = [] \ + | (.cells[] | select(has("execution_count")) | .execution_count) = null \ + | .cells[].metadata = {} \ + ' $$nb)" > $$nb; \ + done .PHONY: run $(NB) clean From b53363a67a36fce58d555e8d272588467663b392 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Thu, 18 May 2017 11:55:46 +0200 Subject: [PATCH 13/23] trials: play with graph-tool --- trials/5_graph_tool.ipynb | 118 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 trials/5_graph_tool.ipynb diff --git a/trials/5_graph_tool.ipynb b/trials/5_graph_tool.ipynb new file mode 100644 index 0000000..fad0765 --- /dev/null +++ b/trials/5_graph_tool.ipynb @@ -0,0 +1,118 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Trial 5: graph-tool\n", + "\n", + "Learn and experiment with [graph-tool](https://graph-tool.skewed.de).\n", + "\n", + "Alternatives for graph analysis:\n", + "* [NetworkX](http://networkx.github.io)\n", + "* [NetworKit](https://networkit.iti.kit.edu)\n", + "* [igraph](http://igraph.org)\n", + "* [GraphLab](https://turi.com)\n", + "* [GraphX](https://spark.apache.org/graphx)\n", + "* [Giraph](https://giraph.apache.org)\n", + "\n", + "Alternatives for graph visualization:\n", + "* [Gephi](https://gephi.org)\n", + "* [Graphviz](http://www.graphviz.org)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "import graph_tool.all as gt\n", + "#import networkx as nx\n", + "#import networkit as nk\n", + "\n", + "gt.openmp_enabled(), gt.openmp_get_num_threads()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Graph filters and plots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "g, pos = gt.triangulation(np.random.random_sample((500, 2)) * 4, type='delaunay')\n", + "\n", + "tree = gt.min_spanning_tree(g2)\n", + "tv = gt.GraphView(g, efilt=tree)\n", + "\n", + "bv, be = gt.betweenness(tv)\n", + "be.a /= be.a.max() / 5\n", + "gt.graph_draw(tv, pos, vertex_fill_color=bv, edge_pen_width=be);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gt.adjacency(g)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#g = nk.readGraph('graph.gt', nk.Format.GraphToolBinary)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Graph models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "g = gt.collection.data['football']\n", + "state = gt.minimize_blockmodel_dl(g, deg_corr=False)\n", + "state.draw(pos=g.vp.pos)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "g = gt.collection.data['celegansneural']\n", + "state = gt.minimize_nested_blockmodel_dl(g, deg_corr=True)\n", + "state.draw()\n", + "state.print_summary()" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file From ed98c9e27506135135e2c1651c5539a2cb1d5257 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Tue, 20 Jun 2017 14:55:03 +0000 Subject: [PATCH 14/23] wikipedia: more and better visualizations --- experiments/4_wikipedia_traffic.ipynb | 175 +++++++++++++++++++------- 1 file changed, 130 insertions(+), 45 deletions(-) diff --git a/experiments/4_wikipedia_traffic.ipynb b/experiments/4_wikipedia_traffic.ipynb index fca6a33..0f674ec 100644 --- a/experiments/4_wikipedia_traffic.ipynb +++ b/experiments/4_wikipedia_traffic.ipynb @@ -58,8 +58,10 @@ "%load_ext dotenv\n", "%dotenv .env\n", "\n", - "WIKI_RAW = os.environ.get('WIKI_RAW') # Downloaded from dumps.wikimedia.org.\n", - "WIKI_CLEAN = os.environ.get('WIKI_CLEAN') # Processed by Kirell Benzi." + "#WIKI_RAW = os.environ.get('WIKI_RAW') # Downloaded from dumps.wikimedia.org.\n", + "#WIKI_CLEAN = os.environ.get('WIKI_CLEAN') # Processed by Kirell Benzi.\n", + "\n", + "DATA_DIR = os.path.join('..', 'data', 'wikipedia')" ] }, { @@ -86,7 +88,7 @@ "metadata": {}, "outputs": [], "source": [ - "graph = gt.load_graph(os.path.join(WIKI_CLEAN, 'enwiki-20150403-graph.gt'))" + "graph = gt.load_graph(os.path.join(DATA_DIR, 'enwiki-20150403-graph.gt'))" ] }, { @@ -131,10 +133,13 @@ "metadata": {}, "outputs": [], "source": [ - "hist = gt.vertex_hist(graph, 'total')\n", - "plt.loglog(hist[1][:-1], hist[0])\n", - "plt.xlabel('#edges')\n", - "plt.ylabel('#nodes');" + "def plot_degree_distribution(graph):\n", + " hist = gt.vertex_hist(graph, 'total')\n", + " plt.loglog(hist[1][:-1], hist[0])\n", + " plt.xlabel('#edges')\n", + " plt.ylabel('#nodes')\n", + " #plt.savefig('degree_distribution.pdf')\n", + "plot_degree_distribution(graph)" ] }, { @@ -152,7 +157,7 @@ "metadata": {}, "outputs": [], "source": [ - "filepath = os.path.join(WIKI_CLEAN, 'enwiki-20150403-page-redirect.csv.gz')\n", + "filepath = os.path.join(DATA_DIR, 'enwiki-20150403-page-redirect.csv.gz')\n", "redirect = pd.read_csv(filepath, compression='gzip', sep='|', encoding='utf-8', quoting=3, index_col=1)\n", "\n", "redirect.head()" @@ -231,7 +236,7 @@ "outputs": [], "source": [ "# Kirell's signal which includes views when greater than 500.\n", - "filepath = os.path.join(WIKI_CLEAN, 'signal_500.h5')\n", + "filepath = os.path.join(DATA_DIR, 'signal_500.h5')\n", "signal = pd.read_hdf(filepath, 'data')\n", "signal['count_views'].plot(kind='hist', logy=True)\n", "print(len(signal), len(signal['page_id'].unique()), len(signal['layer'].unique()), signal['count_views'].max())\n", @@ -296,7 +301,9 @@ "outputs": [], "source": [ "# Power law.\n", - "activations_tot.drop(main_page).plot(kind='hist', logy=True, bins=100);" + "activations_tot.drop(main_page).plot(kind='hist', logy=True, bins=100);\n", + "plt.figure()\n", + "activations_tot.drop(main_page)[activations_tot < 1e7].plot(kind='hist', logy=True, bins=100);" ] }, { @@ -352,16 +359,18 @@ "metadata": {}, "outputs": [], "source": [ - "def load_activations(filepath):\n", + "DROP = [\n", + " 15580374, # Main page draws ~10% traffic.\n", + " 42727860, # Undefined has the largest peaks of traffic while being inactive after 2014-10.\n", + "# 8063851, # Feynman point has a very large traffic peak which is probably an error.\n", + "# 2697304, # Gold_as_an_investment has many traffic peaks.\n", + "]\n", + "\n", + "def load_activations(filepath, drop=DROP):\n", " activations = pd.read_hdf(filepath, 'activations')\n", "\n", - " DROP = [\n", - " 15580374, # Main page draws ~10% traffic.\n", - " 42727860, # Undefined has the largest peaks of traffic while being inactive after 2014-10.\n", - " # 8063851, # Feynman point has a very large traffic peak which is probably an error.\n", - " # 2697304, # Gold_as_an_investment has many traffic peaks.\n", - " ]\n", - " activations.drop(DROP, inplace=True)\n", + " if drop:\n", + " activations.drop(drop, inplace=True)\n", " \n", " print('activations: {} page ids x {} hours = {}'.format(*activations.shape, activations.size))\n", " return activations\n", @@ -395,27 +404,39 @@ "metadata": {}, "outputs": [], "source": [ + "def plot_activation(page_id):\n", + " page_title = id2title(page_id)\n", + " ax = activations.loc[page_id].plot(label='{} ({})'.format(page_title, page_id), logy=True)\n", + " ax.set_ylabel('#views per hour');\n", + " ax.legend()\n", + " #plt.savefig('{}_{}.png'.format(page_id, page_title.lower()), dpi=300)\n", + " #plt.savefig('{}_{}.pdf'.format(page_id, page_title.lower()))\n", + "\n", "# Events.\n", - "page_id = 40817806 # Ebola\n", - "page_id = 44635 # Grammy\n", - "page_id = 150340 # Miss Universe\n", - "page_id = 27718 # Super Bowl\n", - "#page_id = 324 # Academy Awards\n", - "#page_id = 44969225 # Charlie Hebdo shooting\n", - "#page_id = 2251390 # Charlie Hebdo\n", + "plot_activation(2251390) # Charlie Hebdo\n", + "plot_activation(44969225) # Charlie Hebdo shooting\n", + "plt.figure()\n", + "plot_activation(27718) # Super Bowl\n", + "plt.figure()\n", + "#plot_activation(40817806) # Ebola\n", + "plot_activation(44635) # Grammy\n", + "plot_activation(150340) # Miss Universe\n", + "#plot_activation(324) # Academy Awards\n", + "\n", + "# Neighbors of Charlie Hebdo.\n", + "#plot_activation(44969610) # Charb\n", + "#plot_activation(206682) # Caricature\n", + "#plot_activation(15012) # Islamism\n", + "#plot_activation(7826589) # Jihadism\n", + "#plot_activation(50100) # Journalist\n", "\n", "# Remarkable things.\n", - "#page_id = 25\n", - "#page_id = 15580374 # Main Page --> largest traffic (~10%)\n", - "#page_id = 42727860 # Undefined --> hits only before mid-oct 2014\n", - "#page_id = 670 # Alphabet --> strange drop\n", - "#page_id = 8063851 # Shall distinguish outliers (counting errors?) from real events\n", - "#page_id = 2697304 # Lots of peaks --> correlated with fluctuations on market?\n", - "\n", - "page_title = id2title(page_id)\n", - "activations.loc[page_id].plot(title='{} ({})'.format(page_title, page_id), logy=True)\n", - "plt.ylabel('#views per hour');\n", - "#plt.savefig('{}_{}.png'.format(page_id, page_title.lower()), dpi=300)" + "#plot_activation(25)\n", + "#plot_activation(15580374) # Main Page --> largest traffic (~10%)\n", + "#plot_activation(42727860) # Undefined --> hits only before mid-oct 2014\n", + "#plot_activation(670) # Alphabet --> strange drop\n", + "#plot_activation(8063851) # Shall distinguish outliers (counting errors?) from real events\n", + "#plot_activation(2697304) # Lots of peaks --> correlated with fluctuations on market?" ] }, { @@ -435,8 +456,8 @@ "metadata": {}, "outputs": [], "source": [ - "#activations = load_activations(os.path.join('..', 'data', 'wikipedia', 'activations_100.h5'))\n", - "#graph = gt.load_graph(os.path.join('..', 'data', 'wikipedia', 'enwiki-20150403-graph.gt'))" + "activations = load_activations(os.path.join(DAT_DIR, 'activations_100.h5'))\n", + "graph = gt.load_graph(os.path.join(DATA_DIR, 'enwiki-20150403-graph.gt'))" ] }, { @@ -477,11 +498,14 @@ "source": [ "graph = gt.Graph(graph, prune=True)\n", "\n", - "sort = np.argsort(graph.vp['page_id'].get_array())\n", - "sort = np.argsort(sort)\n", - "sort = graph.new_vertex_property('int64_t', sort)\n", + "def sort_vertices(graph, vp):\n", + " sort = np.argsort(vp.get_array())\n", + " sort = np.argsort(sort)\n", + " sort = graph.new_vertex_property('int64_t', sort)\n", + " return gt.Graph(graph, vorder=sort)\n", "\n", - "graph = gt.Graph(graph, vorder=sort) # directed=False" + "graph = sort_vertices(graph, graph.vp['page_id'])\n", + "# directed=False" ] }, { @@ -502,6 +526,7 @@ "np.testing.assert_equal(graph.vp['page_id'].get_array(), activations.index)\n", "\n", "graph.save(os.path.join('..', 'data', 'wikipedia', 'graph.gt'))\n", + "graph.save(os.path.join('..', 'data', 'wikipedia', 'graph.graphml'))\n", "activations.to_hdf(os.path.join('..', 'data', 'wikipedia', 'activations.h5'), 'activations')" ] }, @@ -515,18 +540,78 @@ "#gt.graph_draw(graph)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5 Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "graph = gt.load_graph(os.path.join(DATA_DIR, 'graph.gt'))\n", + "activations = load_activations(os.path.join(DATA_DIR, 'activations.h5'), drop=None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_degree_distribution(graph)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_adjacency(graph, ax=None):\n", + " A = gt.adjacency(graph)\n", + " if not ax:\n", + " fig, ax = plt.subplots(figsize=(10, 10))\n", + " ax.spy(A[:10000,:10000], markersize=0.2)\n", + " ax.set_title('{} nodes, {} edges ({:.2%})'.format(\n", + " A.shape[0], A.nnz, A.nnz / np.multiply(*A.shape)))\n", + "\n", + "plot_adjacency(graph)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def order_adjacency_plot(graph, ax=None, **kwargs):\n", + " state = gt.minimize_blockmodel_dl(graph, **kwargs)\n", + " graph = sort_vertices(graph, state.get_blocks())\n", + " plot_adjacency(graph, ax)\n", + "\n", + "fig, axes = plt.subplots(1, 3)\n", + "for ax, n_blocks in zip(axes, [10, 20, 30]):\n", + " order_adjacency_plot(graph, ax=ax, B_max=n_blocks)" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "A = gt.adjacency(graph)\n", - "ipd.display(A)" + "plt.hist(activations.values.reshape(-1), bins=100, log=True);\n", + "plt.figure()\n", + "plt.hist(activations.sum(axis=1).values.reshape(-1), bins=100, log=True);" ] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file From 68fcb5d4fc8ba1bc696d0d931a09c66eb0d657b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Mon, 26 Jun 2017 15:01:33 +0200 Subject: [PATCH 15/23] structured sequence trial: TF input pipeline --- trials/6_structured_sequence.ipynb | 246 +++++++++++++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100644 trials/6_structured_sequence.ipynb diff --git a/trials/6_structured_sequence.ipynb b/trials/6_structured_sequence.ipynb new file mode 100644 index 0000000..3f94bb0 --- /dev/null +++ b/trials/6_structured_sequence.ipynb @@ -0,0 +1,246 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Trial 6: structured sequence modeling\n", + "\n", + "* Create simple parametric time series and try to model them.\n", + "* Add structure by constructing a graph between the series and see how it improves.\n", + "* Usage of `tflearn` inspired by [How to do time series prediction using RNNs, TensorFlow and Cloud ML Engine](https://medium.com/google-cloud/how-to-do-time-series-prediction-using-rnns-and-tensorflow-and-cloud-ml-engine-2ad2eeb189e8)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import shutil\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "import tensorflow as tf\n", + "import tensorflow.contrib.learn as tflearn\n", + "\n", + "plt.rcParams['figure.figsize'] = (17, 5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_DIR = os.path.join('..', 'data', 'structured_sequence_trial')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1 Data generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "SEQ_LEN = 100\n", + "N_SEQ = 4\n", + "\n", + "def create_time_series(seq_len, random_state):\n", + " freq = random_state.uniform(0.1, 0.6)\n", + " ampl = random_state.uniform(0.5, 1.5)\n", + " offset = random_state.uniform(-1, 1)\n", + " return np.sin(np.arange(seq_len) * freq) * ampl + offset\n", + "\n", + "rs = np.random.RandomState(42)\n", + "data = np.empty((N_SEQ, SEQ_LEN))\n", + "for i in range(N_SEQ):\n", + " data[i] = create_time_series(SEQ_LEN, rs)\n", + "data = pd.DataFrame(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.T.plot();\n", + "plt.savefig('time_series.pdf')\n", + "# hist" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2 Graph construction\n", + "\n", + "k-NN graph between the time series." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3 Data preparation\n", + "\n", + "* Store data in TFRecords files which will be read by the input pipeline.\n", + "* Preprocessing can be done here.\n", + "* Data augmentation should be done in input pipeline (to save disk space).\n", + "* We are doing full batch, i.e. we feed data on the whole graph at once." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "N_INPUTS = 10 # Number of samples used for prediction, i.e. unrolling length.\n", + "N_OUTPUTS = 1 # Number of samples in the time series the model tries to predict.\n", + "\n", + "def feature(array):\n", + " array = array.reshape(-1)\n", + " return tf.train.Feature(float_list=tf.train.FloatList(value=list(array)))\n", + "\n", + "def save_dataset(data, filename):\n", + " \"\"\"Save dataset as TFRecords.\"\"\"\n", + " filename = os.path.join(DATA_DIR, filename)\n", + " num_examples = data.shape[1] - N_INPUTS - N_OUTPUTS + 1\n", + " assert num_examples > 0\n", + " tf.logging.info('Writing {} examples to {}'.format(num_examples, filename))\n", + " with tf.python_io.TFRecordWriter(filename) as writer:\n", + " for idx in range(num_examples):\n", + " inputs = data[:, idx:idx+N_INPUTS]\n", + " targets = data[:, idx+N_INPUTS:idx+N_INPUTS+N_OUTPUTS]\n", + " example = tf.train.Example(features=tf.train.Features(feature={\n", + " #'graph': feature(graph), # Adjacency matrix or Laplacian can be stored here.\n", + " 'inputs': feature(inputs),\n", + " 'targets': feature(targets)}))\n", + " writer.write(example.SerializeToString())\n", + "\n", + "TRAINING_LEN = int(0.8 * SEQ_LEN)\n", + "save_dataset(data.iloc[:, :TRAINING_LEN].values, 'train.tfrecords')\n", + "save_dataset(data.iloc[:, TRAINING_LEN:].values, 'validation.tfrecords')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4 Data loading\n", + "\n", + "Two training schemes:\n", + "* Load whole data for training up to a certain point in time. That is what is done for text (the whole vocabulary graph is used).\n", + "* Use some time series (some part of the graph) as training and the others as evaluation.\n", + "\n", + "TF alternative:\n", + "* [tf.contrib.slim.dataset](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class DataLoader:\n", + "\n", + " def __init__(s, filenames, num_epochs=1, batch_size=1, read_threads=1, seed=None):\n", + " #if mode == tflearn.ModeKeys.TRAIN:\n", + " s.filenames = filenames\n", + " s.batch_size = batch_size\n", + " s.num_epochs = num_epochs\n", + " s.read_threads = read_threads\n", + " s.seed = seed\n", + "\n", + " def _read_and_decode(s, filename_queue):\n", + " reader = tf.TFRecordReader()\n", + " _, example = reader.read(filename_queue)\n", + " features={\n", + " 'inputs': tf.FixedLenFeature([N_SEQ * N_INPUTS], tf.float32),\n", + " 'targets': tf.FixedLenFeature([N_SEQ * N_OUTPUTS], tf.float32),\n", + " }\n", + " example = tf.parse_single_example(example, features)\n", + " inputs = tf.reshape(example['inputs'], [N_SEQ, N_INPUTS])\n", + " targets = tf.reshape(example['targets'], [N_SEQ, N_OUTPUTS])\n", + " return inputs, targets\n", + "\n", + " def __call__(s):\n", + " with tf.name_scope('input_queues'):\n", + " #with tf.device(\"/cpu:0\"): # Input queues are on CPU.\n", + " filenames = [os.path.join(DATA_DIR, filename) for filename in s.filenames]\n", + " filename_queue = tf.train.string_input_producer(filenames, s.num_epochs, shuffle=True)\n", + "\n", + " examples = [s._read_and_decode(filename_queue) for _ in range(s.read_threads)]\n", + "\n", + " # Shuffle examples.\n", + " if True:\n", + " min_after_dequeue = 10 #10000\n", + " capacity = min_after_dequeue + (s.read_threads + 2) * s.batch_size\n", + " input_batch, target_batch = tf.train.shuffle_batch_join(\n", + " examples, batch_size=s.batch_size, seed=s.seed, capacity=capacity,\n", + " min_after_dequeue=min_after_dequeue, allow_smaller_final_batch=True)\n", + " else:\n", + " assert s.read_threads == 1\n", + " input_batch, target_batch = examples[0]\n", + " return {'inputs': input_batch}, target_batch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make one pass over the dataset to make sure the input pipeline works." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inputs = DataLoader(['train.tfrecords'])()[0]['inputs']\n", + "\n", + "sess = tf.Session()\n", + "#sess.run(tf.global_variables_initializer())\n", + "sess.run(tf.local_variables_initializer())\n", + "\n", + "coord = tf.train.Coordinator()\n", + "threads = tf.train.start_queue_runners(sess, coord)\n", + "\n", + "idx = 0\n", + "training_data = np.empty((N_SEQ, TRAINING_LEN-N_OUTPUTS))\n", + "try:\n", + " while not coord.should_stop():\n", + " training_data[:, idx:idx+N_INPUTS] = sess.run(inputs)\n", + " idx += 1\n", + "\n", + "except tf.errors.OutOfRangeError:\n", + " print('Done: {} steps'.format(idx))\n", + "finally:\n", + " coord.request_stop()\n", + "\n", + "coord.join(threads)\n", + "sess.close()\n", + "\n", + "#np.testing.assert_allclose(training_data, data.iloc[:, :TRAINING_LEN-N_OUTPUTS])" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file From 987cf49d10a4221a36528342a7906bbfe5a85e00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Wed, 28 Jun 2017 15:10:14 +0200 Subject: [PATCH 16/23] structured sequence: model & experiment --- trials/6_structured_sequence.ipynb | 104 ++++++++++++++++++++++++++--- 1 file changed, 96 insertions(+), 8 deletions(-) diff --git a/trials/6_structured_sequence.ipynb b/trials/6_structured_sequence.ipynb index 3f94bb0..004666c 100644 --- a/trials/6_structured_sequence.ipynb +++ b/trials/6_structured_sequence.ipynb @@ -158,10 +158,9 @@ "source": [ "class DataLoader:\n", "\n", - " def __init__(s, filenames, num_epochs=1, batch_size=1, read_threads=1, seed=None):\n", + " def __init__(s, filenames, num_epochs=1, read_threads=1, seed=None):\n", " #if mode == tflearn.ModeKeys.TRAIN:\n", " s.filenames = filenames\n", - " s.batch_size = batch_size\n", " s.num_epochs = num_epochs\n", " s.read_threads = read_threads\n", " s.seed = seed\n", @@ -179,7 +178,7 @@ " return inputs, targets\n", "\n", " def __call__(s):\n", - " with tf.name_scope('input_queues'):\n", + " with tf.name_scope('input_pipeline'):\n", " #with tf.device(\"/cpu:0\"): # Input queues are on CPU.\n", " filenames = [os.path.join(DATA_DIR, filename) for filename in s.filenames]\n", " filename_queue = tf.train.string_input_producer(filenames, s.num_epochs, shuffle=True)\n", @@ -189,14 +188,19 @@ " # Shuffle examples.\n", " if True:\n", " min_after_dequeue = 10 #10000\n", - " capacity = min_after_dequeue + (s.read_threads + 2) * s.batch_size\n", - " input_batch, target_batch = tf.train.shuffle_batch_join(\n", - " examples, batch_size=s.batch_size, seed=s.seed, capacity=capacity,\n", + " capacity = min_after_dequeue + (s.read_threads + 2) # * s.batch_size\n", + " inputs, targets = tf.train.shuffle_batch_join(\n", + " examples, batch_size=1, seed=s.seed, capacity=capacity,\n", " min_after_dequeue=min_after_dequeue, allow_smaller_final_batch=True)\n", + " # We read full batch.\n", + " inputs = inputs[0, ...]\n", + " targets = targets[0, ...]\n", " else:\n", " assert s.read_threads == 1\n", - " input_batch, target_batch = examples[0]\n", - " return {'inputs': input_batch}, target_batch" + " inputs, targets = examples[0]\n", + "\n", + " # Can return a fixed graph or a per-sample graph in the features.\n", + " return {'inputs': inputs}, targets" ] }, { @@ -238,6 +242,90 @@ "\n", "#np.testing.assert_allclose(training_data, data.iloc[:, :TRAINING_LEN-N_OUTPUTS])" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5 Sequence modeling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Number of hidden units in each of the LSTM cells.\n", + "# Number of filters in case of GCN.\n", + "LSTM_SIZE = 3\n", + "\n", + "def model(features, targets, mode):\n", + " # Reformat input shape to become a sequence.\n", + " x = tf.split(features['inputs'], N_INPUTS, axis=1)\n", + " \n", + " # Recurrent neural network followed by linear transform.\n", + " lstm_cell = tf.contrib.rnn.BasicLSTMCell(LSTM_SIZE, forget_bias=1.0)\n", + " outputs, _ = tf.contrib.rnn.static_rnn(lstm_cell, x, dtype=tf.float32)\n", + " #outputs, _ = tf.contrib.rnn.dynamic_rnn(lstm_cell, x, dtype=tf.float32)\n", + " with tf.name_scope('output_layer'):\n", + " outputs = outputs[-1]\n", + " weight = tf.Variable(tf.random_normal([LSTM_SIZE, N_OUTPUTS]))\n", + " bias = tf.Variable(tf.random_normal([N_SEQ, N_OUTPUTS]))\n", + " predictions = tf.matmul(outputs, weight) + bias\n", + " \n", + " # Loss function and metric for training and evaluation.\n", + " loss = tf.losses.mean_squared_error(targets, predictions)\n", + " eval_metric_ops = {\n", + " 'rmse': tf.metrics.root_mean_squared_error(targets, predictions)\n", + " }\n", + " \n", + " # Training operations.\n", + " train_op = tf.contrib.layers.optimize_loss(\n", + " loss=loss,\n", + " global_step=tf.train.get_global_step(),\n", + " learning_rate=0.01,\n", + " #learning_rate_decay_fn=lambda lr, gs: tf.train.exponential_decay(lr, gs, 100e3, 0.96, staircase=True),\n", + " optimizer=lambda lr: tf.train.GradientDescentOptimizer(lr),\n", + " #optimizer=lambda lr: tf.train.MomentumOptimizer(lr, 0.9),\n", + " )\n", + " \n", + " return tflearn.ModelFnOps(\n", + " mode=mode,\n", + " predictions={'predictions': predictions},\n", + " loss=loss,\n", + " train_op=train_op,\n", + " eval_metric_ops=eval_metric_ops,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6 Experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "estimator = tflearn.Estimator(model_fn=model, model_dir='structured_sequence')\n", + "#estimator.fit(input_fn=DataLoader(filenames=['train.tfrecords']))\n", + "#estimator.evaluate(input_fn=DataLoader(filenames=['test.tfrecords']))\n", + "\n", + "experiment = tflearn.Experiment(\n", + " estimator,\n", + " eval_steps=None,\n", + " train_input_fn=DataLoader(['train.tfrecords'], num_epochs=10),\n", + " eval_input_fn=DataLoader(['validation.tfrecords']),\n", + ")\n", + "\n", + "shutil.rmtree('structured_sequence', ignore_errors=True) # Start fresh each time.\n", + "experiment.train_and_evaluate()" + ] } ], "metadata": {}, From 84c994b5fc034b73eeac877ae6d684189e5539e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Thu, 18 May 2017 12:14:33 +0200 Subject: [PATCH 17/23] to be finished and merged --- README.md | 59 +++++++++++ rcv1_dev.ipynb | 278 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 337 insertions(+) create mode 100644 rcv1_dev.ipynb diff --git a/README.md b/README.md index 61f4184..1a25c97 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,15 @@ cite the above paper if you use it. Additional material: * [NIPS2016 spotlight video][video], 2016-11-22. +* [NIPS2016 poster][poster] * [Deep Learning on Graphs][slides_ntds], a lecture for EPFL's master course [A Network Tour of Data Science][ntds], 2016-12-21. * [Deep Learning on Graphs][slides_dlid], an invited talk at the [Deep Learning on Irregular Domains][dlid] workshop of BMVC, 2017-09-17. +* most general +* Specific to the algorithm: Presentation at the Swiss Machine Learning Day +* More previous work: candidacy exam +* That [blog post] is a gentle introduction of the model. [video]: https://www.youtube.com/watch?v=cIA_m7vwOVQ [slides_ntds]: https://doi.org/10.6084/m9.figshare.4491686 @@ -63,6 +68,15 @@ cd nips2016 make ``` +## Experiments + +* MNIST (NIPS2016) +* 20NEWS (NIPS2016) +* RCV1 +* Wikipedia (NIPS2017) + +Moving MNIST and PTB experiments were not conducted by me. + ## Using the model To use our graph ConvNet on your data, you need: @@ -76,3 +90,48 @@ Please get in touch if you are unsure about applying the model to a different setting. [usage]: http://nbviewer.jupyter.org/github/mdeff/cnn_graph/blob/outputs/usage.ipynb + +## Applications + +* [Kipf & Weiling '16] applied a first-order approximation of that model to + a supervised learning task. A [blog post] by the author shows an interesting + connection to the ll algorithm. A [blog post] by Ferenz provides a critical + analysis of the method. + +[kipf_paper]: +[kipf_blog]: + +## Repository organization + +See https://github.com/drivendata/cookiecutter-data-science/tree/master/%7B%7B%20cookiecutter.repo_name%20%7D%7D + +* The models (the introduced model and some reference models) are contained in [models.py](models.py). +* Various side functions are implemented in [graph.py](graph.py), [coarsening.py](coarsening.py) and [utils.py](utils.py). +* We did experiments on three datasets: MNIST ([notebook](mnist.ipynb)), 20NEWS ([notebook](20news.ipynb)) and RCV1 ([notebook](rcv1.ipynb)). +* TensorBoard summaries are saved in the `summaries` folder. +* Model parameters are saved in the `checkpoints` folder. +* Data is placed in the `data` folder. + * [MNIST](http://yann.lecun.com/exdb/mnist/) is downloaded automatically. + * [20NEWS](http://qwone.com/~jason/20Newsgroups/) (`20news-bydate.tar.gz`) is downloaded automatically. + * [RCV1](http://trec.nist.gov/data/reuters/reuters.html) should be downloaded manually and placed in TODO. + * [pre-trained word2vec embeddings](https://code.google.com/archive/p/word2vec/) (`GoogleNews-vectors-negative300.bin.gz`). + * Wikipedia graph and activations are available here. Please cite .. if you use it. +* The [trials](trials) folder contains various small experiences in the form of IPython notebooks. + 1. [Learning graph filters][trial1]: first experiments on learning + synthesized graph filters through observations of filtered and source + graph signals. The Chebychev and Lanczos methods as well as optimization + methods are compared there. + 2. [Classification][trial2]: learning filters who extract good features for + classification. + 3. [TensorFlow][trial3]: first experience with TensorFlow. + 4. [Coarsening][trial4]: implementation of the Graclus coarsening algorithm + and comparison with a previous matlab implementation. +* A [makefile](makefile) who runs every notebook as a sanity check. It only runs the code, there is no check on the results. + +[trial1]: h + +## Contributing + +* Please fill a GitHub issue if you encounter any problem. Issues are better than contacting the authors as the community can respond and +* Pull requests are welcome ! +* You can contact me for any help regarding how to apply our model to your problem. diff --git a/rcv1_dev.ipynb b/rcv1_dev.ipynb new file mode 100644 index 0000000..88b8ea3 --- /dev/null +++ b/rcv1_dev.ipynb @@ -0,0 +1,278 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import sklearn.datasets\n", + "import scipy.sparse\n", + "import matplotlib.pyplot as plt\n", + "import tensorflow as tf\n", + "import os\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "flags = tf.app.flags\n", + "FLAGS = flags.FLAGS\n", + "\n", + "flags.DEFINE_string('dir_data', 'data_rcv1', 'Directory to store data.')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**From Dropout (Bruna did the same)**\n", + "\n", + "We took the dataset and split it into 63 classes based on the the 63 categories at the second-level of the category tree. We removed 11 categories that did not have any data and one category that had only 4 training examples. We also removed one category that covered a huge chunk (25%) of the examples. This left us with 50 classes and 402,738 documents. We divided the documents into equal-sized training and test sets randomly. Each document was represented\n", + "using the 2000 most frequent non-stopwords in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Get dataset.\n", + "rcv1 = sklearn.datasets.fetch_rcv1('data_rcv1')\n", + "N, C = rcv1.target.shape\n", + "print('N={} documents, C={} classes'.format(N, C))\n", + "\n", + "#def select_classes\n", + "\n", + "# All classes.\n", + "class_names = ['C11', 'C12', 'C13','C14','C15','C151','C1511','C152','C16','C17',\n", + " 'C171','C172','C173','C174','C18','C181','C182','C183','C21','C22',\n", + " 'C23','C24','C31', 'C311','C312','C313','C32','C33','C331','C34',\n", + " 'C41','C411','C42','CCAT','E11', 'E12','E121','E13','E131','E132',\n", + " 'E14','E141','E142','E143','E21', 'E211','E212','E31','E311','E312',\n", + " 'E313','E41','E411','E51','E511','E512','E513','E61','E71','ECAT',\n", + " 'G15','G151','G152','G153','G154','G155','G156','G157','G158','G159',\n", + " 'GCAT','GCRIM','GDEF','GDIP','GDIS','GENT','GENV','GFAS','GHEA',\n", + " 'GJOB','GMIL','GOBIT','GODD','GPOL','GPRO','GREL','GSCI','GSPO',\n", + " 'GTOUR','GVIO','GVOTE','GWEA','GWELF','M11','M12','M13','M131',\n", + " 'M132','M14','M141','M142','M143','MCAT']\n", + "assert len(class_names) == 103 # There is 103 categories according to LYRL2004.\n", + "\n", + "# Second-level classes.\n", + "keep = ['C11','C12','C13','C14','C15','C16','C17','C18','C21','C22','C23','C24',\n", + " 'C31','C32','C33','C34','C41','C42','E11','E12','E13','E14','E21','E31',\n", + " 'E41','E51','E61','E71','G15','GCRIM','GDEF','GDIP','GDIS','GENT','GENV',\n", + " 'GFAS','GHEA','GJOB','GMIL','GOBIT','GODD','GPOL','GPRO','GREL','GSCI',\n", + " 'GSPO','GTOUR','GVIO','GVOTE','GWEA','GWELF','M11','M12','M13','M14']\n", + "assert len(keep) == 55 # There is 55 second-level categories according to LYRL2004.\n", + "keep.remove('C15') # 151785 documents\n", + "keep.remove('GMIL') # 5 documents only\n", + "\n", + "# Construct a lookup table for labels.\n", + "labels_row = []\n", + "labels_col = []\n", + "class_lookup = {}\n", + "for i,name in enumerate(class_names):\n", + " class_lookup[name] = i\n", + "\n", + "# Index of classes to keep.\n", + "idx_keep = np.empty(len(keep))\n", + "for i,cat in enumerate(keep):\n", + " idx_keep[i] = class_lookup[cat]\n", + "target = rcv1.target[:,idx_keep]\n", + "\n", + "# Number of documents per class.\n", + "def show_doc_per_class(names, target, print_=False):\n", + " docs_per_class = np.array(target.astype(np.uint64).sum(axis=0)).squeeze()\n", + " print('categories ({} assignments in total)'.format(docs_per_class.sum()))\n", + " if print_:\n", + " for i,cat in enumerate(names):\n", + " print(' {:5s}: {:6d} documents'.format(cat, docs_per_class[i]))\n", + " plt.figure(figsize=(17,5))\n", + " plt.plot(sorted(docs_per_class[::-1]),'.')\n", + "show_doc_per_class(rcv1.target_names, rcv1.target)\n", + "show_doc_per_class(keep, target, True)\n", + "\n", + "#def select_documents\n", + "\n", + "# Number of classes per document.\n", + "def show_classes_per_doc(target):\n", + " classes_per_doc = np.array(target.sum(axis=1)).squeeze()\n", + " plt.figure(figsize=(17,5))\n", + " plt.plot(sorted(classes_per_doc[::-1]),'.')\n", + " return classes_per_doc\n", + "classes_per_doc = show_classes_per_doc(rcv1.target)\n", + "classes_per_doc = show_classes_per_doc(target)\n", + "\n", + "target = target[classes_per_doc==1]\n", + "data = rcv1.data[classes_per_doc==1, :]\n", + "\n", + "# Convert labels from indicator form to single value.\n", + "N, C = target.shape\n", + "assert C == len(keep)\n", + "target = target.tocoo()\n", + "target = target.col\n", + "assert target.min() == 0\n", + "assert target.max() == C - 1\n", + "\n", + "# Bruna and Dropout used 2 * 201369 = 402738 documents. Probably the difference btw v1 and v2.\n", + "print('N = {} documents and C = {} classes left'.format(N, C))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dates = []\n", + "n = 0\n", + "for path, subdirs, files in os.walk('data_rcv1/rcv1/'):\n", + " for file in files:\n", + " if 'newsML.xml' in file:\n", + " root = ET.parse(os.path.join(path, file)).getroot()\n", + " date = root.attrib['date']\n", + " dates.append(date)\n", + " n+=1\n", + "print(n)\n", + "print(len(dates))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import xml.etree.ElementTree as ET\n", + "\n", + "root = ET.parse('data_rcv1/rcv1/19960820/2286newsML.xml').getroot()\n", + "date = root.attrib['date']\n", + "\n", + "# Fetch textual content.\n", + "text = root.find('title').text\n", + "for p in root.find('text').findall('p'):\n", + " text = ' '.join((text, p.text))\n", + "print(text)\n", + "\n", + "# Find the labels of a document.\n", + "classes = []\n", + "doc = 0\n", + "for codes in root.find('metadata').findall('codes'):\n", + " if codes.attrib['class'] == 'bip:topics:1.0':\n", + " for code in codes.findall('code'):\n", + " labels_row.append(doc)\n", + " labels_col.append(class_lookup[code.attrib['code']])\n", + " classes.append(code.attrib['code'])\n", + "\n", + "assert len(labels_row) == len(labels_col)\n", + "labels_val = np.ones(len(labels_row), dtype=np.bool)\n", + "labels = scipy.sparse.csr_matrix((labels_val, (labels_row, labels_col)))\n", + "\n", + "print(labels)\n", + "labels.sum()" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "From LYRL2004 Appendix 3\n", + "http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a03-expanded-topics-hierarchy/rcv1.topics.hier.expanded\n", + "\n", + "parent: C1 child: C11 child-description: STRATEGY/PLANS\n", + "parent: C1 child: C12 child-description: LEGAL/JUDICIAL\n", + "parent: C1 child: C13 child-description: REGULATION/POLICY\n", + "parent: C1 child: C14 child-description: SHARE LISTINGS\n", + "parent: C1 child: C15 child-description: PERFORMANCE\n", + "parent: C1 child: C16 child-description: INSOLVENCY/LIQUIDITY\n", + "parent: C1 child: C17 child-description: FUNDING/CAPITAL\n", + "parent: C1 child: C18 child-description: OWNERSHIP CHANGES\n", + "parent: C2 child: C21 child-description: PRODUCTION/SERVICES\n", + "parent: C2 child: C22 child-description: NEW PRODUCTS/SERVICES\n", + "parent: C2 child: C23 child-description: RESEARCH/DEVELOPMENT\n", + "parent: C2 child: C24 child-description: CAPACITY/FACILITIES\n", + "parent: C3 child: C31 child-description: MARKETS/MARKETING\n", + "parent: C3 child: C32 child-description: ADVERTISING/PROMOTION\n", + "parent: C3 child: C33 child-description: CONTRACTS/ORDERS\n", + "parent: C3 child: C34 child-description: MONOPOLIES/COMPETITION\n", + "parent: C4 child: C41 child-description: MANAGEMENT\n", + "parent: C4 child: C42 child-description: LABOUR\n", + "parent: E1 child: E11 child-description: ECONOMIC PERFORMANCE\n", + "parent: E1 child: E12 child-description: MONETARY/ECONOMIC\n", + "parent: E1 child: E13 child-description: INFLATION/PRICES\n", + "parent: E1 child: E14 child-description: CONSUMER FINANCE\n", + "parent: E2 child: E21 child-description: GOVERNMENT FINANCE\n", + "parent: E3 child: E31 child-description: OUTPUT/CAPACITY\n", + "parent: E4 child: E41 child-description: EMPLOYMENT/LABOUR\n", + "parent: E5 child: E51 child-description: TRADE/RESERVES\n", + "parent: E6 child: E61 child-description: HOUSING STARTS\n", + "parent: E7 child: E71 child-description: LEADING INDICATORS\n", + "parent: G1 child: G15 child-description: EUROPEAN COMMUNITY\n", + "parent: GCAT child: GCRIM child-description: CRIME, LAW ENFORCEMENT\n", + "parent: GCAT child: GDEF child-description: DEFENCE\n", + "parent: GCAT child: GDIP child-description: INTERNATIONAL RELATIONS\n", + "parent: GCAT child: GDIS child-description: DISASTERS AND ACCIDENTS\n", + "parent: GCAT child: GENT child-description: ARTS, CULTURE, ENTERTAINMENT\n", + "parent: GCAT child: GENV child-description: ENVIRONMENT AND NATURAL WORLD\n", + "parent: GCAT child: GFAS child-description: FASHION\n", + "parent: GCAT child: GHEA child-description: HEALTH\n", + "parent: GCAT child: GJOB child-description: LABOUR ISSUES\n", + "parent: GCAT child: GMIL child-description: MILLENNIUM ISSUES\n", + "parent: GCAT child: GOBIT child-description: OBITUARIES\n", + "parent: GCAT child: GODD child-description: HUMAN INTEREST\n", + "parent: GCAT child: GPOL child-description: DOMESTIC POLITICS\n", + "parent: GCAT child: GPRO child-description: BIOGRAPHIES, PERSONALITIES, PEOPLE\n", + "parent: GCAT child: GREL child-description: RELIGION\n", + "parent: GCAT child: GSCI child-description: SCIENCE AND TECHNOLOGY\n", + "parent: GCAT child: GSPO child-description: SPORTS\n", + "parent: GCAT child: GTOUR child-description: TRAVEL AND TOURISM\n", + "parent: GCAT child: GVIO child-description: WAR, CIVIL WAR\n", + "parent: GCAT child: GVOTE child-description: ELECTIONS\n", + "parent: GCAT child: GWEA child-description: WEATHER\n", + "parent: GCAT child: GWELF child-description: WELFARE, SOCIAL SERVICES\n", + "parent: M1 child: M11 child-description: EQUITY MARKETS\n", + "parent: M1 child: M12 child-description: BOND MARKETS\n", + "parent: M1 child: M13 child-description: MONEY MARKETS\n", + "parent: M1 child: M14 child-description: COMMODITY MARKETS" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From f6ea33f38b9473365b110c744c094d50fae98e5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Mon, 18 Dec 2017 15:31:35 +0100 Subject: [PATCH 18/23] work in progress --- lib/layers.py | 16 +++ trials/6_structured_sequence.ipynb | 208 +++++++++++++++++++++++------ usage.ipynb | 48 ++----- 3 files changed, 199 insertions(+), 73 deletions(-) diff --git a/lib/layers.py b/lib/layers.py index c47bae5..28237dc 100644 --- a/lib/layers.py +++ b/lib/layers.py @@ -241,3 +241,19 @@ def __call__(self, x): b = self._bias_variable([self.Mout], regularization=True) x = tf.matmul(x, W) + b return tf.nn.relu(x) if self.relu else x + + +class RNN(Layer): + pass + + +class LSTM(RNN): + pass + + +class ConvLSTM(RNN): + pass + + +class GRU(RNN): + pass diff --git a/trials/6_structured_sequence.ipynb b/trials/6_structured_sequence.ipynb index 004666c..66a5f8f 100644 --- a/trials/6_structured_sequence.ipynb +++ b/trials/6_structured_sequence.ipynb @@ -54,8 +54,8 @@ "metadata": {}, "outputs": [], "source": [ - "SEQ_LEN = 100\n", - "N_SEQ = 4\n", + "SEQ_LEN = 1000\n", + "N_SEQ = 40\n", "\n", "def create_time_series(seq_len, random_state):\n", " freq = random_state.uniform(0.1, 0.6)\n", @@ -76,7 +76,7 @@ "metadata": {}, "outputs": [], "source": [ - "data.T.plot();\n", + "data.iloc[:5, :100].T.plot();\n", "plt.savefig('time_series.pdf')\n", "# hist" ] @@ -108,7 +108,7 @@ "metadata": {}, "outputs": [], "source": [ - "N_INPUTS = 10 # Number of samples used for prediction, i.e. unrolling length.\n", + "N_INPUTS = 50 # Number of samples used for prediction, i.e. unrolling length.\n", "N_OUTPUTS = 1 # Number of samples in the time series the model tries to predict.\n", "\n", "def feature(array):\n", @@ -179,28 +179,28 @@ "\n", " def __call__(s):\n", " with tf.name_scope('input_pipeline'):\n", - " #with tf.device(\"/cpu:0\"): # Input queues are on CPU.\n", - " filenames = [os.path.join(DATA_DIR, filename) for filename in s.filenames]\n", - " filename_queue = tf.train.string_input_producer(filenames, s.num_epochs, shuffle=True)\n", - "\n", - " examples = [s._read_and_decode(filename_queue) for _ in range(s.read_threads)]\n", - "\n", - " # Shuffle examples.\n", - " if True:\n", - " min_after_dequeue = 10 #10000\n", - " capacity = min_after_dequeue + (s.read_threads + 2) # * s.batch_size\n", - " inputs, targets = tf.train.shuffle_batch_join(\n", - " examples, batch_size=1, seed=s.seed, capacity=capacity,\n", - " min_after_dequeue=min_after_dequeue, allow_smaller_final_batch=True)\n", - " # We read full batch.\n", - " inputs = inputs[0, ...]\n", - " targets = targets[0, ...]\n", - " else:\n", - " assert s.read_threads == 1\n", - " inputs, targets = examples[0]\n", - "\n", - " # Can return a fixed graph or a per-sample graph in the features.\n", - " return {'inputs': inputs}, targets" + " with tf.device(\"/cpu:0\"): # Input queues are on CPU.\n", + " filenames = [os.path.join(DATA_DIR, filename) for filename in s.filenames]\n", + " filename_queue = tf.train.string_input_producer(filenames, s.num_epochs, shuffle=True)\n", + "\n", + " examples = [s._read_and_decode(filename_queue) for _ in range(s.read_threads)]\n", + "\n", + " # Shuffle examples.\n", + " if True:\n", + " min_after_dequeue = 10 #10000\n", + " capacity = min_after_dequeue + (s.read_threads + 2) # * s.batch_size\n", + " inputs, targets = tf.train.shuffle_batch_join(\n", + " examples, batch_size=1, seed=s.seed, capacity=capacity,\n", + " min_after_dequeue=min_after_dequeue, allow_smaller_final_batch=True)\n", + " # We read full batch.\n", + " inputs = inputs[0, ...]\n", + " targets = targets[0, ...]\n", + " else:\n", + " assert s.read_threads == 1\n", + " inputs, targets = examples[0]\n", + "\n", + " # Can return a fixed graph or a per-sample graph in the features.\n", + " return {'inputs': inputs}, targets" ] }, { @@ -247,7 +247,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 5 Sequence modeling" + "## 5 Sequence modeling\n", + "\n", + "We can either:\n", + "* assume the same dynamic on all time series and train a shared model\n", + "* train a model for each time series (which still has access to its neighbors)\n", + "* mix: e.g. per times series bias or last layer" ] }, { @@ -260,7 +265,7 @@ "# Number of filters in case of GCN.\n", "LSTM_SIZE = 3\n", "\n", - "def model(features, targets, mode):\n", + "def model(features, targets, mode, params):\n", " # Reformat input shape to become a sequence.\n", " x = tf.split(features['inputs'], N_INPUTS, axis=1)\n", " \n", @@ -268,11 +273,9 @@ " lstm_cell = tf.contrib.rnn.BasicLSTMCell(LSTM_SIZE, forget_bias=1.0)\n", " outputs, _ = tf.contrib.rnn.static_rnn(lstm_cell, x, dtype=tf.float32)\n", " #outputs, _ = tf.contrib.rnn.dynamic_rnn(lstm_cell, x, dtype=tf.float32)\n", - " with tf.name_scope('output_layer'):\n", - " outputs = outputs[-1]\n", - " weight = tf.Variable(tf.random_normal([LSTM_SIZE, N_OUTPUTS]))\n", - " bias = tf.Variable(tf.random_normal([N_SEQ, N_OUTPUTS]))\n", - " predictions = tf.matmul(outputs, weight) + bias\n", + " \n", + " tf.summary.histogram('hidden', outputs[-1])\n", + " predictions = tf.contrib.layers.fully_connected(outputs[-1], N_OUTPUTS, activation_fn=None)\n", " \n", " # Loss function and metric for training and evaluation.\n", " loss = tf.losses.mean_squared_error(targets, predictions)\n", @@ -284,7 +287,7 @@ " train_op = tf.contrib.layers.optimize_loss(\n", " loss=loss,\n", " global_step=tf.train.get_global_step(),\n", - " learning_rate=0.01,\n", + " learning_rate=params['learning_rate'],\n", " #learning_rate_decay_fn=lambda lr, gs: tf.train.exponential_decay(lr, gs, 100e3, 0.96, staircase=True),\n", " optimizer=lambda lr: tf.train.GradientDescentOptimizer(lr),\n", " #optimizer=lambda lr: tf.train.MomentumOptimizer(lr, 0.9),\n", @@ -312,9 +315,70 @@ "metadata": {}, "outputs": [], "source": [ - "estimator = tflearn.Estimator(model_fn=model, model_dir='structured_sequence')\n", + "# Observing variables.\n", + "#tflearn.monitors.ValidationMonitor\n", + "#tf.train.SessionRunHook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Tuning the hyper-parameters.\n", + "#tflearn.learn_runner.run()\n", + "#tflearn.learn_runner.tune()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TF debugger.\n", + "from tensorflow.python import debug as tfdbg\n", + "\n", + "hooks = [tfdbg.LocalCLIDebugHook()]\n", + "hooks = [tfdbg.DumpingDebugHook('tfdbg_dumps')]\n", + "# python -m tensorflow.python.debug.cli.offline_analyzer --dump_dir=\"tfdbg_dumps/run__\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Statistics like compute time or memory.\n", + "# Need to pass run_options and run_metadata to sess.run().\n", + "# Not possible with Experiment and Estimator API.\n", + "#run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)\n", + "#run_metadata = tf.RunMetadata()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#MODEL_DIR = os.path.join('..', 'logdir', 'structured_sequence', 'run1')\n", + "MODEL_DIR = 'structured_sequence'\n", + "config = tflearn.RunConfig(\n", + " save_checkpoints_secs=60,\n", + " # save_summary_steps=100,\n", + " model_dir=MODEL_DIR,\n", + " # To see device placement. It unfortunately only shows up in stderr, not Tensorboard (explicit placement only).\n", + " # session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True),\n", + ")\n", + "hparams = {\n", + " 'learning_rate': 0.01\n", + "}\n", + "estimator = tflearn.Estimator(model_fn=model, config=config, params=hparams)\n", "#estimator.fit(input_fn=DataLoader(filenames=['train.tfrecords']))\n", - "#estimator.evaluate(input_fn=DataLoader(filenames=['test.tfrecords']))\n", + "#estimator.evaluate(input_fn=DataLoader(filenames=['validation.tfrecords']))\n", "\n", "experiment = tflearn.Experiment(\n", " estimator,\n", @@ -323,8 +387,76 @@ " eval_input_fn=DataLoader(['validation.tfrecords']),\n", ")\n", "\n", - "shutil.rmtree('structured_sequence', ignore_errors=True) # Start fresh each time.\n", - "experiment.train_and_evaluate()" + "shutil.rmtree(MODEL_DIR, ignore_errors=True) # Start fresh each time.\n", + "experiment.train_and_evaluate()\n", + "#experiment.continuous_train_and_eval() # Takes less ressources.\n", + "\n", + "#estimator.evaluate(input_fn=DataLoader(filenames=['test.tfrecords']))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## XXX" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class RNN:\n", + " \n", + " def __init__(self, units):\n", + " pass\n", + " \n", + " def __call__(self, inputs, states, laplacian):\n", + " \"\"\"Fully connected layer with Mout features.\"\"\"\n", + " N, Min = x.get_shape()\n", + " W = self._weight_variable([int(Min), self.Mout], regularization=True)\n", + " b = self._bias_variable([self.Mout], regularization=True)\n", + " x = tf.matmul(x, W) + b\n", + " return tf.nn.relu(x) if self.relu else x" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* Inherit from RNNCell to use high level TF machinery like `tf.dynamic_rnn()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class LSTM:\n", + " \"\"\"The network is not unrolled.\"\"\"\n", + " \n", + " def _input_conv(self, x, w, b=None):\n", + " pass\n", + " \n", + " def _reccurent_conv(self, x, w, b=None):\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tensorflow.python.ops import control_flow_ops\n", + "control_flow_ops.while_loop(\n", + " cond=lambda time, *_: time < time_steps,\n", + " body=_step,\n", + " loop_vars=(time, output_ta) + states,\n", + " parallel_iterations=32,\n", + " swap_memory=True)" ] } ], diff --git a/usage.ipynb b/usage.ipynb index 2c1ff13..0febd8d 100644 --- a/usage.ipynb +++ b/usage.ipynb @@ -26,9 +26,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from lib import models, graph, coarsening, utils\n", @@ -49,9 +47,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "d = 100 # Dimensionality.\n", @@ -86,9 +82,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "n_train = n // 2\n", @@ -125,9 +119,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "dist, idx = graph.distance_scipy_spatial(X_train.T, k=10, metric='euclidean')\n", @@ -154,9 +146,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "graphs, perm = coarsening.coarsen(A, levels=3, self_connections=False)\n", @@ -176,9 +166,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "L = [graph.laplacian(A, normalized=True) for A in graphs]\n", @@ -199,9 +187,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "params = dict()\n", @@ -237,9 +223,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "model = models.cgcnn(L, **params)\n", @@ -262,9 +246,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "fig, ax1 = plt.subplots(figsize=(15, 5))\n", @@ -279,9 +261,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "print('Time per step: {:.2f} ms'.format(t_step*1000))" @@ -290,9 +270,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "res = model.evaluate(X_test, y_test)\n", @@ -316,9 +294,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.4.3" + "version": "3.6.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From aef2edcf9e7e8e4d3f193b4b9da65416e8e585d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Fri, 21 Feb 2020 03:19:48 +0100 Subject: [PATCH 19/23] keep original parameters --- nips2016/mnist.ipynb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nips2016/mnist.ipynb b/nips2016/mnist.ipynb index 73846be..2b9d2f1 100644 --- a/nips2016/mnist.ipynb +++ b/nips2016/mnist.ipynb @@ -164,10 +164,10 @@ "source": [ "common = {}\n", "common['dir_name'] = 'mnist/'\n", - "common['num_epochs'] = 4 #20\n", + "common['num_epochs'] = 20\n", "common['batch_size'] = 100\n", "common['decay_steps'] = mnist.train.num_examples / common['batch_size']\n", - "common['eval_frequency'] = 100 #30 * common['num_epochs']\n", + "common['eval_frequency'] = 30 * common['num_epochs']\n", "common['brelu'] = 'b1relu'\n", "common['pool'] = 'mpool1'\n", "C = max(mnist.train.labels) + 1 # number of classes\n", @@ -183,7 +183,7 @@ }, "outputs": [], "source": [ - "if False:\n", + "if True:\n", " name = 'softmax'\n", " params = common.copy()\n", " params['dir_name'] += name\n", @@ -228,7 +228,7 @@ }, "outputs": [], "source": [ - "if False:\n", + "if True:\n", " name = 'fgconv_softmax'\n", " params = common.copy()\n", " params['dir_name'] += name\n", @@ -247,7 +247,7 @@ }, "outputs": [], "source": [ - "if False:\n", + "if True:\n", " name = 'sgconv_softmax'\n", " params = common.copy()\n", " params['dir_name'] += name\n", @@ -266,7 +266,7 @@ "outputs": [], "source": [ "# With 'chebyshev2' and 'b2relu', it corresponds to cgcnn2_2(L[0], F=10, K=20).\n", - "if False:\n", + "if True:\n", " name = 'cgconv_softmax'\n", " params = common.copy()\n", " params['dir_name'] += name\n", From 416f8143cf30418711ce57e66dda03bb13ac87f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Fri, 21 Feb 2020 03:36:23 +0100 Subject: [PATCH 20/23] add title and disclaimer to mnist filter viz --- nips2016/mnist.ipynb | 111 ++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 71 deletions(-) diff --git a/nips2016/mnist.ipynb b/nips2016/mnist.ipynb index 2b9d2f1..47372ef 100644 --- a/nips2016/mnist.ipynb +++ b/nips2016/mnist.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", @@ -25,9 +23,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "flags = tf.app.flags\n", @@ -54,9 +50,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def grid_graph(m, corners=False):\n", @@ -96,9 +90,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from tensorflow.examples.tutorials.mnist import input_data\n", @@ -129,9 +121,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "#model = fc1()\n", @@ -157,9 +147,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "common = {}\n", @@ -178,9 +166,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -203,9 +189,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# Common hyper-parameters for networks with one convolutional layer.\n", @@ -223,9 +207,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -242,9 +224,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -260,9 +240,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# With 'chebyshev2' and 'b2relu', it corresponds to cgcnn2_2(L[0], F=10, K=20).\n", @@ -281,9 +259,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# Common hyper-parameters for LeNet5-like networks.\n", @@ -301,9 +277,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Architecture of TF MNIST conv model (LeNet-5-like).\n", @@ -322,9 +296,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -339,9 +311,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -356,9 +326,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "model_perf.show()" @@ -367,9 +335,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_filters(coeffs):\n", @@ -393,9 +359,28 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, + "outputs": [], + "source": [ + "if False:\n", + " grid_params = {}\n", + " data = (train_data, train_labels, val_data, val_labels, test_data, test_labels)\n", + " utils.grid_search(params, grid_params, *data, model=lambda x: models.cgcnn(L,**x))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Filter visualization (experimental)\n", + "\n", + "**Disclaimer**: left as is, not sure if it works. To be checked before usage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "a = np.array([1,2,3])\n", @@ -413,9 +398,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "model = model_f\n", @@ -460,20 +443,6 @@ "plt.imshow(train_data[0,idx][:28**2].reshape(28,28))\n", "assert np.allclose(train_data[0,idx][:28**2].reshape(28,28), mnist.train.images[0].reshape(28,28))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "if False:\n", - " grid_params = {}\n", - " data = (train_data, train_labels, val_data, val_labels, test_data, test_labels)\n", - " utils.grid_search(params, grid_params, *data, model=lambda x: models.cgcnn(L,**x))" - ] } ], "metadata": { From eb245e61f44dcde481000bfde62a251706f90b8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Fri, 21 Feb 2020 04:25:28 +0100 Subject: [PATCH 21/23] make clean --- nips2016/20news.ipynb | 92 +++++++++++-------------------------------- rcv1.ipynb | 84 ++++++++++----------------------------- rcv1_dev.ipynb | 20 +++------- 3 files changed, 49 insertions(+), 147 deletions(-) diff --git a/nips2016/20news.ipynb b/nips2016/20news.ipynb index dfabbd8..67d26c6 100644 --- a/nips2016/20news.ipynb +++ b/nips2016/20news.ipynb @@ -12,9 +12,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", @@ -36,9 +34,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "flags = tf.app.flags\n", @@ -65,9 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Fetch dataset. Scikit-learn already performs some cleaning.\n", @@ -89,9 +83,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Remove short documents.\n", @@ -118,9 +110,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Word embedding\n", @@ -135,9 +125,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Feature selection.\n", @@ -156,9 +144,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "train.normalize(norm='l1')\n", @@ -168,9 +154,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Test dataset.\n", @@ -187,9 +171,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -225,9 +207,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "t_start = time.process_time()\n", @@ -245,9 +225,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "t_start = time.process_time()\n", @@ -267,9 +245,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Training set is shuffled already.\n", @@ -291,9 +267,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -303,9 +277,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "common = {}\n", @@ -325,9 +297,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -350,9 +320,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -375,9 +343,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -400,9 +366,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -426,9 +390,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -452,9 +414,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -477,9 +437,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -502,9 +460,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "model_perf.show()" @@ -513,9 +469,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "if False:\n", diff --git a/rcv1.ipynb b/rcv1.ipynb index 8266209..7bd85d1 100644 --- a/rcv1.ipynb +++ b/rcv1.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", @@ -25,9 +23,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "flags = tf.app.flags\n", @@ -58,9 +54,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Fetch dataset from Scikit-learn.\n", @@ -81,9 +75,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Selection of classes.\n", @@ -106,9 +98,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Remove documents with multiple classes.\n", @@ -119,9 +109,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Remove short documents.\n", @@ -136,9 +124,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Feature selection.\n", @@ -157,9 +143,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "#dataset.normalize(norm='l1')\n", @@ -169,9 +153,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Word embedding\n", @@ -186,9 +168,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "perm = np.random.RandomState(seed=42).permutation(dataset.data.shape[0])\n", @@ -218,9 +198,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "t_start = time.process_time()\n", @@ -238,9 +216,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "assert FLAGS.coarsening_levels is 0\n", @@ -261,9 +237,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Training set is shuffled already.\n", @@ -285,9 +259,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if False:\n", @@ -297,9 +269,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "common = {}\n", @@ -319,9 +289,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -344,9 +312,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -369,9 +335,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -394,9 +358,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -419,9 +381,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -444,9 +404,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "model_perf.show()" diff --git a/rcv1_dev.ipynb b/rcv1_dev.ipynb index 88b8ea3..2794a57 100644 --- a/rcv1_dev.ipynb +++ b/rcv1_dev.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -20,9 +18,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "flags = tf.app.flags\n", @@ -44,9 +40,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Get dataset.\n", @@ -134,9 +128,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "dates = []\n", @@ -155,9 +147,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import xml.etree.ElementTree as ET\n", From f731393c78ad6a423ff249733707abf0b1698ade Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Fri, 21 Feb 2020 12:45:46 +0100 Subject: [PATCH 22/23] move all experiments in single folder --- nips2016/mnist.ipynb => experiments/1_mnist.ipynb | 0 nips2016/20news.ipynb => experiments/2_20news.ipynb | 0 rcv1.ipynb => experiments/3_rcv1.ipynb | 0 rcv1_dev.ipynb => experiments/3_rcv1_dev.ipynb | 0 experiments/makefile | 9 +-------- nips2016/makefile | 11 ----------- 6 files changed, 1 insertion(+), 19 deletions(-) rename nips2016/mnist.ipynb => experiments/1_mnist.ipynb (100%) rename nips2016/20news.ipynb => experiments/2_20news.ipynb (100%) rename rcv1.ipynb => experiments/3_rcv1.ipynb (100%) rename rcv1_dev.ipynb => experiments/3_rcv1_dev.ipynb (100%) delete mode 100644 nips2016/makefile diff --git a/nips2016/mnist.ipynb b/experiments/1_mnist.ipynb similarity index 100% rename from nips2016/mnist.ipynb rename to experiments/1_mnist.ipynb diff --git a/nips2016/20news.ipynb b/experiments/2_20news.ipynb similarity index 100% rename from nips2016/20news.ipynb rename to experiments/2_20news.ipynb diff --git a/rcv1.ipynb b/experiments/3_rcv1.ipynb similarity index 100% rename from rcv1.ipynb rename to experiments/3_rcv1.ipynb diff --git a/rcv1_dev.ipynb b/experiments/3_rcv1_dev.ipynb similarity index 100% rename from rcv1_dev.ipynb rename to experiments/3_rcv1_dev.ipynb diff --git a/experiments/makefile b/experiments/makefile index 22a1c95..3a29a42 100644 --- a/experiments/makefile +++ b/experiments/makefile @@ -6,13 +6,6 @@ $(NB): jupyter nbconvert --inplace --execute --ExecutePreprocessor.timeout=-1 $@ clean: - @for nb in $(NB); do \ - printf "%s" "$$(jq --indent 1 ' \ - .metadata = {} \ - | (.cells[] | select(has("outputs")) | .outputs) = [] \ - | (.cells[] | select(has("execution_count")) | .execution_count) = null \ - | .cells[].metadata = {} \ - ' $$nb)" > $$nb; \ - done + jupyter nbconvert --inplace --ClearOutputPreprocessor.enabled=True $(NB) .PHONY: run $(NB) clean diff --git a/nips2016/makefile b/nips2016/makefile deleted file mode 100644 index 3a29a42..0000000 --- a/nips2016/makefile +++ /dev/null @@ -1,11 +0,0 @@ -NB = $(sort $(wildcard *.ipynb)) - -run: $(NB) - -$(NB): - jupyter nbconvert --inplace --execute --ExecutePreprocessor.timeout=-1 $@ - -clean: - jupyter nbconvert --inplace --ClearOutputPreprocessor.enabled=True $(NB) - -.PHONY: run $(NB) clean From 3265c79bfc35748772a0b58c078f11fbd47d688f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Sat, 22 Feb 2020 02:01:23 +0100 Subject: [PATCH 23/23] requirements: add version numbers and dependencies --- requirements.txt | 83 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 18 deletions(-) diff --git a/requirements.txt b/requirements.txt index 15ce4fb..7193ed6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,22 +1,69 @@ -numpy -scipy -pandas -tables -scikit-learn -matplotlib -seaborn +# Version numbers have been retrieved from a range of machines and environments. +# Take them with a grain of salt. -tensorflow-gpu -#tensorflow +# Direct dependencies +#python==3.5 # 3.4 / 3.6 +#pip==1.5.4 +#setuptools==21.0.0 +numpy==1.11.0 # 1.12.1 +scipy==0.17.0 # 0.19.0 +pandas==0.20.0 +scikit-learn==0.18.1 +matplotlib==1.5.1 # 2.0.1 +seaborn==0.7.1 +tqdm==4.11.2 +gensim==2.1.0 # 0.12.4 / 2.0.0 # Only for NLP experiments. +#graph-tool==2.26 # Cannot be installed with pip. +tensorflow-gpu==1.1.0 # 0.8.0 # Or tensorflow if not running on GPU. +#networkx==1.11 # Only considered at some point. +#networkit==4.2 # Only considered at some point. -jupyter -ipython +# Dependencies of the above. +boto==2.46.1 # 2.40.0 +bz2file==0.98 +cycler==0.10.0 # 0.9.0 +protobuf==3.3.0 # 3.0.0 +pyparsing==2.2.0 # 2.1.4 +python-dateutil==2.6.0 # 2.5.3 +pytz==2016.4 # 2017.2 +requests==2.13.0 # 2.9.1 / 2.10.0 +six==1.10.0 +smart-open==1.5.2 # 1.3.3 +Werkzeug==0.12.1 -python-dotenv -tqdm +# Jupyter notebook and its dependencies. +notebook==5.0.0 # 4.2.0 +bleach==2.0.0 # 3.1.1 +decorator==4.0.9 # 4.0.6 / 4.0.11 / 4.4.1 +entrypoints==0.2.2 +html5lib==0.999999999 +ipykernel==4.6.1 # 4.3.1 +ipython==6.0.0 # 4.2.0 +ipython-genutils==0.2.0 # 0.1.0 +jedi==0.10.2 +Jinja2==2.8 # 2.9.6 +jsonschema==2.6.0 # 2.5.1 +jupyter-client==5.0.1 # 4.2.2 +jupyter-core==4.3.0 # 4.1.0 +MarkupSafe==0.23 # 1.0 +mistune==0.7.4 +nbconvert==5.1.1 # 4.2.0 +nbformat==4.3.0 # 4.0.1 +pandocfilters==1.4.1 +pexpect==4.2.1 # 4.0.1 +pickleshare==0.7.4 # 0.7.2 +prompt-toolkit==1.0.10 +ptyprocess==0.5.1 +Pygments==2.2.0 # 2.1.3 +pyzmq==16.0.2 # 15.2.0 +simplegeneric==0.8.1 +terminado==0.6 +testpath==0.3 +tornado==4.3 # 4.2.1 / 4.4.2 / 4.5.1 +traitlets==4.3.2 # 4.2.1 +wcwidth==0.1.7 +webencodings==0.5.1 -# Only needed for NLP experiments. -gensim - -# Cannot be installed with pip. -#graph-tool +# dotenv and its dependency. +#python-dotenv==0.6.4 +#click==6.7