From 1269b1b52b166c75b7bb714a5cdf3a12a895103e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Thu, 4 May 2017 15:40:32 +0000
Subject: [PATCH 01/23] plot filters

---
 nips2016/mnist.ipynb | 120 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 110 insertions(+), 10 deletions(-)

diff --git a/nips2016/mnist.ipynb b/nips2016/mnist.ipynb
index 8d594f6..73846be 100644
--- a/nips2016/mnist.ipynb
+++ b/nips2016/mnist.ipynb
@@ -116,7 +116,7 @@
     "val_data = coarsening.perm_data(val_data, perm)\n",
     "test_data = coarsening.perm_data(test_data, perm)\n",
     "print('Execution time: {:.2f}s'.format(time.process_time() - t_start))\n",
-    "del perm"
+    "#del perm"
    ]
   },
   {
@@ -164,10 +164,10 @@
    "source": [
     "common = {}\n",
     "common['dir_name']       = 'mnist/'\n",
-    "common['num_epochs']     = 20\n",
+    "common['num_epochs']     = 4 #20\n",
     "common['batch_size']     = 100\n",
     "common['decay_steps']    = mnist.train.num_examples / common['batch_size']\n",
-    "common['eval_frequency'] = 30 * common['num_epochs']\n",
+    "common['eval_frequency'] = 100 #30 * common['num_epochs']\n",
     "common['brelu']          = 'b1relu'\n",
     "common['pool']           = 'mpool1'\n",
     "C = max(mnist.train.labels) + 1  # number of classes\n",
@@ -183,7 +183,7 @@
    },
    "outputs": [],
    "source": [
-    "if True:\n",
+    "if False:\n",
     "    name = 'softmax'\n",
     "    params = common.copy()\n",
     "    params['dir_name'] += name\n",
@@ -228,13 +228,14 @@
    },
    "outputs": [],
    "source": [
-    "if True:\n",
+    "if False:\n",
     "    name = 'fgconv_softmax'\n",
     "    params = common.copy()\n",
     "    params['dir_name'] += name\n",
     "    params['filter'] = 'fourier'\n",
     "    params['K'] = [L[0].shape[0]]\n",
-    "    model_perf.test(models.cgcnn(L, **params), name, params,\n",
+    "    model_f = models.cgcnn(L, **params)\n",
+    "    model_perf.test(model_f, name, params,\n",
     "                    train_data, train_labels, val_data, val_labels, test_data, test_labels)"
    ]
   },
@@ -246,12 +247,13 @@
    },
    "outputs": [],
    "source": [
-    "if True:\n",
+    "if False:\n",
     "    name = 'sgconv_softmax'\n",
     "    params = common.copy()\n",
     "    params['dir_name'] += name\n",
     "    params['filter'] = 'spline'\n",
-    "    model_perf.test(models.cgcnn(L, **params), name, params,\n",
+    "    model_s = models.cgcnn(L, **params)\n",
+    "    model_perf.test(model_s, name, params,\n",
     "                    train_data, train_labels, val_data, val_labels, test_data, test_labels)"
    ]
   },
@@ -264,14 +266,15 @@
    "outputs": [],
    "source": [
     "# With 'chebyshev2' and 'b2relu', it corresponds to cgcnn2_2(L[0], F=10, K=20).\n",
-    "if True:\n",
+    "if False:\n",
     "    name = 'cgconv_softmax'\n",
     "    params = common.copy()\n",
     "    params['dir_name'] += name\n",
     "    params['filter'] = 'chebyshev5'\n",
     "#    params['filter'] = 'chebyshev2'\n",
     "#    params['brelu'] = 'b2relu'\n",
-    "    model_perf.test(models.cgcnn(L, **params), name, params,\n",
+    "    model_c = models.cgcnn(L, **params)\n",
+    "    model_perf.test(model_c, name, params,\n",
     "                    train_data, train_labels, val_data, val_labels, test_data, test_labels)"
    ]
   },
@@ -361,6 +364,103 @@
     "model_perf.show()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def plot_filters(coeffs):\n",
+    "    fig = plt.figure(figsize=(15,5))\n",
+    "    ax = fig.add_subplot(1,1,1)\n",
+    "    for coeff in coeffs:\n",
+    "        c = eval(coeff)\n",
+    "        label = '{}: L={:1.2e}, |dL|={:1.2e}'.format(coeff, L(c), np.linalg.norm(dL(X,Y,c)))\n",
+    "        ax.plot(lamb, c, '.-', label=label)\n",
+    "#        np.testing.assert_allclose(np.linalg.norm(c)**2, E, rtol=1e-2)\n",
+    "    ax.set_xlim(lamb[0], lamb[-1])\n",
+    "    ax.set_title('Filter coefficients, M={}, N={}, eps={}'.format(M, N, eps))\n",
+    "    ax.set_xlabel('frequency')\n",
+    "    ax.set_ylabel('amplitude')\n",
+    "    ax.legend(loc='best')\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "a = np.array([1,2,3])\n",
+    "print(a[[2,0,1]][:2])\n",
+    "\n",
+    "print(mnist.train.labels[0])\n",
+    "plt.imshow(mnist.train.images[0].reshape(28,28))\n",
+    "\n",
+    "a = np.random.permutation(range(len(perm)))\n",
+    "b = a[perm]\n",
+    "c = b[idx]\n",
+    "assert np.all(a == c)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "model = model_f\n",
+    "\n",
+    "sess = tf.Session(graph=model.graph)\n",
+    "filename = tf.train.latest_checkpoint(os.path.join('..', 'checkpoints', model.dir_name))\n",
+    "model.op_saver.restore(sess, filename)\n",
+    "var = model.graph.get_tensor_by_name('conv1/weights' + ':0')\n",
+    "val = sess.run(var)\n",
+    "sess.close()\n",
+    "\n",
+    "lamb, U = graph.fourier(L[0])\n",
+    "\n",
+    "#filters = model_f.get_var('conv1/weights')\n",
+    "filters = val\n",
+    "#filt_fourier = filt\n",
+    "\n",
+    "i = 6\n",
+    "\n",
+    "print(filters.shape)\n",
+    "plt.figure(figsize=(15,5))\n",
+    "plt.plot(lamb, filters[:,i,0])\n",
+    "\n",
+    "print(lamb[0], lamb[-1])\n",
+    "\n",
+    "filt = U.dot(filters[:,i,0])\n",
+    "plt.figure(figsize=(15,5))\n",
+    "plt.plot(filt)\n",
+    "\n",
+    "print(len(lamb)-28**2)\n",
+    "indices = np.array(perm) >= 28**2\n",
+    "\n",
+    "print(train_data[0,indices])\n",
+    "\n",
+    "idx = np.argsort(perm)\n",
+    "filt = filt[idx]\n",
+    "plt.figure(figsize=(15,5))\n",
+    "plt.plot(train_data[0,perm])\n",
+    "\n",
+    "plt.figure(figsize=(15,5))\n",
+    "img = train_data[0,idx][:28**2].reshape(28,28)\n",
+    "plt.imshow(train_data[0,idx][:28**2].reshape(28,28))\n",
+    "assert np.allclose(train_data[0,idx][:28**2].reshape(28,28), mnist.train.images[0].reshape(28,28))"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

From a1aaf9b73a22dcb46310b1e2f440a74eb23a05e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Thu, 4 May 2017 15:53:58 +0000
Subject: [PATCH 02/23] gitignore: local configuration

---
 .gitignore | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index d170d81..0dc54b7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,11 +5,13 @@ __pycache__/
 # IPython checkpoints
 .ipynb_checkpoints/
 
-# Datasets
+# Local configuration
+.env
+.python-version
+
+# Data
 data/
 
-# Tensorflow summaries
+# Tensorflow summaries & model parameters
 summaries/
-
-# Model parameters
 checkpoints/

From 76cebe61187dae9d0516cc58c2e7a5627f4c4434 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Tue, 16 May 2017 13:00:29 +0200
Subject: [PATCH 03/23] makefile: clean notebook JSON

---
 experiments/makefile | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 experiments/makefile

diff --git a/experiments/makefile b/experiments/makefile
new file mode 100644
index 0000000..6bfd9c2
--- /dev/null
+++ b/experiments/makefile
@@ -0,0 +1,18 @@
+NB = $(sort $(wildcard *.ipynb))
+
+run: $(NB)
+
+$(NB):
+	jupyter nbconvert --inplace --execute --ExecutePreprocessor.timeout=-1 $@
+
+clean:
+	@for nb in $(NB); do \
+		echo "$$(jq --indent 1 ' \
+			.metadata = {} \
+			| (.cells[] | select(has("outputs")) | .outputs) = [] \
+			| (.cells[] | select(has("execution_count")) | .execution_count) = null \
+			| .cells[].metadata = {} \
+			' $$nb)" > $$nb; \
+	done
+
+.PHONY: run $(NB) clean

From f3064f746b0a2c428ad22c896b5220431cfb3678 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Tue, 16 May 2017 12:29:24 +0000
Subject: [PATCH 04/23] wikipedia: hyperlink graph

---
 experiments/4_wikipedia_traffic.ipynb | 220 ++++++++++++++++++++++++++
 1 file changed, 220 insertions(+)
 create mode 100644 experiments/4_wikipedia_traffic.ipynb

diff --git a/experiments/4_wikipedia_traffic.ipynb b/experiments/4_wikipedia_traffic.ipynb
new file mode 100644
index 0000000..73b0a6d
--- /dev/null
+++ b/experiments/4_wikipedia_traffic.ipynb
@@ -0,0 +1,220 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wikipedia Traffic
+",
+    "
+",
+    "This notebook aims at modeling user traffic on [Wikipedia](https://wikipedia.org) using a recurrent graph convolutional neural network.
+",
+    "
+",
+    "Goal: anomaly detection. Can be used to detect events in the real world. Other applications:
+",
+    "* intrusion detection on telecomunnication networks,
+",
+    "* anomaly detection on energy networks,
+",
+    "* accident detection on transporation networks.
+",
+    "
+",
+    "Events: Super Bowl, Academy Awards, Grammy, Miss Universe, Golden Globe. Mostly December-February.
+",
+    "Missed: Charlie Hebdo, Ebola
+",
+    "
+",
+    "Network is very large: 5M nodes, 300M edges. Downsampling ideas:
+",
+    "* Choose a category, e.g. science.
+",
+    "* Take most active ones.
+",
+    "* Concatenate in modules / communities / super-nodes.
+",
+    "
+",
+    "Raw data
+",
+    "* [Wikimedia SQL dumps](https://dumps.wikimedia.org/enwiki/), to construct the hyperlink graph.
+",
+    "    * Network size: 5M nodes, 300M edges.
+",
+    "* [Pagecounts](https://dumps.wikimedia.org/other/pagecounts-raw/) as activations on the graph.
+",
+    "    * Data from 2014-09-23 0h to 2015-06-05 22h.
+",
+    "    * 6142 hours in total."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline
+",
+    "
+",
+    "import os
+",
+    "
+",
+    "import numpy as np
+",
+    "import pandas as pd
+",
+    "import matplotlib.pyplot as plt
+",
+    "import seaborn as sns
+",
+    "import graph_tool.all as gt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext dotenv
+",
+    "%dotenv .env
+",
+    "
+",
+    "WIKI_RAW = os.environ.get('WIKI_RAW')  # Downloaded from dumps.wikimedia.org.
+",
+    "WIKI_CLEAN = os.environ.get('WIKI_CLEAN')  # Processed by Kirell Benzi."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.set_context(\"notebook\", font_scale=1.5)
+",
+    "plt.rcParams['figure.figsize'] = (17, 5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1 Hyperlink graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "g = gt.load_graph(os.path.join(WIKI_CLEAN, 'enwiki-20150403-graph.gt'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "g.is_directed()
+",
+    "#g.set_directed(False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('{:.2e} vertices'.format(g.num_vertices()))
+",
+    "print('{:.2e} edges'.format(g.num_edges()))
+",
+    "
+",
+    "g.list_properties()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idx = 42
+",
+    "page_title = g.vertex_properties['page_title'][idx]
+",
+    "page_id = g.vertex_properties['page_id'][idx]
+",
+    "print('{}: {}'.format(page_id, page_title))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hist = gt.vertex_hist(g, 'total')
+",
+    "plt.loglog(hist[1][:-1], hist[0])
+",
+    "plt.xlabel('#edges')
+",
+    "plt.ylabel('#nodes');"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Too large to be drawn in full.
+",
+    "#gt.sfdp_layout
+",
+    "#gt.graph_draw(g)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Remove uninteresting pages.
+",
+    "#g.set_vertex_filter()
+",
+    "#g.remove_vertex"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = gt.adjacency(g)
+",
+    "A"
+   ]
+  }
+ ],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 2ca8bc1caff284073eb7f13e5bfacd11aa68334e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Tue, 16 May 2017 19:38:31 +0000
Subject: [PATCH 05/23] wikipedia: pages & page views

---
 experiments/4_wikipedia_traffic.ipynb | 250 +++++++++++++++++++++++++-
 1 file changed, 249 insertions(+), 1 deletion(-)

diff --git a/experiments/4_wikipedia_traffic.ipynb b/experiments/4_wikipedia_traffic.ipynb
index 73b0a6d..7fb854a 100644
--- a/experiments/4_wikipedia_traffic.ipynb
+++ b/experiments/4_wikipedia_traffic.ipynb
@@ -44,7 +44,7 @@
 ",
     "    * Network size: 5M nodes, 300M edges.
 ",
-    "* [Pagecounts](https://dumps.wikimedia.org/other/pagecounts-raw/) as activations on the graph.
+    "* [Pagecounts](https://dumps.wikimedia.org/other/pagecounts-all-sites/) as activations on the graph.
 ",
     "    * Data from 2014-09-23 0h to 2015-06-05 22h.
 ",
@@ -62,8 +62,14 @@
     "
 ",
     "import os
+",
+    "import datetime
 ",
     "
+",
+    "import IPython.display as ipd
+",
+    "from tqdm import tqdm_notebook
 ",
     "import numpy as np
 ",
@@ -212,6 +218,248 @@
 ",
     "A"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2 Pages
+",
+    "
+",
+    "A lot of pages in `pagecounts` are redirections to actual pages. We need to merge the hits."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filepath = os.path.join(WIKI_CLEAN, 'enwiki-20150403-page-redirect.csv.gz')
+",
+    "redirect = pd.read_csv(filepath, compression='gzip', sep='|', encoding='utf-8', quoting=3, index_col=1)
+",
+    "
+",
+    "redirect.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#assert len(redirect) == len(redirect['page_id'].unique())
+",
+    "print('{:.2e} unique pages, {:.2e} pages including redirections'.format(
+",
+    "        len(redirect['fix_page_id'].unique()),
+",
+    "        len(redirect)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "redirect.loc[page_id]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def id2title(page_id):
+",
+    "    return redirect.at[page_id, 'fix_page_title']
+",
+    "    #return redirect[redirect['page_id'] == page_id]['fix_page_title'].values[0]
+",
+    "id2title(330)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_in_title(string):
+",
+    "
+",
+    "    def find(page_title, string):
+",
+    "        try:
+",
+    "            return string.lower() in page_title.lower()
+",
+    "        except:
+",
+    "            return False
+",
+    "
+",
+    "    #b = redirect['fix_page_title'].apply(find, string=string)
+",
+    "    b = redirect['page_title'].apply(find, string=string)
+",
+    "    #return redirect[b]
+",
+    "    return redirect[b & (redirect['is_redirect'] == 0)]
+",
+    "
+",
+    "#find_in_title('ebola')
+",
+    "find_in_title('zirka')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3 Page views / counts
+",
+    "
+",
+    "Graph has 4M nodes but lot of pages are not seen much. `signal_500.h5` lists only 118k pages."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Kirell's signal which includes views when greater than 500.
+",
+    "filepath = os.path.join(WIKI_CLEAN, 'signal_500.h5')
+",
+    "signal = pd.read_hdf(filepath, 'data')
+",
+    "signal['count_views'].plot(kind='hist', logy=True)
+",
+    "print(len(signal), len(signal['page_id'].unique()), len(signal['layer'].unique()), signal['count_views'].max())
+",
+    "signal.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filepath = '../data/wikipedia/activations_all.h5'
+",
+    "
+",
+    "if os.path.exists(filepath):
+",
+    "    activations = pd.read_hdf(filepath, 'activations')
+",
+    "
+",
+    "else:
+",
+    "    START = datetime.datetime(2014, 9, 23, 2)
+",
+    "    #END = datetime.datetime(2014, 9, 24, 2)
+",
+    "    END = datetime.datetime(2015, 6, 5, 20)
+",
+    "
+",
+    "    activations = pd.DataFrame(columns=pd.date_range(START, END, freq='H'))
+",
+    "
+",
+    "    folder = os.path.join(WIKI_CLEAN, 'pagecounts_clean')
+",
+    "    for date in tqdm_notebook(activations.columns):
+",
+    "        filename = 'pagecounts-{:4d}{:02d}{:02d}-{:02d}0000.csv.gz'.format(date.year, date.month, date.day, date.hour)
+",
+    "        filename = os.path.join(folder, filename)
+",
+    "        pagecounts = pd.read_csv(filename, compression='gzip', index_col=0, squeeze=True)
+",
+    "        #print(len(pagecounts), filename)
+",
+    "        print(date)
+",
+    "        activations[date] = pagecounts
+",
+    "        activations[date] = activations[date].fillna(0).astype(np.int32)
+",
+    "
+",
+    "    activations.to_hdf(filepath, 'activations')
+",
+    "
+",
+    "print(activations.shape)
+",
+    "activations.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Predictable fluctuations with unpredictable spikes. Those are outliers.
+",
+    "* Anomalies should be outliers persisting for many hours."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "page_id = 40817806
+",
+    "page_id = 25
+",
+    "title = '{} ({})'.format(id2title(page_id), page_id)
+",
+    "activations.loc[page_id].plot(title=title)
+",
+    "plt.ylabel('#hits per hour');"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#activations.plot(kind='hist', logy=True);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TO_REMOVE = [15580374, 42727860] # page ids to remove (Main page, Undefined)"
+   ]
   }
  ],
  "metadata": {},

From 38c33e0f3c3c627183f92640447e646cf5bf7f6c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Tue, 16 May 2017 21:40:11 +0200
Subject: [PATCH 06/23] requirements: update

---
 requirements.txt | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 8217513..15ce4fb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,22 @@
 numpy
 scipy
+pandas
+tables
 scikit-learn
 matplotlib
+seaborn
 
-gensim
 tensorflow-gpu
 #tensorflow
 
 jupyter
 ipython
+
+python-dotenv
+tqdm
+
+# Only needed for NLP experiments.
+gensim
+
+# Cannot be installed with pip.
+#graph-tool

From 4c4f2b510f86f102763e26ffc33c2f9ec6c32bc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Tue, 16 May 2017 20:10:29 +0000
Subject: [PATCH 07/23] makefile: some echo implementations interpret \n

---
 experiments/4_wikipedia_traffic.ipynb | 347 +++++++++-----------------
 experiments/makefile                  |   2 +-
 2 files changed, 117 insertions(+), 232 deletions(-)

diff --git a/experiments/4_wikipedia_traffic.ipynb b/experiments/4_wikipedia_traffic.ipynb
index 7fb854a..83e18df 100644
--- a/experiments/4_wikipedia_traffic.ipynb
+++ b/experiments/4_wikipedia_traffic.ipynb
@@ -4,50 +4,28 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Wikipedia Traffic
-",
-    "
-",
-    "This notebook aims at modeling user traffic on [Wikipedia](https://wikipedia.org) using a recurrent graph convolutional neural network.
-",
-    "
-",
-    "Goal: anomaly detection. Can be used to detect events in the real world. Other applications:
-",
-    "* intrusion detection on telecomunnication networks,
-",
-    "* anomaly detection on energy networks,
-",
-    "* accident detection on transporation networks.
-",
-    "
-",
-    "Events: Super Bowl, Academy Awards, Grammy, Miss Universe, Golden Globe. Mostly December-February.
-",
-    "Missed: Charlie Hebdo, Ebola
-",
-    "
-",
-    "Network is very large: 5M nodes, 300M edges. Downsampling ideas:
-",
-    "* Choose a category, e.g. science.
-",
-    "* Take most active ones.
-",
-    "* Concatenate in modules / communities / super-nodes.
-",
-    "
-",
-    "Raw data
-",
-    "* [Wikimedia SQL dumps](https://dumps.wikimedia.org/enwiki/), to construct the hyperlink graph.
-",
-    "    * Network size: 5M nodes, 300M edges.
-",
-    "* [Pagecounts](https://dumps.wikimedia.org/other/pagecounts-all-sites/) as activations on the graph.
-",
-    "    * Data from 2014-09-23 0h to 2015-06-05 22h.
-",
+    "# Wikipedia Traffic\n",
+    "\n",
+    "This notebook aims at modeling user traffic on [Wikipedia](https://wikipedia.org) using a recurrent graph convolutional neural network.\n",
+    "\n",
+    "Goal: anomaly detection. Can be used to detect events in the real world. Other applications:\n",
+    "* intrusion detection on telecomunnication networks,\n",
+    "* anomaly detection on energy networks,\n",
+    "* accident detection on transporation networks.\n",
+    "\n",
+    "Events: Super Bowl, Academy Awards, Grammy, Miss Universe, Golden Globe. Mostly December-February.\n",
+    "Missed: Charlie Hebdo, Ebola\n",
+    "\n",
+    "Network is very large: 5M nodes, 300M edges. Downsampling ideas:\n",
+    "* Choose a category, e.g. science.\n",
+    "* Take most active ones.\n",
+    "* Concatenate in modules / communities / super-nodes.\n",
+    "\n",
+    "Raw data\n",
+    "* [Wikimedia SQL dumps](https://dumps.wikimedia.org/enwiki/), to construct the hyperlink graph.\n",
+    "    * Network size: 5M nodes, 300M edges.\n",
+    "* [Pagecounts](https://dumps.wikimedia.org/other/pagecounts-all-sites/) as activations on the graph.\n",
+    "    * Data from 2014-09-23 0h to 2015-06-05 22h.\n",
     "    * 6142 hours in total."
    ]
   },
@@ -57,28 +35,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%matplotlib inline
-",
-    "
-",
-    "import os
-",
-    "import datetime
-",
-    "
-",
-    "import IPython.display as ipd
-",
-    "from tqdm import tqdm_notebook
-",
-    "import numpy as np
-",
-    "import pandas as pd
-",
-    "import matplotlib.pyplot as plt
-",
-    "import seaborn as sns
-",
+    "%matplotlib inline\n",
+    "\n",
+    "import os\n",
+    "import datetime\n",
+    "\n",
+    "import IPython.display as ipd\n",
+    "from tqdm import tqdm_notebook\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
     "import graph_tool.all as gt"
    ]
   },
@@ -88,14 +55,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%load_ext dotenv
-",
-    "%dotenv .env
-",
-    "
-",
-    "WIKI_RAW = os.environ.get('WIKI_RAW')  # Downloaded from dumps.wikimedia.org.
-",
+    "%load_ext dotenv\n",
+    "%dotenv .env\n",
+    "\n",
+    "WIKI_RAW = os.environ.get('WIKI_RAW')  # Downloaded from dumps.wikimedia.org.\n",
     "WIKI_CLEAN = os.environ.get('WIKI_CLEAN')  # Processed by Kirell Benzi."
    ]
   },
@@ -105,8 +68,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sns.set_context(\"notebook\", font_scale=1.5)
-",
+    "sns.set_context(\"notebook\", font_scale=1.5)\n",
     "plt.rcParams['figure.figsize'] = (17, 5)"
    ]
   },
@@ -132,8 +94,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "g.is_directed()
-",
+    "g.is_directed()\n",
     "#g.set_directed(False)"
    ]
   },
@@ -143,12 +104,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print('{:.2e} vertices'.format(g.num_vertices()))
-",
-    "print('{:.2e} edges'.format(g.num_edges()))
-",
-    "
-",
+    "print('{:.2e} vertices'.format(g.num_vertices()))\n",
+    "print('{:.2e} edges'.format(g.num_edges()))\n",
+    "\n",
     "g.list_properties()"
    ]
   },
@@ -158,12 +116,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "idx = 42
-",
-    "page_title = g.vertex_properties['page_title'][idx]
-",
-    "page_id = g.vertex_properties['page_id'][idx]
-",
+    "idx = 42\n",
+    "page_title = g.vertex_properties['page_title'][idx]\n",
+    "page_id = g.vertex_properties['page_id'][idx]\n",
     "print('{}: {}'.format(page_id, page_title))"
    ]
   },
@@ -173,12 +128,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "hist = gt.vertex_hist(g, 'total')
-",
-    "plt.loglog(hist[1][:-1], hist[0])
-",
-    "plt.xlabel('#edges')
-",
+    "hist = gt.vertex_hist(g, 'total')\n",
+    "plt.loglog(hist[1][:-1], hist[0])\n",
+    "plt.xlabel('#edges')\n",
     "plt.ylabel('#nodes');"
    ]
   },
@@ -188,10 +140,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Too large to be drawn in full.
-",
-    "#gt.sfdp_layout
-",
+    "# Too large to be drawn in full.\n",
+    "#gt.sfdp_layout\n",
     "#gt.graph_draw(g)"
    ]
   },
@@ -201,10 +151,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Remove uninteresting pages.
-",
-    "#g.set_vertex_filter()
-",
+    "# Remove uninteresting pages.\n",
+    "#g.set_vertex_filter()\n",
     "#g.remove_vertex"
    ]
   },
@@ -214,8 +162,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = gt.adjacency(g)
-",
+    "A = gt.adjacency(g)\n",
     "A"
    ]
   },
@@ -223,10 +170,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 2 Pages
-",
-    "
-",
+    "## 2 Pages\n",
+    "\n",
     "A lot of pages in `pagecounts` are redirections to actual pages. We need to merge the hits."
    ]
   },
@@ -236,12 +181,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "filepath = os.path.join(WIKI_CLEAN, 'enwiki-20150403-page-redirect.csv.gz')
-",
-    "redirect = pd.read_csv(filepath, compression='gzip', sep='|', encoding='utf-8', quoting=3, index_col=1)
-",
-    "
-",
+    "filepath = os.path.join(WIKI_CLEAN, 'enwiki-20150403-page-redirect.csv.gz')\n",
+    "redirect = pd.read_csv(filepath, compression='gzip', sep='|', encoding='utf-8', quoting=3, index_col=1)\n",
+    "\n",
     "redirect.head()"
    ]
   },
@@ -251,12 +193,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#assert len(redirect) == len(redirect['page_id'].unique())
-",
-    "print('{:.2e} unique pages, {:.2e} pages including redirections'.format(
-",
-    "        len(redirect['fix_page_id'].unique()),
-",
+    "#assert len(redirect) == len(redirect['page_id'].unique())\n",
+    "print('{:.2e} unique pages, {:.2e} pages including redirections'.format(\n",
+    "        len(redirect['fix_page_id'].unique()),\n",
     "        len(redirect)))"
    ]
   },
@@ -275,12 +214,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def id2title(page_id):
-",
-    "    return redirect.at[page_id, 'fix_page_title']
-",
-    "    #return redirect[redirect['page_id'] == page_id]['fix_page_title'].values[0]
-",
+    "def id2title(page_id):\n",
+    "    return redirect.at[page_id, 'fix_page_title']\n",
+    "    #return redirect[redirect['page_id'] == page_id]['fix_page_title'].values[0]\n",
     "id2title(330)"
    ]
   },
@@ -290,34 +226,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def find_in_title(string):
-",
-    "
-",
-    "    def find(page_title, string):
-",
-    "        try:
-",
-    "            return string.lower() in page_title.lower()
-",
-    "        except:
-",
-    "            return False
-",
-    "
-",
-    "    #b = redirect['fix_page_title'].apply(find, string=string)
-",
-    "    b = redirect['page_title'].apply(find, string=string)
-",
-    "    #return redirect[b]
-",
-    "    return redirect[b & (redirect['is_redirect'] == 0)]
-",
-    "
-",
-    "#find_in_title('ebola')
-",
+    "def find_in_title(string):\n",
+    "\n",
+    "    def find(page_title, string):\n",
+    "        try:\n",
+    "            return string.lower() in page_title.lower()\n",
+    "        except:\n",
+    "            return False\n",
+    "\n",
+    "    #b = redirect['fix_page_title'].apply(find, string=string)\n",
+    "    b = redirect['page_title'].apply(find, string=string)\n",
+    "    #return redirect[b]\n",
+    "    return redirect[b & (redirect['is_redirect'] == 0)]\n",
+    "\n",
+    "#find_in_title('ebola')\n",
     "find_in_title('zirka')"
    ]
   },
@@ -325,10 +247,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 3 Page views / counts
-",
-    "
-",
+    "## 3 Page views / counts\n",
+    "\n",
     "Graph has 4M nodes but lot of pages are not seen much. `signal_500.h5` lists only 118k pages."
    ]
   },
@@ -338,16 +258,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Kirell's signal which includes views when greater than 500.
-",
-    "filepath = os.path.join(WIKI_CLEAN, 'signal_500.h5')
-",
-    "signal = pd.read_hdf(filepath, 'data')
-",
-    "signal['count_views'].plot(kind='hist', logy=True)
-",
-    "print(len(signal), len(signal['page_id'].unique()), len(signal['layer'].unique()), signal['count_views'].max())
-",
+    "# Kirell's signal which includes views when greater than 500.\n",
+    "filepath = os.path.join(WIKI_CLEAN, 'signal_500.h5')\n",
+    "signal = pd.read_hdf(filepath, 'data')\n",
+    "signal['count_views'].plot(kind='hist', logy=True)\n",
+    "print(len(signal), len(signal['page_id'].unique()), len(signal['layer'].unique()), signal['count_views'].max())\n",
     "signal.head()"
    ]
   },
@@ -357,56 +272,31 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "filepath = '../data/wikipedia/activations_all.h5'
-",
-    "
-",
-    "if os.path.exists(filepath):
-",
-    "    activations = pd.read_hdf(filepath, 'activations')
-",
-    "
-",
-    "else:
-",
-    "    START = datetime.datetime(2014, 9, 23, 2)
-",
-    "    #END = datetime.datetime(2014, 9, 24, 2)
-",
-    "    END = datetime.datetime(2015, 6, 5, 20)
-",
-    "
-",
-    "    activations = pd.DataFrame(columns=pd.date_range(START, END, freq='H'))
-",
-    "
-",
-    "    folder = os.path.join(WIKI_CLEAN, 'pagecounts_clean')
-",
-    "    for date in tqdm_notebook(activations.columns):
-",
-    "        filename = 'pagecounts-{:4d}{:02d}{:02d}-{:02d}0000.csv.gz'.format(date.year, date.month, date.day, date.hour)
-",
-    "        filename = os.path.join(folder, filename)
-",
-    "        pagecounts = pd.read_csv(filename, compression='gzip', index_col=0, squeeze=True)
-",
-    "        #print(len(pagecounts), filename)
-",
-    "        print(date)
-",
-    "        activations[date] = pagecounts
-",
-    "        activations[date] = activations[date].fillna(0).astype(np.int32)
-",
-    "
-",
-    "    activations.to_hdf(filepath, 'activations')
-",
-    "
-",
-    "print(activations.shape)
-",
+    "filepath = '../data/wikipedia/activations_all.h5'\n",
+    "\n",
+    "if os.path.exists(filepath):\n",
+    "    activations = pd.read_hdf(filepath, 'activations')\n",
+    "\n",
+    "else:\n",
+    "    START = datetime.datetime(2014, 9, 23, 2)\n",
+    "    #END = datetime.datetime(2014, 9, 24, 2)\n",
+    "    END = datetime.datetime(2015, 6, 5, 20)\n",
+    "\n",
+    "    activations = pd.DataFrame(columns=pd.date_range(START, END, freq='H'))\n",
+    "\n",
+    "    folder = os.path.join(WIKI_CLEAN, 'pagecounts_clean')\n",
+    "    for date in tqdm_notebook(activations.columns):\n",
+    "        filename = 'pagecounts-{:4d}{:02d}{:02d}-{:02d}0000.csv.gz'.format(date.year, date.month, date.day, date.hour)\n",
+    "        filename = os.path.join(folder, filename)\n",
+    "        pagecounts = pd.read_csv(filename, compression='gzip', index_col=0, squeeze=True)\n",
+    "        #print(len(pagecounts), filename)\n",
+    "        print(date)\n",
+    "        activations[date] = pagecounts\n",
+    "        activations[date] = activations[date].fillna(0).astype(np.int32)\n",
+    "\n",
+    "    activations.to_hdf(filepath, 'activations')\n",
+    "\n",
+    "print(activations.shape)\n",
     "activations.head()"
    ]
   },
@@ -414,8 +304,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "* Predictable fluctuations with unpredictable spikes. Those are outliers.
-",
+    "* Predictable fluctuations with unpredictable spikes. Those are outliers.\n",
     "* Anomalies should be outliers persisting for many hours."
    ]
   },
@@ -425,14 +314,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "page_id = 40817806
-",
-    "page_id = 25
-",
-    "title = '{} ({})'.format(id2title(page_id), page_id)
-",
-    "activations.loc[page_id].plot(title=title)
-",
+    "page_id = 40817806\n",
+    "page_id = 25\n",
+    "title = '{} ({})'.format(id2title(page_id), page_id)\n",
+    "activations.loc[page_id].plot(title=title)\n",
     "plt.ylabel('#hits per hour');"
    ]
   },
@@ -465,4 +350,4 @@
  "metadata": {},
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git a/experiments/makefile b/experiments/makefile
index 6bfd9c2..22a1c95 100644
--- a/experiments/makefile
+++ b/experiments/makefile
@@ -7,7 +7,7 @@ $(NB):
 
 clean:
 	@for nb in $(NB); do \
-		echo "$$(jq --indent 1 ' \
+		printf "%s" "$$(jq --indent 1 ' \
 			.metadata = {} \
 			| (.cells[] | select(has("outputs")) | .outputs) = [] \
 			| (.cells[] | select(has("execution_count")) | .execution_count) = null \

From a270cda7c336d3064f68d4d0c260c29df9a4de98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Wed, 17 May 2017 13:53:31 +0000
Subject: [PATCH 08/23] wikipedia: compute average page views then select pages
 and load data

---
 experiments/4_wikipedia_traffic.ipynb | 157 ++++++++++++++++++++------
 1 file changed, 125 insertions(+), 32 deletions(-)

diff --git a/experiments/4_wikipedia_traffic.ipynb b/experiments/4_wikipedia_traffic.ipynb
index 83e18df..661240e 100644
--- a/experiments/4_wikipedia_traffic.ipynb
+++ b/experiments/4_wikipedia_traffic.ipynb
@@ -69,7 +69,8 @@
    "outputs": [],
    "source": [
     "sns.set_context(\"notebook\", font_scale=1.5)\n",
-    "plt.rcParams['figure.figsize'] = (17, 5)"
+    "plt.rcParams['figure.figsize'] = (17, 5)\n",
+    "plt.rcParams['agg.path.chunksize'] = 10000  # OverflowError when plotting large series."
    ]
   },
   {
@@ -239,8 +240,7 @@
     "    #return redirect[b]\n",
     "    return redirect[b & (redirect['is_redirect'] == 0)]\n",
     "\n",
-    "#find_in_title('ebola')\n",
-    "find_in_title('zirka')"
+    "find_in_title('ebola')"
    ]
   },
   {
@@ -272,32 +272,97 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "filepath = '../data/wikipedia/activations_all.h5'\n",
+    "def get_pagecounts(date):\n",
+    "    filename = 'pagecounts-{:4d}{:02d}{:02d}-{:02d}0000.csv.gz'.format(date.year, date.month, date.day, date.hour)\n",
+    "    filepath = os.path.join('..', 'data', 'wikipedia', 'pagecounts_clean', filename)\n",
+    "    return pd.read_csv(filepath, compression='gzip', index_col=0, squeeze=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "START = datetime.datetime(2014, 9, 23, 2)\n",
+    "END = datetime.datetime(2014, 9, 24, 3)\n",
+    "END = datetime.datetime(2015, 6, 5, 20)\n",
+    "dates = pd.date_range(START, END, freq='H')\n",
+    "\n",
+    "activations_tot = pd.Series(\n",
+    "    data=0,\n",
+    "    index=g.vp['page_id'].get_array(),\n",
+    "    dtype=np.int64\n",
+    ")\n",
     "\n",
-    "if os.path.exists(filepath):\n",
-    "    activations = pd.read_hdf(filepath, 'activations')\n",
+    "for date in tqdm_notebook(dates):\n",
+    "    pagecounts = get_pagecounts(date)\n",
+    "    activations_tot += pagecounts.reindex(activations_tot.index).fillna(0).astype(np.int32)\n",
     "\n",
-    "else:\n",
-    "    START = datetime.datetime(2014, 9, 23, 2)\n",
-    "    #END = datetime.datetime(2014, 9, 24, 2)\n",
-    "    END = datetime.datetime(2015, 6, 5, 20)\n",
+    "print(activations_tot.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The largest is the main page.\n",
+    "plt.semilogy(np.sort(activations_tot.values)[::-1])\n",
+    "\n",
+    "main_page = activations_tot.argmax()\n",
+    "print('{} ({}): {:.2e} views in total'.format(id2title(main_page), main_page, activations_tot[main_page]))\n",
+    "\n",
+    "print('{:.2e} views in total'.format(activations_tot.sum()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Power law.\n",
+    "activations_tot.drop(main_page).plot(kind='hist', logy=True, bins=100);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MIN_AVG_VIEWS = 100\n",
     "\n",
-    "    activations = pd.DataFrame(columns=pd.date_range(START, END, freq='H'))\n",
+    "keep = activations_tot.index[activations_tot >= MIN_AVG_VIEWS * len(dates)]\n",
+    "print('{} pages have more than {} views in total ({:.0f} per hour on average)'.format(\n",
+    "    len(keep), MIN_AVG_VIEWS * len(dates), MIN_AVG_VIEWS))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "activations = pd.DataFrame(\n",
+    "    data=0,\n",
+    "    index=keep,\n",
+    "    columns=dates,\n",
+    "    dtype=np.int32\n",
+    ")\n",
     "\n",
-    "    folder = os.path.join(WIKI_CLEAN, 'pagecounts_clean')\n",
-    "    for date in tqdm_notebook(activations.columns):\n",
-    "        filename = 'pagecounts-{:4d}{:02d}{:02d}-{:02d}0000.csv.gz'.format(date.year, date.month, date.day, date.hour)\n",
-    "        filename = os.path.join(folder, filename)\n",
-    "        pagecounts = pd.read_csv(filename, compression='gzip', index_col=0, squeeze=True)\n",
-    "        #print(len(pagecounts), filename)\n",
-    "        print(date)\n",
-    "        activations[date] = pagecounts\n",
-    "        activations[date] = activations[date].fillna(0).astype(np.int32)\n",
+    "for date in tqdm_notebook(dates):\n",
+    "    pagecounts = get_pagecounts(date)\n",
+    "    activations[date] = pagecounts.reindex(activations.index).fillna(0).astype(np.int32)\n",
     "\n",
-    "    activations.to_hdf(filepath, 'activations')\n",
+    "filepath = os.path.join('..', 'data', 'wikipedia', 'activations_{}.h5'.format(MIN_AVG_VIEWS))\n",
+    "activations.to_hdf(filepath, 'activations')\n",
     "\n",
-    "print(activations.shape)\n",
-    "activations.head()"
+    "print('activations: {} x {} = {}'.format(*activations.shape, activations.size))\n",
+    "ipd.display(activations.head())\n",
+    "ipd.display(activations.info())"
    ]
   },
   {
@@ -314,11 +379,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "page_id = 40817806\n",
-    "page_id = 25\n",
-    "title = '{} ({})'.format(id2title(page_id), page_id)\n",
-    "activations.loc[page_id].plot(title=title)\n",
-    "plt.ylabel('#hits per hour');"
+    "activations = pd.read_hdf(filepath, 'activations')\n",
+    "\n",
+    "DROP = [\n",
+    "    15580374,  # Main page draws ~10% traffic\n",
+    "    42727860,  # Undefined has the largest peaks of traffic while being inactive after 2014-10\n",
+    "#   8063851,   # Feynman point has a very large traffic peak which is probably an error.\n",
+    "#   2697304,   # Gold_as_an_investment has many traffic peaks.\n",
+    "]\n",
+    "activations.drop(DROP, inplace=True)"
    ]
   },
   {
@@ -327,14 +396,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#activations.plot(kind='hist', logy=True);"
+    "print('Max of {0} views at page id {2} and time {1}'.format(\n",
+    "    activations.unstack().max(), *activations.unstack().argmax())) \n",
+    "plt.plot(activations.values.reshape(-1));"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "## Cleanup"
+    "plt.hist(activations.values.reshape(-1), bins=100, log=True);"
    ]
   },
   {
@@ -343,7 +416,27 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "TO_REMOVE = [15580374, 42727860] # page ids to remove (Main page, Undefined)"
+    "# Events.\n",
+    "page_id = 40817806  # Ebola\n",
+    "page_id = 44635     # Grammy\n",
+    "page_id = 150340    # Miss Universe\n",
+    "page_id = 27718     # Super Bowl\n",
+    "#page_id = 324       # Academy Awards\n",
+    "#page_id = 44969225  # Charlie Hebdo shooting\n",
+    "#page_id = 2251390   # Charlie Hebdo\n",
+    "\n",
+    "# Remarkable things.\n",
+    "#page_id = 25\n",
+    "#page_id = 15580374  # Main Page --> largest traffic (~10%)\n",
+    "#page_id = 42727860  # Undefined --> hits only before mid-oct 2014\n",
+    "#page_id = 670       # Alphabet --> strange drop\n",
+    "#page_id = 8063851   # Shall distinguish outliers (counting errors?) from real events\n",
+    "#page_id = 2697304   # Lots of peaks --> correlated with fluctuations on market?\n",
+    "\n",
+    "page_title = id2title(page_id)\n",
+    "activations.loc[page_id].plot(title='{} ({})'.format(page_title, page_id), logy=True)\n",
+    "plt.ylabel('#views per hour');\n",
+    "#plt.savefig('{}_{}.png'.format(page_id, page_title.lower()), dpi=300)"
    ]
   }
  ],

From 995c86b06f140d0ccd43b12f01ee5409b98a57a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Wed, 17 May 2017 22:42:24 +0000
Subject: [PATCH 09/23] wikipedia: match graph & activations

---
 experiments/4_wikipedia_traffic.ipynb | 165 +++++++++++++++++++-------
 1 file changed, 119 insertions(+), 46 deletions(-)

diff --git a/experiments/4_wikipedia_traffic.ipynb b/experiments/4_wikipedia_traffic.ipynb
index 661240e..b818bd2 100644
--- a/experiments/4_wikipedia_traffic.ipynb
+++ b/experiments/4_wikipedia_traffic.ipynb
@@ -105,9 +105,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print('{:.2e} vertices'.format(g.num_vertices()))\n",
-    "print('{:.2e} edges'.format(g.num_edges()))\n",
+    "def print_graph(graph):\n",
+    "    print('{} vertices, {} edges'.format(\n",
+    "        graph.num_vertices(), graph.num_edges()))\n",
     "\n",
+    "print_graph(g)\n",
     "g.list_properties()"
    ]
   },
@@ -135,38 +137,6 @@
     "plt.ylabel('#nodes');"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Too large to be drawn in full.\n",
-    "#gt.sfdp_layout\n",
-    "#gt.graph_draw(g)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Remove uninteresting pages.\n",
-    "#g.set_vertex_filter()\n",
-    "#g.remove_vertex"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "A = gt.adjacency(g)\n",
-    "A"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -216,9 +186,11 @@
    "outputs": [],
    "source": [
     "def id2title(page_id):\n",
-    "    return redirect.at[page_id, 'fix_page_title']\n",
-    "    #return redirect[redirect['page_id'] == page_id]['fix_page_title'].values[0]\n",
-    "id2title(330)"
+    "    page_title = redirect.at[page_id, 'fix_page_title']\n",
+    "    #page_title = g.vp['page_title'][page_id]\n",
+    "    print('{}: https://en.wikipedia.org/?curid={}'.format(page_title, page_id))\n",
+    "    return page_title\n",
+    "id2title(12)"
    ]
   },
   {
@@ -357,10 +329,11 @@
     "    pagecounts = get_pagecounts(date)\n",
     "    activations[date] = pagecounts.reindex(activations.index).fillna(0).astype(np.int32)\n",
     "\n",
+    "activations.sort_index(inplace=True)\n",
+    "\n",
     "filepath = os.path.join('..', 'data', 'wikipedia', 'activations_{}.h5'.format(MIN_AVG_VIEWS))\n",
     "activations.to_hdf(filepath, 'activations')\n",
     "\n",
-    "print('activations: {} x {} = {}'.format(*activations.shape, activations.size))\n",
     "ipd.display(activations.head())\n",
     "ipd.display(activations.info())"
    ]
@@ -379,15 +352,21 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "activations = pd.read_hdf(filepath, 'activations')\n",
+    "def load_activations(filepath):\n",
+    "    activations = pd.read_hdf(filepath, 'activations')\n",
+    "\n",
+    "    DROP = [\n",
+    "       15580374,  # Main page draws ~10% traffic.\n",
+    "       42727860,  # Undefined has the largest peaks of traffic while being inactive after 2014-10.\n",
+    "    #   8063851,   # Feynman point has a very large traffic peak which is probably an error.\n",
+    "    #   2697304,   # Gold_as_an_investment has many traffic peaks.\n",
+    "    ]\n",
+    "    activations.drop(DROP, inplace=True)\n",
+    "    \n",
+    "    print('activations: {} page ids x {} hours = {}'.format(*activations.shape, activations.size))\n",
+    "    return activations\n",
     "\n",
-    "DROP = [\n",
-    "    15580374,  # Main page draws ~10% traffic\n",
-    "    42727860,  # Undefined has the largest peaks of traffic while being inactive after 2014-10\n",
-    "#   8063851,   # Feynman point has a very large traffic peak which is probably an error.\n",
-    "#   2697304,   # Gold_as_an_investment has many traffic peaks.\n",
-    "]\n",
-    "activations.drop(DROP, inplace=True)"
+    "activations = load_activations(filepath)"
    ]
   },
   {
@@ -438,6 +417,100 @@
     "plt.ylabel('#views per hour');\n",
     "#plt.savefig('{}_{}.png'.format(page_id, page_title.lower()), dpi=300)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4 Matching graph & activations\n",
+    "\n",
+    "Further analysis\n",
+    "* Ratio of in / out neighbors.\n",
+    "* Proportion of bidirectional hyperlinks."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "activations = load_activations(os.path.join('..', 'data', 'wikipedia', 'activations_100.h5'))\n",
+    "graph = gt.load_graph(os.path.join('..', 'data', 'wikipedia', 'enwiki-20150403-graph.gt'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_graph(graph)\n",
+    "\n",
+    "mask = np.in1d(graph.vp['page_id'].get_array(), activations.index)\n",
+    "g = gt.GraphView(graph, vfilt=mask)\n",
+    "print_graph(g)\n",
+    "\n",
+    "l = gt.label_largest_component(g)\n",
+    "g = gt.GraphView(g, vfilt=l)\n",
+    "print_graph(g)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "g = gt.Graph(g, prune=True)\n",
+    "\n",
+    "sort = np.argsort(g.vp['page_id'].get_array())\n",
+    "sort = np.argsort(sort)\n",
+    "sort = g.new_vertex_property('int64_t', sort)\n",
+    "\n",
+    "g = gt.Graph(g, vorder=sort)  # directed=False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "activations = activations.loc[g.vp['page_id'].get_array()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.testing.assert_equal(g.vp['page_id'].get_array(), activations.index)\n",
+    "\n",
+    "g.save(os.path.join('..', 'data', 'wikipedia', 'graph.gt'))\n",
+    "activations.to_hdf(os.path.join('..', 'data', 'wikipedia', 'activations.h5'), 'activations')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#gt.sfdp_layout()\n",
+    "#gt.graph_draw(g)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = gt.adjacency(g)\n",
+    "ipd.display(A)"
+   ]
   }
  ],
  "metadata": {},

From 9d204607c2e36ebecc9992cbe39a60e0fdf3b9fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Wed, 17 May 2017 23:28:36 +0000
Subject: [PATCH 10/23] layers.py: move layers in a class hierarchy

---
 lib/layers.py | 243 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 243 insertions(+)
 create mode 100644 lib/layers.py

diff --git a/lib/layers.py b/lib/layers.py
new file mode 100644
index 0000000..c47bae5
--- /dev/null
+++ b/lib/layers.py
@@ -0,0 +1,243 @@
+from . import graph
+
+import numpy as np
+import scipy.sparse
+import tensorflow as tf
+
+
+class Layer:
+    pass
+
+
+class Fourier(Layer):
+    """Graph convolutional layers that filter in Fourier."""
+
+    def __init__(self, Fout, K):
+        self.Fout = Fout
+        self.K = K
+
+    def __call__(self, x, L):
+        assert K == L.shape[0]  # artificial but useful to compute number of parameters
+        N, M, Fin = x.get_shape()
+        N, M, Fin = int(N), int(M), int(Fin)
+        # Fourier basis
+        _, U = graph.fourier(L)
+        U = tf.constant(U.T, dtype=tf.float32)
+        # Weights
+        W = self._weight_variable([M, self.Fout, Fin], regularization=False)
+        return self._filter_in_fourier(x, L, self.Fout, self.K, U, W)
+
+    def _filter_in_fourier(self, x, L, Fout, K, U, W):
+        # TODO: N x F x M would avoid the permutations
+        N, M, Fin = x.get_shape()
+        N, M, Fin = int(N), int(M), int(Fin)
+        x = tf.transpose(x, perm=[1, 2, 0])  # M x Fin x N
+        # Transform to Fourier domain
+        x = tf.reshape(x, [M, Fin*N])  # M x Fin*N
+        x = tf.matmul(U, x)  # M x Fin*N
+        x = tf.reshape(x, [M, Fin, N])  # M x Fin x N
+        # Filter
+        x = tf.matmul(W, x)  # for each feature
+        x = tf.transpose(x)  # N x Fout x M
+        x = tf.reshape(x, [N*Fout, M])  # N*Fout x M
+        # Transform back to graph domain
+        x = tf.matmul(x, U)  # N*Fout x M
+        x = tf.reshape(x, [N, Fout, M])  # N x Fout x M
+        return tf.transpose(x, perm=[0, 2, 1])  # N x M x Fout
+
+
+class Spline(Fourier):
+
+    def __call__(self, x, L):
+        N, M, Fin = x.get_shape()
+        N, M, Fin = int(N), int(M), int(Fin)
+        # Fourier basis
+        lamb, U = graph.fourier(L)
+        U = tf.constant(U.T, dtype=tf.float32)  # M x M
+        # Spline basis
+        B = self._bspline_basis(self.K, lamb, degree=3)  # M x K
+        # B = _bspline_basis(K, len(lamb), degree=3)  # M x K
+        B = tf.constant(B, dtype=tf.float32)
+        # Weights
+        W = self._weight_variable([self.K, self.Fout*Fin], regularization=False)
+        W = tf.matmul(B, W)  # M x Fout*Fin
+        W = tf.reshape(W, [M, self.Fout, Fin])
+        return self._filter_in_fourier(x, L, self.Fout, self.K, U, W)
+
+    def _bspline_basis(self, K, x, degree=3):
+        """
+        Return the B-spline basis.
+
+        K: number of control points.
+        x: evaluation points
+           or number of evenly distributed evaluation points.
+        degree: degree of the spline. Cubic spline by default.
+        """
+        if np.isscalar(x):
+            x = np.linspace(0, 1, x)
+
+        # Evenly distributed knot vectors.
+        kv1 = x.min() * np.ones(degree)
+        kv2 = np.linspace(x.min(), x.max(), K-degree+1)
+        kv3 = x.max() * np.ones(degree)
+        kv = np.concatenate((kv1, kv2, kv3))
+
+        # Cox - DeBoor recursive function to compute one spline over x.
+        def cox_deboor(k, d):
+            # Test for end conditions, the rectangular degree zero spline.
+            if (d == 0):
+                return ((x - kv[k] >= 0) & (x - kv[k + 1] < 0)).astype(int)
+
+            denom1 = kv[k + d] - kv[k]
+            term1 = 0
+            if denom1 > 0:
+                term1 = ((x - kv[k]) / denom1) * cox_deboor(k, d - 1)
+
+            denom2 = kv[k + d + 1] - kv[k + 1]
+            term2 = 0
+            if denom2 > 0:
+                term2 = ((-(x - kv[k + d + 1]) / denom2) * cox_deboor(k + 1, d - 1))
+
+            return term1 + term2
+
+        # Compute basis for each point
+        basis = np.column_stack([cox_deboor(k, degree) for k in range(K)])
+        basis[-1, -1] = 1
+        return basis
+
+
+class Chebyshev(Layer):
+
+    def __init__(self, Fout, K):
+        self.Fout = Fout
+        self.K = K
+
+
+class Chebyshev2(Chebyshev):
+
+    def __call__(self, x, L):
+        """
+        Filtering with Chebyshev interpolation
+        Implementation: numpy.
+
+        Data: x of size N x M x F
+            N: number of signals
+            M: number of vertices
+            F: number of features per signal per vertex
+        """
+        N, M, Fin = x.get_shape()
+        N, M, Fin = int(N), int(M), int(Fin)
+        # Rescale Laplacian. Copy to not modify the shared L.
+        L = scipy.sparse.csr_matrix(L)
+        L = graph.rescale_L(L, lmax=2)
+        # Transform to Chebyshev basis
+        x = tf.transpose(x, perm=[1, 2, 0])  # M x Fin x N
+        x = tf.reshape(x, [M, Fin*N])  # M x Fin*N
+        def chebyshev(x):
+            return graph.chebyshev(L, x, self.K)
+        x = tf.py_func(chebyshev, [x], [tf.float32])[0]  # K x M x Fin*N
+        x = tf.reshape(x, [self.K, M, Fin, N])  # K x M x Fin x N
+        x = tf.transpose(x, perm=[3, 1, 2, 0])  # N x M x Fin x K
+        x = tf.reshape(x, [N*M, Fin*self.K])  # N*M x Fin*K
+        # Filter: Fin*Fout filters of order K, i.e. one filterbank per feature.
+        W = self._weight_variable([Fin*K, self.Fout], regularization=False)
+        x = tf.matmul(x, W)  # N*M x Fout
+        return tf.reshape(x, [N, M, self.Fout])  # N x M x Fout
+
+
+def Chebyshev5(Chebyshev):
+
+    def __call__(self, x, L):
+        N, M, Fin = x.get_shape()
+        N, M, Fin = int(N), int(M), int(Fin)
+        # Rescale Laplacian and store as a TF sparse tensor. Copy to not modify the shared L.
+        L = scipy.sparse.csr_matrix(L)
+        L = graph.rescale_L(L, lmax=2)
+        L = L.tocoo()
+        indices = np.column_stack((L.row, L.col))
+        L = tf.SparseTensor(indices, L.data, L.shape)
+        L = tf.sparse_reorder(L)
+        # Transform to Chebyshev basis
+        x0 = tf.transpose(x, perm=[1, 2, 0])  # M x Fin x N
+        x0 = tf.reshape(x0, [M, Fin*N])  # M x Fin*N
+        x = tf.expand_dims(x0, 0)  # 1 x M x Fin*N
+        def concat(x, x_):
+            x_ = tf.expand_dims(x_, 0)  # 1 x M x Fin*N
+            return tf.concat([x, x_], axis=0)  # K x M x Fin*N
+        if self.K > 1:
+            x1 = tf.sparse_tensor_dense_matmul(L, x0)
+            x = concat(x, x1)
+        for k in range(2, self.K):
+            x2 = 2 * tf.sparse_tensor_dense_matmul(L, x1) - x0  # M x Fin*N
+            x = concat(x, x2)
+            x0, x1 = x1, x2
+        x = tf.reshape(x, [self.K, M, Fin, N])  # K x M x Fin x N
+        x = tf.transpose(x, perm=[3, 1, 2, 0])  # N x M x Fin x K
+        x = tf.reshape(x, [N*M, Fin*self.K])  # N*M x Fin*K
+        # Filter: Fin*Fout filters of order K, i.e. one filterbank per feature pair.
+        W = self._weight_variable([Fin*self.K, self.Fout], regularization=False)
+        x = tf.matmul(x, W)  # N*M x Fout
+        return tf.reshape(x, [N, M, self.Fout])  # N x M x Fout
+
+
+class Bias(Layer):
+    pass
+
+
+class Bias1Relu(Bias):
+    """Bias and ReLU. One bias per filter."""
+    def __call__(self, x):
+        N, M, F = x.get_shape()
+        b = self._bias_variable([1, 1, int(F)], regularization=False)
+        return tf.nn.relu(x + b)
+
+
+class Bias2Relu(Bias):
+    """Bias and ReLU. One bias per vertex per filter."""
+    def __call__(self, x):
+        N, M, F = x.get_shape()
+        b = self._bias_variable([1, int(M), int(F)], regularization=False)
+        return tf.nn.relu(x + b)
+
+
+class Pooling(Layer):
+    def __init__(self, p):
+        self.p = p
+
+
+class MaxPooling(Pooling):
+    def __call__(self, x):
+        """Max pooling of size p. Should be a power of 2."""
+        if self.p > 1:
+            x = tf.expand_dims(x, 3)  # N x M x F x 1
+            x = tf.nn.max_pool(x, ksize=[1,self.p,1,1], strides=[1,self.p,1,1], padding='SAME')
+            #tf.maximum
+            return tf.squeeze(x, [3])  # N x M/p x F
+        else:
+            return x
+
+
+class AvgPooling(Pooling):
+    def __call__(self, x):
+        """Average pooling of size p. Should be a power of 2."""
+        if self.p > 1:
+            x = tf.expand_dims(x, 3)  # N x M x F x 1
+            x = tf.nn.avg_pool(x, ksize=[1,self.p,1,1], strides=[1,self.p,1,1], padding='SAME')
+            return tf.squeeze(x, [3])  # N x M/p x F
+        else:
+            return x
+
+
+class Dense(Layer):
+
+    def __init__(self, Mout, relu=True):
+        self.Mout = Mout
+        self.relu = relu
+
+    def __call__(self, x):
+        """Fully connected layer with Mout features."""
+        N, Min = x.get_shape()
+        W = self._weight_variable([int(Min), self.Mout], regularization=True)
+        b = self._bias_variable([self.Mout], regularization=True)
+        x = tf.matmul(x, W) + b
+        return tf.nn.relu(x) if self.relu else x

From 4955b8f32352be643b2bc4d418de71ff3468e146 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Thu, 18 May 2017 08:13:28 +0000
Subject: [PATCH 11/23] wikipedia: compute graph diameter and don't keep copies
 in memory

---
 experiments/4_wikipedia_traffic.ipynb | 67 ++++++++++++++++-----------
 1 file changed, 40 insertions(+), 27 deletions(-)

diff --git a/experiments/4_wikipedia_traffic.ipynb b/experiments/4_wikipedia_traffic.ipynb
index b818bd2..fca6a33 100644
--- a/experiments/4_wikipedia_traffic.ipynb
+++ b/experiments/4_wikipedia_traffic.ipynb
@@ -86,7 +86,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "g = gt.load_graph(os.path.join(WIKI_CLEAN, 'enwiki-20150403-graph.gt'))"
+    "graph = gt.load_graph(os.path.join(WIKI_CLEAN, 'enwiki-20150403-graph.gt'))"
    ]
   },
   {
@@ -95,8 +95,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "g.is_directed()\n",
-    "#g.set_directed(False)"
+    "graph.is_directed()\n",
+    "#graph.set_directed(False)"
    ]
   },
   {
@@ -109,8 +109,8 @@
     "    print('{} vertices, {} edges'.format(\n",
     "        graph.num_vertices(), graph.num_edges()))\n",
     "\n",
-    "print_graph(g)\n",
-    "g.list_properties()"
+    "print_graph(graph)\n",
+    "graph.list_properties()"
    ]
   },
   {
@@ -120,8 +120,8 @@
    "outputs": [],
    "source": [
     "idx = 42\n",
-    "page_title = g.vertex_properties['page_title'][idx]\n",
-    "page_id = g.vertex_properties['page_id'][idx]\n",
+    "page_title = graph.vertex_properties['page_title'][idx]\n",
+    "page_id = graph.vertex_properties['page_id'][idx]\n",
     "print('{}: {}'.format(page_id, page_title))"
    ]
   },
@@ -131,7 +131,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "hist = gt.vertex_hist(g, 'total')\n",
+    "hist = gt.vertex_hist(graph, 'total')\n",
     "plt.loglog(hist[1][:-1], hist[0])\n",
     "plt.xlabel('#edges')\n",
     "plt.ylabel('#nodes');"
@@ -187,7 +187,7 @@
    "source": [
     "def id2title(page_id):\n",
     "    page_title = redirect.at[page_id, 'fix_page_title']\n",
-    "    #page_title = g.vp['page_title'][page_id]\n",
+    "    #page_title = graph.vp['page_title'][id]\n",
     "    print('{}: https://en.wikipedia.org/?curid={}'.format(page_title, page_id))\n",
     "    return page_title\n",
     "id2title(12)"
@@ -263,7 +263,7 @@
     "\n",
     "activations_tot = pd.Series(\n",
     "    data=0,\n",
-    "    index=g.vp['page_id'].get_array(),\n",
+    "    index=graph.vp['page_id'].get_array(),\n",
     "    dtype=np.int64\n",
     ")\n",
     "\n",
@@ -435,8 +435,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "activations = load_activations(os.path.join('..', 'data', 'wikipedia', 'activations_100.h5'))\n",
-    "graph = gt.load_graph(os.path.join('..', 'data', 'wikipedia', 'enwiki-20150403-graph.gt'))"
+    "#activations = load_activations(os.path.join('..', 'data', 'wikipedia', 'activations_100.h5'))\n",
+    "#graph = gt.load_graph(os.path.join('..', 'data', 'wikipedia', 'enwiki-20150403-graph.gt'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_diameter(graph):\n",
+    "    d = gt.pseudo_diameter(graph)[0]\n",
+    "    print('Pseudo-diameter: {}'.format(int(d)))"
    ]
   },
   {
@@ -446,14 +457,16 @@
    "outputs": [],
    "source": [
     "print_graph(graph)\n",
+    "compute_diameter(graph)\n",
     "\n",
     "mask = np.in1d(graph.vp['page_id'].get_array(), activations.index)\n",
-    "g = gt.GraphView(graph, vfilt=mask)\n",
-    "print_graph(g)\n",
+    "graph = gt.GraphView(graph, vfilt=mask)\n",
+    "print_graph(graph)\n",
     "\n",
-    "l = gt.label_largest_component(g)\n",
-    "g = gt.GraphView(g, vfilt=l)\n",
-    "print_graph(g)"
+    "l = gt.label_largest_component(graph)\n",
+    "graph = gt.GraphView(graph, vfilt=l)\n",
+    "print_graph(graph)\n",
+    "compute_diameter(graph)"
    ]
   },
   {
@@ -462,13 +475,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "g = gt.Graph(g, prune=True)\n",
+    "graph = gt.Graph(graph, prune=True)\n",
     "\n",
-    "sort = np.argsort(g.vp['page_id'].get_array())\n",
+    "sort = np.argsort(graph.vp['page_id'].get_array())\n",
     "sort = np.argsort(sort)\n",
-    "sort = g.new_vertex_property('int64_t', sort)\n",
+    "sort = graph.new_vertex_property('int64_t', sort)\n",
     "\n",
-    "g = gt.Graph(g, vorder=sort)  # directed=False"
+    "graph = gt.Graph(graph, vorder=sort)  # directed=False"
    ]
   },
   {
@@ -477,7 +490,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "activations = activations.loc[g.vp['page_id'].get_array()]"
+    "activations = activations.loc[graph.vp['page_id'].get_array()]"
    ]
   },
   {
@@ -486,9 +499,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "np.testing.assert_equal(g.vp['page_id'].get_array(), activations.index)\n",
+    "np.testing.assert_equal(graph.vp['page_id'].get_array(), activations.index)\n",
     "\n",
-    "g.save(os.path.join('..', 'data', 'wikipedia', 'graph.gt'))\n",
+    "graph.save(os.path.join('..', 'data', 'wikipedia', 'graph.gt'))\n",
     "activations.to_hdf(os.path.join('..', 'data', 'wikipedia', 'activations.h5'), 'activations')"
    ]
   },
@@ -499,7 +512,7 @@
    "outputs": [],
    "source": [
     "#gt.sfdp_layout()\n",
-    "#gt.graph_draw(g)"
+    "#gt.graph_draw(graph)"
    ]
   },
   {
@@ -508,7 +521,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = gt.adjacency(g)\n",
+    "A = gt.adjacency(graph)\n",
     "ipd.display(A)"
    ]
   }
@@ -516,4 +529,4 @@
  "metadata": {},
  "nbformat": 4,
  "nbformat_minor": 2
-}
\ No newline at end of file
+}

From df328f7a2ad0114e14b86fe05a54fb8cfab98503 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Thu, 18 May 2017 11:55:10 +0200
Subject: [PATCH 12/23] trials: clean JSON metadata

---
 trials/1_learning_filters.ipynb | 164 ++++++++------------------------
 trials/2_classification.ipynb   | 145 +++++++---------------------
 trials/3_tensorflow.ipynb       |  44 ++-------
 trials/4_coarsening.ipynb       |  60 +++---------
 trials/makefile                 |   9 +-
 5 files changed, 99 insertions(+), 323 deletions(-)

diff --git a/trials/1_learning_filters.ipynb b/trials/1_learning_filters.ipynb
index 7f29d95..33ed47b 100644
--- a/trials/1_learning_filters.ipynb
+++ b/trials/1_learning_filters.ipynb
@@ -2,9 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "source": [
     "# Trial 1: learning graph filters\n",
     "\n",
@@ -24,9 +22,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import time\n",
@@ -39,9 +35,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "source": [
     "## Problem setting\n",
     "\n",
@@ -54,9 +48,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "M = 100  # nodes\n",
@@ -142,9 +134,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def laplacian(W, normalized=True):\n",
@@ -190,9 +180,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def fourier(L):\n",
@@ -264,9 +252,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def gen_filter(type='step', t=2):\n",
@@ -333,9 +319,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "N = 200  # signals\n",
@@ -389,9 +373,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def L(c):\n",
@@ -434,9 +416,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "t_start = time.process_time()\n",
@@ -490,9 +470,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def sgd(c0, L, dL, learning_rate=.1, batch_size=100, crit=1e-3, maxit=100, window=10):\n",
@@ -541,9 +519,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def sgd_plot_convergence(c0, L, dL, params, crit, maxit):\n",
@@ -619,9 +595,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def plot_filters(coeffs):\n",
@@ -661,9 +635,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "K = 5\n",
@@ -722,9 +694,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "K = 10\n",
@@ -788,9 +758,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def polynomial_order(K):\n",
@@ -851,9 +819,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "K = 15\n",
@@ -880,9 +846,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def filter_chebyshev(X, c):\n",
@@ -924,9 +888,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "t_start = time.process_time()\n",
@@ -953,9 +915,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "c0 = np.random.uniform(0, 1, K)\n",
@@ -975,9 +935,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def plot_coefficients(coeffs):\n",
@@ -1037,9 +995,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def rescale_L(L):\n",
@@ -1088,9 +1044,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def eval_clenshaw(x, c):\n",
@@ -1136,9 +1090,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def test(c):\n",
@@ -1187,9 +1139,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def vectorize(Xt, Y):\n",
@@ -1237,9 +1187,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def cheby_coeff_direct(X, Y, K, svd=False):\n",
@@ -1269,9 +1217,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "params = []\n",
@@ -1298,9 +1244,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "plot_coefficients(['c_crs', 'c_crd', 'c_cro', 'c_cs', 'c_co', 'c_cg'])\n",
@@ -1319,9 +1263,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def lanczos(L, X, K):\n",
@@ -1468,9 +1410,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def lanczos_basis_eval_f(L, X, K):\n",
@@ -1554,9 +1494,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def lanczos_basis_eval(L, X, K, ret_q=False, impl=2):\n",
@@ -1645,9 +1583,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "c0 = np.random.uniform(0, 1, K)\n",
@@ -1666,9 +1602,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "plot_coefficients(['c_ls', 'c_ld', 'c_lo', 'c_lf'])"
@@ -1686,9 +1620,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def c_l(n):\n",
@@ -1722,9 +1654,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def polynomial_order(K, step=1):\n",
@@ -1799,25 +1729,7 @@
    ]
   }
  ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.2"
-  }
- },
+ "metadata": {},
  "nbformat": 4,
- "nbformat_minor": 0
-}
+ "nbformat_minor": 1
+}
\ No newline at end of file
diff --git a/trials/2_classification.ipynb b/trials/2_classification.ipynb
index d06d46d..03b6105 100644
--- a/trials/2_classification.ipynb
+++ b/trials/2_classification.ipynb
@@ -12,9 +12,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import time\n",
@@ -50,9 +48,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def mnist(a, b, N):\n",
@@ -109,9 +105,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def test_sklearn(tauR):\n",
@@ -145,9 +139,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def test_optim(clf, X, y, ax=None):\n",
@@ -177,9 +169,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "class rls:\n",
@@ -222,9 +212,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "t_start = time.process_time()\n",
@@ -246,9 +234,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def lanczos(L, X, K):\n",
@@ -301,9 +287,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def test():\n",
@@ -350,9 +334,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "class gflc_noweights:\n",
@@ -417,10 +399,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false,
-    "scrolled": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "class gflc_weights():\n",
@@ -516,9 +495,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "class gflc_split():\n",
@@ -637,9 +614,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "lamb, U = graph.fourier(L)\n",
@@ -649,9 +624,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def plot_filters(C, spectrum=False):\n",
@@ -700,9 +673,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def plot_features(C, x):\n",
@@ -741,9 +712,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def scorer(clf, X, y):\n",
@@ -758,9 +727,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def perf(clf, nfolds=3):\n",
@@ -813,9 +780,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def cross_validation(clf, nfolds, nvalidations):\n",
@@ -839,9 +804,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def test_classification(clf, params, param, values, nfolds=10, nvalidations=1):\n",
@@ -867,9 +830,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "test_classification(rls, {}, 'tauR', [1e8,1e7,1e6,1e5,1e4,1e3,1e-5,1e-8], 10, 10)"
@@ -878,9 +839,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "params = {'F':1, 'K':2, 'tauR':1e3, 'niter':5, 'algo':'direct'}\n",
@@ -890,9 +849,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "params = {'F':2, 'K':10, 'tauR':1e4, 'niter':5, 'algo':'direct'}\n",
@@ -902,9 +859,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "params = {'F':2, 'K':4, 'tauR':1e4, 'niter':5, 'algo':'direct'}\n",
@@ -921,9 +876,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "Xfull = X"
@@ -932,9 +885,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def sample(X, p, seed=None):\n",
@@ -984,9 +935,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "#clf_weights = gflc_weights(F=3, K=4, tauR=1e-3, niter=5, algo='direct')\n",
@@ -997,9 +946,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "#test_classification(rls, {}, 'tauR', [1e1,1e0])\n",
@@ -1010,9 +957,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "test_classification(rls, {}, 'tauR', [1e8,1e7,1e6,1e5,1e4,1e3,1e-5,1e-8], 10, 10)"
@@ -1021,9 +966,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "params = {'F':2, 'K':2, 'tauR':1e3, 'niter':5, 'algo':'direct'}\n",
@@ -1033,9 +976,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "params = {'F':2, 'K':10, 'tauR':1e5, 'niter':5, 'algo':'direct'}\n",
@@ -1045,9 +986,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "params = {'F':2, 'K':4, 'tauR':1e5, 'niter':5, 'algo':'direct'}\n",
@@ -1055,25 +994,7 @@
    ]
   }
  ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.2"
-  }
- },
+ "metadata": {},
  "nbformat": 4,
- "nbformat_minor": 0
-}
+ "nbformat_minor": 1
+}
\ No newline at end of file
diff --git a/trials/3_tensorflow.ipynb b/trials/3_tensorflow.ipynb
index 361ece6..21de92a 100644
--- a/trials/3_tensorflow.ipynb
+++ b/trials/3_tensorflow.ipynb
@@ -12,9 +12,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import tensorflow as tf"
@@ -30,9 +28,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from tensorflow.examples.tutorials.mnist import input_data\n",
@@ -51,9 +47,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "x = tf.placeholder(tf.float32, [None, 784])\n",
@@ -72,9 +66,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "y_ = tf.placeholder(tf.float32, [None, 10])\n",
@@ -100,9 +92,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))\n",
@@ -111,25 +101,7 @@
    ]
   }
  ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.2"
-  }
- },
+ "metadata": {},
  "nbformat": 4,
- "nbformat_minor": 0
-}
+ "nbformat_minor": 1
+}
\ No newline at end of file
diff --git a/trials/4_coarsening.ipynb b/trials/4_coarsening.ipynb
index 414b138..46924b5 100644
--- a/trials/4_coarsening.ipynb
+++ b/trials/4_coarsening.ipynb
@@ -2,9 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "source": [
     "# Trial 4: graph coarsening\n",
     "\n",
@@ -27,9 +25,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
@@ -41,9 +37,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if False:\n",
@@ -75,9 +69,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# INPUT\n",
@@ -170,9 +162,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "#http://nbviewer.ipython.org/gist/Midnighter/9992103\n",
@@ -202,9 +192,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Coarsen a graph given by rr,cc,vv.  rr is assumed to be ordered\n",
@@ -258,9 +246,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "maxsize = 200\n",
@@ -299,9 +285,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import sys\n",
@@ -387,9 +371,7 @@
   },
   {
    "cell_type": "raw",
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "source": [
     "# Matlab results\n",
     "\n",
@@ -458,25 +440,7 @@
    ]
   }
  ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.2"
-  }
- },
+ "metadata": {},
  "nbformat": 4,
- "nbformat_minor": 0
-}
+ "nbformat_minor": 1
+}
\ No newline at end of file
diff --git a/trials/makefile b/trials/makefile
index 3a29a42..22a1c95 100644
--- a/trials/makefile
+++ b/trials/makefile
@@ -6,6 +6,13 @@ $(NB):
 	jupyter nbconvert --inplace --execute --ExecutePreprocessor.timeout=-1 $@
 
 clean:
-	jupyter nbconvert --inplace --ClearOutputPreprocessor.enabled=True $(NB)
+	@for nb in $(NB); do \
+		printf "%s" "$$(jq --indent 1 ' \
+			.metadata = {} \
+			| (.cells[] | select(has("outputs")) | .outputs) = [] \
+			| (.cells[] | select(has("execution_count")) | .execution_count) = null \
+			| .cells[].metadata = {} \
+			' $$nb)" > $$nb; \
+	done
 
 .PHONY: run $(NB) clean

From b53363a67a36fce58d555e8d272588467663b392 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Thu, 18 May 2017 11:55:46 +0200
Subject: [PATCH 13/23] trials: play with graph-tool

---
 trials/5_graph_tool.ipynb | 118 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 trials/5_graph_tool.ipynb

diff --git a/trials/5_graph_tool.ipynb b/trials/5_graph_tool.ipynb
new file mode 100644
index 0000000..fad0765
--- /dev/null
+++ b/trials/5_graph_tool.ipynb
@@ -0,0 +1,118 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Trial 5: graph-tool\n",
+    "\n",
+    "Learn and experiment with [graph-tool](https://graph-tool.skewed.de).\n",
+    "\n",
+    "Alternatives for graph analysis:\n",
+    "* [NetworkX](http://networkx.github.io)\n",
+    "* [NetworKit](https://networkit.iti.kit.edu)\n",
+    "* [igraph](http://igraph.org)\n",
+    "* [GraphLab](https://turi.com)\n",
+    "* [GraphX](https://spark.apache.org/graphx)\n",
+    "* [Giraph](https://giraph.apache.org)\n",
+    "\n",
+    "Alternatives for graph visualization:\n",
+    "* [Gephi](https://gephi.org)\n",
+    "* [Graphviz](http://www.graphviz.org)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "import graph_tool.all as gt\n",
+    "#import networkx as nx\n",
+    "#import networkit as nk\n",
+    "\n",
+    "gt.openmp_enabled(), gt.openmp_get_num_threads()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Graph filters and plots"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "g, pos = gt.triangulation(np.random.random_sample((500, 2)) * 4, type='delaunay')\n",
+    "\n",
+    "tree = gt.min_spanning_tree(g2)\n",
+    "tv = gt.GraphView(g, efilt=tree)\n",
+    "\n",
+    "bv, be = gt.betweenness(tv)\n",
+    "be.a /= be.a.max() / 5\n",
+    "gt.graph_draw(tv, pos, vertex_fill_color=bv, edge_pen_width=be);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gt.adjacency(g)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#g = nk.readGraph('graph.gt', nk.Format.GraphToolBinary)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Graph models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "g = gt.collection.data['football']\n",
+    "state = gt.minimize_blockmodel_dl(g, deg_corr=False)\n",
+    "state.draw(pos=g.vp.pos)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "g = gt.collection.data['celegansneural']\n",
+    "state = gt.minimize_nested_blockmodel_dl(g, deg_corr=True)\n",
+    "state.draw()\n",
+    "state.print_summary()"
+   ]
+  }
+ ],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file

From ed98c9e27506135135e2c1651c5539a2cb1d5257 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Tue, 20 Jun 2017 14:55:03 +0000
Subject: [PATCH 14/23] wikipedia: more and better visualizations

---
 experiments/4_wikipedia_traffic.ipynb | 175 +++++++++++++++++++-------
 1 file changed, 130 insertions(+), 45 deletions(-)

diff --git a/experiments/4_wikipedia_traffic.ipynb b/experiments/4_wikipedia_traffic.ipynb
index fca6a33..0f674ec 100644
--- a/experiments/4_wikipedia_traffic.ipynb
+++ b/experiments/4_wikipedia_traffic.ipynb
@@ -58,8 +58,10 @@
     "%load_ext dotenv\n",
     "%dotenv .env\n",
     "\n",
-    "WIKI_RAW = os.environ.get('WIKI_RAW')  # Downloaded from dumps.wikimedia.org.\n",
-    "WIKI_CLEAN = os.environ.get('WIKI_CLEAN')  # Processed by Kirell Benzi."
+    "#WIKI_RAW = os.environ.get('WIKI_RAW')  # Downloaded from dumps.wikimedia.org.\n",
+    "#WIKI_CLEAN = os.environ.get('WIKI_CLEAN')  # Processed by Kirell Benzi.\n",
+    "\n",
+    "DATA_DIR = os.path.join('..', 'data', 'wikipedia')"
    ]
   },
   {
@@ -86,7 +88,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "graph = gt.load_graph(os.path.join(WIKI_CLEAN, 'enwiki-20150403-graph.gt'))"
+    "graph = gt.load_graph(os.path.join(DATA_DIR, 'enwiki-20150403-graph.gt'))"
    ]
   },
   {
@@ -131,10 +133,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "hist = gt.vertex_hist(graph, 'total')\n",
-    "plt.loglog(hist[1][:-1], hist[0])\n",
-    "plt.xlabel('#edges')\n",
-    "plt.ylabel('#nodes');"
+    "def plot_degree_distribution(graph):\n",
+    "    hist = gt.vertex_hist(graph, 'total')\n",
+    "    plt.loglog(hist[1][:-1], hist[0])\n",
+    "    plt.xlabel('#edges')\n",
+    "    plt.ylabel('#nodes')\n",
+    "    #plt.savefig('degree_distribution.pdf')\n",
+    "plot_degree_distribution(graph)"
    ]
   },
   {
@@ -152,7 +157,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "filepath = os.path.join(WIKI_CLEAN, 'enwiki-20150403-page-redirect.csv.gz')\n",
+    "filepath = os.path.join(DATA_DIR, 'enwiki-20150403-page-redirect.csv.gz')\n",
     "redirect = pd.read_csv(filepath, compression='gzip', sep='|', encoding='utf-8', quoting=3, index_col=1)\n",
     "\n",
     "redirect.head()"
@@ -231,7 +236,7 @@
    "outputs": [],
    "source": [
     "# Kirell's signal which includes views when greater than 500.\n",
-    "filepath = os.path.join(WIKI_CLEAN, 'signal_500.h5')\n",
+    "filepath = os.path.join(DATA_DIR, 'signal_500.h5')\n",
     "signal = pd.read_hdf(filepath, 'data')\n",
     "signal['count_views'].plot(kind='hist', logy=True)\n",
     "print(len(signal), len(signal['page_id'].unique()), len(signal['layer'].unique()), signal['count_views'].max())\n",
@@ -296,7 +301,9 @@
    "outputs": [],
    "source": [
     "# Power law.\n",
-    "activations_tot.drop(main_page).plot(kind='hist', logy=True, bins=100);"
+    "activations_tot.drop(main_page).plot(kind='hist', logy=True, bins=100);\n",
+    "plt.figure()\n",
+    "activations_tot.drop(main_page)[activations_tot < 1e7].plot(kind='hist', logy=True, bins=100);"
    ]
   },
   {
@@ -352,16 +359,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def load_activations(filepath):\n",
+    "DROP = [\n",
+    "   15580374,  # Main page draws ~10% traffic.\n",
+    "   42727860,  # Undefined has the largest peaks of traffic while being inactive after 2014-10.\n",
+    "#   8063851,   # Feynman point has a very large traffic peak which is probably an error.\n",
+    "#   2697304,   # Gold_as_an_investment has many traffic peaks.\n",
+    "]\n",
+    "\n",
+    "def load_activations(filepath, drop=DROP):\n",
     "    activations = pd.read_hdf(filepath, 'activations')\n",
     "\n",
-    "    DROP = [\n",
-    "       15580374,  # Main page draws ~10% traffic.\n",
-    "       42727860,  # Undefined has the largest peaks of traffic while being inactive after 2014-10.\n",
-    "    #   8063851,   # Feynman point has a very large traffic peak which is probably an error.\n",
-    "    #   2697304,   # Gold_as_an_investment has many traffic peaks.\n",
-    "    ]\n",
-    "    activations.drop(DROP, inplace=True)\n",
+    "    if drop:\n",
+    "        activations.drop(drop, inplace=True)\n",
     "    \n",
     "    print('activations: {} page ids x {} hours = {}'.format(*activations.shape, activations.size))\n",
     "    return activations\n",
@@ -395,27 +404,39 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "def plot_activation(page_id):\n",
+    "    page_title = id2title(page_id)\n",
+    "    ax = activations.loc[page_id].plot(label='{} ({})'.format(page_title, page_id), logy=True)\n",
+    "    ax.set_ylabel('#views per hour');\n",
+    "    ax.legend()\n",
+    "    #plt.savefig('{}_{}.png'.format(page_id, page_title.lower()), dpi=300)\n",
+    "    #plt.savefig('{}_{}.pdf'.format(page_id, page_title.lower()))\n",
+    "\n",
     "# Events.\n",
-    "page_id = 40817806  # Ebola\n",
-    "page_id = 44635     # Grammy\n",
-    "page_id = 150340    # Miss Universe\n",
-    "page_id = 27718     # Super Bowl\n",
-    "#page_id = 324       # Academy Awards\n",
-    "#page_id = 44969225  # Charlie Hebdo shooting\n",
-    "#page_id = 2251390   # Charlie Hebdo\n",
+    "plot_activation(2251390)   # Charlie Hebdo\n",
+    "plot_activation(44969225)  # Charlie Hebdo shooting\n",
+    "plt.figure()\n",
+    "plot_activation(27718)     # Super Bowl\n",
+    "plt.figure()\n",
+    "#plot_activation(40817806)  # Ebola\n",
+    "plot_activation(44635)     # Grammy\n",
+    "plot_activation(150340)    # Miss Universe\n",
+    "#plot_activation(324)       # Academy Awards\n",
+    "\n",
+    "# Neighbors of Charlie Hebdo.\n",
+    "#plot_activation(44969610)  # Charb\n",
+    "#plot_activation(206682)    # Caricature\n",
+    "#plot_activation(15012)     # Islamism\n",
+    "#plot_activation(7826589)   # Jihadism\n",
+    "#plot_activation(50100)     # Journalist\n",
     "\n",
     "# Remarkable things.\n",
-    "#page_id = 25\n",
-    "#page_id = 15580374  # Main Page --> largest traffic (~10%)\n",
-    "#page_id = 42727860  # Undefined --> hits only before mid-oct 2014\n",
-    "#page_id = 670       # Alphabet --> strange drop\n",
-    "#page_id = 8063851   # Shall distinguish outliers (counting errors?) from real events\n",
-    "#page_id = 2697304   # Lots of peaks --> correlated with fluctuations on market?\n",
-    "\n",
-    "page_title = id2title(page_id)\n",
-    "activations.loc[page_id].plot(title='{} ({})'.format(page_title, page_id), logy=True)\n",
-    "plt.ylabel('#views per hour');\n",
-    "#plt.savefig('{}_{}.png'.format(page_id, page_title.lower()), dpi=300)"
+    "#plot_activation(25)\n",
+    "#plot_activation(15580374)  # Main Page --> largest traffic (~10%)\n",
+    "#plot_activation(42727860)  # Undefined --> hits only before mid-oct 2014\n",
+    "#plot_activation(670)       # Alphabet --> strange drop\n",
+    "#plot_activation(8063851)   # Shall distinguish outliers (counting errors?) from real events\n",
+    "#plot_activation(2697304)   # Lots of peaks --> correlated with fluctuations on market?"
    ]
   },
   {
@@ -435,8 +456,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#activations = load_activations(os.path.join('..', 'data', 'wikipedia', 'activations_100.h5'))\n",
-    "#graph = gt.load_graph(os.path.join('..', 'data', 'wikipedia', 'enwiki-20150403-graph.gt'))"
+    "activations = load_activations(os.path.join(DAT_DIR, 'activations_100.h5'))\n",
+    "graph = gt.load_graph(os.path.join(DATA_DIR, 'enwiki-20150403-graph.gt'))"
    ]
   },
   {
@@ -477,11 +498,14 @@
    "source": [
     "graph = gt.Graph(graph, prune=True)\n",
     "\n",
-    "sort = np.argsort(graph.vp['page_id'].get_array())\n",
-    "sort = np.argsort(sort)\n",
-    "sort = graph.new_vertex_property('int64_t', sort)\n",
+    "def sort_vertices(graph, vp):\n",
+    "    sort = np.argsort(vp.get_array())\n",
+    "    sort = np.argsort(sort)\n",
+    "    sort = graph.new_vertex_property('int64_t', sort)\n",
+    "    return gt.Graph(graph, vorder=sort)\n",
     "\n",
-    "graph = gt.Graph(graph, vorder=sort)  # directed=False"
+    "graph = sort_vertices(graph, graph.vp['page_id'])\n",
+    "# directed=False"
    ]
   },
   {
@@ -502,6 +526,7 @@
     "np.testing.assert_equal(graph.vp['page_id'].get_array(), activations.index)\n",
     "\n",
     "graph.save(os.path.join('..', 'data', 'wikipedia', 'graph.gt'))\n",
+    "graph.save(os.path.join('..', 'data', 'wikipedia', 'graph.graphml'))\n",
     "activations.to_hdf(os.path.join('..', 'data', 'wikipedia', 'activations.h5'), 'activations')"
    ]
   },
@@ -515,18 +540,78 @@
     "#gt.graph_draw(graph)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5 Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph = gt.load_graph(os.path.join(DATA_DIR, 'graph.gt'))\n",
+    "activations = load_activations(os.path.join(DATA_DIR, 'activations.h5'), drop=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_degree_distribution(graph)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_adjacency(graph, ax=None):\n",
+    "    A = gt.adjacency(graph)\n",
+    "    if not ax:\n",
+    "        fig, ax = plt.subplots(figsize=(10, 10))\n",
+    "    ax.spy(A[:10000,:10000], markersize=0.2)\n",
+    "    ax.set_title('{} nodes, {} edges ({:.2%})'.format(\n",
+    "        A.shape[0], A.nnz, A.nnz / np.multiply(*A.shape)))\n",
+    "\n",
+    "plot_adjacency(graph)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def order_adjacency_plot(graph, ax=None, **kwargs):\n",
+    "    state = gt.minimize_blockmodel_dl(graph, **kwargs)\n",
+    "    graph = sort_vertices(graph, state.get_blocks())\n",
+    "    plot_adjacency(graph, ax)\n",
+    "\n",
+    "fig, axes = plt.subplots(1, 3)\n",
+    "for ax, n_blocks in zip(axes, [10, 20, 30]):\n",
+    "    order_adjacency_plot(graph, ax=ax, B_max=n_blocks)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = gt.adjacency(graph)\n",
-    "ipd.display(A)"
+    "plt.hist(activations.values.reshape(-1), bins=100, log=True);\n",
+    "plt.figure()\n",
+    "plt.hist(activations.sum(axis=1).values.reshape(-1), bins=100, log=True);"
    ]
   }
  ],
  "metadata": {},
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
\ No newline at end of file

From 68fcb5d4fc8ba1bc696d0d931a09c66eb0d657b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Mon, 26 Jun 2017 15:01:33 +0200
Subject: [PATCH 15/23] structured sequence trial: TF input pipeline

---
 trials/6_structured_sequence.ipynb | 246 +++++++++++++++++++++++++++++
 1 file changed, 246 insertions(+)
 create mode 100644 trials/6_structured_sequence.ipynb

diff --git a/trials/6_structured_sequence.ipynb b/trials/6_structured_sequence.ipynb
new file mode 100644
index 0000000..3f94bb0
--- /dev/null
+++ b/trials/6_structured_sequence.ipynb
@@ -0,0 +1,246 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Trial 6: structured sequence modeling\n",
+    "\n",
+    "* Create simple parametric time series and try to model them.\n",
+    "* Add structure by constructing a graph between the series and see how it improves.\n",
+    "* Usage of `tflearn` inspired by [How to do time series prediction using RNNs, TensorFlow and Cloud ML Engine](https://medium.com/google-cloud/how-to-do-time-series-prediction-using-rnns-and-tensorflow-and-cloud-ml-engine-2ad2eeb189e8)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "\n",
+    "import shutil\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "import tensorflow.contrib.learn as tflearn\n",
+    "\n",
+    "plt.rcParams['figure.figsize'] = (17, 5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DATA_DIR = os.path.join('..', 'data', 'structured_sequence_trial')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1 Data generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SEQ_LEN = 100\n",
+    "N_SEQ = 4\n",
+    "\n",
+    "def create_time_series(seq_len, random_state):\n",
+    "    freq = random_state.uniform(0.1, 0.6)\n",
+    "    ampl = random_state.uniform(0.5, 1.5)\n",
+    "    offset = random_state.uniform(-1, 1)\n",
+    "    return np.sin(np.arange(seq_len) * freq) * ampl + offset\n",
+    "\n",
+    "rs = np.random.RandomState(42)\n",
+    "data = np.empty((N_SEQ, SEQ_LEN))\n",
+    "for i in range(N_SEQ):\n",
+    "    data[i] = create_time_series(SEQ_LEN, rs)\n",
+    "data = pd.DataFrame(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data.T.plot();\n",
+    "plt.savefig('time_series.pdf')\n",
+    "# hist"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2 Graph construction\n",
+    "\n",
+    "k-NN graph between the time series."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3 Data preparation\n",
+    "\n",
+    "* Store data in TFRecords files which will be read by the input pipeline.\n",
+    "* Preprocessing can be done here.\n",
+    "* Data augmentation should be done in input pipeline (to save disk space).\n",
+    "* We are doing full batch, i.e. we feed data on the whole graph at once."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "N_INPUTS = 10  # Number of samples used for prediction, i.e. unrolling length.\n",
+    "N_OUTPUTS = 1  # Number of samples in the time series the model tries to predict.\n",
+    "\n",
+    "def feature(array):\n",
+    "    array = array.reshape(-1)\n",
+    "    return tf.train.Feature(float_list=tf.train.FloatList(value=list(array)))\n",
+    "\n",
+    "def save_dataset(data, filename):\n",
+    "    \"\"\"Save dataset as TFRecords.\"\"\"\n",
+    "    filename = os.path.join(DATA_DIR, filename)\n",
+    "    num_examples = data.shape[1] - N_INPUTS - N_OUTPUTS + 1\n",
+    "    assert num_examples > 0\n",
+    "    tf.logging.info('Writing {} examples to {}'.format(num_examples, filename))\n",
+    "    with tf.python_io.TFRecordWriter(filename) as writer:\n",
+    "        for idx in range(num_examples):\n",
+    "            inputs = data[:, idx:idx+N_INPUTS]\n",
+    "            targets = data[:, idx+N_INPUTS:idx+N_INPUTS+N_OUTPUTS]\n",
+    "            example = tf.train.Example(features=tf.train.Features(feature={\n",
+    "                #'graph': feature(graph),  # Adjacency matrix or Laplacian can be stored here.\n",
+    "                'inputs': feature(inputs),\n",
+    "                'targets': feature(targets)}))\n",
+    "            writer.write(example.SerializeToString())\n",
+    "\n",
+    "TRAINING_LEN = int(0.8 * SEQ_LEN)\n",
+    "save_dataset(data.iloc[:, :TRAINING_LEN].values, 'train.tfrecords')\n",
+    "save_dataset(data.iloc[:, TRAINING_LEN:].values, 'validation.tfrecords')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4 Data loading\n",
+    "\n",
+    "Two training schemes:\n",
+    "* Load whole data for training up to a certain point in time. That is what is done for text (the whole vocabulary graph is used).\n",
+    "* Use some time series (some part of the graph) as training and the others as evaluation.\n",
+    "\n",
+    "TF alternative:\n",
+    "* [tf.contrib.slim.dataset](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class DataLoader:\n",
+    "\n",
+    "    def __init__(s, filenames, num_epochs=1, batch_size=1, read_threads=1, seed=None):\n",
+    "        #if mode == tflearn.ModeKeys.TRAIN:\n",
+    "        s.filenames = filenames\n",
+    "        s.batch_size = batch_size\n",
+    "        s.num_epochs = num_epochs\n",
+    "        s.read_threads = read_threads\n",
+    "        s.seed = seed\n",
+    "\n",
+    "    def _read_and_decode(s, filename_queue):\n",
+    "        reader = tf.TFRecordReader()\n",
+    "        _, example = reader.read(filename_queue)\n",
+    "        features={\n",
+    "            'inputs': tf.FixedLenFeature([N_SEQ * N_INPUTS], tf.float32),\n",
+    "            'targets': tf.FixedLenFeature([N_SEQ * N_OUTPUTS], tf.float32),\n",
+    "        }\n",
+    "        example = tf.parse_single_example(example, features)\n",
+    "        inputs = tf.reshape(example['inputs'], [N_SEQ, N_INPUTS])\n",
+    "        targets = tf.reshape(example['targets'], [N_SEQ, N_OUTPUTS])\n",
+    "        return inputs, targets\n",
+    "\n",
+    "    def __call__(s):\n",
+    "        with tf.name_scope('input_queues'):\n",
+    "            #with tf.device(\"/cpu:0\"):  # Input queues are on CPU.\n",
+    "            filenames = [os.path.join(DATA_DIR, filename) for filename in s.filenames]\n",
+    "            filename_queue = tf.train.string_input_producer(filenames, s.num_epochs, shuffle=True)\n",
+    "\n",
+    "            examples = [s._read_and_decode(filename_queue) for _ in range(s.read_threads)]\n",
+    "\n",
+    "            # Shuffle examples.\n",
+    "            if True:\n",
+    "                min_after_dequeue = 10 #10000\n",
+    "                capacity = min_after_dequeue + (s.read_threads + 2) * s.batch_size\n",
+    "                input_batch, target_batch = tf.train.shuffle_batch_join(\n",
+    "                        examples, batch_size=s.batch_size, seed=s.seed, capacity=capacity,\n",
+    "                        min_after_dequeue=min_after_dequeue, allow_smaller_final_batch=True)\n",
+    "            else:\n",
+    "                assert s.read_threads == 1\n",
+    "                input_batch, target_batch = examples[0]\n",
+    "            return {'inputs': input_batch}, target_batch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Make one pass over the dataset to make sure the input pipeline works."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = DataLoader(['train.tfrecords'])()[0]['inputs']\n",
+    "\n",
+    "sess = tf.Session()\n",
+    "#sess.run(tf.global_variables_initializer())\n",
+    "sess.run(tf.local_variables_initializer())\n",
+    "\n",
+    "coord = tf.train.Coordinator()\n",
+    "threads = tf.train.start_queue_runners(sess, coord)\n",
+    "\n",
+    "idx = 0\n",
+    "training_data = np.empty((N_SEQ, TRAINING_LEN-N_OUTPUTS))\n",
+    "try:\n",
+    "    while not coord.should_stop():\n",
+    "        training_data[:, idx:idx+N_INPUTS] = sess.run(inputs)\n",
+    "        idx += 1\n",
+    "\n",
+    "except tf.errors.OutOfRangeError:\n",
+    "    print('Done: {} steps'.format(idx))\n",
+    "finally:\n",
+    "    coord.request_stop()\n",
+    "\n",
+    "coord.join(threads)\n",
+    "sess.close()\n",
+    "\n",
+    "#np.testing.assert_allclose(training_data, data.iloc[:, :TRAINING_LEN-N_OUTPUTS])"
+   ]
+  }
+ ],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file

From 987cf49d10a4221a36528342a7906bbfe5a85e00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Wed, 28 Jun 2017 15:10:14 +0200
Subject: [PATCH 16/23] structured sequence: model & experiment

---
 trials/6_structured_sequence.ipynb | 104 ++++++++++++++++++++++++++---
 1 file changed, 96 insertions(+), 8 deletions(-)

diff --git a/trials/6_structured_sequence.ipynb b/trials/6_structured_sequence.ipynb
index 3f94bb0..004666c 100644
--- a/trials/6_structured_sequence.ipynb
+++ b/trials/6_structured_sequence.ipynb
@@ -158,10 +158,9 @@
    "source": [
     "class DataLoader:\n",
     "\n",
-    "    def __init__(s, filenames, num_epochs=1, batch_size=1, read_threads=1, seed=None):\n",
+    "    def __init__(s, filenames, num_epochs=1, read_threads=1, seed=None):\n",
     "        #if mode == tflearn.ModeKeys.TRAIN:\n",
     "        s.filenames = filenames\n",
-    "        s.batch_size = batch_size\n",
     "        s.num_epochs = num_epochs\n",
     "        s.read_threads = read_threads\n",
     "        s.seed = seed\n",
@@ -179,7 +178,7 @@
     "        return inputs, targets\n",
     "\n",
     "    def __call__(s):\n",
-    "        with tf.name_scope('input_queues'):\n",
+    "        with tf.name_scope('input_pipeline'):\n",
     "            #with tf.device(\"/cpu:0\"):  # Input queues are on CPU.\n",
     "            filenames = [os.path.join(DATA_DIR, filename) for filename in s.filenames]\n",
     "            filename_queue = tf.train.string_input_producer(filenames, s.num_epochs, shuffle=True)\n",
@@ -189,14 +188,19 @@
     "            # Shuffle examples.\n",
     "            if True:\n",
     "                min_after_dequeue = 10 #10000\n",
-    "                capacity = min_after_dequeue + (s.read_threads + 2) * s.batch_size\n",
-    "                input_batch, target_batch = tf.train.shuffle_batch_join(\n",
-    "                        examples, batch_size=s.batch_size, seed=s.seed, capacity=capacity,\n",
+    "                capacity = min_after_dequeue + (s.read_threads + 2)  # * s.batch_size\n",
+    "                inputs, targets = tf.train.shuffle_batch_join(\n",
+    "                        examples, batch_size=1, seed=s.seed, capacity=capacity,\n",
     "                        min_after_dequeue=min_after_dequeue, allow_smaller_final_batch=True)\n",
+    "                # We read full batch.\n",
+    "                inputs = inputs[0, ...]\n",
+    "                targets = targets[0, ...]\n",
     "            else:\n",
     "                assert s.read_threads == 1\n",
-    "                input_batch, target_batch = examples[0]\n",
-    "            return {'inputs': input_batch}, target_batch"
+    "                inputs, targets = examples[0]\n",
+    "\n",
+    "            # Can return a fixed graph or a per-sample graph in the features.\n",
+    "            return {'inputs': inputs}, targets"
    ]
   },
   {
@@ -238,6 +242,90 @@
     "\n",
     "#np.testing.assert_allclose(training_data, data.iloc[:, :TRAINING_LEN-N_OUTPUTS])"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5 Sequence modeling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Number of hidden units in each of the LSTM cells.\n",
+    "# Number of filters in case of GCN.\n",
+    "LSTM_SIZE = 3\n",
+    "\n",
+    "def model(features, targets, mode):\n",
+    "    # Reformat input shape to become a sequence.\n",
+    "    x = tf.split(features['inputs'], N_INPUTS, axis=1)\n",
+    "    \n",
+    "    # Recurrent neural network followed by linear transform.\n",
+    "    lstm_cell = tf.contrib.rnn.BasicLSTMCell(LSTM_SIZE, forget_bias=1.0)\n",
+    "    outputs, _ = tf.contrib.rnn.static_rnn(lstm_cell, x, dtype=tf.float32)\n",
+    "    #outputs, _ = tf.contrib.rnn.dynamic_rnn(lstm_cell, x, dtype=tf.float32)\n",
+    "    with tf.name_scope('output_layer'):\n",
+    "        outputs = outputs[-1]\n",
+    "        weight = tf.Variable(tf.random_normal([LSTM_SIZE, N_OUTPUTS]))\n",
+    "        bias = tf.Variable(tf.random_normal([N_SEQ, N_OUTPUTS]))\n",
+    "        predictions = tf.matmul(outputs, weight) + bias\n",
+    "    \n",
+    "    # Loss function and metric for training and evaluation.\n",
+    "    loss = tf.losses.mean_squared_error(targets, predictions)\n",
+    "    eval_metric_ops = {\n",
+    "        'rmse': tf.metrics.root_mean_squared_error(targets, predictions)\n",
+    "    }\n",
+    "    \n",
+    "    # Training operations.\n",
+    "    train_op = tf.contrib.layers.optimize_loss(\n",
+    "        loss=loss,\n",
+    "        global_step=tf.train.get_global_step(),\n",
+    "        learning_rate=0.01,\n",
+    "        #learning_rate_decay_fn=lambda lr, gs: tf.train.exponential_decay(lr, gs, 100e3, 0.96, staircase=True),\n",
+    "        optimizer=lambda lr: tf.train.GradientDescentOptimizer(lr),\n",
+    "        #optimizer=lambda lr: tf.train.MomentumOptimizer(lr, 0.9),\n",
+    "    )\n",
+    "    \n",
+    "    return tflearn.ModelFnOps(\n",
+    "        mode=mode,\n",
+    "        predictions={'predictions': predictions},\n",
+    "        loss=loss,\n",
+    "        train_op=train_op,\n",
+    "        eval_metric_ops=eval_metric_ops,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6 Experiment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "estimator = tflearn.Estimator(model_fn=model, model_dir='structured_sequence')\n",
+    "#estimator.fit(input_fn=DataLoader(filenames=['train.tfrecords']))\n",
+    "#estimator.evaluate(input_fn=DataLoader(filenames=['test.tfrecords']))\n",
+    "\n",
+    "experiment = tflearn.Experiment(\n",
+    "    estimator,\n",
+    "    eval_steps=None,\n",
+    "    train_input_fn=DataLoader(['train.tfrecords'], num_epochs=10),\n",
+    "    eval_input_fn=DataLoader(['validation.tfrecords']),\n",
+    ")\n",
+    "\n",
+    "shutil.rmtree('structured_sequence', ignore_errors=True)  # Start fresh each time.\n",
+    "experiment.train_and_evaluate()"
+   ]
   }
  ],
  "metadata": {},

From 84c994b5fc034b73eeac877ae6d684189e5539e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Thu, 18 May 2017 12:14:33 +0200
Subject: [PATCH 17/23] to be finished and merged

---
 README.md      |  59 +++++++++++
 rcv1_dev.ipynb | 278 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 337 insertions(+)
 create mode 100644 rcv1_dev.ipynb

diff --git a/README.md b/README.md
index 61f4184..1a25c97 100644
--- a/README.md
+++ b/README.md
@@ -13,10 +13,15 @@ cite the above paper if you use it.
 
 Additional material:
 * [NIPS2016 spotlight video][video], 2016-11-22.
+* [NIPS2016 poster][poster]
 * [Deep Learning on Graphs][slides_ntds], a lecture for EPFL's master course [A
   Network Tour of Data Science][ntds], 2016-12-21.
 * [Deep Learning on Graphs][slides_dlid], an invited talk at the [Deep Learning on
   Irregular Domains][dlid] workshop of BMVC, 2017-09-17.
+* most general
+* Specific to the algorithm: Presentation at the Swiss Machine Learning Day
+* More previous work: candidacy exam
+* That [blog post] is a gentle introduction of the model.
 
 [video]: https://www.youtube.com/watch?v=cIA_m7vwOVQ
 [slides_ntds]: https://doi.org/10.6084/m9.figshare.4491686
@@ -63,6 +68,15 @@ cd nips2016
 make
 ```
 
+## Experiments
+
+* MNIST (NIPS2016)
+* 20NEWS (NIPS2016)
+* RCV1
+* Wikipedia (NIPS2017)
+
+Moving MNIST and PTB experiments were not conducted by me.
+
 ## Using the model
 
 To use our graph ConvNet on your data, you need:
@@ -76,3 +90,48 @@ Please get in touch if you are unsure about applying the model to a different
 setting.
 
 [usage]: http://nbviewer.jupyter.org/github/mdeff/cnn_graph/blob/outputs/usage.ipynb
+
+## Applications
+
+* [Kipf & Weiling '16] applied a first-order approximation of that model to
+  a supervised learning task. A [blog post] by the author shows an interesting
+  connection to the ll algorithm. A [blog post] by Ferenz provides a critical
+  analysis of the method.
+
+[kipf_paper]:
+[kipf_blog]:
+
+## Repository organization
+
+See https://github.com/drivendata/cookiecutter-data-science/tree/master/%7B%7B%20cookiecutter.repo_name%20%7D%7D
+
+* The models (the introduced model and some reference models) are contained in [models.py](models.py).
+* Various side functions are implemented in [graph.py](graph.py), [coarsening.py](coarsening.py) and [utils.py](utils.py).
+* We did experiments on three datasets: MNIST ([notebook](mnist.ipynb)), 20NEWS ([notebook](20news.ipynb)) and RCV1 ([notebook](rcv1.ipynb)).
+* TensorBoard summaries are saved in the `summaries` folder.
+* Model parameters are saved in the `checkpoints` folder.
+* Data is placed in the `data` folder.
+	* [MNIST](http://yann.lecun.com/exdb/mnist/) is downloaded automatically.
+	* [20NEWS](http://qwone.com/~jason/20Newsgroups/) (`20news-bydate.tar.gz`) is downloaded automatically.
+	* [RCV1](http://trec.nist.gov/data/reuters/reuters.html) should be downloaded manually and placed in TODO.
+	* [pre-trained word2vec embeddings](https://code.google.com/archive/p/word2vec/) (`GoogleNews-vectors-negative300.bin.gz`).
+	* Wikipedia graph and activations are available here. Please cite .. if you use it.
+* The [trials](trials) folder contains various small experiences in the form of IPython notebooks.
+	1. [Learning graph filters][trial1]: first experiments on learning
+	   synthesized graph filters through observations of filtered and source
+	   graph signals. The Chebychev and Lanczos methods as well as optimization
+	   methods are compared there.
+	2. [Classification][trial2]: learning filters who extract good features for
+	   classification.
+	3. [TensorFlow][trial3]: first experience with TensorFlow.
+	4. [Coarsening][trial4]: implementation of the Graclus coarsening algorithm
+	   and comparison with a previous matlab implementation.
+* A [makefile](makefile) who runs every notebook as a sanity check. It only runs the code, there is no check on the results.
+
+[trial1]: h
+
+## Contributing
+
+* Please fill a GitHub issue if you encounter any problem. Issues are better than contacting the authors as the community can respond and
+* Pull requests are welcome !
+* You can contact me for any help regarding how to apply our model to your problem.
diff --git a/rcv1_dev.ipynb b/rcv1_dev.ipynb
new file mode 100644
index 0000000..88b8ea3
--- /dev/null
+++ b/rcv1_dev.ipynb
@@ -0,0 +1,278 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import sklearn.datasets\n",
+    "import scipy.sparse\n",
+    "import matplotlib.pyplot as plt\n",
+    "import tensorflow as tf\n",
+    "import os\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "flags = tf.app.flags\n",
+    "FLAGS = flags.FLAGS\n",
+    "\n",
+    "flags.DEFINE_string('dir_data', 'data_rcv1', 'Directory to store data.')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**From Dropout (Bruna did the same)**\n",
+    "\n",
+    "We took the dataset and split it into 63 classes based on the the 63 categories at the second-level of the category tree. We removed 11 categories that did not have any data and one category that had only 4 training examples. We also removed one category that covered a huge chunk (25%) of the examples. This left us with 50 classes and 402,738 documents. We divided the documents into equal-sized training and test sets randomly. Each document was represented\n",
+    "using the 2000 most frequent non-stopwords in the dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# Get dataset.\n",
+    "rcv1 = sklearn.datasets.fetch_rcv1('data_rcv1')\n",
+    "N, C = rcv1.target.shape\n",
+    "print('N={} documents, C={} classes'.format(N, C))\n",
+    "\n",
+    "#def select_classes\n",
+    "\n",
+    "# All classes.\n",
+    "class_names = ['C11', 'C12', 'C13','C14','C15','C151','C1511','C152','C16','C17',\n",
+    "               'C171','C172','C173','C174','C18','C181','C182','C183','C21','C22',\n",
+    "               'C23','C24','C31', 'C311','C312','C313','C32','C33','C331','C34',\n",
+    "               'C41','C411','C42','CCAT','E11', 'E12','E121','E13','E131','E132',\n",
+    "               'E14','E141','E142','E143','E21', 'E211','E212','E31','E311','E312',\n",
+    "               'E313','E41','E411','E51','E511','E512','E513','E61','E71','ECAT',\n",
+    "               'G15','G151','G152','G153','G154','G155','G156','G157','G158','G159',\n",
+    "               'GCAT','GCRIM','GDEF','GDIP','GDIS','GENT','GENV','GFAS','GHEA',\n",
+    "               'GJOB','GMIL','GOBIT','GODD','GPOL','GPRO','GREL','GSCI','GSPO',\n",
+    "               'GTOUR','GVIO','GVOTE','GWEA','GWELF','M11','M12','M13','M131',\n",
+    "               'M132','M14','M141','M142','M143','MCAT']\n",
+    "assert len(class_names) == 103  # There is 103 categories according to LYRL2004.\n",
+    "\n",
+    "# Second-level classes.\n",
+    "keep = ['C11','C12','C13','C14','C15','C16','C17','C18','C21','C22','C23','C24',\n",
+    "        'C31','C32','C33','C34','C41','C42','E11','E12','E13','E14','E21','E31',\n",
+    "        'E41','E51','E61','E71','G15','GCRIM','GDEF','GDIP','GDIS','GENT','GENV',\n",
+    "        'GFAS','GHEA','GJOB','GMIL','GOBIT','GODD','GPOL','GPRO','GREL','GSCI',\n",
+    "        'GSPO','GTOUR','GVIO','GVOTE','GWEA','GWELF','M11','M12','M13','M14']\n",
+    "assert len(keep) == 55  # There is 55 second-level categories according to LYRL2004.\n",
+    "keep.remove('C15')   # 151785 documents\n",
+    "keep.remove('GMIL')  # 5 documents only\n",
+    "\n",
+    "# Construct a lookup table for labels.\n",
+    "labels_row = []\n",
+    "labels_col = []\n",
+    "class_lookup = {}\n",
+    "for i,name in enumerate(class_names):\n",
+    "    class_lookup[name] = i\n",
+    "\n",
+    "# Index of classes to keep.\n",
+    "idx_keep = np.empty(len(keep))\n",
+    "for i,cat in enumerate(keep):\n",
+    "    idx_keep[i] = class_lookup[cat]\n",
+    "target = rcv1.target[:,idx_keep]\n",
+    "\n",
+    "# Number of documents per class.\n",
+    "def show_doc_per_class(names, target, print_=False):\n",
+    "    docs_per_class = np.array(target.astype(np.uint64).sum(axis=0)).squeeze()\n",
+    "    print('categories ({} assignments in total)'.format(docs_per_class.sum()))\n",
+    "    if print_:\n",
+    "        for i,cat in enumerate(names):\n",
+    "            print('  {:5s}: {:6d} documents'.format(cat, docs_per_class[i]))\n",
+    "    plt.figure(figsize=(17,5))\n",
+    "    plt.plot(sorted(docs_per_class[::-1]),'.')\n",
+    "show_doc_per_class(rcv1.target_names, rcv1.target)\n",
+    "show_doc_per_class(keep, target, True)\n",
+    "\n",
+    "#def select_documents\n",
+    "\n",
+    "# Number of classes per document.\n",
+    "def show_classes_per_doc(target):\n",
+    "    classes_per_doc = np.array(target.sum(axis=1)).squeeze()\n",
+    "    plt.figure(figsize=(17,5))\n",
+    "    plt.plot(sorted(classes_per_doc[::-1]),'.')\n",
+    "    return classes_per_doc\n",
+    "classes_per_doc = show_classes_per_doc(rcv1.target)\n",
+    "classes_per_doc = show_classes_per_doc(target)\n",
+    "\n",
+    "target = target[classes_per_doc==1]\n",
+    "data = rcv1.data[classes_per_doc==1, :]\n",
+    "\n",
+    "# Convert labels from indicator form to single value.\n",
+    "N, C = target.shape\n",
+    "assert C == len(keep)\n",
+    "target = target.tocoo()\n",
+    "target = target.col\n",
+    "assert target.min() == 0\n",
+    "assert target.max() == C - 1\n",
+    "\n",
+    "# Bruna and Dropout used 2 * 201369 = 402738 documents. Probably the difference btw v1 and v2.\n",
+    "print('N = {} documents and C = {} classes left'.format(N, C))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "dates = []\n",
+    "n = 0\n",
+    "for path, subdirs, files in os.walk('data_rcv1/rcv1/'):\n",
+    "    for file in files:\n",
+    "        if 'newsML.xml' in file:\n",
+    "            root = ET.parse(os.path.join(path, file)).getroot()\n",
+    "            date = root.attrib['date']\n",
+    "            dates.append(date)\n",
+    "            n+=1\n",
+    "print(n)\n",
+    "print(len(dates))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import xml.etree.ElementTree as ET\n",
+    "\n",
+    "root = ET.parse('data_rcv1/rcv1/19960820/2286newsML.xml').getroot()\n",
+    "date = root.attrib['date']\n",
+    "\n",
+    "# Fetch textual content.\n",
+    "text = root.find('title').text\n",
+    "for p in root.find('text').findall('p'):\n",
+    "    text = ' '.join((text, p.text))\n",
+    "print(text)\n",
+    "\n",
+    "# Find the labels of a document.\n",
+    "classes = []\n",
+    "doc = 0\n",
+    "for codes in root.find('metadata').findall('codes'):\n",
+    "    if codes.attrib['class'] == 'bip:topics:1.0':\n",
+    "        for code in codes.findall('code'):\n",
+    "            labels_row.append(doc)\n",
+    "            labels_col.append(class_lookup[code.attrib['code']])\n",
+    "            classes.append(code.attrib['code'])\n",
+    "\n",
+    "assert len(labels_row) == len(labels_col)\n",
+    "labels_val = np.ones(len(labels_row), dtype=np.bool)\n",
+    "labels = scipy.sparse.csr_matrix((labels_val, (labels_row, labels_col)))\n",
+    "\n",
+    "print(labels)\n",
+    "labels.sum()"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {},
+   "source": [
+    "From LYRL2004 Appendix 3\n",
+    "http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a03-expanded-topics-hierarchy/rcv1.topics.hier.expanded\n",
+    "\n",
+    "parent: C1      child: C11     child-description: STRATEGY/PLANS\n",
+    "parent: C1      child: C12     child-description: LEGAL/JUDICIAL\n",
+    "parent: C1      child: C13     child-description: REGULATION/POLICY\n",
+    "parent: C1      child: C14     child-description: SHARE LISTINGS\n",
+    "parent: C1      child: C15     child-description: PERFORMANCE\n",
+    "parent: C1      child: C16     child-description: INSOLVENCY/LIQUIDITY\n",
+    "parent: C1      child: C17     child-description: FUNDING/CAPITAL\n",
+    "parent: C1      child: C18     child-description: OWNERSHIP CHANGES\n",
+    "parent: C2      child: C21     child-description: PRODUCTION/SERVICES\n",
+    "parent: C2      child: C22     child-description: NEW PRODUCTS/SERVICES\n",
+    "parent: C2      child: C23     child-description: RESEARCH/DEVELOPMENT\n",
+    "parent: C2      child: C24     child-description: CAPACITY/FACILITIES\n",
+    "parent: C3      child: C31     child-description: MARKETS/MARKETING\n",
+    "parent: C3      child: C32     child-description: ADVERTISING/PROMOTION\n",
+    "parent: C3      child: C33     child-description: CONTRACTS/ORDERS\n",
+    "parent: C3      child: C34     child-description: MONOPOLIES/COMPETITION\n",
+    "parent: C4      child: C41     child-description: MANAGEMENT\n",
+    "parent: C4      child: C42     child-description: LABOUR\n",
+    "parent: E1      child: E11     child-description: ECONOMIC PERFORMANCE\n",
+    "parent: E1      child: E12     child-description: MONETARY/ECONOMIC\n",
+    "parent: E1      child: E13     child-description: INFLATION/PRICES\n",
+    "parent: E1      child: E14     child-description: CONSUMER FINANCE\n",
+    "parent: E2      child: E21     child-description: GOVERNMENT FINANCE\n",
+    "parent: E3      child: E31     child-description: OUTPUT/CAPACITY\n",
+    "parent: E4      child: E41     child-description: EMPLOYMENT/LABOUR\n",
+    "parent: E5      child: E51     child-description: TRADE/RESERVES\n",
+    "parent: E6      child: E61     child-description: HOUSING STARTS\n",
+    "parent: E7      child: E71     child-description: LEADING INDICATORS\n",
+    "parent: G1      child: G15     child-description: EUROPEAN COMMUNITY\n",
+    "parent: GCAT    child: GCRIM   child-description: CRIME, LAW ENFORCEMENT\n",
+    "parent: GCAT    child: GDEF    child-description: DEFENCE\n",
+    "parent: GCAT    child: GDIP    child-description: INTERNATIONAL RELATIONS\n",
+    "parent: GCAT    child: GDIS    child-description: DISASTERS AND ACCIDENTS\n",
+    "parent: GCAT    child: GENT    child-description: ARTS, CULTURE, ENTERTAINMENT\n",
+    "parent: GCAT    child: GENV    child-description: ENVIRONMENT AND NATURAL WORLD\n",
+    "parent: GCAT    child: GFAS    child-description: FASHION\n",
+    "parent: GCAT    child: GHEA    child-description: HEALTH\n",
+    "parent: GCAT    child: GJOB    child-description: LABOUR ISSUES\n",
+    "parent: GCAT    child: GMIL    child-description: MILLENNIUM ISSUES\n",
+    "parent: GCAT    child: GOBIT   child-description: OBITUARIES\n",
+    "parent: GCAT    child: GODD    child-description: HUMAN INTEREST\n",
+    "parent: GCAT    child: GPOL    child-description: DOMESTIC POLITICS\n",
+    "parent: GCAT    child: GPRO    child-description: BIOGRAPHIES, PERSONALITIES, PEOPLE\n",
+    "parent: GCAT    child: GREL    child-description: RELIGION\n",
+    "parent: GCAT    child: GSCI    child-description: SCIENCE AND TECHNOLOGY\n",
+    "parent: GCAT    child: GSPO    child-description: SPORTS\n",
+    "parent: GCAT    child: GTOUR   child-description: TRAVEL AND TOURISM\n",
+    "parent: GCAT    child: GVIO    child-description: WAR, CIVIL WAR\n",
+    "parent: GCAT    child: GVOTE   child-description: ELECTIONS\n",
+    "parent: GCAT    child: GWEA    child-description: WEATHER\n",
+    "parent: GCAT    child: GWELF   child-description: WELFARE, SOCIAL SERVICES\n",
+    "parent: M1      child: M11     child-description: EQUITY MARKETS\n",
+    "parent: M1      child: M12     child-description: BOND MARKETS\n",
+    "parent: M1      child: M13     child-description: MONEY MARKETS\n",
+    "parent: M1      child: M14     child-description: COMMODITY MARKETS"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}

From f6ea33f38b9473365b110c744c094d50fae98e5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Mon, 18 Dec 2017 15:31:35 +0100
Subject: [PATCH 18/23] work in progress

---
 lib/layers.py                      |  16 +++
 trials/6_structured_sequence.ipynb | 208 +++++++++++++++++++++++------
 usage.ipynb                        |  48 ++-----
 3 files changed, 199 insertions(+), 73 deletions(-)

diff --git a/lib/layers.py b/lib/layers.py
index c47bae5..28237dc 100644
--- a/lib/layers.py
+++ b/lib/layers.py
@@ -241,3 +241,19 @@ def __call__(self, x):
         b = self._bias_variable([self.Mout], regularization=True)
         x = tf.matmul(x, W) + b
         return tf.nn.relu(x) if self.relu else x
+
+
+class RNN(Layer):
+    pass
+
+
+class LSTM(RNN):
+    pass
+
+
+class ConvLSTM(RNN):
+    pass
+
+
+class GRU(RNN):
+    pass
diff --git a/trials/6_structured_sequence.ipynb b/trials/6_structured_sequence.ipynb
index 004666c..66a5f8f 100644
--- a/trials/6_structured_sequence.ipynb
+++ b/trials/6_structured_sequence.ipynb
@@ -54,8 +54,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "SEQ_LEN = 100\n",
-    "N_SEQ = 4\n",
+    "SEQ_LEN = 1000\n",
+    "N_SEQ = 40\n",
     "\n",
     "def create_time_series(seq_len, random_state):\n",
     "    freq = random_state.uniform(0.1, 0.6)\n",
@@ -76,7 +76,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data.T.plot();\n",
+    "data.iloc[:5, :100].T.plot();\n",
     "plt.savefig('time_series.pdf')\n",
     "# hist"
    ]
@@ -108,7 +108,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "N_INPUTS = 10  # Number of samples used for prediction, i.e. unrolling length.\n",
+    "N_INPUTS = 50  # Number of samples used for prediction, i.e. unrolling length.\n",
     "N_OUTPUTS = 1  # Number of samples in the time series the model tries to predict.\n",
     "\n",
     "def feature(array):\n",
@@ -179,28 +179,28 @@
     "\n",
     "    def __call__(s):\n",
     "        with tf.name_scope('input_pipeline'):\n",
-    "            #with tf.device(\"/cpu:0\"):  # Input queues are on CPU.\n",
-    "            filenames = [os.path.join(DATA_DIR, filename) for filename in s.filenames]\n",
-    "            filename_queue = tf.train.string_input_producer(filenames, s.num_epochs, shuffle=True)\n",
-    "\n",
-    "            examples = [s._read_and_decode(filename_queue) for _ in range(s.read_threads)]\n",
-    "\n",
-    "            # Shuffle examples.\n",
-    "            if True:\n",
-    "                min_after_dequeue = 10 #10000\n",
-    "                capacity = min_after_dequeue + (s.read_threads + 2)  # * s.batch_size\n",
-    "                inputs, targets = tf.train.shuffle_batch_join(\n",
-    "                        examples, batch_size=1, seed=s.seed, capacity=capacity,\n",
-    "                        min_after_dequeue=min_after_dequeue, allow_smaller_final_batch=True)\n",
-    "                # We read full batch.\n",
-    "                inputs = inputs[0, ...]\n",
-    "                targets = targets[0, ...]\n",
-    "            else:\n",
-    "                assert s.read_threads == 1\n",
-    "                inputs, targets = examples[0]\n",
-    "\n",
-    "            # Can return a fixed graph or a per-sample graph in the features.\n",
-    "            return {'inputs': inputs}, targets"
+    "            with tf.device(\"/cpu:0\"):  # Input queues are on CPU.\n",
+    "                filenames = [os.path.join(DATA_DIR, filename) for filename in s.filenames]\n",
+    "                filename_queue = tf.train.string_input_producer(filenames, s.num_epochs, shuffle=True)\n",
+    "\n",
+    "                examples = [s._read_and_decode(filename_queue) for _ in range(s.read_threads)]\n",
+    "\n",
+    "                # Shuffle examples.\n",
+    "                if True:\n",
+    "                    min_after_dequeue = 10 #10000\n",
+    "                    capacity = min_after_dequeue + (s.read_threads + 2)  # * s.batch_size\n",
+    "                    inputs, targets = tf.train.shuffle_batch_join(\n",
+    "                            examples, batch_size=1, seed=s.seed, capacity=capacity,\n",
+    "                            min_after_dequeue=min_after_dequeue, allow_smaller_final_batch=True)\n",
+    "                    # We read full batch.\n",
+    "                    inputs = inputs[0, ...]\n",
+    "                    targets = targets[0, ...]\n",
+    "                else:\n",
+    "                    assert s.read_threads == 1\n",
+    "                    inputs, targets = examples[0]\n",
+    "\n",
+    "                # Can return a fixed graph or a per-sample graph in the features.\n",
+    "                return {'inputs': inputs}, targets"
    ]
   },
   {
@@ -247,7 +247,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 5 Sequence modeling"
+    "## 5 Sequence modeling\n",
+    "\n",
+    "We can either:\n",
+    "* assume the same dynamic on all time series and train a shared model\n",
+    "* train a model for each time series (which still has access to its neighbors)\n",
+    "* mix: e.g. per times series bias or last layer"
    ]
   },
   {
@@ -260,7 +265,7 @@
     "# Number of filters in case of GCN.\n",
     "LSTM_SIZE = 3\n",
     "\n",
-    "def model(features, targets, mode):\n",
+    "def model(features, targets, mode, params):\n",
     "    # Reformat input shape to become a sequence.\n",
     "    x = tf.split(features['inputs'], N_INPUTS, axis=1)\n",
     "    \n",
@@ -268,11 +273,9 @@
     "    lstm_cell = tf.contrib.rnn.BasicLSTMCell(LSTM_SIZE, forget_bias=1.0)\n",
     "    outputs, _ = tf.contrib.rnn.static_rnn(lstm_cell, x, dtype=tf.float32)\n",
     "    #outputs, _ = tf.contrib.rnn.dynamic_rnn(lstm_cell, x, dtype=tf.float32)\n",
-    "    with tf.name_scope('output_layer'):\n",
-    "        outputs = outputs[-1]\n",
-    "        weight = tf.Variable(tf.random_normal([LSTM_SIZE, N_OUTPUTS]))\n",
-    "        bias = tf.Variable(tf.random_normal([N_SEQ, N_OUTPUTS]))\n",
-    "        predictions = tf.matmul(outputs, weight) + bias\n",
+    "    \n",
+    "    tf.summary.histogram('hidden', outputs[-1])\n",
+    "    predictions = tf.contrib.layers.fully_connected(outputs[-1], N_OUTPUTS, activation_fn=None)\n",
     "    \n",
     "    # Loss function and metric for training and evaluation.\n",
     "    loss = tf.losses.mean_squared_error(targets, predictions)\n",
@@ -284,7 +287,7 @@
     "    train_op = tf.contrib.layers.optimize_loss(\n",
     "        loss=loss,\n",
     "        global_step=tf.train.get_global_step(),\n",
-    "        learning_rate=0.01,\n",
+    "        learning_rate=params['learning_rate'],\n",
     "        #learning_rate_decay_fn=lambda lr, gs: tf.train.exponential_decay(lr, gs, 100e3, 0.96, staircase=True),\n",
     "        optimizer=lambda lr: tf.train.GradientDescentOptimizer(lr),\n",
     "        #optimizer=lambda lr: tf.train.MomentumOptimizer(lr, 0.9),\n",
@@ -312,9 +315,70 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "estimator = tflearn.Estimator(model_fn=model, model_dir='structured_sequence')\n",
+    "# Observing variables.\n",
+    "#tflearn.monitors.ValidationMonitor\n",
+    "#tf.train.SessionRunHook"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Tuning the hyper-parameters.\n",
+    "#tflearn.learn_runner.run()\n",
+    "#tflearn.learn_runner.tune()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TF debugger.\n",
+    "from tensorflow.python import debug as tfdbg\n",
+    "\n",
+    "hooks = [tfdbg.LocalCLIDebugHook()]\n",
+    "hooks = [tfdbg.DumpingDebugHook('tfdbg_dumps')]\n",
+    "# python -m tensorflow.python.debug.cli.offline_analyzer --dump_dir=\"tfdbg_dumps/run_<epoch_timestamp_microsec>_<uuid>\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Statistics like compute time or memory.\n",
+    "# Need to pass run_options and run_metadata to sess.run().\n",
+    "# Not possible with Experiment and Estimator API.\n",
+    "#run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)\n",
+    "#run_metadata = tf.RunMetadata()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#MODEL_DIR = os.path.join('..', 'logdir', 'structured_sequence', 'run1')\n",
+    "MODEL_DIR = 'structured_sequence'\n",
+    "config = tflearn.RunConfig(\n",
+    "    save_checkpoints_secs=60,\n",
+    "    # save_summary_steps=100,\n",
+    "    model_dir=MODEL_DIR,\n",
+    "    # To see device placement. It unfortunately only shows up in stderr, not Tensorboard (explicit placement only).\n",
+    "    # session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True),\n",
+    ")\n",
+    "hparams = {\n",
+    "    'learning_rate': 0.01\n",
+    "}\n",
+    "estimator = tflearn.Estimator(model_fn=model, config=config, params=hparams)\n",
     "#estimator.fit(input_fn=DataLoader(filenames=['train.tfrecords']))\n",
-    "#estimator.evaluate(input_fn=DataLoader(filenames=['test.tfrecords']))\n",
+    "#estimator.evaluate(input_fn=DataLoader(filenames=['validation.tfrecords']))\n",
     "\n",
     "experiment = tflearn.Experiment(\n",
     "    estimator,\n",
@@ -323,8 +387,76 @@
     "    eval_input_fn=DataLoader(['validation.tfrecords']),\n",
     ")\n",
     "\n",
-    "shutil.rmtree('structured_sequence', ignore_errors=True)  # Start fresh each time.\n",
-    "experiment.train_and_evaluate()"
+    "shutil.rmtree(MODEL_DIR, ignore_errors=True)  # Start fresh each time.\n",
+    "experiment.train_and_evaluate()\n",
+    "#experiment.continuous_train_and_eval()  # Takes less ressources.\n",
+    "\n",
+    "#estimator.evaluate(input_fn=DataLoader(filenames=['test.tfrecords']))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## XXX"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class RNN:\n",
+    "    \n",
+    "    def __init__(self, units):\n",
+    "        pass\n",
+    "    \n",
+    "    def __call__(self, inputs, states, laplacian):\n",
+    "        \"\"\"Fully connected layer with Mout features.\"\"\"\n",
+    "        N, Min = x.get_shape()\n",
+    "        W = self._weight_variable([int(Min), self.Mout], regularization=True)\n",
+    "        b = self._bias_variable([self.Mout], regularization=True)\n",
+    "        x = tf.matmul(x, W) + b\n",
+    "        return tf.nn.relu(x) if self.relu else x"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Inherit from RNNCell to use high level TF machinery like `tf.dynamic_rnn()`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class LSTM:\n",
+    "    \"\"\"The network is not unrolled.\"\"\"\n",
+    "    \n",
+    "    def _input_conv(self, x, w, b=None):\n",
+    "        pass\n",
+    "    \n",
+    "    def _reccurent_conv(self, x, w, b=None):\n",
+    "        pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.python.ops import control_flow_ops\n",
+    "control_flow_ops.while_loop(\n",
+    "    cond=lambda time, *_: time < time_steps,\n",
+    "    body=_step,\n",
+    "    loop_vars=(time, output_ta) + states,\n",
+    "    parallel_iterations=32,\n",
+    "    swap_memory=True)"
    ]
   }
  ],
diff --git a/usage.ipynb b/usage.ipynb
index 2c1ff13..0febd8d 100644
--- a/usage.ipynb
+++ b/usage.ipynb
@@ -26,9 +26,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from lib import models, graph, coarsening, utils\n",
@@ -49,9 +47,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "d = 100    # Dimensionality.\n",
@@ -86,9 +82,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "n_train = n // 2\n",
@@ -125,9 +119,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "dist, idx = graph.distance_scipy_spatial(X_train.T, k=10, metric='euclidean')\n",
@@ -154,9 +146,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "graphs, perm = coarsening.coarsen(A, levels=3, self_connections=False)\n",
@@ -176,9 +166,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "L = [graph.laplacian(A, normalized=True) for A in graphs]\n",
@@ -199,9 +187,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "params = dict()\n",
@@ -237,9 +223,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "model = models.cgcnn(L, **params)\n",
@@ -262,9 +246,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "fig, ax1 = plt.subplots(figsize=(15, 5))\n",
@@ -279,9 +261,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "print('Time per step: {:.2f} ms'.format(t_step*1000))"
@@ -290,9 +270,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "res = model.evaluate(X_test, y_test)\n",
@@ -316,9 +294,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.4.3"
+   "version": "3.6.2"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }

From aef2edcf9e7e8e4d3f193b4b9da65416e8e585d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Fri, 21 Feb 2020 03:19:48 +0100
Subject: [PATCH 19/23] keep original parameters

---
 nips2016/mnist.ipynb | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/nips2016/mnist.ipynb b/nips2016/mnist.ipynb
index 73846be..2b9d2f1 100644
--- a/nips2016/mnist.ipynb
+++ b/nips2016/mnist.ipynb
@@ -164,10 +164,10 @@
    "source": [
     "common = {}\n",
     "common['dir_name']       = 'mnist/'\n",
-    "common['num_epochs']     = 4 #20\n",
+    "common['num_epochs']     = 20\n",
     "common['batch_size']     = 100\n",
     "common['decay_steps']    = mnist.train.num_examples / common['batch_size']\n",
-    "common['eval_frequency'] = 100 #30 * common['num_epochs']\n",
+    "common['eval_frequency'] = 30 * common['num_epochs']\n",
     "common['brelu']          = 'b1relu'\n",
     "common['pool']           = 'mpool1'\n",
     "C = max(mnist.train.labels) + 1  # number of classes\n",
@@ -183,7 +183,7 @@
    },
    "outputs": [],
    "source": [
-    "if False:\n",
+    "if True:\n",
     "    name = 'softmax'\n",
     "    params = common.copy()\n",
     "    params['dir_name'] += name\n",
@@ -228,7 +228,7 @@
    },
    "outputs": [],
    "source": [
-    "if False:\n",
+    "if True:\n",
     "    name = 'fgconv_softmax'\n",
     "    params = common.copy()\n",
     "    params['dir_name'] += name\n",
@@ -247,7 +247,7 @@
    },
    "outputs": [],
    "source": [
-    "if False:\n",
+    "if True:\n",
     "    name = 'sgconv_softmax'\n",
     "    params = common.copy()\n",
     "    params['dir_name'] += name\n",
@@ -266,7 +266,7 @@
    "outputs": [],
    "source": [
     "# With 'chebyshev2' and 'b2relu', it corresponds to cgcnn2_2(L[0], F=10, K=20).\n",
-    "if False:\n",
+    "if True:\n",
     "    name = 'cgconv_softmax'\n",
     "    params = common.copy()\n",
     "    params['dir_name'] += name\n",

From 416f8143cf30418711ce57e66dda03bb13ac87f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Fri, 21 Feb 2020 03:36:23 +0100
Subject: [PATCH 20/23] add title and disclaimer to mnist filter viz

---
 nips2016/mnist.ipynb | 111 ++++++++++++++++---------------------------
 1 file changed, 40 insertions(+), 71 deletions(-)

diff --git a/nips2016/mnist.ipynb b/nips2016/mnist.ipynb
index 2b9d2f1..47372ef 100644
--- a/nips2016/mnist.ipynb
+++ b/nips2016/mnist.ipynb
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%load_ext autoreload\n",
@@ -25,9 +23,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "flags = tf.app.flags\n",
@@ -54,9 +50,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def grid_graph(m, corners=False):\n",
@@ -96,9 +90,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from tensorflow.examples.tutorials.mnist import input_data\n",
@@ -129,9 +121,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "#model = fc1()\n",
@@ -157,9 +147,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "common = {}\n",
@@ -178,9 +166,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -203,9 +189,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Common hyper-parameters for networks with one convolutional layer.\n",
@@ -223,9 +207,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -242,9 +224,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -260,9 +240,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# With 'chebyshev2' and 'b2relu', it corresponds to cgcnn2_2(L[0], F=10, K=20).\n",
@@ -281,9 +259,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Common hyper-parameters for LeNet5-like networks.\n",
@@ -301,9 +277,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Architecture of TF MNIST conv model (LeNet-5-like).\n",
@@ -322,9 +296,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -339,9 +311,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -356,9 +326,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "model_perf.show()"
@@ -367,9 +335,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def plot_filters(coeffs):\n",
@@ -393,9 +359,28 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if False:\n",
+    "    grid_params = {}\n",
+    "    data = (train_data, train_labels, val_data, val_labels, test_data, test_labels)\n",
+    "    utils.grid_search(params, grid_params, *data, model=lambda x: models.cgcnn(L,**x))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Filter visualization (experimental)\n",
+    "\n",
+    "**Disclaimer**: left as is, not sure if it works. To be checked before usage."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "a = np.array([1,2,3])\n",
@@ -413,9 +398,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "model = model_f\n",
@@ -460,20 +443,6 @@
     "plt.imshow(train_data[0,idx][:28**2].reshape(28,28))\n",
     "assert np.allclose(train_data[0,idx][:28**2].reshape(28,28), mnist.train.images[0].reshape(28,28))"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "if False:\n",
-    "    grid_params = {}\n",
-    "    data = (train_data, train_labels, val_data, val_labels, test_data, test_labels)\n",
-    "    utils.grid_search(params, grid_params, *data, model=lambda x: models.cgcnn(L,**x))"
-   ]
   }
  ],
  "metadata": {

From eb245e61f44dcde481000bfde62a251706f90b8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Fri, 21 Feb 2020 04:25:28 +0100
Subject: [PATCH 21/23] make clean

---
 nips2016/20news.ipynb | 92 +++++++++++--------------------------------
 rcv1.ipynb            | 84 ++++++++++-----------------------------
 rcv1_dev.ipynb        | 20 +++-------
 3 files changed, 49 insertions(+), 147 deletions(-)

diff --git a/nips2016/20news.ipynb b/nips2016/20news.ipynb
index dfabbd8..67d26c6 100644
--- a/nips2016/20news.ipynb
+++ b/nips2016/20news.ipynb
@@ -12,9 +12,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%load_ext autoreload\n",
@@ -36,9 +34,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "flags = tf.app.flags\n",
@@ -65,9 +61,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Fetch dataset. Scikit-learn already performs some cleaning.\n",
@@ -89,9 +83,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Remove short documents.\n",
@@ -118,9 +110,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Word embedding\n",
@@ -135,9 +125,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Feature selection.\n",
@@ -156,9 +144,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "train.normalize(norm='l1')\n",
@@ -168,9 +154,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Test dataset.\n",
@@ -187,9 +171,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -225,9 +207,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "t_start = time.process_time()\n",
@@ -245,9 +225,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "t_start = time.process_time()\n",
@@ -267,9 +245,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Training set is shuffled already.\n",
@@ -291,9 +267,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -303,9 +277,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "common = {}\n",
@@ -325,9 +297,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -350,9 +320,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -375,9 +343,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -400,9 +366,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -426,9 +390,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -452,9 +414,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -477,9 +437,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -502,9 +460,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "model_perf.show()"
@@ -513,9 +469,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if False:\n",
diff --git a/rcv1.ipynb b/rcv1.ipynb
index 8266209..7bd85d1 100644
--- a/rcv1.ipynb
+++ b/rcv1.ipynb
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%load_ext autoreload\n",
@@ -25,9 +23,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "flags = tf.app.flags\n",
@@ -58,9 +54,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Fetch dataset from Scikit-learn.\n",
@@ -81,9 +75,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Selection of classes.\n",
@@ -106,9 +98,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Remove documents with multiple classes.\n",
@@ -119,9 +109,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Remove short documents.\n",
@@ -136,9 +124,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Feature selection.\n",
@@ -157,9 +143,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "#dataset.normalize(norm='l1')\n",
@@ -169,9 +153,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Word embedding\n",
@@ -186,9 +168,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "perm = np.random.RandomState(seed=42).permutation(dataset.data.shape[0])\n",
@@ -218,9 +198,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "t_start = time.process_time()\n",
@@ -238,9 +216,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "assert FLAGS.coarsening_levels is 0\n",
@@ -261,9 +237,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Training set is shuffled already.\n",
@@ -285,9 +259,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if False:\n",
@@ -297,9 +269,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "common = {}\n",
@@ -319,9 +289,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -344,9 +312,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -369,9 +335,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -394,9 +358,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -419,9 +381,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "if True:\n",
@@ -444,9 +404,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "model_perf.show()"
diff --git a/rcv1_dev.ipynb b/rcv1_dev.ipynb
index 88b8ea3..2794a57 100644
--- a/rcv1_dev.ipynb
+++ b/rcv1_dev.ipynb
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -20,9 +18,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "flags = tf.app.flags\n",
@@ -44,9 +40,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Get dataset.\n",
@@ -134,9 +128,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "dates = []\n",
@@ -155,9 +147,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import xml.etree.ElementTree as ET\n",

From f731393c78ad6a423ff249733707abf0b1698ade Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Fri, 21 Feb 2020 12:45:46 +0100
Subject: [PATCH 22/23] move all experiments in single folder

---
 nips2016/mnist.ipynb => experiments/1_mnist.ipynb   |  0
 nips2016/20news.ipynb => experiments/2_20news.ipynb |  0
 rcv1.ipynb => experiments/3_rcv1.ipynb              |  0
 rcv1_dev.ipynb => experiments/3_rcv1_dev.ipynb      |  0
 experiments/makefile                                |  9 +--------
 nips2016/makefile                                   | 11 -----------
 6 files changed, 1 insertion(+), 19 deletions(-)
 rename nips2016/mnist.ipynb => experiments/1_mnist.ipynb (100%)
 rename nips2016/20news.ipynb => experiments/2_20news.ipynb (100%)
 rename rcv1.ipynb => experiments/3_rcv1.ipynb (100%)
 rename rcv1_dev.ipynb => experiments/3_rcv1_dev.ipynb (100%)
 delete mode 100644 nips2016/makefile

diff --git a/nips2016/mnist.ipynb b/experiments/1_mnist.ipynb
similarity index 100%
rename from nips2016/mnist.ipynb
rename to experiments/1_mnist.ipynb
diff --git a/nips2016/20news.ipynb b/experiments/2_20news.ipynb
similarity index 100%
rename from nips2016/20news.ipynb
rename to experiments/2_20news.ipynb
diff --git a/rcv1.ipynb b/experiments/3_rcv1.ipynb
similarity index 100%
rename from rcv1.ipynb
rename to experiments/3_rcv1.ipynb
diff --git a/rcv1_dev.ipynb b/experiments/3_rcv1_dev.ipynb
similarity index 100%
rename from rcv1_dev.ipynb
rename to experiments/3_rcv1_dev.ipynb
diff --git a/experiments/makefile b/experiments/makefile
index 22a1c95..3a29a42 100644
--- a/experiments/makefile
+++ b/experiments/makefile
@@ -6,13 +6,6 @@ $(NB):
 	jupyter nbconvert --inplace --execute --ExecutePreprocessor.timeout=-1 $@
 
 clean:
-	@for nb in $(NB); do \
-		printf "%s" "$$(jq --indent 1 ' \
-			.metadata = {} \
-			| (.cells[] | select(has("outputs")) | .outputs) = [] \
-			| (.cells[] | select(has("execution_count")) | .execution_count) = null \
-			| .cells[].metadata = {} \
-			' $$nb)" > $$nb; \
-	done
+	jupyter nbconvert --inplace --ClearOutputPreprocessor.enabled=True $(NB)
 
 .PHONY: run $(NB) clean
diff --git a/nips2016/makefile b/nips2016/makefile
deleted file mode 100644
index 3a29a42..0000000
--- a/nips2016/makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-NB = $(sort $(wildcard *.ipynb))
-
-run: $(NB)
-
-$(NB):
-	jupyter nbconvert --inplace --execute --ExecutePreprocessor.timeout=-1 $@
-
-clean:
-	jupyter nbconvert --inplace --ClearOutputPreprocessor.enabled=True $(NB)
-
-.PHONY: run $(NB) clean

From 3265c79bfc35748772a0b58c078f11fbd47d688f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= <michael.defferrard@epfl.ch>
Date: Sat, 22 Feb 2020 02:01:23 +0100
Subject: [PATCH 23/23] requirements: add version numbers and dependencies

---
 requirements.txt | 83 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 65 insertions(+), 18 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 15ce4fb..7193ed6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,22 +1,69 @@
-numpy
-scipy
-pandas
-tables
-scikit-learn
-matplotlib
-seaborn
+# Version numbers have been retrieved from a range of machines and environments.
+# Take them with a grain of salt.
 
-tensorflow-gpu
-#tensorflow
+# Direct dependencies
+#python==3.5  # 3.4 / 3.6
+#pip==1.5.4
+#setuptools==21.0.0
+numpy==1.11.0  # 1.12.1
+scipy==0.17.0  # 0.19.0
+pandas==0.20.0
+scikit-learn==0.18.1
+matplotlib==1.5.1  # 2.0.1
+seaborn==0.7.1
+tqdm==4.11.2
+gensim==2.1.0  # 0.12.4 / 2.0.0  # Only for NLP experiments.
+#graph-tool==2.26  # Cannot be installed with pip.
+tensorflow-gpu==1.1.0  # 0.8.0  # Or tensorflow if not running on GPU.
+#networkx==1.11  # Only considered at some point.
+#networkit==4.2  # Only considered at some point.
 
-jupyter
-ipython
+# Dependencies of the above.
+boto==2.46.1  # 2.40.0
+bz2file==0.98
+cycler==0.10.0  # 0.9.0
+protobuf==3.3.0  # 3.0.0
+pyparsing==2.2.0  # 2.1.4
+python-dateutil==2.6.0  # 2.5.3
+pytz==2016.4  # 2017.2
+requests==2.13.0  # 2.9.1 / 2.10.0
+six==1.10.0
+smart-open==1.5.2  # 1.3.3
+Werkzeug==0.12.1
 
-python-dotenv
-tqdm
+# Jupyter notebook and its dependencies.
+notebook==5.0.0  # 4.2.0
+bleach==2.0.0  # 3.1.1
+decorator==4.0.9  # 4.0.6 / 4.0.11 / 4.4.1
+entrypoints==0.2.2
+html5lib==0.999999999
+ipykernel==4.6.1  # 4.3.1
+ipython==6.0.0  # 4.2.0
+ipython-genutils==0.2.0  # 0.1.0
+jedi==0.10.2
+Jinja2==2.8  # 2.9.6
+jsonschema==2.6.0  # 2.5.1
+jupyter-client==5.0.1  # 4.2.2
+jupyter-core==4.3.0  # 4.1.0
+MarkupSafe==0.23  # 1.0
+mistune==0.7.4
+nbconvert==5.1.1  # 4.2.0
+nbformat==4.3.0  # 4.0.1
+pandocfilters==1.4.1
+pexpect==4.2.1  # 4.0.1
+pickleshare==0.7.4  # 0.7.2
+prompt-toolkit==1.0.10
+ptyprocess==0.5.1
+Pygments==2.2.0  # 2.1.3
+pyzmq==16.0.2  # 15.2.0
+simplegeneric==0.8.1
+terminado==0.6
+testpath==0.3
+tornado==4.3  # 4.2.1 / 4.4.2 / 4.5.1
+traitlets==4.3.2  # 4.2.1
+wcwidth==0.1.7
+webencodings==0.5.1
 
-# Only needed for NLP experiments.
-gensim
-
-# Cannot be installed with pip.
-#graph-tool
+# dotenv and its dependency.
+#python-dotenv==0.6.4
+#click==6.7