diff --git a/.gitignore b/.gitignore index d170d81..0dc54b7 100644 --- a/.gitignore +++ b/.gitignore @@ -5,11 +5,13 @@ __pycache__/ # IPython checkpoints .ipynb_checkpoints/ -# Datasets +# Local configuration +.env +.python-version + +# Data data/ -# Tensorflow summaries +# Tensorflow summaries & model parameters summaries/ - -# Model parameters checkpoints/ diff --git a/README.md b/README.md index 8ca207f..fd82045 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,33 @@ setting. [usage]: http://nbviewer.jupyter.org/github/mdeff/cnn_graph/blob/outputs/usage.ipynb +## Repository organization + +* The models (the introduced model and some reference models) are contained in [models.py](models.py). +* Various side functions are implemented in [graph.py](graph.py), [coarsening.py](coarsening.py) and [utils.py](utils.py). +* We did experiments on three datasets: MNIST ([notebook](mnist.ipynb)), 20NEWS ([notebook](20news.ipynb)) and RCV1 ([notebook](rcv1.ipynb)). +* TensorBoard summaries are saved in the `summaries` folder. +* Model parameters are saved in the `checkpoints` folder. +* Data is placed in the `data` folder. + * [MNIST](http://yann.lecun.com/exdb/mnist/) is downloaded automatically. + * [20NEWS](http://qwone.com/~jason/20Newsgroups/) (`20news-bydate.tar.gz`) is downloaded automatically. + * [RCV1](http://trec.nist.gov/data/reuters/reuters.html) should be downloaded manually and placed in TODO. + * [pre-trained word2vec embeddings](https://code.google.com/archive/p/word2vec/) (`GoogleNews-vectors-negative300.bin.gz`). + * Wikipedia graph and activations are available here. Please cite .. if you use it. +* The [trials](trials) folder contains various small experiences in the form of IPython notebooks. + 1. [Learning graph filters][trial1]: first experiments on learning + synthesized graph filters through observations of filtered and source + graph signals. The Chebychev and Lanczos methods as well as optimization + methods are compared there. + 2. [Classification][trial2]: learning filters who extract good features for + classification. + 3. [TensorFlow][trial3]: first experience with TensorFlow. + 4. [Coarsening][trial4]: implementation of the Graclus coarsening algorithm + and comparison with a previous matlab implementation. +* A [makefile](makefile) who runs every notebook as a sanity check. It only runs the code, there is no check on the results. + +[trial1]: h + ## License & co The code in this repository is released under the terms of the [MIT license](LICENSE.txt). diff --git a/nips2016/mnist.ipynb b/experiments/1_mnist.ipynb similarity index 75% rename from nips2016/mnist.ipynb rename to experiments/1_mnist.ipynb index 8d594f6..47372ef 100644 --- a/nips2016/mnist.ipynb +++ b/experiments/1_mnist.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", @@ -25,9 +23,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "flags = tf.app.flags\n", @@ -54,9 +50,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def grid_graph(m, corners=False):\n", @@ -96,9 +90,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from tensorflow.examples.tutorials.mnist import input_data\n", @@ -116,7 +108,7 @@ "val_data = coarsening.perm_data(val_data, perm)\n", "test_data = coarsening.perm_data(test_data, perm)\n", "print('Execution time: {:.2f}s'.format(time.process_time() - t_start))\n", - "del perm" + "#del perm" ] }, { @@ -129,9 +121,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "#model = fc1()\n", @@ -157,9 +147,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "common = {}\n", @@ -178,9 +166,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -203,9 +189,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# Common hyper-parameters for networks with one convolutional layer.\n", @@ -223,9 +207,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -234,16 +216,15 @@ " params['dir_name'] += name\n", " params['filter'] = 'fourier'\n", " params['K'] = [L[0].shape[0]]\n", - " model_perf.test(models.cgcnn(L, **params), name, params,\n", + " model_f = models.cgcnn(L, **params)\n", + " model_perf.test(model_f, name, params,\n", " train_data, train_labels, val_data, val_labels, test_data, test_labels)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -251,16 +232,15 @@ " params = common.copy()\n", " params['dir_name'] += name\n", " params['filter'] = 'spline'\n", - " model_perf.test(models.cgcnn(L, **params), name, params,\n", + " model_s = models.cgcnn(L, **params)\n", + " model_perf.test(model_s, name, params,\n", " train_data, train_labels, val_data, val_labels, test_data, test_labels)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# With 'chebyshev2' and 'b2relu', it corresponds to cgcnn2_2(L[0], F=10, K=20).\n", @@ -271,16 +251,15 @@ " params['filter'] = 'chebyshev5'\n", "# params['filter'] = 'chebyshev2'\n", "# params['brelu'] = 'b2relu'\n", - " model_perf.test(models.cgcnn(L, **params), name, params,\n", + " model_c = models.cgcnn(L, **params)\n", + " model_perf.test(model_c, name, params,\n", " train_data, train_labels, val_data, val_labels, test_data, test_labels)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# Common hyper-parameters for LeNet5-like networks.\n", @@ -298,9 +277,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Architecture of TF MNIST conv model (LeNet-5-like).\n", @@ -319,9 +296,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -336,9 +311,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -353,9 +326,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "model_perf.show()" @@ -364,9 +335,31 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_filters(coeffs):\n", + " fig = plt.figure(figsize=(15,5))\n", + " ax = fig.add_subplot(1,1,1)\n", + " for coeff in coeffs:\n", + " c = eval(coeff)\n", + " label = '{}: L={:1.2e}, |dL|={:1.2e}'.format(coeff, L(c), np.linalg.norm(dL(X,Y,c)))\n", + " ax.plot(lamb, c, '.-', label=label)\n", + "# np.testing.assert_allclose(np.linalg.norm(c)**2, E, rtol=1e-2)\n", + " ax.set_xlim(lamb[0], lamb[-1])\n", + " ax.set_title('Filter coefficients, M={}, N={}, eps={}'.format(M, N, eps))\n", + " ax.set_xlabel('frequency')\n", + " ax.set_ylabel('amplitude')\n", + " ax.legend(loc='best')\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "if False:\n", @@ -374,6 +367,82 @@ " data = (train_data, train_labels, val_data, val_labels, test_data, test_labels)\n", " utils.grid_search(params, grid_params, *data, model=lambda x: models.cgcnn(L,**x))" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Filter visualization (experimental)\n", + "\n", + "**Disclaimer**: left as is, not sure if it works. To be checked before usage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "a = np.array([1,2,3])\n", + "print(a[[2,0,1]][:2])\n", + "\n", + "print(mnist.train.labels[0])\n", + "plt.imshow(mnist.train.images[0].reshape(28,28))\n", + "\n", + "a = np.random.permutation(range(len(perm)))\n", + "b = a[perm]\n", + "c = b[idx]\n", + "assert np.all(a == c)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = model_f\n", + "\n", + "sess = tf.Session(graph=model.graph)\n", + "filename = tf.train.latest_checkpoint(os.path.join('..', 'checkpoints', model.dir_name))\n", + "model.op_saver.restore(sess, filename)\n", + "var = model.graph.get_tensor_by_name('conv1/weights' + ':0')\n", + "val = sess.run(var)\n", + "sess.close()\n", + "\n", + "lamb, U = graph.fourier(L[0])\n", + "\n", + "#filters = model_f.get_var('conv1/weights')\n", + "filters = val\n", + "#filt_fourier = filt\n", + "\n", + "i = 6\n", + "\n", + "print(filters.shape)\n", + "plt.figure(figsize=(15,5))\n", + "plt.plot(lamb, filters[:,i,0])\n", + "\n", + "print(lamb[0], lamb[-1])\n", + "\n", + "filt = U.dot(filters[:,i,0])\n", + "plt.figure(figsize=(15,5))\n", + "plt.plot(filt)\n", + "\n", + "print(len(lamb)-28**2)\n", + "indices = np.array(perm) >= 28**2\n", + "\n", + "print(train_data[0,indices])\n", + "\n", + "idx = np.argsort(perm)\n", + "filt = filt[idx]\n", + "plt.figure(figsize=(15,5))\n", + "plt.plot(train_data[0,perm])\n", + "\n", + "plt.figure(figsize=(15,5))\n", + "img = train_data[0,idx][:28**2].reshape(28,28)\n", + "plt.imshow(train_data[0,idx][:28**2].reshape(28,28))\n", + "assert np.allclose(train_data[0,idx][:28**2].reshape(28,28), mnist.train.images[0].reshape(28,28))" + ] } ], "metadata": { diff --git a/nips2016/20news.ipynb b/experiments/2_20news.ipynb similarity index 93% rename from nips2016/20news.ipynb rename to experiments/2_20news.ipynb index dfabbd8..67d26c6 100644 --- a/nips2016/20news.ipynb +++ b/experiments/2_20news.ipynb @@ -12,9 +12,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", @@ -36,9 +34,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "flags = tf.app.flags\n", @@ -65,9 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Fetch dataset. Scikit-learn already performs some cleaning.\n", @@ -89,9 +83,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Remove short documents.\n", @@ -118,9 +110,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Word embedding\n", @@ -135,9 +125,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Feature selection.\n", @@ -156,9 +144,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "train.normalize(norm='l1')\n", @@ -168,9 +154,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Test dataset.\n", @@ -187,9 +171,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -225,9 +207,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "t_start = time.process_time()\n", @@ -245,9 +225,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "t_start = time.process_time()\n", @@ -267,9 +245,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Training set is shuffled already.\n", @@ -291,9 +267,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -303,9 +277,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "common = {}\n", @@ -325,9 +297,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -350,9 +320,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -375,9 +343,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -400,9 +366,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -426,9 +390,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -452,9 +414,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -477,9 +437,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -502,9 +460,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "model_perf.show()" @@ -513,9 +469,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "if False:\n", diff --git a/rcv1.ipynb b/experiments/3_rcv1.ipynb similarity index 93% rename from rcv1.ipynb rename to experiments/3_rcv1.ipynb index 8266209..7bd85d1 100644 --- a/rcv1.ipynb +++ b/experiments/3_rcv1.ipynb @@ -3,9 +3,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", @@ -25,9 +23,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "flags = tf.app.flags\n", @@ -58,9 +54,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Fetch dataset from Scikit-learn.\n", @@ -81,9 +75,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Selection of classes.\n", @@ -106,9 +98,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Remove documents with multiple classes.\n", @@ -119,9 +109,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Remove short documents.\n", @@ -136,9 +124,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Feature selection.\n", @@ -157,9 +143,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "#dataset.normalize(norm='l1')\n", @@ -169,9 +153,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Word embedding\n", @@ -186,9 +168,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "perm = np.random.RandomState(seed=42).permutation(dataset.data.shape[0])\n", @@ -218,9 +198,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "t_start = time.process_time()\n", @@ -238,9 +216,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "assert FLAGS.coarsening_levels is 0\n", @@ -261,9 +237,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Training set is shuffled already.\n", @@ -285,9 +259,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if False:\n", @@ -297,9 +269,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "common = {}\n", @@ -319,9 +289,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -344,9 +312,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -369,9 +335,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -394,9 +358,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -419,9 +381,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if True:\n", @@ -444,9 +404,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "model_perf.show()" diff --git a/experiments/3_rcv1_dev.ipynb b/experiments/3_rcv1_dev.ipynb new file mode 100644 index 0000000..2794a57 --- /dev/null +++ b/experiments/3_rcv1_dev.ipynb @@ -0,0 +1,268 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import sklearn.datasets\n", + "import scipy.sparse\n", + "import matplotlib.pyplot as plt\n", + "import tensorflow as tf\n", + "import os\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "flags = tf.app.flags\n", + "FLAGS = flags.FLAGS\n", + "\n", + "flags.DEFINE_string('dir_data', 'data_rcv1', 'Directory to store data.')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**From Dropout (Bruna did the same)**\n", + "\n", + "We took the dataset and split it into 63 classes based on the the 63 categories at the second-level of the category tree. We removed 11 categories that did not have any data and one category that had only 4 training examples. We also removed one category that covered a huge chunk (25%) of the examples. This left us with 50 classes and 402,738 documents. We divided the documents into equal-sized training and test sets randomly. Each document was represented\n", + "using the 2000 most frequent non-stopwords in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get dataset.\n", + "rcv1 = sklearn.datasets.fetch_rcv1('data_rcv1')\n", + "N, C = rcv1.target.shape\n", + "print('N={} documents, C={} classes'.format(N, C))\n", + "\n", + "#def select_classes\n", + "\n", + "# All classes.\n", + "class_names = ['C11', 'C12', 'C13','C14','C15','C151','C1511','C152','C16','C17',\n", + " 'C171','C172','C173','C174','C18','C181','C182','C183','C21','C22',\n", + " 'C23','C24','C31', 'C311','C312','C313','C32','C33','C331','C34',\n", + " 'C41','C411','C42','CCAT','E11', 'E12','E121','E13','E131','E132',\n", + " 'E14','E141','E142','E143','E21', 'E211','E212','E31','E311','E312',\n", + " 'E313','E41','E411','E51','E511','E512','E513','E61','E71','ECAT',\n", + " 'G15','G151','G152','G153','G154','G155','G156','G157','G158','G159',\n", + " 'GCAT','GCRIM','GDEF','GDIP','GDIS','GENT','GENV','GFAS','GHEA',\n", + " 'GJOB','GMIL','GOBIT','GODD','GPOL','GPRO','GREL','GSCI','GSPO',\n", + " 'GTOUR','GVIO','GVOTE','GWEA','GWELF','M11','M12','M13','M131',\n", + " 'M132','M14','M141','M142','M143','MCAT']\n", + "assert len(class_names) == 103 # There is 103 categories according to LYRL2004.\n", + "\n", + "# Second-level classes.\n", + "keep = ['C11','C12','C13','C14','C15','C16','C17','C18','C21','C22','C23','C24',\n", + " 'C31','C32','C33','C34','C41','C42','E11','E12','E13','E14','E21','E31',\n", + " 'E41','E51','E61','E71','G15','GCRIM','GDEF','GDIP','GDIS','GENT','GENV',\n", + " 'GFAS','GHEA','GJOB','GMIL','GOBIT','GODD','GPOL','GPRO','GREL','GSCI',\n", + " 'GSPO','GTOUR','GVIO','GVOTE','GWEA','GWELF','M11','M12','M13','M14']\n", + "assert len(keep) == 55 # There is 55 second-level categories according to LYRL2004.\n", + "keep.remove('C15') # 151785 documents\n", + "keep.remove('GMIL') # 5 documents only\n", + "\n", + "# Construct a lookup table for labels.\n", + "labels_row = []\n", + "labels_col = []\n", + "class_lookup = {}\n", + "for i,name in enumerate(class_names):\n", + " class_lookup[name] = i\n", + "\n", + "# Index of classes to keep.\n", + "idx_keep = np.empty(len(keep))\n", + "for i,cat in enumerate(keep):\n", + " idx_keep[i] = class_lookup[cat]\n", + "target = rcv1.target[:,idx_keep]\n", + "\n", + "# Number of documents per class.\n", + "def show_doc_per_class(names, target, print_=False):\n", + " docs_per_class = np.array(target.astype(np.uint64).sum(axis=0)).squeeze()\n", + " print('categories ({} assignments in total)'.format(docs_per_class.sum()))\n", + " if print_:\n", + " for i,cat in enumerate(names):\n", + " print(' {:5s}: {:6d} documents'.format(cat, docs_per_class[i]))\n", + " plt.figure(figsize=(17,5))\n", + " plt.plot(sorted(docs_per_class[::-1]),'.')\n", + "show_doc_per_class(rcv1.target_names, rcv1.target)\n", + "show_doc_per_class(keep, target, True)\n", + "\n", + "#def select_documents\n", + "\n", + "# Number of classes per document.\n", + "def show_classes_per_doc(target):\n", + " classes_per_doc = np.array(target.sum(axis=1)).squeeze()\n", + " plt.figure(figsize=(17,5))\n", + " plt.plot(sorted(classes_per_doc[::-1]),'.')\n", + " return classes_per_doc\n", + "classes_per_doc = show_classes_per_doc(rcv1.target)\n", + "classes_per_doc = show_classes_per_doc(target)\n", + "\n", + "target = target[classes_per_doc==1]\n", + "data = rcv1.data[classes_per_doc==1, :]\n", + "\n", + "# Convert labels from indicator form to single value.\n", + "N, C = target.shape\n", + "assert C == len(keep)\n", + "target = target.tocoo()\n", + "target = target.col\n", + "assert target.min() == 0\n", + "assert target.max() == C - 1\n", + "\n", + "# Bruna and Dropout used 2 * 201369 = 402738 documents. Probably the difference btw v1 and v2.\n", + "print('N = {} documents and C = {} classes left'.format(N, C))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dates = []\n", + "n = 0\n", + "for path, subdirs, files in os.walk('data_rcv1/rcv1/'):\n", + " for file in files:\n", + " if 'newsML.xml' in file:\n", + " root = ET.parse(os.path.join(path, file)).getroot()\n", + " date = root.attrib['date']\n", + " dates.append(date)\n", + " n+=1\n", + "print(n)\n", + "print(len(dates))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import xml.etree.ElementTree as ET\n", + "\n", + "root = ET.parse('data_rcv1/rcv1/19960820/2286newsML.xml').getroot()\n", + "date = root.attrib['date']\n", + "\n", + "# Fetch textual content.\n", + "text = root.find('title').text\n", + "for p in root.find('text').findall('p'):\n", + " text = ' '.join((text, p.text))\n", + "print(text)\n", + "\n", + "# Find the labels of a document.\n", + "classes = []\n", + "doc = 0\n", + "for codes in root.find('metadata').findall('codes'):\n", + " if codes.attrib['class'] == 'bip:topics:1.0':\n", + " for code in codes.findall('code'):\n", + " labels_row.append(doc)\n", + " labels_col.append(class_lookup[code.attrib['code']])\n", + " classes.append(code.attrib['code'])\n", + "\n", + "assert len(labels_row) == len(labels_col)\n", + "labels_val = np.ones(len(labels_row), dtype=np.bool)\n", + "labels = scipy.sparse.csr_matrix((labels_val, (labels_row, labels_col)))\n", + "\n", + "print(labels)\n", + "labels.sum()" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "From LYRL2004 Appendix 3\n", + "http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a03-expanded-topics-hierarchy/rcv1.topics.hier.expanded\n", + "\n", + "parent: C1 child: C11 child-description: STRATEGY/PLANS\n", + "parent: C1 child: C12 child-description: LEGAL/JUDICIAL\n", + "parent: C1 child: C13 child-description: REGULATION/POLICY\n", + "parent: C1 child: C14 child-description: SHARE LISTINGS\n", + "parent: C1 child: C15 child-description: PERFORMANCE\n", + "parent: C1 child: C16 child-description: INSOLVENCY/LIQUIDITY\n", + "parent: C1 child: C17 child-description: FUNDING/CAPITAL\n", + "parent: C1 child: C18 child-description: OWNERSHIP CHANGES\n", + "parent: C2 child: C21 child-description: PRODUCTION/SERVICES\n", + "parent: C2 child: C22 child-description: NEW PRODUCTS/SERVICES\n", + "parent: C2 child: C23 child-description: RESEARCH/DEVELOPMENT\n", + "parent: C2 child: C24 child-description: CAPACITY/FACILITIES\n", + "parent: C3 child: C31 child-description: MARKETS/MARKETING\n", + "parent: C3 child: C32 child-description: ADVERTISING/PROMOTION\n", + "parent: C3 child: C33 child-description: CONTRACTS/ORDERS\n", + "parent: C3 child: C34 child-description: MONOPOLIES/COMPETITION\n", + "parent: C4 child: C41 child-description: MANAGEMENT\n", + "parent: C4 child: C42 child-description: LABOUR\n", + "parent: E1 child: E11 child-description: ECONOMIC PERFORMANCE\n", + "parent: E1 child: E12 child-description: MONETARY/ECONOMIC\n", + "parent: E1 child: E13 child-description: INFLATION/PRICES\n", + "parent: E1 child: E14 child-description: CONSUMER FINANCE\n", + "parent: E2 child: E21 child-description: GOVERNMENT FINANCE\n", + "parent: E3 child: E31 child-description: OUTPUT/CAPACITY\n", + "parent: E4 child: E41 child-description: EMPLOYMENT/LABOUR\n", + "parent: E5 child: E51 child-description: TRADE/RESERVES\n", + "parent: E6 child: E61 child-description: HOUSING STARTS\n", + "parent: E7 child: E71 child-description: LEADING INDICATORS\n", + "parent: G1 child: G15 child-description: EUROPEAN COMMUNITY\n", + "parent: GCAT child: GCRIM child-description: CRIME, LAW ENFORCEMENT\n", + "parent: GCAT child: GDEF child-description: DEFENCE\n", + "parent: GCAT child: GDIP child-description: INTERNATIONAL RELATIONS\n", + "parent: GCAT child: GDIS child-description: DISASTERS AND ACCIDENTS\n", + "parent: GCAT child: GENT child-description: ARTS, CULTURE, ENTERTAINMENT\n", + "parent: GCAT child: GENV child-description: ENVIRONMENT AND NATURAL WORLD\n", + "parent: GCAT child: GFAS child-description: FASHION\n", + "parent: GCAT child: GHEA child-description: HEALTH\n", + "parent: GCAT child: GJOB child-description: LABOUR ISSUES\n", + "parent: GCAT child: GMIL child-description: MILLENNIUM ISSUES\n", + "parent: GCAT child: GOBIT child-description: OBITUARIES\n", + "parent: GCAT child: GODD child-description: HUMAN INTEREST\n", + "parent: GCAT child: GPOL child-description: DOMESTIC POLITICS\n", + "parent: GCAT child: GPRO child-description: BIOGRAPHIES, PERSONALITIES, PEOPLE\n", + "parent: GCAT child: GREL child-description: RELIGION\n", + "parent: GCAT child: GSCI child-description: SCIENCE AND TECHNOLOGY\n", + "parent: GCAT child: GSPO child-description: SPORTS\n", + "parent: GCAT child: GTOUR child-description: TRAVEL AND TOURISM\n", + "parent: GCAT child: GVIO child-description: WAR, CIVIL WAR\n", + "parent: GCAT child: GVOTE child-description: ELECTIONS\n", + "parent: GCAT child: GWEA child-description: WEATHER\n", + "parent: GCAT child: GWELF child-description: WELFARE, SOCIAL SERVICES\n", + "parent: M1 child: M11 child-description: EQUITY MARKETS\n", + "parent: M1 child: M12 child-description: BOND MARKETS\n", + "parent: M1 child: M13 child-description: MONEY MARKETS\n", + "parent: M1 child: M14 child-description: COMMODITY MARKETS" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/experiments/4_wikipedia_traffic.ipynb b/experiments/4_wikipedia_traffic.ipynb new file mode 100644 index 0000000..0f674ec --- /dev/null +++ b/experiments/4_wikipedia_traffic.ipynb @@ -0,0 +1,617 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Wikipedia Traffic\n", + "\n", + "This notebook aims at modeling user traffic on [Wikipedia](https://wikipedia.org) using a recurrent graph convolutional neural network.\n", + "\n", + "Goal: anomaly detection. Can be used to detect events in the real world. Other applications:\n", + "* intrusion detection on telecomunnication networks,\n", + "* anomaly detection on energy networks,\n", + "* accident detection on transporation networks.\n", + "\n", + "Events: Super Bowl, Academy Awards, Grammy, Miss Universe, Golden Globe. Mostly December-February.\n", + "Missed: Charlie Hebdo, Ebola\n", + "\n", + "Network is very large: 5M nodes, 300M edges. Downsampling ideas:\n", + "* Choose a category, e.g. science.\n", + "* Take most active ones.\n", + "* Concatenate in modules / communities / super-nodes.\n", + "\n", + "Raw data\n", + "* [Wikimedia SQL dumps](https://dumps.wikimedia.org/enwiki/), to construct the hyperlink graph.\n", + " * Network size: 5M nodes, 300M edges.\n", + "* [Pagecounts](https://dumps.wikimedia.org/other/pagecounts-all-sites/) as activations on the graph.\n", + " * Data from 2014-09-23 0h to 2015-06-05 22h.\n", + " * 6142 hours in total." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import os\n", + "import datetime\n", + "\n", + "import IPython.display as ipd\n", + "from tqdm import tqdm_notebook\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import graph_tool.all as gt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "#WIKI_RAW = os.environ.get('WIKI_RAW') # Downloaded from dumps.wikimedia.org.\n", + "#WIKI_CLEAN = os.environ.get('WIKI_CLEAN') # Processed by Kirell Benzi.\n", + "\n", + "DATA_DIR = os.path.join('..', 'data', 'wikipedia')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.set_context(\"notebook\", font_scale=1.5)\n", + "plt.rcParams['figure.figsize'] = (17, 5)\n", + "plt.rcParams['agg.path.chunksize'] = 10000 # OverflowError when plotting large series." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1 Hyperlink graph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "graph = gt.load_graph(os.path.join(DATA_DIR, 'enwiki-20150403-graph.gt'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "graph.is_directed()\n", + "#graph.set_directed(False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def print_graph(graph):\n", + " print('{} vertices, {} edges'.format(\n", + " graph.num_vertices(), graph.num_edges()))\n", + "\n", + "print_graph(graph)\n", + "graph.list_properties()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "idx = 42\n", + "page_title = graph.vertex_properties['page_title'][idx]\n", + "page_id = graph.vertex_properties['page_id'][idx]\n", + "print('{}: {}'.format(page_id, page_title))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_degree_distribution(graph):\n", + " hist = gt.vertex_hist(graph, 'total')\n", + " plt.loglog(hist[1][:-1], hist[0])\n", + " plt.xlabel('#edges')\n", + " plt.ylabel('#nodes')\n", + " #plt.savefig('degree_distribution.pdf')\n", + "plot_degree_distribution(graph)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2 Pages\n", + "\n", + "A lot of pages in `pagecounts` are redirections to actual pages. We need to merge the hits." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "filepath = os.path.join(DATA_DIR, 'enwiki-20150403-page-redirect.csv.gz')\n", + "redirect = pd.read_csv(filepath, compression='gzip', sep='|', encoding='utf-8', quoting=3, index_col=1)\n", + "\n", + "redirect.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#assert len(redirect) == len(redirect['page_id'].unique())\n", + "print('{:.2e} unique pages, {:.2e} pages including redirections'.format(\n", + " len(redirect['fix_page_id'].unique()),\n", + " len(redirect)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "redirect.loc[page_id]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def id2title(page_id):\n", + " page_title = redirect.at[page_id, 'fix_page_title']\n", + " #page_title = graph.vp['page_title'][id]\n", + " print('{}: https://en.wikipedia.org/?curid={}'.format(page_title, page_id))\n", + " return page_title\n", + "id2title(12)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def find_in_title(string):\n", + "\n", + " def find(page_title, string):\n", + " try:\n", + " return string.lower() in page_title.lower()\n", + " except:\n", + " return False\n", + "\n", + " #b = redirect['fix_page_title'].apply(find, string=string)\n", + " b = redirect['page_title'].apply(find, string=string)\n", + " #return redirect[b]\n", + " return redirect[b & (redirect['is_redirect'] == 0)]\n", + "\n", + "find_in_title('ebola')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3 Page views / counts\n", + "\n", + "Graph has 4M nodes but lot of pages are not seen much. `signal_500.h5` lists only 118k pages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Kirell's signal which includes views when greater than 500.\n", + "filepath = os.path.join(DATA_DIR, 'signal_500.h5')\n", + "signal = pd.read_hdf(filepath, 'data')\n", + "signal['count_views'].plot(kind='hist', logy=True)\n", + "print(len(signal), len(signal['page_id'].unique()), len(signal['layer'].unique()), signal['count_views'].max())\n", + "signal.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_pagecounts(date):\n", + " filename = 'pagecounts-{:4d}{:02d}{:02d}-{:02d}0000.csv.gz'.format(date.year, date.month, date.day, date.hour)\n", + " filepath = os.path.join('..', 'data', 'wikipedia', 'pagecounts_clean', filename)\n", + " return pd.read_csv(filepath, compression='gzip', index_col=0, squeeze=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "START = datetime.datetime(2014, 9, 23, 2)\n", + "END = datetime.datetime(2014, 9, 24, 3)\n", + "END = datetime.datetime(2015, 6, 5, 20)\n", + "dates = pd.date_range(START, END, freq='H')\n", + "\n", + "activations_tot = pd.Series(\n", + " data=0,\n", + " index=graph.vp['page_id'].get_array(),\n", + " dtype=np.int64\n", + ")\n", + "\n", + "for date in tqdm_notebook(dates):\n", + " pagecounts = get_pagecounts(date)\n", + " activations_tot += pagecounts.reindex(activations_tot.index).fillna(0).astype(np.int32)\n", + "\n", + "print(activations_tot.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The largest is the main page.\n", + "plt.semilogy(np.sort(activations_tot.values)[::-1])\n", + "\n", + "main_page = activations_tot.argmax()\n", + "print('{} ({}): {:.2e} views in total'.format(id2title(main_page), main_page, activations_tot[main_page]))\n", + "\n", + "print('{:.2e} views in total'.format(activations_tot.sum()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Power law.\n", + "activations_tot.drop(main_page).plot(kind='hist', logy=True, bins=100);\n", + "plt.figure()\n", + "activations_tot.drop(main_page)[activations_tot < 1e7].plot(kind='hist', logy=True, bins=100);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MIN_AVG_VIEWS = 100\n", + "\n", + "keep = activations_tot.index[activations_tot >= MIN_AVG_VIEWS * len(dates)]\n", + "print('{} pages have more than {} views in total ({:.0f} per hour on average)'.format(\n", + " len(keep), MIN_AVG_VIEWS * len(dates), MIN_AVG_VIEWS))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "activations = pd.DataFrame(\n", + " data=0,\n", + " index=keep,\n", + " columns=dates,\n", + " dtype=np.int32\n", + ")\n", + "\n", + "for date in tqdm_notebook(dates):\n", + " pagecounts = get_pagecounts(date)\n", + " activations[date] = pagecounts.reindex(activations.index).fillna(0).astype(np.int32)\n", + "\n", + "activations.sort_index(inplace=True)\n", + "\n", + "filepath = os.path.join('..', 'data', 'wikipedia', 'activations_{}.h5'.format(MIN_AVG_VIEWS))\n", + "activations.to_hdf(filepath, 'activations')\n", + "\n", + "ipd.display(activations.head())\n", + "ipd.display(activations.info())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* Predictable fluctuations with unpredictable spikes. Those are outliers.\n", + "* Anomalies should be outliers persisting for many hours." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DROP = [\n", + " 15580374, # Main page draws ~10% traffic.\n", + " 42727860, # Undefined has the largest peaks of traffic while being inactive after 2014-10.\n", + "# 8063851, # Feynman point has a very large traffic peak which is probably an error.\n", + "# 2697304, # Gold_as_an_investment has many traffic peaks.\n", + "]\n", + "\n", + "def load_activations(filepath, drop=DROP):\n", + " activations = pd.read_hdf(filepath, 'activations')\n", + "\n", + " if drop:\n", + " activations.drop(drop, inplace=True)\n", + " \n", + " print('activations: {} page ids x {} hours = {}'.format(*activations.shape, activations.size))\n", + " return activations\n", + "\n", + "activations = load_activations(filepath)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('Max of {0} views at page id {2} and time {1}'.format(\n", + " activations.unstack().max(), *activations.unstack().argmax())) \n", + "plt.plot(activations.values.reshape(-1));" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.hist(activations.values.reshape(-1), bins=100, log=True);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_activation(page_id):\n", + " page_title = id2title(page_id)\n", + " ax = activations.loc[page_id].plot(label='{} ({})'.format(page_title, page_id), logy=True)\n", + " ax.set_ylabel('#views per hour');\n", + " ax.legend()\n", + " #plt.savefig('{}_{}.png'.format(page_id, page_title.lower()), dpi=300)\n", + " #plt.savefig('{}_{}.pdf'.format(page_id, page_title.lower()))\n", + "\n", + "# Events.\n", + "plot_activation(2251390) # Charlie Hebdo\n", + "plot_activation(44969225) # Charlie Hebdo shooting\n", + "plt.figure()\n", + "plot_activation(27718) # Super Bowl\n", + "plt.figure()\n", + "#plot_activation(40817806) # Ebola\n", + "plot_activation(44635) # Grammy\n", + "plot_activation(150340) # Miss Universe\n", + "#plot_activation(324) # Academy Awards\n", + "\n", + "# Neighbors of Charlie Hebdo.\n", + "#plot_activation(44969610) # Charb\n", + "#plot_activation(206682) # Caricature\n", + "#plot_activation(15012) # Islamism\n", + "#plot_activation(7826589) # Jihadism\n", + "#plot_activation(50100) # Journalist\n", + "\n", + "# Remarkable things.\n", + "#plot_activation(25)\n", + "#plot_activation(15580374) # Main Page --> largest traffic (~10%)\n", + "#plot_activation(42727860) # Undefined --> hits only before mid-oct 2014\n", + "#plot_activation(670) # Alphabet --> strange drop\n", + "#plot_activation(8063851) # Shall distinguish outliers (counting errors?) from real events\n", + "#plot_activation(2697304) # Lots of peaks --> correlated with fluctuations on market?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4 Matching graph & activations\n", + "\n", + "Further analysis\n", + "* Ratio of in / out neighbors.\n", + "* Proportion of bidirectional hyperlinks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "activations = load_activations(os.path.join(DAT_DIR, 'activations_100.h5'))\n", + "graph = gt.load_graph(os.path.join(DATA_DIR, 'enwiki-20150403-graph.gt'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_diameter(graph):\n", + " d = gt.pseudo_diameter(graph)[0]\n", + " print('Pseudo-diameter: {}'.format(int(d)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print_graph(graph)\n", + "compute_diameter(graph)\n", + "\n", + "mask = np.in1d(graph.vp['page_id'].get_array(), activations.index)\n", + "graph = gt.GraphView(graph, vfilt=mask)\n", + "print_graph(graph)\n", + "\n", + "l = gt.label_largest_component(graph)\n", + "graph = gt.GraphView(graph, vfilt=l)\n", + "print_graph(graph)\n", + "compute_diameter(graph)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "graph = gt.Graph(graph, prune=True)\n", + "\n", + "def sort_vertices(graph, vp):\n", + " sort = np.argsort(vp.get_array())\n", + " sort = np.argsort(sort)\n", + " sort = graph.new_vertex_property('int64_t', sort)\n", + " return gt.Graph(graph, vorder=sort)\n", + "\n", + "graph = sort_vertices(graph, graph.vp['page_id'])\n", + "# directed=False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "activations = activations.loc[graph.vp['page_id'].get_array()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.testing.assert_equal(graph.vp['page_id'].get_array(), activations.index)\n", + "\n", + "graph.save(os.path.join('..', 'data', 'wikipedia', 'graph.gt'))\n", + "graph.save(os.path.join('..', 'data', 'wikipedia', 'graph.graphml'))\n", + "activations.to_hdf(os.path.join('..', 'data', 'wikipedia', 'activations.h5'), 'activations')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#gt.sfdp_layout()\n", + "#gt.graph_draw(graph)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5 Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "graph = gt.load_graph(os.path.join(DATA_DIR, 'graph.gt'))\n", + "activations = load_activations(os.path.join(DATA_DIR, 'activations.h5'), drop=None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_degree_distribution(graph)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_adjacency(graph, ax=None):\n", + " A = gt.adjacency(graph)\n", + " if not ax:\n", + " fig, ax = plt.subplots(figsize=(10, 10))\n", + " ax.spy(A[:10000,:10000], markersize=0.2)\n", + " ax.set_title('{} nodes, {} edges ({:.2%})'.format(\n", + " A.shape[0], A.nnz, A.nnz / np.multiply(*A.shape)))\n", + "\n", + "plot_adjacency(graph)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def order_adjacency_plot(graph, ax=None, **kwargs):\n", + " state = gt.minimize_blockmodel_dl(graph, **kwargs)\n", + " graph = sort_vertices(graph, state.get_blocks())\n", + " plot_adjacency(graph, ax)\n", + "\n", + "fig, axes = plt.subplots(1, 3)\n", + "for ax, n_blocks in zip(axes, [10, 20, 30]):\n", + " order_adjacency_plot(graph, ax=ax, B_max=n_blocks)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.hist(activations.values.reshape(-1), bins=100, log=True);\n", + "plt.figure()\n", + "plt.hist(activations.sum(axis=1).values.reshape(-1), bins=100, log=True);" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/nips2016/makefile b/experiments/makefile similarity index 100% rename from nips2016/makefile rename to experiments/makefile diff --git a/lib/layers.py b/lib/layers.py new file mode 100644 index 0000000..28237dc --- /dev/null +++ b/lib/layers.py @@ -0,0 +1,259 @@ +from . import graph + +import numpy as np +import scipy.sparse +import tensorflow as tf + + +class Layer: + pass + + +class Fourier(Layer): + """Graph convolutional layers that filter in Fourier.""" + + def __init__(self, Fout, K): + self.Fout = Fout + self.K = K + + def __call__(self, x, L): + assert K == L.shape[0] # artificial but useful to compute number of parameters + N, M, Fin = x.get_shape() + N, M, Fin = int(N), int(M), int(Fin) + # Fourier basis + _, U = graph.fourier(L) + U = tf.constant(U.T, dtype=tf.float32) + # Weights + W = self._weight_variable([M, self.Fout, Fin], regularization=False) + return self._filter_in_fourier(x, L, self.Fout, self.K, U, W) + + def _filter_in_fourier(self, x, L, Fout, K, U, W): + # TODO: N x F x M would avoid the permutations + N, M, Fin = x.get_shape() + N, M, Fin = int(N), int(M), int(Fin) + x = tf.transpose(x, perm=[1, 2, 0]) # M x Fin x N + # Transform to Fourier domain + x = tf.reshape(x, [M, Fin*N]) # M x Fin*N + x = tf.matmul(U, x) # M x Fin*N + x = tf.reshape(x, [M, Fin, N]) # M x Fin x N + # Filter + x = tf.matmul(W, x) # for each feature + x = tf.transpose(x) # N x Fout x M + x = tf.reshape(x, [N*Fout, M]) # N*Fout x M + # Transform back to graph domain + x = tf.matmul(x, U) # N*Fout x M + x = tf.reshape(x, [N, Fout, M]) # N x Fout x M + return tf.transpose(x, perm=[0, 2, 1]) # N x M x Fout + + +class Spline(Fourier): + + def __call__(self, x, L): + N, M, Fin = x.get_shape() + N, M, Fin = int(N), int(M), int(Fin) + # Fourier basis + lamb, U = graph.fourier(L) + U = tf.constant(U.T, dtype=tf.float32) # M x M + # Spline basis + B = self._bspline_basis(self.K, lamb, degree=3) # M x K + # B = _bspline_basis(K, len(lamb), degree=3) # M x K + B = tf.constant(B, dtype=tf.float32) + # Weights + W = self._weight_variable([self.K, self.Fout*Fin], regularization=False) + W = tf.matmul(B, W) # M x Fout*Fin + W = tf.reshape(W, [M, self.Fout, Fin]) + return self._filter_in_fourier(x, L, self.Fout, self.K, U, W) + + def _bspline_basis(self, K, x, degree=3): + """ + Return the B-spline basis. + + K: number of control points. + x: evaluation points + or number of evenly distributed evaluation points. + degree: degree of the spline. Cubic spline by default. + """ + if np.isscalar(x): + x = np.linspace(0, 1, x) + + # Evenly distributed knot vectors. + kv1 = x.min() * np.ones(degree) + kv2 = np.linspace(x.min(), x.max(), K-degree+1) + kv3 = x.max() * np.ones(degree) + kv = np.concatenate((kv1, kv2, kv3)) + + # Cox - DeBoor recursive function to compute one spline over x. + def cox_deboor(k, d): + # Test for end conditions, the rectangular degree zero spline. + if (d == 0): + return ((x - kv[k] >= 0) & (x - kv[k + 1] < 0)).astype(int) + + denom1 = kv[k + d] - kv[k] + term1 = 0 + if denom1 > 0: + term1 = ((x - kv[k]) / denom1) * cox_deboor(k, d - 1) + + denom2 = kv[k + d + 1] - kv[k + 1] + term2 = 0 + if denom2 > 0: + term2 = ((-(x - kv[k + d + 1]) / denom2) * cox_deboor(k + 1, d - 1)) + + return term1 + term2 + + # Compute basis for each point + basis = np.column_stack([cox_deboor(k, degree) for k in range(K)]) + basis[-1, -1] = 1 + return basis + + +class Chebyshev(Layer): + + def __init__(self, Fout, K): + self.Fout = Fout + self.K = K + + +class Chebyshev2(Chebyshev): + + def __call__(self, x, L): + """ + Filtering with Chebyshev interpolation + Implementation: numpy. + + Data: x of size N x M x F + N: number of signals + M: number of vertices + F: number of features per signal per vertex + """ + N, M, Fin = x.get_shape() + N, M, Fin = int(N), int(M), int(Fin) + # Rescale Laplacian. Copy to not modify the shared L. + L = scipy.sparse.csr_matrix(L) + L = graph.rescale_L(L, lmax=2) + # Transform to Chebyshev basis + x = tf.transpose(x, perm=[1, 2, 0]) # M x Fin x N + x = tf.reshape(x, [M, Fin*N]) # M x Fin*N + def chebyshev(x): + return graph.chebyshev(L, x, self.K) + x = tf.py_func(chebyshev, [x], [tf.float32])[0] # K x M x Fin*N + x = tf.reshape(x, [self.K, M, Fin, N]) # K x M x Fin x N + x = tf.transpose(x, perm=[3, 1, 2, 0]) # N x M x Fin x K + x = tf.reshape(x, [N*M, Fin*self.K]) # N*M x Fin*K + # Filter: Fin*Fout filters of order K, i.e. one filterbank per feature. + W = self._weight_variable([Fin*K, self.Fout], regularization=False) + x = tf.matmul(x, W) # N*M x Fout + return tf.reshape(x, [N, M, self.Fout]) # N x M x Fout + + +def Chebyshev5(Chebyshev): + + def __call__(self, x, L): + N, M, Fin = x.get_shape() + N, M, Fin = int(N), int(M), int(Fin) + # Rescale Laplacian and store as a TF sparse tensor. Copy to not modify the shared L. + L = scipy.sparse.csr_matrix(L) + L = graph.rescale_L(L, lmax=2) + L = L.tocoo() + indices = np.column_stack((L.row, L.col)) + L = tf.SparseTensor(indices, L.data, L.shape) + L = tf.sparse_reorder(L) + # Transform to Chebyshev basis + x0 = tf.transpose(x, perm=[1, 2, 0]) # M x Fin x N + x0 = tf.reshape(x0, [M, Fin*N]) # M x Fin*N + x = tf.expand_dims(x0, 0) # 1 x M x Fin*N + def concat(x, x_): + x_ = tf.expand_dims(x_, 0) # 1 x M x Fin*N + return tf.concat([x, x_], axis=0) # K x M x Fin*N + if self.K > 1: + x1 = tf.sparse_tensor_dense_matmul(L, x0) + x = concat(x, x1) + for k in range(2, self.K): + x2 = 2 * tf.sparse_tensor_dense_matmul(L, x1) - x0 # M x Fin*N + x = concat(x, x2) + x0, x1 = x1, x2 + x = tf.reshape(x, [self.K, M, Fin, N]) # K x M x Fin x N + x = tf.transpose(x, perm=[3, 1, 2, 0]) # N x M x Fin x K + x = tf.reshape(x, [N*M, Fin*self.K]) # N*M x Fin*K + # Filter: Fin*Fout filters of order K, i.e. one filterbank per feature pair. + W = self._weight_variable([Fin*self.K, self.Fout], regularization=False) + x = tf.matmul(x, W) # N*M x Fout + return tf.reshape(x, [N, M, self.Fout]) # N x M x Fout + + +class Bias(Layer): + pass + + +class Bias1Relu(Bias): + """Bias and ReLU. One bias per filter.""" + def __call__(self, x): + N, M, F = x.get_shape() + b = self._bias_variable([1, 1, int(F)], regularization=False) + return tf.nn.relu(x + b) + + +class Bias2Relu(Bias): + """Bias and ReLU. One bias per vertex per filter.""" + def __call__(self, x): + N, M, F = x.get_shape() + b = self._bias_variable([1, int(M), int(F)], regularization=False) + return tf.nn.relu(x + b) + + +class Pooling(Layer): + def __init__(self, p): + self.p = p + + +class MaxPooling(Pooling): + def __call__(self, x): + """Max pooling of size p. Should be a power of 2.""" + if self.p > 1: + x = tf.expand_dims(x, 3) # N x M x F x 1 + x = tf.nn.max_pool(x, ksize=[1,self.p,1,1], strides=[1,self.p,1,1], padding='SAME') + #tf.maximum + return tf.squeeze(x, [3]) # N x M/p x F + else: + return x + + +class AvgPooling(Pooling): + def __call__(self, x): + """Average pooling of size p. Should be a power of 2.""" + if self.p > 1: + x = tf.expand_dims(x, 3) # N x M x F x 1 + x = tf.nn.avg_pool(x, ksize=[1,self.p,1,1], strides=[1,self.p,1,1], padding='SAME') + return tf.squeeze(x, [3]) # N x M/p x F + else: + return x + + +class Dense(Layer): + + def __init__(self, Mout, relu=True): + self.Mout = Mout + self.relu = relu + + def __call__(self, x): + """Fully connected layer with Mout features.""" + N, Min = x.get_shape() + W = self._weight_variable([int(Min), self.Mout], regularization=True) + b = self._bias_variable([self.Mout], regularization=True) + x = tf.matmul(x, W) + b + return tf.nn.relu(x) if self.relu else x + + +class RNN(Layer): + pass + + +class LSTM(RNN): + pass + + +class ConvLSTM(RNN): + pass + + +class GRU(RNN): + pass diff --git a/requirements.txt b/requirements.txt index 8217513..7193ed6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,69 @@ -numpy -scipy -scikit-learn -matplotlib +# Version numbers have been retrieved from a range of machines and environments. +# Take them with a grain of salt. -gensim -tensorflow-gpu -#tensorflow +# Direct dependencies +#python==3.5 # 3.4 / 3.6 +#pip==1.5.4 +#setuptools==21.0.0 +numpy==1.11.0 # 1.12.1 +scipy==0.17.0 # 0.19.0 +pandas==0.20.0 +scikit-learn==0.18.1 +matplotlib==1.5.1 # 2.0.1 +seaborn==0.7.1 +tqdm==4.11.2 +gensim==2.1.0 # 0.12.4 / 2.0.0 # Only for NLP experiments. +#graph-tool==2.26 # Cannot be installed with pip. +tensorflow-gpu==1.1.0 # 0.8.0 # Or tensorflow if not running on GPU. +#networkx==1.11 # Only considered at some point. +#networkit==4.2 # Only considered at some point. -jupyter -ipython +# Dependencies of the above. +boto==2.46.1 # 2.40.0 +bz2file==0.98 +cycler==0.10.0 # 0.9.0 +protobuf==3.3.0 # 3.0.0 +pyparsing==2.2.0 # 2.1.4 +python-dateutil==2.6.0 # 2.5.3 +pytz==2016.4 # 2017.2 +requests==2.13.0 # 2.9.1 / 2.10.0 +six==1.10.0 +smart-open==1.5.2 # 1.3.3 +Werkzeug==0.12.1 + +# Jupyter notebook and its dependencies. +notebook==5.0.0 # 4.2.0 +bleach==2.0.0 # 3.1.1 +decorator==4.0.9 # 4.0.6 / 4.0.11 / 4.4.1 +entrypoints==0.2.2 +html5lib==0.999999999 +ipykernel==4.6.1 # 4.3.1 +ipython==6.0.0 # 4.2.0 +ipython-genutils==0.2.0 # 0.1.0 +jedi==0.10.2 +Jinja2==2.8 # 2.9.6 +jsonschema==2.6.0 # 2.5.1 +jupyter-client==5.0.1 # 4.2.2 +jupyter-core==4.3.0 # 4.1.0 +MarkupSafe==0.23 # 1.0 +mistune==0.7.4 +nbconvert==5.1.1 # 4.2.0 +nbformat==4.3.0 # 4.0.1 +pandocfilters==1.4.1 +pexpect==4.2.1 # 4.0.1 +pickleshare==0.7.4 # 0.7.2 +prompt-toolkit==1.0.10 +ptyprocess==0.5.1 +Pygments==2.2.0 # 2.1.3 +pyzmq==16.0.2 # 15.2.0 +simplegeneric==0.8.1 +terminado==0.6 +testpath==0.3 +tornado==4.3 # 4.2.1 / 4.4.2 / 4.5.1 +traitlets==4.3.2 # 4.2.1 +wcwidth==0.1.7 +webencodings==0.5.1 + +# dotenv and its dependency. +#python-dotenv==0.6.4 +#click==6.7 diff --git a/trials/1_learning_filters.ipynb b/trials/1_learning_filters.ipynb index 7f29d95..33ed47b 100644 --- a/trials/1_learning_filters.ipynb +++ b/trials/1_learning_filters.ipynb @@ -2,9 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "collapsed": true - }, + "metadata": {}, "source": [ "# Trial 1: learning graph filters\n", "\n", @@ -24,9 +22,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import time\n", @@ -39,9 +35,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": true - }, + "metadata": {}, "source": [ "## Problem setting\n", "\n", @@ -54,9 +48,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "M = 100 # nodes\n", @@ -142,9 +134,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def laplacian(W, normalized=True):\n", @@ -190,9 +180,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def fourier(L):\n", @@ -264,9 +252,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def gen_filter(type='step', t=2):\n", @@ -333,9 +319,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "N = 200 # signals\n", @@ -389,9 +373,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def L(c):\n", @@ -434,9 +416,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "t_start = time.process_time()\n", @@ -490,9 +470,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def sgd(c0, L, dL, learning_rate=.1, batch_size=100, crit=1e-3, maxit=100, window=10):\n", @@ -541,9 +519,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def sgd_plot_convergence(c0, L, dL, params, crit, maxit):\n", @@ -619,9 +595,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def plot_filters(coeffs):\n", @@ -661,9 +635,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "K = 5\n", @@ -722,9 +694,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "K = 10\n", @@ -788,9 +758,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def polynomial_order(K):\n", @@ -851,9 +819,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "K = 15\n", @@ -880,9 +846,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def filter_chebyshev(X, c):\n", @@ -924,9 +888,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "t_start = time.process_time()\n", @@ -953,9 +915,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "c0 = np.random.uniform(0, 1, K)\n", @@ -975,9 +935,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def plot_coefficients(coeffs):\n", @@ -1037,9 +995,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def rescale_L(L):\n", @@ -1088,9 +1044,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def eval_clenshaw(x, c):\n", @@ -1136,9 +1090,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def test(c):\n", @@ -1187,9 +1139,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def vectorize(Xt, Y):\n", @@ -1237,9 +1187,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def cheby_coeff_direct(X, Y, K, svd=False):\n", @@ -1269,9 +1217,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "params = []\n", @@ -1298,9 +1244,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "plot_coefficients(['c_crs', 'c_crd', 'c_cro', 'c_cs', 'c_co', 'c_cg'])\n", @@ -1319,9 +1263,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def lanczos(L, X, K):\n", @@ -1468,9 +1410,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def lanczos_basis_eval_f(L, X, K):\n", @@ -1554,9 +1494,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def lanczos_basis_eval(L, X, K, ret_q=False, impl=2):\n", @@ -1645,9 +1583,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "c0 = np.random.uniform(0, 1, K)\n", @@ -1666,9 +1602,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "plot_coefficients(['c_ls', 'c_ld', 'c_lo', 'c_lf'])" @@ -1686,9 +1620,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def c_l(n):\n", @@ -1722,9 +1654,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def polynomial_order(K, step=1):\n", @@ -1799,25 +1729,7 @@ ] } ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, + "metadata": {}, "nbformat": 4, - "nbformat_minor": 0 -} + "nbformat_minor": 1 +} \ No newline at end of file diff --git a/trials/2_classification.ipynb b/trials/2_classification.ipynb index d06d46d..03b6105 100644 --- a/trials/2_classification.ipynb +++ b/trials/2_classification.ipynb @@ -12,9 +12,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import time\n", @@ -50,9 +48,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def mnist(a, b, N):\n", @@ -109,9 +105,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def test_sklearn(tauR):\n", @@ -145,9 +139,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def test_optim(clf, X, y, ax=None):\n", @@ -177,9 +169,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "class rls:\n", @@ -222,9 +212,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "t_start = time.process_time()\n", @@ -246,9 +234,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def lanczos(L, X, K):\n", @@ -301,9 +287,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def test():\n", @@ -350,9 +334,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "class gflc_noweights:\n", @@ -417,10 +399,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false, - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "class gflc_weights():\n", @@ -516,9 +495,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "class gflc_split():\n", @@ -637,9 +614,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "lamb, U = graph.fourier(L)\n", @@ -649,9 +624,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def plot_filters(C, spectrum=False):\n", @@ -700,9 +673,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def plot_features(C, x):\n", @@ -741,9 +712,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def scorer(clf, X, y):\n", @@ -758,9 +727,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def perf(clf, nfolds=3):\n", @@ -813,9 +780,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def cross_validation(clf, nfolds, nvalidations):\n", @@ -839,9 +804,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def test_classification(clf, params, param, values, nfolds=10, nvalidations=1):\n", @@ -867,9 +830,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "test_classification(rls, {}, 'tauR', [1e8,1e7,1e6,1e5,1e4,1e3,1e-5,1e-8], 10, 10)" @@ -878,9 +839,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "params = {'F':1, 'K':2, 'tauR':1e3, 'niter':5, 'algo':'direct'}\n", @@ -890,9 +849,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "params = {'F':2, 'K':10, 'tauR':1e4, 'niter':5, 'algo':'direct'}\n", @@ -902,9 +859,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "params = {'F':2, 'K':4, 'tauR':1e4, 'niter':5, 'algo':'direct'}\n", @@ -921,9 +876,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "Xfull = X" @@ -932,9 +885,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "def sample(X, p, seed=None):\n", @@ -984,9 +935,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "#clf_weights = gflc_weights(F=3, K=4, tauR=1e-3, niter=5, algo='direct')\n", @@ -997,9 +946,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "#test_classification(rls, {}, 'tauR', [1e1,1e0])\n", @@ -1010,9 +957,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "test_classification(rls, {}, 'tauR', [1e8,1e7,1e6,1e5,1e4,1e3,1e-5,1e-8], 10, 10)" @@ -1021,9 +966,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "params = {'F':2, 'K':2, 'tauR':1e3, 'niter':5, 'algo':'direct'}\n", @@ -1033,9 +976,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "params = {'F':2, 'K':10, 'tauR':1e5, 'niter':5, 'algo':'direct'}\n", @@ -1045,9 +986,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "params = {'F':2, 'K':4, 'tauR':1e5, 'niter':5, 'algo':'direct'}\n", @@ -1055,25 +994,7 @@ ] } ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, + "metadata": {}, "nbformat": 4, - "nbformat_minor": 0 -} + "nbformat_minor": 1 +} \ No newline at end of file diff --git a/trials/3_tensorflow.ipynb b/trials/3_tensorflow.ipynb index 361ece6..21de92a 100644 --- a/trials/3_tensorflow.ipynb +++ b/trials/3_tensorflow.ipynb @@ -12,9 +12,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf" @@ -30,9 +28,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from tensorflow.examples.tutorials.mnist import input_data\n", @@ -51,9 +47,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "x = tf.placeholder(tf.float32, [None, 784])\n", @@ -72,9 +66,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "y_ = tf.placeholder(tf.float32, [None, 10])\n", @@ -100,9 +92,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))\n", @@ -111,25 +101,7 @@ ] } ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, + "metadata": {}, "nbformat": 4, - "nbformat_minor": 0 -} + "nbformat_minor": 1 +} \ No newline at end of file diff --git a/trials/4_coarsening.ipynb b/trials/4_coarsening.ipynb index 414b138..46924b5 100644 --- a/trials/4_coarsening.ipynb +++ b/trials/4_coarsening.ipynb @@ -2,9 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "collapsed": true - }, + "metadata": {}, "source": [ "# Trial 4: graph coarsening\n", "\n", @@ -27,9 +25,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import os\n", @@ -41,9 +37,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "if False:\n", @@ -75,9 +69,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# INPUT\n", @@ -170,9 +162,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "#http://nbviewer.ipython.org/gist/Midnighter/9992103\n", @@ -202,9 +192,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# Coarsen a graph given by rr,cc,vv. rr is assumed to be ordered\n", @@ -258,9 +246,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "maxsize = 200\n", @@ -299,9 +285,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import sys\n", @@ -387,9 +371,7 @@ }, { "cell_type": "raw", - "metadata": { - "collapsed": true - }, + "metadata": {}, "source": [ "# Matlab results\n", "\n", @@ -458,25 +440,7 @@ ] } ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, + "metadata": {}, "nbformat": 4, - "nbformat_minor": 0 -} + "nbformat_minor": 1 +} \ No newline at end of file diff --git a/trials/5_graph_tool.ipynb b/trials/5_graph_tool.ipynb new file mode 100644 index 0000000..fad0765 --- /dev/null +++ b/trials/5_graph_tool.ipynb @@ -0,0 +1,118 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Trial 5: graph-tool\n", + "\n", + "Learn and experiment with [graph-tool](https://graph-tool.skewed.de).\n", + "\n", + "Alternatives for graph analysis:\n", + "* [NetworkX](http://networkx.github.io)\n", + "* [NetworKit](https://networkit.iti.kit.edu)\n", + "* [igraph](http://igraph.org)\n", + "* [GraphLab](https://turi.com)\n", + "* [GraphX](https://spark.apache.org/graphx)\n", + "* [Giraph](https://giraph.apache.org)\n", + "\n", + "Alternatives for graph visualization:\n", + "* [Gephi](https://gephi.org)\n", + "* [Graphviz](http://www.graphviz.org)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "import graph_tool.all as gt\n", + "#import networkx as nx\n", + "#import networkit as nk\n", + "\n", + "gt.openmp_enabled(), gt.openmp_get_num_threads()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Graph filters and plots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "g, pos = gt.triangulation(np.random.random_sample((500, 2)) * 4, type='delaunay')\n", + "\n", + "tree = gt.min_spanning_tree(g2)\n", + "tv = gt.GraphView(g, efilt=tree)\n", + "\n", + "bv, be = gt.betweenness(tv)\n", + "be.a /= be.a.max() / 5\n", + "gt.graph_draw(tv, pos, vertex_fill_color=bv, edge_pen_width=be);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gt.adjacency(g)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#g = nk.readGraph('graph.gt', nk.Format.GraphToolBinary)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Graph models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "g = gt.collection.data['football']\n", + "state = gt.minimize_blockmodel_dl(g, deg_corr=False)\n", + "state.draw(pos=g.vp.pos)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "g = gt.collection.data['celegansneural']\n", + "state = gt.minimize_nested_blockmodel_dl(g, deg_corr=True)\n", + "state.draw()\n", + "state.print_summary()" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/trials/6_structured_sequence.ipynb b/trials/6_structured_sequence.ipynb new file mode 100644 index 0000000..66a5f8f --- /dev/null +++ b/trials/6_structured_sequence.ipynb @@ -0,0 +1,466 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Trial 6: structured sequence modeling\n", + "\n", + "* Create simple parametric time series and try to model them.\n", + "* Add structure by constructing a graph between the series and see how it improves.\n", + "* Usage of `tflearn` inspired by [How to do time series prediction using RNNs, TensorFlow and Cloud ML Engine](https://medium.com/google-cloud/how-to-do-time-series-prediction-using-rnns-and-tensorflow-and-cloud-ml-engine-2ad2eeb189e8)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import shutil\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "import tensorflow as tf\n", + "import tensorflow.contrib.learn as tflearn\n", + "\n", + "plt.rcParams['figure.figsize'] = (17, 5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_DIR = os.path.join('..', 'data', 'structured_sequence_trial')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1 Data generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "SEQ_LEN = 1000\n", + "N_SEQ = 40\n", + "\n", + "def create_time_series(seq_len, random_state):\n", + " freq = random_state.uniform(0.1, 0.6)\n", + " ampl = random_state.uniform(0.5, 1.5)\n", + " offset = random_state.uniform(-1, 1)\n", + " return np.sin(np.arange(seq_len) * freq) * ampl + offset\n", + "\n", + "rs = np.random.RandomState(42)\n", + "data = np.empty((N_SEQ, SEQ_LEN))\n", + "for i in range(N_SEQ):\n", + " data[i] = create_time_series(SEQ_LEN, rs)\n", + "data = pd.DataFrame(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.iloc[:5, :100].T.plot();\n", + "plt.savefig('time_series.pdf')\n", + "# hist" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2 Graph construction\n", + "\n", + "k-NN graph between the time series." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3 Data preparation\n", + "\n", + "* Store data in TFRecords files which will be read by the input pipeline.\n", + "* Preprocessing can be done here.\n", + "* Data augmentation should be done in input pipeline (to save disk space).\n", + "* We are doing full batch, i.e. we feed data on the whole graph at once." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "N_INPUTS = 50 # Number of samples used for prediction, i.e. unrolling length.\n", + "N_OUTPUTS = 1 # Number of samples in the time series the model tries to predict.\n", + "\n", + "def feature(array):\n", + " array = array.reshape(-1)\n", + " return tf.train.Feature(float_list=tf.train.FloatList(value=list(array)))\n", + "\n", + "def save_dataset(data, filename):\n", + " \"\"\"Save dataset as TFRecords.\"\"\"\n", + " filename = os.path.join(DATA_DIR, filename)\n", + " num_examples = data.shape[1] - N_INPUTS - N_OUTPUTS + 1\n", + " assert num_examples > 0\n", + " tf.logging.info('Writing {} examples to {}'.format(num_examples, filename))\n", + " with tf.python_io.TFRecordWriter(filename) as writer:\n", + " for idx in range(num_examples):\n", + " inputs = data[:, idx:idx+N_INPUTS]\n", + " targets = data[:, idx+N_INPUTS:idx+N_INPUTS+N_OUTPUTS]\n", + " example = tf.train.Example(features=tf.train.Features(feature={\n", + " #'graph': feature(graph), # Adjacency matrix or Laplacian can be stored here.\n", + " 'inputs': feature(inputs),\n", + " 'targets': feature(targets)}))\n", + " writer.write(example.SerializeToString())\n", + "\n", + "TRAINING_LEN = int(0.8 * SEQ_LEN)\n", + "save_dataset(data.iloc[:, :TRAINING_LEN].values, 'train.tfrecords')\n", + "save_dataset(data.iloc[:, TRAINING_LEN:].values, 'validation.tfrecords')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4 Data loading\n", + "\n", + "Two training schemes:\n", + "* Load whole data for training up to a certain point in time. That is what is done for text (the whole vocabulary graph is used).\n", + "* Use some time series (some part of the graph) as training and the others as evaluation.\n", + "\n", + "TF alternative:\n", + "* [tf.contrib.slim.dataset](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class DataLoader:\n", + "\n", + " def __init__(s, filenames, num_epochs=1, read_threads=1, seed=None):\n", + " #if mode == tflearn.ModeKeys.TRAIN:\n", + " s.filenames = filenames\n", + " s.num_epochs = num_epochs\n", + " s.read_threads = read_threads\n", + " s.seed = seed\n", + "\n", + " def _read_and_decode(s, filename_queue):\n", + " reader = tf.TFRecordReader()\n", + " _, example = reader.read(filename_queue)\n", + " features={\n", + " 'inputs': tf.FixedLenFeature([N_SEQ * N_INPUTS], tf.float32),\n", + " 'targets': tf.FixedLenFeature([N_SEQ * N_OUTPUTS], tf.float32),\n", + " }\n", + " example = tf.parse_single_example(example, features)\n", + " inputs = tf.reshape(example['inputs'], [N_SEQ, N_INPUTS])\n", + " targets = tf.reshape(example['targets'], [N_SEQ, N_OUTPUTS])\n", + " return inputs, targets\n", + "\n", + " def __call__(s):\n", + " with tf.name_scope('input_pipeline'):\n", + " with tf.device(\"/cpu:0\"): # Input queues are on CPU.\n", + " filenames = [os.path.join(DATA_DIR, filename) for filename in s.filenames]\n", + " filename_queue = tf.train.string_input_producer(filenames, s.num_epochs, shuffle=True)\n", + "\n", + " examples = [s._read_and_decode(filename_queue) for _ in range(s.read_threads)]\n", + "\n", + " # Shuffle examples.\n", + " if True:\n", + " min_after_dequeue = 10 #10000\n", + " capacity = min_after_dequeue + (s.read_threads + 2) # * s.batch_size\n", + " inputs, targets = tf.train.shuffle_batch_join(\n", + " examples, batch_size=1, seed=s.seed, capacity=capacity,\n", + " min_after_dequeue=min_after_dequeue, allow_smaller_final_batch=True)\n", + " # We read full batch.\n", + " inputs = inputs[0, ...]\n", + " targets = targets[0, ...]\n", + " else:\n", + " assert s.read_threads == 1\n", + " inputs, targets = examples[0]\n", + "\n", + " # Can return a fixed graph or a per-sample graph in the features.\n", + " return {'inputs': inputs}, targets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make one pass over the dataset to make sure the input pipeline works." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inputs = DataLoader(['train.tfrecords'])()[0]['inputs']\n", + "\n", + "sess = tf.Session()\n", + "#sess.run(tf.global_variables_initializer())\n", + "sess.run(tf.local_variables_initializer())\n", + "\n", + "coord = tf.train.Coordinator()\n", + "threads = tf.train.start_queue_runners(sess, coord)\n", + "\n", + "idx = 0\n", + "training_data = np.empty((N_SEQ, TRAINING_LEN-N_OUTPUTS))\n", + "try:\n", + " while not coord.should_stop():\n", + " training_data[:, idx:idx+N_INPUTS] = sess.run(inputs)\n", + " idx += 1\n", + "\n", + "except tf.errors.OutOfRangeError:\n", + " print('Done: {} steps'.format(idx))\n", + "finally:\n", + " coord.request_stop()\n", + "\n", + "coord.join(threads)\n", + "sess.close()\n", + "\n", + "#np.testing.assert_allclose(training_data, data.iloc[:, :TRAINING_LEN-N_OUTPUTS])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5 Sequence modeling\n", + "\n", + "We can either:\n", + "* assume the same dynamic on all time series and train a shared model\n", + "* train a model for each time series (which still has access to its neighbors)\n", + "* mix: e.g. per times series bias or last layer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Number of hidden units in each of the LSTM cells.\n", + "# Number of filters in case of GCN.\n", + "LSTM_SIZE = 3\n", + "\n", + "def model(features, targets, mode, params):\n", + " # Reformat input shape to become a sequence.\n", + " x = tf.split(features['inputs'], N_INPUTS, axis=1)\n", + " \n", + " # Recurrent neural network followed by linear transform.\n", + " lstm_cell = tf.contrib.rnn.BasicLSTMCell(LSTM_SIZE, forget_bias=1.0)\n", + " outputs, _ = tf.contrib.rnn.static_rnn(lstm_cell, x, dtype=tf.float32)\n", + " #outputs, _ = tf.contrib.rnn.dynamic_rnn(lstm_cell, x, dtype=tf.float32)\n", + " \n", + " tf.summary.histogram('hidden', outputs[-1])\n", + " predictions = tf.contrib.layers.fully_connected(outputs[-1], N_OUTPUTS, activation_fn=None)\n", + " \n", + " # Loss function and metric for training and evaluation.\n", + " loss = tf.losses.mean_squared_error(targets, predictions)\n", + " eval_metric_ops = {\n", + " 'rmse': tf.metrics.root_mean_squared_error(targets, predictions)\n", + " }\n", + " \n", + " # Training operations.\n", + " train_op = tf.contrib.layers.optimize_loss(\n", + " loss=loss,\n", + " global_step=tf.train.get_global_step(),\n", + " learning_rate=params['learning_rate'],\n", + " #learning_rate_decay_fn=lambda lr, gs: tf.train.exponential_decay(lr, gs, 100e3, 0.96, staircase=True),\n", + " optimizer=lambda lr: tf.train.GradientDescentOptimizer(lr),\n", + " #optimizer=lambda lr: tf.train.MomentumOptimizer(lr, 0.9),\n", + " )\n", + " \n", + " return tflearn.ModelFnOps(\n", + " mode=mode,\n", + " predictions={'predictions': predictions},\n", + " loss=loss,\n", + " train_op=train_op,\n", + " eval_metric_ops=eval_metric_ops,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6 Experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Observing variables.\n", + "#tflearn.monitors.ValidationMonitor\n", + "#tf.train.SessionRunHook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Tuning the hyper-parameters.\n", + "#tflearn.learn_runner.run()\n", + "#tflearn.learn_runner.tune()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TF debugger.\n", + "from tensorflow.python import debug as tfdbg\n", + "\n", + "hooks = [tfdbg.LocalCLIDebugHook()]\n", + "hooks = [tfdbg.DumpingDebugHook('tfdbg_dumps')]\n", + "# python -m tensorflow.python.debug.cli.offline_analyzer --dump_dir=\"tfdbg_dumps/run__\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Statistics like compute time or memory.\n", + "# Need to pass run_options and run_metadata to sess.run().\n", + "# Not possible with Experiment and Estimator API.\n", + "#run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)\n", + "#run_metadata = tf.RunMetadata()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#MODEL_DIR = os.path.join('..', 'logdir', 'structured_sequence', 'run1')\n", + "MODEL_DIR = 'structured_sequence'\n", + "config = tflearn.RunConfig(\n", + " save_checkpoints_secs=60,\n", + " # save_summary_steps=100,\n", + " model_dir=MODEL_DIR,\n", + " # To see device placement. It unfortunately only shows up in stderr, not Tensorboard (explicit placement only).\n", + " # session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True),\n", + ")\n", + "hparams = {\n", + " 'learning_rate': 0.01\n", + "}\n", + "estimator = tflearn.Estimator(model_fn=model, config=config, params=hparams)\n", + "#estimator.fit(input_fn=DataLoader(filenames=['train.tfrecords']))\n", + "#estimator.evaluate(input_fn=DataLoader(filenames=['validation.tfrecords']))\n", + "\n", + "experiment = tflearn.Experiment(\n", + " estimator,\n", + " eval_steps=None,\n", + " train_input_fn=DataLoader(['train.tfrecords'], num_epochs=10),\n", + " eval_input_fn=DataLoader(['validation.tfrecords']),\n", + ")\n", + "\n", + "shutil.rmtree(MODEL_DIR, ignore_errors=True) # Start fresh each time.\n", + "experiment.train_and_evaluate()\n", + "#experiment.continuous_train_and_eval() # Takes less ressources.\n", + "\n", + "#estimator.evaluate(input_fn=DataLoader(filenames=['test.tfrecords']))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## XXX" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class RNN:\n", + " \n", + " def __init__(self, units):\n", + " pass\n", + " \n", + " def __call__(self, inputs, states, laplacian):\n", + " \"\"\"Fully connected layer with Mout features.\"\"\"\n", + " N, Min = x.get_shape()\n", + " W = self._weight_variable([int(Min), self.Mout], regularization=True)\n", + " b = self._bias_variable([self.Mout], regularization=True)\n", + " x = tf.matmul(x, W) + b\n", + " return tf.nn.relu(x) if self.relu else x" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* Inherit from RNNCell to use high level TF machinery like `tf.dynamic_rnn()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class LSTM:\n", + " \"\"\"The network is not unrolled.\"\"\"\n", + " \n", + " def _input_conv(self, x, w, b=None):\n", + " pass\n", + " \n", + " def _reccurent_conv(self, x, w, b=None):\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tensorflow.python.ops import control_flow_ops\n", + "control_flow_ops.while_loop(\n", + " cond=lambda time, *_: time < time_steps,\n", + " body=_step,\n", + " loop_vars=(time, output_ta) + states,\n", + " parallel_iterations=32,\n", + " swap_memory=True)" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/trials/makefile b/trials/makefile index 3a29a42..22a1c95 100644 --- a/trials/makefile +++ b/trials/makefile @@ -6,6 +6,13 @@ $(NB): jupyter nbconvert --inplace --execute --ExecutePreprocessor.timeout=-1 $@ clean: - jupyter nbconvert --inplace --ClearOutputPreprocessor.enabled=True $(NB) + @for nb in $(NB); do \ + printf "%s" "$$(jq --indent 1 ' \ + .metadata = {} \ + | (.cells[] | select(has("outputs")) | .outputs) = [] \ + | (.cells[] | select(has("execution_count")) | .execution_count) = null \ + | .cells[].metadata = {} \ + ' $$nb)" > $$nb; \ + done .PHONY: run $(NB) clean diff --git a/usage.ipynb b/usage.ipynb index 2c1ff13..0febd8d 100644 --- a/usage.ipynb +++ b/usage.ipynb @@ -26,9 +26,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from lib import models, graph, coarsening, utils\n", @@ -49,9 +47,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "d = 100 # Dimensionality.\n", @@ -86,9 +82,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "n_train = n // 2\n", @@ -125,9 +119,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "dist, idx = graph.distance_scipy_spatial(X_train.T, k=10, metric='euclidean')\n", @@ -154,9 +146,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "graphs, perm = coarsening.coarsen(A, levels=3, self_connections=False)\n", @@ -176,9 +166,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "L = [graph.laplacian(A, normalized=True) for A in graphs]\n", @@ -199,9 +187,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "params = dict()\n", @@ -237,9 +223,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "model = models.cgcnn(L, **params)\n", @@ -262,9 +246,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "fig, ax1 = plt.subplots(figsize=(15, 5))\n", @@ -279,9 +261,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "print('Time per step: {:.2f} ms'.format(t_step*1000))" @@ -290,9 +270,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "res = model.evaluate(X_test, y_test)\n", @@ -316,9 +294,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.4.3" + "version": "3.6.2" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 }