Merge branch 'master' of https://github.com/lukas/ml-class

lukas · lukas · commit d6c317c2977d · 2019-10-16T21:25:23.000Z
diff --git a/examples/keras-perf/wandb/settings b/examples/keras-perf/wandb/settings
@@ -1,4 +1,4 @@
 [default]
 entity = qualcomm
-project = perf-sep26
+project = perf-sep27
 base_url = https://api.wandb.ai
diff --git a/examples/lstm/imdb-classifier/imdb-attention.py b/examples/lstm/imdb-classifier/imdb-attention.py
@@ -1,10 +1,10 @@
 import wandb
-import imdb
 import numpy as np
-import tensorflow as tf
-from tensorflow.keras.preprocessing import sequence, text
-from tensorflow.keras import initializers, regularizers, constraints
-import tensorflow.keras.backend as K
+from util import load_imdb
+import keras
+from keras.preprocessing import sequence, text
+from keras import initializers, regularizers, constraints
+import keras.backend as K
 
 # from https://gist.github.com/cbaziotis/7ef97ccf71cbc14366835198c09809d2
 
@@ -20,7 +20,7 @@ def dot_product(x, kernel):
     return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
 
 
-class AttentionWithContext(tf.keras.layers.Layer):
+class AttentionWithContext(keras.layers.Layer):
     """
     Attention operation, with a context/query vector, for temporal data.
     Supports Masking.
@@ -128,7 +128,7 @@ def compute_output_shape(self, input_shape):
 config.hidden_dims = 100
 config.epochs = 10
 
-(X_train, y_train), (X_test, y_test) = imdb.load_imdb()
+(X_train, y_train), (X_test, y_test) = load_imdb()
 
 tokenizer = text.Tokenizer(num_words=config.vocab_size)
 tokenizer.fit_on_texts(X_train)
@@ -138,13 +138,13 @@ def compute_output_shape(self, input_shape):
 X_train = sequence.pad_sequences(X_train, maxlen=config.maxlen)
 X_test = sequence.pad_sequences(X_test, maxlen=config.maxlen)
 
-model = tf.keras.models.Sequential()
-model.add(tf.keras.layers.Embedding(config.vocab_size,
-                                    config.embedding_dims,
-                                    input_length=config.maxlen))
-model.add(tf.keras.layers.CuDNNLSTM(config.hidden_dims, return_sequences=True))
+model = keras.models.Sequential()
+model.add(keras.layers.Embedding(config.vocab_size,
+                                 config.embedding_dims,
+                                 input_length=config.maxlen))
+model.add(keras.layers.CuDNNLSTM(config.hidden_dims, return_sequences=True))
 model.add(AttentionWithContext())
-model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
+model.add(keras.layers.Dense(1, activation='sigmoid'))
 model.compile(loss='binary_crossentropy',
               optimizer='rmsprop',
               metrics=['accuracy'])
diff --git a/examples/lstm/imdb-classifier/imdb.ipynb b/examples/lstm/imdb-classifier/imdb.ipynb
@@ -2,11 +2,14 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
     "import util\n",
+    "import os\n",
+    "import math\n",
+    "import subprocess\n",
     "import numpy as np\n",
     "from tensorflow.keras.datasets import imdb\n",
     "(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=1000)"
@@ -54,13 +57,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading glove embeddings...\n"
+     ]
+    }
+   ],
    "source": [
     "# Load embeddings\n",
+    "if not os.path.exists(\"glove.6B.100d.txt\"):\n",
+    "    print(\"Downloading glove embeddings...\")\n",
+    "    subprocess.check_output(\n",
+    "        \"curl -OL http://nlp.stanford.edu/data/glove.6B.zip && unzip glove.6B.zip\", shell=True)\n",
     "embeddings_index = dict()\n",
     "f = open('glove.6B.100d.txt')\n",
+    "print(\"Loading globe embeddings...\")\n",
     "for line in f:\n",
     "    values = line.split()\n",
     "    word = values[0]\n",
@@ -71,9 +87,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.21388251764217375"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "def cosine_sim(v1,v2):\n",
     "    \"compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)\"\n",
@@ -90,28 +117,47 @@
     "car = embeddings_index[\"car\"]\n",
     "truck = embeddings_index[\"truck\"]\n",
     "plane = embeddings_index[\"plane\"]\n",
-    "cosine_sim(plane, book)"
+    "cosine_sim(film, truck)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
-     "ename": "NameError",
-     "evalue": "name 'embeddings_index' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-6-5ea6505cec73>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0membeddings_index\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m14\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m: name 'embeddings_index' is not defined"
-     ]
+     "data": {
+      "text/plain": [
+       "array([-1.9744e-01,  4.4831e-01,  1.3689e-01, -1.5595e-01,  9.3600e-01,\n",
+       "        7.2986e-01,  3.4099e-01, -3.3896e-01, -8.9569e-02, -4.7706e-01,\n",
+       "        3.5112e-01, -4.2198e-01, -1.2221e-01, -6.3375e-02, -4.5820e-01,\n",
+       "        7.8723e-01,  9.4045e-01,  8.1101e-02, -2.3224e-01,  4.0778e-01,\n",
+       "        3.3258e-01, -4.4458e-01, -4.7117e-01,  1.4852e-01,  9.6308e-01,\n",
+       "       -6.5267e-02, -5.3661e-02, -6.7474e-01, -4.2364e-01,  9.4392e-02,\n",
+       "       -3.8668e-01,  1.8237e-01, -1.2846e-01, -2.1952e-01, -5.8993e-01,\n",
+       "        7.3602e-01, -2.4009e-01,  3.2392e-01, -2.4663e-01, -4.0684e-01,\n",
+       "       -5.2468e-01,  4.6174e-01, -1.4936e-01, -1.1999e-01, -1.3990e-01,\n",
+       "       -4.4944e-01, -2.6565e-01, -7.0061e-01,  3.0188e-01, -1.1209e-01,\n",
+       "        6.6323e-01,  3.9698e-01,  6.9158e-01,  8.3442e-01, -5.2717e-01,\n",
+       "       -2.5314e+00,  1.3281e-01,  3.0253e-01,  1.1062e+00,  7.2221e-03,\n",
+       "        2.6031e-01,  1.1584e+00, -7.9330e-02, -7.6659e-01,  1.2623e+00,\n",
+       "       -6.2071e-01,  5.9821e-01,  7.3539e-01,  3.8573e-01, -4.0293e-01,\n",
+       "       -3.1440e-02,  7.7863e-01,  3.1525e-01,  1.9003e-01, -6.5821e-01,\n",
+       "        4.0548e-01,  5.3596e-03,  5.5274e-02, -1.2238e+00, -4.8912e-02,\n",
+       "       -3.0511e-01,  4.4473e-01, -3.3826e-01, -2.2133e-01, -1.3214e+00,\n",
+       "       -6.4761e-01, -4.4021e-01, -1.4910e+00, -2.2495e-02,  6.0346e-02,\n",
+       "        1.4833e-01,  4.4162e-01,  7.9787e-01, -2.8076e-01, -2.9400e-02,\n",
+       "       -1.5656e-01, -1.2650e-01, -5.6968e-01,  1.5374e-03,  6.6600e-01],\n",
+       "      dtype=float32)"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "embeddings_index[14]"
+    "embeddings_index[\"book\"]"
    ]
   },
   {
diff --git a/examples/lstm/imdb-classifier/wandb/settings b/examples/lstm/imdb-classifier/wandb/settings
@@ -1,5 +1,5 @@
 [default]
-entity = qualcomm
-project = imdb-sep26
+entity = bloomberg-class
+project = imdb-classifier
 base_url = https://api.wandb.ai
 
diff --git a/examples/lstm/time-series/plotutil.py b/examples/lstm/time-series/plotutil.py
@@ -57,7 +57,7 @@ def on_epoch_end(self, epoch, logs):
             preds = self.model.predict(self.testX)
 
         # Generate a figure with matplotlib</font>
-        figure = matplotlib.pyplot.figure(figsize=(10, 10))
+        figure = matplotlib.pyplot.figure(figsize=(5, 5))
         plot = figure.add_subplot(111)
 
         plot.plot(self.trainY)
diff --git a/examples/lstm/time-series/rnn.py b/examples/lstm/time-series/rnn.py
@@ -7,7 +7,9 @@
 wandb.init()
 config = wandb.config
 config.repeated_predictions = False
+config.batch_size = 40
 config.look_back = 4
+config.epochs = 500
 
 
 def load_data(data_type="airline"):
@@ -44,7 +46,7 @@ def create_dataset(dataset):
 # split into train and test sets
 split = int(len(data) * 0.70)
 train = data[:split]
-test = data[split:]
+test = data[split-config.look_back-2:]
 
 trainX, trainY = create_dataset(train)
 testX, testY = create_dataset(test)
@@ -56,9 +58,9 @@ def create_dataset(dataset):
 # create and fit the RNN
 model = tf.keras.models.Sequential()
 model.add(tf.keras.layers.SimpleRNN(5, input_shape=(config.look_back, 1)))
-model.add(tf.keras.layers.Dense(1))
+model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
 model.compile(loss='mae', optimizer='rmsprop')
-model.fit(trainX, trainY, epochs=1000, batch_size=40, validation_data=(testX, testY),  callbacks=[
+model.fit(trainX, trainY, epochs=config.epochs, batch_size=config.batch_size, validation_data=(testX, testY),  callbacks=[
           PlotCallback(trainX, trainY, testX, testY,
                        config.look_back, config.repeated_predictions),
           wandb.keras.WandbCallback(input_type="time")])
diff --git a/examples/lstm/time-series/wandb/settings b/examples/lstm/time-series/wandb/settings
@@ -1,4 +1,5 @@
 [default]
-entity: qualcomm
-project: timeseries-sep26
-base_url: https://api.wandb.ai
+entity = qualcomm
+project = timeseries-sep26
+base_url = https://api.wandb.ai
+