Merge pull request rasmusbergpalm#20 from rasmusbergpalm/master-experimental

rasmusbergpalm · rasmusbergpalm · commit abb5d37d0d46 · 2013-02-26T12:28:03.000-08:00
Optimal tanh activation functions, bias included in weights, numerically test most permutations of parameters
diff --git a/DBN/dbnunfoldtonn.m b/DBN/dbnunfoldtonn.m
@@ -9,8 +9,7 @@
     end
     nn = nnsetup(size);
     for i = 1 : numel(dbn.rbm)
-        nn.W{i} = dbn.rbm{i}.W;
-        nn.b{i} = dbn.rbm{i}.c;
+        nn.W{i} = [dbn.rbm{i}.c dbn.rbm{i}.W];
     end
 end
 
diff --git a/NN/nnapplygrads.m b/NN/nnapplygrads.m
@@ -11,16 +11,12 @@
         end
         
         dW = nn.learningRate * dW;
-        db = nn.learningRate * nn.db{i};
         
         if(nn.momentum>0)
             nn.vW{i} = nn.momentum*nn.vW{i} + dW;
-            nn.vb{i} = nn.momentum*nn.vb{i} + db;
             dW = nn.vW{i};
-            db = nn.vb{i};
         end
             
         nn.W{i} = nn.W{i} - dW;
-        nn.b{i} = nn.b{i} - db; 
     end
 end
diff --git a/NN/nnbp.m b/NN/nnbp.m
@@ -1,26 +1,47 @@
 function nn = nnbp(nn)
 %NNBP performs backpropagation
-% nn = nnbp(nn) returns an neural network structure with updated weight 
-% and bias gradients (nn.dW and nn.db)
+% nn = nnbp(nn) returns an neural network structure with updated weights 
     
     n = nn.n;
     sparsityError = 0;
     switch nn.output
         case 'sigm'
             d{n} = - nn.e .* (nn.a{n} .* (1 - nn.a{n}));
         case {'softmax','linear'}
-             d{n} = - nn.e;
+            d{n} = - nn.e;
     end
     for i = (n - 1) : -1 : 2
+        % Derivative of the activation function
+        switch nn.activation_function 
+            case 'sigm'
+                d_act = nn.a{i} .* (1 - nn.a{i});
+            case 'tanh_opt'
+                d_act = 1.7159 * 2/3 * (1 - 1/(1.7159)^2 * nn.a{i}.^2);
+        end
+        
         if(nn.nonSparsityPenalty>0)
             pi = repmat(nn.p{i}, size(nn.a{i}, 1), 1);
-            sparsityError = nn.nonSparsityPenalty * (-nn.sparsityTarget ./ pi + (1 - nn.sparsityTarget) ./ (1 - pi));
+            sparsityError = [zeros(size(nn.a{i},1),1) nn.nonSparsityPenalty * (-nn.sparsityTarget ./ pi + (1 - nn.sparsityTarget) ./ (1 - pi))];
+        end
+        
+        % Backpropagate first derivatives
+        if i+1==n % in this case in d{n} there is not the bias term to be removed             
+            d{i} = (d{i + 1} * nn.W{i} + sparsityError) .* d_act; % Bishop (5.56)
+        else % in this case in d{i} the bias term has to be removed
+            d{i} = (d{i + 1}(:,2:end) * nn.W{i} + sparsityError) .* d_act;
+        end
+        
+        if(nn.dropoutFraction>0)
+            d{i} = d{i} .* [ones(size(d{i},1),1) nn.dropOutMask{i}];
         end
-        d{i} = (d{i + 1} * nn.W{i} + sparsityError) .* (nn.a{i} .* (1 - nn.a{i}));
+
     end
 
     for i = 1 : (n - 1)
-        nn.dW{i} = (d{i + 1}' * nn.a{i}) / size(d{i + 1}, 1);
-        nn.db{i} = sum(d{i + 1}, 1)' / size(d{i + 1}, 1);
+        if i+1==n
+            nn.dW{i} = (d{i + 1}' * nn.a{i}) / size(d{i + 1}, 1);
+        else
+            nn.dW{i} = (d{i + 1}(:,2:end)' * nn.a{i}) / size(d{i + 1}, 1);      
+        end
     end
 end
diff --git a/NN/nnchecknumgrad.m b/NN/nnchecknumgrad.m
@@ -1,6 +1,6 @@
 function nnchecknumgrad(nn, x, y)
     epsilon = 1e-6;
-    er = 1e-8;
+    er = 1e-7;
     n = nn.n;
     for l = 1 : (n - 1)
         for i = 1 : size(nn.W{l}, 1)
@@ -18,18 +18,5 @@ function nnchecknumgrad(nn, x, y)
                 assert(e < er, 'numerical gradient checking failed');
             end
         end
-
-        for i = 1 : size(nn.b{l}, 1)
-            nn_m = nn; nn_p = nn;
-            nn_m.b{l}(i) = nn.b{l}(i) - epsilon;
-            nn_p.b{l}(i) = nn.b{l}(i) + epsilon;
-            rng(0);
-            nn_m = nnff(nn_m, x, y);
-            rng(0);
-            nn_p = nnff(nn_p, x, y);
-            db = (nn_p.L - nn_m.L) / (2 * epsilon);
-            e = abs(db - nn.db{l}(i));
-            assert(e < er, 'numerical gradient checking failed');
-        end
     end
 end
diff --git a/NN/nnff.m b/NN/nnff.m
@@ -5,39 +5,54 @@
 
     n = nn.n;
     m = size(x, 1);
-
+    
+    x = [ones(m,1) x];
     nn.a{1} = x;
 
     %feedforward pass
     for i = 2 : n-1
-        nn.a{i} = sigm(repmat(nn.b{i - 1}', m, 1) + nn.a{i - 1} * nn.W{i - 1}');
+        switch nn.activation_function 
+            case 'sigm'
+                % Calculate the unit's outputs (including the bias term)
+                nn.a{i} = sigm(nn.a{i - 1} * nn.W{i - 1}');
+            case 'tanh_opt'
+                nn.a{i} = tanh_opt(nn.a{i - 1} * nn.W{i - 1}');
+        end
+        
+        %dropout
         if(nn.dropoutFraction > 0)
             if(nn.testing)
                 nn.a{i} = nn.a{i}.*(1 - nn.dropoutFraction);
             else
-                nn.a{i} = nn.a{i}.*(rand(size(nn.a{i}))>nn.dropoutFraction);
+                nn.dropOutMask{i} = (rand(size(nn.a{i}))>nn.dropoutFraction);
+                nn.a{i} = nn.a{i}.*nn.dropOutMask{i};
             end
         end
+        
         %calculate running exponential activations for use with sparsity
         if(nn.nonSparsityPenalty>0)
             nn.p{i} = 0.99 * nn.p{i} + 0.01 * mean(nn.a{i}, 1);
         end
+        
+        %Add the bias term
+        nn.a{i} = [ones(m,1) nn.a{i}];
     end
     switch nn.output 
         case 'sigm'
-            nn.a{n} = sigm(repmat(nn.b{n - 1}', m, 1) + nn.a{n - 1} * nn.W{n - 1}');
+            nn.a{n} = sigm(nn.a{n - 1} * nn.W{n - 1}');
         case 'linear'
-            nn.a{n} = repmat(nn.b{n - 1}', m, 1) + nn.a{n - 1} * nn.W{n - 1}';
+            nn.a{n} = nn.a{n - 1} * nn.W{n - 1}';
         case 'softmax'
-            nn.a{n} = repmat(nn.b{n - 1}', m, 1) + nn.a{n - 1} * nn.W{n - 1}';
+            nn.a{n} = nn.a{n - 1} * nn.W{n - 1}';
             nn.a{n} = exp(bsxfun(@minus, nn.a{n}, max(nn.a{n},[],2)));
             nn.a{n} = bsxfun(@rdivide, nn.a{n}, sum(nn.a{n}, 2)); 
     end
 
     %error and loss
     nn.e = y - nn.a{n};
+    
     switch nn.output
-        case {'sigm','linear'}
+        case {'sigm', 'linear'}
             nn.L = 1/2 * sum(sum(nn.e .^ 2)) / m; 
         case 'softmax'
             nn.L = -sum(sum(y .* log(nn.a{n}))) / m;
diff --git a/NN/nnsetup.m b/NN/nnsetup.m
@@ -1,28 +1,26 @@
 function nn = nnsetup(architecture)
 %NNSETUP creates a Feedforward Backpropagate Neural Network
-% nn = nnsetup(size) returns an neural network structure with n=numel(size)
-% layers, size being a n x 1 vector of layer sizes e.g. [784 100 10]
+% nn = nnsetup(architecture) returns an neural network structure with n=numel(architecture)
+% layers, architecture being a n x 1 vector of layer sizes e.g. [784 100 10]
 
     nn.size   = architecture;
     nn.n      = numel(nn.size);
     
-    nn.learningRate                     = 0.1;    %  learning rate 
-    nn.momentum                         = 0.5;    %  Momentum
-    nn.weightPenaltyL2                  = 0;      %  L2 regularization
-    nn.nonSparsityPenalty               = 0;      %  Non sparsity penalty
-    nn.sparsityTarget                   = 0.05;   %  Sparsity target
-    nn.inputZeroMaskedFraction          = 0;      %  Used for Denoising AutoEncoders
-    nn.dropoutFraction                  = 0;      %  Dropout level (http://www.cs.toronto.edu/~hinton/absps/dropout.pdf)
-    nn.testing                          = 0;      %  Internal variable. nntest sets this to one.
-    nn.output                           = 'sigm'; %  output unit 'sigm' (=logistic), 'softmax' and 'linear'
+    nn.normalize_input                  = 1;            %  normalize input elements to be between [-1 1]. Note: use a linear output function if training auto-encoders with normalized inputs
+    nn.activation_function              = 'tanh_opt';   %  Activation functions of hidden layers: 'sigm' (sigmoid) or 'tanh_opt' (optimal tanh).
+    nn.learningRate                     = 2;            %  learning rate Note: typically needs to be lower when using 'sigm' activation function and non-normalized inputs.
+    nn.momentum                         = 0.5;          %  Momentum
+    nn.weightPenaltyL2                  = 0;            %  L2 regularization
+    nn.nonSparsityPenalty               = 0;            %  Non sparsity penalty
+    nn.sparsityTarget                   = 0.05;         %  Sparsity target
+    nn.inputZeroMaskedFraction          = 0;            %  Used for Denoising AutoEncoders
+    nn.dropoutFraction                  = 0;            %  Dropout level (http://www.cs.toronto.edu/~hinton/absps/dropout.pdf)
+    nn.testing                          = 0;            %  Internal variable. nntest sets this to one.
+    nn.output                           = 'sigm';       %  output unit 'sigm' (=logistic), 'softmax' and 'linear'
 
-    for i = 2 : nn.n
-        % biases and bias momentum
-        nn.b{i - 1} = zeros(nn.size(i), 1);
-        nn.vb{i - 1} = zeros(size(nn.b{i - 1}));
-        
+    for i = 2 : nn.n   
         % weights and weight momentum
-        nn.W{i - 1} = (rand(nn.size(i), nn.size(i - 1)) - 0.5) * 2 * 4 * sqrt(6 / (nn.size(i) + nn.size(i - 1)));
+        nn.W{i - 1} = (rand(nn.size(i), nn.size(i - 1)+1) - 0.5) * 2 * 4 * sqrt(6 / (nn.size(i) + nn.size(i - 1)));
         nn.vW{i - 1} = zeros(size(nn.W{i - 1}));
         
         % average activations (for use with sparsity)
diff --git a/NN/nntest.m b/NN/nntest.m
@@ -1,4 +1,8 @@
 function [er, bad] = nntest(nn, x, y)
+    if nn.normalize_input==1;
+       x = zscore(x);
+    end
+    
     nn.testing = 1;
     nn = nnff(nn, x, y);
     nn.testing = 0;
diff --git a/NN/nntrain.m b/NN/nntrain.m
@@ -9,6 +9,10 @@
     assert(isfloat(x), 'x must be a float');
     m = size(x, 1);
     
+    if nn.normalize_input==1
+       x = zscore(x);
+    end
+    
     batchsize = opts.batchsize;
     numepochs = opts.numepochs;
 
diff --git a/README.md b/README.md
@@ -15,9 +15,14 @@ For a more informal introduction, see the following videos by Geoffrey Hinton an
 * [Recent Developments in Deep Learning](http://www.youtube.com/watch?v=VdIURAu1-aU) (Hinton, 2010)
 * [Unsupervised Feature Learning and Deep Learning](http://www.youtube.com/watch?v=ZmNOAtZIgIk) (Ng, 2011)
 
-If you use this toolbox in your research please cite:
+If you use this toolbox in your research please cite [Prediction as a candidate for learning deep hierarchical models of data](http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6284)
 
-[Prediction as a candidate for learning deep hierarchical models of data](http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6284) (Palm, 2012)
+```
+@MASTERSTHESIS\{IMM2012-06284,
+    author       = "R. B. Palm",
+    title        = "Prediction as a candidate for learning deep hierarchical models of data",
+    year         = "2012",
+```
 
 Directories included in the toolbox
 -----------------------------------
@@ -85,15 +90,16 @@ dbn = dbntrain(dbn, train_x, opts);
 
 %unfold dbn to nn
 nn = dbnunfoldtonn(dbn, 10);
+nn.normalize_input = 0;
+nn.activation_function = 'sigm';
 
 %train nn
-nn.learningRate  = 1;
 opts.numepochs =  1;
 opts.batchsize = 100;
 nn = nntrain(nn, train_x, train_y, opts);
 [er, bad] = nntest(nn, test_x, test_y);
 
-assert(er < 0.12, 'Too big error');
+assert(er < 0.10, 'Too big error');
 
 ```
 
@@ -114,25 +120,28 @@ test_y  = double(test_y);
 %  Setup and train a stacked denoising autoencoder (SDAE)
 rng(0);
 sae = saesetup([784 100]);
+sae.ae{1}.normalize_input           = 0;
+sae.ae{1}.activation_function       = 'sigm';
 sae.ae{1}.learningRate              = 1;
 sae.ae{1}.inputZeroMaskedFraction   = 0.5;
 opts.numepochs =   1;
 opts.batchsize = 100;
 sae = saetrain(sae, train_x, opts);
-visualize(sae.ae{1}.W{1}')
+visualize(sae.ae{1}.W{1}(:,2:end)')
 
 % Use the SDAE to initialize a FFNN
 nn = nnsetup([784 100 10]);
+nn.normalize_input                  = 0;
+nn.activation_function              = 'sigm';
+nn.learningRate                     = 1;
 nn.W{1} = sae.ae{1}.W{1};
-nn.b{1} = sae.ae{1}.b{1};
 
 % Train the FFNN
-nn.learningRate  = 1;
 opts.numepochs =   1;
 opts.batchsize = 100;
 nn = nntrain(nn, train_x, train_y, opts);
 [er, bad] = nntest(nn, test_x, test_y);
-assert(er < 0.21, 'Too big error');
+assert(er < 0.16, 'Too big error');
 
 ```
 
@@ -193,44 +202,56 @@ test_y  = double(test_y);
 %% ex1 vanilla neural net
 rng(0);
 nn = nnsetup([784 100 10]);
-
-nn.learningRate = 1;   %  Learning rate
 opts.numepochs =  1;   %  Number of full sweeps through data
 opts.batchsize = 100;  %  Take a mean gradient step over this many samples
-opts.silent = 1;
-nn = nntrain(nn, train_x, train_y, opts);
+[nn, L] = nntrain(nn, train_x, train_y, opts);
 
 [er, bad] = nntest(nn, test_x, test_y);
-assert(er < 0.1, 'Too big error');
+
+assert(er < 0.08, 'Too big error');
+
 
 %% ex2 neural net with L2 weight decay
 rng(0);
 nn = nnsetup([784 100 10]);
 
 nn.weightPenaltyL2 = 1e-4;  %  L2 weight decay
-nn.learningRate = 1;        %  Learning rate
 opts.numepochs =  1;        %  Number of full sweeps through data
 opts.batchsize = 100;       %  Take a mean gradient step over this many samples
-opts.silent = 1;
+
 nn = nntrain(nn, train_x, train_y, opts);
 
 [er, bad] = nntest(nn, test_x, test_y);
 assert(er < 0.1, 'Too big error');
 
+
 %% ex3 neural net with dropout
 rng(0);
 nn = nnsetup([784 100 10]);
 
 nn.dropoutFraction = 0.5;   %  Dropout fraction 
-nn.learningRate = 1;        %  Learning rate
 opts.numepochs =  1;        %  Number of full sweeps through data
 opts.batchsize = 100;       %  Take a mean gradient step over this many samples
-opts.silent = 1;
+
 nn = nntrain(nn, train_x, train_y, opts);
 
 [er, bad] = nntest(nn, test_x, test_y);
-assert(er < 0.16, 'Too big error');
+assert(er < 0.1, 'Too big error');
+
+%% ex4 neural net with sigmoid activation function, and without normalizing inputs
+rng(0);
+nn = nnsetup([784 100 10]);
 
+nn.activation_function = 'sigm';    %  Sigmoid activation function
+nn.normalize_input = 0;             %  Don't normalize inputs
+nn.learningRate = 1;                %  Sigm and non-normalized inputs require a lower learning rate
+opts.numepochs =  1;                %  Number of full sweeps through data
+opts.batchsize = 100;               %  Take a mean gradient step over this many samples
+
+nn = nntrain(nn, train_x, train_y, opts);
+
+[er, bad] = nntest(nn, test_x, test_y);
+assert(er < 0.1, 'Too big error');
 ```
 
 
diff --git a/README_header.md b/README_header.md
@@ -14,9 +14,14 @@ For a more informal introduction, see the following videos by Geoffrey Hinton an
 * [Recent Developments in Deep Learning](http://www.youtube.com/watch?v=VdIURAu1-aU) (Hinton, 2010)
 * [Unsupervised Feature Learning and Deep Learning](http://www.youtube.com/watch?v=ZmNOAtZIgIk) (Ng, 2011)
 
-If you use this toolbox in your research please cite:
-
-[Prediction as a candidate for learning deep hierarchical models of data](http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6284) (Palm, 2012)
+If you use this toolbox in your research please cite [Prediction as a candidate for learning deep hierarchical models of data](http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6284)
+
+```
+@MASTERSTHESIS\{IMM2012-06284,
+    author       = "R. B. Palm",
+    title        = "Prediction as a candidate for learning deep hierarchical models of data",
+    year         = "2012",
+```
 
 Directories included in the toolbox
 -----------------------------------
diff --git a/tests/runalltests.m b/tests/runalltests.m
@@ -1,3 +1,2 @@
 clear all; close all; clc;
-addpath(genpath('../.'))
 runtests
diff --git a/tests/test_example_DBN.m b/tests/test_example_DBN.m
diff --git a/tests/test_example_NN.m b/tests/test_example_NN.m
diff --git a/tests/test_example_SAE.m b/tests/test_example_SAE.m
diff --git a/tests/test_nn_gradients_are_numerically_correct.m b/tests/test_nn_gradients_are_numerically_correct.m
diff --git a/util/tanh_opt.m b/util/tanh_opt.m