Removed explicit bias term, tanh activation fun

marcofraccaro · marcofraccaro · commit c900fae5a1d3 · 2013-02-19T00:46:24.000+01:00
Changes for the neural network: bias matrix nn.b included in the weight
matrix nn.W; normalization of inputs; usage of the tanh activation
function with optimal parameters (see LeCun's "Efficient BackProp");
updated tests
diff --git a/NN/nnapplygrads.m b/NN/nnapplygrads.m
@@ -11,16 +11,12 @@
         end
         
         dW = nn.learningRate * dW;
-        db = nn.learningRate * nn.db{i};
         
         if(nn.momentum>0)
             nn.vW{i} = nn.momentum*nn.vW{i} + dW;
-            nn.vb{i} = nn.momentum*nn.vb{i} + db;
             dW = nn.vW{i};
-            db = nn.vb{i};
         end
             
         nn.W{i} = nn.W{i} - dW;
-        nn.b{i} = nn.b{i} - db; 
     end
 end
diff --git a/NN/nnbp.m b/NN/nnbp.m
@@ -12,15 +12,32 @@
              d{n} = - nn.e;
     end
     for i = (n - 1) : -1 : 2
+                % Derivative of the activation function
+        switch nn.activation_function 
+            case 'sigm'
+                d_act = nn.a{i} .* (ones(size(nn.a{i})) - nn.a{i});
+            case 'tanh_opt'
+                d_act = 1.7159*2/3*(ones(size(nn.a{i})) - 1/(1.7159)^2* nn.a{i}.^2);
+        end
         if(nn.nonSparsityPenalty>0)
             pi = repmat(nn.p{i}, size(nn.a{i}, 1), 1);
             sparsityError = nn.nonSparsityPenalty * (-nn.sparsityTarget ./ pi + (1 - nn.sparsityTarget) ./ (1 - pi));
         end
-        d{i} = (d{i + 1} * nn.W{i} + sparsityError) .* (nn.a{i} .* (1 - nn.a{i}));
+        
+        % Backpropagate first derivatives
+        if i+1==n % in this case in d{n} there is not the bias term to be removed             
+            d{i} = (d{i + 1} * nn.W{i} + sparsityError) .* d_act; % Bishop (5.56)
+        else % in this case in d{i} the bias term has to be removed
+            d{i} = (d{i + 1}(:,2:end) * nn.W{i} + sparsityError) .* d_act;
+        end
+        
     end
 
     for i = 1 : (n - 1)
-        nn.dW{i} = (d{i + 1}' * nn.a{i}) / size(d{i + 1}, 1);
-        nn.db{i} = sum(d{i + 1}, 1)' / size(d{i + 1}, 1);
+        if i+1==n
+            nn.dW{i} = (d{i + 1}' * nn.a{i}) / size(d{i + 1}, 1);
+        else
+            nn.dW{i} = (d{i + 1}(:,2:end)' * nn.a{i}) / size(d{i + 1}, 1);      
+        end
     end
 end
diff --git a/NN/nnchecknumgrad.m b/NN/nnchecknumgrad.m
@@ -18,18 +18,5 @@ function nnchecknumgrad(nn, x, y)
                 assert(e < er, 'numerical gradient checking failed');
             end
         end
-
-        for i = 1 : size(nn.b{l}, 1)
-            nn_m = nn; nn_p = nn;
-            nn_m.b{l}(i) = nn.b{l}(i) - epsilon;
-            nn_p.b{l}(i) = nn.b{l}(i) + epsilon;
-            rng(0);
-            nn_m = nnff(nn_m, x, y);
-            rng(0);
-            nn_p = nnff(nn_p, x, y);
-            db = (nn_p.L - nn_m.L) / (2 * epsilon);
-            e = abs(db - nn.db{l}(i));
-            assert(e < er, 'numerical gradient checking failed');
-        end
     end
 end
diff --git a/NN/nnff.m b/NN/nnff.m
@@ -10,7 +10,13 @@
 
     %feedforward pass
     for i = 2 : n-1
-        nn.a{i} = sigm(repmat(nn.b{i - 1}', m, 1) + nn.a{i - 1} * nn.W{i - 1}');
+        switch nn.activation_function 
+            case 'sigm'
+                % Calculate the unit's outputs (including the bias term)
+                nn.a{i} = [ones(m,1) sigm(nn.a{i - 1} * nn.W{i - 1}')];
+            case 'tanh_opt'
+                nn.a{i} = [ones(m,1) tanh_opt(nn.a{i - 1} * nn.W{i - 1}')];
+        end
         if(nn.dropoutFraction > 0)
             if(nn.testing)
                 nn.a{i} = nn.a{i}.*(1 - nn.dropoutFraction);
@@ -25,11 +31,11 @@
     end
     switch nn.output 
         case 'sigm'
-            nn.a{n} = sigm(repmat(nn.b{n - 1}', m, 1) + nn.a{n - 1} * nn.W{n - 1}');
+            nn.a{n} = sigm(nn.a{n - 1} * nn.W{n - 1}');
         case 'linear'
-            nn.a{n} = repmat(nn.b{n - 1}', m, 1) + nn.a{n - 1} * nn.W{n - 1}';
+            nn.a{n} = nn.a{n - 1} * nn.W{n - 1}';
         case 'softmax'
-            nn.a{n} = repmat(nn.b{n - 1}', m, 1) + nn.a{n - 1} * nn.W{n - 1}';
+            nn.a{n} = nn.a{n - 1} * nn.W{n - 1}';
             nn.a{n} = exp(bsxfun(@minus, nn.a{n}, max(nn.a{n},[],2)));
             nn.a{n} = bsxfun(@rdivide, nn.a{n}, sum(nn.a{n}, 2)); 
     end
diff --git a/NN/nnsetup.m b/NN/nnsetup.m
@@ -6,6 +6,8 @@
     nn.size   = architecture;
     nn.n      = numel(nn.size);
     
+    nn.normalize_input                  = 0;            % Normalize input elements. set to 1 to normaliza, 0 otherwise
+    nn.activation_function              = 'sigm';   % 'sigm','tanh_opt'
     nn.learningRate                     = 0.1;    %  learning rate 
     nn.momentum                         = 0.5;    %  Momentum
     nn.weightPenaltyL2                  = 0;      %  L2 regularization
@@ -16,13 +18,9 @@
     nn.testing                          = 0;      %  Internal variable. nntest sets this to one.
     nn.output                           = 'sigm'; %  output unit 'sigm' (=logistic), 'softmax' and 'linear'
 
-    for i = 2 : nn.n
-        % biases and bias momentum
-        nn.b{i - 1} = zeros(nn.size(i), 1);
-        nn.vb{i - 1} = zeros(size(nn.b{i - 1}));
-        
+    for i = 2 : nn.n   
         % weights and weight momentum
-        nn.W{i - 1} = (rand(nn.size(i), nn.size(i - 1)) - 0.5) * 2 * 4 * sqrt(6 / (nn.size(i) + nn.size(i - 1)));
+        nn.W{i - 1} = (rand(nn.size(i), nn.size(i - 1)+1) - 0.5) * 2 * 4 * sqrt(6 / (nn.size(i) + nn.size(i - 1)));
         nn.vW{i - 1} = zeros(size(nn.W{i - 1}));
         
         % average activations (for use with sparsity)
diff --git a/NN/nntest.m b/NN/nntest.m
@@ -1,4 +1,12 @@
 function [er, bad] = nntest(nn, x, y)
+
+    m = size(x, 1);
+    if nn.normalize_input==1;
+       x=zscore(x);
+    end
+    % Compact notation to include biases in the weight vector
+    x=[ones(m,1) x];
+    
     nn.testing = 1;
     nn = nnff(nn, x, y);
     nn.testing = 0;
diff --git a/NN/nntrain.m b/NN/nntrain.m
@@ -9,6 +9,13 @@
     assert(isfloat(x), 'x must be a float');
     m = size(x, 1);
     
+    if nn.normalize_input==1
+       x=zscore(x);
+    end
+    
+    % Compact notation to include biases in the weight vector
+    x=[ones(m,1) x];
+    
     batchsize = opts.batchsize;
     numepochs = opts.numepochs;
 
diff --git a/tests/test_example_NN.m b/tests/test_example_NN.m
@@ -10,7 +10,14 @@
 rng(0);
 nn = nnsetup([784 100 10]);
 
-nn.learningRate = 1;   %  Learning rate
+nn.activation_function='tanh_opt';
+if strcmp(nn.activation_function,'sigm') == 1
+    nn.learningRate = 1;
+elseif strcmp(nn.activation_function,'tanh_opt') == 1
+    nn.learningRate = 3;
+    nn.normalize_input = 1;
+end
+
 opts.numepochs =  1;   %  Number of full sweeps through data
 opts.batchsize = 100;  %  Take a mean gradient step over this many samples
 opts.silent = 1;
@@ -19,6 +26,7 @@
 [er, bad] = nntest(nn, test_x, test_y);
 assert(er < 0.1, 'Too big error');
 
+
 %% ex2 neural net with L2 weight decay
 rng(0);
 nn = nnsetup([784 100 10]);
@@ -33,6 +41,7 @@
 [er, bad] = nntest(nn, test_x, test_y);
 assert(er < 0.1, 'Too big error');
 
+
 %% ex3 neural net with dropout
 rng(0);
 nn = nnsetup([784 100 10]);
diff --git a/tests/test_nn_gradients_are_numerically_correct.m b/tests/test_nn_gradients_are_numerically_correct.m
@@ -2,8 +2,11 @@
 batch_x = rand(20, 5);
 batch_y = rand(20, 2);
 
+batch_x=[ones(size(batch_x,1),1) batch_x];
+
 nn = nnsetup([5 3 2]);
 nn.output='sigm';
+nn.activation_function='tanh_opt';
 nn = nnff(nn, batch_x, batch_y);
 nn = nnbp(nn);
 nnchecknumgrad(nn, batch_x, batch_y);
diff --git a/util/tanh_opt.m b/util/tanh_opt.m
@@ -0,0 +1,3 @@
+function  f=tanh_opt(A)
+    f=1.7159*tanh(2/3.*A);
+end

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+function f=tanh_opt(A)`
	`2`	`+ f=1.7159tanh(2/3.A);`
	`3`	`+end`