Skip to content

Commit c900fae

Browse files
committed
Removed explicit bias term, tanh activation fun
Changes for the neural network: bias matrix nn.b included in the weight matrix nn.W; normalization of inputs; usage of the tanh activation function with optimal parameters (see LeCun's "Efficient BackProp"); updated tests
1 parent 2966a54 commit c900fae

File tree

10 files changed

+65
-31
lines changed

10 files changed

+65
-31
lines changed

NN/nnapplygrads.m

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,12 @@
1111
end
1212

1313
dW = nn.learningRate * dW;
14-
db = nn.learningRate * nn.db{i};
1514

1615
if(nn.momentum>0)
1716
nn.vW{i} = nn.momentum*nn.vW{i} + dW;
18-
nn.vb{i} = nn.momentum*nn.vb{i} + db;
1917
dW = nn.vW{i};
20-
db = nn.vb{i};
2118
end
2219

2320
nn.W{i} = nn.W{i} - dW;
24-
nn.b{i} = nn.b{i} - db;
2521
end
2622
end

NN/nnbp.m

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,32 @@
1212
d{n} = - nn.e;
1313
end
1414
for i = (n - 1) : -1 : 2
15+
% Derivative of the activation function
16+
switch nn.activation_function
17+
case 'sigm'
18+
d_act = nn.a{i} .* (ones(size(nn.a{i})) - nn.a{i});
19+
case 'tanh_opt'
20+
d_act = 1.7159*2/3*(ones(size(nn.a{i})) - 1/(1.7159)^2* nn.a{i}.^2);
21+
end
1522
if(nn.nonSparsityPenalty>0)
1623
pi = repmat(nn.p{i}, size(nn.a{i}, 1), 1);
1724
sparsityError = nn.nonSparsityPenalty * (-nn.sparsityTarget ./ pi + (1 - nn.sparsityTarget) ./ (1 - pi));
1825
end
19-
d{i} = (d{i + 1} * nn.W{i} + sparsityError) .* (nn.a{i} .* (1 - nn.a{i}));
26+
27+
% Backpropagate first derivatives
28+
if i+1==n % in this case in d{n} there is not the bias term to be removed
29+
d{i} = (d{i + 1} * nn.W{i} + sparsityError) .* d_act; % Bishop (5.56)
30+
else % in this case in d{i} the bias term has to be removed
31+
d{i} = (d{i + 1}(:,2:end) * nn.W{i} + sparsityError) .* d_act;
32+
end
33+
2034
end
2135

2236
for i = 1 : (n - 1)
23-
nn.dW{i} = (d{i + 1}' * nn.a{i}) / size(d{i + 1}, 1);
24-
nn.db{i} = sum(d{i + 1}, 1)' / size(d{i + 1}, 1);
37+
if i+1==n
38+
nn.dW{i} = (d{i + 1}' * nn.a{i}) / size(d{i + 1}, 1);
39+
else
40+
nn.dW{i} = (d{i + 1}(:,2:end)' * nn.a{i}) / size(d{i + 1}, 1);
41+
end
2542
end
2643
end

NN/nnchecknumgrad.m

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,5 @@ function nnchecknumgrad(nn, x, y)
1818
assert(e < er, 'numerical gradient checking failed');
1919
end
2020
end
21-
22-
for i = 1 : size(nn.b{l}, 1)
23-
nn_m = nn; nn_p = nn;
24-
nn_m.b{l}(i) = nn.b{l}(i) - epsilon;
25-
nn_p.b{l}(i) = nn.b{l}(i) + epsilon;
26-
rng(0);
27-
nn_m = nnff(nn_m, x, y);
28-
rng(0);
29-
nn_p = nnff(nn_p, x, y);
30-
db = (nn_p.L - nn_m.L) / (2 * epsilon);
31-
e = abs(db - nn.db{l}(i));
32-
assert(e < er, 'numerical gradient checking failed');
33-
end
3421
end
3522
end

NN/nnff.m

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,13 @@
1010

1111
%feedforward pass
1212
for i = 2 : n-1
13-
nn.a{i} = sigm(repmat(nn.b{i - 1}', m, 1) + nn.a{i - 1} * nn.W{i - 1}');
13+
switch nn.activation_function
14+
case 'sigm'
15+
% Calculate the unit's outputs (including the bias term)
16+
nn.a{i} = [ones(m,1) sigm(nn.a{i - 1} * nn.W{i - 1}')];
17+
case 'tanh_opt'
18+
nn.a{i} = [ones(m,1) tanh_opt(nn.a{i - 1} * nn.W{i - 1}')];
19+
end
1420
if(nn.dropoutFraction > 0)
1521
if(nn.testing)
1622
nn.a{i} = nn.a{i}.*(1 - nn.dropoutFraction);
@@ -25,11 +31,11 @@
2531
end
2632
switch nn.output
2733
case 'sigm'
28-
nn.a{n} = sigm(repmat(nn.b{n - 1}', m, 1) + nn.a{n - 1} * nn.W{n - 1}');
34+
nn.a{n} = sigm(nn.a{n - 1} * nn.W{n - 1}');
2935
case 'linear'
30-
nn.a{n} = repmat(nn.b{n - 1}', m, 1) + nn.a{n - 1} * nn.W{n - 1}';
36+
nn.a{n} = nn.a{n - 1} * nn.W{n - 1}';
3137
case 'softmax'
32-
nn.a{n} = repmat(nn.b{n - 1}', m, 1) + nn.a{n - 1} * nn.W{n - 1}';
38+
nn.a{n} = nn.a{n - 1} * nn.W{n - 1}';
3339
nn.a{n} = exp(bsxfun(@minus, nn.a{n}, max(nn.a{n},[],2)));
3440
nn.a{n} = bsxfun(@rdivide, nn.a{n}, sum(nn.a{n}, 2));
3541
end

NN/nnsetup.m

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
nn.size = architecture;
77
nn.n = numel(nn.size);
88

9+
nn.normalize_input = 0; % Normalize input elements. set to 1 to normaliza, 0 otherwise
10+
nn.activation_function = 'sigm'; % 'sigm','tanh_opt'
911
nn.learningRate = 0.1; % learning rate
1012
nn.momentum = 0.5; % Momentum
1113
nn.weightPenaltyL2 = 0; % L2 regularization
@@ -16,13 +18,9 @@
1618
nn.testing = 0; % Internal variable. nntest sets this to one.
1719
nn.output = 'sigm'; % output unit 'sigm' (=logistic), 'softmax' and 'linear'
1820

19-
for i = 2 : nn.n
20-
% biases and bias momentum
21-
nn.b{i - 1} = zeros(nn.size(i), 1);
22-
nn.vb{i - 1} = zeros(size(nn.b{i - 1}));
23-
21+
for i = 2 : nn.n
2422
% weights and weight momentum
25-
nn.W{i - 1} = (rand(nn.size(i), nn.size(i - 1)) - 0.5) * 2 * 4 * sqrt(6 / (nn.size(i) + nn.size(i - 1)));
23+
nn.W{i - 1} = (rand(nn.size(i), nn.size(i - 1)+1) - 0.5) * 2 * 4 * sqrt(6 / (nn.size(i) + nn.size(i - 1)));
2624
nn.vW{i - 1} = zeros(size(nn.W{i - 1}));
2725

2826
% average activations (for use with sparsity)

NN/nntest.m

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,12 @@
11
function [er, bad] = nntest(nn, x, y)
2+
3+
m = size(x, 1);
4+
if nn.normalize_input==1;
5+
x=zscore(x);
6+
end
7+
% Compact notation to include biases in the weight vector
8+
x=[ones(m,1) x];
9+
210
nn.testing = 1;
311
nn = nnff(nn, x, y);
412
nn.testing = 0;

NN/nntrain.m

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,13 @@
99
assert(isfloat(x), 'x must be a float');
1010
m = size(x, 1);
1111

12+
if nn.normalize_input==1
13+
x=zscore(x);
14+
end
15+
16+
% Compact notation to include biases in the weight vector
17+
x=[ones(m,1) x];
18+
1219
batchsize = opts.batchsize;
1320
numepochs = opts.numepochs;
1421

tests/test_example_NN.m

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,14 @@
1010
rng(0);
1111
nn = nnsetup([784 100 10]);
1212

13-
nn.learningRate = 1; % Learning rate
13+
nn.activation_function='tanh_opt';
14+
if strcmp(nn.activation_function,'sigm') == 1
15+
nn.learningRate = 1;
16+
elseif strcmp(nn.activation_function,'tanh_opt') == 1
17+
nn.learningRate = 3;
18+
nn.normalize_input = 1;
19+
end
20+
1421
opts.numepochs = 1; % Number of full sweeps through data
1522
opts.batchsize = 100; % Take a mean gradient step over this many samples
1623
opts.silent = 1;
@@ -19,6 +26,7 @@
1926
[er, bad] = nntest(nn, test_x, test_y);
2027
assert(er < 0.1, 'Too big error');
2128

29+
2230
%% ex2 neural net with L2 weight decay
2331
rng(0);
2432
nn = nnsetup([784 100 10]);
@@ -33,6 +41,7 @@
3341
[er, bad] = nntest(nn, test_x, test_y);
3442
assert(er < 0.1, 'Too big error');
3543

44+
3645
%% ex3 neural net with dropout
3746
rng(0);
3847
nn = nnsetup([784 100 10]);

tests/test_nn_gradients_are_numerically_correct.m

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22
batch_x = rand(20, 5);
33
batch_y = rand(20, 2);
44

5+
batch_x=[ones(size(batch_x,1),1) batch_x];
6+
57
nn = nnsetup([5 3 2]);
68
nn.output='sigm';
9+
nn.activation_function='tanh_opt';
710
nn = nnff(nn, batch_x, batch_y);
811
nn = nnbp(nn);
912
nnchecknumgrad(nn, batch_x, batch_y);

util/tanh_opt.m

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
function f=tanh_opt(A)
2+
f=1.7159*tanh(2/3.*A);
3+
end

0 commit comments

Comments
 (0)