Skip to content

Commit abb5d37

Browse files
Merge pull request rasmusbergpalm#20 from rasmusbergpalm/master-experimental
Optimal tanh activation functions, bias included in weights, numerically test most permutations of parameters
2 parents 2966a54 + 5be78ce commit abb5d37

16 files changed

+181
-112
lines changed

DBN/dbnunfoldtonn.m

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@
99
end
1010
nn = nnsetup(size);
1111
for i = 1 : numel(dbn.rbm)
12-
nn.W{i} = dbn.rbm{i}.W;
13-
nn.b{i} = dbn.rbm{i}.c;
12+
nn.W{i} = [dbn.rbm{i}.c dbn.rbm{i}.W];
1413
end
1514
end
1615

NN/nnapplygrads.m

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,12 @@
1111
end
1212

1313
dW = nn.learningRate * dW;
14-
db = nn.learningRate * nn.db{i};
1514

1615
if(nn.momentum>0)
1716
nn.vW{i} = nn.momentum*nn.vW{i} + dW;
18-
nn.vb{i} = nn.momentum*nn.vb{i} + db;
1917
dW = nn.vW{i};
20-
db = nn.vb{i};
2118
end
2219

2320
nn.W{i} = nn.W{i} - dW;
24-
nn.b{i} = nn.b{i} - db;
2521
end
2622
end

NN/nnbp.m

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,47 @@
11
function nn = nnbp(nn)
22
%NNBP performs backpropagation
3-
% nn = nnbp(nn) returns an neural network structure with updated weight
4-
% and bias gradients (nn.dW and nn.db)
3+
% nn = nnbp(nn) returns an neural network structure with updated weights
54

65
n = nn.n;
76
sparsityError = 0;
87
switch nn.output
98
case 'sigm'
109
d{n} = - nn.e .* (nn.a{n} .* (1 - nn.a{n}));
1110
case {'softmax','linear'}
12-
d{n} = - nn.e;
11+
d{n} = - nn.e;
1312
end
1413
for i = (n - 1) : -1 : 2
14+
% Derivative of the activation function
15+
switch nn.activation_function
16+
case 'sigm'
17+
d_act = nn.a{i} .* (1 - nn.a{i});
18+
case 'tanh_opt'
19+
d_act = 1.7159 * 2/3 * (1 - 1/(1.7159)^2 * nn.a{i}.^2);
20+
end
21+
1522
if(nn.nonSparsityPenalty>0)
1623
pi = repmat(nn.p{i}, size(nn.a{i}, 1), 1);
17-
sparsityError = nn.nonSparsityPenalty * (-nn.sparsityTarget ./ pi + (1 - nn.sparsityTarget) ./ (1 - pi));
24+
sparsityError = [zeros(size(nn.a{i},1),1) nn.nonSparsityPenalty * (-nn.sparsityTarget ./ pi + (1 - nn.sparsityTarget) ./ (1 - pi))];
25+
end
26+
27+
% Backpropagate first derivatives
28+
if i+1==n % in this case in d{n} there is not the bias term to be removed
29+
d{i} = (d{i + 1} * nn.W{i} + sparsityError) .* d_act; % Bishop (5.56)
30+
else % in this case in d{i} the bias term has to be removed
31+
d{i} = (d{i + 1}(:,2:end) * nn.W{i} + sparsityError) .* d_act;
32+
end
33+
34+
if(nn.dropoutFraction>0)
35+
d{i} = d{i} .* [ones(size(d{i},1),1) nn.dropOutMask{i}];
1836
end
19-
d{i} = (d{i + 1} * nn.W{i} + sparsityError) .* (nn.a{i} .* (1 - nn.a{i}));
37+
2038
end
2139

2240
for i = 1 : (n - 1)
23-
nn.dW{i} = (d{i + 1}' * nn.a{i}) / size(d{i + 1}, 1);
24-
nn.db{i} = sum(d{i + 1}, 1)' / size(d{i + 1}, 1);
41+
if i+1==n
42+
nn.dW{i} = (d{i + 1}' * nn.a{i}) / size(d{i + 1}, 1);
43+
else
44+
nn.dW{i} = (d{i + 1}(:,2:end)' * nn.a{i}) / size(d{i + 1}, 1);
45+
end
2546
end
2647
end

NN/nnchecknumgrad.m

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
function nnchecknumgrad(nn, x, y)
22
epsilon = 1e-6;
3-
er = 1e-8;
3+
er = 1e-7;
44
n = nn.n;
55
for l = 1 : (n - 1)
66
for i = 1 : size(nn.W{l}, 1)
@@ -18,18 +18,5 @@ function nnchecknumgrad(nn, x, y)
1818
assert(e < er, 'numerical gradient checking failed');
1919
end
2020
end
21-
22-
for i = 1 : size(nn.b{l}, 1)
23-
nn_m = nn; nn_p = nn;
24-
nn_m.b{l}(i) = nn.b{l}(i) - epsilon;
25-
nn_p.b{l}(i) = nn.b{l}(i) + epsilon;
26-
rng(0);
27-
nn_m = nnff(nn_m, x, y);
28-
rng(0);
29-
nn_p = nnff(nn_p, x, y);
30-
db = (nn_p.L - nn_m.L) / (2 * epsilon);
31-
e = abs(db - nn.db{l}(i));
32-
assert(e < er, 'numerical gradient checking failed');
33-
end
3421
end
3522
end

NN/nnff.m

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,39 +5,54 @@
55

66
n = nn.n;
77
m = size(x, 1);
8-
8+
9+
x = [ones(m,1) x];
910
nn.a{1} = x;
1011

1112
%feedforward pass
1213
for i = 2 : n-1
13-
nn.a{i} = sigm(repmat(nn.b{i - 1}', m, 1) + nn.a{i - 1} * nn.W{i - 1}');
14+
switch nn.activation_function
15+
case 'sigm'
16+
% Calculate the unit's outputs (including the bias term)
17+
nn.a{i} = sigm(nn.a{i - 1} * nn.W{i - 1}');
18+
case 'tanh_opt'
19+
nn.a{i} = tanh_opt(nn.a{i - 1} * nn.W{i - 1}');
20+
end
21+
22+
%dropout
1423
if(nn.dropoutFraction > 0)
1524
if(nn.testing)
1625
nn.a{i} = nn.a{i}.*(1 - nn.dropoutFraction);
1726
else
18-
nn.a{i} = nn.a{i}.*(rand(size(nn.a{i}))>nn.dropoutFraction);
27+
nn.dropOutMask{i} = (rand(size(nn.a{i}))>nn.dropoutFraction);
28+
nn.a{i} = nn.a{i}.*nn.dropOutMask{i};
1929
end
2030
end
31+
2132
%calculate running exponential activations for use with sparsity
2233
if(nn.nonSparsityPenalty>0)
2334
nn.p{i} = 0.99 * nn.p{i} + 0.01 * mean(nn.a{i}, 1);
2435
end
36+
37+
%Add the bias term
38+
nn.a{i} = [ones(m,1) nn.a{i}];
2539
end
2640
switch nn.output
2741
case 'sigm'
28-
nn.a{n} = sigm(repmat(nn.b{n - 1}', m, 1) + nn.a{n - 1} * nn.W{n - 1}');
42+
nn.a{n} = sigm(nn.a{n - 1} * nn.W{n - 1}');
2943
case 'linear'
30-
nn.a{n} = repmat(nn.b{n - 1}', m, 1) + nn.a{n - 1} * nn.W{n - 1}';
44+
nn.a{n} = nn.a{n - 1} * nn.W{n - 1}';
3145
case 'softmax'
32-
nn.a{n} = repmat(nn.b{n - 1}', m, 1) + nn.a{n - 1} * nn.W{n - 1}';
46+
nn.a{n} = nn.a{n - 1} * nn.W{n - 1}';
3347
nn.a{n} = exp(bsxfun(@minus, nn.a{n}, max(nn.a{n},[],2)));
3448
nn.a{n} = bsxfun(@rdivide, nn.a{n}, sum(nn.a{n}, 2));
3549
end
3650

3751
%error and loss
3852
nn.e = y - nn.a{n};
53+
3954
switch nn.output
40-
case {'sigm','linear'}
55+
case {'sigm', 'linear'}
4156
nn.L = 1/2 * sum(sum(nn.e .^ 2)) / m;
4257
case 'softmax'
4358
nn.L = -sum(sum(y .* log(nn.a{n}))) / m;

NN/nnsetup.m

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,26 @@
11
function nn = nnsetup(architecture)
22
%NNSETUP creates a Feedforward Backpropagate Neural Network
3-
% nn = nnsetup(size) returns an neural network structure with n=numel(size)
4-
% layers, size being a n x 1 vector of layer sizes e.g. [784 100 10]
3+
% nn = nnsetup(architecture) returns an neural network structure with n=numel(architecture)
4+
% layers, architecture being a n x 1 vector of layer sizes e.g. [784 100 10]
55

66
nn.size = architecture;
77
nn.n = numel(nn.size);
88

9-
nn.learningRate = 0.1; % learning rate
10-
nn.momentum = 0.5; % Momentum
11-
nn.weightPenaltyL2 = 0; % L2 regularization
12-
nn.nonSparsityPenalty = 0; % Non sparsity penalty
13-
nn.sparsityTarget = 0.05; % Sparsity target
14-
nn.inputZeroMaskedFraction = 0; % Used for Denoising AutoEncoders
15-
nn.dropoutFraction = 0; % Dropout level (http://www.cs.toronto.edu/~hinton/absps/dropout.pdf)
16-
nn.testing = 0; % Internal variable. nntest sets this to one.
17-
nn.output = 'sigm'; % output unit 'sigm' (=logistic), 'softmax' and 'linear'
9+
nn.normalize_input = 1; % normalize input elements to be between [-1 1]. Note: use a linear output function if training auto-encoders with normalized inputs
10+
nn.activation_function = 'tanh_opt'; % Activation functions of hidden layers: 'sigm' (sigmoid) or 'tanh_opt' (optimal tanh).
11+
nn.learningRate = 2; % learning rate Note: typically needs to be lower when using 'sigm' activation function and non-normalized inputs.
12+
nn.momentum = 0.5; % Momentum
13+
nn.weightPenaltyL2 = 0; % L2 regularization
14+
nn.nonSparsityPenalty = 0; % Non sparsity penalty
15+
nn.sparsityTarget = 0.05; % Sparsity target
16+
nn.inputZeroMaskedFraction = 0; % Used for Denoising AutoEncoders
17+
nn.dropoutFraction = 0; % Dropout level (http://www.cs.toronto.edu/~hinton/absps/dropout.pdf)
18+
nn.testing = 0; % Internal variable. nntest sets this to one.
19+
nn.output = 'sigm'; % output unit 'sigm' (=logistic), 'softmax' and 'linear'
1820

19-
for i = 2 : nn.n
20-
% biases and bias momentum
21-
nn.b{i - 1} = zeros(nn.size(i), 1);
22-
nn.vb{i - 1} = zeros(size(nn.b{i - 1}));
23-
21+
for i = 2 : nn.n
2422
% weights and weight momentum
25-
nn.W{i - 1} = (rand(nn.size(i), nn.size(i - 1)) - 0.5) * 2 * 4 * sqrt(6 / (nn.size(i) + nn.size(i - 1)));
23+
nn.W{i - 1} = (rand(nn.size(i), nn.size(i - 1)+1) - 0.5) * 2 * 4 * sqrt(6 / (nn.size(i) + nn.size(i - 1)));
2624
nn.vW{i - 1} = zeros(size(nn.W{i - 1}));
2725

2826
% average activations (for use with sparsity)

NN/nntest.m

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
function [er, bad] = nntest(nn, x, y)
2+
if nn.normalize_input==1;
3+
x = zscore(x);
4+
end
5+
26
nn.testing = 1;
37
nn = nnff(nn, x, y);
48
nn.testing = 0;

NN/nntrain.m

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
assert(isfloat(x), 'x must be a float');
1010
m = size(x, 1);
1111

12+
if nn.normalize_input==1
13+
x = zscore(x);
14+
end
15+
1216
batchsize = opts.batchsize;
1317
numepochs = opts.numepochs;
1418

README.md

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,14 @@ For a more informal introduction, see the following videos by Geoffrey Hinton an
1515
* [Recent Developments in Deep Learning](http://www.youtube.com/watch?v=VdIURAu1-aU) (Hinton, 2010)
1616
* [Unsupervised Feature Learning and Deep Learning](http://www.youtube.com/watch?v=ZmNOAtZIgIk) (Ng, 2011)
1717

18-
If you use this toolbox in your research please cite:
18+
If you use this toolbox in your research please cite [Prediction as a candidate for learning deep hierarchical models of data](http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6284)
1919

20-
[Prediction as a candidate for learning deep hierarchical models of data](http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6284) (Palm, 2012)
20+
```
21+
@MASTERSTHESIS\{IMM2012-06284,
22+
author = "R. B. Palm",
23+
title = "Prediction as a candidate for learning deep hierarchical models of data",
24+
year = "2012",
25+
```
2126

2227
Directories included in the toolbox
2328
-----------------------------------
@@ -85,15 +90,16 @@ dbn = dbntrain(dbn, train_x, opts);
8590
8691
%unfold dbn to nn
8792
nn = dbnunfoldtonn(dbn, 10);
93+
nn.normalize_input = 0;
94+
nn.activation_function = 'sigm';
8895
8996
%train nn
90-
nn.learningRate = 1;
9197
opts.numepochs = 1;
9298
opts.batchsize = 100;
9399
nn = nntrain(nn, train_x, train_y, opts);
94100
[er, bad] = nntest(nn, test_x, test_y);
95101
96-
assert(er < 0.12, 'Too big error');
102+
assert(er < 0.10, 'Too big error');
97103
98104
```
99105

@@ -114,25 +120,28 @@ test_y = double(test_y);
114120
% Setup and train a stacked denoising autoencoder (SDAE)
115121
rng(0);
116122
sae = saesetup([784 100]);
123+
sae.ae{1}.normalize_input = 0;
124+
sae.ae{1}.activation_function = 'sigm';
117125
sae.ae{1}.learningRate = 1;
118126
sae.ae{1}.inputZeroMaskedFraction = 0.5;
119127
opts.numepochs = 1;
120128
opts.batchsize = 100;
121129
sae = saetrain(sae, train_x, opts);
122-
visualize(sae.ae{1}.W{1}')
130+
visualize(sae.ae{1}.W{1}(:,2:end)')
123131
124132
% Use the SDAE to initialize a FFNN
125133
nn = nnsetup([784 100 10]);
134+
nn.normalize_input = 0;
135+
nn.activation_function = 'sigm';
136+
nn.learningRate = 1;
126137
nn.W{1} = sae.ae{1}.W{1};
127-
nn.b{1} = sae.ae{1}.b{1};
128138
129139
% Train the FFNN
130-
nn.learningRate = 1;
131140
opts.numepochs = 1;
132141
opts.batchsize = 100;
133142
nn = nntrain(nn, train_x, train_y, opts);
134143
[er, bad] = nntest(nn, test_x, test_y);
135-
assert(er < 0.21, 'Too big error');
144+
assert(er < 0.16, 'Too big error');
136145
137146
```
138147

@@ -193,44 +202,56 @@ test_y = double(test_y);
193202
%% ex1 vanilla neural net
194203
rng(0);
195204
nn = nnsetup([784 100 10]);
196-
197-
nn.learningRate = 1; % Learning rate
198205
opts.numepochs = 1; % Number of full sweeps through data
199206
opts.batchsize = 100; % Take a mean gradient step over this many samples
200-
opts.silent = 1;
201-
nn = nntrain(nn, train_x, train_y, opts);
207+
[nn, L] = nntrain(nn, train_x, train_y, opts);
202208
203209
[er, bad] = nntest(nn, test_x, test_y);
204-
assert(er < 0.1, 'Too big error');
210+
211+
assert(er < 0.08, 'Too big error');
212+
205213
206214
%% ex2 neural net with L2 weight decay
207215
rng(0);
208216
nn = nnsetup([784 100 10]);
209217
210218
nn.weightPenaltyL2 = 1e-4; % L2 weight decay
211-
nn.learningRate = 1; % Learning rate
212219
opts.numepochs = 1; % Number of full sweeps through data
213220
opts.batchsize = 100; % Take a mean gradient step over this many samples
214-
opts.silent = 1;
221+
215222
nn = nntrain(nn, train_x, train_y, opts);
216223
217224
[er, bad] = nntest(nn, test_x, test_y);
218225
assert(er < 0.1, 'Too big error');
219226
227+
220228
%% ex3 neural net with dropout
221229
rng(0);
222230
nn = nnsetup([784 100 10]);
223231
224232
nn.dropoutFraction = 0.5; % Dropout fraction
225-
nn.learningRate = 1; % Learning rate
226233
opts.numepochs = 1; % Number of full sweeps through data
227234
opts.batchsize = 100; % Take a mean gradient step over this many samples
228-
opts.silent = 1;
235+
229236
nn = nntrain(nn, train_x, train_y, opts);
230237
231238
[er, bad] = nntest(nn, test_x, test_y);
232-
assert(er < 0.16, 'Too big error');
239+
assert(er < 0.1, 'Too big error');
240+
241+
%% ex4 neural net with sigmoid activation function, and without normalizing inputs
242+
rng(0);
243+
nn = nnsetup([784 100 10]);
233244
245+
nn.activation_function = 'sigm'; % Sigmoid activation function
246+
nn.normalize_input = 0; % Don't normalize inputs
247+
nn.learningRate = 1; % Sigm and non-normalized inputs require a lower learning rate
248+
opts.numepochs = 1; % Number of full sweeps through data
249+
opts.batchsize = 100; % Take a mean gradient step over this many samples
250+
251+
nn = nntrain(nn, train_x, train_y, opts);
252+
253+
[er, bad] = nntest(nn, test_x, test_y);
254+
assert(er < 0.1, 'Too big error');
234255
```
235256

236257

README_header.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,14 @@ For a more informal introduction, see the following videos by Geoffrey Hinton an
1414
* [Recent Developments in Deep Learning](http://www.youtube.com/watch?v=VdIURAu1-aU) (Hinton, 2010)
1515
* [Unsupervised Feature Learning and Deep Learning](http://www.youtube.com/watch?v=ZmNOAtZIgIk) (Ng, 2011)
1616

17-
If you use this toolbox in your research please cite:
18-
19-
[Prediction as a candidate for learning deep hierarchical models of data](http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6284) (Palm, 2012)
17+
If you use this toolbox in your research please cite [Prediction as a candidate for learning deep hierarchical models of data](http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6284)
18+
19+
```
20+
@MASTERSTHESIS\{IMM2012-06284,
21+
author = "R. B. Palm",
22+
title = "Prediction as a candidate for learning deep hierarchical models of data",
23+
year = "2012",
24+
```
2025

2126
Directories included in the toolbox
2227
-----------------------------------

0 commit comments

Comments
 (0)