Skip to content

Commit 9f8da27

Browse files
committed
implement dropout
1 parent d3de998 commit 9f8da27

File tree

10 files changed

+277
-8
lines changed

10 files changed

+277
-8
lines changed

include/dropout.h

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
/*
2+
Copyright (c) 2013, Taiga Nomi
3+
All rights reserved.
4+
5+
Redistribution and use in source and binary forms, with or without
6+
modification, are permitted provided that the following conditions are met:
7+
* Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
* Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in the
11+
documentation and/or other materials provided with the distribution.
12+
* Neither the name of the <organization> nor the
13+
names of its contributors may be used to endorse or promote products
14+
derived from this software without specific prior written permission.
15+
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
17+
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
20+
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23+
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*/
27+
#pragma once
28+
#include "layer.h"
29+
#include "product.h"
30+
#include "util.h"
31+
32+
namespace tiny_cnn {
33+
34+
class filter_none {
35+
public:
36+
explicit filter_none(int out_dim) {
37+
CNN_UNREFERENCED_PARAMETER(out_dim);
38+
}
39+
40+
const vec_t& filter_fprop(const vec_t& out, int index) {
41+
CNN_UNREFERENCED_PARAMETER(index);
42+
return out;
43+
}
44+
45+
const vec_t& filter_bprop(const vec_t& delta, int index) {
46+
CNN_UNREFERENCED_PARAMETER(index);
47+
return delta;
48+
}
49+
50+
private:
51+
};
52+
53+
class dropout {
54+
public:
55+
enum context {
56+
train_phase,
57+
test_phase
58+
};
59+
60+
enum dropout_mode {
61+
per_data,
62+
per_batch
63+
};
64+
65+
explicit dropout(int out_dim)
66+
: out_dim_(out_dim), mask_(out_dim), ctx_(train_phase), mode_(per_data), dropout_rate_(0.5) {
67+
for (int i = 0; i < CNN_TASK_SIZE; i++) {
68+
masked_out_[i].resize(out_dim);
69+
masked_delta_[i].resize(out_dim);
70+
}
71+
shuffle();
72+
}
73+
74+
void set_dropout_rate(double rate) {
75+
if (rate < 0.0 || rate >= 1.0)
76+
throw nn_error("0.0 <= dropout-rate < 1.0");
77+
dropout_rate_ = rate;
78+
}
79+
80+
void set_mode(dropout_mode mode) {
81+
mode_ = mode;
82+
}
83+
84+
void set_context(context ctx) {
85+
ctx_ = ctx;
86+
}
87+
88+
// mask output vector
89+
const vec_t& filter_fprop(const vec_t& out, int index) {
90+
if (ctx_ == train_phase) {
91+
for (int i = 0; i < out_dim_; i++)
92+
masked_out_[index][i] = out[i] * mask_[i];
93+
}
94+
else if (ctx_ == test_phase) {
95+
for (int i = 0; i < out_dim_; i++)
96+
masked_out_[index][i] = out[i] * dropout_rate_;
97+
}
98+
else {
99+
throw nn_error("invalid context");
100+
}
101+
return masked_out_[index];
102+
}
103+
104+
// mask delta
105+
const vec_t& filter_bprop(const vec_t& delta, int index) {
106+
for (int i = 0; i < out_dim_; i++)
107+
masked_delta_[index][i] = delta[i] * mask_[i];
108+
109+
if (mode_ == per_data) shuffle();
110+
111+
return masked_delta_[index];
112+
}
113+
114+
void shuffle() {
115+
for (auto& m : mask_)
116+
m = bernoulli(1.0 - dropout_rate_);
117+
}
118+
119+
void end_batch() {
120+
if (mode_ == per_batch) shuffle();
121+
}
122+
123+
private:
124+
int out_dim_;
125+
std::vector<uint8_t> mask_;
126+
vec_t masked_out_[CNN_TASK_SIZE];
127+
vec_t masked_delta_[CNN_TASK_SIZE];
128+
context ctx_;
129+
dropout_mode mode_;
130+
double dropout_rate_;
131+
};
132+
133+
} // namespace tiny_cnn
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/*
2+
Copyright (c) 2013, Taiga Nomi
3+
All rights reserved.
4+
5+
Redistribution and use in source and binary forms, with or without
6+
modification, are permitted provided that the following conditions are met:
7+
* Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
* Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in the
11+
documentation and/or other materials provided with the distribution.
12+
* Neither the name of the <organization> nor the
13+
names of its contributors may be used to endorse or promote products
14+
derived from this software without specific prior written permission.
15+
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
17+
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
20+
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23+
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*/
27+
#pragma once
28+
#include "fully_connected_layer.h"
29+
30+
namespace tiny_cnn {
31+
32+
// normal
33+
template<typename N, typename Activation>
34+
class fully_connected_dropout_layer : public fully_connected_layer<N, Activation, dropout> {
35+
public:
36+
typedef fully_connected_layer<N, Activation, dropout> Base;
37+
typedef typename Base::Optimizer Optimizer;
38+
39+
fully_connected_dropout_layer(int in_dim, int out_dim, dropout::dropout_mode mode = dropout::per_data)
40+
: Base(in_dim, out_dim)
41+
{
42+
this->filter_.set_mode(mode);
43+
}
44+
45+
void set_dropout_rate(double rate) {
46+
this->filter_.set_dropout_rate(rate);
47+
}
48+
49+
/**
50+
* set dropout-context (training-phase or test-phase)
51+
**/
52+
void set_context(dropout::context ctx) {
53+
this->filter_.set_context(ctx);
54+
}
55+
56+
private:
57+
void post_update() {
58+
this->filter_.end_batch();
59+
}
60+
};
61+
62+
} // namespace tiny_cnn

include/fully_connected_layer.h

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,19 @@
2727
#pragma once
2828
#include "layer.h"
2929
#include "product.h"
30+
#include "dropout.h"
3031

3132
namespace tiny_cnn {
3233

3334
// normal
34-
template<typename N, typename Activation>
35+
template<typename N, typename Activation, typename Filter = filter_none>
3536
class fully_connected_layer : public layer<N, Activation> {
3637
public:
3738
typedef layer<N, Activation> Base;
3839
typedef typename Base::Optimizer Optimizer;
3940

40-
fully_connected_layer(int in_dim, int out_dim) : layer<N, Activation>(in_dim, out_dim, in_dim * out_dim, out_dim) {}
41+
fully_connected_layer(int in_dim, int out_dim)
42+
: layer<N, Activation>(in_dim, out_dim, in_dim * out_dim, out_dim), filter_(out_dim) {}
4143

4244
int connection_size() const {
4345
return this->in_size_ * this->out_size_ + this->out_size_;
@@ -58,10 +60,13 @@ class fully_connected_layer : public layer<N, Activation> {
5860
this->output_[index][r] = this->a_.f(z);
5961
}
6062

61-
return this->next_ ? this->next_->forward_propagation(this->output_[index], index) : this->output_[index];
63+
auto& this_out = this->filter_.filter_fprop(this->output_[index], index);
64+
65+
return this->next_ ? this->next_->forward_propagation(this_out, index) : this_out;
6266
}
6367

6468
const vec_t& back_propagation(const vec_t& current_delta, int index) {
69+
const vec_t& curr_delta = this->filter_.filter_bprop(current_delta, index);
6570
const vec_t& prev_out = this->prev_->output(index);
6671
const activation& prev_h = this->prev_->activation_function();
6772
vec_t& prev_delta = this->prev_delta_[index];
@@ -73,7 +78,7 @@ class fully_connected_layer : public layer<N, Activation> {
7378
//for (int r = 0; r < this->out_size_; r++)
7479
// prev_delta[c] += current_delta[r] * this->W_[c*this->out_size_+r];
7580

76-
prev_delta[c] = vectorize::dot(&current_delta[0], &this->W_[c*this->out_size_], this->out_size_);
81+
prev_delta[c] = vectorize::dot(&curr_delta[0], &this->W_[c*this->out_size_], this->out_size_);
7782
prev_delta[c] *= prev_h.df(prev_out[c]);
7883
}
7984

@@ -83,11 +88,11 @@ class fully_connected_layer : public layer<N, Activation> {
8388
dW[c*this->out_size_+i] += current_delta[i] * prev_out[c];*/
8489

8590
for (int c = 0; c < this->in_size_; c++) {
86-
vectorize::muladd(&current_delta[0], prev_out[c], r.end() - r.begin(), &dW[c*this->out_size_ + r.begin()]);
91+
vectorize::muladd(&curr_delta[0], prev_out[c], r.end() - r.begin(), &dW[c*this->out_size_ + r.begin()]);
8792
}
8893

8994
for (int i = r.begin(); i < r.end(); i++)
90-
db[i] += current_delta[i];
95+
db[i] += curr_delta[i];
9196
});
9297

9398
return this->prev_->back_propagation(this->prev_delta_[index], index);
@@ -101,7 +106,7 @@ class fully_connected_layer : public layer<N, Activation> {
101106

102107
for (int c = 0; c < this->in_size_; c++)
103108
for (int r = 0; r < this->out_size_; r++)
104-
this->Whessian_[c*this->out_size_+r] += current_delta2[r] * prev_out[c] * prev_out[c];
109+
this->Whessian_[c*this->out_size_ + r] += current_delta2[r] * prev_out[c] * prev_out[c];
105110

106111
for (int r = 0; r < this->out_size_; r++)
107112
this->bhessian_[r] += current_delta2[r];
@@ -110,13 +115,16 @@ class fully_connected_layer : public layer<N, Activation> {
110115
this->prev_delta2_[c] = 0.0;
111116

112117
for (int r = 0; r < this->out_size_; r++)
113-
this->prev_delta2_[c] += current_delta2[r] * this->W_[c*this->out_size_+r] * this->W_[c*this->out_size_+r];
118+
this->prev_delta2_[c] += current_delta2[r] * this->W_[c*this->out_size_ + r] * this->W_[c*this->out_size_ + r];
114119

115120
this->prev_delta2_[c] *= prev_h.df(prev_out[c]) * prev_h.df(prev_out[c]);
116121
}
117122

118123
return this->prev_->back_propagation_2nd(this->prev_delta2_);
119124
}
125+
126+
protected:
127+
Filter filter_;
120128
};
121129

122130
} // namespace tiny_cnn

include/layer.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@ class layer_base {
9696
virtual const vec_t& back_propagation(const vec_t& current_delta, int worker_index) = 0;
9797
virtual const vec_t& back_propagation_2nd(const vec_t& current_delta2) = 0;
9898

99+
// called afrer updating weight
100+
virtual void post_update() {}
101+
99102
layer_base<N>* next() { return next_; }
100103
layer_base<N>* prev() { return prev_; }
101104

@@ -116,6 +119,7 @@ class layer_base {
116119
o->update(db_[0][i], bhessian_[i], &b_[i]);
117120

118121
clear_diff(worker_size);
122+
post_update();
119123
}
120124

121125
vec_t& weight_diff(int index) { return dW_[index]; }

include/tiny_cnn.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "average_pooling_layer.h"
3333
#include "convolutional_layer.h"
3434
#include "fully_connected_layer.h"
35+
#include "fully_connected_dropout_layer.h"
3536

3637
#include "activation_function.h"
3738
#include "loss_function.h"

include/util.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ inline T uniform_rand(T min, T max) {
9494
return dst(gen);
9595
}
9696

97+
inline bool bernoulli(double p) {
98+
return uniform_rand(0.0, 1.0) <= p;
99+
}
100+
97101
template<typename Iter>
98102
void uniform_rand(Iter begin, Iter end, float_t min, float_t max) {
99103
for (Iter it = begin; it != end; ++it)

src/main.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
void sample1_convnet();
3636
void sample2_mlp();
3737
void sample3_dae();
38+
void sample4_dropout();
3839

3940
using namespace tiny_cnn;
4041

@@ -219,3 +220,33 @@ void sample3_dae()
219220
nn.train(train_data_corrupted, train_data_original);
220221
}
221222

223+
///////////////////////////////////////////////////////////////////////////////
224+
// dropout-learning
225+
226+
void sample4_dropout()
227+
{
228+
typedef network<mse, gradient_descent> Network;
229+
Network nn;
230+
int input_dim = 10;
231+
int hidden_units = 100;
232+
int output_dim = 10;
233+
234+
fully_connected_dropout_layer<Network, tanh_activation> f1(input_dim, hidden_units, dropout::per_data);
235+
fully_connected_layer<Network, tanh_activation> f2(hidden_units, output_dim);
236+
nn.add(&f1); nn.add(&f2);
237+
238+
std::vector<vec_t> train_data, test_data;
239+
std::vector<label_t> train_label, test_label;
240+
241+
// load train-data, label_data
242+
243+
// learning
244+
nn.train(train_data, train_label);
245+
246+
// change context to enable all hidden-units
247+
f1.set_context(dropout::test_phase);
248+
249+
tiny_cnn::result res = nn.test(test_data, test_label);
250+
251+
std::cout << res.num_success << "/" << res.num_total << std::endl;
252+
}

src/test.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,30 @@ TEST(multi_layer, gradient_check3) { // mixture - mse
440440
EXPECT_TRUE(nn.gradient_check(&a, &t, 1, 1e-4, GRAD_CHECK_RANDOM));
441441
}
442442

443+
TEST(multi_layer4, gradient_check) { // sigmoid - cross-entropy
444+
typedef cross_entropy loss_func;
445+
typedef sigmoid_activation activation;
446+
typedef network<loss_func, gradient_descent_levenberg_marquardt> network;
447+
448+
network nn;
449+
fully_connected_layer<network, activation> l1(10, 14 * 14 * 3);
450+
convolutional_layer<network, activation> l2(14, 14, 5, 3, 6);
451+
average_pooling_layer<network, activation> l3(10, 10, 6, 2);
452+
fully_connected_dropout_layer<network, activation> l4(5 * 5 * 6, 3);
453+
454+
nn.add(&l1);
455+
nn.add(&l2);
456+
nn.add(&l3);
457+
nn.add(&l4);
458+
459+
vec_t a(10, 0.0);
460+
label_t t = 2;
461+
462+
uniform_rand(a.begin(), a.end(), -1, 1);
463+
nn.init_weight();
464+
EXPECT_TRUE(nn.gradient_check(&a, &t, 1, 1e-4, GRAD_CHECK_RANDOM));
465+
}
466+
443467
template <typename N>
444468
void serialization_test(const layer_base<N>& src, layer_base<N>& dst)
445469
{

0 commit comments

Comments
 (0)