Skip to content

Commit 295c204

Browse files
committed
implement gradient checking
1 parent 5f45cdc commit 295c204

File tree

4 files changed

+120
-16
lines changed

4 files changed

+120
-16
lines changed

include/activation.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class sigmoid_activation : public activation {
5353
class rectified_linear : public activation {
5454
public:
5555
float_t f(float_t x) const { return std::max(0.0, x); }
56-
float_t df(float_t f_x) const { return f_x == 0.0 ? 0.0 : 1.0; }
56+
float_t df(float_t f_x) const { return f_x > 0.0 ? 1.0 : 0.0; }
5757
std::pair<float_t, float_t> scale() const { return std::make_pair(0.1, 0.9); }
5858
};
5959

include/layer.h

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,15 @@ class layer_base {
113113
clear_diff(worker_size);
114114
}
115115

116+
vec_t& weight_diff(int index) { return dW_[index]; }
117+
vec_t& bias_diff(int index) { return db_[index]; }
116118

117-
vec_t& get_weight(int index) { return dW_[index]; }
118-
vec_t& get_bias(int index) { return db_[index]; }
119+
void clear_diff(int worker_size) {
120+
for (int i = 0; i < worker_size; i++) {
121+
std::fill(dW_[i].begin(), dW_[i].end(), 0.0);
122+
std::fill(db_[i].begin(), db_[i].end(), 0.0);
123+
}
124+
}
119125

120126
protected:
121127
int in_size_;
@@ -135,13 +141,6 @@ class layer_base {
135141
vec_t prev_delta2_; // d^2E/da^2
136142

137143
private:
138-
void clear_diff(int worker_size) {
139-
for (int i = 0; i < worker_size; i++) {
140-
std::fill(dW_[i].begin(), dW_[i].end(), 0.0);
141-
std::fill(db_[i].begin(), db_[i].end(), 0.0);
142-
}
143-
}
144-
145144
void merge(int worker_size, int batch_size) {
146145
for (int i = 1; i < worker_size; i++) {
147146
std::transform(dW_[0].begin(), dW_[0].end(), dW_[i].begin(), dW_[0].begin(), std::plus<float_t>());

include/network.h

Lines changed: 104 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,11 @@ struct result {
8383
std::map<label_t, std::map<label_t, int> > confusion_matrix;
8484
};
8585

86+
enum grad_check_mode {
87+
GRAD_CHECK_ALL, ///< check all elements of weights
88+
GRAD_CHECK_FIRST, ///< check first element of weights
89+
GRAD_CHECK_RANDOM ///< check 10 randomly selected weights
90+
};
8691

8792
template<typename L, typename U>
8893
class network {
@@ -142,8 +147,7 @@ class network {
142147

143148
template<typename T>
144149
void train(const std::vector<vec_t>& in, const std::vector<T>& t, size_t batch_size = 1, int epoch = 1) {
145-
init_weight();
146-
train(in, t, epoch, batch_size, nop, nop);
150+
train(in, t, batch_size, epoch, nop, nop);
147151
}
148152

149153
result test(const std::vector<vec_t>& in, const std::vector<label_t>& t) {
@@ -163,15 +167,99 @@ class network {
163167
return test_result;
164168
}
165169

170+
171+
float_t calc_delta_diff(const vec_t* in, const vec_t* v, int data_size, vec_t& w, vec_t& dw, int check_index) {
172+
static const float_t delta = 1e-10;
173+
174+
std::fill(dw.begin(), dw.end(), 0.0);
175+
176+
// calculate dw/dE by bprop
177+
for (int i = 0; i < data_size; i++) {
178+
const vec_t& out = forward_propagation(in[i]);
179+
back_propagation(out, v[i]);
180+
}
181+
float_t delta_by_bprop = dw[check_index];
182+
183+
// calculate dw/dE by numeric
184+
float_t prev_w = w[check_index];
185+
w[check_index] = prev_w + delta;
186+
float_t f_p = 0.0;
187+
for (int i = 0; i < data_size; i++) {
188+
const vec_t& out = forward_propagation(in[i]);
189+
f_p += get_loss(out, v[i]);
190+
}
191+
192+
float_t f_m = 0.0;
193+
w[check_index] = prev_w - delta;
194+
for (int i = 0; i < data_size; i++) {
195+
const vec_t& out = forward_propagation(in[i]);
196+
f_m += get_loss(out, v[i]);
197+
}
198+
199+
float_t delta_by_numerical = (f_p - f_m) / (2.0 * delta);
200+
201+
w[check_index] = prev_w;
202+
203+
return std::abs(delta_by_bprop - delta_by_numerical);
204+
}
205+
206+
bool gradient_check(const vec_t* in, const label_t* t, int data_size, float_t eps, grad_check_mode mode = GRAD_CHECK_FIRST) {
207+
assert(!layers_.empty());
208+
std::vector<vec_t> v;
209+
label2vector(t, data_size, &v);
210+
211+
auto current = layers_.head();
212+
213+
while (current = current->next()) { // ignore first input layer
214+
vec_t& w = current->weight();
215+
vec_t& b = current->bias();
216+
vec_t& dw = current->weight_diff(0);
217+
vec_t& db = current->bias_diff(0);
218+
219+
if (w.empty()) continue;
220+
221+
switch (mode) {
222+
case GRAD_CHECK_ALL:
223+
for (int i = 0; i < (int)w.size(); i++) {
224+
if (calc_delta_diff(in, &v[0], data_size, w, dw, i) > eps) return false;
225+
}
226+
for (int i = 0; i < (int)b.size(); i++) {
227+
if (calc_delta_diff(in, &v[0], data_size, b, db, i) > eps) return false;
228+
}
229+
break;
230+
case GRAD_CHECK_FIRST:
231+
if (calc_delta_diff(in, &v[0], data_size, w, dw, 0) > eps) return false;
232+
if (calc_delta_diff(in, &v[0], data_size, b, db, 0) > eps) return false;
233+
break;
234+
case GRAD_CHECK_RANDOM:
235+
for (int i = 0; i < 10; i++) {
236+
int index = uniform_rand(0, (int)w.size() - 1);
237+
if (calc_delta_diff(in, &v[0], data_size, w, dw, index) > eps) return false;
238+
}
239+
for (int i = 0; i < 10; i++) {
240+
int index = uniform_rand(0, (int)b.size() - 1);
241+
if (calc_delta_diff(in, &v[0], data_size, b, db, index) > eps) return false;
242+
}
243+
break;
244+
default:
245+
throw nn_error("unknown grad-check type");
246+
}
247+
248+
}
249+
return true;
250+
}
251+
166252
private:
167253

168254
void label2vector(const label_t* t, int num, std::vector<vec_t> *vec) const {
169-
assert(num > 0);
170255
int outdim = out_dim();
171256

172-
vec->reserve(num);
257+
assert(num > 0);
258+
assert(outdim > 0);
173259

260+
vec->reserve(num);
174261
for (int i = 0; i < num; i++) {
262+
assert(t[i] < outdim);
175263
vec->emplace_back(outdim, target_value_min());
176264
vec->back()[t[i]] = target_value_max();
177265
}
@@ -215,7 +303,6 @@ class network {
215303
}
216304
}
217305

218-
219306
void calc_hessian(const std::vector<vec_t>& in, int size_initialize_hessian = 500) {
220307
int size = std::min((int)in.size(), size_initialize_hessian);
221308

@@ -240,6 +327,18 @@ class network {
240327
return layers_.head()->forward_propagation(in, idx);
241328
}
242329

330+
float_t get_loss(const vec_t& out, const vec_t& t) {
331+
int dim = out.size();
332+
float_t e = 0.0;
333+
334+
assert(dim == (int)t.size());
335+
336+
for (int i = 0; i < dim; i++)
337+
e += E_.f(out[i], t[i]);
338+
339+
return e;
340+
}
341+
243342
void back_propagation_2nd(const vec_t& out) {
244343
vec_t delta(out_dim());
245344
const activation& h = layers_.tail()->activation_function();

include/util.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,14 @@ inline fixed_point<Q> uniform_rand(fixed_point<Q> min, fixed_point<Q> max) {
8080
return dst(gen);
8181
}
8282

83+
inline int uniform_rand(int min, int max) {
84+
static boost::mt19937 gen(0);
85+
boost::uniform_smallint<> dst(min, max);
86+
return dst(gen);
87+
}
88+
8389
template<typename T>
84-
inline double uniform_rand(T min, T max) {
90+
inline T uniform_rand(T min, T max) {
8591
static boost::mt19937 gen(0);
8692
boost::uniform_real<T> dst(min, max);
8793
return dst(gen);

0 commit comments

Comments
 (0)