@@ -83,6 +83,11 @@ struct result {
8383 std::map<label_t , std::map<label_t , int > > confusion_matrix;
8484};
8585
86+ enum grad_check_mode {
87+ GRAD_CHECK_ALL, // /< check all elements of weights
88+ GRAD_CHECK_FIRST, // /< check first element of weights
89+ GRAD_CHECK_RANDOM // /< check 10 randomly selected weights
90+ };
8691
8792template <typename L, typename U>
8893class network {
@@ -142,8 +147,7 @@ class network {
142147
143148 template <typename T>
144149 void train (const std::vector<vec_t >& in, const std::vector<T>& t, size_t batch_size = 1 , int epoch = 1 ) {
145- init_weight ();
146- train (in, t, epoch, batch_size, nop, nop);
150+ train (in, t, batch_size, epoch, nop, nop);
147151 }
148152
149153 result test (const std::vector<vec_t >& in, const std::vector<label_t >& t) {
@@ -163,15 +167,99 @@ class network {
163167 return test_result;
164168 }
165169
170+
171+ float_t calc_delta_diff (const vec_t * in, const vec_t * v, int data_size, vec_t & w, vec_t & dw, int check_index) {
172+ static const float_t delta = 1e-10 ;
173+
174+ std::fill (dw.begin (), dw.end (), 0.0 );
175+
176+ // calculate dw/dE by bprop
177+ for (int i = 0 ; i < data_size; i++) {
178+ const vec_t & out = forward_propagation (in[i]);
179+ back_propagation (out, v[i]);
180+ }
181+ float_t delta_by_bprop = dw[check_index];
182+
183+ // calculate dw/dE by numeric
184+ float_t prev_w = w[check_index];
185+ w[check_index] = prev_w + delta;
186+ float_t f_p = 0.0 ;
187+ for (int i = 0 ; i < data_size; i++) {
188+ const vec_t & out = forward_propagation (in[i]);
189+ f_p += get_loss (out, v[i]);
190+ }
191+
192+ float_t f_m = 0.0 ;
193+ w[check_index] = prev_w - delta;
194+ for (int i = 0 ; i < data_size; i++) {
195+ const vec_t & out = forward_propagation (in[i]);
196+ f_m += get_loss (out, v[i]);
197+ }
198+
199+ float_t delta_by_numerical = (f_p - f_m) / (2.0 * delta);
200+
201+ w[check_index] = prev_w;
202+
203+ return std::abs (delta_by_bprop - delta_by_numerical);
204+ }
205+
206+ bool gradient_check (const vec_t * in, const label_t * t, int data_size, float_t eps, grad_check_mode mode = GRAD_CHECK_FIRST) {
207+ assert (!layers_.empty ());
208+ std::vector<vec_t > v;
209+ label2vector (t, data_size, &v);
210+
211+ auto current = layers_.head ();
212+
213+ while (current = current->next ()) { // ignore first input layer
214+ vec_t & w = current->weight ();
215+ vec_t & b = current->bias ();
216+ vec_t & dw = current->weight_diff (0 );
217+ vec_t & db = current->bias_diff (0 );
218+
219+ if (w.empty ()) continue ;
220+
221+ switch (mode) {
222+ case GRAD_CHECK_ALL:
223+ for (int i = 0 ; i < (int )w.size (); i++) {
224+ if (calc_delta_diff (in, &v[0 ], data_size, w, dw, i) > eps) return false ;
225+ }
226+ for (int i = 0 ; i < (int )b.size (); i++) {
227+ if (calc_delta_diff (in, &v[0 ], data_size, b, db, i) > eps) return false ;
228+ }
229+ break ;
230+ case GRAD_CHECK_FIRST:
231+ if (calc_delta_diff (in, &v[0 ], data_size, w, dw, 0 ) > eps) return false ;
232+ if (calc_delta_diff (in, &v[0 ], data_size, b, db, 0 ) > eps) return false ;
233+ break ;
234+ case GRAD_CHECK_RANDOM:
235+ for (int i = 0 ; i < 10 ; i++) {
236+ int index = uniform_rand (0 , (int )w.size () - 1 );
237+ if (calc_delta_diff (in, &v[0 ], data_size, w, dw, index) > eps) return false ;
238+ }
239+ for (int i = 0 ; i < 10 ; i++) {
240+ int index = uniform_rand (0 , (int )b.size () - 1 );
241+ if (calc_delta_diff (in, &v[0 ], data_size, b, db, index) > eps) return false ;
242+ }
243+ break ;
244+ default :
245+ throw nn_error (" unknown grad-check type" );
246+ }
247+
248+ }
249+ return true ;
250+ }
251+
166252private:
167253
168254 void label2vector (const label_t * t, int num, std::vector<vec_t > *vec) const {
169- assert (num > 0 );
170255 int outdim = out_dim ();
171256
172- vec->reserve (num);
257+ assert (num > 0 );
258+ assert (outdim > 0 );
173259
260+ vec->reserve (num);
174261 for (int i = 0 ; i < num; i++) {
262+ assert (t[i] < outdim);
175263 vec->emplace_back (outdim, target_value_min ());
176264 vec->back ()[t[i]] = target_value_max ();
177265 }
@@ -215,7 +303,6 @@ class network {
215303 }
216304 }
217305
218-
219306 void calc_hessian (const std::vector<vec_t >& in, int size_initialize_hessian = 500 ) {
220307 int size = std::min ((int )in.size (), size_initialize_hessian);
221308
@@ -240,6 +327,18 @@ class network {
240327 return layers_.head ()->forward_propagation (in, idx);
241328 }
242329
330+ float_t get_loss (const vec_t & out, const vec_t & t) {
331+ int dim = out.size ();
332+ float_t e = 0.0 ;
333+
334+ assert (dim == (int )t.size ());
335+
336+ for (int i = 0 ; i < dim; i++)
337+ e += E_.f (out[i], t[i]);
338+
339+ return e;
340+ }
341+
243342 void back_propagation_2nd (const vec_t & out) {
244343 vec_t delta (out_dim ());
245344 const activation& h = layers_.tail ()->activation_function ();
0 commit comments