implement dropout

nyanp · nyanp · commit 9f8da2786dca · 2014-05-11T22:48:34.000+09:00
diff --git a/include/dropout.h b/include/dropout.h
@@ -0,0 +1,133 @@
+/*
+    Copyright (c) 2013, Taiga Nomi
+    All rights reserved.
+    
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+    names of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 
+    EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY 
+    DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
+    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
+    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 
+    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#pragma once
+#include "layer.h"
+#include "product.h"
+#include "util.h"
+
+namespace tiny_cnn {
+
+    class filter_none {
+    public:
+        explicit filter_none(int out_dim) {
+            CNN_UNREFERENCED_PARAMETER(out_dim);
+        }
+
+        const vec_t& filter_fprop(const vec_t& out, int index) {
+            CNN_UNREFERENCED_PARAMETER(index);
+            return out;
+        }
+
+        const vec_t& filter_bprop(const vec_t& delta, int index) {
+            CNN_UNREFERENCED_PARAMETER(index);
+            return delta;
+        }
+
+    private:
+    };
+
+    class dropout {
+    public:
+        enum context {
+            train_phase,
+            test_phase
+        };
+
+        enum dropout_mode {
+            per_data,
+            per_batch
+        };
+
+        explicit dropout(int out_dim)
+            : out_dim_(out_dim), mask_(out_dim), ctx_(train_phase), mode_(per_data), dropout_rate_(0.5) {
+            for (int i = 0; i < CNN_TASK_SIZE; i++) {
+                masked_out_[i].resize(out_dim);
+                masked_delta_[i].resize(out_dim);
+            }
+            shuffle();
+        }
+
+        void set_dropout_rate(double rate) {
+            if (rate < 0.0 || rate >= 1.0)
+                throw nn_error("0.0 <= dropout-rate < 1.0");
+            dropout_rate_ = rate;
+        }
+
+        void set_mode(dropout_mode mode) {
+            mode_ = mode;
+        }
+
+        void set_context(context ctx) {
+            ctx_ = ctx;
+        }
+
+        // mask output vector
+        const vec_t& filter_fprop(const vec_t& out, int index) {
+            if (ctx_ == train_phase) {
+                for (int i = 0; i < out_dim_; i++)
+                    masked_out_[index][i] = out[i] * mask_[i];
+            }
+            else if (ctx_ == test_phase) {
+                for (int i = 0; i < out_dim_; i++)
+                    masked_out_[index][i] = out[i] * dropout_rate_;
+            }
+            else {
+                throw nn_error("invalid context");
+            }
+            return masked_out_[index];
+        }
+
+        // mask delta
+        const vec_t& filter_bprop(const vec_t& delta, int index) {
+            for (int i = 0; i < out_dim_; i++)
+                masked_delta_[index][i] = delta[i] * mask_[i];
+
+            if (mode_ == per_data) shuffle();
+
+            return masked_delta_[index];
+        }
+
+        void shuffle() {
+            for (auto& m : mask_)
+                m = bernoulli(1.0 - dropout_rate_);
+        }
+
+        void end_batch() {
+            if (mode_ == per_batch) shuffle();
+        }
+
+    private:
+        int out_dim_;
+        std::vector<uint8_t> mask_;
+        vec_t masked_out_[CNN_TASK_SIZE];
+        vec_t masked_delta_[CNN_TASK_SIZE];
+        context ctx_;
+        dropout_mode mode_;
+        double dropout_rate_;
+    };
+
+} // namespace tiny_cnn
diff --git a/include/fully_connected_dropout_layer.h b/include/fully_connected_dropout_layer.h
@@ -0,0 +1,62 @@
+/*
+    Copyright (c) 2013, Taiga Nomi
+    All rights reserved.
+    
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+    names of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 
+    EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY 
+    DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
+    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
+    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 
+    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#pragma once
+#include "fully_connected_layer.h"
+
+namespace tiny_cnn {
+
+// normal 
+template<typename N, typename Activation>
+class fully_connected_dropout_layer : public fully_connected_layer<N, Activation, dropout> {
+public:
+    typedef fully_connected_layer<N, Activation, dropout> Base;
+    typedef typename Base::Optimizer Optimizer;
+
+    fully_connected_dropout_layer(int in_dim, int out_dim, dropout::dropout_mode mode = dropout::per_data)
+        : Base(in_dim, out_dim)
+    {
+        this->filter_.set_mode(mode);
+    }
+
+    void set_dropout_rate(double rate) {
+        this->filter_.set_dropout_rate(rate);
+    }
+
+    /**
+     * set dropout-context (training-phase or test-phase)
+     **/
+    void set_context(dropout::context ctx) {
+        this->filter_.set_context(ctx);
+    }
+
+private:
+    void post_update() {
+        this->filter_.end_batch();
+    }
+};
+
+} // namespace tiny_cnn
diff --git a/include/fully_connected_layer.h b/include/fully_connected_layer.h
@@ -27,17 +27,19 @@
 #pragma once
 #include "layer.h"
 #include "product.h"
+#include "dropout.h"
 
 namespace tiny_cnn {
 
 // normal 
-template<typename N, typename Activation>
+template<typename N, typename Activation, typename Filter = filter_none>
 class fully_connected_layer : public layer<N, Activation> {
 public:
     typedef layer<N, Activation> Base;
     typedef typename Base::Optimizer Optimizer;
 
-    fully_connected_layer(int in_dim, int out_dim) : layer<N, Activation>(in_dim, out_dim, in_dim * out_dim, out_dim) {}
+    fully_connected_layer(int in_dim, int out_dim)
+        : layer<N, Activation>(in_dim, out_dim, in_dim * out_dim, out_dim), filter_(out_dim) {}
 
     int connection_size() const {
         return this->in_size_ * this->out_size_ + this->out_size_;
@@ -58,10 +60,13 @@ class fully_connected_layer : public layer<N, Activation> {
             this->output_[index][r] = this->a_.f(z);
         }
 
-        return this->next_ ? this->next_->forward_propagation(this->output_[index], index) : this->output_[index];
+        auto& this_out = this->filter_.filter_fprop(this->output_[index], index);
+
+        return this->next_ ? this->next_->forward_propagation(this_out, index) : this_out;
     }
 
     const vec_t& back_propagation(const vec_t& current_delta, int index) {
+        const vec_t& curr_delta = this->filter_.filter_bprop(current_delta, index);
         const vec_t& prev_out = this->prev_->output(index);
         const activation& prev_h = this->prev_->activation_function();
         vec_t& prev_delta = this->prev_delta_[index];
@@ -73,7 +78,7 @@ class fully_connected_layer : public layer<N, Activation> {
             //for (int r = 0; r < this->out_size_; r++)
             //    prev_delta[c] += current_delta[r] * this->W_[c*this->out_size_+r];
 
-            prev_delta[c] = vectorize::dot(&current_delta[0], &this->W_[c*this->out_size_], this->out_size_);
+            prev_delta[c] = vectorize::dot(&curr_delta[0], &this->W_[c*this->out_size_], this->out_size_);
             prev_delta[c] *= prev_h.df(prev_out[c]);
         }
 
@@ -83,11 +88,11 @@ class fully_connected_layer : public layer<N, Activation> {
                     dW[c*this->out_size_+i] += current_delta[i] * prev_out[c];*/
 
             for (int c = 0; c < this->in_size_; c++) {
-                vectorize::muladd(&current_delta[0], prev_out[c], r.end() - r.begin(), &dW[c*this->out_size_ + r.begin()]);
+                vectorize::muladd(&curr_delta[0], prev_out[c], r.end() - r.begin(), &dW[c*this->out_size_ + r.begin()]);
             }
 
             for (int i = r.begin(); i < r.end(); i++) 
-                db[i] += current_delta[i]; 
+                db[i] += curr_delta[i];
         });
 
         return this->prev_->back_propagation(this->prev_delta_[index], index);
@@ -101,7 +106,7 @@ class fully_connected_layer : public layer<N, Activation> {
 
         for (int c = 0; c < this->in_size_; c++) 
             for (int r = 0; r < this->out_size_; r++)
-                this->Whessian_[c*this->out_size_+r] += current_delta2[r] * prev_out[c] * prev_out[c];
+                this->Whessian_[c*this->out_size_ + r] += current_delta2[r] * prev_out[c] * prev_out[c];
 
         for (int r = 0; r < this->out_size_; r++)
             this->bhessian_[r] += current_delta2[r];
@@ -110,13 +115,16 @@ class fully_connected_layer : public layer<N, Activation> {
             this->prev_delta2_[c] = 0.0;
 
             for (int r = 0; r < this->out_size_; r++) 
-                this->prev_delta2_[c] += current_delta2[r] * this->W_[c*this->out_size_+r] * this->W_[c*this->out_size_+r];
+                this->prev_delta2_[c] += current_delta2[r] * this->W_[c*this->out_size_ + r] * this->W_[c*this->out_size_ + r];
 
             this->prev_delta2_[c] *= prev_h.df(prev_out[c]) * prev_h.df(prev_out[c]);
         }
 
         return this->prev_->back_propagation_2nd(this->prev_delta2_);
     }
+
+protected:
+    Filter filter_;
 };
 
 } // namespace tiny_cnn
diff --git a/include/layer.h b/include/layer.h
@@ -96,6 +96,9 @@ class layer_base {
     virtual const vec_t& back_propagation(const vec_t& current_delta, int worker_index) = 0;
     virtual const vec_t& back_propagation_2nd(const vec_t& current_delta2) = 0;
 
+    // called afrer updating weight
+    virtual void post_update() {}
+
     layer_base<N>* next() { return next_; }
     layer_base<N>* prev() { return prev_; }
 
@@ -116,6 +119,7 @@ class layer_base {
             o->update(db_[0][i], bhessian_[i], &b_[i]);
 
         clear_diff(worker_size);
+        post_update();
     }
 
     vec_t& weight_diff(int index) { return dW_[index]; }
diff --git a/include/tiny_cnn.h b/include/tiny_cnn.h
@@ -32,6 +32,7 @@
 #include "average_pooling_layer.h"
 #include "convolutional_layer.h"
 #include "fully_connected_layer.h"
+#include "fully_connected_dropout_layer.h"
 
 #include "activation_function.h"
 #include "loss_function.h"
diff --git a/include/util.h b/include/util.h
@@ -94,6 +94,10 @@ inline T uniform_rand(T min, T max) {
     return dst(gen);
 }
 
+inline bool bernoulli(double p) {
+    return uniform_rand(0.0, 1.0) <= p;
+}
+
 template<typename Iter>
 void uniform_rand(Iter begin, Iter end, float_t min, float_t max) {
     for (Iter it = begin; it != end; ++it) 
diff --git a/src/main.cpp b/src/main.cpp
@@ -35,6 +35,7 @@
 void sample1_convnet();
 void sample2_mlp();
 void sample3_dae();
+void sample4_dropout();
 
 using namespace tiny_cnn;
 
@@ -219,3 +220,33 @@ void sample3_dae()
     nn.train(train_data_corrupted, train_data_original);
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// dropout-learning
+
+void sample4_dropout()
+{
+    typedef network<mse, gradient_descent> Network;
+    Network nn;
+    int input_dim    = 10;
+    int hidden_units = 100;
+    int output_dim   = 10;
+
+    fully_connected_dropout_layer<Network, tanh_activation> f1(input_dim, hidden_units, dropout::per_data);
+    fully_connected_layer<Network, tanh_activation> f2(hidden_units, output_dim);
+    nn.add(&f1); nn.add(&f2);
+
+    std::vector<vec_t> train_data, test_data;
+    std::vector<label_t> train_label, test_label;
+
+    // load train-data, label_data
+
+    // learning
+    nn.train(train_data, train_label);
+
+    // change context to enable all hidden-units
+    f1.set_context(dropout::test_phase);
+
+    tiny_cnn::result res = nn.test(test_data, test_label);
+
+    std::cout << res.num_success << "/" << res.num_total << std::endl;
+}
diff --git a/src/test.cpp b/src/test.cpp
@@ -440,6 +440,30 @@ TEST(multi_layer, gradient_check3) { // mixture - mse
     EXPECT_TRUE(nn.gradient_check(&a, &t, 1, 1e-4, GRAD_CHECK_RANDOM));
 }
 
+TEST(multi_layer4, gradient_check) { // sigmoid - cross-entropy
+    typedef cross_entropy loss_func;
+    typedef sigmoid_activation activation;
+    typedef network<loss_func, gradient_descent_levenberg_marquardt> network;
+
+    network nn;
+    fully_connected_layer<network, activation> l1(10, 14 * 14 * 3);
+    convolutional_layer<network, activation>   l2(14, 14, 5, 3, 6);
+    average_pooling_layer<network, activation> l3(10, 10, 6, 2);
+    fully_connected_dropout_layer<network, activation> l4(5 * 5 * 6, 3);
+
+    nn.add(&l1);
+    nn.add(&l2);
+    nn.add(&l3);
+    nn.add(&l4);
+
+    vec_t a(10, 0.0);
+    label_t t = 2;
+
+    uniform_rand(a.begin(), a.end(), -1, 1);
+    nn.init_weight();
+    EXPECT_TRUE(nn.gradient_check(&a, &t, 1, 1e-4, GRAD_CHECK_RANDOM));
+}
+
 template <typename N>
 void serialization_test(const layer_base<N>& src, layer_base<N>& dst)
 {
diff --git a/vc/tiny_cnn.vcxproj b/vc/tiny_cnn.vcxproj
diff --git a/vc/tiny_cnn_test.vcxproj b/vc/tiny_cnn_test.vcxproj