optimize mish in yolov4

wang-xinyu · wang-xinyu · commit 9001986d089d · 2020-05-05T23:01:09.000+08:00
diff --git a/yolov4/mish.cu b/yolov4/mish.cu
@@ -44,14 +44,29 @@ namespace nvinfer1
         return DimsCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]);
     }
 
-    __device__ float softplus(float x) { return (x > 20.0) ? x : log(1.0 + exp(x)); }
+    __device__ float tanh_activate_kernel(float x){return (2/(1 + expf(-2*x)) - 1);}
+
+    __device__ float softplus_kernel(float x, float threshold = 20) {
+        if (x > threshold) return x;                // too large
+        else if (x < -threshold) return expf(x);    // too small
+        return logf(expf(x) + 1);
+    }
 
     __global__ void mish_kernel(const float *input, float *output, int num_elem) {
 
         int idx = threadIdx.x + blockDim.x * blockIdx.x;
         if (idx >= num_elem) return;
 
-        output[idx] = input[idx] * tanh(softplus(input[idx]));
+        //float t = exp(input[idx]);
+        //if (input[idx] > 20.0) {
+        //    t *= t;
+        //    output[idx] = (t - 1.0) / (t + 1.0);
+        //} else {
+        //    float tt = t * t;
+        //    output[idx] = (tt + 2.0 * t) / (tt + 2.0 * t + 2.0);
+        //}
+        //output[idx] *= input[idx];
+        output[idx] = input[idx] * tanh_activate_kernel(softplus_kernel(input[idx]));
     }
 
     void MishPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
@@ -69,5 +84,5 @@ namespace nvinfer1
         forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
         return 0;
     }
-
 }
+
diff --git a/yolov4/yolov4.cpp b/yolov4/yolov4.cpp
@@ -14,7 +14,7 @@
 #include <opencv2/opencv.hpp>
 #include <dirent.h>
 
-//#define USE_FP16  // comment out this if want to use FP32
+#define USE_FP16  // comment out this if want to use FP32
 #define DEVICE 0  // GPU id
 #define NMS_THRESH 0.4
 #define BBOX_CONF_THRESH 0.5
@@ -649,13 +649,13 @@ int main(int argc, char** argv) {
         // Run inference
         auto start = std::chrono::system_clock::now();
         doInference(*context, data, prob, BATCH_SIZE);
+        auto end = std::chrono::system_clock::now();
+        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
         std::vector<std::vector<Yolo::Detection>> batch_res(fcount);
         for (int b = 0; b < fcount; b++) {
             auto& res = batch_res[b];
             nms(res, &prob[b * OUTPUT_SIZE]);
         }
-        auto end = std::chrono::system_clock::now();
-        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
         for (int b = 0; b < fcount; b++) {
             auto& res = batch_res[b];
             //std::cout << res.size() << std::endl;