retinaface ssh, decode plugin

wang-xinyu · wang-xinyu · commit 35d59c91b381 · 2020-04-08T14:42:02.000Z
diff --git a/retinaface/CMakeLists.txt b/retinaface/CMakeLists.txt
@@ -18,16 +18,15 @@ link_directories(/usr/local/cuda-9.0/targets/aarch64-linux/lib)
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
 
-#cuda_add_library(leaky ${PROJECT_SOURCE_DIR}/leaky.cu)
-#cuda_add_library(yololayer ${PROJECT_SOURCE_DIR}/yololayer.cu)
+cuda_add_library(decodeplugin SHARED ${PROJECT_SOURCE_DIR}/decode.cu)
 
 find_package(OpenCV)
 include_directories(OpenCV_INCLUDE_DIRS)
 
-add_executable(retina_50 ${PROJECT_SOURCE_DIR}/retina_r50.cpp)
+add_executable(retina_50 ${PROJECT_SOURCE_DIR}/plugin_factory.cpp ${PROJECT_SOURCE_DIR}/retina_r50.cpp)
 target_link_libraries(retina_50 nvinfer nvinfer_plugin)
 target_link_libraries(retina_50 cudart)
-#target_link_libraries(retina yololayer)
+target_link_libraries(retina_50 decodeplugin)
 target_link_libraries(retina_50 ${OpenCV_LIBRARIES})
 
 add_definitions(-O2 -pthread)
diff --git a/retinaface/decode.cu b/retinaface/decode.cu
@@ -0,0 +1,125 @@
+#include "decode.h"
+#include "stdio.h"
+
+namespace nvinfer1
+{
+    DecodePlugin::DecodePlugin(const int cudaThread):thread_count_(cudaThread)
+    {
+    }
+    
+    DecodePlugin::~DecodePlugin()
+    {
+    }
+    
+    // create the plugin at runtime from a byte stream
+    DecodePlugin::DecodePlugin(const void* data, size_t length)
+    {
+    }
+
+    void DecodePlugin::serialize(void* buffer)
+    {
+    }
+    
+    size_t DecodePlugin::getSerializationSize()
+    {  
+        return 0;
+    }
+
+    int DecodePlugin::initialize()
+    { 
+        return 0;
+    }
+    
+    Dims DecodePlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
+    {
+        //output the result to channel
+        int totalCount = 0;
+        totalCount += input_h_ / 8 * input_w_ / 8 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
+        totalCount += input_h_ / 16 * input_w_ / 16 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
+        totalCount += input_h_ / 32 * input_w_ / 32 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
+
+        return Dims3(totalCount + 1, 1, 1);
+    }
+
+    __device__ float Logist(float data){ return 1./(1. + exp(-data)); };
+
+    __global__ void CalDetection(const float *input, float *output, int num_elem, int input_h, int input_w, int step, int anchor) {
+ 
+        int idx = threadIdx.x + blockDim.x * blockIdx.x;
+        if (idx >= num_elem) return;
+
+        int h = input_h / step;
+        int w = input_w / step;
+        int y = idx / w;
+        int x = idx % w;
+        const float *bbox_reg = &input[0];
+        const float *cls_reg = &input[2 * 4 * num_elem];
+        const float *lmk_reg = &input[2 * 4 * num_elem + 2 * 2 * num_elem];
+
+        for (int k = 0; k < 2; ++k) {
+            float conf1 = cls_reg[idx + k * num_elem * 2];
+            float conf2 = cls_reg[idx + k * num_elem * 2 + num_elem];
+            conf2 = exp(conf2) / (exp(conf1) + exp(conf2));
+            if (conf2 <= 0.002) continue;
+
+            float *res_count = output;
+            int count = (int)atomicAdd(res_count, 1);
+            char* data = (char *)res_count + sizeof(float) + count * sizeof(decodeplugin::Detection);
+            decodeplugin::Detection* det = (decodeplugin::Detection*)(data);
+
+            float prior[4];
+            prior[0] = ((float)x + 0.5) / w;
+            prior[1] = ((float)y + 0.5) / h;
+            prior[2] = (float)anchor / input_w;
+            prior[3] = (float)anchor / input_h;
+            printf("prior0, %f\n", prior[0]);
+            printf("bbox0, %f\n", bbox_reg[idx + k * num_elem * 4]);
+
+            //Location
+            det->bbox[0] = prior[0] + bbox_reg[idx + k * num_elem * 4] * 0.1 * prior[2];
+            det->bbox[1] = prior[1] + bbox_reg[idx + k * num_elem * 4 + num_elem] * 0.1 * prior[3];
+            det->bbox[2] = prior[2] * exp(bbox_reg[idx + k * num_elem * 4 + num_elem * 2] * 0.2);
+            det->bbox[3] = prior[3] * exp(bbox_reg[idx + k * num_elem * 4 + num_elem * 3] * 0.2);
+            det->bbox[0] -= det->bbox[2] / 2;
+            det->bbox[1] -= det->bbox[3] / 2;
+            det->bbox[2] += det->bbox[0];
+            det->bbox[3] += det->bbox[1];
+            det->bbox[0] *= input_w;
+            det->bbox[1] *= input_h;
+            det->bbox[2] *= input_w;
+            det->bbox[3] *= input_h;
+            det->class_confidence = conf2;
+            anchor *= 2;
+        }
+    }
+   
+    void DecodePlugin::forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize) 
+    {
+        int num_elem = 0;
+        int base_step = 8;
+        int base_anchor = 16;
+        int thread_count;
+        for (unsigned int i = 0; i < 3; ++i)
+        {
+            num_elem = input_h_ / base_step * input_w_ / base_step;
+            thread_count = (num_elem < thread_count_) ? num_elem : thread_count_;
+            CalDetection<<< (num_elem + thread_count - 1) / thread_count, thread_count>>>
+                (inputs[i], output, num_elem, input_h_, input_w_, base_step, base_anchor);
+            base_step *= 2;
+            base_anchor *= 4;
+        }
+
+    }
+
+
+    int DecodePlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
+    {
+        //assert(batchSize == 1);
+        //GPU
+        //CUDA_CHECK(cudaStreamSynchronize(stream));
+        forwardGpu((const float *const *)inputs,(float *)outputs[0],stream,batchSize);
+
+        return 0;
+    };
+
+}
diff --git a/retinaface/decode.h b/retinaface/decode.h
@@ -0,0 +1,61 @@
+#ifndef _DECODE_CU_H
+#define _DECODE_CU_H
+
+#include "NvInfer.h"
+
+namespace decodeplugin
+{
+    struct alignas(float) Detection{
+        //x y w h
+        float bbox[4];
+        float class_confidence;
+        float landmark[10];
+    };
+}
+
+
+namespace nvinfer1
+{
+    class DecodePlugin: public IPluginExt
+    {
+    public:
+        explicit DecodePlugin(const int cudaThread = 256);
+        DecodePlugin(const void* data, size_t length);
+
+        ~DecodePlugin();
+
+        int getNbOutputs() const override
+        {
+            return 1;
+        }
+
+        Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
+
+        bool supportsFormat(DataType type, PluginFormat format) const override { 
+            return type == DataType::kFLOAT && format == PluginFormat::kNCHW; 
+        }
+
+        void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override {};
+
+        int initialize() override;
+
+        virtual void terminate() override {};
+
+        virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;}
+
+        virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;
+
+        virtual size_t getSerializationSize() override;
+
+        virtual void serialize(void* buffer) override;
+
+        void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1);
+
+    private:
+        const int input_h_ = 384;
+        const int input_w_ = 640;
+        int thread_count_ = 256;
+    };
+};
+
+#endif 
diff --git a/retinaface/plugin_factory.cpp b/retinaface/plugin_factory.cpp
@@ -0,0 +1,17 @@
+#include "plugin_factory.h"
+#include "NvInferPlugin.h"
+#include "decode.h"
+#include "common.h"
+
+using namespace nvinfer1;
+using nvinfer1::PluginFactory;
+
+IPlugin* PluginFactory::createPlugin(const char* layerName, const void* serialData, size_t serialLength) {
+    IPlugin *plugin = nullptr;
+    if (strstr(layerName, "leaky") != NULL) {
+        plugin = plugin::createPReLUPlugin(serialData, serialLength);
+    } else if (strstr(layerName, "decode") != NULL) {
+        plugin = new DecodePlugin(serialData, serialLength);
+    }
+    return plugin;
+}
diff --git a/retinaface/plugin_factory.h b/retinaface/plugin_factory.h
@@ -0,0 +1,12 @@
+#ifndef MY_PLUGIN_FACTORY_H
+#define MY_PLUGIN_FACTORY_H
+#include <NvInfer.h>
+
+namespace nvinfer1 {
+class PluginFactory : public IPluginFactory {
+    public:
+        IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override;
+};
+
+}
+#endif
diff --git a/retinaface/retina_r50.cpp b/retinaface/retina_r50.cpp