retinaface multi-batch

wang-xinyu · wang-xinyu · commit bd1c1ca840b9 · 2020-06-18T11:57:54.000+08:00
diff --git a/README.md b/README.md
@@ -88,12 +88,8 @@ Some tricky operations encountered in these models, already solved, but might ha
 | RetinaFace(resnet50) | Xeon E5-2620/GTX1080 | 1 | FP32 | 928x1600 | 15 |
 | ArcFace(LResNet50E-IR) | Xeon E5-2620/GTX1080 | 1 | FP32 | 112x112 | 333 |
 
-Detection net FPS test including inference and nms time, excluding image preprocess time.
-
 Help wanted, if you got speed results, please add an issue or PR.
 
-Thanks @Kmarconi for yolov3(darknet53) speed test.
-
 ## Acknowledgments & Contact
 
 Currently, This repo is funded by Alleyes-THU AI Lab([aboutus in Chinese](http://www.alleyes.com.cn/aboutus.html)). We are based in Tsinghua University, Beijing, and seeking for talented interns for CV R&D. Contact me if you are interested.
diff --git a/retinaface/README.md b/retinaface/README.md
@@ -42,6 +42,7 @@ sudo ./retina_r50 -d  // deserialize model file and run inference.
 - Input shape `INPUT_H`, `INPUT_W` defined in `decode.h`
 - FP16/FP32 can be selected by the macro `USE_FP16` in `retina_r50.cpp`
 - GPU id can be selected by the macro `DEVICE` in `retina_r50.cpp`
+- Batchsize can be selected by the macro `BATCHSIZE` in `retina_r50.cpp`
 
 ## More Information
 
diff --git a/retinaface/decode.cu b/retinaface/decode.cu
@@ -107,26 +107,30 @@ namespace nvinfer1
 
     __device__ float Logist(float data){ return 1./(1. + expf(-data)); };
 
-    __global__ void CalDetection(const float *input, float *output, int num_elem, int step, int anchor) {
+    __global__ void CalDetection(const float *input, float *output, int num_elem, int step, int anchor, int output_elem) {
 
         int idx = threadIdx.x + blockDim.x * blockIdx.x;
         if (idx >= num_elem) return;
 
         int h = decodeplugin::INPUT_H / step;
         int w = decodeplugin::INPUT_W / step;
+        int total_grid = h * w;
+        int bn_idx = idx / total_grid;
+        idx = idx - bn_idx * total_grid;
         int y = idx / w;
         int x = idx % w;
-        const float *bbox_reg = &input[0];
-        const float *cls_reg = &input[2 * 4 * num_elem];
-        const float *lmk_reg = &input[2 * 4 * num_elem + 2 * 2 * num_elem];
+        const float* cur_input = input + bn_idx * (4 + 2 + 10) * 2 * total_grid;
+        const float *bbox_reg = &cur_input[0];
+        const float *cls_reg = &cur_input[2 * 4 * total_grid];
+        const float *lmk_reg = &cur_input[2 * 4 * total_grid + 2 * 2 * total_grid];
 
         for (int k = 0; k < 2; ++k) {
-            float conf1 = cls_reg[idx + k * num_elem * 2];
-            float conf2 = cls_reg[idx + k * num_elem * 2 + num_elem];
+            float conf1 = cls_reg[idx + k * total_grid * 2];
+            float conf2 = cls_reg[idx + k * total_grid * 2 + total_grid];
             conf2 = expf(conf2) / (expf(conf1) + expf(conf2));
             if (conf2 <= 0.02) continue;
 
-            float *res_count = output;
+            float *res_count = output + bn_idx * output_elem;
             int count = (int)atomicAdd(res_count, 1);
             char* data = (char *)res_count + sizeof(float) + count * sizeof(decodeplugin::Detection);
             decodeplugin::Detection* det = (decodeplugin::Detection*)(data);
@@ -138,10 +142,10 @@ namespace nvinfer1
             prior[3] = (float)anchor * (k + 1) / decodeplugin::INPUT_H;
 
             //Location
-            det->bbox[0] = prior[0] + bbox_reg[idx + k * num_elem * 4] * 0.1 * prior[2];
-            det->bbox[1] = prior[1] + bbox_reg[idx + k * num_elem * 4 + num_elem] * 0.1 * prior[3];
-            det->bbox[2] = prior[2] * expf(bbox_reg[idx + k * num_elem * 4 + num_elem * 2] * 0.2);
-            det->bbox[3] = prior[3] * expf(bbox_reg[idx + k * num_elem * 4 + num_elem * 3] * 0.2);
+            det->bbox[0] = prior[0] + bbox_reg[idx + k * total_grid * 4] * 0.1 * prior[2];
+            det->bbox[1] = prior[1] + bbox_reg[idx + k * total_grid * 4 + total_grid] * 0.1 * prior[3];
+            det->bbox[2] = prior[2] * expf(bbox_reg[idx + k * total_grid * 4 + total_grid * 2] * 0.2);
+            det->bbox[3] = prior[3] * expf(bbox_reg[idx + k * total_grid * 4 + total_grid * 3] * 0.2);
             det->bbox[0] -= det->bbox[2] / 2;
             det->bbox[1] -= det->bbox[3] / 2;
             det->bbox[2] += det->bbox[0];
@@ -152,39 +156,45 @@ namespace nvinfer1
             det->bbox[3] *= decodeplugin::INPUT_H;
             det->class_confidence = conf2;
             for (int i = 0; i < 10; i += 2) {
-                det->landmark[i] = prior[0] + lmk_reg[idx + k * num_elem * 10 + num_elem * i] * 0.1 * prior[2];
-                det->landmark[i+1] = prior[1] + lmk_reg[idx + k * num_elem * 10 + num_elem * (i + 1)] * 0.1 * prior[3];
+                det->landmark[i] = prior[0] + lmk_reg[idx + k * total_grid * 10 + total_grid * i] * 0.1 * prior[2];
+                det->landmark[i+1] = prior[1] + lmk_reg[idx + k * total_grid * 10 + total_grid * (i + 1)] * 0.1 * prior[3];
                 det->landmark[i] *= decodeplugin::INPUT_W;
                 det->landmark[i+1] *= decodeplugin::INPUT_H;
             }
         }
     }
 
-    void DecodePlugin::forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize) 
+    void DecodePlugin::forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize)
     {
         int num_elem = 0;
         int base_step = 8;
         int base_anchor = 16;
         int thread_count;
-        cudaMemset(output, 0, sizeof(float));
+
+        int totalCount = 1;
+        totalCount += decodeplugin::INPUT_H / 8 * decodeplugin::INPUT_W / 8 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
+        totalCount += decodeplugin::INPUT_H / 16 * decodeplugin::INPUT_W / 16 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
+        totalCount += decodeplugin::INPUT_H / 32 * decodeplugin::INPUT_W / 32 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
+        for(int idx = 0 ; idx < batchSize; ++idx) {
+            cudaMemset(output + idx * totalCount, 0, sizeof(float));
+        }
+
         for (unsigned int i = 0; i < 3; ++i)
         {
-            num_elem = decodeplugin::INPUT_H / base_step * decodeplugin::INPUT_W / base_step;
+            num_elem = batchSize * decodeplugin::INPUT_H / base_step * decodeplugin::INPUT_W / base_step;
             thread_count = (num_elem < thread_count_) ? num_elem : thread_count_;
             CalDetection<<< (num_elem + thread_count - 1) / thread_count, thread_count>>>
-                (inputs[i], output, num_elem, base_step, base_anchor);
+                (inputs[i], output, num_elem, base_step, base_anchor, totalCount);
             base_step *= 2;
             base_anchor *= 4;
         }
     }
 
     int DecodePlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
     {
-        //assert(batchSize == 1);
         //GPU
         //CUDA_CHECK(cudaStreamSynchronize(stream));
-        forwardGpu((const float *const *)inputs,(float *)outputs[0],stream,batchSize);
-
+        forwardGpu((const float *const *)inputs, (float *)outputs[0], stream, batchSize);
         return 0;
     };
 
diff --git a/retinaface/retina_r50.cpp b/retinaface/retina_r50.cpp
@@ -23,6 +23,7 @@
 
 #define USE_FP16  // comment out this if want to use FP32
 #define DEVICE 0  // GPU id
+#define BATCH_SIZE 1
 
 // stuff we know about the network and the input/output blobs
 static const int INPUT_H = decodeplugin::INPUT_H;  // H, W must be able to  be divided by 32.
@@ -482,7 +483,7 @@ int main(int argc, char** argv) {
 
     if (std::string(argv[1]) == "-s") {
         IHostMemory* modelStream{nullptr};
-        APIToModel(1, &modelStream);
+        APIToModel(BATCH_SIZE, &modelStream);
         assert(modelStream != nullptr);
 
         std::ofstream p("retina_r50.engine", std::ios::binary);
@@ -509,17 +510,23 @@ int main(int argc, char** argv) {
     }
 
     // prepare input data ---------------------------
-    static float data[3 * INPUT_H * INPUT_W];
+    static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
     //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
     //    data[i] = 1.0;
 
     cv::Mat img = cv::imread("worlds-largest-selfie.jpg");
     cv::Mat pr_img = preprocess_img(img);
     //cv::imwrite("preprocessed.jpg", pr_img);
-    for (int i = 0; i < INPUT_H * INPUT_W; i++) {
-        data[i] = pr_img.at<cv::Vec3b>(i)[0] - 104.0;
-        data[i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] - 117.0;
-        data[i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[2] - 123.0;
+
+    // For multi-batch, I feed the same image multiple times.
+    // If you want to process different images in a batch, you need adapt it.
+    for (int b = 0; b < BATCH_SIZE; b++) {
+        float *p_data = &data[b * 3 * INPUT_H * INPUT_W];
+        for (int i = 0; i < INPUT_H * INPUT_W; i++) {
+            p_data[i] = pr_img.at<cv::Vec3b>(i)[0] - 104.0;
+            p_data[i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] - 117.0;
+            p_data[i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[2] - 123.0;
+        }
     }
 
     IRuntime* runtime = createInferRuntime(gLogger);
@@ -531,28 +538,30 @@ int main(int argc, char** argv) {
     assert(context != nullptr);
 
     // Run inference
-    static float prob[OUTPUT_SIZE];
-    std::vector<decodeplugin::Detection> res;
-    for (int i = 0; i < 20; i++) {
-        res.clear();
-        auto start = std::chrono::system_clock::now();
-        doInference(*context, data, prob, 1);
-        nms(res, prob);
-        auto end = std::chrono::system_clock::now();
-        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
-    }
-    std::cout << "detected before nms -> " << prob[0] << std::endl;
-    std::cout << "after nms -> " << res.size() << std::endl;
-    for (size_t j = 0; j < res.size(); j++) {
-        if (res[j].class_confidence < 0.1) continue;
-        cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].landmark);
-        cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
-        //cv::putText(img, std::to_string((int)(res[j].class_confidence * 100)) + "%", cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 1);
-        for (int k = 0; k < 10; k += 2) {
-            cv::circle(img, cv::Point(res[j].landmark[k], res[j].landmark[k + 1]), 1, cv::Scalar(255 * (k > 2), 255 * (k > 0 && k < 8), 255 * (k < 6)), 4);
+    static float prob[BATCH_SIZE * OUTPUT_SIZE];
+    auto start = std::chrono::system_clock::now();
+    doInference(*context, data, prob, BATCH_SIZE);
+    auto end = std::chrono::system_clock::now();
+    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
+
+    for (int b = 0; b < BATCH_SIZE; b++) {
+        std::vector<decodeplugin::Detection> res;
+        nms(res, &prob[b * OUTPUT_SIZE]);
+        std::cout << "number of detections -> " << prob[b * OUTPUT_SIZE] << std::endl;
+        std::cout << " -> " << prob[b * OUTPUT_SIZE + 10] << std::endl;
+        std::cout << "after nms -> " << res.size() << std::endl;
+        cv::Mat tmp = img.clone();
+        for (size_t j = 0; j < res.size(); j++) {
+            if (res[j].class_confidence < 0.1) continue;
+            cv::Rect r = get_rect_adapt_landmark(tmp, res[j].bbox, res[j].landmark);
+            cv::rectangle(tmp, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
+            //cv::putText(tmp, std::to_string((int)(res[j].class_confidence * 100)) + "%", cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 1);
+            for (int k = 0; k < 10; k += 2) {
+                cv::circle(tmp, cv::Point(res[j].landmark[k], res[j].landmark[k + 1]), 1, cv::Scalar(255 * (k > 2), 255 * (k > 0 && k < 8), 255 * (k < 6)), 4);
+            }
         }
+        cv::imwrite(std::to_string(b) + "_result.jpg", tmp);
     }
-    cv::imwrite("result.jpg", img);
 
     // Destroy the engine
     context->destroy();