retinaface tested on 1080/trt7

wang-xinyu · wang-xinyu · commit a53ec18e1618 · 2020-04-09T21:36:52.000+08:00
diff --git a/README.md b/README.md
@@ -14,9 +14,9 @@ There is a guide for quickly getting started, taking lenet5 as a demo. [Getting_
 
 ## Test Environment
 
-1. Jetson TX1 / Ubuntu16.04 / cuda9.0 / cudnn7.1.5 / tensorrt4.0.2 / nvinfer4.1.3
+1. Jetson TX1 / Ubuntu16.04 / cuda9.0 / cudnn7.1.5 / tensorrt4.0.2 / nvinfer4.1.3 / opencv3.3
 
-2. GTX1080 / Ubuntu16.04 / cuda10.0 / cudnn7.6.5 / tensorrt7.0.0 / nvinfer7.0.0
+2. GTX1080 / Ubuntu16.04 / cuda10.0 / cudnn7.6.5 / tensorrt7.0.0 / nvinfer7.0.0 / opencv3.3
 
 Currently, TX1 ans x86 GTX1080 were tested. trt4 api were using, some api are deprecated in trt7, but still can compile successfully.
 
@@ -67,8 +67,11 @@ Some tricky operations encountered in these models, already solved, but might ha
 | Models | Device | BatchSize | Mode | Input Shape(HxW) | FPS |
 |-|-|:-:|:-:|:-:|:-:|
 | YOLOv3(darknet53) | Xavier | 1 | FP16 | 320x320 | 55 |
-| YOLOv3-spp(darknet53) | GTX1080 | 1 | FP32 | 256x416 | 94 |
+| YOLOv3-spp(darknet53) | Xeon E5-2620/GTX1080 | 1 | FP32 | 256x416 | 94 |
 | RetinaFace(resnet50) | TX2 | 1 | FP16 | 384x640 | 15 |
+| RetinaFace(resnet50) | Xeon E5-2620/GTX1080 | 1 | FP32 | 928x1600 | 15 |
+
+Detection net FPS test including inference and nms time, excluding image preprocess time.
 
 Help wanted, if you got speed results, please add an issue or PR.
 
diff --git a/retinaface/CMakeLists.txt b/retinaface/CMakeLists.txt
@@ -12,9 +12,15 @@ find_package(CUDA REQUIRED)
 
 set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30)
 
-include_directories(${PROJECT_SOURCE_DIR}/include)
-include_directories(/usr/local/cuda-9.0/targets/aarch64-linux/include)
-link_directories(/usr/local/cuda-9.0/targets/aarch64-linux/lib)
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    message("embed_platform on")
+    include_directories(/usr/local/cuda/targets/aarch64-linux/include)
+    link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
+else()
+    message("embed_platform off")
+    include_directories(/usr/local/cuda/include)
+    link_directories(/usr/local/cuda/lib64)
+endif()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
 
diff --git a/retinaface/README.md b/retinaface/README.md
@@ -2,7 +2,7 @@
 
 ## Notice
 
-- Only tested on TensorRT4
+- Tested on TX2/TensorRT4 and GTX1080/TensorRT7
 - The pytorch implementation is [biubug6/Pytorch_Retinaface](https://github.com/biubug6/Pytorch_Retinaface), I forked it into 
 [wang-xinyu/Pytorch_Retinaface](https://github.com/biubug6/Pytorch_Retinaface) and add genwts.py
 
diff --git a/retinaface/decode.cu b/retinaface/decode.cu
@@ -102,6 +102,7 @@ namespace nvinfer1
         int base_step = 8;
         int base_anchor = 16;
         int thread_count;
+        cudaMemset(output, 0, sizeof(float));
         for (unsigned int i = 0; i < 3; ++i)
         {
             num_elem = decodeplugin::INPUT_H / base_step * decodeplugin::INPUT_W / base_step;
diff --git a/retinaface/decode.h b/retinaface/decode.h
@@ -10,8 +10,8 @@ namespace decodeplugin
         float class_confidence;
         float landmark[10];
     };
-    static const int INPUT_H = 384;
-    static const int INPUT_W = 640;
+    static const int INPUT_H = 928;
+    static const int INPUT_W = 1600;
 }
 
 
diff --git a/retinaface/retina_r50.cpp b/retinaface/retina_r50.cpp
@@ -12,7 +12,7 @@
 #include "decode.h"
 #include <opencv2/opencv.hpp>
 
-#define USE_FP16  // comment out this if want to use FP32
+//#define USE_FP16  // comment out this if want to use FP32
 #define DEVICE 0  // GPU id
 
 // stuff we know about the network and the input/output blobs
@@ -101,6 +101,7 @@ void nms(std::vector<decodeplugin::Detection>& res, float *output, float nms_thr
         dets.push_back(det);
     }
     std::sort(dets.begin(), dets.end(), cmp);
+    if (dets.size() > 5000) dets.erase(dets.begin() + 5000, dets.end());
     for (size_t m = 0; m < dets.size(); ++m) {
         auto& item = dets[m];
         res.push_back(item);
@@ -497,7 +498,7 @@ int main(int argc, char** argv) {
     }
 
     // prepare input data ---------------------------
-    float data[3 * INPUT_H * INPUT_W];
+    static float data[3 * INPUT_H * INPUT_W];
     //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
     //    data[i] = 1.0;
 
@@ -536,7 +537,7 @@ int main(int argc, char** argv) {
         if (res[j].class_confidence < 0.1) continue;
         cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].landmark);
         cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
-        cv::putText(img, std::to_string((int)(res[j].class_confidence * 100)) + "%", cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 1);
+        //cv::putText(img, std::to_string((int)(res[j].class_confidence * 100)) + "%", cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 1);
         for (int k = 0; k < 10; k += 2) {
             cv::circle(img, cv::Point(res[j].landmark[k], res[j].landmark[k + 1]), 1, cv::Scalar(255 * (k > 2), 255 * (k > 0 && k < 8), 255 * (k < 6)), 4);
         }

Original file line number	Diff line number	Diff line change
`@@ -102,6 +102,7 @@ namespace nvinfer1`
`102`	`102`	`int base_step = 8;`
`103`	`103`	`int base_anchor = 16;`
`104`	`104`	`int thread_count;`
	`105`	`+ cudaMemset(output, 0, sizeof(float));`
`105`	`106`	`for (unsigned int i = 0; i < 3; ++i)`
`106`	`107`	`{`
`107`	`108`	`num_elem = decodeplugin::INPUT_H / base_step * decodeplugin::INPUT_W / base_step;`
Original file line number	Diff line number	Diff line change
`@@ -10,8 +10,8 @@ namespace decodeplugin`
`10`	`10`	`float class_confidence;`
`11`	`11`	`float landmark[10];`
`12`	`12`	`};`
`13`		`- static const int INPUT_H = 384;`
`14`		`- static const int INPUT_W = 640;`
	`13`	`+ static const int INPUT_H = 928;`
	`14`	`+ static const int INPUT_W = 1600;`
`15`	`15`	`}`
`16`	`16`
`17`	`17`