Skip to content

Commit a53ec18

Browse files
committed
retinaface tested on 1080/trt7
1 parent 2923026 commit a53ec18

File tree

6 files changed

+23
-12
lines changed

6 files changed

+23
-12
lines changed

README.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ There is a guide for quickly getting started, taking lenet5 as a demo. [Getting_
1414

1515
## Test Environment
1616

17-
1. Jetson TX1 / Ubuntu16.04 / cuda9.0 / cudnn7.1.5 / tensorrt4.0.2 / nvinfer4.1.3
17+
1. Jetson TX1 / Ubuntu16.04 / cuda9.0 / cudnn7.1.5 / tensorrt4.0.2 / nvinfer4.1.3 / opencv3.3
1818

19-
2. GTX1080 / Ubuntu16.04 / cuda10.0 / cudnn7.6.5 / tensorrt7.0.0 / nvinfer7.0.0
19+
2. GTX1080 / Ubuntu16.04 / cuda10.0 / cudnn7.6.5 / tensorrt7.0.0 / nvinfer7.0.0 / opencv3.3
2020

2121
Currently, TX1 ans x86 GTX1080 were tested. trt4 api were using, some api are deprecated in trt7, but still can compile successfully.
2222

@@ -67,8 +67,11 @@ Some tricky operations encountered in these models, already solved, but might ha
6767
| Models | Device | BatchSize | Mode | Input Shape(HxW) | FPS |
6868
|-|-|:-:|:-:|:-:|:-:|
6969
| YOLOv3(darknet53) | Xavier | 1 | FP16 | 320x320 | 55 |
70-
| YOLOv3-spp(darknet53) | GTX1080 | 1 | FP32 | 256x416 | 94 |
70+
| YOLOv3-spp(darknet53) | Xeon E5-2620/GTX1080 | 1 | FP32 | 256x416 | 94 |
7171
| RetinaFace(resnet50) | TX2 | 1 | FP16 | 384x640 | 15 |
72+
| RetinaFace(resnet50) | Xeon E5-2620/GTX1080 | 1 | FP32 | 928x1600 | 15 |
73+
74+
Detection net FPS test including inference and nms time, excluding image preprocess time.
7275

7376
Help wanted, if you got speed results, please add an issue or PR.
7477

retinaface/CMakeLists.txt

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,15 @@ find_package(CUDA REQUIRED)
1212

1313
set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30)
1414

15-
include_directories(${PROJECT_SOURCE_DIR}/include)
16-
include_directories(/usr/local/cuda-9.0/targets/aarch64-linux/include)
17-
link_directories(/usr/local/cuda-9.0/targets/aarch64-linux/lib)
15+
if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
16+
message("embed_platform on")
17+
include_directories(/usr/local/cuda/targets/aarch64-linux/include)
18+
link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
19+
else()
20+
message("embed_platform off")
21+
include_directories(/usr/local/cuda/include)
22+
link_directories(/usr/local/cuda/lib64)
23+
endif()
1824

1925
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
2026

retinaface/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
## Notice
44

5-
- Only tested on TensorRT4
5+
- Tested on TX2/TensorRT4 and GTX1080/TensorRT7
66
- The pytorch implementation is [biubug6/Pytorch_Retinaface](https://github.com/biubug6/Pytorch_Retinaface), I forked it into
77
[wang-xinyu/Pytorch_Retinaface](https://github.com/biubug6/Pytorch_Retinaface) and add genwts.py
88

retinaface/decode.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ namespace nvinfer1
102102
int base_step = 8;
103103
int base_anchor = 16;
104104
int thread_count;
105+
cudaMemset(output, 0, sizeof(float));
105106
for (unsigned int i = 0; i < 3; ++i)
106107
{
107108
num_elem = decodeplugin::INPUT_H / base_step * decodeplugin::INPUT_W / base_step;

retinaface/decode.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ namespace decodeplugin
1010
float class_confidence;
1111
float landmark[10];
1212
};
13-
static const int INPUT_H = 384;
14-
static const int INPUT_W = 640;
13+
static const int INPUT_H = 928;
14+
static const int INPUT_W = 1600;
1515
}
1616

1717

retinaface/retina_r50.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#include "decode.h"
1313
#include <opencv2/opencv.hpp>
1414

15-
#define USE_FP16 // comment out this if want to use FP32
15+
//#define USE_FP16 // comment out this if want to use FP32
1616
#define DEVICE 0 // GPU id
1717

1818
// stuff we know about the network and the input/output blobs
@@ -101,6 +101,7 @@ void nms(std::vector<decodeplugin::Detection>& res, float *output, float nms_thr
101101
dets.push_back(det);
102102
}
103103
std::sort(dets.begin(), dets.end(), cmp);
104+
if (dets.size() > 5000) dets.erase(dets.begin() + 5000, dets.end());
104105
for (size_t m = 0; m < dets.size(); ++m) {
105106
auto& item = dets[m];
106107
res.push_back(item);
@@ -497,7 +498,7 @@ int main(int argc, char** argv) {
497498
}
498499

499500
// prepare input data ---------------------------
500-
float data[3 * INPUT_H * INPUT_W];
501+
static float data[3 * INPUT_H * INPUT_W];
501502
//for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
502503
// data[i] = 1.0;
503504

@@ -536,7 +537,7 @@ int main(int argc, char** argv) {
536537
if (res[j].class_confidence < 0.1) continue;
537538
cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].landmark);
538539
cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
539-
cv::putText(img, std::to_string((int)(res[j].class_confidence * 100)) + "%", cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 1);
540+
//cv::putText(img, std::to_string((int)(res[j].class_confidence * 100)) + "%", cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 1);
540541
for (int k = 0; k < 10; k += 2) {
541542
cv::circle(img, cv::Point(res[j].landmark[k], res[j].landmark[k + 1]), 1, cv::Scalar(255 * (k > 2), 255 * (k > 0 && k < 8), 255 * (k < 6)), 4);
542543
}

0 commit comments

Comments
 (0)