Skip to content

Commit bd1c1ca

Browse files
committed
retinaface multi-batch
1 parent ff364db commit bd1c1ca

File tree

4 files changed

+66
-50
lines changed

4 files changed

+66
-50
lines changed

README.md

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,12 +88,8 @@ Some tricky operations encountered in these models, already solved, but might ha
8888
| RetinaFace(resnet50) | Xeon E5-2620/GTX1080 | 1 | FP32 | 928x1600 | 15 |
8989
| ArcFace(LResNet50E-IR) | Xeon E5-2620/GTX1080 | 1 | FP32 | 112x112 | 333 |
9090

91-
Detection net FPS test including inference and nms time, excluding image preprocess time.
92-
9391
Help wanted, if you got speed results, please add an issue or PR.
9492

95-
Thanks @Kmarconi for yolov3(darknet53) speed test.
96-
9793
## Acknowledgments & Contact
9894

9995
Currently, This repo is funded by Alleyes-THU AI Lab([aboutus in Chinese](http://www.alleyes.com.cn/aboutus.html)). We are based in Tsinghua University, Beijing, and seeking for talented interns for CV R&D. Contact me if you are interested.

retinaface/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ sudo ./retina_r50 -d // deserialize model file and run inference.
4242
- Input shape `INPUT_H`, `INPUT_W` defined in `decode.h`
4343
- FP16/FP32 can be selected by the macro `USE_FP16` in `retina_r50.cpp`
4444
- GPU id can be selected by the macro `DEVICE` in `retina_r50.cpp`
45+
- Batchsize can be selected by the macro `BATCHSIZE` in `retina_r50.cpp`
4546

4647
## More Information
4748

retinaface/decode.cu

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -107,26 +107,30 @@ namespace nvinfer1
107107

108108
__device__ float Logist(float data){ return 1./(1. + expf(-data)); };
109109

110-
__global__ void CalDetection(const float *input, float *output, int num_elem, int step, int anchor) {
110+
__global__ void CalDetection(const float *input, float *output, int num_elem, int step, int anchor, int output_elem) {
111111

112112
int idx = threadIdx.x + blockDim.x * blockIdx.x;
113113
if (idx >= num_elem) return;
114114

115115
int h = decodeplugin::INPUT_H / step;
116116
int w = decodeplugin::INPUT_W / step;
117+
int total_grid = h * w;
118+
int bn_idx = idx / total_grid;
119+
idx = idx - bn_idx * total_grid;
117120
int y = idx / w;
118121
int x = idx % w;
119-
const float *bbox_reg = &input[0];
120-
const float *cls_reg = &input[2 * 4 * num_elem];
121-
const float *lmk_reg = &input[2 * 4 * num_elem + 2 * 2 * num_elem];
122+
const float* cur_input = input + bn_idx * (4 + 2 + 10) * 2 * total_grid;
123+
const float *bbox_reg = &cur_input[0];
124+
const float *cls_reg = &cur_input[2 * 4 * total_grid];
125+
const float *lmk_reg = &cur_input[2 * 4 * total_grid + 2 * 2 * total_grid];
122126

123127
for (int k = 0; k < 2; ++k) {
124-
float conf1 = cls_reg[idx + k * num_elem * 2];
125-
float conf2 = cls_reg[idx + k * num_elem * 2 + num_elem];
128+
float conf1 = cls_reg[idx + k * total_grid * 2];
129+
float conf2 = cls_reg[idx + k * total_grid * 2 + total_grid];
126130
conf2 = expf(conf2) / (expf(conf1) + expf(conf2));
127131
if (conf2 <= 0.02) continue;
128132

129-
float *res_count = output;
133+
float *res_count = output + bn_idx * output_elem;
130134
int count = (int)atomicAdd(res_count, 1);
131135
char* data = (char *)res_count + sizeof(float) + count * sizeof(decodeplugin::Detection);
132136
decodeplugin::Detection* det = (decodeplugin::Detection*)(data);
@@ -138,10 +142,10 @@ namespace nvinfer1
138142
prior[3] = (float)anchor * (k + 1) / decodeplugin::INPUT_H;
139143

140144
//Location
141-
det->bbox[0] = prior[0] + bbox_reg[idx + k * num_elem * 4] * 0.1 * prior[2];
142-
det->bbox[1] = prior[1] + bbox_reg[idx + k * num_elem * 4 + num_elem] * 0.1 * prior[3];
143-
det->bbox[2] = prior[2] * expf(bbox_reg[idx + k * num_elem * 4 + num_elem * 2] * 0.2);
144-
det->bbox[3] = prior[3] * expf(bbox_reg[idx + k * num_elem * 4 + num_elem * 3] * 0.2);
145+
det->bbox[0] = prior[0] + bbox_reg[idx + k * total_grid * 4] * 0.1 * prior[2];
146+
det->bbox[1] = prior[1] + bbox_reg[idx + k * total_grid * 4 + total_grid] * 0.1 * prior[3];
147+
det->bbox[2] = prior[2] * expf(bbox_reg[idx + k * total_grid * 4 + total_grid * 2] * 0.2);
148+
det->bbox[3] = prior[3] * expf(bbox_reg[idx + k * total_grid * 4 + total_grid * 3] * 0.2);
145149
det->bbox[0] -= det->bbox[2] / 2;
146150
det->bbox[1] -= det->bbox[3] / 2;
147151
det->bbox[2] += det->bbox[0];
@@ -152,39 +156,45 @@ namespace nvinfer1
152156
det->bbox[3] *= decodeplugin::INPUT_H;
153157
det->class_confidence = conf2;
154158
for (int i = 0; i < 10; i += 2) {
155-
det->landmark[i] = prior[0] + lmk_reg[idx + k * num_elem * 10 + num_elem * i] * 0.1 * prior[2];
156-
det->landmark[i+1] = prior[1] + lmk_reg[idx + k * num_elem * 10 + num_elem * (i + 1)] * 0.1 * prior[3];
159+
det->landmark[i] = prior[0] + lmk_reg[idx + k * total_grid * 10 + total_grid * i] * 0.1 * prior[2];
160+
det->landmark[i+1] = prior[1] + lmk_reg[idx + k * total_grid * 10 + total_grid * (i + 1)] * 0.1 * prior[3];
157161
det->landmark[i] *= decodeplugin::INPUT_W;
158162
det->landmark[i+1] *= decodeplugin::INPUT_H;
159163
}
160164
}
161165
}
162166

163-
void DecodePlugin::forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize)
167+
void DecodePlugin::forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize)
164168
{
165169
int num_elem = 0;
166170
int base_step = 8;
167171
int base_anchor = 16;
168172
int thread_count;
169-
cudaMemset(output, 0, sizeof(float));
173+
174+
int totalCount = 1;
175+
totalCount += decodeplugin::INPUT_H / 8 * decodeplugin::INPUT_W / 8 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
176+
totalCount += decodeplugin::INPUT_H / 16 * decodeplugin::INPUT_W / 16 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
177+
totalCount += decodeplugin::INPUT_H / 32 * decodeplugin::INPUT_W / 32 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
178+
for(int idx = 0 ; idx < batchSize; ++idx) {
179+
cudaMemset(output + idx * totalCount, 0, sizeof(float));
180+
}
181+
170182
for (unsigned int i = 0; i < 3; ++i)
171183
{
172-
num_elem = decodeplugin::INPUT_H / base_step * decodeplugin::INPUT_W / base_step;
184+
num_elem = batchSize * decodeplugin::INPUT_H / base_step * decodeplugin::INPUT_W / base_step;
173185
thread_count = (num_elem < thread_count_) ? num_elem : thread_count_;
174186
CalDetection<<< (num_elem + thread_count - 1) / thread_count, thread_count>>>
175-
(inputs[i], output, num_elem, base_step, base_anchor);
187+
(inputs[i], output, num_elem, base_step, base_anchor, totalCount);
176188
base_step *= 2;
177189
base_anchor *= 4;
178190
}
179191
}
180192

181193
int DecodePlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
182194
{
183-
//assert(batchSize == 1);
184195
//GPU
185196
//CUDA_CHECK(cudaStreamSynchronize(stream));
186-
forwardGpu((const float *const *)inputs,(float *)outputs[0],stream,batchSize);
187-
197+
forwardGpu((const float *const *)inputs, (float *)outputs[0], stream, batchSize);
188198
return 0;
189199
};
190200

retinaface/retina_r50.cpp

Lines changed: 35 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
#define USE_FP16 // comment out this if want to use FP32
2525
#define DEVICE 0 // GPU id
26+
#define BATCH_SIZE 1
2627

2728
// stuff we know about the network and the input/output blobs
2829
static const int INPUT_H = decodeplugin::INPUT_H; // H, W must be able to be divided by 32.
@@ -482,7 +483,7 @@ int main(int argc, char** argv) {
482483

483484
if (std::string(argv[1]) == "-s") {
484485
IHostMemory* modelStream{nullptr};
485-
APIToModel(1, &modelStream);
486+
APIToModel(BATCH_SIZE, &modelStream);
486487
assert(modelStream != nullptr);
487488

488489
std::ofstream p("retina_r50.engine", std::ios::binary);
@@ -509,17 +510,23 @@ int main(int argc, char** argv) {
509510
}
510511

511512
// prepare input data ---------------------------
512-
static float data[3 * INPUT_H * INPUT_W];
513+
static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
513514
//for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
514515
// data[i] = 1.0;
515516

516517
cv::Mat img = cv::imread("worlds-largest-selfie.jpg");
517518
cv::Mat pr_img = preprocess_img(img);
518519
//cv::imwrite("preprocessed.jpg", pr_img);
519-
for (int i = 0; i < INPUT_H * INPUT_W; i++) {
520-
data[i] = pr_img.at<cv::Vec3b>(i)[0] - 104.0;
521-
data[i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] - 117.0;
522-
data[i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[2] - 123.0;
520+
521+
// For multi-batch, I feed the same image multiple times.
522+
// If you want to process different images in a batch, you need adapt it.
523+
for (int b = 0; b < BATCH_SIZE; b++) {
524+
float *p_data = &data[b * 3 * INPUT_H * INPUT_W];
525+
for (int i = 0; i < INPUT_H * INPUT_W; i++) {
526+
p_data[i] = pr_img.at<cv::Vec3b>(i)[0] - 104.0;
527+
p_data[i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] - 117.0;
528+
p_data[i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[2] - 123.0;
529+
}
523530
}
524531

525532
IRuntime* runtime = createInferRuntime(gLogger);
@@ -531,28 +538,30 @@ int main(int argc, char** argv) {
531538
assert(context != nullptr);
532539

533540
// Run inference
534-
static float prob[OUTPUT_SIZE];
535-
std::vector<decodeplugin::Detection> res;
536-
for (int i = 0; i < 20; i++) {
537-
res.clear();
538-
auto start = std::chrono::system_clock::now();
539-
doInference(*context, data, prob, 1);
540-
nms(res, prob);
541-
auto end = std::chrono::system_clock::now();
542-
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
543-
}
544-
std::cout << "detected before nms -> " << prob[0] << std::endl;
545-
std::cout << "after nms -> " << res.size() << std::endl;
546-
for (size_t j = 0; j < res.size(); j++) {
547-
if (res[j].class_confidence < 0.1) continue;
548-
cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].landmark);
549-
cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
550-
//cv::putText(img, std::to_string((int)(res[j].class_confidence * 100)) + "%", cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 1);
551-
for (int k = 0; k < 10; k += 2) {
552-
cv::circle(img, cv::Point(res[j].landmark[k], res[j].landmark[k + 1]), 1, cv::Scalar(255 * (k > 2), 255 * (k > 0 && k < 8), 255 * (k < 6)), 4);
541+
static float prob[BATCH_SIZE * OUTPUT_SIZE];
542+
auto start = std::chrono::system_clock::now();
543+
doInference(*context, data, prob, BATCH_SIZE);
544+
auto end = std::chrono::system_clock::now();
545+
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
546+
547+
for (int b = 0; b < BATCH_SIZE; b++) {
548+
std::vector<decodeplugin::Detection> res;
549+
nms(res, &prob[b * OUTPUT_SIZE]);
550+
std::cout << "number of detections -> " << prob[b * OUTPUT_SIZE] << std::endl;
551+
std::cout << " -> " << prob[b * OUTPUT_SIZE + 10] << std::endl;
552+
std::cout << "after nms -> " << res.size() << std::endl;
553+
cv::Mat tmp = img.clone();
554+
for (size_t j = 0; j < res.size(); j++) {
555+
if (res[j].class_confidence < 0.1) continue;
556+
cv::Rect r = get_rect_adapt_landmark(tmp, res[j].bbox, res[j].landmark);
557+
cv::rectangle(tmp, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
558+
//cv::putText(tmp, std::to_string((int)(res[j].class_confidence * 100)) + "%", cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 1);
559+
for (int k = 0; k < 10; k += 2) {
560+
cv::circle(tmp, cv::Point(res[j].landmark[k], res[j].landmark[k + 1]), 1, cv::Scalar(255 * (k > 2), 255 * (k > 0 && k < 8), 255 * (k < 6)), 4);
561+
}
553562
}
563+
cv::imwrite(std::to_string(b) + "_result.jpg", tmp);
554564
}
555-
cv::imwrite("result.jpg", img);
556565

557566
// Destroy the engine
558567
context->destroy();

0 commit comments

Comments
 (0)