Skip to content

Commit 01690fc

Browse files
committed
yolov4 support batchsize
1 parent 9d39797 commit 01690fc

File tree

7 files changed

+81
-78
lines changed

7 files changed

+81
-78
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,9 @@ Some tricky operations encountered in these models, already solved, but might ha
7474
|-|-|:-:|:-:|:-:|:-:|
7575
| YOLOv3(darknet53) | Xavier | 1 | FP16 | 320x320 | 55 |
7676
| YOLOv3-spp(darknet53) | Xeon E5-2620/GTX1080 | 1 | FP32 | 256x416 | 94 |
77-
| YOLOv4(CSPDarknet53) | Xeon E5-2620/GTX1080 | 1 | FP32 | 256x416 | 67 |
77+
| YOLOv4(CSPDarknet53) | Xeon E5-2620/GTX1080 | 1 | FP32 | 256x416 | 59 |
78+
| YOLOv4(CSPDarknet53) | Xeon E5-2620/GTX1080 | 4 | FP32 | 256x416 | 74 |
79+
| YOLOv4(CSPDarknet53) | Xeon E5-2620/GTX1080 | 8 | FP32 | 256x416 | 83 |
7880
| RetinaFace(resnet50) | TX2 | 1 | FP16 | 384x640 | 15 |
7981
| RetinaFace(resnet50) | Xeon E5-2620/GTX1080 | 1 | FP32 | 928x1600 | 15 |
8082

yolov4/README.md

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,14 @@ sudo ./yolov4 -d ../../yolov3-spp/samples // deserialize plan file and run infe
4646

4747
## Config
4848

49-
- Input shape defined in yololayer.h
50-
- Number of classes defined in yololayer.h
51-
- FP16/FP32 can be selected by the macro in yolov4.cpp
52-
- GPU id can be selected by the macro in yolov4.cpp
53-
- NMS thresh in yolov4.cpp
54-
- BBox confidence thresh in yolov4.cpp
49+
- Input shape `INPUT_H`, `INPUT_W` defined in yololayer.h
50+
- Number of classes `CLASS_NUM` defined in yololayer.h
51+
- FP16/FP32 can be selected by the macro `USE_FP16` in yolov4.cpp
52+
- GPU id can be selected by the macro `DEVICE` in yolov4.cpp
53+
- NMS thresh `NMS_THRESH` in yolov4.cpp
54+
- bbox confidence threshold `BBOX_CONF_THRESH` in yolov4.cpp
55+
- `BATCH_SIZE` in yolov4.cpp
5556

5657
## More Information
5758

58-
See the [readme](../README.md) in home page
59+
See the [readme](../) in home page

yolov4/mish.cu

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ namespace nvinfer1
5454
output[idx] = input[idx] * tanh(softplus(input[idx]));
5555
}
5656

57-
void MishPlugin::forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize) {
57+
void MishPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
5858
int block_size = thread_count_;
59-
int grid_size = (input_size_ + block_size - 1) / block_size;
60-
mish_kernel<<<grid_size, block_size>>>(inputs[0], output, input_size_);
59+
int grid_size = (input_size_ * batchSize + block_size - 1) / block_size;
60+
mish_kernel<<<grid_size, block_size>>>(inputs[0], output, input_size_ * batchSize);
6161
}
6262

6363

@@ -66,8 +66,8 @@ namespace nvinfer1
6666
//assert(batchSize == 1);
6767
//GPU
6868
//CUDA_CHECK(cudaStreamSynchronize(stream));
69-
forwardGpu((const float *const *)inputs,(float *)outputs[0],stream,batchSize);
69+
forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
7070
return 0;
71-
};
71+
}
7272

7373
}

yolov4/mish.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ namespace nvinfer1
3838

3939
virtual void serialize(void* buffer) override;
4040

41-
void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1);
41+
void forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize = 1);
4242

4343
private:
4444
int thread_count_ = 256;

yolov4/yololayer.cu

Lines changed: 22 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ namespace nvinfer1
1818
YoloLayerPlugin::~YoloLayerPlugin()
1919
{
2020
}
21-
21+
2222
// create the plugin at runtime from a byte stream
2323
YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
2424
{
@@ -56,24 +56,15 @@ namespace nvinfer1
5656

5757
int YoloLayerPlugin::initialize()
5858
{
59-
int totalCount = 0;
60-
for(const auto& yolo : mYoloKernel)
61-
totalCount += (LOCATIONS + 1) * yolo.width*yolo.height * CHECK_COUNT;
62-
63-
totalCount = 0;//detection count
64-
for(const auto& yolo : mYoloKernel)
65-
totalCount += yolo.width*yolo.height * CHECK_COUNT;
6659
return 0;
6760
}
6861

6962
Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
7063
{
7164
//output the result to channel
72-
int totalCount = 0;
73-
for(const auto& yolo : mYoloKernel)
74-
totalCount += yolo.width*yolo.height * CHECK_COUNT * sizeof(Detection) / sizeof(float);
65+
int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
7566

76-
return Dims3(totalCount + 1, 1, 1);
67+
return Dims3(totalsize + 1, 1, 1);
7768
}
7869

7970
__device__ float Logist(float data){ return 1./(1. + exp(-data)); };
@@ -85,64 +76,60 @@ namespace nvinfer1
8576
if (idx >= noElements) return;
8677

8778
int total_grid = yoloWidth * yoloHeight;
79+
int bnIdx = idx / total_grid;
80+
idx = idx - total_grid*bnIdx;
8881
int info_len_i = 5 + classes;
89-
//int info_len_o = 7;
90-
int input_col = idx;
91-
//int out_row = input_col;
82+
const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);
9283

9384
for (int k = 0; k < 3; ++k) {
9485
int class_id = 0;
9586
float max_cls_prob = 0.0;
9687
for (int i = 5; i < info_len_i; ++i) {
97-
float p = Logist(input[input_col + k * info_len_i * total_grid + i * total_grid]);
88+
float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
9889
if (p > max_cls_prob) {
9990
max_cls_prob = p;
10091
class_id = i - 5;
10192
}
10293
}
103-
float box_prob = Logist(input[input_col + k * info_len_i * total_grid + 4 * total_grid]);
94+
float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
10495
if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) continue;
10596

106-
float *res_count = output;
97+
float *res_count = output + bnIdx*outputElem;
10798
int count = (int)atomicAdd(res_count, 1);
99+
if (count >= MAX_OUTPUT_BBOX_COUNT) return;
108100
char* data = (char * )res_count + sizeof(float) + count*sizeof(Detection);
109101
Detection* det = (Detection*)(data);
110102

111103
int row = idx / yoloWidth;
112104
int col = idx % yoloWidth;
113105

114106
//Location
115-
det->bbox[0] = (col + Logist(input[input_col + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth;
116-
det->bbox[1] = (row + Logist(input[input_col + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight;
117-
det->bbox[2] = exp(input[input_col + k * info_len_i * total_grid + 2 * total_grid]) * anchors[2*k];
118-
det->bbox[3] = exp(input[input_col + k * info_len_i * total_grid + 3 * total_grid]) * anchors[2*k + 1];
107+
det->bbox[0] = (col + Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth;
108+
det->bbox[1] = (row + Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight;
109+
det->bbox[2] = exp(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]) * anchors[2*k];
110+
det->bbox[3] = exp(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]) * anchors[2*k + 1];
119111
det->det_confidence = box_prob;
120112
det->class_id = class_id;
121113
det->class_confidence = max_cls_prob;
122114
}
123115
}
124-
125-
void YoloLayerPlugin::forwardGpu(const float *const * inputs,float * output,cudaStream_t stream,int batchSize) {
116+
117+
void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
126118
void* devAnchor;
127119
size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
128120
CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen));
129121

130-
int outputElem = 1;
131-
for (unsigned int i = 0;i< mYoloKernel.size();++i)
132-
{
133-
const auto& yolo = mYoloKernel[i];
134-
outputElem += yolo.width*yolo.height * CHECK_COUNT * sizeof(Detection) / sizeof(float);
135-
}
122+
int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
136123

137-
for(int idx = 0 ;idx < batchSize;++idx)
124+
for(int idx = 0 ; idx < batchSize; ++idx) {
138125
CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float)));
139-
126+
}
140127
int numElem = 0;
141128
for (unsigned int i = 0;i< mYoloKernel.size();++i)
142129
{
143130
const auto& yolo = mYoloKernel[i];
144131
numElem = yolo.width*yolo.height*batchSize;
145-
if (numElem < 256)
132+
if (numElem < mThreadCount)
146133
mThreadCount = numElem;
147134
CUDA_CHECK(cudaMemcpy(devAnchor, yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
148135
CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>
@@ -158,9 +145,9 @@ namespace nvinfer1
158145
//assert(batchSize == 1);
159146
//GPU
160147
//CUDA_CHECK(cudaStreamSynchronize(stream));
161-
forwardGpu((const float *const *)inputs,(float *)outputs[0],stream,batchSize);
148+
forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
162149

163150
return 0;
164-
};
151+
}
165152

166153
}

yolov4/yololayer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ namespace Yolo
1414
{
1515
static constexpr int CHECK_COUNT = 3;
1616
static constexpr float IGNORE_THRESH = 0.1f;
17+
static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
1718
static constexpr int CLASS_NUM = 80;
1819
static constexpr int INPUT_H = 608;
1920
static constexpr int INPUT_W = 608;

yolov4/yolov4.cpp

Lines changed: 41 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,15 @@
1818
#define DEVICE 0 // GPU id
1919
#define NMS_THRESH 0.4
2020
#define BBOX_CONF_THRESH 0.5
21+
#define BATCH_SIZE 1
2122

2223
using namespace nvinfer1;
2324

2425
// stuff we know about the network and the input/output blobs
2526
static const int INPUT_H = Yolo::INPUT_H;
2627
static const int INPUT_W = Yolo::INPUT_W;
27-
static const int OUTPUT_SIZE = 1000 * 7 + 1; // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1
28+
static const int DETECTION_SIZE = sizeof(Yolo::Detection) / sizeof(float);
29+
static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * DETECTION_SIZE + 1; // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
2830
const char* INPUT_BLOB_NAME = "data";
2931
const char* OUTPUT_BLOB_NAME = "prob";
3032
static Logger gLogger;
@@ -98,10 +100,10 @@ bool cmp(Yolo::Detection& a, Yolo::Detection& b) {
98100

99101
void nms(std::vector<Yolo::Detection>& res, float *output, float nms_thresh = NMS_THRESH) {
100102
std::map<float, std::vector<Yolo::Detection>> m;
101-
for (int i = 0; i < output[0] && i < 1000; i++) {
102-
if (output[1 + 7 * i + 4] <= BBOX_CONF_THRESH) continue;
103+
for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) {
104+
if (output[1 + DETECTION_SIZE * i + 4] <= BBOX_CONF_THRESH) continue;
103105
Yolo::Detection det;
104-
memcpy(&det, &output[1 + 7 * i], 7 * sizeof(float));
106+
memcpy(&det, &output[1 + DETECTION_SIZE * i], DETECTION_SIZE * sizeof(float));
105107
if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
106108
m[det.class_id].push_back(det);
107109
}
@@ -582,7 +584,7 @@ int main(int argc, char** argv) {
582584

583585
if (argc == 2 && std::string(argv[1]) == "-s") {
584586
IHostMemory* modelStream{nullptr};
585-
APIToModel(1, &modelStream);
587+
APIToModel(BATCH_SIZE, &modelStream);
586588
assert(modelStream != nullptr);
587589
std::ofstream p("yolov4.engine");
588590
if (!p) {
@@ -617,10 +619,10 @@ int main(int argc, char** argv) {
617619
}
618620

619621
// prepare input data ---------------------------
620-
float data[3 * INPUT_H * INPUT_W];
622+
static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
621623
//for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
622624
// data[i] = 1.0;
623-
static float prob[OUTPUT_SIZE];
625+
static float prob[BATCH_SIZE * OUTPUT_SIZE];
624626
PluginFactory pf;
625627
IRuntime* runtime = createInferRuntime(gLogger);
626628
assert(runtime != nullptr);
@@ -630,37 +632,47 @@ int main(int argc, char** argv) {
630632
assert(context != nullptr);
631633

632634
int fcount = 0;
633-
for (auto f: file_names) {
635+
for (int f = 0; f < file_names.size(); f++) {
634636
fcount++;
635-
std::cout << fcount << " " << f << std::endl;
636-
cv::Mat img = cv::imread(std::string(argv[2]) + "/" + f);
637-
if (img.empty()) continue;
638-
cv::Mat pr_img = preprocess_img(img);
639-
for (int i = 0; i < INPUT_H * INPUT_W; i++) {
640-
data[i] = pr_img.at<cv::Vec3b>(i)[2] / 255.0;
641-
data[i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] / 255.0;
642-
data[i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[0] / 255.0;
637+
if (fcount < BATCH_SIZE && f + 1 != file_names.size()) continue;
638+
for (int b = 0; b < fcount; b++) {
639+
cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - BATCH_SIZE + 1 + b]);
640+
if (img.empty()) continue;
641+
cv::Mat pr_img = preprocess_img(img);
642+
for (int i = 0; i < INPUT_H * INPUT_W; i++) {
643+
data[b * 3 * INPUT_H * INPUT_W + i] = pr_img.at<cv::Vec3b>(i)[2] / 255.0;
644+
data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] / 255.0;
645+
data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[0] / 255.0;
646+
}
643647
}
644648

645649
// Run inference
646650
auto start = std::chrono::system_clock::now();
647-
doInference(*context, data, prob, 1);
648-
std::vector<Yolo::Detection> res;
649-
nms(res, prob);
651+
doInference(*context, data, prob, BATCH_SIZE);
652+
std::vector<std::vector<Yolo::Detection>> batch_res(fcount);
653+
for (int b = 0; b < fcount; b++) {
654+
auto& res = batch_res[b];
655+
nms(res, &prob[b * OUTPUT_SIZE]);
656+
}
650657
auto end = std::chrono::system_clock::now();
651658
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
652-
std::cout << res.size() << std::endl;
653-
for (size_t j = 0; j < res.size(); j++) {
654-
float *p = (float*)&res[j];
655-
for (size_t k = 0; k < 7; k++) {
656-
std::cout << p[k] << ", ";
659+
for (int b = 0; b < fcount; b++) {
660+
auto& res = batch_res[b];
661+
//std::cout << res.size() << std::endl;
662+
cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - BATCH_SIZE + 1 + b]);
663+
for (size_t j = 0; j < res.size(); j++) {
664+
float *p = (float*)&res[j];
665+
for (size_t k = 0; k < 7; k++) {
666+
// std::cout << p[k] << ", ";
667+
}
668+
//std::cout << std::endl;
669+
cv::Rect r = get_rect(img, res[j].bbox);
670+
cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
671+
cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
657672
}
658-
std::cout << std::endl;
659-
cv::Rect r = get_rect(img, res[j].bbox);
660-
cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
661-
cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
673+
cv::imwrite("_" + file_names[f - BATCH_SIZE + 1 + b], img);
662674
}
663-
cv::imwrite("_" + f, img);
675+
fcount = 0;
664676
}
665677

666678
// Destroy the engine

0 commit comments

Comments
 (0)