yolov4 support batchsize

wang-xinyu · wang-xinyu · commit 01690fc2a25d · 2020-05-03T14:56:43.000+08:00
diff --git a/README.md b/README.md
@@ -74,7 +74,9 @@ Some tricky operations encountered in these models, already solved, but might ha
 |-|-|:-:|:-:|:-:|:-:|
 | YOLOv3(darknet53) | Xavier | 1 | FP16 | 320x320 | 55 |
 | YOLOv3-spp(darknet53) | Xeon E5-2620/GTX1080 | 1 | FP32 | 256x416 | 94 |
-| YOLOv4(CSPDarknet53) | Xeon E5-2620/GTX1080 | 1 | FP32 | 256x416 | 67 |
+| YOLOv4(CSPDarknet53) | Xeon E5-2620/GTX1080 | 1 | FP32 | 256x416 | 59 |
+| YOLOv4(CSPDarknet53) | Xeon E5-2620/GTX1080 | 4 | FP32 | 256x416 | 74 |
+| YOLOv4(CSPDarknet53) | Xeon E5-2620/GTX1080 | 8 | FP32 | 256x416 | 83 |
 | RetinaFace(resnet50) | TX2 | 1 | FP16 | 384x640 | 15 |
 | RetinaFace(resnet50) | Xeon E5-2620/GTX1080 | 1 | FP32 | 928x1600 | 15 |
 
diff --git a/yolov4/README.md b/yolov4/README.md
@@ -46,13 +46,14 @@ sudo ./yolov4 -d  ../../yolov3-spp/samples // deserialize plan file and run infe
 
 ## Config
 
-- Input shape defined in yololayer.h
-- Number of classes defined in yololayer.h
-- FP16/FP32 can be selected by the macro in yolov4.cpp
-- GPU id can be selected by the macro in yolov4.cpp
-- NMS thresh in yolov4.cpp
-- BBox confidence thresh in yolov4.cpp
+- Input shape `INPUT_H`, `INPUT_W` defined in yololayer.h
+- Number of classes `CLASS_NUM` defined in yololayer.h
+- FP16/FP32 can be selected by the macro `USE_FP16` in yolov4.cpp
+- GPU id can be selected by the macro `DEVICE` in yolov4.cpp
+- NMS thresh `NMS_THRESH` in yolov4.cpp
+- bbox confidence threshold `BBOX_CONF_THRESH` in yolov4.cpp
+- `BATCH_SIZE` in yolov4.cpp
 
 ## More Information
 
-See the [readme](../README.md) in home page
+See the [readme](../) in home page
diff --git a/yolov4/mish.cu b/yolov4/mish.cu
@@ -54,10 +54,10 @@ namespace nvinfer1
         output[idx] = input[idx] * tanh(softplus(input[idx]));
     }
 
-    void MishPlugin::forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize) {
+    void MishPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
         int block_size = thread_count_;
-        int grid_size = (input_size_ + block_size - 1) / block_size;
-        mish_kernel<<<grid_size, block_size>>>(inputs[0], output, input_size_);
+        int grid_size = (input_size_ * batchSize + block_size - 1) / block_size;
+        mish_kernel<<<grid_size, block_size>>>(inputs[0], output, input_size_ * batchSize);
     }
 
 
@@ -66,8 +66,8 @@ namespace nvinfer1
         //assert(batchSize == 1);
         //GPU
         //CUDA_CHECK(cudaStreamSynchronize(stream));
-        forwardGpu((const float *const *)inputs,(float *)outputs[0],stream,batchSize);
+        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
         return 0;
-    };
+    }
 
 }
diff --git a/yolov4/mish.h b/yolov4/mish.h
@@ -38,7 +38,7 @@ namespace nvinfer1
 
         virtual void serialize(void* buffer) override;
 
-        void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1);
+        void forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize = 1);
 
     private:
         int thread_count_ = 256;
diff --git a/yolov4/yololayer.cu b/yolov4/yololayer.cu
@@ -18,7 +18,7 @@ namespace nvinfer1
     YoloLayerPlugin::~YoloLayerPlugin()
     {
     }
-    
+
     // create the plugin at runtime from a byte stream
     YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
     {
@@ -56,24 +56,15 @@ namespace nvinfer1
 
     int YoloLayerPlugin::initialize()
     { 
-        int totalCount = 0;
-        for(const auto& yolo : mYoloKernel)
-            totalCount += (LOCATIONS + 1) * yolo.width*yolo.height * CHECK_COUNT;
-
-        totalCount = 0;//detection count
-        for(const auto& yolo : mYoloKernel)
-            totalCount += yolo.width*yolo.height * CHECK_COUNT;
         return 0;
     }
     
     Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
     {
         //output the result to channel
-        int totalCount = 0;
-        for(const auto& yolo : mYoloKernel)
-            totalCount += yolo.width*yolo.height * CHECK_COUNT * sizeof(Detection) / sizeof(float);
+        int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
 
-        return Dims3(totalCount + 1, 1, 1);
+        return Dims3(totalsize + 1, 1, 1);
     }
 
     __device__ float Logist(float data){ return 1./(1. + exp(-data)); };
@@ -85,64 +76,60 @@ namespace nvinfer1
         if (idx >= noElements) return;
 
         int total_grid = yoloWidth * yoloHeight;
+        int bnIdx = idx / total_grid;
+        idx = idx - total_grid*bnIdx;
         int info_len_i = 5 + classes;
-        //int info_len_o = 7;
-        int input_col = idx;
-        //int out_row = input_col;
+        const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);
 
         for (int k = 0; k < 3; ++k) {
             int class_id = 0;
             float max_cls_prob = 0.0;
             for (int i = 5; i < info_len_i; ++i) {
-                float p = Logist(input[input_col + k * info_len_i * total_grid + i * total_grid]);
+                float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
                 if (p > max_cls_prob) {
                     max_cls_prob = p;
                     class_id = i - 5;
                 }
             }
-            float box_prob = Logist(input[input_col + k * info_len_i * total_grid + 4 * total_grid]);
+            float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
             if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) continue;
 
-            float *res_count = output;
+            float *res_count = output + bnIdx*outputElem;
             int count = (int)atomicAdd(res_count, 1);
+            if (count >= MAX_OUTPUT_BBOX_COUNT) return;
             char* data = (char * )res_count + sizeof(float) + count*sizeof(Detection);
             Detection* det =  (Detection*)(data);
 
             int row = idx / yoloWidth;
             int col = idx % yoloWidth;
 
             //Location
-            det->bbox[0] = (col + Logist(input[input_col + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth;
-            det->bbox[1] = (row + Logist(input[input_col + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight;
-            det->bbox[2] = exp(input[input_col + k * info_len_i * total_grid + 2 * total_grid]) * anchors[2*k];
-            det->bbox[3] = exp(input[input_col + k * info_len_i * total_grid + 3 * total_grid]) * anchors[2*k + 1];
+            det->bbox[0] = (col + Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth;
+            det->bbox[1] = (row + Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight;
+            det->bbox[2] = exp(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]) * anchors[2*k];
+            det->bbox[3] = exp(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]) * anchors[2*k + 1];
             det->det_confidence = box_prob;
             det->class_id = class_id;
             det->class_confidence = max_cls_prob;
         }
     }
-   
-    void YoloLayerPlugin::forwardGpu(const float *const * inputs,float * output,cudaStream_t stream,int batchSize) {
+
+    void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
         void* devAnchor;
         size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
         CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen));
 
-        int outputElem = 1;
-        for (unsigned int i = 0;i< mYoloKernel.size();++i)
-        {
-            const auto& yolo = mYoloKernel[i];
-            outputElem += yolo.width*yolo.height * CHECK_COUNT * sizeof(Detection) / sizeof(float);
-        }
+        int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
 
-        for(int idx = 0 ;idx < batchSize;++idx)
+        for(int idx = 0 ; idx < batchSize; ++idx) {
             CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float)));
-
+        }
         int numElem = 0;
         for (unsigned int i = 0;i< mYoloKernel.size();++i)
         {
             const auto& yolo = mYoloKernel[i];
             numElem = yolo.width*yolo.height*batchSize;
-            if (numElem < 256)
+            if (numElem < mThreadCount)
                 mThreadCount = numElem;
             CUDA_CHECK(cudaMemcpy(devAnchor, yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
             CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>
@@ -158,9 +145,9 @@ namespace nvinfer1
         //assert(batchSize == 1);
         //GPU
         //CUDA_CHECK(cudaStreamSynchronize(stream));
-        forwardGpu((const float *const *)inputs,(float *)outputs[0],stream,batchSize);
+        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
 
         return 0;
-    };
+    }
 
 }
diff --git a/yolov4/yololayer.h b/yolov4/yololayer.h
@@ -14,6 +14,7 @@ namespace Yolo
 {
     static constexpr int CHECK_COUNT = 3;
     static constexpr float IGNORE_THRESH = 0.1f;
+    static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
     static constexpr int CLASS_NUM = 80;
     static constexpr int INPUT_H = 608;
     static constexpr int INPUT_W = 608;
diff --git a/yolov4/yolov4.cpp b/yolov4/yolov4.cpp
@@ -18,13 +18,15 @@
 #define DEVICE 0  // GPU id
 #define NMS_THRESH 0.4
 #define BBOX_CONF_THRESH 0.5
+#define BATCH_SIZE 1
 
 using namespace nvinfer1;
 
 // stuff we know about the network and the input/output blobs
 static const int INPUT_H = Yolo::INPUT_H;
 static const int INPUT_W = Yolo::INPUT_W;
-static const int OUTPUT_SIZE = 1000 * 7 + 1;  // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1
+static const int DETECTION_SIZE = sizeof(Yolo::Detection) / sizeof(float);
+static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * DETECTION_SIZE + 1;  // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
 const char* INPUT_BLOB_NAME = "data";
 const char* OUTPUT_BLOB_NAME = "prob";
 static Logger gLogger;
@@ -98,10 +100,10 @@ bool cmp(Yolo::Detection& a, Yolo::Detection& b) {
 
 void nms(std::vector<Yolo::Detection>& res, float *output, float nms_thresh = NMS_THRESH) {
     std::map<float, std::vector<Yolo::Detection>> m;
-    for (int i = 0; i < output[0] && i < 1000; i++) {
-        if (output[1 + 7 * i + 4] <= BBOX_CONF_THRESH) continue;
+    for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) {
+        if (output[1 + DETECTION_SIZE * i + 4] <= BBOX_CONF_THRESH) continue;
         Yolo::Detection det;
-        memcpy(&det, &output[1 + 7 * i], 7 * sizeof(float));
+        memcpy(&det, &output[1 + DETECTION_SIZE * i], DETECTION_SIZE * sizeof(float));
         if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
         m[det.class_id].push_back(det);
     }
@@ -582,7 +584,7 @@ int main(int argc, char** argv) {
 
     if (argc == 2 && std::string(argv[1]) == "-s") {
         IHostMemory* modelStream{nullptr};
-        APIToModel(1, &modelStream);
+        APIToModel(BATCH_SIZE, &modelStream);
         assert(modelStream != nullptr);
         std::ofstream p("yolov4.engine");
         if (!p) {
@@ -617,10 +619,10 @@ int main(int argc, char** argv) {
     }
 
     // prepare input data ---------------------------
-    float data[3 * INPUT_H * INPUT_W];
+    static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
     //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
     //    data[i] = 1.0;
-    static float prob[OUTPUT_SIZE];
+    static float prob[BATCH_SIZE * OUTPUT_SIZE];
     PluginFactory pf;
     IRuntime* runtime = createInferRuntime(gLogger);
     assert(runtime != nullptr);
@@ -630,37 +632,47 @@ int main(int argc, char** argv) {
     assert(context != nullptr);
 
     int fcount = 0;
-    for (auto f: file_names) {
+    for (int f = 0; f < file_names.size(); f++) {
         fcount++;
-        std::cout << fcount << "  " << f << std::endl;
-        cv::Mat img = cv::imread(std::string(argv[2]) + "/" + f);
-        if (img.empty()) continue;
-        cv::Mat pr_img = preprocess_img(img);
-        for (int i = 0; i < INPUT_H * INPUT_W; i++) {
-            data[i] = pr_img.at<cv::Vec3b>(i)[2] / 255.0;
-            data[i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] / 255.0;
-            data[i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[0] / 255.0;
+        if (fcount < BATCH_SIZE && f + 1 != file_names.size()) continue;
+        for (int b = 0; b < fcount; b++) {
+            cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - BATCH_SIZE + 1 + b]);
+            if (img.empty()) continue;
+            cv::Mat pr_img = preprocess_img(img);
+            for (int i = 0; i < INPUT_H * INPUT_W; i++) {
+                data[b * 3 * INPUT_H * INPUT_W + i] = pr_img.at<cv::Vec3b>(i)[2] / 255.0;
+                data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] / 255.0;
+                data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[0] / 255.0;
+            }
         }
 
         // Run inference
         auto start = std::chrono::system_clock::now();
-        doInference(*context, data, prob, 1);
-        std::vector<Yolo::Detection> res;
-        nms(res, prob);
+        doInference(*context, data, prob, BATCH_SIZE);
+        std::vector<std::vector<Yolo::Detection>> batch_res(fcount);
+        for (int b = 0; b < fcount; b++) {
+            auto& res = batch_res[b];
+            nms(res, &prob[b * OUTPUT_SIZE]);
+        }
         auto end = std::chrono::system_clock::now();
         std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
-        std::cout << res.size() << std::endl;
-        for (size_t j = 0; j < res.size(); j++) {
-            float *p = (float*)&res[j];
-            for (size_t k = 0; k < 7; k++) {
-                std::cout << p[k] << ", ";
+        for (int b = 0; b < fcount; b++) {
+            auto& res = batch_res[b];
+            //std::cout << res.size() << std::endl;
+            cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - BATCH_SIZE + 1 + b]);
+            for (size_t j = 0; j < res.size(); j++) {
+                float *p = (float*)&res[j];
+                for (size_t k = 0; k < 7; k++) {
+                //    std::cout << p[k] << ", ";
+                }
+                //std::cout << std::endl;
+                cv::Rect r = get_rect(img, res[j].bbox);
+                cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
+                cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
             }
-            std::cout << std::endl;
-            cv::Rect r = get_rect(img, res[j].bbox);
-            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
-            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
+            cv::imwrite("_" + file_names[f - BATCH_SIZE + 1 + b], img);
         }
-        cv::imwrite("_" + f, img);
+        fcount = 0;
     }
 
     // Destroy the engine

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@ namespace Yolo`
`14`	`14`	`{`
`15`	`15`	`static constexpr int CHECK_COUNT = 3;`
`16`	`16`	`static constexpr float IGNORE_THRESH = 0.1f;`
	`17`	`+ static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;`
`17`	`18`	`static constexpr int CLASS_NUM = 80;`
`18`	`19`	`static constexpr int INPUT_H = 608;`
`19`	`20`	`static constexpr int INPUT_W = 608;`