fix yololayer

wang-xinyu · wang-xinyu · commit e56ec6ebbec2 · 2020-04-02T22:54:03.000+08:00
diff --git a/README.md b/README.md
@@ -34,11 +34,12 @@ Following models are implemented, each one also has a readme inside.
 |[mnasnet](./mnasnet)| MNASNet with depth multiplier of 0.5 from the paper |
 |[mobilenet](./mobilenetv2)| MobileNet V2, V3-small, V3-large. |
 |[resnet](./resnet)| resnet-18, resnet-50 and resnext50-32x4d are implemented |
-|[senet](./senet)| se_resnet50 |
+|[senet](./senet)| se-resnet50 |
 |[shufflenet](./shufflenetv2)| ShuffleNetV2 with 0.5x output channels |
 |[squeezenet](./squeezenet)| SqueezeNet 1.1 model |
 |[vgg](./vgg)| VGG 11-layer model |
 |[yolov3](./yolov3)| darknet-53, weights from yolov3 authors |
+|[yolov3-spp](./yolov3-spp)| darknet-53, weights from [ultralytics/yolov3](https://github.com/ultralytics/yolov3) |
 
 ## Tricky Operations
 
@@ -54,7 +55,18 @@ Some tricky operations encountered in these models, already solved, but might ha
 |channel shuffle| use two shuffle layers to implement `channel_shuffle`, see shufflenet. |
 |adaptive pool| use fixed input dimension, and use regular average pooling, see shufflenet. |
 |leaky relu| I wrote a leaky relu plugin, but PRelu in `NvInferPlugin.h` can be used, see yolov3. |
-|yolo layer| yolo layer is implemented as a plugin, see yolov3. |
+|yolo layer v1| yolo layer is implemented as a plugin, see yolov3. |
+|yolo layer v2| three yolo layers implemented in one plugin, see yolov3-spp. |
 |upsample| replaced by a deconvolution layer, see yolov3. |
 |hsigmoid| hard sigmoid is implemented as a plugin, hsigmoid and hswish are used in mobilenetv3 |
 
+## Speed Benchmark
+
+| Models | Device | BatchSize | Mode | Input Shape(HxW) | FPS |
+|-|-|:-:|:-:|:-:|:-:|
+| yolov3(darknet53) | Xavier | 1 | FP16 | 320x320 | 55 |
+| yolov3-spp(darknet53) | GTX1080 | 1 | FP32 | 256x416 | 94 |
+
+Help wanted, if you got speed results, please add an issue or PR.
+
+Thanks @Kmarconi for yolov3(darknet53) speed test.
diff --git a/yolov3-spp/README.md b/yolov3-spp/README.md
@@ -1,6 +1,6 @@
 # yolov3-spp
 
-The Pytorch implementation is [ultralytics/yolov3](https://github.com/ultralytics/yolov3)
+The Pytorch implementation is [ultralytics/yolov3](https://github.com/ultralytics/yolov3). It provides two trained weights of yolov3-spp, `yolov3-spp.pt` and `yolov3-spp-ultralytics.pt`(originally named `ultralytics68.pt`).
 
 Following tricks are used in this yolov3-spp:
 
@@ -10,14 +10,14 @@ Following tricks are used in this yolov3-spp:
 ## Excute:
 
 ```
-1. generate yolov3-spp_ultralytics68.wts from pytorch implementation with yolov3-spp.cfg and ultralytics68.pt
+1. generate yolov3-spp_ultralytics68.wts from pytorch implementation with yolov3-spp.cfg and yolov3-spp-ultralytics.pt
 
 git clone https://github.com/wang-xinyu/tensorrtx.git
 git clone https://github.com/ultralytics/yolov3.git
-// download its weights 'ultralytics68.pt'
+// download its weights 'yolov3-spp-ultralytics.pt'
 cd yolov3
 cp ../tensorrtx/yolov3-spp/gen_wts.py .
-python gen_wts.py ultralytics68.pt
+python gen_wts.py yolov3-spp-ultralytics.pt
 // a file 'yolov3-spp_ultralytics68.wts' will be generated.
 // the master branch of yolov3 should work, if not, you can checkout 4ac60018f6e6c1e24b496485f126a660d9c793d8
 
diff --git a/yolov3-spp/yololayer.cu b/yolov3-spp/yololayer.cu
@@ -195,33 +195,34 @@ namespace nvinfer1
         //int out_row = input_col;
 
         for (int k = 0; k < 3; ++k) {
-            float *res_count = output;
-            if(*res_count > 1000) break;
-            int count = (int)atomicAdd(res_count, 1);
-            char* data = (char * )res_count + sizeof(float) + count*sizeof(Detection);
-            Detection* det =  (Detection*)(data);
-
             int class_id = 0;
-            float max_prob = 0.0;
+            float max_cls_prob = 0.0;
             for (int i = 5; i < info_len_i; ++i) {
                 float p = Logist(input[input_col + k * info_len_i * total_grid + i * total_grid]);
-                if (p > max_prob) {
-                    max_prob = p;
+                if (p > max_cls_prob) {
+                    max_cls_prob = p;
                     class_id = i - 5;
                 }
             }
+            float box_prob = Logist(input[input_col + k * info_len_i * total_grid + 4 * total_grid]);
+            if (max_cls_prob < 0.1 || box_prob < 0.1) continue;
+
+            float *res_count = output;
+            int count = (int)atomicAdd(res_count, 1);
+            char* data = (char * )res_count + sizeof(float) + count*sizeof(Detection);
+            Detection* det =  (Detection*)(data);
 
             int row = idx / yoloWidth;
             int col = idx % yoloWidth;
 
-        //Location
+            //Location
             det->bbox[0] = (col + Logist(input[input_col + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth;
             det->bbox[1] = (row + Logist(input[input_col + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight;
             det->bbox[2] = exp(input[input_col + k * info_len_i * total_grid + 2 * total_grid]) * anchors[2*k];
             det->bbox[3] = exp(input[input_col + k * info_len_i * total_grid + 3 * total_grid]) * anchors[2*k + 1];
-            det->det_confidence =  Logist(input[input_col + k * info_len_i * total_grid + 4 * total_grid]);
-            det->class_id =  class_id;
-            det->class_confidence =  max_prob;
+            det->det_confidence = box_prob;
+            det->class_id = class_id;
+            det->class_confidence = max_cls_prob;
         }
     }
    
@@ -247,9 +248,9 @@ namespace nvinfer1
             numElem = yolo.width*yolo.height*batchSize;
             if (numElem < 256)
                 mThreadCount = numElem;
-        CUDA_CHECK(cudaMemcpy(devAnchor, yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
+            CUDA_CHECK(cudaMemcpy(devAnchor, yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
             CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>
-                    (inputs[i],output, numElem, yolo.width, yolo.height, (float *)devAnchor, mClassCount ,outputElem);
+                (inputs[i],output, numElem, yolo.width, yolo.height, (float *)devAnchor, mClassCount ,outputElem);
         }
 
         CUDA_CHECK(cudaFree(devAnchor));
diff --git a/yolov3-spp/yolov3-spp.cpp b/yolov3-spp/yolov3-spp.cpp
@@ -22,7 +22,7 @@ using namespace Yolo;
 // stuff we know about the network and the input/output blobs
 static const int INPUT_H = 256;
 static const int INPUT_W = 416;
-static const int OUTPUT_SIZE = 1000 * 7 + 1;
+static const int OUTPUT_SIZE = 1000 * 7 + 1;  // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1
 const char* INPUT_BLOB_NAME = "data";
 const char* OUTPUT_BLOB_NAME = "prob";
 static Logger gLogger;
@@ -96,7 +96,7 @@ bool cmp(Detection& a, Detection& b) {
 
 void nms(std::vector<Detection>& res, float *output, float nms_thresh = 0.4) {
     std::map<float, std::vector<Detection>> m;
-    for (int i = 0; i < output[0]; i++) {
+    for (int i = 0; i < output[0] && i < 1000; i++) {
         if (output[1 + 7 * i + 4] <= 0.5) continue;
         Detection det;
         memcpy(&det, &output[1 + 7 * i], 7 * sizeof(float));
@@ -537,6 +537,8 @@ int main(int argc, char** argv) {
         doInference(*context, data, prob, 1);
         std::vector<Detection> res;
         nms(res, prob);
+        auto end = std::chrono::system_clock::now();
+        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
         for (int i=0; i<20; i++) {
             std::cout << prob[i] << ",";
         }
@@ -551,8 +553,6 @@ int main(int argc, char** argv) {
             cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
             cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
         }
-        auto end = std::chrono::system_clock::now();
-        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
         cv::imwrite("_" + f, img);
     }