update rcnn (wang-xinyu#521)

freedenS · web-flow · commit 2c4fdea67d10 · 2021-05-07T10:50:55.000+08:00
* add MaskRcnn(C4)

* add MaskRcnnInference plugin for mask selecting

* split ROIHeads to BOXHead and MaskHead

* remove unuseful parameters in createEngine_rcnn and BuildRcnnModel

* change the type of scores_h, boxes_h and classes_h from unique_ptr to vector

* add doInference

* add maskrcnn postprocess

* update README.md

* update rcnn

* fix bugs with R18 and R34
 add BasicBlock for R18 and R34
 add STRIDE_IN_1X1, MakeStage is same with detectron2 now.

* update README.md

* update rcnn

replace picture with url
diff --git a/rcnn/README.md b/rcnn/README.md
@@ -26,7 +26,8 @@ TensorRT7.2 is recomended because Resize layer in 7.0 with kLINEAR mode is a lit
 // go to facebookresearch/detectron2
 python setup.py build develop // more install information see https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md
 // download https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/model_final_721ade.pkl
-// copy tensorrtx/rcnn/(gen_wts.py,demo.jpg) into facebookresearch/detectron2
+// download https://raw.githubusercontent.com/freedenS/TestImage/main/demo.jpg
+// copy tensorrtx/rcnn/gen_wts.py and demo.jpg into facebookresearch/detectron2
 // ensure cfg.MODEL.WEIGHTS in gen_wts.py is correct
 // go to facebookresearch/detectron2
 python gen_wts.py
@@ -52,33 +53,39 @@ sudo ./rcnn -d faster.engine ../samples
 // sudo ./rcnn -d mask.engine ../samples m
 ```
 
-3. check the images generated, as follows. _zidane.jpg and _bus.jpg
+3. check the images generated, as follows. _demo.jpg and so on.
 
 ## Backbone
 
 #### R18, R34, R152
 
 ```
+// python
 1.download pretrained model
   R18: https://download.pytorch.org/models/resnet18-f37072fd.pth
   R34: https://download.pytorch.org/models/resnet34-b627a593.pth
+  R50: https://download.pytorch.org/models/resnet50-0676ba61.pth
+  R101: https://download.pytorch.org/models/resnet101-63fe2227.pth
   R152: https://download.pytorch.org/models/resnet152-394f9c45.pth
 2.convert pth to pkl by facebookresearch/detectron2/tools/convert-torchvision-to-d2.py
 3.set merge_from_file in gen_wts.py
   ./configs/COCO-Detections/faster_rcnn_R_50_C4_1x.yaml for fasterRcnn
   ./configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml for maskRcnn
-4.set cfg.MODEL.RESNETS.DEPTH = 18(34,152),
+4.set cfg.MODEL.RESNETS.DEPTH = 18(34,50,101,152),
       cfg.MODEL.RESNETS.STRIDE_IN_1X1 = False,
-      cfg.MODEL.RESNETS.RES2_OUT_CHANNELS = 64, // for R18, R34
+      cfg.MODEL.RESNETS.RES2_OUT_CHANNELS = 64, // for R18, R34; 256 for others
       cfg.MODEL.PIXEL_MEAN = [123.675, 116.280, 103.530],
       cfg.MODEL.PIXEL_STD = [58.395, 57.120, 57.375],
       cfg.INPUT.FORMAT = "RGB"
   and then train your own model
-5.set BACKBONE_RESNETTYPE = R18(R34, R152) in rcnn.cpp line 13
-6.modify PIXEL_MEAN and PIXEL_STD in rcnn.cpp
-7.set res2_out_channels=64 in BuildResNet in rcnn.cpp line 239 // for R18, R34
-8.generate wts file from your own model and build your engine, refer to how to run
-9.convert your image to RGB before inference
+5.generate your wts file.
+// c++
+6.set BACKBONE_RESNETTYPE = R18(R34,R50,R101,R152) in rcnn.cpp line 14
+7.modify PIXEL_MEAN and PIXEL_STD in rcnn.cpp
+8.set STRIDE_IN_1X1=false in backbone.hpp line 9
+9.set other parameters if it's not same with default
+10.build your engine, refer to how to run
+11.convert your image to RGB before inference
 ```
 
 #### R50, R101
@@ -95,7 +102,8 @@ sudo ./rcnn -d faster.engine ../samples
   R50-mask: ./configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml
   R101-mask: ./configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml
 3.set BACKBONE_RESNETTYPE = R50(R101) rcnn.cpp line 13
-4.follow how to run
+4.set STRIDE_IN_1X1=true in backbone.hpp
+5.follow how to run
 ```
 
 ## NOTE
@@ -114,7 +122,7 @@ sudo ./rcnn -d faster.engine ../samples
 
 - if you want to use maskrcnn with cuda10.2, please be sure that you have upgraded cuda to the latest patch. see https://github.com/NVIDIA/TensorRT/issues/1151 for detail.
 
-- you can only build fasterRcnn part with maskRcnn weights file.
+- you can build fasterRcnn with maskRcnn weights file.
 
 ## Quantization
 
diff --git a/rcnn/backbone.hpp b/rcnn/backbone.hpp
@@ -4,6 +4,10 @@
 #include <string>
 #include "common.hpp"
 
+/* when stride>1, whether to put stride in the first 1x1 convolution or the bottleneck 3x3 convolution.
+set false when use backbone from torchvision*/
+#define STRIDE_IN_1X1 true
+
 enum RESNETTYPE {
     R18 = 0,
     R34,
@@ -44,6 +48,55 @@ int group_num = 1) {
     return max_pool2d;
 }
 
+ITensor* BasicBlock(INetworkDefinition *network,
+std::map<std::string, Weights>& weightMap,
+const std::string& lname,
+ITensor& input,
+int in_channels,
+int out_channels,
+int stride = 1) {
+    // conv1
+    IConvolutionLayer* conv1 = network->addConvolutionNd(input, out_channels, DimsHW{ 3, 3 },
+    weightMap[lname + ".conv1.weight"],
+    weightMap[lname + ".conv1.bias"]);
+    assert(conv1);
+    conv1->setStrideNd(DimsHW{ stride, stride });
+    conv1->setPaddingNd(DimsHW{ 1, 1 });
+
+    auto r1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
+    assert(r1);
+
+    // conv2
+    IConvolutionLayer* conv2 = network->addConvolutionNd(*r1->getOutput(0), out_channels, DimsHW{ 3, 3 },
+    weightMap[lname + ".conv2.weight"],
+    weightMap[lname + ".conv2.bias"]);
+    assert(conv2);
+    conv2->setStrideNd(DimsHW{ 1, 1 });
+    conv2->setPaddingNd(DimsHW{ 1, 1 });
+
+    // shortcut
+    ITensor* shortcut_value = nullptr;
+    if (in_channels != out_channels) {
+        auto shortcut = network->addConvolutionNd(input, out_channels, DimsHW{ 1, 1 },
+        weightMap[lname + ".shortcut.weight"],
+        weightMap[lname + ".shortcut.bias"]);
+        assert(shortcut);
+        shortcut->setStrideNd(DimsHW{ stride, stride });
+        shortcut_value = shortcut->getOutput(0);
+    } else {
+        shortcut_value = &input;
+    }
+
+    // add
+    auto ew = network->addElementWise(*conv2->getOutput(0), *shortcut_value, ElementWiseOperation::kSUM);
+    assert(ew);
+
+    auto r3 = network->addActivation(*ew->getOutput(0), ActivationType::kRELU);
+    assert(r3);
+
+    return r3->getOutput(0);
+}
+
 ITensor* BottleneckBlock(INetworkDefinition *network,
 std::map<std::string, Weights>& weightMap,
 const std::string& lname,
@@ -54,12 +107,14 @@ int out_channels,
 int stride = 1,
 int dilation = 1,
 int group_num = 1) {
+    int stride_1x1 = STRIDE_IN_1X1 ? stride : 1;
+    int stride_3x3 = STRIDE_IN_1X1 ? 1 : stride;
     // conv1
     IConvolutionLayer* conv1 = network->addConvolutionNd(input, bottleneck_channels, DimsHW{ 1, 1 },
     weightMap[lname + ".conv1.weight"],
     weightMap[lname + ".conv1.bias"]);
     assert(conv1);
-    conv1->setStrideNd(DimsHW{ stride, stride });
+    conv1->setStrideNd(DimsHW{ stride_1x1, stride_1x1 });
     conv1->setNbGroups(group_num);
 
     auto r1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
@@ -70,7 +125,7 @@ int group_num = 1) {
     weightMap[lname + ".conv2.weight"],
     weightMap[lname + ".conv2.bias"]);
     assert(conv2);
-    conv2->setStrideNd(DimsHW{ 1, 1 });
+    conv2->setStrideNd(DimsHW{ stride_3x3, stride_3x3 });
     conv2->setPaddingNd(DimsHW{ 1 * dilation, 1 * dilation });
     conv2->setDilationNd(DimsHW{ dilation, dilation });
     conv2->setNbGroups(group_num);
@@ -115,21 +170,23 @@ std::map<std::string, Weights>& weightMap,
 const std::string& lname,
 ITensor& input,
 int stage,
+RESNETTYPE resnet_type,
 int in_channels,
 int bottleneck_channels,
 int out_channels,
 int first_stride = 1,
 int dilation = 1) {
     ITensor* out = &input;
     for (int i = 0; i < stage; i++) {
-        if (i == 0)
-            out = BottleneckBlock(network, weightMap,
-            lname + "." + std::to_string(i), *out, in_channels,
-            bottleneck_channels, out_channels, first_stride, dilation);
+        std::string layerName = lname + "." + std::to_string(i);
+        int stride = i == 0 ? first_stride : 1;
+
+        if (resnet_type == R18 || resnet_type == R34)
+            out = BasicBlock(network, weightMap, layerName, *out, in_channels, out_channels, stride);
         else
-            out = BottleneckBlock(network, weightMap,
-            lname + "." + std::to_string(i), *out, in_channels,
-            bottleneck_channels, out_channels, 1, dilation);
+            out = BottleneckBlock(network, weightMap, layerName, *out,
+            in_channels, bottleneck_channels, out_channels, stride, dilation);
+
         in_channels = out_channels;
     }
     return out;
@@ -161,8 +218,9 @@ int res5_dilation = 1) {
         int first_stride = (i == 0 || (i == 3 && dilation == 2)) ? 1 : 2;
         out = MakeStage(network, weightMap,
         "backbone.res" + std::to_string(i + 2), *out,
-        num_blocks_per_stage.at(resnet_type)[i], stem_out_channels,
-        bottleneck_channels, out_channels, first_stride, dilation);
+        num_blocks_per_stage.at(resnet_type)[i], resnet_type,
+        stem_out_channels, bottleneck_channels, out_channels,
+        first_stride, dilation);
         stem_out_channels = out_channels;
         bottleneck_channels *= 2;
         out_channels *= 2;
diff --git a/rcnn/rcnn.cpp b/rcnn/rcnn.cpp
@@ -22,6 +22,9 @@ static constexpr int INPUT_H = 480;
 static constexpr int INPUT_W = 640;
 static int IMAGE_HEIGHT = 800;
 static int IMAGE_WIDTH = 1333;
+// backbone
+static const int RES2_OUT_CHANNELS = (BACKBONE_RESNETTYPE == R18 ||
+BACKBONE_RESNETTYPE == R34) ? 64 : 256;
 // rpn
 static const std::vector<float> ANCHOR_SIZES = { 32, 64, 128, 256, 512 };
 static const std::vector<float> ASPECT_RATIOS = { 0.5, 1.0, 2.0 };
@@ -132,14 +135,13 @@ void calculateRatio() {
 }
 
 ITensor* RPN(INetworkDefinition *network,
-std::map<std::string, Weights>& weightMap, ITensor& features,
-    int out_channels = 256) {
+std::map<std::string, Weights>& weightMap, ITensor& features) {
     int num_anchors = ANCHOR_SIZES.size() * ASPECT_RATIOS.size();
     int box_dim = 4;
 
     // rpn head conv
-    auto rpn_head_conv = network->addConvolutionNd(features, out_channels,
-    DimsHW{ 3, 3 }, weightMap["proposal_generator.rpn_head.conv.weight"],
+    auto rpn_head_conv = network->addConvolutionNd(features, features.getDimensions().d[0], DimsHW{ 3, 3 },
+    weightMap["proposal_generator.rpn_head.conv.weight"],
     weightMap["proposal_generator.rpn_head.conv.bias"]);
     assert(rpn_head_conv);
     rpn_head_conv->setStrideNd(DimsHW{ 1, 1 });
@@ -185,8 +187,13 @@ ITensor* proposals, ITensor* features, int num_proposals) {
     auto roiAlignLayer = network->addPluginV2(roi_inputs.data(), roi_inputs.size(), roiAlignPlugin);
 
     // res5
+    /* same with https://github.com/facebookresearch/detectron2/
+    blob/9246ebc3af1c023cfbdae77e5d976edbcf9a2933/detectron2/modeling/roi_heads/roi_heads.py#L430,
+    use bottleneck here, so pass R50*/
     auto box_features = MakeStage(network, weightMap, "roi_heads.res5",
-    *roiAlignLayer->getOutput(0), 3, 1024, 512, 2048, 2);
+    *roiAlignLayer->getOutput(0), 3, R50,
+    roiAlignLayer->getOutput(0)->getDimensions().d[1],
+    512, RES2_OUT_CHANNELS * 8, 2);
     return box_features;
 }
 
@@ -293,9 +300,9 @@ ICudaEngine* createEngine_rcnn(unsigned int maxBatchSize,
     loadWeights(wtsfile, weightMap);
 
     // backbone
-    ITensor* features = BuildResNet(network, weightMap, *data, BACKBONE_RESNETTYPE, 64, 64, 256);
+    ITensor* features = BuildResNet(network, weightMap, *data, BACKBONE_RESNETTYPE, 64, 64, RES2_OUT_CHANNELS);
 
-    auto proposals = RPN(network, weightMap, *features, 1024);
+    auto proposals = RPN(network, weightMap, *features);
     auto results = ROIHeads(network, weightMap, proposals, features);
 
     // build output