Skip to content

Commit f2521a5

Browse files
nengwpnengwpwang-xinyu
authored
RCNN upgrade to support TensorRT 7&8 (wang-xinyu#1256)
* rcnn upgrade to support TensorRT 8. * Update RpnDecodePlugin.h Remove Chinese * Update rcnn.cpp Remove Chinese * Update backbone.hpp Remove Chinese * Update backbone.hpp * Update rcnn.cpp * Update rcnn.cpp * Update rcnn.cpp * Update MaskRcnnInferencePlugin.h * rcnn upgrade to support TensorRT 8.x * rcnn upgrade to support TensorRT 8.x * Update macros.h * Update README.md --------- Co-authored-by: nengwp <[email protected]> Co-authored-by: Wang Xinyu <[email protected]>
1 parent ef22b1d commit f2521a5

23 files changed

+286
-210
lines changed

README.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ The basic workflow of TensorRTx is:
1414
4. Load the TensorRT engine and run inference.
1515

1616
## News
17-
17+
- `1 Mar 2023`. [Nengwp](https://github.com/nengwp): [RCNN](./rcnn) and [UNet](./unet) upgrade to support TensorRT 8.
1818
- `18 Dec 2022`. [YOLOv5](./yolov5) upgrade to support v7.0, including instance segmentation.
1919
- `12 Dec 2022`. [East-Face](https://github.com/East-Face): [UNet](./unet) upgrade to support v3.0 of [Pytorch-UNet](https://github.com/milesial/Pytorch-UNet).
2020
- `26 Oct 2022`. [ausk](https://github.com/ausk): YoloP(You Only Look Once for Panopitic Driving Perception).
@@ -29,7 +29,6 @@ The basic workflow of TensorRTx is:
2929
- `19 Oct 2021`. [liuqi123123](https://github.com/liuqi123123) added cuda preprossing for yolov5, preprocessing + inference is 3x faster when batchsize=8.
3030
- `18 Oct 2021`. [xupengao](https://github.com/xupengao): YOLOv5 updated to v6.0, supporting n/s/m/l/x/n6/s6/m6/l6/x6.
3131
- `31 Aug 2021`. [FamousDirector](https://github.com/FamousDirector): update retinaface to support TensorRT 8.0.
32-
- `27 Aug 2021`. [HaiyangPeng](https://github.com/HaiyangPeng): add a python wrapper for hrnet segmentation.
3332

3433
## Tutorials
3534

rcnn/BatchedNms.cu

100644100755
Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
#include <thrust/sequence.h>
44
#include <thrust/execution_policy.h>
55
#include <thrust/gather.h>
6-
#include <thrust/system/cuda/detail/cub/device/device_radix_sort.cuh>
76
#include <cmath>
87
#include <algorithm>
98
#include <iostream>
@@ -12,6 +11,17 @@
1211
#include <vector>
1312
#include "BatchedNmsPlugin.h"
1413
#include "./cuda_utils.h"
14+
#include "macros.h"
15+
16+
#ifdef CUDA_11
17+
#include <cub/device/device_radix_sort.cuh>
18+
#include <cub/iterator/counting_input_iterator.cuh>
19+
#else
20+
#include <thrust/system/cuda/detail/cub/device/device_radix_sort.cuh>
21+
#include <thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh>
22+
namespace cub = thrust::cuda_cub::cub;
23+
24+
#endif
1525

1626
namespace nvinfer1 {
1727

@@ -52,7 +62,7 @@ __global__ void batched_nms_kernel(
5262
}
5363

5464
int batchedNms(int batch_size,
55-
const void *const *inputs, void **outputs,
65+
const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
5666
size_t count, int detections_per_im, float nms_thresh,
5767
void *workspace, size_t workspace_size, cudaStream_t stream) {
5868

@@ -63,7 +73,7 @@ int batchedNms(int batch_size,
6373
workspace_size += get_size_aligned<float>(count); // scores_sorted
6474

6575
size_t temp_size_sort = 0;
66-
thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending(
76+
cub::DeviceRadixSort::SortPairsDescending(
6777
static_cast<void*>(nullptr), temp_size_sort,
6878
static_cast<float*>(nullptr),
6979
static_cast<float*>(nullptr),
@@ -95,7 +105,7 @@ int batchedNms(int batch_size,
95105

96106
// Sort scores and corresponding indices
97107
int num_detections = count;
98-
thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
108+
cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
99109
in_scores, scores_sorted, indices, indices_sorted, num_detections, 0, sizeof(*scores_sorted) * 8, stream);
100110

101111
// Launch actual NMS kernel - 1 block with each thread handling n detections
@@ -106,7 +116,7 @@ int batchedNms(int batch_size,
106116
indices_sorted, scores_sorted, in_classes, in_boxes);
107117

108118
// Re-sort with updated scores
109-
thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
119+
cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
110120
scores_sorted, scores_sorted, indices_sorted, indices,
111121
num_detections, 0, sizeof(*scores_sorted) * 8, stream);
112122

rcnn/BatchedNmsPlugin.h

100644100755
Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#include <vector>
66
#include <cassert>
7+
#include "macros.h"
78

89
using namespace nvinfer1;
910

@@ -13,7 +14,7 @@ using namespace nvinfer1;
1314

1415
namespace nvinfer1 {
1516
int batchedNms(int batchSize,
16-
const void *const *inputs, void **outputs,
17+
const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
1718
size_t count, int detections_per_im, float nms_thresh,
1819
void *workspace, size_t workspace_size, cudaStream_t stream);
1920

@@ -40,12 +41,12 @@ class BatchedNmsPlugin : public IPluginV2Ext {
4041
read(d, _count);
4142
}
4243

43-
size_t getSerializationSize() const override {
44+
size_t getSerializationSize() const TRT_NOEXCEPT override {
4445
return sizeof(_nms_thresh) + sizeof(_detections_per_im)
4546
+ sizeof(_count);
4647
}
4748

48-
void serialize(void *buffer) const override {
49+
void serialize(void *buffer) const TRT_NOEXCEPT override {
4950
char* d = static_cast<char*>(buffer);
5051
write(d, _nms_thresh);
5152
write(d, _detections_per_im);
@@ -70,34 +71,34 @@ class BatchedNmsPlugin : public IPluginV2Ext {
7071
this->deserialize(data, length);
7172
}
7273

73-
const char *getPluginType() const override {
74+
const char *getPluginType() const TRT_NOEXCEPT override {
7475
return PLUGIN_NAME;
7576
}
7677

77-
const char *getPluginVersion() const override {
78+
const char *getPluginVersion() const TRT_NOEXCEPT override {
7879
return PLUGIN_VERSION;
7980
}
8081

81-
int getNbOutputs() const override {
82+
int getNbOutputs() const TRT_NOEXCEPT override {
8283
return 3;
8384
}
8485

8586
Dims getOutputDimensions(int index,
86-
const Dims *inputs, int nbInputDims) override {
87+
const Dims *inputs, int nbInputDims) TRT_NOEXCEPT override {
8788
assert(nbInputDims == 3);
8889
assert(index < this->getNbOutputs());
8990
return Dims2(_detections_per_im, index == 1 ? 4 : 1);
9091
}
9192

92-
bool supportsFormat(DataType type, PluginFormat format) const override {
93+
bool supportsFormat(DataType type, PluginFormat format) const TRT_NOEXCEPT override {
9394
return type == DataType::kFLOAT && format == PluginFormat::kLINEAR;
9495
}
9596

96-
int initialize() override { return 0; }
97+
int initialize() TRT_NOEXCEPT override { return 0; }
9798

98-
void terminate() override {}
99+
void terminate() TRT_NOEXCEPT override {}
99100

100-
size_t getWorkspaceSize(int maxBatchSize) const override {
101+
size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override {
101102
static int size = -1;
102103
if (size < 0) {
103104
size = batchedNms(maxBatchSize, nullptr, nullptr, _count,
@@ -108,40 +109,40 @@ class BatchedNmsPlugin : public IPluginV2Ext {
108109
}
109110

110111
int enqueue(int batchSize,
111-
const void *const *inputs, void **outputs,
112-
void *workspace, cudaStream_t stream) override {
112+
const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
113+
void *workspace, cudaStream_t stream) TRT_NOEXCEPT override {
113114
return batchedNms(batchSize, inputs, outputs, _count,
114115
_detections_per_im, _nms_thresh,
115116
workspace, getWorkspaceSize(batchSize), stream);
116117
}
117118

118-
void destroy() override {
119+
void destroy() TRT_NOEXCEPT override {
119120
delete this;
120121
}
121122

122-
const char *getPluginNamespace() const override {
123+
const char *getPluginNamespace() const TRT_NOEXCEPT override {
123124
return PLUGIN_NAMESPACE;
124125
}
125126

126-
void setPluginNamespace(const char *N) override {
127+
void setPluginNamespace(const char *N) TRT_NOEXCEPT override {
127128
}
128129

129130
// IPluginV2Ext Methods
130-
DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const {
131+
DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override {
131132
assert(index < 3);
132133
return DataType::kFLOAT;
133134
}
134135

135136
bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
136-
int nbInputs) const {
137+
int nbInputs) const TRT_NOEXCEPT override {
137138
return false;
138139
}
139140

140-
bool canBroadcastInputAcrossBatch(int inputIndex) const { return false; }
141+
bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override { return false; }
141142

142143
void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs,
143144
const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast,
144-
const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) {
145+
const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) TRT_NOEXCEPT override {
145146
assert(*inputTypes == nvinfer1::DataType::kFLOAT &&
146147
floatFormat == nvinfer1::PluginFormat::kLINEAR);
147148
assert(nbInputs == 3);
@@ -150,7 +151,7 @@ class BatchedNmsPlugin : public IPluginV2Ext {
150151
_count = inputDims[0].d[0];
151152
}
152153

153-
IPluginV2Ext *clone() const override {
154+
IPluginV2Ext *clone() const TRT_NOEXCEPT override {
154155
return new BatchedNmsPlugin(_nms_thresh, _detections_per_im, _count);
155156
}
156157

@@ -170,24 +171,24 @@ class BatchedNmsPluginCreator : public IPluginCreator {
170171
public:
171172
BatchedNmsPluginCreator() {}
172173

173-
const char *getPluginNamespace() const override {
174+
const char *getPluginNamespace() const TRT_NOEXCEPT override {
174175
return PLUGIN_NAMESPACE;
175176
}
176-
const char *getPluginName() const override {
177+
const char *getPluginName() const TRT_NOEXCEPT override {
177178
return PLUGIN_NAME;
178179
}
179180

180-
const char *getPluginVersion() const override {
181+
const char *getPluginVersion() const TRT_NOEXCEPT override {
181182
return PLUGIN_VERSION;
182183
}
183184

184-
IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) override {
185+
IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT override {
185186
return new BatchedNmsPlugin(serialData, serialLength);
186187
}
187188

188-
void setPluginNamespace(const char *N) override {}
189-
const PluginFieldCollection *getFieldNames() override { return nullptr; }
190-
IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) override { return nullptr; }
189+
void setPluginNamespace(const char *N) TRT_NOEXCEPT override {}
190+
const PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return nullptr; }
191+
IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) TRT_NOEXCEPT override { return nullptr; }
191192
};
192193

193194
REGISTER_TENSORRT_PLUGIN(BatchedNmsPluginCreator);

rcnn/CMakeLists.txt

100644100755
Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
cmake_minimum_required(VERSION 2.6)
1+
cmake_minimum_required(VERSION 3.1)
22

33
project(rcnn)
44

5-
add_definitions(-std=c++11)
5+
add_definitions(-std=c++14)
66

77
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
8-
set(CMAKE_CXX_STANDARD 11)
8+
set(CMAKE_CXX_STANDARD 14)
99
set(CMAKE_BUILD_TYPE Debug)
1010

1111
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--extended-lambda)
@@ -15,13 +15,13 @@ find_package(CUDA REQUIRED)
1515
include_directories(${PROJECT_SOURCE_DIR}/include)
1616
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
1717
# cuda
18-
include_directories(/usr/local/cuda-10.2/include)
19-
link_directories(/usr/local/cuda-10.2/lib64)
18+
include_directories(/usr/local/cuda/include)
19+
link_directories(/usr/local/cuda/lib64)
2020
# tensorrt
21-
include_directories(/home/jushi/TensorRT-7.2.1.6/include)
22-
link_directories(/home/jushi/TensorRT-7.2.1.6/lib)
21+
include_directories(/home/jushi/TensorRT-8.2.1.6/include)
22+
link_directories(/home/jushi/TensorRT-8.2.1.6/lib)
2323

24-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
24+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
2525

2626
cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/BatchedNms.cu ${PROJECT_SOURCE_DIR}/PredictorDecode.cu ${PROJECT_SOURCE_DIR}/RoiAlign.cu ${PROJECT_SOURCE_DIR}/RpnDecode.cu ${PROJECT_SOURCE_DIR}/RpnNms.cu ${PROJECT_SOURCE_DIR}/MaskRcnnInference.cu)
2727
target_link_libraries(myplugins nvinfer cudart)

rcnn/MaskRcnnInference.cu

100644100755
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "MaskRcnnInferencePlugin.h"
2+
#include "macros.h"
23

34
namespace nvinfer1 {
45

@@ -31,7 +32,7 @@ __global__ void MaskRcnnInferenceKernel(
3132
}
3233

3334
int maskRcnnInference(int batchSize,
34-
const void *const *inputs, void **outputs,
35+
const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
3536
int detections_per_im, int output_size, int num_classes, cudaStream_t stream) {
3637

3738
for (int batch = 0; batch < batchSize; batch++) {

0 commit comments

Comments
 (0)