unet support trt8

wang-xinyu · wang-xinyu · commit e94e4ed70f2a · 2022-11-25T21:13:49.000+08:00
diff --git a/unet/CMakeLists.txt b/unet/CMakeLists.txt
@@ -8,27 +8,23 @@ option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_BUILD_TYPE Debug)
 
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30)
-
 # cuda directory
-include_directories(${PROJECT_SOURCE_DIR}/include)
-include_directories(/usr/local/cuda-10.2/targets/x86_64-linux/include)
-link_directories(/usr/local/cuda-10.2/targets/x86_64-linux/lib)
+include_directories(/usr/local/cuda/include/)
+link_directories(/usr/local/cuda/lib64/)
 
 # tensorrt
-include_directories(/home/sycv/workplace/pengyuzhou/TensorRT-7.0.0.11/targets/x86_64-linux-gnu/include)
-link_directories(/home/sycv/workplace/pengyuzhou/TensorRT-7.0.0.11/targets/x86_64-linux-gnu/lib)
+include_directories(/workspace/TensorRT-8.4.1.5/include/)
+link_directories(/workspace/TensorRT-8.4.1.5/lib/)
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
+# opencv library
+find_package(OpenCV)
+include_directories(${OpenCV_INCLUDE_DIRS})
 
 # link library and add exec file
 add_executable(unet ${PROJECT_SOURCE_DIR}/unet.cpp)
 target_link_libraries(unet nvinfer)
 target_link_libraries(unet cudart)
+target_link_libraries(unet ${OpenCV_LIBS})
 
 add_definitions(-O2 -pthread)
 
-# opencv library
-find_package(OpenCV)
-include_directories(${OpenCV_INCLUDE_DIRS})
-target_link_libraries(unet ${OpenCV_LIBS})
diff --git a/unet/logging.h b/unet/logging.h
@@ -25,6 +25,7 @@
 #include <ostream>
 #include <sstream>
 #include <string>
+#include "macros.h"
 
 using Severity = nvinfer1::ILogger::Severity;
 
@@ -236,7 +237,7 @@ class Logger : public nvinfer1::ILogger
     //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
     //! inheritance from nvinfer1::ILogger
     //!
-    void log(Severity severity, const char* msg) override
+    void log(Severity severity, const char* msg) TRT_NOEXCEPT override 
     {
         LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
     }
@@ -500,4 +501,4 @@ inline LogStreamConsumer LOG_FATAL(const Logger& logger)
 
 } // anonymous namespace
 
-#endif // TENSORRT_LOGGING_H
+#endif // TENSORRT_LOGGING_H
diff --git a/unet/macros.h b/unet/macros.h
@@ -0,0 +1,27 @@
+#ifndef __MACROS_H
+#define __MACROS_H
+
+#ifdef API_EXPORTS
+#if defined(_MSC_VER)
+#define API __declspec(dllexport)
+#else
+#define API __attribute__((visibility("default")))
+#endif
+#else
+
+#if defined(_MSC_VER)
+#define API __declspec(dllimport)
+#else
+#define API
+#endif
+#endif  // API_EXPORTS
+
+#if NV_TENSORRT_MAJOR >= 8
+#define TRT_NOEXCEPT noexcept
+#define TRT_CONST_ENQUEUE const
+#else
+#define TRT_NOEXCEPT
+#define TRT_CONST_ENQUEUE
+#endif
+
+#endif  // __MACROS_H
diff --git a/unet/unet.cpp b/unet/unet.cpp
@@ -3,28 +3,22 @@
 #include "cuda_runtime_api.h"
 #include "logging.h"
 #include "common.hpp"
+
 #define DEVICE 0
-#define NET s  // s m l x
-#define NETSTRUCT(str) createEngine_##str
-#define CREATENET(net) NETSTRUCT(net)
-#define STR1(x) #x
-#define STR2(x) STR1(x)
 // #define USE_FP16  // comment out this if want to use FP16
 #define CONF_THRESH 0.5
 #define BATCH_SIZE 1
+
+using namespace nvinfer1;
+
 // stuff we know about the network and the input/output blobs
 static const int INPUT_H = 816;
 static const int INPUT_W = 672;
 static const int OUTPUT_SIZE = 672*816;
-
 const char* INPUT_BLOB_NAME = "data";
 const char* OUTPUT_BLOB_NAME = "prob";
-
-using namespace nvinfer1;
-
 static Logger gLogger;
 
-
 cv::Mat preprocess_img(cv::Mat& img) {
     int w, h, x, y;
     float r_w = INPUT_W / (img.cols*1.0);
@@ -47,8 +41,6 @@ cv::Mat preprocess_img(cv::Mat& img) {
     return out;
 }
 
-
-
 ILayer* doubleConv(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, std::string lname, int midch){
     // Weights emptywts{DataType::kFLOAT, nullptr, 0};
     // int p = ksize / 2;
@@ -97,28 +89,26 @@ ILayer* up(INetworkDefinition *network, std::map<std::string, Weights>& weightMa
     // IPoolingLayer* pool1 = network->addPooling(dcov1, PoolingType::kMAX, DimsHW{2, 2});
     // pool1->setStrideNd(DimsHW{2, 2});
     // dcov1->add_pading
-    ILayer* pad1 = network->addPaddingNd(*deconv1->getOutput(0),DimsHW{diffx / 2, diffy / 2},DimsHW{diffx - (diffx / 2), diffy - (diffy / 2)});
+    ILayer* pad1 = network->addPaddingNd(*deconv1->getOutput(0), DimsHW{diffx / 2, diffy / 2}, DimsHW{diffx - (diffx / 2), diffy - (diffy / 2)});
     // dcov1->setPaddingNd(DimsHW{diffx / 2, diffx - diffx / 2},DimsHW{diffy / 2, diffy - diffy / 2});
-    ITensor* inputTensors[] = {&input2,pad1->getOutput(0)};
+    ITensor* inputTensors[] = {&input2, pad1->getOutput(0)};
     auto cat = network->addConcatenation(inputTensors, 2);
     assert(cat);
-    if (midch==64){
+    if (midch == 64) {
         ILayer* dcov1 = doubleConv(network,weightMap,*cat->getOutput(0),outch,3,lname+".conv",outch);
         assert(dcov1);
         return dcov1;
-    }else{
+    } else {
         int midch1 = outch/2;
         ILayer* dcov1 = doubleConv(network,weightMap,*cat->getOutput(0),midch1,3,lname+".conv",outch);
         assert(dcov1);
         return dcov1;
     }
-    
     // assert(dcov1);
-
     // return dcov1;
 }
 
-ILayer* outConv(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input,  int outch, std::string lname){
+ILayer* outConv(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, std::string lname) {
     // Weights emptywts{DataType::kFLOAT, nullptr, 0};
 
     IConvolutionLayer* conv1 = network->addConvolutionNd(input, 1, DimsHW{1, 1}, weightMap[lname + ".conv.weight"], weightMap[lname + ".conv.bias"]);
@@ -129,16 +119,14 @@ ILayer* outConv(INetworkDefinition *network, std::map<std::string, Weights>& wei
     return conv1;
 }
 
-
-
 ICudaEngine* createEngine_l(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
     INetworkDefinition* network = builder->createNetworkV2(0U);
 
     // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
     ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
     assert(data);
 
-    std::map<std::string, Weights> weightMap = loadWeights("/home/sycv/workplace/pengyuzhou/tensorrtx/unet/unet_816_672.wts");
+    std::map<std::string, Weights> weightMap = loadWeights("../unet.wts");
     Weights emptywts{DataType::kFLOAT, nullptr, 0};
 
     // build network
@@ -170,22 +158,19 @@ ICudaEngine* createEngine_l(unsigned int maxBatchSize, IBuilder* builder, IBuild
     network->destroy();
 
     // Release host memory
-    for (auto& mem : weightMap)
-    {
+    for (auto& mem : weightMap) {
         free((void*)(mem.second.values));
     }
 
     return engine;
 }
 
-
 void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
     // Create builder
     IBuilder* builder = createInferBuilder(gLogger);
     IBuilderConfig* config = builder->createBuilderConfig();
 
     // Create model to populate the network, then set the outputs and create an engine
-    // ICudaEngine* engine = (CREATENET(NET))(maxBatchSize, builder, config, DataType::kFLOAT);
     ICudaEngine* engine = createEngine_l(maxBatchSize, builder, config, DataType::kFLOAT);
     assert(engine != nullptr);
 
@@ -222,7 +207,7 @@ void doInference(IExecutionContext& context, float* input, float* output, int ba
     CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
     context.enqueue(batchSize, buffers, stream, nullptr);
     CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
-    //流同步：通过cudaStreamSynchronize()来协调。
+
     cudaStreamSynchronize(stream);
 
     // Release stream and buffers
@@ -231,20 +216,19 @@ void doInference(IExecutionContext& context, float* input, float* output, int ba
     CHECK(cudaFree(buffers[outputIndex]));
 }
 
-struct  Detection{        
+struct Detection {        
     float mask[INPUT_W*INPUT_H*1];
-    };
+};
 
-float sigmoid(float x)
-{
+float sigmoid(float x) {
     return (1 / (1 + exp(-x)));
 }
 
 void process_cls_result(Detection &res, float *output) {    
-    for(int i=0;i<INPUT_W*INPUT_H*1;i++){
+    for (int i = 0; i < INPUT_W * INPUT_H * 1; i++) {
         res.mask[i] = sigmoid(*(output+i));
-        }
     }
+}
 
 int main(int argc, char** argv) {
     cudaSetDevice(DEVICE);
@@ -329,8 +313,6 @@ int main(int argc, char** argv) {
         auto end = std::chrono::system_clock::now();
         std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
 
-        
-
         std::vector<Detection> batch_res(fcount);
         for (int b = 0; b < fcount; b++) {
             auto& res = batch_res[b];