Merge pull request wang-xinyu#53 from qiuyunzhe/anchor

wang-xinyu · web-flow · commit 70bec40e3484 · 2020-06-19T12:11:37.000+08:00
Prepare anchor during initialization and add multi-gpu tutorial
diff --git a/tutorials/multi_GPU_processing.md b/tutorials/multi_GPU_processing.md
@@ -0,0 +1,30 @@
+# How to Implement Multi-GPU Processing
+
+Maybe you hope to take advantage of multiple GPU to make inference even faster. Here are few tips to help you deal with it! Take **YOLO V4** as an example.
+
+## 1. Make custom plugin (i.e. YOLO layer and Mish layer for YOLO V4) running asynchronically.
+
+To do this, we need to use CudaStream parameter in the kernels of all custom layers and use asynchronous functions.
+For example, in function ` forwardGpu()` of **yololayer.cu**, you need to do the following changes to make sure that the engine will be running on a specific CudaStream.
+
+  1) Change `cudaMemset(output + idx*outputElem, 0, sizeof(float))` to `cudaMemsetAsync(output + idx*outputElem, 0, sizeof(float), stream)`
+  2) Change `CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>(inputs[i],output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount ,outputElem)` to `CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(inputs[i],output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount ,outputElem)`
+  
+  ## 2. Create an engine for each device you want to use.
+  
+  Maybe it is a good idea to create a struct to store the engine, context and buffer for each device individually. For example,
+  ```
+  struct Plan{
+    IRuntime* runtime;
+    ICudaEngine* engine;
+    IExecutionContext* context;
+    void buffers[2];
+    cudaStream_t stream;
+  };
+  ```
+  And then use `cudaSetDevice()` to make each engine you create running on specific device. Moreover, to maximize performance, make sure that the engine file you are using to deserialize is the one tensor RT optimized for this device.
+  
+  ## 3. Use function wisely
+  Here are some knowledge I learned when trying to parallelize the inference.
+  1) Do not use synchronized function , like `cudaFree()`, during inference.
+  2) Using `cudaMallocHost()` instead of `malloc()` when allocating memory on the host side.
diff --git a/yolov4/yololayer.cu b/yolov4/yololayer.cu
@@ -13,6 +13,15 @@ namespace nvinfer1
         mYoloKernel.push_back(yolo3);
 
         mKernelCount = mYoloKernel.size();
+
+        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
+        size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
+        for(int ii = 0; ii < mKernelCount; ii ++)
+        {
+            CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen));
+            const auto& yolo = mYoloKernel[ii];
+            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
+        }
     }
     
     YoloLayerPlugin::~YoloLayerPlugin()
@@ -32,6 +41,15 @@ namespace nvinfer1
         memcpy(mYoloKernel.data(),d,kernelSize);
         d += kernelSize;
 
+        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
+        size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
+        for(int ii = 0; ii < mKernelCount; ii ++)
+        {
+            CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen));
+            const auto& yolo = mYoloKernel[ii];
+            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
+        }
+
         assert(d == a + length);
     }
 
@@ -179,9 +197,6 @@ namespace nvinfer1
     }
 
     void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
-        void* devAnchor;
-        size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
-        CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen));
 
         int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
 
@@ -195,12 +210,10 @@ namespace nvinfer1
             numElem = yolo.width*yolo.height*batchSize;
             if (numElem < mThreadCount)
                 mThreadCount = numElem;
-            CUDA_CHECK(cudaMemcpy(devAnchor, yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
             CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>
-                (inputs[i],output, numElem, yolo.width, yolo.height, (float *)devAnchor, mClassCount ,outputElem);
+                (inputs[i],output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount ,outputElem);
         }
 
-        CUDA_CHECK(cudaFree(devAnchor));
     }
 
 
diff --git a/yolov4/yololayer.h b/yolov4/yololayer.h
@@ -116,6 +116,7 @@ namespace nvinfer1
             int mKernelCount;
             std::vector<Yolo::YoloKernel> mYoloKernel;
             int mThreadCount = 256;
+            void** mAnchor;
             const char* mPluginNamespace;
     };