Skip to content

Commit 2fe6dba

Browse files
authored
Merge pull request wang-xinyu#1117 from LiberiFatali/master
Attach the decode kernel to the same CUDA stream
2 parents 269501d + 4f61d7c commit 2fe6dba

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

retinaface/decode.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,14 +176,14 @@ namespace nvinfer1
176176
totalCount += decodeplugin::INPUT_H / 16 * decodeplugin::INPUT_W / 16 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
177177
totalCount += decodeplugin::INPUT_H / 32 * decodeplugin::INPUT_W / 32 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
178178
for(int idx = 0 ; idx < batchSize; ++idx) {
179-
cudaMemset(output + idx * totalCount, 0, sizeof(float));
179+
cudaMemsetAsync(output + idx * totalCount, 0, sizeof(float), stream);
180180
}
181181

182182
for (unsigned int i = 0; i < 3; ++i)
183183
{
184184
num_elem = batchSize * decodeplugin::INPUT_H / base_step * decodeplugin::INPUT_W / base_step;
185185
thread_count = (num_elem < thread_count_) ? num_elem : thread_count_;
186-
CalDetection<<< (num_elem + thread_count - 1) / thread_count, thread_count>>>
186+
CalDetection<<< (num_elem + thread_count - 1) / thread_count, thread_count, 0, stream>>>
187187
(inputs[i], output, num_elem, base_step, base_anchor, totalCount);
188188
base_step *= 2;
189189
base_anchor *= 4;

0 commit comments

Comments
 (0)