Skip to content

Commit 8461cb3

Browse files
author
Vladislav Vinogradov
committed
refactored gpu::convolve function:
* converted it to Algorithm * old API still can be used for source compatibility (marked as deprecated)
1 parent 26a4be8 commit 8461cb3

File tree

5 files changed

+162
-128
lines changed

5 files changed

+162
-128
lines changed

modules/gpuarithm/include/opencv2/gpuarithm.hpp

+24-8
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,23 @@ CV_EXPORTS void mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArr
374374
//! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format.
375375
CV_EXPORTS void dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());
376376

377-
struct CV_EXPORTS ConvolveBuf
377+
//! computes convolution (or cross-correlation) of two images using discrete Fourier transform
378+
//! supports source images of 32FC1 type only
379+
//! result matrix will have 32FC1 type
380+
class CV_EXPORTS Convolution : public Algorithm
381+
{
382+
public:
383+
virtual void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null()) = 0;
384+
};
385+
CV_EXPORTS Ptr<Convolution> createConvolution(Size user_block_size = Size());
386+
387+
__OPENCV_GPUARITHM_DEPR_BEFORE__ void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null()) __OPENCV_GPUARITHM_DEPR_AFTER__;
388+
inline void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr , Stream& stream)
389+
{
390+
createConvolution()->convolve(image, templ, result, ccorr, stream);
391+
}
392+
393+
struct ConvolveBuf
378394
{
379395
Size result_size;
380396
Size block_size;
@@ -385,15 +401,15 @@ struct CV_EXPORTS ConvolveBuf
385401
GpuMat image_spect, templ_spect, result_spect;
386402
GpuMat image_block, templ_block, result_data;
387403

388-
void create(Size image_size, Size templ_size);
389-
static Size estimateBlockSize(Size result_size, Size templ_size);
404+
void create(Size, Size){}
405+
static Size estimateBlockSize(Size, Size){ return Size(); }
390406
};
391407

392-
//! computes convolution (or cross-correlation) of two images using discrete Fourier transform
393-
//! supports source images of 32FC1 type only
394-
//! result matrix will have 32FC1 type
395-
CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false);
396-
CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null());
408+
__OPENCV_GPUARITHM_DEPR_BEFORE__ void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null()) __OPENCV_GPUARITHM_DEPR_AFTER__;
409+
inline void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr, ConvolveBuf& buf, Stream& stream)
410+
{
411+
createConvolution(buf.user_block_size)->convolve(image, templ, result, ccorr, stream);
412+
}
397413

398414
}} // namespace cv { namespace gpu {
399415

modules/gpuarithm/perf/perf_arithm.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -228,10 +228,11 @@ PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve,
228228
cv::gpu::GpuMat d_templ = cv::gpu::createContinuous(templ_size, templ_size, CV_32FC1);
229229
d_templ.upload(templ);
230230

231+
cv::Ptr<cv::gpu::Convolution> convolution = cv::gpu::createConvolution();
232+
231233
cv::gpu::GpuMat dst;
232-
cv::gpu::ConvolveBuf d_buf;
233234

234-
TEST_CYCLE() cv::gpu::convolve(d_image, d_templ, dst, ccorr, d_buf);
235+
TEST_CYCLE() convolution->convolve(d_image, d_templ, dst, ccorr);
235236

236237
GPU_SANITY_CHECK(dst);
237238
}

modules/gpuarithm/src/arithm.cpp

+127-113
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,7 @@ void cv::gpu::mulAndScaleSpectrums(InputArray, InputArray, OutputArray, int, flo
5454

5555
void cv::gpu::dft(InputArray, OutputArray, Size, int, Stream&) { throw_no_cuda(); }
5656

57-
void cv::gpu::ConvolveBuf::create(Size, Size) { throw_no_cuda(); }
58-
void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_no_cuda(); }
59-
void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream&) { throw_no_cuda(); }
57+
Ptr<Convolution> cv::gpu::createConvolution(Size) { throw_no_cuda(); return Ptr<Convolution>(); }
6058

6159
#else /* !defined (HAVE_CUDA) */
6260

@@ -486,136 +484,152 @@ void cv::gpu::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, S
486484
}
487485

488486
//////////////////////////////////////////////////////////////////////////////
489-
// convolve
487+
// Convolution
490488

491-
void cv::gpu::ConvolveBuf::create(Size image_size, Size templ_size)
489+
#ifdef HAVE_CUFFT
490+
491+
namespace
492492
{
493-
result_size = Size(image_size.width - templ_size.width + 1,
494-
image_size.height - templ_size.height + 1);
495-
496-
block_size = user_block_size;
497-
if (user_block_size.width == 0 || user_block_size.height == 0)
498-
block_size = estimateBlockSize(result_size, templ_size);
499-
500-
dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.)));
501-
dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.)));
502-
503-
// CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192),
504-
// see CUDA Toolkit 4.1 CUFFT Library Programming Guide
505-
if (dft_size.width > 8192)
506-
dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1);
507-
if (dft_size.height > 8192)
508-
dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1);
509-
510-
// To avoid wasting time doing small DFTs
511-
dft_size.width = std::max(dft_size.width, 512);
512-
dft_size.height = std::max(dft_size.height, 512);
513-
514-
createContinuous(dft_size, CV_32F, image_block);
515-
createContinuous(dft_size, CV_32F, templ_block);
516-
createContinuous(dft_size, CV_32F, result_data);
517-
518-
spect_len = dft_size.height * (dft_size.width / 2 + 1);
519-
createContinuous(1, spect_len, CV_32FC2, image_spect);
520-
createContinuous(1, spect_len, CV_32FC2, templ_spect);
521-
createContinuous(1, spect_len, CV_32FC2, result_spect);
522-
523-
// Use maximum result matrix block size for the estimated DFT block size
524-
block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width);
525-
block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height);
526-
}
493+
class ConvolutionImpl : public Convolution
494+
{
495+
public:
496+
explicit ConvolutionImpl(Size user_block_size_) : user_block_size(user_block_size_) {}
527497

498+
void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null());
528499

529-
Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size*/)
530-
{
531-
int width = (result_size.width + 2) / 3;
532-
int height = (result_size.height + 2) / 3;
533-
width = std::min(width, result_size.width);
534-
height = std::min(height, result_size.height);
535-
return Size(width, height);
536-
}
500+
private:
501+
void create(Size image_size, Size templ_size);
502+
static Size estimateBlockSize(Size result_size);
537503

504+
Size result_size;
505+
Size block_size;
506+
Size user_block_size;
507+
Size dft_size;
508+
int spect_len;
538509

539-
void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr)
540-
{
541-
ConvolveBuf buf;
542-
gpu::convolve(image, templ, result, ccorr, buf);
543-
}
510+
GpuMat image_spect, templ_spect, result_spect;
511+
GpuMat image_block, templ_block, result_data;
512+
};
544513

545-
void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)
546-
{
547-
#ifndef HAVE_CUFFT
548-
(void) image;
549-
(void) templ;
550-
(void) result;
551-
(void) ccorr;
552-
(void) buf;
553-
(void) stream;
554-
throw_no_cuda();
555-
#else
556-
CV_Assert(image.type() == CV_32F);
557-
CV_Assert(templ.type() == CV_32F);
514+
void ConvolutionImpl::create(Size image_size, Size templ_size)
515+
{
516+
result_size = Size(image_size.width - templ_size.width + 1,
517+
image_size.height - templ_size.height + 1);
518+
519+
block_size = user_block_size;
520+
if (user_block_size.width == 0 || user_block_size.height == 0)
521+
block_size = estimateBlockSize(result_size);
522+
523+
dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.)));
524+
dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.)));
525+
526+
// CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192),
527+
// see CUDA Toolkit 4.1 CUFFT Library Programming Guide
528+
if (dft_size.width > 8192)
529+
dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1);
530+
if (dft_size.height > 8192)
531+
dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1);
532+
533+
// To avoid wasting time doing small DFTs
534+
dft_size.width = std::max(dft_size.width, 512);
535+
dft_size.height = std::max(dft_size.height, 512);
536+
537+
createContinuous(dft_size, CV_32F, image_block);
538+
createContinuous(dft_size, CV_32F, templ_block);
539+
createContinuous(dft_size, CV_32F, result_data);
540+
541+
spect_len = dft_size.height * (dft_size.width / 2 + 1);
542+
createContinuous(1, spect_len, CV_32FC2, image_spect);
543+
createContinuous(1, spect_len, CV_32FC2, templ_spect);
544+
createContinuous(1, spect_len, CV_32FC2, result_spect);
545+
546+
// Use maximum result matrix block size for the estimated DFT block size
547+
block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width);
548+
block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height);
549+
}
550+
551+
Size ConvolutionImpl::estimateBlockSize(Size result_size)
552+
{
553+
int width = (result_size.width + 2) / 3;
554+
int height = (result_size.height + 2) / 3;
555+
width = std::min(width, result_size.width);
556+
height = std::min(height, result_size.height);
557+
return Size(width, height);
558+
}
558559

559-
buf.create(image.size(), templ.size());
560-
result.create(buf.result_size, CV_32F);
560+
void ConvolutionImpl::convolve(InputArray _image, InputArray _templ, OutputArray _result, bool ccorr, Stream& _stream)
561+
{
562+
GpuMat image = _image.getGpuMat();
563+
GpuMat templ = _templ.getGpuMat();
561564

562-
Size& block_size = buf.block_size;
563-
Size& dft_size = buf.dft_size;
565+
CV_Assert( image.type() == CV_32FC1 );
566+
CV_Assert( templ.type() == CV_32FC1 );
564567

565-
GpuMat& image_block = buf.image_block;
566-
GpuMat& templ_block = buf.templ_block;
567-
GpuMat& result_data = buf.result_data;
568+
create(image.size(), templ.size());
568569

569-
GpuMat& image_spect = buf.image_spect;
570-
GpuMat& templ_spect = buf.templ_spect;
571-
GpuMat& result_spect = buf.result_spect;
570+
_result.create(result_size, CV_32FC1);
571+
GpuMat result = _result.getGpuMat();
572572

573-
cufftHandle planR2C, planC2R;
574-
cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
575-
cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));
573+
cudaStream_t stream = StreamAccessor::getStream(_stream);
576574

577-
cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );
578-
cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );
575+
cufftHandle planR2C, planC2R;
576+
cufftSafeCall( cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R) );
577+
cufftSafeCall( cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C) );
579578

580-
GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
581-
gpu::copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
582-
templ_block.cols - templ_roi.cols, 0, Scalar(), stream);
579+
cufftSafeCall( cufftSetStream(planR2C, stream) );
580+
cufftSafeCall( cufftSetStream(planC2R, stream) );
583581

584-
cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(),
585-
templ_spect.ptr<cufftComplex>()));
582+
GpuMat templ_roi(templ.size(), CV_32FC1, templ.data, templ.step);
583+
gpu::copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
584+
templ_block.cols - templ_roi.cols, 0, Scalar(), _stream);
586585

587-
// Process all blocks of the result matrix
588-
for (int y = 0; y < result.rows; y += block_size.height)
589-
{
590-
for (int x = 0; x < result.cols; x += block_size.width)
586+
cufftSafeCall( cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(), templ_spect.ptr<cufftComplex>()) );
587+
588+
// Process all blocks of the result matrix
589+
for (int y = 0; y < result.rows; y += block_size.height)
591590
{
592-
Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
593-
std::min(y + dft_size.height, image.rows) - y);
594-
GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
595-
image.step);
596-
gpu::copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
597-
0, image_block.cols - image_roi.cols, 0, Scalar(), stream);
598-
599-
cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
600-
image_spect.ptr<cufftComplex>()));
601-
gpu::mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
602-
1.f / dft_size.area(), ccorr, stream);
603-
cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
604-
result_data.ptr<cufftReal>()));
605-
606-
Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
607-
std::min(y + block_size.height, result.rows) - y);
608-
GpuMat result_roi(result_roi_size, result.type(),
609-
(void*)(result.ptr<float>(y) + x), result.step);
610-
GpuMat result_block(result_roi_size, result_data.type(),
611-
result_data.ptr(), result_data.step);
612-
613-
result_block.copyTo(result_roi, stream);
591+
for (int x = 0; x < result.cols; x += block_size.width)
592+
{
593+
Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
594+
std::min(y + dft_size.height, image.rows) - y);
595+
GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
596+
image.step);
597+
gpu::copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
598+
0, image_block.cols - image_roi.cols, 0, Scalar(), _stream);
599+
600+
cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
601+
image_spect.ptr<cufftComplex>()));
602+
gpu::mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
603+
1.f / dft_size.area(), ccorr, _stream);
604+
cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
605+
result_data.ptr<cufftReal>()));
606+
607+
Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
608+
std::min(y + block_size.height, result.rows) - y);
609+
GpuMat result_roi(result_roi_size, result.type(),
610+
(void*)(result.ptr<float>(y) + x), result.step);
611+
GpuMat result_block(result_roi_size, result_data.type(),
612+
result_data.ptr(), result_data.step);
613+
614+
result_block.copyTo(result_roi, _stream);
615+
}
614616
}
617+
618+
cufftSafeCall( cufftDestroy(planR2C) );
619+
cufftSafeCall( cufftDestroy(planC2R) );
615620
}
621+
}
622+
623+
#endif
616624

617-
cufftSafeCall(cufftDestroy(planR2C));
618-
cufftSafeCall(cufftDestroy(planC2R));
625+
Ptr<Convolution> cv::gpu::createConvolution(Size user_block_size)
626+
{
627+
#ifndef HAVE_CUBLAS
628+
(void) user_block_size;
629+
CV_Error(cv::Error::StsNotImplemented, "The library was build without CUFFT");
630+
return Ptr<BLAS>();
631+
#else
632+
return new ConvolutionImpl(user_block_size);
619633
#endif
620634
}
621635

modules/gpuarithm/test/test_arithm.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -419,8 +419,10 @@ GPU_TEST_P(Convolve, Accuracy)
419419
cv::Mat src = randomMat(size, CV_32FC1, 0.0, 100.0);
420420
cv::Mat kernel = randomMat(cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0);
421421

422+
cv::Ptr<cv::gpu::Convolution> conv = cv::gpu::createConvolution();
423+
422424
cv::gpu::GpuMat dst;
423-
cv::gpu::convolve(loadMat(src), loadMat(kernel), dst, ccorr);
425+
conv->convolve(loadMat(src), loadMat(kernel), dst, ccorr);
424426

425427
cv::Mat dst_gold;
426428
convolveDFT(src, kernel, dst_gold, ccorr);

modules/gpuimgproc/src/match_template.cpp

+5-4
Original file line numberDiff line numberDiff line change
@@ -172,15 +172,16 @@ namespace
172172
return;
173173
}
174174

175-
gpu::ConvolveBuf convolve_buf;
176-
convolve_buf.user_block_size = buf.user_block_size;
175+
Ptr<gpu::Convolution> conv = gpu::createConvolution(buf.user_block_size);
177176

178177
if (image.channels() == 1)
179-
gpu::convolve(image.reshape(1), templ.reshape(1), result, true, convolve_buf, stream);
178+
{
179+
conv->convolve(image.reshape(1), templ.reshape(1), result, true, stream);
180+
}
180181
else
181182
{
182183
GpuMat result_;
183-
gpu::convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf, stream);
184+
conv->convolve(image.reshape(1), templ.reshape(1), result_, true, stream);
184185
extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
185186
}
186187
}

0 commit comments

Comments
 (0)