@@ -54,9 +54,7 @@ void cv::gpu::mulAndScaleSpectrums(InputArray, InputArray, OutputArray, int, flo
54
54
55
55
void cv::gpu::dft (InputArray, OutputArray, Size , int , Stream&) { throw_no_cuda (); }
56
56
57
- void cv::gpu::ConvolveBuf::create (Size , Size ) { throw_no_cuda (); }
58
- void cv::gpu::convolve (const GpuMat&, const GpuMat&, GpuMat&, bool ) { throw_no_cuda (); }
59
- void cv::gpu::convolve (const GpuMat&, const GpuMat&, GpuMat&, bool , ConvolveBuf&, Stream&) { throw_no_cuda (); }
57
+ Ptr <Convolution> cv::gpu::createConvolution (Size ) { throw_no_cuda (); return Ptr <Convolution>(); }
60
58
61
59
#else /* !defined (HAVE_CUDA) */
62
60
@@ -486,136 +484,152 @@ void cv::gpu::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, S
486
484
}
487
485
488
486
// ////////////////////////////////////////////////////////////////////////////
489
- // convolve
487
+ // Convolution
490
488
491
- void cv::gpu::ConvolveBuf::create (Size image_size, Size templ_size)
489
+ #ifdef HAVE_CUFFT
490
+
491
+ namespace
492
492
{
493
- result_size = Size (image_size.width - templ_size.width + 1 ,
494
- image_size.height - templ_size.height + 1 );
495
-
496
- block_size = user_block_size;
497
- if (user_block_size.width == 0 || user_block_size.height == 0 )
498
- block_size = estimateBlockSize (result_size, templ_size);
499
-
500
- dft_size.width = 1 << int (ceil (std::log (block_size.width + templ_size.width - 1 .) / std::log (2 .)));
501
- dft_size.height = 1 << int (ceil (std::log (block_size.height + templ_size.height - 1 .) / std::log (2 .)));
502
-
503
- // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192),
504
- // see CUDA Toolkit 4.1 CUFFT Library Programming Guide
505
- if (dft_size.width > 8192 )
506
- dft_size.width = getOptimalDFTSize (block_size.width + templ_size.width - 1 );
507
- if (dft_size.height > 8192 )
508
- dft_size.height = getOptimalDFTSize (block_size.height + templ_size.height - 1 );
509
-
510
- // To avoid wasting time doing small DFTs
511
- dft_size.width = std::max (dft_size.width , 512 );
512
- dft_size.height = std::max (dft_size.height , 512 );
513
-
514
- createContinuous (dft_size, CV_32F, image_block);
515
- createContinuous (dft_size, CV_32F, templ_block);
516
- createContinuous (dft_size, CV_32F, result_data);
517
-
518
- spect_len = dft_size.height * (dft_size.width / 2 + 1 );
519
- createContinuous (1 , spect_len, CV_32FC2, image_spect);
520
- createContinuous (1 , spect_len, CV_32FC2, templ_spect);
521
- createContinuous (1 , spect_len, CV_32FC2, result_spect);
522
-
523
- // Use maximum result matrix block size for the estimated DFT block size
524
- block_size.width = std::min (dft_size.width - templ_size.width + 1 , result_size.width );
525
- block_size.height = std::min (dft_size.height - templ_size.height + 1 , result_size.height );
526
- }
493
+ class ConvolutionImpl : public Convolution
494
+ {
495
+ public:
496
+ explicit ConvolutionImpl (Size user_block_size_) : user_block_size(user_block_size_) {}
527
497
498
+ void convolve (InputArray image, InputArray templ, OutputArray result, bool ccorr = false , Stream& stream = Stream::Null());
528
499
529
- Size cv::gpu::ConvolveBuf::estimateBlockSize (Size result_size, Size /* templ_size*/ )
530
- {
531
- int width = (result_size.width + 2 ) / 3 ;
532
- int height = (result_size.height + 2 ) / 3 ;
533
- width = std::min (width, result_size.width );
534
- height = std::min (height, result_size.height );
535
- return Size (width, height);
536
- }
500
+ private:
501
+ void create (Size image_size, Size templ_size);
502
+ static Size estimateBlockSize (Size result_size);
537
503
504
+ Size result_size;
505
+ Size block_size;
506
+ Size user_block_size;
507
+ Size dft_size;
508
+ int spect_len;
538
509
539
- void cv::gpu::convolve (const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr)
540
- {
541
- ConvolveBuf buf;
542
- gpu::convolve (image, templ, result, ccorr, buf);
543
- }
510
+ GpuMat image_spect, templ_spect, result_spect;
511
+ GpuMat image_block, templ_block, result_data;
512
+ };
544
513
545
- void cv::gpu::convolve (const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)
546
- {
547
- #ifndef HAVE_CUFFT
548
- (void ) image;
549
- (void ) templ;
550
- (void ) result;
551
- (void ) ccorr;
552
- (void ) buf;
553
- (void ) stream;
554
- throw_no_cuda ();
555
- #else
556
- CV_Assert (image.type () == CV_32F);
557
- CV_Assert (templ.type () == CV_32F);
514
+ void ConvolutionImpl::create (Size image_size, Size templ_size)
515
+ {
516
+ result_size = Size (image_size.width - templ_size.width + 1 ,
517
+ image_size.height - templ_size.height + 1 );
518
+
519
+ block_size = user_block_size;
520
+ if (user_block_size.width == 0 || user_block_size.height == 0 )
521
+ block_size = estimateBlockSize (result_size);
522
+
523
+ dft_size.width = 1 << int (ceil (std::log (block_size.width + templ_size.width - 1 .) / std::log (2 .)));
524
+ dft_size.height = 1 << int (ceil (std::log (block_size.height + templ_size.height - 1 .) / std::log (2 .)));
525
+
526
+ // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192),
527
+ // see CUDA Toolkit 4.1 CUFFT Library Programming Guide
528
+ if (dft_size.width > 8192 )
529
+ dft_size.width = getOptimalDFTSize (block_size.width + templ_size.width - 1 );
530
+ if (dft_size.height > 8192 )
531
+ dft_size.height = getOptimalDFTSize (block_size.height + templ_size.height - 1 );
532
+
533
+ // To avoid wasting time doing small DFTs
534
+ dft_size.width = std::max (dft_size.width , 512 );
535
+ dft_size.height = std::max (dft_size.height , 512 );
536
+
537
+ createContinuous (dft_size, CV_32F, image_block);
538
+ createContinuous (dft_size, CV_32F, templ_block);
539
+ createContinuous (dft_size, CV_32F, result_data);
540
+
541
+ spect_len = dft_size.height * (dft_size.width / 2 + 1 );
542
+ createContinuous (1 , spect_len, CV_32FC2, image_spect);
543
+ createContinuous (1 , spect_len, CV_32FC2, templ_spect);
544
+ createContinuous (1 , spect_len, CV_32FC2, result_spect);
545
+
546
+ // Use maximum result matrix block size for the estimated DFT block size
547
+ block_size.width = std::min (dft_size.width - templ_size.width + 1 , result_size.width );
548
+ block_size.height = std::min (dft_size.height - templ_size.height + 1 , result_size.height );
549
+ }
550
+
551
+ Size ConvolutionImpl::estimateBlockSize (Size result_size)
552
+ {
553
+ int width = (result_size.width + 2 ) / 3 ;
554
+ int height = (result_size.height + 2 ) / 3 ;
555
+ width = std::min (width, result_size.width );
556
+ height = std::min (height, result_size.height );
557
+ return Size (width, height);
558
+ }
558
559
559
- buf.create (image.size (), templ.size ());
560
- result.create (buf.result_size , CV_32F);
560
+ void ConvolutionImpl::convolve (InputArray _image, InputArray _templ, OutputArray _result, bool ccorr, Stream& _stream)
561
+ {
562
+ GpuMat image = _image.getGpuMat ();
563
+ GpuMat templ = _templ.getGpuMat ();
561
564
562
- Size & block_size = buf. block_size ;
563
- Size & dft_size = buf. dft_size ;
565
+ CV_Assert ( image. type () == CV_32FC1 ) ;
566
+ CV_Assert ( templ. type () == CV_32FC1 ) ;
564
567
565
- GpuMat& image_block = buf.image_block ;
566
- GpuMat& templ_block = buf.templ_block ;
567
- GpuMat& result_data = buf.result_data ;
568
+ create (image.size (), templ.size ());
568
569
569
- GpuMat& image_spect = buf.image_spect ;
570
- GpuMat& templ_spect = buf.templ_spect ;
571
- GpuMat& result_spect = buf.result_spect ;
570
+ _result.create (result_size, CV_32FC1);
571
+ GpuMat result = _result.getGpuMat ();
572
572
573
- cufftHandle planR2C, planC2R;
574
- cufftSafeCall (cufftPlan2d (&planC2R, dft_size.height , dft_size.width , CUFFT_C2R));
575
- cufftSafeCall (cufftPlan2d (&planR2C, dft_size.height , dft_size.width , CUFFT_R2C));
573
+ cudaStream_t stream = StreamAccessor::getStream (_stream);
576
574
577
- cufftSafeCall ( cufftSetStream (planR2C, StreamAccessor::getStream (stream)) );
578
- cufftSafeCall ( cufftSetStream (planC2R, StreamAccessor::getStream (stream)) );
575
+ cufftHandle planR2C, planC2R;
576
+ cufftSafeCall ( cufftPlan2d (&planC2R, dft_size.height , dft_size.width , CUFFT_C2R) );
577
+ cufftSafeCall ( cufftPlan2d (&planR2C, dft_size.height , dft_size.width , CUFFT_R2C) );
579
578
580
- GpuMat templ_roi (templ.size (), CV_32F, templ.data , templ.step );
581
- gpu::copyMakeBorder (templ_roi, templ_block, 0 , templ_block.rows - templ_roi.rows , 0 ,
582
- templ_block.cols - templ_roi.cols , 0 , Scalar (), stream);
579
+ cufftSafeCall ( cufftSetStream (planR2C, stream) );
580
+ cufftSafeCall ( cufftSetStream (planC2R, stream) );
583
581
584
- cufftSafeCall (cufftExecR2C (planR2C, templ_block.ptr <cufftReal>(),
585
- templ_spect.ptr <cufftComplex>()));
582
+ GpuMat templ_roi (templ.size (), CV_32FC1, templ.data , templ.step );
583
+ gpu::copyMakeBorder (templ_roi, templ_block, 0 , templ_block.rows - templ_roi.rows , 0 ,
584
+ templ_block.cols - templ_roi.cols , 0 , Scalar (), _stream);
586
585
587
- // Process all blocks of the result matrix
588
- for ( int y = 0 ; y < result. rows ; y += block_size. height )
589
- {
590
- for (int x = 0 ; x < result.cols ; x += block_size.width )
586
+ cufftSafeCall ( cufftExecR2C (planR2C, templ_block. ptr <cufftReal>(), templ_spect. ptr <cufftComplex>()) );
587
+
588
+ // Process all blocks of the result matrix
589
+ for (int y = 0 ; y < result.rows ; y += block_size.height )
591
590
{
592
- Size image_roi_size (std::min (x + dft_size.width , image.cols ) - x,
593
- std::min (y + dft_size.height , image.rows ) - y);
594
- GpuMat image_roi (image_roi_size, CV_32F, (void *)(image.ptr <float >(y) + x),
595
- image.step );
596
- gpu::copyMakeBorder (image_roi, image_block, 0 , image_block.rows - image_roi.rows ,
597
- 0 , image_block.cols - image_roi.cols , 0 , Scalar (), stream);
598
-
599
- cufftSafeCall (cufftExecR2C (planR2C, image_block.ptr <cufftReal>(),
600
- image_spect.ptr <cufftComplex>()));
601
- gpu::mulAndScaleSpectrums (image_spect, templ_spect, result_spect, 0 ,
602
- 1 .f / dft_size.area (), ccorr, stream);
603
- cufftSafeCall (cufftExecC2R (planC2R, result_spect.ptr <cufftComplex>(),
604
- result_data.ptr <cufftReal>()));
605
-
606
- Size result_roi_size (std::min (x + block_size.width , result.cols ) - x,
607
- std::min (y + block_size.height , result.rows ) - y);
608
- GpuMat result_roi (result_roi_size, result.type (),
609
- (void *)(result.ptr <float >(y) + x), result.step );
610
- GpuMat result_block (result_roi_size, result_data.type (),
611
- result_data.ptr (), result_data.step );
612
-
613
- result_block.copyTo (result_roi, stream);
591
+ for (int x = 0 ; x < result.cols ; x += block_size.width )
592
+ {
593
+ Size image_roi_size (std::min (x + dft_size.width , image.cols ) - x,
594
+ std::min (y + dft_size.height , image.rows ) - y);
595
+ GpuMat image_roi (image_roi_size, CV_32F, (void *)(image.ptr <float >(y) + x),
596
+ image.step );
597
+ gpu::copyMakeBorder (image_roi, image_block, 0 , image_block.rows - image_roi.rows ,
598
+ 0 , image_block.cols - image_roi.cols , 0 , Scalar (), _stream);
599
+
600
+ cufftSafeCall (cufftExecR2C (planR2C, image_block.ptr <cufftReal>(),
601
+ image_spect.ptr <cufftComplex>()));
602
+ gpu::mulAndScaleSpectrums (image_spect, templ_spect, result_spect, 0 ,
603
+ 1 .f / dft_size.area (), ccorr, _stream);
604
+ cufftSafeCall (cufftExecC2R (planC2R, result_spect.ptr <cufftComplex>(),
605
+ result_data.ptr <cufftReal>()));
606
+
607
+ Size result_roi_size (std::min (x + block_size.width , result.cols ) - x,
608
+ std::min (y + block_size.height , result.rows ) - y);
609
+ GpuMat result_roi (result_roi_size, result.type (),
610
+ (void *)(result.ptr <float >(y) + x), result.step );
611
+ GpuMat result_block (result_roi_size, result_data.type (),
612
+ result_data.ptr (), result_data.step );
613
+
614
+ result_block.copyTo (result_roi, _stream);
615
+ }
614
616
}
617
+
618
+ cufftSafeCall ( cufftDestroy (planR2C) );
619
+ cufftSafeCall ( cufftDestroy (planC2R) );
615
620
}
621
+ }
622
+
623
+ #endif
616
624
617
- cufftSafeCall (cufftDestroy (planR2C));
618
- cufftSafeCall (cufftDestroy (planC2R));
625
+ Ptr <Convolution> cv::gpu::createConvolution (Size user_block_size)
626
+ {
627
+ #ifndef HAVE_CUBLAS
628
+ (void ) user_block_size;
629
+ CV_Error (cv::Error::StsNotImplemented, " The library was build without CUFFT" );
630
+ return Ptr <BLAS>();
631
+ #else
632
+ return new ConvolutionImpl (user_block_size);
619
633
#endif
620
634
}
621
635
0 commit comments