Skip to content

Commit 539f367

Browse files
author
Vladislav Vinogradov
committed
refactored gpu::LUT function:
* converted it to Algorithm, because implementation uses inner buffers and requires preprocessing step * new implementation splits preprocessing and transform, what is more effecient * old API still can be used for source compatibility (marked as deprecated)
1 parent 0c50d08 commit 539f367

File tree

4 files changed

+219
-65
lines changed

4 files changed

+219
-65
lines changed

modules/gpuarithm/include/opencv2/gpuarithm.hpp

+30-5
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,17 @@
4949

5050
#include "opencv2/core/gpu.hpp"
5151

52+
#if defined __GNUC__
53+
#define __OPENCV_GPUARITHM_DEPR_BEFORE__
54+
#define __OPENCV_GPUARITHM_DEPR_AFTER__ __attribute__ ((deprecated))
55+
#elif (defined WIN32 || defined _WIN32)
56+
#define __OPENCV_GPUARITHM_DEPR_BEFORE__ __declspec(deprecated)
57+
#define __OPENCV_GPUARITHM_DEPR_AFTER__
58+
#else
59+
#define __OPENCV_GPUARITHM_DEPR_BEFORE__
60+
#define __OPENCV_GPUARITHM_DEPR_AFTER__
61+
#endif
62+
5263
namespace cv { namespace gpu {
5364

5465
//! adds one matrix to another (dst = src1 + src2)
@@ -178,14 +189,25 @@ CV_EXPORTS void transpose(InputArray src1, OutputArray dst, Stream& stream = Str
178189
//! supports 1, 3 and 4 channels images with CV_8U, CV_16U, CV_32S or CV_32F depth
179190
CV_EXPORTS void flip(InputArray src, OutputArray dst, int flipCode, Stream& stream = Stream::Null());
180191

181-
//! implements generalized matrix product algorithm GEMM from BLAS
182-
CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha,
183-
const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null());
184-
185192
//! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
186193
//! destination array will have the depth type as lut and the same channels number as source
187194
//! supports CV_8UC1, CV_8UC3 types
188-
CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null());
195+
class CV_EXPORTS LookUpTable : public Algorithm
196+
{
197+
public:
198+
virtual void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
199+
};
200+
CV_EXPORTS Ptr<LookUpTable> createLookUpTable(InputArray lut);
201+
202+
__OPENCV_GPUARITHM_DEPR_BEFORE__ void LUT(InputArray src, InputArray lut, OutputArray dst, Stream& stream = Stream::Null()) __OPENCV_GPUARITHM_DEPR_AFTER__;
203+
inline void LUT(InputArray src, InputArray lut, OutputArray dst, Stream& stream)
204+
{
205+
createLookUpTable(lut)->transform(src, dst, stream);
206+
}
207+
208+
//! implements generalized matrix product algorithm GEMM from BLAS
209+
CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha,
210+
const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null());
189211

190212
//! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values
191213
CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0,
@@ -311,4 +333,7 @@ CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& resul
311333

312334
}} // namespace cv { namespace gpu {
313335

336+
#undef __OPENCV_GPUARITHM_DEPR_BEFORE__
337+
#undef __OPENCV_GPUARITHM_DEPR_AFTER__
338+
314339
#endif /* __OPENCV_GPUARITHM_HPP__ */

modules/gpuarithm/perf/perf_core.cpp

+6-2
Original file line numberDiff line numberDiff line change
@@ -224,10 +224,12 @@ PERF_TEST_P(Sz_Type, LutOneChannel,
224224

225225
if (PERF_RUN_GPU())
226226
{
227+
cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
228+
227229
const cv::gpu::GpuMat d_src(src);
228230
cv::gpu::GpuMat dst;
229231

230-
TEST_CYCLE() cv::gpu::LUT(d_src, lut, dst);
232+
TEST_CYCLE() lutAlg->transform(d_src, dst);
231233

232234
GPU_SANITY_CHECK(dst);
233235
}
@@ -259,10 +261,12 @@ PERF_TEST_P(Sz_Type, LutMultiChannel,
259261

260262
if (PERF_RUN_GPU())
261263
{
264+
cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
265+
262266
const cv::gpu::GpuMat d_src(src);
263267
cv::gpu::GpuMat dst;
264268

265-
TEST_CYCLE() cv::gpu::LUT(d_src, lut, dst);
269+
TEST_CYCLE() lutAlg->transform(d_src, dst);
266270

267271
GPU_SANITY_CHECK(dst);
268272
}

modules/gpuarithm/src/core.cpp

+177-56
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ void cv::gpu::transpose(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
5757

5858
void cv::gpu::flip(InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }
5959

60-
void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
60+
Ptr<LookUpTable> cv::gpu::createLookUpTable(InputArray) { throw_no_cuda(); return Ptr<LookUpTable>(); }
6161

6262
void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, int, const Scalar&, Stream&) { throw_no_cuda(); }
6363

@@ -290,93 +290,214 @@ void cv::gpu::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& stre
290290
////////////////////////////////////////////////////////////////////////
291291
// LUT
292292

293-
void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
293+
#if (CUDA_VERSION >= 5000)
294+
295+
namespace
294296
{
295-
const int cn = src.channels();
297+
class LookUpTableImpl : public LookUpTable
298+
{
299+
public:
300+
LookUpTableImpl(InputArray lut);
296301

297-
CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
298-
CV_Assert( lut.depth() == CV_8U );
299-
CV_Assert( lut.channels() == 1 || lut.channels() == cn );
300-
CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() );
302+
void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
301303

302-
dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn));
304+
private:
305+
int lut_cn;
303306

304-
NppiSize sz;
305-
sz.height = src.rows;
306-
sz.width = src.cols;
307+
int nValues3[3];
308+
const Npp32s* pValues3[3];
309+
const Npp32s* pLevels3[3];
307310

308-
Mat nppLut;
309-
lut.convertTo(nppLut, CV_32S);
311+
GpuMat d_pLevels;
312+
GpuMat d_nppLut;
313+
GpuMat d_nppLut3[3];
314+
};
310315

311-
int nValues3[] = {256, 256, 256};
316+
LookUpTableImpl::LookUpTableImpl(InputArray _lut)
317+
{
318+
nValues3[0] = nValues3[1] = nValues3[2] = 256;
312319

313-
Npp32s pLevels[256];
314-
for (int i = 0; i < 256; ++i)
315-
pLevels[i] = i;
320+
Npp32s pLevels[256];
321+
for (int i = 0; i < 256; ++i)
322+
pLevels[i] = i;
316323

317-
const Npp32s* pLevels3[3];
324+
d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
325+
pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
318326

319-
#if (CUDA_VERSION <= 4020)
320-
pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
321-
#else
322-
GpuMat d_pLevels;
323-
d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
324-
pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
325-
#endif
327+
GpuMat lut;
328+
if (_lut.kind() == _InputArray::GPU_MAT)
329+
{
330+
lut = _lut.getGpuMat();
331+
}
332+
else
333+
{
334+
Mat hLut = _lut.getMat();
335+
CV_Assert( hLut.total() == 256 && hLut.isContinuous() );
336+
lut.upload(Mat(1, 256, hLut.type(), hLut.data));
337+
}
326338

327-
cudaStream_t stream = StreamAccessor::getStream(s);
328-
NppStreamHandler h(stream);
339+
lut_cn = lut.channels();
340+
341+
CV_Assert( lut.depth() == CV_8U );
342+
CV_Assert( lut.rows == 1 && lut.cols == 256 );
343+
344+
lut.convertTo(d_nppLut, CV_32S);
345+
346+
if (lut_cn == 1)
347+
{
348+
pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
349+
}
350+
else
351+
{
352+
gpu::split(d_nppLut, d_nppLut3);
353+
354+
pValues3[0] = d_nppLut3[0].ptr<Npp32s>();
355+
pValues3[1] = d_nppLut3[1].ptr<Npp32s>();
356+
pValues3[2] = d_nppLut3[2].ptr<Npp32s>();
357+
}
358+
}
329359

330-
if (src.type() == CV_8UC1)
360+
void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream)
331361
{
332-
#if (CUDA_VERSION <= 4020)
333-
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
334-
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
335-
#else
336-
GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
337-
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
338-
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
339-
#endif
362+
GpuMat src = _src.getGpuMat();
363+
364+
const int cn = src.channels();
365+
366+
CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
367+
CV_Assert( lut_cn == 1 || lut_cn == cn );
368+
369+
_dst.create(src.size(), src.type());
370+
GpuMat dst = _dst.getGpuMat();
371+
372+
cudaStream_t stream = StreamAccessor::getStream(_stream);
373+
374+
NppStreamHandler h(stream);
375+
376+
NppiSize sz;
377+
sz.height = src.rows;
378+
sz.width = src.cols;
379+
380+
if (src.type() == CV_8UC1)
381+
{
382+
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
383+
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
384+
}
385+
else
386+
{
387+
nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
388+
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
389+
}
390+
391+
if (stream == 0)
392+
cudaSafeCall( cudaDeviceSynchronize() );
340393
}
341-
else
394+
}
395+
396+
#else // (CUDA_VERSION >= 5000)
397+
398+
namespace
399+
{
400+
class LookUpTableImpl : public LookUpTable
342401
{
402+
public:
403+
LookUpTableImpl(InputArray lut);
404+
405+
void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
406+
407+
private:
408+
int lut_cn;
409+
410+
Npp32s pLevels[256];
411+
int nValues3[3];
343412
const Npp32s* pValues3[3];
413+
const Npp32s* pLevels3[3];
344414

415+
Mat nppLut;
345416
Mat nppLut3[3];
346-
if (nppLut.channels() == 1)
417+
};
418+
419+
LookUpTableImpl::LookUpTableImpl(InputArray _lut)
420+
{
421+
nValues3[0] = nValues3[1] = nValues3[2] = 256;
422+
423+
for (int i = 0; i < 256; ++i)
424+
pLevels[i] = i;
425+
pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
426+
427+
Mat lut;
428+
if (_lut.kind() == _InputArray::GPU_MAT)
429+
{
430+
lut = Mat(_lut.getGpuMat());
431+
}
432+
else
433+
{
434+
Mat hLut = _lut.getMat();
435+
CV_Assert( hLut.total() == 256 && hLut.isContinuous() );
436+
lut = hLut;
437+
}
438+
439+
lut_cn = lut.channels();
440+
441+
CV_Assert( lut.depth() == CV_8U );
442+
CV_Assert( lut.rows == 1 && lut.cols == 256 );
443+
444+
lut.convertTo(nppLut, CV_32S);
445+
446+
if (lut_cn == 1)
347447
{
348-
#if (CUDA_VERSION <= 4020)
349448
pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
350-
#else
351-
GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
352-
pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
353-
#endif
354449
}
355450
else
356451
{
357452
cv::split(nppLut, nppLut3);
358453

359-
#if (CUDA_VERSION <= 4020)
360454
pValues3[0] = nppLut3[0].ptr<Npp32s>();
361455
pValues3[1] = nppLut3[1].ptr<Npp32s>();
362456
pValues3[2] = nppLut3[2].ptr<Npp32s>();
363-
#else
364-
GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data));
365-
GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data));
366-
GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data));
457+
}
458+
}
367459

368-
pValues3[0] = d_nppLut0.ptr<Npp32s>();
369-
pValues3[1] = d_nppLut1.ptr<Npp32s>();
370-
pValues3[2] = d_nppLut2.ptr<Npp32s>();
371-
#endif
460+
void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream)
461+
{
462+
GpuMat src = _src.getGpuMat();
463+
464+
const int cn = src.channels();
465+
466+
CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
467+
CV_Assert( lut_cn == 1 || lut_cn == cn );
468+
469+
_dst.create(src.size(), src.type());
470+
GpuMat dst = _dst.getGpuMat();
471+
472+
cudaStream_t stream = StreamAccessor::getStream(_stream);
473+
474+
NppStreamHandler h(stream);
475+
476+
NppiSize sz;
477+
sz.height = src.rows;
478+
sz.width = src.cols;
479+
480+
if (src.type() == CV_8UC1)
481+
{
482+
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
483+
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
484+
}
485+
else
486+
{
487+
nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
488+
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
372489
}
373490

374-
nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
375-
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
491+
if (stream == 0)
492+
cudaSafeCall( cudaDeviceSynchronize() );
376493
}
494+
}
377495

378-
if (stream == 0)
379-
cudaSafeCall( cudaDeviceSynchronize() );
496+
#endif // (CUDA_VERSION >= 5000)
497+
498+
Ptr<LookUpTable> cv::gpu::createLookUpTable(InputArray lut)
499+
{
500+
return new LookUpTableImpl(lut);
380501
}
381502

382503
////////////////////////////////////////////////////////////////////////

modules/gpuarithm/test/test_core.cpp

+6-2
Original file line numberDiff line numberDiff line change
@@ -323,8 +323,10 @@ GPU_TEST_P(LUT, OneChannel)
323323
cv::Mat src = randomMat(size, type);
324324
cv::Mat lut = randomMat(cv::Size(256, 1), CV_8UC1);
325325

326+
cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
327+
326328
cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()));
327-
cv::gpu::LUT(loadMat(src, useRoi), lut, dst);
329+
lutAlg->transform(loadMat(src, useRoi), dst);
328330

329331
cv::Mat dst_gold;
330332
cv::LUT(src, lut, dst_gold);
@@ -337,8 +339,10 @@ GPU_TEST_P(LUT, MultiChannel)
337339
cv::Mat src = randomMat(size, type);
338340
cv::Mat lut = randomMat(cv::Size(256, 1), CV_MAKE_TYPE(CV_8U, src.channels()));
339341

342+
cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
343+
340344
cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()), useRoi);
341-
cv::gpu::LUT(loadMat(src, useRoi), lut, dst);
345+
lutAlg->transform(loadMat(src, useRoi), dst);
342346

343347
cv::Mat dst_gold;
344348
cv::LUT(src, lut, dst_gold);

0 commit comments

Comments
 (0)