Skip to content

Commit dba7186

Browse files
authored
Merge pull request opencv#24271 from Kumataro:fix24163
Fix to convert float32 to int32/uint32 with rounding to nearest (ties to even). opencv#24271 Fix opencv#24163 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake (carotene is BSD)
1 parent d9d4029 commit dba7186

File tree

13 files changed

+323
-164
lines changed

13 files changed

+323
-164
lines changed

3rdparty/carotene/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,14 @@ endif()
4242

4343
if(WITH_NEON)
4444
target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
45+
if(NOT DEFINED CAROTENE_NEON_ARCH )
46+
elseif(CAROTENE_NEON_ARCH EQUAL 8)
47+
target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=8")
48+
elseif(CAROTENE_NEON_ARCH EQUAL 7)
49+
target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=7")
50+
else()
51+
target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=0")
52+
endif()
4553
endif()
4654

4755
# we add dummy file to fix XCode build

3rdparty/carotene/src/add_weighted.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939

4040
#include "common.hpp"
4141
#include "vtransform.hpp"
42+
#include "vround_helper.hpp"
4243

4344
namespace CAROTENE_NS {
4445

@@ -106,7 +107,7 @@ template <> struct wAdd<s32>
106107
{
107108
valpha = vdupq_n_f32(_alpha);
108109
vbeta = vdupq_n_f32(_beta);
109-
vgamma = vdupq_n_f32(_gamma + 0.5);
110+
vgamma = vdupq_n_f32(_gamma);
110111
}
111112

112113
void operator() (const VecTraits<s32>::vec128 & v_src0,
@@ -118,7 +119,7 @@ template <> struct wAdd<s32>
118119

119120
vs1 = vmlaq_f32(vgamma, vs1, valpha);
120121
vs1 = vmlaq_f32(vs1, vs2, vbeta);
121-
v_dst = vcvtq_s32_f32(vs1);
122+
v_dst = vroundq_s32_f32(vs1);
122123
}
123124

124125
void operator() (const VecTraits<s32>::vec64 & v_src0,
@@ -130,7 +131,7 @@ template <> struct wAdd<s32>
130131

131132
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
132133
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
133-
v_dst = vcvt_s32_f32(vs1);
134+
v_dst = vround_s32_f32(vs1);
134135
}
135136

136137
void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
@@ -150,7 +151,7 @@ template <> struct wAdd<u32>
150151
{
151152
valpha = vdupq_n_f32(_alpha);
152153
vbeta = vdupq_n_f32(_beta);
153-
vgamma = vdupq_n_f32(_gamma + 0.5);
154+
vgamma = vdupq_n_f32(_gamma);
154155
}
155156

156157
void operator() (const VecTraits<u32>::vec128 & v_src0,
@@ -162,7 +163,7 @@ template <> struct wAdd<u32>
162163

163164
vs1 = vmlaq_f32(vgamma, vs1, valpha);
164165
vs1 = vmlaq_f32(vs1, vs2, vbeta);
165-
v_dst = vcvtq_u32_f32(vs1);
166+
v_dst = vroundq_u32_f32(vs1);
166167
}
167168

168169
void operator() (const VecTraits<u32>::vec64 & v_src0,
@@ -174,7 +175,7 @@ template <> struct wAdd<u32>
174175

175176
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
176177
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
177-
v_dst = vcvt_u32_f32(vs1);
178+
v_dst = vround_u32_f32(vs1);
178179
}
179180

180181
void operator() (const u32 * src0, const u32 * src1, u32 * dst) const

3rdparty/carotene/src/blur.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141

4242
#include "common.hpp"
4343
#include "saturate_cast.hpp"
44+
#include "vround_helper.hpp"
4445

4546
namespace CAROTENE_NS {
4647

@@ -198,7 +199,6 @@ void blur3x3(const Size2D &size, s32 cn,
198199
//#define FLOAT_VARIANT_1_9
199200
#ifdef FLOAT_VARIANT_1_9
200201
float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0);
201-
float32x4_t v0_5 = vdupq_n_f32 (.5);
202202
#else
203203
const int16x8_t vScale = vmovq_n_s16(3640);
204204
#endif
@@ -283,8 +283,8 @@ void blur3x3(const Size2D &size, s32 cn,
283283
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
284284
float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
285285
float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
286-
tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
287-
tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
286+
tres1 = internal::vroundq_u32_f32(vf1);
287+
tres2 = internal::vroundq_u32_f32(vf2);
288288
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
289289
vst1_u8(drow + x - 8, vmovn_u16(t0));
290290
#else
@@ -445,8 +445,8 @@ void blur3x3(const Size2D &size, s32 cn,
445445
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
446446
float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
447447
float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
448-
tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
449-
tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
448+
tres1 = internal::vroundq_u32_f32(vf1);
449+
tres2 = internal::vroundq_u32_f32(vf2);
450450
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
451451
vst1_u8(drow + x - 8, vmovn_u16(t0));
452452
#else
@@ -508,7 +508,6 @@ void blur5x5(const Size2D &size, s32 cn,
508508
#define FLOAT_VARIANT_1_25
509509
#ifdef FLOAT_VARIANT_1_25
510510
float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f);
511-
float32x4_t v0_5 = vdupq_n_f32 (.5f);
512511
#else
513512
const int16x8_t vScale = vmovq_n_s16(1310);
514513
#endif
@@ -752,8 +751,8 @@ void blur5x5(const Size2D &size, s32 cn,
752751
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
753752
float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1));
754753
float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2));
755-
tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
756-
tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
754+
tres1 = internal::vroundq_u32_f32(vf1);
755+
tres2 = internal::vroundq_u32_f32(vf2);
757756
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
758757
vst1_u8(drow + x - 8, vmovn_u16(t0));
759758
#else

3rdparty/carotene/src/colorconvert.cpp

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
#include "common.hpp"
4141

4242
#include "saturate_cast.hpp"
43+
#include "vround_helper.hpp"
4344

4445
namespace CAROTENE_NS {
4546

@@ -1166,17 +1167,10 @@ inline uint8x8x3_t convertToHSV(const uint8x8_t vR, const uint8x8_t vG, const ui
11661167
vSt3 = vmulq_f32(vHF1, vDivTab);
11671168
vSt4 = vmulq_f32(vHF2, vDivTab);
11681169

1169-
float32x4_t bias = vdupq_n_f32(0.5f);
1170-
1171-
vSt1 = vaddq_f32(vSt1, bias);
1172-
vSt2 = vaddq_f32(vSt2, bias);
1173-
vSt3 = vaddq_f32(vSt3, bias);
1174-
vSt4 = vaddq_f32(vSt4, bias);
1175-
1176-
uint32x4_t vRes1 = vcvtq_u32_f32(vSt1);
1177-
uint32x4_t vRes2 = vcvtq_u32_f32(vSt2);
1178-
uint32x4_t vRes3 = vcvtq_u32_f32(vSt3);
1179-
uint32x4_t vRes4 = vcvtq_u32_f32(vSt4);
1170+
uint32x4_t vRes1 = internal::vroundq_u32_f32(vSt1);
1171+
uint32x4_t vRes2 = internal::vroundq_u32_f32(vSt2);
1172+
uint32x4_t vRes3 = internal::vroundq_u32_f32(vSt3);
1173+
uint32x4_t vRes4 = internal::vroundq_u32_f32(vSt4);
11801174

11811175
int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4));
11821176
int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4));

3rdparty/carotene/src/common.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,17 @@
5858

5959
namespace CAROTENE_NS { namespace internal {
6060

61+
#ifndef CAROTENE_NEON_ARCH
62+
# if defined(__aarch64__) || defined(__aarch32__)
63+
# define CAROTENE_NEON_ARCH 8
64+
# else
65+
# define CAROTENE_NEON_ARCH 7
66+
# endif
67+
#endif
68+
#if ( !defined(__aarch64__) && !defined(__aarch32__) ) && (CAROTENE_NEON_ARCH == 8 )
69+
# error("ARMv7 doen't support A32/A64 Neon instructions")
70+
#endif
71+
6172
inline void prefetch(const void *ptr, size_t offset = 32*10)
6273
{
6374
#if defined __GNUC__

0 commit comments

Comments
 (0)