@@ -131,7 +131,11 @@ extern "C" {
131131
132132#ifndef TINYEXR_USE_THREAD
133133#define TINYEXR_USE_THREAD (0 ) // No threaded loading.
134- // http://computation.llnl.gov/projects/floating-point-compression
134+ #else
135+ // When using threading a reduced custom upperbound can be specified by setting TINYEXR_MAX_THREADS
136+ #ifndef TINYEXR_MAX_THREADS // if not defined define it as 0 meaning upper limit is taken from hardware_concurrency()
137+ #define TINYEXR_MAX_THREADS (0 )
138+ #endif
135139#endif
136140
137141#ifndef TINYEXR_USE_OPENMP
@@ -142,6 +146,41 @@ extern "C" {
142146#endif
143147#endif
144148
149+ #ifndef TINYEXR_USE_COMPILER_FP16
150+ #define TINYEXR_USE_COMPILER_FP16 (0 )
151+ #endif
152+
153+ #if TINYEXR_USE_COMPILER_FP16
154+ #ifndef _MSC_VER
155+ #if defined( __GNUC__ ) || defined( __clang__ )
156+ #if defined( __SSE2__ )
157+ #if ( __GNUC__ > 11 ) || ( __clang_major__ > 14 )
158+ #ifndef __STDC_WANT_IEC_60559_TYPES_EXT__
159+ #define __STDC_WANT_IEC_60559_TYPES_EXT__
160+ #endif
161+ #include < float.h>
162+ #include < math.h>
163+ #define TINYEXR_FP16_COMPILER_TYPE _Float16
164+ #endif
165+ #endif
166+ #if defined( __ARM_NEON__ ) || defined( __ARM_NEON )
167+ #define TINYEXR_FP16_COMPILER_TYPE __fp16
168+ #endif
169+ #endif
170+ #else
171+ #if (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__)
172+ #include < intrin.h>
173+ #define TINYEXR_FP16_COMPILER_TYPE uint16_t
174+ #endif
175+ #endif
176+ #endif
177+
178+ #ifdef TINYEXR_FP16_COMPILER_TYPE
179+ #define TINYEXR_HAS_FP16_COMPILER_TYPE (1 )
180+ #else
181+ #define TINYEXR_HAS_FP16_COMPILER_TYPE (0 )
182+ #endif
183+
145184#define TINYEXR_SUCCESS (0 )
146185#define TINYEXR_ERROR_INVALID_MAGIC_NUMBER (-1 )
147186#define TINYEXR_ERROR_INVALID_EXR_VERSION (-2 )
@@ -771,15 +810,15 @@ static void SetWarningMessage(const std::string &msg, const char **warn) {
771810
772811static const int kEXRVersionSize = 8 ;
773812
774- static void cpy2 (unsigned short *dst_val, const unsigned short *src_val) {
813+ static void inline cpy2 (unsigned short *dst_val, const unsigned short *src_val) {
775814 unsigned char *dst = reinterpret_cast <unsigned char *>(dst_val);
776815 const unsigned char *src = reinterpret_cast <const unsigned char *>(src_val);
777816
778817 dst[0 ] = src[0 ];
779818 dst[1 ] = src[1 ];
780819}
781820
782- static void swap2 (unsigned short *val) {
821+ static void inline swap2 (unsigned short *val) {
783822#if TINYEXR_LITTLE_ENDIAN
784823 (void )val;
785824#else
@@ -801,7 +840,7 @@ static void swap2(unsigned short *val) {
801840#pragma GCC diagnostic push
802841#pragma GCC diagnostic ignored "-Wunused-function"
803842#endif
804- static void cpy4 (int *dst_val, const int *src_val) {
843+ static void inline cpy4 (int *dst_val, const int *src_val) {
805844 unsigned char *dst = reinterpret_cast <unsigned char *>(dst_val);
806845 const unsigned char *src = reinterpret_cast <const unsigned char *>(src_val);
807846
@@ -811,7 +850,7 @@ static void cpy4(int *dst_val, const int *src_val) {
811850 dst[3 ] = src[3 ];
812851}
813852
814- static void cpy4 (unsigned int *dst_val, const unsigned int *src_val) {
853+ static void inline cpy4 (unsigned int *dst_val, const unsigned int *src_val) {
815854 unsigned char *dst = reinterpret_cast <unsigned char *>(dst_val);
816855 const unsigned char *src = reinterpret_cast <const unsigned char *>(src_val);
817856
@@ -821,7 +860,7 @@ static void cpy4(unsigned int *dst_val, const unsigned int *src_val) {
821860 dst[3 ] = src[3 ];
822861}
823862
824- static void cpy4 (float *dst_val, const float *src_val) {
863+ static void inline cpy4 (float *dst_val, const float *src_val) {
825864 unsigned char *dst = reinterpret_cast <unsigned char *>(dst_val);
826865 const unsigned char *src = reinterpret_cast <const unsigned char *>(src_val);
827866
@@ -838,7 +877,7 @@ static void cpy4(float *dst_val, const float *src_val) {
838877#pragma GCC diagnostic pop
839878#endif
840879
841- static void swap4 (unsigned int *val) {
880+ static void inline swap4 (unsigned int *val) {
842881#if TINYEXR_LITTLE_ENDIAN
843882 (void )val;
844883#else
@@ -853,7 +892,7 @@ static void swap4(unsigned int *val) {
853892#endif
854893}
855894
856- static void swap4 (int *val) {
895+ static void inline swap4 (int *val) {
857896#if TINYEXR_LITTLE_ENDIAN
858897 (void )val;
859898#else
@@ -868,7 +907,7 @@ static void swap4(int *val) {
868907#endif
869908}
870909
871- static void swap4 (float *val) {
910+ static void inline swap4 (float *val) {
872911#if TINYEXR_LITTLE_ENDIAN
873912 (void )val;
874913#else
@@ -884,7 +923,7 @@ static void swap4(float *val) {
884923}
885924
886925#if 0
887- static void cpy8(tinyexr::tinyexr_uint64 *dst_val, const tinyexr::tinyexr_uint64 *src_val) {
926+ static void inline cpy8(tinyexr::tinyexr_uint64 *dst_val, const tinyexr::tinyexr_uint64 *src_val) {
888927 unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
889928 const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
890929
@@ -899,7 +938,7 @@ static void cpy8(tinyexr::tinyexr_uint64 *dst_val, const tinyexr::tinyexr_uint64
899938}
900939#endif
901940
902- static void swap8 (tinyexr::tinyexr_uint64 *val) {
941+ static void inline swap8 (tinyexr::tinyexr_uint64 *val) {
903942#if TINYEXR_LITTLE_ENDIAN
904943 (void )val;
905944#else
@@ -919,6 +958,11 @@ static void swap8(tinyexr::tinyexr_uint64 *val) {
919958}
920959
921960// https://gist.github.com/rygorous/2156668
961+ #if TINYEXR_HAS_FP16_COMPILER_TYPE && (TINYEXR_USE_COMPILER_FP16 > 0)
962+ union FP32 {
963+ float f;
964+ };
965+ #else
922966union FP32 {
923967 unsigned int u;
924968 float f;
@@ -934,12 +978,21 @@ union FP32 {
934978#endif
935979 } s;
936980};
981+ #endif
937982
938983#ifdef __clang__
939984#pragma clang diagnostic push
940985#pragma clang diagnostic ignored "-Wpadded"
941986#endif
942987
988+ #if TINYEXR_HAS_FP16_COMPILER_TYPE && (TINYEXR_USE_COMPILER_FP16 > 0)
989+ union FP16 {
990+ TINYEXR_FP16_COMPILER_TYPE f;
991+ unsigned short u;
992+ };
993+
994+ #else
995+
943996union FP16 {
944997 unsigned short u;
945998 struct {
@@ -954,11 +1007,32 @@ union FP16 {
9541007#endif
9551008 } s;
9561009};
1010+ #endif
9571011
9581012#ifdef __clang__
9591013#pragma clang diagnostic pop
9601014#endif
9611015
1016+ #if TINYEXR_HAS_FP16_COMPILER_TYPE && (TINYEXR_USE_COMPILER_FP16 > 0)
1017+ static inline FP32 half_to_float (FP16 h) {
1018+ FP32 o;
1019+ #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__)
1020+ o.f =_mm_cvtss_f32 (_mm_cvtph_ps (_mm_cvtsi32_si128 (static_cast <int > (h.u ))));
1021+ #else
1022+ o.f = static_cast <float > (h.f );
1023+ #endif
1024+ return o;
1025+ }
1026+ static inline FP16 float_to_half_full (FP32 f) {
1027+ FP16 o;
1028+ #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__)
1029+ o.f = static_cast <TINYEXR_FP16_COMPILER_TYPE> (_mm_cvtsi128_si32 (_mm_cvtps_ph (_mm_set_ss (f.f ), _MM_FROUND_CUR_DIRECTION)));
1030+ #else
1031+ o.f = static_cast <TINYEXR_FP16_COMPILER_TYPE> (f.f );
1032+ #endif
1033+ return o;
1034+ }
1035+ #else
9621036static FP32 half_to_float (FP16 h) {
9631037 static const FP32 magic = {113 << 23 };
9641038 static const unsigned int shifted_exp = 0x7c00
@@ -1018,7 +1092,7 @@ static FP16 float_to_half_full(FP32 f) {
10181092 o.s .Sign = f.s .Sign ;
10191093 return o;
10201094}
1021-
1095+ # endif
10221096// NOTE: From OpenEXR code
10231097// #define IMF_INCREASING_Y 0
10241098// #define IMF_DECREASING_Y 1
@@ -4930,10 +5004,12 @@ static int DecodeTiledLevel(EXRImage* exr_image, const EXRHeader* exr_header,
49305004 std::atomic<int > tile_count (0 );
49315005
49325006 int num_threads = std::max (1 , int (std::thread::hardware_concurrency ()));
5007+ #if (TINYEXR_MAX_THREADS > 0)
5008+ num_threads = std::min (num_threads,TINYEXR_MAX_THREADS);
5009+ #endif
49335010 if (num_threads > int (num_tiles)) {
49345011 num_threads = int (num_tiles);
49355012 }
4936-
49375013 for (int t = 0 ; t < num_threads; t++) {
49385014 workers.emplace_back (std::thread ([&]()
49395015 {
@@ -5286,10 +5362,12 @@ static int DecodeChunk(EXRImage *exr_image, const EXRHeader *exr_header,
52865362 std::atomic<int > y_count (0 );
52875363
52885364 int num_threads = std::max (1 , int (std::thread::hardware_concurrency ()));
5365+ #if (TINYEXR_MAX_THREADS > 0)
5366+ num_threads = std::min (num_threads,TINYEXR_MAX_THREADS);
5367+ #endif
52895368 if (num_threads > int (num_blocks)) {
52905369 num_threads = int (num_blocks);
52915370 }
5292-
52935371 for (int t = 0 ; t < num_threads; t++) {
52945372 workers.emplace_back (std::thread ([&]() {
52955373 int y = 0 ;
@@ -7268,6 +7346,9 @@ static int EncodeTiledLevel(const EXRImage* level_image, const EXRHeader* exr_he
72687346 std::atomic<int > tile_count (0 );
72697347
72707348 int num_threads = std::max (1 , int (std::thread::hardware_concurrency ()));
7349+ #if (TINYEXR_MAX_THREADS > 0)
7350+ num_threads = std::min (num_threads,TINYEXR_MAX_THREADS);
7351+ #endif
72717352 if (num_threads > int (num_tiles)) {
72727353 num_threads = int (num_tiles);
72737354 }
@@ -7517,7 +7598,9 @@ static int EncodeChunk(const EXRImage* exr_image, const EXRHeader* exr_header,
75177598 std::atomic<int > block_count (0 );
75187599
75197600 int num_threads = std::min (std::max (1 , int (std::thread::hardware_concurrency ())), num_blocks);
7520-
7601+ #if (TINYEXR_MAX_THREADS > 0)
7602+ num_threads = std::min (num_threads,TINYEXR_MAX_THREADS);
7603+ #endif
75217604 for (int t = 0 ; t < num_threads; t++) {
75227605 workers.emplace_back (std::thread ([&]() {
75237606 int i = 0 ;
@@ -9047,13 +9130,19 @@ int SaveEXRToMemory(const float *data, int width, int height, int components,
90479130 images[3 ].resize (static_cast <size_t >(width * height));
90489131
90499132 // Split RGB(A)RGB(A)RGB(A)... into R, G and B(and A) layers
9050- for (size_t i = 0 ; i < static_cast <size_t >(width * height); i++) {
9051- images[0 ][i] = data[static_cast <size_t >(components) * i + 0 ];
9052- images[1 ][i] = data[static_cast <size_t >(components) * i + 1 ];
9053- images[2 ][i] = data[static_cast <size_t >(components) * i + 2 ];
9054- if (components == 4 ) {
9055- images[3 ][i] = data[static_cast <size_t >(components) * i + 3 ];
9056- }
9133+ if (components == 4 ) {
9134+ for (size_t i = 0 ; i < static_cast <size_t >(width * height); i++) {
9135+ images[0 ][i] = data[static_cast <size_t >(components) * i + 0 ];
9136+ images[1 ][i] = data[static_cast <size_t >(components) * i + 1 ];
9137+ images[2 ][i] = data[static_cast <size_t >(components) * i + 2 ];
9138+ images[3 ][i] = data[static_cast <size_t >(components) * i + 3 ];
9139+ }
9140+ } else {
9141+ for (size_t i = 0 ; i < static_cast <size_t >(width * height); i++) {
9142+ images[0 ][i] = data[static_cast <size_t >(components) * i + 0 ];
9143+ images[1 ][i] = data[static_cast <size_t >(components) * i + 1 ];
9144+ images[2 ][i] = data[static_cast <size_t >(components) * i + 2 ];
9145+ }
90579146 }
90589147 }
90599148
@@ -9198,13 +9287,19 @@ int SaveEXR(const float *data, int width, int height, int components,
91989287 images[3 ].resize (pixel_count);
91999288
92009289 // Split RGB(A)RGB(A)RGB(A)... into R, G and B(and A) layers
9201- for (size_t i = 0 ; i < pixel_count; i++) {
9202- images[0 ][i] = data[static_cast <size_t >(components) * i + 0 ];
9203- images[1 ][i] = data[static_cast <size_t >(components) * i + 1 ];
9204- images[2 ][i] = data[static_cast <size_t >(components) * i + 2 ];
9205- if (components == 4 ) {
9206- images[3 ][i] = data[static_cast <size_t >(components) * i + 3 ];
9207- }
9290+ if (components == 4 ) {
9291+ for (size_t i = 0 ; i < pixel_count; i++) {
9292+ images[0 ][i] = data[static_cast <size_t >(components) * i + 0 ];
9293+ images[1 ][i] = data[static_cast <size_t >(components) * i + 1 ];
9294+ images[2 ][i] = data[static_cast <size_t >(components) * i + 2 ];
9295+ images[3 ][i] = data[static_cast <size_t >(components) * i + 3 ];
9296+ }
9297+ } else {
9298+ for (size_t i = 0 ; i < pixel_count; i++) {
9299+ images[0 ][i] = data[static_cast <size_t >(components) * i + 0 ];
9300+ images[1 ][i] = data[static_cast <size_t >(components) * i + 1 ];
9301+ images[2 ][i] = data[static_cast <size_t >(components) * i + 2 ];
9302+ }
92089303 }
92099304 }
92109305
0 commit comments