Skip to content

Commit 648e47b

Browse files
authored
Merge pull request syoyo#214 from dbPhilips/perf_enhancements
Perf enhancements
2 parents 756f7d3 + 33e0472 commit 648e47b

File tree

2 files changed

+128
-31
lines changed

2 files changed

+128
-31
lines changed

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,10 +159,12 @@ Include `tinyexr.h` with `TINYEXR_IMPLEMENTATION` flag (do this only for **one**
159159
* `TINYEXR_USE_MINIZ` Use miniz (default = 1). Please include `zlib.h` header before `tinyexr.h` if you disable miniz support(e.g. use system's zlib).
160160
* `TINYEXR_USE_STB_ZLIB` Use zlib from `stb_image[_write].h` instead of miniz or the system's zlib (default = 0).
161161
* `TINYEXR_USE_PIZ` Enable PIZ compression support (default = 1)
162-
* `TINYEXR_USE_ZFP` Enable ZFP compression supoort (TinyEXR extension, default = 0)
163-
* `TINYEXR_USE_THREAD` Enable threaded loading using C++11 thread (Requires C++11 compiler, default = 0)
162+
* `TINYEXR_USE_ZFP` Enable ZFP compression support (TinyEXR extension, default = 0)
163+
* `TINYEXR_USE_THREAD` Enable threaded loading/storing using C++11 thread (Requires C++11 compiler, default = 0)
164+
* Use `TINYEXR_MAX_THREADS` over 0 to use MIN(TINYEXR_MAX_THREADS,hardware_concurrency()) in stead off hardware_concurrency(). (default = 0)
164165
* `TINYEXR_USE_OPENMP` Enable OpenMP threading support (default = 1 if `_OPENMP` is defined)
165166
* Use `TINYEXR_USE_OPENMP=0` to force disable OpenMP code path even if OpenMP is available/enabled in the compiler.
167+
* `TINYEXR_USE_COMPILER_FP16` Enable use of compiler provided FP16<>FP32 conversions when available (default = 0)
166168

167169
### Quickly reading RGB(A) EXR file.
168170

tinyexr.h

Lines changed: 124 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,11 @@ extern "C" {
131131

132132
#ifndef TINYEXR_USE_THREAD
133133
#define TINYEXR_USE_THREAD (0) // No threaded loading.
134-
// http://computation.llnl.gov/projects/floating-point-compression
134+
#else
135+
// When using threading a reduced custom upperbound can be specified by setting TINYEXR_MAX_THREADS
136+
#ifndef TINYEXR_MAX_THREADS // if not defined define it as 0 meaning upper limit is taken from hardware_concurrency()
137+
#define TINYEXR_MAX_THREADS (0)
138+
#endif
135139
#endif
136140

137141
#ifndef TINYEXR_USE_OPENMP
@@ -142,6 +146,41 @@ extern "C" {
142146
#endif
143147
#endif
144148

149+
#ifndef TINYEXR_USE_COMPILER_FP16
150+
#define TINYEXR_USE_COMPILER_FP16 (0)
151+
#endif
152+
153+
#if TINYEXR_USE_COMPILER_FP16
154+
#ifndef _MSC_VER
155+
#if defined( __GNUC__ ) || defined( __clang__ )
156+
#if defined( __SSE2__ )
157+
#if ( __GNUC__ > 11 ) || ( __clang_major__ > 14 )
158+
#ifndef __STDC_WANT_IEC_60559_TYPES_EXT__
159+
#define __STDC_WANT_IEC_60559_TYPES_EXT__
160+
#endif
161+
#include <float.h>
162+
#include <math.h>
163+
#define TINYEXR_FP16_COMPILER_TYPE _Float16
164+
#endif
165+
#endif
166+
#if defined( __ARM_NEON__ ) || defined( __ARM_NEON )
167+
#define TINYEXR_FP16_COMPILER_TYPE __fp16
168+
#endif
169+
#endif
170+
#else
171+
#if (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__)
172+
#include <intrin.h>
173+
#define TINYEXR_FP16_COMPILER_TYPE uint16_t
174+
#endif
175+
#endif
176+
#endif
177+
178+
#ifdef TINYEXR_FP16_COMPILER_TYPE
179+
#define TINYEXR_HAS_FP16_COMPILER_TYPE (1)
180+
#else
181+
#define TINYEXR_HAS_FP16_COMPILER_TYPE (0)
182+
#endif
183+
145184
#define TINYEXR_SUCCESS (0)
146185
#define TINYEXR_ERROR_INVALID_MAGIC_NUMBER (-1)
147186
#define TINYEXR_ERROR_INVALID_EXR_VERSION (-2)
@@ -771,15 +810,15 @@ static void SetWarningMessage(const std::string &msg, const char **warn) {
771810

772811
static const int kEXRVersionSize = 8;
773812

774-
static void cpy2(unsigned short *dst_val, const unsigned short *src_val) {
813+
static void inline cpy2(unsigned short *dst_val, const unsigned short *src_val) {
775814
unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
776815
const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
777816

778817
dst[0] = src[0];
779818
dst[1] = src[1];
780819
}
781820

782-
static void swap2(unsigned short *val) {
821+
static void inline swap2(unsigned short *val) {
783822
#if TINYEXR_LITTLE_ENDIAN
784823
(void)val;
785824
#else
@@ -801,7 +840,7 @@ static void swap2(unsigned short *val) {
801840
#pragma GCC diagnostic push
802841
#pragma GCC diagnostic ignored "-Wunused-function"
803842
#endif
804-
static void cpy4(int *dst_val, const int *src_val) {
843+
static void inline cpy4(int *dst_val, const int *src_val) {
805844
unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
806845
const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
807846

@@ -811,7 +850,7 @@ static void cpy4(int *dst_val, const int *src_val) {
811850
dst[3] = src[3];
812851
}
813852

814-
static void cpy4(unsigned int *dst_val, const unsigned int *src_val) {
853+
static void inline cpy4(unsigned int *dst_val, const unsigned int *src_val) {
815854
unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
816855
const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
817856

@@ -821,7 +860,7 @@ static void cpy4(unsigned int *dst_val, const unsigned int *src_val) {
821860
dst[3] = src[3];
822861
}
823862

824-
static void cpy4(float *dst_val, const float *src_val) {
863+
static void inline cpy4(float *dst_val, const float *src_val) {
825864
unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
826865
const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
827866

@@ -838,7 +877,7 @@ static void cpy4(float *dst_val, const float *src_val) {
838877
#pragma GCC diagnostic pop
839878
#endif
840879

841-
static void swap4(unsigned int *val) {
880+
static void inline swap4(unsigned int *val) {
842881
#if TINYEXR_LITTLE_ENDIAN
843882
(void)val;
844883
#else
@@ -853,7 +892,7 @@ static void swap4(unsigned int *val) {
853892
#endif
854893
}
855894

856-
static void swap4(int *val) {
895+
static void inline swap4(int *val) {
857896
#if TINYEXR_LITTLE_ENDIAN
858897
(void)val;
859898
#else
@@ -868,7 +907,7 @@ static void swap4(int *val) {
868907
#endif
869908
}
870909

871-
static void swap4(float *val) {
910+
static void inline swap4(float *val) {
872911
#if TINYEXR_LITTLE_ENDIAN
873912
(void)val;
874913
#else
@@ -884,7 +923,7 @@ static void swap4(float *val) {
884923
}
885924

886925
#if 0
887-
static void cpy8(tinyexr::tinyexr_uint64 *dst_val, const tinyexr::tinyexr_uint64 *src_val) {
926+
static void inline cpy8(tinyexr::tinyexr_uint64 *dst_val, const tinyexr::tinyexr_uint64 *src_val) {
888927
unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
889928
const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
890929

@@ -899,7 +938,7 @@ static void cpy8(tinyexr::tinyexr_uint64 *dst_val, const tinyexr::tinyexr_uint64
899938
}
900939
#endif
901940

902-
static void swap8(tinyexr::tinyexr_uint64 *val) {
941+
static void inline swap8(tinyexr::tinyexr_uint64 *val) {
903942
#if TINYEXR_LITTLE_ENDIAN
904943
(void)val;
905944
#else
@@ -919,6 +958,11 @@ static void swap8(tinyexr::tinyexr_uint64 *val) {
919958
}
920959

921960
// https://gist.github.com/rygorous/2156668
961+
#if TINYEXR_HAS_FP16_COMPILER_TYPE && (TINYEXR_USE_COMPILER_FP16 > 0)
962+
union FP32 {
963+
float f;
964+
};
965+
#else
922966
union FP32 {
923967
unsigned int u;
924968
float f;
@@ -934,12 +978,21 @@ union FP32 {
934978
#endif
935979
} s;
936980
};
981+
#endif
937982

938983
#ifdef __clang__
939984
#pragma clang diagnostic push
940985
#pragma clang diagnostic ignored "-Wpadded"
941986
#endif
942987

988+
#if TINYEXR_HAS_FP16_COMPILER_TYPE && (TINYEXR_USE_COMPILER_FP16 > 0)
989+
union FP16 {
990+
TINYEXR_FP16_COMPILER_TYPE f;
991+
unsigned short u;
992+
};
993+
994+
#else
995+
943996
union FP16 {
944997
unsigned short u;
945998
struct {
@@ -954,11 +1007,32 @@ union FP16 {
9541007
#endif
9551008
} s;
9561009
};
1010+
#endif
9571011

9581012
#ifdef __clang__
9591013
#pragma clang diagnostic pop
9601014
#endif
9611015

1016+
#if TINYEXR_HAS_FP16_COMPILER_TYPE && (TINYEXR_USE_COMPILER_FP16 > 0)
1017+
static inline FP32 half_to_float(FP16 h) {
1018+
FP32 o;
1019+
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__)
1020+
o.f =_mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(static_cast<int> (h.u))));
1021+
#else
1022+
o.f = static_cast<float> (h.f);
1023+
#endif
1024+
return o;
1025+
}
1026+
static inline FP16 float_to_half_full(FP32 f) {
1027+
FP16 o;
1028+
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__)
1029+
o.f = static_cast<TINYEXR_FP16_COMPILER_TYPE> (_mm_cvtsi128_si32(_mm_cvtps_ph(_mm_set_ss(f.f), _MM_FROUND_CUR_DIRECTION)));
1030+
#else
1031+
o.f = static_cast<TINYEXR_FP16_COMPILER_TYPE> (f.f);
1032+
#endif
1033+
return o;
1034+
}
1035+
#else
9621036
static FP32 half_to_float(FP16 h) {
9631037
static const FP32 magic = {113 << 23};
9641038
static const unsigned int shifted_exp = 0x7c00
@@ -1018,7 +1092,7 @@ static FP16 float_to_half_full(FP32 f) {
10181092
o.s.Sign = f.s.Sign;
10191093
return o;
10201094
}
1021-
1095+
#endif
10221096
// NOTE: From OpenEXR code
10231097
// #define IMF_INCREASING_Y 0
10241098
// #define IMF_DECREASING_Y 1
@@ -4930,10 +5004,12 @@ static int DecodeTiledLevel(EXRImage* exr_image, const EXRHeader* exr_header,
49305004
std::atomic<int> tile_count(0);
49315005

49325006
int num_threads = std::max(1, int(std::thread::hardware_concurrency()));
5007+
#if (TINYEXR_MAX_THREADS > 0)
5008+
num_threads = std::min(num_threads,TINYEXR_MAX_THREADS);
5009+
#endif
49335010
if (num_threads > int(num_tiles)) {
49345011
num_threads = int(num_tiles);
49355012
}
4936-
49375013
for (int t = 0; t < num_threads; t++) {
49385014
workers.emplace_back(std::thread([&]()
49395015
{
@@ -5286,10 +5362,12 @@ static int DecodeChunk(EXRImage *exr_image, const EXRHeader *exr_header,
52865362
std::atomic<int> y_count(0);
52875363

52885364
int num_threads = std::max(1, int(std::thread::hardware_concurrency()));
5365+
#if (TINYEXR_MAX_THREADS > 0)
5366+
num_threads = std::min(num_threads,TINYEXR_MAX_THREADS);
5367+
#endif
52895368
if (num_threads > int(num_blocks)) {
52905369
num_threads = int(num_blocks);
52915370
}
5292-
52935371
for (int t = 0; t < num_threads; t++) {
52945372
workers.emplace_back(std::thread([&]() {
52955373
int y = 0;
@@ -7268,6 +7346,9 @@ static int EncodeTiledLevel(const EXRImage* level_image, const EXRHeader* exr_he
72687346
std::atomic<int> tile_count(0);
72697347

72707348
int num_threads = std::max(1, int(std::thread::hardware_concurrency()));
7349+
#if (TINYEXR_MAX_THREADS > 0)
7350+
num_threads = std::min(num_threads,TINYEXR_MAX_THREADS);
7351+
#endif
72717352
if (num_threads > int(num_tiles)) {
72727353
num_threads = int(num_tiles);
72737354
}
@@ -7517,7 +7598,9 @@ static int EncodeChunk(const EXRImage* exr_image, const EXRHeader* exr_header,
75177598
std::atomic<int> block_count(0);
75187599

75197600
int num_threads = std::min(std::max(1, int(std::thread::hardware_concurrency())), num_blocks);
7520-
7601+
#if (TINYEXR_MAX_THREADS > 0)
7602+
num_threads = std::min(num_threads,TINYEXR_MAX_THREADS);
7603+
#endif
75217604
for (int t = 0; t < num_threads; t++) {
75227605
workers.emplace_back(std::thread([&]() {
75237606
int i = 0;
@@ -9047,13 +9130,19 @@ int SaveEXRToMemory(const float *data, int width, int height, int components,
90479130
images[3].resize(static_cast<size_t>(width * height));
90489131

90499132
// Split RGB(A)RGB(A)RGB(A)... into R, G and B(and A) layers
9050-
for (size_t i = 0; i < static_cast<size_t>(width * height); i++) {
9051-
images[0][i] = data[static_cast<size_t>(components) * i + 0];
9052-
images[1][i] = data[static_cast<size_t>(components) * i + 1];
9053-
images[2][i] = data[static_cast<size_t>(components) * i + 2];
9054-
if (components == 4) {
9055-
images[3][i] = data[static_cast<size_t>(components) * i + 3];
9056-
}
9133+
if (components == 4) {
9134+
for (size_t i = 0; i < static_cast<size_t>(width * height); i++) {
9135+
images[0][i] = data[static_cast<size_t>(components) * i + 0];
9136+
images[1][i] = data[static_cast<size_t>(components) * i + 1];
9137+
images[2][i] = data[static_cast<size_t>(components) * i + 2];
9138+
images[3][i] = data[static_cast<size_t>(components) * i + 3];
9139+
}
9140+
} else {
9141+
for (size_t i = 0; i < static_cast<size_t>(width * height); i++) {
9142+
images[0][i] = data[static_cast<size_t>(components) * i + 0];
9143+
images[1][i] = data[static_cast<size_t>(components) * i + 1];
9144+
images[2][i] = data[static_cast<size_t>(components) * i + 2];
9145+
}
90579146
}
90589147
}
90599148

@@ -9198,13 +9287,19 @@ int SaveEXR(const float *data, int width, int height, int components,
91989287
images[3].resize(pixel_count);
91999288

92009289
// Split RGB(A)RGB(A)RGB(A)... into R, G and B(and A) layers
9201-
for (size_t i = 0; i < pixel_count; i++) {
9202-
images[0][i] = data[static_cast<size_t>(components) * i + 0];
9203-
images[1][i] = data[static_cast<size_t>(components) * i + 1];
9204-
images[2][i] = data[static_cast<size_t>(components) * i + 2];
9205-
if (components == 4) {
9206-
images[3][i] = data[static_cast<size_t>(components) * i + 3];
9207-
}
9290+
if (components == 4) {
9291+
for (size_t i = 0; i < pixel_count; i++) {
9292+
images[0][i] = data[static_cast<size_t>(components) * i + 0];
9293+
images[1][i] = data[static_cast<size_t>(components) * i + 1];
9294+
images[2][i] = data[static_cast<size_t>(components) * i + 2];
9295+
images[3][i] = data[static_cast<size_t>(components) * i + 3];
9296+
}
9297+
} else {
9298+
for (size_t i = 0; i < pixel_count; i++) {
9299+
images[0][i] = data[static_cast<size_t>(components) * i + 0];
9300+
images[1][i] = data[static_cast<size_t>(components) * i + 1];
9301+
images[2][i] = data[static_cast<size_t>(components) * i + 2];
9302+
}
92089303
}
92099304
}
92109305

0 commit comments

Comments
 (0)