Skip to content

Commit ad45783

Browse files
authored
Optimize *.test_sse4_1 test runtime. NFC (#24397)
LLVM struggles with compilation time on large functions, and counterproductively inlines functions to make things worse for itself. Outline each SSE function to be tested in separate noninlineable function. Before: Total core time: 487.852s. Wallclock time: 323.379s. Parallelization: 1.51x. After: Total core time: 243.685s. Wallclock time: 120.836s. Parallelization: 2.02x.
1 parent 2ca9ace commit ad45783

File tree

2 files changed

+154
-62
lines changed

2 files changed

+154
-62
lines changed

test/sse/test_sse.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,14 @@
2727
#define align1_double double
2828
#endif
2929

30+
#ifdef __GNUC__
31+
#define NOINLINE __attribute__((noinline))
32+
#elif defined(_MSC_VER)
33+
#define NOINLINE __declspec(noinline)
34+
#else
35+
#define NOINLINE
36+
#endif
37+
3038
// Recasts floating point representation of f to an integer.
3139
uint32_t fcastu(float f) { return *(uint32_t*)&f; }
3240
uint64_t dcastu(double f) { return *(uint64_t*)&f; }

test/sse/test_sse4_1.cpp

Lines changed: 146 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -19,94 +19,178 @@ int numInterestingInts = sizeof(interesting_ints_)/sizeof(interesting_ints_[0]);
1919
double *interesting_doubles = get_interesting_doubles();
2020
int numInterestingDoubles = sizeof(interesting_doubles_)/sizeof(interesting_doubles_[0]);
2121

22-
void test_round() {
23-
Ret_M128d(__m128d, _mm_ceil_pd);
24-
Ret_M128(__m128, _mm_ceil_ps);
25-
Ret_M128d_M128d(__m128d, _mm_ceil_sd);
26-
Ret_M128_M128(__m128, _mm_ceil_ss);
27-
Ret_M128d(__m128d, _mm_floor_pd);
28-
Ret_M128(__m128, _mm_floor_ps);
29-
Ret_M128d_M128d(__m128d, _mm_floor_sd);
30-
Ret_M128_M128(__m128, _mm_floor_ss);
31-
Ret_M128d_Tint(__m128d, _mm_round_pd);
32-
Ret_M128_Tint(__m128, _mm_round_ps);
33-
Ret_M128d_M128d_Tint(__m128d, _mm_round_sd);
34-
Ret_M128_M128_Tint(__m128, _mm_round_ss);
35-
}
36-
37-
int main() {
38-
assert(numInterestingFloats % 4 == 0);
39-
assert(numInterestingInts % 4 == 0);
40-
assert(numInterestingDoubles % 4 == 0);
22+
void NOINLINE test_ceil_pd() { Ret_M128d(__m128d, _mm_ceil_pd); }
23+
void NOINLINE test_ceil_ps() { Ret_M128(__m128, _mm_ceil_ps); }
24+
void NOINLINE test_ceil_sd() { Ret_M128d_M128d(__m128d, _mm_ceil_sd); }
25+
void NOINLINE test_ceil_ss() { Ret_M128_M128(__m128, _mm_ceil_ss); }
26+
void NOINLINE test_floor_pd() { Ret_M128d(__m128d, _mm_floor_pd); }
27+
void NOINLINE test_floor_ps() { Ret_M128(__m128, _mm_floor_ps); }
28+
void NOINLINE test_floor_sd() { Ret_M128d_M128d(__m128d, _mm_floor_sd); }
29+
void NOINLINE test_floor_ss() { Ret_M128_M128(__m128, _mm_floor_ss); }
30+
void NOINLINE test_round_pd() { Ret_M128d_Tint(__m128d, _mm_round_pd); }
31+
void NOINLINE test_round_ps() { Ret_M128_Tint(__m128, _mm_round_ps); }
32+
void NOINLINE test_round_sd() { Ret_M128d_M128d_Tint(__m128d, _mm_round_sd); }
33+
void NOINLINE test_round_ss() { Ret_M128_M128_Tint(__m128, _mm_round_ss); }
34+
void NOINLINE test_blend_epi16() { Ret_M128i_M128i_Tint(__m128i, _mm_blend_epi16); }
35+
void NOINLINE test_blend_pd() { Ret_M128d_M128d_Tint(__m128d, _mm_blend_pd); }
36+
void NOINLINE test_blend_ps() { Ret_M128_M128_Tint(__m128, _mm_blend_ps); }
37+
void NOINLINE test_blendv_epi8() { Ret_M128i_M128i_M128i(__m128i, _mm_blendv_epi8); }
38+
void NOINLINE test_blendv_pd() { Ret_M128d_M128d_M128d(__m128d, _mm_blendv_pd); }
39+
void NOINLINE test_blendv_ps() { Ret_M128_M128_M128(__m128, _mm_blendv_ps); }
40+
void NOINLINE test_cvtepi16_epi32() { Ret_M128i(__m128i, _mm_cvtepi16_epi32); }
41+
void NOINLINE test_cvtepi16_epi64() { Ret_M128i(__m128i, _mm_cvtepi16_epi64); }
42+
void NOINLINE test_cvtepi32_epi64() { Ret_M128i(__m128i, _mm_cvtepi32_epi64); }
43+
void NOINLINE test_cvtepi8_epi16() { Ret_M128i(__m128i, _mm_cvtepi8_epi16); }
44+
void NOINLINE test_cvtepi8_epi32() { Ret_M128i(__m128i, _mm_cvtepi8_epi32); }
45+
void NOINLINE test_cvtepi8_epi64() { Ret_M128i(__m128i, _mm_cvtepi8_epi64); }
46+
void NOINLINE test_cvtepu16_epi32() { Ret_M128i(__m128i, _mm_cvtepu16_epi32); }
47+
void NOINLINE test_cvtepu16_epi64() { Ret_M128i(__m128i, _mm_cvtepu16_epi64); }
48+
void NOINLINE test_cvtepu32_epi64() { Ret_M128i(__m128i, _mm_cvtepu32_epi64); }
49+
void NOINLINE test_cvtepu8_epi16() { Ret_M128i(__m128i, _mm_cvtepu8_epi16); }
50+
void NOINLINE test_cvtepu8_epi32() { Ret_M128i(__m128i, _mm_cvtepu8_epi32); }
51+
void NOINLINE test_cvtepu8_epi64() { Ret_M128i(__m128i, _mm_cvtepu8_epi64); }
52+
void NOINLINE test_extract_epi32() { Ret_M128i_Tint(int, _mm_extract_epi32); }
53+
void NOINLINE test_extract_epi64() { Ret_M128i_Tint(int64_t, _mm_extract_epi64); }
54+
void NOINLINE test_extract_epi8() { Ret_M128i_Tint(int, _mm_extract_epi8); }
55+
void NOINLINE test_extract_ps() { Ret_M128_Tint(float, _mm_extract_ps); }
56+
void NOINLINE test_insert_epi32() { Ret_M128i_int_Tint(__m128i, _mm_insert_epi32); }
57+
void NOINLINE test_insert_epi64() { Ret_M128i_int_Tint(__m128i, _mm_insert_epi64); }
58+
void NOINLINE test_insert_ps() { Ret_M128_M128_Tint(__m128, _mm_insert_ps); }
59+
void NOINLINE test_max_epi32() { Ret_M128i_M128i(__m128i, _mm_max_epi32); }
60+
void NOINLINE test_max_epi8() { Ret_M128i_M128i(__m128i, _mm_max_epi8); }
61+
void NOINLINE test_max_epu16() { Ret_M128i_M128i(__m128i, _mm_max_epu16); }
62+
void NOINLINE test_max_epu32() { Ret_M128i_M128i(__m128i, _mm_max_epu32); }
63+
void NOINLINE test_min_epi32() { Ret_M128i_M128i(__m128i, _mm_min_epi32); }
64+
void NOINLINE test_min_epi8() { Ret_M128i_M128i(__m128i, _mm_min_epi8); }
65+
void NOINLINE test_min_epu16() { Ret_M128i_M128i(__m128i, _mm_min_epu16); }
66+
void NOINLINE test_min_epu32() { Ret_M128i_M128i(__m128i, _mm_min_epu32); }
67+
void NOINLINE test_test_cmpeq_epi64() { Ret_M128i_M128i(__m128i, _mm_cmpeq_epi64); }
68+
void NOINLINE test_test_minpos_epu16() { Ret_M128i(__m128i, _mm_minpos_epu16); }
69+
void NOINLINE test_test_mpsadbw_epu8() { Ret_M128i_M128i_Tint(__m128i, _mm_mpsadbw_epu8); }
70+
void NOINLINE test_testmul_epi32() { Ret_M128i_M128i(__m128i, _mm_mul_epi32); }
71+
void NOINLINE test_test_mullo_epi32() { Ret_M128i_M128i(__m128i, _mm_mullo_epi32); }
72+
void NOINLINE test_test_packus_epi32() { Ret_M128i_M128i(__m128i, _mm_packus_epi32); }
73+
void NOINLINE test_test_stream_load_si128() { Ret_IntPtr(__m128i, _mm_stream_load_si128, __m128i*, 4, 4); }
4174

42-
test_round();
43-
44-
Ret_M128i_M128i_Tint(__m128i, _mm_blend_epi16);
45-
Ret_M128d_M128d_Tint(__m128d, _mm_blend_pd);
46-
Ret_M128_M128_Tint(__m128, _mm_blend_ps);
47-
Ret_M128i_M128i_M128i(__m128i, _mm_blendv_epi8);
48-
Ret_M128d_M128d_M128d(__m128d, _mm_blendv_pd);
49-
Ret_M128_M128_M128(__m128, _mm_blendv_ps);
50-
Ret_M128i_M128i(__m128i, _mm_cmpeq_epi64);
51-
Ret_M128i(__m128i, _mm_cvtepi16_epi32);
52-
Ret_M128i(__m128i, _mm_cvtepi16_epi64);
53-
Ret_M128i(__m128i, _mm_cvtepi32_epi64);
54-
Ret_M128i(__m128i, _mm_cvtepi8_epi16);
55-
Ret_M128i(__m128i, _mm_cvtepi8_epi32);
56-
Ret_M128i(__m128i, _mm_cvtepi8_epi64);
57-
Ret_M128i(__m128i, _mm_cvtepu16_epi32);
58-
Ret_M128i(__m128i, _mm_cvtepu16_epi64);
59-
Ret_M128i(__m128i, _mm_cvtepu32_epi64);
60-
Ret_M128i(__m128i, _mm_cvtepu8_epi16);
61-
Ret_M128i(__m128i, _mm_cvtepu8_epi32);
62-
Ret_M128i(__m128i, _mm_cvtepu8_epi64);
75+
void NOINLINE test_dp_pd() {
76+
bool oldTestNaNBits = testNaNBits;
6377
testNaNBits = false;
6478
Ret_M128d_M128d_Tint(__m128d, _mm_dp_pd);
79+
testNaNBits = oldTestNaNBits;
80+
}
81+
82+
void NOINLINE test_dp_ps() {
83+
bool oldTestNaNBits = testNaNBits;
84+
testNaNBits = false;
6585
Ret_M128_M128_Tint(__m128, _mm_dp_ps); // _mm_dp_ps emulation does not match NaN bit selection rules (seems to be unspecified)
66-
testNaNBits = true;
67-
Ret_M128i_Tint(int, _mm_extract_epi32);
68-
Ret_M128i_Tint(int64_t, _mm_extract_epi64);
69-
Ret_M128i_Tint(int, _mm_extract_epi8);
70-
Ret_M128_Tint(float, _mm_extract_ps);
71-
Ret_M128i_int_Tint(__m128i, _mm_insert_epi32);
72-
Ret_M128i_int_Tint(__m128i, _mm_insert_epi64);
73-
Ret_M128_M128_Tint(__m128, _mm_insert_ps);
74-
Ret_M128i_M128i(__m128i, _mm_max_epi32);
75-
Ret_M128i_M128i(__m128i, _mm_max_epi8);
76-
Ret_M128i_M128i(__m128i, _mm_max_epu16);
77-
Ret_M128i_M128i(__m128i, _mm_max_epu32);
78-
Ret_M128i_M128i(__m128i, _mm_min_epi32);
79-
Ret_M128i_M128i(__m128i, _mm_min_epi8);
80-
Ret_M128i_M128i(__m128i, _mm_min_epu16);
81-
Ret_M128i_M128i(__m128i, _mm_min_epu32);
82-
Ret_M128i(__m128i, _mm_minpos_epu16);
83-
Ret_M128i_M128i_Tint(__m128i, _mm_mpsadbw_epu8);
84-
Ret_M128i_M128i(__m128i, _mm_mul_epi32);
85-
Ret_M128i_M128i(__m128i, _mm_mullo_epi32);
86-
Ret_M128i_M128i(__m128i, _mm_packus_epi32);
87-
Ret_IntPtr(__m128i, _mm_stream_load_si128, __m128i*, 4, 4);
86+
testNaNBits = oldTestNaNBits;
87+
}
88+
89+
void NOINLINE test_test_all_ones() {
8890
Ret_M128i(int, _mm_test_all_ones);
8991
printf("_mm_test_all_ones(0xFFFFFFFFFFFFFFFFull): %d\n", _mm_test_all_ones(_mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull)));
9092
printf("_mm_test_all_ones(0xFFFFFFFFFFFFFFFEull): %d\n", _mm_test_all_ones(_mm_set1_epi64x(0xFFFFFFFFFFFFFFFEull)));
9193
printf("_mm_test_all_ones(0): %d\n", _mm_test_all_ones(_mm_set1_epi64x(0)));
94+
}
95+
96+
void NOINLINE test_test_all_zeros() {
9297
Ret_M128i_M128i(int, _mm_test_all_zeros);
9398
printf("_mm_test_all_zeros(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull): %d\n", _mm_test_all_zeros(_mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull)));
9499
printf("_mm_test_all_zeros(0xFFFFFFFFFFFFFFFEull, 0xFFFFFFFFFFFFFFFFull): %d\n", _mm_test_all_zeros(_mm_set1_epi64x(0xFFFFFFFFFFFFFFFEull), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull)));
95100
printf("_mm_test_all_zeros(0, 0xFFFFFFFFFFFFFFFFull): %d\n", _mm_test_all_zeros(_mm_set1_epi64x(0), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull)));
101+
}
102+
103+
void NOINLINE test_test_mix_ones_zeros() {
96104
Ret_M128i_M128i(int, _mm_test_mix_ones_zeros);
97105
printf("_mm_test_mix_ones_zeros(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull): %d\n", _mm_test_mix_ones_zeros(_mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull)));
98106
printf("_mm_test_mix_ones_zeros(0xFFFFFFFFFFFFFFFEull, 0xFFFFFFFFFFFFFFFFull): %d\n", _mm_test_mix_ones_zeros(_mm_set1_epi64x(0xFFFFFFFFFFFFFFFEull), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull)));
99107
printf("_mm_test_mix_ones_zeros(0, 0xFFFFFFFFFFFFFFFFull): %d\n", _mm_test_mix_ones_zeros(_mm_set1_epi64x(0), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull)));
108+
}
109+
110+
void NOINLINE test_testc() {
100111
Ret_M128i_M128i(int, _mm_testc_si128);
101112
printf("_mm_testc_si128(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull): %d\n", _mm_testc_si128(_mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull)));
102113
printf("_mm_testc_si128(0xFFFFFFFFFFFFFFFEull, 0xFFFFFFFFFFFFFFFFull): %d\n", _mm_testc_si128(_mm_set1_epi64x(0xFFFFFFFFFFFFFFFEull), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull)));
103114
printf("_mm_testc_si128(0, 0xFFFFFFFFFFFFFFFFull): %d\n", _mm_testc_si128(_mm_set1_epi64x(0), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull)));
115+
}
116+
117+
void NOINLINE test_testnzc() {
104118
Ret_M128i_M128i(int, _mm_testnzc_si128);
105119
printf("_mm_testnzc_si128(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull): %d\n", _mm_testnzc_si128(_mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull)));
106120
printf("_mm_testnzc_si128(0xFFFFFFFFFFFFFFFEull, 0xFFFFFFFFFFFFFFFFull): %d\n", _mm_testnzc_si128(_mm_set1_epi64x(0xFFFFFFFFFFFFFFFEull), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull)));
107121
printf("_mm_testnzc_si128(0, 0xFFFFFFFFFFFFFFFFull): %d\n", _mm_testnzc_si128(_mm_set1_epi64x(0), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull)));
122+
}
123+
124+
void NOINLINE test_testz() {
108125
Ret_M128i_M128i(int, _mm_testz_si128);
109126
printf("_mm_testz_si128(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull): %d\n", _mm_testz_si128(_mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull)));
110127
printf("_mm_testz_si128(0xFFFFFFFFFFFFFFFEull, 0xFFFFFFFFFFFFFFFFull): %d\n", _mm_testz_si128(_mm_set1_epi64x(0xFFFFFFFFFFFFFFFEull), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull)));
111128
printf("_mm_testz_si128(0, 0xFFFFFFFFFFFFFFFFull): %d\n", _mm_testz_si128(_mm_set1_epi64x(0), _mm_set1_epi64x(0xFFFFFFFFFFFFFFFFull)));
112129
}
130+
131+
int main() {
132+
assert(numInterestingFloats % 4 == 0);
133+
assert(numInterestingInts % 4 == 0);
134+
assert(numInterestingDoubles % 4 == 0);
135+
136+
test_ceil_pd();
137+
test_ceil_ps();
138+
test_ceil_sd();
139+
test_ceil_ss();
140+
test_floor_pd();
141+
test_floor_ps();
142+
test_floor_sd();
143+
test_floor_ss();
144+
test_round_pd();
145+
test_round_ps();
146+
test_round_sd();
147+
test_round_ss();
148+
test_blend_epi16();
149+
test_blend_pd();
150+
test_blend_ps();
151+
test_blendv_epi8();
152+
test_blendv_pd();
153+
test_blendv_ps();
154+
test_cvtepi16_epi32();
155+
test_cvtepi16_epi64();
156+
test_cvtepi32_epi64();
157+
test_cvtepi8_epi16();
158+
test_cvtepi8_epi32();
159+
test_cvtepi8_epi64();
160+
test_cvtepu16_epi32();
161+
test_cvtepu16_epi64();
162+
test_cvtepu32_epi64();
163+
test_cvtepu8_epi16();
164+
test_cvtepu8_epi32();
165+
test_cvtepu8_epi64();
166+
test_extract_epi32();
167+
test_extract_epi64();
168+
test_extract_epi8();
169+
test_extract_ps();
170+
test_insert_epi32();
171+
test_insert_epi64();
172+
test_insert_ps();
173+
test_max_epi32();
174+
test_max_epi8();
175+
test_max_epu16();
176+
test_max_epu32();
177+
test_min_epi32();
178+
test_min_epi8();
179+
test_min_epu16();
180+
test_min_epu32();
181+
test_test_cmpeq_epi64();
182+
test_test_minpos_epu16();
183+
test_test_mpsadbw_epu8();
184+
test_testmul_epi32();
185+
test_test_mullo_epi32();
186+
test_test_packus_epi32();
187+
test_test_stream_load_si128();
188+
test_dp_pd();
189+
test_dp_ps();
190+
test_test_all_ones();
191+
test_test_all_zeros();
192+
test_test_mix_ones_zeros();
193+
test_testc();
194+
test_testnzc();
195+
test_testz();
196+
}

0 commit comments

Comments
 (0)