|
2 | 2 | # Distributed under the terms of the GNU General Public License v2
|
3 | 3 |
|
4 | 4 | # notes about optimization
|
5 |
| - # CXXFLAGS="-O3" recommended |
| 5 | + # CXXFLAGS="-march=native -O3" recommended |
6 | 6 | # LTO recommended with clang, but hit or miss with gcc
|
7 | 7 |
|
8 | 8 | # USE=pgo implements traditional compile => train => recompile
|
9 | 9 | # trains on static data from an actual cmdock boinc job
|
10 |
| - # env PGO_TIMEOUT=2h to change training time limit |
| 10 | + # env PGO_TIMEOUT=2h to train longer but it does not help much |
| 11 | + # training data needs updated whenever sidock switches target disease |
11 | 12 |
|
12 | 13 | # perfdata-sample implements live sampling PGO
|
13 | 14 | # see https://clang.llvm.org/docs/UsersManual.html#using-sampling-profilers
|
14 | 15 | # clang only - gcc tooling is not really usable
|
15 | 16 | # may require special CPU features for branch sampling
|
16 | 17 | # traditional pgo builds can be sampled but not both applied to the same build
|
17 | 18 | # can be repeated indefinitely, as any build with debug symbols can be sampled
|
18 |
| - # adds about 10% runtime sample conversion overhead (todo: reduce) |
| 19 | + # adds about 20% runtime sample conversion overhead (todo: reduce) |
| 20 | + # no noticeable overhead unless perfdata is actually sampling |
| 21 | + # todo: might get better results when gathering and applying samples on same CPU |
| 22 | + |
| 23 | + # top performers from tests on bdver2 - rough comparison with official project binaries |
| 24 | + # 1. 11% faster - gcc-13.3.1 USE="-clang pgo" CXXFLAGS="-march=native -O3 -flto -fno-profile-partial-training" |
| 25 | + # 2. 06% faster - gcc-13.3.1 USE="-clang -pgo" CXXFLAGS="-march=native -O3" |
| 26 | + # 3. 02% faster - clang-18.1.8 USE="clang perfdata-sample-use" CXXFLAGS="-march=native -O3 -flto -fno-profile-sample-accurate -fno-sample-profile-use-profi" with samples from skylake |
19 | 27 | #
|
20 | 28 |
|
21 | 29 | EAPI=8
|
@@ -161,7 +169,9 @@ src_configure() {
|
161 | 169 |
|
162 | 170 | if use pgo || use perfdata-sample-use; then
|
163 | 171 | # do not assume all code paths are exercised during pgo training
|
164 |
| - tc-is-clang && prepend-flags '-fno-profile-sample-accurate' || prepend-flags '-fprofile-partial-training' |
| 172 | + # without this flag, unused paths are optimized for size rather than speed |
| 173 | + # gcc has similar -fprofile-partial-training but it hurts slightly rather than help slightly |
| 174 | + tc-is-clang && prepend-flags '-fno-profile-sample-accurate' |
165 | 175 | fi
|
166 | 176 |
|
167 | 177 | if use perfdata-sample-gen; then
|
|
0 commit comments