Skip to content
This repository was archived by the owner on Jul 16, 2024. It is now read-only.

Commit 5cd28bc

Browse files
author
Mark Ryan
authored
Merge pull request #9 from intel/markdryan/jan-2023
Add AMX, FP16 and other examples
2 parents 61cdceb + a388c3c commit 5cd28bc

File tree

215 files changed

+8117
-89
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

215 files changed

+8117
-89
lines changed

.github/workflows/main.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
# This workflow contains a single job called "build"
1919
build:
2020
# The type of runner that the job will run on
21-
runs-on: ubuntu-20.04
21+
runs-on: ubuntu-22.04
2222

2323
# Steps represent a sequence of tasks that will be executed as part of the job
2424
steps:
@@ -41,4 +41,4 @@ jobs:
4141
mkdir clang-build
4242
cd clang-build
4343
CC=clang CXX=clang++ cmake -DENABLE_WERROR=ON ..
44-
make -j
44+
make -j 4

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ build
55
checkpatch.pl
66
const_structs.checkpatch
77
spelling.txt
8+
**/optimisation.tar

CMakeLists.txt

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
cmake_minimum_required (VERSION 3.16.3)
2+
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
3+
cmake_policy(SET CMP0135 NEW)
4+
endif()
25
project(optimization C CXX ASM)
36

7+
include(CheckCXXCompilerFlag)
8+
49
find_package(benchmark QUIET)
510

611
if (CMAKE_CXX_COMPILER_ID MATCHES MSVC)
@@ -29,6 +34,7 @@ include_directories("common")
2934

3035
if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang)
3136
add_compile_options(-Wall -Winline -pedantic -march=haswell)
37+
add_compile_options($<$<COMPILE_LANGUAGE:ASM>:-x$<SEMICOLON>assembler-with-cpp>)
3238
endif()
3339

3440
if (CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang)
@@ -42,6 +48,8 @@ endif()
4248
enable_testing()
4349
add_subdirectory(common)
4450
add_subdirectory(chap5/ex15)
51+
add_subdirectory(chap7/ex3)
52+
add_subdirectory(chap7/ex4)
4553
add_subdirectory(chap8/ex1)
4654
add_subdirectory(chap8/ex2)
4755
add_subdirectory(chap8/ex4)
@@ -91,6 +99,7 @@ add_subdirectory(chap18/ex1)
9199
add_subdirectory(chap18/ex2)
92100
add_subdirectory(chap18/ex3)
93101
add_subdirectory(chap18/ex4)
102+
add_subdirectory(chap18/ex5)
94103
add_subdirectory(chap18/ex6)
95104
add_subdirectory(chap18/ex7)
96105
add_subdirectory(chap18/ex8)
@@ -121,3 +130,27 @@ add_subdirectory(chap18/ex32)
121130
add_subdirectory(chap18/ex33)
122131
add_subdirectory(chap18/ex34)
123132
add_subdirectory(chap18/ex35)
133+
add_subdirectory(chap19/ex1)
134+
add_subdirectory(chap19/ex2)
135+
add_subdirectory(chap19/ex3)
136+
add_subdirectory(chap19/ex4)
137+
add_subdirectory(chap19/ex5)
138+
add_subdirectory(chap20/ex4)
139+
add_subdirectory(chap20/ex5)
140+
add_subdirectory(chap20/ex6)
141+
add_subdirectory(chap20/ex7)
142+
add_subdirectory(chap20/ex8)
143+
add_subdirectory(chap20/ex10)
144+
add_subdirectory(chap20/ex14)
145+
add_subdirectory(chap20/ex16)
146+
add_subdirectory(chap20/ex17)
147+
add_subdirectory(chap20/ex18)
148+
add_subdirectory(chap20/ex19)
149+
add_subdirectory(chap20/ex20)
150+
add_subdirectory(chap20/ex21)
151+
add_subdirectory(chap20/ex22)
152+
add_subdirectory(chap20/ex23)
153+
add_subdirectory(chap20/ex24)
154+
add_subdirectory(chap20/ex25)
155+
add_subdirectory(chap20/ex27)
156+

CONTRIBUTORS.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,6 @@
33
44
55
6+
7+
8+

README.md

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ Intel Optimization Manual available here
66
is provided for GCC, Clang and MSVC, using the Intel syntax. Unit tests are
77
also provided for each of the samples.
88

9-
## Building on Linux
9+
## Building on Linux and macOS
1010

1111
To run the unit tests
1212

@@ -16,7 +16,13 @@ To run the unit tests
1616
4. cmake ..
1717
5. make && make test
1818

19-
GCC 8.1 or higher is required to build the unit tests. The unit tests are
19+
GCC 8.1 (or clang 12 on macOS) or higher is required to build the unit tests. However,
20+
many of the newer examples, e.g, those that use AMX or AVX-512 FP16 instructions, require newer
21+
versions of the compilers to build; GCC 12 or clang 14. No errors will be reported
22+
when building, but examples built with toolchains that do not support the instructions
23+
that they test will simple report an error when run and exit.
24+
25+
The unit tests are
2026
compiled with --march=haswell and so a fourth-generation Intel® Core™ (Haswell)
2127
CPU or later is required to run them. Tests that execute instructions not present
2228
on fourth-generation Intel® Core™ (Haswell) will be
@@ -34,13 +40,13 @@ The code samples can also be compiled with clang:
3440
## Building on Windows
3541

3642
To run the tests on Windows machine-
37-
Dependency- Visual Studio 2019
43+
Dependency- Visual Studio 2022
3844

3945
1. go to optimization repo on your local machine.
4046
2. mkdir bld
4147
3. cd bld
4248
4. (inside x64 Native tools command prompt)
43-
"cmake -G "Visual Studio 16 2019" .." => this will generate visual studio solution files.
49+
"cmake -G "Visual Studio 17 2022" .." => this will generate visual studio solution files.
4450
open optimization.sln file using visual studio.
4551
5. To Build- build "ALL_BUILD" project
4652
6. To Run tests- build "RUN_TESTS" project.
@@ -52,6 +58,10 @@ built using [Google's Benchmark project](https://github.com/google/benchmark).
5258
If Benchmark is installed and discoverable by CMake, the benchmarks for the code
5359
samples will be automatically built when you type make.
5460

61+
In Windows, ensure you build the benchmark code with the same build type
62+
(Release/Debug) as Google's Benchmark to prevent debug level mismatch errors
63+
while linking.
64+
5565
## CPU Requirements
5666

5767
The code samples assume that they are being run on a fourth-generation Intel® Core™ (Haswell) processor

chap15/ex27/rsqrtps_newt_sse.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,11 @@ loop1:
5555
pop rbx
5656
ret
5757

58-
.data
58+
#ifdef __APPLE__
59+
.section __TEXT,__const
60+
#else
61+
.section .rodata
62+
#endif
5963
.p2align 4
6064

6165
minus_half:

chap15/ex27/vrsqrtps_newt_avx.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,11 @@ loop1:
5757
pop rbx
5858
ret
5959

60-
.data
60+
#ifdef __APPLE__
61+
.section __TEXT,__const
62+
#else
63+
.section .rodata
64+
#endif
6165
.p2align 5
6266

6367
half:

chap15/ex30/sqrt_rsqrtps_taylor_sse.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,11 @@ loop1:
6161
pop rbx
6262
ret
6363

64-
.data
64+
#ifdef __APPLE__
65+
.section __TEXT,__const
66+
#else
67+
.section .rodata
68+
#endif
6569
.p2align 4
6670

6771
minus_half:

chap15/ex30/sqrt_vrsqrtps_taylor_avx.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,11 @@ loop1:
5959
pop rbx
6060
ret
6161

62-
.data
62+
#ifdef __APPLE__
63+
.section __TEXT,__const
64+
#else
65+
.section .rodata
66+
#endif
6367
.p2align 5
6468

6569
minus_half:

chap15/ex46/avx2_vpgatherd.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,11 @@ loop:
6161
vzeroupper
6262
ret
6363

64-
.data
64+
#ifdef __APPLE__
65+
.section __TEXT,__const
66+
#else
67+
.section .rodata
68+
#endif
6569
.p2align 5
6670

6771
real_offset:

chap18/ex10/avx2_compress.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,11 @@ mainloop:
7272

7373
ret
7474

75-
.data
75+
#ifdef __APPLE__
76+
.section __TEXT,__const
77+
#else
78+
.section .rodata
79+
#endif
7680
.p2align 5
7781

7882
shuffle_LUT:

chap18/ex10/avx_compress.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,11 @@ mainloop:
7373

7474
ret
7575

76-
.data
76+
#ifdef __APPLE__
77+
.section __TEXT,__const
78+
#else
79+
.section .rodata
80+
#endif
7781
.p2align 4
7882

7983
shuffle_LUT:

chap18/ex11/expand_avx2.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,11 @@ mainloop:
6565
vzeroupper
6666
ret
6767

68-
.data
68+
#ifdef __APPLE__
69+
.section __TEXT,__const
70+
#else
71+
.section .rodata
72+
#endif
6973
.p2align 5
7074

7175
shuf2:

chap18/ex13/transpose_avx512.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,11 @@ matrix_loop:
5151
vzeroupper
5252
ret
5353

54-
.data
54+
#ifdef __APPLE__
55+
.section __TEXT,__const
56+
#else
57+
.section .rodata
58+
#endif
5559
.p2align 6
5660

5761
permMaskBuffer:

chap18/ex17/software_scatter.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,11 @@ mainloop:
100100
pop rbx
101101
ret
102102

103-
.data
103+
#ifdef __APPLE__
104+
.section __TEXT,__const
105+
#else
106+
.section .rodata
107+
#endif
104108
.p2align 5
105109

106110
shufMaskP:

chap18/ex18/qword_avx2_intrinsics.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ void qword_avx2_intrinsics(const int64_t *a, const int64_t *b, int64_t *c,
2424

2525
for (int i = 0; i < N; i += 32) {
2626
__m256i aa, bb, aah, bbh, mul, sum;
27-
//#pragma unroll(8)
27+
// #pragma unroll(8)
2828
for (int j = 0; j < 8; j++) {
2929
aa = _mm256_loadu_si256(
3030
(const __m256i *)(a + i + 4 * j));

chap18/ex20/avx512_vector_dp.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,11 @@ end:
165165

166166
ret
167167

168-
.data
168+
#ifdef __APPLE__
169+
.section __TEXT,__const
170+
#else
171+
.section .rodata
172+
#endif
169173
.p2align 6
170174
all_31s:
171175
.quad 0x0000001f0000001f

chap18/ex23/decompress_vbmi.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,11 @@ loop:
5151
vzeroupper
5252
ret
5353

54-
.data
54+
#ifdef __APPLE__
55+
.section __TEXT,__const
56+
#else
57+
.section .rodata
58+
#endif
5559
.p2align 6
5660

5761
permute_ctrl:

chap18/ex25/fma_only_tpt.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,11 @@ loop1:
6060

6161
ret
6262

63-
.data
63+
#ifdef __APPLE__
64+
.section __TEXT,__const
65+
#else
66+
.section .rodata
67+
#endif
6468
.p2align 6
6569
one_vec:
6670
.double 1, 1, 1, 1, 1, 1, 1, 1

chap18/ex25/fma_shuffle_tpt.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,11 @@ loop1:
8585

8686
ret
8787

88-
.data
88+
#ifdef __APPLE__
89+
.section __TEXT,__const
90+
#else
91+
.section .rodata
92+
#endif
8993
.p2align 6
9094
one_vec:
9195
.double 1, 1, 1, 1, 1, 1, 1, 1

chap18/ex26/g2s_vpermi2d.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,11 @@ loop:
6262

6363
ret
6464

65-
.data
65+
#ifdef __APPLE__
66+
.section __TEXT,__const
67+
#else
68+
.section .rodata
69+
#endif
6670
.p2align 6
6771

6872
gather_imag_index:

chap18/ex26/g2s_vpgatherdd.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,11 @@ loop:
7676

7777
ret
7878

79-
.data
79+
#ifdef __APPLE__
80+
.section __TEXT,__const
81+
#else
82+
.section .rodata
83+
#endif
8084
.p2align 6
8185

8286
gather_imag_index:

chap18/ex27/s2s_vpermi2d.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,11 @@ loop:
5858

5959
ret
6060

61-
.data
61+
#ifdef __APPLE__
62+
.section __TEXT,__const
63+
#else
64+
.section .rodata
65+
#endif
6266
.p2align 6
6367

6468
first_half:

chap18/ex27/s2s_vscatterdps.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,11 @@ loop:
5858

5959
ret
6060

61-
.data
61+
#ifdef __APPLE__
62+
.section __TEXT,__const
63+
#else
64+
.section .rodata
65+
#endif
6266
.p2align 6
6367

6468
gather_imag_index:

chap18/ex28/adj_vpgatherpd.s

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,11 @@ loop:
6666

6767
ret
6868

69-
.data
69+
#ifdef __APPLE__
70+
.section __TEXT,__const
71+
#else
72+
.section .rodata
73+
#endif
7074
.p2align 5
7175

7276
index_inc:

0 commit comments

Comments
 (0)