Add Intrinsics Code Sample (oneapi-src#17)

ethanhirsch · web-flow · commit f3c242844958 · 2020-07-09T10:25:59.000-07:00
* added source files for intrinsics sample

Signed-off-by: Ethan Hirsch &lt;ethan.hirsch@intel.com&gt;

* added sample.json

Signed-off-by: Ethan Hirsch &lt;ethan.hirsch@intel.com&gt;

* updated sample.json to comply with guidelines

Signed-off-by: Ethan Hirsch &lt;ethan.hirsch@intel.com&gt;

* added implementation deets to readme

Signed-off-by: Ethan Hirsch &lt;ethan.hirsch@intel.com&gt;

* added sample output to readme

Signed-off-by: Ethan Hirsch &lt;ethan.hirsch@intel.com&gt;

* removed old readme

Signed-off-by: Ethan Hirsch &lt;ethan.hirsch@intel.com&gt;

* renamed .c files to .cpp
includes renaming all file references

Signed-off-by: Ethan Hirsch &lt;ethan.hirsch@intel.com&gt;

* added debug ci config and documentation

Signed-off-by: Ethan Hirsch &lt;ethan.hirsch@intel.com&gt;

* styled src files with clang-format

Signed-off-by: Ethan Hirsch &lt;ethan.hirsch@intel.com&gt;

* update license.txt

Signed-off-by: Ethan Hirsch &lt;ethan.hirsch@intel.com&gt;
diff --git a/DirectProgramming/C++/Intrinsics/Makefile b/DirectProgramming/C++/Intrinsics/Makefile
@@ -0,0 +1,52 @@
+#==============================================================
+#
+# SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
+# http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
+#
+# Copyright Intel Corporation
+#
+# THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
+# NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+# PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
+#
+# =============================================================
+CC = icc
+EXECS=intrin_dot_sample.exe intrin_double_sample.exe intrin_ftz_sample.exe
+DBG_EXECS=intrin_dot_sample_dbg.exe intrin_double_sample_dbg.exe intrin_ftz_sample_dbg.exe
+
+release: $(EXECS)
+
+debug: $(DBG_EXECS)
+
+run: release
+	@for i in $(EXECS); do ./$$i; done
+
+debug_run: debug
+	@for i in $(DBG_EXECS); do ./$$i; done
+
+intrin_dot_sample.exe: intrin_dot_sample.o
+	$(CC) -O2 $^ -o $@
+
+intrin_double_sample.exe: intrin_double_sample.o
+	$(CC) -O2 $^ -o $@
+
+intrin_ftz_sample.exe: intrin_ftz_sample.o
+	$(CC) -O2 $^ -o $@
+
+intrin_dot_sample_dbg.exe: intrin_dot_sample_dbg.o
+	$(CC) -O0 -g $^ -o $@
+
+intrin_double_sample_dbg.exe: intrin_double_sample_dbg.o
+	$(CC) -O0 -g $^ -o $@
+
+intrin_ftz_sample_dbg.exe: intrin_ftz_sample_dbg.o
+	$(CC) -O0 -g $^ -o $@
+
+%.o: src/%.cpp
+	$(CC) -O2 -c -o $@  $<
+
+%_dbg.o: src/%.cpp
+	$(CC) -O0 -g -c -o $@  $<
+
+clean:
+	/bin/rm -f core.* *.o *.exe
diff --git a/DirectProgramming/C++/Intrinsics/README.md b/DirectProgramming/C++/Intrinsics/README.md
@@ -0,0 +1,76 @@
+# `Intrinsics` Sample
+
+The intrinsic samples are designed to show how to utilize the intrinsics supported by the Intel&reg; C++ compiler in a variety of applications. The src folder contains three .cpp source files each demonstrating different functionality of the intrinsics, including vector operations, complex numbers computations, and FTZ/DAZ flags.
+
+| Optimized for                     | Description
+|:---                               |:---
+| OS                                | Linux* Ubuntu* 18.04; MacOS* Catalina* or newer
+| Hardware                          | Skylake with GEN9 or newer
+| Software                          | Intel&reg; C++ Compiler 2021.1 or newer;
+| What you will learn               | How to utlize intrinsics supported by the Intel&reg; C++ Compiler
+| Time to complete                  | 15 minutes
+
+
+## Purpose
+
+Intrinsics are assembly-coded functions that allow you to use C++ function calls and variables in place of assembly instructions. Intrinsics are expanded inline, eliminating function call overhead. While providing the same benefits as using inline assembly, intrinsics improve code readability, assist instruction scheduling, and help when debugging. They provide access to instructions that cannot be generated using the standard constructs of the C and C++ languages, and allow code to leverage performance enhancing features unique to specific processors.
+
+Further information on intriniscs can be found here: https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics.html#intrinsics_GUID-D70F9A9A-BAE1-4242-963E-C3A12DE296A1
+
+## Key Implementation Details 
+
+This sample makes use of intrinsic functions to perform common mathematical operations including:
+- Computing a dot product of two vectors
+- Computing the product of two complex numbers
+The implementations include multiple functions to accomplish these tasks, each one leveraging a different set of intrinsics available to Intel&reg; processors.
+
+ 
+## License  
+
+This code sample is licensed under MIT license. 
+
+
+## Building the `Mandelbrot` Program for CPU and GPU
+
+Perform the following steps:
+1. Build the program using the following `make` commands. 
+``` 
+$ make (or "make debug" to compile with the -g flag)
+```
+
+2. Run the program:
+    ```
+    make run (or "make debug_run" to run the debug version)
+    ```
+
+3. Clean the program using:
+    ```
+    make clean
+    ```
+
+
+### Application Parameters 
+
+These intrinsics samples have relatively few modifiable parameters. However, certain options are avaiable to the user:
+
+1. intrin_dot_sample: Line 35 defines the size of the vectors used in the dot product computation.
+
+2. intrin_double_sample: Lines 244-247 define the values of the two complex numbers used in the computation.
+
+3. intrin_ftz_sample: This sample has no modifiable parameters.
+
+
+```
+Dot Product computed by C:  4324.000000
+Dot Product computed by Intel(R) SSE3 intrinsics:  4324.000000
+Dot Product computed by Intel(R) AVX2 intrinsics:  4324.000000
+Dot Product computed by Intel(R) AVX intrinsics:  4324.000000
+Dot Product computed by Intel(R) MMX(TM) intrinsics:  4324
+Complex Product(C):             23.00+ -2.00i
+Complex Product(Intel(R) AVX2): 23.00+ -2.00i
+Complex Product(Intel(R) AVX):  23.00+ -2.00i
+Complex Product(Intel(R) SSE3): 23.00+ -2.00i
+Complex Product(Intel(R) SSE2): 23.00+ -2.00i
+FTZ is set.
+DAZ is set.
+```
diff --git a/DirectProgramming/C++/Intrinsics/license.txt b/DirectProgramming/C++/Intrinsics/license.txt
@@ -0,0 +1,8 @@
+Copyright 2020 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
diff --git a/DirectProgramming/C++/Intrinsics/sample.json b/DirectProgramming/C++/Intrinsics/sample.json
@@ -0,0 +1,20 @@
+{
+    "name": "Intrinsics C++",
+    "description": "Demonstrates the intrinsic functions of the Intel® C++ Compiler",
+    "categories": ["Toolkit/Intel® oneAPI HPC Toolkit"],
+    "os": ["linux", "darwin"],
+    "builder": ["cmake"],
+    "languages": [{"cpp":{}}],
+    "toolchain": ["icc"],
+    "guid": "ACD0E89E-67CC-4CB4-87AB-B12B84962EAF",
+    "ciTests": {
+        "linux": [
+            { "id": "standard", "steps": [ "make", "make run", "make clean" ] },
+            { "id": "debug", "steps": [ "make debug", "make debug_run", "make clean" ] }
+        ],
+        "darwin": [ 
+            { "id": "standard", "steps": [ "make", "make run", "make clean" ] },
+            { "id": "debug", "steps": [ "make debug", "make debug_run", "make clean" ] }
+        ]
+    }
+}
diff --git a/DirectProgramming/C++/Intrinsics/src/intrin_dot_sample.cpp b/DirectProgramming/C++/Intrinsics/src/intrin_dot_sample.cpp
@@ -0,0 +1,236 @@
+//==============================================================
+//
+// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
+// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
+//
+// Copyright 2016 Intel Corporation
+//
+// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS
+// FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
+//
+// =============================================================
+/* [DESCRIPTION]
+ * This C code sample demonstrates how to use C, Intel(R) MMX(TM),
+ * Intel(R) Streaming SIMD Extensions 3 (Intel(R) SSE3),
+ * Intel(R) Advanced Vector Extensions (Intel(R) AVX), and
+ * Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2)
+ * intrinsics to calculate the dot product of two vectors.
+ *
+ * Do not run the sample on systems using processors that do
+ * not support Intel(R) MMX(TM), Intel(R) SSE3; the application
+ * will fail.
+ *
+ * [Output]
+ * Dot Product computed by C:  4324.000000
+ * Dot Product computed by Intel(R) SSE3 intrinsics:  4324.000000
+ * Dot Product computed by Intel(R) AVX intrinsics:  4324.000000
+ * Dot Product computed by Intel(R) AVX2 intrinsics:  4324.000000
+ * Dot Product computed by Intel(R) MMX(TM) intrinsics:  4324
+ *
+ */
+#include <immintrin.h>
+#include <pmmintrin.h>
+#include <stdio.h>
+#define SIZE 24  // assumes size is a multiple of 8 because
+// Intel(R) AVX registers will store 8, 32bit elements.
+
+// Computes dot product using C
+float dot_product(float *a, float *b);
+// Computes dot product using Intel(R) SSE intrinsics
+float dot_product_intrin(float *a, float *b);
+// Computes dot product using Intel(R) AVX intrinsics
+float AVX_dot_product(float *a, float *b);
+float AVX2_dot_product(float *a, float *b);
+// Computes dot product using Intel(R) MMX(TM) intrinsics
+short MMX_dot_product(short *a, short *b);
+
+#define MMX_DOT_PROD_ENABLED (__INTEL_COMPILER || (_MSC_VER && !_WIN64))
+
+int main() {
+  float x[SIZE], y[SIZE];
+  short a[SIZE], b[SIZE];
+  int i;
+  float product;
+  short mmx_product;
+  for (i = 0; i < SIZE; i++) {
+    x[i] = i;
+    y[i] = i;
+    a[i] = i;
+    b[i] = i;
+  }
+  product = dot_product(x, y);
+  printf("Dot Product computed by C:  %f\n", product);
+
+  product = dot_product_intrin(x, y);
+  printf("Dot Product computed by Intel(R) SSE3 intrinsics:  %f\n", product);
+
+  // The Visual Studio* editor will show the following section as disabled as it
+  // does not know that __INTEL_COMPILER is defined by the Intel (R) Compiler
+#if __INTEL_COMPILER
+  if (_may_i_use_cpu_feature(_FEATURE_AVX2)) {
+    product = AVX2_dot_product(x, y);
+    printf("Dot Product computed by Intel(R) AVX2 intrinsics:  %f\n", product);
+  } else
+    printf("Your Processor does not support AVX2 instrinsics.\n");
+  if (_may_i_use_cpu_feature(_FEATURE_AVX)) {
+    product = AVX_dot_product(x, y);
+    printf("Dot Product computed by Intel(R) AVX intrinsics:  %f\n", product);
+  } else
+    printf("Your Processor does not support AVX intrinsics.\n");
+#else
+  printf("Use Intel(R) Compiler to compute with Intel(R) AVX intrinsics\n");
+#endif
+
+#if MMX_DOT_PROD_ENABLED
+  mmx_product = MMX_dot_product(a, b);
+  _mm_empty();
+  printf("Dot Product computed by Intel(R) MMX(TM) intrinsics:  %d\n",
+         mmx_product);
+
+#else
+  printf(
+      "Use Intel(R) compiler in order to calculate dot product using Intel(R) "
+      "MMX(TM) intrinsics\n");
+#endif
+
+  return 0;
+}
+
+float dot_product(float *a, float *b) {
+  int i;
+  int sum = 0;
+  for (i = 0; i < SIZE; i++) {
+    sum += a[i] * b[i];
+  }
+  return sum;
+}
+
+// The Visual Studio* editor will show the following section as disabled as it
+// does not know that __INTEL_COMPILER is defined by the Intel(R) Compiler
+#if __INTEL_COMPILER
+
+float AVX2_dot_product(float *a, float *b) {
+  float total;
+  int i;
+  __m256 num1, num2, num3;
+  __m128 top, bot;
+  num3 = _mm256_setzero_ps();  // sets sum to zero
+  for (i = 0; i < SIZE; i += 8) {
+    num1 = _mm256_loadu_ps(a + i);  // loads unaligned array a into num1
+    // num1= a[7] a[6] a[5] a[4] a[3]  a[2]  a[1]  a[0]
+    num2 = _mm256_loadu_ps(b + i);  // loads unaligned array b into num2
+    // num2= b[7] b[6] b[5] b[4] b[3]   b[2]   b[1]  b[0]
+    num3 = _mm256_fmadd_ps(
+        num1, num2, num3);  // performs multiplication and vertical addition
+    // num3 = a[7]*b[7]+num3[7]  a[6]*b[6]+num3[6]  a[5]*b[5]+num3[5]
+    // a[4]*b[4]+num3[4]
+    //       a[3]*b[3]+num3[3]  a[2]*b[2]+num3[2]  a[1]*b[1]+num3[1]
+    //       a[0]*b[0]+num3[0]
+  }
+  num3 = _mm256_hadd_ps(num3, num3);  // performs horizontal addition
+  // For example, if num3 is filled with: 7 6 5 4 3 2 1 0
+  // then num3 = 13 9 13 9 5 1 5 1
+
+  // extracting the __m128 from the __m256 datatype
+  top = _mm256_extractf128_ps(num3, 1);  // top = 13 9 13 9
+  bot = _mm256_extractf128_ps(num3, 0);  // bot = 5 1 5 1
+
+  // completing the reduction
+  top = _mm_add_ps(top, bot);   // top = 14 10 14 10
+  top = _mm_hadd_ps(top, top);  // top = 24 24 24 24
+
+  _mm_store_ss(&total, top);  // Storing the result in total
+
+  return total;
+}
+
+float AVX_dot_product(float *a, float *b) {
+  float total;
+  int i;
+  __m256 num1, num2, num3, num4;
+  __m128 top, bot;
+  num4 = _mm256_setzero_ps();  // sets sum to zero
+  for (i = 0; i < SIZE; i += 8) {
+    num1 = _mm256_loadu_ps(a + i);  // loads unaligned array a into num1
+    // num1= a[7] a[6] a[5] a[4] a[3]  a[2]  a[1]  a[0]
+    num2 = _mm256_loadu_ps(b + i);  // loads unaligned array b into num2
+    // num2= b[7] b[6] b[5] b[4] b[3]   b[2]   b[1]  b[0]
+    num3 = _mm256_mul_ps(num1, num2);  // performs multiplication
+    // num3 = a[7]*b[7]  a[6]*b[6]  a[5]*b[5]  a[4]*b[4]  a[3]*b[3]  a[2]*b[2]
+    // a[1]*b[1]  a[0]*b[0]
+    num4 = _mm256_add_ps(num4, num3);  // performs vertical addition
+  }
+  num4 = _mm256_hadd_ps(num4, num4);  // performs horizontal addition
+  // For example, if num4 is filled with: 7 6 5 4 3 2 1 0
+  // then num4 = 13 9 13 9 5 1 5 1
+
+  // extracting the __m128 from the __m256 datatype
+  top = _mm256_extractf128_ps(num4, 1);  // top = 13 9 13 9
+  bot = _mm256_extractf128_ps(num4, 0);  // bot = 5 1 5 1
+
+  // completing the reduction
+  top = _mm_add_ps(top, bot);   // top = 14 10 14 10
+  top = _mm_hadd_ps(top, top);  // top = 24 24 24 24
+
+  _mm_store_ss(&total, top);  // Storing the result in total
+
+  return total;
+}
+#endif
+
+float dot_product_intrin(float *a, float *b) {
+  float total;
+  int i;
+  __m128 num1, num2, num3, num4;
+  __m128 num5;
+  num4 = _mm_setzero_ps();  // sets sum to zero
+  for (i = 0; i < SIZE; i += 4) {
+    num1 = _mm_loadu_ps(
+        a +
+        i);  // loads unaligned array a into num1  num1= a[3]  a[2]  a[1]  a[0]
+    num2 = _mm_loadu_ps(
+        b +
+        i);  // loads unaligned array b into num2  num2= b[3]   b[2]   b[1] b[0]
+    num3 = _mm_mul_ps(num1, num2);  // performs multiplication   num3 =
+                                    // a[3]*b[3]  a[2]*b[2]  a[1]*b[1]  a[0]*b[0]
+    num3 = _mm_hadd_ps(num3, num3);  // performs horizontal addition
+    // num3=  a[3]*b[3]+ a[2]*b[2]  a[1]*b[1]+a[0]*b[0]  a[3]*b[3]+ a[2]*b[2]
+    // a[1]*b[1]+a[0]*b[0]
+    num4 = _mm_add_ps(num4, num3);  // performs vertical addition
+  }
+
+  num4 = _mm_hadd_ps(num4, num4);
+  _mm_store_ss(&total, num4);
+  return total;
+}
+
+// Intel(R) MMX(TM) technology cannot handle single precision floats
+#if MMX_DOT_PROD_ENABLED
+short MMX_dot_product(short *a, short *b) {
+  int i;
+  short result, data;
+  __m64 num3, sum;
+  __m64 *ptr1, *ptr2;
+  _m_empty();
+  sum = _mm_setzero_si64();  // sets sum to zero
+  for (i = 0; i < SIZE; i += 4) {
+    ptr1 = (__m64 *)&a[i];  // Converts array a to a pointer of type
+    //__m64 and stores four elements into
+    // Intel(R) MMX(TM) registers
+    ptr2 = (__m64 *)&b[i];
+    num3 = _m_pmaddwd(*ptr1, *ptr2);  // multiplies elements and adds lower
+    // elements with lower element and
+    // higher elements with higher
+    sum = _m_paddw(sum, num3);
+  }
+
+  data = _m_to_int(sum);     // converts __m64 data type to an int
+  sum = _m_psrlqi(sum, 32);  // shifts sum
+  result = _m_to_int(sum);
+  result = result + data;
+  _mm_empty();  // clears the Intel(R) MMX(TM) registers and
+  // Intel(R) MMX(TM) state.
+  return result;
+}
+#endif
diff --git a/DirectProgramming/C++/Intrinsics/src/intrin_double_sample.cpp b/DirectProgramming/C++/Intrinsics/src/intrin_double_sample.cpp
diff --git a/DirectProgramming/C++/Intrinsics/src/intrin_ftz_sample.cpp b/DirectProgramming/C++/Intrinsics/src/intrin_ftz_sample.cpp