hughes-c
diff --git a/‎DirectProgramming/C++/StructuredGrids/iso3dfd_omp_offload/CMakeLists.txt
Lines changed: 6 additions & 0 deletions b/‎DirectProgramming/C++/StructuredGrids/iso3dfd_omp_offload/CMakeLists.txt
Lines changed: 6 additions & 0 deletions
diff --git a/‎DirectProgramming/C++/StructuredGrids/iso3dfd_omp_offload/License.txt
Lines changed: 7 additions & 0 deletions b/‎DirectProgramming/C++/StructuredGrids/iso3dfd_omp_offload/License.txt
Lines changed: 7 additions & 0 deletions
diff --git a/‎DirectProgramming/C++/StructuredGrids/iso3dfd_omp_offload/README.md
Lines changed: 140 additions & 0 deletions b/‎DirectProgramming/C++/StructuredGrids/iso3dfd_omp_offload/README.md
Lines changed: 140 additions & 0 deletions
diff --git a/‎DirectProgramming/C++/StructuredGrids/iso3dfd_omp_offload/include/iso3dfd.h
Lines changed: 50 additions & 0 deletions b/‎DirectProgramming/C++/StructuredGrids/iso3dfd_omp_offload/include/iso3dfd.h
Lines changed: 50 additions & 0 deletions
diff --git a/‎DirectProgramming/C++/StructuredGrids/iso3dfd_omp_offload/sample.json
Lines changed: 22 additions & 0 deletions b/‎DirectProgramming/C++/StructuredGrids/iso3dfd_omp_offload/sample.json
Lines changed: 22 additions & 0 deletions
diff --git a/‎DirectProgramming/C++/StructuredGrids/iso3dfd_omp_offload/src/CMakeLists.txt
Lines changed: 37 additions & 0 deletions b/‎DirectProgramming/C++/StructuredGrids/iso3dfd_omp_offload/src/CMakeLists.txt
Lines changed: 37 additions & 0 deletions
@@ -0,0 +1,6 @@
+# CMakeLists.txt for ISO3DFD_OMP_OFFLOAD project
+cmake_minimum_required (VERSION 3.0)
+set(CMAKE_CXX_COMPILER "icpx")
+project (iso3dfd_omp_offload)
+add_subdirectory (src)
+
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,140 @@
+# `ISO3DFD OpenMP Offload` Sample
+
+The ISO3DFD sample refers to Three-Dimensional Finite-Difference Wave Propagation in Isotropic Media.  It is a three-dimensional stencil to simulate a wave propagating in a 3D isotropic medium and shows some of the more common challenges and techniques when targeting OMP Offload devices (GPU) in more complex applications to achieve good performance. 
+
+| Optimized for                       | Description
+|:---                               |:---
+| OS                                | Linux* Ubuntu* 18.04
+| Hardware                          | Skylake with GEN9 or newer
+| Software                          | Intel&reg; oneAPI DPC++/C++ Compiler;
+| What you will learn               | How to offload the computation to GPU using Intel&reg; oneAPI DPC++/C++ Compiler
+| Time to complete                  | 15 minutes
+
+Performance number tabulation
+
+| iso3dfd_omp_offload sample            | Performance data
+|:---                               	|:---
+| Default Baseline version              | 1.0
+| Optimized version 1	                | 1.11x
+| Optimized version 2	                | 1.48x
+| Optimized version 3	                | 1.60x
+
+
+## Purpose
+
+ISO3DFD is a finite difference stencil kernel for solving the 3D acoustic isotropic wave equation which can be used as a proxy for propogating a seismic wave. Kernels in this sample are implemented as 16th order in space, with symmetric coefficients, and 2nd order in time scheme without boundary conditions.. Using OpenMP Offload, the sample can explicitly run on the GPU to propagate a seismic wave which is a compute intensive task.
+
+The code will attempt to find an available GPU or OpenMP Offload capable device and exit if a compatible device is not detected. By default, the output will print the device name where the OpenMP Offload code ran along with the grid computation metrics - flops and effective throughput. For validating results, a OpenMP/CPU-only version of the application will be run on host/CPU and results will be compared to the OpenMP Offload version.
+
+The code also demonstrates some of the common optimization techniques which can be used to improve performance of 3D-stencil code running on a GPU device.
+ 
+## Key Implementation Details 
+
+The basic OpenMP Offload implementation explained in the code includes the use of the following : 
+* OpenMP offload target data map construct
+* Default Baseline version demonstrates use of OpenMP offload target parallel for construct with collapse 
+* Optimized version 1 demonstrates use of OpenMP offload teams distribute construct and use of num_teams and thread_limit clause
+* Incremental Optimized version 2 demonstrates use of OpenMP offload teams distribute construct with improved data-access pattern
+* Incremental Optimized version 3 demonstrates use of OpenMP CPU threads along with OpenMP offload target construct
+
+ 
+## License  
+
+This code sample is licensed under MIT license. 
+
+
+## Building the `ISO3DFD` Program for GPU
+
+### Running Samples In DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) and Intel® oneAPI HPC Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/hpc-toolkit/)
+
+### On a Linux* System
+Perform the following steps:
+1. Build the program using the following `cmake` commands. 
+``` 
+$ mkdir build
+$ cd build
+$ cmake ..
+$ make -j
+```
+
+> Note: by default, executable is build with default baseline version. You can build the kernel with optimized versions with the following:
+```
+cmake -DUSE_OPT1=1 ..
+make -j
+```
+```
+cmake -DUSE_OPT2=1 ..
+make -j
+```
+```
+cmake -DUSE_OPT3=1 ..
+make -j
+```
+
+2. Run the program :
+    ```
+    make run
+    ```
+
+3. Clean the program using:
+    ```
+    make clean
+    ```
+
+## Running the Sample
+```
+make run
+```
+
+### Application Parameters 
+You can modify the ISO3DFD parameters from the command line.
+   * Configurable Application Parameters   
+	
+	Usage: src/iso3dfd n1 n2 n3 n1_block n2_block n3_block Iterations
+
+ 	n1 n2 n3                       	: Grid sizes for the stencil
+ 	n1_block n2_block n3_block     	: cache block sizes for CPU
+                                	: OR TILE sizes for OMP Offload
+ 	Iterations                     	: No. of timesteps.
+
+### Example of Output with default baseline version
+```
+Grid Sizes: 256 256 256
+Tile sizes ignored for OMP Offload
+--Using Baseline version with omp target with collapse
+Memory Usage (MBytes): 230
+--------------------------------------
+time         : 4.827 secs
+throughput   : 347.57 Mpts/s
+flops        : 21.2018 GFlops
+bytes        : 4.17084 GBytes/s
+
+--------------------------------------
+
+--------------------------------------
+Checking Results ...
+Final wavefields from OMP Offload device and CPU are equivalent: Success
+--------------------------------------
+```
+
+### Example of Output with Optimized version 3
+```
+Grid Sizes: 256 256 256
+Tile sizes: 16 8 64
+Using Optimized target code - version 3:
+--OMP Threads + OMP_Offload with Tiling and Z Window
+Memory Usage (MBytes): 230
+--------------------------------------
+time         : 3.014 secs
+throughput   : 556.643 Mpts/s
+flops        : 33.9552 GFlops
+bytes        : 6.67971 GBytes/s
+
+--------------------------------------
+
+--------------------------------------
+Checking Results ...
+Final wavefields from OMP Offload device and CPU are equivalent: Success
+
+```
@@ -0,0 +1,50 @@
+//==============================================================
+// Copyright © 2020 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#include <omp.h>
+#include <chrono>
+#include <cmath>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+
+constexpr float dt = 0.002f;
+constexpr float dxyz = 50.0f;
+constexpr unsigned int kHalfLength = 8;
+constexpr unsigned int kMaxTeamSizeLimit = 256;
+
+#define STENCIL_LOOKUP(ir)                                          \
+  (coeff[ir] * ((ptr_prev[ix + ir] + ptr_prev[ix - ir]) +           \
+                (ptr_prev[ix + ir * n1] + ptr_prev[ix - ir * n1]) + \
+                (ptr_prev[ix + ir * dimn1n2] + ptr_prev[ix - ir * dimn1n2])))
+
+#define STENCIL_LOOKUP_Z(ir)                                             \
+  (coeff[ir] * (front[ir] + back[ir - 1] + ptr_prev_base[gid + ir] +     \
+                ptr_prev_base[gid - ir] + ptr_prev_base[gid + ir * n1] + \
+                ptr_prev_base[gid - ir * n1]))
+
+void Usage(const std::string& programName);
+
+void PrintStats(double time, unsigned int n1, unsigned int n2, unsigned int n3,
+                unsigned int num_iterations);
+
+bool WithinEpsilon(float* output, float* reference, unsigned int dim_x,
+                   unsigned int dim_y, unsigned int dim_z, unsigned int radius,
+                   const int zadjust, const float delta);
+
+void Initialize(float* ptr_prev, float* ptr_next, float* ptr_vel,
+                unsigned int n1, unsigned int n2, unsigned int n3);
+
+bool VerifyResults(float* next_base, float* prev_base, float* vel_base,
+                   float* coeff, unsigned int n1, unsigned int n2,
+                   unsigned int n3, unsigned int num_iterations,
+                   unsigned int n1_block, unsigned int n2_block,
+                   unsigned int n3_block);
+
+bool ValidateInput(unsigned int n1, unsigned int n2, unsigned int n3,
+                   unsigned int n1_block, unsigned int n2_block,
+                   unsigned int n3_block, unsigned int num_iterations);
@@ -0,0 +1,22 @@
+{
+  "guid": "E3407632-7F3D-4B5B-A956-5155408D7468",
+  "name": "iso3dfd_omp_offload",
+  "categories": [ "Toolkit/Intel® oneAPI HPC Toolkit" ],
+  "description": "A finite difference stencil kernel for solving 3D acoustic isotropic wave equation",
+  "toolchain": [ "icpx" ],
+  "targetDevice": [ "GPU" ],
+  "languages": [ { "cpp": {} } ],
+  "os": [ "linux" ],
+  "builder": [ "cmake" ],
+  "ciTests": {
+	"linux": [{
+            "steps": [
+                        "mkdir build",
+                        "cd build",
+                        "cmake ..",
+                        "make",
+                        "make run"
+                 ]
+        }]
+  }
+}
@@ -0,0 +1,37 @@
+OPTION(VERIFY_RESULTS "Use Results Validation" ON)
+OPTION(USE_OPT1 "Select Optimized target code - version 1" OFF)
+OPTION(USE_OPT2 "Select Optimized target code - version 2" OFF)
+OPTION(USE_OPT3 "Select Optimized target code - version 3" OFF)
+
+set(CMAKE_BUILD_TYPE "RelWithDebInfo")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fiopenmp -std=c++17 -fopenmp-targets=spir64 -O3 -D__STRICT_ANSI__ ")
+
+set(SOURCES iso3dfd.cpp utils.cpp)
+
+if(USE_OPT3)
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_OPT3")
+	message("-- Using Optimized target code - version 3")
+elseif(USE_OPT2)
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_OPT2")
+	message("-- Using Optimized target code - version 2")
+elseif(USE_OPT1)
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_OPT1")
+	message("-- Using Optimized target code - version 1")
+else()
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_BASELINE")
+	message("-- Using Baseline target code")
+endif()
+
+if(VERIFY_RESULTS)
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVERIFY_RESULTS")
+	set(SOURCES ${SOURCES} iso3dfd_verify.cpp)
+endif(VERIFY_RESULTS)
+
+
+add_executable (iso3dfd ${SOURCES})
+
+add_custom_target (run 
+	COMMAND iso3dfd 256 256 256 16 8 64 100
+	WORKING_DIRECTORY ${CMAKE_PROJECT_DIR}
+)
+