diff --git a/.gitignore b/.gitignore index faada9c8a..73df90509 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,8 @@ bld/ # Visual Studio 2015/2017 cache/options directory .vs/ +# Visual Studio Code cache/options directory +.vscode/ # Uncomment if you have tasks that create the project's static files in wwwroot #wwwroot/ diff --git a/docker/images/runtime/README.md b/docker/images/runtime/README.md new file mode 100644 index 000000000..b18610a83 --- /dev/null +++ b/docker/images/runtime/README.md @@ -0,0 +1,92 @@ +# .NET for Apache Spark runtime Docker image + +## Description + +This directory contains the source code to build the docker runtime images for different versions of .NET for Apache Spark and [Apache Spark](https://spark.apache.org). + +By using these images, you can run and debug your .NET for Apache Spark projects inside a docker container, without the need to set up all required dependencies yourself. This means that you, for example, can + +- run a spark master and multiple slaves within the same container. +- run spark master and slave instances in separate containers. +- connect to a .NET for Apache Spark session in debug mode from Visual Studio or Visual Studio Code. + +If you do not want to build those images yourself, you can get our pre-built images directly from docker hub at [https://hub.docker.com/r/3rdman/dotnet-spark](https://hub.docker.com/r/3rdman/dotnet-spark). + +Additional information on how to use the images can be found at [3rdman.de](https://3rdman.de/tag/net-for-apache-spark/), or the docker hub page mentioned above. + +## Building + +To build an image just run the [build.sh](build.sh) bash script. Per default it should build an image with the latest supported versions of .NET Core, Apache Spark and .NET for Apache Spark installed. + +You can also build for different versions by specifying one of the following options: + +```bash + -a, --apache-spark + -d, --dotnet-spark +``` + +For more details please run + +```bash +build.sh -h +``` + +Please note, that not all version combinations are supported, however. + +## The image build stages + +Using different stages makes sense to efficiently build multiple images that are based on the same .NET core SDK, but are using different .NET for Apache Spark or Apache Spark versions. +In that way, dependencies (e.g. .NET core SDK) do not have to be downloaded again and again, while building images for the different versions. This saves time and bandwidth. + +The three stages used in the build process are: + +- ### **dotnet-sdk** + + Downloads and installs the specified .NET Core SDK into a base Ubuntu 18.04 image along with some other tools that might be required by later stages or for debugging. The resulting image is tagged with the .NET Core version number. + +- ### **dotnet-spark-base (runtime)** + + Adds the specified .NET for Apache Spark version to the dotnet-sdk image and also copies/builds the HelloSpark example into the image. HelloSpark is also use to install the correct microsoft-spark-*.jar version that is required for using the image for debugging [debugging .NET for Apache Spark](https://docs.microsoft.com/en-us/dotnet/spark/how-to-guides/debug) via Visual Studio, or Visual Studio Code. + + + ![Debug](img/dotnet-spark-vsc-debug.gif) + +- ### **dotnet-spark (runtime)** + + Gets/installs the Apache Spark version and copies the related startup scripts into the image. + +## Docker Run Examples + +As mentioned earlier, the dotnet-spark runtime image can be used in multiple ways. Below are some examples that might be useful. + +- ### master and one slave in a single container + + ```bash +docker run -d --name dotnet-spark -p 8080:8080 -p 8081:8081 -e SPARK_DEBUG_DISABLED=true 3rdman/dotnet-spark:latest +``` + +- ### master and two slaves in a single container + +```bash +docker run -d --name dotnet-spark -p 8080:8080 -p 8081:8081 -p 8081:8081 -e SPARK_DEBUG_DISABLED=true -e SPARK_WORKER_INSTANCES=2 3rdman/dotnet-spark:latest +``` + +- ### master only + +```bash +docker run -d --name dotnet-spark-master -p 8080:8080 -p 7077:7077 -e SPARK_DEBUG_DISABLED=true -e SPARK_WORKER_INSTANCES=0 3rdman/dotnet-spark:latest +``` + +- ### slave only, connecting to external master + +```bash +docker run -d --name dotnet-spark-slave -p 8080:8080 -e SPARK_DEBUG_DISABLED=true -e SPARK_MASTER_DISABLED=true -e SPARK_MASTER_URL="spark://master-hostname:7077" 3rdman/dotnet-spark:latest +``` + +For details about how to use the image for .NET for Apache Spark debugging, please have a look at one of the following posts: + +- [.NET for Apache Spark – VSCode with Docker on Linux and df.Collect()](https://3rdman.de/2020/01/net-for-apache-spark-visual-studio-code-with-docker-on-linux/) + +- [.NET for Apache Spark – UDF, VS2019, Docker for Windows and a Christmas Puzzle](https://3rdman.de/2019/12/net-for-apache-spark-udf-vs2019-docker-for-windows-and-a-christmas-puzzle/) + +- [Debug .NET for Apache Spark with Visual Studio and docker](https://3rdman.de/2019/10/debug-net-for-apache-spark-with-visual-studio-and-docker/) diff --git a/docker/images/runtime/build.sh b/docker/images/runtime/build.sh new file mode 100755 index 000000000..a81fba68d --- /dev/null +++ b/docker/images/runtime/build.sh @@ -0,0 +1,243 @@ +#!/usr/bin/env bash + +# Create different versions of the .NET for Apache Spark runtime docker image +# based on the Apach Spark and .NET for Apache Spark version. + +set -o errexit # abort on nonzero exitstatus +set -o nounset # abort on unbound variable +set -o pipefail # don't hide errors within pipes + +readonly image_repository='3rdman' +readonly supported_apache_spark_versions=( + "2.3.0" "2.3.1" "2.3.2" "2.3.3" "2.3.4" + "2.4.0" "2.4.1" "2.4.3" "2.4.4" "2.4.5" "2.4.6" "2.4.7" + "3.0.0" "3.0.1" + ) +readonly supported_dotnet_spark_versions=("1.0.0") +readonly dotnet_core_version=3.1 + +dotnet_spark_version=1.0.0 +apache_spark_version=3.0.1 +apache_spark_short_version="${apache_spark_version:0:3}" +scala_version=2.11 + +main() { + # Parse the options an set the related variables + while [[ "$#" -gt 0 ]]; do + case "${1}" in + -a|--apache-spark) opt_check_apache_spark_version "${2}"; shift ;; + -d|--dotnet-spark) opt_check_dotnet_spark_version "${2}"; shift ;; + -h|--help) print_help + exit 1 ;; + *) echo "Unknown parameter passed: ${1}"; exit 1 ;; + esac + shift + done + + echo "Building .NET for Apache Spark ${dotnet_spark_version} runtime image with Apache Spark ${apache_spark_version}" + + # execute the different build stages + cleanup + + set_scala_version + build_dotnet_sdk + build_dotnet_spark_base_runtime + build_dotnet_spark_runtime + + trap finish EXIT ERR + + exit 0 +} + +####################################### +# Checks if the provided Apache Spark version number is supported +# Arguments: +# The version number string +# Result: +# Sets the global variable apache_spark_version if supported, +# otherwise exits with a related message +####################################### +opt_check_apache_spark_version() { + local provided_version="${1}" + local valid_version="" + + for value in "${supported_apache_spark_versions[@]}" + do + [[ "${provided_version}" = "$value" ]] && valid_version="${provided_version}" + done + + if [ -z "${valid_version}" ] + then + echo "${provided_version} is an unsupported Apache Spark version." + exit 1 ; + else + apache_spark_version="${valid_version}" + apache_spark_short_version="${apache_spark_version:0:3}" + fi +} + +####################################### +# Checks if the provided .NET for Apache Spark version number is supported +# Arguments: +# The version number string +# Result: +# Sets the global variable dotnet_spark_version if supported, +# otherwise exits with a related message +####################################### +opt_check_dotnet_spark_version() { + local provided_version="${1}" + local valid_version="" + + for value in "${supported_dotnet_spark_versions[@]}" + do + [[ "${provided_version}" = "$value" ]] && valid_version="${provided_version}" + done + + if [ -z "${valid_version}" ] + then + echo "${provided_version} is an unsupported .NET for Apache Spark version." + exit 1 ; + else + dotnet_spark_version="${valid_version}" + fi +} + +####################################### +# Replaces every occurence of search_string by replacement_string in a file +# Arguments: +# The file name +# The string to search for +# The string to replace the search string with +# Result: +# An updated file with the replaced string +####################################### +replace_text_in_file() { + local filename="${1}" + local search_string="${2}" + local replacement_string="${3}" + + sh -c 'sed -i.bak "s/$1/$2/g" "$3" && rm "$3.bak"' _ "${search_string}" "${replacement_string}" "${filename}" +} + +####################################### +# Sets the Scala version depending on the Apache Spark version +####################################### +set_scala_version() { + case "${apache_spark_version:0:1}" in + 2) scala_version=2.11 ;; + 3) scala_version=2.12 ;; + esac +} + +####################################### +# Runs the docker build command with the related build arguments +# Arguments: +# The image name (incl. tag) +# Result: +# A local docker image with the specified name +####################################### +build_image() { + local image_name="${1}" + local build_args="--build-arg DOTNET_CORE_VERSION=${dotnet_core_version} --build-arg DOTNET_SPARK_VERSION=${dotnet_spark_version} --build-arg SPARK_VERSION=${apache_spark_version}" + local cmd="docker build ${build_args} -t ${image_name} ." + + echo "Building ${image_name}" + + ${cmd} +} + +####################################### +# Use the Dockerfile in the sub-folder dotnet-sdk to build the image of the first stage +# Result: +# A dotnet-sdk docker image tagged with the .NET core version +####################################### +build_dotnet_sdk() { + local image_name="dotnet-sdk:${dotnet_core_version}" + + cd dotnet-sdk + build_image "${image_name}" + cd ~- +} + +####################################### +# Use the Dockerfile in the sub-folder dotnet-spark-base to build the image of the second stage +# The image contains the specified .NET for Apache Spark version +# Result: +# A dotnet-spark-base-runtime docker image tagged with the .NET for Apache Spark version +####################################### +build_dotnet_spark_base_runtime() { + local image_name="dotnet-spark-base-runtime:${dotnet_spark_version}" + + cd dotnet-spark-base + build_image "${image_name}" + cd ~- +} + +####################################### +# Use the Dockerfile in the sub-folder dotnet-spark to build the image of the last stage +# The image contains the specified Apache Spark version +# Result: +# A dotnet-spark docker image tagged with the .NET for Apache Spark version and the Apache Spark version. +####################################### +build_dotnet_spark_runtime() { + local image_name="${image_repository}/dotnet-spark:${dotnet_spark_version}-${apache_spark_version}" + local msspark_short_string=${apache_spark_short_version//./-} + + cd dotnet-spark + cp --recursive templates/scripts ./bin + cp --recursive templates/HelloSpark ./HelloSpark + + replace_text_in_file HelloSpark/HelloSpark.csproj "<\/TargetFramework>" "netcoreapp${dotnet_core_version}<\/TargetFramework>" + replace_text_in_file HelloSpark/HelloSpark.csproj "PackageReference Include=\"Microsoft.Spark\" Version=\"\"" "PackageReference Include=\"Microsoft.Spark\" Version=\"${dotnet_spark_version}\"" + + replace_text_in_file HelloSpark/README.txt "netcoreappX.X" "netcoreapp${dotnet_core_version}" + replace_text_in_file HelloSpark/README.txt "spark-X.X.X" "spark-${apache_spark_short_version}.x" + replace_text_in_file HelloSpark/README.txt "microsoft-spark-${apache_spark_short_version}.x-X.X.X.jar" "microsoft-spark-${msspark_short_string}_${scala_version}-${dotnet_spark_version}.jar" + + replace_text_in_file bin/start-spark-debug.sh "microsoft-spark-X.X.X" "microsoft-spark-${msspark_short_string}_${scala_version}" + + build_image "${image_name}" + cd ~- +} + +####################################### +# Remove the temporary folders created during the different build stages +####################################### +cleanup() +{ + ( + cd dotnet-spark + rm --recursive --force bin + rm --recursive --force HelloSpark + ) +} + +finish() +{ + result=$? + cleanup + exit ${result} +} + +####################################### +# Display the help text +####################################### +print_help() { + cat < + + + Exe + + + + + + + + diff --git a/docker/images/runtime/dotnet-spark/templates/HelloSpark/Program.cs b/docker/images/runtime/dotnet-spark/templates/HelloSpark/Program.cs new file mode 100644 index 000000000..9be1e8b8d --- /dev/null +++ b/docker/images/runtime/dotnet-spark/templates/HelloSpark/Program.cs @@ -0,0 +1,14 @@ +using Microsoft.Spark.Sql; + +namespace HelloSpark +{ + class Program + { + static void Main(string[] args) + { + var spark = SparkSession.Builder().GetOrCreate(); + var df = spark.Read().Json("people.json"); + df.Show(); + } + } +} diff --git a/docker/images/runtime/dotnet-spark/templates/HelloSpark/README.txt b/docker/images/runtime/dotnet-spark/templates/HelloSpark/README.txt new file mode 100644 index 000000000..31f707b62 --- /dev/null +++ b/docker/images/runtime/dotnet-spark/templates/HelloSpark/README.txt @@ -0,0 +1,13 @@ +Use the commands below to build and run the example as outline at https://github.com/dotnet/spark/blob/master/docs/getting-started/ubuntu-instructions.md + +dotnet build + +cp people.json /dotnet/HelloSpark/bin/Debug/netcoreappX.X +cd /dotnet/HelloSpark/bin/Debug/netcoreappX.X + +####### spark-X.X.X ####### +# Run locally +spark-submit --class org.apache.spark.deploy.dotnet.DotnetRunner --master local microsoft-spark-X.X.X-X.X.X.jar dotnet HelloSpark.dll + +# To test out the example using the master and slave instances +spark-submit --class org.apache.spark.deploy.dotnet.DotnetRunner --master spark://$HOSTNAME:$SPARK_MASTER_PORT microsoft-spark-X.X.X-X.X.X.jar dotnet HelloSpark.dll diff --git a/docker/images/runtime/dotnet-spark/templates/HelloSpark/people.json b/docker/images/runtime/dotnet-spark/templates/HelloSpark/people.json new file mode 100644 index 000000000..50a859cbd --- /dev/null +++ b/docker/images/runtime/dotnet-spark/templates/HelloSpark/people.json @@ -0,0 +1,3 @@ +{"name":"Michael"} +{"name":"Andy", "age":30} +{"name":"Justin", "age":19} diff --git a/docker/images/runtime/dotnet-spark/templates/scripts/start-spark-debug.sh b/docker/images/runtime/dotnet-spark/templates/scripts/start-spark-debug.sh new file mode 100644 index 000000000..d93f0228e --- /dev/null +++ b/docker/images/runtime/dotnet-spark/templates/scripts/start-spark-debug.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +set -o errexit # abort on nonzero exitstatus +set -o nounset # abort on unbound variable +set -o pipefail # don't hide errors within pipes + +if [ -z "${SPARK_DEBUG_DISABLED}" ] && [ -z "${SPARK_MASTER_DISABLED}" ]; then + socat tcp-l:5567,fork,reuseaddr tcp:127.0.0.1:5050 & + cd /dotnet/Debug/netcoreapp"${DOTNET_CORE_VERSION}" + "${SPARK_HOME}"/bin/spark-submit --packages "${SPARK_SUBMIT_PACKAGES}" --class org.apache.spark.deploy.dotnet.DotnetRunner --jars "/dotnet/Debug/netcoreapp${DOTNET_CORE_VERSION}/*.jar" --master local microsoft-spark-X.X.X-"${DOTNET_SPARK_VERSION}".jar debug 5050 +fi diff --git a/docker/images/runtime/dotnet-spark/templates/scripts/start-spark-master.sh b/docker/images/runtime/dotnet-spark/templates/scripts/start-spark-master.sh new file mode 100644 index 000000000..2d567d77d --- /dev/null +++ b/docker/images/runtime/dotnet-spark/templates/scripts/start-spark-master.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +set -o errexit # abort on nonzero exitstatus +set -o nounset # abort on unbound variable +set -o pipefail # don't hide errors within pipes + +if [ -z "${SPARK_MASTER_DISABLED}" ]; then + "${SPARK_HOME}"/sbin/start-master.sh +fi diff --git a/docker/images/runtime/dotnet-spark/templates/scripts/start-spark-slave.sh b/docker/images/runtime/dotnet-spark/templates/scripts/start-spark-slave.sh new file mode 100644 index 000000000..242a23251 --- /dev/null +++ b/docker/images/runtime/dotnet-spark/templates/scripts/start-spark-slave.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +set -o errexit # abort on nonzero exitstatus +set -o nounset # abort on unbound variable +set -o pipefail # don't hide errors within pipes + +if [ -z "${SPARK_MASTER_URL}" ]; then + "${SPARK_HOME}"/sbin/start-slave.sh spark://`hostname`:7077 +else + "${SPARK_HOME}"/sbin/start-slave.sh "${SPARK_MASTER_URL}" +fi diff --git a/docker/images/runtime/img/dotnet-spark-vsc-debug.gif b/docker/images/runtime/img/dotnet-spark-vsc-debug.gif new file mode 100644 index 000000000..677e920e0 Binary files /dev/null and b/docker/images/runtime/img/dotnet-spark-vsc-debug.gif differ