Skip to content

Commit 981b21a

Browse files
committed
Merge branch 'bootstrap-vcs'
2 parents 4a0fb8c + 66d8ae4 commit 981b21a

File tree

7 files changed

+314
-47
lines changed

7 files changed

+314
-47
lines changed
Lines changed: 23 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,35 @@
11
#!/bin/bash
22
set -x -e
33

4-
# AWS EMR bootstrap script
5-
# Install hail
6-
7-
# check for master node
4+
INPUT_PATH=""
5+
HAIL_VERSION="0.1"
6+
SPARK_VERSION="2.2.1"
87
IS_MASTER=false
8+
99
if grep isMaster /mnt/var/lib/info/instance.json | grep true;
1010
then
1111
IS_MASTER=true
1212
fi
1313

14-
# error message
15-
error_msg ()
16-
{
17-
echo 1>&2 "Error: $1"
18-
}
19-
20-
# error message
21-
fatal_error_msg ()
22-
{
23-
echo 1>&2 "Fatal error: $1"
24-
exit 1
25-
}
26-
27-
VS_BUCKET="variant-spark"
28-
RELEASE_DIR=
29-
30-
# get input parameters
3114
while [ $# -gt 0 ]; do
3215
case "$1" in
33-
--release-url)
34-
shift
35-
HAIL_RELEASE_URL="$1"
16+
--input-path)
17+
shift
18+
INPUT_PATH=$1
19+
;;
20+
--hail-version)
21+
shift
22+
HAIL_VERSION=$1
23+
;;
24+
--spark-version)
25+
shift
26+
SPARK_VERSION=$1
27+
;;
28+
--path-prefix)
29+
# not used by this script
30+
shift
3631
;;
3732
-*)
38-
# do not exit out, just note failure
3933
error_msg "unrecognized option: $1"
4034
;;
4135
*)
@@ -45,21 +39,8 @@ while [ $# -gt 0 ]; do
4539
shift
4640
done
4741

48-
if [[ -z "${HAIL_RELEASE_URL}" ]]; then
49-
fatal_error_msg "Parameter: --release-url is required"
50-
fi
51-
52-
echo "Hail release location is: ${HAIL_RELEASE_URL}"
53-
54-
INST_VOL="${INST_VOL:-/mnt}"
55-
HAIL_INST_DIR="${INST_VOL}/hail"
56-
57-
echo "Bootstraping hail"
42+
# copy hail to both master and workers
43+
# as ther is not shared dir and the bgz codec is needed on classpath for both
5844

59-
echo "Installing hail in: ${HAIL_INST_DIR}"
60-
mkdir -p "${HAIL_INST_DIR}"
61-
#download and install variant spark
62-
cd ${HAIL_INST_DIR}
63-
aws s3 cp --recursive "${HAIL_RELEASE_URL}/" .
64-
echo "Installed variant-spark in: ${HAIL_INST_DIR}"
65-
echo "Finished bootstraping hail"
45+
aws s3 cp ${INPUT_PATH}/hail-python.zip ${HOME}
46+
aws s3 cp ${INPUT_PATH}/hail-all-spark.jar ${HOME}
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/bin/bash
2+
set -x -e
3+
4+
INPUT_PATH=""
5+
HAIL_VERSION="0.1"
6+
SPARK_VERSION="2.2.1"
7+
IS_MASTER=false
8+
9+
if grep isMaster /mnt/var/lib/info/instance.json | grep true;
10+
then
11+
IS_MASTER=true
12+
fi
13+
14+
while [ $# -gt 0 ]; do
15+
case "$1" in
16+
--input-path)
17+
shift
18+
INPUT_PATH=$1
19+
;;
20+
--hail-version)
21+
shift
22+
HAIL_VERSION=$1
23+
;;
24+
--spark-version)
25+
shift
26+
SPARK_VERSION=$1
27+
;;
28+
--notebookPath)
29+
shift
30+
NotebookPath=$1
31+
;;
32+
--path-prefix)
33+
# Passed in by default, but not used here
34+
shift
35+
;;
36+
-*)
37+
error_msg "unrecognized option: $1"
38+
;;
39+
*)
40+
break;
41+
;;
42+
esac
43+
shift
44+
done
45+
46+
47+
if [ "$IS_MASTER" = true ]; then
48+
#Install miniconda
49+
wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh
50+
sh Miniconda2-latest-Linux-x86_64.sh -b
51+
export PATH=~/miniconda2/bin:$PATH
52+
conda create -y -n jupyter python=2.7
53+
source activate jupyter
54+
#Install other packages
55+
#TODO: make these configurable
56+
pip install --upgrade matplotlib pandas click variant-spark
57+
ln -s "/home/hadoop/miniconda2/envs/jupyter/lib/python2.7/site-packages/varspark/jars/variant-spark"*.jar "/home/hadoop/miniconda2/envs/jupyter/lib/python2.7/site-packages/varspark/jars/varspark.jar"
58+
59+
fi
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
#!/bin/bash
2+
set -x -e
3+
4+
INPUT_PATH=""
5+
HAIL_VERSION="0.1"
6+
SPARK_VERSION="2.2.1"
7+
IS_MASTER=false
8+
9+
if grep isMaster /mnt/var/lib/info/instance.json | grep true;
10+
then
11+
IS_MASTER=true
12+
fi
13+
14+
while [ $# -gt 0 ]; do
15+
case "$1" in
16+
--input-path)
17+
shift
18+
INPUT_PATH=$1
19+
;;
20+
--hail-version)
21+
shift
22+
HAIL_VERSION=$1
23+
;;
24+
--spark-version)
25+
shift
26+
SPARK_VERSION=$1
27+
;;
28+
--notebookPath)
29+
shift
30+
NotebookPath=$1
31+
;;
32+
--path-prefix)
33+
shift
34+
PATH_PREFIX=$1
35+
;;
36+
-*)
37+
error_msg "unrecognized option: $1"
38+
;;
39+
*)
40+
break;
41+
;;
42+
esac
43+
shift
44+
done
45+
46+
BUCKET=$(awk -v XX="$NotebookPath" 'BEGIN{x=substr(XX,6); split(x,a,"/"); print(a[1])}')
47+
PREFIX=$(awk -v XX="$NotebookPath" -v YY="$BUCKET" 'BEGIN{y=length(YY); print(substr(XX,7+y));}')
48+
49+
wget "${PATH_PREFIX}/cloud/aws-emr/cf-templates/VariantSpark_example_with_Hail_library.ipynb"
50+
aws s3 cp VariantSpark_example_with_Hail_library.ipynb $NotebookPath/
51+
52+
upstart_jupyter() {
53+
sudo puppet apply << PUPPET_SCRIPT
54+
include 'upstart'
55+
upstart::job { 'jupyter':
56+
description => 'Jupyter',
57+
respawn => true,
58+
respawn_limit => '0 10',
59+
start_on => 'runlevel [2345]',
60+
stop_on => 'runlevel [016]',
61+
console => 'output',
62+
chdir => '/home/hadoop',
63+
script => '
64+
sudo su - hadoop > /home/hadoop/jupyter.log 2>&1 <<BASH_SCRIPT
65+
export SPARK_HOME=/usr/lib/spark
66+
export PYTHONPATH=$PYTHONPATH:/home/hadoop/hail-python.zip
67+
/home/hadoop/miniconda2/envs/jupyter/bin/jupyter notebook
68+
BASH_SCRIPT
69+
',
70+
}
71+
PUPPET_SCRIPT
72+
}
73+
74+
75+
if [ "$IS_MASTER" = true ]; then
76+
#Install miniconda
77+
wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh
78+
sh Miniconda2-latest-Linux-x86_64.sh -b
79+
export PATH=~/miniconda2/bin:$PATH
80+
conda create -y -n jupyter python=2.7
81+
source activate jupyter
82+
#Install other packages
83+
#TODO: make these configurable
84+
pip install --upgrade matplotlib pandas click variant-spark
85+
ln -s "/home/hadoop/miniconda2/envs/jupyter/lib/python2.7/site-packages/varspark/jars/variant-spark"*.jar "/home/hadoop/miniconda2/envs/jupyter/lib/python2.7/site-packages/varspark/jars/varspark.jar"
86+
#Install jupyter components
87+
pip install --upgrade jupyter==1.0.0 s3contents==0.1.4 decorator==4.2.1 notebook==5.7.0 juspark
88+
mkdir -p ~/.jupyter
89+
cat >> ~/.jupyter/jupyter_notebook_config.py << EOF
90+
# S3ContentsManager
91+
from s3contents import S3ContentsManager
92+
c.NotebookApp.contents_manager_class = S3ContentsManager
93+
c.S3ContentsManager.bucket_name = "$BUCKET"
94+
c.S3ContentsManager.prefix = "$PREFIX"
95+
EOF
96+
97+
cat >> ~/.jupyter/jupyter_notebook_config.py << EOF
98+
c.NotebookApp.token = ''
99+
c.NotebookApp.password = ''
100+
c.NotebookApp.ip = '*'
101+
c.NotebookApp.open_browser = False
102+
c.NotebookApp.allow_remote_access = True
103+
EOF
104+
105+
# Setup JuSpark kernel
106+
107+
mkdir -p ~/.local/share/jupyter/kernels/juspark
108+
cat > ~/.local/share/jupyter/kernels/juspark/kernel.json << EOF
109+
{
110+
"display_name": "JuSpark",
111+
"language": "python",
112+
"argv": [
113+
"/home/hadoop/miniconda2/envs/jupyter/bin/python",
114+
"-m",
115+
"ipykernel",
116+
"-f",
117+
"{connection_file}",
118+
"--ext=juspark"
119+
]
120+
}
121+
EOF
122+
123+
# Setup profiles for juspark
124+
mkdir -p ~/.juspark/profiles
125+
cat > ~/.juspark/profiles/hail << EOF
126+
{
127+
"spark.jars":"/home/hadoop/hail-all-spark.jar",
128+
"spark.submit.pyFiles":"/home/hadoop/hail-python.zip"
129+
}
130+
EOF
131+
132+
#Install puppet modules
133+
sudo puppet module install spantree-upstart
134+
135+
#Setup daemons
136+
upstart_jupyter
137+
138+
fi
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#!/bin/bash
2+
set -e -x
3+
4+
# Used to sit in an s3 bucket and will call an arbitrary script, passing in version information
5+
# This is because AWS EMR requires bootstrap actions to be in a bucket, rather than in a VCS.
6+
# e.g. /path/to/this_script.sh --path-prefix "https://github.com/<user>/<repo>/raw/<tag>" --bootstrap-file "/path/to/version/controlled/script.sh" -- <script_arguments>
7+
# Will pass --path-prefix to the called script, so it can reference the relevant version in some VCS.
8+
9+
BOOTSTRAP_LOCAL_FILE="/tmp/bootstrap.sh"
10+
11+
error_out ()
12+
{
13+
>&2 echo "Error: $1"
14+
exit 1
15+
}
16+
17+
options=$(getopt -o '' --longoptions path-prefix:,bootstrap-file: -- "$@")
18+
eval set -- "$options"
19+
20+
while true; do
21+
case "$1" in
22+
--path-prefix)
23+
shift
24+
PATH_PREFIX=$1
25+
;;
26+
--bootstrap-file)
27+
shift
28+
BOOTSTRAP_FILE=$1
29+
;;
30+
--)
31+
shift
32+
break
33+
;;
34+
*)
35+
error_out "unrecognised option: $1"
36+
;;
37+
esac
38+
shift
39+
done
40+
41+
if [ -z "$PATH_PREFIX" ]; then
42+
error_out "missing required option: --path-prefix"
43+
fi
44+
if [ -z "$BOOTSTRAP_FILE" ]; then
45+
error_out "missing required option: --bootstrap-file"
46+
fi
47+
48+
curl -L --output "$BOOTSTRAP_LOCAL_FILE" "${PATH_PREFIX}${BOOTSTRAP_FILE}"
49+
sudo chmod +x "$BOOTSTRAP_LOCAL_FILE"
50+
"$BOOTSTRAP_LOCAL_FILE" --path-prefix "$PATH_PREFIX" $@

cloud/aws-emr/cf-templates/VariantSpark_Hail_EMR_Notebook.yaml

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
---
22
AWSTemplateFormatVersion: '2010-09-09'
33
Description: Variant Spark CF template to spin EMR clusters with Jupyter Notebook.
4+
Mappings:
5+
Constants:
6+
GitHub:
7+
Repo: "https://github.com/bhosking/VariantSpark"
8+
Version: "bootstrap-vcs"
49
Parameters:
510
# Hardware Group
611
MasterNodePricing:
@@ -115,19 +120,33 @@ Resources:
115120
- Name: Install Jupyter
116121
ScriptBootstrapAction:
117122
Args:
123+
- "--path-prefix"
124+
- !Sub
125+
- "${repo}/raw/${version}"
126+
- {repo: !FindInMap [Constants, GitHub, Repo], version: !FindInMap [Constants, GitHub, Version]}
127+
- "--bootstrap-file"
128+
- "/cloud/aws-emr/bootstrap/install-jupyter.sh"
129+
- "--"
118130
- "--notebookPath"
119131
- Ref: NotebookDir
120-
Path: "s3://variant-spark/HailJupyter/install-jupyter.sh"
132+
Path: "s3://variant-spark/s3-bootstrap.sh"
121133
- Name: Install Hail
122134
ScriptBootstrapAction:
123135
Args:
136+
- "--path-prefix"
137+
- !Sub
138+
- "${repo}/raw/${version}"
139+
- {repo: !FindInMap [Constants, GitHub, Repo], version: !FindInMap [Constants, GitHub, Version]}
140+
- "--bootstrap-file"
141+
- "/cloud/aws-emr/bootstrap/install-hail.sh"
142+
- "--"
124143
- "--input-path"
125144
- "s3://variant-spark/HailJupyter/hail/0.1_2.2.1"
126145
- "--hail-version"
127146
- "0.1"
128147
- "--spark-version"
129148
- "2.2.1"
130-
Path: "s3://variant-spark/HailJupyter/install-hail.sh"
149+
Path: "s3://variant-spark/s3-bootstrap.sh"
131150
Configurations:
132151
- Classification: emrfs-site
133152
ConfigurationProperties:

0 commit comments

Comments
 (0)