33DIR=$( cd " $( dirname " ${BASH_SOURCE[0]} " ) " && pwd )
44JOB_CONFIG=" $DIR /$1 .json"
55
6+ REGION=$( jq -r ' .region' < " $JOB_CONFIG " )
7+ JOB_NAME=$( jq -r ' .job_name' < " $JOB_CONFIG " )
8+
69export PATH=/usr/bin/:/usr/local/bin/:$PATH
710
811if [ ! -f " $JOB_CONFIG " ]; then
@@ -11,13 +14,11 @@ if [ ! -f "$JOB_CONFIG" ]; then
1114fi
1215
1316if [ " $( jq -r ' .num_workers|type' < $JOB_CONFIG ) " == " number" ]; then # Spark cluster
14- REGION=$( jq -r ' .region' < " $JOB_CONFIG " )
1517 EMR_RELEASE=$( jq -r ' .emr_release' < " $JOB_CONFIG " )
1618 N_WORKERS=$( jq -r ' .num_workers' < " $JOB_CONFIG " )
1719 MASTER_TYPE=$( jq -r ' .master_instance_type' < " $JOB_CONFIG " )
1820 SLAVE_TYPE=$( jq -r ' .slave_instance_type' < " $JOB_CONFIG " )
1921 CLUSTER_NAME=$( jq -r ' .cluster_name' < " $JOB_CONFIG " )
20- JOB_NAME=$( jq -r ' .job_name' < " $JOB_CONFIG " )
2122 CODE=$( jq -r ' .code_uri' < " $JOB_CONFIG " )
2223 TIMEOUT=$( jq -r ' .timeout_minutes' < " $JOB_CONFIG " )
2324 DATA_BUCKET=$( jq -r ' .data_bucket' < " $JOB_CONFIG " )
@@ -48,29 +49,55 @@ if [ "$(jq -r '.num_workers|type' < $JOB_CONFIG)" == "number" ]; then # Spark cl
4849 --bootstrap-actions Path=s3://${EMR_BUCKET} /bootstrap/telemetry.sh,Args=\[ " --timeout" ," $TIMEOUT " \] \
4950 --configurations https://s3-${REGION} .amazonaws.com/${EMR_BUCKET} /configuration/configuration.json \
5051 --steps Type=CUSTOM_JAR,Name=CustomJAR,ActionOnFailure=TERMINATE_JOB_FLOW,Jar=s3://${REGION} .elasticmapreduce/libs/script-runner/script-runner.jar,Args=" $STEP_ARGS "
52+ EXIT_CODE=$?
53+ if [ $EXIT_CODE -ne 0 ]; then
54+ # Error creating emr cluster. Notify owner.
55+ NOTIFY_SUBJECT=" Scheduled Spark job '$JOB_NAME ' encountered an error"
56+ NOTIFY_BODY=<< END
57+ Scheduled Telemetry Spark job '$JOB_NAME ' exited with a code of $EXIT_CODE which
58+ indicates it probably encountered an error.
59+ END
60+ fi
5161else
5262 cd ~ /telemetry-server
5363 python -m provisioning.aws.launch_worker " $JOB_CONFIG "
5464 EXIT_CODE=$?
5565
5666 if [ $EXIT_CODE -eq 2 ]; then
5767 # Job timed out. Notify owner.
58- NOTIFY=monitoring/anomaly_detection/notify.py
59- 60- TO=$( jq -r ' .job_owner' < " $JOB_CONFIG " )
61- JOB_NAME=$( jq -r ' .job_name' < " $JOB_CONFIG " )
62- SUBJECT=" Your scheduled Telemetry job '$JOB_NAME ' timed out"
6368 JOB_TIMEOUT=$( jq -r ' .job_timeout_minutes' < " $JOB_CONFIG " )
64- if [ -z " $TO " ]; then
65- # Send to a default address if the owner name is missing from the config.
66- TO=$FROM
67- SUBJECT=" Scheduled Telemetry job '$JOB_NAME ' timed out (and had no owner)"
68- fi
69- python $NOTIFY -f " $FROM " -t " $TO " -s " $SUBJECT " << END
70- Scheduled Telemetry job "$JOB_NAME " was forcibly terminated after the configured
69+ NOTIFY_SUBJECT=" Scheduled Telemetry job '$JOB_NAME ' timed out"
70+ NOTIFY_BODY=<< END
71+ Scheduled Telemetry job '$JOB_NAME ' was forcibly terminated after the configured
7172timeout ($JOB_TIMEOUT minutes).
73+ END
74+ elif [ $EXIT_CODE -ne 0 ]; then
75+ # Error running job. Notify owner.
76+ NOTIFY_SUBJECT=" Scheduled Telemetry job '$JOB_NAME ' encountered an error"
77+ NOTIFY_BODY=<< END
78+ Scheduled Telemetry job '$JOB_NAME ' exited with code $EXIT_CODE which indicates
79+ it probably encountered an error.
80+ END
81+ fi
82+ fi
83+
84+ if [ ! -z " $NOTIFY_SUBJECT " ]; then
85+ 86+ TO=$( jq -r ' .job_owner' < " $JOB_CONFIG " )
87+ if [ -z " $TO " ]; then
88+ # Send to a default address if the owner name is missing from the config.
89+ TO=$FROM
90+ NOTIFY_SUBJECT=" $NOTIFY_SUBJECT (and had no owner)"
91+ fi
92+ NOTIFY_BODY=<< END
93+ $NOTIFY_BODY
7294
7395You can review the job's details at http://analysis.telemetry.mozilla.org
7496END
75- fi
97+ aws ses send-email \
98+ --region $REGION \
99+ --from " $FROM " \
100+ --to " $TO " \
101+ --subject " $NOTIFY_SUBJECT " \
102+ --text " $NOTIFY_BODY "
76103fi
0 commit comments