Skip to content

Commit f6286d3

Browse files
committed
Notify by email on job failures via aws cli.
Update to notify on failed Spark jobs too, and expand non-Spark notification to include any error, not just timeouts.
1 parent 008fc2e commit f6286d3

File tree

1 file changed

+42
-15
lines changed
  • http/analysis-service/jobs

1 file changed

+42
-15
lines changed

http/analysis-service/jobs/run.sh

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
44
JOB_CONFIG="$DIR/$1.json"
55

6+
REGION=$(jq -r '.region' < "$JOB_CONFIG")
7+
JOB_NAME=$(jq -r '.job_name' < "$JOB_CONFIG")
8+
69
export PATH=/usr/bin/:/usr/local/bin/:$PATH
710

811
if [ ! -f "$JOB_CONFIG" ]; then
@@ -11,13 +14,11 @@ if [ ! -f "$JOB_CONFIG" ]; then
1114
fi
1215

1316
if [ "$(jq -r '.num_workers|type' < $JOB_CONFIG)" == "number" ]; then # Spark cluster
14-
REGION=$(jq -r '.region' < "$JOB_CONFIG")
1517
EMR_RELEASE=$(jq -r '.emr_release' < "$JOB_CONFIG")
1618
N_WORKERS=$(jq -r '.num_workers' < "$JOB_CONFIG")
1719
MASTER_TYPE=$(jq -r '.master_instance_type' < "$JOB_CONFIG")
1820
SLAVE_TYPE=$(jq -r '.slave_instance_type' < "$JOB_CONFIG")
1921
CLUSTER_NAME=$(jq -r '.cluster_name' < "$JOB_CONFIG")
20-
JOB_NAME=$(jq -r '.job_name' < "$JOB_CONFIG")
2122
CODE=$(jq -r '.code_uri' < "$JOB_CONFIG")
2223
TIMEOUT=$(jq -r '.timeout_minutes' < "$JOB_CONFIG")
2324
DATA_BUCKET=$(jq -r '.data_bucket' < "$JOB_CONFIG")
@@ -48,29 +49,55 @@ if [ "$(jq -r '.num_workers|type' < $JOB_CONFIG)" == "number" ]; then # Spark cl
4849
--bootstrap-actions Path=s3://${EMR_BUCKET}/bootstrap/telemetry.sh,Args=\["--timeout","$TIMEOUT"\] \
4950
--configurations https://s3-${REGION}.amazonaws.com/${EMR_BUCKET}/configuration/configuration.json \
5051
--steps Type=CUSTOM_JAR,Name=CustomJAR,ActionOnFailure=TERMINATE_JOB_FLOW,Jar=s3://${REGION}.elasticmapreduce/libs/script-runner/script-runner.jar,Args="$STEP_ARGS"
52+
EXIT_CODE=$?
53+
if [ $EXIT_CODE -ne 0 ]; then
54+
# Error creating emr cluster. Notify owner.
55+
NOTIFY_SUBJECT="Scheduled Spark job '$JOB_NAME' encountered an error"
56+
NOTIFY_BODY=<<END
57+
Scheduled Telemetry Spark job '$JOB_NAME' exited with a code of $EXIT_CODE which
58+
indicates it probably encountered an error.
59+
END
60+
fi
5161
else
5262
cd ~/telemetry-server
5363
python -m provisioning.aws.launch_worker "$JOB_CONFIG"
5464
EXIT_CODE=$?
5565

5666
if [ $EXIT_CODE -eq 2 ]; then
5767
# Job timed out. Notify owner.
58-
NOTIFY=monitoring/anomaly_detection/notify.py
59-
60-
TO=$(jq -r '.job_owner' < "$JOB_CONFIG")
61-
JOB_NAME=$(jq -r '.job_name' < "$JOB_CONFIG")
62-
SUBJECT="Your scheduled Telemetry job '$JOB_NAME' timed out"
6368
JOB_TIMEOUT=$(jq -r '.job_timeout_minutes' < "$JOB_CONFIG")
64-
if [ -z "$TO" ]; then
65-
# Send to a default address if the owner name is missing from the config.
66-
TO=$FROM
67-
SUBJECT="Scheduled Telemetry job '$JOB_NAME' timed out (and had no owner)"
68-
fi
69-
python $NOTIFY -f "$FROM" -t "$TO" -s "$SUBJECT" <<END
70-
Scheduled Telemetry job "$JOB_NAME" was forcibly terminated after the configured
69+
NOTIFY_SUBJECT="Scheduled Telemetry job '$JOB_NAME' timed out"
70+
NOTIFY_BODY=<<END
71+
Scheduled Telemetry job '$JOB_NAME' was forcibly terminated after the configured
7172
timeout ($JOB_TIMEOUT minutes).
73+
END
74+
elif [ $EXIT_CODE -ne 0 ]; then
75+
# Error running job. Notify owner.
76+
NOTIFY_SUBJECT="Scheduled Telemetry job '$JOB_NAME' encountered an error"
77+
NOTIFY_BODY=<<END
78+
Scheduled Telemetry job '$JOB_NAME' exited with code $EXIT_CODE which indicates
79+
it probably encountered an error.
80+
END
81+
fi
82+
fi
83+
84+
if [ ! -z "$NOTIFY_SUBJECT" ]; then
85+
86+
TO=$(jq -r '.job_owner' < "$JOB_CONFIG")
87+
if [ -z "$TO" ]; then
88+
# Send to a default address if the owner name is missing from the config.
89+
TO=$FROM
90+
NOTIFY_SUBJECT="$NOTIFY_SUBJECT (and had no owner)"
91+
fi
92+
NOTIFY_BODY=<<END
93+
$NOTIFY_BODY
7294
7395
You can review the job's details at http://analysis.telemetry.mozilla.org
7496
END
75-
fi
97+
aws ses send-email \
98+
--region $REGION \
99+
--from "$FROM" \
100+
--to "$TO" \
101+
--subject "$NOTIFY_SUBJECT" \
102+
--text "$NOTIFY_BODY"
76103
fi

0 commit comments

Comments
 (0)