Skip to content

Commit f5ada55

Browse files
committed
exponential backoff for node exporter and dcgm exporter containers
1 parent 7f4a505 commit f5ada55

File tree

2 files changed

+85
-24
lines changed

2 files changed

+85
-24
lines changed
Lines changed: 47 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,69 @@
11
#!/bin/bash
22

3-
# Define the container name
3+
# Define the container name and image version
44
CONTAINER_NAME="dcgm-exporter"
5+
DCGM_EXPORTER_VERSION=3.3.8-3.6.0-ubuntu22.04
6+
IMAGE="nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_VERSION}"
7+
8+
# Maximum number of retries
9+
MAX_RETRIES=5
10+
RETRY_DELAY=5 # Initial delay in seconds
511

612
# Check if the container exists and is running
713
if docker ps --filter "name=$CONTAINER_NAME" --filter "status=running" | grep -q "$CONTAINER_NAME"; then
814
echo "Container $CONTAINER_NAME is already running."
15+
exit 0
916
else
1017
echo "Container $CONTAINER_NAME is not running or does not exist..."
1118
echo "Checking if $CONTAINER_NAME container exists but is not running. If yes, removing it..."
1219
docker rm -f $CONTAINER_NAME && echo "Container $CONTAINER_NAME has been removed."
20+
echo "Proceeding with script..."
21+
fi
22+
23+
# Check for GPU, then proceed with script
24+
if nvidia-smi > /dev/null 2>&1; then
25+
echo "NVIDIA GPU found. Proceeding with script..."
1326

14-
# Check for GPU, then proceed with script
15-
if nvidia-smi > /dev/null 2>&1; then
16-
echo "NVIDIA GPU found. Proceeding with script..."
27+
# Get the instance-type from EC2 instance metadata
28+
TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
29+
INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/instance-type)
1730

18-
# Get the instance-type from EC2 instance metadata
19-
TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
20-
INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/instance-type)
21-
22-
DCGM_EXPORTER_VERSION=3.3.8-3.6.0-ubuntu22.04
31+
echo "Instance Type is recognized as $INSTANCE_TYPE, setting DCGM_EXPORTER_VERSION to $DCGM_EXPORTER_VERSION"
2332

24-
echo "Instance Type is recognized as $INSTANCE_TYPE, setting DCGM_EXPORTER_VERSION to $DCGM_EXPORTER_VERSION"
33+
# Retry logic for pulling the image
34+
attempt=0
35+
while [ $attempt -lt $MAX_RETRIES ]; do
36+
echo "Attempting to pull image ($attempt/$MAX_RETRIES)..."
37+
if sudo docker pull "$IMAGE"; then
38+
echo "Successfully pulled image."
39+
break
40+
else
41+
attempt=$((attempt + 1))
42+
if [ $attempt -lt $MAX_RETRIES ]; then
43+
echo "Pull failed. Retrying in $RETRY_DELAY seconds..."
44+
sleep $RETRY_DELAY
45+
RETRY_DELAY=$((RETRY_DELAY * 2)) # Exponential backoff
46+
else
47+
echo "Failed to pull Docker image after $MAX_RETRIES attempts. Exiting..."
48+
exit 1
49+
fi
50+
fi
51+
done
2552

26-
# Run the DCGM Exporter Docker container
27-
sudo docker run -d --restart always \
53+
# Run the DCGM Exporter Docker container
54+
if sudo docker run -d --restart always \
2855
--name $CONTAINER_NAME \
2956
--gpus all \
3057
--net host \
3158
--cap-add SYS_ADMIN \
32-
nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_VERSION} \
33-
-f /etc/dcgm-exporter/dcp-metrics-included.csv || { echo "Failed to run DCGM Exporter Docker container"; exit 1; }
34-
59+
$IMAGE \
60+
-f /etc/dcgm-exporter/dcp-metrics-included.csv; then
3561
echo "Running DCGM exporter in a Docker container on port 9400..."
3662
else
37-
echo "NVIDIA GPU not found. DCGM Exporter was not installed. If this is a controller node, you can safely ignore this warning. Exiting gracefully..."
38-
exit 0
63+
echo "Failed to run DCGM Exporter Docker container"
64+
exit 1
3965
fi
40-
fi
66+
else
67+
echo "NVIDIA GPU not found. DCGM Exporter was not installed. If this is a controller node, you can safely ignore this warning. Exiting gracefully..."
68+
exit 0
69+
fi

1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/install_efa_node_exporter.sh

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,54 @@
22

33
# Define the container name
44
CONTAINER_NAME="efa-node-exporter"
5+
IMAGE="public.ecr.aws/hpc-cloud/efa-node-exporter:latest"
6+
7+
# Maximum number of retries
8+
MAX_RETRIES=5
9+
RETRY_DELAY=5 # Initial delay in seconds
510

611
# Check if the container exists and is running
712
if docker ps --filter "name=$CONTAINER_NAME" --filter "status=running" | grep -q "$CONTAINER_NAME"; then
813
echo "Container $CONTAINER_NAME is already running."
14+
exit 0
915
else
1016
echo "Container $CONTAINER_NAME is not running or does not exist..."
1117
echo "Checking if $CONTAINER_NAME container exists but is not running. If yes, removing it..."
1218
docker rm -f $CONTAINER_NAME && echo "Container $CONTAINER_NAME has been removed."
1319
echo "Proceeding with script..."
14-
15-
# Run the Docker container with appropriate configurations
16-
sudo docker run -d --restart always \
20+
fi
21+
22+
# Retry logic for pulling the image
23+
attempt=0
24+
while [ $attempt -lt $MAX_RETRIES ]; do
25+
echo "Attempting to pull image ($attempt/$MAX_RETRIES)..."
26+
if sudo docker pull "$IMAGE"; then
27+
echo "Successfully pulled image."
28+
break
29+
else
30+
attempt=$((attempt + 1))
31+
if [ $attempt -lt $MAX_RETRIES ]; then
32+
echo "Pull failed. Retrying in $RETRY_DELAY seconds..."
33+
sleep $RETRY_DELAY
34+
RETRY_DELAY=$((RETRY_DELAY * 2)) # Exponential backoff
35+
else
36+
echo "Failed to pull Docker image after $MAX_RETRIES attempts. Exiting..."
37+
exit 1
38+
fi
39+
fi
40+
done
41+
42+
# Run the Docker container with appropriate configurations
43+
if sudo docker run -d --restart always \
1744
--name=$CONTAINER_NAME \
1845
--net="host" \
1946
--pid="host" \
2047
-v "/:/host:ro,rslave" \
21-
public.ecr.aws/hpc-cloud/efa-node-exporter:latest \
22-
--path.rootfs=/host && { echo "Successfully started EFA Node Exporter on node"; exit 0; } || { echo "Failed to run Docker container"; exit 1; }
23-
fi
48+
$IMAGE \
49+
--path.rootfs=/host; then
50+
echo "Successfully started EFA Node Exporter on node"
51+
exit 0
52+
else
53+
echo "Failed to run Docker container"
54+
exit 1
55+
fi

0 commit comments

Comments
 (0)