1
1
#! /bin/bash
2
2
3
- # Define the container name
3
+ # Define the container name and image version
4
4
CONTAINER_NAME=" dcgm-exporter"
5
+ DCGM_EXPORTER_VERSION=3.3.8-3.6.0-ubuntu22.04
6
+ IMAGE=" nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_VERSION} "
7
+
8
+ # Maximum number of retries
9
+ MAX_RETRIES=5
10
+ RETRY_DELAY=5 # Initial delay in seconds
5
11
6
12
# Check if the container exists and is running
7
13
if docker ps --filter " name=$CONTAINER_NAME " --filter " status=running" | grep -q " $CONTAINER_NAME " ; then
8
14
echo " Container $CONTAINER_NAME is already running."
15
+ exit 0
9
16
else
10
17
echo " Container $CONTAINER_NAME is not running or does not exist..."
11
18
echo " Checking if $CONTAINER_NAME container exists but is not running. If yes, removing it..."
12
19
docker rm -f $CONTAINER_NAME && echo " Container $CONTAINER_NAME has been removed."
20
+ echo " Proceeding with script..."
21
+ fi
22
+
23
+ # Check for GPU, then proceed with script
24
+ if nvidia-smi > /dev/null 2>&1 ; then
25
+ echo " NVIDIA GPU found. Proceeding with script..."
13
26
14
- # Check for GPU, then proceed with script
15
- if nvidia-smi > /dev/null 2>&1 ; then
16
- echo " NVIDIA GPU found. Proceeding with script... "
27
+ # Get the instance-type from EC2 instance metadata
28
+ TOKEN= $( curl -X PUT " http://169.254.169.254/latest/api/token " -H " X-aws-ec2-metadata-token-ttl-seconds: 21600 " )
29
+ INSTANCE_TYPE= $( curl -H " X-aws-ec2-metadata-token: $TOKEN " -s http://169.254.169.254/latest/meta-data/instance-type )
17
30
18
- # Get the instance-type from EC2 instance metadata
19
- TOKEN=$( curl -X PUT " http://169.254.169.254/latest/api/token" -H " X-aws-ec2-metadata-token-ttl-seconds: 21600" )
20
- INSTANCE_TYPE=$( curl -H " X-aws-ec2-metadata-token: $TOKEN " -s http://169.254.169.254/latest/meta-data/instance-type)
21
-
22
- DCGM_EXPORTER_VERSION=3.3.8-3.6.0-ubuntu22.04
31
+ echo " Instance Type is recognized as $INSTANCE_TYPE , setting DCGM_EXPORTER_VERSION to $DCGM_EXPORTER_VERSION "
23
32
24
- echo " Instance Type is recognized as $INSTANCE_TYPE , setting DCGM_EXPORTER_VERSION to $DCGM_EXPORTER_VERSION "
33
+ # Retry logic for pulling the image
34
+ attempt=0
35
+ while [ $attempt -lt $MAX_RETRIES ]; do
36
+ echo " Attempting to pull image ($attempt /$MAX_RETRIES )..."
37
+ if sudo docker pull " $IMAGE " ; then
38
+ echo " Successfully pulled image."
39
+ break
40
+ else
41
+ attempt=$(( attempt + 1 ))
42
+ if [ $attempt -lt $MAX_RETRIES ]; then
43
+ echo " Pull failed. Retrying in $RETRY_DELAY seconds..."
44
+ sleep $RETRY_DELAY
45
+ RETRY_DELAY=$(( RETRY_DELAY * 2 )) # Exponential backoff
46
+ else
47
+ echo " Failed to pull Docker image after $MAX_RETRIES attempts. Exiting..."
48
+ exit 1
49
+ fi
50
+ fi
51
+ done
25
52
26
- # Run the DCGM Exporter Docker container
27
- sudo docker run -d --restart always \
53
+ # Run the DCGM Exporter Docker container
54
+ if sudo docker run -d --restart always \
28
55
--name $CONTAINER_NAME \
29
56
--gpus all \
30
57
--net host \
31
58
--cap-add SYS_ADMIN \
32
- nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_VERSION} \
33
- -f /etc/dcgm-exporter/dcp-metrics-included.csv || { echo " Failed to run DCGM Exporter Docker container" ; exit 1; }
34
-
59
+ $IMAGE \
60
+ -f /etc/dcgm-exporter/dcp-metrics-included.csv; then
35
61
echo " Running DCGM exporter in a Docker container on port 9400..."
36
62
else
37
- echo " NVIDIA GPU not found. DCGM Exporter was not installed. If this is a controller node, you can safely ignore this warning. Exiting gracefully... "
38
- exit 0
63
+ echo " Failed to run DCGM Exporter Docker container "
64
+ exit 1
39
65
fi
40
- fi
66
+ else
67
+ echo " NVIDIA GPU not found. DCGM Exporter was not installed. If this is a controller node, you can safely ignore this warning. Exiting gracefully..."
68
+ exit 0
69
+ fi
0 commit comments