Skip to content

Commit 9fae22d

Browse files
authored
Release 2.3.1
Merge Release 2.3.1
2 parents 3f908d6 + 6dda05d commit 9fae22d

29 files changed

+1316
-1173
lines changed

CHANGELOG.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,35 @@ aws-parallelcluster-node CHANGELOG
33

44
This file is used to list changes made in each version of the aws-parallelcluster-node package.
55

6+
2.3.1
7+
-----
8+
9+
**BUG FIXES**
10+
- `sqswatcher`: Slurm - Fix host removal
11+
12+
13+
2.3.0
14+
-----
15+
16+
**CHANGES**
17+
- `sqswatcher`: Slurm - dynamically adjust max cluster size based on ASG settings
18+
- `sqswatcher`: Slurm - use FUTURE state for dummy nodes to prevent Slurm daemon from contacting unexisting nodes
19+
- `sqswatcher`: Slurm - dynamically change the number of configured FUTURE nodes based on the actual nodes that
20+
join the cluster. The max size of the cluster seen by the scheduler always matches the max capacity of the ASG.
21+
- `sqswatcher`: Slurm - process nodes added to or removed from the cluster in batches. This speeds up cluster scaling
22+
which is able to react with a delay of less than 1 minute to variations in the ASG capacity.
23+
- `sqswatcher`: Slurm - add support for job dependencies and pending reasons. The cluster won't scale up if the job
24+
cannot start due to an unsatisfied dependency.
25+
- Slurm - set `ReturnToService=1` in scheduler config in order to recover instances that were initially marked as down
26+
due to a transient issue.
27+
- `sqswatcher`: remove DynamoDB table creation
28+
- improve and standardize shell command execution
29+
- add retries on failures and exceptions
30+
31+
**BUG FIXES**
32+
- `sqswatcher`: Slurm - set compute nodes to DRAIN state before removing them from cluster. This prevents the scheduler
33+
from submitting a job to a node that is being terminated.
34+
635
2.2.1
736
-----
837

common/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
4+
# with the License. A copy of the License is located at
5+
#
6+
# http://aws.amazon.com/apache2.0/
7+
#
8+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
9+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10+
# limitations under the License.

common/sge.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/usr/bin/env python2.6
2+
3+
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
from common.utils import check_command_output, run_command
16+
17+
SGE_ROOT = "/opt/sge"
18+
SGE_BIN_PATH = SGE_ROOT + "/bin/lx-amd64"
19+
SGE_BIN_DIR = SGE_BIN_PATH + "/"
20+
SGE_ENV = {"SGE_ROOT": SGE_ROOT, "PATH": "{0}/bin:{1}:/bin:/usr/bin".format(SGE_ROOT, SGE_BIN_PATH)}
21+
22+
23+
def check_sge_command_output(command, log):
24+
"""
25+
Execute SGE shell command, by exporting the appropriate environment.
26+
27+
:param command: command to execute
28+
:param log: logger
29+
:raise: subprocess.CalledProcessError if the command fails
30+
"""
31+
command = _prepend_sge_bin_dir(command)
32+
return check_command_output(command, log, SGE_ENV)
33+
34+
35+
def run_sge_command(command, log):
36+
"""
37+
Execute SGE shell command, by exporting the appropriate environment.
38+
39+
:param command: command to execute
40+
:param log: logger
41+
:raise: subprocess.CalledProcessError if the command fails
42+
"""
43+
command = _prepend_sge_bin_dir(command)
44+
run_command(command, log, SGE_ENV)
45+
46+
47+
def _prepend_sge_bin_dir(command):
48+
if isinstance(command, str) or isinstance(command, unicode):
49+
command = SGE_BIN_DIR + command
50+
else:
51+
command[0] = SGE_BIN_DIR + command[0]
52+
53+
return command

common/slurm.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/usr/bin/env python2.6
2+
3+
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
PENDING_RESOURCES_REASONS = [
16+
"Resources",
17+
"Nodes required for job are DOWN, DRAINED or reserved for jobs in higher priority partitions"
18+
]

common/utils.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
#!/usr/bin/env python2.6
2+
3+
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
import os
16+
import shlex
17+
import subprocess
18+
import sys
19+
20+
import boto3
21+
from future.moves.subprocess import check_output
22+
from retrying import retry
23+
24+
25+
class CriticalError(Exception):
26+
"""Critical error for the daemon."""
27+
pass
28+
29+
30+
def load_module(module):
31+
"""
32+
Load python module.
33+
34+
:param module: module path, relative to the caller one.
35+
:return: the loaded scheduler module
36+
"""
37+
# import module
38+
__import__(module)
39+
# get module from the loaded maps
40+
scheduler_module = sys.modules[module]
41+
return scheduler_module
42+
43+
44+
@retry(
45+
stop_max_attempt_number=5,
46+
wait_exponential_multiplier=10000,
47+
wait_exponential_max=80000,
48+
retry_on_exception=lambda exception: isinstance(exception, IndexError)
49+
)
50+
def get_asg_name(stack_name, region, proxy_config, log):
51+
"""
52+
Get autoscaling group name associated to the given stack.
53+
54+
:param stack_name: stack name to search for
55+
:param region: AWS region
56+
:param proxy_config: Proxy configuration
57+
:param log: logger
58+
:raise ASGNotFoundError if the ASG is not found (after the timeout) or if an unexpected error occurs
59+
:return: the ASG name
60+
"""
61+
asg_client = boto3.client("autoscaling", region_name=region, config=proxy_config)
62+
try:
63+
response = asg_client.describe_tags(Filters=[{"Name": "Value", "Values": [stack_name]}])
64+
asg_name = response.get("Tags")[0].get("ResourceId")
65+
log.info("ASG %s found for the stack %s", asg_name, stack_name)
66+
return asg_name
67+
except IndexError:
68+
log.warning("Unable to get ASG for stack %s", stack_name)
69+
raise
70+
except Exception as e:
71+
raise CriticalError("Unable to get ASG for stack {0}. Failed with exception: {1}".format(stack_name, e))
72+
73+
74+
def get_asg_settings(region, proxy_config, asg_name, log):
75+
try:
76+
asg_client = boto3.client("autoscaling", region_name=region, config=proxy_config)
77+
asg = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]).get('AutoScalingGroups')[0]
78+
min_size = asg.get('MinSize')
79+
desired_capacity = asg.get('DesiredCapacity')
80+
max_size = asg.get('MaxSize')
81+
82+
log.info("min/desired/max %d/%d/%d" % (min_size, desired_capacity, max_size))
83+
return min_size, desired_capacity, max_size
84+
except Exception as e:
85+
log.error("Failed when retrieving data for ASG %s with exception %s", asg_name, e)
86+
raise
87+
88+
89+
def check_command_output(command, log, env=None, raise_on_error=True):
90+
"""
91+
Execute shell command and retrieve command output.
92+
93+
:param command: command to execute
94+
:param env: a dictionary containing environment variables
95+
:param log: logger
96+
:param raise_on_error: True to raise subprocess.CalledProcessError on errors
97+
:return: the command output
98+
:raise: subprocess.CalledProcessError if the command fails
99+
"""
100+
return _run_command(
101+
lambda _command, _env: check_output(_command, env=_env, stderr=subprocess.STDOUT, universal_newlines=True),
102+
command,
103+
log,
104+
env,
105+
raise_on_error,
106+
)
107+
108+
109+
def run_command(command, log, env=None, raise_on_error=True):
110+
"""
111+
Execute shell command.
112+
113+
:param command: command to execute
114+
:param env: a dictionary containing environment variables
115+
:param log: logger
116+
:param raise_on_error: True to raise subprocess.CalledProcessError on errors
117+
:raise: subprocess.CalledProcessError if the command fails
118+
"""
119+
_run_command(lambda _command, _env: subprocess.check_call(_command, env=_env), command, log, env, raise_on_error)
120+
121+
122+
def _run_command(command_function, command, log, env=None, raise_on_error=True):
123+
try:
124+
if isinstance(command, str) or isinstance(command, unicode):
125+
command = shlex.split(command.encode("ascii"))
126+
if env is None:
127+
env = {}
128+
129+
env.update(os.environ.copy())
130+
log.debug("Executing command: %s" % command)
131+
return command_function(command, env)
132+
except subprocess.CalledProcessError as e:
133+
# CalledProcessError.__str__ already produces a significant error message
134+
if raise_on_error:
135+
log.error(e)
136+
raise
137+
else:
138+
log.warning(e)
139+
return None
140+
except OSError as e:
141+
log.error("Unable to execute the command %s. Failed with exception: %s", command, e)
142+
raise

0 commit comments

Comments
 (0)