Skip to content

Commit 5004513

Browse files
committed
Merge branch 'develop' into wip/pcluster3
Signed-off-by: Enrico Usai <[email protected]> # Conflicts: # CHANGELOG.md # amis/build_ami.sh # amis/packer_alinux2.json # amis/packer_centos7.json # amis/packer_centos8.json # amis/packer_ubuntu1804.json # amis/packer_ubuntu2004.json # amis/packer_variables.json # attributes/conditions.rb # attributes/default.rb # files/default/configure-pat.sh # libraries/helpers.rb # metadata.rb # recipes/cluster_admin_user_install.rb # recipes/fsx_mount.rb # recipes/sge_install.rb # recipes/slurm_install.rb # recipes/tests.rb # templates/default/compute_ready.erb # templates/default/slurm/slurm.conf.erb
2 parents 29cf9db + 22c7e21 commit 5004513

File tree

7 files changed

+115
-46
lines changed

7 files changed

+115
-46
lines changed

.github/workflows/codeql-analysis.yml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
name: "CodeQL"
2+
3+
on:
4+
push:
5+
pull_request:
6+
schedule:
7+
- cron: '0 10 * * 2'
8+
9+
jobs:
10+
analyze:
11+
name: Analyze
12+
runs-on: ubuntu-latest
13+
permissions:
14+
actions: read
15+
contents: read
16+
security-events: write
17+
strategy:
18+
fail-fast: false
19+
matrix:
20+
language: [ 'python' ]
21+
steps:
22+
- name: Checkout repository
23+
uses: actions/checkout@v2
24+
- name: Initialize CodeQL
25+
uses: github/codeql-action/init@v1
26+
with:
27+
languages: ${{ matrix.language }}
28+
queries: +security-and-quality
29+
- name: Perform CodeQL Analysis
30+
uses: github/codeql-action/analyze@v1

CHANGELOG.md

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,33 @@ This file is used to list changes made in each version of the aws-parallelcluste
1212
- Use tags prefix `parallelcluster:`.
1313
- Run Slurm command `scontrol` with sudo because clustermgtd is run as cluster admin user (not root).
1414
- Implement `computemgtd` self-termination via `shutdown` command instead of calling TerminateInstances.
15+
- Implement scaling protection mechanism with Slurm scheduler: compute fleet is automatically set to 'PROTECTED' state
16+
in case recurrent failures are encountered when provisioning nodes.
1517

16-
2.x.x
18+
2.11.0
1719
-----
1820

1921
**ENHANCEMENTS**
20-
- SGE: make `qstat` command in nodewatcher more robust in case a custom DHCP option set is configured.
22+
- SGE: always use shortname as hostname filter with `qstat`. This will make nodewatcher more robust when using custom DHCP option, where the full hostname seen by `SGE` might differ from the hostname returned from EC2 metadata(local-hostname).
2123
- Transition from IMDSv1 to IMDSv2.
22-
- Implement scaling protection mechanism with Slurm scheduler: compute fleet is automatically set to 'PROTECTED' state
23-
in case recurrent failures are encountered when provisioning nodes.
24+
- Have `computemgtd` reuse last available daemon configuration when the new one cannot be loaded.
25+
- Use methods with timeouts to read NFS shared files, which will prevent `computemgtd` from hanging when NFS filesystems are not available.
2426

2527
**BUG FIXES**
2628
- Fix a bug that caused `clustermgtd` to not immediately replace instances with failed status check that are in replacement process.
2729

30+
2.10.4
31+
-----
32+
33+
**CHANGES**
34+
- There were no notable changes for this version.
35+
36+
2.10.3
37+
-----
38+
39+
**CHANGES**
40+
- There were no notable changes for this version.
41+
2842
2.10.2
2943
-----
3044

src/slurm_plugin/common.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,14 @@
1515
import logging
1616
from datetime import datetime
1717

18-
from common.utils import time_is_up
18+
from common.utils import check_command_output, time_is_up
1919

2020
logger = logging.getLogger(__name__)
2121

2222
# timestamp used by clustermgtd and computemgtd should be in default ISO format
2323
# YYYY-MM-DDTHH:MM:SS.ffffff+HH:MM[:SS[.ffffff]]
2424
TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S.%f%z"
25+
DEFAULT_COMMAND_TIMEOUT = 30
2526

2627

2728
def log_exception(
@@ -72,16 +73,27 @@ def retrieve_instance_type_mapping(file_path):
7273
raise
7374

7475

75-
def _get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path):
76+
def get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path):
7677
"""Get clustermgtd's last heartbeat."""
77-
with open(clustermgtd_heartbeat_file_path, "r") as timestamp_file:
78-
# Note: heartbeat must be written with datetime.strftime to convert localized datetime into str
79-
# datetime.strptime will not work with str(datetime)
80-
# Example timestamp written to heartbeat file: 2020-07-30 19:34:02.613338+00:00
81-
return datetime.strptime(timestamp_file.read().strip(), TIMESTAMP_FORMAT)
78+
# Use subprocess based method to read shared file to prevent hanging when NFS is down
79+
# Do not copy to local. Different users need to access the file, but file should be writable by root only
80+
# Only use last line of output to avoid taking unexpected output in stdout
81+
heartbeat = (
82+
check_command_output(
83+
f"cat {clustermgtd_heartbeat_file_path}",
84+
timeout=DEFAULT_COMMAND_TIMEOUT,
85+
shell=True, # nosec
86+
)
87+
.splitlines()[-1]
88+
.strip()
89+
)
90+
# Note: heartbeat must be written with datetime.strftime to convert localized datetime into str
91+
# datetime.strptime will not work with str(datetime)
92+
# Example timestamp written to heartbeat file: 2020-07-30 19:34:02.613338+00:00
93+
return datetime.strptime(heartbeat, TIMESTAMP_FORMAT)
8294

8395

84-
def _expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout):
96+
def expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout):
8597
"""Test if clustermgtd heartbeat is expired."""
8698
if time_is_up(last_heartbeat, current_time, clustermgtd_timeout):
8799
logger.error(
@@ -96,9 +108,9 @@ def _expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_tim
96108

97109
def is_clustermgtd_heartbeat_valid(current_time, clustermgtd_timeout, clustermgtd_heartbeat_file_path):
98110
try:
99-
last_heartbeat = _get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path)
111+
last_heartbeat = get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path)
100112
logger.info("Latest heartbeat from clustermgtd: %s", last_heartbeat)
101-
return not _expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout)
113+
return not expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout)
102114
except Exception as e:
103115
logger.error("Unable to retrieve clustermgtd heartbeat with exception: %s", e)
104116
return False

src/slurm_plugin/computemgtd.py

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,12 @@
2323
from common.time_utils import seconds
2424
from common.utils import get_metadata, run_command, sleep_remaining_loop_time
2525
from retrying import retry
26-
from slurm_plugin.common import is_clustermgtd_heartbeat_valid, log_exception
26+
from slurm_plugin.common import (
27+
DEFAULT_COMMAND_TIMEOUT,
28+
expired_clustermgtd_heartbeat,
29+
get_clustermgtd_heartbeat,
30+
log_exception,
31+
)
2732
from slurm_plugin.slurm_resources import CONFIG_FILE_DIR
2833

2934
LOOP_TIME = 60
@@ -54,14 +59,20 @@ def __repr__(self):
5459
attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()])
5560
return "{class_name}({attrs})".format(class_name=self.__class__.__name__, attrs=attrs)
5661

57-
@log_exception(log, "reading computemgtd config", catch_exception=IOError, raise_on_error=True)
62+
@log_exception(log, "reading computemgtd config", catch_exception=Exception, raise_on_error=True)
5863
def _get_config(self, config_file_path):
5964
"""Get computemgtd configuration."""
6065
log.info("Reading %s", config_file_path)
6166
config = ConfigParser()
6267
try:
63-
config.read_file(open(config_file_path, "r"))
64-
except IOError:
68+
# Use subprocess based method to copy shared file to local to prevent hanging when NFS is down
69+
run_command(
70+
f"cat {config_file_path} > {CONFIG_FILE_DIR}/.computemgtd_config.local",
71+
timeout=DEFAULT_COMMAND_TIMEOUT,
72+
shell=True, # nosec
73+
)
74+
config.read_file(open(f"{CONFIG_FILE_DIR}/.computemgtd_config.local", "r"))
75+
except Exception:
6576
log.error(f"Cannot read computemgtd configuration file: {config_file_path}")
6677
raise
6778

@@ -100,11 +111,10 @@ def _get_config(self, config_file_path):
100111
def _read_nodename_from_file(nodename_file_path):
101112
"""Read self nodename from a file."""
102113
try:
103-
log.info("Reading self nodename from %s", nodename_file_path)
104114
with open(nodename_file_path, "r") as nodename_file:
105115
nodename = nodename_file.read()
106116
return nodename
107-
except IOError as e:
117+
except Exception as e:
108118
log.error("Unable to read self nodename from %s with exception: %s\n", nodename_file_path, e)
109119
raise
110120

@@ -169,22 +179,32 @@ def _run_computemgtd():
169179
# Initial default heartbeat time as computemgtd startup time
170180
last_heartbeat = datetime.now(tz=timezone.utc)
171181
log.info("Initializing clustermgtd heartbeat to be computemgtd startup time: %s", last_heartbeat)
172-
computemgtd_config = None
173-
reload_config_counter = 0
182+
computemgtd_config = _load_daemon_config()
183+
reload_config_counter = RELOAD_CONFIG_ITERATIONS
174184
while True:
175185
# Get current time
176186
current_time = datetime.now(tz=timezone.utc)
177187

178-
if not computemgtd_config or reload_config_counter <= 0:
179-
computemgtd_config = _load_daemon_config()
180-
reload_config_counter = RELOAD_CONFIG_ITERATIONS
188+
if reload_config_counter <= 0:
189+
try:
190+
computemgtd_config = _load_daemon_config()
191+
reload_config_counter = RELOAD_CONFIG_ITERATIONS
192+
except Exception as e:
193+
log.warning("Unable to reload daemon config, using previous one.\nException: %s", e)
181194
else:
182195
reload_config_counter -= 1
183196

184197
# Check heartbeat
185-
if not is_clustermgtd_heartbeat_valid(
186-
current_time, computemgtd_config.clustermgtd_timeout, computemgtd_config.clustermgtd_heartbeat_file_path
187-
):
198+
try:
199+
last_heartbeat = get_clustermgtd_heartbeat(computemgtd_config.clustermgtd_heartbeat_file_path)
200+
log.info("Latest heartbeat from clustermgtd: %s", last_heartbeat)
201+
except Exception as e:
202+
log.warning(
203+
"Unable to retrieve clustermgtd heartbeat. Using last known heartbeat: %s with exception: %s",
204+
last_heartbeat,
205+
e,
206+
)
207+
if expired_clustermgtd_heartbeat(last_heartbeat, current_time, computemgtd_config.clustermgtd_timeout):
188208
if computemgtd_config.disable_computemgtd_actions:
189209
log.info("All computemgtd actions currently disabled")
190210
elif _is_self_node_down(computemgtd_config.nodename):

tests/slurm_plugin/test_clustermgtd.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1183,12 +1183,6 @@ def test_manage_cluster(
11831183
"PrivateDnsName": "hostname",
11841184
"LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc),
11851185
},
1186-
{
1187-
"InstanceId": "i-6",
1188-
"PrivateIpAddress": "ip-6",
1189-
"PrivateDnsName": "hostname",
1190-
"LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc),
1191-
},
11921186
# Return an orphaned instance
11931187
{
11941188
"InstanceId": "i-999",
@@ -1251,21 +1245,16 @@ def test_manage_cluster(
12511245
},
12521246
generate_error=False,
12531247
),
1254-
# _maintain_nodes: _handle_powering_down_nodes
1255-
MockedBoto3Request(
1256-
method="terminate_instances",
1257-
response={},
1258-
expected_params={"InstanceIds": ["i-6"]},
1259-
generate_error=False,
1260-
),
12611248
# _maintain_nodes/delete_instances: terminate dynamic down nodes
1249+
# dynamic down nodes are handled with suspend script, and its boto3 call should not be reflected here
12621250
MockedBoto3Request(
12631251
method="terminate_instances",
12641252
response={},
12651253
expected_params={"InstanceIds": ["i-2"]},
12661254
generate_error=False,
12671255
),
12681256
# _maintain_nodes/delete_instances: terminate static down nodes
1257+
# dynamic down nodes are handled with suspend script, and its boto3 call should not be reflected here
12691258
MockedBoto3Request(
12701259
method="terminate_instances",
12711260
response={},

tests/slurm_plugin/test_common.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,11 @@
1111

1212

1313
from datetime import datetime, timedelta, timezone
14-
from unittest.mock import mock_open
1514

1615
import pytest
1716
from assertpy import assert_that
1817
from common.utils import time_is_up
19-
from slurm_plugin.common import TIMESTAMP_FORMAT, _get_clustermgtd_heartbeat
18+
from slurm_plugin.common import TIMESTAMP_FORMAT, get_clustermgtd_heartbeat
2019

2120

2221
@pytest.mark.parametrize(
@@ -72,5 +71,8 @@ def test_time_is_up(initial_time, current_time, grace_time, expected_result):
7271
],
7372
)
7473
def test_get_clustermgtd_heartbeat(time, expected_parsed_time, mocker):
75-
mocker.patch("slurm_plugin.common.open", mock_open(read_data=time.strftime(TIMESTAMP_FORMAT)))
76-
assert_that(_get_clustermgtd_heartbeat("some file path")).is_equal_to(expected_parsed_time)
74+
mocker.patch(
75+
"slurm_plugin.common.check_command_output",
76+
return_value=f"some_random_stdout\n{time.strftime(TIMESTAMP_FORMAT)}",
77+
)
78+
assert_that(get_clustermgtd_heartbeat("some file path")).is_equal_to(expected_parsed_time)

tests/slurm_plugin/test_computemgtd.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,9 @@
6262
)
6363
def test_computemgtd_config(config_file, expected_attributes, test_datadir, mocker):
6464
mocker.patch("slurm_plugin.computemgtd.ComputemgtdConfig._read_nodename_from_file", return_value="some_nodename")
65-
compute_config = ComputemgtdConfig(test_datadir / config_file)
65+
mocker.patch("slurm_plugin.computemgtd.run_command")
66+
mocker.patch("slurm_plugin.computemgtd.open", return_value=open(test_datadir / config_file, "r"))
67+
compute_config = ComputemgtdConfig("mocked_config_path")
6668
for key in expected_attributes:
6769
assert_that(compute_config.__dict__.get(key)).is_equal_to(expected_attributes.get(key))
6870

0 commit comments

Comments
 (0)