Skip to content

Commit 8314671

Browse files
committed
Force nodes to down and power_save when stopping the cluster
If a cluster is stopped while a node is powering-up (alloc#-idle#), node is kept in the powering-up state on cluster start. This makes the node unavailable for the entire ResumeTimeout which is 60 minutes. Slurm is ignoring the transition to power_down if we don't put the node to down first. Copy Changelog from 2.11 branch. Signed-off-by: Enrico Usai <[email protected]>
1 parent 683c69c commit 8314671

File tree

3 files changed

+21
-8
lines changed

3 files changed

+21
-8
lines changed

CHANGELOG.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,25 @@ This file is used to list changes made in each version of the aws-parallelcluste
77
------
88

99
**CHANGES**
10-
1110
- Drop support for SGE and Torque schedulers.
1211
- Use tags prefix `parallelcluster:`.
1312
- Run Slurm command `scontrol` with sudo because clustermgtd is run as cluster admin user (not root).
1413
- Implement `computemgtd` self-termination via `shutdown` command instead of calling TerminateInstances.
1514
- Implement scaling protection mechanism with Slurm scheduler: compute fleet is automatically set to 'PROTECTED' state
1615
in case recurrent failures are encountered when provisioning nodes.
1716

17+
2.11.2
18+
-----
19+
20+
**BUG FIXES**
21+
- Slurm: fix issue that prevented powering-up nodes to be correctly reset after a stop and start of the cluster.
22+
23+
2.11.1
24+
-----
25+
26+
**CHANGES**
27+
- There were no notable changes for this version.
28+
1829
2.11.0
1930
-----
2031

src/common/schedulers/slurm_commands.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ def update_all_partitions(state, reset_node_addrs_hostname):
138138
log.info(f"Setting partition {part.name} state from {part.state} to {state}")
139139
if reset_node_addrs_hostname:
140140
log.info(f"Resetting partition nodes {part.nodenames}")
141-
reset_nodes(part.nodenames, state="power_down", reason="stopping cluster")
141+
set_nodes_down_and_power_save(part.nodenames, reason="stopping cluster")
142142
partition_to_update.append(part.name)
143143
succeeded_partitions = update_partitions(partition_to_update, state)
144144
return succeeded_partitions == partition_to_update

tests/common/schedulers/test_slurm_commands.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -535,7 +535,7 @@ def test_update_partitions(
535535
],
536536
PartitionStatus.INACTIVE,
537537
True,
538-
[call("node-3,node-4", reason="stopping cluster", state="power_down")],
538+
[call("node-3,node-4", reason="stopping cluster")],
539539
["part-2"],
540540
["part-2"],
541541
True,
@@ -548,8 +548,8 @@ def test_update_partitions(
548548
PartitionStatus.INACTIVE,
549549
True,
550550
[
551-
call("node-1,node-2", reason="stopping cluster", state="power_down"),
552-
call("node-3,node-4", reason="stopping cluster", state="power_down"),
551+
call("node-1,node-2", reason="stopping cluster"),
552+
call("node-3,node-4", reason="stopping cluster"),
553553
],
554554
["part-1", "part-2"],
555555
["part-1", "part-2"],
@@ -603,7 +603,9 @@ def test_update_all_partitions(
603603
expected_results,
604604
mocker,
605605
):
606-
reset_node_spy = mocker.patch("common.schedulers.slurm_commands.reset_nodes", auto_spec=True)
606+
set_nodes_down_and_power_save_spy = mocker.patch(
607+
"common.schedulers.slurm_commands.set_nodes_down_and_power_save", auto_spec=True
608+
)
607609
update_partitions_spy = mocker.patch(
608610
"common.schedulers.slurm_commands.update_partitions", return_value=mock_succeeded_partitions, auto_spec=True
609611
)
@@ -613,7 +615,7 @@ def test_update_all_partitions(
613615
assert_that(update_all_partitions(state, reset_node_addrs_hostname=reset_node_info)).is_equal_to(expected_results)
614616
get_part_spy.assert_called_with(get_all_nodes=True)
615617
if expected_reset_nodes_calls:
616-
reset_node_spy.assert_has_calls(expected_reset_nodes_calls)
618+
set_nodes_down_and_power_save_spy.assert_has_calls(expected_reset_nodes_calls)
617619
else:
618-
reset_node_spy.assert_not_called()
620+
set_nodes_down_and_power_save_spy.assert_not_called()
619621
update_partitions_spy.assert_called_with(partitions_to_update, state)

0 commit comments

Comments
 (0)