Skip to content

Commit 29cf9db

Browse files
chenwanyenrico-usai
authored andcommitted
3.0 Cherry-pick:Not use cloud_reg_addrs option and manually reset nodeaddr on power_down
Signed-off-by: chenwany <[email protected]>
1 parent a9bb0c8 commit 29cf9db

File tree

6 files changed

+25
-33
lines changed

6 files changed

+25
-33
lines changed

src/common/schedulers/slurm_commands.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -193,13 +193,8 @@ def set_nodes_drain(nodes, reason):
193193

194194

195195
def set_nodes_power_down(nodes, reason=None):
196-
"""
197-
Place slurm node into power_down state.
198-
199-
Do not reset the nodeaddr/nodehostname manually.
200-
Nodeaddr/nodehostname will be reset automatically after power_down with cloud_reg_addrs.
201-
"""
202-
update_nodes(nodes=nodes, state="power_down", reason=reason, raise_on_error=True)
196+
"""Place slurm node into power_down state and reset nodeaddr/nodehostname."""
197+
reset_nodes(nodes=nodes, state="power_down", reason=reason, raise_on_error=True)
203198

204199

205200
def reset_nodes(nodes, state=None, reason=None, raise_on_error=False):

src/slurm_plugin/clustermgtd.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -670,12 +670,12 @@ def _handle_powering_down_nodes(self, slurm_nodes):
670670
Handle nodes that are powering down.
671671
672672
Terminate instances backing the powering down node if any.
673-
Do not reset the nodeaddr/nodehostname manually.
674-
Nodeaddr/nodehostname will be reset automatically after power_down with cloud_reg_addrs.
675-
Node state is not changed.
673+
Reset the nodeaddr for the powering down node. Node state is not changed.
676674
"""
677675
powering_down_nodes = [node for node in slurm_nodes if node.is_powering_down_with_nodeaddr()]
678676
if powering_down_nodes:
677+
log.info("Resetting powering down nodes: %s", print_with_count(powering_down_nodes))
678+
reset_nodes(nodes=[node.name for node in powering_down_nodes])
679679
instances_to_terminate = [node.instance.id for node in powering_down_nodes if node.instance]
680680
log.info("Terminating instances that are backing powering down nodes")
681681
self._instance_manager.delete_instances(

src/slurm_plugin/slurm_resources.py

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -374,12 +374,8 @@ def is_state_healthy(self, terminate_drain_nodes, terminate_down_nodes, log_warn
374374

375375
def is_healthy(self, terminate_drain_nodes, terminate_down_nodes, log_warn_if_unhealthy=True):
376376
"""Check if a slurm node is considered healthy."""
377-
return (
378-
self.is_powering_down()
379-
or self.is_backing_instance_valid(log_warn_if_unhealthy=log_warn_if_unhealthy)
380-
and self.is_state_healthy(
381-
terminate_drain_nodes, terminate_down_nodes, log_warn_if_unhealthy=log_warn_if_unhealthy
382-
)
377+
return self.is_backing_instance_valid(log_warn_if_unhealthy=log_warn_if_unhealthy) and self.is_state_healthy(
378+
terminate_drain_nodes, terminate_down_nodes, log_warn_if_unhealthy=log_warn_if_unhealthy
383379
)
384380

385381
def is_bootstrap_failure(self):
@@ -408,15 +404,7 @@ def is_bootstrap_failure(self):
408404
return False
409405

410406
def is_powering_down_with_nodeaddr(self):
411-
"""
412-
Check if a slurm node is a powering down node with instance backing.
413-
414-
Nodes in powering_down(i.e. down%) state will still have the nodeaddr of the backing instance.
415-
Nodeaddr is reset to nodename automatically by slurm after the power_down process,
416-
when node goes back into power_saving(i.e. idle~).
417-
If instance is not terminated during powering_down for some reason,
418-
it becomes orphaned once the nodeaddr is reset, and will be handled as an orphaned node.
419-
"""
407+
"""Check if a slurm node is a powering down node with instance backing."""
420408
return self.is_nodeaddr_set() and (self.is_power() or self.is_powering_down())
421409

422410
def needs_reset_when_inactive(self):

tests/common/schedulers/test_slurm_commands.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ def test_set_nodes_down(nodes, reason, reset_addrs, update_call_kwargs, mocker):
346346
],
347347
)
348348
def test_set_nodes_power_down(nodes, reason, reset_addrs, update_call_kwargs, mocker):
349-
update_mock = mocker.patch("common.schedulers.slurm_commands.update_nodes", autospec=True)
349+
update_mock = mocker.patch("common.schedulers.slurm_commands.reset_nodes", autospec=True)
350350
set_nodes_power_down(nodes, reason)
351351
update_mock.assert_called_with(**update_call_kwargs)
352352

tests/slurm_plugin/slurm_resources/test_slurm_resources.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -439,12 +439,18 @@ def test_slurm_node_is_bootstrap_failure(
439439
None,
440440
False,
441441
),
442-
# Powering_down nodes are handled separately, always considered healthy by this workflow
442+
# Powering_down nodes with backing instance is considered as healthy
443443
(
444444
DynamicNode("queue-dy-c5xlarge-1", "ip-2", "hostname", "DOWN+CLOUD+POWERING_DOWN", "queue"),
445445
EC2Instance("id-2", "ip-2", "hostname", datetime(2020, 1, 1, 0, 0, 0)),
446446
True,
447447
),
448+
# Powering_down nodes without backing instance is considered as unhealthy
449+
(
450+
DynamicNode("queue-dy-c5xlarge-1", "ip-2", "hostname", "DOWN+CLOUD+POWERING_DOWN", "queue"),
451+
None,
452+
False,
453+
),
448454
# Node in POWER_SAVE, but still has ip associated should be considered unhealthy
449455
(
450456
DynamicNode("queue-dy-c5xlarge-1", "ip-2", "hostname", "IDLE+CLOUD+POWER", "queue"),
@@ -471,7 +477,8 @@ def test_slurm_node_is_bootstrap_failure(
471477
"dynamic_nodeaddr_not_set",
472478
"dynamic_unhealthy",
473479
"static_unhealthy",
474-
"powering_down",
480+
"powering_down_healthy",
481+
"powering_down_unhealthy",
475482
"power_unhealthy1",
476483
"power_unhealthy2",
477484
"power_healthy",

tests/slurm_plugin/test_clustermgtd.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -626,7 +626,7 @@ def test_handle_unhealthy_dynamic_nodes(
626626

627627

628628
@pytest.mark.parametrize(
629-
"slurm_nodes, instances, instances_to_terminate",
629+
"slurm_nodes, instances, instances_to_terminate, expected_powering_down_nodes",
630630
[
631631
(
632632
[
@@ -637,7 +637,7 @@ def test_handle_unhealthy_dynamic_nodes(
637637
DynamicNode(
638638
"queue1-dy-c5xlarge-5", "queue1-dy-c5xlarge-5", "queue1-dy-c5xlarge-5", "POWERING_DOWN", "queue1"
639639
),
640-
DynamicNode("queue1-st-c5xlarge-6", "ip-6", "hostname", "POWERING_DOWN", "queue1"),
640+
StaticNode("queue1-st-c5xlarge-6", "ip-6", "hostname", "POWERING_DOWN", "queue1"),
641641
],
642642
[
643643
EC2Instance("id-1", "ip-1", "hostname", "some_launch_time"),
@@ -648,12 +648,15 @@ def test_handle_unhealthy_dynamic_nodes(
648648
None,
649649
],
650650
["id-3"],
651+
["queue1-dy-c5xlarge-2", "queue1-dy-c5xlarge-3"],
651652
)
652653
],
653654
ids=["basic"],
654655
)
655656
@pytest.mark.usefixtures("initialize_instance_manager_mock", "initialize_compute_fleet_status_manager_mock")
656-
def test_handle_powering_down_nodes(slurm_nodes, instances, instances_to_terminate, mocker):
657+
def test_handle_powering_down_nodes(
658+
slurm_nodes, instances, instances_to_terminate, expected_powering_down_nodes, mocker
659+
):
657660
for node, instance in zip(slurm_nodes, instances):
658661
node.instance = instance
659662
mock_sync_config = SimpleNamespace(terminate_max_batch_size=4)
@@ -662,8 +665,7 @@ def test_handle_powering_down_nodes(slurm_nodes, instances, instances_to_termina
662665
reset_nodes_mock = mocker.patch("slurm_plugin.clustermgtd.reset_nodes", auto_spec=True)
663666
cluster_manager._handle_powering_down_nodes(slurm_nodes)
664667
mock_instance_manager.delete_instances.assert_called_with(instances_to_terminate, terminate_batch_size=4)
665-
# We don't need to reset nodes manually because cloud_reg_addrs option is specified
666-
reset_nodes_mock.assert_not_called()
668+
reset_nodes_mock.assert_called_with(nodes=expected_powering_down_nodes)
667669

668670

669671
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)