Skip to content

Commit c76f681

Browse files
committed
Implement disable insufficient capacity compute resources logic
1. Include queue_name in `get_compute_resource_name` method to also consider the case that a node belongs to multiple partition. In different queue, the compute resources name can be the same. Include queue_name in `get_compute_resource_name` helps to keep track of insufficient capacity compute resources. 2. Implement disable insufficient capacity compute nodes logic: * Collect the ComputeResource name and nodes mapping for all nodes in DOWN state whose reason belongs to one of the following: `InsufficientInstanceCapacity, InsufficientHostCapacity, InsufficientReservedInstanceCapacity, MaxSpotInstanceCountExceeded`. * For each compute resource failing due to capacity problems check if clustermgtd marked it as disabled in a previous iteration. If so check if the timeout expired and in case restore the nodes to an enabled state and clean-up the reason field. If not, disable all POWERED_DOWN nodes belonging to that compute resource and store this information in clustermgtd. Signed-off-by: chenwany <[email protected]>
1 parent 523c366 commit c76f681

File tree

4 files changed

+884
-26
lines changed

4 files changed

+884
-26
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ aws-parallelcluster-node CHANGELOG
33

44
This file is used to list changes made in each version of the aws-parallelcluster-node package.
55

6+
x.x.x
7+
------
8+
9+
**ENHANCEMENTS**
10+
- Enable fast insufficient capacity fail-over with slurm scheduler.
11+
612
3.1.3
713
------
814

src/slurm_plugin/clustermgtd.py

Lines changed: 143 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from enum import Enum
2020
from logging.config import fileConfig
2121
from subprocess import CalledProcessError
22+
from typing import Dict, List
2223

2324
from botocore.config import Config
2425
from common.schedulers.slurm_commands import (
@@ -36,7 +37,15 @@
3637
from retrying import retry
3738
from slurm_plugin.common import TIMESTAMP_FORMAT, log_exception, print_with_count, read_json
3839
from slurm_plugin.instance_manager import InstanceManager
39-
from slurm_plugin.slurm_resources import CONFIG_FILE_DIR, EC2InstanceHealthState, PartitionStatus, StaticNode
40+
from slurm_plugin.slurm_resources import (
41+
CONFIG_FILE_DIR,
42+
ComputeResourceFailureEvent,
43+
DynamicNode,
44+
EC2InstanceHealthState,
45+
PartitionStatus,
46+
SlurmNode,
47+
StaticNode,
48+
)
4049

4150
LOOP_TIME = 60
4251
log = logging.getLogger(__name__)
@@ -300,6 +309,7 @@ def __init__(self, config):
300309
self.static_nodes_in_replacement is persistent across multiple iteration of manage_cluster
301310
This state is required because we need to ignore static nodes that might have long bootstrap time
302311
"""
312+
self._insufficient_capacity_compute_resources = {}
303313
self._static_nodes_in_replacement = set()
304314
self._partitions_protected_failure_count_map = {}
305315
self._compute_fleet_status = ComputeFleetStatus.RUNNING
@@ -380,7 +390,7 @@ def manage_cluster(self):
380390
log.info("Retrieving nodes info from the scheduler")
381391
nodes = self._get_node_info_with_retry()
382392
log.debug("Nodes: %s", nodes)
383-
partitions_name_map = self._retrieve_scheduler_partitions(nodes)
393+
partitions_name_map, compute_resource_nodes_map = self._retrieve_scheduler_partitions(nodes)
384394
except Exception as e:
385395
log.error(
386396
"Unable to get partition/node info from slurm, no other action can be performed. Sleeping... "
@@ -404,7 +414,7 @@ def manage_cluster(self):
404414
if not self._config.disable_all_health_checks:
405415
self._perform_health_check_actions(partitions)
406416
# Maintain slurm nodes
407-
self._maintain_nodes(partitions_name_map)
417+
self._maintain_nodes(partitions_name_map, compute_resource_nodes_map)
408418
# Clean up orphaned instances
409419
self._terminate_orphaned_instances(cluster_instances)
410420
elif self._compute_fleet_status in {
@@ -608,7 +618,7 @@ def _find_unhealthy_slurm_nodes(self, slurm_nodes):
608618
def _increase_partitions_protected_failure_count(self, bootstrap_failure_nodes):
609619
"""Keep count of boostrap failures."""
610620
for node in bootstrap_failure_nodes:
611-
compute_resource = node.get_compute_resource_name()
621+
compute_resource = node.compute_resource_name
612622
for p in node.partitions:
613623
if p in self._partitions_protected_failure_count_map:
614624
self._partitions_protected_failure_count_map[p][compute_resource] = (
@@ -701,7 +711,7 @@ def _handle_unhealthy_static_nodes(self, unhealthy_static_nodes):
701711
)
702712

703713
@log_exception(log, "maintaining slurm nodes", catch_exception=Exception, raise_on_error=False)
704-
def _maintain_nodes(self, partitions_name_map):
714+
def _maintain_nodes(self, partitions_name_map, compute_resource_nodes_map):
705715
"""
706716
Call functions to maintain unhealthy nodes.
707717
@@ -725,6 +735,8 @@ def _maintain_nodes(self, partitions_name_map):
725735
self._handle_unhealthy_static_nodes(unhealthy_static_nodes)
726736
if self._is_protected_mode_enabled():
727737
self._handle_protected_mode_process(active_nodes, partitions_name_map)
738+
if self._config.disable_nodes_on_insufficient_capacity:
739+
self._handle_ice_nodes(unhealthy_dynamic_nodes, compute_resource_nodes_map)
728740
self._handle_failed_health_check_nodes_in_replacement(active_nodes)
729741

730742
@log_exception(log, "terminating orphaned instances", catch_exception=Exception, raise_on_error=False)
@@ -877,6 +889,7 @@ def _handle_health_check(self, unhealthy_instances_status, instance_id_to_active
877889
def _retrieve_scheduler_partitions(nodes):
878890
try:
879891
ignored_nodes = []
892+
compute_resource_nodes_map = {}
880893
partitions_name_map = ClusterManager._get_partition_info_with_retry()
881894
log.debug("Partitions: %s", partitions_name_map)
882895
for node in nodes:
@@ -886,9 +899,12 @@ def _retrieve_scheduler_partitions(nodes):
886899
else:
887900
for p in node.partitions:
888901
partitions_name_map[p].slurm_nodes.append(node)
902+
compute_resource_nodes_map.setdefault(node.queue_name, {}).setdefault(
903+
node.compute_resource_name, []
904+
).append(node)
889905
if ignored_nodes:
890906
log.warning("Ignoring following nodes because they do not belong to any partition: %s", ignored_nodes)
891-
return partitions_name_map
907+
return partitions_name_map, compute_resource_nodes_map
892908
except Exception as e:
893909
log.error("Failed when getting partition/node states from scheduler with exception %s", e)
894910
raise
@@ -950,6 +966,17 @@ def _find_active_nodes(partitions_name_map):
950966
active_nodes += partition.slurm_nodes
951967
return active_nodes
952968

969+
@staticmethod
970+
def _get_unhealthy_ice_nodes(unhealthy_dynamic_nodes: List[DynamicNode]) -> Dict[str, Dict[str, List[DynamicNode]]]:
971+
"""Get insufficient capacity compute resource and nodes, error code mapping."""
972+
ice_compute_resources_and_nodes_map = {}
973+
for node in unhealthy_dynamic_nodes:
974+
if node.is_ice():
975+
ice_compute_resources_and_nodes_map.setdefault(node.queue_name, {}).setdefault(
976+
node.compute_resource_name, []
977+
).append(node)
978+
return ice_compute_resources_and_nodes_map
979+
953980
def _is_node_in_replacement_valid(self, node, check_node_is_valid):
954981
"""
955982
Check node is replacement timeout or in replacement.
@@ -964,6 +991,116 @@ def _is_node_in_replacement_valid(self, node, check_node_is_valid):
964991
return not time_is_expired if check_node_is_valid else time_is_expired
965992
return False
966993

994+
@log_exception(
995+
log, "handling nodes failed due to insufficient capacity", catch_exception=Exception, raise_on_error=False
996+
)
997+
def _handle_ice_nodes(
998+
self,
999+
unhealthy_dynamic_nodes: List[DynamicNode],
1000+
compute_resource_nodes_map: Dict[str, Dict[str, List[SlurmNode]]],
1001+
):
1002+
"""Handle nodes failed with insufficient capacity."""
1003+
# get insufficient capacity compute resource and nodes mapping
1004+
ice_compute_resources_and_nodes_map = self._get_unhealthy_ice_nodes(unhealthy_dynamic_nodes)
1005+
if ice_compute_resources_and_nodes_map:
1006+
self._update_insufficient_capacity_compute_resources(ice_compute_resources_and_nodes_map)
1007+
self._reset_timeout_expired_compute_resources(ice_compute_resources_and_nodes_map)
1008+
self._set_ice_compute_resources_to_down(compute_resource_nodes_map)
1009+
1010+
def _update_insufficient_capacity_compute_resources(
1011+
self, ice_compute_resources_and_nodes_map: Dict[str, Dict[str, List[SlurmNode]]]
1012+
):
1013+
"""Add compute resource to insufficient_capacity_compute_resources if node is ICE node."""
1014+
for queue_name, compute_resources in ice_compute_resources_and_nodes_map.items():
1015+
for compute_resource, nodes in compute_resources.items():
1016+
if not self._insufficient_capacity_compute_resources.get(queue_name, {}).get(compute_resource):
1017+
self._insufficient_capacity_compute_resources.setdefault(queue_name, {})[
1018+
compute_resource
1019+
] = ComputeResourceFailureEvent(self._current_time, nodes[0].error_code)
1020+
1021+
def _reset_timeout_expired_compute_resources(
1022+
self, ice_compute_resources_and_nodes_map: Dict[str, Dict[str, List[SlurmNode]]]
1023+
):
1024+
"""Reset compute resources which insufficient_capacity_timeout expired."""
1025+
# Find insufficient_capacity_timeout compute resources
1026+
if not self._insufficient_capacity_compute_resources:
1027+
return
1028+
log.info(
1029+
"The following compute resources are in down state due to insufficient capacity: %s, "
1030+
"compute resources will be reset after insufficient capacity timeout (%s seconds) expired",
1031+
self._insufficient_capacity_compute_resources,
1032+
self._config.insufficient_capacity_timeout,
1033+
)
1034+
timeout_expired_compute_resources = self._find_insufficient_capacity_timeout_expired_compute_resources()
1035+
1036+
# Reset nodes which insufficient capacity timeout expired
1037+
if timeout_expired_compute_resources:
1038+
self._reset_insufficient_capacity_timeout_expired_nodes(
1039+
timeout_expired_compute_resources, ice_compute_resources_and_nodes_map
1040+
)
1041+
1042+
def _set_ice_compute_resources_to_down(self, compute_resource_nodes_map: Dict[str, Dict[str, List[SlurmNode]]]):
1043+
"""Set powered_down nodes which belong to insufficient capacity compute resources to down."""
1044+
if not self._insufficient_capacity_compute_resources:
1045+
return
1046+
nodes_to_down = {}
1047+
for queue_name, compute_resources in self._insufficient_capacity_compute_resources.items():
1048+
for compute_resource, event in compute_resources.items():
1049+
nodes = compute_resource_nodes_map.get(queue_name, {}).get(compute_resource, [])
1050+
for node in nodes:
1051+
if not node.is_ice() and node.is_power() and not node.is_nodeaddr_set():
1052+
error_code = event.error_code
1053+
nodes_to_down.setdefault(error_code, []).append(node.name)
1054+
if nodes_to_down:
1055+
for error_code, node_list in nodes_to_down.items():
1056+
log.info(
1057+
"Setting following nodes into DOWN state due to insufficient capacity: %s",
1058+
print_with_count(node_list),
1059+
)
1060+
set_nodes_down(
1061+
node_list, reason=f"(Code:{error_code})Temporarily disabling node due to insufficient capacity"
1062+
)
1063+
1064+
def _find_insufficient_capacity_timeout_expired_compute_resources(
1065+
self,
1066+
) -> Dict[str, Dict[str, ComputeResourceFailureEvent]]:
1067+
"""Find compute resources which insufficient_capacity_timeout expired."""
1068+
timeout_expired_cr = dict()
1069+
for queue_name, compute_resources in self._insufficient_capacity_compute_resources.copy().items():
1070+
for compute_resource, event in compute_resources.copy().items():
1071+
if time_is_up(event.timestamp, self._current_time, self._config.insufficient_capacity_timeout):
1072+
self._insufficient_capacity_compute_resources[queue_name].pop(compute_resource)
1073+
timeout_expired_cr.setdefault(queue_name, []).append(compute_resource)
1074+
if not self._insufficient_capacity_compute_resources.get(queue_name):
1075+
self._insufficient_capacity_compute_resources.pop(queue_name)
1076+
return timeout_expired_cr
1077+
1078+
def _reset_insufficient_capacity_timeout_expired_nodes(
1079+
self,
1080+
timeout_expired_cr: Dict[str, Dict[str, ComputeResourceFailureEvent]],
1081+
ice_compute_resources_and_nodes_map: Dict[str, Dict[str, ComputeResourceFailureEvent]],
1082+
):
1083+
"""Reset nodes in the compute resource which insufficient_capacity_timeout expired."""
1084+
logging.info(
1085+
f"Reset the following compute resources because insufficient capacity timeout expired: {timeout_expired_cr}"
1086+
)
1087+
nodes_to_power_down = []
1088+
for queue, compute_resources in timeout_expired_cr.items():
1089+
for compute_resource in compute_resources:
1090+
nodes = ice_compute_resources_and_nodes_map.get(queue, {}).get(compute_resource, [])
1091+
nodes_to_power_down += nodes
1092+
1093+
if nodes_to_power_down:
1094+
node_names = [node.name for node in nodes_to_power_down]
1095+
log.info(
1096+
"Enabling the following nodes because insufficient capacity timeout expired: %s",
1097+
print_with_count(node_names),
1098+
)
1099+
set_nodes_power_down(
1100+
node_names,
1101+
reason="Enabling node since insufficient capacity timeout expired",
1102+
)
1103+
9671104

9681105
def _run_clustermgtd(config_file):
9691106
"""Run clustermgtd actions."""

src/slurm_plugin/slurm_resources.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
import logging
1313
import re
1414
from abc import ABCMeta, abstractmethod
15+
from dataclasses import dataclass
16+
from datetime import datetime
1517
from enum import Enum
1618

1719
from common.utils import time_is_up
@@ -95,7 +97,7 @@ def get_online_node_by_type(self, terminate_drain_nodes, terminate_down_nodes):
9597
and node.is_online()
9698
):
9799
logger.debug("Currently online node: %s, node state: %s", node.name, node.state_string)
98-
online_compute_resources.add(node.get_compute_resource_name())
100+
online_compute_resources.add(node.compute_resource_name)
99101
return online_compute_resources
100102

101103
def __eq__(self, other):
@@ -139,6 +141,7 @@ def __init__(self, name, nodeaddr, nodehostname, state, partitions=None, reason=
139141
self._is_replacement_timeout = False
140142
self.is_failing_health_check = False
141143
self.error_code = self._parse_error_code()
144+
self.queue_name, self._node_type, self.compute_resource_name = parse_nodename(name)
142145

143146
def is_nodeaddr_set(self):
144147
"""Check if nodeaddr(private ip) for the node is set."""
@@ -253,11 +256,6 @@ def needs_reset_when_inactive(self):
253256
"""Check if the node need to be reset if node is inactive."""
254257
pass
255258

256-
def get_compute_resource_name(self):
257-
"""Get instance name of given node."""
258-
_, _, compute_resource_name = parse_nodename(self.name)
259-
return compute_resource_name
260-
261259
def _parse_error_code(self):
262260
"""Parse RunInstance error code from node reason."""
263261
if self.reason and self.reason.startswith("(Code:"):
@@ -498,6 +496,12 @@ class InvalidNodenameError(ValueError):
498496
pass
499497

500498

499+
@dataclass
500+
class ComputeResourceFailureEvent:
501+
timestamp: datetime
502+
error_code: str
503+
504+
501505
def parse_nodename(nodename):
502506
"""Parse queue_name, node_type (st vs dy) and instance_type from nodename."""
503507
nodename_capture = re.match(r"^([a-z0-9\-]+)-(st|dy)-([a-z0-9\-]+)-\d+$", nodename)

0 commit comments

Comments
 (0)