Implement disable insufficient capacity compute resources logic

chenwany · chenwany · commit c76f6812e459 · 2022-04-14T09:21:31.000-07:00
1. Include queue_name in `get_compute_resource_name` method to also consider the case that a node belongs to multiple partition. In different queue, the compute resources name can be the same. Include queue_name in `get_compute_resource_name` helps to keep track of insufficient capacity compute resources.
2. Implement disable insufficient capacity compute nodes logic:
* Collect the ComputeResource name and nodes mapping for all nodes in DOWN state whose reason belongs to one of the following: `InsufficientInstanceCapacity, InsufficientHostCapacity, InsufficientReservedInstanceCapacity, MaxSpotInstanceCountExceeded`.
* For each compute resource failing due to capacity problems check if clustermgtd marked it as disabled in a previous iteration. If so check if the timeout expired and in case restore the nodes to an enabled state and clean-up the reason field. If not, disable all POWERED_DOWN nodes belonging to that compute resource and store this information in clustermgtd.

Signed-off-by: chenwany &lt;chenwany@amazon.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,12 @@ aws-parallelcluster-node CHANGELOG
 
 This file is used to list changes made in each version of the aws-parallelcluster-node package.
 
+x.x.x
+------
+
+**ENHANCEMENTS**
+- Enable fast insufficient capacity fail-over with slurm scheduler.
+
 3.1.3
 ------
 
diff --git a/src/slurm_plugin/clustermgtd.py b/src/slurm_plugin/clustermgtd.py
@@ -19,6 +19,7 @@
 from enum import Enum
 from logging.config import fileConfig
 from subprocess import CalledProcessError
+from typing import Dict, List
 
 from botocore.config import Config
 from common.schedulers.slurm_commands import (
@@ -36,7 +37,15 @@
 from retrying import retry
 from slurm_plugin.common import TIMESTAMP_FORMAT, log_exception, print_with_count, read_json
 from slurm_plugin.instance_manager import InstanceManager
-from slurm_plugin.slurm_resources import CONFIG_FILE_DIR, EC2InstanceHealthState, PartitionStatus, StaticNode
+from slurm_plugin.slurm_resources import (
+    CONFIG_FILE_DIR,
+    ComputeResourceFailureEvent,
+    DynamicNode,
+    EC2InstanceHealthState,
+    PartitionStatus,
+    SlurmNode,
+    StaticNode,
+)
 
 LOOP_TIME = 60
 log = logging.getLogger(__name__)
@@ -300,6 +309,7 @@ def __init__(self, config):
         self.static_nodes_in_replacement is persistent across multiple iteration of manage_cluster
         This state is required because we need to ignore static nodes that might have long bootstrap time
         """
+        self._insufficient_capacity_compute_resources = {}
         self._static_nodes_in_replacement = set()
         self._partitions_protected_failure_count_map = {}
         self._compute_fleet_status = ComputeFleetStatus.RUNNING
@@ -380,7 +390,7 @@ def manage_cluster(self):
                     log.info("Retrieving nodes info from the scheduler")
                     nodes = self._get_node_info_with_retry()
                     log.debug("Nodes: %s", nodes)
-                    partitions_name_map = self._retrieve_scheduler_partitions(nodes)
+                    partitions_name_map, compute_resource_nodes_map = self._retrieve_scheduler_partitions(nodes)
                 except Exception as e:
                     log.error(
                         "Unable to get partition/node info from slurm, no other action can be performed. Sleeping... "
@@ -404,7 +414,7 @@ def manage_cluster(self):
                 if not self._config.disable_all_health_checks:
                     self._perform_health_check_actions(partitions)
                 # Maintain slurm nodes
-                self._maintain_nodes(partitions_name_map)
+                self._maintain_nodes(partitions_name_map, compute_resource_nodes_map)
                 # Clean up orphaned instances
                 self._terminate_orphaned_instances(cluster_instances)
             elif self._compute_fleet_status in {
@@ -608,7 +618,7 @@ def _find_unhealthy_slurm_nodes(self, slurm_nodes):
     def _increase_partitions_protected_failure_count(self, bootstrap_failure_nodes):
         """Keep count of boostrap failures."""
         for node in bootstrap_failure_nodes:
-            compute_resource = node.get_compute_resource_name()
+            compute_resource = node.compute_resource_name
             for p in node.partitions:
                 if p in self._partitions_protected_failure_count_map:
                     self._partitions_protected_failure_count_map[p][compute_resource] = (
@@ -701,7 +711,7 @@ def _handle_unhealthy_static_nodes(self, unhealthy_static_nodes):
         )
 
     @log_exception(log, "maintaining slurm nodes", catch_exception=Exception, raise_on_error=False)
-    def _maintain_nodes(self, partitions_name_map):
+    def _maintain_nodes(self, partitions_name_map, compute_resource_nodes_map):
         """
         Call functions to maintain unhealthy nodes.
 
@@ -725,6 +735,8 @@ def _maintain_nodes(self, partitions_name_map):
             self._handle_unhealthy_static_nodes(unhealthy_static_nodes)
         if self._is_protected_mode_enabled():
             self._handle_protected_mode_process(active_nodes, partitions_name_map)
+        if self._config.disable_nodes_on_insufficient_capacity:
+            self._handle_ice_nodes(unhealthy_dynamic_nodes, compute_resource_nodes_map)
         self._handle_failed_health_check_nodes_in_replacement(active_nodes)
 
     @log_exception(log, "terminating orphaned instances", catch_exception=Exception, raise_on_error=False)
@@ -877,6 +889,7 @@ def _handle_health_check(self, unhealthy_instances_status, instance_id_to_active
     def _retrieve_scheduler_partitions(nodes):
         try:
             ignored_nodes = []
+            compute_resource_nodes_map = {}
             partitions_name_map = ClusterManager._get_partition_info_with_retry()
             log.debug("Partitions: %s", partitions_name_map)
             for node in nodes:
@@ -886,9 +899,12 @@ def _retrieve_scheduler_partitions(nodes):
                 else:
                     for p in node.partitions:
                         partitions_name_map[p].slurm_nodes.append(node)
+                    compute_resource_nodes_map.setdefault(node.queue_name, {}).setdefault(
+                        node.compute_resource_name, []
+                    ).append(node)
             if ignored_nodes:
                 log.warning("Ignoring following nodes because they do not belong to any partition: %s", ignored_nodes)
-            return partitions_name_map
+            return partitions_name_map, compute_resource_nodes_map
         except Exception as e:
             log.error("Failed when getting partition/node states from scheduler with exception %s", e)
             raise
@@ -950,6 +966,17 @@ def _find_active_nodes(partitions_name_map):
                 active_nodes += partition.slurm_nodes
         return active_nodes
 
+    @staticmethod
+    def _get_unhealthy_ice_nodes(unhealthy_dynamic_nodes: List[DynamicNode]) -> Dict[str, Dict[str, List[DynamicNode]]]:
+        """Get insufficient capacity compute resource and nodes, error code mapping."""
+        ice_compute_resources_and_nodes_map = {}
+        for node in unhealthy_dynamic_nodes:
+            if node.is_ice():
+                ice_compute_resources_and_nodes_map.setdefault(node.queue_name, {}).setdefault(
+                    node.compute_resource_name, []
+                ).append(node)
+        return ice_compute_resources_and_nodes_map
+
     def _is_node_in_replacement_valid(self, node, check_node_is_valid):
         """
         Check node is replacement timeout or in replacement.
@@ -964,6 +991,116 @@ def _is_node_in_replacement_valid(self, node, check_node_is_valid):
             return not time_is_expired if check_node_is_valid else time_is_expired
         return False
 
+    @log_exception(
+        log, "handling nodes failed due to insufficient capacity", catch_exception=Exception, raise_on_error=False
+    )
+    def _handle_ice_nodes(
+        self,
+        unhealthy_dynamic_nodes: List[DynamicNode],
+        compute_resource_nodes_map: Dict[str, Dict[str, List[SlurmNode]]],
+    ):
+        """Handle nodes failed with insufficient capacity."""
+        # get insufficient capacity compute resource and nodes mapping
+        ice_compute_resources_and_nodes_map = self._get_unhealthy_ice_nodes(unhealthy_dynamic_nodes)
+        if ice_compute_resources_and_nodes_map:
+            self._update_insufficient_capacity_compute_resources(ice_compute_resources_and_nodes_map)
+            self._reset_timeout_expired_compute_resources(ice_compute_resources_and_nodes_map)
+        self._set_ice_compute_resources_to_down(compute_resource_nodes_map)
+
+    def _update_insufficient_capacity_compute_resources(
+        self, ice_compute_resources_and_nodes_map: Dict[str, Dict[str, List[SlurmNode]]]
+    ):
+        """Add compute resource to insufficient_capacity_compute_resources if node is ICE node."""
+        for queue_name, compute_resources in ice_compute_resources_and_nodes_map.items():
+            for compute_resource, nodes in compute_resources.items():
+                if not self._insufficient_capacity_compute_resources.get(queue_name, {}).get(compute_resource):
+                    self._insufficient_capacity_compute_resources.setdefault(queue_name, {})[
+                        compute_resource
+                    ] = ComputeResourceFailureEvent(self._current_time, nodes[0].error_code)
+
+    def _reset_timeout_expired_compute_resources(
+        self, ice_compute_resources_and_nodes_map: Dict[str, Dict[str, List[SlurmNode]]]
+    ):
+        """Reset compute resources which insufficient_capacity_timeout expired."""
+        # Find insufficient_capacity_timeout compute resources
+        if not self._insufficient_capacity_compute_resources:
+            return
+        log.info(
+            "The following compute resources are in down state due to insufficient capacity: %s, "
+            "compute resources will be reset after insufficient capacity timeout (%s seconds) expired",
+            self._insufficient_capacity_compute_resources,
+            self._config.insufficient_capacity_timeout,
+        )
+        timeout_expired_compute_resources = self._find_insufficient_capacity_timeout_expired_compute_resources()
+
+        # Reset nodes which insufficient capacity timeout expired
+        if timeout_expired_compute_resources:
+            self._reset_insufficient_capacity_timeout_expired_nodes(
+                timeout_expired_compute_resources, ice_compute_resources_and_nodes_map
+            )
+
+    def _set_ice_compute_resources_to_down(self, compute_resource_nodes_map: Dict[str, Dict[str, List[SlurmNode]]]):
+        """Set powered_down nodes which belong to insufficient capacity compute resources to down."""
+        if not self._insufficient_capacity_compute_resources:
+            return
+        nodes_to_down = {}
+        for queue_name, compute_resources in self._insufficient_capacity_compute_resources.items():
+            for compute_resource, event in compute_resources.items():
+                nodes = compute_resource_nodes_map.get(queue_name, {}).get(compute_resource, [])
+                for node in nodes:
+                    if not node.is_ice() and node.is_power() and not node.is_nodeaddr_set():
+                        error_code = event.error_code
+                        nodes_to_down.setdefault(error_code, []).append(node.name)
+        if nodes_to_down:
+            for error_code, node_list in nodes_to_down.items():
+                log.info(
+                    "Setting following nodes into DOWN state due to insufficient capacity: %s",
+                    print_with_count(node_list),
+                )
+                set_nodes_down(
+                    node_list, reason=f"(Code:{error_code})Temporarily disabling node due to insufficient capacity"
+                )
+
+    def _find_insufficient_capacity_timeout_expired_compute_resources(
+        self,
+    ) -> Dict[str, Dict[str, ComputeResourceFailureEvent]]:
+        """Find compute resources which insufficient_capacity_timeout expired."""
+        timeout_expired_cr = dict()
+        for queue_name, compute_resources in self._insufficient_capacity_compute_resources.copy().items():
+            for compute_resource, event in compute_resources.copy().items():
+                if time_is_up(event.timestamp, self._current_time, self._config.insufficient_capacity_timeout):
+                    self._insufficient_capacity_compute_resources[queue_name].pop(compute_resource)
+                    timeout_expired_cr.setdefault(queue_name, []).append(compute_resource)
+                    if not self._insufficient_capacity_compute_resources.get(queue_name):
+                        self._insufficient_capacity_compute_resources.pop(queue_name)
+        return timeout_expired_cr
+
+    def _reset_insufficient_capacity_timeout_expired_nodes(
+        self,
+        timeout_expired_cr: Dict[str, Dict[str, ComputeResourceFailureEvent]],
+        ice_compute_resources_and_nodes_map: Dict[str, Dict[str, ComputeResourceFailureEvent]],
+    ):
+        """Reset nodes in the compute resource which insufficient_capacity_timeout expired."""
+        logging.info(
+            f"Reset the following compute resources because insufficient capacity timeout expired: {timeout_expired_cr}"
+        )
+        nodes_to_power_down = []
+        for queue, compute_resources in timeout_expired_cr.items():
+            for compute_resource in compute_resources:
+                nodes = ice_compute_resources_and_nodes_map.get(queue, {}).get(compute_resource, [])
+                nodes_to_power_down += nodes
+
+        if nodes_to_power_down:
+            node_names = [node.name for node in nodes_to_power_down]
+            log.info(
+                "Enabling the following nodes because insufficient capacity timeout expired: %s",
+                print_with_count(node_names),
+            )
+            set_nodes_power_down(
+                node_names,
+                reason="Enabling node since insufficient capacity timeout expired",
+            )
+
 
 def _run_clustermgtd(config_file):
     """Run clustermgtd actions."""
diff --git a/src/slurm_plugin/slurm_resources.py b/src/slurm_plugin/slurm_resources.py
@@ -12,6 +12,8 @@
 import logging
 import re
 from abc import ABCMeta, abstractmethod
+from dataclasses import dataclass
+from datetime import datetime
 from enum import Enum
 
 from common.utils import time_is_up
@@ -95,7 +97,7 @@ def get_online_node_by_type(self, terminate_drain_nodes, terminate_down_nodes):
                     and node.is_online()
                 ):
                     logger.debug("Currently online node: %s, node state: %s", node.name, node.state_string)
-                    online_compute_resources.add(node.get_compute_resource_name())
+                    online_compute_resources.add(node.compute_resource_name)
         return online_compute_resources
 
     def __eq__(self, other):
@@ -139,6 +141,7 @@ def __init__(self, name, nodeaddr, nodehostname, state, partitions=None, reason=
         self._is_replacement_timeout = False
         self.is_failing_health_check = False
         self.error_code = self._parse_error_code()
+        self.queue_name, self._node_type, self.compute_resource_name = parse_nodename(name)
 
     def is_nodeaddr_set(self):
         """Check if nodeaddr(private ip) for the node is set."""
@@ -253,11 +256,6 @@ def needs_reset_when_inactive(self):
         """Check if the node need to be reset if node is inactive."""
         pass
 
-    def get_compute_resource_name(self):
-        """Get instance name of given node."""
-        _, _, compute_resource_name = parse_nodename(self.name)
-        return compute_resource_name
-
     def _parse_error_code(self):
         """Parse RunInstance error code from node reason."""
         if self.reason and self.reason.startswith("(Code:"):
@@ -498,6 +496,12 @@ class InvalidNodenameError(ValueError):
     pass
 
 
+@dataclass
+class ComputeResourceFailureEvent:
+    timestamp: datetime
+    error_code: str
+
+
 def parse_nodename(nodename):
     """Parse queue_name, node_type (st vs dy) and instance_type from nodename."""
     nodename_capture = re.match(r"^([a-z0-9\-]+)-(st|dy)-([a-z0-9\-]+)-\d+$", nodename)
diff --git a/tests/slurm_plugin/test_clustermgtd.py b/tests/slurm_plugin/test_clustermgtd.py