|
| 1 | +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with |
| 4 | +# the License. A copy of the License is located at |
| 5 | +# |
| 6 | +# http://aws.amazon.com/apache2.0/ |
| 7 | +# |
| 8 | +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES |
| 9 | +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and |
| 10 | +# limitations under the License. |
| 11 | + |
| 12 | + |
| 13 | +import os |
| 14 | +from types import SimpleNamespace |
| 15 | + |
| 16 | +import botocore |
| 17 | +import pytest |
| 18 | +import slurm_plugin |
| 19 | +from assertpy import assert_that |
| 20 | +from slurm_plugin.clustermgtd import ComputeFleetStatus |
| 21 | +from slurm_plugin.fleet_status_manager import ( |
| 22 | + SlurmFleetManagerConfig, |
| 23 | + _get_computefleet_status, |
| 24 | + _manage_fleet_status_transition, |
| 25 | + _start_partitions, |
| 26 | + _stop_partitions, |
| 27 | +) |
| 28 | +from slurm_plugin.slurm_resources import PartitionStatus |
| 29 | + |
| 30 | + |
| 31 | +@pytest.fixture() |
| 32 | +def boto3_stubber_path(): |
| 33 | + # we need to set the region in the environment because the Boto3ClientFactory requires it. |
| 34 | + os.environ["AWS_DEFAULT_REGION"] = "us-east-2" |
| 35 | + return "slurm_plugin.instance_manager.boto3" |
| 36 | + |
| 37 | + |
| 38 | +@pytest.mark.parametrize( |
| 39 | + ("config_file", "expected_attributes"), |
| 40 | + [ |
| 41 | + ( |
| 42 | + "default.conf", |
| 43 | + { |
| 44 | + "cluster_name": "test", |
| 45 | + "region": "us-east-2", |
| 46 | + "terminate_max_batch_size": 1000, |
| 47 | + "_boto3_config": {"retries": {"max_attempts": 5, "mode": "standard"}}, |
| 48 | + "logging_config": os.path.join( |
| 49 | + os.path.dirname(slurm_plugin.__file__), |
| 50 | + "logging", |
| 51 | + "parallelcluster_fleet_status_manager_logging.conf", |
| 52 | + ), |
| 53 | + }, |
| 54 | + ), |
| 55 | + ( |
| 56 | + "all_options.conf", |
| 57 | + { |
| 58 | + "cluster_name": "test_again", |
| 59 | + "region": "us-east-1", |
| 60 | + "terminate_max_batch_size": 50, |
| 61 | + "_boto3_config": { |
| 62 | + "retries": {"max_attempts": 10, "mode": "standard"}, |
| 63 | + "proxies": {"https": "my.resume.proxy"}, |
| 64 | + }, |
| 65 | + "logging_config": "/path/to/fleet_status_manager_logging/config", |
| 66 | + }, |
| 67 | + ), |
| 68 | + ], |
| 69 | +) |
| 70 | +def test_fleet_status_manager_config(config_file, expected_attributes, test_datadir): |
| 71 | + resume_config = SlurmFleetManagerConfig(test_datadir / config_file) |
| 72 | + for key in expected_attributes: |
| 73 | + assert_that(resume_config.__dict__.get(key)).is_equal_to(expected_attributes.get(key)) |
| 74 | + |
| 75 | + |
| 76 | +@pytest.mark.parametrize( |
| 77 | + ("computefleet_status_data_path", "status", "action"), |
| 78 | + [ |
| 79 | + ("path_to_file_1", ComputeFleetStatus.STOPPED, None), |
| 80 | + ("path_to_file_2", ComputeFleetStatus.RUNNING, None), |
| 81 | + ("path_to_file_3", ComputeFleetStatus.STOPPING, None), |
| 82 | + ("path_to_file_4", ComputeFleetStatus.STARTING, None), |
| 83 | + ("path_to_file_5", ComputeFleetStatus.STOP_REQUESTED, "stop"), |
| 84 | + ("path_to_file_6", ComputeFleetStatus.START_REQUESTED, "start"), |
| 85 | + ("path_to_file_7", ComputeFleetStatus.PROTECTED, None), |
| 86 | + ], |
| 87 | +) |
| 88 | +def test_fleet_status_manager(mocker, test_datadir, computefleet_status_data_path, status, action): |
| 89 | + # mocks |
| 90 | + config = SimpleNamespace(some_key_1="some_value_1", some_key_2="some_value_2") |
| 91 | + get_computefleet_status_mocked = mocker.patch("slurm_plugin.fleet_status_manager._get_computefleet_status") |
| 92 | + get_computefleet_status_mocked.return_value = status |
| 93 | + stop_partitions_mocked = mocker.patch("slurm_plugin.fleet_status_manager._stop_partitions") |
| 94 | + start_partitions_mocked = mocker.patch("slurm_plugin.fleet_status_manager._start_partitions") |
| 95 | + |
| 96 | + # method to test |
| 97 | + _manage_fleet_status_transition(config, computefleet_status_data_path) |
| 98 | + |
| 99 | + # assertions |
| 100 | + get_computefleet_status_mocked.assert_called_once_with(computefleet_status_data_path) |
| 101 | + if action == "start": |
| 102 | + start_partitions_mocked.assert_called_once() |
| 103 | + stop_partitions_mocked.assert_not_called() |
| 104 | + elif action == "stop": |
| 105 | + stop_partitions_mocked.assert_called_once_with(config) |
| 106 | + start_partitions_mocked.assert_not_called() |
| 107 | + else: |
| 108 | + start_partitions_mocked.assert_not_called() |
| 109 | + stop_partitions_mocked.assert_not_called() |
| 110 | + |
| 111 | + |
| 112 | +@pytest.mark.parametrize( |
| 113 | + ("config_file", "expected_status"), |
| 114 | + [ |
| 115 | + ("correct_status.json", ComputeFleetStatus.RUNNING), |
| 116 | + ("no_status.json", Exception), |
| 117 | + ("malformed_status.json", Exception), |
| 118 | + ("wrong_status.json", Exception), |
| 119 | + (None, Exception), |
| 120 | + ], |
| 121 | +) |
| 122 | +def test_get_computefleet_status(test_datadir, config_file, expected_status): |
| 123 | + if expected_status is Exception: |
| 124 | + with pytest.raises(Exception): |
| 125 | + _get_computefleet_status(test_datadir / config_file) |
| 126 | + else: |
| 127 | + status = _get_computefleet_status(test_datadir / config_file) |
| 128 | + assert_that(status).is_equal_to(expected_status) |
| 129 | + |
| 130 | + |
| 131 | +def test_start_partitions(mocker): |
| 132 | + update_all_partitions_mocked = mocker.patch("slurm_plugin.fleet_status_manager.update_all_partitions") |
| 133 | + resume_powering_down_nodes_mocked = mocker.patch("slurm_plugin.fleet_status_manager.resume_powering_down_nodes") |
| 134 | + |
| 135 | + _start_partitions() |
| 136 | + |
| 137 | + update_all_partitions_mocked.assert_called_once_with(PartitionStatus.UP, reset_node_addrs_hostname=False) |
| 138 | + resume_powering_down_nodes_mocked.assert_called_once() |
| 139 | + |
| 140 | + |
| 141 | +def test_stop_partitions(mocker): |
| 142 | + # mocks |
| 143 | + config = SimpleNamespace( |
| 144 | + terminate_max_batch_size="3", region="us-east-1", cluster_name="test", boto3_config=botocore.config.Config() |
| 145 | + ) |
| 146 | + update_all_partitions_mocked = mocker.patch("slurm_plugin.fleet_status_manager.update_all_partitions") |
| 147 | + |
| 148 | + terminate_all_compute_nodes_mocked = mocker.patch.object( |
| 149 | + slurm_plugin.instance_manager.InstanceManager, "terminate_all_compute_nodes", auto_spec=True |
| 150 | + ) |
| 151 | + |
| 152 | + # method to test |
| 153 | + _stop_partitions(config) |
| 154 | + |
| 155 | + # assertions |
| 156 | + update_all_partitions_mocked.assert_called_once_with(PartitionStatus.INACTIVE, reset_node_addrs_hostname=True) |
| 157 | + terminate_all_compute_nodes_mocked.assert_called_once_with(config.terminate_max_batch_size) |
0 commit comments