Skip to content

Commit aa565ec

Browse files
committed
Set failed nodes down with reason to Errorcode
* When setting a dynamic node to down state, set the reason to Error code if the node fail with ClientError. * If the error type is general Exception, set error code to Expcetion, if the exception is Client error, set errorcode to be equal to ClientError error code. * Insufficient capacity error code: {"InsufficientInstanceCapacity", "MaxSpotInstanceCountExceeded", "InsufficientHostCapacity", "InsufficientReservedInstanceCapacity"} When setting a dynamic node to down state, set the reason to Error code if the node fail with insufficient capacity. Signed-off-by: chenwany <[email protected]>
1 parent fc74dcf commit aa565ec

File tree

5 files changed

+220
-45
lines changed

5 files changed

+220
-45
lines changed

src/slurm_plugin/clustermgtd.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -676,7 +676,8 @@ def _handle_unhealthy_static_nodes(self, unhealthy_static_nodes):
676676
node_list, self._config.launch_max_batch_size, self._config.update_node_address
677677
)
678678
# Add launched nodes to list of nodes being replaced, excluding any nodes that failed to launch
679-
launched_nodes = set(node_list) - set(self._instance_manager.failed_nodes)
679+
failed_nodes = set().union(*self._instance_manager.failed_nodes.values())
680+
launched_nodes = set(node_list) - failed_nodes
680681
self._static_nodes_in_replacement |= launched_nodes
681682
log.info(
682683
"After node maintenance, following nodes are currently in replacement: %s",

src/slurm_plugin/instance_manager.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def __init__(
6161
self._region = region
6262
self._cluster_name = cluster_name
6363
self._boto3_config = boto3_config
64-
self.failed_nodes = []
64+
self.failed_nodes = {}
6565
self._ddb_resource = boto3.resource("dynamodb", region_name=region, config=boto3_config)
6666
self._table = self._ddb_resource.Table(table_name) if table_name else None
6767
self._hosted_zone = hosted_zone
@@ -74,7 +74,7 @@ def __init__(
7474

7575
def _clear_failed_nodes(self):
7676
"""Clear and reset failed nodes list."""
77-
self.failed_nodes = []
77+
self.failed_nodes = {}
7878

7979
def add_instances_for_nodes(
8080
self, node_list, launch_batch_size, update_node_address=True, all_or_nothing_batch=False
@@ -102,14 +102,22 @@ def add_instances_for_nodes(
102102
self._store_assigned_hostnames(assigned_nodes)
103103
self._update_dns_hostnames(assigned_nodes)
104104
except Exception:
105-
self.failed_nodes.extend(list(assigned_nodes.keys()))
105+
self._update_failed_nodes(set(assigned_nodes.keys()))
106+
except ClientError as e:
107+
logger.error(
108+
"Encountered exception when launching instances for nodes %s: %s",
109+
print_with_count(batch_nodes),
110+
e,
111+
)
112+
error_code = e.response.get("Error", {}).get("Code")
113+
self._update_failed_nodes(set(batch_nodes), error_code)
106114
except Exception as e:
107115
logger.error(
108116
"Encountered exception when launching instances for nodes %s: %s",
109117
print_with_count(batch_nodes),
110118
e,
111119
)
112-
self.failed_nodes.extend(batch_nodes)
120+
self._update_failed_nodes(set(batch_nodes))
113121

114122
def _update_slurm_node_addrs(self, slurm_nodes, launched_instances):
115123
"""Update node information in slurm with info from launched EC2 instance."""
@@ -139,7 +147,7 @@ def _update_slurm_node_addrs(self, slurm_nodes, launched_instances):
139147
)
140148
if fail_launch_nodes:
141149
logger.info("Failed to launch instances for following nodes: %s", print_with_count(fail_launch_nodes))
142-
self.failed_nodes.extend(fail_launch_nodes)
150+
self._update_failed_nodes(set(fail_launch_nodes), "LimitedInstanceCapacity")
143151

144152
return dict(zip(launched_nodes, launched_instances))
145153

@@ -149,7 +157,7 @@ def _update_slurm_node_addrs(self, slurm_nodes, launched_instances):
149157
print_with_count(slurm_nodes),
150158
print_with_count(launched_instances),
151159
)
152-
self.failed_nodes.extend(slurm_nodes)
160+
self._update_failed_nodes(set(slurm_nodes))
153161

154162
@log_exception(logger, "saving assigned hostnames in DynamoDB", raise_on_error=True)
155163
def _store_assigned_hostnames(self, nodes):
@@ -228,7 +236,7 @@ def _parse_requested_instances(self, node_list):
228236
instances_to_launch[queue_name][compute_resource_name].append(node)
229237
except (InvalidNodenameError, KeyError):
230238
logger.warning("Discarding NodeName with invalid format: %s", node)
231-
self.failed_nodes.append(node)
239+
self._update_failed_nodes({node}, "InvalidNodenameError")
232240
logger.debug("Launch configuration requested by nodes = %s", instances_to_launch)
233241

234242
return instances_to_launch
@@ -362,3 +370,7 @@ def terminate_all_compute_nodes(self, terminate_batch_size):
362370
except Exception as e:
363371
logger.error("Failed when terminating compute fleet with error %s", e)
364372
return False
373+
374+
def _update_failed_nodes(self, nodeset, error_code="Exception"):
375+
"""Update failed nodes dict with error code as key and nodeset value."""
376+
self.failed_nodes[error_code] = self.failed_nodes.get(error_code, set()).union(nodeset)

src/slurm_plugin/resume.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def _get_config(self, config_file_path):
116116
log.info(self.__repr__())
117117

118118

119-
def _handle_failed_nodes(node_list):
119+
def _handle_failed_nodes(node_list, reason="Failure when resuming nodes"):
120120
"""
121121
Fall back mechanism to handle failure when launching instances.
122122
@@ -132,7 +132,7 @@ def _handle_failed_nodes(node_list):
132132
"""
133133
try:
134134
log.info("Setting following failed nodes into DOWN state: %s", print_with_count(node_list))
135-
set_nodes_down(node_list, reason="Failure when resuming nodes")
135+
set_nodes_down(node_list, reason=reason)
136136
except Exception as e:
137137
log.error("Failed to place nodes %s into down with exception: %s", print_with_count(node_list), e)
138138

@@ -175,14 +175,17 @@ def _resume(arg_nodes, resume_config):
175175
update_node_address=resume_config.update_node_address,
176176
all_or_nothing_batch=resume_config.all_or_nothing_batch,
177177
)
178-
success_nodes = [node for node in node_list if node not in instance_manager.failed_nodes]
178+
failed_nodes = set().union(*instance_manager.failed_nodes.values())
179+
success_nodes = [node for node in node_list if node not in failed_nodes]
179180
log.info("Successfully launched nodes %s", print_with_count(success_nodes))
180-
if instance_manager.failed_nodes:
181+
182+
if failed_nodes:
181183
log.error(
182184
"Failed to launch following nodes, setting nodes to down: %s",
183-
print_with_count(instance_manager.failed_nodes),
185+
print_with_count(failed_nodes),
184186
)
185-
_handle_failed_nodes(instance_manager.failed_nodes)
187+
for error_code, node_list in instance_manager.failed_nodes.items():
188+
_handle_failed_nodes(node_list, reason=f"(Code:{error_code})Failure when resuming nodes")
186189

187190

188191
def main():

tests/slurm_plugin/slurm_resources/test_instance_manager.py

Lines changed: 37 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def instance_manager(self, mocker):
165165
},
166166
),
167167
],
168-
None,
168+
{},
169169
[
170170
call(
171171
["queue1-st-c5xlarge-2"],
@@ -229,6 +229,7 @@ def instance_manager(self, mocker):
229229
"LaunchTemplate": {"LaunchTemplateName": "hit-queue1-c5.2xlarge", "Version": "$Latest"},
230230
},
231231
generate_error=True,
232+
error_code="some_error_code",
232233
),
233234
MockedBoto3Request(
234235
method="run_instances",
@@ -257,7 +258,7 @@ def instance_manager(self, mocker):
257258
},
258259
),
259260
],
260-
["queue1-st-c52xlarge-1"],
261+
{"some_error_code": {"queue1-st-c52xlarge-1"}},
261262
[
262263
call(
263264
["queue1-st-c5xlarge-2"],
@@ -303,7 +304,7 @@ def instance_manager(self, mocker):
303304
},
304305
),
305306
],
306-
None,
307+
{},
307308
None,
308309
),
309310
# batch_size1
@@ -353,6 +354,7 @@ def instance_manager(self, mocker):
353354
"LaunchTemplate": {"LaunchTemplateName": "hit-queue1-c5.2xlarge", "Version": "$Latest"},
354355
},
355356
generate_error=True,
357+
error_code="InsufficientHostCapacity",
356358
),
357359
MockedBoto3Request(
358360
method="run_instances",
@@ -363,14 +365,13 @@ def instance_manager(self, mocker):
363365
"LaunchTemplate": {"LaunchTemplateName": "hit-queue2-c5.xlarge", "Version": "$Latest"},
364366
},
365367
generate_error=True,
368+
error_code="ServiceUnavailable",
366369
),
367370
],
368-
[
369-
"queue1-st-c52xlarge-1",
370-
"queue2-st-c5xlarge-1",
371-
"queue2-st-c5xlarge-2",
372-
"queue2-dy-c5xlarge-1",
373-
],
371+
{
372+
"InsufficientHostCapacity": {"queue1-st-c52xlarge-1"},
373+
"ServiceUnavailable": {"queue2-st-c5xlarge-1", "queue2-dy-c5xlarge-1", "queue2-st-c5xlarge-2"},
374+
},
374375
[
375376
call(
376377
["queue1-st-c5xlarge-2"],
@@ -425,6 +426,7 @@ def instance_manager(self, mocker):
425426
"LaunchTemplate": {"LaunchTemplateName": "hit-queue1-c5.2xlarge", "Version": "$Latest"},
426427
},
427428
generate_error=True,
429+
error_code="InsufficientVolumeCapacity",
428430
),
429431
MockedBoto3Request(
430432
method="run_instances",
@@ -454,6 +456,7 @@ def instance_manager(self, mocker):
454456
"LaunchTemplate": {"LaunchTemplateName": "hit-queue2-c5.xlarge", "Version": "$Latest"},
455457
},
456458
generate_error=True,
459+
error_code="InternalError",
457460
),
458461
MockedBoto3Request(
459462
method="run_instances",
@@ -475,7 +478,7 @@ def instance_manager(self, mocker):
475478
},
476479
),
477480
],
478-
["queue1-st-c52xlarge-1", "queue2-st-c5xlarge-2"],
481+
{"InsufficientVolumeCapacity": {"queue1-st-c52xlarge-1"}, "InternalError": {"queue2-st-c5xlarge-2"}},
479482
[
480483
call(
481484
["queue1-st-c5xlarge-2"],
@@ -525,7 +528,7 @@ def instance_manager(self, mocker):
525528
"LaunchTemplate": {"LaunchTemplateName": "hit-queue2-c5.xlarge", "Version": "$Latest"},
526529
},
527530
),
528-
["queue2-st-c5xlarge-2", "queue2-dy-c5xlarge-1"],
531+
{"LimitedInstanceCapacity": {"queue2-st-c5xlarge-2", "queue2-dy-c5xlarge-1"}},
529532
[
530533
call(
531534
["queue2-st-c5xlarge-1", "queue2-st-c5xlarge-2", "queue2-dy-c5xlarge-1"],
@@ -592,9 +595,10 @@ def instance_manager(self, mocker):
592595
"LaunchTemplate": {"LaunchTemplateName": "hit-queue2-c5.xlarge", "Version": "$Latest"},
593596
},
594597
generate_error=True,
598+
error_code="InsufficientInstanceCapacity",
595599
),
596600
],
597-
["queue2-dy-c5xlarge-2", "queue2-dy-c5xlarge-3"],
601+
{"InsufficientInstanceCapacity": {"queue2-dy-c5xlarge-2", "queue2-dy-c5xlarge-3"}},
598602
[
599603
call(
600604
["queue2-st-c5xlarge-1", "queue2-st-c5xlarge-2", "queue2-dy-c5xlarge-1"],
@@ -716,7 +720,7 @@ def instance_manager(self, mocker):
716720
},
717721
),
718722
],
719-
None,
723+
{},
720724
[
721725
call(
722726
["queue3-st-c5xlarge-2"],
@@ -941,35 +945,42 @@ def test_launch_ec2_instances(
941945
["queue1-st-c5xlarge-1"],
942946
[EC2Instance("id-1", "ip-1", "hostname-1", "some_launch_time")],
943947
call(["queue1-st-c5xlarge-1"], nodeaddrs=["ip-1"], nodehostnames=None),
944-
[],
948+
{},
949+
False,
950+
"dns.domain",
951+
),
952+
(
953+
["queue1-st-c5xlarge-1"],
954+
{},
955+
None,
956+
{"LimitedInstanceCapacity": {"queue1-st-c5xlarge-1"}},
945957
False,
946958
"dns.domain",
947959
),
948-
(["queue1-st-c5xlarge-1"], [], None, ["queue1-st-c5xlarge-1"], False, "dns.domain"),
949960
(
950961
["queue1-st-c5xlarge-1", "queue1-st-c5xlarge-2", "queue1-st-c5xlarge-3", "queue1-st-c5xlarge-4"],
951962
[
952963
EC2Instance("id-1", "ip-1", "hostname-1", "some_launch_time"),
953964
EC2Instance("id-2", "ip-2", "hostname-2", "some_launch_time"),
954965
],
955966
call(["queue1-st-c5xlarge-1", "queue1-st-c5xlarge-2"], nodeaddrs=["ip-1", "ip-2"], nodehostnames=None),
956-
["queue1-st-c5xlarge-3", "queue1-st-c5xlarge-4"],
967+
{"LimitedInstanceCapacity": {"queue1-st-c5xlarge-4", "queue1-st-c5xlarge-3"}},
957968
False,
958969
"dns.domain",
959970
),
960971
(
961972
["queue1-st-c5xlarge-1"],
962973
[EC2Instance("id-1", "ip-1", "hostname-1", "some_launch_time")],
963974
call(["queue1-st-c5xlarge-1"], nodeaddrs=["ip-1"], nodehostnames=["hostname-1"]),
964-
[],
975+
{},
965976
True,
966977
"dns.domain",
967978
),
968979
(
969980
["queue1-st-c5xlarge-1"],
970981
[EC2Instance("id-1", "ip-1", "hostname-1", "some_launch_time")],
971982
call(["queue1-st-c5xlarge-1"], nodeaddrs=["ip-1"], nodehostnames=None),
972-
[],
983+
{},
973984
False,
974985
"",
975986
),
@@ -1306,12 +1317,14 @@ def test_update_dns_hostnames(
13061317
"u6tb1metal": ["queue2-st-u6tb1metal-1"],
13071318
},
13081319
},
1309-
[
1310-
"in-valid/queue.name-st-c5xlarge-2",
1311-
"noBrackets-st-c5xlarge-[1-2]",
1312-
"queue2-invalidnodetype-c5xlarge-12",
1313-
"queuename-with-dash-and_underscore-st-i3enmetal2tb-1",
1314-
],
1320+
{
1321+
"InvalidNodenameError": {
1322+
"queue2-invalidnodetype-c5xlarge-12",
1323+
"noBrackets-st-c5xlarge-[1-2]",
1324+
"queuename-with-dash-and_underscore-st-i3enmetal2tb-1",
1325+
"in-valid/queue.name-st-c5xlarge-2",
1326+
}
1327+
},
13151328
),
13161329
],
13171330
)

0 commit comments

Comments
 (0)