Skip to content

Commit be80ca0

Browse files
committed
Handle case when instances don't have all EC2 info yet
Handle corner case when instance is just launched and the describe doesn't report yet all the EC2 info like the private IP address. In this case, the get_cluster_instances must not completely fail, but only skip the instance for which there aren't all the info yet. The get_cluster_instances is used in the compute fleet stopped loop to keep the nodes to down and in the manage cluster loop to update slurm nodes with EC2 info and to terminate orphaned nodes. Signed-off-by: Luca Carrogu <[email protected]>
1 parent 6bf4278 commit be80ca0

File tree

2 files changed

+69
-14
lines changed

2 files changed

+69
-14
lines changed

src/slurm_plugin/instance_manager.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,11 @@ def get_unhealthy_cluster_instance_status(self, cluster_instance_ids):
342342

343343
@log_exception(logger, "getting cluster instances from EC2", raise_on_error=True)
344344
def get_cluster_instances(self, include_head_node=False, alive_states_only=True):
345-
"""Get instances that are associated with the cluster."""
345+
"""
346+
Get instances that are associated with the cluster.
347+
348+
Instances without all the info set are ignored and not returned
349+
"""
346350
ec2_client = boto3.client("ec2", region_name=self._region, config=self._boto3_config)
347351
paginator = ec2_client.get_paginator("describe_instances")
348352
args = {
@@ -354,15 +358,27 @@ def get_cluster_instances(self, include_head_node=False, alive_states_only=True)
354358
args["Filters"].append({"Name": "tag:parallelcluster:node-type", "Values": ["Compute"]})
355359
response_iterator = paginator.paginate(PaginationConfig={"PageSize": BOTO3_PAGINATION_PAGE_SIZE}, **args)
356360
filtered_iterator = response_iterator.search("Reservations[].Instances[]")
357-
return [
358-
EC2Instance(
359-
instance_info["InstanceId"],
360-
instance_info["PrivateIpAddress"],
361-
instance_info["PrivateDnsName"].split(".")[0],
362-
instance_info["LaunchTime"],
363-
)
364-
for instance_info in filtered_iterator
365-
]
361+
362+
instances = []
363+
for instance_info in filtered_iterator:
364+
try:
365+
instances.append(
366+
EC2Instance(
367+
instance_info["InstanceId"],
368+
instance_info["PrivateIpAddress"],
369+
instance_info["PrivateDnsName"].split(".")[0],
370+
instance_info["LaunchTime"],
371+
)
372+
)
373+
except Exception as e:
374+
logger.warning(
375+
"Ignoring instance %s because not all EC2 info are available, exception: %s, message: %s",
376+
instance_info["InstanceId"],
377+
type(e).__name__,
378+
e,
379+
)
380+
381+
return instances
366382

367383
def terminate_all_compute_nodes(self, terminate_batch_size):
368384
try:

tests/slurm_plugin/slurm_resources/test_instance_manager.py

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1514,7 +1514,7 @@ def get_unhealthy_cluster_instance_status(
15141514
@pytest.mark.parametrize(
15151515
"mock_kwargs, mocked_boto3_request, expected_parsed_result",
15161516
[
1517-
(
1517+
pytest.param(
15181518
{"include_head_node": False, "alive_states_only": True},
15191519
MockedBoto3Request(
15201520
method="describe_instances",
@@ -1552,8 +1552,9 @@ def get_unhealthy_cluster_instance_status(
15521552
EC2Instance("i-1", "ip-1", "hostname", datetime(2020, 1, 1, tzinfo=timezone.utc)),
15531553
EC2Instance("i-2", "ip-2", "hostname", datetime(2020, 1, 1, tzinfo=timezone.utc)),
15541554
],
1555+
id="default",
15551556
),
1556-
(
1557+
pytest.param(
15571558
{"include_head_node": False, "alive_states_only": True},
15581559
MockedBoto3Request(
15591560
method="describe_instances",
@@ -1569,8 +1570,9 @@ def get_unhealthy_cluster_instance_status(
15691570
generate_error=False,
15701571
),
15711572
[],
1573+
id="empty_response",
15721574
),
1573-
(
1575+
pytest.param(
15741576
{"include_head_node": True, "alive_states_only": False},
15751577
MockedBoto3Request(
15761578
method="describe_instances",
@@ -1595,9 +1597,46 @@ def get_unhealthy_cluster_instance_status(
15951597
generate_error=False,
15961598
),
15971599
[EC2Instance("i-1", "ip-1", "hostname", datetime(2020, 1, 1, tzinfo=timezone.utc))],
1600+
id="custom_args",
1601+
),
1602+
pytest.param(
1603+
{"include_head_node": False, "alive_states_only": True},
1604+
MockedBoto3Request(
1605+
method="describe_instances",
1606+
response={
1607+
"Reservations": [
1608+
{
1609+
"Instances": [
1610+
{
1611+
"InstanceId": "i-1",
1612+
"LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc),
1613+
},
1614+
{
1615+
"InstanceId": "i-2",
1616+
"PrivateIpAddress": "ip-2",
1617+
"PrivateDnsName": "hostname",
1618+
"LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc),
1619+
},
1620+
]
1621+
}
1622+
]
1623+
},
1624+
expected_params={
1625+
"Filters": [
1626+
{"Name": "tag:parallelcluster:cluster-name", "Values": ["hit"]},
1627+
{"Name": "instance-state-name", "Values": list(EC2_INSTANCE_ALIVE_STATES)},
1628+
{"Name": "tag:parallelcluster:node-type", "Values": ["Compute"]},
1629+
],
1630+
"MaxResults": 1000,
1631+
},
1632+
generate_error=False,
1633+
),
1634+
[
1635+
EC2Instance("i-2", "ip-2", "hostname", datetime(2020, 1, 1, tzinfo=timezone.utc)),
1636+
],
1637+
id="no_ec2_info",
15981638
),
15991639
],
1600-
ids=["default", "empty_response", "custom_args"],
16011640
)
16021641
def test_get_cluster_instances(
16031642
self, mock_kwargs, mocked_boto3_request, expected_parsed_result, instance_manager, boto3_stubber, mocker

0 commit comments

Comments
 (0)