Skip to content

Commit 3937224

Browse files
committed
Simplify generate_fleet_config_file my moving compute resource logic into a separate function
Signed-off-by: Enrico Usai <[email protected]>
1 parent a3de12a commit 3937224

File tree

2 files changed

+115
-65
lines changed

2 files changed

+115
-65
lines changed

cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py

Lines changed: 96 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,20 @@
1414
import json
1515
import logging
1616
import traceback
17+
from typing import List
1718

1819
import yaml
1920

2021
log = logging.getLogger()
2122

2223

24+
CAPACITY_TYPE_MAP = {
25+
"ONDEMAND": "on-demand",
26+
"SPOT": "spot",
27+
"CAPACITY_BLOCK": "capacity-block",
28+
}
29+
30+
2331
class CriticalError(Exception):
2432
"""Critical error for the script."""
2533

@@ -32,7 +40,7 @@ class ConfigurationFieldNotFoundError(Exception):
3240
pass
3341

3442

35-
def generate_fleet_config_file(output_file, input_file):
43+
def generate_fleet_config_file(output_file: str, input_file: str):
3644
"""
3745
Generate configuration file used by Fleet Manager in node daemon package.
3846
@@ -64,21 +72,15 @@ def generate_fleet_config_file(output_file, input_file):
6472
}
6573
}
6674
"""
67-
capacity_type_map = {
68-
"ONDEMAND": "on-demand",
69-
"SPOT": "spot",
70-
"CAPACITY_BLOCK": "capacity-block",
71-
}
72-
7375
cluster_config = _load_cluster_config(input_file)
74-
queue, compute_resource = None, None
76+
queue_name, compute_resource_name = None, None
7577
try:
7678
fleet_config = {}
7779
for queue_config in cluster_config["Scheduling"]["SlurmQueues"]:
78-
queue = queue_config["Name"]
80+
queue_name = queue_config["Name"]
7981

80-
# Retrieve capacity info from the queue, if there
81-
queue_capacity_type = capacity_type_map.get(queue_config.get("CapacityType", "ONDEMAND"))
82+
# Retrieve capacity info from the queue_name, if there
83+
queue_capacity_type = CAPACITY_TYPE_MAP.get(queue_config.get("CapacityType", "ONDEMAND"))
8284
queue_allocation_strategy = queue_config.get("AllocationStrategy")
8385
queue_capacity_reservation_target = queue_config.get("CapacityReservationTarget", {})
8486
queue_capacity_reservation = (
@@ -87,66 +89,25 @@ def generate_fleet_config_file(output_file, input_file):
8789
else None
8890
)
8991

90-
fleet_config[queue] = {}
92+
fleet_config[queue_name] = {}
9193

9294
for compute_resource_config in queue_config["ComputeResources"]:
93-
compute_resource = compute_resource_config["Name"]
94-
95-
# Override capacity info from the compute resource.
96-
# CapacityReservationTarget can be specified on both queue and compute resource level.
97-
# CapacityType and AllocationStrategy are not yet supported at compute resource level from the CLI,
98-
# but this code is ready to use them.
99-
capacity_type = capacity_type_map.get(compute_resource_config.get("CapacityType"), queue_capacity_type)
100-
allocation_strategy = compute_resource_config.get("AllocationStrategy", queue_allocation_strategy)
101-
capacity_reservation_target = compute_resource_config.get("CapacityReservationTarget", {})
102-
capacity_reservation = (
103-
capacity_reservation_target.get("CapacityReservationId", queue_capacity_reservation)
104-
if capacity_reservation_target
105-
else queue_capacity_reservation
95+
compute_resource_name, config_for_fleet = _generate_compute_resource_fleet_config(
96+
compute_resource_config=compute_resource_config,
97+
queue_name=queue_name,
98+
queue_allocation_strategy=queue_allocation_strategy,
99+
queue_capacity_reservation=queue_capacity_reservation,
100+
queue_capacity_type=queue_capacity_type,
101+
queue_subnets=queue_config["Networking"]["SubnetIds"],
106102
)
107-
108-
config_for_fleet = {"CapacityType": capacity_type}
109-
if capacity_reservation:
110-
config_for_fleet.update({"CapacityReservationId": capacity_reservation})
111-
112-
if compute_resource_config.get("Instances"):
113-
# multiple instance types, create-fleet api
114-
config_for_fleet.update(
115-
{
116-
"Api": "create-fleet",
117-
"Instances": copy.deepcopy(compute_resource_config["Instances"]),
118-
"Networking": {"SubnetIds": queue_config["Networking"]["SubnetIds"]},
119-
}
120-
)
121-
if allocation_strategy:
122-
config_for_fleet.update({"AllocationStrategy": allocation_strategy})
123-
if capacity_type == "spot" and compute_resource_config["SpotPrice"]:
124-
config_for_fleet.update({"MaxPrice": compute_resource_config["SpotPrice"]})
125-
126-
elif compute_resource_config.get("InstanceType"):
127-
# single instance type, run-instances api
128-
config_for_fleet.update(
129-
{
130-
"Api": "run-instances",
131-
"Instances": [{"InstanceType": compute_resource_config["InstanceType"]}],
132-
}
133-
)
134-
135-
else:
136-
raise ConfigurationFieldNotFoundError(
137-
"Instances or InstanceType field not found "
138-
f"in queue: {queue}, compute resource: {compute_resource} configuration"
139-
)
140-
141-
fleet_config[queue][compute_resource] = config_for_fleet
103+
fleet_config[queue_name][compute_resource_name] = config_for_fleet
142104

143105
except (KeyError, AttributeError) as e:
144106
if isinstance(e, KeyError):
145107
message = f"Unable to find key {e} in the configuration file."
146108
else:
147109
message = f"Error parsing configuration file. {e}. {traceback.format_exc()}."
148-
message += f" Queue: {queue}" if queue else ""
149-
message += f" Compute resource: {compute_resource}" if compute_resource else ""
110+
message += f" Queue: {queue_name}" if queue_name else ""
150111
log.error(message)
151112
raise CriticalError(message)
152113

@@ -157,6 +118,79 @@ def generate_fleet_config_file(output_file, input_file):
157118
log.info("Finished.")
158119

159120

121+
def _generate_compute_resource_fleet_config(
122+
compute_resource_config: dict,
123+
queue_name: str,
124+
queue_allocation_strategy: str,
125+
queue_capacity_reservation: str,
126+
queue_capacity_type: str,
127+
queue_subnets: List,
128+
):
129+
"""
130+
Generate compute resource config to add in the fleet-config.json, overriding values from the queue.
131+
132+
CapacityReservationTarget can be specified on both queue and compute resource level.
133+
CapacityType and AllocationStrategy are not yet supported at compute resource level from the CLI,
134+
but this code is ready to use them.
135+
136+
Returns compute_resource name and fleet-config section for the given compute resource.
137+
"""
138+
compute_resource_name = compute_resource_config["Name"]
139+
140+
try:
141+
capacity_type = CAPACITY_TYPE_MAP.get(compute_resource_config.get("CapacityType"), queue_capacity_type)
142+
config_for_fleet = {"CapacityType": capacity_type}
143+
144+
capacity_reservation_target = compute_resource_config.get("CapacityReservationTarget", {})
145+
capacity_reservation = (
146+
capacity_reservation_target.get("CapacityReservationId", queue_capacity_reservation)
147+
if capacity_reservation_target
148+
else queue_capacity_reservation
149+
)
150+
if capacity_reservation:
151+
config_for_fleet.update({"CapacityReservationId": capacity_reservation})
152+
153+
if compute_resource_config.get("Instances"):
154+
# multiple instance types, create-fleet api
155+
config_for_fleet.update(
156+
{
157+
"Api": "create-fleet",
158+
"Instances": copy.deepcopy(compute_resource_config["Instances"]),
159+
"Networking": {"SubnetIds": queue_subnets},
160+
}
161+
)
162+
allocation_strategy = compute_resource_config.get("AllocationStrategy", queue_allocation_strategy)
163+
if allocation_strategy:
164+
config_for_fleet.update({"AllocationStrategy": allocation_strategy})
165+
if capacity_type == "spot" and compute_resource_config["SpotPrice"]:
166+
config_for_fleet.update({"MaxPrice": compute_resource_config["SpotPrice"]})
167+
168+
elif compute_resource_config.get("InstanceType"):
169+
# single instance type, run-instances api
170+
config_for_fleet.update(
171+
{
172+
"Api": "run-instances",
173+
"Instances": [{"InstanceType": compute_resource_config["InstanceType"]}],
174+
}
175+
)
176+
177+
else:
178+
raise ConfigurationFieldNotFoundError(
179+
"Instances or InstanceType field not found "
180+
f"in queue: {queue_name}, compute resource: {compute_resource_name} configuration"
181+
)
182+
except (KeyError, AttributeError) as e:
183+
if isinstance(e, KeyError):
184+
message = f"Unable to find key {e} in the configuration file."
185+
else:
186+
message = f"Error parsing configuration file. {e}. {traceback.format_exc()}."
187+
message += f" Queue: {queue_name}, Compute resource: {compute_resource_name}"
188+
log.error(message)
189+
raise CriticalError(message)
190+
191+
return compute_resource_name, config_for_fleet
192+
193+
160194
def _load_cluster_config(input_file_path):
161195
"""Load cluster config file."""
162196
with open(input_file_path, encoding="utf-8") as input_file:

test/unit/slurm/test_fleet_config_generator.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,22 @@
4444
}
4545
},
4646
CriticalError,
47+
"Unable to find key 'Networking' in the configuration file. Queue: q1",
48+
),
49+
(
50+
{
51+
"Scheduling": {
52+
"SlurmQueues": [
53+
{
54+
"Name": "q1",
55+
"CapacityType": "SPOT",
56+
"ComputeResources": [{"Instances": []}],
57+
"Networking": {"SubnetIds": ["123"]},
58+
}
59+
]
60+
}
61+
},
62+
CriticalError,
4763
"Unable to find key 'Name' in the configuration file. Queue: q1",
4864
),
4965
(
@@ -137,7 +153,7 @@
137153
}
138154
},
139155
CriticalError,
140-
"Unable to find key 'SpotPrice' in the configuration file. Queue: q1 Compute resource: cr1",
156+
"Unable to find key 'SpotPrice' in the configuration file. Queue: q1, Compute resource: cr1",
141157
),
142158
(
143159
{
@@ -172,7 +188,7 @@
172188
}
173189
},
174190
CriticalError,
175-
"Unable to find key 'Networking' in the configuration file. Queue: q1 Compute resource: cr1",
191+
"Unable to find key 'Networking' in the configuration file. Queue: q1",
176192
),
177193
(
178194
{
@@ -190,7 +206,7 @@
190206
}
191207
},
192208
CriticalError,
193-
"Unable to find key 'SubnetIds' in the configuration file. Queue: q1 Compute resource: cr1",
209+
"Unable to find key 'SubnetIds' in the configuration file. Queue: q1",
194210
),
195211
(
196212
{

0 commit comments

Comments
 (0)