Skip to content

Commit b57387c

Browse files
authored
feat: add support for gpu_sharing_config on nodepool (terraform-google-modules#1874)
1 parent c51c446 commit b57387c

File tree

16 files changed

+317
-149
lines changed

16 files changed

+317
-149
lines changed

README.md

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -54,24 +54,29 @@ module "gke" {
5454
5555
node_pools = [
5656
{
57-
name = "default-node-pool"
58-
machine_type = "e2-medium"
59-
node_locations = "us-central1-b,us-central1-c"
60-
min_count = 1
61-
max_count = 100
62-
local_ssd_count = 0
63-
spot = false
64-
disk_size_gb = 100
65-
disk_type = "pd-standard"
66-
image_type = "COS_CONTAINERD"
67-
enable_gcfs = false
68-
enable_gvnic = false
69-
logging_variant = "DEFAULT"
70-
auto_repair = true
71-
auto_upgrade = true
72-
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
73-
preemptible = false
74-
initial_node_count = 80
57+
name = "default-node-pool"
58+
machine_type = "e2-medium"
59+
node_locations = "us-central1-b,us-central1-c"
60+
min_count = 1
61+
max_count = 100
62+
local_ssd_count = 0
63+
spot = false
64+
disk_size_gb = 100
65+
disk_type = "pd-standard"
66+
image_type = "COS_CONTAINERD"
67+
enable_gcfs = false
68+
enable_gvnic = false
69+
logging_variant = "DEFAULT"
70+
auto_repair = true
71+
auto_upgrade = true
72+
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
73+
preemptible = false
74+
initial_node_count = 80
75+
accelerator_count = 1
76+
accelerator_type = "nvidia-l4"
77+
gpu_driver_version = "LATEST"
78+
gpu_sharing_strategy = "TIME_SHARING"
79+
max_shared_clients_per_gpu = 2
7580
},
7681
]
7782

autogen/main/README.md

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -96,27 +96,32 @@ module "gke" {
9696
{% if autopilot_cluster != true %}
9797
node_pools = [
9898
{
99-
name = "default-node-pool"
100-
machine_type = "e2-medium"
101-
node_locations = "us-central1-b,us-central1-c"
102-
min_count = 1
103-
max_count = 100
104-
local_ssd_count = 0
105-
spot = false
99+
name = "default-node-pool"
100+
machine_type = "e2-medium"
101+
node_locations = "us-central1-b,us-central1-c"
102+
min_count = 1
103+
max_count = 100
104+
local_ssd_count = 0
105+
spot = false
106106
{% if beta_cluster %}
107-
local_ssd_ephemeral_count = 0
107+
local_ssd_ephemeral_count = 0
108108
{% endif %}
109-
disk_size_gb = 100
110-
disk_type = "pd-standard"
111-
image_type = "COS_CONTAINERD"
112-
enable_gcfs = false
113-
enable_gvnic = false
114-
logging_variant = "DEFAULT"
115-
auto_repair = true
116-
auto_upgrade = true
117-
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
118-
preemptible = false
119-
initial_node_count = 80
109+
disk_size_gb = 100
110+
disk_type = "pd-standard"
111+
image_type = "COS_CONTAINERD"
112+
enable_gcfs = false
113+
enable_gvnic = false
114+
logging_variant = "DEFAULT"
115+
auto_repair = true
116+
auto_upgrade = true
117+
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
118+
preemptible = false
119+
initial_node_count = 80
120+
accelerator_count = 1
121+
accelerator_type = "nvidia-l4"
122+
gpu_driver_version = "LATEST"
123+
gpu_sharing_strategy = "TIME_SHARING"
124+
max_shared_clients_per_gpu = 2
120125
},
121126
]
122127

autogen/main/cluster.tf.tmpl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,8 @@ locals {
643643
"accelerator_type",
644644
"gpu_partition_size",
645645
"gpu_driver_version",
646+
"gpu_sharing_strategy",
647+
"max_shared_clients_per_gpu",
646648
"enable_secure_boot",
647649
"enable_integrity_monitoring",
648650
"local_ssd_count",
@@ -927,6 +929,14 @@ resource "google_container_node_pool" "windows_pools" {
927929
gpu_driver_version = lookup(each.value, "gpu_driver_version", "")
928930
}
929931
}
932+
933+
dynamic "gpu_sharing_config" {
934+
for_each = lookup(each.value, "gpu_sharing_strategy", "") != "" ? [1] : []
935+
content {
936+
gpu_sharing_strategy = lookup(each.value, "gpu_sharing_strategy", "")
937+
max_shared_clients_per_gpu = lookup(each.value, "max_shared_clients_per_gpu", 2)
938+
}
939+
}
930940
}
931941
}
932942

cluster.tf

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -644,6 +644,14 @@ resource "google_container_node_pool" "pools" {
644644
gpu_driver_version = lookup(each.value, "gpu_driver_version", "")
645645
}
646646
}
647+
648+
dynamic "gpu_sharing_config" {
649+
for_each = lookup(each.value, "gpu_sharing_strategy", "") != "" ? [1] : []
650+
content {
651+
gpu_sharing_strategy = lookup(each.value, "gpu_sharing_strategy", "")
652+
max_shared_clients_per_gpu = lookup(each.value, "max_shared_clients_per_gpu", 2)
653+
}
654+
}
647655
}
648656
}
649657

@@ -882,6 +890,14 @@ resource "google_container_node_pool" "windows_pools" {
882890
gpu_driver_version = lookup(each.value, "gpu_driver_version", "")
883891
}
884892
}
893+
894+
dynamic "gpu_sharing_config" {
895+
for_each = lookup(each.value, "gpu_sharing_strategy", "") != "" ? [1] : []
896+
content {
897+
gpu_sharing_strategy = lookup(each.value, "gpu_sharing_strategy", "")
898+
max_shared_clients_per_gpu = lookup(each.value, "max_shared_clients_per_gpu", 2)
899+
}
900+
}
885901
}
886902
}
887903

modules/beta-private-cluster-update-variant/README.md

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -84,25 +84,30 @@ module "gke" {
8484
8585
node_pools = [
8686
{
87-
name = "default-node-pool"
88-
machine_type = "e2-medium"
89-
node_locations = "us-central1-b,us-central1-c"
90-
min_count = 1
91-
max_count = 100
92-
local_ssd_count = 0
93-
spot = false
94-
local_ssd_ephemeral_count = 0
95-
disk_size_gb = 100
96-
disk_type = "pd-standard"
97-
image_type = "COS_CONTAINERD"
98-
enable_gcfs = false
99-
enable_gvnic = false
100-
logging_variant = "DEFAULT"
101-
auto_repair = true
102-
auto_upgrade = true
103-
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
104-
preemptible = false
105-
initial_node_count = 80
87+
name = "default-node-pool"
88+
machine_type = "e2-medium"
89+
node_locations = "us-central1-b,us-central1-c"
90+
min_count = 1
91+
max_count = 100
92+
local_ssd_count = 0
93+
spot = false
94+
local_ssd_ephemeral_count = 0
95+
disk_size_gb = 100
96+
disk_type = "pd-standard"
97+
image_type = "COS_CONTAINERD"
98+
enable_gcfs = false
99+
enable_gvnic = false
100+
logging_variant = "DEFAULT"
101+
auto_repair = true
102+
auto_upgrade = true
103+
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
104+
preemptible = false
105+
initial_node_count = 80
106+
accelerator_count = 1
107+
accelerator_type = "nvidia-l4"
108+
gpu_driver_version = "LATEST"
109+
gpu_sharing_strategy = "TIME_SHARING"
110+
max_shared_clients_per_gpu = 2
106111
},
107112
]
108113

modules/beta-private-cluster-update-variant/cluster.tf

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,8 @@ locals {
552552
"accelerator_type",
553553
"gpu_partition_size",
554554
"gpu_driver_version",
555+
"gpu_sharing_strategy",
556+
"max_shared_clients_per_gpu",
555557
"enable_secure_boot",
556558
"enable_integrity_monitoring",
557559
"local_ssd_count",
@@ -811,6 +813,14 @@ resource "google_container_node_pool" "pools" {
811813
gpu_driver_version = lookup(each.value, "gpu_driver_version", "")
812814
}
813815
}
816+
817+
dynamic "gpu_sharing_config" {
818+
for_each = lookup(each.value, "gpu_sharing_strategy", "") != "" ? [1] : []
819+
content {
820+
gpu_sharing_strategy = lookup(each.value, "gpu_sharing_strategy", "")
821+
max_shared_clients_per_gpu = lookup(each.value, "max_shared_clients_per_gpu", 2)
822+
}
823+
}
814824
}
815825
}
816826

@@ -1075,6 +1085,14 @@ resource "google_container_node_pool" "windows_pools" {
10751085
gpu_driver_version = lookup(each.value, "gpu_driver_version", "")
10761086
}
10771087
}
1088+
1089+
dynamic "gpu_sharing_config" {
1090+
for_each = lookup(each.value, "gpu_sharing_strategy", "") != "" ? [1] : []
1091+
content {
1092+
gpu_sharing_strategy = lookup(each.value, "gpu_sharing_strategy", "")
1093+
max_shared_clients_per_gpu = lookup(each.value, "max_shared_clients_per_gpu", 2)
1094+
}
1095+
}
10781096
}
10791097
}
10801098

modules/beta-private-cluster/README.md

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -62,25 +62,30 @@ module "gke" {
6262
6363
node_pools = [
6464
{
65-
name = "default-node-pool"
66-
machine_type = "e2-medium"
67-
node_locations = "us-central1-b,us-central1-c"
68-
min_count = 1
69-
max_count = 100
70-
local_ssd_count = 0
71-
spot = false
72-
local_ssd_ephemeral_count = 0
73-
disk_size_gb = 100
74-
disk_type = "pd-standard"
75-
image_type = "COS_CONTAINERD"
76-
enable_gcfs = false
77-
enable_gvnic = false
78-
logging_variant = "DEFAULT"
79-
auto_repair = true
80-
auto_upgrade = true
81-
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
82-
preemptible = false
83-
initial_node_count = 80
65+
name = "default-node-pool"
66+
machine_type = "e2-medium"
67+
node_locations = "us-central1-b,us-central1-c"
68+
min_count = 1
69+
max_count = 100
70+
local_ssd_count = 0
71+
spot = false
72+
local_ssd_ephemeral_count = 0
73+
disk_size_gb = 100
74+
disk_type = "pd-standard"
75+
image_type = "COS_CONTAINERD"
76+
enable_gcfs = false
77+
enable_gvnic = false
78+
logging_variant = "DEFAULT"
79+
auto_repair = true
80+
auto_upgrade = true
81+
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
82+
preemptible = false
83+
initial_node_count = 80
84+
accelerator_count = 1
85+
accelerator_type = "nvidia-l4"
86+
gpu_driver_version = "LATEST"
87+
gpu_sharing_strategy = "TIME_SHARING"
88+
max_shared_clients_per_gpu = 2
8489
},
8590
]
8691

modules/beta-private-cluster/cluster.tf

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -735,6 +735,14 @@ resource "google_container_node_pool" "pools" {
735735
gpu_driver_version = lookup(each.value, "gpu_driver_version", "")
736736
}
737737
}
738+
739+
dynamic "gpu_sharing_config" {
740+
for_each = lookup(each.value, "gpu_sharing_strategy", "") != "" ? [1] : []
741+
content {
742+
gpu_sharing_strategy = lookup(each.value, "gpu_sharing_strategy", "")
743+
max_shared_clients_per_gpu = lookup(each.value, "max_shared_clients_per_gpu", 2)
744+
}
745+
}
738746
}
739747
}
740748

@@ -998,6 +1006,14 @@ resource "google_container_node_pool" "windows_pools" {
9981006
gpu_driver_version = lookup(each.value, "gpu_driver_version", "")
9991007
}
10001008
}
1009+
1010+
dynamic "gpu_sharing_config" {
1011+
for_each = lookup(each.value, "gpu_sharing_strategy", "") != "" ? [1] : []
1012+
content {
1013+
gpu_sharing_strategy = lookup(each.value, "gpu_sharing_strategy", "")
1014+
max_shared_clients_per_gpu = lookup(each.value, "max_shared_clients_per_gpu", 2)
1015+
}
1016+
}
10011017
}
10021018
}
10031019

modules/beta-public-cluster-update-variant/README.md

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -78,25 +78,30 @@ module "gke" {
7878
7979
node_pools = [
8080
{
81-
name = "default-node-pool"
82-
machine_type = "e2-medium"
83-
node_locations = "us-central1-b,us-central1-c"
84-
min_count = 1
85-
max_count = 100
86-
local_ssd_count = 0
87-
spot = false
88-
local_ssd_ephemeral_count = 0
89-
disk_size_gb = 100
90-
disk_type = "pd-standard"
91-
image_type = "COS_CONTAINERD"
92-
enable_gcfs = false
93-
enable_gvnic = false
94-
logging_variant = "DEFAULT"
95-
auto_repair = true
96-
auto_upgrade = true
97-
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
98-
preemptible = false
99-
initial_node_count = 80
81+
name = "default-node-pool"
82+
machine_type = "e2-medium"
83+
node_locations = "us-central1-b,us-central1-c"
84+
min_count = 1
85+
max_count = 100
86+
local_ssd_count = 0
87+
spot = false
88+
local_ssd_ephemeral_count = 0
89+
disk_size_gb = 100
90+
disk_type = "pd-standard"
91+
image_type = "COS_CONTAINERD"
92+
enable_gcfs = false
93+
enable_gvnic = false
94+
logging_variant = "DEFAULT"
95+
auto_repair = true
96+
auto_upgrade = true
97+
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
98+
preemptible = false
99+
initial_node_count = 80
100+
accelerator_count = 1
101+
accelerator_type = "nvidia-l4"
102+
gpu_driver_version = "LATEST"
103+
gpu_sharing_strategy = "TIME_SHARING"
104+
max_shared_clients_per_gpu = 2
100105
},
101106
]
102107

0 commit comments

Comments
 (0)