Skip to content

feat: add support for gpu_sharing_config on nodepool #1874

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add node pool re-creation at value change
  • Loading branch information
jimgus committed May 23, 2024
commit c760869597d22e738564c85361955600d0831a9c
41 changes: 23 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,24 +54,29 @@ module "gke" {

node_pools = [
{
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
accelerator_count = 1
accelerator_type = "nvidia-l4"
gpu_driver_version = "LATEST"
gpu_sharing_strategy = "TIME_SHARING"
max_shared_clients_per_gpu = 2
},
]

Expand Down
2 changes: 2 additions & 0 deletions autogen/main/cluster.tf.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,8 @@ locals {
"accelerator_type",
"gpu_partition_size",
"gpu_driver_version",
"gpu_sharing_strategy",
"max_shared_clients_per_gpu",
"enable_secure_boot",
"enable_integrity_monitoring",
"local_ssd_count",
Expand Down
43 changes: 24 additions & 19 deletions modules/beta-private-cluster-update-variant/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,25 +84,30 @@ module "gke" {

node_pools = [
{
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
local_ssd_ephemeral_count = 0
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
local_ssd_ephemeral_count = 0
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
accelerator_count = 1
accelerator_type = "nvidia-l4"
gpu_driver_version = "LATEST"
gpu_sharing_strategy = "TIME_SHARING"
max_shared_clients_per_gpu = 2
},
]

Expand Down
2 changes: 2 additions & 0 deletions modules/beta-private-cluster-update-variant/cluster.tf
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,8 @@ locals {
"accelerator_type",
"gpu_partition_size",
"gpu_driver_version",
"gpu_sharing_strategy",
"max_shared_clients_per_gpu",
"enable_secure_boot",
"enable_integrity_monitoring",
"local_ssd_count",
Expand Down
43 changes: 24 additions & 19 deletions modules/beta-private-cluster/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,25 +62,30 @@ module "gke" {

node_pools = [
{
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
local_ssd_ephemeral_count = 0
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
local_ssd_ephemeral_count = 0
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
accelerator_count = 1
accelerator_type = "nvidia-l4"
gpu_driver_version = "LATEST"
gpu_sharing_strategy = "TIME_SHARING"
max_shared_clients_per_gpu = 2
},
]

Expand Down
43 changes: 24 additions & 19 deletions modules/beta-public-cluster-update-variant/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,25 +78,30 @@ module "gke" {

node_pools = [
{
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
local_ssd_ephemeral_count = 0
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
local_ssd_ephemeral_count = 0
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
accelerator_count = 1
accelerator_type = "nvidia-l4"
gpu_driver_version = "LATEST"
gpu_sharing_strategy = "TIME_SHARING"
max_shared_clients_per_gpu = 2
},
]

Expand Down
2 changes: 2 additions & 0 deletions modules/beta-public-cluster-update-variant/cluster.tf
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,8 @@ locals {
"accelerator_type",
"gpu_partition_size",
"gpu_driver_version",
"gpu_sharing_strategy",
"max_shared_clients_per_gpu",
"enable_secure_boot",
"enable_integrity_monitoring",
"local_ssd_count",
Expand Down
43 changes: 24 additions & 19 deletions modules/beta-public-cluster/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,25 +56,30 @@ module "gke" {

node_pools = [
{
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
local_ssd_ephemeral_count = 0
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
local_ssd_ephemeral_count = 0
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
accelerator_count = 1
accelerator_type = "nvidia-l4"
gpu_driver_version = "LATEST"
gpu_sharing_strategy = "TIME_SHARING"
max_shared_clients_per_gpu = 2
},
]

Expand Down
41 changes: 23 additions & 18 deletions modules/private-cluster-update-variant/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,24 +82,29 @@ module "gke" {

node_pools = [
{
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
accelerator_count = 1
accelerator_type = "nvidia-l4"
gpu_driver_version = "LATEST"
gpu_sharing_strategy = "TIME_SHARING"
max_shared_clients_per_gpu = 2
},
]

Expand Down
2 changes: 2 additions & 0 deletions modules/private-cluster-update-variant/cluster.tf
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,8 @@ locals {
"accelerator_type",
"gpu_partition_size",
"gpu_driver_version",
"gpu_sharing_strategy",
"max_shared_clients_per_gpu",
"enable_secure_boot",
"enable_integrity_monitoring",
"local_ssd_count",
Expand Down
41 changes: 23 additions & 18 deletions modules/private-cluster/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,24 +60,29 @@ module "gke" {

node_pools = [
{
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
accelerator_count = 1
accelerator_type = "nvidia-l4"
gpu_driver_version = "LATEST"
gpu_sharing_strategy = "TIME_SHARING"
max_shared_clients_per_gpu = 2
},
]

Expand Down