Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit a647004

Browse files
authoredFeb 21, 2024
Merge branch 'develop' into wip/mgiacomo/390/disable-unused-services
2 parents 2e6b867 + 48144e2 commit a647004

File tree

9 files changed

+30
-8
lines changed

9 files changed

+30
-8
lines changed
 

‎CHANGELOG.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ This file is used to list changes made in each version of the AWS ParallelCluste
3535
**BUG FIXES**
3636
- Fix issue making job fail when submitted as active directory user from login nodes.
3737
The issue was caused by an incomplete configuration of the integration with the external Active Directory on the head node.
38-
This fix comes with a breaking change: now cluster creation/update would fail if the integration with the Active Directory does not work.
3938

4039
3.8.0
4140
------

‎cookbooks/aws-parallelcluster-entrypoints/recipes/finalize.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
end
2222

2323
include_recipe "aws-parallelcluster-platform::finalize"
24-
include_recipe "aws-parallelcluster-environment::finalize"
2524

2625
include_recipe 'aws-parallelcluster-slurm::finalize' if node['cluster']['scheduler'] == 'slurm'
26+
27+
include_recipe "aws-parallelcluster-environment::finalize"

‎cookbooks/aws-parallelcluster-environment/recipes/finalize/finalize_directory_service.rb

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,13 @@
2222
read_only_user = domain_service_read_only_user_name(node['cluster']['directory_service']['domain_read_only_user'])
2323

2424
execute 'Fetch user data from remote directory service' do
25-
# The switch-user (sudo -u) is necessary to trigger the fetching of AD data
25+
# The switch-user (sudo -u) is necessary to trigger the fetching of AD data.
26+
# Failures are ignored because we experimentally verified that a MsAD backend
27+
# may take long time to become available.
28+
# So, we prefer to execute this step in best effort mode.
29+
# Once we will reintroduce the failures, we should consider 30 retries with 10 seconds delay.
2630
command "sudo -u #{default_user} getent passwd #{read_only_user}"
2731
user 'root'
28-
retries 10 # Retries are just a safe guard in case the node is still fetching data from the AD
29-
retry_delay 3
32+
ignore_failure true
3033
end
3134
end

‎cookbooks/aws-parallelcluster-environment/spec/unit/recipes/finalize_directory_service_spec.rb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,7 @@
3838
is_expected.to run_execute('Fetch user data from remote directory service').with(
3939
command: "sudo -u #{cluster_user} getent passwd #{domain_read_only_user}",
4040
user: 'root',
41-
retries: 10,
42-
retry_delay: 3
41+
ignore_failure: true
4342
)
4443
end
4544
else

‎cookbooks/aws-parallelcluster-environment/templates/cfn_bootstrap/cfn-hup-runner.sh.erb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@ set -ex
1010

1111
while true; do
1212
<%= @cfn_bootstrap_virtualenv_path %>/bin/cfn-hup --no-daemon --verbose
13-
sleep 120
13+
sleep 60
1414
done

‎cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323
return unless nvidia_driver_enabled?
2424
return if on_docker?
2525

26+
# Share nvidia driver version with InSpec tests
27+
node.default['cluster']['nvidia']['driver_version'] = _nvidia_driver_version
28+
node_attributes "Save Nvidia driver version for Inspec tests"
29+
2630
remote_file tmp_nvidia_run do
2731
source nvidia_driver_url
2832
mode '0755'

‎cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,8 @@ def self.configure(chef_run)
212212
'centos7' => 'el7',
213213
'rhel8' => 'el8',
214214
'rocky8' => 'el8',
215+
'rhel9' => 'el9',
216+
'rocky9' => 'el9',
215217
'ubuntu20.04' => 'Ubuntu20_04',
216218
'ubuntu22.04' => 'Ubuntu22_04',
217219
}

‎cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,10 @@ def self.setup(chef_run, nvidia_driver_version: nil)
187187
end
188188
cached(:node) { chef_run.node }
189189

190+
it 'dumps nodes attribues' do
191+
is_expected.to write_node_attributes('Save Nvidia driver version for Inspec tests')
192+
end
193+
190194
it 'sets up nvidia_driver' do
191195
is_expected.to setup_nvidia_driver('setup')
192196
end

‎util/upload-cookbook.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ _help() {
2424
--profile <aws-profile> AWS profile name to use for the upload
2525
(optional, default is AWS_PROFILE env variable or "default")
2626
--region <aws-region> Region to use for AWSCli commands (optional, default is "us-east-1")
27+
--scope <string> Disambiguation string used in the S3 path to avoid collisions (default is empty)
2728
-h, --help Print this help message
2829
EOF
2930
}
@@ -40,6 +41,8 @@ main() {
4041
--profile=*) _profile="${1#*=}";;
4142
--region) _region="$2"; shift;;
4243
--region=*) _region="${1#*=}";;
44+
--scope) _scope="$2"; shift;;
45+
--scope=*) _scope="${1#*=}";;
4346
-h|--help|help) _help; exit 0;;
4447
*) _help; echo "[error] Unrecognized option '$1'"; exit 1;;
4548
esac
@@ -66,6 +69,10 @@ main() {
6669
_info "--region parameter not specified, using 'us-east-1'"
6770
_region="us-east-1"
6871
fi
72+
if [ -z "${_scope}" ]; then
73+
_info "--scope parameter not specified, no scope will be used"
74+
_scope=""
75+
fi
6976

7077
# check bucket or create it
7178
aws ${_profile} s3api head-bucket --bucket "${_bucket}" --region "${_region}"
@@ -94,6 +101,9 @@ main() {
94101

95102
# upload packages
96103
_key_path="parallelcluster/${_version}/cookbooks"
104+
if [ -n "${_scope}" ]; then
105+
_key_path="${_key_path}/${_scope}"
106+
fi
97107
aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-cookbook-${_version}.tgz s3://${_bucket}/${_key_path}/aws-parallelcluster-cookbook-${_version}.tgz || _error_exit 'Failed to push cookbook to S3'
98108
aws ${_profile} --region "${_region}" s3 cp aws-parallelcluster-cookbook-${_version}.md5 s3://${_bucket}/${_key_path}/aws-parallelcluster-cookbook-${_version}.md5 || _error_exit 'Failed to push cookbook md5 to S3'
99109
aws ${_profile} --region "${_region}" s3api head-object --bucket ${_bucket} --key ${_key_path}/aws-parallelcluster-cookbook-${_version}.tgz --output text --query LastModified > aws-parallelcluster-cookbook-${_version}.tgz.date || _error_exit 'Failed to fetch LastModified date'

0 commit comments

Comments
 (0)
Failed to load comments.