Merge branch 'develop' into wip/pcluster3

enrico-usai · enrico-usai · commit 5004513f70e2 · 2021-07-13T11:42:24.000+02:00
Signed-off-by: Enrico Usai &lt;usai@amazon.com&gt;

# Conflicts:
#	CHANGELOG.md
#	amis/build_ami.sh
#	amis/packer_alinux2.json
#	amis/packer_centos7.json
#	amis/packer_centos8.json
#	amis/packer_ubuntu1804.json
#	amis/packer_ubuntu2004.json
#	amis/packer_variables.json
#	attributes/conditions.rb
#	attributes/default.rb
#	files/default/configure-pat.sh
#	libraries/helpers.rb
#	metadata.rb
#	recipes/cluster_admin_user_install.rb
#	recipes/fsx_mount.rb
#	recipes/sge_install.rb
#	recipes/slurm_install.rb
#	recipes/tests.rb
#	templates/default/compute_ready.erb
#	templates/default/slurm/slurm.conf.erb
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
@@ -0,0 +1,30 @@
+name: "CodeQL"
+
+on:
+  push:
+  pull_request:
+  schedule:
+    - cron: '0 10 * * 2'
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+    strategy:
+      fail-fast: false
+      matrix:
+        language: [ 'python' ]
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v1
+      with:
+        languages: ${{ matrix.language }}
+        queries: +security-and-quality
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v1
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,19 +12,33 @@ This file is used to list changes made in each version of the aws-parallelcluste
 - Use tags prefix `parallelcluster:`.
 - Run Slurm command `scontrol` with sudo because clustermgtd is run as cluster admin user (not root).
 - Implement `computemgtd` self-termination via `shutdown` command instead of calling TerminateInstances.
+- Implement scaling protection mechanism with Slurm scheduler: compute fleet is automatically set to 'PROTECTED' state
+  in case recurrent failures are encountered when provisioning nodes.
 
-2.x.x
+2.11.0
 -----
 
 **ENHANCEMENTS**
-- SGE: make `qstat` command in nodewatcher more robust in case a custom DHCP option set is configured.
+- SGE: always use shortname as hostname filter with `qstat`. This will make nodewatcher more robust when using custom DHCP option, where the full hostname seen by `SGE` might differ from the hostname returned from EC2 metadata(local-hostname).
 - Transition from IMDSv1 to IMDSv2.
-- Implement scaling protection mechanism with Slurm scheduler: compute fleet is automatically set to 'PROTECTED' state 
-  in case recurrent failures are encountered when provisioning nodes.
+- Have `computemgtd` reuse last available daemon configuration when the new one cannot be loaded.
+- Use methods with timeouts to read NFS shared files, which will prevent `computemgtd` from hanging when NFS filesystems are not available.
 
 **BUG FIXES**
 - Fix a bug that caused `clustermgtd` to not immediately replace instances with failed status check that are in replacement process.
 
+2.10.4
+-----
+
+**CHANGES**
+- There were no notable changes for this version.
+
+2.10.3
+-----
+
+**CHANGES**
+- There were no notable changes for this version.
+
 2.10.2
 -----
 
diff --git a/src/slurm_plugin/common.py b/src/slurm_plugin/common.py
@@ -15,13 +15,14 @@
 import logging
 from datetime import datetime
 
-from common.utils import time_is_up
+from common.utils import check_command_output, time_is_up
 
 logger = logging.getLogger(__name__)
 
 # timestamp used by clustermgtd and computemgtd should be in default ISO format
 # YYYY-MM-DDTHH:MM:SS.ffffff+HH:MM[:SS[.ffffff]]
 TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S.%f%z"
+DEFAULT_COMMAND_TIMEOUT = 30
 
 
 def log_exception(
@@ -72,16 +73,27 @@ def retrieve_instance_type_mapping(file_path):
         raise
 
 
-def _get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path):
+def get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path):
     """Get clustermgtd's last heartbeat."""
-    with open(clustermgtd_heartbeat_file_path, "r") as timestamp_file:
-        # Note: heartbeat must be written with datetime.strftime to convert localized datetime into str
-        # datetime.strptime will not work with str(datetime)
-        # Example timestamp written to heartbeat file: 2020-07-30 19:34:02.613338+00:00
-        return datetime.strptime(timestamp_file.read().strip(), TIMESTAMP_FORMAT)
+    # Use subprocess based method to read shared file to prevent hanging when NFS is down
+    # Do not copy to local. Different users need to access the file, but file should be writable by root only
+    # Only use last line of output to avoid taking unexpected output in stdout
+    heartbeat = (
+        check_command_output(
+            f"cat {clustermgtd_heartbeat_file_path}",
+            timeout=DEFAULT_COMMAND_TIMEOUT,
+            shell=True,  # nosec
+        )
+        .splitlines()[-1]
+        .strip()
+    )
+    # Note: heartbeat must be written with datetime.strftime to convert localized datetime into str
+    # datetime.strptime will not work with str(datetime)
+    # Example timestamp written to heartbeat file: 2020-07-30 19:34:02.613338+00:00
+    return datetime.strptime(heartbeat, TIMESTAMP_FORMAT)
 
 
-def _expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout):
+def expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout):
     """Test if clustermgtd heartbeat is expired."""
     if time_is_up(last_heartbeat, current_time, clustermgtd_timeout):
         logger.error(
@@ -96,9 +108,9 @@ def _expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_tim
 
 def is_clustermgtd_heartbeat_valid(current_time, clustermgtd_timeout, clustermgtd_heartbeat_file_path):
     try:
-        last_heartbeat = _get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path)
+        last_heartbeat = get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path)
         logger.info("Latest heartbeat from clustermgtd: %s", last_heartbeat)
-        return not _expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout)
+        return not expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout)
     except Exception as e:
         logger.error("Unable to retrieve clustermgtd heartbeat with exception: %s", e)
         return False
diff --git a/src/slurm_plugin/computemgtd.py b/src/slurm_plugin/computemgtd.py
@@ -23,7 +23,12 @@
 from common.time_utils import seconds
 from common.utils import get_metadata, run_command, sleep_remaining_loop_time
 from retrying import retry
-from slurm_plugin.common import is_clustermgtd_heartbeat_valid, log_exception
+from slurm_plugin.common import (
+    DEFAULT_COMMAND_TIMEOUT,
+    expired_clustermgtd_heartbeat,
+    get_clustermgtd_heartbeat,
+    log_exception,
+)
 from slurm_plugin.slurm_resources import CONFIG_FILE_DIR
 
 LOOP_TIME = 60
@@ -54,14 +59,20 @@ def __repr__(self):
         attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()])
         return "{class_name}({attrs})".format(class_name=self.__class__.__name__, attrs=attrs)
 
-    @log_exception(log, "reading computemgtd config", catch_exception=IOError, raise_on_error=True)
+    @log_exception(log, "reading computemgtd config", catch_exception=Exception, raise_on_error=True)
     def _get_config(self, config_file_path):
         """Get computemgtd configuration."""
         log.info("Reading %s", config_file_path)
         config = ConfigParser()
         try:
-            config.read_file(open(config_file_path, "r"))
-        except IOError:
+            # Use subprocess based method to copy shared file to local to prevent hanging when NFS is down
+            run_command(
+                f"cat {config_file_path} > {CONFIG_FILE_DIR}/.computemgtd_config.local",
+                timeout=DEFAULT_COMMAND_TIMEOUT,
+                shell=True,  # nosec
+            )
+            config.read_file(open(f"{CONFIG_FILE_DIR}/.computemgtd_config.local", "r"))
+        except Exception:
             log.error(f"Cannot read computemgtd configuration file: {config_file_path}")
             raise
 
@@ -100,11 +111,10 @@ def _get_config(self, config_file_path):
     def _read_nodename_from_file(nodename_file_path):
         """Read self nodename from a file."""
         try:
-            log.info("Reading self nodename from %s", nodename_file_path)
             with open(nodename_file_path, "r") as nodename_file:
                 nodename = nodename_file.read()
             return nodename
-        except IOError as e:
+        except Exception as e:
             log.error("Unable to read self nodename from %s with exception: %s\n", nodename_file_path, e)
             raise
 
@@ -169,22 +179,32 @@ def _run_computemgtd():
     # Initial default heartbeat time as computemgtd startup time
     last_heartbeat = datetime.now(tz=timezone.utc)
     log.info("Initializing clustermgtd heartbeat to be computemgtd startup time: %s", last_heartbeat)
-    computemgtd_config = None
-    reload_config_counter = 0
+    computemgtd_config = _load_daemon_config()
+    reload_config_counter = RELOAD_CONFIG_ITERATIONS
     while True:
         # Get current time
         current_time = datetime.now(tz=timezone.utc)
 
-        if not computemgtd_config or reload_config_counter <= 0:
-            computemgtd_config = _load_daemon_config()
-            reload_config_counter = RELOAD_CONFIG_ITERATIONS
+        if reload_config_counter <= 0:
+            try:
+                computemgtd_config = _load_daemon_config()
+                reload_config_counter = RELOAD_CONFIG_ITERATIONS
+            except Exception as e:
+                log.warning("Unable to reload daemon config, using previous one.\nException: %s", e)
         else:
             reload_config_counter -= 1
 
         # Check heartbeat
-        if not is_clustermgtd_heartbeat_valid(
-            current_time, computemgtd_config.clustermgtd_timeout, computemgtd_config.clustermgtd_heartbeat_file_path
-        ):
+        try:
+            last_heartbeat = get_clustermgtd_heartbeat(computemgtd_config.clustermgtd_heartbeat_file_path)
+            log.info("Latest heartbeat from clustermgtd: %s", last_heartbeat)
+        except Exception as e:
+            log.warning(
+                "Unable to retrieve clustermgtd heartbeat. Using last known heartbeat: %s with exception: %s",
+                last_heartbeat,
+                e,
+            )
+        if expired_clustermgtd_heartbeat(last_heartbeat, current_time, computemgtd_config.clustermgtd_timeout):
             if computemgtd_config.disable_computemgtd_actions:
                 log.info("All computemgtd actions currently disabled")
             elif _is_self_node_down(computemgtd_config.nodename):
diff --git a/tests/slurm_plugin/test_clustermgtd.py b/tests/slurm_plugin/test_clustermgtd.py
@@ -1183,12 +1183,6 @@ def test_manage_cluster(
                                         "PrivateDnsName": "hostname",
                                         "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc),
                                     },
-                                    {
-                                        "InstanceId": "i-6",
-                                        "PrivateIpAddress": "ip-6",
-                                        "PrivateDnsName": "hostname",
-                                        "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc),
-                                    },
                                     # Return an orphaned instance
                                     {
                                         "InstanceId": "i-999",
@@ -1251,21 +1245,16 @@ def test_manage_cluster(
                     },
                     generate_error=False,
                 ),
-                # _maintain_nodes: _handle_powering_down_nodes
-                MockedBoto3Request(
-                    method="terminate_instances",
-                    response={},
-                    expected_params={"InstanceIds": ["i-6"]},
-                    generate_error=False,
-                ),
                 # _maintain_nodes/delete_instances: terminate dynamic down nodes
+                # dynamic down nodes are handled with suspend script, and its boto3 call should not be reflected here
                 MockedBoto3Request(
                     method="terminate_instances",
                     response={},
                     expected_params={"InstanceIds": ["i-2"]},
                     generate_error=False,
                 ),
                 # _maintain_nodes/delete_instances: terminate static down nodes
+                # dynamic down nodes are handled with suspend script, and its boto3 call should not be reflected here
                 MockedBoto3Request(
                     method="terminate_instances",
                     response={},
diff --git a/tests/slurm_plugin/test_common.py b/tests/slurm_plugin/test_common.py
@@ -11,12 +11,11 @@
 
 
 from datetime import datetime, timedelta, timezone
-from unittest.mock import mock_open
 
 import pytest
 from assertpy import assert_that
 from common.utils import time_is_up
-from slurm_plugin.common import TIMESTAMP_FORMAT, _get_clustermgtd_heartbeat
+from slurm_plugin.common import TIMESTAMP_FORMAT, get_clustermgtd_heartbeat
 
 
 @pytest.mark.parametrize(
@@ -72,5 +71,8 @@ def test_time_is_up(initial_time, current_time, grace_time, expected_result):
     ],
 )
 def test_get_clustermgtd_heartbeat(time, expected_parsed_time, mocker):
-    mocker.patch("slurm_plugin.common.open", mock_open(read_data=time.strftime(TIMESTAMP_FORMAT)))
-    assert_that(_get_clustermgtd_heartbeat("some file path")).is_equal_to(expected_parsed_time)
+    mocker.patch(
+        "slurm_plugin.common.check_command_output",
+        return_value=f"some_random_stdout\n{time.strftime(TIMESTAMP_FORMAT)}",
+    )
+    assert_that(get_clustermgtd_heartbeat("some file path")).is_equal_to(expected_parsed_time)
diff --git a/tests/slurm_plugin/test_computemgtd.py b/tests/slurm_plugin/test_computemgtd.py
@@ -62,7 +62,9 @@
 )
 def test_computemgtd_config(config_file, expected_attributes, test_datadir, mocker):
     mocker.patch("slurm_plugin.computemgtd.ComputemgtdConfig._read_nodename_from_file", return_value="some_nodename")
-    compute_config = ComputemgtdConfig(test_datadir / config_file)
+    mocker.patch("slurm_plugin.computemgtd.run_command")
+    mocker.patch("slurm_plugin.computemgtd.open", return_value=open(test_datadir / config_file, "r"))
+    compute_config = ComputemgtdConfig("mocked_config_path")
     for key in expected_attributes:
         assert_that(compute_config.__dict__.get(key)).is_equal_to(expected_attributes.get(key))