Update e2e pipeline (zalando#1202)

sdudoladov · Sergey Dudoladov · web-flow · commit e779eab22fca · 2020-11-11T10:21:46.000+01:00
* clean up after test_multi_namespace test

* see the PR description for complete list of changes

Co-authored-by: Sergey Dudoladov &lt;sergey.dudoladov@zalando.de&gt;
diff --git a/e2e/README.md b/e2e/README.md
@@ -56,12 +56,24 @@ NOCLEANUP=True ./run.sh main tests.test_e2e.EndToEndTestCase.test_lazy_spilo_upg
 
 ## Inspecting Kind
 
-If you want to inspect Kind/Kubernetes cluster, use the following script to exec into the K8s setup and then use `kubectl`
+If you want to inspect Kind/Kubernetes cluster, switch `kubeconfig` file and context
+```bash
+# save the old config in case you have it
+export KUBECONFIG_SAVED=$KUBECONFIG
+
+# use the one created by e2e tests
+export KUBECONFIG=/tmp/kind-config-postgres-operator-e2e-tests
+
+# this kubeconfig defines a single context
+kubectl config use-context kind-postgres-operator-e2e-tests
+```
+
+or use the following script to exec into the K8s setup and then use `kubectl`
 
 ```bash
 ./exec_into_env.sh
 
-# use kube ctl
+# use kubectl
 kubectl get pods
 
 # watch relevant objects
@@ -71,6 +83,14 @@ kubectl get pods
 ./scripts/get_logs.sh
 ```
 
+If you want to inspect the state of the `kind` cluster manually with a single command, add a `context` flag
+```bash
+kubectl get pods --context kind-kind
+```
+or set the context for a few commands at once
+
+
+
 ## Cleaning up Kind
 
 To cleanup kind and start fresh
@@ -79,6 +99,12 @@ To cleanup kind and start fresh
 e2e/run.sh cleanup
 ```
 
+That also helps in case you see the
+```
+ERROR: no nodes found for cluster "postgres-operator-e2e-tests"
+```
+that happens when the `kind` cluster was deleted manually but its configuraiton file was not.
+
 ## Covered use cases
 
 The current tests are all bundled in [`test_e2e.py`](tests/test_e2e.py):
diff --git a/e2e/tests/k8s_api.py b/e2e/tests/k8s_api.py
@@ -11,9 +11,11 @@
 from kubernetes import client, config
 from kubernetes.client.rest import ApiException
 
+
 def to_selector(labels):
     return ",".join(["=".join(l) for l in labels.items()])
 
+
 class K8sApi:
 
     def __init__(self):
@@ -181,10 +183,10 @@ def count_deployments_with_label(self, labels, namespace='default'):
     def count_pdbs_with_label(self, labels, namespace='default'):
         return len(self.api.policy_v1_beta1.list_namespaced_pod_disruption_budget(
             namespace, label_selector=labels).items)
-  
+
     def count_running_pods(self, labels='application=spilo,cluster-name=acid-minimal-cluster', namespace='default'):
         pods = self.api.core_v1.list_namespaced_pod(namespace, label_selector=labels).items
-        return len(list(filter(lambda x: x.status.phase=='Running', pods)))
+        return len(list(filter(lambda x: x.status.phase == 'Running', pods)))
 
     def wait_for_pod_failover(self, failover_targets, labels, namespace='default'):
         pod_phase = 'Failing over'
@@ -210,9 +212,9 @@ def wait_for_logical_backup_job_deletion(self):
     def wait_for_logical_backup_job_creation(self):
         self.wait_for_logical_backup_job(expected_num_of_jobs=1)
 
-    def delete_operator_pod(self, step="Delete operator deplyment"):
-        operator_pod = self.api.core_v1.list_namespaced_pod('default', label_selector="name=postgres-operator").items[0].metadata.name
-        self.api.apps_v1.patch_namespaced_deployment("postgres-operator","default", {"spec":{"template":{"metadata":{"annotations":{"step":"{}-{}".format(step, time.time())}}}}})
+    def delete_operator_pod(self, step="Delete operator pod"):
+        # patching the pod template in the deployment restarts the operator pod
+        self.api.apps_v1.patch_namespaced_deployment("postgres-operator","default", {"spec":{"template":{"metadata":{"annotations":{"step":"{}-{}".format(step, datetime.fromtimestamp(time.time()))}}}}})
         self.wait_for_operator_pod_start()
 
     def update_config(self, config_map_patch, step="Updating operator deployment"):
@@ -241,7 +243,7 @@ def get_patroni_state(self, pod):
 
     def get_operator_state(self):
         pod = self.get_operator_pod()
-        if pod == None:
+        if pod is None:
             return None
         pod = pod.metadata.name
 
@@ -251,7 +253,6 @@ def get_operator_state(self):
 
         return json.loads(r.stdout.decode())
 
-
     def get_patroni_running_members(self, pod="acid-minimal-cluster-0"):
         result = self.get_patroni_state(pod)
         return list(filter(lambda x: "State" in x and x["State"] == "running", result))
@@ -260,9 +261,9 @@ def get_deployment_replica_count(self, name="acid-minimal-cluster-pooler", names
         try:
             deployment = self.api.apps_v1.read_namespaced_deployment(name, namespace)
             return deployment.spec.replicas
-        except ApiException as e:
+        except ApiException:
             return None
-    
+
     def get_statefulset_image(self, label_selector="application=spilo,cluster-name=acid-minimal-cluster", namespace='default'):
         ssets = self.api.apps_v1.list_namespaced_stateful_set(namespace, label_selector=label_selector, limit=1)
         if len(ssets.items) == 0:
@@ -463,7 +464,6 @@ def wait_for_logical_backup_job_creation(self):
         self.wait_for_logical_backup_job(expected_num_of_jobs=1)
 
     def delete_operator_pod(self, step="Delete operator deplyment"):
-        operator_pod = self.api.core_v1.list_namespaced_pod('default', label_selector="name=postgres-operator").items[0].metadata.name
         self.api.apps_v1.patch_namespaced_deployment("postgres-operator","default", {"spec":{"template":{"metadata":{"annotations":{"step":"{}-{}".format(step, time.time())}}}}})
         self.wait_for_operator_pod_start()
 
@@ -521,7 +521,7 @@ def __init__(self, labels="name=postgres-operator", namespace="default"):
 class K8sPostgres(K8sBase):
     def __init__(self, labels="cluster-name=acid-minimal-cluster", namespace="default"):
         super().__init__(labels, namespace)
-    
+
     def get_pg_nodes(self):
         master_pod_node = ''
         replica_pod_nodes = []
@@ -532,4 +532,4 @@ def get_pg_nodes(self):
             elif pod.metadata.labels.get('spilo-role') == 'replica':
                 replica_pod_nodes.append(pod.spec.node_name)
 
-        return master_pod_node, replica_pod_nodes
+        return master_pod_node, replica_pod_nodes
diff --git a/e2e/tests/test_e2e.py b/e2e/tests/test_e2e.py
@@ -2,15 +2,14 @@
 import unittest
 import time
 import timeout_decorator
-import subprocess
-import warnings
 import os
 import yaml
 
 from datetime import datetime
-from kubernetes import client, config
+from kubernetes import client
 
 from tests.k8s_api import K8s
+from kubernetes.client.rest import ApiException
 
 SPILO_CURRENT = "registry.opensource.zalan.do/acid/spilo-12:1.6-p5"
 SPILO_LAZY = "registry.opensource.zalan.do/acid/spilo-cdp-12:1.6-p114"
@@ -89,17 +88,17 @@ def setUpClass(cls):
         # remove existing local storage class and create hostpath class
         try:
             k8s.api.storage_v1_api.delete_storage_class("standard")
-        except:
-            print("Storage class has already been remove")
+        except ApiException as e:
+            print("Failed to delete the 'standard' storage class: {0}".format(e))
 
         # operator deploys pod service account there on start up
         # needed for test_multi_namespace_support()
-        cls.namespace = "test"
+        cls.test_namespace = "test"
         try:
-            v1_namespace = client.V1Namespace(metadata=client.V1ObjectMeta(name=cls.namespace))
+            v1_namespace = client.V1Namespace(metadata=client.V1ObjectMeta(name=cls.test_namespace))
             k8s.api.core_v1.create_namespace(v1_namespace)
-        except:
-            print("Namespace already present")
+        except ApiException as e:
+            print("Failed to create the '{0}' namespace: {1}".format(cls.test_namespace, e))
 
         # submit the most recent operator image built on the Docker host
         with open("manifests/postgres-operator.yaml", 'r+') as f:
@@ -135,10 +134,8 @@ def setUpClass(cls):
 
         # make sure we start a new operator on every new run,
         # this tackles the problem when kind is reused
-        # and the Docker image is infact changed (dirty one)
+        # and the Docker image is in fact changed (dirty one)
 
-        # patch resync period, this can catch some problems with hanging e2e tests
-        # k8s.update_config({"data": {"resync_period":"30s"}},step="TestSuite setup")
         k8s.update_config({}, step="TestSuite Startup")
 
         actual_operator_image = k8s.api.core_v1.list_namespaced_pod(
@@ -170,9 +167,6 @@ def test_enable_disable_connection_pooler(self):
             'connection-pooler': 'acid-minimal-cluster-pooler',
         })
 
-        pod_selector = to_selector(pod_labels)
-        service_selector = to_selector(service_labels)
-
         # enable connection pooler
         k8s.api.custom_objects_api.patch_namespaced_custom_object(
             'acid.zalan.do', 'v1', 'default',
@@ -347,7 +341,7 @@ def test_infrastructure_roles(self):
             },
         }
         k8s.update_config(patch_infrastructure_roles)
-        self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0":"idle"}, "Operator does not get in sync")
+        self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
 
         try:
             # check that new roles are represented in the config by requesting the
@@ -604,17 +598,25 @@ def test_multi_namespace_support(self):
 
         with open("manifests/complete-postgres-manifest.yaml", 'r+') as f:
             pg_manifest = yaml.safe_load(f)
-            pg_manifest["metadata"]["namespace"] = self.namespace
+            pg_manifest["metadata"]["namespace"] = self.test_namespace
             yaml.dump(pg_manifest, f, Dumper=yaml.Dumper)
 
         try:
             k8s.create_with_kubectl("manifests/complete-postgres-manifest.yaml")
-            k8s.wait_for_pod_start("spilo-role=master", self.namespace)
-            self.assert_master_is_unique(self.namespace, "acid-test-cluster")
+            k8s.wait_for_pod_start("spilo-role=master", self.test_namespace)
+            self.assert_master_is_unique(self.test_namespace, "acid-test-cluster")
 
         except timeout_decorator.TimeoutError:
             print('Operator log: {}'.format(k8s.get_operator_log()))
             raise
+        finally:
+            # delete the new cluster so that the k8s_api.get_operator_state works correctly in subsequent tests
+            # ideally we should delete the 'test' namespace here but
+            # the pods inside the namespace stuck in the Terminating state making the test time out
+            k8s.api.custom_objects_api.delete_namespaced_custom_object(
+                "acid.zalan.do", "v1", self.test_namespace, "postgresqls", "acid-test-cluster")
+            time.sleep(5)
+
 
     @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
     def test_zz_node_readiness_label(self):
@@ -746,12 +748,12 @@ def test_statefulset_annotation_propagation(self):
         }
         k8s.api.custom_objects_api.patch_namespaced_custom_object(
             "acid.zalan.do", "v1", "default", "postgresqls", "acid-minimal-cluster", pg_crd_annotations)
-        
+
         annotations = {
             "deployment-time": "2020-04-30 12:00:00",
             "downscaler/downtime_replicas": "0",
         }
-        
+
         self.eventuallyTrue(lambda: k8s.check_statefulset_annotations(cluster_label, annotations), "Annotations missing")
 
 
@@ -823,14 +825,14 @@ def test_zzzz_cluster_deletion(self):
             }
         }
         k8s.update_config(patch_delete_annotations)
-        self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0":"idle"}, "Operator does not get in sync")
+        self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
 
         try:
             # this delete attempt should be omitted because of missing annotations
             k8s.api.custom_objects_api.delete_namespaced_custom_object(
                 "acid.zalan.do", "v1", "default", "postgresqls", "acid-minimal-cluster")
             time.sleep(5)
-            self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0":"idle"}, "Operator does not get in sync")
+            self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
 
             # check that pods and services are still there
             k8s.wait_for_running_pods(cluster_label, 2)
@@ -841,7 +843,7 @@ def test_zzzz_cluster_deletion(self):
 
             # wait a little before proceeding
             time.sleep(10)
-            self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0":"idle"}, "Operator does not get in sync")
+            self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
 
             # add annotations to manifest
             delete_date = datetime.today().strftime('%Y-%m-%d')
@@ -855,7 +857,7 @@ def test_zzzz_cluster_deletion(self):
             }
             k8s.api.custom_objects_api.patch_namespaced_custom_object(
                 "acid.zalan.do", "v1", "default", "postgresqls", "acid-minimal-cluster", pg_patch_delete_annotations)
-            self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0":"idle"}, "Operator does not get in sync")
+            self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
 
             # wait a little before proceeding
             time.sleep(20)
@@ -882,7 +884,7 @@ def test_zzzz_cluster_deletion(self):
             print('Operator log: {}'.format(k8s.get_operator_log()))
             raise
 
-        #reset configmap
+        # reset configmap
         patch_delete_annotations = {
             "data": {
                 "delete_annotation_date_key": "",
diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go
@@ -626,14 +626,14 @@ func (c *Cluster) Update(oldSpec, newSpec *acidv1.Postgresql) error {
 		}
 	}()
 
-	if oldSpec.Spec.PostgresqlParam.PgVersion >= newSpec.Spec.PostgresqlParam.PgVersion {
+	if oldSpec.Spec.PostgresqlParam.PgVersion > newSpec.Spec.PostgresqlParam.PgVersion {
 		c.logger.Warningf("postgresql version change(%q -> %q) has no effect",
 			oldSpec.Spec.PostgresqlParam.PgVersion, newSpec.Spec.PostgresqlParam.PgVersion)
 		c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "PostgreSQL", "postgresql version change(%q -> %q) has no effect",
 			oldSpec.Spec.PostgresqlParam.PgVersion, newSpec.Spec.PostgresqlParam.PgVersion)
 		// we need that hack to generate statefulset with the old version
 		newSpec.Spec.PostgresqlParam.PgVersion = oldSpec.Spec.PostgresqlParam.PgVersion
-	} else {
+	} else if oldSpec.Spec.PostgresqlParam.PgVersion < newSpec.Spec.PostgresqlParam.PgVersion {
 		c.logger.Infof("postgresql version increased (%q -> %q), major version upgrade can be done manually after StatefulSet Sync",
 			oldSpec.Spec.PostgresqlParam.PgVersion, newSpec.Spec.PostgresqlParam.PgVersion)
 	}