Fix flakiness in the E2E test e2e_multi_cluster_replica_set_scale_up (#231)

viveksinghggits · web-flow · commit 5710105f0347 · 2025-07-08T12:43:39.000+02:00
# Summary The E2E test `e2e_multi_cluster_replica_set_scale_up` has been flaky and @lucian-tosa suggested that we should fix it. It has been failing while waiting for statefulsets (STSs) to have correct number of in case multi cluster mongoDB deployment. And the problem was sometimes after the `MongoDBMultiCluster (mdbmc)` resource got into running phase (that would mean all the STSs are ready), some of the STSs got into not ready state. When we see that `mdbmc` resource is Running we try to make sure that STSs have correct number of replicas but because of above problem (STSs transitioning into not ready state from ready), STS didn't have the correct number of replicas and tests failed. The reason why STS was transitioning into not ready state from ready is, the pod that it was maintaining did the same, i.e., it transitioned from ready state to not ready state. After looking into it further we got to know that the pod is behaving like this because sometimes, it's readiness probe fails momentarily. And because of that the pod gets to ready and then transitioned to not ready (readiness probe failed) and then eventually becomes ready. This is documented in much more detail in the document [here](https://jira.mongodb.org/browse/CLOUDP-329231). The ideal fix of the problem would be to figure out why the readiness probe fails and then fix that. But this PR has the workaround that changes the test slightly to wait for STSs to get correct number of replicas. Jira ticket: https://jira.mongodb.org/browse/CLOUDP-329422 ## Proof of Work Ran the test `e2e_multi_cluster_replica_set_scale_up` manually locally to make sure that it's passing consistently. I am not able to reproduce the flakiness now. ## Checklist - [x] Have you linked a jira ticket and/or is the ticket in the title? - [x] Have you checked whether your jira ticket required DOCSP changes? - [x] Have you checked for release_note changes?
diff --git a/docker/mongodb-kubernetes-tests/tests/multicluster/multi_cluster_replica_set_scale_up.py b/docker/mongodb-kubernetes-tests/tests/multicluster/multi_cluster_replica_set_scale_up.py
@@ -1,6 +1,7 @@
 from typing import List
 
 import kubernetes
+import kubetester
 import pytest
 from kubetester.automation_config_tester import AutomationConfigTester
 from kubetester.certs_mongodb_multi import create_multi_cluster_mongodb_tls_certs
@@ -80,18 +81,30 @@ def test_statefulsets_have_been_created_correctly(
     mongodb_multi: MongoDBMulti,
     member_cluster_clients: List[MultiClusterClient],
 ):
-    statefulsets = mongodb_multi.read_statefulsets(member_cluster_clients)
-    cluster_one_client = member_cluster_clients[0]
-    cluster_one_sts = statefulsets[cluster_one_client.cluster_name]
-    assert cluster_one_sts.status.ready_replicas == 1
+    # Even though we already verified, in previous test, that the MongoDBMultiCluster resource's phase is running (that would mean all STSs are ready);
+    # checking the expected number of replicas for STS makes the test flaky because of an issue mentioned in detail in this ticket https://jira.mongodb.org/browse/CLOUDP-329231.
+    # That's why we are waiting for STS to have expected number of replicas. This change can be reverted when we make the proper fix as
+    # mentioned in the above ticket.
+    def fn():
+        cluster_one_client = member_cluster_clients[0]
+        cluster_one_statefulsets = mongodb_multi.read_statefulsets([cluster_one_client])
+        return cluster_one_statefulsets[cluster_one_client.cluster_name].status.ready_replicas == 1
 
-    cluster_two_client = member_cluster_clients[1]
-    cluster_two_sts = statefulsets[cluster_two_client.cluster_name]
-    assert cluster_two_sts.status.ready_replicas == 1
+    kubetester.wait_until(fn, timeout=60, message="Verifying sts has correct number of replicas in cluster one")
 
-    cluster_three_client = member_cluster_clients[2]
-    cluster_three_sts = statefulsets[cluster_three_client.cluster_name]
-    assert cluster_three_sts.status.ready_replicas == 1
+    def fn():
+        cluster_two_client = member_cluster_clients[1]
+        cluster_two_statefulsets = mongodb_multi.read_statefulsets([cluster_two_client])
+        return cluster_two_statefulsets[cluster_two_client.cluster_name].status.ready_replicas == 1
+
+    kubetester.wait_until(fn, timeout=60, message="Verifying sts has correct number of replicas in cluster two")
+
+    def fn():
+        cluster_three_client = member_cluster_clients[2]
+        cluster_three_statefulsets = mongodb_multi.read_statefulsets([cluster_three_client])
+        return cluster_three_statefulsets[cluster_three_client.cluster_name].status.ready_replicas == 1
+
+    kubetester.wait_until(fn, timeout=60, message="Verifying sts has correct number of replicas in cluster three")
 
 
 @pytest.mark.e2e_multi_cluster_replica_set_scale_up
@@ -116,18 +129,36 @@ def test_statefulsets_have_been_scaled_up_correctly(
     mongodb_multi: MongoDBMulti,
     member_cluster_clients: List[MultiClusterClient],
 ):
-    statefulsets = mongodb_multi.read_statefulsets(member_cluster_clients)
-    cluster_one_client = member_cluster_clients[0]
-    cluster_one_sts = statefulsets[cluster_one_client.cluster_name]
-    assert cluster_one_sts.status.ready_replicas == 2
-
-    cluster_two_client = member_cluster_clients[1]
-    cluster_two_sts = statefulsets[cluster_two_client.cluster_name]
-    assert cluster_two_sts.status.ready_replicas == 1
-
-    cluster_three_client = member_cluster_clients[2]
-    cluster_three_sts = statefulsets[cluster_three_client.cluster_name]
-    assert cluster_three_sts.status.ready_replicas == 2
+    # Even though we already verified, in previous test, that the MongoDBMultiCluster resource's phase is running (that would mean all STSs are ready);
+    # checking the expected number of replicas for STS makes the test flaky because of an issue mentioned in detail in this ticket https://jira.mongodb.org/browse/CLOUDP-329231.
+    # That's why we are waiting for STS to have expected number of replicas. This change can be reverted when we make the proper fix as
+    # mentioned in the above ticket.
+    def fn():
+        cluster_one_client = member_cluster_clients[0]
+        cluster_one_statefulsets = mongodb_multi.read_statefulsets([cluster_one_client])
+        return cluster_one_statefulsets[cluster_one_client.cluster_name].status.ready_replicas == 2
+
+    kubetester.wait_until(
+        fn, timeout=60, message="Verifying sts has correct number of replicas after scale up in cluster one"
+    )
+
+    def fn():
+        cluster_two_client = member_cluster_clients[1]
+        cluster_two_statefulsets = mongodb_multi.read_statefulsets([cluster_two_client])
+        return cluster_two_statefulsets[cluster_two_client.cluster_name].status.ready_replicas == 1
+
+    kubetester.wait_until(
+        fn, timeout=60, message="Verifying sts has correct number of replicas after scale up in cluster two"
+    )
+
+    def fn():
+        cluster_three_client = member_cluster_clients[2]
+        cluster_three_statefulsets = mongodb_multi.read_statefulsets([cluster_three_client])
+        return cluster_three_statefulsets[cluster_three_client.cluster_name].status.ready_replicas == 2
+
+    kubetester.wait_until(
+        fn, timeout=60, message="Verifying sts has correct number of replicas after scale up in cluster three"
+    )
 
 
 @pytest.mark.e2e_multi_cluster_replica_set_scale_up