Skip to content

Commit b91b69c

Browse files
BugFix: Switchover (during a Node drain) fails randomly in synchronous mode (zalando#1984)
* Use getSwitchoverCandidate instead of masterCandidate when trying to migrating master pod to a replica Ref: zalando#1983 * Remove unused masterCandidate (replaced by getSwitchoverCandidate) Ref: zalando#1983
1 parent b2642fa commit b91b69c

File tree

1 file changed

+10
-36
lines changed

1 file changed

+10
-36
lines changed

pkg/cluster/pod.go

Lines changed: 10 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package cluster
33
import (
44
"context"
55
"fmt"
6-
"math/rand"
76
"sort"
87
"strconv"
98
"time"
@@ -212,42 +211,12 @@ func (c *Cluster) movePodFromEndOfLifeNode(pod *v1.Pod) (*v1.Pod, error) {
212211
return newPod, nil
213212
}
214213

215-
func (c *Cluster) masterCandidate(oldNodeName string) (*v1.Pod, error) {
216-
217-
// Wait until at least one replica pod will come up
218-
if err := c.waitForAnyReplicaLabelReady(); err != nil {
219-
c.logger.Warningf("could not find at least one ready replica: %v", err)
220-
}
221-
222-
replicas, err := c.getRolePods(Replica)
223-
if err != nil {
224-
return nil, fmt.Errorf("could not get replica pods: %v", err)
225-
}
226-
227-
if len(replicas) == 0 {
228-
c.logger.Warningf("no available master candidates, migration will cause longer downtime of Postgres cluster")
229-
return nil, nil
230-
}
231-
232-
for i, pod := range replicas {
233-
// look for replicas running on live nodes. Ignore errors when querying the nodes.
234-
if pod.Spec.NodeName != oldNodeName {
235-
eol, err := c.podIsEndOfLife(&pod)
236-
if err == nil && !eol {
237-
return &replicas[i], nil
238-
}
239-
}
240-
}
241-
c.logger.Warningf("no available master candidates on live nodes")
242-
return &replicas[rand.Intn(len(replicas))], nil
243-
}
244-
245214
// MigrateMasterPod migrates master pod via failover to a replica
246215
func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error {
247216
var (
248-
masterCandidatePod *v1.Pod
249-
err error
250-
eol bool
217+
masterCandidateName spec.NamespacedName
218+
err error
219+
eol bool
251220
)
252221

253222
oldMaster, err := c.KubeClient.Pods(podName.Namespace).Get(context.TODO(), podName.Name, metav1.GetOptions{})
@@ -283,13 +252,19 @@ func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error {
283252
}
284253
// We may not have a cached statefulset if the initial cluster sync has aborted, revert to the spec in that case.
285254
if *c.Statefulset.Spec.Replicas > 1 {
286-
if masterCandidatePod, err = c.masterCandidate(oldMaster.Spec.NodeName); err != nil {
255+
if masterCandidateName, err = c.getSwitchoverCandidate(oldMaster); err != nil {
287256
return fmt.Errorf("could not find suitable replica pod as candidate for failover: %v", err)
288257
}
289258
} else {
290259
c.logger.Warningf("migrating single pod cluster %q, this will cause downtime of the Postgres cluster until pod is back", c.clusterName())
291260
}
292261

262+
masterCandidatePod, err := c.KubeClient.Pods(masterCandidateName.Namespace).Get(context.TODO(), masterCandidateName.Name, metav1.GetOptions{})
263+
264+
if err != nil {
265+
return fmt.Errorf("could not get master candidate pod: %v", err)
266+
}
267+
293268
// there are two cases for each postgres cluster that has its master pod on the node to migrate from:
294269
// - the cluster has some replicas - migrate one of those if necessary and failover to it
295270
// - there are no replicas - just terminate the master and wait until it respawns
@@ -306,7 +281,6 @@ func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error {
306281
return fmt.Errorf("could not move pod: %v", err)
307282
}
308283

309-
masterCandidateName := util.NameFromMeta(masterCandidatePod.ObjectMeta)
310284
err = retryutil.Retry(1*time.Minute, 5*time.Minute,
311285
func() (bool, error) {
312286
err := c.Switchover(oldMaster, masterCandidateName)

0 commit comments

Comments
 (0)