Skip to content

Commit 1105228

Browse files
authored
in sync mode select only syncStandby as switchover candidate (zalando#2278)
* in sync mode select only syncStandby as swicthover candidate * do not exit retry with err * unit test: use error from reading byte stream twice
1 parent 0ac5f58 commit 1105228

File tree

3 files changed

+43
-18
lines changed

3 files changed

+43
-18
lines changed

pkg/cluster/pod.go

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -469,39 +469,51 @@ func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, e
469469
func() (bool, error) {
470470
var err error
471471
members, err = c.patroni.GetClusterMembers(master)
472-
473472
if err != nil {
474473
return false, err
475474
}
475+
476+
// look for SyncStandby candidates (which also implies pod is in running state)
477+
for _, member := range members {
478+
if PostgresRole(member.Role) == SyncStandby {
479+
syncCandidates = append(syncCandidates, member)
480+
}
481+
}
482+
483+
// if synchronous mode is enabled and no SyncStandy was found
484+
// return false for retry - cannot failover with no sync candidate
485+
if c.Spec.Patroni.SynchronousMode && len(syncCandidates) == 0 {
486+
c.logger.Warnf("no sync standby found - retrying fetching cluster members")
487+
return false, nil
488+
}
489+
476490
return true, nil
477491
},
478492
)
479493
if err != nil {
480494
return spec.NamespacedName{}, fmt.Errorf("failed to get Patroni cluster members: %s", err)
481495
}
482496

483-
for _, member := range members {
484-
if PostgresRole(member.Role) != Leader && PostgresRole(member.Role) != StandbyLeader && member.State == "running" {
485-
candidates = append(candidates, member)
486-
if PostgresRole(member.Role) == SyncStandby {
487-
syncCandidates = append(syncCandidates, member)
488-
}
489-
}
490-
}
491-
492497
// pick candidate with lowest lag
493-
// if sync_standby replicas were found assume synchronous_mode is enabled and ignore other candidates list
494498
if len(syncCandidates) > 0 {
495499
sort.Slice(syncCandidates, func(i, j int) bool {
496500
return syncCandidates[i].Lag < syncCandidates[j].Lag
497501
})
498502
return spec.NamespacedName{Namespace: master.Namespace, Name: syncCandidates[0].Name}, nil
499-
}
500-
if len(candidates) > 0 {
501-
sort.Slice(candidates, func(i, j int) bool {
502-
return candidates[i].Lag < candidates[j].Lag
503-
})
504-
return spec.NamespacedName{Namespace: master.Namespace, Name: candidates[0].Name}, nil
503+
} else {
504+
// in asynchronous mode find running replicas
505+
for _, member := range members {
506+
if PostgresRole(member.Role) != Leader && PostgresRole(member.Role) != StandbyLeader && member.State == "running" {
507+
candidates = append(candidates, member)
508+
}
509+
}
510+
511+
if len(candidates) > 0 {
512+
sort.Slice(candidates, func(i, j int) bool {
513+
return candidates[i].Lag < candidates[j].Lag
514+
})
515+
return spec.NamespacedName{Namespace: master.Namespace, Name: candidates[0].Name}, nil
516+
}
505517
}
506518

507519
return spec.NamespacedName{}, fmt.Errorf("no switchover candidate found")

pkg/cluster/pod_test.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,30 +36,42 @@ func TestGetSwitchoverCandidate(t *testing.T) {
3636
tests := []struct {
3737
subtest string
3838
clusterJson string
39+
syncModeEnabled bool
3940
expectedCandidate spec.NamespacedName
4041
expectedError error
4142
}{
4243
{
4344
subtest: "choose sync_standby over replica",
4445
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "sync_standby", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 0}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 0}]}`,
46+
syncModeEnabled: true,
4547
expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-1"},
4648
expectedError: nil,
4749
},
50+
{
51+
subtest: "no running sync_standby available",
52+
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 0}]}`,
53+
syncModeEnabled: true,
54+
expectedCandidate: spec.NamespacedName{},
55+
expectedError: fmt.Errorf("failed to get Patroni cluster members: unexpected end of JSON input"),
56+
},
4857
{
4958
subtest: "choose replica with lowest lag",
5059
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 2}]}`,
60+
syncModeEnabled: false,
5161
expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-2"},
5262
expectedError: nil,
5363
},
5464
{
5565
subtest: "choose first replica when lag is equal evrywhere",
5666
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 5}]}`,
67+
syncModeEnabled: false,
5768
expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-1"},
5869
expectedError: nil,
5970
},
6071
{
6172
subtest: "no running replica available",
6273
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 2}, {"name": "acid-test-cluster-1", "role": "replica", "state": "starting", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 2}]}`,
74+
syncModeEnabled: false,
6375
expectedCandidate: spec.NamespacedName{},
6476
expectedError: fmt.Errorf("no switchover candidate found"),
6577
},
@@ -81,6 +93,7 @@ func TestGetSwitchoverCandidate(t *testing.T) {
8193
cluster.patroni = p
8294
mockMasterPod := newMockPod("192.168.100.1")
8395
mockMasterPod.Namespace = namespace
96+
cluster.Spec.Patroni.SynchronousMode = tt.syncModeEnabled
8497

8598
candidate, err := cluster.getSwitchoverCandidate(mockMasterPod)
8699
if err != nil && err.Error() != tt.expectedError.Error() {

pkg/cluster/sync.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,7 @@ func (c *Cluster) syncPatroniConfig(pods []v1.Pod, requiredPatroniConfig acidv1.
458458
// get Postgres config, compare with manifest and update via Patroni PATCH endpoint if it differs
459459
for i, pod := range pods {
460460
podName := util.NameFromMeta(pods[i].ObjectMeta)
461-
effectivePatroniConfig, effectivePgParameters, err = c.patroni.GetConfig(&pod)
461+
effectivePatroniConfig, effectivePgParameters, err = c.getPatroniConfig(&pod)
462462
if err != nil {
463463
errors = append(errors, fmt.Sprintf("could not get Postgres config from pod %s: %v", podName, err))
464464
continue

0 commit comments

Comments
 (0)