Skip to content

Commit 268a86a

Browse files
authored
removing inner goroutine in cluster.Switchover (zalando#1876)
* removing inner goroutine in cluster.Switchover and resolve race between processPodEvent and unregisterPodSubscriber * unlock mutex after handling event, now with non-blocking default case
1 parent c6f2c68 commit 268a86a

File tree

5 files changed

+36
-44
lines changed

5 files changed

+36
-44
lines changed

pkg/cluster/cluster.go

Lines changed: 15 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,12 +1032,20 @@ func (c *Cluster) processPodEvent(obj interface{}) error {
10321032
return fmt.Errorf("could not cast to PodEvent")
10331033
}
10341034

1035+
// can only take lock when (un)registerPodSubscriber is finshed
10351036
c.podSubscribersMu.RLock()
10361037
subscriber, ok := c.podSubscribers[spec.NamespacedName(event.PodName)]
1037-
c.podSubscribersMu.RUnlock()
10381038
if ok {
1039-
subscriber <- event
1039+
select {
1040+
case subscriber <- event:
1041+
default:
1042+
// ending up here when there is no receiver on the channel (i.e. waitForPodLabel finished)
1043+
// avoids blocking channel: https://gobyexample.com/non-blocking-channel-operations
1044+
}
10401045
}
1046+
// hold lock for the time of processing the event to avoid race condition
1047+
// with unregisterPodSubscriber closing the channel (see #1876)
1048+
c.podSubscribersMu.RUnlock()
10411049

10421050
return nil
10431051
}
@@ -1501,49 +1509,23 @@ func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) e
15011509
var err error
15021510
c.logger.Debugf("switching over from %q to %q", curMaster.Name, candidate)
15031511
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switching over from %q to %q", curMaster.Name, candidate)
1504-
1505-
var wg sync.WaitGroup
1506-
1507-
podLabelErr := make(chan error)
15081512
stopCh := make(chan struct{})
1509-
1510-
wg.Add(1)
1511-
1512-
go func() {
1513-
defer wg.Done()
1514-
ch := c.registerPodSubscriber(candidate)
1515-
defer c.unregisterPodSubscriber(candidate)
1516-
1517-
role := Master
1518-
1519-
select {
1520-
case <-stopCh:
1521-
case podLabelErr <- func() (err2 error) {
1522-
_, err2 = c.waitForPodLabel(ch, stopCh, &role)
1523-
return
1524-
}():
1525-
}
1526-
}()
1513+
ch := c.registerPodSubscriber(candidate)
1514+
defer c.unregisterPodSubscriber(candidate)
1515+
defer close(stopCh)
15271516

15281517
if err = c.patroni.Switchover(curMaster, candidate.Name); err == nil {
15291518
c.logger.Debugf("successfully switched over from %q to %q", curMaster.Name, candidate)
15301519
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Successfully switched over from %q to %q", curMaster.Name, candidate)
1531-
if err = <-podLabelErr; err != nil {
1520+
_, err = c.waitForPodLabel(ch, stopCh, nil)
1521+
if err != nil {
15321522
err = fmt.Errorf("could not get master pod label: %v", err)
15331523
}
15341524
} else {
15351525
err = fmt.Errorf("could not switch over from %q to %q: %v", curMaster.Name, candidate, err)
15361526
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switchover from %q to %q FAILED: %v", curMaster.Name, candidate, err)
15371527
}
15381528

1539-
// signal the role label waiting goroutine to close the shop and go home
1540-
close(stopCh)
1541-
// wait until the goroutine terminates, since unregisterPodSubscriber
1542-
// must be called before the outer return; otherwise we risk subscribing to the same pod twice.
1543-
wg.Wait()
1544-
// close the label waiting channel no sooner than the waiting goroutine terminates.
1545-
close(podLabelErr)
1546-
15471529
return err
15481530
}
15491531

pkg/cluster/pod.go

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ func (c *Cluster) markRollingUpdateFlagForPod(pod *v1.Pod, msg string) error {
6767
return fmt.Errorf("could not form patch for pod's rolling update flag: %v", err)
6868
}
6969

70-
err = retryutil.Retry(c.OpConfig.PatroniAPICheckInterval, c.OpConfig.PatroniAPICheckTimeout,
70+
err = retryutil.Retry(1*time.Second, 5*time.Second,
7171
func() (bool, error) {
7272
_, err2 := c.KubeClient.Pods(pod.Namespace).Patch(
7373
context.TODO(),
@@ -151,12 +151,13 @@ func (c *Cluster) unregisterPodSubscriber(podName spec.NamespacedName) {
151151
c.podSubscribersMu.Lock()
152152
defer c.podSubscribersMu.Unlock()
153153

154-
if _, ok := c.podSubscribers[podName]; !ok {
154+
ch, ok := c.podSubscribers[podName]
155+
if !ok {
155156
panic("subscriber for pod '" + podName.String() + "' is not found")
156157
}
157158

158-
close(c.podSubscribers[podName])
159159
delete(c.podSubscribers, podName)
160+
close(ch)
160161
}
161162

162163
func (c *Cluster) registerPodSubscriber(podName spec.NamespacedName) chan PodEvent {
@@ -399,11 +400,12 @@ func (c *Cluster) getPatroniMemberData(pod *v1.Pod) (patroni.MemberData, error)
399400
}
400401

401402
func (c *Cluster) recreatePod(podName spec.NamespacedName) (*v1.Pod, error) {
403+
stopCh := make(chan struct{})
402404
ch := c.registerPodSubscriber(podName)
403405
defer c.unregisterPodSubscriber(podName)
404-
stopChan := make(chan struct{})
406+
defer close(stopCh)
405407

406-
err := retryutil.Retry(c.OpConfig.PatroniAPICheckInterval, c.OpConfig.PatroniAPICheckTimeout,
408+
err := retryutil.Retry(1*time.Second, 5*time.Second,
407409
func() (bool, error) {
408410
err2 := c.KubeClient.Pods(podName.Namespace).Delete(
409411
context.TODO(),
@@ -421,7 +423,7 @@ func (c *Cluster) recreatePod(podName spec.NamespacedName) (*v1.Pod, error) {
421423
if err := c.waitForPodDeletion(ch); err != nil {
422424
return nil, err
423425
}
424-
pod, err := c.waitForPodLabel(ch, stopChan, nil)
426+
pod, err := c.waitForPodLabel(ch, stopCh, nil)
425427
if err != nil {
426428
return nil, err
427429
}
@@ -446,7 +448,7 @@ func (c *Cluster) recreatePods(pods []v1.Pod, switchoverCandidates []spec.Namesp
446448
continue
447449
}
448450

449-
podName := util.NameFromMeta(pod.ObjectMeta)
451+
podName := util.NameFromMeta(pods[i].ObjectMeta)
450452
newPod, err := c.recreatePod(podName)
451453
if err != nil {
452454
return fmt.Errorf("could not recreate replica pod %q: %v", util.NameFromMeta(pod.ObjectMeta), err)

pkg/cluster/util.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ func (c *Cluster) annotationsSet(annotations map[string]string) map[string]strin
316316
return nil
317317
}
318318

319-
func (c *Cluster) waitForPodLabel(podEvents chan PodEvent, stopChan chan struct{}, role *PostgresRole) (*v1.Pod, error) {
319+
func (c *Cluster) waitForPodLabel(podEvents chan PodEvent, stopCh chan struct{}, role *PostgresRole) (*v1.Pod, error) {
320320
timeout := time.After(c.OpConfig.PodLabelWaitTimeout)
321321
for {
322322
select {
@@ -332,7 +332,7 @@ func (c *Cluster) waitForPodLabel(podEvents chan PodEvent, stopChan chan struct{
332332
}
333333
case <-timeout:
334334
return nil, fmt.Errorf("pod label wait timeout")
335-
case <-stopChan:
335+
case <-stopCh:
336336
return nil, fmt.Errorf("pod label wait cancelled")
337337
}
338338
}

pkg/controller/controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -451,7 +451,7 @@ func (c *Controller) Run(stopCh <-chan struct{}, wg *sync.WaitGroup) {
451451
panic("could not acquire initial list of clusters")
452452
}
453453

454-
wg.Add(5)
454+
wg.Add(5 + util.Bool2Int(c.opConfig.EnablePostgresTeamCRD))
455455
go c.runPodInformer(stopCh, wg)
456456
go c.runPostgresqlInformer(stopCh, wg)
457457
go c.clusterResync(stopCh, wg)

pkg/util/util.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,12 +324,20 @@ func testNil(values ...*int32) bool {
324324
return false
325325
}
326326

327-
// Convert int to IntOrString type
327+
// ToIntStr converts int to IntOrString type
328328
func ToIntStr(val int) *intstr.IntOrString {
329329
b := intstr.FromInt(val)
330330
return &b
331331
}
332332

333+
// Bool2Int converts bool to int
334+
func Bool2Int(flag bool) int {
335+
if flag {
336+
return 1
337+
}
338+
return 0
339+
}
340+
333341
// Get int from IntOrString and return max int if string
334342
func IntFromIntStr(intOrStr intstr.IntOrString) int {
335343
if intOrStr.Type == 1 {

0 commit comments

Comments
 (0)