Skip to content

Commit 0181a1b

Browse files
Introduce a repair scan to fix failing clusters (zalando#304)
A repair is a sync scan that acts only on those clusters that indicate that the last add, update or sync operation on them has failed. It is supposed to kick in more frequently than the repair scan. The repair scan still remains to be useful to fix the consequences of external actions (i.e. someone deletes a postgres-related service by mistake) unbeknownst to the operator. The repair scan is controlled by the new repair_period parameter in the operator configuration. It has to be at least 2 times more frequent than a sync scan to have any effect (a normal sync scan will update both last synced and last repaired attributes of the controller, since repair is just a sync underneath). A repair scan could be queued for a cluster that is already being synced if the sync period exceeds the interval between repairs. In that case a repair event will be discarded once the corresponding worker finds out that the cluster is not failing anymore. Review by @zerg-junior
1 parent 1a0e535 commit 0181a1b

File tree

11 files changed

+97
-19
lines changed

11 files changed

+97
-19
lines changed

docs/administrator.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,3 +199,12 @@ cluster manifest. In the case any of these variables are omitted from the
199199
manifest, the operator configmap's settings `enable_master_load_balancer` and
200200
`enable_replica_load_balancer` apply. Note that the operator settings affect
201201
all Postgresql services running in a namespace watched by the operator.
202+
203+
## Running periodic 'autorepair' scans of Kubernetes objects
204+
205+
The Postgres operator periodically scans all Kubernetes objects belonging to
206+
each cluster and repairs all discrepancies between them and the definitions
207+
generated from the current cluster manifest. There are two types of scans: a
208+
`sync scan`, running every `resync_period` seconds for every cluster, and the
209+
`repair scan`, coming every `repair_period` only for those clusters that didn't
210+
report success as a result of the last operation applied to them.

docs/reference/operator_parameters.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,10 @@ Those are top-level keys, containing both leaf keys and groups.
8080
are applied. The default is `-1`.
8181

8282
* **resync_period**
83-
period between consecutive sync requests. The default is `5m`.
83+
period between consecutive sync requests. The default is `30m`.
84+
85+
* **repair_period**
86+
period between consecutive repair requests. The default is `5m`.
8487

8588
## Postgres users
8689

manifests/postgresql-operator-default-configuration.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ configuration:
88
workers: 4
99
min_instances: -1
1010
max_instances: -1
11-
resync_period: 5m
11+
resync_period: 30m
12+
repair_period: 5m
13+
1214
#sidecar_docker_images:
1315
# example: "exampleimage:exampletag"
1416
users:

pkg/cluster/cluster.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,13 @@ func (c *Cluster) Delete() {
630630
}
631631
}
632632

633+
func (c *Cluster) NeedsRepair() (bool, spec.PostgresStatus) {
634+
c.specMu.RLock()
635+
defer c.specMu.RUnlock()
636+
return !c.Status.Success(), c.Status
637+
638+
}
639+
633640
// ReceivePodEvent is called back by the controller in order to add the cluster's pod event to the queue.
634641
func (c *Cluster) ReceivePodEvent(event spec.PodEvent) {
635642
if err := c.podEventsQueue.Add(event); err != nil {

pkg/controller/controller.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,9 @@ type Controller struct {
4848
nodesInformer cache.SharedIndexInformer
4949
podCh chan spec.PodEvent
5050

51-
clusterEventQueues []*cache.FIFO // [workerID]Queue
52-
lastClusterSyncTime int64
51+
clusterEventQueues []*cache.FIFO // [workerID]Queue
52+
lastClusterSyncTime int64
53+
lastClusterRepairTime int64
5354

5455
workerLogs map[uint32]ringlog.RingLogger
5556

pkg/controller/operator_config.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ func (c *Controller) importConfigurationFromCRD(fromCRD *config.OperatorConfigur
4343
result.MinInstances = fromCRD.MinInstances
4444
result.MaxInstances = fromCRD.MaxInstances
4545
result.ResyncPeriod = time.Duration(fromCRD.ResyncPeriod)
46+
result.RepairPeriod = time.Duration(fromCRD.RepairPeriod)
4647
result.Sidecars = fromCRD.Sidecars
4748

4849
result.SuperUsername = fromCRD.PostgresUsersConfiguration.SuperUsername

pkg/controller/postgresql.go

Lines changed: 56 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,14 @@ func (c *Controller) clusterResync(stopCh <-chan struct{}, wg *sync.WaitGroup) {
4242

4343
// TODO: make a separate function to be called from InitSharedInformers
4444
// clusterListFunc obtains a list of all PostgreSQL clusters and runs sync when necessary
45+
// NB: as this function is called directly by the informer, it needs to avoid acquiring locks
46+
// on individual cluster structures. Therefore, it acts on the manifests obtained from Kubernetes
47+
// and not on the internal state of the clusters.
4548
func (c *Controller) clusterListFunc(options metav1.ListOptions) (runtime.Object, error) {
46-
var list spec.PostgresqlList
47-
var activeClustersCnt, failedClustersCnt int
49+
var (
50+
list spec.PostgresqlList
51+
event spec.EventType
52+
)
4853

4954
req := c.KubeClient.CRDREST.
5055
Get().
@@ -61,19 +66,41 @@ func (c *Controller) clusterListFunc(options metav1.ListOptions) (runtime.Object
6166
c.logger.Warningf("could not unmarshal list of clusters: %v", err)
6267
}
6368

64-
timeFromPreviousSync := time.Now().Unix() - atomic.LoadInt64(&c.lastClusterSyncTime)
65-
if timeFromPreviousSync < int64(c.opConfig.ResyncPeriod.Seconds()) {
66-
c.logger.Infof("not running SYNC, previous sync happened %d seconds ago", timeFromPreviousSync)
67-
return &list, err
69+
currentTime := time.Now().Unix()
70+
timeFromPreviousSync := currentTime - atomic.LoadInt64(&c.lastClusterSyncTime)
71+
timeFromPreviousRepair := currentTime - atomic.LoadInt64(&c.lastClusterRepairTime)
72+
if timeFromPreviousSync >= int64(c.opConfig.ResyncPeriod.Seconds()) {
73+
event = spec.EventSync
74+
} else if timeFromPreviousRepair >= int64(c.opConfig.RepairPeriod.Seconds()) {
75+
event = spec.EventRepair
6876
}
77+
if event != "" {
78+
c.queueEvents(&list, event)
79+
} else {
80+
c.logger.Infof("not enough time passed since the last sync (%s seconds) or repair (%s seconds)",
81+
timeFromPreviousSync, timeFromPreviousRepair)
82+
}
83+
return &list, err
84+
}
6985

86+
// queueEvents queues a sync or repair event for every cluster with a valid manifest
87+
func (c *Controller) queueEvents(list *spec.PostgresqlList, event spec.EventType) {
88+
var activeClustersCnt, failedClustersCnt, clustersToRepair int
7089
for i, pg := range list.Items {
7190
if pg.Error != nil {
7291
failedClustersCnt++
7392
continue
7493
}
75-
c.queueClusterEvent(nil, &list.Items[i], spec.EventSync)
7694
activeClustersCnt++
95+
// check if that cluster needs repair
96+
if event == spec.EventRepair {
97+
if pg.Status.Success() {
98+
continue
99+
} else {
100+
clustersToRepair++
101+
}
102+
}
103+
c.queueClusterEvent(nil, &list.Items[i], event)
77104
}
78105
if len(list.Items) > 0 {
79106
if failedClustersCnt > 0 && activeClustersCnt == 0 {
@@ -83,13 +110,18 @@ func (c *Controller) clusterListFunc(options metav1.ListOptions) (runtime.Object
83110
} else {
84111
c.logger.Infof("there are %d clusters running and %d are in the failed state", activeClustersCnt, failedClustersCnt)
85112
}
113+
if clustersToRepair > 0 {
114+
c.logger.Infof("%d clusters are scheduled for a repair scan", clustersToRepair)
115+
}
86116
} else {
87117
c.logger.Infof("no clusters running")
88118
}
89-
90-
atomic.StoreInt64(&c.lastClusterSyncTime, time.Now().Unix())
91-
92-
return &list, err
119+
if event == spec.EventRepair || event == spec.EventSync {
120+
atomic.StoreInt64(&c.lastClusterRepairTime, time.Now().Unix())
121+
if event == spec.EventSync {
122+
atomic.StoreInt64(&c.lastClusterSyncTime, time.Now().Unix())
123+
}
124+
}
93125
}
94126

95127
type crdDecoder struct {
@@ -155,7 +187,7 @@ func (c *Controller) processEvent(event spec.ClusterEvent) {
155187

156188
lg := c.logger.WithField("worker", event.WorkerID)
157189

158-
if event.EventType == spec.EventAdd || event.EventType == spec.EventSync {
190+
if event.EventType == spec.EventAdd || event.EventType == spec.EventSync || event.EventType == spec.EventRepair {
159191
clusterName = util.NameFromMeta(event.NewSpec.ObjectMeta)
160192
} else {
161193
clusterName = util.NameFromMeta(event.OldSpec.ObjectMeta)
@@ -171,6 +203,16 @@ func (c *Controller) processEvent(event spec.ClusterEvent) {
171203

172204
defer c.curWorkerCluster.Store(event.WorkerID, nil)
173205

206+
if event.EventType == spec.EventRepair {
207+
runRepair, lastOperationStatus := cl.NeedsRepair()
208+
if !runRepair {
209+
lg.Debugf("Observed cluster status %s, repair is not required", lastOperationStatus)
210+
return
211+
}
212+
lg.Debugf("Observed cluster status %s, running sync scan to repair the cluster", lastOperationStatus)
213+
event.EventType = spec.EventSync
214+
}
215+
174216
if event.EventType == spec.EventAdd || event.EventType == spec.EventUpdate || event.EventType == spec.EventSync {
175217
// handle deprecated parameters by possibly assigning their values to the new ones.
176218
if event.OldSpec != nil {
@@ -406,8 +448,8 @@ func (c *Controller) queueClusterEvent(informerOldSpec, informerNewSpec *spec.Po
406448
if eventType != spec.EventDelete {
407449
return
408450
}
409-
410-
for _, evType := range []spec.EventType{spec.EventAdd, spec.EventSync, spec.EventUpdate} {
451+
// A delete event discards all prior requests for that cluster.
452+
for _, evType := range []spec.EventType{spec.EventAdd, spec.EventSync, spec.EventUpdate, spec.EventRepair} {
411453
obj, exists, err := c.clusterEventQueues[workerID].GetByKey(queueClusterKey(evType, uid))
412454
if err != nil {
413455
lg.Warningf("could not get event from the queue: %v", err)

pkg/spec/postgresql.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,3 +335,13 @@ func (pl *PostgresqlList) UnmarshalJSON(data []byte) error {
335335

336336
return nil
337337
}
338+
339+
func (status PostgresStatus) Success() bool {
340+
return status != ClusterStatusAddFailed &&
341+
status != ClusterStatusUpdateFailed &&
342+
status != ClusterStatusSyncFailed
343+
}
344+
345+
func (status PostgresStatus) String() string {
346+
return string(status)
347+
}

pkg/spec/types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ const (
3030
EventUpdate EventType = "UPDATE"
3131
EventDelete EventType = "DELETE"
3232
EventSync EventType = "SYNC"
33+
EventRepair EventType = "REPAIR"
3334

3435
fileWithNamespace = "/var/run/secrets/kubernetes.io/serviceaccount/namespace"
3536
)

pkg/util/config/config.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ import (
1414
type CRD struct {
1515
ReadyWaitInterval time.Duration `name:"ready_wait_interval" default:"4s"`
1616
ReadyWaitTimeout time.Duration `name:"ready_wait_timeout" default:"30s"`
17-
ResyncPeriod time.Duration `name:"resync_period" default:"5m"`
17+
ResyncPeriod time.Duration `name:"resync_period" default:"30m"`
18+
RepairPeriod time.Duration `name:"repair_period" default:"5m"`
1819
}
1920

2021
// Resources describes kubernetes resource specific configuration parameters

0 commit comments

Comments
 (0)