Skip to content

Commit 26a07e2

Browse files
zmbergfurykerry
authored andcommitted
fix cloneSet controller block caused by scale expectation leakage
Signed-off-by: liheng.zms <[email protected]>
1 parent fa139cb commit 26a07e2

File tree

2 files changed

+48
-6
lines changed

2 files changed

+48
-6
lines changed

pkg/controller/cloneset/cloneset_controller.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ import (
4141
"github.com/openkruise/kruise/pkg/util/refmanager"
4242
"github.com/openkruise/kruise/pkg/util/volumeclaimtemplate"
4343

44+
"github.com/prometheus/client_golang/prometheus"
4445
apps "k8s.io/api/apps/v1"
4546
v1 "k8s.io/api/core/v1"
4647
"k8s.io/apimachinery/pkg/api/errors"
@@ -59,13 +60,16 @@ import (
5960
"sigs.k8s.io/controller-runtime/pkg/event"
6061
"sigs.k8s.io/controller-runtime/pkg/handler"
6162
"sigs.k8s.io/controller-runtime/pkg/manager"
63+
"sigs.k8s.io/controller-runtime/pkg/metrics"
6264
"sigs.k8s.io/controller-runtime/pkg/predicate"
6365
"sigs.k8s.io/controller-runtime/pkg/reconcile"
6466
"sigs.k8s.io/controller-runtime/pkg/source"
6567
)
6668

6769
func init() {
6870
flag.IntVar(&concurrentReconciles, "cloneset-workers", concurrentReconciles, "Max concurrent workers for CloneSet controller.")
71+
// register prometheus
72+
metrics.Registry.MustRegister(CloneSetScaleExpectationLeakageMetrics)
6973
}
7074

7175
var (
@@ -75,6 +79,16 @@ var (
7579
minimumReplicasToPreDownloadImage int32 = 3
7680
)
7781

82+
var (
83+
CloneSetScaleExpectationLeakageMetrics = prometheus.NewCounterVec(
84+
prometheus.CounterOpts{
85+
Name: "cloneset_scale_expectation_leakage",
86+
Help: "CloneSet Scale Expectation Leakage Metrics",
87+
// cloneSet namespace, name
88+
}, []string{"namespace", "name"},
89+
)
90+
)
91+
7892
// Add creates a new CloneSet Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller
7993
// and Start it when the Manager is Started.
8094
func Add(mgr manager.Manager) error {
@@ -229,7 +243,31 @@ func (r *ReconcileCloneSet) doReconcile(request reconcile.Request) (res reconcil
229243
// If scaling expectations have not satisfied yet, just skip this reconcile.
230244
if scaleSatisfied, unsatisfiedDuration, scaleDirtyPods := clonesetutils.ScaleExpectations.SatisfiedExpectations(request.String()); !scaleSatisfied {
231245
if unsatisfiedDuration >= expectations.ExpectationTimeout {
246+
// In some extreme scenarios, if the Pod is created and then quickly deleted, there may be event loss.
247+
// Therefore, a touting mechanism is needed to ensure that clonesets can continue to work.
232248
klog.InfoS("Expectation unsatisfied overtime", "cloneSet", request, "scaleDirtyPods", scaleDirtyPods, "overTime", unsatisfiedDuration)
249+
CloneSetScaleExpectationLeakageMetrics.WithLabelValues(request.Namespace, request.Name).Add(1)
250+
// TODO: check the existence of resource in apiserver using client-go directly
251+
/*for _, pods := range scaleDirtyPods {
252+
for _, name := range pods {
253+
_, err = kubeClient.GetGenericClient().KubeClient.CoreV1().Pods(request.Namespace).Get(context.TODO(), name, metav1.GetOptions{})
254+
if err == nil {
255+
klog.Warningf("CloneSet(%s/%s) ScaleExpectations leakage, but Pod(%s) already exist", request.Namespace, request.Name, name)
256+
return reconcile.Result{RequeueAfter: 30 * time.Second}, nil
257+
} else if !errors.IsNotFound(err) {
258+
klog.ErrorS(err, "Failed to get Pod", "cloneSet", request, "pod", name)
259+
return reconcile.Result{RequeueAfter: 3 * time.Second}, nil
260+
}
261+
}
262+
}
263+
klog.InfoS("CloneSet ScaleExpectation DirtyPods no longer exists, and delete ScaleExpectation", "cloneSet", request)*/
264+
if utilfeature.DefaultFeatureGate.Enabled(features.ForceDeleteTimeoutExpectationFeatureGate) {
265+
klog.InfoS("Expectation unsatisfied overtime, and force delete the timeout Expectation", "cloneSet", request, "scaleDirtyPods", scaleDirtyPods, "overTime", unsatisfiedDuration)
266+
clonesetutils.ScaleExpectations.DeleteExpectations(request.String())
267+
// In order to avoid the scale expectation timeout,
268+
// there is no subsequent Pod, CloneSet event causing CloneSet not to be scheduled
269+
return reconcile.Result{RequeueAfter: 10 * time.Second}, nil
270+
}
233271
return reconcile.Result{}, nil
234272
}
235273
klog.V(4).InfoS("Not satisfied scale", "cloneSet", request, "scaleDirtyPods", scaleDirtyPods)

pkg/features/kruise_features.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,9 @@ const (
131131

132132
// Enables policies auto resizing PVCs created by a StatefulSet when user expands volumeClaimTemplates.
133133
StatefulSetAutoResizePVCGate featuregate.Feature = "StatefulSetAutoResizePVCGate"
134+
135+
// ForceDeleteTimeoutExpectationFeatureGate enable delete timeout expectation, for example: cloneSet ScaleExpectation
136+
ForceDeleteTimeoutExpectationFeatureGate = "ForceDeleteTimeoutExpectationGate"
134137
)
135138

136139
var defaultFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
@@ -160,12 +163,13 @@ var defaultFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
160163
ResourceDistributionGate: {Default: false, PreRelease: featuregate.Alpha},
161164
DeletionProtectionForCRDCascadingGate: {Default: false, PreRelease: featuregate.Alpha},
162165

163-
EnhancedLivenessProbeGate: {Default: false, PreRelease: featuregate.Alpha},
164-
RecreatePodWhenChangeVCTInCloneSetGate: {Default: false, PreRelease: featuregate.Alpha},
165-
StatefulSetStartOrdinal: {Default: false, PreRelease: featuregate.Alpha},
166-
PodIndexLabel: {Default: true, PreRelease: featuregate.Beta},
167-
EnableExternalCerts: {Default: false, PreRelease: featuregate.Alpha},
168-
StatefulSetAutoResizePVCGate: {Default: false, PreRelease: featuregate.Alpha},
166+
EnhancedLivenessProbeGate: {Default: false, PreRelease: featuregate.Alpha},
167+
RecreatePodWhenChangeVCTInCloneSetGate: {Default: false, PreRelease: featuregate.Alpha},
168+
StatefulSetStartOrdinal: {Default: false, PreRelease: featuregate.Alpha},
169+
PodIndexLabel: {Default: true, PreRelease: featuregate.Beta},
170+
EnableExternalCerts: {Default: false, PreRelease: featuregate.Alpha},
171+
StatefulSetAutoResizePVCGate: {Default: false, PreRelease: featuregate.Alpha},
172+
ForceDeleteTimeoutExpectationFeatureGate: {Default: false, PreRelease: featuregate.Alpha},
169173
}
170174

171175
func init() {

0 commit comments

Comments
 (0)