@@ -20,23 +20,24 @@ import (
20
20
"context"
21
21
"fmt"
22
22
"net"
23
- "sync "
23
+ "time "
24
24
25
25
v1 "k8s.io/api/core/v1"
26
+ "k8s.io/apimachinery/pkg/util/wait"
26
27
"k8s.io/klog/v2"
27
28
netutils "k8s.io/utils/net"
28
29
29
30
apierrors "k8s.io/apimachinery/pkg/api/errors"
30
31
"k8s.io/apimachinery/pkg/types"
31
32
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
32
- "k8s.io/apimachinery/pkg/util/sets"
33
33
informers "k8s.io/client-go/informers/core/v1"
34
34
clientset "k8s.io/client-go/kubernetes"
35
35
"k8s.io/client-go/kubernetes/scheme"
36
36
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
37
37
corelisters "k8s.io/client-go/listers/core/v1"
38
38
"k8s.io/client-go/tools/cache"
39
39
"k8s.io/client-go/tools/record"
40
+ "k8s.io/client-go/util/workqueue"
40
41
nodeutil "k8s.io/component-helpers/node/util"
41
42
"k8s.io/kubernetes/pkg/controller/nodeipam/ipam/cidrset"
42
43
controllerutil "k8s.io/kubernetes/pkg/controller/util/node"
@@ -52,14 +53,12 @@ type rangeAllocator struct {
52
53
nodeLister corelisters.NodeLister
53
54
// nodesSynced returns true if the node shared informer has been synced at least once.
54
55
nodesSynced cache.InformerSynced
55
- // Channel that is used to pass updating Nodes and their reserved CIDRs to the background
56
- // This increases a throughput of CIDR assignment by not blocking on long operations.
57
- nodeCIDRUpdateChannel chan nodeReservedCIDRs
58
- broadcaster record.EventBroadcaster
59
- recorder record.EventRecorder
60
- // Keep a set of nodes that are currently being processed to avoid races in CIDR allocation
61
- lock sync.Mutex
62
- nodesInProcessing sets.String
56
+ broadcaster record.EventBroadcaster
57
+ recorder record.EventRecorder
58
+
59
+ // queues are where incoming work is placed to de-dup and to allow "easy"
60
+ // rate limited requeues on errors
61
+ queue workqueue.RateLimitingInterface
63
62
}
64
63
65
64
// NewCIDRRangeAllocator returns a CIDRAllocator to allocate CIDRs for node (one from each of clusterCIDRs)
@@ -89,15 +88,14 @@ func NewCIDRRangeAllocator(ctx context.Context, client clientset.Interface, node
89
88
}
90
89
91
90
ra := & rangeAllocator {
92
- client : client ,
93
- clusterCIDRs : allocatorParams .ClusterCIDRs ,
94
- cidrSets : cidrSets ,
95
- nodeLister : nodeInformer .Lister (),
96
- nodesSynced : nodeInformer .Informer ().HasSynced ,
97
- nodeCIDRUpdateChannel : make (chan nodeReservedCIDRs , cidrUpdateQueueSize ),
98
- broadcaster : eventBroadcaster ,
99
- recorder : recorder ,
100
- nodesInProcessing : sets .NewString (),
91
+ client : client ,
92
+ clusterCIDRs : allocatorParams .ClusterCIDRs ,
93
+ cidrSets : cidrSets ,
94
+ nodeLister : nodeInformer .Lister (),
95
+ nodesSynced : nodeInformer .Informer ().HasSynced ,
96
+ broadcaster : eventBroadcaster ,
97
+ recorder : recorder ,
98
+ queue : workqueue .NewNamedRateLimitingQueue (workqueue .DefaultControllerRateLimiter (), "cidrallocator_node" ),
101
99
}
102
100
103
101
if allocatorParams .ServiceCIDR != nil {
@@ -130,37 +128,33 @@ func NewCIDRRangeAllocator(ctx context.Context, client clientset.Interface, node
130
128
}
131
129
132
130
nodeInformer .Informer ().AddEventHandler (cache.ResourceEventHandlerFuncs {
133
- AddFunc : controllerutil .CreateAddNodeHandler (func (node * v1.Node ) error {
134
- return ra .AllocateOrOccupyCIDR (logger , node )
135
- }),
136
- UpdateFunc : controllerutil .CreateUpdateNodeHandler (func (_ , newNode * v1.Node ) error {
137
- // If the PodCIDRs list is not empty we either:
138
- // - already processed a Node that already had CIDRs after NC restarted
139
- // (cidr is marked as used),
140
- // - already processed a Node successfully and allocated CIDRs for it
141
- // (cidr is marked as used),
142
- // - already processed a Node but we did saw a "timeout" response and
143
- // request eventually got through in this case we haven't released
144
- // the allocated CIDRs (cidr is still marked as used).
145
- // There's a possible error here:
146
- // - NC sees a new Node and assigns CIDRs X,Y.. to it,
147
- // - Update Node call fails with a timeout,
148
- // - Node is updated by some other component, NC sees an update and
149
- // assigns CIDRs A,B.. to the Node,
150
- // - Both CIDR X,Y.. and CIDR A,B.. are marked as used in the local cache,
151
- // even though Node sees only CIDR A,B..
152
- // The problem here is that in in-memory cache we see CIDR X,Y.. as marked,
153
- // which prevents it from being assigned to any new node. The cluster
154
- // state is correct.
155
- // Restart of NC fixes the issue.
156
- if len (newNode .Spec .PodCIDRs ) == 0 {
157
- return ra .AllocateOrOccupyCIDR (logger , newNode )
131
+ AddFunc : func (obj interface {}) {
132
+ key , err := cache .MetaNamespaceKeyFunc (obj )
133
+ if err == nil {
134
+ ra .queue .Add (key )
158
135
}
159
- return nil
160
- }),
161
- DeleteFunc : controllerutil .CreateDeleteNodeHandler (logger , func (node * v1.Node ) error {
162
- return ra .ReleaseCIDR (logger , node )
163
- }),
136
+ },
137
+ UpdateFunc : func (old , new interface {}) {
138
+ key , err := cache .MetaNamespaceKeyFunc (new )
139
+ if err == nil {
140
+ ra .queue .Add (key )
141
+ }
142
+ },
143
+ DeleteFunc : func (obj interface {}) {
144
+ // The informer cache no longer has the object, and since Node doesn't have a finalizer,
145
+ // we don't see the Update with DeletionTimestamp != 0.
146
+ // TODO: instead of executing the operation directly in the handler, build a small cache with key node.Name
147
+ // and value PodCIDRs use ReleaseCIDR on the reconcile loop so we can retry on `ReleaseCIDR` failures.
148
+ if err := ra .ReleaseCIDR (logger , obj .(* v1.Node )); err != nil {
149
+ utilruntime .HandleError (fmt .Errorf ("error while processing CIDR Release: %w" , err ))
150
+ }
151
+ // IndexerInformer uses a delta nodeQueue, therefore for deletes we have to use this
152
+ // key function.
153
+ key , err := cache .DeletionHandlingMetaNamespaceKeyFunc (obj )
154
+ if err == nil {
155
+ ra .queue .Add (key )
156
+ }
157
+ },
164
158
})
165
159
166
160
return ra , nil
@@ -176,6 +170,8 @@ func (r *rangeAllocator) Run(ctx context.Context) {
176
170
r .broadcaster .StartRecordingToSink (& v1core.EventSinkImpl {Interface : r .client .CoreV1 ().Events ("" )})
177
171
defer r .broadcaster .Shutdown ()
178
172
173
+ defer r .queue .ShutDown ()
174
+
179
175
logger .Info ("Starting range CIDR allocator" )
180
176
defer logger .Info ("Shutting down range CIDR allocator" )
181
177
@@ -184,50 +180,100 @@ func (r *rangeAllocator) Run(ctx context.Context) {
184
180
}
185
181
186
182
for i := 0 ; i < cidrUpdateWorkers ; i ++ {
187
- go r . worker (ctx )
183
+ go wait . UntilWithContext (ctx , r . runWorker , time . Second )
188
184
}
189
185
190
186
<- ctx .Done ()
191
187
}
192
188
193
- func (r * rangeAllocator ) worker (ctx context.Context ) {
194
- logger := klog .FromContext (ctx )
195
- for {
196
- select {
197
- case workItem , ok := <- r .nodeCIDRUpdateChannel :
198
- if ! ok {
199
- logger .Info ("Channel nodeCIDRUpdateChannel was unexpectedly closed" )
200
- return
201
- }
202
- if err := r .updateCIDRsAllocation (logger , workItem ); err != nil {
203
- // Requeue the failed node for update again.
204
- r .nodeCIDRUpdateChannel <- workItem
205
- }
206
- case <- ctx .Done ():
207
- return
208
- }
189
+ // runWorker is a long-running function that will continually call the
190
+ // processNextWorkItem function in order to read and process a message on the
191
+ // queue.
192
+ func (r * rangeAllocator ) runWorker (ctx context.Context ) {
193
+ for r .processNextNodeWorkItem (ctx ) {
209
194
}
210
195
}
211
196
212
- func (r * rangeAllocator ) insertNodeToProcessing (nodeName string ) bool {
213
- r .lock .Lock ()
214
- defer r .lock .Unlock ()
215
- if r .nodesInProcessing .Has (nodeName ) {
197
+ // processNextWorkItem will read a single work item off the queue and
198
+ // attempt to process it, by calling the syncHandler.
199
+ func (r * rangeAllocator ) processNextNodeWorkItem (ctx context.Context ) bool {
200
+ obj , shutdown := r .queue .Get ()
201
+ if shutdown {
216
202
return false
217
203
}
218
- r .nodesInProcessing .Insert (nodeName )
204
+
205
+ // We wrap this block in a func so we can defer r.queue.Done.
206
+ err := func (logger klog.Logger , obj interface {}) error {
207
+ // We call Done here so the workNodeQueue knows we have finished
208
+ // processing this item. We also must remember to call Forget if we
209
+ // do not want this work item being re-queued. For example, we do
210
+ // not call Forget if a transient error occurs, instead the item is
211
+ // put back on the queue and attempted again after a back-off
212
+ // period.
213
+ defer r .queue .Done (obj )
214
+ var key string
215
+ var ok bool
216
+ // We expect strings to come off the workNodeQueue. These are of the
217
+ // form namespace/name. We do this as the delayed nature of the
218
+ // workNodeQueue means the items in the informer cache may actually be
219
+ // more up to date that when the item was initially put onto the
220
+ // workNodeQueue.
221
+ if key , ok = obj .(string ); ! ok {
222
+ // As the item in the workNodeQueue is actually invalid, we call
223
+ // Forget here else we'd go into a loop of attempting to
224
+ // process a work item that is invalid.
225
+ r .queue .Forget (obj )
226
+ utilruntime .HandleError (fmt .Errorf ("expected string in workNodeQueue but got %#v" , obj ))
227
+ return nil
228
+ }
229
+ // Run the syncHandler, passing it the namespace/name string of the
230
+ // Foo resource to be synced.
231
+ if err := r .syncNode (logger , key ); err != nil {
232
+ // Put the item back on the queue to handle any transient errors.
233
+ r .queue .AddRateLimited (key )
234
+ return fmt .Errorf ("error syncing '%s': %s, requeuing" , key , err .Error ())
235
+ }
236
+ // Finally, if no error occurs we Forget this item so it does not
237
+ // get queue again until another change happens.
238
+ r .queue .Forget (obj )
239
+ logger .Info ("Successfully synced" , "key" , key )
240
+ return nil
241
+ }(klog .FromContext (ctx ), obj )
242
+
243
+ if err != nil {
244
+ utilruntime .HandleError (err )
245
+ return true
246
+ }
247
+
219
248
return true
220
249
}
221
250
222
- func (r * rangeAllocator ) removeNodeFromProcessing (nodeName string ) {
223
- r .lock .Lock ()
224
- defer r .lock .Unlock ()
225
- r .nodesInProcessing .Delete (nodeName )
251
+ func (r * rangeAllocator ) syncNode (logger klog.Logger , key string ) error {
252
+ startTime := time .Now ()
253
+ defer func () {
254
+ logger .V (4 ).Info ("Finished syncing Node request" , "node" , key , "elapsed" , time .Since (startTime ))
255
+ }()
256
+
257
+ node , err := r .nodeLister .Get (key )
258
+ if apierrors .IsNotFound (err ) {
259
+ logger .V (3 ).Info ("node has been deleted" , "node" , key )
260
+ // TODO: obtain the node object information to call ReleaseCIDR from here
261
+ // and retry if there is an error.
262
+ return nil
263
+ }
264
+ if err != nil {
265
+ return err
266
+ }
267
+ // Check the DeletionTimestamp to determine if object is under deletion.
268
+ if ! node .DeletionTimestamp .IsZero () {
269
+ logger .V (3 ).Info ("node is being deleted" , "node" , key )
270
+ return r .ReleaseCIDR (logger , node )
271
+ }
272
+ return r .AllocateOrOccupyCIDR (logger , node )
226
273
}
227
274
228
275
// marks node.PodCIDRs[...] as used in allocator's tracked cidrSet
229
276
func (r * rangeAllocator ) occupyCIDRs (node * v1.Node ) error {
230
- defer r .removeNodeFromProcessing (node .Name )
231
277
if len (node .Spec .PodCIDRs ) == 0 {
232
278
return nil
233
279
}
@@ -257,34 +303,25 @@ func (r *rangeAllocator) AllocateOrOccupyCIDR(logger klog.Logger, node *v1.Node)
257
303
if node == nil {
258
304
return nil
259
305
}
260
- if ! r .insertNodeToProcessing (node .Name ) {
261
- logger .V (2 ).Info ("Node is already in a process of CIDR assignment" , "node" , klog .KObj (node ))
262
- return nil
263
- }
264
306
265
307
if len (node .Spec .PodCIDRs ) > 0 {
266
308
return r .occupyCIDRs (node )
267
309
}
268
- // allocate and queue the assignment
269
- allocated := nodeReservedCIDRs {
270
- nodeName : node .Name ,
271
- allocatedCIDRs : make ([]* net.IPNet , len (r .cidrSets )),
272
- }
310
+
311
+ allocatedCIDRs := make ([]* net.IPNet , len (r .cidrSets ))
273
312
274
313
for idx := range r .cidrSets {
275
314
podCIDR , err := r .cidrSets [idx ].AllocateNext ()
276
315
if err != nil {
277
- r .removeNodeFromProcessing (node .Name )
278
316
controllerutil .RecordNodeStatusChange (logger , r .recorder , node , "CIDRNotAvailable" )
279
317
return fmt .Errorf ("failed to allocate cidr from cluster cidr at idx:%v: %v" , idx , err )
280
318
}
281
- allocated . allocatedCIDRs [idx ] = podCIDR
319
+ allocatedCIDRs [idx ] = podCIDR
282
320
}
283
321
284
322
//queue the assignment
285
- logger .V (4 ).Info ("Putting node with CIDR into the work queue" , "node" , klog .KObj (node ), "CIDRs" , allocated .allocatedCIDRs )
286
- r .nodeCIDRUpdateChannel <- allocated
287
- return nil
323
+ logger .V (4 ).Info ("Putting node with CIDR into the work queue" , "node" , klog .KObj (node ), "CIDRs" , allocatedCIDRs )
324
+ return r .updateCIDRsAllocation (logger , node .Name , allocatedCIDRs )
288
325
}
289
326
290
327
// ReleaseCIDR marks node.podCIDRs[...] as unused in our tracked cidrSets
@@ -336,21 +373,20 @@ func (r *rangeAllocator) filterOutServiceRange(logger klog.Logger, serviceCIDR *
336
373
}
337
374
338
375
// updateCIDRsAllocation assigns CIDR to Node and sends an update to the API server.
339
- func (r * rangeAllocator ) updateCIDRsAllocation (logger klog.Logger , data nodeReservedCIDRs ) error {
376
+ func (r * rangeAllocator ) updateCIDRsAllocation (logger klog.Logger , nodeName string , allocatedCIDRs [] * net. IPNet ) error {
340
377
var err error
341
378
var node * v1.Node
342
- defer r .removeNodeFromProcessing (data .nodeName )
343
- cidrsString := ipnetToStringList (data .allocatedCIDRs )
344
- node , err = r .nodeLister .Get (data .nodeName )
379
+ cidrsString := ipnetToStringList (allocatedCIDRs )
380
+ node , err = r .nodeLister .Get (nodeName )
345
381
if err != nil {
346
- logger .Error (err , "Failed while getting node for updating Node.Spec.PodCIDRs" , "node" , klog .KRef ("" , data . nodeName ))
382
+ logger .Error (err , "Failed while getting node for updating Node.Spec.PodCIDRs" , "node" , klog .KRef ("" , nodeName ))
347
383
return err
348
384
}
349
385
350
386
// if cidr list matches the proposed.
351
387
// then we possibly updated this node
352
388
// and just failed to ack the success.
353
- if len (node .Spec .PodCIDRs ) == len (data . allocatedCIDRs ) {
389
+ if len (node .Spec .PodCIDRs ) == len (allocatedCIDRs ) {
354
390
match := true
355
391
for idx , cidr := range cidrsString {
356
392
if node .Spec .PodCIDRs [idx ] != cidr {
@@ -359,15 +395,15 @@ func (r *rangeAllocator) updateCIDRsAllocation(logger klog.Logger, data nodeRese
359
395
}
360
396
}
361
397
if match {
362
- logger .V (4 ).Info ("Node already has allocated CIDR. It matches the proposed one" , "node" , klog .KObj (node ), "CIDRs" , data . allocatedCIDRs )
398
+ logger .V (4 ).Info ("Node already has allocated CIDR. It matches the proposed one" , "node" , klog .KObj (node ), "CIDRs" , allocatedCIDRs )
363
399
return nil
364
400
}
365
401
}
366
402
367
403
// node has cidrs, release the reserved
368
404
if len (node .Spec .PodCIDRs ) != 0 {
369
405
logger .Error (nil , "Node already has a CIDR allocated. Releasing the new one" , "node" , klog .KObj (node ), "podCIDRs" , node .Spec .PodCIDRs )
370
- for idx , cidr := range data . allocatedCIDRs {
406
+ for idx , cidr := range allocatedCIDRs {
371
407
if releaseErr := r .cidrSets [idx ].Release (cidr ); releaseErr != nil {
372
408
logger .Error (releaseErr , "Error when releasing CIDR" , "index" , idx , "CIDR" , cidr )
373
409
}
@@ -390,7 +426,7 @@ func (r *rangeAllocator) updateCIDRsAllocation(logger klog.Logger, data nodeRese
390
426
// NodeController restart will return all falsely allocated CIDRs to the pool.
391
427
if ! apierrors .IsServerTimeout (err ) {
392
428
logger .Error (err , "CIDR assignment for node failed. Releasing allocated CIDR" , "node" , klog .KObj (node ))
393
- for idx , cidr := range data . allocatedCIDRs {
429
+ for idx , cidr := range allocatedCIDRs {
394
430
if releaseErr := r .cidrSets [idx ].Release (cidr ); releaseErr != nil {
395
431
logger .Error (releaseErr , "Error releasing allocated CIDR for node" , "node" , klog .KObj (node ))
396
432
}
0 commit comments