@@ -24,6 +24,7 @@ import (
2424 "time"
2525
2626 "github.com/milvus-io/milvus/internal/log"
27+ "github.com/milvus-io/milvus/internal/proto/commonpb"
2728 "github.com/milvus-io/milvus/internal/proto/datapb"
2829 "github.com/milvus-io/milvus/internal/util/tsoutil"
2930 "go.uber.org/zap"
@@ -32,22 +33,19 @@ import (
3233// TODO this num should be determined by resources of datanode, for now, we set to a fixed value for simple
3334// TODO we should split compaction into different priorities, small compaction helps to merge segment, large compaction helps to handle delta and expiration of large segments
3435const (
35- maxParallelCompactionTaskNum = 100
36- compactionTimeout = 10 * time .Second
37- compactionExpirationCheckInterval = 60 * time .Second
36+ maxParallelCompactionTaskNum = 100
37+ rpcCompactionTimeout = 10 * time .Second
3838)
3939
4040type compactionPlanContext interface {
4141 start ()
4242 stop ()
4343 // execCompactionPlan start to execute plan and return immediately
4444 execCompactionPlan (signal * compactionSignal , plan * datapb.CompactionPlan ) error
45- // completeCompaction record the result of a compaction
46- completeCompaction (result * datapb.CompactionResult ) error
4745 // getCompaction return compaction task. If planId does not exist, return nil.
4846 getCompaction (planID int64 ) * compactionTask
49- // expireCompaction set the compaction state to expired
50- expireCompaction (ts Timestamp ) error
47+ // updateCompaction set the compaction state to timeout or completed
48+ updateCompaction (ts Timestamp ) error
5149 // isFull return true if the task pool is full
5250 isFull () bool
5351 // get compaction tasks by signal id
@@ -59,6 +57,7 @@ type compactionTaskState int8
5957const (
6058 executing compactionTaskState = iota + 1
6159 completed
60+ failed
6261 timeout
6362)
6463
@@ -102,23 +101,26 @@ type compactionPlanHandler struct {
102101 wg sync.WaitGroup
103102 flushCh chan UniqueID
104103 segRefer * SegmentReferenceManager
104+ parallelCh map [int64 ]chan struct {}
105105}
106106
107107func newCompactionPlanHandler (sessions * SessionManager , cm * ChannelManager , meta * meta ,
108108 allocator allocator , flush chan UniqueID , segRefer * SegmentReferenceManager ) * compactionPlanHandler {
109109 return & compactionPlanHandler {
110- plans : make (map [int64 ]* compactionTask ),
111- chManager : cm ,
112- meta : meta ,
113- sessions : sessions ,
114- allocator : allocator ,
115- flushCh : flush ,
116- segRefer : segRefer ,
110+ plans : make (map [int64 ]* compactionTask ),
111+ chManager : cm ,
112+ meta : meta ,
113+ sessions : sessions ,
114+ allocator : allocator ,
115+ flushCh : flush ,
116+ segRefer : segRefer ,
117+ parallelCh : make (map [int64 ]chan struct {}),
117118 }
118119}
119120
120121func (c * compactionPlanHandler ) start () {
121- ticker := time .NewTicker (compactionExpirationCheckInterval )
122+ interval := time .Duration (Params .DataCoordCfg .CompactionCheckIntervalInSeconds ) * time .Second
123+ ticker := time .NewTicker (interval )
122124 c .quit = make (chan struct {})
123125 c .wg .Add (1 )
124126
@@ -139,7 +141,7 @@ func (c *compactionPlanHandler) start() {
139141 continue
140142 }
141143 cancel ()
142- _ = c .expireCompaction (ts )
144+ _ = c .updateCompaction (ts )
143145 }
144146 }
145147 }()
@@ -162,17 +164,40 @@ func (c *compactionPlanHandler) execCompactionPlan(signal *compactionSignal, pla
162164
163165 c .setSegmentsCompacting (plan , true )
164166
165- // FIXME: check response of compaction call and restore segment state if failed
166- c .sessions .Compaction (nodeID , plan )
167+ go func () {
168+ log .Debug ("acquire queue" , zap .Int64 ("nodeID" , nodeID ), zap .Int64 ("planID" , plan .GetPlanID ()))
169+ c .acquireQueue (nodeID )
167170
168- task := & compactionTask {
169- triggerInfo : signal ,
170- plan : plan ,
171- state : executing ,
172- dataNodeID : nodeID ,
173- }
174- c .plans [plan .PlanID ] = task
175- c .executingTaskNum ++
171+ ts , err := c .allocator .allocTimestamp (context .TODO ())
172+ if err != nil {
173+ log .Warn ("Alloc start time for CompactionPlan failed" , zap .Int64 ("planID" , plan .GetPlanID ()))
174+ return
175+ }
176+ plan .StartTime = ts
177+
178+ c .mu .Lock ()
179+ task := & compactionTask {
180+ triggerInfo : signal ,
181+ plan : plan ,
182+ state : executing ,
183+ dataNodeID : nodeID ,
184+ }
185+ c .plans [plan .PlanID ] = task
186+ c .executingTaskNum ++
187+ c .mu .Unlock ()
188+
189+ err = c .sessions .Compaction (nodeID , plan )
190+ if err != nil {
191+ log .Warn ("Try to Compaction but DataNode rejected" , zap .Any ("TargetNodeId" , nodeID ), zap .Any ("planId" , plan .GetPlanID ()))
192+ c .mu .Lock ()
193+ delete (c .plans , plan .PlanID )
194+ c .executingTaskNum --
195+ c .mu .Unlock ()
196+ return
197+ }
198+
199+ log .Debug ("start compaction" , zap .Int64 ("nodeID" , nodeID ), zap .Int64 ("planID" , plan .GetPlanID ()))
200+ }()
176201 return nil
177202}
178203
@@ -182,11 +207,9 @@ func (c *compactionPlanHandler) setSegmentsCompacting(plan *datapb.CompactionPla
182207 }
183208}
184209
185- // completeCompaction record the result of a compaction
210+ // complete a compaction task
211+ // not threadsafe, only can be used internally
186212func (c * compactionPlanHandler ) completeCompaction (result * datapb.CompactionResult ) error {
187- c .mu .Lock ()
188- defer c .mu .Unlock ()
189-
190213 planID := result .PlanID
191214 if _ , ok := c .plans [planID ]; ! ok {
192215 return fmt .Errorf ("plan %d is not found" , planID )
@@ -219,6 +242,8 @@ func (c *compactionPlanHandler) completeCompaction(result *datapb.CompactionResu
219242 }
220243 // TODO: when to clean task list
221244
245+ nodeID := c .plans [planID ].dataNodeID
246+ c .releaseQueue (nodeID )
222247 return nil
223248}
224249
@@ -241,21 +266,35 @@ func (c *compactionPlanHandler) getCompaction(planID int64) *compactionTask {
241266}
242267
243268// expireCompaction set the compaction state to expired
244- func (c * compactionPlanHandler ) expireCompaction (ts Timestamp ) error {
269+ func (c * compactionPlanHandler ) updateCompaction (ts Timestamp ) error {
270+ planStates := c .sessions .GetCompactionState ()
271+
245272 c .mu .Lock ()
246273 defer c .mu .Unlock ()
247274
248275 tasks := c .getExecutingCompactions ()
249276 for _ , task := range tasks {
250- if ! c .isTimeout (ts , task .plan .GetStartTime (), task .plan .GetTimeoutInSeconds ()) {
251- continue
277+ stateResult , ok := planStates [task .plan .PlanID ]
278+ state := stateResult .GetState ()
279+ planID := task .plan .PlanID
280+
281+ // check wether the state of CompactionPlan is working
282+ if ok {
283+ // check wether the CompactionPlan is timeout
284+ if state == commonpb .CompactionState_Executing && ! c .isTimeout (ts , task .plan .GetStartTime (), task .plan .GetTimeoutInSeconds ()) {
285+ continue
286+ }
287+ if state == commonpb .CompactionState_Completed {
288+ c .completeCompaction (stateResult .GetResult ())
289+ continue
290+ }
291+ c .plans [planID ] = c .plans [planID ].shadowClone (setState (timeout ))
252292 }
253293
294+ c .plans [planID ] = c .plans [planID ].shadowClone (setState (failed ))
254295 c .setSegmentsCompacting (task .plan , false )
255-
256- planID := task .plan .PlanID
257- c .plans [planID ] = c .plans [planID ].shadowClone (setState (timeout ))
258296 c .executingTaskNum --
297+ c .releaseQueue (task .dataNodeID )
259298 }
260299
261300 return nil
@@ -267,6 +306,29 @@ func (c *compactionPlanHandler) isTimeout(now Timestamp, start Timestamp, timeou
267306 return int32 (ts .Sub (startTime ).Seconds ()) >= timeout
268307}
269308
309+ func (c * compactionPlanHandler ) acquireQueue (nodeID int64 ) {
310+ c .mu .Lock ()
311+ _ , ok := c .parallelCh [nodeID ]
312+ if ! ok {
313+ c .parallelCh [nodeID ] = make (chan struct {}, calculateParallel ())
314+ }
315+ c .mu .Unlock ()
316+
317+ c .mu .RLock ()
318+ ch := c .parallelCh [nodeID ]
319+ c .mu .RUnlock ()
320+ ch <- struct {}{}
321+ }
322+
323+ func (c * compactionPlanHandler ) releaseQueue (nodeID int64 ) {
324+ log .Debug ("try to release queue" , zap .Int64 ("nodeID" , nodeID ))
325+ ch , ok := c .parallelCh [nodeID ]
326+ if ! ok {
327+ return
328+ }
329+ <- ch
330+ }
331+
270332// isFull return true if the task pool is full
271333func (c * compactionPlanHandler ) isFull () bool {
272334 c .mu .RLock ()
@@ -285,13 +347,17 @@ func (c *compactionPlanHandler) getExecutingCompactions() []*compactionTask {
285347 return tasks
286348}
287349
288- // get compaction tasks by signal id
350+ // get compaction tasks by signal id; if signalID == 0 return all tasks
289351func (c * compactionPlanHandler ) getCompactionTasksBySignalID (signalID int64 ) []* compactionTask {
290352 c .mu .RLock ()
291353 defer c .mu .RUnlock ()
292354
293355 var tasks []* compactionTask
294356 for _ , t := range c .plans {
357+ if signalID == 0 {
358+ tasks = append (tasks , t )
359+ continue
360+ }
295361 if t .triggerInfo .id != signalID {
296362 continue
297363 }
@@ -313,3 +379,13 @@ func setResult(result *datapb.CompactionResult) compactionTaskOpt {
313379 task .result = result
314380 }
315381}
382+
383+ // 0.5*min(8, NumCPU/2)
384+ func calculateParallel () int {
385+ return 2
386+ //cores := runtime.NumCPU()
387+ //if cores < 16 {
388+ //return 4
389+ //}
390+ //return cores / 2
391+ }
0 commit comments