BUG/MINOR: sched: properly account for the CPU time of dying tasks

wtarreau · wtarreau · commit 62b5b96bcc91 · 2022-09-08T14:19:15.000+02:00
When task profiling is enabled, the scheduler can measure and report
the cumulated time spent in each task and their respective latencies. But
this was wrong for tasks with few wakeups as well as for self-waking ones,
because the call date needed to measure how long it takes to process the
task is retrieved in the task itself (-&gt;wake_date was turned to the call
date), and we could face two conditions:
  - a new wakeup while the task is executing would reset the -&gt;wake_date
    field before returning and make abnormally low values being reported;
    that was likely the case for taskèrun_applet for self-waking applets;

  - when the task dies, NULL is returned and the call date couldn't be
    retrieved, so that CPU time was not being accounted for. This was
    particularly visible with process_stream() which is usually called
    only twice per request, and whose time was systematically halved.

The cleanest solution here is to keep in mind that the scheduler already
uses quite a bit of local context in th_ctx, and place the intermediary
values there so that they cannot vanish. The wake_date has to be reset
immediately once read, and only its copy is used along the function. Note
that this must be done both for tasks and tasklet, and that until recently
tasklets were also able to report wrong values due to their sole dependency
on TH_FL_TASK_PROFILING between tests.

One nice benefit for future improvements is that such information will now
be available from the task without having to be stored into the task itself
anymore.

Since the tasklet part was computed on wrapping 32-bit arithmetics and
the task one was on 64-bit, the values were now consistently moved to
32-bit as it's already largely sufficient (4s spent in a task is more
than twice what the watchdog would tolerate). Some further cleanups might
be necessary, but the patch aimed at staying minimal.

Task profiling output after 1 million HTTP request previously looked like
this:

  Tasks activity:
    function                      calls   cpu_tot   cpu_avg   lat_tot   lat_avg
    h1_io_cb                    2012338   4.850s    2.410us   12.91s    6.417us
    process_stream              2000136   9.594s    4.796us   34.26s    17.13us
    sc_conn_io_cb               2000135   1.973s    986.0ns   30.24s    15.12us
    h1_timeout_task                 137      -         -      2.649ms   19.34us
    accept_queue_process             49   152.3us   3.107us   321.7yr   6.564yr
    main+0x146430                     7   5.250us   750.0ns   25.92us   3.702us
    srv_cleanup_idle_conns            1   559.0ns   559.0ns   918.0ns   918.0ns
    task_run_applet                   1      -         -      2.162us   2.162us

  Now it looks like this:
  Tasks activity:
    function                      calls   cpu_tot   cpu_avg   lat_tot   lat_avg
    h1_io_cb                    2014194   4.794s    2.380us   13.75s    6.826us
    process_stream              2000151   20.01s    10.00us   36.04s    18.02us
    sc_conn_io_cb               2000148   2.167s    1.083us   32.27s    16.13us
    h1_timeout_task                 198   54.24us   273.0ns   3.487ms   17.61us
    accept_queue_process             52   158.3us   3.044us   409.9us   7.882us
    main+0x1466e0                    18   16.77us   931.0ns   63.98us   3.554us
    srv_cleanup_toremove_conns        8   282.1us   35.26us   546.8us   68.35us
    srv_cleanup_idle_conns            3   149.2us   49.73us   8.131us   2.710us
    task_run_applet                   3   268.1us   89.38us   11.61us   3.871us

Note the two-fold difference on process_stream().

This feature is essentially used for debugging so it has extremely limited
impact. However it's used quite a bit more in bug reports and it would be
desirable that at least 2.6 gets this fix backported. It depends on at least
these two previous patches which will then also have to be backported:

     MINOR: task: permanently enable latency measurement on tasklets
     CLEANUP: task: rename -&gt;call_date to -&gt;wake_date
diff --git a/include/haproxy/task-t.h b/include/haproxy/task-t.h
@@ -112,7 +112,7 @@ struct task {
 	int expire;			/* next expiration date for this task, in ticks */
 	short nice;                     /* task prio from -1024 to +1024 */
 	short tid;                      /* TID where it's allowed to run, <0 if anywhere */
-	uint64_t wake_date;		/* date of the last task wakeup */
+	uint32_t wake_date;		/* date of the last task wakeup */
 	uint64_t lat_time;		/* total latency time experienced */
 	uint64_t cpu_time;              /* total CPU time consumed */
 };
diff --git a/include/haproxy/tinfo-t.h b/include/haproxy/tinfo-t.h
@@ -129,6 +129,9 @@ struct thread_ctx {
 	uint flags;                         /* thread flags, TH_FL_*, atomic! */
 	/* 32-bit hole here */
 
+	uint32_t sched_wake_date;           /* current task/tasklet's wake date or 0 */
+	uint32_t sched_call_date;           /* current task/tasklet's call date (valid if sched_wake_date > 0) */
+
 	uint64_t prev_cpu_time;             /* previous per thread CPU time */
 	uint64_t prev_mono_time;            /* previous system wide monotonic time  */
 
diff --git a/src/task.c b/src/task.c
@@ -564,19 +564,19 @@ unsigned int run_tasks_from_lists(unsigned int budgets[])
 		_HA_ATOMIC_DEC(&th_ctx->rq_total);
 
 		if (t->state & TASK_F_TASKLET) {
-			uint64_t before = 0;
-
 			LIST_DEL_INIT(&((struct tasklet *)t)->list);
 			__ha_barrier_store();
 
-			if (unlikely(_HA_ATOMIC_LOAD(&th_ctx->flags) & TH_FL_TASK_PROFILING)) {
-				profile_entry = sched_activity_entry(sched_activity, t->process);
-				before = now_mono_time();
+			th_ctx->sched_wake_date = ((struct tasklet *)t)->wake_date;
+			if (th_ctx->sched_wake_date) {
+				uint32_t now_ns = now_mono_time();
+				uint32_t lat = now_ns - th_ctx->sched_wake_date;
 
-				if (((struct tasklet *)t)->wake_date) {
-					HA_ATOMIC_ADD(&profile_entry->lat_time, (uint32_t)(before - ((struct tasklet *)t)->wake_date));
-					((struct tasklet *)t)->wake_date = 0;
-				}
+				((struct tasklet *)t)->wake_date = 0;
+				th_ctx->sched_call_date = now_ns;
+				profile_entry = sched_activity_entry(sched_activity, t->process);
+				HA_ATOMIC_ADD(&profile_entry->lat_time, lat);
+				HA_ATOMIC_INC(&profile_entry->calls);
 			}
 
 			state = _HA_ATOMIC_FETCH_AND(&t->state, TASK_PERSISTENT);
@@ -593,10 +593,8 @@ unsigned int run_tasks_from_lists(unsigned int budgets[])
 				continue;
 			}
 
-			if (unlikely(_HA_ATOMIC_LOAD(&th_ctx->flags) & TH_FL_TASK_PROFILING)) {
-				HA_ATOMIC_INC(&profile_entry->calls);
-				HA_ATOMIC_ADD(&profile_entry->cpu_time, now_mono_time() - before);
-			}
+			if (th_ctx->sched_wake_date)
+				HA_ATOMIC_ADD(&profile_entry->cpu_time, (uint32_t)(now_mono_time() - th_ctx->sched_call_date));
 
 			done++;
 			th_ctx->current = NULL;
@@ -607,6 +605,21 @@ unsigned int run_tasks_from_lists(unsigned int budgets[])
 		LIST_DEL_INIT(&((struct tasklet *)t)->list);
 		__ha_barrier_store();
 
+		th_ctx->sched_wake_date = t->wake_date;
+		if (unlikely(t->wake_date)) {
+			uint32_t now_ns = now_mono_time();
+			uint32_t lat = now_ns - t->wake_date;
+
+			t->lat_time += lat;
+			t->wake_date = 0;
+			th_ctx->sched_call_date = now_ns;
+			profile_entry = sched_activity_entry(sched_activity, t->process);
+			HA_ATOMIC_ADD(&profile_entry->lat_time, lat);
+			HA_ATOMIC_INC(&profile_entry->calls);
+		}
+
+		__ha_barrier_store();
+
 		/* We must be the exclusive owner of the TASK_RUNNING bit, and
 		 * have to be careful that the task is not being manipulated on
 		 * another thread finding it expired in wake_expired_tasks().
@@ -627,18 +640,6 @@ unsigned int run_tasks_from_lists(unsigned int budgets[])
 		/* OK then this is a regular task */
 
 		_HA_ATOMIC_DEC(&ha_thread_ctx[tid].tasks_in_list);
-		if (unlikely(t->wake_date)) {
-			uint64_t now_ns = now_mono_time();
-			uint64_t lat = now_ns - t->wake_date;
-
-			t->lat_time += lat;
-			t->wake_date = now_ns;
-			profile_entry = sched_activity_entry(sched_activity, t->process);
-			HA_ATOMIC_ADD(&profile_entry->lat_time, lat);
-			HA_ATOMIC_INC(&profile_entry->calls);
-		}
-
-		__ha_barrier_store();
 
 		/* Note for below: if TASK_KILLED arrived before we've read the state, we
 		 * directly free the task. Otherwise it will be seen after processing and
@@ -661,18 +662,20 @@ unsigned int run_tasks_from_lists(unsigned int budgets[])
 		}
 		th_ctx->current = NULL;
 		__ha_barrier_store();
+
+		/* stats are only registered for non-zero wake dates */
+		if (unlikely(th_ctx->sched_wake_date)) {
+			uint32_t cpu = (uint32_t)now_mono_time() - th_ctx->sched_call_date;
+
+			if (t)
+				t->cpu_time += cpu;
+			HA_ATOMIC_ADD(&profile_entry->cpu_time, cpu);
+		}
+
 		/* If there is a pending state  we have to wake up the task
 		 * immediately, else we defer it into wait queue
 		 */
 		if (t != NULL) {
-			if (unlikely(t->wake_date)) {
-				uint64_t cpu = now_mono_time() - t->wake_date;
-
-				t->cpu_time += cpu;
-				t->wake_date = 0;
-				HA_ATOMIC_ADD(&profile_entry->cpu_time, cpu);
-			}
-
 			state = _HA_ATOMIC_LOAD(&t->state);
 			if (unlikely(state & TASK_KILLED)) {
 				task_unlink_wq(t);