4141 * idle state 2, the third bin spans from the target residency of idle state 2
4242 * up to, but not including, the target residency of idle state 3 and so on.
4343 * The last bin spans from the target residency of the deepest idle state
44- * supplied by the driver to the scheduler tick period length or to infinity if
45- * the tick period length is less than the target residency of that state. In
46- * the latter case, the governor also counts events with the measured idle
47- * duration between the tick period length and the target residency of the
48- * deepest idle state.
44+ * supplied by the driver to infinity.
4945 *
5046 * Two metrics called "hits" and "intercepts" are associated with each bin.
5147 * They are updated every time before selecting an idle state for the given CPU
6056 * into by the sleep length (these events are also referred to as "intercepts"
6157 * below).
6258 *
59+ * The governor also counts "intercepts" with the measured idle duration below
60+ * the tick period length and uses this information when deciding whether or not
61+ * to stop the scheduler tick.
62+ *
6363 * In order to select an idle state for a CPU, the governor takes the following
6464 * steps (modulo the possible latency constraint that must be taken into account
6565 * too):
105105
106106#include "gov.h"
107107
108+ /*
109+ * Idle state exit latency threshold used for deciding whether or not to check
110+ * the time till the closest expected timer event.
111+ */
112+ #define LATENCY_THRESHOLD_NS (RESIDENCY_THRESHOLD_NS / 2)
113+
108114/*
109115 * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
110116 * is used for decreasing metrics on a regular basis.
@@ -124,18 +130,20 @@ struct teo_bin {
124130
125131/**
126132 * struct teo_cpu - CPU data used by the TEO cpuidle governor.
127- * @time_span_ns: Time between idle state selection and post-wakeup update.
128133 * @sleep_length_ns: Time till the closest timer event (at the selection time).
129134 * @state_bins: Idle state data bins for this CPU.
130135 * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
131- * @tick_hits: Number of "hits" after TICK_NSEC.
136+ * @tick_intercepts: "Intercepts" before TICK_NSEC.
137+ * @short_idles: Wakeups after short idle periods.
138+ * @artificial_wakeup: Set if the wakeup has been triggered by a safety net.
132139 */
133140struct teo_cpu {
134- s64 time_span_ns ;
135141 s64 sleep_length_ns ;
136142 struct teo_bin state_bins [CPUIDLE_STATE_MAX ];
137143 unsigned int total ;
138- unsigned int tick_hits ;
144+ unsigned int tick_intercepts ;
145+ unsigned int short_idles ;
146+ bool artificial_wakeup ;
139147};
140148
141149static DEFINE_PER_CPU (struct teo_cpu , teo_cpus ) ;
@@ -152,38 +160,34 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
152160 s64 target_residency_ns ;
153161 u64 measured_ns ;
154162
155- if (cpu_data -> time_span_ns >= cpu_data -> sleep_length_ns ) {
163+ cpu_data -> short_idles -= cpu_data -> short_idles >> DECAY_SHIFT ;
164+
165+ if (cpu_data -> artificial_wakeup ) {
156166 /*
157- * One of the safety nets has triggered or the wakeup was close
158- * enough to the closest timer event expected at the idle state
159- * selection time to be discarded.
167+ * If one of the safety nets has triggered, assume that this
168+ * might have been a long sleep.
160169 */
161170 measured_ns = U64_MAX ;
162171 } else {
163172 u64 lat_ns = drv -> states [dev -> last_state_idx ].exit_latency_ns ;
164173
165- /*
166- * The computations below are to determine whether or not the
167- * (saved) time till the next timer event and the measured idle
168- * duration fall into the same "bin", so use last_residency_ns
169- * for that instead of time_span_ns which includes the cpuidle
170- * overhead.
171- */
172174 measured_ns = dev -> last_residency_ns ;
173175 /*
174176 * The delay between the wakeup and the first instruction
175177 * executed by the CPU is not likely to be worst-case every
176178 * time, so take 1/2 of the exit latency as a very rough
177179 * approximation of the average of it.
178180 */
179- if (measured_ns >= lat_ns )
181+ if (measured_ns >= lat_ns ) {
180182 measured_ns -= lat_ns / 2 ;
181- else
183+ if (measured_ns < RESIDENCY_THRESHOLD_NS )
184+ cpu_data -> short_idles += PULSE ;
185+ } else {
182186 measured_ns /= 2 ;
187+ cpu_data -> short_idles += PULSE ;
188+ }
183189 }
184190
185- cpu_data -> total = 0 ;
186-
187191 /*
188192 * Decay the "hits" and "intercepts" metrics for all of the bins and
189193 * find the bins that the sleep length and the measured idle duration
@@ -195,8 +199,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
195199 bin -> hits -= bin -> hits >> DECAY_SHIFT ;
196200 bin -> intercepts -= bin -> intercepts >> DECAY_SHIFT ;
197201
198- cpu_data -> total += bin -> hits + bin -> intercepts ;
199-
200202 target_residency_ns = drv -> states [i ].target_residency_ns ;
201203
202204 if (target_residency_ns <= cpu_data -> sleep_length_ns ) {
@@ -206,38 +208,22 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
206208 }
207209 }
208210
209- /*
210- * If the deepest state's target residency is below the tick length,
211- * make a record of it to help teo_select() decide whether or not
212- * to stop the tick. This effectively adds an extra hits-only bin
213- * beyond the last state-related one.
214- */
215- if (target_residency_ns < TICK_NSEC ) {
216- cpu_data -> tick_hits -= cpu_data -> tick_hits >> DECAY_SHIFT ;
217-
218- cpu_data -> total += cpu_data -> tick_hits ;
219-
220- if (TICK_NSEC <= cpu_data -> sleep_length_ns ) {
221- idx_timer = drv -> state_count ;
222- if (TICK_NSEC <= measured_ns ) {
223- cpu_data -> tick_hits += PULSE ;
224- goto end ;
225- }
226- }
227- }
228-
211+ cpu_data -> tick_intercepts -= cpu_data -> tick_intercepts >> DECAY_SHIFT ;
229212 /*
230213 * If the measured idle duration falls into the same bin as the sleep
231214 * length, this is a "hit", so update the "hits" metric for that bin.
232215 * Otherwise, update the "intercepts" metric for the bin fallen into by
233216 * the measured idle duration.
234217 */
235- if (idx_timer == idx_duration )
218+ if (idx_timer == idx_duration ) {
236219 cpu_data -> state_bins [idx_timer ].hits += PULSE ;
237- else
220+ } else {
238221 cpu_data -> state_bins [idx_duration ].intercepts += PULSE ;
222+ if (TICK_NSEC <= measured_ns )
223+ cpu_data -> tick_intercepts += PULSE ;
224+ }
239225
240- end :
226+ cpu_data -> total -= cpu_data -> total >> DECAY_SHIFT ;
241227 cpu_data -> total += PULSE ;
242228}
243229
@@ -285,14 +271,12 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
285271 struct teo_cpu * cpu_data = per_cpu_ptr (& teo_cpus , dev -> cpu );
286272 s64 latency_req = cpuidle_governor_latency_req (dev -> cpu );
287273 ktime_t delta_tick = TICK_NSEC / 2 ;
288- unsigned int tick_intercept_sum = 0 ;
289274 unsigned int idx_intercept_sum = 0 ;
290275 unsigned int intercept_sum = 0 ;
291276 unsigned int idx_hit_sum = 0 ;
292277 unsigned int hit_sum = 0 ;
293278 int constraint_idx = 0 ;
294279 int idx0 = 0 , idx = -1 ;
295- int prev_intercept_idx ;
296280 s64 duration_ns ;
297281 int i ;
298282
@@ -301,10 +285,14 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
301285 dev -> last_state_idx = -1 ;
302286 }
303287
304- cpu_data -> time_span_ns = local_clock ();
305288 /*
306- * Set the expected sleep length to infinity in case of an early
307- * return.
289+ * Set the sleep length to infinity in case the invocation of
290+ * tick_nohz_get_sleep_length() below is skipped, in which case it won't
291+ * be known whether or not the subsequent wakeup is caused by a timer.
292+ * It is generally fine to count the wakeup as an intercept then, except
293+ * for the cases when the CPU is mostly woken up by timers and there may
294+ * be opportunities to ask for a deeper idle state when no imminent
295+ * timers are scheduled which may be missed.
308296 */
309297 cpu_data -> sleep_length_ns = KTIME_MAX ;
310298
@@ -360,17 +348,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
360348 goto end ;
361349 }
362350
363- tick_intercept_sum = intercept_sum +
364- cpu_data -> state_bins [drv -> state_count - 1 ].intercepts ;
365-
366351 /*
367352 * If the sum of the intercepts metric for all of the idle states
368353 * shallower than the current candidate one (idx) is greater than the
369354 * sum of the intercepts and hits metrics for the candidate state and
370- * all of the deeper states a shallower idle state is likely to be a
355+ * all of the deeper states, a shallower idle state is likely to be a
371356 * better choice.
372357 */
373- prev_intercept_idx = idx ;
374358 if (2 * idx_intercept_sum > cpu_data -> total - idx_hit_sum ) {
375359 int first_suitable_idx = idx ;
376360
@@ -396,41 +380,38 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
396380 * first enabled state that is deep enough.
397381 */
398382 if (teo_state_ok (i , drv ) &&
399- !dev -> states_usage [i ].disable )
383+ !dev -> states_usage [i ].disable ) {
400384 idx = i ;
401- else
402- idx = first_suitable_idx ;
403-
385+ break ;
386+ }
387+ idx = first_suitable_idx ;
404388 break ;
405389 }
406390
407391 if (dev -> states_usage [i ].disable )
408392 continue ;
409393
410- if (! teo_state_ok (i , drv )) {
394+ if (teo_state_ok (i , drv )) {
411395 /*
412- * The current state is too shallow, but if an
413- * alternative candidate state has been found,
414- * it may still turn out to be a better choice.
396+ * The current state is deep enough, but still
397+ * there may be a better one.
415398 */
416- if (first_suitable_idx != idx )
417- continue ;
418-
419- break ;
399+ first_suitable_idx = i ;
400+ continue ;
420401 }
421402
422- first_suitable_idx = i ;
403+ /*
404+ * The current state is too shallow, so if no suitable
405+ * states other than the initial candidate have been
406+ * found, give up (the remaining states to check are
407+ * shallower still), but otherwise the first suitable
408+ * state other than the initial candidate may turn out
409+ * to be preferable.
410+ */
411+ if (first_suitable_idx == idx )
412+ break ;
423413 }
424414 }
425- if (!idx && prev_intercept_idx ) {
426- /*
427- * We have to query the sleep length here otherwise we don't
428- * know after wakeup if our guess was correct.
429- */
430- duration_ns = tick_nohz_get_sleep_length (& delta_tick );
431- cpu_data -> sleep_length_ns = duration_ns ;
432- goto out_tick ;
433- }
434415
435416 /*
436417 * If there is a latency constraint, it may be necessary to select an
@@ -440,24 +421,39 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
440421 idx = constraint_idx ;
441422
442423 /*
443- * Skip the timers check if state 0 is the current candidate one,
444- * because an immediate non-timer wakeup is expected in that case.
424+ * If either the candidate state is state 0 or its target residency is
425+ * low enough, there is basically nothing more to do, but if the sleep
426+ * length is not updated, the subsequent wakeup will be counted as an
427+ * "intercept" which may be problematic in the cases when timer wakeups
428+ * are dominant. Namely, it may effectively prevent deeper idle states
429+ * from being selected at one point even if no imminent timers are
430+ * scheduled.
431+ *
432+ * However, frequent timers in the RESIDENCY_THRESHOLD_NS range on one
433+ * CPU are unlikely (user space has a default 50 us slack value for
434+ * hrtimers and there are relatively few timers with a lower deadline
435+ * value in the kernel), and even if they did happen, the potential
436+ * benefit from using a deep idle state in that case would be
437+ * questionable anyway for latency reasons. Thus if the measured idle
438+ * duration falls into that range in the majority of cases, assume
439+ * non-timer wakeups to be dominant and skip updating the sleep length
440+ * to reduce latency.
441+ *
442+ * Also, if the latency constraint is sufficiently low, it will force
443+ * shallow idle states regardless of the wakeup type, so the sleep
444+ * length need not be known in that case.
445445 */
446- if (!idx )
447- goto out_tick ;
448-
449- /*
450- * If state 0 is a polling one, check if the target residency of
451- * the current candidate state is low enough and skip the timers
452- * check in that case too.
453- */
454- if ((drv -> states [0 ].flags & CPUIDLE_FLAG_POLLING ) &&
455- drv -> states [idx ].target_residency_ns < RESIDENCY_THRESHOLD_NS )
446+ if ((!idx || drv -> states [idx ].target_residency_ns < RESIDENCY_THRESHOLD_NS ) &&
447+ (2 * cpu_data -> short_idles >= cpu_data -> total ||
448+ latency_req < LATENCY_THRESHOLD_NS ))
456449 goto out_tick ;
457450
458451 duration_ns = tick_nohz_get_sleep_length (& delta_tick );
459452 cpu_data -> sleep_length_ns = duration_ns ;
460453
454+ if (!idx )
455+ goto out_tick ;
456+
461457 /*
462458 * If the closest expected timer is before the target residency of the
463459 * candidate state, a shallower one needs to be found.
@@ -474,7 +470,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
474470 * total wakeup events, do not stop the tick.
475471 */
476472 if (drv -> states [idx ].target_residency_ns < TICK_NSEC &&
477- tick_intercept_sum > cpu_data -> total / 2 + cpu_data -> total / 8 )
473+ cpu_data -> tick_intercepts > cpu_data -> total / 2 + cpu_data -> total / 8 )
478474 duration_ns = TICK_NSEC / 2 ;
479475
480476end :
@@ -511,17 +507,16 @@ static void teo_reflect(struct cpuidle_device *dev, int state)
511507 struct teo_cpu * cpu_data = per_cpu_ptr (& teo_cpus , dev -> cpu );
512508
513509 dev -> last_state_idx = state ;
514- /*
515- * If the wakeup was not "natural", but triggered by one of the safety
516- * nets, assume that the CPU might have been idle for the entire sleep
517- * length time.
518- */
519510 if (dev -> poll_time_limit ||
520511 (tick_nohz_idle_got_tick () && cpu_data -> sleep_length_ns > TICK_NSEC )) {
512+ /*
513+ * The wakeup was not "genuine", but triggered by one of the
514+ * safety nets.
515+ */
521516 dev -> poll_time_limit = false;
522- cpu_data -> time_span_ns = cpu_data -> sleep_length_ns ;
517+ cpu_data -> artificial_wakeup = true ;
523518 } else {
524- cpu_data -> time_span_ns = local_clock () - cpu_data -> time_span_ns ;
519+ cpu_data -> artificial_wakeup = false ;
525520 }
526521}
527522
0 commit comments