diff mbox series

[RT,2/2] tick: Fix timer storm since introduction of timersd

Message ID 20220405010752.1347437-2-frederic@kernel.org
State New
Headers show
Series [RT,1/2] rcutorture: Also force sched priority to timersd on boosting test | expand

Commit Message

Frederic Weisbecker April 5, 2022, 1:07 a.m. UTC
If timers are pending while the tick is reprogrammed on nohz_mode, the
next expiry is not armed to fire now, it is delayed one jiffy forward
instead so as not to raise an inextinguishable timer storm with such
scenario:

1) IRQ triggers and queue a timer
2) ksoftirqd() is woken up
3) IRQ tail: timer is reprogrammed to fire now
4) IRQ exit
5) TIMER interrupt
6) goto 3)

...all that until we finally reach ksoftirqd.

Unfortunately we are checking the wrong softirq vector bitmask since
timersd kthread has split from ksoftirqd. Timers now have their own
vector state field that must be checked separately. As a result, the
old timer storm is back. This shows up early on boot with extremely long
initcalls:

	[  333.004807] initcall dquot_init+0x0/0x111 returned 0 after 323822879 usecs

and the cause is uncovered with the right trace events showing just
10 microseconds between ticks (~100 000 Hz):

	swapper/-1         1dn.h111 60818582us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415486608
	swapper/-1         1dn.h111 60818592us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415496082
	swapper/-1         1dn.h111 60818601us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415505550
	swapper/-1         1dn.h111 60818611us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415515013
	swapper/-1         1dn.h111 60818620us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415524483
	swapper/-1         1dn.h111 60818630us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415533949
	swapper/-1         1dn.h111 60818639us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415543426
	swapper/-1         1dn.h111 60818649us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415553061
	swapper/-1         1dn.h111 60818658us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415562511

Fix this with checking the right timer vector state from the nohz code.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h | 12 ++++++++++++
 kernel/softirq.c          |  7 +------
 kernel/time/tick-sched.c  |  2 +-
 3 files changed, 14 insertions(+), 7 deletions(-)

Comments

Alison Chaiken April 5, 2022, 4:21 p.m. UTC | #1
On Mon, Apr 4, 2022 at 9:33 PM Frederic Weisbecker <frederic@kernel.org> wrote:
>
> If timers are pending while the tick is reprogrammed on nohz_mode, the
> next expiry is not armed to fire now, it is delayed one jiffy forward
> instead so as not to raise an inextinguishable timer storm with such
> scenario:
>
> 1) IRQ triggers and queue a timer
> 2) ksoftirqd() is woken up
> 3) IRQ tail: timer is reprogrammed to fire now
> 4) IRQ exit
> 5) TIMER interrupt
> 6) goto 3)
>
> ...all that until we finally reach ksoftirqd.
>
> Unfortunately we are checking the wrong softirq vector bitmask since
> timersd kthread has split from ksoftirqd. Timers now have their own
> vector state field that must be checked separately.

With kernel 5.15 and the timersd patch applied, we've observed that
x86_64 cores tend to enter deeper C-states even when there are pending
hrtimers.      Presumably failure to check the right   bits could also
explain that observation and, accordingly, the patch might fix it?

> As a result, the
> old timer storm is back. This shows up early on boot with extremely long
> initcalls:
>
>         [  333.004807] initcall dquot_init+0x0/0x111 returned 0 after 323822879 usecs
>
> and the cause is uncovered with the right trace events showing just
> 10 microseconds between ticks (~100 000 Hz):
>
>         swapper/-1         1dn.h111 60818582us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415486608
>         swapper/-1         1dn.h111 60818592us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415496082
>         swapper/-1         1dn.h111 60818601us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415505550
>         swapper/-1         1dn.h111 60818611us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415515013
>         swapper/-1         1dn.h111 60818620us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415524483
>         swapper/-1         1dn.h111 60818630us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415533949
>         swapper/-1         1dn.h111 60818639us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415543426
>         swapper/-1         1dn.h111 60818649us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415553061
>         swapper/-1         1dn.h111 60818658us : hrtimer_expire_entry: hrtimer=00000000e0ef0f6b function=tick_sched_timer now=60415562511
>
> Fix this with checking the right timer vector state from the nohz code.
>
> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> ---
>  include/linux/interrupt.h | 12 ++++++++++++
>  kernel/softirq.c          |  7 +------
>  kernel/time/tick-sched.c  |  2 +-
>  3 files changed, 14 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index e4b8a04e67ce..da248458f4d9 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -607,9 +607,16 @@ extern void raise_softirq(unsigned int nr);
>
>  #ifdef CONFIG_PREEMPT_RT
>  DECLARE_PER_CPU(struct task_struct *, timersd);
> +DECLARE_PER_CPU(unsigned long, pending_timer_softirq);
> +
>  extern void raise_timer_softirq(void);
>  extern void raise_hrtimer_softirq(void);
>
> +static inline unsigned int local_pending_timers(void)
> +{
> +        return __this_cpu_read(pending_timer_softirq);
> +}
> +
>  #else
>  static inline void raise_timer_softirq(void)
>  {
> @@ -620,6 +627,11 @@ static inline void raise_hrtimer_softirq(void)
>  {
>         raise_softirq_irqoff(HRTIMER_SOFTIRQ);
>  }
> +
> +static inline unsigned int local_pending_timers(void)
> +{
> +        return local_softirq_pending();
> +}
>  #endif
>
>  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index 89eb45614af6..c0aef5f760e5 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -625,12 +625,7 @@ static inline void tick_irq_exit(void)
>  }
>
>  DEFINE_PER_CPU(struct task_struct *, timersd);
> -static DEFINE_PER_CPU(unsigned long, pending_timer_softirq);
> -
> -static unsigned int local_pending_timers(void)
> -{
> -        return __this_cpu_read(pending_timer_softirq);
> -}
> +DEFINE_PER_CPU(unsigned long, pending_timer_softirq);
>
>  static void wake_timersd(void)
>  {
> diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
> index 17a283ce2b20..7c359f029b97 100644
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -763,7 +763,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
>
>  static inline bool local_timer_softirq_pending(void)
>  {
> -       return local_softirq_pending() & BIT(TIMER_SOFTIRQ);
> +       return local_pending_timers() & BIT(TIMER_SOFTIRQ);
>  }
>
>  static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
> --
> 2.25.1
>

Thanks,
Alison Chaiken
achaiken@aurora.tech
Aurora Innovation
Frederic Weisbecker April 15, 2022, 10:28 a.m. UTC | #2
On Tue, Apr 05, 2022 at 09:21:16AM -0700, Alison Chaiken wrote:
> On Mon, Apr 4, 2022 at 9:33 PM Frederic Weisbecker <frederic@kernel.org> wrote:
> >
> > If timers are pending while the tick is reprogrammed on nohz_mode, the
> > next expiry is not armed to fire now, it is delayed one jiffy forward
> > instead so as not to raise an inextinguishable timer storm with such
> > scenario:
> >
> > 1) IRQ triggers and queue a timer
> > 2) ksoftirqd() is woken up
> > 3) IRQ tail: timer is reprogrammed to fire now
> > 4) IRQ exit
> > 5) TIMER interrupt
> > 6) goto 3)
> >
> > ...all that until we finally reach ksoftirqd.
> >
> > Unfortunately we are checking the wrong softirq vector bitmask since
> > timersd kthread has split from ksoftirqd. Timers now have their own
> > vector state field that must be checked separately.
> 
> With kernel 5.15 and the timersd patch applied, we've observed that
> x86_64 cores tend to enter deeper C-states even when there are pending
> hrtimers.      Presumably failure to check the right   bits could also
> explain that observation and, accordingly, the patch might fix it?

Well, this issue rather adds unnecessary ticks. Thus I wouldn't expect
deeper C-states as a consequence but who knows...

Thanks.
Sebastian Andrzej Siewior May 11, 2022, 11:54 a.m. UTC | #3
On 2022-04-05 03:07:52 [+0200], Frederic Weisbecker wrote:
> If timers are pending while the tick is reprogrammed on nohz_mode, the
> next expiry is not armed to fire now, it is delayed one jiffy forward
> instead so as not to raise an inextinguishable timer storm with such
> scenario:

The series of two has been applied.

Sebastian
diff mbox series

Patch

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index e4b8a04e67ce..da248458f4d9 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -607,9 +607,16 @@  extern void raise_softirq(unsigned int nr);
 
 #ifdef CONFIG_PREEMPT_RT
 DECLARE_PER_CPU(struct task_struct *, timersd);
+DECLARE_PER_CPU(unsigned long, pending_timer_softirq);
+
 extern void raise_timer_softirq(void);
 extern void raise_hrtimer_softirq(void);
 
+static inline unsigned int local_pending_timers(void)
+{
+        return __this_cpu_read(pending_timer_softirq);
+}
+
 #else
 static inline void raise_timer_softirq(void)
 {
@@ -620,6 +627,11 @@  static inline void raise_hrtimer_softirq(void)
 {
 	raise_softirq_irqoff(HRTIMER_SOFTIRQ);
 }
+
+static inline unsigned int local_pending_timers(void)
+{
+        return local_softirq_pending();
+}
 #endif
 
 DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 89eb45614af6..c0aef5f760e5 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -625,12 +625,7 @@  static inline void tick_irq_exit(void)
 }
 
 DEFINE_PER_CPU(struct task_struct *, timersd);
-static DEFINE_PER_CPU(unsigned long, pending_timer_softirq);
-
-static unsigned int local_pending_timers(void)
-{
-        return __this_cpu_read(pending_timer_softirq);
-}
+DEFINE_PER_CPU(unsigned long, pending_timer_softirq);
 
 static void wake_timersd(void)
 {
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 17a283ce2b20..7c359f029b97 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -763,7 +763,7 @@  static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 
 static inline bool local_timer_softirq_pending(void)
 {
-	return local_softirq_pending() & BIT(TIMER_SOFTIRQ);
+	return local_pending_timers() & BIT(TIMER_SOFTIRQ);
 }
 
 static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)