diff mbox series

cpufreq: intel_pstate: Fix for HWP interrupt before driver is ready

Message ID 20210904053703.581297-1-srinivas.pandruvada@linux.intel.com
State New
Headers show
Series cpufreq: intel_pstate: Fix for HWP interrupt before driver is ready | expand

Commit Message

Srinivas Pandruvada Sept. 4, 2021, 5:37 a.m. UTC
In Lenovo X1 gen9 laptop, HWP interrupts arrive before driver is ready
to handle on that CPU. Basically didn't even allocated memory for per
cpu data structure and not even started interrupt enable process on that
CPU. So interrupt handler observes a NULL pointer to schedule work.

This interrupt was probably for SMM, but since it is redirected to
OS by OSC call, OS receives it, but not ready to handle. That redirection
of interrupt to OS was also done to solve one SMM crash on Yoga 260 for
HWP interrupt a while back.

To solve this the HWP interrupt handler should ignore such request if the
driver is not ready. This will require some flag to wait till the driver
setup a workqueue to handle on a CPU. We can't simply assume cpudata to
be NULL and avoid processing as it may not be NULL but data structure is
not in consistent state.

So created a cpumask which sets the CPU on which interrupt was setup. If
not setup, simply clear the interrupt status and return. Since the
similar issue can happen during S3 resume, clear the bit during offline.

Since interrupt timing may be before HWP is enabled, use safe MSR read
writes as before the change for HWP interrupt.

Fixes: d0e936adbd22 ("cpufreq: intel_pstate: Process HWP Guaranteed change notification")
Reported-and-tested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
 drivers/cpufreq/intel_pstate.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

Comments

Rafael J. Wysocki Sept. 6, 2021, 4:17 p.m. UTC | #1
On Sat, Sep 4, 2021 at 7:37 AM Srinivas Pandruvada
<srinivas.pandruvada@linux.intel.com> wrote:
>

> In Lenovo X1 gen9 laptop, HWP interrupts arrive before driver is ready

> to handle on that CPU. Basically didn't even allocated memory for per

> cpu data structure and not even started interrupt enable process on that

> CPU. So interrupt handler observes a NULL pointer to schedule work.

>

> This interrupt was probably for SMM, but since it is redirected to

> OS by OSC call, OS receives it, but not ready to handle. That redirection

> of interrupt to OS was also done to solve one SMM crash on Yoga 260 for

> HWP interrupt a while back.

>

> To solve this the HWP interrupt handler should ignore such request if the

> driver is not ready. This will require some flag to wait till the driver

> setup a workqueue to handle on a CPU. We can't simply assume cpudata to

> be NULL and avoid processing as it may not be NULL but data structure is

> not in consistent state.

>

> So created a cpumask which sets the CPU on which interrupt was setup. If

> not setup, simply clear the interrupt status and return. Since the

> similar issue can happen during S3 resume, clear the bit during offline.

>

> Since interrupt timing may be before HWP is enabled, use safe MSR read

> writes as before the change for HWP interrupt.

>

> Fixes: d0e936adbd22 ("cpufreq: intel_pstate: Process HWP Guaranteed change notification")

> Reported-and-tested-by: Jens Axboe <axboe@kernel.dk>

> Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>

> ---

>  drivers/cpufreq/intel_pstate.c | 23 ++++++++++++++++++++++-

>  1 file changed, 22 insertions(+), 1 deletion(-)

>

> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c

> index b4ffe6c8a0d0..5ac86bfa1080 100644

> --- a/drivers/cpufreq/intel_pstate.c

> +++ b/drivers/cpufreq/intel_pstate.c

> @@ -298,6 +298,8 @@ static bool hwp_boost __read_mostly;

>

>  static struct cpufreq_driver *intel_pstate_driver __read_mostly;

>

> +static cpumask_t hwp_intr_enable_mask;

> +

>  #ifdef CONFIG_ACPI

>  static bool acpi_ppc;

>  #endif

> @@ -1067,11 +1069,15 @@ static void intel_pstate_hwp_set(unsigned int cpu)

>         wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);

>  }

>

> +static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata);

> +

>  static void intel_pstate_hwp_offline(struct cpudata *cpu)

>  {

>         u64 value = READ_ONCE(cpu->hwp_req_cached);

>         int min_perf;

>

> +       intel_pstate_disable_hwp_interrupt(cpu);

> +

>         if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {

>                 /*

>                  * In case the EPP has been set to "performance" by the

> @@ -1645,20 +1651,35 @@ void notify_hwp_interrupt(void)

>         if (!hwp_active || !boot_cpu_has(X86_FEATURE_HWP_NOTIFY))

>                 return;

>

> -       rdmsrl(MSR_HWP_STATUS, value);

> +       rdmsrl_safe(MSR_HWP_STATUS, &value);

>         if (!(value & 0x01))

>                 return;

>

> +       if (!cpumask_test_cpu(this_cpu, &hwp_intr_enable_mask)) {

> +               wrmsrl_safe(MSR_HWP_STATUS, 0);

> +               return;

> +       }


Without additional locking, there is a race between this and
intel_pstate_disable_hwp_interrupt().

1. notify_hwp_interrupt() checks hwp_intr_enable_mask() and the target
CPU is in there, so it will go for scheduling the delayed work.
2. intel_pstate_disable_hwp_interrupt() runs between the check and the
cpudata load below.
3. hwp_notify_work is scheduled on the CPU that isn't there in the
mask any more.

I think that this should be avoided?

> +

>         cpudata = all_cpu_data[this_cpu];

>         schedule_delayed_work_on(this_cpu, &cpudata->hwp_notify_work, msecs_to_jiffies(10));

>  }

>

> +static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata)

> +{

> +

> +       if (cpumask_test_and_clear_cpu(cpudata->cpu, &hwp_intr_enable_mask)) {

> +               wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);

> +               cancel_delayed_work_sync(&cpudata->hwp_notify_work);

> +       }

> +}

> +

>  static void intel_pstate_enable_hwp_interrupt(struct cpudata *cpudata)

>  {

>         /* Enable HWP notification interrupt for guaranteed performance change */

>         if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) {

>                 INIT_DELAYED_WORK(&cpudata->hwp_notify_work, intel_pstate_notify_work);

>                 wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x01);

> +               cpumask_set_cpu(cpudata->cpu, &hwp_intr_enable_mask);

>         }

>  }

>

> --

> 2.31.1

>
Jens Axboe Sept. 6, 2021, 4:43 p.m. UTC | #2
On 9/6/21 10:17 AM, Rafael J. Wysocki wrote:
> On Sat, Sep 4, 2021 at 7:37 AM Srinivas Pandruvada

> <srinivas.pandruvada@linux.intel.com> wrote:

>>

>> In Lenovo X1 gen9 laptop, HWP interrupts arrive before driver is ready

>> to handle on that CPU. Basically didn't even allocated memory for per

>> cpu data structure and not even started interrupt enable process on that

>> CPU. So interrupt handler observes a NULL pointer to schedule work.

>>

>> This interrupt was probably for SMM, but since it is redirected to

>> OS by OSC call, OS receives it, but not ready to handle. That redirection

>> of interrupt to OS was also done to solve one SMM crash on Yoga 260 for

>> HWP interrupt a while back.

>>

>> To solve this the HWP interrupt handler should ignore such request if the

>> driver is not ready. This will require some flag to wait till the driver

>> setup a workqueue to handle on a CPU. We can't simply assume cpudata to

>> be NULL and avoid processing as it may not be NULL but data structure is

>> not in consistent state.

>>

>> So created a cpumask which sets the CPU on which interrupt was setup. If

>> not setup, simply clear the interrupt status and return. Since the

>> similar issue can happen during S3 resume, clear the bit during offline.

>>

>> Since interrupt timing may be before HWP is enabled, use safe MSR read

>> writes as before the change for HWP interrupt.

>>

>> Fixes: d0e936adbd22 ("cpufreq: intel_pstate: Process HWP Guaranteed change notification")

>> Reported-and-tested-by: Jens Axboe <axboe@kernel.dk>

>> Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>

>> ---

>>  drivers/cpufreq/intel_pstate.c | 23 ++++++++++++++++++++++-

>>  1 file changed, 22 insertions(+), 1 deletion(-)

>>

>> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c

>> index b4ffe6c8a0d0..5ac86bfa1080 100644

>> --- a/drivers/cpufreq/intel_pstate.c

>> +++ b/drivers/cpufreq/intel_pstate.c

>> @@ -298,6 +298,8 @@ static bool hwp_boost __read_mostly;

>>

>>  static struct cpufreq_driver *intel_pstate_driver __read_mostly;

>>

>> +static cpumask_t hwp_intr_enable_mask;

>> +

>>  #ifdef CONFIG_ACPI

>>  static bool acpi_ppc;

>>  #endif

>> @@ -1067,11 +1069,15 @@ static void intel_pstate_hwp_set(unsigned int cpu)

>>         wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);

>>  }

>>

>> +static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata);

>> +

>>  static void intel_pstate_hwp_offline(struct cpudata *cpu)

>>  {

>>         u64 value = READ_ONCE(cpu->hwp_req_cached);

>>         int min_perf;

>>

>> +       intel_pstate_disable_hwp_interrupt(cpu);

>> +

>>         if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {

>>                 /*

>>                  * In case the EPP has been set to "performance" by the

>> @@ -1645,20 +1651,35 @@ void notify_hwp_interrupt(void)

>>         if (!hwp_active || !boot_cpu_has(X86_FEATURE_HWP_NOTIFY))

>>                 return;

>>

>> -       rdmsrl(MSR_HWP_STATUS, value);

>> +       rdmsrl_safe(MSR_HWP_STATUS, &value);

>>         if (!(value & 0x01))

>>                 return;

>>

>> +       if (!cpumask_test_cpu(this_cpu, &hwp_intr_enable_mask)) {

>> +               wrmsrl_safe(MSR_HWP_STATUS, 0);

>> +               return;

>> +       }

> 

> Without additional locking, there is a race between this and

> intel_pstate_disable_hwp_interrupt().

> 

> 1. notify_hwp_interrupt() checks hwp_intr_enable_mask() and the target

> CPU is in there, so it will go for scheduling the delayed work.

> 2. intel_pstate_disable_hwp_interrupt() runs between the check and the

> cpudata load below.

> 3. hwp_notify_work is scheduled on the CPU that isn't there in the

> mask any more.


I noticed that too, not clear to me how much of an issue that is in
practice. But there's definitely a race there.

Can we just revert d0e936adbd22 for now so this can get solved in due
time? I doubt I'm the only one that got bit by this, but I might be the
only one that bothered to actually figured out what caused my laptop to
crash with a black screen on boot.

-- 
Jens Axboe
Srinivas Pandruvada Sept. 6, 2021, 4:53 p.m. UTC | #3
On Mon, 2021-09-06 at 18:17 +0200, Rafael J. Wysocki wrote:
> On Sat, Sep 4, 2021 at 7:37 AM Srinivas Pandruvada

> <srinivas.pandruvada@linux.intel.com> wrote:

> > 

> > In Lenovo X1 gen9 laptop, HWP interrupts arrive before driver is

> > ready

> > to handle on that CPU. Basically didn't even allocated memory for

> > per

> > cpu data structure and not even started interrupt enable process on

> > that

> > CPU. So interrupt handler observes a NULL pointer to schedule work.

> > 

> > This interrupt was probably for SMM, but since it is redirected to

> > OS by OSC call, OS receives it, but not ready to handle. That

> > redirection

> > of interrupt to OS was also done to solve one SMM crash on Yoga 260

> > for

> > HWP interrupt a while back.

> > 

> > To solve this the HWP interrupt handler should ignore such request

> > if the

> > driver is not ready. This will require some flag to wait till the

> > driver

> > setup a workqueue to handle on a CPU. We can't simply assume

> > cpudata to

> > be NULL and avoid processing as it may not be NULL but data

> > structure is

> > not in consistent state.

> > 

> > So created a cpumask which sets the CPU on which interrupt was

> > setup. If

> > not setup, simply clear the interrupt status and return. Since the

> > similar issue can happen during S3 resume, clear the bit during

> > offline.

> > 

> > Since interrupt timing may be before HWP is enabled, use safe MSR

> > read

> > writes as before the change for HWP interrupt.

> > 

> > Fixes: d0e936adbd22 ("cpufreq: intel_pstate: Process HWP Guaranteed

> > change notification")

> > Reported-and-tested-by: Jens Axboe <axboe@kernel.dk>

> > Signed-off-by: Srinivas Pandruvada <   

> > srinivas.pandruvada@linux.intel.com>

> > ---

> >  drivers/cpufreq/intel_pstate.c | 23 ++++++++++++++++++++++-

> >  1 file changed, 22 insertions(+), 1 deletion(-)

> > 

> > diff --git a/drivers/cpufreq/intel_pstate.c

> > b/drivers/cpufreq/intel_pstate.c

> > index b4ffe6c8a0d0..5ac86bfa1080 100644

> > --- a/drivers/cpufreq/intel_pstate.c

> > +++ b/drivers/cpufreq/intel_pstate.c

> > @@ -298,6 +298,8 @@ static bool hwp_boost __read_mostly;

> > 

> >  static struct cpufreq_driver *intel_pstate_driver __read_mostly;

> > 

> > +static cpumask_t hwp_intr_enable_mask;

> > +

> >  #ifdef CONFIG_ACPI

> >  static bool acpi_ppc;

> >  #endif

> > @@ -1067,11 +1069,15 @@ static void intel_pstate_hwp_set(unsigned

> > int cpu)

> >         wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);

> >  }

> > 

> > +static void intel_pstate_disable_hwp_interrupt(struct cpudata

> > *cpudata);

> > +

> >  static void intel_pstate_hwp_offline(struct cpudata *cpu)

> >  {

> >         u64 value = READ_ONCE(cpu->hwp_req_cached);

> >         int min_perf;

> > 

> > +       intel_pstate_disable_hwp_interrupt(cpu);

> > +

> >         if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {

> >                 /*

> >                  * In case the EPP has been set to "performance" by

> > the

> > @@ -1645,20 +1651,35 @@ void notify_hwp_interrupt(void)

> >         if (!hwp_active || !boot_cpu_has(X86_FEATURE_HWP_NOTIFY))

> >                 return;

> > 

> > -       rdmsrl(MSR_HWP_STATUS, value);

> > +       rdmsrl_safe(MSR_HWP_STATUS, &value);

> >         if (!(value & 0x01))

> >                 return;

> > 

> > +       if (!cpumask_test_cpu(this_cpu, &hwp_intr_enable_mask)) {

> > +               wrmsrl_safe(MSR_HWP_STATUS, 0);

> > +               return;

> > +       }

> 

> Without additional locking, there is a race between this and

> intel_pstate_disable_hwp_interrupt().

> 

Let's check.

> 1. notify_hwp_interrupt() checks hwp_intr_enable_mask() and the

> target

> CPU is in there, so it will go for scheduling the delayed work.



> 2. intel_pstate_disable_hwp_interrupt() runs between the check and

> the

> cpudata load below.


You mean it didn't call schedule_delayed_work_on(). We are running on
top half of the interrupt handler here, which will not be preempted on
this CPU. intel_pstate_disable_hwp_interrupt() runs on a task context,
which can't preempt ISR. 



> 3. hwp_notify_work is scheduled on the CPU that isn't there in the

> mask any more.


Not sure how top half can be preempted, which can only be preempted
from higher priority interrupt only, but offline callbacks are not
running as ISRs. 

If intel_pstate_disable_hwp_interrupt() started before and interrupt
arrives, ISR will start running. If CPU is removed from mask it ISR
will not do anything, if it arrives before ISR will run to completion. 

Thanks,
Srinivas


> 

> I think that this should be avoided?

> 

> > +

> >         cpudata = all_cpu_data[this_cpu];

> >         schedule_delayed_work_on(this_cpu, &cpudata-

> > >hwp_notify_work, msecs_to_jiffies(10));

> >  }

> > 

> > +static void intel_pstate_disable_hwp_interrupt(struct cpudata

> > *cpudata)

> > +{

> > +

> > +       if (cpumask_test_and_clear_cpu(cpudata->cpu,

> > &hwp_intr_enable_mask)) {

> > +               wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT,

> > 0x00);

> > +               cancel_delayed_work_sync(&cpudata-

> > >hwp_notify_work);

> > +       }

> > +}

> > +

> >  static void intel_pstate_enable_hwp_interrupt(struct cpudata

> > *cpudata)

> >  {

> >         /* Enable HWP notification interrupt for guaranteed

> > performance change */

> >         if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) {

> >                 INIT_DELAYED_WORK(&cpudata->hwp_notify_work,

> > intel_pstate_notify_work);

> >                 wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT,

> > 0x01);

> > +               cpumask_set_cpu(cpudata->cpu,

> > &hwp_intr_enable_mask);

> >         }

> >  }

> > 

> > --

> > 2.31.1

> >
Srinivas Pandruvada Sept. 6, 2021, 4:54 p.m. UTC | #4
On Mon, 2021-09-06 at 10:43 -0600, Jens Axboe wrote:
> On 9/6/21 10:17 AM, Rafael J. Wysocki wrote:

> > On Sat, Sep 4, 2021 at 7:37 AM Srinivas Pandruvada

> > <srinivas.pandruvada@linux.intel.com> wrote:

> > > 

> > > In Lenovo X1 gen9 laptop, HWP interrupts arrive before driver is

> > > ready

> > > to handle on that CPU. Basically didn't even allocated memory for

> > > per

> > > cpu data structure and not even started interrupt enable process

> > > on that

> > > CPU. So interrupt handler observes a NULL pointer to schedule

> > > work.

> > > 

> > > This interrupt was probably for SMM, but since it is redirected

> > > to

> > > OS by OSC call, OS receives it, but not ready to handle. That

> > > redirection

> > > of interrupt to OS was also done to solve one SMM crash on Yoga

> > > 260 for

> > > HWP interrupt a while back.

> > > 

> > > To solve this the HWP interrupt handler should ignore such

> > > request if the

> > > driver is not ready. This will require some flag to wait till the

> > > driver

> > > setup a workqueue to handle on a CPU. We can't simply assume

> > > cpudata to

> > > be NULL and avoid processing as it may not be NULL but data

> > > structure is

> > > not in consistent state.

> > > 

> > > So created a cpumask which sets the CPU on which interrupt was

> > > setup. If

> > > not setup, simply clear the interrupt status and return. Since

> > > the

> > > similar issue can happen during S3 resume, clear the bit during

> > > offline.

> > > 

> > > Since interrupt timing may be before HWP is enabled, use safe MSR

> > > read

> > > writes as before the change for HWP interrupt.

> > > 

> > > Fixes: d0e936adbd22 ("cpufreq: intel_pstate: Process HWP

> > > Guaranteed change notification")

> > > Reported-and-tested-by: Jens Axboe <axboe@kernel.dk>

> > > Signed-off-by: Srinivas Pandruvada < 

> > > srinivas.pandruvada@linux.intel.com>

> > > ---

> > >  drivers/cpufreq/intel_pstate.c | 23 ++++++++++++++++++++++-

> > >  1 file changed, 22 insertions(+), 1 deletion(-)

> > > 

> > > diff --git a/drivers/cpufreq/intel_pstate.c

> > > b/drivers/cpufreq/intel_pstate.c

> > > index b4ffe6c8a0d0..5ac86bfa1080 100644

> > > --- a/drivers/cpufreq/intel_pstate.c

> > > +++ b/drivers/cpufreq/intel_pstate.c

> > > @@ -298,6 +298,8 @@ static bool hwp_boost __read_mostly;

> > > 

> > >  static struct cpufreq_driver *intel_pstate_driver __read_mostly;

> > > 

> > > +static cpumask_t hwp_intr_enable_mask;

> > > +

> > >  #ifdef CONFIG_ACPI

> > >  static bool acpi_ppc;

> > >  #endif

> > > @@ -1067,11 +1069,15 @@ static void intel_pstate_hwp_set(unsigned

> > > int cpu)

> > >         wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);

> > >  }

> > > 

> > > +static void intel_pstate_disable_hwp_interrupt(struct cpudata

> > > *cpudata);

> > > +

> > >  static void intel_pstate_hwp_offline(struct cpudata *cpu)

> > >  {

> > >         u64 value = READ_ONCE(cpu->hwp_req_cached);

> > >         int min_perf;

> > > 

> > > +       intel_pstate_disable_hwp_interrupt(cpu);

> > > +

> > >         if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {

> > >                 /*

> > >                  * In case the EPP has been set to "performance"

> > > by the

> > > @@ -1645,20 +1651,35 @@ void notify_hwp_interrupt(void)

> > >         if (!hwp_active || !boot_cpu_has(X86_FEATURE_HWP_NOTIFY))

> > >                 return;

> > > 

> > > -       rdmsrl(MSR_HWP_STATUS, value);

> > > +       rdmsrl_safe(MSR_HWP_STATUS, &value);

> > >         if (!(value & 0x01))

> > >                 return;

> > > 

> > > +       if (!cpumask_test_cpu(this_cpu, &hwp_intr_enable_mask)) {

> > > +               wrmsrl_safe(MSR_HWP_STATUS, 0);

> > > +               return;

> > > +       }

> > 

> > Without additional locking, there is a race between this and

> > intel_pstate_disable_hwp_interrupt().

> > 

> > 1. notify_hwp_interrupt() checks hwp_intr_enable_mask() and the

> > target

> > CPU is in there, so it will go for scheduling the delayed work.

> > 2. intel_pstate_disable_hwp_interrupt() runs between the check and

> > the

> > cpudata load below.

> > 3. hwp_notify_work is scheduled on the CPU that isn't there in the

> > mask any more.

> 

> I noticed that too, not clear to me how much of an issue that is in

> practice. But there's definitely a race there.

Glad to see how this is possible from code running in ISR context.

Thanks,
Srinivas

> 

> Can we just revert d0e936adbd22 for now so this can get solved in due

> time? I doubt I'm the only one that got bit by this, but I might be

> the

> only one that bothered to actually figured out what caused my laptop

> to

> crash with a black screen on boot.

>
Rafael J. Wysocki Sept. 6, 2021, 4:58 p.m. UTC | #5
On Mon, Sep 6, 2021 at 6:55 PM Srinivas Pandruvada
<srinivas.pandruvada@linux.intel.com> wrote:
>

> On Mon, 2021-09-06 at 10:43 -0600, Jens Axboe wrote:

> > On 9/6/21 10:17 AM, Rafael J. Wysocki wrote:

> > > On Sat, Sep 4, 2021 at 7:37 AM Srinivas Pandruvada

> > > <srinivas.pandruvada@linux.intel.com> wrote:

> > > >

> > > > In Lenovo X1 gen9 laptop, HWP interrupts arrive before driver is

> > > > ready

> > > > to handle on that CPU. Basically didn't even allocated memory for

> > > > per

> > > > cpu data structure and not even started interrupt enable process

> > > > on that

> > > > CPU. So interrupt handler observes a NULL pointer to schedule

> > > > work.

> > > >

> > > > This interrupt was probably for SMM, but since it is redirected

> > > > to

> > > > OS by OSC call, OS receives it, but not ready to handle. That

> > > > redirection

> > > > of interrupt to OS was also done to solve one SMM crash on Yoga

> > > > 260 for

> > > > HWP interrupt a while back.

> > > >

> > > > To solve this the HWP interrupt handler should ignore such

> > > > request if the

> > > > driver is not ready. This will require some flag to wait till the

> > > > driver

> > > > setup a workqueue to handle on a CPU. We can't simply assume

> > > > cpudata to

> > > > be NULL and avoid processing as it may not be NULL but data

> > > > structure is

> > > > not in consistent state.

> > > >

> > > > So created a cpumask which sets the CPU on which interrupt was

> > > > setup. If

> > > > not setup, simply clear the interrupt status and return. Since

> > > > the

> > > > similar issue can happen during S3 resume, clear the bit during

> > > > offline.

> > > >

> > > > Since interrupt timing may be before HWP is enabled, use safe MSR

> > > > read

> > > > writes as before the change for HWP interrupt.

> > > >

> > > > Fixes: d0e936adbd22 ("cpufreq: intel_pstate: Process HWP

> > > > Guaranteed change notification")

> > > > Reported-and-tested-by: Jens Axboe <axboe@kernel.dk>

> > > > Signed-off-by: Srinivas Pandruvada <

> > > > srinivas.pandruvada@linux.intel.com>

> > > > ---

> > > >  drivers/cpufreq/intel_pstate.c | 23 ++++++++++++++++++++++-

> > > >  1 file changed, 22 insertions(+), 1 deletion(-)

> > > >

> > > > diff --git a/drivers/cpufreq/intel_pstate.c

> > > > b/drivers/cpufreq/intel_pstate.c

> > > > index b4ffe6c8a0d0..5ac86bfa1080 100644

> > > > --- a/drivers/cpufreq/intel_pstate.c

> > > > +++ b/drivers/cpufreq/intel_pstate.c

> > > > @@ -298,6 +298,8 @@ static bool hwp_boost __read_mostly;

> > > >

> > > >  static struct cpufreq_driver *intel_pstate_driver __read_mostly;

> > > >

> > > > +static cpumask_t hwp_intr_enable_mask;

> > > > +

> > > >  #ifdef CONFIG_ACPI

> > > >  static bool acpi_ppc;

> > > >  #endif

> > > > @@ -1067,11 +1069,15 @@ static void intel_pstate_hwp_set(unsigned

> > > > int cpu)

> > > >         wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);

> > > >  }

> > > >

> > > > +static void intel_pstate_disable_hwp_interrupt(struct cpudata

> > > > *cpudata);

> > > > +

> > > >  static void intel_pstate_hwp_offline(struct cpudata *cpu)

> > > >  {

> > > >         u64 value = READ_ONCE(cpu->hwp_req_cached);

> > > >         int min_perf;

> > > >

> > > > +       intel_pstate_disable_hwp_interrupt(cpu);

> > > > +

> > > >         if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {

> > > >                 /*

> > > >                  * In case the EPP has been set to "performance"

> > > > by the

> > > > @@ -1645,20 +1651,35 @@ void notify_hwp_interrupt(void)

> > > >         if (!hwp_active || !boot_cpu_has(X86_FEATURE_HWP_NOTIFY))

> > > >                 return;

> > > >

> > > > -       rdmsrl(MSR_HWP_STATUS, value);

> > > > +       rdmsrl_safe(MSR_HWP_STATUS, &value);

> > > >         if (!(value & 0x01))

> > > >                 return;

> > > >

> > > > +       if (!cpumask_test_cpu(this_cpu, &hwp_intr_enable_mask)) {

> > > > +               wrmsrl_safe(MSR_HWP_STATUS, 0);

> > > > +               return;

> > > > +       }

> > >

> > > Without additional locking, there is a race between this and

> > > intel_pstate_disable_hwp_interrupt().

> > >

> > > 1. notify_hwp_interrupt() checks hwp_intr_enable_mask() and the

> > > target

> > > CPU is in there, so it will go for scheduling the delayed work.

> > > 2. intel_pstate_disable_hwp_interrupt() runs between the check and

> > > the

> > > cpudata load below.

> > > 3. hwp_notify_work is scheduled on the CPU that isn't there in the

> > > mask any more.

> >

> > I noticed that too, not clear to me how much of an issue that is in

> > practice. But there's definitely a race there.

> Glad to see how this is possible from code running in ISR context.


intel_pstate_disable_hwp_interrupt() may very well run on a different
CPU in parallel with the interrupt handler running on this CPU.  Or is
this not possible for some reason?
Srinivas Pandruvada Sept. 6, 2021, 5:23 p.m. UTC | #6
On Mon, 2021-09-06 at 18:58 +0200, Rafael J. Wysocki wrote:
> On Mon, Sep 6, 2021 at 6:55 PM Srinivas Pandruvada

> <srinivas.pandruvada@linux.intel.com> wrote:

> > 

> > On Mon, 2021-09-06 at 10:43 -0600, Jens Axboe wrote:

> > > On 9/6/21 10:17 AM, Rafael J. Wysocki wrote:

> > > > On Sat, Sep 4, 2021 at 7:37 AM Srinivas Pandruvada

> > > > <srinivas.pandruvada@linux.intel.com> wrote:

> > > > > 

> > > > > In Lenovo X1 gen9 laptop, HWP interrupts arrive before driver

> > > > > is

> > > > > ready

> > > > > to handle on that CPU. Basically didn't even allocated memory

> > > > > for

> > > > > per

> > > > > cpu data structure and not even started interrupt enable

> > > > > process

> > > > > on that

> > > > > CPU. So interrupt handler observes a NULL pointer to schedule

> > > > > work.

> > > > > 

> > > > > This interrupt was probably for SMM, but since it is

> > > > > redirected

> > > > > to

> > > > > OS by OSC call, OS receives it, but not ready to handle. That

> > > > > redirection

> > > > > of interrupt to OS was also done to solve one SMM crash on

> > > > > Yoga

> > > > > 260 for

> > > > > HWP interrupt a while back.

> > > > > 

> > > > > To solve this the HWP interrupt handler should ignore such

> > > > > request if the

> > > > > driver is not ready. This will require some flag to wait till

> > > > > the

> > > > > driver

> > > > > setup a workqueue to handle on a CPU. We can't simply assume

> > > > > cpudata to

> > > > > be NULL and avoid processing as it may not be NULL but data

> > > > > structure is

> > > > > not in consistent state.

> > > > > 

> > > > > So created a cpumask which sets the CPU on which interrupt

> > > > > was

> > > > > setup. If

> > > > > not setup, simply clear the interrupt status and return.

> > > > > Since

> > > > > the

> > > > > similar issue can happen during S3 resume, clear the bit

> > > > > during

> > > > > offline.

> > > > > 

> > > > > Since interrupt timing may be before HWP is enabled, use safe

> > > > > MSR

> > > > > read

> > > > > writes as before the change for HWP interrupt.

> > > > > 

> > > > > Fixes: d0e936adbd22 ("cpufreq: intel_pstate: Process HWP

> > > > > Guaranteed change notification")

> > > > > Reported-and-tested-by: Jens Axboe <axboe@kernel.dk>

> > > > > Signed-off-by: Srinivas Pandruvada <

> > > > > srinivas.pandruvada@linux.intel.com>

> > > > > ---

> > > > >  drivers/cpufreq/intel_pstate.c | 23 ++++++++++++++++++++++-

> > > > >  1 file changed, 22 insertions(+), 1 deletion(-)

> > > > > 

> > > > > diff --git a/drivers/cpufreq/intel_pstate.c

> > > > > b/drivers/cpufreq/intel_pstate.c

> > > > > index b4ffe6c8a0d0..5ac86bfa1080 100644

> > > > > --- a/drivers/cpufreq/intel_pstate.c

> > > > > +++ b/drivers/cpufreq/intel_pstate.c

> > > > > @@ -298,6 +298,8 @@ static bool hwp_boost __read_mostly;

> > > > > 

> > > > >  static struct cpufreq_driver *intel_pstate_driver

> > > > > __read_mostly;

> > > > > 

> > > > > +static cpumask_t hwp_intr_enable_mask;

> > > > > +

> > > > >  #ifdef CONFIG_ACPI

> > > > >  static bool acpi_ppc;

> > > > >  #endif

> > > > > @@ -1067,11 +1069,15 @@ static void

> > > > > intel_pstate_hwp_set(unsigned

> > > > > int cpu)

> > > > >         wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);

> > > > >  }

> > > > > 

> > > > > +static void intel_pstate_disable_hwp_interrupt(struct

> > > > > cpudata

> > > > > *cpudata);

> > > > > +

> > > > >  static void intel_pstate_hwp_offline(struct cpudata *cpu)

> > > > >  {

> > > > >         u64 value = READ_ONCE(cpu->hwp_req_cached);

> > > > >         int min_perf;

> > > > > 

> > > > > +       intel_pstate_disable_hwp_interrupt(cpu);

> > > > > +

> > > > >         if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {

> > > > >                 /*

> > > > >                  * In case the EPP has been set to

> > > > > "performance"

> > > > > by the

> > > > > @@ -1645,20 +1651,35 @@ void notify_hwp_interrupt(void)

> > > > >         if (!hwp_active ||

> > > > > !boot_cpu_has(X86_FEATURE_HWP_NOTIFY))

> > > > >                 return;

> > > > > 

> > > > > -       rdmsrl(MSR_HWP_STATUS, value);

> > > > > +       rdmsrl_safe(MSR_HWP_STATUS, &value);

> > > > >         if (!(value & 0x01))

> > > > >                 return;

> > > > > 

> > > > > +       if (!cpumask_test_cpu(this_cpu,

> > > > > &hwp_intr_enable_mask)) {

> > > > > +               wrmsrl_safe(MSR_HWP_STATUS, 0);

> > > > > +               return;

> > > > > +       }

> > > > 

> > > > Without additional locking, there is a race between this and

> > > > intel_pstate_disable_hwp_interrupt().

> > > > 

> > > > 1. notify_hwp_interrupt() checks hwp_intr_enable_mask() and the

> > > > target

> > > > CPU is in there, so it will go for scheduling the delayed work.

> > > > 2. intel_pstate_disable_hwp_interrupt() runs between the check

> > > > and

> > > > the

> > > > cpudata load below.

> > > > 3. hwp_notify_work is scheduled on the CPU that isn't there in

> > > > the

> > > > mask any more.

> > > 

> > > I noticed that too, not clear to me how much of an issue that is

> > > in

> > > practice. But there's definitely a race there.

> > Glad to see how this is possible from code running in ISR context.

> 

> intel_pstate_disable_hwp_interrupt() may very well run on a different

> CPU in parallel with the interrupt handler running on this CPU.  Or

> is

> this not possible for some reason?

I see the offline callback is called from cpufreq core from hotplug
online/offline callback. So this should run the call on the target CPU.
From Documentation
"The states CPUHP_AP_OFFLINE … CPUHP_AP_ONLINE are invoked just the
after the CPU has been brought up. The interrupts are off and the
scheduler is not yet active on this CPU. Starting with CPUHP_AP_OFFLINE
the callbacks are invoked on the target CPU."

The only other place it is called is from subsys remove callback. Not
sure how can you remove cpufreq subsys on fly.

Let's say it is possible:
While running ISR on a local CPU, how can someone pull the CPU before
its completion? If the CPU is going away after that, the workqueue is
unbounded. So it will run on some other CPU, here if that happens it
will call cpufreq update policy, which will be harmless.

"
static void unbind_workers(int cpu)

This is solved by allowing the pools to be disassociated from the CPU
 * running as an unbound one and allowing it to be reattached later if
the
 * cpu comes back online.
"


Thanks,
Srinivas
Rafael J. Wysocki Sept. 6, 2021, 5:54 p.m. UTC | #7
On Mon, Sep 6, 2021 at 7:23 PM Srinivas Pandruvada
<srinivas.pandruvada@linux.intel.com> wrote:
>

> On Mon, 2021-09-06 at 18:58 +0200, Rafael J. Wysocki wrote:

> > On Mon, Sep 6, 2021 at 6:55 PM Srinivas Pandruvada

> > <srinivas.pandruvada@linux.intel.com> wrote:

> > >

> > > On Mon, 2021-09-06 at 10:43 -0600, Jens Axboe wrote:

> > > > On 9/6/21 10:17 AM, Rafael J. Wysocki wrote:

> > > > > On Sat, Sep 4, 2021 at 7:37 AM Srinivas Pandruvada

> > > > > <srinivas.pandruvada@linux.intel.com> wrote:

> > > > > >

> > > > > > In Lenovo X1 gen9 laptop, HWP interrupts arrive before driver

> > > > > > is

> > > > > > ready

> > > > > > to handle on that CPU. Basically didn't even allocated memory

> > > > > > for

> > > > > > per

> > > > > > cpu data structure and not even started interrupt enable

> > > > > > process

> > > > > > on that

> > > > > > CPU. So interrupt handler observes a NULL pointer to schedule

> > > > > > work.

> > > > > >

> > > > > > This interrupt was probably for SMM, but since it is

> > > > > > redirected

> > > > > > to

> > > > > > OS by OSC call, OS receives it, but not ready to handle. That

> > > > > > redirection

> > > > > > of interrupt to OS was also done to solve one SMM crash on

> > > > > > Yoga

> > > > > > 260 for

> > > > > > HWP interrupt a while back.

> > > > > >

> > > > > > To solve this the HWP interrupt handler should ignore such

> > > > > > request if the

> > > > > > driver is not ready. This will require some flag to wait till

> > > > > > the

> > > > > > driver

> > > > > > setup a workqueue to handle on a CPU. We can't simply assume

> > > > > > cpudata to

> > > > > > be NULL and avoid processing as it may not be NULL but data

> > > > > > structure is

> > > > > > not in consistent state.

> > > > > >

> > > > > > So created a cpumask which sets the CPU on which interrupt

> > > > > > was

> > > > > > setup. If

> > > > > > not setup, simply clear the interrupt status and return.

> > > > > > Since

> > > > > > the

> > > > > > similar issue can happen during S3 resume, clear the bit

> > > > > > during

> > > > > > offline.

> > > > > >

> > > > > > Since interrupt timing may be before HWP is enabled, use safe

> > > > > > MSR

> > > > > > read

> > > > > > writes as before the change for HWP interrupt.

> > > > > >

> > > > > > Fixes: d0e936adbd22 ("cpufreq: intel_pstate: Process HWP

> > > > > > Guaranteed change notification")

> > > > > > Reported-and-tested-by: Jens Axboe <axboe@kernel.dk>

> > > > > > Signed-off-by: Srinivas Pandruvada <

> > > > > > srinivas.pandruvada@linux.intel.com>

> > > > > > ---

> > > > > >  drivers/cpufreq/intel_pstate.c | 23 ++++++++++++++++++++++-

> > > > > >  1 file changed, 22 insertions(+), 1 deletion(-)

> > > > > >

> > > > > > diff --git a/drivers/cpufreq/intel_pstate.c

> > > > > > b/drivers/cpufreq/intel_pstate.c

> > > > > > index b4ffe6c8a0d0..5ac86bfa1080 100644

> > > > > > --- a/drivers/cpufreq/intel_pstate.c

> > > > > > +++ b/drivers/cpufreq/intel_pstate.c

> > > > > > @@ -298,6 +298,8 @@ static bool hwp_boost __read_mostly;

> > > > > >

> > > > > >  static struct cpufreq_driver *intel_pstate_driver

> > > > > > __read_mostly;

> > > > > >

> > > > > > +static cpumask_t hwp_intr_enable_mask;

> > > > > > +

> > > > > >  #ifdef CONFIG_ACPI

> > > > > >  static bool acpi_ppc;

> > > > > >  #endif

> > > > > > @@ -1067,11 +1069,15 @@ static void

> > > > > > intel_pstate_hwp_set(unsigned

> > > > > > int cpu)

> > > > > >         wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);

> > > > > >  }

> > > > > >

> > > > > > +static void intel_pstate_disable_hwp_interrupt(struct

> > > > > > cpudata

> > > > > > *cpudata);

> > > > > > +

> > > > > >  static void intel_pstate_hwp_offline(struct cpudata *cpu)

> > > > > >  {

> > > > > >         u64 value = READ_ONCE(cpu->hwp_req_cached);

> > > > > >         int min_perf;

> > > > > >

> > > > > > +       intel_pstate_disable_hwp_interrupt(cpu);

> > > > > > +

> > > > > >         if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {

> > > > > >                 /*

> > > > > >                  * In case the EPP has been set to

> > > > > > "performance"

> > > > > > by the

> > > > > > @@ -1645,20 +1651,35 @@ void notify_hwp_interrupt(void)

> > > > > >         if (!hwp_active ||

> > > > > > !boot_cpu_has(X86_FEATURE_HWP_NOTIFY))

> > > > > >                 return;

> > > > > >

> > > > > > -       rdmsrl(MSR_HWP_STATUS, value);

> > > > > > +       rdmsrl_safe(MSR_HWP_STATUS, &value);

> > > > > >         if (!(value & 0x01))

> > > > > >                 return;

> > > > > >

> > > > > > +       if (!cpumask_test_cpu(this_cpu,

> > > > > > &hwp_intr_enable_mask)) {

> > > > > > +               wrmsrl_safe(MSR_HWP_STATUS, 0);

> > > > > > +               return;

> > > > > > +       }

> > > > >

> > > > > Without additional locking, there is a race between this and

> > > > > intel_pstate_disable_hwp_interrupt().

> > > > >

> > > > > 1. notify_hwp_interrupt() checks hwp_intr_enable_mask() and the

> > > > > target

> > > > > CPU is in there, so it will go for scheduling the delayed work.

> > > > > 2. intel_pstate_disable_hwp_interrupt() runs between the check

> > > > > and

> > > > > the

> > > > > cpudata load below.

> > > > > 3. hwp_notify_work is scheduled on the CPU that isn't there in

> > > > > the

> > > > > mask any more.

> > > >

> > > > I noticed that too, not clear to me how much of an issue that is

> > > > in

> > > > practice. But there's definitely a race there.

> > > Glad to see how this is possible from code running in ISR context.

> >

> > intel_pstate_disable_hwp_interrupt() may very well run on a different

> > CPU in parallel with the interrupt handler running on this CPU.  Or

> > is

> > this not possible for some reason?

> I see the offline callback is called from cpufreq core from hotplug

> online/offline callback. So this should run the call on the target CPU.

> From Documentation

> "The states CPUHP_AP_OFFLINE … CPUHP_AP_ONLINE are invoked just the

> after the CPU has been brought up. The interrupts are off and the

> scheduler is not yet active on this CPU. Starting with CPUHP_AP_OFFLINE

> the callbacks are invoked on the target CPU."

>

> The only other place it is called is from subsys remove callback. Not

> sure how can you remove cpufreq subsys on fly.


cpufreq_unregister_driver() causes this to happen.

> Let's say it is possible:

> While running ISR on a local CPU, how can someone pull the CPU before

> its completion? If the CPU is going away after that, the workqueue is

> unbounded. So it will run on some other CPU, here if that happens it

> will call cpufreq update policy, which will be harmless.


Well, it looks to me like if you are super-unlucky, the ISR may run on
the local CPU in parallel with intel_pstate_update_status() running on
a remote one and so dereferencing cpudata from it is generally unsafe.
In theory.  In practice it is unlikely to become problematic for
timing reasons AFAICS.

Anyway, I would consider using RCU here to stay on the safe side.
Srinivas Pandruvada Sept. 6, 2021, 6:14 p.m. UTC | #8
On Mon, 2021-09-06 at 19:54 +0200, Rafael J. Wysocki wrote:
> On Mon, Sep 6, 2021 at 7:23 PM Srinivas Pandruvada

> <srinivas.pandruvada@linux.intel.com> wrote:

> > 

> > On Mon, 2021-09-06 at 18:58 +0200, Rafael J. Wysocki wrote:

> > > On Mon, Sep 6, 2021 at 6:55 PM Srinivas Pandruvada

> > > <srinivas.pandruvada@linux.intel.com> wrote:

> > > > 

> > > > On Mon, 2021-09-06 at 10:43 -0600, Jens Axboe wrote:

> > > > > On 9/6/21 10:17 AM, Rafael J. Wysocki wrote:

> > > > > > On Sat, Sep 4, 2021 at 7:37 AM Srinivas Pandruvada

> > > > > > <srinivas.pandruvada@linux.intel.com> wrote:

> > > > > > > 

> > > > > > > In Lenovo X1 gen9 laptop, HWP interrupts arrive before

> > > > > > > driver

> > > > > > > is

> > > > > > > ready

> > > > > > > to handle on that CPU. Basically didn't even allocated

> > > > > > > memory

> > > > > > > for

> > > > > > > per

> > > > > > > cpu data structure and not even started interrupt enable

> > > > > > > process

> > > > > > > on that

> > > > > > > CPU. So interrupt handler observes a NULL pointer to

> > > > > > > schedule

> > > > > > > work.

> > > > > > > 

> > > > > > > This interrupt was probably for SMM, but since it is

> > > > > > > redirected

> > > > > > > to

> > > > > > > OS by OSC call, OS receives it, but not ready to handle.

> > > > > > > That

> > > > > > > redirection

> > > > > > > of interrupt to OS was also done to solve one SMM crash on

> > > > > > > Yoga

> > > > > > > 260 for

> > > > > > > HWP interrupt a while back.

> > > > > > > 

> > > > > > > To solve this the HWP interrupt handler should ignore such

> > > > > > > request if the

> > > > > > > driver is not ready. This will require some flag to wait

> > > > > > > till

> > > > > > > the

> > > > > > > driver

> > > > > > > setup a workqueue to handle on a CPU. We can't simply

> > > > > > > assume

> > > > > > > cpudata to

> > > > > > > be NULL and avoid processing as it may not be NULL but data

> > > > > > > structure is

> > > > > > > not in consistent state.

> > > > > > > 

> > > > > > > So created a cpumask which sets the CPU on which interrupt

> > > > > > > was

> > > > > > > setup. If

> > > > > > > not setup, simply clear the interrupt status and return.

> > > > > > > Since

> > > > > > > the

> > > > > > > similar issue can happen during S3 resume, clear the bit

> > > > > > > during

> > > > > > > offline.

> > > > > > > 

> > > > > > > Since interrupt timing may be before HWP is enabled, use

> > > > > > > safe

> > > > > > > MSR

> > > > > > > read

> > > > > > > writes as before the change for HWP interrupt.

> > > > > > > 

> > > > > > > Fixes: d0e936adbd22 ("cpufreq: intel_pstate: Process HWP

> > > > > > > Guaranteed change notification")

> > > > > > > Reported-and-tested-by: Jens Axboe <axboe@kernel.dk>

> > > > > > > Signed-off-by: Srinivas Pandruvada <

> > > > > > > srinivas.pandruvada@linux.intel.com>

> > > > > > > ---

> > > > > > >  drivers/cpufreq/intel_pstate.c | 23

> > > > > > > ++++++++++++++++++++++-

> > > > > > >  1 file changed, 22 insertions(+), 1 deletion(-)

> > > > > > > 

> > > > > > > diff --git a/drivers/cpufreq/intel_pstate.c

> > > > > > > b/drivers/cpufreq/intel_pstate.c

> > > > > > > index b4ffe6c8a0d0..5ac86bfa1080 100644

> > > > > > > --- a/drivers/cpufreq/intel_pstate.c

> > > > > > > +++ b/drivers/cpufreq/intel_pstate.c

> > > > > > > @@ -298,6 +298,8 @@ static bool hwp_boost __read_mostly;

> > > > > > > 

> > > > > > >  static struct cpufreq_driver *intel_pstate_driver

> > > > > > > __read_mostly;

> > > > > > > 

> > > > > > > +static cpumask_t hwp_intr_enable_mask;

> > > > > > > +

> > > > > > >  #ifdef CONFIG_ACPI

> > > > > > >  static bool acpi_ppc;

> > > > > > >  #endif

> > > > > > > @@ -1067,11 +1069,15 @@ static void

> > > > > > > intel_pstate_hwp_set(unsigned

> > > > > > > int cpu)

> > > > > > >         wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);

> > > > > > >  }

> > > > > > > 

> > > > > > > +static void intel_pstate_disable_hwp_interrupt(struct

> > > > > > > cpudata

> > > > > > > *cpudata);

> > > > > > > +

> > > > > > >  static void intel_pstate_hwp_offline(struct cpudata *cpu)

> > > > > > >  {

> > > > > > >         u64 value = READ_ONCE(cpu->hwp_req_cached);

> > > > > > >         int min_perf;

> > > > > > > 

> > > > > > > +       intel_pstate_disable_hwp_interrupt(cpu);

> > > > > > > +

> > > > > > >         if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {

> > > > > > >                 /*

> > > > > > >                  * In case the EPP has been set to

> > > > > > > "performance"

> > > > > > > by the

> > > > > > > @@ -1645,20 +1651,35 @@ void notify_hwp_interrupt(void)

> > > > > > >         if (!hwp_active ||

> > > > > > > !boot_cpu_has(X86_FEATURE_HWP_NOTIFY))

> > > > > > >                 return;

> > > > > > > 

> > > > > > > -       rdmsrl(MSR_HWP_STATUS, value);

> > > > > > > +       rdmsrl_safe(MSR_HWP_STATUS, &value);

> > > > > > >         if (!(value & 0x01))

> > > > > > >                 return;

> > > > > > > 

> > > > > > > +       if (!cpumask_test_cpu(this_cpu,

> > > > > > > &hwp_intr_enable_mask)) {

> > > > > > > +               wrmsrl_safe(MSR_HWP_STATUS, 0);

> > > > > > > +               return;

> > > > > > > +       }

> > > > > > 

> > > > > > Without additional locking, there is a race between this and

> > > > > > intel_pstate_disable_hwp_interrupt().

> > > > > > 

> > > > > > 1. notify_hwp_interrupt() checks hwp_intr_enable_mask() and

> > > > > > the

> > > > > > target

> > > > > > CPU is in there, so it will go for scheduling the delayed

> > > > > > work.

> > > > > > 2. intel_pstate_disable_hwp_interrupt() runs between the

> > > > > > check

> > > > > > and

> > > > > > the

> > > > > > cpudata load below.

> > > > > > 3. hwp_notify_work is scheduled on the CPU that isn't there

> > > > > > in

> > > > > > the

> > > > > > mask any more.

> > > > > 

> > > > > I noticed that too, not clear to me how much of an issue that

> > > > > is

> > > > > in

> > > > > practice. But there's definitely a race there.

> > > > Glad to see how this is possible from code running in ISR

> > > > context.

> > > 

> > > intel_pstate_disable_hwp_interrupt() may very well run on a

> > > different

> > > CPU in parallel with the interrupt handler running on this CPU.  Or

> > > is

> > > this not possible for some reason?

> > I see the offline callback is called from cpufreq core from hotplug

> > online/offline callback. So this should run the call on the target

> > CPU.

> > From Documentation

> > "The states CPUHP_AP_OFFLINE … CPUHP_AP_ONLINE are invoked just the

> > after the CPU has been brought up. The interrupts are off and the

> > scheduler is not yet active on this CPU. Starting with

> > CPUHP_AP_OFFLINE

> > the callbacks are invoked on the target CPU."

> > 

> > The only other place it is called is from subsys remove callback. Not

> > sure how can you remove cpufreq subsys on fly.

> 

> cpufreq_unregister_driver() causes this to happen.

> 

> > Let's say it is possible:

> > While running ISR on a local CPU, how can someone pull the CPU before

> > its completion? If the CPU is going away after that, the workqueue is

> > unbounded. So it will run on some other CPU, here if that happens it

> > will call cpufreq update policy, which will be harmless.

> 

> Well, it looks to me like if you are super-unlucky, the ISR may run on

> the local CPU in parallel with intel_pstate_update_status() running on

> a remote one and so dereferencing cpudata from it is generally unsafe.

> In theory.  In practice it is unlikely to become problematic for

> timing reasons AFAICS.

> 

> 

We are handling offline for other thermal interrupt sources from same
interrupt in therm-throt.c, where we do similar in offline path (by
TGLX). If cpufreq offline can cause such issue of changing CPU, I can
call intel_pstate_disable_hwp_interrupt() via override from
https://elixir.bootlin.com/linux/latest/C/ident/thermal_throttle_offline
after masking APIC interrupt.

Thanks,
Srinivas

> Anyway, I would consider using RCU here to stay on the safe side.
Rafael J. Wysocki Sept. 6, 2021, 6:25 p.m. UTC | #9
On Mon, Sep 6, 2021 at 8:14 PM Srinivas Pandruvada
<srinivas.pandruvada@linux.intel.com> wrote:
>

> On Mon, 2021-09-06 at 19:54 +0200, Rafael J. Wysocki wrote:

> > On Mon, Sep 6, 2021 at 7:23 PM Srinivas Pandruvada

> > <srinivas.pandruvada@linux.intel.com> wrote:

> > >

> > > On Mon, 2021-09-06 at 18:58 +0200, Rafael J. Wysocki wrote:

> > > > On Mon, Sep 6, 2021 at 6:55 PM Srinivas Pandruvada

> > > > <srinivas.pandruvada@linux.intel.com> wrote:

> > > > >

> > > > > On Mon, 2021-09-06 at 10:43 -0600, Jens Axboe wrote:

> > > > > > On 9/6/21 10:17 AM, Rafael J. Wysocki wrote:

> > > > > > > On Sat, Sep 4, 2021 at 7:37 AM Srinivas Pandruvada

> > > > > > > <srinivas.pandruvada@linux.intel.com> wrote:

> > > > > > > >

> > > > > > > > In Lenovo X1 gen9 laptop, HWP interrupts arrive before

> > > > > > > > driver

> > > > > > > > is

> > > > > > > > ready

> > > > > > > > to handle on that CPU. Basically didn't even allocated

> > > > > > > > memory

> > > > > > > > for

> > > > > > > > per

> > > > > > > > cpu data structure and not even started interrupt enable

> > > > > > > > process

> > > > > > > > on that

> > > > > > > > CPU. So interrupt handler observes a NULL pointer to

> > > > > > > > schedule

> > > > > > > > work.

> > > > > > > >

> > > > > > > > This interrupt was probably for SMM, but since it is

> > > > > > > > redirected

> > > > > > > > to

> > > > > > > > OS by OSC call, OS receives it, but not ready to handle.

> > > > > > > > That

> > > > > > > > redirection

> > > > > > > > of interrupt to OS was also done to solve one SMM crash on

> > > > > > > > Yoga

> > > > > > > > 260 for

> > > > > > > > HWP interrupt a while back.

> > > > > > > >

> > > > > > > > To solve this the HWP interrupt handler should ignore such

> > > > > > > > request if the

> > > > > > > > driver is not ready. This will require some flag to wait

> > > > > > > > till

> > > > > > > > the

> > > > > > > > driver

> > > > > > > > setup a workqueue to handle on a CPU. We can't simply

> > > > > > > > assume

> > > > > > > > cpudata to

> > > > > > > > be NULL and avoid processing as it may not be NULL but data

> > > > > > > > structure is

> > > > > > > > not in consistent state.

> > > > > > > >

> > > > > > > > So created a cpumask which sets the CPU on which interrupt

> > > > > > > > was

> > > > > > > > setup. If

> > > > > > > > not setup, simply clear the interrupt status and return.

> > > > > > > > Since

> > > > > > > > the

> > > > > > > > similar issue can happen during S3 resume, clear the bit

> > > > > > > > during

> > > > > > > > offline.

> > > > > > > >

> > > > > > > > Since interrupt timing may be before HWP is enabled, use

> > > > > > > > safe

> > > > > > > > MSR

> > > > > > > > read

> > > > > > > > writes as before the change for HWP interrupt.

> > > > > > > >

> > > > > > > > Fixes: d0e936adbd22 ("cpufreq: intel_pstate: Process HWP

> > > > > > > > Guaranteed change notification")

> > > > > > > > Reported-and-tested-by: Jens Axboe <axboe@kernel.dk>

> > > > > > > > Signed-off-by: Srinivas Pandruvada <

> > > > > > > > srinivas.pandruvada@linux.intel.com>

> > > > > > > > ---

> > > > > > > >  drivers/cpufreq/intel_pstate.c | 23

> > > > > > > > ++++++++++++++++++++++-

> > > > > > > >  1 file changed, 22 insertions(+), 1 deletion(-)

> > > > > > > >

> > > > > > > > diff --git a/drivers/cpufreq/intel_pstate.c

> > > > > > > > b/drivers/cpufreq/intel_pstate.c

> > > > > > > > index b4ffe6c8a0d0..5ac86bfa1080 100644

> > > > > > > > --- a/drivers/cpufreq/intel_pstate.c

> > > > > > > > +++ b/drivers/cpufreq/intel_pstate.c

> > > > > > > > @@ -298,6 +298,8 @@ static bool hwp_boost __read_mostly;

> > > > > > > >

> > > > > > > >  static struct cpufreq_driver *intel_pstate_driver

> > > > > > > > __read_mostly;

> > > > > > > >

> > > > > > > > +static cpumask_t hwp_intr_enable_mask;

> > > > > > > > +

> > > > > > > >  #ifdef CONFIG_ACPI

> > > > > > > >  static bool acpi_ppc;

> > > > > > > >  #endif

> > > > > > > > @@ -1067,11 +1069,15 @@ static void

> > > > > > > > intel_pstate_hwp_set(unsigned

> > > > > > > > int cpu)

> > > > > > > >         wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);

> > > > > > > >  }

> > > > > > > >

> > > > > > > > +static void intel_pstate_disable_hwp_interrupt(struct

> > > > > > > > cpudata

> > > > > > > > *cpudata);

> > > > > > > > +

> > > > > > > >  static void intel_pstate_hwp_offline(struct cpudata *cpu)

> > > > > > > >  {

> > > > > > > >         u64 value = READ_ONCE(cpu->hwp_req_cached);

> > > > > > > >         int min_perf;

> > > > > > > >

> > > > > > > > +       intel_pstate_disable_hwp_interrupt(cpu);

> > > > > > > > +

> > > > > > > >         if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {

> > > > > > > >                 /*

> > > > > > > >                  * In case the EPP has been set to

> > > > > > > > "performance"

> > > > > > > > by the

> > > > > > > > @@ -1645,20 +1651,35 @@ void notify_hwp_interrupt(void)

> > > > > > > >         if (!hwp_active ||

> > > > > > > > !boot_cpu_has(X86_FEATURE_HWP_NOTIFY))

> > > > > > > >                 return;

> > > > > > > >

> > > > > > > > -       rdmsrl(MSR_HWP_STATUS, value);

> > > > > > > > +       rdmsrl_safe(MSR_HWP_STATUS, &value);

> > > > > > > >         if (!(value & 0x01))

> > > > > > > >                 return;

> > > > > > > >

> > > > > > > > +       if (!cpumask_test_cpu(this_cpu,

> > > > > > > > &hwp_intr_enable_mask)) {

> > > > > > > > +               wrmsrl_safe(MSR_HWP_STATUS, 0);

> > > > > > > > +               return;

> > > > > > > > +       }

> > > > > > >

> > > > > > > Without additional locking, there is a race between this and

> > > > > > > intel_pstate_disable_hwp_interrupt().

> > > > > > >

> > > > > > > 1. notify_hwp_interrupt() checks hwp_intr_enable_mask() and

> > > > > > > the

> > > > > > > target

> > > > > > > CPU is in there, so it will go for scheduling the delayed

> > > > > > > work.

> > > > > > > 2. intel_pstate_disable_hwp_interrupt() runs between the

> > > > > > > check

> > > > > > > and

> > > > > > > the

> > > > > > > cpudata load below.

> > > > > > > 3. hwp_notify_work is scheduled on the CPU that isn't there

> > > > > > > in

> > > > > > > the

> > > > > > > mask any more.

> > > > > >

> > > > > > I noticed that too, not clear to me how much of an issue that

> > > > > > is

> > > > > > in

> > > > > > practice. But there's definitely a race there.

> > > > > Glad to see how this is possible from code running in ISR

> > > > > context.

> > > >

> > > > intel_pstate_disable_hwp_interrupt() may very well run on a

> > > > different

> > > > CPU in parallel with the interrupt handler running on this CPU.  Or

> > > > is

> > > > this not possible for some reason?

> > > I see the offline callback is called from cpufreq core from hotplug

> > > online/offline callback. So this should run the call on the target

> > > CPU.

> > > From Documentation

> > > "The states CPUHP_AP_OFFLINE … CPUHP_AP_ONLINE are invoked just the

> > > after the CPU has been brought up. The interrupts are off and the

> > > scheduler is not yet active on this CPU. Starting with

> > > CPUHP_AP_OFFLINE

> > > the callbacks are invoked on the target CPU."

> > >

> > > The only other place it is called is from subsys remove callback. Not

> > > sure how can you remove cpufreq subsys on fly.

> >

> > cpufreq_unregister_driver() causes this to happen.

> >

> > > Let's say it is possible:

> > > While running ISR on a local CPU, how can someone pull the CPU before

> > > its completion? If the CPU is going away after that, the workqueue is

> > > unbounded. So it will run on some other CPU, here if that happens it

> > > will call cpufreq update policy, which will be harmless.

> >

> > Well, it looks to me like if you are super-unlucky, the ISR may run on

> > the local CPU in parallel with intel_pstate_update_status() running on

> > a remote one and so dereferencing cpudata from it is generally unsafe.

> > In theory.  In practice it is unlikely to become problematic for

> > timing reasons AFAICS.

> >

> >

> We are handling offline for other thermal interrupt sources from same

> interrupt in therm-throt.c, where we do similar in offline path (by

> TGLX). If cpufreq offline can cause such issue of changing CPU,


This is not cpufreq offline, but intel_pstate_update_status() which
may be triggered via sysfs.  And again, the theoretically problematic
thing is dereferencing cpudata (which may be cleared by a remote CPU)
from the interrupt handler without protection.

> I can call intel_pstate_disable_hwp_interrupt() via override from

> https://elixir.bootlin.com/linux/latest/C/ident/thermal_throttle_offline

> after masking APIC interrupt.


But why would using RCU be harder than this?

Also please note that on RT kernels interrupt handlers are run in threads.
Srinivas Pandruvada Sept. 6, 2021, 7:56 p.m. UTC | #10
On Mon, 2021-09-06 at 20:25 +0200, Rafael J. Wysocki wrote:
> On Mon, Sep 6, 2021 at 8:14 PM Srinivas Pandruvada

> <srinivas.pandruvada@linux.intel.com> wrote:

> > 

> > On Mon, 2021-09-06 at 19:54 +0200, Rafael J. Wysocki wrote:

> > > 

[...]

> > > 

> > We are handling offline for other thermal interrupt sources from

> > same

> > interrupt in therm-throt.c, where we do similar in offline path (by

> > TGLX). If cpufreq offline can cause such issue of changing CPU,

> 

> This is not cpufreq offline, but intel_pstate_update_status() which

> may be triggered via sysfs.  And again, the theoretically problematic

> thing is dereferencing cpudata (which may be cleared by a remote CPU)

> from the interrupt handler without protection.


This will be a problem.

> 

> > I can call intel_pstate_disable_hwp_interrupt() via override from

> > https://elixir.bootlin.com/linux/latest/C/ident/thermal_throttle_offline

> > after masking APIC interrupt.

> 

> But why would using RCU be harder than this?

I think, this will require all_cpu_data and cpu_data to be rcu
protected. This needs to be well tested.

I think better to revert the patch for the next release.

Thanks,
Srinivas

> 

> Also please note that on RT kernels interrupt handlers are run in

> threads.
Rafael J. Wysocki Sept. 7, 2021, 1:41 p.m. UTC | #11
On Mon, Sep 6, 2021 at 9:57 PM Srinivas Pandruvada
<srinivas.pandruvada@linux.intel.com> wrote:
>

> On Mon, 2021-09-06 at 20:25 +0200, Rafael J. Wysocki wrote:

> > On Mon, Sep 6, 2021 at 8:14 PM Srinivas Pandruvada

> > <srinivas.pandruvada@linux.intel.com> wrote:

> > >

> > > On Mon, 2021-09-06 at 19:54 +0200, Rafael J. Wysocki wrote:

> > > >

> [...]

>

> > > >

> > > We are handling offline for other thermal interrupt sources from

> > > same

> > > interrupt in therm-throt.c, where we do similar in offline path (by

> > > TGLX). If cpufreq offline can cause such issue of changing CPU,

> >

> > This is not cpufreq offline, but intel_pstate_update_status() which

> > may be triggered via sysfs.  And again, the theoretically problematic

> > thing is dereferencing cpudata (which may be cleared by a remote CPU)

> > from the interrupt handler without protection.

>

> This will be a problem.

>

> >

> > > I can call intel_pstate_disable_hwp_interrupt() via override from

> > > https://elixir.bootlin.com/linux/latest/C/ident/thermal_throttle_offline

> > > after masking APIC interrupt.

> >

> > But why would using RCU be harder than this?

> I think, this will require all_cpu_data and cpu_data to be rcu

> protected. This needs to be well tested.

>

> I think better to revert the patch for the next release.


Done, thanks!
diff mbox series

Patch

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index b4ffe6c8a0d0..5ac86bfa1080 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -298,6 +298,8 @@  static bool hwp_boost __read_mostly;
 
 static struct cpufreq_driver *intel_pstate_driver __read_mostly;
 
+static cpumask_t hwp_intr_enable_mask;
+
 #ifdef CONFIG_ACPI
 static bool acpi_ppc;
 #endif
@@ -1067,11 +1069,15 @@  static void intel_pstate_hwp_set(unsigned int cpu)
 	wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
 }
 
+static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata);
+
 static void intel_pstate_hwp_offline(struct cpudata *cpu)
 {
 	u64 value = READ_ONCE(cpu->hwp_req_cached);
 	int min_perf;
 
+	intel_pstate_disable_hwp_interrupt(cpu);
+
 	if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
 		/*
 		 * In case the EPP has been set to "performance" by the
@@ -1645,20 +1651,35 @@  void notify_hwp_interrupt(void)
 	if (!hwp_active || !boot_cpu_has(X86_FEATURE_HWP_NOTIFY))
 		return;
 
-	rdmsrl(MSR_HWP_STATUS, value);
+	rdmsrl_safe(MSR_HWP_STATUS, &value);
 	if (!(value & 0x01))
 		return;
 
+	if (!cpumask_test_cpu(this_cpu, &hwp_intr_enable_mask)) {
+		wrmsrl_safe(MSR_HWP_STATUS, 0);
+		return;
+	}
+
 	cpudata = all_cpu_data[this_cpu];
 	schedule_delayed_work_on(this_cpu, &cpudata->hwp_notify_work, msecs_to_jiffies(10));
 }
 
+static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata)
+{
+
+	if (cpumask_test_and_clear_cpu(cpudata->cpu, &hwp_intr_enable_mask)) {
+		wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
+		cancel_delayed_work_sync(&cpudata->hwp_notify_work);
+	}
+}
+
 static void intel_pstate_enable_hwp_interrupt(struct cpudata *cpudata)
 {
 	/* Enable HWP notification interrupt for guaranteed performance change */
 	if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) {
 		INIT_DELAYED_WORK(&cpudata->hwp_notify_work, intel_pstate_notify_work);
 		wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x01);
+		cpumask_set_cpu(cpudata->cpu, &hwp_intr_enable_mask);
 	}
 }