diff mbox series

[1/5] thermal/drivers/mediatek/lvts: Disable monitor mode during suspend

Message ID 20241125-mt8192-lvts-filtered-suspend-fix-v1-1-42e3c0528c6c@collabora.com
State Superseded
Headers show
Series [1/5] thermal/drivers/mediatek/lvts: Disable monitor mode during suspend | expand

Commit Message

Nícolas F. R. A. Prado Nov. 25, 2024, 9:20 p.m. UTC
When configured in filtered mode, the LVTS thermal controller will
monitor the temperature from the sensors and trigger an interrupt once a
thermal threshold is crossed.

Currently this is true even during suspend and resume. The problem with
that is that when enabling the internal clock of the LVTS controller in
lvts_ctrl_set_enable() during resume, the temperature reading can glitch
and appear much higher than the real one, resulting in a spurious
interrupt getting generated.

Disable the temperature monitoring and give some time for the signals to
stabilize during suspend in order to prevent such spurious interrupts.

Cc: stable@vger.kernel.org
Reported-by: Hsin-Te Yuan <yuanhsinte@chromium.org>
Closes: https://lore.kernel.org/all/20241108-lvts-v1-1-eee339c6ca20@chromium.org/
Fixes: 8137bb90600d ("thermal/drivers/mediatek/lvts_thermal: Add suspend and resume")
Signed-off-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
---
 drivers/thermal/mediatek/lvts_thermal.c | 36 +++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

Comments

Hsin-Te Yuan Nov. 26, 2024, 8 a.m. UTC | #1
On Tue, Nov 26, 2024 at 5:21 AM Nícolas F. R. A. Prado
<nfraprado@collabora.com> wrote:
>
> When configured in filtered mode, the LVTS thermal controller will
> monitor the temperature from the sensors and trigger an interrupt once a
> thermal threshold is crossed.
>
> Currently this is true even during suspend and resume. The problem with
> that is that when enabling the internal clock of the LVTS controller in
> lvts_ctrl_set_enable() during resume, the temperature reading can glitch
> and appear much higher than the real one, resulting in a spurious
> interrupt getting generated.
>
This sounds weird to me. On my end, the symptom is that the device
sometimes cannot suspend.
To be more precise, `echo mem > /sys/power/state` returns almost
immediately. I think the irq is more
likely to be triggered during suspension.


> Disable the temperature monitoring and give some time for the signals to
> stabilize during suspend in order to prevent such spurious interrupts.
>
> Cc: stable@vger.kernel.org
> Reported-by: Hsin-Te Yuan <yuanhsinte@chromium.org>
> Closes: https://lore.kernel.org/all/20241108-lvts-v1-1-eee339c6ca20@chromium.org/
> Fixes: 8137bb90600d ("thermal/drivers/mediatek/lvts_thermal: Add suspend and resume")
> Signed-off-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
> ---
>  drivers/thermal/mediatek/lvts_thermal.c | 36 +++++++++++++++++++++++++++++++--
>  1 file changed, 34 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/thermal/mediatek/lvts_thermal.c b/drivers/thermal/mediatek/lvts_thermal.c
> index 1997e91bb3be94a3059db619238aa5787edc7675..a92ff2325c40704adc537af6995b34f93c3b0650 100644
> --- a/drivers/thermal/mediatek/lvts_thermal.c
> +++ b/drivers/thermal/mediatek/lvts_thermal.c
> @@ -860,6 +860,32 @@ static int lvts_ctrl_init(struct device *dev, struct lvts_domain *lvts_td,
>         return 0;
>  }
>
> +static void lvts_ctrl_monitor_enable(struct device *dev, struct lvts_ctrl *lvts_ctrl, bool enable)
> +{
> +       /*
> +        * Bitmaps to enable each sensor on filtered mode in the MONCTL0
> +        * register.
> +        */
> +       u32 sensor_filt_bitmap[] = { BIT(0), BIT(1), BIT(2), BIT(3) };
> +       u32 sensor_map = 0;
> +       int i;
> +
> +       if (lvts_ctrl->mode != LVTS_MSR_FILTERED_MODE)
> +               return;
> +
> +       if (enable) {
> +               lvts_for_each_valid_sensor(i, lvts_ctrl)
> +                       sensor_map |= sensor_filt_bitmap[i];
> +       }
> +
> +       /*
> +        * Bits:
> +        *      9: Single point access flow
> +        *    0-3: Enable sensing point 0-3
> +        */
> +       writel(sensor_map | BIT(9), LVTS_MONCTL0(lvts_ctrl->base));
> +}
> +
>  /*
>   * At this point the configuration register is the only place in the
>   * driver where we write multiple values. Per hardware constraint,
> @@ -1381,8 +1407,11 @@ static int lvts_suspend(struct device *dev)
>
>         lvts_td = dev_get_drvdata(dev);
>
> -       for (i = 0; i < lvts_td->num_lvts_ctrl; i++)
> +       for (i = 0; i < lvts_td->num_lvts_ctrl; i++) {
> +               lvts_ctrl_monitor_enable(dev, &lvts_td->lvts_ctrl[i], false);
> +               usleep_range(100, 200);
>                 lvts_ctrl_set_enable(&lvts_td->lvts_ctrl[i], false);
> +       }
>
>         clk_disable_unprepare(lvts_td->clk);
>
> @@ -1400,8 +1429,11 @@ static int lvts_resume(struct device *dev)
>         if (ret)
>                 return ret;
>
> -       for (i = 0; i < lvts_td->num_lvts_ctrl; i++)
> +       for (i = 0; i < lvts_td->num_lvts_ctrl; i++) {
>                 lvts_ctrl_set_enable(&lvts_td->lvts_ctrl[i], true);
> +               usleep_range(100, 200);
> +               lvts_ctrl_monitor_enable(dev, &lvts_td->lvts_ctrl[i], true);
> +       }
>
>         return 0;
>  }
>
> --
> 2.47.0
>
AngeloGioacchino Del Regno Nov. 26, 2024, 9:43 a.m. UTC | #2
Il 25/11/24 22:20, Nícolas F. R. A. Prado ha scritto:
> When configured in filtered mode, the LVTS thermal controller will
> monitor the temperature from the sensors and trigger an interrupt once a
> thermal threshold is crossed.
> 
> Currently this is true even during suspend and resume. The problem with
> that is that when enabling the internal clock of the LVTS controller in
> lvts_ctrl_set_enable() during resume, the temperature reading can glitch
> and appear much higher than the real one, resulting in a spurious
> interrupt getting generated.
> 
> Disable the temperature monitoring and give some time for the signals to
> stabilize during suspend in order to prevent such spurious interrupts.
> 
> Cc: stable@vger.kernel.org
> Reported-by: Hsin-Te Yuan <yuanhsinte@chromium.org>
> Closes: https://lore.kernel.org/all/20241108-lvts-v1-1-eee339c6ca20@chromium.org/
> Fixes: 8137bb90600d ("thermal/drivers/mediatek/lvts_thermal: Add suspend and resume")
> Signed-off-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
> ---
>   drivers/thermal/mediatek/lvts_thermal.c | 36 +++++++++++++++++++++++++++++++--
>   1 file changed, 34 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/thermal/mediatek/lvts_thermal.c b/drivers/thermal/mediatek/lvts_thermal.c
> index 1997e91bb3be94a3059db619238aa5787edc7675..a92ff2325c40704adc537af6995b34f93c3b0650 100644
> --- a/drivers/thermal/mediatek/lvts_thermal.c
> +++ b/drivers/thermal/mediatek/lvts_thermal.c
> @@ -860,6 +860,32 @@ static int lvts_ctrl_init(struct device *dev, struct lvts_domain *lvts_td,
>   	return 0;
>   }
>   
> +static void lvts_ctrl_monitor_enable(struct device *dev, struct lvts_ctrl *lvts_ctrl, bool enable)
> +{
> +	/*
> +	 * Bitmaps to enable each sensor on filtered mode in the MONCTL0
> +	 * register.
> +	 */
> +	u32 sensor_filt_bitmap[] = { BIT(0), BIT(1), BIT(2), BIT(3) };
> +	u32 sensor_map = 0;
> +	int i;
> +
> +	if (lvts_ctrl->mode != LVTS_MSR_FILTERED_MODE)
> +		return;
> +

That's easier and shorter:

static void lvts_ctrl_monitor_enable( .... )
{
	/* Bitmap to enable each sensor on filtered mode in the MONCTL0 register */
	const u32 sensor_map = GENMASK(3, 0);

	if (lvts_ctrl->mode != LVTS_MSR_FILTERED_MODE)
		return;

	/* Bits 0-3: Sensing points - Bit 9: Single point access flow */
	if (enable)
		writel(sensor_map | BIT(9), LVTS_MONCTL0(lvts_ctrl->base));
	else
		writel(BIT(9), LVTS_MONCTL0 ....
}


Cheers,
Angelo

> +	if (enable) {
> +		lvts_for_each_valid_sensor(i, lvts_ctrl)
> +			sensor_map |= sensor_filt_bitmap[i];
> +	}
> +
> +	/*
> +	 * Bits:
> +	 *      9: Single point access flow
> +	 *    0-3: Enable sensing point 0-3
> +	 */
> +	writel(sensor_map | BIT(9), LVTS_MONCTL0(lvts_ctrl->base));
> +}
> +
>   /*
>    * At this point the configuration register is the only place in the
>    * driver where we write multiple values. Per hardware constraint,
> @@ -1381,8 +1407,11 @@ static int lvts_suspend(struct device *dev)
>   
>   	lvts_td = dev_get_drvdata(dev);
>   
> -	for (i = 0; i < lvts_td->num_lvts_ctrl; i++)
> +	for (i = 0; i < lvts_td->num_lvts_ctrl; i++) {
> +		lvts_ctrl_monitor_enable(dev, &lvts_td->lvts_ctrl[i], false);
> +		usleep_range(100, 200);
>   		lvts_ctrl_set_enable(&lvts_td->lvts_ctrl[i], false);
> +	}
>   
>   	clk_disable_unprepare(lvts_td->clk);
>   
> @@ -1400,8 +1429,11 @@ static int lvts_resume(struct device *dev)
>   	if (ret)
>   		return ret;
>   
> -	for (i = 0; i < lvts_td->num_lvts_ctrl; i++)
> +	for (i = 0; i < lvts_td->num_lvts_ctrl; i++) {
>   		lvts_ctrl_set_enable(&lvts_td->lvts_ctrl[i], true);
> +		usleep_range(100, 200);
> +		lvts_ctrl_monitor_enable(dev, &lvts_td->lvts_ctrl[i], true);
> +	}
>   
>   	return 0;
>   }
>
Nícolas F. R. A. Prado Nov. 26, 2024, 1:37 p.m. UTC | #3
On Tue, Nov 26, 2024 at 04:00:42PM +0800, Hsin-Te Yuan wrote:
> On Tue, Nov 26, 2024 at 5:21 AM Nícolas F. R. A. Prado
> <nfraprado@collabora.com> wrote:
> >
> > When configured in filtered mode, the LVTS thermal controller will
> > monitor the temperature from the sensors and trigger an interrupt once a
> > thermal threshold is crossed.
> >
> > Currently this is true even during suspend and resume. The problem with
> > that is that when enabling the internal clock of the LVTS controller in
> > lvts_ctrl_set_enable() during resume, the temperature reading can glitch
> > and appear much higher than the real one, resulting in a spurious
> > interrupt getting generated.
> >
> This sounds weird to me. On my end, the symptom is that the device
> sometimes cannot suspend.
> To be more precise, `echo mem > /sys/power/state` returns almost
> immediately. I think the irq is more
> likely to be triggered during suspension.

Hi Hsin-Te,

please also check the first paragraph of the cover letter, and patch 2, that
should clarify it. But anyway, I can explain it here too:

The issue you observed is caused by two things combined:
* When returning from resume with filtered mode enabled, the sensor temperature
  reading can glitch, appearing much higher. (fixed by this patch)
* Since the Stage 3 threshold is enabled and configured to take the maximum
  reading from the sensors, it will be triggered by that glitch and bring the
  system into a state where it can no longer suspend, it will just resume right
  away. (fixed by patch 2)

So currently, every so often, during resume both these things will happen, and
any future suspend will resume right away. That's why this was never observed by
me when testing a single suspend/resume. It only breaks on resume, and only
affects future suspends, so you need to test multiple suspend/resumes on the
same run to observe this issue.

And also since both things are needed to cause this issue, if you apply only
patch 1 or only patch 2, it will already fix the issue.

Hope this clarifies it.

Thanks,
Nícolas
AngeloGioacchino Del Regno Nov. 26, 2024, 2:38 p.m. UTC | #4
Il 26/11/24 14:19, Nícolas F. R. A. Prado ha scritto:
> On Tue, Nov 26, 2024 at 10:43:55AM +0100, AngeloGioacchino Del Regno wrote:
>> Il 25/11/24 22:20, Nícolas F. R. A. Prado ha scritto:
>>> When configured in filtered mode, the LVTS thermal controller will
>>> monitor the temperature from the sensors and trigger an interrupt once a
>>> thermal threshold is crossed.
>>>
>>> Currently this is true even during suspend and resume. The problem with
>>> that is that when enabling the internal clock of the LVTS controller in
>>> lvts_ctrl_set_enable() during resume, the temperature reading can glitch
>>> and appear much higher than the real one, resulting in a spurious
>>> interrupt getting generated.
>>>
>>> Disable the temperature monitoring and give some time for the signals to
>>> stabilize during suspend in order to prevent such spurious interrupts.
>>>
>>> Cc: stable@vger.kernel.org
>>> Reported-by: Hsin-Te Yuan <yuanhsinte@chromium.org>
>>> Closes: https://lore.kernel.org/all/20241108-lvts-v1-1-eee339c6ca20@chromium.org/
>>> Fixes: 8137bb90600d ("thermal/drivers/mediatek/lvts_thermal: Add suspend and resume")
>>> Signed-off-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
>>> ---
>>>    drivers/thermal/mediatek/lvts_thermal.c | 36 +++++++++++++++++++++++++++++++--
>>>    1 file changed, 34 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/thermal/mediatek/lvts_thermal.c b/drivers/thermal/mediatek/lvts_thermal.c
>>> index 1997e91bb3be94a3059db619238aa5787edc7675..a92ff2325c40704adc537af6995b34f93c3b0650 100644
>>> --- a/drivers/thermal/mediatek/lvts_thermal.c
>>> +++ b/drivers/thermal/mediatek/lvts_thermal.c
>>> @@ -860,6 +860,32 @@ static int lvts_ctrl_init(struct device *dev, struct lvts_domain *lvts_td,
>>>    	return 0;
>>>    }
>>> +static void lvts_ctrl_monitor_enable(struct device *dev, struct lvts_ctrl *lvts_ctrl, bool enable)
>>> +{
>>> +	/*
>>> +	 * Bitmaps to enable each sensor on filtered mode in the MONCTL0
>>> +	 * register.
>>> +	 */
>>> +	u32 sensor_filt_bitmap[] = { BIT(0), BIT(1), BIT(2), BIT(3) };
>>> +	u32 sensor_map = 0;
>>> +	int i;
>>> +
>>> +	if (lvts_ctrl->mode != LVTS_MSR_FILTERED_MODE)
>>> +		return;
>>> +
>>
>> That's easier and shorter:
>>
>> static void lvts_ctrl_monitor_enable( .... )
>> {
>> 	/* Bitmap to enable each sensor on filtered mode in the MONCTL0 register */
>> 	const u32 sensor_map = GENMASK(3, 0);
>>
>> 	if (lvts_ctrl->mode != LVTS_MSR_FILTERED_MODE)
>> 		return;
>>
>> 	/* Bits 0-3: Sensing points - Bit 9: Single point access flow */
>> 	if (enable)
>> 		writel(sensor_map | BIT(9), LVTS_MONCTL0(lvts_ctrl->base));
> 
> Wait, no, here you're enabling all the sensors in the controller. We only want
> to enable ones that are valid, otherwise we might get garbage data and irqs from
> sensors that aren't actually there. That's why I use the
> lvts_for_each_valid_sensor() helper in this patch.
> 

Whoa, my brain actually missed the lvts_for_each_valid_sensor()!

Okay no, then you're right - sorry for the bad example! In that case, though, I
still have one more comment.

You can constify sensor_filt_bitmap, and since the values never go higher than
BIT(3), you should also be able to spare some memory by turning that into a u8:

	const u8 sensor_filt_bitmap[] = { BIT(0), BIT(1), BIT(2), BIT(3) };

...and then I assume that there's no way valid sensors could ever read from an
index that is more than 4 (so, I assume that there's no way the loop tries to
read out of the array upper boundary).

In which case - after at least constifying the sensor_filt_bitmap array, for v2
feel free to add my

Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>

...and sorry again for the initial miss :-)

Cheers,
Angelo
Hsin-Te Yuan Nov. 27, 2024, 7:27 a.m. UTC | #5
On Tue, Nov 26, 2024 at 9:37 PM Nícolas F. R. A. Prado
<nfraprado@collabora.com> wrote:
>
> On Tue, Nov 26, 2024 at 04:00:42PM +0800, Hsin-Te Yuan wrote:
> > On Tue, Nov 26, 2024 at 5:21 AM Nícolas F. R. A. Prado
> > <nfraprado@collabora.com> wrote:
> > >
> > > When configured in filtered mode, the LVTS thermal controller will
> > > monitor the temperature from the sensors and trigger an interrupt once a
> > > thermal threshold is crossed.
> > >
> > > Currently this is true even during suspend and resume. The problem with
> > > that is that when enabling the internal clock of the LVTS controller in
> > > lvts_ctrl_set_enable() during resume, the temperature reading can glitch
> > > and appear much higher than the real one, resulting in a spurious
> > > interrupt getting generated.
> > >
> > This sounds weird to me. On my end, the symptom is that the device
> > sometimes cannot suspend.
> > To be more precise, `echo mem > /sys/power/state` returns almost
> > immediately. I think the irq is more
> > likely to be triggered during suspension.
>
> Hi Hsin-Te,
>
> please also check the first paragraph of the cover letter, and patch 2, that
> should clarify it. But anyway, I can explain it here too:
>
> The issue you observed is caused by two things combined:
> * When returning from resume with filtered mode enabled, the sensor temperature
>   reading can glitch, appearing much higher. (fixed by this patch)
> * Since the Stage 3 threshold is enabled and configured to take the maximum
>   reading from the sensors, it will be triggered by that glitch and bring the
>   system into a state where it can no longer suspend, it will just resume right
>   away. (fixed by patch 2)
>
> So currently, every so often, during resume both these things will happen, and
> any future suspend will resume right away. That's why this was never observed by
> me when testing a single suspend/resume. It only breaks on resume, and only
> affects future suspends, so you need to test multiple suspend/resumes on the
> same run to observe this issue.
>
> And also since both things are needed to cause this issue, if you apply only
> patch 1 or only patch 2, it will already fix the issue.
>
> Hope this clarifies it.
>
> Thanks,
> Nícolas

Thanks for the explanation!

Regards,
Hsin-Te
diff mbox series

Patch

diff --git a/drivers/thermal/mediatek/lvts_thermal.c b/drivers/thermal/mediatek/lvts_thermal.c
index 1997e91bb3be94a3059db619238aa5787edc7675..a92ff2325c40704adc537af6995b34f93c3b0650 100644
--- a/drivers/thermal/mediatek/lvts_thermal.c
+++ b/drivers/thermal/mediatek/lvts_thermal.c
@@ -860,6 +860,32 @@  static int lvts_ctrl_init(struct device *dev, struct lvts_domain *lvts_td,
 	return 0;
 }
 
+static void lvts_ctrl_monitor_enable(struct device *dev, struct lvts_ctrl *lvts_ctrl, bool enable)
+{
+	/*
+	 * Bitmaps to enable each sensor on filtered mode in the MONCTL0
+	 * register.
+	 */
+	u32 sensor_filt_bitmap[] = { BIT(0), BIT(1), BIT(2), BIT(3) };
+	u32 sensor_map = 0;
+	int i;
+
+	if (lvts_ctrl->mode != LVTS_MSR_FILTERED_MODE)
+		return;
+
+	if (enable) {
+		lvts_for_each_valid_sensor(i, lvts_ctrl)
+			sensor_map |= sensor_filt_bitmap[i];
+	}
+
+	/*
+	 * Bits:
+	 *      9: Single point access flow
+	 *    0-3: Enable sensing point 0-3
+	 */
+	writel(sensor_map | BIT(9), LVTS_MONCTL0(lvts_ctrl->base));
+}
+
 /*
  * At this point the configuration register is the only place in the
  * driver where we write multiple values. Per hardware constraint,
@@ -1381,8 +1407,11 @@  static int lvts_suspend(struct device *dev)
 
 	lvts_td = dev_get_drvdata(dev);
 
-	for (i = 0; i < lvts_td->num_lvts_ctrl; i++)
+	for (i = 0; i < lvts_td->num_lvts_ctrl; i++) {
+		lvts_ctrl_monitor_enable(dev, &lvts_td->lvts_ctrl[i], false);
+		usleep_range(100, 200);
 		lvts_ctrl_set_enable(&lvts_td->lvts_ctrl[i], false);
+	}
 
 	clk_disable_unprepare(lvts_td->clk);
 
@@ -1400,8 +1429,11 @@  static int lvts_resume(struct device *dev)
 	if (ret)
 		return ret;
 
-	for (i = 0; i < lvts_td->num_lvts_ctrl; i++)
+	for (i = 0; i < lvts_td->num_lvts_ctrl; i++) {
 		lvts_ctrl_set_enable(&lvts_td->lvts_ctrl[i], true);
+		usleep_range(100, 200);
+		lvts_ctrl_monitor_enable(dev, &lvts_td->lvts_ctrl[i], true);
+	}
 
 	return 0;
 }