diff mbox series

[v1,3/4] thermal: gov_bang_bang: Add .manage() callback

Message ID 8419356.T7Z3S40VBb@rjwysocki.net
State New
Headers show
Series thermal: gov_bang_bang: Prevent cooling devices from getting stuck in the "on" state | expand

Commit Message

Rafael J. Wysocki Aug. 13, 2024, 2:27 p.m. UTC
From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

After recent changes, the Bang-bang governor may not adjust the
initial configuration of cooling devices to the actual situation.

Namely, if a cooling device bound to a certain trip point starts in
the "on" state and the thermal zone temperature is below the threshold
of that trip point, the trip point may never be crossed on the way up
in which case the state of the cooling device will never be adjusted
because the thermal core will never invoke the governor's
.trip_crossed() callback.  [Note that there is no issue if the zone
temperature is at the trip threshold or above it to start with because
.trip_crossed() will be invoked then to indicate the start of thermal
mitigation for the given trip.]

To address this, add a .manage() callback to the Bang-bang governor
and use it to ensure that all of the thermal instances managed by the
governor have been initialized properly and the states of all of the
cooling devices involved have been adjusted to the current zone
temperature as appropriate.

Fixes: 530c932bdf75 ("thermal: gov_bang_bang: Use .trip_crossed() instead of .throttle()")
Link: https://lore.kernel.org/linux-pm/1bfbbae5-42b0-4c7d-9544-e98855715294@piie.net/
Cc: 6.10+ <stable@vger.kernel.org> # 6.10+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/gov_bang_bang.c |   30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

Comments

Peter Kästle Aug. 13, 2024, 9:04 p.m. UTC | #1
On 13.08.24 16:27, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> 
> After recent changes, the Bang-bang governor may not adjust the
> initial configuration of cooling devices to the actual situation.
> 
> Namely, if a cooling device bound to a certain trip point starts in
> the "on" state and the thermal zone temperature is below the threshold
> of that trip point, the trip point may never be crossed on the way up
> in which case the state of the cooling device will never be adjusted
> because the thermal core will never invoke the governor's
> .trip_crossed() callback.  [Note that there is no issue if the zone
> temperature is at the trip threshold or above it to start with because
> .trip_crossed() will be invoked then to indicate the start of thermal
> mitigation for the given trip.]
> 
> To address this, add a .manage() callback to the Bang-bang governor
> and use it to ensure that all of the thermal instances managed by the
> governor have been initialized properly and the states of all of the
> cooling devices involved have been adjusted to the current zone
> temperature as appropriate.
> 
> Fixes: 530c932bdf75 ("thermal: gov_bang_bang: Use .trip_crossed() instead of .throttle()")
> Link: https://lore.kernel.org/linux-pm/1bfbbae5-42b0-4c7d-9544-e98855715294@piie.net/
> Cc: 6.10+ <stable@vger.kernel.org> # 6.10+
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

Acked-by: Peter Kästle <peter@piie.net>

> ---
>   drivers/thermal/gov_bang_bang.c |   30 ++++++++++++++++++++++++++++++
>   1 file changed, 30 insertions(+)
> 
> Index: linux-pm/drivers/thermal/gov_bang_bang.c
> ===================================================================
> --- linux-pm.orig/drivers/thermal/gov_bang_bang.c
> +++ linux-pm/drivers/thermal/gov_bang_bang.c
> @@ -26,6 +26,7 @@ static void bang_bang_set_instance_targe
>   	 * when the trip is crossed on the way down.
>   	 */
>   	instance->target = target;
> +	instance->initialized = true;
>   
>   	dev_dbg(&instance->cdev->device, "target=%ld\n", instance->target);
>   
> @@ -80,8 +81,37 @@ static void bang_bang_control(struct the
>   	}
>   }
>   
> +static void bang_bang_manage(struct thermal_zone_device *tz)
> +{
> +	const struct thermal_trip_desc *td;
> +	struct thermal_instance *instance;
> +
> +	for_each_trip_desc(tz, td) {
> +		const struct thermal_trip *trip = &td->trip;
> +
> +		if (tz->temperature >= td->threshold ||
> +		    trip->temperature == THERMAL_TEMP_INVALID ||
> +		    trip->type == THERMAL_TRIP_CRITICAL ||
> +		    trip->type == THERMAL_TRIP_HOT)
> +			continue;
> +
> +		/*
> +		 * If the initial cooling device state is "on", but the zone
> +		 * temperature is not above the trip point, the core will not
> +		 * call bang_bang_control() until the zone temperature reaches
> +		 * the trip point temperature which may be never.  In those
> +		 * cases, set the initial state of the cooling device to 0.
> +		 */
> +		list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
> +			if (!instance->initialized && instance->trip == trip)
> +				bang_bang_set_instance_target(instance, 0);
> +		}
> +	}
> +}
> +
>   static struct thermal_governor thermal_gov_bang_bang = {
>   	.name		= "bang_bang",
>   	.trip_crossed	= bang_bang_control,
> +	.manage		= bang_bang_manage,
>   };
>   THERMAL_GOVERNOR_DECLARE(thermal_gov_bang_bang);
> 
> 
>
Peter Kästle Aug. 13, 2024, 9:07 p.m. UTC | #2
On 13.08.24 16:27, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> 
> After recent changes, the Bang-bang governor may not adjust the
> initial configuration of cooling devices to the actual situation.
> 
> Namely, if a cooling device bound to a certain trip point starts in
> the "on" state and the thermal zone temperature is below the threshold
> of that trip point, the trip point may never be crossed on the way up
> in which case the state of the cooling device will never be adjusted
> because the thermal core will never invoke the governor's
> .trip_crossed() callback.  [Note that there is no issue if the zone
> temperature is at the trip threshold or above it to start with because
> .trip_crossed() will be invoked then to indicate the start of thermal
> mitigation for the given trip.]
> 
> To address this, add a .manage() callback to the Bang-bang governor
> and use it to ensure that all of the thermal instances managed by the
> governor have been initialized properly and the states of all of the
> cooling devices involved have been adjusted to the current zone
> temperature as appropriate.
> 
> Fixes: 530c932bdf75 ("thermal: gov_bang_bang: Use .trip_crossed() instead of .throttle()")
> Link: https://lore.kernel.org/linux-pm/1bfbbae5-42b0-4c7d-9544-e98855715294@piie.net/
> Cc: 6.10+ <stable@vger.kernel.org> # 6.10+
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

oops, previously sent from wrong email address, here again from correct one...
Acked-by: Peter Kästle <peter@piie.net>

> ---
>   drivers/thermal/gov_bang_bang.c |   30 ++++++++++++++++++++++++++++++
>   1 file changed, 30 insertions(+)
> 
> Index: linux-pm/drivers/thermal/gov_bang_bang.c
> ===================================================================
> --- linux-pm.orig/drivers/thermal/gov_bang_bang.c
> +++ linux-pm/drivers/thermal/gov_bang_bang.c
> @@ -26,6 +26,7 @@ static void bang_bang_set_instance_targe
>   	 * when the trip is crossed on the way down.
>   	 */
>   	instance->target = target;
> +	instance->initialized = true;
>   
>   	dev_dbg(&instance->cdev->device, "target=%ld\n", instance->target);
>   
> @@ -80,8 +81,37 @@ static void bang_bang_control(struct the
>   	}
>   }
>   
> +static void bang_bang_manage(struct thermal_zone_device *tz)
> +{
> +	const struct thermal_trip_desc *td;
> +	struct thermal_instance *instance;
> +
> +	for_each_trip_desc(tz, td) {
> +		const struct thermal_trip *trip = &td->trip;
> +
> +		if (tz->temperature >= td->threshold ||
> +		    trip->temperature == THERMAL_TEMP_INVALID ||
> +		    trip->type == THERMAL_TRIP_CRITICAL ||
> +		    trip->type == THERMAL_TRIP_HOT)
> +			continue;
> +
> +		/*
> +		 * If the initial cooling device state is "on", but the zone
> +		 * temperature is not above the trip point, the core will not
> +		 * call bang_bang_control() until the zone temperature reaches
> +		 * the trip point temperature which may be never.  In those
> +		 * cases, set the initial state of the cooling device to 0.
> +		 */
> +		list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
> +			if (!instance->initialized && instance->trip == trip)
> +				bang_bang_set_instance_target(instance, 0);
> +		}
> +	}
> +}
> +
>   static struct thermal_governor thermal_gov_bang_bang = {
>   	.name		= "bang_bang",
>   	.trip_crossed	= bang_bang_control,
> +	.manage		= bang_bang_manage,
>   };
>   THERMAL_GOVERNOR_DECLARE(thermal_gov_bang_bang);
> 
> 
>
Rafael J. Wysocki Aug. 14, 2024, 5:18 p.m. UTC | #3
On Wed, Aug 14, 2024 at 12:50 AM Peter Kästle <peter@piie.net> wrote:
>
> On 13.08.24 16:27, Rafael J. Wysocki wrote:
> > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> >
> > After recent changes, the Bang-bang governor may not adjust the
> > initial configuration of cooling devices to the actual situation.
> >
> > Namely, if a cooling device bound to a certain trip point starts in
> > the "on" state and the thermal zone temperature is below the threshold
> > of that trip point, the trip point may never be crossed on the way up
> > in which case the state of the cooling device will never be adjusted
> > because the thermal core will never invoke the governor's
> > .trip_crossed() callback.  [Note that there is no issue if the zone
> > temperature is at the trip threshold or above it to start with because
> > .trip_crossed() will be invoked then to indicate the start of thermal
> > mitigation for the given trip.]
> >
> > To address this, add a .manage() callback to the Bang-bang governor
> > and use it to ensure that all of the thermal instances managed by the
> > governor have been initialized properly and the states of all of the
> > cooling devices involved have been adjusted to the current zone
> > temperature as appropriate.
> >
> > Fixes: 530c932bdf75 ("thermal: gov_bang_bang: Use .trip_crossed() instead of .throttle()")
> > Link: https://lore.kernel.org/linux-pm/1bfbbae5-42b0-4c7d-9544-e98855715294@piie.net/
> > Cc: 6.10+ <stable@vger.kernel.org> # 6.10+
> > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>
> oops, previously sent from wrong email address, here again from correct one...
> Acked-by: Peter Kästle <peter@piie.net>

No worries and thanks for the ACKs!

I gather that they mean that the changes work for you.

> > ---
> >   drivers/thermal/gov_bang_bang.c |   30 ++++++++++++++++++++++++++++++
> >   1 file changed, 30 insertions(+)
> >
> > Index: linux-pm/drivers/thermal/gov_bang_bang.c
> > ===================================================================
> > --- linux-pm.orig/drivers/thermal/gov_bang_bang.c
> > +++ linux-pm/drivers/thermal/gov_bang_bang.c
> > @@ -26,6 +26,7 @@ static void bang_bang_set_instance_targe
> >        * when the trip is crossed on the way down.
> >        */
> >       instance->target = target;
> > +     instance->initialized = true;
> >
> >       dev_dbg(&instance->cdev->device, "target=%ld\n", instance->target);
> >
> > @@ -80,8 +81,37 @@ static void bang_bang_control(struct the
> >       }
> >   }
> >
> > +static void bang_bang_manage(struct thermal_zone_device *tz)
> > +{
> > +     const struct thermal_trip_desc *td;
> > +     struct thermal_instance *instance;
> > +
> > +     for_each_trip_desc(tz, td) {
> > +             const struct thermal_trip *trip = &td->trip;
> > +
> > +             if (tz->temperature >= td->threshold ||
> > +                 trip->temperature == THERMAL_TEMP_INVALID ||
> > +                 trip->type == THERMAL_TRIP_CRITICAL ||
> > +                 trip->type == THERMAL_TRIP_HOT)
> > +                     continue;
> > +
> > +             /*
> > +              * If the initial cooling device state is "on", but the zone
> > +              * temperature is not above the trip point, the core will not
> > +              * call bang_bang_control() until the zone temperature reaches
> > +              * the trip point temperature which may be never.  In those
> > +              * cases, set the initial state of the cooling device to 0.
> > +              */
> > +             list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
> > +                     if (!instance->initialized && instance->trip == trip)
> > +                             bang_bang_set_instance_target(instance, 0);
> > +             }
> > +     }
> > +}
> > +
> >   static struct thermal_governor thermal_gov_bang_bang = {
> >       .name           = "bang_bang",
> >       .trip_crossed   = bang_bang_control,
> > +     .manage         = bang_bang_manage,
> >   };
> >   THERMAL_GOVERNOR_DECLARE(thermal_gov_bang_bang);
> >
> >
> >
>
Peter Kästle Aug. 14, 2024, 8:55 p.m. UTC | #4
Hi Rafael,

On 14.08.24 19:18, Rafael J. Wysocki wrote:
> On Wed, Aug 14, 2024 at 12:50 AM Peter Kästle <peter@piie.net> wrote:
>>
>> On 13.08.24 16:27, Rafael J. Wysocki wrote:
>>> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>>>
>>> After recent changes, the Bang-bang governor may not adjust the
>>> initial configuration of cooling devices to the actual situation.
>>>
>>> Namely, if a cooling device bound to a certain trip point starts in
>>> the "on" state and the thermal zone temperature is below the threshold
>>> of that trip point, the trip point may never be crossed on the way up
>>> in which case the state of the cooling device will never be adjusted
>>> because the thermal core will never invoke the governor's
>>> .trip_crossed() callback.  [Note that there is no issue if the zone
>>> temperature is at the trip threshold or above it to start with because
>>> .trip_crossed() will be invoked then to indicate the start of thermal
>>> mitigation for the given trip.]
>>>
>>> To address this, add a .manage() callback to the Bang-bang governor
>>> and use it to ensure that all of the thermal instances managed by the
>>> governor have been initialized properly and the states of all of the
>>> cooling devices involved have been adjusted to the current zone
>>> temperature as appropriate.
>>>
>>> Fixes: 530c932bdf75 ("thermal: gov_bang_bang: Use .trip_crossed() instead of .throttle()")
>>> Link: https://lore.kernel.org/linux-pm/1bfbbae5-42b0-4c7d-9544-e98855715294@piie.net/
>>> Cc: 6.10+ <stable@vger.kernel.org> # 6.10+
>>> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>>
>> oops, previously sent from wrong email address, here again from correct one...
>> Acked-by: Peter Kästle <peter@piie.net>
> 
> No worries and thanks for the ACKs!
> 
> I gather that they mean that the changes work for you.

Yes, successfully tested all my use cases and reviewed the code.  Thanks.
Still working on the acerhdf cleanup, but this will be another patch series.

> 
>>> ---
>>>    drivers/thermal/gov_bang_bang.c |   30 ++++++++++++++++++++++++++++++
>>>    1 file changed, 30 insertions(+)
>>>
>>> Index: linux-pm/drivers/thermal/gov_bang_bang.c
>>> ===================================================================
>>> --- linux-pm.orig/drivers/thermal/gov_bang_bang.c
>>> +++ linux-pm/drivers/thermal/gov_bang_bang.c
>>> @@ -26,6 +26,7 @@ static void bang_bang_set_instance_targe
>>>         * when the trip is crossed on the way down.
>>>         */
>>>        instance->target = target;
>>> +     instance->initialized = true;
>>>
>>>        dev_dbg(&instance->cdev->device, "target=%ld\n", instance->target);
>>>
>>> @@ -80,8 +81,37 @@ static void bang_bang_control(struct the
>>>        }
>>>    }
>>>
>>> +static void bang_bang_manage(struct thermal_zone_device *tz)
>>> +{
>>> +     const struct thermal_trip_desc *td;
>>> +     struct thermal_instance *instance;
>>> +
>>> +     for_each_trip_desc(tz, td) {
>>> +             const struct thermal_trip *trip = &td->trip;
>>> +
>>> +             if (tz->temperature >= td->threshold ||
>>> +                 trip->temperature == THERMAL_TEMP_INVALID ||
>>> +                 trip->type == THERMAL_TRIP_CRITICAL ||
>>> +                 trip->type == THERMAL_TRIP_HOT)
>>> +                     continue;
>>> +
>>> +             /*
>>> +              * If the initial cooling device state is "on", but the zone
>>> +              * temperature is not above the trip point, the core will not
>>> +              * call bang_bang_control() until the zone temperature reaches
>>> +              * the trip point temperature which may be never.  In those
>>> +              * cases, set the initial state of the cooling device to 0.
>>> +              */
>>> +             list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
>>> +                     if (!instance->initialized && instance->trip == trip)
>>> +                             bang_bang_set_instance_target(instance, 0);
>>> +             }
>>> +     }
>>> +}
>>> +
>>>    static struct thermal_governor thermal_gov_bang_bang = {
>>>        .name           = "bang_bang",
>>>        .trip_crossed   = bang_bang_control,
>>> +     .manage         = bang_bang_manage,
>>>    };
>>>    THERMAL_GOVERNOR_DECLARE(thermal_gov_bang_bang);
>>>
>>>
>>>
>>
Zhang, Rui Aug. 16, 2024, 3 a.m. UTC | #5
On Tue, 2024-08-13 at 16:27 +0200, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> 
> After recent changes, the Bang-bang governor may not adjust the
> initial configuration of cooling devices to the actual situation.
> 
> Namely, if a cooling device bound to a certain trip point starts in
> the "on" state and the thermal zone temperature is below the
> threshold
> of that trip point, the trip point may never be crossed on the way up
> in which case the state of the cooling device will never be adjusted
> because the thermal core will never invoke the governor's
> .trip_crossed() callback.  [Note that there is no issue if the zone
> temperature is at the trip threshold or above it to start with
> because
> .trip_crossed() will be invoked then to indicate the start of thermal
> mitigation for the given trip.]
> 
> To address this, add a .manage() callback to the Bang-bang governor
> and use it to ensure that all of the thermal instances managed by the
> governor have been initialized properly and the states of all of the
> cooling devices involved have been adjusted to the current zone
> temperature as appropriate.
> 
> Fixes: 530c932bdf75 ("thermal: gov_bang_bang: Use .trip_crossed()
> instead of .throttle()")
> Link:
> https://lore.kernel.org/linux-pm/1bfbbae5-42b0-4c7d-9544-e98855715294@piie.net/
> Cc: 6.10+ <stable@vger.kernel.org> # 6.10+
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

Reviewed-by: Zhang Rui <rui.zhang@intel.com>

thanks,
rui

> ---
>  drivers/thermal/gov_bang_bang.c |   30
> ++++++++++++++++++++++++++++++
>  1 file changed, 30 insertions(+)
> 
> Index: linux-pm/drivers/thermal/gov_bang_bang.c
> ===================================================================
> --- linux-pm.orig/drivers/thermal/gov_bang_bang.c
> +++ linux-pm/drivers/thermal/gov_bang_bang.c
> @@ -26,6 +26,7 @@ static void bang_bang_set_instance_targe
>          * when the trip is crossed on the way down.
>          */
>         instance->target = target;
> +       instance->initialized = true;
>  
>         dev_dbg(&instance->cdev->device, "target=%ld\n", instance-
> >target);
>  
> @@ -80,8 +81,37 @@ static void bang_bang_control(struct the
>         }
>  }
>  
> +static void bang_bang_manage(struct thermal_zone_device *tz)
> +{
> +       const struct thermal_trip_desc *td;
> +       struct thermal_instance *instance;
> +
> +       for_each_trip_desc(tz, td) {
> +               const struct thermal_trip *trip = &td->trip;
> +
> +               if (tz->temperature >= td->threshold ||
> +                   trip->temperature == THERMAL_TEMP_INVALID ||
> +                   trip->type == THERMAL_TRIP_CRITICAL ||
> +                   trip->type == THERMAL_TRIP_HOT)
> +                       continue;
> +
> +               /*
> +                * If the initial cooling device state is "on", but
> the zone
> +                * temperature is not above the trip point, the core
> will not
> +                * call bang_bang_control() until the zone
> temperature reaches
> +                * the trip point temperature which may be never.  In
> those
> +                * cases, set the initial state of the cooling device
> to 0.
> +                */
> +               list_for_each_entry(instance, &tz->thermal_instances,
> tz_node) {
> +                       if (!instance->initialized && instance->trip
> == trip)
> +                               bang_bang_set_instance_target(instanc
> e, 0);
> +               }
> +       }
> +}
> +
>  static struct thermal_governor thermal_gov_bang_bang = {
>         .name           = "bang_bang",
>         .trip_crossed   = bang_bang_control,
> +       .manage         = bang_bang_manage,
>  };
>  THERMAL_GOVERNOR_DECLARE(thermal_gov_bang_bang);
> 
> 
>
diff mbox series

Patch

Index: linux-pm/drivers/thermal/gov_bang_bang.c
===================================================================
--- linux-pm.orig/drivers/thermal/gov_bang_bang.c
+++ linux-pm/drivers/thermal/gov_bang_bang.c
@@ -26,6 +26,7 @@  static void bang_bang_set_instance_targe
 	 * when the trip is crossed on the way down.
 	 */
 	instance->target = target;
+	instance->initialized = true;
 
 	dev_dbg(&instance->cdev->device, "target=%ld\n", instance->target);
 
@@ -80,8 +81,37 @@  static void bang_bang_control(struct the
 	}
 }
 
+static void bang_bang_manage(struct thermal_zone_device *tz)
+{
+	const struct thermal_trip_desc *td;
+	struct thermal_instance *instance;
+
+	for_each_trip_desc(tz, td) {
+		const struct thermal_trip *trip = &td->trip;
+
+		if (tz->temperature >= td->threshold ||
+		    trip->temperature == THERMAL_TEMP_INVALID ||
+		    trip->type == THERMAL_TRIP_CRITICAL ||
+		    trip->type == THERMAL_TRIP_HOT)
+			continue;
+
+		/*
+		 * If the initial cooling device state is "on", but the zone
+		 * temperature is not above the trip point, the core will not
+		 * call bang_bang_control() until the zone temperature reaches
+		 * the trip point temperature which may be never.  In those
+		 * cases, set the initial state of the cooling device to 0.
+		 */
+		list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+			if (!instance->initialized && instance->trip == trip)
+				bang_bang_set_instance_target(instance, 0);
+		}
+	}
+}
+
 static struct thermal_governor thermal_gov_bang_bang = {
 	.name		= "bang_bang",
 	.trip_crossed	= bang_bang_control,
+	.manage		= bang_bang_manage,
 };
 THERMAL_GOVERNOR_DECLARE(thermal_gov_bang_bang);