Message ID | 20231212142730.998913-5-vincent.guittot@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | Rework system pressure interface to the scheduler | expand |
On 12/12/23 14:27, Vincent Guittot wrote: > Now that cpufreq provides a pressure value to the scheduler, rename > arch_update_thermal_pressure into hw pressure to reflect that it returns > a pressure applied by HW with a high frequency and which needs filtering. I would elaborte this meaning 'filtering' here. Something like: '... high frequency and which needs filtering to smooth the singal and get an average value. That reflects available capacity of the CPU in longer period' > This pressure is not always related to thermal mitigation but can also be > generated by max current limitation as an example. > > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> > --- > arch/arm/include/asm/topology.h | 6 ++--- > arch/arm64/include/asm/topology.h | 6 ++--- > drivers/base/arch_topology.c | 26 +++++++++---------- > drivers/cpufreq/qcom-cpufreq-hw.c | 4 +-- > include/linux/arch_topology.h | 8 +++--- > include/linux/sched/topology.h | 8 +++--- > .../{thermal_pressure.h => hw_pressure.h} | 14 +++++----- > include/trace/events/sched.h | 2 +- > init/Kconfig | 12 ++++----- > kernel/sched/core.c | 8 +++--- > kernel/sched/fair.c | 12 ++++----- > kernel/sched/pelt.c | 18 ++++++------- > kernel/sched/pelt.h | 16 ++++++------ > kernel/sched/sched.h | 4 +-- > 14 files changed, 72 insertions(+), 72 deletions(-) > rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%) > > diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h > index 853c4f81ba4a..e175e8596b5d 100644 > --- a/arch/arm/include/asm/topology.h > +++ b/arch/arm/include/asm/topology.h > @@ -22,9 +22,9 @@ > /* Enable topology flag updates */ > #define arch_update_cpu_topology topology_update_cpu_topology > > -/* Replace task scheduler's default thermal pressure API */ > -#define arch_scale_thermal_pressure topology_get_thermal_pressure > -#define arch_update_thermal_pressure topology_update_thermal_pressure > +/* Replace task scheduler's default hw pressure API */ > +#define arch_scale_hw_pressure topology_get_hw_pressure > +#define arch_update_hw_pressure topology_update_hw_pressure > > #else > > diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h > index a323b109b9c4..a427650bdfba 100644 > --- a/arch/arm64/include/asm/topology.h > +++ b/arch/arm64/include/asm/topology.h > @@ -35,9 +35,9 @@ void update_freq_counters_refs(void); > /* Enable topology flag updates */ > #define arch_update_cpu_topology topology_update_cpu_topology > > -/* Replace task scheduler's default thermal pressure API */ > -#define arch_scale_thermal_pressure topology_get_thermal_pressure > -#define arch_update_thermal_pressure topology_update_thermal_pressure > +/* Replace task scheduler's default hw pressure API */ s/hw/HW/ ? > +#define arch_scale_hw_pressure topology_get_hw_pressure > +#define arch_update_hw_pressure topology_update_hw_pressure > > #include <asm-generic/topology.h> > > diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c > index 0906114963ff..3d8dc9d5c3ad 100644 > --- a/drivers/base/arch_topology.c > +++ b/drivers/base/arch_topology.c > @@ -22,7 +22,7 @@ > #include <linux/units.h> > > #define CREATE_TRACE_POINTS > -#include <trace/events/thermal_pressure.h> > +#include <trace/events/hw_pressure.h> > > static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); > static struct cpumask scale_freq_counters_mask; > @@ -160,26 +160,26 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity) > per_cpu(cpu_scale, cpu) = capacity; > } > > -DEFINE_PER_CPU(unsigned long, thermal_pressure); > +DEFINE_PER_CPU(unsigned long, hw_pressure); > > /** > - * topology_update_thermal_pressure() - Update thermal pressure for CPUs > + * topology_update_hw_pressure() - Update hw pressure for CPUs same here: HW? > * @cpus : The related CPUs for which capacity has been reduced > * @capped_freq : The maximum allowed frequency that CPUs can run at > * > - * Update the value of thermal pressure for all @cpus in the mask. The > + * Update the value of hw pressure for all @cpus in the mask. The HW? > * cpumask should include all (online+offline) affected CPUs, to avoid > * operating on stale data when hot-plug is used for some CPUs. The > * @capped_freq reflects the currently allowed max CPUs frequency due to > - * thermal capping. It might be also a boost frequency value, which is bigger > + * hw capping. It might be also a boost frequency value, which is bigger HW? > * than the internal 'capacity_freq_ref' max frequency. In such case the > * pressure value should simply be removed, since this is an indication that > - * there is no thermal throttling. The @capped_freq must be provided in kHz. > + * there is no hw throttling. The @capped_freq must be provided in kHz. HW? > */ > -void topology_update_thermal_pressure(const struct cpumask *cpus, > +void topology_update_hw_pressure(const struct cpumask *cpus, > unsigned long capped_freq) > { > - unsigned long max_capacity, capacity, th_pressure; > + unsigned long max_capacity, capacity, hw_pressure; > u32 max_freq; > int cpu; > > @@ -189,21 +189,21 @@ void topology_update_thermal_pressure(const struct cpumask *cpus, > > /* > * Handle properly the boost frequencies, which should simply clean > - * the thermal pressure value. > + * the hw pressure value. HW? > */ > if (max_freq <= capped_freq) > capacity = max_capacity; > else > capacity = mult_frac(max_capacity, capped_freq, max_freq); > > - th_pressure = max_capacity - capacity; > + hw_pressure = max_capacity - capacity; > > - trace_thermal_pressure_update(cpu, th_pressure); > + trace_hw_pressure_update(cpu, hw_pressure); > > for_each_cpu(cpu, cpus) > - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); > + WRITE_ONCE(per_cpu(hw_pressure, cpu), hw_pressure); > } > -EXPORT_SYMBOL_GPL(topology_update_thermal_pressure); > +EXPORT_SYMBOL_GPL(topology_update_hw_pressure); > > static ssize_t cpu_capacity_show(struct device *dev, > struct device_attribute *attr, > diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c > index 70b0f21968a0..a619202ba81d 100644 > --- a/drivers/cpufreq/qcom-cpufreq-hw.c > +++ b/drivers/cpufreq/qcom-cpufreq-hw.c > @@ -347,8 +347,8 @@ static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data) > > throttled_freq = freq_hz / HZ_PER_KHZ; > > - /* Update thermal pressure (the boost frequencies are accepted) */ > - arch_update_thermal_pressure(policy->related_cpus, throttled_freq); > + /* Update hw pressure (the boost frequencies are accepted) */ HW? > + arch_update_hw_pressure(policy->related_cpus, throttled_freq); > [snip] > > if (housekeeping_cpu(cpu, HK_TYPE_TICK)) > @@ -5669,8 +5669,8 @@ void scheduler_tick(void) > rq_lock(rq, &rf); > > update_rq_clock(rq); > - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); > - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); > + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); > + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); We switch to task clock here, could you tell me why? Don't we have to maintain the boot command parameter for the shift? > curr->sched_class->task_tick(rq, curr, 0); > if (sched_feat(LATENCY_WARN)) > resched_latency = cpu_resched_latency(rq); > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 11d3be829302..07050955d6e0 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -4936,7 +4936,7 @@ static inline unsigned long get_actual_cpu_capacity(int cpu) > { > unsigned long capacity = arch_scale_cpu_capacity(cpu); > > - capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); > + capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); > > return capacity; > } > @@ -4968,7 +4968,7 @@ static inline int util_fits_cpu(unsigned long util, > * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it > * should fit a little cpu even if there's some pressure. > * > - * Only exception is for thermal pressure since it has a direct impact > + * Only exception is for hw or cpufreq pressure since it has a direct impact HW? > * on available OPP of the system. > * > * We honour it for uclamp_min only as a drop in performance level > @@ -9224,7 +9224,7 @@ static inline bool others_have_blocked(struct rq *rq) > if (READ_ONCE(rq->avg_dl.util_avg)) > return true; > > - if (thermal_load_avg(rq)) > + if (hw_load_avg(rq)) > return true; > > #ifdef CONFIG_HAVE_SCHED_AVG_IRQ > @@ -9256,7 +9256,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done) > { > const struct sched_class *curr_class; > u64 now = rq_clock_pelt(rq); > - unsigned long thermal_pressure; > + unsigned long hw_pressure; > bool decayed; > > /* > @@ -9265,11 +9265,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done) > */ > curr_class = rq->curr->sched_class; > > - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); > + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); > > decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | > update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | > - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | > + update_hw_load_avg(now, rq, hw_pressure) | Here also the rq_clock_thermal() is not used anymore. Shouldn't we remove the rq_clock_thermal() if not used anymore (I cannot se this in the patch)? > update_irq_load_avg(rq, 0); > > if (others_have_blocked(rq)) [snip] > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index e58a54bda77d..7eaf186d071e 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -1078,8 +1078,8 @@ struct rq { > #ifdef CONFIG_HAVE_SCHED_AVG_IRQ > struct sched_avg avg_irq; > #endif > -#ifdef CONFIG_SCHED_THERMAL_PRESSURE > - struct sched_avg avg_thermal; > +#ifdef CONFIG_SCHED_HW_PRESSURE > + struct sched_avg avg_hw; > #endif > u64 idle_stamp; > u64 avg_idle; I don't see that rq_clock_thermal() removal in this header. Maybe I miss some patch? BTW, I like the name 'HW pressure' for this information/signal.
On Thu, 14 Dec 2023 at 09:30, Lukasz Luba <lukasz.luba@arm.com> wrote: > > > On 12/12/23 14:27, Vincent Guittot wrote: > > Now that cpufreq provides a pressure value to the scheduler, rename > > arch_update_thermal_pressure into hw pressure to reflect that it returns > > a pressure applied by HW with a high frequency and which needs filtering. > > I would elaborte this meaning 'filtering' here. Something like: > '... high frequency and which needs filtering to smooth the singal and > get an average value. That reflects available capacity of the CPU in > longer period' Ok I will update the commit message to provide more details > > > This pressure is not always related to thermal mitigation but can also be > > generated by max current limitation as an example. > > > > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> > > --- > > arch/arm/include/asm/topology.h | 6 ++--- > > arch/arm64/include/asm/topology.h | 6 ++--- > > drivers/base/arch_topology.c | 26 +++++++++---------- > > drivers/cpufreq/qcom-cpufreq-hw.c | 4 +-- > > include/linux/arch_topology.h | 8 +++--- > > include/linux/sched/topology.h | 8 +++--- > > .../{thermal_pressure.h => hw_pressure.h} | 14 +++++----- > > include/trace/events/sched.h | 2 +- > > init/Kconfig | 12 ++++----- > > kernel/sched/core.c | 8 +++--- > > kernel/sched/fair.c | 12 ++++----- > > kernel/sched/pelt.c | 18 ++++++------- > > kernel/sched/pelt.h | 16 ++++++------ > > kernel/sched/sched.h | 4 +-- > > 14 files changed, 72 insertions(+), 72 deletions(-) > > rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%) > > > > diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h > > index 853c4f81ba4a..e175e8596b5d 100644 > > --- a/arch/arm/include/asm/topology.h > > +++ b/arch/arm/include/asm/topology.h > > @@ -22,9 +22,9 @@ > > /* Enable topology flag updates */ > > #define arch_update_cpu_topology topology_update_cpu_topology > > > > -/* Replace task scheduler's default thermal pressure API */ > > -#define arch_scale_thermal_pressure topology_get_thermal_pressure > > -#define arch_update_thermal_pressure topology_update_thermal_pressure > > +/* Replace task scheduler's default hw pressure API */ > > +#define arch_scale_hw_pressure topology_get_hw_pressure > > +#define arch_update_hw_pressure topology_update_hw_pressure > > > > #else > > > > diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h > > index a323b109b9c4..a427650bdfba 100644 > > --- a/arch/arm64/include/asm/topology.h > > +++ b/arch/arm64/include/asm/topology.h > > @@ -35,9 +35,9 @@ void update_freq_counters_refs(void); > > /* Enable topology flag updates */ > > #define arch_update_cpu_topology topology_update_cpu_topology > > > > -/* Replace task scheduler's default thermal pressure API */ > > -#define arch_scale_thermal_pressure topology_get_thermal_pressure > > -#define arch_update_thermal_pressure topology_update_thermal_pressure > > +/* Replace task scheduler's default hw pressure API */ > > s/hw/HW/ ? > > > +#define arch_scale_hw_pressure topology_get_hw_pressure > > +#define arch_update_hw_pressure topology_update_hw_pressure > > > > #include <asm-generic/topology.h> > > > > diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c > > index 0906114963ff..3d8dc9d5c3ad 100644 > > --- a/drivers/base/arch_topology.c > > +++ b/drivers/base/arch_topology.c > > @@ -22,7 +22,7 @@ > > #include <linux/units.h> > > > > #define CREATE_TRACE_POINTS > > -#include <trace/events/thermal_pressure.h> > > +#include <trace/events/hw_pressure.h> > > > > static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); > > static struct cpumask scale_freq_counters_mask; > > @@ -160,26 +160,26 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity) > > per_cpu(cpu_scale, cpu) = capacity; > > } > > > > -DEFINE_PER_CPU(unsigned long, thermal_pressure); > > +DEFINE_PER_CPU(unsigned long, hw_pressure); > > > > /** > > - * topology_update_thermal_pressure() - Update thermal pressure for CPUs > > + * topology_update_hw_pressure() - Update hw pressure for CPUs > > same here: HW? > > > * @cpus : The related CPUs for which capacity has been reduced > > * @capped_freq : The maximum allowed frequency that CPUs can run at > > * > > - * Update the value of thermal pressure for all @cpus in the mask. The > > + * Update the value of hw pressure for all @cpus in the mask. The > > HW? > > > * cpumask should include all (online+offline) affected CPUs, to avoid > > * operating on stale data when hot-plug is used for some CPUs. The > > * @capped_freq reflects the currently allowed max CPUs frequency due to > > - * thermal capping. It might be also a boost frequency value, which is bigger > > + * hw capping. It might be also a boost frequency value, which is bigger > > HW? > > > * than the internal 'capacity_freq_ref' max frequency. In such case the > > * pressure value should simply be removed, since this is an indication that > > - * there is no thermal throttling. The @capped_freq must be provided in kHz. > > + * there is no hw throttling. The @capped_freq must be provided in kHz. > > HW? > > > */ > > -void topology_update_thermal_pressure(const struct cpumask *cpus, > > +void topology_update_hw_pressure(const struct cpumask *cpus, > > unsigned long capped_freq) > > { > > - unsigned long max_capacity, capacity, th_pressure; > > + unsigned long max_capacity, capacity, hw_pressure; > > u32 max_freq; > > int cpu; > > > > @@ -189,21 +189,21 @@ void topology_update_thermal_pressure(const struct cpumask *cpus, > > > > /* > > * Handle properly the boost frequencies, which should simply clean > > - * the thermal pressure value. > > + * the hw pressure value. > > HW? > > > */ > > if (max_freq <= capped_freq) > > capacity = max_capacity; > > else > > capacity = mult_frac(max_capacity, capped_freq, max_freq); > > > > - th_pressure = max_capacity - capacity; > > + hw_pressure = max_capacity - capacity; > > > > - trace_thermal_pressure_update(cpu, th_pressure); > > + trace_hw_pressure_update(cpu, hw_pressure); > > > > for_each_cpu(cpu, cpus) > > - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); > > + WRITE_ONCE(per_cpu(hw_pressure, cpu), hw_pressure); > > } > > -EXPORT_SYMBOL_GPL(topology_update_thermal_pressure); > > +EXPORT_SYMBOL_GPL(topology_update_hw_pressure); > > > > static ssize_t cpu_capacity_show(struct device *dev, > > struct device_attribute *attr, > > diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c > > index 70b0f21968a0..a619202ba81d 100644 > > --- a/drivers/cpufreq/qcom-cpufreq-hw.c > > +++ b/drivers/cpufreq/qcom-cpufreq-hw.c > > @@ -347,8 +347,8 @@ static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data) > > > > throttled_freq = freq_hz / HZ_PER_KHZ; > > > > - /* Update thermal pressure (the boost frequencies are accepted) */ > > - arch_update_thermal_pressure(policy->related_cpus, throttled_freq); > > + /* Update hw pressure (the boost frequencies are accepted) */ > > HW? > > > + arch_update_hw_pressure(policy->related_cpus, throttled_freq); > > > > [snip] > > > > > if (housekeeping_cpu(cpu, HK_TYPE_TICK)) > > @@ -5669,8 +5669,8 @@ void scheduler_tick(void) > > rq_lock(rq, &rf); > > > > update_rq_clock(rq); > > - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); > > - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); > > + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); > > + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); > > We switch to task clock here, could you tell me why? > Don't we have to maintain the boot command parameter for the shift? This should have been part of the patch5 that I finally removed. IMO, the additional time shift with rq_clock_thermal is no more needed now that we have 2 separates signals > > > curr->sched_class->task_tick(rq, curr, 0); > > if (sched_feat(LATENCY_WARN)) > > resched_latency = cpu_resched_latency(rq); > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > > index 11d3be829302..07050955d6e0 100644 > > --- a/kernel/sched/fair.c > > +++ b/kernel/sched/fair.c > > @@ -4936,7 +4936,7 @@ static inline unsigned long get_actual_cpu_capacity(int cpu) > > { > > unsigned long capacity = arch_scale_cpu_capacity(cpu); > > > > - capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); > > + capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); > > > > return capacity; > > } > > @@ -4968,7 +4968,7 @@ static inline int util_fits_cpu(unsigned long util, > > * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it > > * should fit a little cpu even if there's some pressure. > > * > > - * Only exception is for thermal pressure since it has a direct impact > > + * Only exception is for hw or cpufreq pressure since it has a direct impact > > HW? > > > * on available OPP of the system. > > * > > * We honour it for uclamp_min only as a drop in performance level > > @@ -9224,7 +9224,7 @@ static inline bool others_have_blocked(struct rq *rq) > > if (READ_ONCE(rq->avg_dl.util_avg)) > > return true; > > > > - if (thermal_load_avg(rq)) > > + if (hw_load_avg(rq)) > > return true; > > > > #ifdef CONFIG_HAVE_SCHED_AVG_IRQ > > @@ -9256,7 +9256,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done) > > { > > const struct sched_class *curr_class; > > u64 now = rq_clock_pelt(rq); > > - unsigned long thermal_pressure; > > + unsigned long hw_pressure; > > bool decayed; > > > > /* > > @@ -9265,11 +9265,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done) > > */ > > curr_class = rq->curr->sched_class; > > > > - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); > > + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); > > > > decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | > > update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | > > - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | > > + update_hw_load_avg(now, rq, hw_pressure) | > > Here also the rq_clock_thermal() is not used anymore. Shouldn't we > remove the rq_clock_thermal() if not used anymore (I cannot se this in > the patch)? > > > update_irq_load_avg(rq, 0); > > > > if (others_have_blocked(rq)) > > [snip] > > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > > index e58a54bda77d..7eaf186d071e 100644 > > --- a/kernel/sched/sched.h > > +++ b/kernel/sched/sched.h > > @@ -1078,8 +1078,8 @@ struct rq { > > #ifdef CONFIG_HAVE_SCHED_AVG_IRQ > > struct sched_avg avg_irq; > > #endif > > -#ifdef CONFIG_SCHED_THERMAL_PRESSURE > > - struct sched_avg avg_thermal; > > +#ifdef CONFIG_SCHED_HW_PRESSURE > > + struct sched_avg avg_hw; > > #endif > > u64 idle_stamp; > > u64 avg_idle; > > I don't see that rq_clock_thermal() removal in this header. > Maybe I miss some patch? > > BTW, I like the name 'HW pressure' for this information/signal.
On 12/14/23 08:36, Vincent Guittot wrote: > On Thu, 14 Dec 2023 at 09:30, Lukasz Luba <lukasz.luba@arm.com> wrote: >> >> >> On 12/12/23 14:27, Vincent Guittot wrote: [snip] >>> update_rq_clock(rq); >>> - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); >>> - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); >>> + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); >>> + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); >> >> We switch to task clock here, could you tell me why? >> Don't we have to maintain the boot command parameter for the shift? > > This should have been part of the patch5 that I finally removed. IMO, > the additional time shift with rq_clock_thermal is no more needed now > that we have 2 separates signals > I didn't like the left-shift which causes the signal to converge slowly. I rather wanted right-shift to converge (react faster), so you have my vote for this change. Also, I agree that with the two-signal approach this shift trick can go away now. I just worry about the dropped boot parameter. So, are going to send that patach5 which removes the 'sched_thermal_decay_shift' and documentation bit?
On Thu, 14 Dec 2023 at 09:53, Lukasz Luba <lukasz.luba@arm.com> wrote: > > > > On 12/14/23 08:36, Vincent Guittot wrote: > > On Thu, 14 Dec 2023 at 09:30, Lukasz Luba <lukasz.luba@arm.com> wrote: > >> > >> > >> On 12/12/23 14:27, Vincent Guittot wrote: > > [snip] > > >>> update_rq_clock(rq); > >>> - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); > >>> - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); > >>> + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); > >>> + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); > >> > >> We switch to task clock here, could you tell me why? > >> Don't we have to maintain the boot command parameter for the shift? > > > > This should have been part of the patch5 that I finally removed. IMO, > > the additional time shift with rq_clock_thermal is no more needed now > > that we have 2 separates signals > > > > I didn't like the left-shift which causes the signal to converge slowly. > I rather wanted right-shift to converge (react faster), so you have my > vote for this change. Also, I agree that with the two-signal approach > this shift trick can go away now. I just worry about the dropped boot > parameter. > > So, are going to send that patach5 which removes the > 'sched_thermal_decay_shift' and documentation bit? Yes, i will add it back for the next version
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 853c4f81ba4a..e175e8596b5d 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -22,9 +22,9 @@ /* Enable topology flag updates */ #define arch_update_cpu_topology topology_update_cpu_topology -/* Replace task scheduler's default thermal pressure API */ -#define arch_scale_thermal_pressure topology_get_thermal_pressure -#define arch_update_thermal_pressure topology_update_thermal_pressure +/* Replace task scheduler's default hw pressure API */ +#define arch_scale_hw_pressure topology_get_hw_pressure +#define arch_update_hw_pressure topology_update_hw_pressure #else diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index a323b109b9c4..a427650bdfba 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -35,9 +35,9 @@ void update_freq_counters_refs(void); /* Enable topology flag updates */ #define arch_update_cpu_topology topology_update_cpu_topology -/* Replace task scheduler's default thermal pressure API */ -#define arch_scale_thermal_pressure topology_get_thermal_pressure -#define arch_update_thermal_pressure topology_update_thermal_pressure +/* Replace task scheduler's default hw pressure API */ +#define arch_scale_hw_pressure topology_get_hw_pressure +#define arch_update_hw_pressure topology_update_hw_pressure #include <asm-generic/topology.h> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 0906114963ff..3d8dc9d5c3ad 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -22,7 +22,7 @@ #include <linux/units.h> #define CREATE_TRACE_POINTS -#include <trace/events/thermal_pressure.h> +#include <trace/events/hw_pressure.h> static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); static struct cpumask scale_freq_counters_mask; @@ -160,26 +160,26 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity) per_cpu(cpu_scale, cpu) = capacity; } -DEFINE_PER_CPU(unsigned long, thermal_pressure); +DEFINE_PER_CPU(unsigned long, hw_pressure); /** - * topology_update_thermal_pressure() - Update thermal pressure for CPUs + * topology_update_hw_pressure() - Update hw pressure for CPUs * @cpus : The related CPUs for which capacity has been reduced * @capped_freq : The maximum allowed frequency that CPUs can run at * - * Update the value of thermal pressure for all @cpus in the mask. The + * Update the value of hw pressure for all @cpus in the mask. The * cpumask should include all (online+offline) affected CPUs, to avoid * operating on stale data when hot-plug is used for some CPUs. The * @capped_freq reflects the currently allowed max CPUs frequency due to - * thermal capping. It might be also a boost frequency value, which is bigger + * hw capping. It might be also a boost frequency value, which is bigger * than the internal 'capacity_freq_ref' max frequency. In such case the * pressure value should simply be removed, since this is an indication that - * there is no thermal throttling. The @capped_freq must be provided in kHz. + * there is no hw throttling. The @capped_freq must be provided in kHz. */ -void topology_update_thermal_pressure(const struct cpumask *cpus, +void topology_update_hw_pressure(const struct cpumask *cpus, unsigned long capped_freq) { - unsigned long max_capacity, capacity, th_pressure; + unsigned long max_capacity, capacity, hw_pressure; u32 max_freq; int cpu; @@ -189,21 +189,21 @@ void topology_update_thermal_pressure(const struct cpumask *cpus, /* * Handle properly the boost frequencies, which should simply clean - * the thermal pressure value. + * the hw pressure value. */ if (max_freq <= capped_freq) capacity = max_capacity; else capacity = mult_frac(max_capacity, capped_freq, max_freq); - th_pressure = max_capacity - capacity; + hw_pressure = max_capacity - capacity; - trace_thermal_pressure_update(cpu, th_pressure); + trace_hw_pressure_update(cpu, hw_pressure); for_each_cpu(cpu, cpus) - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); + WRITE_ONCE(per_cpu(hw_pressure, cpu), hw_pressure); } -EXPORT_SYMBOL_GPL(topology_update_thermal_pressure); +EXPORT_SYMBOL_GPL(topology_update_hw_pressure); static ssize_t cpu_capacity_show(struct device *dev, struct device_attribute *attr, diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 70b0f21968a0..a619202ba81d 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -347,8 +347,8 @@ static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data) throttled_freq = freq_hz / HZ_PER_KHZ; - /* Update thermal pressure (the boost frequencies are accepted) */ - arch_update_thermal_pressure(policy->related_cpus, throttled_freq); + /* Update hw pressure (the boost frequencies are accepted) */ + arch_update_hw_pressure(policy->related_cpus, throttled_freq); /* * In the unlikely case policy is unregistered do not enable diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index a63d61ca55af..b721f360d759 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -60,14 +60,14 @@ void topology_scale_freq_tick(void); void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus); void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus); -DECLARE_PER_CPU(unsigned long, thermal_pressure); +DECLARE_PER_CPU(unsigned long, hw_pressure); -static inline unsigned long topology_get_thermal_pressure(int cpu) +static inline unsigned long topology_get_hw_pressure(int cpu) { - return per_cpu(thermal_pressure, cpu); + return per_cpu(hw_pressure, cpu); } -void topology_update_thermal_pressure(const struct cpumask *cpus, +void topology_update_hw_pressure(const struct cpumask *cpus, unsigned long capped_freq); struct cpu_topology { diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a6e04b4a21d7..e3b2cf7de018 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -264,17 +264,17 @@ unsigned long arch_scale_cpu_capacity(int cpu) } #endif -#ifndef arch_scale_thermal_pressure +#ifndef arch_scale_hw_pressure static __always_inline -unsigned long arch_scale_thermal_pressure(int cpu) +unsigned long arch_scale_hw_pressure(int cpu) { return 0; } #endif -#ifndef arch_update_thermal_pressure +#ifndef arch_update_hw_pressure static __always_inline -void arch_update_thermal_pressure(const struct cpumask *cpus, +void arch_update_hw_pressure(const struct cpumask *cpus, unsigned long capped_frequency) { } #endif diff --git a/include/trace/events/thermal_pressure.h b/include/trace/events/hw_pressure.h similarity index 55% rename from include/trace/events/thermal_pressure.h rename to include/trace/events/hw_pressure.h index b68680201360..b9cd68854128 100644 --- a/include/trace/events/thermal_pressure.h +++ b/include/trace/events/hw_pressure.h @@ -1,27 +1,27 @@ /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM -#define TRACE_SYSTEM thermal_pressure +#define TRACE_SYSTEM hw_pressure #if !defined(_TRACE_THERMAL_PRESSURE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_THERMAL_PRESSURE_H #include <linux/tracepoint.h> -TRACE_EVENT(thermal_pressure_update, - TP_PROTO(int cpu, unsigned long thermal_pressure), - TP_ARGS(cpu, thermal_pressure), +TRACE_EVENT(hw_pressure_update, + TP_PROTO(int cpu, unsigned long hw_pressure), + TP_ARGS(cpu, hw_pressure), TP_STRUCT__entry( - __field(unsigned long, thermal_pressure) + __field(unsigned long, hw_pressure) __field(int, cpu) ), TP_fast_assign( - __entry->thermal_pressure = thermal_pressure; + __entry->hw_pressure = hw_pressure; __entry->cpu = cpu; ), - TP_printk("cpu=%d thermal_pressure=%lu", __entry->cpu, __entry->thermal_pressure) + TP_printk("cpu=%d hw_pressure=%lu", __entry->cpu, __entry->hw_pressure) ); #endif /* _TRACE_THERMAL_PRESSURE_H */ diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index dbb01b4b7451..d115d64c4011 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -752,7 +752,7 @@ DECLARE_TRACE(pelt_dl_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); -DECLARE_TRACE(pelt_thermal_tp, +DECLARE_TRACE(pelt_hw_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); diff --git a/init/Kconfig b/init/Kconfig index 9ffb103fc927..37ceeb67e01c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -538,24 +538,24 @@ config HAVE_SCHED_AVG_IRQ depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING depends on SMP -config SCHED_THERMAL_PRESSURE +config SCHED_HW_PRESSURE bool default y if ARM && ARM_CPU_TOPOLOGY default y if ARM64 depends on SMP depends on CPU_FREQ_THERMAL help - Select this option to enable thermal pressure accounting in the - scheduler. Thermal pressure is the value conveyed to the scheduler + Select this option to enable HW pressure accounting in the + scheduler. HW pressure is the value conveyed to the scheduler that reflects the reduction in CPU compute capacity resulted from - thermal throttling. Thermal throttling occurs when the performance of - a CPU is capped due to high operating temperatures. + HW throttling. HW throttling occurs when the performance of + a CPU is capped due to high operating temperatures as an example. If selected, the scheduler will be able to balance tasks accordingly, i.e. put less load on throttled CPUs than on non/less throttled ones. This requires the architecture to implement - arch_update_thermal_pressure() and arch_scale_thermal_pressure(). + arch_update_hw_pressure() and arch_scale_thermal_pressure(). config BSD_PROCESS_ACCT bool "BSD Process Accounting" diff --git a/kernel/sched/core.c b/kernel/sched/core.c index db4be4921e7f..c68e47bfd5ae 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -107,7 +107,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); -EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); @@ -5658,7 +5658,7 @@ void scheduler_tick(void) struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; struct rq_flags rf; - unsigned long thermal_pressure; + unsigned long hw_pressure; u64 resched_latency; if (housekeeping_cpu(cpu, HK_TYPE_TICK)) @@ -5669,8 +5669,8 @@ void scheduler_tick(void) rq_lock(rq, &rf); update_rq_clock(rq); - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); curr->sched_class->task_tick(rq, curr, 0); if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 11d3be829302..07050955d6e0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4936,7 +4936,7 @@ static inline unsigned long get_actual_cpu_capacity(int cpu) { unsigned long capacity = arch_scale_cpu_capacity(cpu); - capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); + capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); return capacity; } @@ -4968,7 +4968,7 @@ static inline int util_fits_cpu(unsigned long util, * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it * should fit a little cpu even if there's some pressure. * - * Only exception is for thermal pressure since it has a direct impact + * Only exception is for hw or cpufreq pressure since it has a direct impact * on available OPP of the system. * * We honour it for uclamp_min only as a drop in performance level @@ -9224,7 +9224,7 @@ static inline bool others_have_blocked(struct rq *rq) if (READ_ONCE(rq->avg_dl.util_avg)) return true; - if (thermal_load_avg(rq)) + if (hw_load_avg(rq)) return true; #ifdef CONFIG_HAVE_SCHED_AVG_IRQ @@ -9256,7 +9256,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done) { const struct sched_class *curr_class; u64 now = rq_clock_pelt(rq); - unsigned long thermal_pressure; + unsigned long hw_pressure; bool decayed; /* @@ -9265,11 +9265,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done) */ curr_class = rq->curr->sched_class; - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | + update_hw_load_avg(now, rq, hw_pressure) | update_irq_load_avg(rq, 0); if (others_have_blocked(rq)) diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 63b6cf898220..cd599a5185c4 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -384,30 +384,30 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } -#ifdef CONFIG_SCHED_THERMAL_PRESSURE +#ifdef CONFIG_SCHED_HW_PRESSURE /* - * thermal: + * hardware: * * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked * * util_avg and runnable_load_avg are not supported and meaningless. * * Unlike rt/dl utilization tracking that track time spent by a cpu - * running a rt/dl task through util_avg, the average thermal pressure is - * tracked through load_avg. This is because thermal pressure signal is + * running a rt/dl task through util_avg, the average hw pressure is + * tracked through load_avg. This is because hw pressure signal is * time weighted "delta" capacity unlike util_avg which is binary. * "delta capacity" = actual capacity - - * capped capacity a cpu due to a thermal event. + * capped capacity a cpu due to a hw event. */ -int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { - if (___update_load_sum(now, &rq->avg_thermal, + if (___update_load_sum(now, &rq->avg_hw, capacity, capacity, capacity)) { - ___update_load_avg(&rq->avg_thermal, 1); - trace_pelt_thermal_tp(rq); + ___update_load_avg(&rq->avg_hw, 1); + trace_pelt_hw_tp(rq); return 1; } diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 3a0e0dc28721..e3ab44c4a2ef 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -7,21 +7,21 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -#ifdef CONFIG_SCHED_THERMAL_PRESSURE -int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); +#ifdef CONFIG_SCHED_HW_PRESSURE +int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity); -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { - return READ_ONCE(rq->avg_thermal.load_avg); + return READ_ONCE(rq->avg_hw.load_avg); } #else static inline int -update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { return 0; } -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { return 0; } @@ -202,12 +202,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) } static inline int -update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { return 0; } -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { return 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e58a54bda77d..7eaf186d071e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1078,8 +1078,8 @@ struct rq { #ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif -#ifdef CONFIG_SCHED_THERMAL_PRESSURE - struct sched_avg avg_thermal; +#ifdef CONFIG_SCHED_HW_PRESSURE + struct sched_avg avg_hw; #endif u64 idle_stamp; u64 avg_idle;
Now that cpufreq provides a pressure value to the scheduler, rename arch_update_thermal_pressure into hw pressure to reflect that it returns a pressure applied by HW with a high frequency and which needs filtering. This pressure is not always related to thermal mitigation but can also be generated by max current limitation as an example. Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> --- arch/arm/include/asm/topology.h | 6 ++--- arch/arm64/include/asm/topology.h | 6 ++--- drivers/base/arch_topology.c | 26 +++++++++---------- drivers/cpufreq/qcom-cpufreq-hw.c | 4 +-- include/linux/arch_topology.h | 8 +++--- include/linux/sched/topology.h | 8 +++--- .../{thermal_pressure.h => hw_pressure.h} | 14 +++++----- include/trace/events/sched.h | 2 +- init/Kconfig | 12 ++++----- kernel/sched/core.c | 8 +++--- kernel/sched/fair.c | 12 ++++----- kernel/sched/pelt.c | 18 ++++++------- kernel/sched/pelt.h | 16 ++++++------ kernel/sched/sched.h | 4 +-- 14 files changed, 72 insertions(+), 72 deletions(-) rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)