diff mbox series

[v2,2/7] cpufreq/sched: Move cpufreq-specific EAS checks to cpufreq

Message ID 2317800.iZASKD2KPV@rjwysocki.net
State New
Headers show
Series [v2,1/7] cpufreq/sched: schedutil: Add helper for governor checks | expand

Commit Message

Rafael J. Wysocki May 6, 2025, 8:37 p.m. UTC
From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

Doing cpufreq-specific EAS checks that require accessing policy
internals directly from sched_is_eas_possible() is a bit unfortunate,
so introduce cpufreq_ready_for_eas() in cpufreq, move those checks
into that new function and make sched_is_eas_possible() call it.

While at it, address a possible race between the EAS governor check
and governor change by doing the former under the policy rwsem.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Christian Loehle <christian.loehle@arm.com>
Tested-by: Christian Loehle <christian.loehle@arm.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---

v1 -> v2:
   * Add missing newline characters in two places (Christian).
   * Pick up tags.

---
 drivers/cpufreq/cpufreq.c |   32 ++++++++++++++++++++++++++++++++
 include/linux/cpufreq.h   |    2 ++
 kernel/sched/topology.c   |   25 +++++--------------------
 3 files changed, 39 insertions(+), 20 deletions(-)

Comments

Marek Szyprowski May 9, 2025, 11:49 p.m. UTC | #1
On 06.05.2025 22:37, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>
> Doing cpufreq-specific EAS checks that require accessing policy
> internals directly from sched_is_eas_possible() is a bit unfortunate,
> so introduce cpufreq_ready_for_eas() in cpufreq, move those checks
> into that new function and make sched_is_eas_possible() call it.
>
> While at it, address a possible race between the EAS governor check
> and governor change by doing the former under the policy rwsem.
>
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> Reviewed-by: Christian Loehle <christian.loehle@arm.com>
> Tested-by: Christian Loehle <christian.loehle@arm.com>
> Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>

In my tests I've noticed that this patch, merged as commit 4854649b1fb4 
("cpufreq/sched: Move cpufreq-specific EAS checks to cpufreq"), causes a 
regression on ARM64 Amlogic Meson SoC based OdroidN2 board. The board 
finally lockups. Reverting $subject on top of next-20250509 fixes this 
issue. Here is the lockdep warning observed before the lockup:

======================================================
WARNING: possible circular locking dependency detected
6.15.0-rc5-next-20250509-dirty #10335 Tainted: G         C
cpufreq: cpufreq_policy_online: CPU2: Running at unlisted initial 
frequency: 999999 kHz, changing to: 1000000 kHz
------------------------------------------------------
kworker/3:1/79 is trying to acquire lock:
ffff00000494b380 (&policy->rwsem){++++}-{4:4}, at: 
cpufreq_ready_for_eas+0x60/0xbc

but task is already holding lock:
ffff8000832887a0 (sched_domains_mutex){+.+.}-{4:4}, at: 
partition_sched_domains+0x54/0x938

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (sched_domains_mutex){+.+.}-{4:4}:
        __mutex_lock+0xa8/0x598
        mutex_lock_nested+0x24/0x30
        partition_sched_domains+0x54/0x938
        rebuild_sched_domains_locked+0x2d4/0x900
        rebuild_sched_domains+0x2c/0x48
        rebuild_sched_domains_energy+0x3c/0x58
        rebuild_sd_workfn+0x10/0x1c
        process_one_work+0x208/0x604
        worker_thread+0x244/0x388
        kthread+0x150/0x228
        ret_from_fork+0x10/0x20

-> #1 (cpuset_mutex){+.+.}-{4:4}:
        __mutex_lock+0xa8/0x598
        mutex_lock_nested+0x24/0x30
        cpuset_lock+0x1c/0x28
        __sched_setscheduler+0x31c/0x830
        sched_setattr_nocheck+0x18/0x24
        sugov_init+0x1b4/0x388
        cpufreq_init_governor.part.0+0x58/0xd4
        cpufreq_set_policy+0x2c8/0x3ec
        cpufreq_online+0x520/0xb20
        cpufreq_add_dev+0x80/0x98
        subsys_interface_register+0xfc/0x118
        cpufreq_register_driver+0x150/0x238
        dt_cpufreq_probe+0x148/0x488
        platform_probe+0x68/0xdc
        really_probe+0xbc/0x298
        __driver_probe_device+0x78/0x12c
        driver_probe_device+0xdc/0x164
        __device_attach_driver+0xb8/0x138
        bus_for_each_drv+0x80/0xdc
        __device_attach+0xa8/0x1b0
        device_initial_probe+0x14/0x20
        bus_probe_device+0xb0/0xb4
        deferred_probe_work_func+0x8c/0xc8
        process_one_work+0x208/0x604
        worker_thread+0x244/0x388
        kthread+0x150/0x228
        ret_from_fork+0x10/0x20

-> #0 (&policy->rwsem){++++}-{4:4}:
        __lock_acquire+0x1408/0x2254
        lock_acquire+0x1c8/0x354
        down_read+0x60/0x180
        cpufreq_ready_for_eas+0x60/0xbc
        sched_is_eas_possible+0x144/0x170
        partition_sched_domains+0x504/0x938
        rebuild_sched_domains_locked+0x2d4/0x900
        rebuild_sched_domains+0x2c/0x48
        rebuild_sched_domains_energy+0x3c/0x58
        rebuild_sd_workfn+0x10/0x1c
        process_one_work+0x208/0x604
        worker_thread+0x244/0x388
        kthread+0x150/0x228
        ret_from_fork+0x10/0x20

other info that might help us debug this:

Chain exists of:
   &policy->rwsem --> cpuset_mutex --> sched_domains_mutex

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(sched_domains_mutex);
                                lock(cpuset_mutex);
                                lock(sched_domains_mutex);
   rlock(&policy->rwsem);

  *** DEADLOCK ***

6 locks held by kworker/3:1/79:
  #0: ffff00000000a748 ((wq_completion)events){+.+.}-{0:0}, at: 
process_one_work+0x18c/0x604
  #1: ffff800084a83dd0 (rebuild_sd_work){+.+.}-{0:0}, at: 
process_one_work+0x1b4/0x604
  #2: ffff800083288908 (sched_energy_mutex){+.+.}-{4:4}, at: 
rebuild_sched_domains_energy+0x30/0x58
  #3: ffff80008327c2d8 (cpu_hotplug_lock){++++}-{0:0}, at: 
cpus_read_lock+0x10/0x1c
  #4: ffff80008331b098 (cpuset_mutex){+.+.}-{4:4}, at: 
rebuild_sched_domains+0x28/0x48
  #5: ffff8000832887a0 (sched_domains_mutex){+.+.}-{4:4}, at: 
partition_sched_domains+0x54/0x938

stack backtrace:
CPU: 3 UID: 0 PID: 79 Comm: kworker/3:1 Tainted: G C          
6.15.0-rc5-next-20250509-dirty #10335 PREEMPT
Tainted: [C]=CRAP
Hardware name: Hardkernel ODROID-N2 (DT)
Workqueue: events rebuild_sd_workfn
Call trace:
  show_stack+0x18/0x24 (C)
  dump_stack_lvl+0x90/0xd0
  dump_stack+0x18/0x24
  print_circular_bug+0x298/0x37c
  check_noncircular+0x15c/0x170
  __lock_acquire+0x1408/0x2254
  lock_acquire+0x1c8/0x354
  down_read+0x60/0x180
  cpufreq_ready_for_eas+0x60/0xbc
  sched_is_eas_possible+0x144/0x170
  partition_sched_domains+0x504/0x938
  rebuild_sched_domains_locked+0x2d4/0x900
  rebuild_sched_domains+0x2c/0x48
  rebuild_sched_domains_energy+0x3c/0x58
  rebuild_sd_workfn+0x10/0x1c
  process_one_work+0x208/0x604
  worker_thread+0x244/0x388
  kthread+0x150/0x228
  ret_from_fork+0x10/0x20


> ---
>
> v1 -> v2:
>     * Add missing newline characters in two places (Christian).
>     * Pick up tags.
>
> ---
>   drivers/cpufreq/cpufreq.c |   32 ++++++++++++++++++++++++++++++++
>   include/linux/cpufreq.h   |    2 ++
>   kernel/sched/topology.c   |   25 +++++--------------------
>   3 files changed, 39 insertions(+), 20 deletions(-)
>
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -3056,6 +3056,38 @@
>   
>   	return 0;
>   }
> +
> +static bool cpufreq_policy_is_good_for_eas(unsigned int cpu)
> +{
> +	struct cpufreq_policy *policy __free(put_cpufreq_policy);
> +
> +	policy = cpufreq_cpu_get(cpu);
> +	if (!policy) {
> +		pr_debug("cpufreq policy not set for CPU: %d\n", cpu);
> +		return false;
> +	}
> +
> +	guard(cpufreq_policy_read)(policy);
> +
> +	return sugov_is_governor(policy);
> +}
> +
> +bool cpufreq_ready_for_eas(const struct cpumask *cpu_mask)
> +{
> +	unsigned int cpu;
> +
> +	/* Do not attempt EAS if schedutil is not being used. */
> +	for_each_cpu(cpu, cpu_mask) {
> +		if (!cpufreq_policy_is_good_for_eas(cpu)) {
> +			pr_debug("rd %*pbl: schedutil is mandatory for EAS\n",
> +				 cpumask_pr_args(cpu_mask));
> +			return false;
> +		}
> +	}
> +
> +	return true;
> +}
> +
>   module_param(off, int, 0444);
>   module_param_string(default_governor, default_governor, CPUFREQ_NAME_LEN, 0444);
>   core_initcall(cpufreq_core_init);
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -1237,6 +1237,8 @@
>   		struct cpufreq_frequency_table *table,
>   		unsigned int transition_latency);
>   
> +bool cpufreq_ready_for_eas(const struct cpumask *cpu_mask);
> +
>   static inline void cpufreq_register_em_with_opp(struct cpufreq_policy *policy)
>   {
>   	dev_pm_opp_of_register_em(get_cpu_device(policy->cpu),
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -212,8 +212,6 @@
>   static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
>   {
>   	bool any_asym_capacity = false;
> -	struct cpufreq_policy *policy;
> -	bool policy_is_ready;
>   	int i;
>   
>   	/* EAS is enabled for asymmetric CPU capacity topologies. */
> @@ -248,25 +246,12 @@
>   		return false;
>   	}
>   
> -	/* Do not attempt EAS if schedutil is not being used. */
> -	for_each_cpu(i, cpu_mask) {
> -		policy = cpufreq_cpu_get(i);
> -		if (!policy) {
> -			if (sched_debug()) {
> -				pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d",
> -					cpumask_pr_args(cpu_mask), i);
> -			}
> -			return false;
> -		}
> -		policy_is_ready = sugov_is_governor(policy);
> -		cpufreq_cpu_put(policy);
> -		if (!policy_is_ready) {
> -			if (sched_debug()) {
> -				pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n",
> -					cpumask_pr_args(cpu_mask));
> -			}
> -			return false;
> +	if (!cpufreq_ready_for_eas(cpu_mask)) {
> +		if (sched_debug()) {
> +			pr_info("rd %*pbl: Checking EAS: cpufreq is not ready\n",
> +				cpumask_pr_args(cpu_mask));
>   		}
> +		return false;
>   	}
>   
>   	return true;
>
>
>
>
Best regards
Marek Szyprowski May 12, 2025, 6:48 a.m. UTC | #2
On 10.05.2025 13:31, Rafael J. Wysocki wrote:
> On Sat, May 10, 2025 at 1:49 AM Marek Szyprowski
> <m.szyprowski@samsung.com> wrote:
>> On 06.05.2025 22:37, Rafael J. Wysocki wrote:
>>> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>>>
>>> Doing cpufreq-specific EAS checks that require accessing policy
>>> internals directly from sched_is_eas_possible() is a bit unfortunate,
>>> so introduce cpufreq_ready_for_eas() in cpufreq, move those checks
>>> into that new function and make sched_is_eas_possible() call it.
>>>
>>> While at it, address a possible race between the EAS governor check
>>> and governor change by doing the former under the policy rwsem.
>>>
>>> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>>> Reviewed-by: Christian Loehle <christian.loehle@arm.com>
>>> Tested-by: Christian Loehle <christian.loehle@arm.com>
>>> Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
>> In my tests I've noticed that this patch, merged as commit 4854649b1fb4
>> ("cpufreq/sched: Move cpufreq-specific EAS checks to cpufreq"), causes a
>> regression on ARM64 Amlogic Meson SoC based OdroidN2 board. The board
>> finally lockups. Reverting $subject on top of next-20250509 fixes this
>> issue. Here is the lockdep warning observed before the lockup:
> Thanks for the report!
>
>> ======================================================
>> WARNING: possible circular locking dependency detected
>> 6.15.0-rc5-next-20250509-dirty #10335 Tainted: G         C
>> cpufreq: cpufreq_policy_online: CPU2: Running at unlisted initial
>> frequency: 999999 kHz, changing to: 1000000 kHz
>> ------------------------------------------------------
>> kworker/3:1/79 is trying to acquire lock:
>> ffff00000494b380 (&policy->rwsem){++++}-{4:4}, at:
>> cpufreq_ready_for_eas+0x60/0xbc
>>
>> but task is already holding lock:
>> ffff8000832887a0 (sched_domains_mutex){+.+.}-{4:4}, at:
>> partition_sched_domains+0x54/0x938
>>
>> which lock already depends on the new lock.
>>
>> the existing dependency chain (in reverse order) is:
>>
>> -> #2 (sched_domains_mutex){+.+.}-{4:4}:
>>          __mutex_lock+0xa8/0x598
>>          mutex_lock_nested+0x24/0x30
>>          partition_sched_domains+0x54/0x938
>>          rebuild_sched_domains_locked+0x2d4/0x900
>>          rebuild_sched_domains+0x2c/0x48
>>          rebuild_sched_domains_energy+0x3c/0x58
>>          rebuild_sd_workfn+0x10/0x1c
>>          process_one_work+0x208/0x604
>>          worker_thread+0x244/0x388
>>          kthread+0x150/0x228
>>          ret_from_fork+0x10/0x20
>>
>> -> #1 (cpuset_mutex){+.+.}-{4:4}:
>>          __mutex_lock+0xa8/0x598
>>          mutex_lock_nested+0x24/0x30
>>          cpuset_lock+0x1c/0x28
>>          __sched_setscheduler+0x31c/0x830
>>          sched_setattr_nocheck+0x18/0x24
>>          sugov_init+0x1b4/0x388
>>          cpufreq_init_governor.part.0+0x58/0xd4
>>          cpufreq_set_policy+0x2c8/0x3ec
>>          cpufreq_online+0x520/0xb20
>>          cpufreq_add_dev+0x80/0x98
>>          subsys_interface_register+0xfc/0x118
>>          cpufreq_register_driver+0x150/0x238
>>          dt_cpufreq_probe+0x148/0x488
>>          platform_probe+0x68/0xdc
>>          really_probe+0xbc/0x298
>>          __driver_probe_device+0x78/0x12c
>>          driver_probe_device+0xdc/0x164
>>          __device_attach_driver+0xb8/0x138
>>          bus_for_each_drv+0x80/0xdc
>>          __device_attach+0xa8/0x1b0
>>          device_initial_probe+0x14/0x20
>>          bus_probe_device+0xb0/0xb4
>>          deferred_probe_work_func+0x8c/0xc8
>>          process_one_work+0x208/0x604
>>          worker_thread+0x244/0x388
>>          kthread+0x150/0x228
>>          ret_from_fork+0x10/0x20
>>
>> -> #0 (&policy->rwsem){++++}-{4:4}:
>>          __lock_acquire+0x1408/0x2254
>>          lock_acquire+0x1c8/0x354
>>          down_read+0x60/0x180
>>          cpufreq_ready_for_eas+0x60/0xbc
>>          sched_is_eas_possible+0x144/0x170
>>          partition_sched_domains+0x504/0x938
>>          rebuild_sched_domains_locked+0x2d4/0x900
>>          rebuild_sched_domains+0x2c/0x48
>>          rebuild_sched_domains_energy+0x3c/0x58
>>          rebuild_sd_workfn+0x10/0x1c
>>          process_one_work+0x208/0x604
>>          worker_thread+0x244/0x388
>>          kthread+0x150/0x228
>>          ret_from_fork+0x10/0x20
>>
>> other info that might help us debug this:
>>
>> Chain exists of:
>>     &policy->rwsem --> cpuset_mutex --> sched_domains_mutex
>>
>>    Possible unsafe locking scenario:
>>
>>          CPU0                    CPU1
>>          ----                    ----
>>     lock(sched_domains_mutex);
>>                                  lock(cpuset_mutex);
>>                                  lock(sched_domains_mutex);
>>     rlock(&policy->rwsem);
>>
>>    *** DEADLOCK ***
> Well, it turns out that trying to acquire policy->rwsem under
> sched_domains_mutex is a bad idea.  It was added to
> cpufreq_policy_is_good_for_eas() to address a theoretical race, so it
> can be dropped safely.  A theoretical race is better than a real
> deadlock.
>
> Please test the attached patch.

This fixed the observed issue. Thanks!

Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>


Best regards
Rafael J. Wysocki May 12, 2025, 5:47 p.m. UTC | #3
On Mon, May 12, 2025 at 7:24 PM Dietmar Eggemann
<dietmar.eggemann@arm.com> wrote:
>
> On 12/05/2025 14:53, Rafael J. Wysocki wrote:
> > On Mon, May 12, 2025 at 8:48 AM Marek Szyprowski
> > <m.szyprowski@samsung.com> wrote:
> >>
> >> On 10.05.2025 13:31, Rafael J. Wysocki wrote:
> >>> On Sat, May 10, 2025 at 1:49 AM Marek Szyprowski
> >>> <m.szyprowski@samsung.com> wrote:
> >>>> On 06.05.2025 22:37, Rafael J. Wysocki wrote:
> >>>>> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>
> [...]
>
> >>>>    *** DEADLOCK ***
> >>> Well, it turns out that trying to acquire policy->rwsem under
> >>> sched_domains_mutex is a bad idea.  It was added to
> >>> cpufreq_policy_is_good_for_eas() to address a theoretical race, so it
> >>> can be dropped safely.  A theoretical race is better than a real
> >>> deadlock.
> >>>
> >>> Please test the attached patch.
> >>
> >> This fixed the observed issue. Thanks!
> >>
> >> Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
> >> Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
> >
> > Thanks for the confirmation!
>
> See this on my Hikey 960 as well. I was wondering why Christian L. and I
> didn't catch this with RFT v1.0 (*) on i7-13700K.
>
> But it looks like that
>
> https://web.git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git/log/?h=experimental/intel_pstate/eas-take2-extended
>
> mentioned in the patch-header of (*) didn't have this line in its
>
> commit 9ad047cade6b ("cpufreq/sched: Move cpufreq-specific EAS checks to
> cpufreq")

Yeah, I made this change later.  My bad.
diff mbox series

Patch

--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -3056,6 +3056,38 @@ 
 
 	return 0;
 }
+
+static bool cpufreq_policy_is_good_for_eas(unsigned int cpu)
+{
+	struct cpufreq_policy *policy __free(put_cpufreq_policy);
+
+	policy = cpufreq_cpu_get(cpu);
+	if (!policy) {
+		pr_debug("cpufreq policy not set for CPU: %d\n", cpu);
+		return false;
+	}
+
+	guard(cpufreq_policy_read)(policy);
+
+	return sugov_is_governor(policy);
+}
+
+bool cpufreq_ready_for_eas(const struct cpumask *cpu_mask)
+{
+	unsigned int cpu;
+
+	/* Do not attempt EAS if schedutil is not being used. */
+	for_each_cpu(cpu, cpu_mask) {
+		if (!cpufreq_policy_is_good_for_eas(cpu)) {
+			pr_debug("rd %*pbl: schedutil is mandatory for EAS\n",
+				 cpumask_pr_args(cpu_mask));
+			return false;
+		}
+	}
+
+	return true;
+}
+
 module_param(off, int, 0444);
 module_param_string(default_governor, default_governor, CPUFREQ_NAME_LEN, 0444);
 core_initcall(cpufreq_core_init);
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -1237,6 +1237,8 @@ 
 		struct cpufreq_frequency_table *table,
 		unsigned int transition_latency);
 
+bool cpufreq_ready_for_eas(const struct cpumask *cpu_mask);
+
 static inline void cpufreq_register_em_with_opp(struct cpufreq_policy *policy)
 {
 	dev_pm_opp_of_register_em(get_cpu_device(policy->cpu),
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -212,8 +212,6 @@ 
 static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
 {
 	bool any_asym_capacity = false;
-	struct cpufreq_policy *policy;
-	bool policy_is_ready;
 	int i;
 
 	/* EAS is enabled for asymmetric CPU capacity topologies. */
@@ -248,25 +246,12 @@ 
 		return false;
 	}
 
-	/* Do not attempt EAS if schedutil is not being used. */
-	for_each_cpu(i, cpu_mask) {
-		policy = cpufreq_cpu_get(i);
-		if (!policy) {
-			if (sched_debug()) {
-				pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d",
-					cpumask_pr_args(cpu_mask), i);
-			}
-			return false;
-		}
-		policy_is_ready = sugov_is_governor(policy);
-		cpufreq_cpu_put(policy);
-		if (!policy_is_ready) {
-			if (sched_debug()) {
-				pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n",
-					cpumask_pr_args(cpu_mask));
-			}
-			return false;
+	if (!cpufreq_ready_for_eas(cpu_mask)) {
+		if (sched_debug()) {
+			pr_info("rd %*pbl: Checking EAS: cpufreq is not ready\n",
+				cpumask_pr_args(cpu_mask));
 		}
+		return false;
 	}
 
 	return true;