diff mbox

[2/2] cpufreq: serialize calls to __cpufreq_governor()

Message ID 5225E205.7080008@linux.vnet.ibm.com
State New
Headers show

Commit Message

Srivatsa S. Bhat Sept. 3, 2013, 1:20 p.m. UTC
On 09/01/2013 10:56 AM, Viresh Kumar wrote:
> We can't take a big lock around __cpufreq_governor() as this causes recursive
> locking for some cases. But calls to this routine must be serialized for every
> policy.
> 
> Lets introduce another variable which would guarantee serialization here.
> 
> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> ---
>  drivers/cpufreq/cpufreq.c | 7 ++++++-
>  include/linux/cpufreq.h   | 1 +
>  2 files changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index f320a20..4d5723db 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -1692,13 +1692,15 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
>  						policy->cpu, event);
> 
>  	mutex_lock(&cpufreq_governor_lock);
> -	if ((policy->governor_enabled && (event == CPUFREQ_GOV_START)) ||
> +	if (policy->governor_busy ||
> +		(policy->governor_enabled && (event == CPUFREQ_GOV_START)) ||
>  		(!policy->governor_enabled && ((event == CPUFREQ_GOV_LIMITS) ||
>  					       (event == CPUFREQ_GOV_STOP)))) {
>  		mutex_unlock(&cpufreq_governor_lock);
>  		return -EBUSY;
>  	}
> 
> +	policy->governor_busy = true;
>  	if (event == CPUFREQ_GOV_STOP)
>  		policy->governor_enabled = false;
>  	else if (event == CPUFREQ_GOV_START)
> @@ -1727,6 +1729,9 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
>  			((event == CPUFREQ_GOV_POLICY_EXIT) && !ret))
>  		module_put(policy->governor->owner);
> 
> +	mutex_lock(&cpufreq_governor_lock);
> +	policy->governor_busy = false;
> +	mutex_unlock(&cpufreq_governor_lock);
>  	return ret;
>  }
> 

This doesn't solve the problem completely: it prevents the store_*() task
from continuing *only* when it concurrently executes the __cpufreq_governor()
function along with the CPU offline task. But if the two calls don't overlap,
we will still have the possibility where the store_*() task tries to acquire
the timer mutex after the CPU offline task has just finished destroying it.

So, IMHO, the right fix is to synchronize with CPU hotplug. That way, the
store_*() thread will wait until the entire CPU offline operation is completed.
After that, if it continues, it will get -EBUSY, due to patch [1], since
policy->governor_enabled will be set to false.

[1]. https://patchwork.kernel.org/patch/2852463/

So here is the (completely untested) fix that I propose, as a replacement to
this patch [2/2]:




Regards,
Srivatsa S. Bhat

Comments

Rafael J. Wysocki Sept. 3, 2013, 7:40 p.m. UTC | #1
On Tuesday, September 03, 2013 06:50:05 PM Srivatsa S. Bhat wrote:
> On 09/01/2013 10:56 AM, Viresh Kumar wrote:
> > We can't take a big lock around __cpufreq_governor() as this causes recursive
> > locking for some cases. But calls to this routine must be serialized for every
> > policy.
> > 
> > Lets introduce another variable which would guarantee serialization here.
> > 
> > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> > ---
> >  drivers/cpufreq/cpufreq.c | 7 ++++++-
> >  include/linux/cpufreq.h   | 1 +
> >  2 files changed, 7 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> > index f320a20..4d5723db 100644
> > --- a/drivers/cpufreq/cpufreq.c
> > +++ b/drivers/cpufreq/cpufreq.c
> > @@ -1692,13 +1692,15 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
> >  						policy->cpu, event);
> > 
> >  	mutex_lock(&cpufreq_governor_lock);
> > -	if ((policy->governor_enabled && (event == CPUFREQ_GOV_START)) ||
> > +	if (policy->governor_busy ||
> > +		(policy->governor_enabled && (event == CPUFREQ_GOV_START)) ||
> >  		(!policy->governor_enabled && ((event == CPUFREQ_GOV_LIMITS) ||
> >  					       (event == CPUFREQ_GOV_STOP)))) {
> >  		mutex_unlock(&cpufreq_governor_lock);
> >  		return -EBUSY;
> >  	}
> > 
> > +	policy->governor_busy = true;
> >  	if (event == CPUFREQ_GOV_STOP)
> >  		policy->governor_enabled = false;
> >  	else if (event == CPUFREQ_GOV_START)
> > @@ -1727,6 +1729,9 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
> >  			((event == CPUFREQ_GOV_POLICY_EXIT) && !ret))
> >  		module_put(policy->governor->owner);
> > 
> > +	mutex_lock(&cpufreq_governor_lock);
> > +	policy->governor_busy = false;
> > +	mutex_unlock(&cpufreq_governor_lock);
> >  	return ret;
> >  }
> > 
> 
> This doesn't solve the problem completely: it prevents the store_*() task
> from continuing *only* when it concurrently executes the __cpufreq_governor()
> function along with the CPU offline task. But if the two calls don't overlap,
> we will still have the possibility where the store_*() task tries to acquire
> the timer mutex after the CPU offline task has just finished destroying it.

Yeah, I overlooked that.

> So, IMHO, the right fix is to synchronize with CPU hotplug. That way, the
> store_*() thread will wait until the entire CPU offline operation is completed.
> After that, if it continues, it will get -EBUSY, due to patch [1], since
> policy->governor_enabled will be set to false.
> 
> [1]. https://patchwork.kernel.org/patch/2852463/
> 
> So here is the (completely untested) fix that I propose, as a replacement to
> this patch [2/2]:

That looks reasonable to me.

Viresh?

> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 5c75e31..71c4fb2 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -440,17 +440,24 @@ static ssize_t store_##file_name					\
>  	unsigned int ret;						\
>  	struct cpufreq_policy new_policy;				\
>  									\
> +	get_online_cpus();						\
>  	ret = cpufreq_get_policy(&new_policy, policy->cpu);		\
> -	if (ret)							\
> -		return -EINVAL;						\
> +	if (ret) {							\
> +		ret = -EINVAL;						\
> +		goto out;						\
> +	}								\
>  									\
>  	ret = sscanf(buf, "%u", &new_policy.object);			\
> -	if (ret != 1)							\
> -		return -EINVAL;						\
> +	if (ret != 1) {							\
> +		ret = -EINVAL;						\
> +		goto out;						\
> +	}								\
>  									\
>  	ret = __cpufreq_set_policy(policy, &new_policy);		\
>  	policy->user_policy.object = policy->object;			\
>  									\
> +out:									\
> +	put_online_cpus();						\
>  	return ret ? ret : count;					\
>  }
>  
> @@ -494,17 +501,22 @@ static ssize_t store_scaling_governor(struct cpufreq_policy *policy,
>  	char	str_governor[16];
>  	struct cpufreq_policy new_policy;
>  
> +	get_online_cpus();
>  	ret = cpufreq_get_policy(&new_policy, policy->cpu);
>  	if (ret)
> -		return ret;
> +		goto out;
>  
>  	ret = sscanf(buf, "%15s", str_governor);
> -	if (ret != 1)
> -		return -EINVAL;
> +	if (ret != 1) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
>  
>  	if (cpufreq_parse_governor(str_governor, &new_policy.policy,
> -						&new_policy.governor))
> -		return -EINVAL;
> +						&new_policy.governor)) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
>  
>  	/*
>  	 * Do not use cpufreq_set_policy here or the user_policy.max
> @@ -515,6 +527,9 @@ static ssize_t store_scaling_governor(struct cpufreq_policy *policy,
>  	policy->user_policy.policy = policy->policy;
>  	policy->user_policy.governor = policy->governor;
>  
> +out:
> +	put_online_cpus();
> +
>  	if (ret)
>  		return ret;
>  	else
> 
> 
> Regards,
> Srivatsa S. Bhat
>
Viresh Kumar Sept. 10, 2013, 6:52 a.m. UTC | #2
Back finally and I see lots of mails over cpufreq stuff.. :)

On 3 September 2013 18:50, Srivatsa S. Bhat
<srivatsa.bhat@linux.vnet.ibm.com> wrote:
> This doesn't solve the problem completely: it prevents the store_*() task
> from continuing *only* when it concurrently executes the __cpufreq_governor()
> function along with the CPU offline task. But if the two calls don't overlap,
> we will still have the possibility where the store_*() task tries to acquire
> the timer mutex after the CPU offline task has just finished destroying it.

How exactly? My brain is still on vacations :)

Anyway, this was one of the problem that I tried to solve with my patch.
But there can be other race conditions where things can go wrong and so
that patch may still be useful.

Call to __cpufreq_governor() must be serialized I believe.
Srivatsa S. Bhat Sept. 10, 2013, 8:47 a.m. UTC | #3
On 09/10/2013 12:22 PM, Viresh Kumar wrote:
> Back finally and I see lots of mails over cpufreq stuff.. :)
> 
> On 3 September 2013 18:50, Srivatsa S. Bhat
> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>> This doesn't solve the problem completely: it prevents the store_*() task
>> from continuing *only* when it concurrently executes the __cpufreq_governor()
>> function along with the CPU offline task. But if the two calls don't overlap,
>> we will still have the possibility where the store_*() task tries to acquire
>> the timer mutex after the CPU offline task has just finished destroying it.
> 
> How exactly? My brain is still on vacations :)
>

The race window is not limited to __cpufreq_governor() alone. It just starts there,
but doesn't end there; it extends beyond that point. That's the main problem. And
that's why serializing __cpufreq_governor() won't completely solve the issue.

CPU_DOWN:

__cpufreq_governor(STOP)  ----+
  ...                         |
  ...                         | RACE
  ...                         | WINDOW
  ...                         |
sysfs teardown            ----+
 
In the STOP call, we destroy the timer mutex and set cur_policy = NULL. So the
incorrect access to timer mutex can occur from that point until we finally unlink
or teardown the sysfs files and prevent userspace from writing to those files.
Thus, this window involves stuff other than the call to __cpufreq_governor() as
well. So serializing __cpufreq_governor() alone is not enough.

> Anyway, this was one of the problem that I tried to solve with my patch.
> But there can be other race conditions where things can go wrong and so
> that patch may still be useful.
>

I agree, but see below (its the "may be useful" part that I'm not convinced
about).
 
> Call to __cpufreq_governor() must be serialized I believe.
> 

Sure, its a good idea to serialize __cpufreq_governor(). However, we need
specific justification for that. Just a hunch that something will go wrong is
not enough, IMHO. We have to *prove* that something is indeed wrong, and only
then add appropriate synchronization to fix it.

Besides, even the commit message appeared a bit odd. It mentions something about
problems with recursive calls. What cases are those? When do we end up recursively
calling __cpufreq_governor()?  And hence, why can't we use plain locks and must
instead use yet another quick-and-dirty variable to protect that function?
[Using variables like that is bad from a lockdep perspective as well (apart from
other things) : lockdep can't catch any resulting locking issues.]

Other related questions that are worth answering would be: why should we add the
synchronization *inside* __cpufreq_governor()? Why not add it at its call-sites?

IOW, I'm all in for fixing problems, just that I'd be comfortable with the
changes only when we completely understand what problem we are fixing and why
that patch is the right fix for that problem.

Regards,
Srivatsa S. Bhat
Viresh Kumar Sept. 13, 2013, 11:44 a.m. UTC | #4
On 4 September 2013 01:10, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> On Tuesday, September 03, 2013 06:50:05 PM Srivatsa S. Bhat wrote:

>> This doesn't solve the problem completely: it prevents the store_*() task
>> from continuing *only* when it concurrently executes the __cpufreq_governor()
>> function along with the CPU offline task. But if the two calls don't overlap,
>> we will still have the possibility where the store_*() task tries to acquire
>> the timer mutex after the CPU offline task has just finished destroying it.
>
> Yeah, I overlooked that.

As a background, I had a IRC chat with Srivatsa on this mail.. (I have marked
this unread as there were other important topics to close)..

And he had some other code in mind and these synchronization problems
aren't there with my patch at all (as per him too)..

Rafael, probably both me and Srivatsa are missing something that you
understood, can you please share what problem you see here with my patch?

And yes, even with Srivatsa's patchset I found a problem:
Two threads, one changing governor from ondemand->conservative and other
one changing min/max freq..

First one will try to STOP governor and other one will try to change
limits of gov.
Suppose 2nd one gets to ->governor() and after this first one stops
the governor.
Now the first one tries to access lock and crashes..

On IRC, Srivatsa agreed about the problem..

So, probably the first thing to do is to get this patch back, i.e.
revert of Srivatsa's
patch:

commit 56d07db274b7b15ca38b60ea4a762d40de093000
Author: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Date:   Sat Sep 7 01:23:55 2013 +0530

    cpufreq: Remove temporary fix for race between CPU hotplug and sysfs-writes


Srivatsa also asked if we can get a big lock around call to ->governor()
which would create recursive locks on a call to EXIT event (as we have seen it
earlier..)..

But he believes he can pull it off and will try this later.. So, for
now we can revert
the above patch :)

--
viresh
Rafael J. Wysocki Sept. 13, 2013, 11:48 p.m. UTC | #5
On Friday, September 13, 2013 05:14:26 PM Viresh Kumar wrote:
> On 4 September 2013 01:10, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> > On Tuesday, September 03, 2013 06:50:05 PM Srivatsa S. Bhat wrote:
> 
> >> This doesn't solve the problem completely: it prevents the store_*() task
> >> from continuing *only* when it concurrently executes the __cpufreq_governor()
> >> function along with the CPU offline task. But if the two calls don't overlap,
> >> we will still have the possibility where the store_*() task tries to acquire
> >> the timer mutex after the CPU offline task has just finished destroying it.
> >
> > Yeah, I overlooked that.
> 
> As a background, I had a IRC chat with Srivatsa on this mail.. (I have marked
> this unread as there were other important topics to close)..
> 
> And he had some other code in mind and these synchronization problems
> aren't there with my patch at all (as per him too)..
> 
> Rafael, probably both me and Srivatsa are missing something that you
> understood, can you please share what problem you see here with my patch?
> 
> And yes, even with Srivatsa's patchset I found a problem:
> Two threads, one changing governor from ondemand->conservative and other
> one changing min/max freq..
> 
> First one will try to STOP governor and other one will try to change
> limits of gov.
> Suppose 2nd one gets to ->governor() and after this first one stops
> the governor.
> Now the first one tries to access lock and crashes..
> 
> On IRC, Srivatsa agreed about the problem..
> 
> So, probably the first thing to do is to get this patch back, i.e.
> revert of Srivatsa's
> patch:
> 
> commit 56d07db274b7b15ca38b60ea4a762d40de093000
> Author: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
> Date:   Sat Sep 7 01:23:55 2013 +0530
> 
>     cpufreq: Remove temporary fix for race between CPU hotplug and sysfs-writes
> 
> 
> Srivatsa also asked if we can get a big lock around call to ->governor()
> which would create recursive locks on a call to EXIT event (as we have seen it
> earlier..)..
> 
> But he believes he can pull it off and will try this later.. So, for
> now we can revert
> the above patch :)

OK, I'll think about that, but not today and probably not within the next
few days, because I'm heading to New Orleans in several hours and really
have other stuff to take care of now.  Sorry about that.

Thanks,
Rafael
Viresh Kumar Sept. 14, 2013, 4:29 a.m. UTC | #6
On 14 September 2013 05:18, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> OK, I'll think about that, but not today and probably not within the next
> few days, because I'm heading to New Orleans in several hours and really
> have other stuff to take care of now.  Sorry about that.

No issues..
Srivatsa S. Bhat Sept. 17, 2013, 11:49 a.m. UTC | #7
On 09/13/2013 05:14 PM, Viresh Kumar wrote:
> On 4 September 2013 01:10, Rafael J. Wysocki <rjw@sisk.pl> wrote:
>> On Tuesday, September 03, 2013 06:50:05 PM Srivatsa S. Bhat wrote:
> 
>>> This doesn't solve the problem completely: it prevents the store_*() task
>>> from continuing *only* when it concurrently executes the __cpufreq_governor()
>>> function along with the CPU offline task. But if the two calls don't overlap,
>>> we will still have the possibility where the store_*() task tries to acquire
>>> the timer mutex after the CPU offline task has just finished destroying it.
>>
>> Yeah, I overlooked that.
> 
> As a background, I had a IRC chat with Srivatsa on this mail.. (I have marked
> this unread as there were other important topics to close)..
> 
> And he had some other code in mind and these synchronization problems
> aren't there with my patch at all (as per him too)..
> 
> Rafael, probably both me and Srivatsa are missing something that you
> understood, can you please share what problem you see here with my patch?
> 
> And yes, even with Srivatsa's patchset I found a problem:
> Two threads, one changing governor from ondemand->conservative and other
> one changing min/max freq..
> 
> First one will try to STOP governor and other one will try to change
> limits of gov.
> Suppose 2nd one gets to ->governor() and after this first one stops
> the governor.
> Now the first one tries to access lock and crashes..
> 
> On IRC, Srivatsa agreed about the problem..
> 

Actually, we had another round of discussions and decided that there is no
problem here. The reason is, the top-level store() routine takes the rwsem of
that policy for write; so only one store() can go at a time. So the race between
changing the governor and changing the min/max freq won't happen because only
one of them can execute at a given time. In fact, by extension, *any* two store()s
in cpufreq shouldn't race with one another.

So the conclusion is that the mainline code is fine in this aspect. Nothing
needs to be reverted.

That said, there are still certain low-priority/less-visible synchronization
issues to be fixed in cpufreq, and we'll take them up later, once the dust is
settled.

Regards,
Srivatsa S. Bhat
diff mbox

Patch

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 5c75e31..71c4fb2 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -440,17 +440,24 @@  static ssize_t store_##file_name					\
 	unsigned int ret;						\
 	struct cpufreq_policy new_policy;				\
 									\
+	get_online_cpus();						\
 	ret = cpufreq_get_policy(&new_policy, policy->cpu);		\
-	if (ret)							\
-		return -EINVAL;						\
+	if (ret) {							\
+		ret = -EINVAL;						\
+		goto out;						\
+	}								\
 									\
 	ret = sscanf(buf, "%u", &new_policy.object);			\
-	if (ret != 1)							\
-		return -EINVAL;						\
+	if (ret != 1) {							\
+		ret = -EINVAL;						\
+		goto out;						\
+	}								\
 									\
 	ret = __cpufreq_set_policy(policy, &new_policy);		\
 	policy->user_policy.object = policy->object;			\
 									\
+out:									\
+	put_online_cpus();						\
 	return ret ? ret : count;					\
 }
 
@@ -494,17 +501,22 @@  static ssize_t store_scaling_governor(struct cpufreq_policy *policy,
 	char	str_governor[16];
 	struct cpufreq_policy new_policy;
 
+	get_online_cpus();
 	ret = cpufreq_get_policy(&new_policy, policy->cpu);
 	if (ret)
-		return ret;
+		goto out;
 
 	ret = sscanf(buf, "%15s", str_governor);
-	if (ret != 1)
-		return -EINVAL;
+	if (ret != 1) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	if (cpufreq_parse_governor(str_governor, &new_policy.policy,
-						&new_policy.governor))
-		return -EINVAL;
+						&new_policy.governor)) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	/*
 	 * Do not use cpufreq_set_policy here or the user_policy.max
@@ -515,6 +527,9 @@  static ssize_t store_scaling_governor(struct cpufreq_policy *policy,
 	policy->user_policy.policy = policy->policy;
 	policy->user_policy.governor = policy->governor;
 
+out:
+	put_online_cpus();
+
 	if (ret)
 		return ret;
 	else