diff mbox

[v2,2/2] sched/fair: leverage the idle state info when choosing the "idlest" cpu

Message ID 1409844730-12273-3-git-send-email-nicolas.pitre@linaro.org
State Accepted
Commit 83a0a96a5f26d974580fd7251043ff70c8f1823d
Headers show

Commit Message

Nicolas Pitre Sept. 4, 2014, 3:32 p.m. UTC
The code in find_idlest_cpu() looks for the CPU with the smallest load.
However, if multiple CPUs are idle, the first idle CPU is selected
irrespective of the depth of its idle state.

Among the idle CPUs we should pick the one with with the shallowest idle
state, or the latest to have gone idle if all idle CPUs are in the same
state.  The later applies even when cpuidle is configured out.

This patch doesn't cover the following issues:

- The idle exit latency of a CPU might be larger than the time needed
  to migrate the waking task to an already running CPU with sufficient
  capacity, and therefore performance would benefit from task packing
  in such case (in most cases task packing is about power saving).

- Some idle states have a non negligible and non abortable entry latency
  which needs to run to completion before the exit latency can start.
  A concurrent patch series is making this info available to the cpuidle
  core.  Once available, the entry latency with the idle timestamp could
  determine when the exit latency may be effective.

Those issues will be handled in due course.  In the mean time, what
is implemented here should improve things already compared to the current
state of affairs.

Based on an initial patch from Daniel Lezcano.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 kernel/sched/fair.c | 43 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 7 deletions(-)

Comments

Daniel Lezcano Sept. 5, 2014, 7:52 a.m. UTC | #1
On 09/04/2014 05:32 PM, Nicolas Pitre wrote:
> The code in find_idlest_cpu() looks for the CPU with the smallest load.
> However, if multiple CPUs are idle, the first idle CPU is selected
> irrespective of the depth of its idle state.
>
> Among the idle CPUs we should pick the one with with the shallowest idle
> state, or the latest to have gone idle if all idle CPUs are in the same
> state.  The later applies even when cpuidle is configured out.
>
> This patch doesn't cover the following issues:
>
> - The idle exit latency of a CPU might be larger than the time needed
>    to migrate the waking task to an already running CPU with sufficient
>    capacity, and therefore performance would benefit from task packing
>    in such case (in most cases task packing is about power saving).
>
> - Some idle states have a non negligible and non abortable entry latency
>    which needs to run to completion before the exit latency can start.
>    A concurrent patch series is making this info available to the cpuidle
>    core.  Once available, the entry latency with the idle timestamp could
>    determine when the exit latency may be effective.
>
> Those issues will be handled in due course.  In the mean time, what
> is implemented here should improve things already compared to the current
> state of affairs.
>
> Based on an initial patch from Daniel Lezcano.
>
> Signed-off-by: Nicolas Pitre <nico@linaro.org>

Sounds good for me.

Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>

> ---
>   kernel/sched/fair.c | 43 ++++++++++++++++++++++++++++++++++++-------
>   1 file changed, 36 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bfa3c86d0d..416329e1a6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -23,6 +23,7 @@
>   #include <linux/latencytop.h>
>   #include <linux/sched.h>
>   #include <linux/cpumask.h>
> +#include <linux/cpuidle.h>
>   #include <linux/slab.h>
>   #include <linux/profile.h>
>   #include <linux/interrupt.h>
> @@ -4428,20 +4429,48 @@ static int
>   find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
>   {
>   	unsigned long load, min_load = ULONG_MAX;
> -	int idlest = -1;
> +	unsigned int min_exit_latency = UINT_MAX;
> +	u64 latest_idle_timestamp = 0;
> +	int least_loaded_cpu = this_cpu;
> +	int shallowest_idle_cpu = -1;
>   	int i;
>
>   	/* Traverse only the allowed CPUs */
>   	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
> -		load = weighted_cpuload(i);
> -
> -		if (load < min_load || (load == min_load && i == this_cpu)) {
> -			min_load = load;
> -			idlest = i;
> +		if (idle_cpu(i)) {
> +			struct rq *rq = cpu_rq(i);
> +			struct cpuidle_state *idle = idle_get_state(rq);
> +			if (idle && idle->exit_latency < min_exit_latency) {
> +				/*
> +				 * We give priority to a CPU whose idle state
> +				 * has the smallest exit latency irrespective
> +				 * of any idle timestamp.
> +				 */
> +				min_exit_latency = idle->exit_latency;
> +				latest_idle_timestamp = rq->idle_stamp;
> +				shallowest_idle_cpu = i;
> +			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
> +				   rq->idle_stamp > latest_idle_timestamp) {
> +				/*
> +				 * If equal or no active idle state, then
> +				 * the most recently idled CPU might have
> +				 * a warmer cache.
> +				 */
> +				latest_idle_timestamp = rq->idle_stamp;
> +				shallowest_idle_cpu = i;
> +			}
> +			cpuidle_put_state(rq);
> +		} else {
> +			load = weighted_cpuload(i);
> +			if (load < min_load ||
> +			    (load == min_load && i == this_cpu)) {
> +				min_load = load;
> +				least_loaded_cpu = i;
> +			}
>   		}
>   	}
>
> -	return idlest;
> +	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
>   }
>
>   /*
>
Peter Zijlstra Sept. 19, 2014, 12:05 a.m. UTC | #2
On Thu, Sep 04, 2014 at 11:32:10AM -0400, Nicolas Pitre wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bfa3c86d0d..416329e1a6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -23,6 +23,7 @@
>  #include <linux/latencytop.h>
>  #include <linux/sched.h>
>  #include <linux/cpumask.h>
> +#include <linux/cpuidle.h>
>  #include <linux/slab.h>
>  #include <linux/profile.h>
>  #include <linux/interrupt.h>
> @@ -4428,20 +4429,48 @@ static int
>  find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
>  {
>  	unsigned long load, min_load = ULONG_MAX;
> -	int idlest = -1;
> +	unsigned int min_exit_latency = UINT_MAX;
> +	u64 latest_idle_timestamp = 0;
> +	int least_loaded_cpu = this_cpu;
> +	int shallowest_idle_cpu = -1;
>  	int i;
>  
>  	/* Traverse only the allowed CPUs */
>  	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
> -		load = weighted_cpuload(i);
> -
> -		if (load < min_load || (load == min_load && i == this_cpu)) {
> -			min_load = load;
> -			idlest = i;
> +		if (idle_cpu(i)) {
> +			struct rq *rq = cpu_rq(i);
> +			struct cpuidle_state *idle = idle_get_state(rq);
> +			if (idle && idle->exit_latency < min_exit_latency) {
> +				/*
> +				 * We give priority to a CPU whose idle state
> +				 * has the smallest exit latency irrespective
> +				 * of any idle timestamp.
> +				 */
> +				min_exit_latency = idle->exit_latency;
> +				latest_idle_timestamp = rq->idle_stamp;
> +				shallowest_idle_cpu = i;
> +			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
> +				   rq->idle_stamp > latest_idle_timestamp) {
> +				/*
> +				 * If equal or no active idle state, then
> +				 * the most recently idled CPU might have
> +				 * a warmer cache.
> +				 */
> +				latest_idle_timestamp = rq->idle_stamp;
> +				shallowest_idle_cpu = i;
> +			}
> +			cpuidle_put_state(rq);

Right, matching the other changes, I killed that line. The rest looks
ok.

> +		} else {
> +			load = weighted_cpuload(i);
> +			if (load < min_load ||
> +			    (load == min_load && i == this_cpu)) {
> +				min_load = load;
> +				least_loaded_cpu = i;
> +			}
>  		}
>  	}
>  
> -	return idlest;
> +	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
>  }

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Yao Dongdong Sept. 19, 2014, 4:49 a.m. UTC | #3
On 2014/9/4 23:32, Nicolas Pitre wrote:
> The code in find_idlest_cpu() looks for the CPU with the smallest load.
> However, if multiple CPUs are idle, the first idle CPU is selected
> irrespective of the depth of its idle state.
>
> Among the idle CPUs we should pick the one with with the shallowest idle
> state, or the latest to have gone idle if all idle CPUs are in the same
> state.  The later applies even when cpuidle is configured out.
>
> This patch doesn't cover the following issues:
>
> - The idle exit latency of a CPU might be larger than the time needed
>   to migrate the waking task to an already running CPU with sufficient
>   capacity, and therefore performance would benefit from task packing
>   in such case (in most cases task packing is about power saving).
>
> - Some idle states have a non negligible and non abortable entry latency
>   which needs to run to completion before the exit latency can start.
>   A concurrent patch series is making this info available to the cpuidle
>   core.  Once available, the entry latency with the idle timestamp could
>   determine when the exit latency may be effective.
>
> Those issues will be handled in due course.  In the mean time, what
> is implemented here should improve things already compared to the current
> state of affairs.
>
> Based on an initial patch from Daniel Lezcano.
>
> Signed-off-by: Nicolas Pitre <nico@linaro.org>
> ---
>  kernel/sched/fair.c | 43 ++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 36 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bfa3c86d0d..416329e1a6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -23,6 +23,7 @@
>  #include <linux/latencytop.h>
>  #include <linux/sched.h>
>  #include <linux/cpumask.h>
> +#include <linux/cpuidle.h>
>  #include <linux/slab.h>
>  #include <linux/profile.h>
>  #include <linux/interrupt.h>
> @@ -4428,20 +4429,48 @@ static int
>  find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
>  {
>  	unsigned long load, min_load = ULONG_MAX;
> -	int idlest = -1;
> +	unsigned int min_exit_latency = UINT_MAX;
> +	u64 latest_idle_timestamp = 0;
> +	int least_loaded_cpu = this_cpu;
> +	int shallowest_idle_cpu = -1;
>  	int i;
>  
>  	/* Traverse only the allowed CPUs */
>  	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
> -		load = weighted_cpuload(i);
> -
> -		if (load < min_load || (load == min_load && i == this_cpu)) {
> -			min_load = load;
> -			idlest = i;
> +		if (idle_cpu(i)) {
> +			struct rq *rq = cpu_rq(i);
> +			struct cpuidle_state *idle = idle_get_state(rq);
> +			if (idle && idle->exit_latency < min_exit_latency) {
> +				/*
> +				 * We give priority to a CPU whose idle state
> +				 * has the smallest exit latency irrespective
> +				 * of any idle timestamp.
> +				 */
> +				min_exit_latency = idle->exit_latency;
> +				latest_idle_timestamp = rq->idle_stamp;
> +				shallowest_idle_cpu = i;
> +			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
> +				   rq->idle_stamp > latest_idle_timestamp) {
> +				/*
> +				 * If equal or no active idle state, then
> +				 * the most recently idled CPU might have
> +				 * a warmer cache.
> +				 */
> +				latest_idle_timestamp = rq->idle_stamp;
> +				shallowest_idle_cpu = i;
> +			}
> +			cpuidle_put_state(rq);
> +		} else {
I think we needn't test no idle cpus after find an idle cpu.
And what about this?
                                     } else if (shallowest_idle_cpu == -1) {

> +			load = weighted_cpuload(i);
> +			if (load < min_load ||
> +			    (load == min_load && i == this_cpu)) {
> +				min_load = load;
> +				least_loaded_cpu = i;
> +			}
>  		}
>  	}
>  
> -	return idlest;
> +	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
>  }
>  
>  /*

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rik van Riel Sept. 30, 2014, 9:58 p.m. UTC | #4
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 09/04/2014 11:32 AM, Nicolas Pitre wrote:
> The code in find_idlest_cpu() looks for the CPU with the smallest
> load. However, if multiple CPUs are idle, the first idle CPU is
> selected irrespective of the depth of its idle state.
> 
> Among the idle CPUs we should pick the one with with the shallowest
> idle state, or the latest to have gone idle if all idle CPUs are in
> the same state.  The later applies even when cpuidle is configured
> out.
> 
> This patch doesn't cover the following issues:

The main thing it does not cover is already running tasks that
get woken up again, since select_idle_sibling() covers everything
except for newly forked and newly executed tasks.

I am looking at adding similar logic to select_idle_sibling()

- -- 
All rights reversed
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1

iQEcBAEBAgAGBQJUKyd9AAoJEM553pKExN6DXZgH/1R26XfYv2FzYV9IGbty3vVx
1kMPozdb7jRAUR8+WkUgs7ntkbqau2hC/nTjDsSsiLQwXdjaDdqvnbt8Y6srI1es
/Z+IRaIPGx24D7D6nB5sLgsAq6DsANUdtFK3TsED8+07LbiY71o64YQ3X1IEVyRO
FKBcDw9+DBPGVySKIdMm0h2txdnQ3Jy2lM3nKV8tBFheRuOhU4Rv/fumEYAUYvDV
J9y91RhKOeEJYmaYL6oQYtZgBqhDoJmh/0DjOrK6H71oZYiNWeIUxtieNXaNQp7B
Rd8khOVFLsf/qZK6qjmgnfO9Mm5ij/PvrALOBZt8O9KAD3/v3kXfWIm9tO1NDZU=
=kTdn
-----END PGP SIGNATURE-----
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Nicolas Pitre Sept. 30, 2014, 11:15 p.m. UTC | #5
On Tue, 30 Sep 2014, Rik van Riel wrote:

> On 09/04/2014 11:32 AM, Nicolas Pitre wrote:
> > The code in find_idlest_cpu() looks for the CPU with the smallest
> > load. However, if multiple CPUs are idle, the first idle CPU is
> > selected irrespective of the depth of its idle state.
> > 
> > Among the idle CPUs we should pick the one with with the shallowest
> > idle state, or the latest to have gone idle if all idle CPUs are in
> > the same state.  The later applies even when cpuidle is configured
> > out.
> > 
> > This patch doesn't cover the following issues:
> 
> The main thing it does not cover is already running tasks that
> get woken up again, since select_idle_sibling() covers everything
> except for newly forked and newly executed tasks.

True. Now that you bring this up, I remember that Peter mentioned it as 
well.

> I am looking at adding similar logic to select_idle_sibling()

OK thanks.


Nicolas
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mike Galbraith Oct. 3, 2014, 6:04 a.m. UTC | #6
On Thu, 2014-10-02 at 13:15 -0400, Rik van Riel wrote:

> This patch is ugly. I have not bothered cleaning it up, because it
> causes a regression with hackbench. Apparently for hackbench (and
> potentially other sync wakeups), locality is more important than
> idleness.
> 
> We may need to add a third clause before the search, something
> along the lines of, to ensure target gets selected if neither
> target or i are idle and the wakeup is synchronous...
> 
>     if (sync_wakeup && cpu_of(target)->nr_running == 1)
> 	return target;

I recommend you forget that trusting sync hint ever sprang to mind, it
is often a big fat lie.

-Mike

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mike Galbraith Oct. 3, 2014, 6:23 a.m. UTC | #7
On Thu, 2014-10-02 at 13:15 -0400, Rik van Riel wrote:

> Subject: sched,idle: teach select_idle_sibling about idle states
> 
> Change select_idle_sibling to take cpu idle exit latency into
> account.  First preference is to select the cpu with the lowest
> exit latency from a completely idle sched_group inside the CPU;
> if that is not available, we pick the CPU with the lowest exit
> latency in any sched_group.
> 
> This increases the total search time of select_idle_sibling,
> we may want to look into propagating load info up the sched_group
> tree in some way. That information would also be useful to prevent
> the wake_affine logic from causing a load imbalance between
> sched_groups.

A generic boo hiss aimed in the general direction of all of this let's
go look at every possibility on every wakeup stuff.  Less is more.

-Mike

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Oct. 3, 2014, 7:50 a.m. UTC | #8
On Fri, Oct 03, 2014 at 08:23:04AM +0200, Mike Galbraith wrote:
> On Thu, 2014-10-02 at 13:15 -0400, Rik van Riel wrote:
> 
> > Subject: sched,idle: teach select_idle_sibling about idle states
> > 
> > Change select_idle_sibling to take cpu idle exit latency into
> > account.  First preference is to select the cpu with the lowest
> > exit latency from a completely idle sched_group inside the CPU;
> > if that is not available, we pick the CPU with the lowest exit
> > latency in any sched_group.
> > 
> > This increases the total search time of select_idle_sibling,
> > we may want to look into propagating load info up the sched_group
> > tree in some way. That information would also be useful to prevent
> > the wake_affine logic from causing a load imbalance between
> > sched_groups.
> 
> A generic boo hiss aimed in the general direction of all of this let's
> go look at every possibility on every wakeup stuff.  Less is more.

I hear you, can you see actual slowdown with the patch? While the worst
case doesn't change, it does make the average case equal to the worst
case iteration -- where we previously would average out at inspecting
half the CPUs before finding an idle one, we'd now always inspect all of
them in order to compare all idle ones on their properties.

Also, with the latest generation of Haswell Xeons having 18 cores (36
threads) this is one massively painful loop for sure.

I'm just not sure what to do about it.. I suppose we can artificially
split it into smaller groups, but I bet that'll hurt some, but if we can
show it gains more we might still be able to do it. The only real
problem is actual numbers/workloads (isn't it always) :/

One thing I suppose we could try is keeping a 'busy' flag at the
llc domain which is set when all CPUs are busy (we'll clear it from
new_idle) that way we can avoid the entire iteration if we know its
pointless.

Hmm...
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mike Galbraith Oct. 3, 2014, 1:05 p.m. UTC | #9
On Fri, 2014-10-03 at 09:50 +0200, Peter Zijlstra wrote: 
> On Fri, Oct 03, 2014 at 08:23:04AM +0200, Mike Galbraith wrote:

> > A generic boo hiss aimed in the general direction of all of this let's
> > go look at every possibility on every wakeup stuff.  Less is more.
> 
> I hear you, can you see actual slowdown with the patch? While the worst
> case doesn't change, it does make the average case equal to the worst
> case iteration -- where we previously would average out at inspecting
> half the CPUs before finding an idle one, we'd now always inspect all of
> them in order to compare all idle ones on their properties.
> 
> Also, with the latest generation of Haswell Xeons having 18 cores (36
> threads) this is one massively painful loop for sure.

Yeah, the things are getting too damn big.  I didn't try the patch and
measure anything, my gut instantly said "nope, not worth it".
  
> I'm just not sure what to do about it.. I suppose we can artificially
> split it into smaller groups, but I bet that'll hurt some, but if we can
> show it gains more we might still be able to do it. The only real
> problem is actual numbers/workloads (isn't it always) :/
> 
> One thing I suppose we could try is keeping a 'busy' flag at the
> llc domain which is set when all CPUs are busy (we'll clear it from
> new_idle) that way we can avoid the entire iteration if we know its
> pointless.

On one of those huge packages, heck, even on a 8 core that could save a
substantial number of busy box cycles.

-Mike

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rik van Riel Oct. 3, 2014, 2:28 p.m. UTC | #10
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 10/03/2014 03:50 AM, Peter Zijlstra wrote:
> On Fri, Oct 03, 2014 at 08:23:04AM +0200, Mike Galbraith wrote:
>> On Thu, 2014-10-02 at 13:15 -0400, Rik van Riel wrote:
>> 
>>> Subject: sched,idle: teach select_idle_sibling about idle
>>> states
>>> 
>>> Change select_idle_sibling to take cpu idle exit latency into 
>>> account.  First preference is to select the cpu with the
>>> lowest exit latency from a completely idle sched_group inside
>>> the CPU; if that is not available, we pick the CPU with the
>>> lowest exit latency in any sched_group.
>>> 
>>> This increases the total search time of select_idle_sibling, we
>>> may want to look into propagating load info up the sched_group 
>>> tree in some way. That information would also be useful to
>>> prevent the wake_affine logic from causing a load imbalance
>>> between sched_groups.
>> 
>> A generic boo hiss aimed in the general direction of all of this
>> let's go look at every possibility on every wakeup stuff.  Less
>> is more.
> 
> I hear you, can you see actual slowdown with the patch? While the
> worst case doesn't change, it does make the average case equal to
> the worst case iteration -- where we previously would average out
> at inspecting half the CPUs before finding an idle one, we'd now
> always inspect all of them in order to compare all idle ones on
> their properties.
> 
> Also, with the latest generation of Haswell Xeons having 18 cores
> (36 threads) this is one massively painful loop for sure.

We have 3 different goals when selecting a runqueue for a task:
1) locality: get the task running close to where it has stuff cached
2) work preserving: get the task running ASAP, and preferably on a
   fully idle core
3) idle state latency: place the task on a CPU that can start running
   it ASAP

We may also consider the interplay of the above 3 to have an impact on
4) power use: pack tasks on some CPUs so other CPUs can go into deeper
   idle states

The current implementation is a "compromise" between (1) and (2),
with a strong preference for (2), falling back to (1) if no fully
idle core is found.

My ugly hack isn't any better, trading off (1) in order to be better
at (2) and (3). Whether it even affects (4) remains to be seen.

I know my patch is probably unacceptable, but I do think it is important
that we talk about the problem, and hopefully agree on exactly what the
problem is that we want to solve.

One big question in my mind is, when is locality more important, and
when is work preserving more important?  Do we have an answer to that
question?

The current code has the potential to be quite painful on systems with
a large number of cores per chip, so we will have to change things
anyway...

- -- 
All rights reversed
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1

iQEcBAEBAgAGBQJULrKaAAoJEM553pKExN6DVk4H/0d3vVXEezyIUgONluPwKwJC
6QFlaYkglMvfPM85aVLzj4JSQwGmgttXOZBcKvPxk76TbPEgee3lHsstqb0hmWKA
gJdNsR3q/56uUZz4nKTFZqHTXQ6JeXWhppCtd6dibfugo4gI6duvfNsugtOdggm7
1xfUamU6wNAa8VYl3XlHaAaXG4xApVgiNuAC/zRog4ckhfB/Rl2X+4A5Ki7F3eBa
6Gz1DvABd9UYXWvzmHZvB0B+cwSMUpApj5PlPIeo+ZceMCfw7vN20gdZdg/2trsn
weAQsc6ENGaadd5xPj3vsE5QS9oXUw14QM/RH74xy5A7iNyd5JToDRz67aKONiA=
=ZlKb
-----END PGP SIGNATURE-----
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Oct. 3, 2014, 2:46 p.m. UTC | #11
On Fri, Oct 03, 2014 at 10:28:42AM -0400, Rik van Riel wrote:
> We have 3 different goals when selecting a runqueue for a task:
> 1) locality: get the task running close to where it has stuff cached
> 2) work preserving: get the task running ASAP, and preferably on a
>    fully idle core
> 3) idle state latency: place the task on a CPU that can start running
>    it ASAP

3 can also be considered part of power aware, seeing how it will try and
let CPUs reach their deep idle potential.

> We may also consider the interplay of the above 3 to have an impact on
> 4) power use: pack tasks on some CPUs so other CPUs can go into deeper
>    idle states
> 
> The current implementation is a "compromise" between (1) and (2),
> with a strong preference for (2), falling back to (1) if no fully
> idle core is found.
> 
> My ugly hack isn't any better, trading off (1) in order to be better
> at (2) and (3). Whether it even affects (4) remains to be seen.
> 
> I know my patch is probably unacceptable, but I do think it is important
> that we talk about the problem, and hopefully agree on exactly what the
> problem is that we want to solve.

Yeah, we've been through this several times, it basically boils down to
the amount of fail vs win on 'various' workloads. The endless problem is
of course that the fail vs win ratio is entirely workload dependent and
as ever there is no comprehensive set.

The last time this came up was when Mike tried to do his cache buddy
idea, which basically reduced things to only looking at 2 cpus. That
make some things fly and some things tank.

> One big question in my mind is, when is locality more important, and
> when is work preserving more important?  Do we have an answer to that
> question?

Typically 2) is important when there's lots of short running tasks
around, any queueing typically destroys throughput in that case.

> The current code has the potential to be quite painful on systems with
> a large number of cores per chip, so we will have to change things
> anyway...

What I said.. so far we've failed at coming up with anything sane
though, so far we've found that 2 cpus is too small a slice to look at
and we're fairly sure 18/36 is too large :-)
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rik van Riel Oct. 3, 2014, 3:37 p.m. UTC | #12
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 10/03/2014 10:46 AM, Peter Zijlstra wrote:
> On Fri, Oct 03, 2014 at 10:28:42AM -0400, Rik van Riel wrote:

>> The current code has the potential to be quite painful on systems
>> with a large number of cores per chip, so we will have to change
>> things anyway...
> 
> What I said.. so far we've failed at coming up with anything sane 
> though, so far we've found that 2 cpus is too small a slice to look
> at and we're fairly sure 18/36 is too large :-)

Some more brainstorming points...

1) We should probably (lazily/batched?) propagate load information
   up the sched_group tree.  This will be useful for wake_affine,
   load_balancing, find_idlest_cpu, and select_idle_sibling

2) With both find_idlest_cpu and select_idle_sibling walking down
   the tree from the LLC level, they could probably share code

3) Counting both blocked and runnable load may give better long
   term stability of loads, resulting in a reduction in work
   preserving behaviour, but an improvement in locality - this
   could be worthwhile, but it is hard to say in advance

4) We can be pretty sure that CPU makers are not going to stop
   at a mere 18 cores. We need to subdivide things below the LLC
   level, turning select_idle_sibling and find_idlest_cpu into
   a tree walk.

   This means whatever selection criteria are used by these need
   to be propagated up the sched_group tree. This, in turn, means
   we probably need to restrict ourselves to things that do not get
   changed/updated too often.

Am I overlooking anything?

- -- 
All rights reversed
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1

iQEcBAEBAgAGBQJULsK7AAoJEM553pKExN6DtBEIAIJWwDPXfrIN6D4yH+/sY7Xg
cDRVDS978OW8GMx3/IqOD90PIvx/l/pttIHHkAcMfDv2Lv8QhiGJEX+OMQg9ETPq
bA31A5t3V3Wlnfc/0xeIMrebc2P3Wfe5s2DApiYPQbDzh47BimDJyeC/9XSqKyvk
CuOZR02t4/20axGwZhl8hk7vGTJhlJWPuh5RUHWjRi2shoHJM90nfZh144GDO3S7
EfiNlC9ZT9z9MYUL6FvCGA7yF+fwzIPE4ppU/KeoDVHsav2OKadV+MjsTQ/IHti2
p0Heu80jEmWW3/zv9zeMpa8jv6Xg8kNsaW709ZSBAzphen5g9sch170A0SdZCiU=
=gUXr
-----END PGP SIGNATURE-----
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicolas Pitre Oct. 3, 2014, 6:52 p.m. UTC | #13
On Fri, 3 Oct 2014, Rik van Riel wrote:

> We have 3 different goals when selecting a runqueue for a task:
> 1) locality: get the task running close to where it has stuff cached
> 2) work preserving: get the task running ASAP, and preferably on a
>    fully idle core
> 3) idle state latency: place the task on a CPU that can start running
>    it ASAP
> 
> We may also consider the interplay of the above 3 to have an impact on
> 4) power use: pack tasks on some CPUs so other CPUs can go into deeper
>    idle states

In my mind the actual choice is between (1) and (2).  Once you decided 
on (2) then obviously you should imply (3) all the time. And by having 
(3) then (4) should be a natural side effect by not selecting idle CPUs 
randomly.

By selecting (1) you already have (4).

The deficient part right now is (3) as a consequence of (2).  Fixing 
(3) should not have to affect (1).

> The current implementation is a "compromise" between (1) and (2),
> with a strong preference for (2), falling back to (1) if no fully
> idle core is found.
> 
> My ugly hack isn't any better, trading off (1) in order to be better
> at (2) and (3). Whether it even affects (4) remains to be seen.

(4) is greatly influenced by (3) on mobile platforms, especially those 
with a cluster topology.  This might not be as significant on server 
type systems, although performance should benefit as well from the 
smaller wake-up latency.

On a mobile system losing 10% performance to save 20% on power usage 
might be an excellent compromise.  Maybe not so on a server system where 
performance is everything.


Nicolas
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Oct. 9, 2014, 4:04 p.m. UTC | #14
On Fri, Oct 03, 2014 at 11:37:31AM -0400, Rik van Riel wrote:
> Some more brainstorming points...
> 
> 1) We should probably (lazily/batched?) propagate load information
>    up the sched_group tree.  This will be useful for wake_affine,
>    load_balancing, find_idlest_cpu, and select_idle_sibling
> 
> 2) With both find_idlest_cpu and select_idle_sibling walking down
>    the tree from the LLC level, they could probably share code
> 
> 3) Counting both blocked and runnable load may give better long
>    term stability of loads, resulting in a reduction in work
>    preserving behaviour, but an improvement in locality - this
>    could be worthwhile, but it is hard to say in advance
> 
> 4) We can be pretty sure that CPU makers are not going to stop
>    at a mere 18 cores. We need to subdivide things below the LLC
>    level, turning select_idle_sibling and find_idlest_cpu into
>    a tree walk.
> 
>    This means whatever selection criteria are used by these need
>    to be propagated up the sched_group tree. This, in turn, means
>    we probably need to restrict ourselves to things that do not get
>    changed/updated too often.
> 
> Am I overlooking anything?

Well, we can certainly try something like that; but your last point
seems like a contradition; seeing how _the_ important point for
select_idle_sibling() is the actual idle state, and that per definition
is something that can change/update often.

But yes, the only viable option is some artificial breakup of the
topology and we can indeed try and bridge the gap with some caching.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bfa3c86d0d..416329e1a6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,7 @@ 
 #include <linux/latencytop.h>
 #include <linux/sched.h>
 #include <linux/cpumask.h>
+#include <linux/cpuidle.h>
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
@@ -4428,20 +4429,48 @@  static int
 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
 	unsigned long load, min_load = ULONG_MAX;
-	int idlest = -1;
+	unsigned int min_exit_latency = UINT_MAX;
+	u64 latest_idle_timestamp = 0;
+	int least_loaded_cpu = this_cpu;
+	int shallowest_idle_cpu = -1;
 	int i;
 
 	/* Traverse only the allowed CPUs */
 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
-		load = weighted_cpuload(i);
-
-		if (load < min_load || (load == min_load && i == this_cpu)) {
-			min_load = load;
-			idlest = i;
+		if (idle_cpu(i)) {
+			struct rq *rq = cpu_rq(i);
+			struct cpuidle_state *idle = idle_get_state(rq);
+			if (idle && idle->exit_latency < min_exit_latency) {
+				/*
+				 * We give priority to a CPU whose idle state
+				 * has the smallest exit latency irrespective
+				 * of any idle timestamp.
+				 */
+				min_exit_latency = idle->exit_latency;
+				latest_idle_timestamp = rq->idle_stamp;
+				shallowest_idle_cpu = i;
+			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
+				   rq->idle_stamp > latest_idle_timestamp) {
+				/*
+				 * If equal or no active idle state, then
+				 * the most recently idled CPU might have
+				 * a warmer cache.
+				 */
+				latest_idle_timestamp = rq->idle_stamp;
+				shallowest_idle_cpu = i;
+			}
+			cpuidle_put_state(rq);
+		} else {
+			load = weighted_cpuload(i);
+			if (load < min_load ||
+			    (load == min_load && i == this_cpu)) {
+				min_load = load;
+				least_loaded_cpu = i;
+			}
 		}
 	}
 
-	return idlest;
+	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
 }
 
 /*