diff mbox

[v4,5/5] sched/fair: Track peak per-entity utilization

Message ID 1472640739-8778-6-git-send-email-morten.rasmussen@arm.com
State New
Headers show

Commit Message

Morten Rasmussen Aug. 31, 2016, 10:52 a.m. UTC
When using PELT (per-entity load tracking) utilization to place tasks at
wake-up using the decayed utilization (due to sleep) leads to
under-estimation of true utilization of the task. This could mean
putting the task on a cpu with less available capacity than is actually
needed. This issue can be mitigated by using 'peak' utilization instead
of the decayed utilization for placement decisions, e.g. at task
wake-up.

The 'peak' utilization metric, util_peak, tracks util_avg when the task
is running and retains its previous value while the task is
blocked/waiting on the rq. It is instantly updated to track util_avg
again as soon as the task running again.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>

---
 include/linux/sched.h |  2 +-
 kernel/sched/fair.c   | 23 ++++++++++++++---------
 2 files changed, 15 insertions(+), 10 deletions(-)

-- 
1.9.1

Comments

Patrick Bellasi Sept. 1, 2016, 11:51 a.m. UTC | #1
On 31-Aug 11:52, Morten Rasmussen wrote:
> When using PELT (per-entity load tracking) utilization to place tasks at

> wake-up using the decayed utilization (due to sleep) leads to

> under-estimation of true utilization of the task. This could mean

> putting the task on a cpu with less available capacity than is actually

> needed. This issue can be mitigated by using 'peak' utilization instead

> of the decayed utilization for placement decisions, e.g. at task

> wake-up.

> 

> The 'peak' utilization metric, util_peak, tracks util_avg when the task

> is running and retains its previous value while the task is

> blocked/waiting on the rq. It is instantly updated to track util_avg

> again as soon as the task running again.

> 

> cc: Ingo Molnar <mingo@redhat.com>

> cc: Peter Zijlstra <peterz@infradead.org>

> 

> Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>

> ---

>  include/linux/sched.h |  2 +-

>  kernel/sched/fair.c   | 23 ++++++++++++++---------

>  2 files changed, 15 insertions(+), 10 deletions(-)

> 

> diff --git a/include/linux/sched.h b/include/linux/sched.h

> index d75024053e9b..fff4e4b6e654 100644

> --- a/include/linux/sched.h

> +++ b/include/linux/sched.h

> @@ -1282,7 +1282,7 @@ struct load_weight {

>  struct sched_avg {

>  	u64 last_update_time, load_sum;

>  	u32 util_sum, period_contrib;

> -	unsigned long load_avg, util_avg;

> +	unsigned long load_avg, util_avg, util_peak;


By adding util_peak here (in sched_avg) we implicitly define a new
signal for CFS RQs as well, but in the rest of this patch it seems we
use it only for tasks?

Overall this seems to be a filtered signal on top of PELT but just for
tasks. Perhaps something similar can be useful for CPUs utilization as
well...

>  };

>  

>  #ifdef CONFIG_SCHEDSTATS

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> index 68d8b40c546b..27534e36555b 100644

> --- a/kernel/sched/fair.c

> +++ b/kernel/sched/fair.c

> @@ -692,6 +692,7 @@ void init_entity_runnable_average(struct sched_entity *se)

>  	 * At this point, util_avg won't be used in select_task_rq_fair anyway

>  	 */

>  	sa->util_avg = 0;

> +	sa->util_peak = 0;


For consistency with other sched_avg's signals, perhaps we should report
the value of util_peak from:

   kernel/sched/debug.c::{print_cfs_group_statproc_sched_show_task,proc_sched_show_task}

>  	sa->util_sum = 0;

>  	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */

>  }

> @@ -743,6 +744,7 @@ void post_init_entity_util_avg(struct sched_entity *se)

>  		} else {

>  			sa->util_avg = cap;

>  		}

> +		sa->util_peak = sa->util_avg;

>  		sa->util_sum = sa->util_avg * LOAD_AVG_MAX;

>  	}

>  

> @@ -2804,6 +2806,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,

>  		sa->util_avg = sa->util_sum / LOAD_AVG_MAX;

>  	}

>  

> +	if (running || sa->util_avg > sa->util_peak)

> +		sa->util_peak = sa->util_avg;


Do we really need to update this new signal so often?

It seems that we use it only at wakeup time, is it not enough
to cache the util_avg value in dequeue_task_fair() in case of a
DEQUEUE_SLEEP?

> +

>  	return decayed;

>  }

>  

> @@ -5184,7 +5189,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,

>  	return 1;

>  }

>  

> -static inline int task_util(struct task_struct *p);

> +static inline int task_util_peak(struct task_struct *p);

>  static int cpu_util_wake(int cpu, struct task_struct *p);

>  

>  static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)

> @@ -5267,14 +5272,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,

>  	/*

>  	 * The cross-over point between using spare capacity or least load

>  	 * is too conservative for high utilization tasks on partially

> -	 * utilized systems if we require spare_capacity > task_util(p),

> +	 * utilized systems if we require spare_capacity > task_util_peak(p),

>  	 * so we allow for some task stuffing by using

> -	 * spare_capacity > task_util(p)/2.

> +	 * spare_capacity > task_util_peak(p)/2.

>  	 */

> -	if (this_spare > task_util(p) / 2 &&

> +	if (this_spare > task_util_peak(p) / 2 &&

>  	    imbalance*this_spare > 100*most_spare)

>  		return NULL;

> -	else if (most_spare > task_util(p) / 2)

> +	else if (most_spare > task_util_peak(p) / 2)

>  		return most_spare_sg;

>  

>  	if (!idlest || 100*this_load < imbalance*min_load)

> @@ -5432,9 +5437,9 @@ static int cpu_util(int cpu)

>  	return (util >= capacity) ? capacity : util;

>  }

>  

> -static inline int task_util(struct task_struct *p)

> +static inline int task_util_peak(struct task_struct *p)

>  {

> -	return p->se.avg.util_avg;

> +	return p->se.avg.util_peak;

>  }

>  

>  /*

> @@ -5450,7 +5455,7 @@ static int cpu_util_wake(int cpu, struct task_struct *p)

>  		return cpu_util(cpu);

>  

>  	capacity = capacity_orig_of(cpu);

> -	util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);

> +	util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util_peak(p), 0);

>  

>  	return (util >= capacity) ? capacity : util;

>  }

> @@ -5476,7 +5481,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)

>  	/* Bring task utilization in sync with prev_cpu */

>  	sync_entity_load_avg(&p->se);

>  

> -	return min_cap * 1024 < task_util(p) * capacity_margin;

> +	return min_cap * 1024 < task_util_peak(p) * capacity_margin;

>  }

>  

>  /*

> -- 

> 1.9.1

> 


-- 
#include <best/regards.h>

Patrick Bellasi
Morten Rasmussen Sept. 1, 2016, 12:43 p.m. UTC | #2
On Thu, Sep 01, 2016 at 12:51:36PM +0100, Patrick Bellasi wrote:
> On 31-Aug 11:52, Morten Rasmussen wrote:

> > When using PELT (per-entity load tracking) utilization to place tasks at

> > wake-up using the decayed utilization (due to sleep) leads to

> > under-estimation of true utilization of the task. This could mean

> > putting the task on a cpu with less available capacity than is actually

> > needed. This issue can be mitigated by using 'peak' utilization instead

> > of the decayed utilization for placement decisions, e.g. at task

> > wake-up.

> > 

> > The 'peak' utilization metric, util_peak, tracks util_avg when the task

> > is running and retains its previous value while the task is

> > blocked/waiting on the rq. It is instantly updated to track util_avg

> > again as soon as the task running again.

> > 

> > cc: Ingo Molnar <mingo@redhat.com>

> > cc: Peter Zijlstra <peterz@infradead.org>

> > 

> > Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>

> > ---

> >  include/linux/sched.h |  2 +-

> >  kernel/sched/fair.c   | 23 ++++++++++++++---------

> >  2 files changed, 15 insertions(+), 10 deletions(-)

> > 

> > diff --git a/include/linux/sched.h b/include/linux/sched.h

> > index d75024053e9b..fff4e4b6e654 100644

> > --- a/include/linux/sched.h

> > +++ b/include/linux/sched.h

> > @@ -1282,7 +1282,7 @@ struct load_weight {

> >  struct sched_avg {

> >  	u64 last_update_time, load_sum;

> >  	u32 util_sum, period_contrib;

> > -	unsigned long load_avg, util_avg;

> > +	unsigned long load_avg, util_avg, util_peak;

> 

> By adding util_peak here (in sched_avg) we implicitly define a new

> signal for CFS RQs as well, but in the rest of this patch it seems we

> use it only for tasks?

> 

> Overall this seems to be a filtered signal on top of PELT but just for

> tasks. Perhaps something similar can be useful for CPUs utilization as

> well...


Yes, sched_avg is used by both sched_entity and cfs_rq. Current code is
structured to exploit this, so I didn't want to change that. At the
moment we only need util_peak to cache the non-decayed utilization for
wake-up placement as patch 1 changes the task utilization in
select_task_rq_fair() from non-decayed to being up-to-date. That means
that we would consistently under-estimate the task utilization in the
remaining patches of this v4 patch set if we use the up-to-date task
utilization.

It is currently not used anywhere else. It might be later on. I'm not
sure how it should be defined for cfs_rqs to make any sense.

> 

> >  };

> >  

> >  #ifdef CONFIG_SCHEDSTATS

> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> > index 68d8b40c546b..27534e36555b 100644

> > --- a/kernel/sched/fair.c

> > +++ b/kernel/sched/fair.c

> > @@ -692,6 +692,7 @@ void init_entity_runnable_average(struct sched_entity *se)

> >  	 * At this point, util_avg won't be used in select_task_rq_fair anyway

> >  	 */

> >  	sa->util_avg = 0;

> > +	sa->util_peak = 0;

> 

> For consistency with other sched_avg's signals, perhaps we should report

> the value of util_peak from:

> 

>    kernel/sched/debug.c::{print_cfs_group_statproc_sched_show_task,proc_sched_show_task}


I forgot to add the debug bits. I'm not sure how useful it is though. It
is just an outdated snapshot, but if the others are there, this one
should be there too.

> 

> >  	sa->util_sum = 0;

> >  	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */

> >  }

> > @@ -743,6 +744,7 @@ void post_init_entity_util_avg(struct sched_entity *se)

> >  		} else {

> >  			sa->util_avg = cap;

> >  		}

> > +		sa->util_peak = sa->util_avg;

> >  		sa->util_sum = sa->util_avg * LOAD_AVG_MAX;

> >  	}

> >  

> > @@ -2804,6 +2806,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,

> >  		sa->util_avg = sa->util_sum / LOAD_AVG_MAX;

> >  	}

> >  

> > +	if (running || sa->util_avg > sa->util_peak)

> > +		sa->util_peak = sa->util_avg;

> 

> Do we really need to update this new signal so often?

> 

> It seems that we use it only at wakeup time, is it not enough

> to cache the util_avg value in dequeue_task_fair() in case of a

> DEQUEUE_SLEEP?


I think we could hook into the dequeue path and do it there instead. I
do cause an additional comparison and potentially writing one
additional variable, but I thought it was a very simple and easy to
understand implementation.

This implementation has the side-effect that task_util_peak() does not
maintain its previous peak value until the task sleeps again. But
currently we don't use it in between, so why not.

I will give it a try.

Morten
diff mbox

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d75024053e9b..fff4e4b6e654 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1282,7 +1282,7 @@  struct load_weight {
 struct sched_avg {
 	u64 last_update_time, load_sum;
 	u32 util_sum, period_contrib;
-	unsigned long load_avg, util_avg;
+	unsigned long load_avg, util_avg, util_peak;
 };
 
 #ifdef CONFIG_SCHEDSTATS
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 68d8b40c546b..27534e36555b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -692,6 +692,7 @@  void init_entity_runnable_average(struct sched_entity *se)
 	 * At this point, util_avg won't be used in select_task_rq_fair anyway
 	 */
 	sa->util_avg = 0;
+	sa->util_peak = 0;
 	sa->util_sum = 0;
 	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
@@ -743,6 +744,7 @@  void post_init_entity_util_avg(struct sched_entity *se)
 		} else {
 			sa->util_avg = cap;
 		}
+		sa->util_peak = sa->util_avg;
 		sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
 	}
 
@@ -2804,6 +2806,9 @@  __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 		sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
 	}
 
+	if (running || sa->util_avg > sa->util_peak)
+		sa->util_peak = sa->util_avg;
+
 	return decayed;
 }
 
@@ -5184,7 +5189,7 @@  static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 	return 1;
 }
 
-static inline int task_util(struct task_struct *p);
+static inline int task_util_peak(struct task_struct *p);
 static int cpu_util_wake(int cpu, struct task_struct *p);
 
 static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
@@ -5267,14 +5272,14 @@  find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 	/*
 	 * The cross-over point between using spare capacity or least load
 	 * is too conservative for high utilization tasks on partially
-	 * utilized systems if we require spare_capacity > task_util(p),
+	 * utilized systems if we require spare_capacity > task_util_peak(p),
 	 * so we allow for some task stuffing by using
-	 * spare_capacity > task_util(p)/2.
+	 * spare_capacity > task_util_peak(p)/2.
 	 */
-	if (this_spare > task_util(p) / 2 &&
+	if (this_spare > task_util_peak(p) / 2 &&
 	    imbalance*this_spare > 100*most_spare)
 		return NULL;
-	else if (most_spare > task_util(p) / 2)
+	else if (most_spare > task_util_peak(p) / 2)
 		return most_spare_sg;
 
 	if (!idlest || 100*this_load < imbalance*min_load)
@@ -5432,9 +5437,9 @@  static int cpu_util(int cpu)
 	return (util >= capacity) ? capacity : util;
 }
 
-static inline int task_util(struct task_struct *p)
+static inline int task_util_peak(struct task_struct *p)
 {
-	return p->se.avg.util_avg;
+	return p->se.avg.util_peak;
 }
 
 /*
@@ -5450,7 +5455,7 @@  static int cpu_util_wake(int cpu, struct task_struct *p)
 		return cpu_util(cpu);
 
 	capacity = capacity_orig_of(cpu);
-	util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
+	util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util_peak(p), 0);
 
 	return (util >= capacity) ? capacity : util;
 }
@@ -5476,7 +5481,7 @@  static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 	/* Bring task utilization in sync with prev_cpu */
 	sync_entity_load_avg(&p->se);
 
-	return min_cap * 1024 < task_util(p) * capacity_margin;
+	return min_cap * 1024 < task_util_peak(p) * capacity_margin;
 }
 
 /*