Message ID | 1411403047-32010-2-git-send-email-morten.rasmussen@arm.com |
---|---|
State | New |
Headers | show |
On 22 September 2014 18:24, Morten Rasmussen <morten.rasmussen@arm.com> wrote: > From: Dietmar Eggemann <dietmar.eggemann@arm.com> > > The per-entity load-tracking currently neither accounts for frequency > changes due to frequency scaling (cpufreq) nor for micro-architectural > differences between cpus (ARM big.LITTLE). Comparing tracked loads > between different cpus might therefore be quite misleading. > > This patch introduces a scale-invariance scaling factor to the > load-tracking computation that can be used to compensate for compute > capacity variations. The scaling factor is to be provided by the > architecture through an arch specific function. It may be as simple as: > > current_freq(cpu) * SCHED_CAPACITY_SCALE / max_freq(cpu) > > If the architecture has more sophisticated ways of tracking compute > capacity, it can do so in its implementation. By default, no scaling is > applied. > > The patch is loosely based on a patch by Chris Redpath > <Chris.Redpath@arm.com>. > > cc: Paul Turner <pjt@google.com> > cc: Ben Segall <bsegall@google.com> > > Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com> > Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com> > --- > kernel/sched/fair.c | 32 ++++++++++++++++++++++++++------ > 1 file changed, 26 insertions(+), 6 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 2a1e6ac..52abb3e 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -2267,6 +2267,8 @@ static u32 __compute_runnable_contrib(u64 n) > return contrib + runnable_avg_yN_sum[n]; > } > > +unsigned long arch_scale_load_capacity(int cpu); Why haven't you used arch_scale_freq_capacity which has a similar purpose in scaling the CPU capacity except the additional sched_domain pointer argument ? > + > /* > * We can represent the historical contribution to runnable average as the > * coefficients of a geometric series. To do this we sub-divide our runnable > @@ -2295,13 +2297,14 @@ static u32 __compute_runnable_contrib(u64 n) > * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) > * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] > */ > -static __always_inline int __update_entity_runnable_avg(u64 now, > +static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, > struct sched_avg *sa, > int runnable) > { > u64 delta, periods; > u32 runnable_contrib; > int delta_w, decayed = 0; > + u32 scale_cap = arch_scale_load_capacity(cpu); > > delta = now - sa->last_runnable_update; > /* > @@ -2334,8 +2337,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now, > * period and accrue it. > */ > delta_w = 1024 - delta_w; > + > if (runnable) > - sa->runnable_avg_sum += delta_w; > + sa->runnable_avg_sum += (delta_w * scale_cap) > + >> SCHED_CAPACITY_SHIFT; > sa->runnable_avg_period += delta_w; > > delta -= delta_w; > @@ -2351,14 +2356,17 @@ static __always_inline int __update_entity_runnable_avg(u64 now, > > /* Efficiently calculate \sum (1..n_period) 1024*y^i */ > runnable_contrib = __compute_runnable_contrib(periods); > + > if (runnable) > - sa->runnable_avg_sum += runnable_contrib; > + sa->runnable_avg_sum += (runnable_contrib * scale_cap) > + >> SCHED_CAPACITY_SHIFT; > sa->runnable_avg_period += runnable_contrib; > } > > /* Remainder of delta accrued against u_0` */ > if (runnable) > - sa->runnable_avg_sum += delta; > + sa->runnable_avg_sum += (delta * scale_cap) > + >> SCHED_CAPACITY_SHIFT; If we take the example of an always running task, its runnable_avg_sum should stay at the LOAD_AVG_MAX value whatever the frequency of the CPU on which it runs. But your change links the max value of runnable_avg_sum with the current frequency of the CPU so an always running task will have a load contribution of 25% your proposed scaling is fine with usage_avg_sum which reflects the effective running time on the CPU but the runnable_avg_sum should be able to reach LOAD_AVG_MAX whatever the current frequency is Regards, Vincent > sa->runnable_avg_period += delta; > > return decayed; > @@ -2464,7 +2472,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) > > static inline void update_rq_runnable_avg(struct rq *rq, int runnable) > { > - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); > + __update_entity_runnable_avg(rq_clock_task(rq), rq->cpu, &rq->avg, > + runnable); > __update_tg_runnable_avg(&rq->avg, &rq->cfs); > } > #else /* CONFIG_FAIR_GROUP_SCHED */ > @@ -2518,6 +2527,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, > { > struct cfs_rq *cfs_rq = cfs_rq_of(se); > long contrib_delta; > + int cpu = rq_of(cfs_rq)->cpu; > u64 now; > > /* > @@ -2529,7 +2539,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, > else > now = cfs_rq_clock_task(group_cfs_rq(se)); > > - if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) > + if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq)) > return; > > contrib_delta = __update_entity_load_avg_contrib(se); > @@ -5719,6 +5729,16 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) > return default_scale_cpu_capacity(sd, cpu); > } > > +static unsigned long default_scale_load_capacity(int cpu) > +{ > + return SCHED_CAPACITY_SCALE; > +} > + > +unsigned long __weak arch_scale_load_capacity(int cpu) > +{ > + return default_scale_load_capacity(cpu); > +} > + > static unsigned long scale_rt_capacity(int cpu) > { > struct rq *rq = cpu_rq(cpu); > -- > 1.7.9.5 > > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On Thu, Sep 25, 2014 at 02:48:47PM +0100, Vincent Guittot wrote: > On 22 September 2014 18:24, Morten Rasmussen <morten.rasmussen@arm.com> wrote: > > From: Dietmar Eggemann <dietmar.eggemann@arm.com> > > > > The per-entity load-tracking currently neither accounts for frequency > > changes due to frequency scaling (cpufreq) nor for micro-architectural > > differences between cpus (ARM big.LITTLE). Comparing tracked loads > > between different cpus might therefore be quite misleading. > > > > This patch introduces a scale-invariance scaling factor to the > > load-tracking computation that can be used to compensate for compute > > capacity variations. The scaling factor is to be provided by the > > architecture through an arch specific function. It may be as simple as: > > > > current_freq(cpu) * SCHED_CAPACITY_SCALE / max_freq(cpu) > > > > If the architecture has more sophisticated ways of tracking compute > > capacity, it can do so in its implementation. By default, no scaling is > > applied. > > > > The patch is loosely based on a patch by Chris Redpath > > <Chris.Redpath@arm.com>. > > > > cc: Paul Turner <pjt@google.com> > > cc: Ben Segall <bsegall@google.com> > > > > Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com> > > Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com> > > --- > > kernel/sched/fair.c | 32 ++++++++++++++++++++++++++------ > > 1 file changed, 26 insertions(+), 6 deletions(-) > > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > > index 2a1e6ac..52abb3e 100644 > > --- a/kernel/sched/fair.c > > +++ b/kernel/sched/fair.c > > @@ -2267,6 +2267,8 @@ static u32 __compute_runnable_contrib(u64 n) > > return contrib + runnable_avg_yN_sum[n]; > > } > > > > +unsigned long arch_scale_load_capacity(int cpu); > > Why haven't you used arch_scale_freq_capacity which has a similar > purpose in scaling the CPU capacity except the additional sched_domain > pointer argument ? To be honest I'm not happy with introducing another arch-function either and I'm happy to change that. It wasn't really clear to me which functions that would remain after your cpu_capacity rework patches, so I added this one. Now that we have most of the patches for capacity scaling and scale-invariant load-tracking on the table I think we have a better chance of figuring out which ones are needed and exactly how they are supposed to work. arch_scale_load_capacity() compensates for both frequency scaling and micro-architectural differences, while arch_scale_freq_capacity() only for frequency. As long as we can use arch_scale_cpu_capacity() to provide the micro-architecture scaling we can just do the scaling in two operations rather than one similar to how it is done for capacity in update_cpu_capacity(). I can fix that in the next version. It will cost an extra function call and multiplication though. To make sure that runnable_avg_{sum, period} are still bounded by LOAD_AVG_MAX, arch_scale_{cpu,freq}_capacity() must both return a factor in the range 0..SCHED_CAPACITY_SCALE. > > > + > > /* > > * We can represent the historical contribution to runnable average as the > > * coefficients of a geometric series. To do this we sub-divide our runnable > > @@ -2295,13 +2297,14 @@ static u32 __compute_runnable_contrib(u64 n) > > * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) > > * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] > > */ > > -static __always_inline int __update_entity_runnable_avg(u64 now, > > +static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, > > struct sched_avg *sa, > > int runnable) > > { > > u64 delta, periods; > > u32 runnable_contrib; > > int delta_w, decayed = 0; > > + u32 scale_cap = arch_scale_load_capacity(cpu); > > > > delta = now - sa->last_runnable_update; > > /* > > @@ -2334,8 +2337,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now, > > * period and accrue it. > > */ > > delta_w = 1024 - delta_w; > > + > > if (runnable) > > - sa->runnable_avg_sum += delta_w; > > + sa->runnable_avg_sum += (delta_w * scale_cap) > > + >> SCHED_CAPACITY_SHIFT; > > sa->runnable_avg_period += delta_w; > > > > delta -= delta_w; > > @@ -2351,14 +2356,17 @@ static __always_inline int __update_entity_runnable_avg(u64 now, > > > > /* Efficiently calculate \sum (1..n_period) 1024*y^i */ > > runnable_contrib = __compute_runnable_contrib(periods); > > + > > if (runnable) > > - sa->runnable_avg_sum += runnable_contrib; > > + sa->runnable_avg_sum += (runnable_contrib * scale_cap) > > + >> SCHED_CAPACITY_SHIFT; > > sa->runnable_avg_period += runnable_contrib; > > } > > > > /* Remainder of delta accrued against u_0` */ > > if (runnable) > > - sa->runnable_avg_sum += delta; > > + sa->runnable_avg_sum += (delta * scale_cap) > > + >> SCHED_CAPACITY_SHIFT; > > If we take the example of an always running task, its runnable_avg_sum > should stay at the LOAD_AVG_MAX value whatever the frequency of the > CPU on which it runs. But your change links the max value of > runnable_avg_sum with the current frequency of the CPU so an always > running task will have a load contribution of 25% > your proposed scaling is fine with usage_avg_sum which reflects the > effective running time on the CPU but the runnable_avg_sum should be > able to reach LOAD_AVG_MAX whatever the current frequency is I don't think it makes sense to scale one metric and not the other. You will end up with two very different (potentially opposite) views of the cpu load/utilization situation in many scenarios. As I see it, scale-invariance and load-balancing with scale-invariance present can be done in two ways: 1. Leave runnable_avg_sum unscaled and scale running_avg_sum. se->avg.load_avg_contrib will remain unscaled and so will cfs_rq->runnable_load_avg, cfs_rq->blocked_load_avg, and weighted_cpuload(). Essentially all the existing load-balancing code will continue to use unscaled load. When we want to improve cpu utilization and energy-awareness we will have to bypass most of this code as it is likely to lead us on the wrong direction since it has a potentially wrong view of the cpu load due to the lack of scale-invariance. 2. Scale both runnable_avg_sum and running_avg_sum. All existing load metrics including weighted_cpuload() are scaled and thus more accurate. The difference between se->avg.load_avg_contrib and se->avg.usage_avg_contrib is the priority scaling and whether or not runqueue waiting time is counted. se->avg.load_avg_contrib can only reach se->load.weight when running on the fastest cpu at the highest frequency, but it is now scale-invariant so we have much better idea about how much load we are pulling when load-balancing two cpus running at different frequencies. The load-balance code-path still has to be audited to see if anything blows up due to the scaling. I haven't finished doing that yet. This patch set doesn't include patches to address such issues (yet). IMHO, by scaling runnable_avg_sum we can more easily make the existing load-balancing code do the right thing. For both options we have to go through the existing load-balancing code to either change it to use the scale-invariant metric (running_avg_sum) when appropriate or to fix bits that don't work properly with a scale-invariant runnable_avg_sum and reuse the existing code. I think the latter is less intrusive, but I might be wrong. Opinions? Morten -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On 25 September 2014 19:23, Morten Rasmussen <morten.rasmussen@arm.com> wrote: [snip] >> > /* Remainder of delta accrued against u_0` */ >> > if (runnable) >> > - sa->runnable_avg_sum += delta; >> > + sa->runnable_avg_sum += (delta * scale_cap) >> > + >> SCHED_CAPACITY_SHIFT; >> >> If we take the example of an always running task, its runnable_avg_sum >> should stay at the LOAD_AVG_MAX value whatever the frequency of the >> CPU on which it runs. But your change links the max value of >> runnable_avg_sum with the current frequency of the CPU so an always >> running task will have a load contribution of 25% >> your proposed scaling is fine with usage_avg_sum which reflects the >> effective running time on the CPU but the runnable_avg_sum should be >> able to reach LOAD_AVG_MAX whatever the current frequency is > > I don't think it makes sense to scale one metric and not the other. You > will end up with two very different (potentially opposite) views of the you have missed my point, i fully agree that scaling in-variance is a good enhancement but IIUC your patchset doesn't solve the whole problem. Let me try to explain with examples : - A task with a load of 10% on a CPU at max frequency will keep a load of 10% if the frequency of the CPU is divided by 2 which is fine - But an always running task with a load of 100% on a CPU at max frequency will have a load of 50% if the frequency of the CPU is divided by 2 which is not what we want; the load of such task should stay at 100% - if we have 2 identical always running tasks on CPUs with different frequency, their load will be different So your patchset adds scaling invariance for small tasks but add some scaling variances for heavy tasks Regards, Vincent > cpu load/utilization situation in many scenarios. As I see it, > scale-invariance and load-balancing with scale-invariance present can be > done in two ways: > > 1. Leave runnable_avg_sum unscaled and scale running_avg_sum. > se->avg.load_avg_contrib will remain unscaled and so will > cfs_rq->runnable_load_avg, cfs_rq->blocked_load_avg, and > weighted_cpuload(). Essentially all the existing load-balancing code > will continue to use unscaled load. When we want to improve cpu > utilization and energy-awareness we will have to bypass most of this > code as it is likely to lead us on the wrong direction since it has a > potentially wrong view of the cpu load due to the lack of > scale-invariance. > > 2. Scale both runnable_avg_sum and running_avg_sum. All existing load > metrics including weighted_cpuload() are scaled and thus more accurate. > The difference between se->avg.load_avg_contrib and > se->avg.usage_avg_contrib is the priority scaling and whether or not > runqueue waiting time is counted. se->avg.load_avg_contrib can only > reach se->load.weight when running on the fastest cpu at the highest > frequency, but it is now scale-invariant so we have much better idea > about how much load we are pulling when load-balancing two cpus running > at different frequencies. The load-balance code-path still has to be > audited to see if anything blows up due to the scaling. I haven't > finished doing that yet. This patch set doesn't include patches to > address such issues (yet). IMHO, by scaling runnable_avg_sum we can more > easily make the existing load-balancing code do the right thing. > > For both options we have to go through the existing load-balancing code > to either change it to use the scale-invariant metric (running_avg_sum) > when appropriate or to fix bits that don't work properly with a > scale-invariant runnable_avg_sum and reuse the existing code. I think > the latter is less intrusive, but I might be wrong. > > Opinions? > > Morten > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On Fri, Sep 26, 2014 at 08:36:53AM +0100, Vincent Guittot wrote: > On 25 September 2014 19:23, Morten Rasmussen <morten.rasmussen@arm.com> wrote: > > [snip] > > >> > /* Remainder of delta accrued against u_0` */ > >> > if (runnable) > >> > - sa->runnable_avg_sum += delta; > >> > + sa->runnable_avg_sum += (delta * scale_cap) > >> > + >> SCHED_CAPACITY_SHIFT; > >> > >> If we take the example of an always running task, its runnable_avg_sum > >> should stay at the LOAD_AVG_MAX value whatever the frequency of the > >> CPU on which it runs. But your change links the max value of > >> runnable_avg_sum with the current frequency of the CPU so an always > >> running task will have a load contribution of 25% > >> your proposed scaling is fine with usage_avg_sum which reflects the > >> effective running time on the CPU but the runnable_avg_sum should be > >> able to reach LOAD_AVG_MAX whatever the current frequency is > > > > I don't think it makes sense to scale one metric and not the other. You > > will end up with two very different (potentially opposite) views of the > > you have missed my point, i fully agree that scaling in-variance is a > good enhancement but IIUC your patchset doesn't solve the whole > problem. > > Let me try to explain with examples : > - A task with a load of 10% on a CPU at max frequency will keep a load > of 10% if the frequency of the CPU is divided by 2 which is fine Yes. > - But an always running task with a load of 100% on a CPU at max > frequency will have a load of 50% if the frequency of the CPU is > divided by 2 which is not what we want; the load of such task should > stay at 100% I think that is fine too and that is intentional. We can't say anything about the load/utilization of an always running no matter what cpu and at what frequency it is running. As soon as the tracked load/utilization indicates always running, we don't know how much load/utilization it will cause on a faster cpu. However, if it is 99.9% we are fine (well, we do probably want some bigger margin). As I see it, always running tasks must be treated specially. We can easily figure out which tasks are always running by comparing the scale load divided by se->load.weight to the current compute capacity on the cpu it is running on. If they are equal (or close), the task is always running. If we migrate it to a different cpu we should take into account that its load might increase if it gets more cycles to spend. You could even do something like: unsigned long migration_load(sched_entity *se) { if (se->avg.load_avg_contrib >= current_capacity(cpu_of(se)) * se->load.weight) return se->load.weight; return se->avg.load_avg_contrib; } for use when moving tasks between cpus when the source cpu is fully loaded at its current capacity. The task load is actually 100% relative to the current compute capacity of the task cpu, but not compared to the fastest cpu in the system. As I said in my previous reply, this isn't covered yet by this patch set. It is of course necessary to go through the load-balancing conditions to see where/if modifications are needed to do the right thing for scale-invariant load. > - if we have 2 identical always running tasks on CPUs with different > frequency, their load will be different Yes, in terms of absolute load and it is only the case for always running tasks. However, they would both have a load equal to the cpu capacity divided by se->avg.load_avg_contrib, so we can easily identify them. > So your patchset adds scaling invariance for small tasks but add some > scaling variances for heavy tasks For always running tasks, yes, but I don't see how we can avoid treating them specially anyway as we don't know anything about their true load. That doesn't change by changing how we scale their load. Better suggestions are of course welcome :) Morten -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On Thu, Sep 25, 2014 at 06:23:43PM +0100, Morten Rasmussen wrote: > > Why haven't you used arch_scale_freq_capacity which has a similar > > purpose in scaling the CPU capacity except the additional sched_domain > > pointer argument ? > > To be honest I'm not happy with introducing another arch-function > either and I'm happy to change that. It wasn't really clear to me which > functions that would remain after your cpu_capacity rework patches, so I > added this one. Now that we have most of the patches for capacity > scaling and scale-invariant load-tracking on the table I think we have a > better chance of figuring out which ones are needed and exactly how they > are supposed to work. > > arch_scale_load_capacity() compensates for both frequency scaling and > micro-architectural differences, while arch_scale_freq_capacity() only > for frequency. As long as we can use arch_scale_cpu_capacity() to > provide the micro-architecture scaling we can just do the scaling in two > operations rather than one similar to how it is done for capacity in > update_cpu_capacity(). I can fix that in the next version. It will cost > an extra function call and multiplication though. > > To make sure that runnable_avg_{sum, period} are still bounded by > LOAD_AVG_MAX, arch_scale_{cpu,freq}_capacity() must both return a factor > in the range 0..SCHED_CAPACITY_SCALE. I would certainly like some words in the Changelog on how and that the math is still free of overflows. Clearly you've thought about it, so please feel free to elucidate the rest of us :-) > > If we take the example of an always running task, its runnable_avg_sum > > should stay at the LOAD_AVG_MAX value whatever the frequency of the > > CPU on which it runs. But your change links the max value of > > runnable_avg_sum with the current frequency of the CPU so an always > > running task will have a load contribution of 25% > > your proposed scaling is fine with usage_avg_sum which reflects the > > effective running time on the CPU but the runnable_avg_sum should be > > able to reach LOAD_AVG_MAX whatever the current frequency is > > I don't think it makes sense to scale one metric and not the other. You > will end up with two very different (potentially opposite) views of the > cpu load/utilization situation in many scenarios. As I see it, > scale-invariance and load-balancing with scale-invariance present can be > done in two ways: > > 1. Leave runnable_avg_sum unscaled and scale running_avg_sum. > se->avg.load_avg_contrib will remain unscaled and so will > cfs_rq->runnable_load_avg, cfs_rq->blocked_load_avg, and > weighted_cpuload(). Essentially all the existing load-balancing code > will continue to use unscaled load. When we want to improve cpu > utilization and energy-awareness we will have to bypass most of this > code as it is likely to lead us on the wrong direction since it has a > potentially wrong view of the cpu load due to the lack of > scale-invariance. > > 2. Scale both runnable_avg_sum and running_avg_sum. All existing load > metrics including weighted_cpuload() are scaled and thus more accurate. > The difference between se->avg.load_avg_contrib and > se->avg.usage_avg_contrib is the priority scaling and whether or not > runqueue waiting time is counted. se->avg.load_avg_contrib can only > reach se->load.weight when running on the fastest cpu at the highest > frequency, but it is now scale-invariant so we have much better idea > about how much load we are pulling when load-balancing two cpus running > at different frequencies. The load-balance code-path still has to be > audited to see if anything blows up due to the scaling. I haven't > finished doing that yet. This patch set doesn't include patches to > address such issues (yet). IMHO, by scaling runnable_avg_sum we can more > easily make the existing load-balancing code do the right thing. > > For both options we have to go through the existing load-balancing code > to either change it to use the scale-invariant metric (running_avg_sum) > when appropriate or to fix bits that don't work properly with a > scale-invariant runnable_avg_sum and reuse the existing code. I think > the latter is less intrusive, but I might be wrong. > > Opinions? /me votes #2, I think the example in the reply is a false one, an always running task will/should ramp up the cpufreq and get us at full speed (and yes I'm aware of the case where you're memory bound and raising the cpu freq isn't going to actually improve performance, but I'm not sure we want to get/be that smart, esp. at this stage). -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Hi Morten, Sorry for late jumping in. The problem seems to be self-evident. But for the implementation to be equally attractive it needs to account for every freq change for every task, or anything less than that makes it less attractive. But this should be very hard. Intel Architecture has limitation to capture all the freq changes in software and also the intel_pstate should have no notification. For every task, this makes the updating of the entire queue in load tracking more needed, so once again, ping maintainers for the rewrite patches, :) Thanks, Yuyang On Mon, Sep 22, 2014 at 05:24:01PM +0100, Morten Rasmussen wrote: > From: Dietmar Eggemann <dietmar.eggemann@arm.com> > > The per-entity load-tracking currently neither accounts for frequency > changes due to frequency scaling (cpufreq) nor for micro-architectural > differences between cpus (ARM big.LITTLE). Comparing tracked loads > between different cpus might therefore be quite misleading. > > This patch introduces a scale-invariance scaling factor to the > load-tracking computation that can be used to compensate for compute > capacity variations. The scaling factor is to be provided by the > architecture through an arch specific function. It may be as simple as: > > current_freq(cpu) * SCHED_CAPACITY_SCALE / max_freq(cpu) > > If the architecture has more sophisticated ways of tracking compute > capacity, it can do so in its implementation. By default, no scaling is > applied. > > The patch is loosely based on a patch by Chris Redpath > <Chris.Redpath@arm.com>. > > cc: Paul Turner <pjt@google.com> > cc: Ben Segall <bsegall@google.com> > > Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com> > Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com> -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On Thu, Oct 02, 2014 at 09:34:28PM +0100, Peter Zijlstra wrote: > On Thu, Sep 25, 2014 at 06:23:43PM +0100, Morten Rasmussen wrote: > > > > Why haven't you used arch_scale_freq_capacity which has a similar > > > purpose in scaling the CPU capacity except the additional sched_domain > > > pointer argument ? > > > > To be honest I'm not happy with introducing another arch-function > > either and I'm happy to change that. It wasn't really clear to me which > > functions that would remain after your cpu_capacity rework patches, so I > > added this one. Now that we have most of the patches for capacity > > scaling and scale-invariant load-tracking on the table I think we have a > > better chance of figuring out which ones are needed and exactly how they > > are supposed to work. > > > > arch_scale_load_capacity() compensates for both frequency scaling and > > micro-architectural differences, while arch_scale_freq_capacity() only > > for frequency. As long as we can use arch_scale_cpu_capacity() to > > provide the micro-architecture scaling we can just do the scaling in two > > operations rather than one similar to how it is done for capacity in > > update_cpu_capacity(). I can fix that in the next version. It will cost > > an extra function call and multiplication though. > > > > To make sure that runnable_avg_{sum, period} are still bounded by > > LOAD_AVG_MAX, arch_scale_{cpu,freq}_capacity() must both return a factor > > in the range 0..SCHED_CAPACITY_SCALE. > > I would certainly like some words in the Changelog on how and that the > math is still free of overflows. Clearly you've thought about it, so > please feel free to elucidate the rest of us :-) Sure. The easiest way to avoid introducing overflows is to ensure that we always scale by a factor >= 1.0. That should be true as long as arch_scale_{cpu,freq}_capacity() never returns anything greater than SCHED_CAPACITY_SCALE (= 1024 = 1.0). If we take big.LITTLE is an example, the max cpu capacity of a big cpu would be 1024 and since we multiply the scaling factors (as in update_cpu_capacity()) the max frequency scaling capacity factor would be 1024. The result is a 1.0 (1.0 * 1.0) scaling factor when a task is running on a big cpu at the highest frequency. At 50% frequency, the scaling factor is 0.5 (1.0 * 0.5). For a little cpu arch_scale_cpu_capacity() would return something less than 1024, 512 for example. The max frequency scaling capacity factor is 1024. A task running on a little cpu at max frequency would have its load scaled by 0.5 (0.5 * 1.0). At 50% frequency, it would be 0.25 (0.5 * 0.5). However, as said earlier (below), we have to go through the load-balance code to ensure that it doesn't blow up when cpu capacities get small (huge.TINY), but the load-tracking code itself should be fine I think. > > > > If we take the example of an always running task, its runnable_avg_sum > > > should stay at the LOAD_AVG_MAX value whatever the frequency of the > > > CPU on which it runs. But your change links the max value of > > > runnable_avg_sum with the current frequency of the CPU so an always > > > running task will have a load contribution of 25% > > > your proposed scaling is fine with usage_avg_sum which reflects the > > > effective running time on the CPU but the runnable_avg_sum should be > > > able to reach LOAD_AVG_MAX whatever the current frequency is > > > > I don't think it makes sense to scale one metric and not the other. You > > will end up with two very different (potentially opposite) views of the > > cpu load/utilization situation in many scenarios. As I see it, > > scale-invariance and load-balancing with scale-invariance present can be > > done in two ways: > > > > 1. Leave runnable_avg_sum unscaled and scale running_avg_sum. > > se->avg.load_avg_contrib will remain unscaled and so will > > cfs_rq->runnable_load_avg, cfs_rq->blocked_load_avg, and > > weighted_cpuload(). Essentially all the existing load-balancing code > > will continue to use unscaled load. When we want to improve cpu > > utilization and energy-awareness we will have to bypass most of this > > code as it is likely to lead us on the wrong direction since it has a > > potentially wrong view of the cpu load due to the lack of > > scale-invariance. > > > > 2. Scale both runnable_avg_sum and running_avg_sum. All existing load > > metrics including weighted_cpuload() are scaled and thus more accurate. > > The difference between se->avg.load_avg_contrib and > > se->avg.usage_avg_contrib is the priority scaling and whether or not > > runqueue waiting time is counted. se->avg.load_avg_contrib can only > > reach se->load.weight when running on the fastest cpu at the highest > > frequency, but it is now scale-invariant so we have much better idea > > about how much load we are pulling when load-balancing two cpus running > > at different frequencies. The load-balance code-path still has to be > > audited to see if anything blows up due to the scaling. I haven't > > finished doing that yet. This patch set doesn't include patches to > > address such issues (yet). IMHO, by scaling runnable_avg_sum we can more > > easily make the existing load-balancing code do the right thing. > > > > For both options we have to go through the existing load-balancing code > > to either change it to use the scale-invariant metric (running_avg_sum) > > when appropriate or to fix bits that don't work properly with a > > scale-invariant runnable_avg_sum and reuse the existing code. I think > > the latter is less intrusive, but I might be wrong. > > > > Opinions? > > /me votes #2, I think the example in the reply is a false one, an always > running task will/should ramp up the cpufreq and get us at full speed > (and yes I'm aware of the case where you're memory bound and raising the > cpu freq isn't going to actually improve performance, but I'm not sure > we want to get/be that smart, esp. at this stage). Okay, and agreed that memory bound task smarts are out of scope for the time being. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On 8 October 2014 13:00, Morten Rasmussen <morten.rasmussen@arm.com> wrote: > On Thu, Oct 02, 2014 at 09:34:28PM +0100, Peter Zijlstra wrote: >> On Thu, Sep 25, 2014 at 06:23:43PM +0100, Morten Rasmussen wrote: >> >> > > Why haven't you used arch_scale_freq_capacity which has a similar >> > > purpose in scaling the CPU capacity except the additional sched_domain >> > > pointer argument ? >> > >> > To be honest I'm not happy with introducing another arch-function >> > either and I'm happy to change that. It wasn't really clear to me which >> > functions that would remain after your cpu_capacity rework patches, so I >> > added this one. Now that we have most of the patches for capacity >> > scaling and scale-invariant load-tracking on the table I think we have a >> > better chance of figuring out which ones are needed and exactly how they >> > are supposed to work. >> > >> > arch_scale_load_capacity() compensates for both frequency scaling and >> > micro-architectural differences, while arch_scale_freq_capacity() only >> > for frequency. As long as we can use arch_scale_cpu_capacity() to >> > provide the micro-architecture scaling we can just do the scaling in two >> > operations rather than one similar to how it is done for capacity in >> > update_cpu_capacity(). I can fix that in the next version. It will cost >> > an extra function call and multiplication though. >> > >> > To make sure that runnable_avg_{sum, period} are still bounded by >> > LOAD_AVG_MAX, arch_scale_{cpu,freq}_capacity() must both return a factor >> > in the range 0..SCHED_CAPACITY_SCALE. >> >> I would certainly like some words in the Changelog on how and that the >> math is still free of overflows. Clearly you've thought about it, so >> please feel free to elucidate the rest of us :-) > > Sure. The easiest way to avoid introducing overflows is to ensure that > we always scale by a factor >= 1.0. That should be true as long as > arch_scale_{cpu,freq}_capacity() never returns anything greater than > SCHED_CAPACITY_SCALE (= 1024 = 1.0). the current ARM arch_scale_cpu is in the range [1536..0] which is free of overflow AFAICT > > If we take big.LITTLE is an example, the max cpu capacity of a big cpu > would be 1024 and since we multiply the scaling factors (as in > update_cpu_capacity()) the max frequency scaling capacity factor would > be 1024. The result is a 1.0 (1.0 * 1.0) scaling factor when a task is > running on a big cpu at the highest frequency. At 50% frequency, the > scaling factor is 0.5 (1.0 * 0.5). > > For a little cpu arch_scale_cpu_capacity() would return something less > than 1024, 512 for example. The max frequency scaling capacity factor is > 1024. A task running on a little cpu at max frequency would have its > load scaled by 0.5 (0.5 * 1.0). At 50% frequency, it would be 0.25 (0.5 > * 0.5). > > However, as said earlier (below), we have to go through the load-balance > code to ensure that it doesn't blow up when cpu capacities get small > (huge.TINY), but the load-tracking code itself should be fine I think. > >> >> > > If we take the example of an always running task, its runnable_avg_sum >> > > should stay at the LOAD_AVG_MAX value whatever the frequency of the >> > > CPU on which it runs. But your change links the max value of >> > > runnable_avg_sum with the current frequency of the CPU so an always >> > > running task will have a load contribution of 25% >> > > your proposed scaling is fine with usage_avg_sum which reflects the >> > > effective running time on the CPU but the runnable_avg_sum should be >> > > able to reach LOAD_AVG_MAX whatever the current frequency is >> > >> > I don't think it makes sense to scale one metric and not the other. You >> > will end up with two very different (potentially opposite) views of the >> > cpu load/utilization situation in many scenarios. As I see it, >> > scale-invariance and load-balancing with scale-invariance present can be >> > done in two ways: >> > >> > 1. Leave runnable_avg_sum unscaled and scale running_avg_sum. >> > se->avg.load_avg_contrib will remain unscaled and so will >> > cfs_rq->runnable_load_avg, cfs_rq->blocked_load_avg, and >> > weighted_cpuload(). Essentially all the existing load-balancing code >> > will continue to use unscaled load. When we want to improve cpu >> > utilization and energy-awareness we will have to bypass most of this >> > code as it is likely to lead us on the wrong direction since it has a >> > potentially wrong view of the cpu load due to the lack of >> > scale-invariance. >> > >> > 2. Scale both runnable_avg_sum and running_avg_sum. All existing load >> > metrics including weighted_cpuload() are scaled and thus more accurate. >> > The difference between se->avg.load_avg_contrib and >> > se->avg.usage_avg_contrib is the priority scaling and whether or not >> > runqueue waiting time is counted. se->avg.load_avg_contrib can only >> > reach se->load.weight when running on the fastest cpu at the highest >> > frequency, but it is now scale-invariant so we have much better idea >> > about how much load we are pulling when load-balancing two cpus running >> > at different frequencies. The load-balance code-path still has to be >> > audited to see if anything blows up due to the scaling. I haven't >> > finished doing that yet. This patch set doesn't include patches to >> > address such issues (yet). IMHO, by scaling runnable_avg_sum we can more >> > easily make the existing load-balancing code do the right thing. >> > >> > For both options we have to go through the existing load-balancing code >> > to either change it to use the scale-invariant metric (running_avg_sum) >> > when appropriate or to fix bits that don't work properly with a >> > scale-invariant runnable_avg_sum and reuse the existing code. I think >> > the latter is less intrusive, but I might be wrong. >> > >> > Opinions? >> >> /me votes #2, I think the example in the reply is a false one, an always >> running task will/should ramp up the cpufreq and get us at full speed >> (and yes I'm aware of the case where you're memory bound and raising the >> cpu freq isn't going to actually improve performance, but I'm not sure >> we want to get/be that smart, esp. at this stage). > > Okay, and agreed that memory bound task smarts are out of scope for the > time being. > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On 2 October 2014 22:34, Peter Zijlstra <peterz@infradead.org> wrote: > On Thu, Sep 25, 2014 at 06:23:43PM +0100, Morten Rasmussen wrote: > >> > Why haven't you used arch_scale_freq_capacity which has a similar >> > purpose in scaling the CPU capacity except the additional sched_domain >> > pointer argument ? >> >> To be honest I'm not happy with introducing another arch-function >> either and I'm happy to change that. It wasn't really clear to me which >> functions that would remain after your cpu_capacity rework patches, so I >> added this one. Now that we have most of the patches for capacity >> scaling and scale-invariant load-tracking on the table I think we have a >> better chance of figuring out which ones are needed and exactly how they >> are supposed to work. >> >> arch_scale_load_capacity() compensates for both frequency scaling and >> micro-architectural differences, while arch_scale_freq_capacity() only >> for frequency. As long as we can use arch_scale_cpu_capacity() to >> provide the micro-architecture scaling we can just do the scaling in two >> operations rather than one similar to how it is done for capacity in >> update_cpu_capacity(). I can fix that in the next version. It will cost >> an extra function call and multiplication though. >> >> To make sure that runnable_avg_{sum, period} are still bounded by >> LOAD_AVG_MAX, arch_scale_{cpu,freq}_capacity() must both return a factor >> in the range 0..SCHED_CAPACITY_SCALE. > > I would certainly like some words in the Changelog on how and that the > math is still free of overflows. Clearly you've thought about it, so > please feel free to elucidate the rest of us :-) > >> > If we take the example of an always running task, its runnable_avg_sum >> > should stay at the LOAD_AVG_MAX value whatever the frequency of the >> > CPU on which it runs. But your change links the max value of >> > runnable_avg_sum with the current frequency of the CPU so an always >> > running task will have a load contribution of 25% >> > your proposed scaling is fine with usage_avg_sum which reflects the >> > effective running time on the CPU but the runnable_avg_sum should be >> > able to reach LOAD_AVG_MAX whatever the current frequency is >> >> I don't think it makes sense to scale one metric and not the other. You >> will end up with two very different (potentially opposite) views of the >> cpu load/utilization situation in many scenarios. As I see it, >> scale-invariance and load-balancing with scale-invariance present can be >> done in two ways: >> >> 1. Leave runnable_avg_sum unscaled and scale running_avg_sum. >> se->avg.load_avg_contrib will remain unscaled and so will >> cfs_rq->runnable_load_avg, cfs_rq->blocked_load_avg, and >> weighted_cpuload(). Essentially all the existing load-balancing code >> will continue to use unscaled load. When we want to improve cpu >> utilization and energy-awareness we will have to bypass most of this >> code as it is likely to lead us on the wrong direction since it has a >> potentially wrong view of the cpu load due to the lack of >> scale-invariance. >> >> 2. Scale both runnable_avg_sum and running_avg_sum. All existing load >> metrics including weighted_cpuload() are scaled and thus more accurate. >> The difference between se->avg.load_avg_contrib and >> se->avg.usage_avg_contrib is the priority scaling and whether or not >> runqueue waiting time is counted. se->avg.load_avg_contrib can only >> reach se->load.weight when running on the fastest cpu at the highest >> frequency, but it is now scale-invariant so we have much better idea >> about how much load we are pulling when load-balancing two cpus running >> at different frequencies. The load-balance code-path still has to be >> audited to see if anything blows up due to the scaling. I haven't >> finished doing that yet. This patch set doesn't include patches to >> address such issues (yet). IMHO, by scaling runnable_avg_sum we can more >> easily make the existing load-balancing code do the right thing. >> >> For both options we have to go through the existing load-balancing code >> to either change it to use the scale-invariant metric (running_avg_sum) >> when appropriate or to fix bits that don't work properly with a >> scale-invariant runnable_avg_sum and reuse the existing code. I think >> the latter is less intrusive, but I might be wrong. >> >> Opinions? > > /me votes #2, I think the example in the reply is a false one, an always > running task will/should ramp up the cpufreq and get us at full speed I have in mind some system where the max achievable freq of a core depends of how many cores are running simultaneously because of some HW constraint like max current. In this case, the CPU might not reach max frequency even with an always running task. Then, beside frequency scaling, their is the uarch invariance that is introduced by patch 4 that will generate similar behavior of the load. > (and yes I'm aware of the case where you're memory bound and raising the > cpu freq isn't going to actually improve performance, but I'm not sure > we want to get/be that smart, esp. at this stage). -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Hi Yuyang, On 08/10/14 01:50, Yuyang Du wrote: > Hi Morten, > > Sorry for late jumping in. > > The problem seems to be self-evident. But for the implementation to be > equally attractive it needs to account for every freq change for every task, > or anything less than that makes it less attractive. > > But this should be very hard. Intel Architecture has limitation to capture all > the freq changes in software and also the intel_pstate should have no > notification. We encountered this missing notification for current frequency with Intel systems (e.g. i5-3320M) using the intel_pstate driver while testing this patch-set. The arch_scale_set_curr_freq call in __cpufreq_notify_transition [[PATCH 2/7] cpufreq: Architecture specific callback for frequency changes] will not work on such a system. In our internal testing, we placed arch_scale_set_curr_freq(cpu->cpu, sample->freq) into intel_pstate_timer_func [intel_pstate.c] to get the current frequency for a cpu. The arch_scale_set_max_freq call in cpufreq_set_policy [drivers/cpufreq/cpufreq.c] still works although the driver exposes the max turbo pstate and not the max pstate. That's an additional problem because we don't want to use turbo states for frequency scaling. > > For every task, this makes the updating of the entire queue in load tracking > more needed, so once again, ping maintainers for the rewrite patches, :) > > Thanks, > Yuyang > [...] -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On Wed, Oct 08, 2014 at 12:21:45PM +0100, Vincent Guittot wrote: > On 8 October 2014 13:00, Morten Rasmussen <morten.rasmussen@arm.com> wrote: > > On Thu, Oct 02, 2014 at 09:34:28PM +0100, Peter Zijlstra wrote: > >> On Thu, Sep 25, 2014 at 06:23:43PM +0100, Morten Rasmussen wrote: > >> > >> > > Why haven't you used arch_scale_freq_capacity which has a similar > >> > > purpose in scaling the CPU capacity except the additional sched_domain > >> > > pointer argument ? > >> > > >> > To be honest I'm not happy with introducing another arch-function > >> > either and I'm happy to change that. It wasn't really clear to me which > >> > functions that would remain after your cpu_capacity rework patches, so I > >> > added this one. Now that we have most of the patches for capacity > >> > scaling and scale-invariant load-tracking on the table I think we have a > >> > better chance of figuring out which ones are needed and exactly how they > >> > are supposed to work. > >> > > >> > arch_scale_load_capacity() compensates for both frequency scaling and > >> > micro-architectural differences, while arch_scale_freq_capacity() only > >> > for frequency. As long as we can use arch_scale_cpu_capacity() to > >> > provide the micro-architecture scaling we can just do the scaling in two > >> > operations rather than one similar to how it is done for capacity in > >> > update_cpu_capacity(). I can fix that in the next version. It will cost > >> > an extra function call and multiplication though. > >> > > >> > To make sure that runnable_avg_{sum, period} are still bounded by > >> > LOAD_AVG_MAX, arch_scale_{cpu,freq}_capacity() must both return a factor > >> > in the range 0..SCHED_CAPACITY_SCALE. > >> > >> I would certainly like some words in the Changelog on how and that the > >> math is still free of overflows. Clearly you've thought about it, so > >> please feel free to elucidate the rest of us :-) > > > > Sure. The easiest way to avoid introducing overflows is to ensure that > > we always scale by a factor >= 1.0. That should be true as long as > > arch_scale_{cpu,freq}_capacity() never returns anything greater than > > SCHED_CAPACITY_SCALE (= 1024 = 1.0). > > the current ARM arch_scale_cpu is in the range [1536..0] which is free > of overflow AFAICT If I'm not mistaken, that will cause an overflow in __update_task_entity_contrib(): static inline void __update_task_entity_contrib(struct sched_entity *se) { u32 contrib; /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); contrib /= (se->avg.avg_period + 1); se->avg.load_avg_contrib = scale_load(contrib); } With arch_scale_cpu_capacity() > 1024 se->avg.runnable_avg_sum is no longer bounded by LOAD_AVG_MAX = 47742. scale_load_down(se->load.weight) == se->load.weight =< 88761. 47742 * 88761 = 4237627662 (2^32 = 4294967296) To avoid overflow se->avg.runnable_avg_sum must be less than 2^32/88761 = 48388, which means that arch_scale_cpu_capacity() must be in the range 0..48388*1024/47742 = 0..1037. I also think it is easier to have a fixed defined max scaling factor, but that might just be me. Regarding the ARM arch_scale_cpu_capacity() implementation, I think that can be changed to fit the 0..1024 range easily. Currently, it will only report a non-default (1024) capacity for big.LITTLE systems and actually enabling it (requires a certain property to be set in device tree) leads to broken load-balancing decisions. We have discussed that several times in the past. I wouldn't recommend enabling it until the load-balance code can deal with big.LITTLE compute capacities correctly. This is also why it isn't implemented by ARM64. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On Wed, Oct 08, 2014 at 12:38:40PM +0100, Vincent Guittot wrote: > On 2 October 2014 22:34, Peter Zijlstra <peterz@infradead.org> wrote: > > On Thu, Sep 25, 2014 at 06:23:43PM +0100, Morten Rasmussen wrote: > > > >> > Why haven't you used arch_scale_freq_capacity which has a similar > >> > purpose in scaling the CPU capacity except the additional sched_domain > >> > pointer argument ? > >> > >> To be honest I'm not happy with introducing another arch-function > >> either and I'm happy to change that. It wasn't really clear to me which > >> functions that would remain after your cpu_capacity rework patches, so I > >> added this one. Now that we have most of the patches for capacity > >> scaling and scale-invariant load-tracking on the table I think we have a > >> better chance of figuring out which ones are needed and exactly how they > >> are supposed to work. > >> > >> arch_scale_load_capacity() compensates for both frequency scaling and > >> micro-architectural differences, while arch_scale_freq_capacity() only > >> for frequency. As long as we can use arch_scale_cpu_capacity() to > >> provide the micro-architecture scaling we can just do the scaling in two > >> operations rather than one similar to how it is done for capacity in > >> update_cpu_capacity(). I can fix that in the next version. It will cost > >> an extra function call and multiplication though. > >> > >> To make sure that runnable_avg_{sum, period} are still bounded by > >> LOAD_AVG_MAX, arch_scale_{cpu,freq}_capacity() must both return a factor > >> in the range 0..SCHED_CAPACITY_SCALE. > > > > I would certainly like some words in the Changelog on how and that the > > math is still free of overflows. Clearly you've thought about it, so > > please feel free to elucidate the rest of us :-) > > > >> > If we take the example of an always running task, its runnable_avg_sum > >> > should stay at the LOAD_AVG_MAX value whatever the frequency of the > >> > CPU on which it runs. But your change links the max value of > >> > runnable_avg_sum with the current frequency of the CPU so an always > >> > running task will have a load contribution of 25% > >> > your proposed scaling is fine with usage_avg_sum which reflects the > >> > effective running time on the CPU but the runnable_avg_sum should be > >> > able to reach LOAD_AVG_MAX whatever the current frequency is > >> > >> I don't think it makes sense to scale one metric and not the other. You > >> will end up with two very different (potentially opposite) views of the > >> cpu load/utilization situation in many scenarios. As I see it, > >> scale-invariance and load-balancing with scale-invariance present can be > >> done in two ways: > >> > >> 1. Leave runnable_avg_sum unscaled and scale running_avg_sum. > >> se->avg.load_avg_contrib will remain unscaled and so will > >> cfs_rq->runnable_load_avg, cfs_rq->blocked_load_avg, and > >> weighted_cpuload(). Essentially all the existing load-balancing code > >> will continue to use unscaled load. When we want to improve cpu > >> utilization and energy-awareness we will have to bypass most of this > >> code as it is likely to lead us on the wrong direction since it has a > >> potentially wrong view of the cpu load due to the lack of > >> scale-invariance. > >> > >> 2. Scale both runnable_avg_sum and running_avg_sum. All existing load > >> metrics including weighted_cpuload() are scaled and thus more accurate. > >> The difference between se->avg.load_avg_contrib and > >> se->avg.usage_avg_contrib is the priority scaling and whether or not > >> runqueue waiting time is counted. se->avg.load_avg_contrib can only > >> reach se->load.weight when running on the fastest cpu at the highest > >> frequency, but it is now scale-invariant so we have much better idea > >> about how much load we are pulling when load-balancing two cpus running > >> at different frequencies. The load-balance code-path still has to be > >> audited to see if anything blows up due to the scaling. I haven't > >> finished doing that yet. This patch set doesn't include patches to > >> address such issues (yet). IMHO, by scaling runnable_avg_sum we can more > >> easily make the existing load-balancing code do the right thing. > >> > >> For both options we have to go through the existing load-balancing code > >> to either change it to use the scale-invariant metric (running_avg_sum) > >> when appropriate or to fix bits that don't work properly with a > >> scale-invariant runnable_avg_sum and reuse the existing code. I think > >> the latter is less intrusive, but I might be wrong. > >> > >> Opinions? > > > > /me votes #2, I think the example in the reply is a false one, an always > > running task will/should ramp up the cpufreq and get us at full speed > > I have in mind some system where the max achievable freq of a core > depends of how many cores are running simultaneously because of some > HW constraint like max current. In this case, the CPU might not reach > max frequency even with an always running task. If we compare scale-invariant task load to the current frequency scaled compute capacity of the cpu when making load-balancing decisions as I described in my other reply that shouldn't be a problem. > Then, beside frequency scaling, their is the uarch invariance that is > introduced by patch 4 that will generate similar behavior of the load. I don't quite follow. When we make task load frequency and uarch invariant, we must scale compute capacity accordingly. So compute capacity is bigger for big cores and smaller for little cores. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On 8 October 2014 15:53, Morten Rasmussen <morten.rasmussen@arm.com> wrote: > On Wed, Oct 08, 2014 at 12:21:45PM +0100, Vincent Guittot wrote: >> On 8 October 2014 13:00, Morten Rasmussen <morten.rasmussen@arm.com> wrote: >> > >> > Sure. The easiest way to avoid introducing overflows is to ensure that >> > we always scale by a factor >= 1.0. That should be true as long as >> > arch_scale_{cpu,freq}_capacity() never returns anything greater than >> > SCHED_CAPACITY_SCALE (= 1024 = 1.0). >> >> the current ARM arch_scale_cpu is in the range [1536..0] which is free >> of overflow AFAICT > > If I'm not mistaken, that will cause an overflow in > __update_task_entity_contrib(): > > static inline void __update_task_entity_contrib(struct sched_entity *se) > { > u32 contrib; > /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ > contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); > contrib /= (se->avg.avg_period + 1); > se->avg.load_avg_contrib = scale_load(contrib); > } > > With arch_scale_cpu_capacity() > 1024 se->avg.runnable_avg_sum is no > longer bounded by LOAD_AVG_MAX = 47742. scale_load_down(se->load.weight) > == se->load.weight =< 88761. > > 47742 * 88761 = 4237627662 (2^32 = 4294967296) > > To avoid overflow se->avg.runnable_avg_sum must be less than 2^32/88761 > = 48388, which means that arch_scale_cpu_capacity() must be in the range > 0..48388*1024/47742 = 0..1037. > > I also think it is easier to have a fixed defined max scaling factor, > but that might just be me. OK, overflow comes with adding uarch invariance into runnable load average > > Regarding the ARM arch_scale_cpu_capacity() implementation, I think that > can be changed to fit the 0..1024 range easily. Currently, it will only > report a non-default (1024) capacity for big.LITTLE systems and actually > enabling it (requires a certain property to be set in device tree) leads > to broken load-balancing decisions. We have discussed that several times Only the 1 task per CPU is broken but in the other hand, it better handles the overload use case where we have more tasks than CPU and other middle range use case by putting more task on big cluster. > in the past. I wouldn't recommend enabling it until the load-balance > code can deal with big.LITTLE compute capacities correctly. This is also > why it isn't implemented by ARM64. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On Wed, Oct 08, 2014 at 03:08:04PM +0100, Vincent Guittot wrote: > On 8 October 2014 15:53, Morten Rasmussen <morten.rasmussen@arm.com> wrote: > > On Wed, Oct 08, 2014 at 12:21:45PM +0100, Vincent Guittot wrote: > >> On 8 October 2014 13:00, Morten Rasmussen <morten.rasmussen@arm.com> wrote: > > >> > > >> > Sure. The easiest way to avoid introducing overflows is to ensure that > >> > we always scale by a factor >= 1.0. That should be true as long as > >> > arch_scale_{cpu,freq}_capacity() never returns anything greater than > >> > SCHED_CAPACITY_SCALE (= 1024 = 1.0). > >> > >> the current ARM arch_scale_cpu is in the range [1536..0] which is free > >> of overflow AFAICT > > > > If I'm not mistaken, that will cause an overflow in > > __update_task_entity_contrib(): > > > > static inline void __update_task_entity_contrib(struct sched_entity *se) > > { > > u32 contrib; > > /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ > > contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); > > contrib /= (se->avg.avg_period + 1); > > se->avg.load_avg_contrib = scale_load(contrib); > > } > > > > With arch_scale_cpu_capacity() > 1024 se->avg.runnable_avg_sum is no > > longer bounded by LOAD_AVG_MAX = 47742. scale_load_down(se->load.weight) > > == se->load.weight =< 88761. > > > > 47742 * 88761 = 4237627662 (2^32 = 4294967296) > > > > To avoid overflow se->avg.runnable_avg_sum must be less than 2^32/88761 > > = 48388, which means that arch_scale_cpu_capacity() must be in the range > > 0..48388*1024/47742 = 0..1037. > > > > I also think it is easier to have a fixed defined max scaling factor, > > but that might just be me. > > OK, overflow comes with adding uarch invariance into runnable load average > > > > > Regarding the ARM arch_scale_cpu_capacity() implementation, I think that > > can be changed to fit the 0..1024 range easily. Currently, it will only > > report a non-default (1024) capacity for big.LITTLE systems and actually > > enabling it (requires a certain property to be set in device tree) leads > > to broken load-balancing decisions. We have discussed that several times > > Only the 1 task per CPU is broken but in the other hand, it better > handles the overload use case where we have more tasks than CPU and > other middle range use case by putting more task on big cluster. Yes, agreed. My point was just to say that it shouldn't cause a lot of harm changing the range of arch_scale_cpu_capacity() for ARM. We need to fix things anyway. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On Wed, Oct 08, 2014 at 01:38:40PM +0200, Vincent Guittot wrote: > I have in mind some system where the max achievable freq of a core > depends of how many cores are running simultaneously because of some > HW constraint like max current. In this case, the CPU might not reach > max frequency even with an always running task. > Then, beside frequency scaling, their is the uarch invariance that is > introduced by patch 4 that will generate similar behavior of the load. This is a 'common' issue. x86 and powerpc also suffer this. It can be the result of either thermal or power thresholds. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On Wed, Oct 08, 2014 at 08:50:07AM +0800, Yuyang Du wrote: > Hi Morten, > > Sorry for late jumping in. > > The problem seems to be self-evident. But for the implementation to be > equally attractive it needs to account for every freq change for every task, > or anything less than that makes it less attractive. I'm not entirely sure that is indeed required. > But this should be very hard. Intel Architecture has limitation to capture all > the freq changes in software and also the intel_pstate should have no > notification. So current Intel arch takes P-states as hints and then can mostly lower their actual frequency, right? In this case still accounting at the higher frequency is not a problem, the hardware conserves more power than we know, but that's the right kind of error to have :-) For the full automatic hardware, that's basically hardware without DVFS capability so we should not bother at all and simply disable this scaling. > For every task, this makes the updating of the entire queue in load tracking > more needed, so once again, ping maintainers for the rewrite patches, :) Could you remind me what your latest version is; please reply to those patches with a new email so that I can more conveniently locate it. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On Wed, Oct 08, 2014 at 01:54:58PM +0100, Dietmar Eggemann wrote: > > But this should be very hard. Intel Architecture has limitation to capture all > > the freq changes in software and also the intel_pstate should have no > > notification. > > We encountered this missing notification for current frequency with > Intel systems (e.g. i5-3320M) using the intel_pstate driver while > testing this patch-set. The arch_scale_set_curr_freq call in > __cpufreq_notify_transition [[PATCH 2/7] cpufreq: Architecture specific > callback for frequency changes] will not work on such a system. > > In our internal testing, we placed arch_scale_set_curr_freq(cpu->cpu, > sample->freq) into intel_pstate_timer_func [intel_pstate.c] to get the > current frequency for a cpu. > > The arch_scale_set_max_freq call in cpufreq_set_policy > [drivers/cpufreq/cpufreq.c] still works although the driver exposes the > max turbo pstate and not the max pstate. That's an additional problem > because we don't want to use turbo states for frequency scaling. Right, so when we pull the policy part into the scheduler, intel_pstate will revert to just another driver without such logic and things should just work. But yes, currently it also implements policy, that needs to go away. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2a1e6ac..52abb3e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2267,6 +2267,8 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +unsigned long arch_scale_load_capacity(int cpu); + /* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -2295,13 +2297,14 @@ static u32 __compute_runnable_contrib(u64 n) * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ -static __always_inline int __update_entity_runnable_avg(u64 now, +static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, struct sched_avg *sa, int runnable) { u64 delta, periods; u32 runnable_contrib; int delta_w, decayed = 0; + u32 scale_cap = arch_scale_load_capacity(cpu); delta = now - sa->last_runnable_update; /* @@ -2334,8 +2337,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now, * period and accrue it. */ delta_w = 1024 - delta_w; + if (runnable) - sa->runnable_avg_sum += delta_w; + sa->runnable_avg_sum += (delta_w * scale_cap) + >> SCHED_CAPACITY_SHIFT; sa->runnable_avg_period += delta_w; delta -= delta_w; @@ -2351,14 +2356,17 @@ static __always_inline int __update_entity_runnable_avg(u64 now, /* Efficiently calculate \sum (1..n_period) 1024*y^i */ runnable_contrib = __compute_runnable_contrib(periods); + if (runnable) - sa->runnable_avg_sum += runnable_contrib; + sa->runnable_avg_sum += (runnable_contrib * scale_cap) + >> SCHED_CAPACITY_SHIFT; sa->runnable_avg_period += runnable_contrib; } /* Remainder of delta accrued against u_0` */ if (runnable) - sa->runnable_avg_sum += delta; + sa->runnable_avg_sum += (delta * scale_cap) + >> SCHED_CAPACITY_SHIFT; sa->runnable_avg_period += delta; return decayed; @@ -2464,7 +2472,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); + __update_entity_runnable_avg(rq_clock_task(rq), rq->cpu, &rq->avg, + runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -2518,6 +2527,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, { struct cfs_rq *cfs_rq = cfs_rq_of(se); long contrib_delta; + int cpu = rq_of(cfs_rq)->cpu; u64 now; /* @@ -2529,7 +2539,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, else now = cfs_rq_clock_task(group_cfs_rq(se)); - if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) + if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq)) return; contrib_delta = __update_entity_load_avg_contrib(se); @@ -5719,6 +5729,16 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) return default_scale_cpu_capacity(sd, cpu); } +static unsigned long default_scale_load_capacity(int cpu) +{ + return SCHED_CAPACITY_SCALE; +} + +unsigned long __weak arch_scale_load_capacity(int cpu) +{ + return default_scale_load_capacity(cpu); +} + static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu);