Message ID | 1466615004-3503-3-git-send-email-morten.rasmussen@arm.com |
---|---|
State | Superseded |
Headers | show |
On Wed, Jun 22, 2016 at 02:04:11PM -0400, Rik van Riel wrote: > On Wed, 2016-06-22 at 18:03 +0100, Morten Rasmussen wrote: > > In commit ac66f5477239 ("sched/numa: Introduce migrate_swap()") > > select_task_rq() got a 'cpu' argument to enable overriding of > > prev_cpu > > in special cases (NUMA task swapping). However, the > > select_task_rq_fair() helper functions: wake_affine() and > > select_idle_sibling(), still use task_cpu(p) directly to work out > > prev_cpu which leads to inconsistencies. > > > > This patch passes prev_cpu (potentially overridden by NUMA code) into > > the helper functions to ensure prev_cpu is indeed the same cpu > > everywhere in the wakeup path. > > > > cc: Ingo Molnar <mingo@redhat.com> > > cc: Peter Zijlstra <peterz@infradead.org> > > cc: Rik van Riel <riel@redhat.com> > > > > Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com> > > --- > > kernel/sched/fair.c | 24 +++++++++++++----------- > > 1 file changed, 13 insertions(+), 11 deletions(-) > > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > > index c6dd8bab010c..eec8e29104f9 100644 > > --- a/kernel/sched/fair.c > > +++ b/kernel/sched/fair.c > > @@ -656,7 +656,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, > > struct sched_entity *se) > > } > > > > #ifdef CONFIG_SMP > > -static int select_idle_sibling(struct task_struct *p, int cpu); > > +static int select_idle_sibling(struct task_struct *p, int prev_cpu, > > int cpu); > > static unsigned long task_h_load(struct task_struct *p); > > > > /* > > @@ -1483,7 +1483,8 @@ static void task_numa_compare(struct > > task_numa_env *env, > > * Call select_idle_sibling to maybe find a better one. > > */ > > if (!cur) > > - env->dst_cpu = select_idle_sibling(env->p, env- > > >dst_cpu); > > + env->dst_cpu = select_idle_sibling(env->p, env- > > >src_cpu, > > + env->dst_cpu); > > It is worth remembering that "prev" will only > ever be returned by select_idle_sibling() if > it is part of the same NUMA node as target. > > That means this patch does not change behaviour > of the NUMA balancing code, since that always > migrates between nodes. > > Now lets look at try_to_wake_up(). It will pass > p->wake_cpu as the argument for "prev_cpu", which > again appears to be the same CPU number as that used > by the current code. IIUC, p->wake_cpu != task_cpu(p) if task_numa_migrate() decided to call migrate_swap() on the task while it was sleeping intending it to swap places with a task on a different NUMA node when it wakes up. Using p->wake_cpu in select_idle_sibling() as "prev_cpu" when called through try_to_wake_up()->select_task_rq() should only make a difference if the target cpu happens to share cache with it and it is idle. if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) return prev; The selection of the target cpu for select_idle_sibling() is also slightly affected as wake_affine() currently compares task_cpu(p) and smp_processor_id(), and then picks p->wake_cpu or smp_processor_id() depending on the outcome. With this patch wake_affine() uses p->wake_cpu instead of task_cpu(p) so we actually compare the candidates we choose between. I think that would lead to some minor changes in behaviour in a few corner cases, but I mainly wrote the patch as I thought it was very confusing that we could have different "prev_cpu"s in different parts of the select_task_rq_fair() code path. > > I have no objection to your patch, but must be > overlooking something, since I cannot find a change > in behaviour that your patch would create. Thanks for confirming that it shouldn't change anything for NUMA load balancing. That is what I hope for :-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c6dd8bab010c..eec8e29104f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -656,7 +656,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP -static int select_idle_sibling(struct task_struct *p, int cpu); +static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); /* @@ -1483,7 +1483,8 @@ static void task_numa_compare(struct task_numa_env *env, * Call select_idle_sibling to maybe find a better one. */ if (!cur) - env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, + env->dst_cpu); assign: task_numa_assign(env, cur, imp); @@ -4985,18 +4986,18 @@ static int wake_wide(struct task_struct *p) return 1; } -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, + int prev_cpu, int sync) { s64 this_load, load; s64 this_eff_load, prev_eff_load; - int idx, this_cpu, prev_cpu; + int idx, this_cpu; struct task_group *tg; unsigned long weight; int balanced; idx = sd->wake_idx; this_cpu = smp_processor_id(); - prev_cpu = task_cpu(p); load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); @@ -5161,11 +5162,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* * Try and locate an idle CPU in the sched_domain. */ -static int select_idle_sibling(struct task_struct *p, int target) +static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; struct sched_group *sg; - int i = task_cpu(p); if (idle_cpu(target)) return target; @@ -5173,8 +5173,8 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * If the prevous cpu is cache affine and idle, don't be stupid. */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) + return prev; /* * Otherwise, iterate the domains and find an eligible idle cpu. @@ -5195,6 +5195,8 @@ static int select_idle_sibling(struct task_struct *p, int target) for_each_lower_domain(sd) { sg = sd->groups; do { + int i; + if (!cpumask_intersects(sched_group_cpus(sg), tsk_cpus_allowed(p))) goto next; @@ -5303,13 +5305,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = cpu; } if (!sd) { if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ - new_cpu = select_idle_sibling(p, new_cpu); + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } else while (sd) { struct sched_group *group;
In commit ac66f5477239 ("sched/numa: Introduce migrate_swap()") select_task_rq() got a 'cpu' argument to enable overriding of prev_cpu in special cases (NUMA task swapping). However, the select_task_rq_fair() helper functions: wake_affine() and select_idle_sibling(), still use task_cpu(p) directly to work out prev_cpu which leads to inconsistencies. This patch passes prev_cpu (potentially overridden by NUMA code) into the helper functions to ensure prev_cpu is indeed the same cpu everywhere in the wakeup path. cc: Ingo Molnar <mingo@redhat.com> cc: Peter Zijlstra <peterz@infradead.org> cc: Rik van Riel <riel@redhat.com> Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com> --- kernel/sched/fair.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) -- 1.9.1