Message ID | 1549469662-13614-2-git-send-email-vincent.guittot@linaro.org |
---|---|
State | Accepted |
Commit | 31bc6aeaab1d1de8959b67edbed5c7a4b3cdbe7c |
Headers | show |
Series | sched/fair: Fix O(nr_cgroups) in load balance path | expand |
Argh head hurts!! On Wed, Feb 06, 2019 at 05:14:21PM +0100, Vincent Guittot wrote: > @@ -4438,6 +4450,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) > /* adjust cfs_rq_clock_task() */ > cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - > cfs_rq->throttled_clock_task; > + > + /* Add cfs_rq with already running entity in the list */ > + if (cfs_rq->nr_running >= 1) > + list_add_leaf_cfs_rq(cfs_rq); > } > > return 0; Do we want the below to go with the above change? diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 38d4669aa2ef..0bd80a891025 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4536,6 +4536,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + assert_list_leaf_cfs_rq(rq); + if (!cfs_rq->load.weight) return;
On Fri, 8 Feb 2019 at 16:40, Peter Zijlstra <peterz@infradead.org> wrote: > > > Argh head hurts!! > > On Wed, Feb 06, 2019 at 05:14:21PM +0100, Vincent Guittot wrote: > > @@ -4438,6 +4450,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) > > /* adjust cfs_rq_clock_task() */ > > cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - > > cfs_rq->throttled_clock_task; > > + > > + /* Add cfs_rq with already running entity in the list */ > > + if (cfs_rq->nr_running >= 1) > > + list_add_leaf_cfs_rq(cfs_rq); > > } > > > > return 0; > > Do we want the below to go with the above change? > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 38d4669aa2ef..0bd80a891025 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -4536,6 +4536,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) > /* update hierarchical throttle state */ > walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); > > + assert_list_leaf_cfs_rq(rq); > + Good point but this should go after the for_each_sched_entity() loop > if (!cfs_rq->load.weight) > return; >
On Fri, Feb 08, 2019 at 04:44:53PM +0100, Vincent Guittot wrote: > On Fri, 8 Feb 2019 at 16:40, Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > Argh head hurts!! > > > > On Wed, Feb 06, 2019 at 05:14:21PM +0100, Vincent Guittot wrote: > > > @@ -4438,6 +4450,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) > > > /* adjust cfs_rq_clock_task() */ > > > cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - > > > cfs_rq->throttled_clock_task; > > > + > > > + /* Add cfs_rq with already running entity in the list */ > > > + if (cfs_rq->nr_running >= 1) > > > + list_add_leaf_cfs_rq(cfs_rq); > > > } > > > > > > return 0; > > > > Do we want the below to go with the above change? > > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > > index 38d4669aa2ef..0bd80a891025 100644 > > --- a/kernel/sched/fair.c > > +++ b/kernel/sched/fair.c > > @@ -4536,6 +4536,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) > > /* update hierarchical throttle state */ > > walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); > > > > + assert_list_leaf_cfs_rq(rq); > > + > > Good point but this should go after the for_each_sched_entity() loop Indeed, but that loop does enqueue and can throttle again, should that not also get that additinoal list_add_leaf_cfs_rq() loop we added to enqueue_task_fair() to finish the add?
On Wed, Feb 06, 2019 at 05:14:21PM +0100, Vincent Guittot wrote: > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -346,6 +346,18 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) > static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) > { > if (cfs_rq->on_list) { > + struct rq *rq = rq_of(cfs_rq); > + > + /* > + * With cfs_rq being unthrottled/throttled during an enqueue, > + * it can happen the tmp_alone_branch points the a leaf that > + * we finally want to del. In this case, tmp_alone_branch moves > + * to the prev element but it will point to rq->leaf_cfs_rq_list > + * at the end of the enqueue. > + */ > + if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list) > + rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev; > + > list_del_rcu(&cfs_rq->leaf_cfs_rq_list); > cfs_rq->on_list = 0; > } So that is: enqueue_task_fair() enqueue_entity() list_add_lead_cfs_rq() check_enqueue_throttle() throttle_cfs_rq() walk_tg_tree_from() tg_throttle_down() list_del_leaf_cfs_rq() Which can try and remove a cfs_rq which we just added. And because the list is a bottom-up order, and the deletion is a downward operation, we must go back (prev) in the list. So far so good I suppose. > @@ -4449,8 +4465,10 @@ static int tg_throttle_down(struct task_group *tg, void *data) > struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; > > /* group is entering throttled state, stop time */ > - if (!cfs_rq->throttle_count) > + if (!cfs_rq->throttle_count) { > cfs_rq->throttled_clock_task = rq_clock_task(rq); > + list_del_leaf_cfs_rq(cfs_rq); > + } > cfs_rq->throttle_count++; > > return 0;
On Fri, 8 Feb 2019 at 17:30, Peter Zijlstra <peterz@infradead.org> wrote: > > On Fri, Feb 08, 2019 at 04:44:53PM +0100, Vincent Guittot wrote: > > On Fri, 8 Feb 2019 at 16:40, Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > > > > Argh head hurts!! > > > > > > On Wed, Feb 06, 2019 at 05:14:21PM +0100, Vincent Guittot wrote: > > > > @@ -4438,6 +4450,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) > > > > /* adjust cfs_rq_clock_task() */ > > > > cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - > > > > cfs_rq->throttled_clock_task; > > > > + > > > > + /* Add cfs_rq with already running entity in the list */ > > > > + if (cfs_rq->nr_running >= 1) > > > > + list_add_leaf_cfs_rq(cfs_rq); > > > > } > > > > > > > > return 0; > > > > > > Do we want the below to go with the above change? > > > > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > > > index 38d4669aa2ef..0bd80a891025 100644 > > > --- a/kernel/sched/fair.c > > > +++ b/kernel/sched/fair.c > > > @@ -4536,6 +4536,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) > > > /* update hierarchical throttle state */ > > > walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); > > > > > > + assert_list_leaf_cfs_rq(rq); > > > + > > > > Good point but this should go after the for_each_sched_entity() loop > > Indeed, but that loop does enqueue and can throttle again, should that > not also get that additional list_add_leaf_cfs_rq() loop we added to > enqueue_task_fair() to finish the add? Initially, I added this additional loop but finally removed it because I didn't hit it during my tests. IIRC, we are protected by throttle_count in such case, which is not the case when we enqueue a task
On Fri, 8 Feb 2019 at 17:31, Peter Zijlstra <peterz@infradead.org> wrote: > > On Wed, Feb 06, 2019 at 05:14:21PM +0100, Vincent Guittot wrote: > > --- a/kernel/sched/fair.c > > +++ b/kernel/sched/fair.c > > @@ -346,6 +346,18 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) > > static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) > > { > > if (cfs_rq->on_list) { > > + struct rq *rq = rq_of(cfs_rq); > > + > > + /* > > + * With cfs_rq being unthrottled/throttled during an enqueue, > > + * it can happen the tmp_alone_branch points the a leaf that > > + * we finally want to del. In this case, tmp_alone_branch moves > > + * to the prev element but it will point to rq->leaf_cfs_rq_list > > + * at the end of the enqueue. > > + */ > > + if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list) > > + rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev; > > + > > list_del_rcu(&cfs_rq->leaf_cfs_rq_list); > > cfs_rq->on_list = 0; > > } > > So that is: > > enqueue_task_fair() > enqueue_entity() > list_add_lead_cfs_rq() > check_enqueue_throttle() > throttle_cfs_rq() > walk_tg_tree_from() > tg_throttle_down() > list_del_leaf_cfs_rq() > > Which can try and remove a cfs_rq which we just added. > > And because the list is a bottom-up order, and the deletion is a > downward operation, we must go back (prev) in the list. yes exactly > > So far so good I suppose. > > > @@ -4449,8 +4465,10 @@ static int tg_throttle_down(struct task_group *tg, void *data) > > struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; > > > > /* group is entering throttled state, stop time */ > > - if (!cfs_rq->throttle_count) > > + if (!cfs_rq->throttle_count) { > > cfs_rq->throttled_clock_task = rq_clock_task(rq); > > + list_del_leaf_cfs_rq(cfs_rq); > > + } > > cfs_rq->throttle_count++; > > > > return 0; > >
On Fri, Feb 08, 2019 at 05:47:53PM +0100, Vincent Guittot wrote: > On Fri, 8 Feb 2019 at 17:30, Peter Zijlstra <peterz@infradead.org> wrote: > > On Fri, Feb 08, 2019 at 04:44:53PM +0100, Vincent Guittot wrote: > > > On Fri, 8 Feb 2019 at 16:40, Peter Zijlstra <peterz@infradead.org> wrote: > > > Good point but this should go after the for_each_sched_entity() loop > > > > Indeed, but that loop does enqueue and can throttle again, should that > > not also get that additional list_add_leaf_cfs_rq() loop we added to > > enqueue_task_fair() to finish the add? > > Initially, I added this additional loop but finally removed it because > I didn't hit it during my tests. IIRC, we are protected by > throttle_count in such case, which is not the case when we enqueue a > task Fair enough; and the to-be added assert will notify us if we got that wrong :-) I'll add the assert, no need to re-send. Thanks!
On Fri, 8 Feb 2019 at 17:51, Peter Zijlstra <peterz@infradead.org> wrote: > > On Fri, Feb 08, 2019 at 05:47:53PM +0100, Vincent Guittot wrote: > > On Fri, 8 Feb 2019 at 17:30, Peter Zijlstra <peterz@infradead.org> wrote: > > > On Fri, Feb 08, 2019 at 04:44:53PM +0100, Vincent Guittot wrote: > > > > On Fri, 8 Feb 2019 at 16:40, Peter Zijlstra <peterz@infradead.org> wrote: > > > > > Good point but this should go after the for_each_sched_entity() loop > > > > > > Indeed, but that loop does enqueue and can throttle again, should that > > > not also get that additional list_add_leaf_cfs_rq() loop we added to > > > enqueue_task_fair() to finish the add? > > > > Initially, I added this additional loop but finally removed it because > > I didn't hit it during my tests. IIRC, we are protected by > > throttle_count in such case, which is not the case when we enqueue a > > task > > Fair enough; and the to-be added assert will notify us if we got that > wrong :-) > > I'll add the assert, no need to re-send. Thanks > > Thanks!
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ffd1ae7..badf8173 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -346,6 +346,18 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + + /* + * With cfs_rq being unthrottled/throttled during an enqueue, + * it can happen the tmp_alone_branch points the a leaf that + * we finally want to del. In this case, tmp_alone_branch moves + * to the prev element but it will point to rq->leaf_cfs_rq_list + * at the end of the enqueue. + */ + if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list) + rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev; + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); cfs_rq->on_list = 0; } @@ -4438,6 +4450,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; + + /* Add cfs_rq with already running entity in the list */ + if (cfs_rq->nr_running >= 1) + list_add_leaf_cfs_rq(cfs_rq); } return 0; @@ -4449,8 +4465,10 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; /* group is entering throttled state, stop time */ - if (!cfs_rq->throttle_count) + if (!cfs_rq->throttle_count) { cfs_rq->throttled_clock_task = rq_clock_task(rq); + list_del_leaf_cfs_rq(cfs_rq); + } cfs_rq->throttle_count++; return 0; @@ -7699,10 +7717,6 @@ static void update_blocked_averages(int cpu) for_each_leaf_cfs_rq(rq, cfs_rq) { struct sched_entity *se; - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - continue; - if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) update_tg_load_avg(cfs_rq, 0);
Removing a cfs_rq from rq->leaf_cfs_rq_list can break the parent/child ordering of the list when it will be added back. In order to remove an empty and fully decayed cfs_rq, we must remove its children too so they will be added back in the right order next time. With a normal decay of pelt, a parent will be empty and fully decayed if all children are empty and fully decayed too. In such a case, we just have to ensure that the whole branch will be added when a new task is enqueued. This is default behavior since : commit f6783319737f ("sched/fair: Fix insertion in rq->leaf_cfs_rq_list") In case of throttling, the pelt of throttled cfs_rq will not be updated whereas the parent will. This breaks the assumption made above unless we remove the children of a cfs_rq that is throttled. Then, they will be added back when unthrottled and a sched_entity will be enqueued. As throttled cfs_rq are now removed from the list, we can remove the associated test in update_blocked_averages(). Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> --- kernel/sched/fair.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) -- 2.7.4