diff mbox series

bfq: Fix use-after-free with cgroups

Message ID 20211201133439.3309-1-jack@suse.cz
State New
Headers show
Series bfq: Fix use-after-free with cgroups | expand

Commit Message

Jan Kara Dec. 1, 2021, 1:34 p.m. UTC
BFQ started crashing with 5.15-based kernels like:

BUG: KASAN: use-after-free in rb_erase (lib/rbtree.c:262 lib/rbtr
Read of size 8 at addr ffff888008193098 by task bash/1472

CPU: 0 PID: 1472 Comm: bash Tainted: G            E     5.15.2-0.
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1
Call Trace:
rb_erase (lib/rbtree.c:262 lib/rbtree.c:445)
bfq_idle_extract (block/bfq-wf2q.c:356)
bfq_put_idle_entity (block/bfq-wf2q.c:660)
bfq_bfqq_served (block/bfq-wf2q.c:833)
bfq_dispatch_request (block/bfq-iosched.c:4870 block/bfq-iosched.
__blk_mq_do_dispatch_sched (block/blk-mq-sched.c:150)
__blk_mq_sched_dispatch_requests (block/blk-mq-sched.c:215 block/
blk_mq_sched_dispatch_requests (block/blk-mq-sched.c:360)
blk_mq_sched_insert_requests (include/linux/percpu-refcount.h:174
blk_mq_flush_plug_list (include/linux/list.h:282 block/blk-mq.c:1
blk_flush_plug_list (block/blk-core.c:1722)
blk_finish_plug (block/blk-core.c:1745 block/blk-core.c:1739)
read_pages (include/linux/list.h:282 mm/readahead.c:152)
page_cache_ra_unbounded (mm/readahead.c:212 (discriminator 2))
filemap_fault (mm/filemap.c:2982 mm/filemap.c:3074)
__do_fault (mm/memory.c:3858)
__handle_mm_fault (mm/memory.c:4182 mm/memory.c:4310 mm/memory.c:
handle_mm_fault (mm/memory.c:4801)

After some analysis we've found out that the culprit of the problem is
that some task is reparented from cgroup G to the root cgroup and G is
offlined. But a bfq_queue in task's IO context still points to G as its
parent and thus when task submits more IO, G is inserted into service
trees. Once the task exits and bfq_queue is destroyed, the last
reference to G is dropped as well and G is freed but it is still linked
from service trees causing use-after-free issues sometime later.

Fix the problem by tracking all bfq_queues that point to a particular
cgroup as their parent and reparent them when the cgroup is going
offline.

CC: stable@vger.kernel.org
Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support")
Tested-by: Fabian Vogt <fvogt@suse.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 block/bfq-cgroup.c  | 100 ++++++--------------------------------------
 block/bfq-iosched.c |  54 ++++++++++++------------
 block/bfq-iosched.h |   6 +++
 3 files changed, 47 insertions(+), 113 deletions(-)

Comments

Michal Koutný Dec. 7, 2021, 7:08 p.m. UTC | #1
On Wed, Dec 01, 2021 at 02:34:39PM +0100, Jan Kara <jack@suse.cz> wrote:
> After some analysis we've found out that the culprit of the problem is
> that some task is reparented from cgroup G to the root cgroup and G is
> offlined.

Just sharing my interpretation for context -- (I saw this was a system
using the unified cgroup hierarchy, io_cgrp_subsys_on_dfl_key was
enabled) and what was observed could also have been disabling the io
controller on given level -- that would also manifest similarly -- the
task is migrated to parent and the former blkcg is offlined.


> +static void bfq_reparent_children(struct bfq_data *bfqd, struct bfq_group *bfqg)
> [...]
> -	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
> [...]
> +	hlist_for_each_entry_safe(bfqq, next, &bfqg->children, children_node)
> +		bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);

Here I assume root_group is (representing) the global blkcg root and
this reparenting thus skips all ancestors between the removed leaf and
the root. IIUC the associated io_context would then be treated as if it
was running in the root blkcg.
(Admittedly, this isn't a change from this patch but it may cause some
surprises if the given process runs after the operation.)

Reparenting to the immediate ancestors should be safe as cgroup core
should ensure children are offlined before parents. Would it make sense
to you?


> @@ -897,38 +844,17 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
> [...]
> -		 * It may happen that some queues are still active
> -		 * (busy) upon group destruction (if the corresponding
> -		 * processes have been forced to terminate). We move
> -		 * all the leaf entities corresponding to these queues
> -		 * to the root_group.

This comment is removed but it seems to me it assumed that the
reparented entities are only some transitional remainings of terminated
tasks but they may be the processes migrated upwards with a long (IO
active) life ahead.
diff mbox series

Patch

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 24a5c5329bcd..519e6291e98e 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -666,6 +666,7 @@  void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
 	else if (entity->on_st_or_in_serv)
 		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+	hlist_del(&bfqq->children_node);
 	bfqg_and_blkg_put(bfqq_group(bfqq));
 
 	if (entity->parent &&
@@ -678,6 +679,7 @@  void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	entity->sched_data = &bfqg->sched_data;
 	/* pin down bfqg and its associated blkg  */
 	bfqg_and_blkg_get(bfqg);
+	hlist_add_head(&bfqq->children_node, &bfqg->children);
 
 	if (bfq_bfqq_busy(bfqq)) {
 		if (unlikely(!bfqd->nonrot_with_queueing))
@@ -810,68 +812,13 @@  void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
 	rcu_read_unlock();
 }
 
-/**
- * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
- * @st: the service tree being flushed.
- */
-static void bfq_flush_idle_tree(struct bfq_service_tree *st)
-{
-	struct bfq_entity *entity = st->first_idle;
-
-	for (; entity ; entity = st->first_idle)
-		__bfq_deactivate_entity(entity, false);
-}
-
-/**
- * bfq_reparent_leaf_entity - move leaf entity to the root_group.
- * @bfqd: the device data structure with the root group.
- * @entity: the entity to move, if entity is a leaf; or the parent entity
- *	    of an active leaf entity to move, if entity is not a leaf.
- */
-static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
-				     struct bfq_entity *entity,
-				     int ioprio_class)
+static void bfq_reparent_children(struct bfq_data *bfqd, struct bfq_group *bfqg)
 {
 	struct bfq_queue *bfqq;
-	struct bfq_entity *child_entity = entity;
-
-	while (child_entity->my_sched_data) { /* leaf not reached yet */
-		struct bfq_sched_data *child_sd = child_entity->my_sched_data;
-		struct bfq_service_tree *child_st = child_sd->service_tree +
-			ioprio_class;
-		struct rb_root *child_active = &child_st->active;
-
-		child_entity = bfq_entity_of(rb_first(child_active));
-
-		if (!child_entity)
-			child_entity = child_sd->in_service_entity;
-	}
-
-	bfqq = bfq_entity_to_bfqq(child_entity);
-	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
-}
-
-/**
- * bfq_reparent_active_queues - move to the root group all active queues.
- * @bfqd: the device data structure with the root group.
- * @bfqg: the group to move from.
- * @st: the service tree to start the search from.
- */
-static void bfq_reparent_active_queues(struct bfq_data *bfqd,
-				       struct bfq_group *bfqg,
-				       struct bfq_service_tree *st,
-				       int ioprio_class)
-{
-	struct rb_root *active = &st->active;
-	struct bfq_entity *entity;
-
-	while ((entity = bfq_entity_of(rb_first(active))))
-		bfq_reparent_leaf_entity(bfqd, entity, ioprio_class);
+	struct hlist_node *next;
 
-	if (bfqg->sched_data.in_service_entity)
-		bfq_reparent_leaf_entity(bfqd,
-					 bfqg->sched_data.in_service_entity,
-					 ioprio_class);
+	hlist_for_each_entry_safe(bfqq, next, &bfqg->children, children_node)
+		bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
 }
 
 /**
@@ -897,38 +844,17 @@  static void bfq_pd_offline(struct blkg_policy_data *pd)
 		goto put_async_queues;
 
 	/*
-	 * Empty all service_trees belonging to this group before
-	 * deactivating the group itself.
+	 * Reparent all bfqqs under this bfq group. This will also empty all
+	 * service_trees belonging to this group before deactivating the group
+	 * itself.
 	 */
+	bfq_reparent_children(bfqd, bfqg);
+
 	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
 		st = bfqg->sched_data.service_tree + i;
 
-		/*
-		 * It may happen that some queues are still active
-		 * (busy) upon group destruction (if the corresponding
-		 * processes have been forced to terminate). We move
-		 * all the leaf entities corresponding to these queues
-		 * to the root_group.
-		 * Also, it may happen that the group has an entity
-		 * in service, which is disconnected from the active
-		 * tree: it must be moved, too.
-		 * There is no need to put the sync queues, as the
-		 * scheduler has taken no reference.
-		 */
-		bfq_reparent_active_queues(bfqd, bfqg, st, i);
-
-		/*
-		 * The idle tree may still contain bfq_queues
-		 * belonging to exited task because they never
-		 * migrated to a different cgroup from the one being
-		 * destroyed now. In addition, even
-		 * bfq_reparent_active_queues() may happen to add some
-		 * entities to the idle tree. It happens if, in some
-		 * of the calls to bfq_bfqq_move() performed by
-		 * bfq_reparent_active_queues(), the queue to move is
-		 * empty and gets expired.
-		 */
-		bfq_flush_idle_tree(st);
+		WARN_ON_ONCE(!RB_EMPTY_ROOT(&st->active));
+		WARN_ON_ONCE(!RB_EMPTY_ROOT(&st->idle));
 	}
 
 	__bfq_deactivate_entity(entity, false);
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index fec18118dc30..8f33f6b91d05 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5163,6 +5163,7 @@  void bfq_put_queue(struct bfq_queue *bfqq)
 	if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq)
 		bfqq->bfqd->last_completed_rq_bfqq = NULL;
 
+	hlist_del(&bfqq->children_node);
 	kmem_cache_free(bfq_pool, bfqq);
 	bfqg_and_blkg_put(bfqg);
 }
@@ -5337,8 +5338,9 @@  static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
 		bfq_set_next_ioprio_data(bfqq, bic);
 }
 
-static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			  struct bfq_io_cq *bic, pid_t pid, int is_sync)
+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_group *bfqg,
+			  struct bfq_queue *bfqq, struct bfq_io_cq *bic,
+			  pid_t pid, int is_sync)
 {
 	u64 now_ns = ktime_get_ns();
 
@@ -5347,6 +5349,7 @@  static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	INIT_HLIST_NODE(&bfqq->burst_list_node);
 	INIT_HLIST_NODE(&bfqq->woken_list_node);
 	INIT_HLIST_HEAD(&bfqq->woken_list);
+	hlist_add_head(&bfqq->children_node, &bfqg->children);
 
 	bfqq->ref = 0;
 	bfqq->bfqd = bfqd;
@@ -5600,8 +5603,7 @@  static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 				     bfqd->queue->node);
 
 	if (bfqq) {
-		bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
-			      is_sync);
+		bfq_init_bfqq(bfqd, bfqg, bfqq, bic, current->pid, is_sync);
 		bfq_init_entity(&bfqq->entity, bfqg);
 		bfq_log_bfqq(bfqd, bfqq, "allocated");
 	} else {
@@ -6908,6 +6910,7 @@  static void bfq_exit_queue(struct elevator_queue *e)
 
 	hrtimer_cancel(&bfqd->idle_slice_timer);
 
+	hlist_del(&bfqd->oom_bfqq.children_node);
 	/* release oom-queue reference to root group */
 	bfqg_and_blkg_put(bfqd->root_group);
 
@@ -6959,28 +6962,6 @@  static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	q->elevator = eq;
 	spin_unlock_irq(&q->queue_lock);
 
-	/*
-	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
-	 * Grab a permanent reference to it, so that the normal code flow
-	 * will not attempt to free it.
-	 */
-	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
-	bfqd->oom_bfqq.ref++;
-	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
-	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
-	bfqd->oom_bfqq.entity.new_weight =
-		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
-
-	/* oom_bfqq does not participate to bursts */
-	bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
-
-	/*
-	 * Trigger weight initialization, according to ioprio, at the
-	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
-	 * class won't be changed any more.
-	 */
-	bfqd->oom_bfqq.entity.prio_changed = 1;
-
 	bfqd->queue = q;
 
 	INIT_LIST_HEAD(&bfqd->dispatch);
@@ -7059,6 +7040,27 @@  static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 		goto out_free;
 	bfq_init_root_group(bfqd->root_group, bfqd);
 	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+	/*
+	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+	 * Grab a permanent reference to it, so that the normal code flow
+	 * will not attempt to free it.
+	 */
+	bfq_init_bfqq(bfqd, bfqd->root_group, &bfqd->oom_bfqq, NULL, 1, 0);
+	bfqd->oom_bfqq.ref++;
+	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
+	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
+	bfqd->oom_bfqq.entity.new_weight =
+		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
+
+	/* oom_bfqq does not participate to bursts */
+	bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
+
+	/*
+	 * Trigger weight initialization, according to ioprio, at the
+	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
+	 * class won't be changed any more.
+	 */
+	bfqd->oom_bfqq.entity.prio_changed = 1;
 
 	wbt_disable_default(q);
 	return 0;
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index a73488eec8a4..a1984959d6be 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -292,6 +292,9 @@  struct bfq_queue {
 
 	/* node for active/idle bfqq list inside parent bfqd */
 	struct list_head bfqq_list;
+	/* Member of parent's bfqg children list */
+	struct hlist_node children_node;
+
 
 	/* associated @bfq_ttime struct */
 	struct bfq_ttime ttime;
@@ -929,6 +932,9 @@  struct bfq_group {
 	struct bfq_entity entity;
 	struct bfq_sched_data sched_data;
 
+	/* bfq_queues under this entity */
+	struct hlist_head children;
+
 	void *bfqd;
 
 	struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];