[RFC,21/22] block, bfq: boost the throughput with random I/O on NCQ-capable HDDs

Message ID	1454364778-25179-22-git-send-email-paolo.valente@linaro.org
State	New
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67; From: Paolo Valente <paolo.valente@linaro.org> To: Jens Axboe <axboe@kernel.dk>, Tejun Heo <tj@kernel.org> Cc: Fabio Checconi <fchecconi@gmail.com>, Arianna Avanzini <avanzini.arianna@gmail.com>, linux-block@vger.kernel.org, linux-kernel@vger.kernel.org, ulf.hansson@linaro.org, linus.walleij@linaro.org, broonie@kernel.org, Paolo Valente <paolo.valente@linaro.org> Subject: [PATCH RFC 21/22] block, bfq: boost the throughput with random I/O on NCQ-capable HDDs Date: Mon, 1 Feb 2016 23:12:57 +0100 Message-Id: <1454364778-25179-22-git-send-email-paolo.valente@linaro.org> In-Reply-To: <1454364778-25179-1-git-send-email-paolo.valente@linaro.org> References: <1454364778-25179-1-git-send-email-paolo.valente@linaro.org> UNIMORE-X-SA-Score: -2.9 Sender: linux-kernel-owner@vger.kernel.org Precedence: bulk

diff --git a/block/bfq.h b/block/bfq.h index 63ebba0..b808242 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -366,6 +366,31 @@ enum bfq_device_speed { * details). * @busy_queues: number of bfq_queues containing requests (including the * queue in service, even if it is idling). + * @busy_in_flight_queues: number of @bfq_queues containing pending or + * in-flight requests, plus the @bfq_queue in + * service, even if idle but waiting for the + * possible arrival of its next sync request. This + * field is updated only if the device is rotational, + * but used only if the device is also NCQ-capable. + * The reason why the field is updated also for non- + * NCQ-capable rotational devices is related to the + * fact that the value of @hw_tag may be set also + * later than when busy_in_flight_queues may need to + * be incremented for the first time(s). Taking also + * this possibility into account, to avoid unbalanced + * increments/decrements, would imply more overhead + * than just updating busy_in_flight_queues + * regardless of the value of @hw_tag. + * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues + * (that is, seeky queues that expired + * for budget timeout at least once) + * containing pending or in-flight + * requests, including the in-service + * @bfq_queue if constantly seeky. This + * field is updated only if the device + * is rotational, but used only if the + * device is also NCQ-capable (see the + * comments to @busy_in_flight_queues). * @wr_busy_queues: number of weight-raised busy @bfq_queues. * @queued: number of queued requests. * @rq_in_driver: number of requests dispatched and waiting for completion. @@ -449,6 +474,8 @@ struct bfq_data { struct rb_root group_weights_tree; int busy_queues; + int busy_in_flight_queues; + int const_seeky_busy_in_flight_queues; int wr_busy_queues; int queued; int rq_in_driver; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b8ee88c..28ffa97 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -38,7 +38,9 @@ * Even better for latency, BFQ explicitly privileges the I/O of two * classes of time-sensitive applications: interactive and soft * real-time. This feature enables BFQ to provide applications in - * these classes with a very low latency. + * these classes with a very low latency. Finally, BFQ also features + * additional heuristics for preserving both a low latency and a high + * throughput on NCQ-capable, rotational or flash-based devices. * * With respect to the version of BFQ presented in [1], and in the * papers cited therein, this implementation adds a hierarchical @@ -1278,9 +1280,15 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqd->busy_queues--; - if (!bfqq->dispatched) + if (!bfqq->dispatched) { bfq_weights_tree_remove(bfqd, &bfqq->entity, &bfqd->queue_weights_tree); + if (!blk_queue_nonrot(bfqd->queue)) { + bfqd->busy_in_flight_queues--; + if (bfq_bfqq_constantly_seeky(bfqq)) + bfqd->const_seeky_busy_in_flight_queues--; + } + } if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues--; @@ -1303,9 +1311,16 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_mark_bfqq_busy(bfqq); bfqd->busy_queues++; - if (!bfqq->dispatched && bfqq->wr_coeff == 1) - bfq_weights_tree_add(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); + if (!bfqq->dispatched) { + if (bfqq->wr_coeff == 1) + bfq_weights_tree_add(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + if (!blk_queue_nonrot(bfqd->queue)) { + bfqd->busy_in_flight_queues++; + if (bfq_bfqq_constantly_seeky(bfqq)) + bfqd->const_seeky_busy_in_flight_queues++; + } + } if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues++; } @@ -4277,8 +4292,12 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, bfqq->service_from_backlogged += bfqq->entity.service; - if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT) + if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && + !bfq_bfqq_constantly_seeky(bfqq)) { bfq_mark_bfqq_constantly_seeky(bfqq); + if (!blk_queue_nonrot(bfqd->queue)) + bfqd->const_seeky_busy_in_flight_queues++; + } if (reason == BFQ_BFQQ_TOO_IDLE && bfqq->entity.service <= 2 * bfqq->entity.budget / 10) @@ -4402,26 +4421,23 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; bool idling_boosts_thr, idling_boosts_thr_without_issues, + all_queues_seeky, on_hdd_and_not_all_queues_seeky, + idling_needed_for_service_guarantees, asymmetric_scenario; /* * The next variable takes into account the cases where idling * boosts the throughput. * - * The value of the variable is computed considering that - * idling is usually beneficial for the throughput if: + * The value of the variable is computed considering, first, that + * idling is virtually always beneficial for the throughput if: * (a) the device is not NCQ-capable, or * (b) regardless of the presence of NCQ, the device is rotational - * and the request pattern for bfqq is I/O-bound (possible - * throughput losses caused by granting idling to seeky queues - * are mitigated by the fact that, in all scenarios where - * boosting throughput is the best thing to do, i.e., in all - * symmetric scenarios, only a minimal idle time is allowed to - * seeky queues). + * and the request pattern for bfqq is I/O-bound and sequential. * * Secondly, and in contrast to the above item (b), idling an * NCQ-capable flash-based device would not boost the - * throughput even with intense I/O; rather it would lower + * throughput even with sequential I/O; rather it would lower * the throughput in proportion to how fast the device * is. Accordingly, the next variable is true if any of the * above conditions (a) and (b) is true, and, in particular, @@ -4429,7 +4445,8 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * device. */ idling_boosts_thr = !bfqd->hw_tag || - (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq)); + (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && + bfq_bfqq_idle_window(bfqq)); /* * The value of the next variable, @@ -4470,36 +4487,87 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) bfqd->wr_busy_queues == 0; /* - * There is then a case where idling must be performed not for - * throughput concerns, but to preserve service guarantees. To - * introduce it, we can note that allowing the drive to - * enqueue more than one request at a time, and hence - * delegating de facto final scheduling decisions to the - * drive's internal scheduler, causes loss of control on the - * actual request service order. In particular, the critical - * situation is when requests from different processes happens - * to be present, at the same time, in the internal queue(s) - * of the drive. In such a situation, the drive, by deciding - * the service order of the internally-queued requests, does - * determine also the actual throughput distribution among - * these processes. But the drive typically has no notion or - * concern about per-process throughput distribution, and - * makes its decisions only on a per-request basis. Therefore, - * the service distribution enforced by the drive's internal - * scheduler is likely to coincide with the desired - * device-throughput distribution only in a completely - * symmetric scenario where: (i) each of these processes must - * get the same throughput as the others; (ii) all these - * processes have the same I/O pattern (either sequential or - * random). In fact, in such a scenario, the drive will tend - * to treat the requests of each of these processes in about - * the same way as the requests of the others, and thus to - * provide each of these processes with about the same - * throughput (which is exactly the desired throughput - * distribution). In contrast, in any asymmetric scenario, - * device idling is certainly needed to guarantee that bfqq - * receives its assigned fraction of the device throughput - * (see [1] for details). + * There are then two cases where idling must be performed not + * for throughput concerns, but to preserve service + * guarantees. In the description of these cases, we say, for + * short, that a queue is sequential/random if the process + * associated to the queue issues sequential/random requests + * (in the second case the queue may be tagged as seeky or + * even constantly_seeky). + * + * To introduce the first case, we note that, since + * bfq_bfqq_idle_window(bfqq) is false if the device is + * NCQ-capable and bfqq is random (see + * bfq_update_idle_window()), then, from the above two + * assignments it follows that + * idling_boosts_thr_without_issues is false if the device is + * NCQ-capable and bfqq is random. Therefore, for this case, + * device idling would never be allowed if we used just + * idling_boosts_thr_without_issues to decide whether to allow + * it. And, beneficially, this would imply that throughput + * would always be boosted also with random I/O on NCQ-capable + * HDDs. + * + * But we must be careful on this point, to avoid an unfair + * treatment for bfqq. In fact, because of the same above + * assignments, idling_boosts_thr_without_issues is, on the + * other hand, true if 1) the device is an HDD and bfqq is + * sequential, and 2) there are no busy weight-raised + * queues. As a consequence, if we used just + * idling_boosts_thr_without_issues to decide whether to idle + * the device, then with an HDD we might easily bump into a + * scenario where queues that are sequential and I/O-bound + * would enjoy idling, whereas random queues would not. The + * latter might then get a low share of the device throughput, + * simply because the former would get many requests served + * after being set as in service, while the latter would not. + * + * To address this issue, we start by setting to true a + * sentinel variable, on_hdd_and_not_all_queues_seeky, if the + * device is rotational and not all queues with pending or + * in-flight requests are constantly seeky (i.e., there are + * active sequential queues, and bfqq might then be mistreated + * if it does not enjoy idling because it is random). + */ + all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) && + bfqd->busy_in_flight_queues == + bfqd->const_seeky_busy_in_flight_queues; + + on_hdd_and_not_all_queues_seeky = + !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky; + + /* + * To introduce the second case where idling needs to be + * performed to preserve service guarantees, we can note that + * allowing the drive to enqueue more than one request at a + * time, and hence delegating de facto final scheduling + * decisions to the drive's internal scheduler, causes loss of + * control on the actual request service order. In particular, + * the critical situation is when requests from different + * processes happens to be present, at the same time, in the + * internal queue(s) of the drive. In such a situation, the + * drive, by deciding the service order of the + * internally-queued requests, does determine also the actual + * throughput distribution among these processes. But the + * drive typically has no notion or concern about per-process + * throughput distribution, and makes its decisions only on a + * per-request basis. Therefore, the service distribution + * enforced by the drive's internal scheduler is likely to + * coincide with the desired device-throughput distribution + * only in a completely symmetric scenario where: + * (i) each of these processes must get the same throughput as + * the others; + * (ii) all these processes have the same I/O pattern + * (either sequential or random). + * In fact, in such a scenario, the drive will tend to treat + * the requests of each of these processes in about the same + * way as the requests of the others, and thus to provide + * each of these processes with about the same throughput + * (which is exactly the desired throughput distribution). In + * contrast, in any asymmetric scenario, device idling is + * certainly needed to guarantee that bfqq receives its + * assigned fraction of the device throughput (see [1] for + * details). * * We address this issue by controlling, actually, only the * symmetry sub-condition (i), i.e., provided that @@ -4509,9 +4577,10 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * is allowed, and the device tends to be prevented from * queueing many requests, possibly of several processes. The * reason for not controlling also sub-condition (ii) is that, - * first, in the case of an HDD, idling is always performed - * for I/O-bound processes (see the comments on - * idling_boosts_thr above). Secondly, in the case of a + * first, in the case of an HDD, the asymmetry in terms of + * types of I/O patterns is already taken in to account in the + * above sentinel variable + * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a * flash-based device, we prefer however to privilege * throughput (and idling lowers throughput for this type of * devices), for the following reasons: @@ -4551,6 +4620,13 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) !bfq_symmetric_scenario(bfqd); /* + * Combining the two cases above, we can now establish when + * idling is actually needed to preserve service guarantees. + */ + idling_needed_for_service_guarantees = + (on_hdd_and_not_all_queues_seeky || asymmetric_scenario); + + /* * We have now all the components we need to compute the return * value of the function, which is true only if both the following * conditions hold: @@ -4559,7 +4635,8 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) * is necessary to preserve service guarantees. */ return bfq_bfqq_sync(bfqq) && - (idling_boosts_thr_without_issues || asymmetric_scenario); + (idling_boosts_thr_without_issues || + idling_needed_for_service_guarantees); } /* @@ -5316,8 +5393,11 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_update_io_thinktime(bfqd, bic); bfq_update_io_seektime(bfqd, bfqq, rq); - if (!BFQQ_SEEKY(bfqq)) + if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { bfq_clear_bfqq_constantly_seeky(bfqq); + if (!blk_queue_nonrot(bfqd->queue)) + bfqd->const_seeky_busy_in_flight_queues--; + } if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || !BFQQ_SEEKY(bfqq)) bfq_update_idle_window(bfqd, bfqq, bic); @@ -5477,9 +5557,15 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) rq_io_start_time_ns(rq), rq->cmd_flags); #endif - if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) + if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { bfq_weights_tree_remove(bfqd, &bfqq->entity, &bfqd->queue_weights_tree); + if (!blk_queue_nonrot(bfqd->queue)) { + bfqd->busy_in_flight_queues--; + if (bfq_bfqq_constantly_seeky(bfqq)) + bfqd->const_seeky_busy_in_flight_queues--; + } + } if (sync) { bfqd->sync_flight--; @@ -5926,6 +6012,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) * video. */ bfqd->wr_busy_queues = 0; + bfqd->busy_in_flight_queues = 0; + bfqd->const_seeky_busy_in_flight_queues = 0; /* * Begin by assuming, optimistically, that the device peak rate is @@ -6302,7 +6390,7 @@ static int __init bfq_init(void) if (ret) goto err_pol_unreg; - pr_info("BFQ I/O-scheduler: v6"); + pr_info("BFQ I/O-scheduler: v7r3"); return 0;

[RFC,21/22] block, bfq: boost the throughput with random I/O on NCQ-capable HDDs

Commit Message

Patch