diff mbox

[RFC,21/22] block, bfq: boost the throughput with random I/O on NCQ-capable HDDs

Message ID 1454364778-25179-22-git-send-email-paolo.valente@linaro.org
State New
Headers show

Commit Message

Paolo Valente Feb. 1, 2016, 10:12 p.m. UTC
This patch is basically the counterpart, for NCQ-capable rotational
devices, of the previous patch. Exactly as the previous patch does on
flash-based devices and for any workload, this patch disables device
idling on rotational devices, but only for random I/O. More precisely,
idling is disabled only for constantly-seeky queues. In fact, only
with these queues disabling idling boosts the throughput on
NCQ-capable rotational devices.

To not break service guarantees, idling is disabled for NCQ-enabled
rotational devices and constantly-seeky queues only when the same
symmetry conditions considered in the previous patches, plus an
additional one, hold. The additional condition is related to the fact
that this patch disables idling only for constantly-seeky queues. In
this respect, should idling be disabled for a constantly-seeky queue
while some other non-constantly-seeky queue has pending requests, the
latter queue would get more requests served, after being set as in
service, than the former. This differentiated treatment would cause a
deviation with respect to the desired throughput distribution (i.e.,
with respect to the throughput distribution corresponding to the
weights assigned to processes and groups of processes). For this
reason, the additional condition for disabling idling for a
constantly-seeky queue is that all queues with pending or in-flight
requests are constantly seeky.

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>

Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>

---
 block/bfq.h         |  27 ++++++++
 block/cfq-iosched.c | 194 ++++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 168 insertions(+), 53 deletions(-)

-- 
1.9.1
diff mbox

Patch

diff --git a/block/bfq.h b/block/bfq.h
index 63ebba0..b808242 100644
--- a/block/bfq.h
+++ b/block/bfq.h
@@ -366,6 +366,31 @@  enum bfq_device_speed {
  *                     details).
  * @busy_queues: number of bfq_queues containing requests (including the
  *		 queue in service, even if it is idling).
+ * @busy_in_flight_queues: number of @bfq_queues containing pending or
+ *                         in-flight requests, plus the @bfq_queue in
+ *                         service, even if idle but waiting for the
+ *                         possible arrival of its next sync request. This
+ *                         field is updated only if the device is rotational,
+ *                         but used only if the device is also NCQ-capable.
+ *                         The reason why the field is updated also for non-
+ *                         NCQ-capable rotational devices is related to the
+ *                         fact that the value of @hw_tag may be set also
+ *                         later than when busy_in_flight_queues may need to
+ *                         be incremented for the first time(s). Taking also
+ *                         this possibility into account, to avoid unbalanced
+ *                         increments/decrements, would imply more overhead
+ *                         than just updating busy_in_flight_queues
+ *                         regardless of the value of @hw_tag.
+ * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues
+ *                                     (that is, seeky queues that expired
+ *                                     for budget timeout at least once)
+ *                                     containing pending or in-flight
+ *                                     requests, including the in-service
+ *                                     @bfq_queue if constantly seeky. This
+ *                                     field is updated only if the device
+ *                                     is rotational, but used only if the
+ *                                     device is also NCQ-capable (see the
+ *                                     comments to @busy_in_flight_queues).
  * @wr_busy_queues: number of weight-raised busy @bfq_queues.
  * @queued: number of queued requests.
  * @rq_in_driver: number of requests dispatched and waiting for completion.
@@ -449,6 +474,8 @@  struct bfq_data {
 	struct rb_root group_weights_tree;
 
 	int busy_queues;
+	int busy_in_flight_queues;
+	int const_seeky_busy_in_flight_queues;
 	int wr_busy_queues;
 	int queued;
 	int rq_in_driver;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b8ee88c..28ffa97 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -38,7 +38,9 @@ 
  * Even better for latency, BFQ explicitly privileges the I/O of two
  * classes of time-sensitive applications: interactive and soft
  * real-time. This feature enables BFQ to provide applications in
- * these classes with a very low latency.
+ * these classes with a very low latency. Finally, BFQ also features
+ * additional heuristics for preserving both a low latency and a high
+ * throughput on NCQ-capable, rotational or flash-based devices.
  *
  * With respect to the version of BFQ presented in [1], and in the
  * papers cited therein, this implementation adds a hierarchical
@@ -1278,9 +1280,15 @@  static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
 	bfqd->busy_queues--;
 
-	if (!bfqq->dispatched)
+	if (!bfqq->dispatched) {
 		bfq_weights_tree_remove(bfqd, &bfqq->entity,
 					&bfqd->queue_weights_tree);
+		if (!blk_queue_nonrot(bfqd->queue)) {
+			bfqd->busy_in_flight_queues--;
+			if (bfq_bfqq_constantly_seeky(bfqq))
+				bfqd->const_seeky_busy_in_flight_queues--;
+		}
+	}
 	if (bfqq->wr_coeff > 1)
 		bfqd->wr_busy_queues--;
 
@@ -1303,9 +1311,16 @@  static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	bfq_mark_bfqq_busy(bfqq);
 	bfqd->busy_queues++;
 
-	if (!bfqq->dispatched && bfqq->wr_coeff == 1)
-		bfq_weights_tree_add(bfqd, &bfqq->entity,
-				     &bfqd->queue_weights_tree);
+	if (!bfqq->dispatched) {
+		if (bfqq->wr_coeff == 1)
+			bfq_weights_tree_add(bfqd, &bfqq->entity,
+					     &bfqd->queue_weights_tree);
+		if (!blk_queue_nonrot(bfqd->queue)) {
+			bfqd->busy_in_flight_queues++;
+			if (bfq_bfqq_constantly_seeky(bfqq))
+				bfqd->const_seeky_busy_in_flight_queues++;
+		}
+	}
 	if (bfqq->wr_coeff > 1)
 		bfqd->wr_busy_queues++;
 }
@@ -4277,8 +4292,12 @@  static void bfq_bfqq_expire(struct bfq_data *bfqd,
 
 	bfqq->service_from_backlogged += bfqq->entity.service;
 
-	if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT)
+	if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
+	    !bfq_bfqq_constantly_seeky(bfqq)) {
 		bfq_mark_bfqq_constantly_seeky(bfqq);
+		if (!blk_queue_nonrot(bfqd->queue))
+			bfqd->const_seeky_busy_in_flight_queues++;
+	}
 
 	if (reason == BFQ_BFQQ_TOO_IDLE &&
 	    bfqq->entity.service <= 2 * bfqq->entity.budget / 10)
@@ -4402,26 +4421,23 @@  static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 {
 	struct bfq_data *bfqd = bfqq->bfqd;
 	bool idling_boosts_thr, idling_boosts_thr_without_issues,
+		all_queues_seeky, on_hdd_and_not_all_queues_seeky,
+		idling_needed_for_service_guarantees,
 		asymmetric_scenario;
 
 	/*
 	 * The next variable takes into account the cases where idling
 	 * boosts the throughput.
 	 *
-	 * The value of the variable is computed considering that
-	 * idling is usually beneficial for the throughput if:
+	 * The value of the variable is computed considering, first, that
+	 * idling is virtually always beneficial for the throughput if:
 	 * (a) the device is not NCQ-capable, or
 	 * (b) regardless of the presence of NCQ, the device is rotational
-	 *     and the request pattern for bfqq is I/O-bound (possible
-	 *     throughput losses caused by granting idling to seeky queues
-	 *     are mitigated by the fact that, in all scenarios where
-	 *     boosting throughput is the best thing to do, i.e., in all
-	 *     symmetric scenarios, only a minimal idle time is allowed to
-	 *     seeky queues).
+	 *     and the request pattern for bfqq is I/O-bound and sequential.
 	 *
 	 * Secondly, and in contrast to the above item (b), idling an
 	 * NCQ-capable flash-based device would not boost the
-	 * throughput even with intense I/O; rather it would lower
+	 * throughput even with sequential I/O; rather it would lower
 	 * the throughput in proportion to how fast the device
 	 * is. Accordingly, the next variable is true if any of the
 	 * above conditions (a) and (b) is true, and, in particular,
@@ -4429,7 +4445,8 @@  static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 	 * device.
 	 */
 	idling_boosts_thr = !bfqd->hw_tag ||
-		(!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq));
+		(!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&
+		 bfq_bfqq_idle_window(bfqq));
 
 	/*
 	 * The value of the next variable,
@@ -4470,36 +4487,87 @@  static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 		bfqd->wr_busy_queues == 0;
 
 	/*
-	 * There is then a case where idling must be performed not for
-	 * throughput concerns, but to preserve service guarantees. To
-	 * introduce it, we can note that allowing the drive to
-	 * enqueue more than one request at a time, and hence
-	 * delegating de facto final scheduling decisions to the
-	 * drive's internal scheduler, causes loss of control on the
-	 * actual request service order. In particular, the critical
-	 * situation is when requests from different processes happens
-	 * to be present, at the same time, in the internal queue(s)
-	 * of the drive. In such a situation, the drive, by deciding
-	 * the service order of the internally-queued requests, does
-	 * determine also the actual throughput distribution among
-	 * these processes. But the drive typically has no notion or
-	 * concern about per-process throughput distribution, and
-	 * makes its decisions only on a per-request basis. Therefore,
-	 * the service distribution enforced by the drive's internal
-	 * scheduler is likely to coincide with the desired
-	 * device-throughput distribution only in a completely
-	 * symmetric scenario where: (i) each of these processes must
-	 * get the same throughput as the others; (ii) all these
-	 * processes have the same I/O pattern (either sequential or
-	 * random).  In fact, in such a scenario, the drive will tend
-	 * to treat the requests of each of these processes in about
-	 * the same way as the requests of the others, and thus to
-	 * provide each of these processes with about the same
-	 * throughput (which is exactly the desired throughput
-	 * distribution). In contrast, in any asymmetric scenario,
-	 * device idling is certainly needed to guarantee that bfqq
-	 * receives its assigned fraction of the device throughput
-	 * (see [1] for details).
+	 * There are then two cases where idling must be performed not
+	 * for throughput concerns, but to preserve service
+	 * guarantees. In the description of these cases, we say, for
+	 * short, that a queue is sequential/random if the process
+	 * associated to the queue issues sequential/random requests
+	 * (in the second case the queue may be tagged as seeky or
+	 * even constantly_seeky).
+	 *
+	 * To introduce the first case, we note that, since
+	 * bfq_bfqq_idle_window(bfqq) is false if the device is
+	 * NCQ-capable and bfqq is random (see
+	 * bfq_update_idle_window()), then, from the above two
+	 * assignments it follows that
+	 * idling_boosts_thr_without_issues is false if the device is
+	 * NCQ-capable and bfqq is random. Therefore, for this case,
+	 * device idling would never be allowed if we used just
+	 * idling_boosts_thr_without_issues to decide whether to allow
+	 * it. And, beneficially, this would imply that throughput
+	 * would always be boosted also with random I/O on NCQ-capable
+	 * HDDs.
+	 *
+	 * But we must be careful on this point, to avoid an unfair
+	 * treatment for bfqq. In fact, because of the same above
+	 * assignments, idling_boosts_thr_without_issues is, on the
+	 * other hand, true if 1) the device is an HDD and bfqq is
+	 * sequential, and 2) there are no busy weight-raised
+	 * queues. As a consequence, if we used just
+	 * idling_boosts_thr_without_issues to decide whether to idle
+	 * the device, then with an HDD we might easily bump into a
+	 * scenario where queues that are sequential and I/O-bound
+	 * would enjoy idling, whereas random queues would not. The
+	 * latter might then get a low share of the device throughput,
+	 * simply because the former would get many requests served
+	 * after being set as in service, while the latter would not.
+	 *
+	 * To address this issue, we start by setting to true a
+	 * sentinel variable, on_hdd_and_not_all_queues_seeky, if the
+	 * device is rotational and not all queues with pending or
+	 * in-flight requests are constantly seeky (i.e., there are
+	 * active sequential queues, and bfqq might then be mistreated
+	 * if it does not enjoy idling because it is random).
+	 */
+	all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) &&
+			   bfqd->busy_in_flight_queues ==
+			   bfqd->const_seeky_busy_in_flight_queues;
+
+	on_hdd_and_not_all_queues_seeky =
+		!blk_queue_nonrot(bfqd->queue) && !all_queues_seeky;
+
+	/*
+	 * To introduce the second case where idling needs to be
+	 * performed to preserve service guarantees, we can note that
+	 * allowing the drive to enqueue more than one request at a
+	 * time, and hence delegating de facto final scheduling
+	 * decisions to the drive's internal scheduler, causes loss of
+	 * control on the actual request service order. In particular,
+	 * the critical situation is when requests from different
+	 * processes happens to be present, at the same time, in the
+	 * internal queue(s) of the drive. In such a situation, the
+	 * drive, by deciding the service order of the
+	 * internally-queued requests, does determine also the actual
+	 * throughput distribution among these processes. But the
+	 * drive typically has no notion or concern about per-process
+	 * throughput distribution, and makes its decisions only on a
+	 * per-request basis. Therefore, the service distribution
+	 * enforced by the drive's internal scheduler is likely to
+	 * coincide with the desired device-throughput distribution
+	 * only in a completely symmetric scenario where:
+	 * (i)  each of these processes must get the same throughput as
+	 *      the others;
+	 * (ii) all these processes have the same I/O pattern
+	 *	(either sequential or random).
+	 * In fact, in such a scenario, the drive will tend to treat
+	 * the requests of each of these processes in about the same
+	 * way as the requests of the others, and thus to provide
+	 * each of these processes with about the same throughput
+	 * (which is exactly the desired throughput distribution). In
+	 * contrast, in any asymmetric scenario, device idling is
+	 * certainly needed to guarantee that bfqq receives its
+	 * assigned fraction of the device throughput (see [1] for
+	 * details).
 	 *
 	 * We address this issue by controlling, actually, only the
 	 * symmetry sub-condition (i), i.e., provided that
@@ -4509,9 +4577,10 @@  static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 	 * is allowed, and the device tends to be prevented from
 	 * queueing many requests, possibly of several processes. The
 	 * reason for not controlling also sub-condition (ii) is that,
-	 * first, in the case of an HDD, idling is always performed
-	 * for I/O-bound processes (see the comments on
-	 * idling_boosts_thr above). Secondly, in the case of a
+	 * first, in the case of an HDD, the asymmetry in terms of
+	 * types of I/O patterns is already taken in to account in the
+	 * above sentinel variable
+	 * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a
 	 * flash-based device, we prefer however to privilege
 	 * throughput (and idling lowers throughput for this type of
 	 * devices), for the following reasons:
@@ -4551,6 +4620,13 @@  static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 		!bfq_symmetric_scenario(bfqd);
 
 	/*
+	 * Combining the two cases above, we can now establish when
+	 * idling is actually needed to preserve service guarantees.
+	 */
+	idling_needed_for_service_guarantees =
+		(on_hdd_and_not_all_queues_seeky || asymmetric_scenario);
+
+	/*
 	 * We have now all the components we need to compute the return
 	 * value of the function, which is true only if both the following
 	 * conditions hold:
@@ -4559,7 +4635,8 @@  static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 	 *    is necessary to preserve service guarantees.
 	 */
 	return bfq_bfqq_sync(bfqq) &&
-		(idling_boosts_thr_without_issues || asymmetric_scenario);
+		(idling_boosts_thr_without_issues ||
+		 idling_needed_for_service_guarantees);
 }
 
 /*
@@ -5316,8 +5393,11 @@  static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
 	bfq_update_io_thinktime(bfqd, bic);
 	bfq_update_io_seektime(bfqd, bfqq, rq);
-	if (!BFQQ_SEEKY(bfqq))
+	if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) {
 		bfq_clear_bfqq_constantly_seeky(bfqq);
+		if (!blk_queue_nonrot(bfqd->queue))
+			bfqd->const_seeky_busy_in_flight_queues--;
+	}
 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
 	    !BFQQ_SEEKY(bfqq))
 		bfq_update_idle_window(bfqd, bfqq, bic);
@@ -5477,9 +5557,15 @@  static void bfq_completed_request(struct request_queue *q, struct request *rq)
 				     rq_io_start_time_ns(rq), rq->cmd_flags);
 #endif
 
-	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq))
+	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
 		bfq_weights_tree_remove(bfqd, &bfqq->entity,
 					&bfqd->queue_weights_tree);
+		if (!blk_queue_nonrot(bfqd->queue)) {
+			bfqd->busy_in_flight_queues--;
+			if (bfq_bfqq_constantly_seeky(bfqq))
+				bfqd->const_seeky_busy_in_flight_queues--;
+		}
+	}
 
 	if (sync) {
 		bfqd->sync_flight--;
@@ -5926,6 +6012,8 @@  static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 					      * video.
 					      */
 	bfqd->wr_busy_queues = 0;
+	bfqd->busy_in_flight_queues = 0;
+	bfqd->const_seeky_busy_in_flight_queues = 0;
 
 	/*
 	 * Begin by assuming, optimistically, that the device peak rate is
@@ -6302,7 +6390,7 @@  static int __init bfq_init(void)
 	if (ret)
 		goto err_pol_unreg;
 
-	pr_info("BFQ I/O-scheduler: v6");
+	pr_info("BFQ I/O-scheduler: v7r3");
 
 	return 0;