@@ -28,8 +28,8 @@ config IOSCHED_CFQ
The CFQ I/O scheduler, now internally replaced by BFQ, tries
to distribute bandwidth among all processes according to
their weights, regardless of the device parameters and with
- any workload. It also tries to guarantee a low latency to
- interactive applications.
+ any workload. It also tries to guarantee a low latency to
+ interactive and soft real-time applications.
This is the default I/O scheduler.
@@ -35,9 +35,10 @@
* guarantee a low latency to non-I/O bound processes (the latter
* often belong to time-sensitive applications).
*
- * Even better for latency, BFQ explicitly privileges the I/O of
- * interactive applications, thereby providing these applications with
- * a very low latency.
+ * Even better for latency, BFQ explicitly privileges the I/O of two
+ * classes of time-sensitive applications: interactive and soft
+ * real-time. This feature enables BFQ to provide applications in
+ * these classes with a very low latency.
*
* With respect to the version of BFQ presented in [1], and in the
* papers cited therein, this implementation adds a hierarchical
@@ -289,6 +290,14 @@ struct bfq_queue {
/* current maximum weight-raising time for this queue */
unsigned long wr_cur_max_time;
/*
+ * Minimum time instant such that, only if a new request is
+ * enqueued after this time instant in an idle @bfq_queue with
+ * no outstanding requests, then the task associated with the
+ * queue it is deemed as soft real-time (see the comments on
+ * the function bfq_bfqq_softrt_next_start())
+ */
+ unsigned long soft_rt_next_start;
+ /*
* Start time of the current weight-raising period if
* the @bfq-queue is being weight-raised, otherwise
* finish time of the last weight-raising period.
@@ -296,6 +305,16 @@ struct bfq_queue {
unsigned long last_wr_start_finish;
/* factor by which the weight of this queue is multiplied */
unsigned int wr_coeff;
+ /*
+ * Time of the last transition of the @bfq_queue from idle to
+ * backlogged.
+ */
+ unsigned long last_idle_bklogged;
+ /*
+ * Cumulative service received from the @bfq_queue since the
+ * last transition from idle to backlogged.
+ */
+ unsigned long service_from_backlogged;
};
/**
@@ -451,6 +470,9 @@ struct bfq_data {
unsigned int bfq_wr_coeff;
/* maximum duration of a weight-raising period (jiffies) */
unsigned int bfq_wr_max_time;
+
+ /* Maximum weight-raising duration for soft real-time processes */
+ unsigned int bfq_wr_rt_max_time;
/*
* Minimum idle period after which weight-raising may be
* reactivated for a queue (in jiffies).
@@ -462,6 +484,9 @@ struct bfq_data {
* queue (in jiffies).
*/
unsigned long bfq_wr_min_inter_arr_async;
+
+ /* Max service-rate for a soft real-time queue, in sectors/sec */
+ unsigned int bfq_wr_max_softrt_rate;
/*
* Cached value of the product R*T, used for computing the
* maximum duration of weight raising automatically.
@@ -490,6 +515,10 @@ enum bfqq_state_flags {
* having consumed at most 2/10 of
* its budget
*/
+ BFQ_BFQQ_FLAG_softrt_update, /*
+ * may need softrt-next-start
+ * update
+ */
};
#define BFQ_BFQQ_FNS(name) \
@@ -514,6 +543,7 @@ BFQ_BFQQ_FNS(fifo_expire);
BFQ_BFQQ_FNS(idle_window);
BFQ_BFQQ_FNS(sync);
BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(softrt_update);
#undef BFQ_BFQQ_FNS
/* Logging facilities. */
@@ -3510,13 +3540,17 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
unsigned int old_wr_coeff,
bool wr_or_deserves_wr,
- bool interactive)
+ bool interactive,
+ bool soft_rt)
{
if (old_wr_coeff == 1 && wr_or_deserves_wr) {
/* start a weight-raising period */
bfqq->wr_coeff = bfqd->bfq_wr_coeff;
- /* update wr duration */
- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ if (interactive) /* update wr duration */
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ else
+ bfqq->wr_cur_max_time =
+ bfqd->bfq_wr_rt_max_time;
/*
* If needed, further reduce budget to make sure it is
@@ -3531,8 +3565,61 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
bfqq->entity.budget,
2 * bfq_min_budget(bfqd));
} else if (old_wr_coeff > 1) {
- /* update wr duration */
- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ if (interactive) /* update wr duration */
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ else if (time_before(
+ bfqq->last_wr_start_finish +
+ bfqq->wr_cur_max_time,
+ jiffies +
+ bfqd->bfq_wr_rt_max_time) &&
+ soft_rt) {
+ /*
+ * The remaining weight-raising time is lower
+ * than bfqd->bfq_wr_rt_max_time, which means
+ * that the application is enjoying weight
+ * raising either because deemed soft-rt in
+ * the near past, or because deemed interactive
+ * a long ago.
+ * In both cases, resetting now the current
+ * remaining weight-raising time for the
+ * application to the weight-raising duration
+ * for soft rt applications would not cause any
+ * latency increase for the application (as the
+ * new duration would be higher than the
+ * remaining time).
+ *
+ * In addition, the application is now meeting
+ * the requirements for being deemed soft rt.
+ * In the end we can correctly and safely
+ * (re)charge the weight-raising duration for
+ * the application with the weight-raising
+ * duration for soft rt applications.
+ *
+ * In particular, doing this recharge now, i.e.,
+ * before the weight-raising period for the
+ * application finishes, reduces the probability
+ * of the following negative scenario:
+ * 1) the weight of a soft rt application is
+ * raised at startup (as for any newly
+ * created application),
+ * 2) since the application is not interactive,
+ * at a certain time weight-raising is
+ * stopped for the application,
+ * 3) at that time the application happens to
+ * still have pending requests, and hence
+ * is destined to not have a chance to be
+ * deemed soft rt before these requests are
+ * completed (see the comments to the
+ * function bfq_bfqq_softrt_next_start()
+ * for details on soft rt detection),
+ * 4) these pending requests experience a high
+ * latency because the application is not
+ * weight-raised while they are pending.
+ */
+ bfqq->last_wr_start_finish = jiffies;
+ bfqq->wr_cur_max_time =
+ bfqd->bfq_wr_rt_max_time;
+ }
}
}
@@ -3551,7 +3638,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
struct request *rq,
bool *interactive)
{
- bool wr_or_deserves_wr, bfqq_wants_to_preempt,
+ bool soft_rt, wr_or_deserves_wr, bfqq_wants_to_preempt,
idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
/*
* See the comments on
@@ -3568,12 +3655,14 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
/*
* bfqq deserves to be weight-raised if:
* - it is sync,
- * - it has been idle for enough time.
+ * - it has been idle for enough time or is soft real-time.
*/
+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
+ time_is_before_jiffies(bfqq->soft_rt_next_start);
*interactive = idle_for_long_time;
wr_or_deserves_wr = bfqd->low_latency &&
(bfqq->wr_coeff > 1 ||
- (bfq_bfqq_sync(bfqq) && *interactive));
+ (bfq_bfqq_sync(bfqq) && (*interactive || soft_rt)));
/*
* Using the last flag, update budget and check whether bfqq
@@ -3598,12 +3687,17 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
old_wr_coeff,
wr_or_deserves_wr,
- *interactive);
+ *interactive,
+ soft_rt);
if (old_wr_coeff != bfqq->wr_coeff)
bfqq->entity.prio_changed = 1;
}
+ bfqq->last_idle_bklogged = jiffies;
+ bfqq->service_from_backlogged = 0;
+ bfq_clear_bfqq_softrt_update(bfqq);
+
bfq_add_bfqq_busy(bfqd, bfqq);
/*
@@ -3680,6 +3774,12 @@ static void bfq_add_request(struct request *rq)
* period must start or restart (this case is considered
* separately because it is not detected by the above
* conditions, if bfqq is already weight-raised)
+ *
+ * last_wr_start_finish has to be updated also if bfqq is soft
+ * real-time, because the weight-raising period is constantly
+ * restarted on idle-to-busy transitions for these queues, but
+ * this is already done in bfq_bfqq_handle_idle_busy_switch if
+ * needed.
*/
if (bfqd->low_latency &&
(old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
@@ -3920,11 +4020,17 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq,
static void bfq_set_budget_timeout(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
+ unsigned int timeout_coeff;
+
+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
+ timeout_coeff = 1;
+ else
+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
+
bfqd->last_budget_start = ktime_get();
bfqq->budget_timeout = jiffies +
- bfqd->bfq_timeout *
- (bfqq->entity.weight / bfqq->entity.orig_weight);
+ bfqd->bfq_timeout * timeout_coeff;
}
static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
@@ -3937,6 +4043,37 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
+ if (bfqq->wr_coeff > 1 &&
+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
+ time_is_before_jiffies(bfqq->budget_timeout)) {
+ /*
+ * For soft real-time queues, move the start
+ * of the weight-raising period forward by the
+ * time the queue has not received any
+ * service. Otherwise, a relatively long
+ * service delay is likely to cause the
+ * weight-raising period of the queue to end,
+ * because of the short duration of the
+ * weight-raising period of a soft real-time
+ * queue. It is worth noting that this move
+ * is not so dangerous for the other queues,
+ * because soft real-time queues are not
+ * greedy.
+ *
+ * To not add a further variable, we use the
+ * overloaded field budget_timeout to
+ * determine for how long the queue has not
+ * received service, i.e., how much time has
+ * elapsed since the queue expired. However,
+ * this is a little imprecise, because
+ * budget_timeout is set to jiffies if bfqq
+ * not only expires, but also remains with no
+ * request.
+ */
+ bfqq->last_wr_start_finish += jiffies -
+ bfqq->budget_timeout;
+ }
+
bfq_set_budget_timeout(bfqd, bfqq);
bfq_log_bfqq(bfqd, bfqq,
"set_in_service_queue, cur-budget = %d",
@@ -4319,6 +4456,13 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
*/
if (delta_usecs > 20000) {
bool fully_sequential = bfqq->seek_history == 0;
+ /*
+ * Soft real-time queues are not good candidates for
+ * evaluating bw, as they are likely to be slow even
+ * if sequential.
+ */
+ bool non_soft_rt = bfqq->wr_coeff == 1 ||
+ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time;
bool consumed_large_budget =
reason == BFQ_BFQQ_BUDGET_EXHAUSTED &&
bfqq->entity.budget >= bfqd->bfq_max_budget * 2 / 3;
@@ -4327,7 +4471,7 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
consumed_large_budget;
if (bw > bfqd->peak_rate ||
- (bfq_bfqq_sync(bfqq) && fully_sequential &&
+ (bfq_bfqq_sync(bfqq) && fully_sequential && non_soft_rt &&
served_for_long_time)) {
/*
* To smooth oscillations use a low-pass filter with
@@ -4388,6 +4532,76 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
}
/*
+ * To be deemed as soft real-time, an application must meet two
+ * requirements. First, the application must not require an average
+ * bandwidth higher than the approximate bandwidth required to playback or
+ * record a compressed high-definition video.
+ * The next function is invoked on the completion of the last request of a
+ * batch, to compute the next-start time instant, soft_rt_next_start, such
+ * that, if the next request of the application does not arrive before
+ * soft_rt_next_start, then the above requirement on the bandwidth is met.
+ *
+ * The second requirement is that the request pattern of the application is
+ * isochronous, i.e., that, after issuing a request or a batch of requests,
+ * the application stops issuing new requests until all its pending requests
+ * have been completed. After that, the application may issue a new batch,
+ * and so on.
+ * For this reason the next function is invoked to compute
+ * soft_rt_next_start only for applications that meet this requirement,
+ * whereas soft_rt_next_start is set to infinity for applications that do
+ * not.
+ *
+ * Unfortunately, even a greedy application may happen to behave in an
+ * isochronous way if the CPU load is high. In fact, the application may
+ * stop issuing requests while the CPUs are busy serving other processes,
+ * then restart, then stop again for a while, and so on. In addition, if
+ * the disk achieves a low enough throughput with the request pattern
+ * issued by the application (e.g., because the request pattern is random
+ * and/or the device is slow), then the application may meet the above
+ * bandwidth requirement too. To prevent such a greedy application to be
+ * deemed as soft real-time, a further rule is used in the computation of
+ * soft_rt_next_start: soft_rt_next_start must be higher than the current
+ * time plus the maximum time for which the arrival of a request is waited
+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
+ * This filters out greedy applications, as the latter issue instead their
+ * next request as soon as possible after the last one has been completed
+ * (in contrast, when a batch of requests is completed, a soft real-time
+ * application spends some time processing data).
+ *
+ * Unfortunately, the last filter may easily generate false positives if
+ * only bfqd->bfq_slice_idle is used as a reference time interval and one
+ * or both the following cases occur:
+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher
+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
+ * HZ=100.
+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
+ * for a while, then suddenly 'jump' by several units to recover the lost
+ * increments. This seems to happen, e.g., inside virtual machines.
+ * To address this issue, we do not use as a reference time interval just
+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
+ * particular we add the minimum number of jiffies for which the filter
+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual
+ * machines.
+ */
+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ return max(bfqq->last_idle_bklogged +
+ HZ * bfqq->service_from_backlogged /
+ bfqd->bfq_wr_max_softrt_rate,
+ jiffies + bfqq->bfqd->bfq_slice_idle + 4);
+}
+
+/*
+ * Return the farthest future time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_greatest_from_now(void)
+{
+ return jiffies + MAX_JIFFY_OFFSET;
+}
+
+/*
* Return the farthest past time instant according to jiffies
* macros.
*/
@@ -4438,6 +4652,17 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason, &delta);
/*
+ * Increase service_from_backlogged before next statement,
+ * because the possible next invocation of
+ * bfq_bfqq_charge_time would likely inflate
+ * entity->service. In contrast, service_from_backlogged must
+ * contain real service, to enable the soft real-time
+ * heuristic to correctly compute the bandwidth consumed by
+ * bfqq.
+ */
+ bfqq->service_from_backlogged += entity->service;
+
+ /*
* As above explained, charge slow (typically seeky) and
* timed-out queues with the time and not the service
* received, to favor sequential workloads.
@@ -4465,6 +4690,48 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
if (bfqd->low_latency && bfqq->wr_coeff == 1)
bfqq->last_wr_start_finish = jiffies;
+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
+ RB_EMPTY_ROOT(&bfqq->sort_list)) {
+ /*
+ * If we get here, and there are no outstanding
+ * requests, then the request pattern is isochronous
+ * (see the comments on the function
+ * bfq_bfqq_softrt_next_start()). Thus we can compute
+ * soft_rt_next_start. If, instead, the queue still
+ * has outstanding requests, then we have to wait for
+ * the completion of all the outstanding requests to
+ * discover whether the request pattern is actually
+ * isochronous.
+ */
+ if (bfqq->dispatched == 0)
+ bfqq->soft_rt_next_start =
+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
+ else {
+ /*
+ * The application is still waiting for the
+ * completion of one or more requests:
+ * prevent it from possibly being incorrectly
+ * deemed as soft real-time by setting its
+ * soft_rt_next_start to infinity. In fact,
+ * without this assignment, the application
+ * would be incorrectly deemed as soft
+ * real-time if:
+ * 1) it issued a new request before the
+ * completion of all its in-flight
+ * requests, and
+ * 2) at that time, its soft_rt_next_start
+ * happened to be in the past.
+ */
+ bfqq->soft_rt_next_start =
+ bfq_greatest_from_now();
+ /*
+ * Schedule an update of soft_rt_next_start to when
+ * the task may be discovered to be isochronous.
+ */
+ bfq_mark_bfqq_softrt_update(bfqq);
+ }
+ }
+
bfq_log_bfqq(bfqd, bfqq,
"expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
@@ -5092,6 +5359,12 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqq->wr_coeff = 1;
bfqq->last_wr_start_finish = bfq_smallest_from_now();
+ /*
+ * Set to the value for which bfqq will not be deemed as
+ * soft rt when it becomes backlogged.
+ */
+ bfqq->soft_rt_next_start = bfq_greatest_from_now();
+
/* first request is almost certainly seeky */
bfqq->seek_history = 1;
}
@@ -5381,6 +5654,20 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
RQ_BIC(rq)->ttime.last_end_request = jiffies;
/*
+ * If we are waiting to discover whether the request pattern
+ * of the task associated with the queue is actually
+ * isochronous, and both requisites for this condition to hold
+ * are now satisfied, then compute soft_rt_next_start (see the
+ * comments on the function bfq_bfqq_softrt_next_start()). We
+ * schedule this delayed check when bfqq expires, if it still
+ * has in-flight requests.
+ */
+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
+ RB_EMPTY_ROOT(&bfqq->sort_list))
+ bfqq->soft_rt_next_start =
+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
+
+ /*
* If this is the in-service queue, check if it needs to be expired,
* or if we want to idle in case it has no pending requests.
*/
@@ -5728,9 +6015,16 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
* Trade-off between responsiveness and fairness.
*/
bfqd->bfq_wr_coeff = 30;
+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
bfqd->bfq_wr_max_time = 0;
bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
+ bfqd->bfq_wr_max_softrt_rate = 7000; /*
+ * Approximate rate required
+ * to playback or record a
+ * high-definition compressed
+ * video.
+ */
/*
* Begin by assuming, optimistically, that the device is a
@@ -5852,9 +6146,11 @@ SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);
+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);
SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);
SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,
1);
+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);
#undef SHOW_FUNCTION
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
@@ -5884,10 +6180,14 @@ STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);
STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);
+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,
+ 1);
STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,
INT_MAX, 1);
STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,
&bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);
+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,
+ INT_MAX, 0);
#undef STORE_FUNCTION
static ssize_t bfq_fake_lat_show(struct elevator_queue *e, char *page)
@@ -6013,8 +6313,10 @@ static struct elv_fs_entry bfq_attrs[] = {
BFQ_ATTR(low_latency),
BFQ_ATTR(wr_coeff),
BFQ_ATTR(wr_max_time),
+ BFQ_ATTR(wr_rt_max_time),
BFQ_ATTR(wr_min_idle_time),
BFQ_ATTR(wr_min_inter_arr_async),
+ BFQ_ATTR(wr_max_softrt_rate),
BFQ_ATTR(weights),
BFQ_FAKE_LAT_ATTR(target_latency),
__ATTR_NULL
@@ -6118,7 +6420,7 @@ static int __init bfq_init(void)
if (ret)
goto err_pol_unreg;
- pr_info("BFQ I/O-scheduler: v1");
+ pr_info("BFQ I/O-scheduler: v2");
return 0;