@@ -240,6 +240,23 @@ config SCSI_SCAN_ASYNC
Note that this setting also affects whether resuming from
system suspend will be performed asynchronously.
+config SCSI_ERROR_UEVENT
+ bool "Enable SCSI error uevent reporting"
+ depends on SCSI
+ default n
+ help
+ If enabled, the SCSI mid-layer will emit kobject uevents when
+ SCSI commands fail or are retried by the error handler. These
+ events provide details about the error, including the command
+ result (host byte), sense key (SK), additional sense code (ASC),
+ additional sense code qualifier (ASCQ), and whether the command
+ is being retried (SDEV_ERROR_RETRY=1) or finally failing because
+ of error in completion path (SDEV_ERROR_RETRY=0).
+
+ Events are filtered for certain conditions (e.g., DID_BAD_TARGET,
+ DID_IMM_RETRY) and are also ratelimited per device to prevent
+ excessive noise.
+
config SCSI_PROTO_TEST
tristate "scsi_proto.h unit tests" if !KUNIT_ALL_TESTS
depends on SCSI && KUNIT
@@ -2227,6 +2227,9 @@ void scsi_eh_flush_done_q(struct list_head *done_q)
scmd_printk(KERN_INFO, scmd,
"%s: flush retry cmd\n",
current->comm));
+#ifdef CONFIG_SCSI_ERROR_UEVENT
+ scsi_emit_error(scmd);
+#endif
scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
blk_mq_kick_requeue_list(sdev->request_queue);
} else {
@@ -2595,3 +2598,66 @@ bool scsi_get_sense_info_fld(const u8 *sense_buffer, int sb_len,
}
}
EXPORT_SYMBOL(scsi_get_sense_info_fld);
+
+/**
+ * scsi_emit_error - Emit an error event.
+ *
+ * May be called from scsi_softirq_done(). Cannot sleep.
+ *
+ * @cmd: the scsi command
+ */
+void scsi_emit_error(struct scsi_cmnd *cmd)
+{
+ struct scsi_sense_hdr sshdr;
+ u8 result, sk, asc, ascq;
+ int sense_valid;
+ int retry;
+
+ if (unlikely(cmd->result)) {
+ result = host_byte(cmd->result);
+ if (result == DID_BAD_TARGET ||
+ result == DID_IMM_RETRY)
+ /*
+ * Do not report an error upstream, the situation is
+ * not stable. Will report once the IO really fails.
+ */
+ return;
+ sk = 0;
+ asc = 0;
+ ascq = 0;
+
+ if (result == DID_OK) {
+ sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
+ if (!sense_valid) {
+ /*
+ * With libata, this happens when the error
+ * handler is called but the error causes are
+ * not identified yet.
+ */
+ return;
+ }
+
+ sk = sshdr.sense_key;
+ asc = sshdr.asc;
+ ascq = sshdr.ascq;
+
+ /*
+ * asc == 0 && ascq == 0x1D means "ATA pass through
+ * information available"; this is not an error, but
+ * rather the driver returning some data.
+ */
+ if (sk == NO_SENSE ||
+ (sk == RECOVERED_ERROR &&
+ asc == 0x0 &&
+ ascq == 0x1D)) {
+ return;
+ }
+ }
+
+ retry = (!scsi_noretry_cmd(cmd) &&
+ cmd->retries > 0 &&
+ cmd->retries <= cmd->allowed);
+ sdev_evt_send_error(cmd->device, GFP_ATOMIC,
+ retry, result, sk, asc, ascq);
+ }
+}
@@ -1029,6 +1029,9 @@ static int scsi_io_completion_nz_result(struct scsi_cmnd *cmd, int result,
result = 0;
*blk_statp = BLK_STS_OK;
}
+#ifdef CONFIG_SCSI_ERROR_UEVENT
+ scsi_emit_error(cmd);
+#endif
return result;
}
@@ -1544,6 +1547,9 @@ static void scsi_complete(struct request *rq)
scsi_finish_command(cmd);
break;
case NEEDS_RETRY:
+#ifdef CONFIG_SCSI_ERROR_UEVENT
+ scsi_emit_error(cmd);
+#endif
scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY);
break;
case ADD_TO_MLQUEUE:
@@ -2559,43 +2565,77 @@ EXPORT_SYMBOL(scsi_device_set_state);
*/
static void scsi_evt_emit(struct scsi_device *sdev, struct scsi_event *evt)
{
- int idx = 0;
- char *envp[3];
+ struct kobj_uevent_env *env;
+
+ env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
+ if (!env)
+ return;
switch (evt->evt_type) {
case SDEV_EVT_MEDIA_CHANGE:
- envp[idx++] = "SDEV_MEDIA_CHANGE=1";
+ if (add_uevent_var(env, "SDEV_MEDIA_CHANGE=1"))
+ goto exit;
break;
case SDEV_EVT_INQUIRY_CHANGE_REPORTED:
scsi_rescan_device(sdev);
- envp[idx++] = "SDEV_UA=INQUIRY_DATA_HAS_CHANGED";
+ if (add_uevent_var(env, "SDEV_UA=INQUIRY_DATA_HAS_CHANGED"))
+ goto exit;
break;
case SDEV_EVT_CAPACITY_CHANGE_REPORTED:
- envp[idx++] = "SDEV_UA=CAPACITY_DATA_HAS_CHANGED";
+ if (add_uevent_var(env, "SDEV_UA=CAPACITY_DATA_HAS_CHANGED"))
+ goto exit;
break;
case SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED:
- envp[idx++] = "SDEV_UA=THIN_PROVISIONING_SOFT_THRESHOLD_REACHED";
+ if (add_uevent_var(env,
+ "SDEV_UA=THIN_PROVISIONING_SOFT_THRESHOLD_REACHED"))
+ goto exit;
break;
case SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED:
- envp[idx++] = "SDEV_UA=MODE_PARAMETERS_CHANGED";
+ if (add_uevent_var(env, "SDEV_UA=MODE_PARAMETERS_CHANGED"))
+ goto exit;
break;
case SDEV_EVT_LUN_CHANGE_REPORTED:
- envp[idx++] = "SDEV_UA=REPORTED_LUNS_DATA_HAS_CHANGED";
+ if (add_uevent_var(env,
+ "SDEV_UA=REPORTED_LUNS_DATA_HAS_CHANGED"))
+ goto exit;
break;
case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED:
- envp[idx++] = "SDEV_UA=ASYMMETRIC_ACCESS_STATE_CHANGED";
+ if (add_uevent_var(env,
+ "SDEV_UA=ASYMMETRIC_ACCESS_STATE_CHANGED"))
+ goto exit;
+ break;
+ case SDEV_EVT_ERROR:
+ if (add_uevent_var(env, "SDEV_ERROR=1"))
+ goto exit;
+ if (add_uevent_var(env, "SDEV_ERROR_RETRY=%u",
+ evt->error_evt.retry))
+ goto exit;
+ if (add_uevent_var(env, "SDEV_ERROR_RESULT=%u",
+ evt->error_evt.result))
+ goto exit;
+ if (add_uevent_var(env, "SDEV_ERROR_SK=%u",
+ evt->error_evt.sk))
+ goto exit;
+ if (add_uevent_var(env, "SDEV_ERROR_ASC=%u",
+ evt->error_evt.asc))
+ goto exit;
+ if (add_uevent_var(env, "SDEV_ERROR_ASCQ=%u",
+ evt->error_evt.ascq))
+ goto exit;
break;
case SDEV_EVT_POWER_ON_RESET_OCCURRED:
- envp[idx++] = "SDEV_UA=POWER_ON_RESET_OCCURRED";
+ if (add_uevent_var(env, "SDEV_UA=POWER_ON_RESET_OCCURRED"))
+ goto exit;
break;
default:
/* do nothing */
break;
}
- envp[idx++] = NULL;
+ kobject_uevent_env(&sdev->sdev_gendev.kobj, KOBJ_CHANGE, env->envp);
- kobject_uevent_env(&sdev->sdev_gendev.kobj, KOBJ_CHANGE, envp);
+exit:
+ kfree(env);
}
/**
@@ -2693,6 +2733,7 @@ struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type,
case SDEV_EVT_LUN_CHANGE_REPORTED:
case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED:
case SDEV_EVT_POWER_ON_RESET_OCCURRED:
+ case SDEV_EVT_ERROR:
default:
/* do nothing */
break;
@@ -2724,6 +2765,41 @@ void sdev_evt_send_simple(struct scsi_device *sdev,
}
EXPORT_SYMBOL_GPL(sdev_evt_send_simple);
+/**
+ * sdev_evt_send_error - send error event to uevent thread
+ * @sdev: scsi_device event occurred on
+ * @gfpflags: GFP flags for allocation
+ * @retry: if non-zero, command failed, will retry, otherwise final attempt
+ * @result: host byte of result
+ * @sk: sense key
+ * @asc: additional sense code
+ * @ascq: additional sense code qualifier
+ *
+ * Assert scsi device error event asynchronously.
+ */
+void sdev_evt_send_error(struct scsi_device *sdev, gfp_t gfpflags,
+ u8 retry, u8 result, u8 sk, u8 asc, u8 ascq)
+{
+ struct scsi_event *evt;
+
+ evt = sdev_evt_alloc(SDEV_EVT_ERROR, gfpflags);
+ if (!evt) {
+ sdev_printk(KERN_ERR, sdev, "error event eaten due to OOM: retry=%u result=%u sk=%u asc=%u ascq=%u\n",
+ retry, result, sk, asc, ascq);
+ return;
+ }
+
+ evt->error_evt.retry = retry;
+ evt->error_evt.result = result;
+ evt->error_evt.sk = sk;
+ evt->error_evt.asc = asc;
+ evt->error_evt.ascq = ascq;
+
+ if (___ratelimit(&sdev->error_ratelimit, "SCSI error"))
+ sdev_evt_send(sdev, evt);
+}
+EXPORT_SYMBOL_GPL(sdev_evt_send_error);
+
/**
* scsi_device_quiesce - Block all commands except power management.
* @sdev: scsi device to quiesce.
@@ -101,6 +101,7 @@ int scsi_eh_get_sense(struct list_head *work_q,
struct list_head *done_q);
bool scsi_noretry_cmd(struct scsi_cmnd *scmd);
void scsi_eh_done(struct scsi_cmnd *scmd);
+extern void scsi_emit_error(struct scsi_cmnd *scmd);
/* scsi_lib.c */
extern void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd);
@@ -310,6 +310,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
mutex_init(&sdev->inquiry_mutex);
INIT_WORK(&sdev->event_work, scsi_evt_thread);
INIT_WORK(&sdev->requeue_work, scsi_requeue_run_queue);
+ ratelimit_state_init(&sdev->error_ratelimit, 5 * HZ, 10);
sdev->sdev_gendev.parent = get_device(&starget->dev);
sdev->sdev_target = starget;
@@ -363,6 +364,9 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
scsi_change_queue_depth(sdev, depth);
+ /* All devices support error events */
+ set_bit(SDEV_EVT_ERROR, sdev->supported_events);
+
scsi_sysfs_device_initialize(sdev);
if (shost->hostt->sdev_init) {
@@ -1025,6 +1025,7 @@ DECLARE_EVT(capacity_change_reported, CAPACITY_CHANGE_REPORTED)
DECLARE_EVT(soft_threshold_reached, SOFT_THRESHOLD_REACHED_REPORTED)
DECLARE_EVT(mode_parameter_change_reported, MODE_PARAMETER_CHANGE_REPORTED)
DECLARE_EVT(lun_change_reported, LUN_CHANGE_REPORTED)
+DECLARE_EVT(error, ERROR)
static ssize_t
sdev_store_queue_depth(struct device *dev, struct device_attribute *attr,
@@ -1345,6 +1346,7 @@ static struct attribute *scsi_sdev_attrs[] = {
REF_EVT(soft_threshold_reached),
REF_EVT(mode_parameter_change_reported),
REF_EVT(lun_change_reported),
+ REF_EVT(error),
NULL
};
@@ -64,7 +64,8 @@ enum scsi_scan_mode {
};
enum scsi_device_event {
- SDEV_EVT_MEDIA_CHANGE = 1, /* media has changed */
+ SDEV_EVT_MEDIA_CHANGE = 0, /* media has changed */
+ SDEV_EVT_ERROR = 1, /* command failed */
SDEV_EVT_INQUIRY_CHANGE_REPORTED, /* 3F 03 UA reported */
SDEV_EVT_CAPACITY_CHANGE_REPORTED, /* 2A 09 UA reported */
SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED, /* 38 07 UA reported */
@@ -79,6 +80,19 @@ enum scsi_device_event {
SDEV_EVT_MAXBITS = SDEV_EVT_LAST + 1
};
+struct scsi_error_event {
+ /* A retry event */
+ u8 retry;
+ /* Host byte */
+ u8 result;
+ /* Sense key */
+ u8 sk;
+ /* Additional sense code */
+ u8 asc;
+ /* Additional sense code qualifier */
+ u8 ascq;
+};
+
struct scsi_event {
enum scsi_device_event evt_type;
struct list_head node;
@@ -86,6 +100,9 @@ struct scsi_event {
/* put union of data structures, for non-simple event types,
* here
*/
+ union {
+ struct scsi_error_event error_evt;
+ };
};
/**
@@ -269,6 +286,7 @@ struct scsi_device {
sdev_dev;
struct work_struct requeue_work;
+ struct ratelimit_state error_ratelimit;
struct scsi_device_handler *handler;
void *handler_data;
@@ -474,6 +492,8 @@ extern struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type,
extern void sdev_evt_send(struct scsi_device *sdev, struct scsi_event *evt);
extern void sdev_evt_send_simple(struct scsi_device *sdev,
enum scsi_device_event evt_type, gfp_t gfpflags);
+extern void sdev_evt_send_error(struct scsi_device *sdev, gfp_t gfpflags,
+ u8 retry, u8 result, u8 sk, u8 asc, u8 ascq);
extern int scsi_device_quiesce(struct scsi_device *sdev);
extern void scsi_device_resume(struct scsi_device *sdev);
extern void scsi_target_quiesce(struct scsi_target *);
Adds a new function scsi_emit_error(), called when a command is placed back on the command queue for retry by the error handler, or when a command completes. The scsi_emit_error() function uses the kobject_uevent_env() mechanism to emit a KOBJ_CHANGE event with details about the SCSI error. The event has the following key/value pairs set in the environment: - SDEV_ERROR: Always set to 1, to distinguish disk errors from media change events, which have SDEV_MEDIA_CHANGE=1 - SDEV_ERROR_RETRY: 0 if this is an error in the completion path in scsi_io_completion(), 1 if the command is going to be placed back on the queue - SDEV_ERROR_RESULT: Host byte of result code - SDEV_ERROR_SK: Sense key - SDEV_ERROR_ASC: Additional sense code - SDEV_ERROR_ASCQ: Additional sense code qualifier Error events are filtered under specific conditions: - DID_BAD_TARGET: Avoids uevent storms if a removed device is repeatedly accessed and is not responding. - DID_IMM_RETRY: Avoids reporting temporary transport errors where the command is immediately retried. This is a temporary error that should not be forwarded to userspace. scsi_emit_error() can be invoked from vairous atomic contexts, where sleeping is not permitted, so GFP_ATOMIC is used to ensure allocations can safely occur in these contexts. A per-device ratelimiting mechanism is added to prevent flooding userspace during persistent error conditions. The ratelimit is checked before scheduling the event work. Signed-off-by: Salomon Dushimirimana <salomondush@google.com> --- drivers/scsi/Kconfig | 17 +++++++ drivers/scsi/scsi_error.c | 66 ++++++++++++++++++++++++ drivers/scsi/scsi_lib.c | 100 ++++++++++++++++++++++++++++++++----- drivers/scsi/scsi_priv.h | 1 + drivers/scsi/scsi_scan.c | 4 ++ drivers/scsi/scsi_sysfs.c | 2 + include/scsi/scsi_device.h | 22 +++++++- 7 files changed, 199 insertions(+), 13 deletions(-)