diff mbox series

scsi: Add SCSI error events, sent as kobject uevents by mid-layer

Message ID 20250422181729.2792081-1-salomondush@google.com
State New
Headers show
Series scsi: Add SCSI error events, sent as kobject uevents by mid-layer | expand

Commit Message

Salomon Dushimirimana April 22, 2025, 6:17 p.m. UTC
Adds a new function scsi_emit_error(), called when a command is placed
back on the command queue for retry by the error handler, or when a
command completes.

The scsi_emit_error() function uses the kobject_uevent_env() mechanism
to emit a KOBJ_CHANGE event with details about the SCSI error.

The event has the following key/value pairs set in the environment:
- SDEV_ERROR: Always set to 1, to distinguish disk errors
  from media change events, which have SDEV_MEDIA_CHANGE=1
- SDEV_ERROR_RETRY: 0 if this is an error in the completion
  path in scsi_io_completion(), 1 if the command is going to be
  placed back on the queue
- SDEV_ERROR_RESULT: Host byte of result code
- SDEV_ERROR_SK: Sense key
- SDEV_ERROR_ASC: Additional sense code
- SDEV_ERROR_ASCQ: Additional sense code qualifier

Error events are filtered under specific conditions:
- DID_BAD_TARGET: Avoids uevent storms if a removed device is repeatedly
  accessed and is not responding.
- DID_IMM_RETRY: Avoids reporting temporary transport errors where the
  command is immediately retried. This is a temporary error that should
  not be forwarded to userspace.

scsi_emit_error() can be invoked from vairous atomic contexts, where
sleeping is not permitted, so GFP_ATOMIC is used to ensure allocations
can safely occur in these contexts.

A per-device ratelimiting mechanism is added to prevent flooding
userspace during persistent error conditions. The ratelimit is checked
before scheduling the event work.

Signed-off-by: Salomon Dushimirimana <salomondush@google.com>
---
 drivers/scsi/Kconfig       |  17 +++++++
 drivers/scsi/scsi_error.c  |  66 ++++++++++++++++++++++++
 drivers/scsi/scsi_lib.c    | 100 ++++++++++++++++++++++++++++++++-----
 drivers/scsi/scsi_priv.h   |   1 +
 drivers/scsi/scsi_scan.c   |   4 ++
 drivers/scsi/scsi_sysfs.c  |   2 +
 include/scsi/scsi_device.h |  22 +++++++-
 7 files changed, 199 insertions(+), 13 deletions(-)

Comments

Salomon Dushimirimana May 13, 2025, 7 p.m. UTC | #1
Hi James and Martin

I wanted to follow up on this patch! It's a decently sized patch, so
it might take some time, but I'd love to hear your thoughts and
address any feedback!!

Thank you,
Salomon Dushimirimana

Salomon Dushimirimana


On Tue, Apr 22, 2025 at 11:17 AM Salomon Dushimirimana
<salomondush@google.com> wrote:
>
> Adds a new function scsi_emit_error(), called when a command is placed
> back on the command queue for retry by the error handler, or when a
> command completes.
>
> The scsi_emit_error() function uses the kobject_uevent_env() mechanism
> to emit a KOBJ_CHANGE event with details about the SCSI error.
>
> The event has the following key/value pairs set in the environment:
> - SDEV_ERROR: Always set to 1, to distinguish disk errors
>   from media change events, which have SDEV_MEDIA_CHANGE=1
> - SDEV_ERROR_RETRY: 0 if this is an error in the completion
>   path in scsi_io_completion(), 1 if the command is going to be
>   placed back on the queue
> - SDEV_ERROR_RESULT: Host byte of result code
> - SDEV_ERROR_SK: Sense key
> - SDEV_ERROR_ASC: Additional sense code
> - SDEV_ERROR_ASCQ: Additional sense code qualifier
>
> Error events are filtered under specific conditions:
> - DID_BAD_TARGET: Avoids uevent storms if a removed device is repeatedly
>   accessed and is not responding.
> - DID_IMM_RETRY: Avoids reporting temporary transport errors where the
>   command is immediately retried. This is a temporary error that should
>   not be forwarded to userspace.
>
> scsi_emit_error() can be invoked from vairous atomic contexts, where
> sleeping is not permitted, so GFP_ATOMIC is used to ensure allocations
> can safely occur in these contexts.
>
> A per-device ratelimiting mechanism is added to prevent flooding
> userspace during persistent error conditions. The ratelimit is checked
> before scheduling the event work.
>
> Signed-off-by: Salomon Dushimirimana <salomondush@google.com>
> ---
>  drivers/scsi/Kconfig       |  17 +++++++
>  drivers/scsi/scsi_error.c  |  66 ++++++++++++++++++++++++
>  drivers/scsi/scsi_lib.c    | 100 ++++++++++++++++++++++++++++++++-----
>  drivers/scsi/scsi_priv.h   |   1 +
>  drivers/scsi/scsi_scan.c   |   4 ++
>  drivers/scsi/scsi_sysfs.c  |   2 +
>  include/scsi/scsi_device.h |  22 +++++++-
>  7 files changed, 199 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
> index 5a3c670aec27d..36a156ad6afd2 100644
> --- a/drivers/scsi/Kconfig
> +++ b/drivers/scsi/Kconfig
> @@ -240,6 +240,23 @@ config SCSI_SCAN_ASYNC
>           Note that this setting also affects whether resuming from
>           system suspend will be performed asynchronously.
>
> +config SCSI_ERROR_UEVENT
> +       bool "Enable SCSI error uevent reporting"
> +       depends on SCSI
> +       default n
> +       help
> +         If enabled, the SCSI mid-layer will emit kobject uevents when
> +         SCSI commands fail or are retried by the error handler. These
> +         events provide details about the error, including the command
> +         result (host byte), sense key (SK), additional sense code (ASC),
> +         additional sense code qualifier (ASCQ), and whether the command
> +         is being retried (SDEV_ERROR_RETRY=1) or finally failing because
> +         of error in completion path (SDEV_ERROR_RETRY=0).
> +
> +         Events are filtered for certain conditions (e.g., DID_BAD_TARGET,
> +         DID_IMM_RETRY) and are also ratelimited per device to prevent
> +         excessive noise.
> +
>  config SCSI_PROTO_TEST
>         tristate "scsi_proto.h unit tests" if !KUNIT_ALL_TESTS
>         depends on SCSI && KUNIT
> diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
> index 376b8897ab90a..327a012f328ff 100644
> --- a/drivers/scsi/scsi_error.c
> +++ b/drivers/scsi/scsi_error.c
> @@ -2227,6 +2227,9 @@ void scsi_eh_flush_done_q(struct list_head *done_q)
>                                 scmd_printk(KERN_INFO, scmd,
>                                              "%s: flush retry cmd\n",
>                                              current->comm));
> +#ifdef CONFIG_SCSI_ERROR_UEVENT
> +                               scsi_emit_error(scmd);
> +#endif
>                                 scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
>                                 blk_mq_kick_requeue_list(sdev->request_queue);
>                 } else {
> @@ -2595,3 +2598,66 @@ bool scsi_get_sense_info_fld(const u8 *sense_buffer, int sb_len,
>         }
>  }
>  EXPORT_SYMBOL(scsi_get_sense_info_fld);
> +
> +/**
> + * scsi_emit_error - Emit an error event.
> + *
> + * May be called from scsi_softirq_done(). Cannot sleep.
> + *
> + * @cmd: the scsi command
> + */
> +void scsi_emit_error(struct scsi_cmnd *cmd)
> +{
> +       struct scsi_sense_hdr sshdr;
> +       u8 result, sk, asc, ascq;
> +       int sense_valid;
> +       int retry;
> +
> +       if (unlikely(cmd->result)) {
> +               result = host_byte(cmd->result);
> +               if (result == DID_BAD_TARGET ||
> +                   result == DID_IMM_RETRY)
> +                       /*
> +                        * Do not report an error upstream, the situation is
> +                        * not stable. Will report once the IO really fails.
> +                        */
> +                       return;
> +               sk = 0;
> +               asc = 0;
> +               ascq = 0;
> +
> +               if (result == DID_OK) {
> +                       sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
> +                       if (!sense_valid) {
> +                               /*
> +                                * With libata, this happens when the error
> +                                * handler is called but the error causes are
> +                                * not identified yet.
> +                                */
> +                               return;
> +                       }
> +
> +                       sk = sshdr.sense_key;
> +                       asc = sshdr.asc;
> +                       ascq = sshdr.ascq;
> +
> +                       /*
> +                        * asc == 0 && ascq == 0x1D means "ATA pass through
> +                        * information available"; this is not an error, but
> +                        * rather the driver returning some data.
> +                        */
> +                       if (sk == NO_SENSE ||
> +                           (sk == RECOVERED_ERROR &&
> +                            asc == 0x0 &&
> +                            ascq == 0x1D)) {
> +                               return;
> +                       }
> +               }
> +
> +               retry = (!scsi_noretry_cmd(cmd) &&
> +                        cmd->retries > 0 &&
> +                        cmd->retries <= cmd->allowed);
> +               sdev_evt_send_error(cmd->device, GFP_ATOMIC,
> +                                   retry, result, sk, asc, ascq);
> +       }
> +}
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 0d29470e86b0b..2a2fae00e9f1c 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -1029,6 +1029,9 @@ static int scsi_io_completion_nz_result(struct scsi_cmnd *cmd, int result,
>                 result = 0;
>                 *blk_statp = BLK_STS_OK;
>         }
> +#ifdef CONFIG_SCSI_ERROR_UEVENT
> +       scsi_emit_error(cmd);
> +#endif
>         return result;
>  }
>
> @@ -1544,6 +1547,9 @@ static void scsi_complete(struct request *rq)
>                 scsi_finish_command(cmd);
>                 break;
>         case NEEDS_RETRY:
> +#ifdef CONFIG_SCSI_ERROR_UEVENT
> +               scsi_emit_error(cmd);
> +#endif
>                 scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY);
>                 break;
>         case ADD_TO_MLQUEUE:
> @@ -2559,43 +2565,77 @@ EXPORT_SYMBOL(scsi_device_set_state);
>   */
>  static void scsi_evt_emit(struct scsi_device *sdev, struct scsi_event *evt)
>  {
> -       int idx = 0;
> -       char *envp[3];
> +       struct kobj_uevent_env *env;
> +
> +       env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
> +       if (!env)
> +               return;
>
>         switch (evt->evt_type) {
>         case SDEV_EVT_MEDIA_CHANGE:
> -               envp[idx++] = "SDEV_MEDIA_CHANGE=1";
> +               if (add_uevent_var(env, "SDEV_MEDIA_CHANGE=1"))
> +                       goto exit;
>                 break;
>         case SDEV_EVT_INQUIRY_CHANGE_REPORTED:
>                 scsi_rescan_device(sdev);
> -               envp[idx++] = "SDEV_UA=INQUIRY_DATA_HAS_CHANGED";
> +               if (add_uevent_var(env, "SDEV_UA=INQUIRY_DATA_HAS_CHANGED"))
> +                       goto exit;
>                 break;
>         case SDEV_EVT_CAPACITY_CHANGE_REPORTED:
> -               envp[idx++] = "SDEV_UA=CAPACITY_DATA_HAS_CHANGED";
> +               if (add_uevent_var(env, "SDEV_UA=CAPACITY_DATA_HAS_CHANGED"))
> +                       goto exit;
>                 break;
>         case SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED:
> -              envp[idx++] = "SDEV_UA=THIN_PROVISIONING_SOFT_THRESHOLD_REACHED";
> +               if (add_uevent_var(env,
> +                       "SDEV_UA=THIN_PROVISIONING_SOFT_THRESHOLD_REACHED"))
> +                       goto exit;
>                 break;
>         case SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED:
> -               envp[idx++] = "SDEV_UA=MODE_PARAMETERS_CHANGED";
> +               if (add_uevent_var(env, "SDEV_UA=MODE_PARAMETERS_CHANGED"))
> +                       goto exit;
>                 break;
>         case SDEV_EVT_LUN_CHANGE_REPORTED:
> -               envp[idx++] = "SDEV_UA=REPORTED_LUNS_DATA_HAS_CHANGED";
> +               if (add_uevent_var(env,
> +                       "SDEV_UA=REPORTED_LUNS_DATA_HAS_CHANGED"))
> +                       goto exit;
>                 break;
>         case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED:
> -               envp[idx++] = "SDEV_UA=ASYMMETRIC_ACCESS_STATE_CHANGED";
> +               if (add_uevent_var(env,
> +                       "SDEV_UA=ASYMMETRIC_ACCESS_STATE_CHANGED"))
> +                       goto exit;
> +               break;
> +       case SDEV_EVT_ERROR:
> +               if (add_uevent_var(env, "SDEV_ERROR=1"))
> +                       goto exit;
> +               if (add_uevent_var(env, "SDEV_ERROR_RETRY=%u",
> +                                       evt->error_evt.retry))
> +                       goto exit;
> +               if (add_uevent_var(env, "SDEV_ERROR_RESULT=%u",
> +                                       evt->error_evt.result))
> +                       goto exit;
> +               if (add_uevent_var(env, "SDEV_ERROR_SK=%u",
> +                                       evt->error_evt.sk))
> +                       goto exit;
> +               if (add_uevent_var(env, "SDEV_ERROR_ASC=%u",
> +                                       evt->error_evt.asc))
> +                       goto exit;
> +               if (add_uevent_var(env, "SDEV_ERROR_ASCQ=%u",
> +                                       evt->error_evt.ascq))
> +                       goto exit;
>                 break;
>         case SDEV_EVT_POWER_ON_RESET_OCCURRED:
> -               envp[idx++] = "SDEV_UA=POWER_ON_RESET_OCCURRED";
> +               if (add_uevent_var(env, "SDEV_UA=POWER_ON_RESET_OCCURRED"))
> +                       goto exit;
>                 break;
>         default:
>                 /* do nothing */
>                 break;
>         }
>
> -       envp[idx++] = NULL;
> +       kobject_uevent_env(&sdev->sdev_gendev.kobj, KOBJ_CHANGE, env->envp);
>
> -       kobject_uevent_env(&sdev->sdev_gendev.kobj, KOBJ_CHANGE, envp);
> +exit:
> +       kfree(env);
>  }
>
>  /**
> @@ -2693,6 +2733,7 @@ struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type,
>         case SDEV_EVT_LUN_CHANGE_REPORTED:
>         case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED:
>         case SDEV_EVT_POWER_ON_RESET_OCCURRED:
> +       case SDEV_EVT_ERROR:
>         default:
>                 /* do nothing */
>                 break;
> @@ -2724,6 +2765,41 @@ void sdev_evt_send_simple(struct scsi_device *sdev,
>  }
>  EXPORT_SYMBOL_GPL(sdev_evt_send_simple);
>
> +/**
> + *     sdev_evt_send_error - send error event to uevent thread
> + *     @sdev: scsi_device event occurred on
> + *     @gfpflags: GFP flags for allocation
> + *     @retry: if non-zero, command failed, will retry, otherwise final attempt
> + *     @result: host byte of result
> + *     @sk: sense key
> + *     @asc: additional sense code
> + *     @ascq: additional sense code qualifier
> + *
> + *     Assert scsi device error event asynchronously.
> + */
> +void sdev_evt_send_error(struct scsi_device *sdev, gfp_t gfpflags,
> +                        u8 retry, u8 result, u8 sk, u8 asc, u8 ascq)
> +{
> +       struct scsi_event *evt;
> +
> +       evt = sdev_evt_alloc(SDEV_EVT_ERROR, gfpflags);
> +       if (!evt) {
> +               sdev_printk(KERN_ERR, sdev, "error event eaten due to OOM: retry=%u result=%u sk=%u asc=%u ascq=%u\n",
> +                           retry, result, sk, asc, ascq);
> +               return;
> +       }
> +
> +       evt->error_evt.retry = retry;
> +       evt->error_evt.result = result;
> +       evt->error_evt.sk = sk;
> +       evt->error_evt.asc = asc;
> +       evt->error_evt.ascq = ascq;
> +
> +       if (___ratelimit(&sdev->error_ratelimit, "SCSI error"))
> +               sdev_evt_send(sdev, evt);
> +}
> +EXPORT_SYMBOL_GPL(sdev_evt_send_error);
> +
>  /**
>   *     scsi_device_quiesce - Block all commands except power management.
>   *     @sdev:  scsi device to quiesce.
> diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
> index 9fc397a9ce7a4..8519b563e2feb 100644
> --- a/drivers/scsi/scsi_priv.h
> +++ b/drivers/scsi/scsi_priv.h
> @@ -101,6 +101,7 @@ int scsi_eh_get_sense(struct list_head *work_q,
>                       struct list_head *done_q);
>  bool scsi_noretry_cmd(struct scsi_cmnd *scmd);
>  void scsi_eh_done(struct scsi_cmnd *scmd);
> +extern void scsi_emit_error(struct scsi_cmnd *scmd);
>
>  /* scsi_lib.c */
>  extern void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd);
> diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
> index 4833b8fe251b8..5c311dfc501c3 100644
> --- a/drivers/scsi/scsi_scan.c
> +++ b/drivers/scsi/scsi_scan.c
> @@ -310,6 +310,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
>         mutex_init(&sdev->inquiry_mutex);
>         INIT_WORK(&sdev->event_work, scsi_evt_thread);
>         INIT_WORK(&sdev->requeue_work, scsi_requeue_run_queue);
> +       ratelimit_state_init(&sdev->error_ratelimit, 5 * HZ, 10);
>
>         sdev->sdev_gendev.parent = get_device(&starget->dev);
>         sdev->sdev_target = starget;
> @@ -363,6 +364,9 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
>
>         scsi_change_queue_depth(sdev, depth);
>
> +       /* All devices support error events */
> +       set_bit(SDEV_EVT_ERROR, sdev->supported_events);
> +
>         scsi_sysfs_device_initialize(sdev);
>
>         if (shost->hostt->sdev_init) {
> diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
> index d772258e29ad2..20a537490e9f2 100644
> --- a/drivers/scsi/scsi_sysfs.c
> +++ b/drivers/scsi/scsi_sysfs.c
> @@ -1025,6 +1025,7 @@ DECLARE_EVT(capacity_change_reported, CAPACITY_CHANGE_REPORTED)
>  DECLARE_EVT(soft_threshold_reached, SOFT_THRESHOLD_REACHED_REPORTED)
>  DECLARE_EVT(mode_parameter_change_reported, MODE_PARAMETER_CHANGE_REPORTED)
>  DECLARE_EVT(lun_change_reported, LUN_CHANGE_REPORTED)
> +DECLARE_EVT(error, ERROR)
>
>  static ssize_t
>  sdev_store_queue_depth(struct device *dev, struct device_attribute *attr,
> @@ -1345,6 +1346,7 @@ static struct attribute *scsi_sdev_attrs[] = {
>         REF_EVT(soft_threshold_reached),
>         REF_EVT(mode_parameter_change_reported),
>         REF_EVT(lun_change_reported),
> +       REF_EVT(error),
>         NULL
>  };
>
> diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
> index 68dd49947d041..5485d3b5853e2 100644
> --- a/include/scsi/scsi_device.h
> +++ b/include/scsi/scsi_device.h
> @@ -64,7 +64,8 @@ enum scsi_scan_mode {
>  };
>
>  enum scsi_device_event {
> -       SDEV_EVT_MEDIA_CHANGE   = 1,    /* media has changed */
> +       SDEV_EVT_MEDIA_CHANGE   = 0,    /* media has changed */
> +       SDEV_EVT_ERROR          = 1,    /* command failed */
>         SDEV_EVT_INQUIRY_CHANGE_REPORTED,               /* 3F 03  UA reported */
>         SDEV_EVT_CAPACITY_CHANGE_REPORTED,              /* 2A 09  UA reported */
>         SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED,       /* 38 07  UA reported */
> @@ -79,6 +80,19 @@ enum scsi_device_event {
>         SDEV_EVT_MAXBITS        = SDEV_EVT_LAST + 1
>  };
>
> +struct scsi_error_event {
> +       /* A retry event */
> +       u8 retry;
> +       /* Host byte */
> +       u8 result;
> +       /* Sense key */
> +       u8 sk;
> +       /* Additional sense code */
> +       u8 asc;
> +       /* Additional sense code qualifier */
> +       u8 ascq;
> +};
> +
>  struct scsi_event {
>         enum scsi_device_event  evt_type;
>         struct list_head        node;
> @@ -86,6 +100,9 @@ struct scsi_event {
>         /* put union of data structures, for non-simple event types,
>          * here
>          */
> +       union {
> +               struct scsi_error_event error_evt;
> +       };
>  };
>
>  /**
> @@ -269,6 +286,7 @@ struct scsi_device {
>                                 sdev_dev;
>
>         struct work_struct      requeue_work;
> +       struct ratelimit_state  error_ratelimit;
>
>         struct scsi_device_handler *handler;
>         void                    *handler_data;
> @@ -474,6 +492,8 @@ extern struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type,
>  extern void sdev_evt_send(struct scsi_device *sdev, struct scsi_event *evt);
>  extern void sdev_evt_send_simple(struct scsi_device *sdev,
>                           enum scsi_device_event evt_type, gfp_t gfpflags);
> +extern void sdev_evt_send_error(struct scsi_device *sdev, gfp_t gfpflags,
> +       u8 retry, u8 result, u8 sk, u8 asc, u8 ascq);
>  extern int scsi_device_quiesce(struct scsi_device *sdev);
>  extern void scsi_device_resume(struct scsi_device *sdev);
>  extern void scsi_target_quiesce(struct scsi_target *);
> --
> 2.49.0.805.g082f7c87e0-goog
>
Bart Van Assche May 13, 2025, 7:39 p.m. UTC | #2
On 4/22/25 11:17 AM, Salomon Dushimirimana wrote:
> A per-device ratelimiting mechanism is added to prevent flooding
> userspace during persistent error conditions. The ratelimit is checked
> before scheduling the event work.

Isn't the above an argument to use ftrace instead of uevents to
send this information to user space?

Thanks,

Bart.
James Bottomley May 13, 2025, 8:27 p.m. UTC | #3
On Tue, 2025-05-13 at 12:00 -0700, Salomon Dushimirimana wrote:
> Hi James and Martin
> 
> I wanted to follow up on this patch! It's a decently sized patch, so
> it might take some time, but I'd love to hear your thoughts and
> address any feedback!!

I think the first fundamental question should be why is this a uevent?
It looks like what you're obtaining is really tracing information on
the retry and we could simply add it as another tracepoint in the
existing blktrace infrastructure SCSI already has.

Regards,

James
Salomon Dushimirimana May 15, 2025, 8:03 p.m. UTC | #4
Hi,

I agree with the recommended use of ftrace or blktrace for tracing.
However, our primary goal for using uevents was not merely for
collecting trace information. We are using uevents as a notification
mechanism for userspace workflows to determine repair workflows (swap
/ remove a failing device).

We are open to any feedback on other notification recommendations for
such use cases.




Salomon Dushimirimana

On Tue, May 13, 2025 at 1:27 PM James Bottomley
<James.Bottomley@hansenpartnership.com> wrote:
>
> On Tue, 2025-05-13 at 12:00 -0700, Salomon Dushimirimana wrote:
> > Hi James and Martin
> >
> > I wanted to follow up on this patch! It's a decently sized patch, so
> > it might take some time, but I'd love to hear your thoughts and
> > address any feedback!!
>
> I think the first fundamental question should be why is this a uevent?
> It looks like what you're obtaining is really tracing information on
> the retry and we could simply add it as another tracepoint in the
> existing blktrace infrastructure SCSI already has.
>
> Regards,
>
> James
>
James Bottomley May 15, 2025, 8:11 p.m. UTC | #5
On Thu, 2025-05-15 at 13:03 -0700, Salomon Dushimirimana wrote:
> Hi,
> 
> I agree with the recommended use of ftrace or blktrace for tracing.

Great; what made me think of tracing is that your event emits for every
error or retry which seemed like quite an overhead.  Conditioning it on
a config parameter really isn't useful to distributions, so using the
tracepoint system would solve both the quantity and the activation
problem.

> However, our primary goal for using uevents was not merely for
> collecting trace information. We are using uevents as a notification
> mechanism for userspace workflows to determine repair workflows (swap
> / remove a failing device).

If you're collecting stats for predictive failure, how is this proposed
active mechanism more effective than the passive one of simply using
the existing SMART monitor tools?

Regards,

James
diff mbox series

Patch

diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 5a3c670aec27d..36a156ad6afd2 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -240,6 +240,23 @@  config SCSI_SCAN_ASYNC
 	  Note that this setting also affects whether resuming from
 	  system suspend will be performed asynchronously.
 
+config SCSI_ERROR_UEVENT
+	bool "Enable SCSI error uevent reporting"
+	depends on SCSI
+	default n
+	help
+	  If enabled, the SCSI mid-layer will emit kobject uevents when
+	  SCSI commands fail or are retried by the error handler. These
+	  events provide details about the error, including the command
+	  result (host byte), sense key (SK), additional sense code (ASC),
+	  additional sense code qualifier (ASCQ), and whether the command
+	  is being retried (SDEV_ERROR_RETRY=1) or finally failing because
+	  of error in completion path (SDEV_ERROR_RETRY=0).
+
+	  Events are filtered for certain conditions (e.g., DID_BAD_TARGET,
+	  DID_IMM_RETRY) and are also ratelimited per device to prevent
+	  excessive noise.
+
 config SCSI_PROTO_TEST
 	tristate "scsi_proto.h unit tests" if !KUNIT_ALL_TESTS
 	depends on SCSI && KUNIT
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 376b8897ab90a..327a012f328ff 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -2227,6 +2227,9 @@  void scsi_eh_flush_done_q(struct list_head *done_q)
 				scmd_printk(KERN_INFO, scmd,
 					     "%s: flush retry cmd\n",
 					     current->comm));
+#ifdef CONFIG_SCSI_ERROR_UEVENT
+				scsi_emit_error(scmd);
+#endif
 				scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
 				blk_mq_kick_requeue_list(sdev->request_queue);
 		} else {
@@ -2595,3 +2598,66 @@  bool scsi_get_sense_info_fld(const u8 *sense_buffer, int sb_len,
 	}
 }
 EXPORT_SYMBOL(scsi_get_sense_info_fld);
+
+/**
+ * scsi_emit_error - Emit an error event.
+ *
+ * May be called from scsi_softirq_done(). Cannot sleep.
+ *
+ * @cmd: the scsi command
+ */
+void scsi_emit_error(struct scsi_cmnd *cmd)
+{
+	struct scsi_sense_hdr sshdr;
+	u8 result, sk, asc, ascq;
+	int sense_valid;
+	int retry;
+
+	if (unlikely(cmd->result)) {
+		result = host_byte(cmd->result);
+		if (result == DID_BAD_TARGET ||
+		    result == DID_IMM_RETRY)
+			/*
+			 * Do not report an error upstream, the situation is
+			 * not stable. Will report once the IO really fails.
+			 */
+			return;
+		sk = 0;
+		asc = 0;
+		ascq = 0;
+
+		if (result == DID_OK) {
+			sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
+			if (!sense_valid) {
+				/*
+				 * With libata, this happens when the error
+				 * handler is called but the error causes are
+				 * not identified yet.
+				 */
+				return;
+			}
+
+			sk = sshdr.sense_key;
+			asc = sshdr.asc;
+			ascq = sshdr.ascq;
+
+			/*
+			 * asc == 0 && ascq == 0x1D means "ATA pass through
+			 * information available"; this is not an error, but
+			 * rather the driver returning some data.
+			 */
+			if (sk == NO_SENSE ||
+			    (sk == RECOVERED_ERROR &&
+			     asc == 0x0 &&
+			     ascq == 0x1D)) {
+				return;
+			}
+		}
+
+		retry = (!scsi_noretry_cmd(cmd) &&
+			 cmd->retries > 0 &&
+			 cmd->retries <= cmd->allowed);
+		sdev_evt_send_error(cmd->device, GFP_ATOMIC,
+				    retry, result, sk, asc, ascq);
+	}
+}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 0d29470e86b0b..2a2fae00e9f1c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1029,6 +1029,9 @@  static int scsi_io_completion_nz_result(struct scsi_cmnd *cmd, int result,
 		result = 0;
 		*blk_statp = BLK_STS_OK;
 	}
+#ifdef CONFIG_SCSI_ERROR_UEVENT
+	scsi_emit_error(cmd);
+#endif
 	return result;
 }
 
@@ -1544,6 +1547,9 @@  static void scsi_complete(struct request *rq)
 		scsi_finish_command(cmd);
 		break;
 	case NEEDS_RETRY:
+#ifdef CONFIG_SCSI_ERROR_UEVENT
+		scsi_emit_error(cmd);
+#endif
 		scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY);
 		break;
 	case ADD_TO_MLQUEUE:
@@ -2559,43 +2565,77 @@  EXPORT_SYMBOL(scsi_device_set_state);
  */
 static void scsi_evt_emit(struct scsi_device *sdev, struct scsi_event *evt)
 {
-	int idx = 0;
-	char *envp[3];
+	struct kobj_uevent_env *env;
+
+	env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
+	if (!env)
+		return;
 
 	switch (evt->evt_type) {
 	case SDEV_EVT_MEDIA_CHANGE:
-		envp[idx++] = "SDEV_MEDIA_CHANGE=1";
+		if (add_uevent_var(env, "SDEV_MEDIA_CHANGE=1"))
+			goto exit;
 		break;
 	case SDEV_EVT_INQUIRY_CHANGE_REPORTED:
 		scsi_rescan_device(sdev);
-		envp[idx++] = "SDEV_UA=INQUIRY_DATA_HAS_CHANGED";
+		if (add_uevent_var(env,	"SDEV_UA=INQUIRY_DATA_HAS_CHANGED"))
+			goto exit;
 		break;
 	case SDEV_EVT_CAPACITY_CHANGE_REPORTED:
-		envp[idx++] = "SDEV_UA=CAPACITY_DATA_HAS_CHANGED";
+		if (add_uevent_var(env,	"SDEV_UA=CAPACITY_DATA_HAS_CHANGED"))
+			goto exit;
 		break;
 	case SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED:
-	       envp[idx++] = "SDEV_UA=THIN_PROVISIONING_SOFT_THRESHOLD_REACHED";
+		if (add_uevent_var(env,
+			"SDEV_UA=THIN_PROVISIONING_SOFT_THRESHOLD_REACHED"))
+			goto exit;
 		break;
 	case SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED:
-		envp[idx++] = "SDEV_UA=MODE_PARAMETERS_CHANGED";
+		if (add_uevent_var(env, "SDEV_UA=MODE_PARAMETERS_CHANGED"))
+			goto exit;
 		break;
 	case SDEV_EVT_LUN_CHANGE_REPORTED:
-		envp[idx++] = "SDEV_UA=REPORTED_LUNS_DATA_HAS_CHANGED";
+		if (add_uevent_var(env,
+			"SDEV_UA=REPORTED_LUNS_DATA_HAS_CHANGED"))
+			goto exit;
 		break;
 	case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED:
-		envp[idx++] = "SDEV_UA=ASYMMETRIC_ACCESS_STATE_CHANGED";
+		if (add_uevent_var(env,
+			"SDEV_UA=ASYMMETRIC_ACCESS_STATE_CHANGED"))
+			goto exit;
+		break;
+	case SDEV_EVT_ERROR:
+		if (add_uevent_var(env, "SDEV_ERROR=1"))
+			goto exit;
+		if (add_uevent_var(env, "SDEV_ERROR_RETRY=%u",
+					evt->error_evt.retry))
+			goto exit;
+		if (add_uevent_var(env, "SDEV_ERROR_RESULT=%u",
+					evt->error_evt.result))
+			goto exit;
+		if (add_uevent_var(env, "SDEV_ERROR_SK=%u",
+					evt->error_evt.sk))
+			goto exit;
+		if (add_uevent_var(env, "SDEV_ERROR_ASC=%u",
+					evt->error_evt.asc))
+			goto exit;
+		if (add_uevent_var(env, "SDEV_ERROR_ASCQ=%u",
+					evt->error_evt.ascq))
+			goto exit;
 		break;
 	case SDEV_EVT_POWER_ON_RESET_OCCURRED:
-		envp[idx++] = "SDEV_UA=POWER_ON_RESET_OCCURRED";
+		if (add_uevent_var(env, "SDEV_UA=POWER_ON_RESET_OCCURRED"))
+			goto exit;
 		break;
 	default:
 		/* do nothing */
 		break;
 	}
 
-	envp[idx++] = NULL;
+	kobject_uevent_env(&sdev->sdev_gendev.kobj, KOBJ_CHANGE, env->envp);
 
-	kobject_uevent_env(&sdev->sdev_gendev.kobj, KOBJ_CHANGE, envp);
+exit:
+	kfree(env);
 }
 
 /**
@@ -2693,6 +2733,7 @@  struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type,
 	case SDEV_EVT_LUN_CHANGE_REPORTED:
 	case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED:
 	case SDEV_EVT_POWER_ON_RESET_OCCURRED:
+	case SDEV_EVT_ERROR:
 	default:
 		/* do nothing */
 		break;
@@ -2724,6 +2765,41 @@  void sdev_evt_send_simple(struct scsi_device *sdev,
 }
 EXPORT_SYMBOL_GPL(sdev_evt_send_simple);
 
+/**
+ *	sdev_evt_send_error - send error event to uevent thread
+ *	@sdev: scsi_device event occurred on
+ *	@gfpflags: GFP flags for allocation
+ *	@retry: if non-zero, command failed, will retry, otherwise final attempt
+ *	@result: host byte of result
+ *	@sk: sense key
+ *	@asc: additional sense code
+ *	@ascq: additional sense code qualifier
+ *
+ *	Assert scsi device error event asynchronously.
+ */
+void sdev_evt_send_error(struct scsi_device *sdev, gfp_t gfpflags,
+			 u8 retry, u8 result, u8 sk, u8 asc, u8 ascq)
+{
+	struct scsi_event *evt;
+
+	evt = sdev_evt_alloc(SDEV_EVT_ERROR, gfpflags);
+	if (!evt) {
+		sdev_printk(KERN_ERR, sdev, "error event eaten due to OOM: retry=%u result=%u sk=%u asc=%u ascq=%u\n",
+			    retry, result, sk, asc, ascq);
+		return;
+	}
+
+	evt->error_evt.retry = retry;
+	evt->error_evt.result = result;
+	evt->error_evt.sk = sk;
+	evt->error_evt.asc = asc;
+	evt->error_evt.ascq = ascq;
+
+	if (___ratelimit(&sdev->error_ratelimit, "SCSI error"))
+		sdev_evt_send(sdev, evt);
+}
+EXPORT_SYMBOL_GPL(sdev_evt_send_error);
+
 /**
  *	scsi_device_quiesce - Block all commands except power management.
  *	@sdev:	scsi device to quiesce.
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index 9fc397a9ce7a4..8519b563e2feb 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -101,6 +101,7 @@  int scsi_eh_get_sense(struct list_head *work_q,
 		      struct list_head *done_q);
 bool scsi_noretry_cmd(struct scsi_cmnd *scmd);
 void scsi_eh_done(struct scsi_cmnd *scmd);
+extern void scsi_emit_error(struct scsi_cmnd *scmd);
 
 /* scsi_lib.c */
 extern void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd);
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 4833b8fe251b8..5c311dfc501c3 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -310,6 +310,7 @@  static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 	mutex_init(&sdev->inquiry_mutex);
 	INIT_WORK(&sdev->event_work, scsi_evt_thread);
 	INIT_WORK(&sdev->requeue_work, scsi_requeue_run_queue);
+	ratelimit_state_init(&sdev->error_ratelimit, 5 * HZ, 10);
 
 	sdev->sdev_gendev.parent = get_device(&starget->dev);
 	sdev->sdev_target = starget;
@@ -363,6 +364,9 @@  static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 
 	scsi_change_queue_depth(sdev, depth);
 
+	/* All devices support error events */
+	set_bit(SDEV_EVT_ERROR, sdev->supported_events);
+
 	scsi_sysfs_device_initialize(sdev);
 
 	if (shost->hostt->sdev_init) {
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index d772258e29ad2..20a537490e9f2 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -1025,6 +1025,7 @@  DECLARE_EVT(capacity_change_reported, CAPACITY_CHANGE_REPORTED)
 DECLARE_EVT(soft_threshold_reached, SOFT_THRESHOLD_REACHED_REPORTED)
 DECLARE_EVT(mode_parameter_change_reported, MODE_PARAMETER_CHANGE_REPORTED)
 DECLARE_EVT(lun_change_reported, LUN_CHANGE_REPORTED)
+DECLARE_EVT(error, ERROR)
 
 static ssize_t
 sdev_store_queue_depth(struct device *dev, struct device_attribute *attr,
@@ -1345,6 +1346,7 @@  static struct attribute *scsi_sdev_attrs[] = {
 	REF_EVT(soft_threshold_reached),
 	REF_EVT(mode_parameter_change_reported),
 	REF_EVT(lun_change_reported),
+	REF_EVT(error),
 	NULL
 };
 
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 68dd49947d041..5485d3b5853e2 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -64,7 +64,8 @@  enum scsi_scan_mode {
 };
 
 enum scsi_device_event {
-	SDEV_EVT_MEDIA_CHANGE	= 1,	/* media has changed */
+	SDEV_EVT_MEDIA_CHANGE	= 0,	/* media has changed */
+	SDEV_EVT_ERROR		= 1,	/* command failed */
 	SDEV_EVT_INQUIRY_CHANGE_REPORTED,		/* 3F 03  UA reported */
 	SDEV_EVT_CAPACITY_CHANGE_REPORTED,		/* 2A 09  UA reported */
 	SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED,	/* 38 07  UA reported */
@@ -79,6 +80,19 @@  enum scsi_device_event {
 	SDEV_EVT_MAXBITS	= SDEV_EVT_LAST + 1
 };
 
+struct scsi_error_event {
+	/* A retry event */
+	u8 retry;
+	/* Host byte */
+	u8 result;
+	/* Sense key */
+	u8 sk;
+	/* Additional sense code */
+	u8 asc;
+	/* Additional sense code qualifier */
+	u8 ascq;
+};
+
 struct scsi_event {
 	enum scsi_device_event	evt_type;
 	struct list_head	node;
@@ -86,6 +100,9 @@  struct scsi_event {
 	/* put union of data structures, for non-simple event types,
 	 * here
 	 */
+	union {
+		struct scsi_error_event error_evt;
+	};
 };
 
 /**
@@ -269,6 +286,7 @@  struct scsi_device {
 				sdev_dev;
 
 	struct work_struct	requeue_work;
+	struct ratelimit_state	error_ratelimit;
 
 	struct scsi_device_handler *handler;
 	void			*handler_data;
@@ -474,6 +492,8 @@  extern struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type,
 extern void sdev_evt_send(struct scsi_device *sdev, struct scsi_event *evt);
 extern void sdev_evt_send_simple(struct scsi_device *sdev,
 			  enum scsi_device_event evt_type, gfp_t gfpflags);
+extern void sdev_evt_send_error(struct scsi_device *sdev, gfp_t gfpflags,
+	u8 retry, u8 result, u8 sk, u8 asc, u8 ascq);
 extern int scsi_device_quiesce(struct scsi_device *sdev);
 extern void scsi_device_resume(struct scsi_device *sdev);
 extern void scsi_target_quiesce(struct scsi_target *);