diff mbox series

ufs: core: fix bus timeout in ufshcd_wl_resume flow

Message ID 20240813134729.284583-1-zhanghui31@xiaomi.com
State New
Headers show
Series ufs: core: fix bus timeout in ufshcd_wl_resume flow | expand

Commit Message

章辉 Aug. 13, 2024, 1:47 p.m. UTC
From: zhanghui <zhanghui31@xiaomi.com>

If the SSU CMD completion flow in UFS resume and the CMD timeout flow occur
simultaneously, the timestamp attribute command will be sent to the device
during the UFS resume flow, while the UFS controller performs a reset in
the timeout error handling flow.

In this scenario, a bus timeout will occur because the UFS resume flow
will attempt to access the UFS host controller registers while the UFS
controller is in a reset state or power cycle.

Call trace:
  arm64_serror_panic+0x6c/0x94
  do_serror+0xbc/0xc8
  el1h_64_error_handler+0x34/0x48
  el1h_64_error+0x68/0x6c
  _raw_spin_unlock+0x18/0x44
  ufshcd_send_command+0x1c0/0x380
  ufshcd_exec_dev_cmd+0x21c/0x29c
  ufshcd_set_timestamp_attr+0x78/0xdc
  __ufshcd_wl_resume+0x228/0x48c
  ufshcd_wl_resume+0x5c/0x1b4
  scsi_bus_resume+0x58/0xa0
  dpm_run_callback+0x6c/0x23c
  __device_resume+0x298/0x46c
  async_resume+0x24/0x3c
  async_run_entry_fn+0x44/0x150
  process_scheduled_works+0x254/0x4f4
  worker_thread+0x244/0x334
  kthread+0x124/0x1f0
  ret_from_fork+0x10/0x20

This patch fixes the issue by adding a check of the UFS controller
register states before sending the device command.

Signed-off-by: zhanghui <zhanghui31@xiaomi.com>
---
 drivers/ufs/core/ufshcd.c | 14 ++++++++++++++
 include/ufs/ufshcd.h      | 13 +++++++++++++
 2 files changed, 27 insertions(+)

Comments

Bart Van Assche Aug. 14, 2024, 1:41 a.m. UTC | #1
On 8/13/24 6:47 AM, ZhangHui wrote:
> diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
> index 5e3c67e96956..e5e3e0277d43 100644
> --- a/drivers/ufs/core/ufshcd.c
> +++ b/drivers/ufs/core/ufshcd.c
> @@ -3291,6 +3291,8 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
>   	struct ufshcd_lrb *lrbp = &hba->lrb[tag];
>   	int err;
>   
> +	if (hba->ufshcd_reg_state == UFSHCD_REG_RESET)
> +		return -EBUSY;
>   	/* Protects use of hba->reserved_slot. */
>   	lockdep_assert_held(&hba->dev_cmd.lock);

Does this change make ufshcd_exec_dev_cmd() unpredictable - it succeeds
if the controller is in the normal state and fails if error recovery
is ongoing? If so, which code paths does this affect and/or break?

Additionally, I think the above check is racy. hba->ufshcd_reg_state may
change after the above code checked it and before ufshcd_exec_dev_cmd()
has finished. Wouldn't it be better to make code that shouldn't be
executed while the error handler is ongoing wait until error handling
has finished?

Thanks,

Bart.
Peter Wang (王信友) Aug. 14, 2024, 6:39 a.m. UTC | #2
On Tue, 2024-08-13 at 21:47 +0800, ZhangHui wrote:
> From: zhanghui <zhanghui31@xiaomi.com>
> 
> If the SSU CMD completion flow in UFS resume and the CMD timeout flow
> occur
> simultaneously, the timestamp attribute command will be sent to the
> device
> 

Hi Zhanghui,

If the timeout command is SSU?
In resume flow, if SSU command timeout, ufshcd_eh_host_reset_handler
invoke ufshcd_link_recovery only, not schedule eh work?


Thanks.
Peter

> during the UFS resume flow, while the UFS controller performs a reset
> in
> the timeout error handling flow.
> 
> In this scenario, a bus timeout will occur because the UFS resume
> flow
> will attempt to access the UFS host controller registers while the
> UFS
> controller is in a reset state or power cycle.
> 
> Call trace:
>   arm64_serror_panic+0x6c/0x94
>   do_serror+0xbc/0xc8
>   el1h_64_error_handler+0x34/0x48
>   el1h_64_error+0x68/0x6c
>   _raw_spin_unlock+0x18/0x44
>   ufshcd_send_command+0x1c0/0x380
>   ufshcd_exec_dev_cmd+0x21c/0x29c
>   ufshcd_set_timestamp_attr+0x78/0xdc
>   __ufshcd_wl_resume+0x228/0x48c
>   ufshcd_wl_resume+0x5c/0x1b4
>   scsi_bus_resume+0x58/0xa0
>   dpm_run_callback+0x6c/0x23c
>   __device_resume+0x298/0x46c
>   async_resume+0x24/0x3c
>   async_run_entry_fn+0x44/0x150
>   process_scheduled_works+0x254/0x4f4
>   worker_thread+0x244/0x334
>   kthread+0x124/0x1f0
>   ret_from_fork+0x10/0x20
> 
> This patch fixes the issue by adding a check of the UFS controller
> register states before sending the device command.
> 
> Signed-off-by: zhanghui <zhanghui31@xiaomi.com>
> ---
>  drivers/ufs/core/ufshcd.c | 14 ++++++++++++++
>  include/ufs/ufshcd.h      | 13 +++++++++++++
>  2 files changed, 27 insertions(+)
> 
> diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
> index 5e3c67e96956..e5e3e0277d43 100644
> --- a/drivers/ufs/core/ufshcd.c
> +++ b/drivers/ufs/core/ufshcd.c
> @@ -3291,6 +3291,8 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba
> *hba,
>  	struct ufshcd_lrb *lrbp = &hba->lrb[tag];
>  	int err;
>  
> +	if (hba->ufshcd_reg_state == UFSHCD_REG_RESET)
> +		return -EBUSY;
>  	/* Protects use of hba->reserved_slot. */
>  	lockdep_assert_held(&hba->dev_cmd.lock);
>  
> @@ -4857,6 +4859,7 @@ static int ufshcd_hba_execute_hce(struct
> ufs_hba *hba)
>  int ufshcd_hba_enable(struct ufs_hba *hba)
>  {
>  	int ret;
> +	unsigned long flags;
>  
>  	if (hba->quirks & UFSHCI_QUIRK_BROKEN_HCE) {
>  		ufshcd_set_link_off(hba);
> @@ -4881,6 +4884,13 @@ int ufshcd_hba_enable(struct ufs_hba *hba)
>  		ret = ufshcd_hba_execute_hce(hba);
>  	}
>  
> +	spin_lock_irqsave(hba->host->host_lock, flags);
> +	if (ret)
> +		hba->ufshcd_reg_state = UFSHCD_REG_RESET;
> +	else
> +		hba->ufshcd_reg_state = UFSHCD_REG_OPERATIONAL;
> +	spin_unlock_irqrestore(hba->host->host_lock, flags);
> +
>  	return ret;
>  }
>  EXPORT_SYMBOL_GPL(ufshcd_hba_enable);
> @@ -7693,7 +7703,11 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
>  static int ufshcd_host_reset_and_restore(struct ufs_hba *hba)
>  {
>  	int err;
> +	unsigned long flags;
>  
> +	spin_lock_irqsave(hba->host->host_lock, flags);
> +	hba->ufshcd_reg_state = UFSHCD_REG_RESET;
> +	spin_unlock_irqrestore(hba->host->host_lock, flags);
>  	/*
>  	 * Stop the host controller and complete the requests
>  	 * cleared by h/w
> diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
> index cac0cdb9a916..e5af4c2114ce 100644
> --- a/include/ufs/ufshcd.h
> +++ b/include/ufs/ufshcd.h
> @@ -523,6 +523,18 @@ enum ufshcd_state {
>  	UFSHCD_STATE_ERROR,
>  };
>  
> +/**
> + * enum ufshcd_state - UFS host controller state
> + * @UFSHCD_REG_OPERATIONAL: UFS host controller is enabled, the host
> controller
> + * register can be accessed.
> + * @UFSHCD_REG_RESET: UFS host controller registers will be reset,
> can't access
> + * any ufs host controller register.
> + */
> +enum ufshcd_reg_state {
> +	UFSHCD_REG_OPERATIONAL,
> +	UFSHCD_REG_RESET,
> +};
> +
>  enum ufshcd_quirks {
>  	/* Interrupt aggregation support is broken */
>  	UFSHCD_QUIRK_BROKEN_INTR_AGGR			= 1 << 0,
> @@ -1020,6 +1032,7 @@ struct ufs_hba {
>  	struct completion *uic_async_done;
>  
>  	enum ufshcd_state ufshcd_state;
> +	enum ufshcd_reg_state ufshcd_reg_state;
>  	u32 eh_flags;
>  	u32 intr_mask;
>  	u16 ee_ctrl_mask;
章辉 Aug. 15, 2024, 1:07 p.m. UTC | #3
On 2024/8/14 9:41, Bart Van Assche wrote:
> On 8/13/24 6:47 AM, ZhangHui wrote:
>> diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
>> index 5e3c67e96956..e5e3e0277d43 100644
>> --- a/drivers/ufs/core/ufshcd.c
>> +++ b/drivers/ufs/core/ufshcd.c
>> @@ -3291,6 +3291,8 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba 
>> *hba,
>>       struct ufshcd_lrb *lrbp = &hba->lrb[tag];
>>       int err;
>>
>> +     if (hba->ufshcd_reg_state == UFSHCD_REG_RESET)
>> +             return -EBUSY;
>>       /* Protects use of hba->reserved_slot. */
>>       lockdep_assert_held(&hba->dev_cmd.lock);
>
> Does this change make ufshcd_exec_dev_cmd() unpredictable - it succeeds
> if the controller is in the normal state and fails if error recovery
> is ongoing? If so, which code paths does this affect and/or break?
>
> Additionally, I think the above check is racy. hba->ufshcd_reg_state may
> change after the above code checked it and before ufshcd_exec_dev_cmd()
> has finished. Wouldn't it be better to make code that shouldn't be
> executed while the error handler is ongoing wait until error handling
> has finished?
>
> Thanks,
>
> Bart.
>
hi Bart,

1. If the host needs to send a dev command, the HBA must be enabled.
We have set the ufshcd_reg_state to operational in the ufshcd_hba_enable,
so it is not unpredictable.

2. That's a good question, but I think it makes sense to block dev cmd while
ufs is doing a reset.


Thanks
Zhanghui
章辉 Aug. 15, 2024, 1:14 p.m. UTC | #4
On 2024/8/14 14:39, Peter Wang (王信友) wrote:
> On Tue, 2024-08-13 at 21:47 +0800, ZhangHui wrote:
>> From: zhanghui <zhanghui31@xiaomi.com>
>>
>> If the SSU CMD completion flow in UFS resume and the CMD timeout flow
>> occur
>> simultaneously, the timestamp attribute command will be sent to the
>> device
>>
> Hi Zhanghui,
>
> If the timeout command is SSU?
> In resume flow, if SSU command timeout, ufshcd_eh_host_reset_handler
> invoke ufshcd_link_recovery only, not schedule eh work?
>
>
> Thanks.
> Peter

hi Peter G,

     ufshcd_link_recovery calls ufshcd_host_reset_and_restore to power 
cycle

     and reset the host controller.


Thanks

Zhanghui
diff mbox series

Patch

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 5e3c67e96956..e5e3e0277d43 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -3291,6 +3291,8 @@  static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
 	struct ufshcd_lrb *lrbp = &hba->lrb[tag];
 	int err;
 
+	if (hba->ufshcd_reg_state == UFSHCD_REG_RESET)
+		return -EBUSY;
 	/* Protects use of hba->reserved_slot. */
 	lockdep_assert_held(&hba->dev_cmd.lock);
 
@@ -4857,6 +4859,7 @@  static int ufshcd_hba_execute_hce(struct ufs_hba *hba)
 int ufshcd_hba_enable(struct ufs_hba *hba)
 {
 	int ret;
+	unsigned long flags;
 
 	if (hba->quirks & UFSHCI_QUIRK_BROKEN_HCE) {
 		ufshcd_set_link_off(hba);
@@ -4881,6 +4884,13 @@  int ufshcd_hba_enable(struct ufs_hba *hba)
 		ret = ufshcd_hba_execute_hce(hba);
 	}
 
+	spin_lock_irqsave(hba->host->host_lock, flags);
+	if (ret)
+		hba->ufshcd_reg_state = UFSHCD_REG_RESET;
+	else
+		hba->ufshcd_reg_state = UFSHCD_REG_OPERATIONAL;
+	spin_unlock_irqrestore(hba->host->host_lock, flags);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(ufshcd_hba_enable);
@@ -7693,7 +7703,11 @@  static int ufshcd_abort(struct scsi_cmnd *cmd)
 static int ufshcd_host_reset_and_restore(struct ufs_hba *hba)
 {
 	int err;
+	unsigned long flags;
 
+	spin_lock_irqsave(hba->host->host_lock, flags);
+	hba->ufshcd_reg_state = UFSHCD_REG_RESET;
+	spin_unlock_irqrestore(hba->host->host_lock, flags);
 	/*
 	 * Stop the host controller and complete the requests
 	 * cleared by h/w
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index cac0cdb9a916..e5af4c2114ce 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -523,6 +523,18 @@  enum ufshcd_state {
 	UFSHCD_STATE_ERROR,
 };
 
+/**
+ * enum ufshcd_state - UFS host controller state
+ * @UFSHCD_REG_OPERATIONAL: UFS host controller is enabled, the host controller
+ * register can be accessed.
+ * @UFSHCD_REG_RESET: UFS host controller registers will be reset, can't access
+ * any ufs host controller register.
+ */
+enum ufshcd_reg_state {
+	UFSHCD_REG_OPERATIONAL,
+	UFSHCD_REG_RESET,
+};
+
 enum ufshcd_quirks {
 	/* Interrupt aggregation support is broken */
 	UFSHCD_QUIRK_BROKEN_INTR_AGGR			= 1 << 0,
@@ -1020,6 +1032,7 @@  struct ufs_hba {
 	struct completion *uic_async_done;
 
 	enum ufshcd_state ufshcd_state;
+	enum ufshcd_reg_state ufshcd_reg_state;
 	u32 eh_flags;
 	u32 intr_mask;
 	u16 ee_ctrl_mask;