mbox series

[v3,00/17] libicsi and qedi TMF fixes

Message ID 20210416020440.259271-1-michael.christie@oracle.com
Headers show
Series libicsi and qedi TMF fixes | expand

Message

Mike Christie April 16, 2021, 2:04 a.m. UTC
The following patches were made over Martin's 5.12 scsi-fixes branch
which has this fix: 73603e995a37 ("scsi: iscsi: Fix iSCSI cls conn state")
that has conflicts with a patch in this set.

The patches fix TMF bugs in the qedi driver, libiscsi EH issues that are
common to all offload drivers like qedi and some fixes for cases where
userspace is not doing an unbind target nl cmd and we are doing TMFs
during session termination.

V3:
- Fix u16 initialization and test.
- Fix bool return value use.
- Added patches for cases where EH is running then userspace terminates
the connection without removing the target first.
- Made patch that stops IO during ep_disconnect more driver friendly
by handling if the ep is bound or not.

V2:
- Dropped patch that reverted the in-kernel conn failure handling. I
posted a full patchset to fix that separately.
- Modfied the tmf vs cmd queueing patch. The RFC patch only supported
qedi and offload drivers. iscsi_tcp/cxgbgi do not need it.
- Added patch for another issue found where cmds can still be queued to the
driver while it does ep_disconnect.

Comments

Lee Duncan April 17, 2021, 5:22 p.m. UTC | #1
On 4/15/21 7:04 PM, Mike Christie wrote:
> If we are handling a sg io reset there is a small window where cmds can

> sneak into iscsi_queuecommand even though libiscsi has sent a TMF to the

> driver. This does seems to not be a problem for normal drivers because they

               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
"doesn't seem to be a problem"?

> know exactly what was sent to the FW. For libiscsi this is a problem

> because it knows what it has sent to the driver but not what the driver

> sent to the FW. When we go to cleanup the running tasks, libiscsi might

> cleanup the sneaky cmd but the driver/FQ may not have seen the sneaky cmd


FO?

> and it's left running in FW.

> 

> This has libiscsi just stop queueing cmds when it sends the TMF down to the

> driver. Both then know they see the same set of cmds.

> 

> Signed-off-by: Mike Christie <michael.christie@oracle.com>

> ---

>  drivers/scsi/libiscsi.c | 104 +++++++++++++++++++++++++++-------------

>  1 file changed, 72 insertions(+), 32 deletions(-)

> 

> diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c

> index 4834219497ee..aa5ceaffc697 100644

> --- a/drivers/scsi/libiscsi.c

> +++ b/drivers/scsi/libiscsi.c

> @@ -1669,29 +1669,11 @@ enum {

>  	FAILURE_SESSION_NOT_READY,

>  };

>  

> -int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)

> +static bool iscsi_eh_running(struct iscsi_conn *conn, struct scsi_cmnd *sc,

> +			     int *reason)

>  {

> -	struct iscsi_cls_session *cls_session;

> -	struct iscsi_host *ihost;

> -	int reason = 0;

> -	struct iscsi_session *session;

> -	struct iscsi_conn *conn;

> -	struct iscsi_task *task = NULL;

> -

> -	sc->result = 0;

> -	sc->SCp.ptr = NULL;

> -

> -	ihost = shost_priv(host);

> -

> -	cls_session = starget_to_session(scsi_target(sc->device));

> -	session = cls_session->dd_data;

> -	spin_lock_bh(&session->frwd_lock);

> -

> -	reason = iscsi_session_chkready(cls_session);

> -	if (reason) {

> -		sc->result = reason;

> -		goto fault;

> -	}

> +	struct iscsi_session *session = conn->session;

> +	struct iscsi_tm *tmf;

>  

>  	if (session->state != ISCSI_STATE_LOGGED_IN) {

>  		/*

> @@ -1707,31 +1689,92 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)

>  			 * state is bad, allowing completion to happen

>  			 */

>  			if (unlikely(system_state != SYSTEM_RUNNING)) {

> -				reason = FAILURE_SESSION_FAILED;

> +				*reason = FAILURE_SESSION_FAILED;

>  				sc->result = DID_NO_CONNECT << 16;

>  				break;

>  			}

>  			fallthrough;

>  		case ISCSI_STATE_IN_RECOVERY:

> -			reason = FAILURE_SESSION_IN_RECOVERY;

> +			*reason = FAILURE_SESSION_IN_RECOVERY;

>  			sc->result = DID_IMM_RETRY << 16;

>  			break;

>  		case ISCSI_STATE_LOGGING_OUT:

> -			reason = FAILURE_SESSION_LOGGING_OUT;

> +			*reason = FAILURE_SESSION_LOGGING_OUT;

>  			sc->result = DID_IMM_RETRY << 16;

>  			break;

>  		case ISCSI_STATE_RECOVERY_FAILED:

> -			reason = FAILURE_SESSION_RECOVERY_TIMEOUT;

> +			*reason = FAILURE_SESSION_RECOVERY_TIMEOUT;

>  			sc->result = DID_TRANSPORT_FAILFAST << 16;

>  			break;

>  		case ISCSI_STATE_TERMINATE:

> -			reason = FAILURE_SESSION_TERMINATE;

> +			*reason = FAILURE_SESSION_TERMINATE;

>  			sc->result = DID_NO_CONNECT << 16;

>  			break;

>  		default:

> -			reason = FAILURE_SESSION_FREED;

> +			*reason = FAILURE_SESSION_FREED;

>  			sc->result = DID_NO_CONNECT << 16;

>  		}

> +		goto eh_running;

> +	}

> +

> +	if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) {

> +		*reason = FAILURE_SESSION_IN_RECOVERY;

> +		sc->result = DID_REQUEUE << 16;

> +		goto eh_running;

> +	}

> +

> +	/*

> +	 * For sg resets make sure libiscsi, the drivers and their fw see the

> +	 * same cmds. Once we get a TMF that can affect multiple cmds stop

> +	 * queueing.

> +	 */

> +	if (conn->tmf_state != TMF_INITIAL) {

> +		tmf = &conn->tmhdr;

> +

> +		switch (ISCSI_TM_FUNC_VALUE(tmf)) {

> +		case ISCSI_TM_FUNC_LOGICAL_UNIT_RESET:

> +			if (sc->device->lun != scsilun_to_int(&tmf->lun))

> +				break;

> +

> +			ISCSI_DBG_EH(conn->session,

> +				     "Requeue cmd sent during LU RESET processing.\n");

> +			sc->result = DID_REQUEUE << 16;

> +			goto eh_running;

> +		case ISCSI_TM_FUNC_TARGET_WARM_RESET:

> +			ISCSI_DBG_EH(conn->session,

> +				     "Requeue cmd sent during TARGET RESET processing.\n");

> +			sc->result = DID_REQUEUE << 16;

> +			goto eh_running;

> +		}


Is it common to have no default case? I thought the compiler disliked
that. If the compiler and convention are fine with this, I'm fine with
it too.

> +	}

> +

> +	return false;

> +

> +eh_running:

> +	return true;

> +}

> +

> +int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)

> +{

> +	struct iscsi_cls_session *cls_session;

> +	struct iscsi_host *ihost;

> +	int reason = 0;

> +	struct iscsi_session *session;

> +	struct iscsi_conn *conn;

> +	struct iscsi_task *task = NULL;

> +

> +	sc->result = 0;

> +	sc->SCp.ptr = NULL;

> +

> +	ihost = shost_priv(host);

> +

> +	cls_session = starget_to_session(scsi_target(sc->device));

> +	session = cls_session->dd_data;

> +	spin_lock_bh(&session->frwd_lock);

> +

> +	reason = iscsi_session_chkready(cls_session);

> +	if (reason) {

> +		sc->result = reason;

>  		goto fault;

>  	}

>  

> @@ -1742,11 +1785,8 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)

>  		goto fault;

>  	}

>  

> -	if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) {

> -		reason = FAILURE_SESSION_IN_RECOVERY;

> -		sc->result = DID_REQUEUE << 16;

> +	if (iscsi_eh_running(conn, sc, &reason))

>  		goto fault;

> -	}

>  

>  	if (iscsi_check_cmdsn_window_closed(conn)) {

>  		reason = FAILURE_WINDOW_CLOSED;

>
Mike Christie April 17, 2021, 5:26 p.m. UTC | #2
On 4/17/21 12:22 PM, Lee Duncan wrote:
> On 4/15/21 7:04 PM, Mike Christie wrote:

>> If we are handling a sg io reset there is a small window where cmds can

>> sneak into iscsi_queuecommand even though libiscsi has sent a TMF to the

>> driver. This does seems to not be a problem for normal drivers because they

>                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

> "doesn't seem to be a problem"?


Yeah.

> 

>> know exactly what was sent to the FW. For libiscsi this is a problem

>> because it knows what it has sent to the driver but not what the driver

>> sent to the FW. When we go to cleanup the running tasks, libiscsi might

>> cleanup the sneaky cmd but the driver/FQ may not have seen the sneaky cmd

> 

> FO?

> 


FW.

>> and it's left running in FW.

>>

>> This has libiscsi just stop queueing cmds when it sends the TMF down to the

>> driver. Both then know they see the same set of cmds.

>>

>> Signed-off-by: Mike Christie <michael.christie@oracle.com>

>> ---

>>  drivers/scsi/libiscsi.c | 104 +++++++++++++++++++++++++++-------------

>>  1 file changed, 72 insertions(+), 32 deletions(-)

>>

>> diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c

>> index 4834219497ee..aa5ceaffc697 100644

>> --- a/drivers/scsi/libiscsi.c

>> +++ b/drivers/scsi/libiscsi.c

>> @@ -1669,29 +1669,11 @@ enum {

>>  	FAILURE_SESSION_NOT_READY,

>>  };

>>  

>> -int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)

>> +static bool iscsi_eh_running(struct iscsi_conn *conn, struct scsi_cmnd *sc,

>> +			     int *reason)

>>  {

>> -	struct iscsi_cls_session *cls_session;

>> -	struct iscsi_host *ihost;

>> -	int reason = 0;

>> -	struct iscsi_session *session;

>> -	struct iscsi_conn *conn;

>> -	struct iscsi_task *task = NULL;

>> -

>> -	sc->result = 0;

>> -	sc->SCp.ptr = NULL;

>> -

>> -	ihost = shost_priv(host);

>> -

>> -	cls_session = starget_to_session(scsi_target(sc->device));

>> -	session = cls_session->dd_data;

>> -	spin_lock_bh(&session->frwd_lock);

>> -

>> -	reason = iscsi_session_chkready(cls_session);

>> -	if (reason) {

>> -		sc->result = reason;

>> -		goto fault;

>> -	}

>> +	struct iscsi_session *session = conn->session;

>> +	struct iscsi_tm *tmf;

>>  

>>  	if (session->state != ISCSI_STATE_LOGGED_IN) {

>>  		/*

>> @@ -1707,31 +1689,92 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)

>>  			 * state is bad, allowing completion to happen

>>  			 */

>>  			if (unlikely(system_state != SYSTEM_RUNNING)) {

>> -				reason = FAILURE_SESSION_FAILED;

>> +				*reason = FAILURE_SESSION_FAILED;

>>  				sc->result = DID_NO_CONNECT << 16;

>>  				break;

>>  			}

>>  			fallthrough;

>>  		case ISCSI_STATE_IN_RECOVERY:

>> -			reason = FAILURE_SESSION_IN_RECOVERY;

>> +			*reason = FAILURE_SESSION_IN_RECOVERY;

>>  			sc->result = DID_IMM_RETRY << 16;

>>  			break;

>>  		case ISCSI_STATE_LOGGING_OUT:

>> -			reason = FAILURE_SESSION_LOGGING_OUT;

>> +			*reason = FAILURE_SESSION_LOGGING_OUT;

>>  			sc->result = DID_IMM_RETRY << 16;

>>  			break;

>>  		case ISCSI_STATE_RECOVERY_FAILED:

>> -			reason = FAILURE_SESSION_RECOVERY_TIMEOUT;

>> +			*reason = FAILURE_SESSION_RECOVERY_TIMEOUT;

>>  			sc->result = DID_TRANSPORT_FAILFAST << 16;

>>  			break;

>>  		case ISCSI_STATE_TERMINATE:

>> -			reason = FAILURE_SESSION_TERMINATE;

>> +			*reason = FAILURE_SESSION_TERMINATE;

>>  			sc->result = DID_NO_CONNECT << 16;

>>  			break;

>>  		default:

>> -			reason = FAILURE_SESSION_FREED;

>> +			*reason = FAILURE_SESSION_FREED;

>>  			sc->result = DID_NO_CONNECT << 16;

>>  		}

>> +		goto eh_running;

>> +	}

>> +

>> +	if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) {

>> +		*reason = FAILURE_SESSION_IN_RECOVERY;

>> +		sc->result = DID_REQUEUE << 16;

>> +		goto eh_running;

>> +	}

>> +

>> +	/*

>> +	 * For sg resets make sure libiscsi, the drivers and their fw see the

>> +	 * same cmds. Once we get a TMF that can affect multiple cmds stop

>> +	 * queueing.

>> +	 */

>> +	if (conn->tmf_state != TMF_INITIAL) {

>> +		tmf = &conn->tmhdr;

>> +

>> +		switch (ISCSI_TM_FUNC_VALUE(tmf)) {

>> +		case ISCSI_TM_FUNC_LOGICAL_UNIT_RESET:

>> +			if (sc->device->lun != scsilun_to_int(&tmf->lun))

>> +				break;

>> +

>> +			ISCSI_DBG_EH(conn->session,

>> +				     "Requeue cmd sent during LU RESET processing.\n");

>> +			sc->result = DID_REQUEUE << 16;

>> +			goto eh_running;

>> +		case ISCSI_TM_FUNC_TARGET_WARM_RESET:

>> +			ISCSI_DBG_EH(conn->session,

>> +				     "Requeue cmd sent during TARGET RESET processing.\n");

>> +			sc->result = DID_REQUEUE << 16;

>> +			goto eh_running;

>> +		}

> 

> Is it common to have no default case? I thought the compiler disliked

> that. If the compiler and convention are fine with this, I'm fine with

> it too.

> 


There is no compiler warnings or errors.