Message ID | 20210618094911.20377-1-njavali@marvell.com |
---|---|
State | New |
Headers | show |
Series | qla2xxx: add heartbeat check | expand |
On 6/18/21 4:49 AM, Nilesh Javali wrote: > From: Quinn Tran <qutran@marvell.com> > > Use 'no-op' mailbox command to check and see if FW is still responsive. > > Signed-off-by: Quinn Tran <qutran@marvell.com> > Signed-off-by: Nilesh Javali <njavali@marvell.com> > --- > drivers/scsi/qla2xxx/qla_def.h | 4 ++ > drivers/scsi/qla2xxx/qla_gbl.h | 1 + > drivers/scsi/qla2xxx/qla_init.c | 6 ++- > drivers/scsi/qla2xxx/qla_iocb.c | 4 ++ > drivers/scsi/qla2xxx/qla_isr.c | 4 ++ > drivers/scsi/qla2xxx/qla_mbx.c | 27 +++++++++++++ > drivers/scsi/qla2xxx/qla_nvme.c | 4 ++ > drivers/scsi/qla2xxx/qla_os.c | 68 +++++++++++++++++++++++++++++++++ > 8 files changed, 117 insertions(+), 1 deletion(-) > > diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h > index def4d99f80e9..2f67ec1df3e6 100644 > --- a/drivers/scsi/qla2xxx/qla_def.h > +++ b/drivers/scsi/qla2xxx/qla_def.h > @@ -3660,6 +3660,8 @@ struct qla_qpair { > struct qla_tgt_counters tgt_counters; > uint16_t cpuid; > struct qla_fw_resources fwres ____cacheline_aligned; > + u32 cmd_cnt; > + u32 cmd_completion_cnt; > }; > > /* Place holder for FW buffer parameters */ > @@ -4616,6 +4618,7 @@ struct qla_hw_data { > > struct qla_hw_data_stat stat; > pci_error_state_t pci_error_state; > + u64 prev_cmd_cnt; > }; > > struct active_regions { > @@ -4743,6 +4746,7 @@ typedef struct scsi_qla_host { > #define SET_ZIO_THRESHOLD_NEEDED 32 > #define ISP_ABORT_TO_ROM 33 > #define VPORT_DELETE 34 > +#define HEARTBEAT_CHK 38 > > #define PROCESS_PUREX_IOCB 63 > > diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h > index fae5cae6f0a8..70b7cda0a25a 100644 > --- a/drivers/scsi/qla2xxx/qla_gbl.h > +++ b/drivers/scsi/qla2xxx/qla_gbl.h > @@ -551,6 +551,7 @@ extern int qla2xxx_read_remote_register(scsi_qla_host_t *, uint32_t, > uint32_t *); > extern int qla2xxx_write_remote_register(scsi_qla_host_t *, uint32_t, > uint32_t); > +void qla_no_op_mb(struct scsi_qla_host *vha); > > /* > * Global Function Prototypes in qla_isr.c source file. > diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c > index eb825318e3f5..f8f471157109 100644 > --- a/drivers/scsi/qla2xxx/qla_init.c > +++ b/drivers/scsi/qla2xxx/qla_init.c > @@ -6870,10 +6870,14 @@ qla2x00_abort_isp_cleanup(scsi_qla_host_t *vha) > ha->flags.fw_init_done = 0; > ha->chip_reset++; > ha->base_qpair->chip_reset = ha->chip_reset; > + ha->base_qpair->cmd_cnt = ha->base_qpair->cmd_completion_cnt = 0; > for (i = 0; i < ha->max_qpairs; i++) { > - if (ha->queue_pair_map[i]) > + if (ha->queue_pair_map[i]) { > ha->queue_pair_map[i]->chip_reset = > ha->base_qpair->chip_reset; > + ha->queue_pair_map[i]->cmd_cnt = > + ha->queue_pair_map[i]->cmd_completion_cnt = 0; > + } > } > > /* purge MBox commands */ > diff --git a/drivers/scsi/qla2xxx/qla_iocb.c b/drivers/scsi/qla2xxx/qla_iocb.c > index 38b5bdde2405..d0ee843f6b04 100644 > --- a/drivers/scsi/qla2xxx/qla_iocb.c > +++ b/drivers/scsi/qla2xxx/qla_iocb.c > @@ -1710,6 +1710,7 @@ qla24xx_start_scsi(srb_t *sp) > } else > req->ring_ptr++; > > + sp->qpair->cmd_cnt++; > sp->flags |= SRB_DMA_VALID; > > /* Set chip new ring index. */ > @@ -1912,6 +1913,7 @@ qla24xx_dif_start_scsi(srb_t *sp) > } else > req->ring_ptr++; > > + sp->qpair->cmd_cnt++; > /* Set chip new ring index. */ > wrt_reg_dword(req->req_q_in, req->ring_index); > > @@ -2068,6 +2070,7 @@ qla2xxx_start_scsi_mq(srb_t *sp) > } else > req->ring_ptr++; > > + sp->qpair->cmd_cnt++; > sp->flags |= SRB_DMA_VALID; > > /* Set chip new ring index. */ > @@ -2284,6 +2287,7 @@ qla2xxx_dif_start_scsi_mq(srb_t *sp) > } else > req->ring_ptr++; > > + sp->qpair->cmd_cnt++; > /* Set chip new ring index. */ > wrt_reg_dword(req->req_q_in, req->ring_index); > > diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c > index 6e8f737a4af3..8a8e355f4a89 100644 > --- a/drivers/scsi/qla2xxx/qla_isr.c > +++ b/drivers/scsi/qla2xxx/qla_isr.c > @@ -2322,6 +2322,8 @@ static void qla24xx_nvme_iocb_entry(scsi_qla_host_t *vha, struct req_que *req, > > if (unlikely(iocb->u.nvme.aen_op)) > atomic_dec(&sp->vha->hw->nvme_active_aen_cnt); > + else > + sp->qpair->cmd_completion_cnt++; > > if (unlikely(comp_status != CS_COMPLETE)) > logit = 1; > @@ -2976,6 +2978,8 @@ qla2x00_status_entry(scsi_qla_host_t *vha, struct rsp_que *rsp, void *pkt) > return; > } > > + sp->qpair->cmd_completion_cnt++; > + > /* Fast path completion. */ > if (comp_status == CS_COMPLETE && scsi_status == 0) { > qla2x00_process_completed_request(vha, req, handle); > diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c > index 0bcd8afdc0ff..9f3ad8aa649c 100644 > --- a/drivers/scsi/qla2xxx/qla_mbx.c > +++ b/drivers/scsi/qla2xxx/qla_mbx.c > @@ -6939,3 +6939,30 @@ ql26xx_led_config(scsi_qla_host_t *vha, uint16_t options, uint16_t *led) > > return rval; > } > + > +/** > + * qla_no_op_mb(): This MB is used to check if FW is still alive and > + * able to generate an interrupt. Otherwise, a timeout will trigger > + * FW dump + reset > + * @vha: host adapter pointer > + * Return: None > + */ > +void qla_no_op_mb(struct scsi_qla_host *vha) > +{ > + mbx_cmd_t mc; > + mbx_cmd_t *mcp = &mc; > + int rval; > + > + memset(&mc, 0, sizeof(mc)); > + mcp->mb[0] = 0; // noop cmd= 0 > + mcp->out_mb = MBX_0; > + mcp->in_mb = MBX_0; > + mcp->tov = 5; > + mcp->flags = 0; > + rval = qla2x00_mailbox_command(vha, mcp); > + > + if (rval) { > + ql_dbg(ql_dbg_async, vha, 0x7071, > + "Failed %s %x\n", __func__, rval); > + } > +} > diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c > index e119f8b24e33..3e5c70a1d969 100644 > --- a/drivers/scsi/qla2xxx/qla_nvme.c > +++ b/drivers/scsi/qla2xxx/qla_nvme.c > @@ -536,6 +536,10 @@ static inline int qla2x00_start_nvme_mq(srb_t *sp) > req->ring_ptr++; > } > > + /* ignore nvme async cmd due to long timeout */ > + if (!nvme->u.nvme.aen_op) > + sp->qpair->cmd_cnt++; > + > /* Set chip new ring index. */ > wrt_reg_dword(req->req_q_in, req->ring_index); > > diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c > index 4eab564ea6a0..aa8581e07156 100644 > --- a/drivers/scsi/qla2xxx/qla_os.c > +++ b/drivers/scsi/qla2xxx/qla_os.c > @@ -6969,6 +6969,17 @@ qla2x00_do_dpc(void *data) > qla2x00_lip_reset(base_vha); > } > > + if (test_bit(HEARTBEAT_CHK, &base_vha->dpc_flags)) { > + /* > + * if there is a mb in progress then that's > + * enough of a check to see if fw is still ticking. > + */ > + if (!ha->flags.mbox_busy && base_vha->flags.init_done) > + qla_no_op_mb(base_vha); > + > + clear_bit(HEARTBEAT_CHK, &base_vha->dpc_flags); > + } > + > ha->dpc_active = 0; > end_loop: > set_current_state(TASK_INTERRUPTIBLE); > @@ -7025,6 +7036,61 @@ qla2x00_rst_aen(scsi_qla_host_t *vha) > } > } > > +static bool qla_do_hb(struct scsi_qla_host *vha) > +{ small nit... I would have like this name above to look similar as qla_heart_beat() for ease of reading, i.e., qla_do_heat_beat() ? > + u64 cmd_cnt, prev_cmd_cnt; > + bool do_hb = false; > + struct qla_hw_data *ha = vha->hw; > + int i; > + > + /* if cmds are still pending down in fw, then do hb */ > + if (ha->base_qpair->cmd_cnt != ha->base_qpair->cmd_completion_cnt) { > + do_hb = true; > + goto skip; > + } > + > + for (i = 0; i < ha->max_qpairs; i++) { > + if (ha->queue_pair_map[i] && > + ha->queue_pair_map[i]->cmd_cnt != > + ha->queue_pair_map[i]->cmd_completion_cnt) { > + do_hb = true; > + break; > + } > + } > + > +skip: > + prev_cmd_cnt = ha->prev_cmd_cnt; > + cmd_cnt = ha->base_qpair->cmd_cnt; > + for (i = 0; i < ha->max_qpairs; i++) { > + if (ha->queue_pair_map[i]) > + cmd_cnt += ha->queue_pair_map[i]->cmd_cnt; > + } > + ha->prev_cmd_cnt = cmd_cnt; > + > + if (!do_hb && ((cmd_cnt - prev_cmd_cnt) > 50)) > + /* > + * IOs are completing before periodic hb check. > + * IOs seems to be running, do hb for sanity check. > + */ > + do_hb = true; > + > + return do_hb; > +} > + > +static void qla_heart_beat(struct scsi_qla_host *vha) > +{ > + if (vha->vp_idx) > + return; > + > + if (vha->hw->flags.eeh_busy || qla2x00_chip_is_down(vha)) > + return; > + > + if (qla_do_hb(vha)) { > + set_bit(HEARTBEAT_CHK, &vha->dpc_flags); > + qla2xxx_wake_dpc(vha); > + } > +} > + > /************************************************************************** > * qla2x00_timer > * > @@ -7243,6 +7309,8 @@ qla2x00_timer(struct timer_list *t) > qla2xxx_wake_dpc(vha); > } > > + qla_heart_beat(vha); > + > qla2x00_restart_timer(vha, WATCH_INTERVAL); > } > > other than small nit... Patch itself is good. Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index def4d99f80e9..2f67ec1df3e6 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -3660,6 +3660,8 @@ struct qla_qpair { struct qla_tgt_counters tgt_counters; uint16_t cpuid; struct qla_fw_resources fwres ____cacheline_aligned; + u32 cmd_cnt; + u32 cmd_completion_cnt; }; /* Place holder for FW buffer parameters */ @@ -4616,6 +4618,7 @@ struct qla_hw_data { struct qla_hw_data_stat stat; pci_error_state_t pci_error_state; + u64 prev_cmd_cnt; }; struct active_regions { @@ -4743,6 +4746,7 @@ typedef struct scsi_qla_host { #define SET_ZIO_THRESHOLD_NEEDED 32 #define ISP_ABORT_TO_ROM 33 #define VPORT_DELETE 34 +#define HEARTBEAT_CHK 38 #define PROCESS_PUREX_IOCB 63 diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h index fae5cae6f0a8..70b7cda0a25a 100644 --- a/drivers/scsi/qla2xxx/qla_gbl.h +++ b/drivers/scsi/qla2xxx/qla_gbl.h @@ -551,6 +551,7 @@ extern int qla2xxx_read_remote_register(scsi_qla_host_t *, uint32_t, uint32_t *); extern int qla2xxx_write_remote_register(scsi_qla_host_t *, uint32_t, uint32_t); +void qla_no_op_mb(struct scsi_qla_host *vha); /* * Global Function Prototypes in qla_isr.c source file. diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index eb825318e3f5..f8f471157109 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -6870,10 +6870,14 @@ qla2x00_abort_isp_cleanup(scsi_qla_host_t *vha) ha->flags.fw_init_done = 0; ha->chip_reset++; ha->base_qpair->chip_reset = ha->chip_reset; + ha->base_qpair->cmd_cnt = ha->base_qpair->cmd_completion_cnt = 0; for (i = 0; i < ha->max_qpairs; i++) { - if (ha->queue_pair_map[i]) + if (ha->queue_pair_map[i]) { ha->queue_pair_map[i]->chip_reset = ha->base_qpair->chip_reset; + ha->queue_pair_map[i]->cmd_cnt = + ha->queue_pair_map[i]->cmd_completion_cnt = 0; + } } /* purge MBox commands */ diff --git a/drivers/scsi/qla2xxx/qla_iocb.c b/drivers/scsi/qla2xxx/qla_iocb.c index 38b5bdde2405..d0ee843f6b04 100644 --- a/drivers/scsi/qla2xxx/qla_iocb.c +++ b/drivers/scsi/qla2xxx/qla_iocb.c @@ -1710,6 +1710,7 @@ qla24xx_start_scsi(srb_t *sp) } else req->ring_ptr++; + sp->qpair->cmd_cnt++; sp->flags |= SRB_DMA_VALID; /* Set chip new ring index. */ @@ -1912,6 +1913,7 @@ qla24xx_dif_start_scsi(srb_t *sp) } else req->ring_ptr++; + sp->qpair->cmd_cnt++; /* Set chip new ring index. */ wrt_reg_dword(req->req_q_in, req->ring_index); @@ -2068,6 +2070,7 @@ qla2xxx_start_scsi_mq(srb_t *sp) } else req->ring_ptr++; + sp->qpair->cmd_cnt++; sp->flags |= SRB_DMA_VALID; /* Set chip new ring index. */ @@ -2284,6 +2287,7 @@ qla2xxx_dif_start_scsi_mq(srb_t *sp) } else req->ring_ptr++; + sp->qpair->cmd_cnt++; /* Set chip new ring index. */ wrt_reg_dword(req->req_q_in, req->ring_index); diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c index 6e8f737a4af3..8a8e355f4a89 100644 --- a/drivers/scsi/qla2xxx/qla_isr.c +++ b/drivers/scsi/qla2xxx/qla_isr.c @@ -2322,6 +2322,8 @@ static void qla24xx_nvme_iocb_entry(scsi_qla_host_t *vha, struct req_que *req, if (unlikely(iocb->u.nvme.aen_op)) atomic_dec(&sp->vha->hw->nvme_active_aen_cnt); + else + sp->qpair->cmd_completion_cnt++; if (unlikely(comp_status != CS_COMPLETE)) logit = 1; @@ -2976,6 +2978,8 @@ qla2x00_status_entry(scsi_qla_host_t *vha, struct rsp_que *rsp, void *pkt) return; } + sp->qpair->cmd_completion_cnt++; + /* Fast path completion. */ if (comp_status == CS_COMPLETE && scsi_status == 0) { qla2x00_process_completed_request(vha, req, handle); diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index 0bcd8afdc0ff..9f3ad8aa649c 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -6939,3 +6939,30 @@ ql26xx_led_config(scsi_qla_host_t *vha, uint16_t options, uint16_t *led) return rval; } + +/** + * qla_no_op_mb(): This MB is used to check if FW is still alive and + * able to generate an interrupt. Otherwise, a timeout will trigger + * FW dump + reset + * @vha: host adapter pointer + * Return: None + */ +void qla_no_op_mb(struct scsi_qla_host *vha) +{ + mbx_cmd_t mc; + mbx_cmd_t *mcp = &mc; + int rval; + + memset(&mc, 0, sizeof(mc)); + mcp->mb[0] = 0; // noop cmd= 0 + mcp->out_mb = MBX_0; + mcp->in_mb = MBX_0; + mcp->tov = 5; + mcp->flags = 0; + rval = qla2x00_mailbox_command(vha, mcp); + + if (rval) { + ql_dbg(ql_dbg_async, vha, 0x7071, + "Failed %s %x\n", __func__, rval); + } +} diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c index e119f8b24e33..3e5c70a1d969 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.c +++ b/drivers/scsi/qla2xxx/qla_nvme.c @@ -536,6 +536,10 @@ static inline int qla2x00_start_nvme_mq(srb_t *sp) req->ring_ptr++; } + /* ignore nvme async cmd due to long timeout */ + if (!nvme->u.nvme.aen_op) + sp->qpair->cmd_cnt++; + /* Set chip new ring index. */ wrt_reg_dword(req->req_q_in, req->ring_index); diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 4eab564ea6a0..aa8581e07156 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -6969,6 +6969,17 @@ qla2x00_do_dpc(void *data) qla2x00_lip_reset(base_vha); } + if (test_bit(HEARTBEAT_CHK, &base_vha->dpc_flags)) { + /* + * if there is a mb in progress then that's + * enough of a check to see if fw is still ticking. + */ + if (!ha->flags.mbox_busy && base_vha->flags.init_done) + qla_no_op_mb(base_vha); + + clear_bit(HEARTBEAT_CHK, &base_vha->dpc_flags); + } + ha->dpc_active = 0; end_loop: set_current_state(TASK_INTERRUPTIBLE); @@ -7025,6 +7036,61 @@ qla2x00_rst_aen(scsi_qla_host_t *vha) } } +static bool qla_do_hb(struct scsi_qla_host *vha) +{ + u64 cmd_cnt, prev_cmd_cnt; + bool do_hb = false; + struct qla_hw_data *ha = vha->hw; + int i; + + /* if cmds are still pending down in fw, then do hb */ + if (ha->base_qpair->cmd_cnt != ha->base_qpair->cmd_completion_cnt) { + do_hb = true; + goto skip; + } + + for (i = 0; i < ha->max_qpairs; i++) { + if (ha->queue_pair_map[i] && + ha->queue_pair_map[i]->cmd_cnt != + ha->queue_pair_map[i]->cmd_completion_cnt) { + do_hb = true; + break; + } + } + +skip: + prev_cmd_cnt = ha->prev_cmd_cnt; + cmd_cnt = ha->base_qpair->cmd_cnt; + for (i = 0; i < ha->max_qpairs; i++) { + if (ha->queue_pair_map[i]) + cmd_cnt += ha->queue_pair_map[i]->cmd_cnt; + } + ha->prev_cmd_cnt = cmd_cnt; + + if (!do_hb && ((cmd_cnt - prev_cmd_cnt) > 50)) + /* + * IOs are completing before periodic hb check. + * IOs seems to be running, do hb for sanity check. + */ + do_hb = true; + + return do_hb; +} + +static void qla_heart_beat(struct scsi_qla_host *vha) +{ + if (vha->vp_idx) + return; + + if (vha->hw->flags.eeh_busy || qla2x00_chip_is_down(vha)) + return; + + if (qla_do_hb(vha)) { + set_bit(HEARTBEAT_CHK, &vha->dpc_flags); + qla2xxx_wake_dpc(vha); + } +} + /************************************************************************** * qla2x00_timer * @@ -7243,6 +7309,8 @@ qla2x00_timer(struct timer_list *t) qla2xxx_wake_dpc(vha); } + qla_heart_beat(vha); + qla2x00_restart_timer(vha, WATCH_INTERVAL); }