Message ID | 20211224070712.17905-6-njavali@marvell.com |
---|---|
State | Superseded |
Headers | show |
Series | qla2xxx misc bug fixes and features | expand |
> On Dec 23, 2021, at 11:07 PM, Nilesh Javali <njavali@marvell.com> wrote: > > From: Quinn Tran <qutran@marvell.com> > > Fix premature hw access after pci error. > After a recoverable PCI error has been detected and recovered, qla driver > needs to check to see if the error condition still persist and/or wait until > the OS to give the resume signal. > > Sep 8 22:26:03 localhost kernel: WARNING: CPU: 9 PID: 124606 at qla_tmpl.c:440 > qla27xx_fwdt_entry_t266+0x55/0x60 [qla2xxx] > Sep 8 22:26:03 localhost kernel: RIP: 0010:qla27xx_fwdt_entry_t266+0x55/0x60 > [qla2xxx] > Sep 8 22:26:03 localhost kernel: Call Trace: > Sep 8 22:26:03 localhost kernel: ? qla27xx_walk_template+0xb1/0x1b0 [qla2xxx] > Sep 8 22:26:03 localhost kernel: ? qla27xx_execute_fwdt_template+0x12a/0x160 > [qla2xxx] > Sep 8 22:26:03 localhost kernel: ? qla27xx_fwdump+0xa0/0x1c0 [qla2xxx] > Sep 8 22:26:03 localhost kernel: ? qla2xxx_pci_mmio_enabled+0xfb/0x120 > [qla2xxx] > Sep 8 22:26:03 localhost kernel: ? report_mmio_enabled+0x44/0x80 > Sep 8 22:26:03 localhost kernel: ? report_slot_reset+0x80/0x80 > Sep 8 22:26:03 localhost kernel: ? pci_walk_bus+0x70/0x90 > Sep 8 22:26:03 localhost kernel: ? aer_dev_correctable_show+0xc0/0xc0 > Sep 8 22:26:03 localhost kernel: ? pcie_do_recovery+0x1bb/0x240 > Sep 8 22:26:03 localhost kernel: ? aer_recover_work_func+0xaa/0xd0 > Sep 8 22:26:03 localhost kernel: ? process_one_work+0x1a7/0x360 > .. > Sep 8 22:26:03 localhost kernel: qla2xxx [0000:42:00.2]-8041:22: detected PCI > disconnect. > Sep 8 22:26:03 localhost kernel: qla2xxx [0000:42:00.2]-107ff:22: > qla27xx_fwdt_entry_t262: dump ram MB failed. Area 5h start 198013h end 198013h > Sep 8 22:26:03 localhost kernel: qla2xxx [0000:42:00.2]-107ff:22: Unable to > capture FW dump > Sep 8 22:26:03 localhost kernel: qla2xxx [0000:42:00.2]-1015:22: cmd=0x0, > waited 5221 msecs > Sep 8 22:26:03 localhost kernel: qla2xxx [0000:42:00.2]-680d:22: mmio > enabled returning. > Sep 8 22:26:03 localhost kernel: qla2xxx [0000:42:00.2]-d04c:22: MBX > Command timeout for cmd 0, iocontrol=ffffffff jiffies=10140f2e5 > mb[0-3]=[0xffff 0xffff 0xffff 0xffff] > > Cc: stable@vger.kernel.org > Signed-off-by: Quinn Tran <qutran@marvell.com> > Signed-off-by: Nilesh Javali <njavali@marvell.com> > --- > drivers/scsi/qla2xxx/qla_os.c | 10 +++++++++- > drivers/scsi/qla2xxx/qla_tmpl.c | 9 +++++++-- > 2 files changed, 16 insertions(+), 3 deletions(-) > > diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c > index 0a7b00d165c7..c4b4b4496399 100644 > --- a/drivers/scsi/qla2xxx/qla_os.c > +++ b/drivers/scsi/qla2xxx/qla_os.c > @@ -7639,7 +7639,7 @@ qla2xxx_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) > > switch (state) { > case pci_channel_io_normal: > - ha->flags.eeh_busy = 0; > + qla_pci_set_eeh_busy(vha); > if (ql2xmqsupport || ql2xnvmeenable) { > set_bit(QPAIR_ONLINE_CHECK_NEEDED, &vha->dpc_flags); > qla2xxx_wake_dpc(vha); > @@ -7680,9 +7680,16 @@ qla2xxx_pci_mmio_enabled(struct pci_dev *pdev) > "mmio enabled\n"); > > ha->pci_error_state = QLA_PCI_MMIO_ENABLED; > + > if (IS_QLA82XX(ha)) > return PCI_ERS_RESULT_RECOVERED; > > + if (qla2x00_isp_reg_stat(ha)) { > + ql_log(ql_log_info, base_vha, 0x803f, > + "During mmio enabled, PCI/Register disconnect still detected.\n"); > + goto out; > + } > + > spin_lock_irqsave(&ha->hardware_lock, flags); > if (IS_QLA2100(ha) || IS_QLA2200(ha)){ > stat = rd_reg_word(®->hccr); > @@ -7704,6 +7711,7 @@ qla2xxx_pci_mmio_enabled(struct pci_dev *pdev) > "RISC paused -- mmio_enabled, Dumping firmware.\n"); > qla2xxx_dump_fw(base_vha); > } > +out: > /* set PCI_ERS_RESULT_NEED_RESET to trigger call to qla2xxx_pci_slot_reset */ > ql_dbg(ql_dbg_aer, base_vha, 0x600d, > "mmio enabled returning.\n"); > diff --git a/drivers/scsi/qla2xxx/qla_tmpl.c b/drivers/scsi/qla2xxx/qla_tmpl.c > index 26c13a953b97..b0a74b036cf4 100644 > --- a/drivers/scsi/qla2xxx/qla_tmpl.c > +++ b/drivers/scsi/qla2xxx/qla_tmpl.c > @@ -435,8 +435,13 @@ qla27xx_fwdt_entry_t266(struct scsi_qla_host *vha, > { > ql_dbg(ql_dbg_misc, vha, 0xd20a, > "%s: reset risc [%lx]\n", __func__, *len); > - if (buf) > - WARN_ON_ONCE(qla24xx_soft_reset(vha->hw) != QLA_SUCCESS); > + if (buf) { > + if (qla24xx_soft_reset(vha->hw) != QLA_SUCCESS) { > + ql_dbg(ql_dbg_async, vha, 0x5001, > + "%s: unable to soft reset\n", __func__); > + return INVALID_ENTRY; > + } > + } > > return qla27xx_next_entry(ent); > } > -- > 2.23.1 > Looks Good. Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com> -- Himanshu Madhani Oracle Linux Engineering
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 0a7b00d165c7..c4b4b4496399 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -7639,7 +7639,7 @@ qla2xxx_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) switch (state) { case pci_channel_io_normal: - ha->flags.eeh_busy = 0; + qla_pci_set_eeh_busy(vha); if (ql2xmqsupport || ql2xnvmeenable) { set_bit(QPAIR_ONLINE_CHECK_NEEDED, &vha->dpc_flags); qla2xxx_wake_dpc(vha); @@ -7680,9 +7680,16 @@ qla2xxx_pci_mmio_enabled(struct pci_dev *pdev) "mmio enabled\n"); ha->pci_error_state = QLA_PCI_MMIO_ENABLED; + if (IS_QLA82XX(ha)) return PCI_ERS_RESULT_RECOVERED; + if (qla2x00_isp_reg_stat(ha)) { + ql_log(ql_log_info, base_vha, 0x803f, + "During mmio enabled, PCI/Register disconnect still detected.\n"); + goto out; + } + spin_lock_irqsave(&ha->hardware_lock, flags); if (IS_QLA2100(ha) || IS_QLA2200(ha)){ stat = rd_reg_word(®->hccr); @@ -7704,6 +7711,7 @@ qla2xxx_pci_mmio_enabled(struct pci_dev *pdev) "RISC paused -- mmio_enabled, Dumping firmware.\n"); qla2xxx_dump_fw(base_vha); } +out: /* set PCI_ERS_RESULT_NEED_RESET to trigger call to qla2xxx_pci_slot_reset */ ql_dbg(ql_dbg_aer, base_vha, 0x600d, "mmio enabled returning.\n"); diff --git a/drivers/scsi/qla2xxx/qla_tmpl.c b/drivers/scsi/qla2xxx/qla_tmpl.c index 26c13a953b97..b0a74b036cf4 100644 --- a/drivers/scsi/qla2xxx/qla_tmpl.c +++ b/drivers/scsi/qla2xxx/qla_tmpl.c @@ -435,8 +435,13 @@ qla27xx_fwdt_entry_t266(struct scsi_qla_host *vha, { ql_dbg(ql_dbg_misc, vha, 0xd20a, "%s: reset risc [%lx]\n", __func__, *len); - if (buf) - WARN_ON_ONCE(qla24xx_soft_reset(vha->hw) != QLA_SUCCESS); + if (buf) { + if (qla24xx_soft_reset(vha->hw) != QLA_SUCCESS) { + ql_dbg(ql_dbg_async, vha, 0x5001, + "%s: unable to soft reset\n", __func__); + return INVALID_ENTRY; + } + } return qla27xx_next_entry(ent); }