@@ -1595,6 +1595,12 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
struct device *dev = &hdev->pdev->dev;
u32 status;
+ if (!test_bit(HCLGE_STATE_SERVICE_INITED, &hdev->state)) {
+ dev_err(dev,
+ "Can't recover - RAS error reported during dev init\n");
+ return PCI_ERS_RESULT_NONE;
+ }
+
status = hclge_read_dev(&hdev->hw, HCLGE_RAS_PF_OTHER_INT_STS_REG);
if (status & HCLGE_RAS_REG_NFE_MASK ||
@@ -1631,6 +1637,21 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
return PCI_ERS_RESULT_RECOVERED;
}
+static int hclge_clear_hw_msix_error(struct hclge_dev *hdev,
+ struct hclge_desc *desc, bool is_mpf,
+ u32 bd_num)
+{
+ if (is_mpf)
+ desc[0].opcode =
+ cpu_to_le16(HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT);
+ else
+ desc[0].opcode = cpu_to_le16(HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT);
+
+ desc[0].flag = cpu_to_le16(HCLGE_CMD_FLAG_NO_INTR | HCLGE_CMD_FLAG_IN);
+
+ return hclge_cmd_send(&hdev->hw, &desc[0], bd_num);
+}
+
/* hclge_query_8bd_info: query information about over_8bd_nfe_err
* @hdev: pointer to struct hclge_dev
* @vf_id: Index of the virtual function with error
@@ -1706,8 +1727,8 @@ static void hclge_handle_over_8bd_err(struct hclge_dev *hdev,
}
}
-int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
- unsigned long *reset_requests)
+static int hclge_handle_all_hw_msix_error(struct hclge_dev *hdev,
+ unsigned long *reset_requests)
{
struct hclge_mac_tnl_stats mac_tnl_stats;
struct device *dev = &hdev->pdev->dev;
@@ -1764,8 +1785,7 @@ int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
status, reset_requests);
/* clear all main PF MSIx errors */
- hclge_cmd_reuse_desc(&desc[0], false);
- ret = hclge_cmd_send(&hdev->hw, &desc[0], mpf_bd_num);
+ ret = hclge_clear_hw_msix_error(hdev, desc, true, mpf_bd_num);
if (ret) {
dev_err(dev, "clear all mpf msix int cmd failed (%d)\n",
ret);
@@ -1811,11 +1831,10 @@ int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
hclge_handle_over_8bd_err(hdev, reset_requests);
/* clear all PF MSIx errors */
- hclge_cmd_reuse_desc(&desc[0], false);
- ret = hclge_cmd_send(&hdev->hw, &desc[0], pf_bd_num);
+ ret = hclge_clear_hw_msix_error(hdev, desc, false, pf_bd_num);
if (ret) {
- dev_err(dev, "clear all pf msix int cmd failed (%d)\n",
- ret);
+ dev_err(dev, "clear all pf msix int cmd failed (%d)\n", ret);
+ goto msi_error;
}
/* query and clear mac tnl interruptions */
@@ -1847,3 +1866,79 @@ int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
out:
return ret;
}
+
+int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
+ unsigned long *reset_requests)
+{
+ struct device *dev = &hdev->pdev->dev;
+
+ if (!test_bit(HCLGE_STATE_SERVICE_INITED, &hdev->state)) {
+ dev_err(dev,
+ "Can't handle - MSIx error reported during dev init\n");
+ return 0;
+ }
+
+ return hclge_handle_all_hw_msix_error(hdev, reset_requests);
+}
+
+void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev)
+{
+#define HCLGE_DESC_NO_DATA_LEN 8
+
+ struct hclge_dev *hdev = ae_dev->priv;
+ struct device *dev = &hdev->pdev->dev;
+ u32 mpf_bd_num, pf_bd_num, bd_num;
+ struct hclge_desc desc_bd;
+ struct hclge_desc *desc;
+ u32 status;
+ int ret;
+
+ ae_dev->hw_err_reset_req = 0;
+ status = hclge_read_dev(&hdev->hw, HCLGE_RAS_PF_OTHER_INT_STS_REG);
+
+ /* query the number of bds for the MSIx int status */
+ hclge_cmd_setup_basic_desc(&desc_bd, HCLGE_QUERY_MSIX_INT_STS_BD_NUM,
+ true);
+ ret = hclge_cmd_send(&hdev->hw, &desc_bd, 1);
+ if (ret) {
+ dev_err(dev, "fail(%d) to query msix int status bd num\n",
+ ret);
+ return;
+ }
+
+ mpf_bd_num = le32_to_cpu(desc_bd.data[0]);
+ pf_bd_num = le32_to_cpu(desc_bd.data[1]);
+ bd_num = max_t(u32, mpf_bd_num, pf_bd_num);
+
+ desc = kcalloc(bd_num, sizeof(struct hclge_desc), GFP_KERNEL);
+ if (!desc)
+ return;
+
+ /* Clear HNS hw errors reported through msix */
+ memset(&desc[0].data[0], 0xFF, mpf_bd_num * sizeof(struct hclge_desc) -
+ HCLGE_DESC_NO_DATA_LEN);
+ ret = hclge_clear_hw_msix_error(hdev, desc, true, mpf_bd_num);
+ if (ret) {
+ dev_err(dev, "fail(%d) to clear mpf msix int during init\n",
+ ret);
+ goto msi_error;
+ }
+
+ memset(&desc[0].data[0], 0xFF, pf_bd_num * sizeof(struct hclge_desc) -
+ HCLGE_DESC_NO_DATA_LEN);
+ ret = hclge_clear_hw_msix_error(hdev, desc, false, pf_bd_num);
+ if (ret) {
+ dev_err(dev, "fail(%d) to clear pf msix int during init\n",
+ ret);
+ goto msi_error;
+ }
+
+ /* Handle Non-fatal HNS RAS errors */
+ if (status & HCLGE_RAS_REG_NFE_MASK) {
+ dev_warn(dev, "HNS hw error(RAS) identified during init\n");
+ hclge_handle_all_ras_errors(hdev);
+ }
+
+msi_error:
+ kfree(desc);
+}
@@ -123,6 +123,7 @@ struct hclge_hw_error {
int hclge_config_mac_tnl_int(struct hclge_dev *hdev, bool en);
int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state);
int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en);
+void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev);
pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev);
int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
unsigned long *reset_requests);
@@ -8611,6 +8611,9 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
hclge_clear_all_event_cause(hdev);
+ /* Log and clear the hw errors those already occurred */
+ hclge_handle_all_hns_hw_errors(ae_dev);
+
/* Enable MISC vector(vector0) */
hclge_enable_vector(&hdev->misc_vector, true);