@@ -1741,6 +1741,7 @@ int mlx4_en_start_port(struct net_device
mlx4_en_deactivate_cq(priv, cq);
goto tx_err;
}
+ clear_bit(MLX4_EN_TX_RING_STATE_RECOVERING, &tx_ring->state);
if (t != TX_XDP) {
tx_ring->tx_queue = netdev_get_tx_queue(dev, i);
tx_ring->recycle_ring = NULL;
@@ -385,6 +385,35 @@ int mlx4_en_free_tx_buf(struct net_devic
return cnt;
}
+static void mlx4_en_handle_err_cqe(struct mlx4_en_priv *priv, struct mlx4_err_cqe *err_cqe,
+ u16 cqe_index, struct mlx4_en_tx_ring *ring)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+ struct mlx4_en_tx_info *tx_info;
+ struct mlx4_en_tx_desc *tx_desc;
+ u16 wqe_index;
+ int desc_size;
+
+ en_err(priv, "CQE error - cqn 0x%x, ci 0x%x, vendor syndrome: 0x%x syndrome: 0x%x\n",
+ ring->sp_cqn, cqe_index, err_cqe->vendor_err_syndrome, err_cqe->syndrome);
+ print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, err_cqe, sizeof(*err_cqe),
+ false);
+
+ wqe_index = be16_to_cpu(err_cqe->wqe_index) & ring->size_mask;
+ tx_info = &ring->tx_info[wqe_index];
+ desc_size = tx_info->nr_txbb << LOG_TXBB_SIZE;
+ en_err(priv, "Related WQE - qpn 0x%x, wqe index 0x%x, wqe size 0x%x\n", ring->qpn,
+ wqe_index, desc_size);
+ tx_desc = ring->buf + (wqe_index << LOG_TXBB_SIZE);
+ print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, tx_desc, desc_size, false);
+
+ if (test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING, &priv->state))
+ return;
+
+ en_err(priv, "Scheduling port restart\n");
+ queue_work(mdev->workqueue, &priv->restart_task);
+}
+
bool mlx4_en_process_tx_cq(struct net_device *dev,
struct mlx4_en_cq *cq, int napi_budget)
{
@@ -431,13 +460,10 @@ bool mlx4_en_process_tx_cq(struct net_de
dma_rmb();
if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
- MLX4_CQE_OPCODE_ERROR)) {
- struct mlx4_err_cqe *cqe_err = (struct mlx4_err_cqe *)cqe;
-
- en_err(priv, "CQE error - vendor syndrome: 0x%x syndrome: 0x%x\n",
- cqe_err->vendor_err_syndrome,
- cqe_err->syndrome);
- }
+ MLX4_CQE_OPCODE_ERROR))
+ if (!test_and_set_bit(MLX4_EN_TX_RING_STATE_RECOVERING, &ring->state))
+ mlx4_en_handle_err_cqe(priv, (struct mlx4_err_cqe *)cqe, index,
+ ring);
/* Skip over last polled CQE */
new_index = be16_to_cpu(cqe->wqe_index) & size_mask;
@@ -271,6 +271,10 @@ struct mlx4_en_page_cache {
} buf[MLX4_EN_CACHE_SIZE];
};
+enum {
+ MLX4_EN_TX_RING_STATE_RECOVERING,
+};
+
struct mlx4_en_priv;
struct mlx4_en_tx_ring {
@@ -317,6 +321,7 @@ struct mlx4_en_tx_ring {
* Only queue_stopped might be used if BQL is not properly working.
*/
unsigned long queue_stopped;
+ unsigned long state;
struct mlx4_hwq_resources sp_wqres;
struct mlx4_qp sp_qp;
struct mlx4_qp_context sp_context;