[net-next,15/16] gve: DQO: Add TX path

Message ID	20210624180632.3659809-16-bcf@google.com
State	New
Headers	show Return-Path: <netdev-owner@kernel.org> Date: Thu, 24 Jun 2021 11:06:31 -0700 In-Reply-To: <20210624180632.3659809-1-bcf@google.com> Message-Id: <20210624180632.3659809-16-bcf@google.com> Mime-Version: 1.0 References: <20210624180632.3659809-1-bcf@google.com> Subject: [PATCH net-next 15/16] gve: DQO: Add TX path From: Bailey Forrest <bcf@google.com> To: Bailey Forrest <bcf@google.com>, "David S . Miller" <davem@davemloft.net> Cc: netdev@vger.kernel.org, Willem de Bruijn <willemb@google.com>, Catherine Sullivan <csully@google.com> Content-Type: text/plain; charset="UTF-8" Precedence: bulk
Series	[net-next,01/16] gve: Update GVE documentation to describe DQO \| expand [net-next,01/16] gve: Update GVE documentation to describe DQO [net-next,03/16] gve: gve_rx_copy: Move padding to an argument [net-next,05/16] gve: Introduce a new model for device options [net-next,07/16] gve: adminq: DQO specific device descriptor logic [net-next,09/16] gve: Add dqo descriptors [net-next,11/16] gve: Update adminq commands to support DQO queues [net-next,13/16] gve: DQO: Add ring allocation and initialization [net-next,15/16] gve: DQO: Add TX path

diff --git a/drivers/net/ethernet/google/gve/gve_dqo.h b/drivers/net/ethernet/google/gve/gve_dqo.h index 3b300223ea15..836042364124 100644 --- a/drivers/net/ethernet/google/gve/gve_dqo.h +++ b/drivers/net/ethernet/google/gve/gve_dqo.h @@ -19,6 +19,18 @@ #define GVE_TX_IRQ_RATELIMIT_US_DQO 50 #define GVE_RX_IRQ_RATELIMIT_US_DQO 20 +/* Timeout in seconds to wait for a reinjection completion after receiving + * its corresponding miss completion. + */ +#define GVE_REINJECT_COMPL_TIMEOUT 1 + +/* Timeout in seconds to deallocate the completion tag for a packet that was + * prematurely freed for not receiving a valid completion. This should be large + * enough to rule out the possibility of receiving the corresponding valid + * completion after this interval. + */ +#define GVE_DEALLOCATE_COMPL_TIMEOUT 60 + netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev); bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean); int gve_rx_poll_dqo(struct gve_notify_block *block, int budget); diff --git a/drivers/net/ethernet/google/gve/gve_tx_dqo.c b/drivers/net/ethernet/google/gve/gve_tx_dqo.c index bde8f90ac8bd..a4906b9df540 100644 --- a/drivers/net/ethernet/google/gve/gve_tx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_tx_dqo.c @@ -12,6 +12,67 @@ #include <linux/slab.h> #include <linux/skbuff.h> +/* Returns true if a gve_tx_pending_packet_dqo object is available. */ +static bool gve_has_pending_packet(struct gve_tx_ring *tx) +{ + /* Check TX path's list. */ + if (tx->dqo_tx.free_pending_packets != -1) + return true; + + /* Check completion handler's list. */ + if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1) + return true; + + return false; +} + +static struct gve_tx_pending_packet_dqo * +gve_alloc_pending_packet(struct gve_tx_ring *tx) +{ + struct gve_tx_pending_packet_dqo *pending_packet; + s16 index; + + index = tx->dqo_tx.free_pending_packets; + + /* No pending_packets available, try to steal the list from the + * completion handler. + */ + if (unlikely(index == -1)) { + tx->dqo_tx.free_pending_packets = + atomic_xchg(&tx->dqo_compl.free_pending_packets, -1); + index = tx->dqo_tx.free_pending_packets; + + if (unlikely(index == -1)) + return NULL; + } + + pending_packet = &tx->dqo.pending_packets[index]; + + /* Remove pending_packet from free list */ + tx->dqo_tx.free_pending_packets = pending_packet->next; + pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; + + return pending_packet; +} + +static void +gve_free_pending_packet(struct gve_tx_ring *tx, + struct gve_tx_pending_packet_dqo *pending_packet) +{ + s16 index = pending_packet - tx->dqo.pending_packets; + + pending_packet->state = GVE_PACKET_STATE_UNALLOCATED; + while (true) { + s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets); + + pending_packet->next = old_head; + if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets, + old_head, index) == old_head) { + break; + } + } +} + /* gve_tx_free_desc - Cleans up all pending tx requests and buffers. */ static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx) @@ -199,18 +260,772 @@ void gve_tx_free_rings_dqo(struct gve_priv *priv) } } +/* Returns the number of slots available in the ring */ +static inline u32 num_avail_tx_slots(const struct gve_tx_ring *tx) +{ + u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask; + + return tx->mask - num_used; +} + +/* Stops the queue if available descriptors is less than 'count'. + * Return: 0 if stop is not required. + */ +static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int count) +{ + if (likely(gve_has_pending_packet(tx) && + num_avail_tx_slots(tx) >= count)) + return 0; + + /* Update cached TX head pointer */ + tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head); + + if (likely(gve_has_pending_packet(tx) && + num_avail_tx_slots(tx) >= count)) + return 0; + + /* No space, so stop the queue */ + tx->stop_queue++; + netif_tx_stop_queue(tx->netdev_txq); + + /* Sync with restarting queue in `gve_tx_poll_dqo()` */ + mb(); + + /* After stopping queue, check if we can transmit again in order to + * avoid TOCTOU bug. + */ + tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head); + + if (likely(!gve_has_pending_packet(tx) || + num_avail_tx_slots(tx) < count)) + return -EBUSY; + + netif_tx_start_queue(tx->netdev_txq); + tx->wake_queue++; + return 0; +} + +static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb, + struct gve_tx_metadata_dqo *metadata) +{ + memset(metadata, 0, sizeof(*metadata)); + metadata->version = GVE_TX_METADATA_VERSION_DQO; + + if (skb->l4_hash) { + u16 path_hash = skb->hash ^ (skb->hash >> 16); + + path_hash &= (1 << 15) - 1; + if (unlikely(path_hash == 0)) + path_hash = ~path_hash; + + metadata->path_hash = path_hash; + } +} + +static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx, + struct sk_buff *skb, u32 len, u64 addr, + s16 compl_tag, bool eop, bool is_gso) +{ + const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL; + + while (len > 0) { + struct gve_tx_pkt_desc_dqo *desc = + &tx->dqo.tx_ring[*desc_idx].pkt; + u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO); + bool cur_eop = eop && cur_len == len; + + *desc = (struct gve_tx_pkt_desc_dqo){ + .buf_addr = cpu_to_le64(addr), + .dtype = GVE_TX_PKT_DESC_DTYPE_DQO, + .end_of_packet = cur_eop, + .checksum_offload_enable = checksum_offload_en, + .compl_tag = cpu_to_le16(compl_tag), + .buf_size = cur_len, + }; + + addr += cur_len; + len -= cur_len; + *desc_idx = (*desc_idx + 1) & tx->mask; + } +} + +/* Validates and prepares `skb` for TSO. + * + * Returns header length, or < 0 if invalid. + */ +static int gve_prep_tso(struct sk_buff *skb) +{ + struct tcphdr *tcp; + int header_len; + u32 paylen; + int err; + + /* Note: HW requires MSS (gso_size) to be <= 9728 and the total length + * of the TSO to be <= 262143. + * + * However, we don't validate these because: + * - Hypervisor enforces a limit of 9K MTU + * - Kernel will not produce a TSO larger than 64k + */ + + if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO)) + return -1; + + /* Needed because we will modify header. */ + err = skb_cow_head(skb, 0); + if (err < 0) + return err; + + tcp = tcp_hdr(skb); + + /* Remove payload length from checksum. */ + paylen = skb->len - skb_transport_offset(skb); + + switch (skb_shinfo(skb)->gso_type) { + case SKB_GSO_TCPV4: + case SKB_GSO_TCPV6: + csum_replace_by_diff(&tcp->check, + (__force __wsum)htonl(paylen)); + + /* Compute length of segmentation header. */ + header_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + break; + default: + return -EINVAL; + } + + if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO)) + return -EINVAL; + + return header_len; +} + +static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc, + const struct sk_buff *skb, + const struct gve_tx_metadata_dqo *metadata, + int header_len) +{ + *desc = (struct gve_tx_tso_context_desc_dqo){ + .header_len = header_len, + .cmd_dtype = { + .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO, + .tso = 1, + }, + .flex0 = metadata->bytes[0], + .flex5 = metadata->bytes[5], + .flex6 = metadata->bytes[6], + .flex7 = metadata->bytes[7], + .flex8 = metadata->bytes[8], + .flex9 = metadata->bytes[9], + .flex10 = metadata->bytes[10], + .flex11 = metadata->bytes[11], + }; + desc->tso_total_len = skb->len - header_len; + desc->mss = skb_shinfo(skb)->gso_size; +} + +static void +gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc, + const struct gve_tx_metadata_dqo *metadata) +{ + *desc = (struct gve_tx_general_context_desc_dqo){ + .flex0 = metadata->bytes[0], + .flex1 = metadata->bytes[1], + .flex2 = metadata->bytes[2], + .flex3 = metadata->bytes[3], + .flex4 = metadata->bytes[4], + .flex5 = metadata->bytes[5], + .flex6 = metadata->bytes[6], + .flex7 = metadata->bytes[7], + .flex8 = metadata->bytes[8], + .flex9 = metadata->bytes[9], + .flex10 = metadata->bytes[10], + .flex11 = metadata->bytes[11], + .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO}, + }; +} + +/* Returns 0 on success, or < 0 on error. + * + * Before this function is called, the caller must ensure + * gve_has_pending_packet(tx) returns true. + */ +static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx, + struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + const bool is_gso = skb_is_gso(skb); + u32 desc_idx = tx->dqo_tx.tail; + + struct gve_tx_pending_packet_dqo *pending_packet; + struct gve_tx_metadata_dqo metadata; + s16 completion_tag; + int i; + + pending_packet = gve_alloc_pending_packet(tx); + pending_packet->skb = skb; + pending_packet->num_bufs = 0; + completion_tag = pending_packet - tx->dqo.pending_packets; + + gve_extract_tx_metadata_dqo(skb, &metadata); + if (is_gso) { + int header_len = gve_prep_tso(skb); + + if (unlikely(header_len < 0)) + goto err; + + gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx, + skb, &metadata, header_len); + desc_idx = (desc_idx + 1) & tx->mask; + } + + gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx, + &metadata); + desc_idx = (desc_idx + 1) & tx->mask; + + /* Note: HW requires that the size of a non-TSO packet be within the + * range of [17, 9728]. + * + * We don't double check because + * - We limited `netdev->min_mtu` to ETH_MIN_MTU. + * - Hypervisor won't allow MTU larger than 9216. + */ + + /* Map the linear portion of skb */ + { + struct gve_tx_dma_buf *buf = + &pending_packet->bufs[pending_packet->num_bufs]; + u32 len = skb_headlen(skb); + dma_addr_t addr; + + addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx->dev, addr))) + goto err; + + dma_unmap_len_set(buf, len, len); + dma_unmap_addr_set(buf, dma, addr); + ++pending_packet->num_bufs; + + gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr, + completion_tag, + /*eop=*/shinfo->nr_frags == 0, is_gso); + } + + for (i = 0; i < shinfo->nr_frags; i++) { + struct gve_tx_dma_buf *buf = + &pending_packet->bufs[pending_packet->num_bufs]; + const skb_frag_t *frag = &shinfo->frags[i]; + bool is_eop = i == (shinfo->nr_frags - 1); + u32 len = skb_frag_size(frag); + dma_addr_t addr; + + addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx->dev, addr))) + goto err; + + dma_unmap_len_set(buf, len, len); + dma_unmap_addr_set(buf, dma, addr); + ++pending_packet->num_bufs; + + gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr, + completion_tag, is_eop, is_gso); + } + + /* Commit the changes to our state */ + tx->dqo_tx.tail = desc_idx; + + /* Request a descriptor completion on the last descriptor of the + * packet if we are allowed to by the HW enforced interval. + */ + { + u32 last_desc_idx = (desc_idx - 1) & tx->mask; + u32 last_report_event_interval = + (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask; + + if (unlikely(last_report_event_interval >= + GVE_TX_MIN_RE_INTERVAL)) { + tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true; + tx->dqo_tx.last_re_idx = last_desc_idx; + } + } + + return 0; + +err: + for (i = 0; i < pending_packet->num_bufs; i++) { + struct gve_tx_dma_buf *buf = &pending_packet->bufs[i]; + + if (i == 0) { + dma_unmap_single(tx->dev, dma_unmap_addr(buf, dma), + dma_unmap_len(buf, len), + DMA_TO_DEVICE); + } else { + dma_unmap_page(tx->dev, dma_unmap_addr(buf, dma), + dma_unmap_len(buf, len), DMA_TO_DEVICE); + } + } + + pending_packet->skb = NULL; + pending_packet->num_bufs = 0; + gve_free_pending_packet(tx, pending_packet); + + return -1; +} + +static int gve_num_descs_per_buf(size_t size) +{ + return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO); +} + +static int gve_num_buffer_descs_needed(const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + int num_descs; + int i; + + num_descs = gve_num_descs_per_buf(skb_headlen(skb)); + + for (i = 0; i < shinfo->nr_frags; i++) { + unsigned int frag_size = skb_frag_size(&shinfo->frags[i]); + + num_descs += gve_num_descs_per_buf(frag_size); + } + + return num_descs; +} + +/* Returns true if HW is capable of sending TSO represented by `skb`. + * + * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers. + * - The header is counted as one buffer for every single segment. + * - A buffer which is split between two segments is counted for both. + * - If a buffer contains both header and payload, it is counted as two buffers. + */ +static bool gve_can_send_tso(const struct sk_buff *skb) +{ + const int header_len = skb_checksum_start_offset(skb) + tcp_hdrlen(skb); + const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1; + const struct skb_shared_info *shinfo = skb_shinfo(skb); + const int gso_size = shinfo->gso_size; + int cur_seg_num_bufs; + int cur_seg_size; + int i; + + cur_seg_size = skb_headlen(skb) - header_len; + cur_seg_num_bufs = cur_seg_size > 0; + + for (i = 0; i < shinfo->nr_frags; i++) { + if (cur_seg_size >= gso_size) { + cur_seg_size %= gso_size; + cur_seg_num_bufs = cur_seg_size > 0; + } + + if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg)) + return false; + + cur_seg_size += skb_frag_size(&shinfo->frags[i]); + } + + return true; +} + +/* Attempt to transmit specified SKB. + * + * Returns 0 if the SKB was transmitted or dropped. + * Returns -1 if there is not currently enough space to transmit the SKB. + */ +static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx, + struct sk_buff *skb) +{ + int num_buffer_descs; + int total_num_descs; + + if (skb_is_gso(skb)) { + /* If TSO doesn't meet HW requirements, attempt to linearize the + * packet. + */ + if (unlikely(!gve_can_send_tso(skb) && + skb_linearize(skb) < 0)) { + net_err_ratelimited("%s: Failed to transmit TSO packet\n", + priv->dev->name); + goto drop; + } + + num_buffer_descs = gve_num_buffer_descs_needed(skb); + } else { + num_buffer_descs = gve_num_buffer_descs_needed(skb); + + if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) { + if (unlikely(skb_linearize(skb) < 0)) + goto drop; + + num_buffer_descs = 1; + } + } + + /* Metadata + (optional TSO) + data descriptors. */ + total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs; + if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs + + GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP))) { + return -1; + } + + if (unlikely(gve_tx_add_skb_no_copy_dqo(tx, skb) < 0)) + goto drop; + + netdev_tx_sent_queue(tx->netdev_txq, skb->len); + skb_tx_timestamp(skb); + return 0; + +drop: + tx->dropped_pkt++; + dev_kfree_skb_any(skb); + return 0; +} + +/* Transmit a given skb and ring the doorbell. */ netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev) { + struct gve_priv *priv = netdev_priv(dev); + struct gve_tx_ring *tx; + + tx = &priv->tx[skb_get_queue_mapping(skb)]; + if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) { + /* We need to ring the txq doorbell -- we have stopped the Tx + * queue for want of resources, but prior calls to gve_tx() + * may have added descriptors without ringing the doorbell. + */ + gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); + return NETDEV_TX_BUSY; + } + + if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more()) + return NETDEV_TX_OK; + + gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); return NETDEV_TX_OK; } +static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list, + struct gve_tx_pending_packet_dqo *pending_packet) +{ + s16 old_tail, index; + + index = pending_packet - tx->dqo.pending_packets; + old_tail = list->tail; + list->tail = index; + if (old_tail == -1) + list->head = index; + else + tx->dqo.pending_packets[old_tail].next = index; + + pending_packet->next = -1; + pending_packet->prev = old_tail; +} + +static void remove_from_list(struct gve_tx_ring *tx, + struct gve_index_list *list, + struct gve_tx_pending_packet_dqo *pending_packet) +{ + s16 index, prev_index, next_index; + + index = pending_packet - tx->dqo.pending_packets; + prev_index = pending_packet->prev; + next_index = pending_packet->next; + + if (prev_index == -1) { + /* Node is head */ + list->head = next_index; + } else { + tx->dqo.pending_packets[prev_index].next = next_index; + } + if (next_index == -1) { + /* Node is tail */ + list->tail = prev_index; + } else { + tx->dqo.pending_packets[next_index].prev = prev_index; + } +} + +static void gve_unmap_packet(struct device *dev, + struct gve_tx_pending_packet_dqo *pending_packet) +{ + struct gve_tx_dma_buf *buf; + int i; + + /* SKB linear portion is guaranteed to be mapped */ + buf = &pending_packet->bufs[0]; + dma_unmap_single(dev, dma_unmap_addr(buf, dma), + dma_unmap_len(buf, len), DMA_TO_DEVICE); + for (i = 1; i < pending_packet->num_bufs; i++) { + buf = &pending_packet->bufs[i]; + dma_unmap_page(dev, dma_unmap_addr(buf, dma), + dma_unmap_len(buf, len), DMA_TO_DEVICE); + } + pending_packet->num_bufs = 0; +} + +/* Completion types and expected behavior: + * No Miss compl + Packet compl = Packet completed normally. + * Miss compl + Re-inject compl = Packet completed normally. + * No Miss compl + Re-inject compl = Skipped i.e. packet not completed. + * Miss compl + Packet compl = Skipped i.e. packet not completed. + */ +static void gve_handle_packet_completion(struct gve_priv *priv, + struct gve_tx_ring *tx, bool is_napi, + u16 compl_tag, u64 *bytes, u64 *pkts, + bool is_reinjection) +{ + struct gve_tx_pending_packet_dqo *pending_packet; + + if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { + net_err_ratelimited("%s: Invalid TX completion tag: %d\n", + priv->dev->name, (int)compl_tag); + return; + } + + pending_packet = &tx->dqo.pending_packets[compl_tag]; + + if (unlikely(is_reinjection)) { + if (unlikely(pending_packet->state == + GVE_PACKET_STATE_TIMED_OUT_COMPL)) { + net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n", + priv->dev->name, (int)compl_tag); + /* Packet was already completed as a result of timeout, + * so just remove from list and free pending packet. + */ + remove_from_list(tx, + &tx->dqo_compl.timed_out_completions, + pending_packet); + gve_free_pending_packet(tx, pending_packet); + return; + } + if (unlikely(pending_packet->state != + GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) { + /* No outstanding miss completion but packet allocated + * implies packet receives a re-injection completion + * without a a prior miss completion. Return without + * completing the packet. + */ + net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n", + priv->dev->name, (int)compl_tag); + return; + } + remove_from_list(tx, &tx->dqo_compl.miss_completions, + pending_packet); + } else { + /* Packet is allocated but not a pending data completion. */ + if (unlikely(pending_packet->state != + GVE_PACKET_STATE_PENDING_DATA_COMPL)) { + net_err_ratelimited("%s: No pending data completion: %d\n", + priv->dev->name, (int)compl_tag); + return; + } + } + gve_unmap_packet(tx->dev, pending_packet); + + *bytes += pending_packet->skb->len; + (*pkts)++; + napi_consume_skb(pending_packet->skb, is_napi); + pending_packet->skb = NULL; + gve_free_pending_packet(tx, pending_packet); +} + +static void gve_handle_miss_completion(struct gve_priv *priv, + struct gve_tx_ring *tx, u16 compl_tag, + u64 *bytes, u64 *pkts) +{ + struct gve_tx_pending_packet_dqo *pending_packet; + + if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { + net_err_ratelimited("%s: Invalid TX completion tag: %d\n", + priv->dev->name, (int)compl_tag); + return; + } + + pending_packet = &tx->dqo.pending_packets[compl_tag]; + if (unlikely(pending_packet->state != + GVE_PACKET_STATE_PENDING_DATA_COMPL)) { + net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n", + priv->dev->name, (int)pending_packet->state, + (int)compl_tag); + return; + } + + pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL; + /* jiffies can wraparound but time comparisons can handle overflows. */ + pending_packet->timeout_jiffies = + jiffies + + msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT * + MSEC_PER_SEC); + add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet); + + *bytes += pending_packet->skb->len; + (*pkts)++; +} + +static void remove_miss_completions(struct gve_priv *priv, + struct gve_tx_ring *tx) +{ + struct gve_tx_pending_packet_dqo *pending_packet; + s16 next_index; + + next_index = tx->dqo_compl.miss_completions.head; + while (next_index != -1) { + pending_packet = &tx->dqo.pending_packets[next_index]; + next_index = pending_packet->next; + /* Break early because packets should timeout in order. */ + if (time_is_after_jiffies(pending_packet->timeout_jiffies)) + break; + + remove_from_list(tx, &tx->dqo_compl.miss_completions, + pending_packet); + /* Unmap buffers and free skb but do not unallocate packet i.e. + * the completion tag is not freed to ensure that the driver + * can take appropriate action if a corresponding valid + * completion is received later. + */ + gve_unmap_packet(tx->dev, pending_packet); + /* This indicates the packet was dropped. */ + dev_kfree_skb_any(pending_packet->skb); + pending_packet->skb = NULL; + tx->dropped_pkt++; + net_err_ratelimited("%s: No reinjection completion was received for: %ld.\n", + priv->dev->name, + (pending_packet - tx->dqo.pending_packets)); + + pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL; + pending_packet->timeout_jiffies = + jiffies + + msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT * + MSEC_PER_SEC); + /* Maintain pending packet in another list so the packet can be + * unallocated at a later time. + */ + add_to_list(tx, &tx->dqo_compl.timed_out_completions, + pending_packet); + } +} + +static void remove_timed_out_completions(struct gve_priv *priv, + struct gve_tx_ring *tx) +{ + struct gve_tx_pending_packet_dqo *pending_packet; + s16 next_index; + + next_index = tx->dqo_compl.timed_out_completions.head; + while (next_index != -1) { + pending_packet = &tx->dqo.pending_packets[next_index]; + next_index = pending_packet->next; + /* Break early because packets should timeout in order. */ + if (time_is_after_jiffies(pending_packet->timeout_jiffies)) + break; + + remove_from_list(tx, &tx->dqo_compl.timed_out_completions, + pending_packet); + gve_free_pending_packet(tx, pending_packet); + } +} + int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, struct napi_struct *napi) { - return 0; + u64 reinject_compl_bytes = 0; + u64 reinject_compl_pkts = 0; + int num_descs_cleaned = 0; + u64 miss_compl_bytes = 0; + u64 miss_compl_pkts = 0; + u64 pkt_compl_bytes = 0; + u64 pkt_compl_pkts = 0; + + /* Limit in order to avoid blocking for too long */ + while (!napi || pkt_compl_pkts < napi->weight) { + struct gve_tx_compl_desc *compl_desc = + &tx->dqo.compl_ring[tx->dqo_compl.head]; + u16 type; + + if (compl_desc->generation == tx->dqo_compl.cur_gen_bit) + break; + + /* Prefetch the next descriptor. */ + prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) & + tx->dqo.complq_mask]); + + /* Do not read data until we own the descriptor */ + dma_rmb(); + type = compl_desc->type; + + if (type == GVE_COMPL_TYPE_DQO_DESC) { + /* This is the last descriptor fetched by HW plus one */ + u16 tx_head = le16_to_cpu(compl_desc->tx_head); + + atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head); + } else if (type == GVE_COMPL_TYPE_DQO_PKT) { + u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); + + gve_handle_packet_completion(priv, tx, !!napi, + compl_tag, + &pkt_compl_bytes, + &pkt_compl_pkts, + /*is_reinjection=*/false); + } else if (type == GVE_COMPL_TYPE_DQO_MISS) { + u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); + + gve_handle_miss_completion(priv, tx, compl_tag, + &miss_compl_bytes, + &miss_compl_pkts); + } else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) { + u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); + + gve_handle_packet_completion(priv, tx, !!napi, + compl_tag, + &reinject_compl_bytes, + &reinject_compl_pkts, + /*is_reinjection=*/true); + } + + tx->dqo_compl.head = + (tx->dqo_compl.head + 1) & tx->dqo.complq_mask; + /* Flip the generation bit when we wrap around */ + tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0; + num_descs_cleaned++; + } + + netdev_tx_completed_queue(tx->netdev_txq, + pkt_compl_pkts + miss_compl_pkts, + pkt_compl_bytes + miss_compl_bytes); + + remove_miss_completions(priv, tx); + remove_timed_out_completions(priv, tx); + + u64_stats_update_begin(&tx->statss); + tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes; + tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts; + u64_stats_update_end(&tx->statss); + return num_descs_cleaned; } bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean) { - return false; + struct gve_tx_compl_desc *compl_desc; + struct gve_tx_ring *tx = block->tx; + struct gve_priv *priv = block->priv; + + if (do_clean) { + int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx, + &block->napi); + + /* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */ + mb(); + + if (netif_tx_queue_stopped(tx->netdev_txq) && + num_descs_cleaned > 0) { + tx->wake_queue++; + netif_tx_wake_queue(tx->netdev_txq); + } + } + + /* Return true if we still have work. */ + compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head]; + return compl_desc->generation != tx->dqo_compl.cur_gen_bit; }

[net-next,15/16] gve: DQO: Add TX path

Commit Message

Patch