Message ID | 20210331200857.3274425-10-olteanv@gmail.com |
---|---|
State | New |
Headers | show |
Series | XDP for NXP ENETC | expand |
Vladimir Oltean <olteanv@gmail.com> writes: > From: Vladimir Oltean <vladimir.oltean@nxp.com> > > The driver implementation of the XDP_REDIRECT action reuses parts from > XDP_TX, most notably the enetc_xdp_tx function which transmits an array > of TX software BDs. Only this time, the buffers don't have DMA mappings, > we need to create them. > > When a BPF program reaches the XDP_REDIRECT verdict for a frame, we can > employ the same buffer reuse strategy as for the normal processing path > and for XDP_PASS: we can flip to the other page half and seed that to > the RX ring. > > Note that scatter/gather support is there, but disabled due to lack of > multi-buffer support in XDP (which is added by this series): > https://patchwork.kernel.org/project/netdevbpf/cover/cover.1616179034.git.lorenzo@kernel.org/ > > Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> > --- > drivers/net/ethernet/freescale/enetc/enetc.c | 212 +++++++++++++++++- > drivers/net/ethernet/freescale/enetc/enetc.h | 11 +- > .../ethernet/freescale/enetc/enetc_ethtool.c | 6 + > .../net/ethernet/freescale/enetc/enetc_pf.c | 1 + > 4 files changed, 218 insertions(+), 12 deletions(-) > > diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c > index ba5313a5d7a4..57049ae97201 100644 > --- a/drivers/net/ethernet/freescale/enetc/enetc.c > +++ b/drivers/net/ethernet/freescale/enetc/enetc.c > @@ -8,6 +8,23 @@ > #include <linux/vmalloc.h> > #include <net/pkt_sched.h> > > +static struct sk_buff *enetc_tx_swbd_get_skb(struct enetc_tx_swbd *tx_swbd) > +{ > + if (tx_swbd->is_xdp_tx || tx_swbd->is_xdp_redirect) > + return NULL; > + > + return tx_swbd->skb; > +} > + > +static struct xdp_frame * > +enetc_tx_swbd_get_xdp_frame(struct enetc_tx_swbd *tx_swbd) > +{ > + if (tx_swbd->is_xdp_redirect) > + return tx_swbd->xdp_frame; > + > + return NULL; > +} > + > static void enetc_unmap_tx_buff(struct enetc_bdr *tx_ring, > struct enetc_tx_swbd *tx_swbd) > { > @@ -25,14 +42,20 @@ static void enetc_unmap_tx_buff(struct enetc_bdr *tx_ring, > tx_swbd->dma = 0; > } > > -static void enetc_free_tx_skb(struct enetc_bdr *tx_ring, > - struct enetc_tx_swbd *tx_swbd) > +static void enetc_free_tx_frame(struct enetc_bdr *tx_ring, > + struct enetc_tx_swbd *tx_swbd) > { > + struct xdp_frame *xdp_frame = enetc_tx_swbd_get_xdp_frame(tx_swbd); > + struct sk_buff *skb = enetc_tx_swbd_get_skb(tx_swbd); > + > if (tx_swbd->dma) > enetc_unmap_tx_buff(tx_ring, tx_swbd); > > - if (tx_swbd->skb) { > - dev_kfree_skb_any(tx_swbd->skb); > + if (xdp_frame) { > + xdp_return_frame(tx_swbd->xdp_frame); > + tx_swbd->xdp_frame = NULL; > + } else if (skb) { > + dev_kfree_skb_any(skb); > tx_swbd->skb = NULL; > } > } > @@ -183,7 +206,7 @@ static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb, > > do { > tx_swbd = &tx_ring->tx_swbd[i]; > - enetc_free_tx_skb(tx_ring, tx_swbd); > + enetc_free_tx_frame(tx_ring, tx_swbd); > if (i == 0) > i = tx_ring->bd_count; > i--; > @@ -381,6 +404,9 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) > do_tstamp = false; > > while (bds_to_clean && tx_frm_cnt < ENETC_DEFAULT_TX_WORK) { > + struct xdp_frame *xdp_frame = enetc_tx_swbd_get_xdp_frame(tx_swbd); > + struct sk_buff *skb = enetc_tx_swbd_get_skb(tx_swbd); > + > if (unlikely(tx_swbd->check_wb)) { > struct enetc_ndev_priv *priv = netdev_priv(ndev); > union enetc_tx_bd *txbd; > @@ -400,12 +426,15 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) > else if (likely(tx_swbd->dma)) > enetc_unmap_tx_buff(tx_ring, tx_swbd); > > - if (tx_swbd->skb) { > + if (xdp_frame) { > + xdp_return_frame(xdp_frame); > + tx_swbd->xdp_frame = NULL; > + } else if (skb) { > if (unlikely(do_tstamp)) { > - enetc_tstamp_tx(tx_swbd->skb, tstamp); > + enetc_tstamp_tx(skb, tstamp); > do_tstamp = false; > } > - napi_consume_skb(tx_swbd->skb, napi_budget); > + napi_consume_skb(skb, napi_budget); > tx_swbd->skb = NULL; > } > > @@ -827,6 +856,109 @@ static bool enetc_xdp_tx(struct enetc_bdr *tx_ring, > return true; > } > > +static int enetc_xdp_frame_to_xdp_tx_swbd(struct enetc_bdr *tx_ring, > + struct enetc_tx_swbd *xdp_tx_arr, > + struct xdp_frame *xdp_frame) > +{ > + struct enetc_tx_swbd *xdp_tx_swbd = &xdp_tx_arr[0]; > + struct skb_shared_info *shinfo; > + void *data = xdp_frame->data; > + int len = xdp_frame->len; > + skb_frag_t *frag; > + dma_addr_t dma; > + unsigned int f; > + int n = 0; > + > + dma = dma_map_single(tx_ring->dev, data, len, DMA_TO_DEVICE); > + if (unlikely(dma_mapping_error(tx_ring->dev, dma))) { > + netdev_err(tx_ring->ndev, "DMA map error\n"); > + return -1; > + } > + > + xdp_tx_swbd->dma = dma; > + xdp_tx_swbd->dir = DMA_TO_DEVICE; > + xdp_tx_swbd->len = len; > + xdp_tx_swbd->is_xdp_redirect = true; > + xdp_tx_swbd->is_eof = false; > + xdp_tx_swbd->xdp_frame = NULL; > + > + n++; > + xdp_tx_swbd = &xdp_tx_arr[n]; > + > + shinfo = xdp_get_shared_info_from_frame(xdp_frame); > + > + for (f = 0, frag = &shinfo->frags[0]; f < shinfo->nr_frags; > + f++, frag++) { > + data = skb_frag_address(frag); > + len = skb_frag_size(frag); > + > + dma = dma_map_single(tx_ring->dev, data, len, DMA_TO_DEVICE); > + if (unlikely(dma_mapping_error(tx_ring->dev, dma))) { > + /* Undo the DMA mapping for all fragments */ > + while (n-- >= 0) > + enetc_unmap_tx_buff(tx_ring, &xdp_tx_arr[n]); > + > + netdev_err(tx_ring->ndev, "DMA map error\n"); > + return -1; > + } > + > + xdp_tx_swbd->dma = dma; > + xdp_tx_swbd->dir = DMA_TO_DEVICE; > + xdp_tx_swbd->len = len; > + xdp_tx_swbd->is_xdp_redirect = true; > + xdp_tx_swbd->is_eof = false; > + xdp_tx_swbd->xdp_frame = NULL; > + > + n++; > + xdp_tx_swbd = &xdp_tx_arr[n]; > + } > + > + xdp_tx_arr[n - 1].is_eof = true; > + xdp_tx_arr[n - 1].xdp_frame = xdp_frame; > + > + return n; > +} > + > +int enetc_xdp_xmit(struct net_device *ndev, int num_frames, > + struct xdp_frame **frames, u32 flags) > +{ > + struct enetc_tx_swbd xdp_redirect_arr[ENETC_MAX_SKB_FRAGS] = {0}; > + struct enetc_ndev_priv *priv = netdev_priv(ndev); > + struct enetc_bdr *tx_ring; > + int xdp_tx_bd_cnt, i, k; > + int xdp_tx_frm_cnt = 0; > + > + tx_ring = priv->tx_ring[smp_processor_id()]; What mechanism guarantees that this won't overflow the array? :) -Toke
On Thu, Apr 01, 2021 at 01:26:02PM +0200, Toke Høiland-Jørgensen wrote: > > +int enetc_xdp_xmit(struct net_device *ndev, int num_frames, > > + struct xdp_frame **frames, u32 flags) > > +{ > > + struct enetc_tx_swbd xdp_redirect_arr[ENETC_MAX_SKB_FRAGS] = {0}; > > + struct enetc_ndev_priv *priv = netdev_priv(ndev); > > + struct enetc_bdr *tx_ring; > > + int xdp_tx_bd_cnt, i, k; > > + int xdp_tx_frm_cnt = 0; > > + > > + tx_ring = priv->tx_ring[smp_processor_id()]; > > What mechanism guarantees that this won't overflow the array? :) Which array, the array of TX rings? You mean that it's possible to receive a TC_SETUP_QDISC_MQPRIO or TC_SETUP_QDISC_TAPRIO with num_tc == 1, and we have 2 CPUs? Well, yeah, I don't know what's the proper way to deal with that. Ideas?
Vladimir Oltean <olteanv@gmail.com> writes: > On Thu, Apr 01, 2021 at 01:26:02PM +0200, Toke Høiland-Jørgensen wrote: >> > +int enetc_xdp_xmit(struct net_device *ndev, int num_frames, >> > + struct xdp_frame **frames, u32 flags) >> > +{ >> > + struct enetc_tx_swbd xdp_redirect_arr[ENETC_MAX_SKB_FRAGS] = {0}; >> > + struct enetc_ndev_priv *priv = netdev_priv(ndev); >> > + struct enetc_bdr *tx_ring; >> > + int xdp_tx_bd_cnt, i, k; >> > + int xdp_tx_frm_cnt = 0; >> > + >> > + tx_ring = priv->tx_ring[smp_processor_id()]; >> >> What mechanism guarantees that this won't overflow the array? :) > > Which array, the array of TX rings? Yes. > You mean that it's possible to receive a TC_SETUP_QDISC_MQPRIO or > TC_SETUP_QDISC_TAPRIO with num_tc == 1, and we have 2 CPUs? Not just that, this ndo can be called on arbitrary CPUs after a redirect. The code just calls through from the XDP receive path so which CPU it ends up on depends on the RSS+IRQ config of the other device, which may not even be the same driver; i.e., you have no control over that... :) > Well, yeah, I don't know what's the proper way to deal with that. Ideas? Well the obvious one is just: tx_ring = priv->tx_ring[smp_processor_id() % num_ring_ids]; and then some kind of locking to deal with multiple CPUs accessing the same TX ring... -Toke
On Thu, Apr 01, 2021 at 01:39:05PM +0200, Toke Høiland-Jørgensen wrote: > Vladimir Oltean <olteanv@gmail.com> writes: > > > On Thu, Apr 01, 2021 at 01:26:02PM +0200, Toke Høiland-Jørgensen wrote: > >> > +int enetc_xdp_xmit(struct net_device *ndev, int num_frames, > >> > + struct xdp_frame **frames, u32 flags) > >> > +{ > >> > + struct enetc_tx_swbd xdp_redirect_arr[ENETC_MAX_SKB_FRAGS] = {0}; > >> > + struct enetc_ndev_priv *priv = netdev_priv(ndev); > >> > + struct enetc_bdr *tx_ring; > >> > + int xdp_tx_bd_cnt, i, k; > >> > + int xdp_tx_frm_cnt = 0; > >> > + > >> > + tx_ring = priv->tx_ring[smp_processor_id()]; > >> > >> What mechanism guarantees that this won't overflow the array? :) > > > > Which array, the array of TX rings? > > Yes. > The problem isn't even accessing an out-of-bounds element in the TX ring array. As it turns out, I had a relatively superficial understanding of how things are organized, but let me try to explain. The number of TX rings is a configurable resource (between PFs and VFs) and we read the capability at probe time: enetc_get_si_caps: val = enetc_rd(hw, ENETC_SICAPR0); si->num_rx_rings = (val >> 16) & 0xff; si->num_tx_rings = val & 0xff; enetc_init_si_rings_params: priv->num_tx_rings = si->num_tx_rings; In any case, the TX array is declared as: struct enetc_ndev_priv { struct enetc_bdr *tx_ring[16]; struct enetc_bdr *rx_ring[16]; }; because that's the maximum hardware capability. The priv->tx_ring array is populated in: enetc_alloc_msix: /* # of tx rings per int vector */ v_tx_rings = priv->num_tx_rings / priv->bdr_int_num; for (i = 0; i < priv->bdr_int_num; i++) { for (j = 0; j < v_tx_rings; j++) { if (priv->bdr_int_num == ENETC_MAX_BDR_INT) idx = 2 * j + i; /* 2 CPUs */ else idx = j + i * v_tx_rings; /* default */ priv->tx_ring[idx] = bdr; } } priv->bdr_int_num is set to "num_online_cpus()". On LS1028A, it can be either 1 or 2 (and the ENETC_MAX_BDR_INT macro is equal to 2). Otherwise said, the convoluted logic above does the following: - It affines an MSI interrupt vector per CPU - It affines an RX ring per MSI vector, hence per CPU - It balances the fixed number of TX rings (say 8) among the available MSI vectors, hence CPUs (say 2). It does this by iterating with i through the RX MSI interrupt vectors, and with j through the number of TX rings per MSI vector. This logic maps: - the even TX rings to CPU 0 and the odd TX rings to CPU 1, if 2 CPUs are used - all TX rings to CPU 0, if 1 CPU is used This is done because we have this logic in enetc_poll: for (i = 0; i < v->count_tx_rings; i++) if (!enetc_clean_tx_ring(&v->tx_ring[i], budget)) complete = false; for processing the TX completions of a given group of TX rings in the RX MSI interrupt handler of a certain CPU. Otherwise said, priv->tx_ring[i] is always BD ring i, and that mapping never changes. All 8 TX rings are enabled and available for use. What I knew about tc-taprio and tc-mqprio is that they only enqueue to TX queues [0, num_tc-1] because of this, as it turns out: enetc_xmit: tx_ring = priv->tx_ring[skb->queue_mapping]; where skb->queue_mapping is given by: err = netif_set_real_num_tx_queues(ndev, priv->num_tx_rings); and by this, respectively, from the mqprio code path: netif_set_real_num_tx_queues(ndev, num_tc); As for why XDP works, and priv->tx_ring[smp_processor_id()] is: - TX ring 0 for CPU 0 and TX ring 1 for CPU 1, if 2 CPUs are used - TX ring 0, if 1 CPU is used The TX completions in the first case are handled by: - CPU 0 for TX ring 0 (because it is even) and CPU 1 for TX ring 1 (because it is odd), if 2 CPUs are used, due to the mapping I talked about earlier - CPU 0 if only 1 CPU is used > > You mean that it's possible to receive a TC_SETUP_QDISC_MQPRIO or > > TC_SETUP_QDISC_TAPRIO with num_tc == 1, and we have 2 CPUs? > > Not just that, this ndo can be called on arbitrary CPUs after a > redirect. The code just calls through from the XDP receive path so which > CPU it ends up on depends on the RSS+IRQ config of the other device, > which may not even be the same driver; i.e., you have no control over > that... :) > What do you mean by "arbitrary" CPU? You can't plug CPUs in, it's a dual core system... Why does the source ifindex matter at all? I'm using the TX ring affined to the CPU that ndo_xdp_xmit is currently running on. > > Well, yeah, I don't know what's the proper way to deal with that. Ideas? > > Well the obvious one is just: > > tx_ring = priv->tx_ring[smp_processor_id() % num_ring_ids]; > > and then some kind of locking to deal with multiple CPUs accessing the > same TX ring... By multiple CPUs accessing the same TX ring, you mean locking between ndo_xdp_xmit and ndo_start_xmit? Can that even happen if the hardware architecture is to have at least as many TX rings as CPUs? Because otherwise, I see that ndo_xdp_xmit is only called from xdp_do_flush, which is in softirq context, which to my very rudimentary knowledge run with bottom halves, thus preemption, disabled? So I don't think it's possible for ndo_xdp_xmit and ndo_xmit, or even two ndo_xdp_xmit instances, to access the same TX ring? Sorry, I'm sure these are trivial questions, but I would like to really understand what I need to change and why :D
Vladimir Oltean <olteanv@gmail.com> writes: > On Thu, Apr 01, 2021 at 01:39:05PM +0200, Toke Høiland-Jørgensen wrote: >> Vladimir Oltean <olteanv@gmail.com> writes: >> >> > On Thu, Apr 01, 2021 at 01:26:02PM +0200, Toke Høiland-Jørgensen wrote: >> >> > +int enetc_xdp_xmit(struct net_device *ndev, int num_frames, >> >> > + struct xdp_frame **frames, u32 flags) >> >> > +{ >> >> > + struct enetc_tx_swbd xdp_redirect_arr[ENETC_MAX_SKB_FRAGS] = {0}; >> >> > + struct enetc_ndev_priv *priv = netdev_priv(ndev); >> >> > + struct enetc_bdr *tx_ring; >> >> > + int xdp_tx_bd_cnt, i, k; >> >> > + int xdp_tx_frm_cnt = 0; >> >> > + >> >> > + tx_ring = priv->tx_ring[smp_processor_id()]; >> >> >> >> What mechanism guarantees that this won't overflow the array? :) >> > >> > Which array, the array of TX rings? >> >> Yes. >> > > The problem isn't even accessing an out-of-bounds element in the TX ring array. > > As it turns out, I had a relatively superficial understanding of how > things are organized, but let me try to explain. > > The number of TX rings is a configurable resource (between PFs and VFs) > and we read the capability at probe time: > > enetc_get_si_caps: > val = enetc_rd(hw, ENETC_SICAPR0); > si->num_rx_rings = (val >> 16) & 0xff; > si->num_tx_rings = val & 0xff; > > enetc_init_si_rings_params: > priv->num_tx_rings = si->num_tx_rings; > > In any case, the TX array is declared as: > > struct enetc_ndev_priv { > struct enetc_bdr *tx_ring[16]; > struct enetc_bdr *rx_ring[16]; > }; > > because that's the maximum hardware capability. > > The priv->tx_ring array is populated in: > > enetc_alloc_msix: > /* # of tx rings per int vector */ > v_tx_rings = priv->num_tx_rings / priv->bdr_int_num; > > for (i = 0; i < priv->bdr_int_num; i++) { > for (j = 0; j < v_tx_rings; j++) { > if (priv->bdr_int_num == ENETC_MAX_BDR_INT) > idx = 2 * j + i; /* 2 CPUs */ > else > idx = j + i * v_tx_rings; /* default */ > > priv->tx_ring[idx] = bdr; > } > } > > priv->bdr_int_num is set to "num_online_cpus()". > On LS1028A, it can be either 1 or 2 (and the ENETC_MAX_BDR_INT macro is > equal to 2). > > Otherwise said, the convoluted logic above does the following: > - It affines an MSI interrupt vector per CPU > - It affines an RX ring per MSI vector, hence per CPU > - It balances the fixed number of TX rings (say 8) among the available > MSI vectors, hence CPUs (say 2). It does this by iterating with i > through the RX MSI interrupt vectors, and with j through the number of > TX rings per MSI vector. > > This logic maps: > - the even TX rings to CPU 0 and the odd TX rings to CPU 1, if 2 CPUs > are used > - all TX rings to CPU 0, if 1 CPU is used > > This is done because we have this logic in enetc_poll: > > for (i = 0; i < v->count_tx_rings; i++) > if (!enetc_clean_tx_ring(&v->tx_ring[i], budget)) > complete = false; > > for processing the TX completions of a given group of TX rings in the RX > MSI interrupt handler of a certain CPU. > > Otherwise said, priv->tx_ring[i] is always BD ring i, and that mapping > never changes. All 8 TX rings are enabled and available for use. > > What I knew about tc-taprio and tc-mqprio is that they only enqueue to > TX queues [0, num_tc-1] because of this, as it turns out: > > enetc_xmit: > tx_ring = priv->tx_ring[skb->queue_mapping]; > > where skb->queue_mapping is given by: > err = netif_set_real_num_tx_queues(ndev, priv->num_tx_rings); > and by this, respectively, from the mqprio code path: > netif_set_real_num_tx_queues(ndev, num_tc); > > As for why XDP works, and priv->tx_ring[smp_processor_id()] is: > - TX ring 0 for CPU 0 and TX ring 1 for CPU 1, if 2 CPUs are used > - TX ring 0, if 1 CPU is used > > The TX completions in the first case are handled by: > - CPU 0 for TX ring 0 (because it is even) and CPU 1 for TX ring 1 > (because it is odd), if 2 CPUs are used, due to the mapping I talked > about earlier > - CPU 0 if only 1 CPU is used Right - thank you for the details! So what are the constraints on the configuration. Specifically, given two netdevs on the same device, is it possible that the system can ever end up in a situation where one device has two *RXQs* configured, and the other only one *TXQ*. Because then you could get a redirect from RXQ 1 on one device, which would also end up trying to transmit on TXQ 1 on the other device; and that would break if that other device only has TXQ 0 configured... Same thing if a single device has 2 RXQs but only one TXQ (it can redirect to itself). >> > You mean that it's possible to receive a TC_SETUP_QDISC_MQPRIO or >> > TC_SETUP_QDISC_TAPRIO with num_tc == 1, and we have 2 CPUs? >> >> Not just that, this ndo can be called on arbitrary CPUs after a >> redirect. The code just calls through from the XDP receive path so which >> CPU it ends up on depends on the RSS+IRQ config of the other device, >> which may not even be the same driver; i.e., you have no control over >> that... :) >> > > What do you mean by "arbitrary" CPU? You can't plug CPUs in, it's a dual > core system... Why does the source ifindex matter at all? I'm using the > TX ring affined to the CPU that ndo_xdp_xmit is currently running on. See, this is why I asked 'what mechanism ensures'. Because if that mechanism is 'this driver is only ever used on a system with fewer CPUs than TXQs', then that's of course fine :) But there are drivers that do basically the same thing as what you've done here, *without* having such an assurance, and just looking at that function it's not obvious that there's an out-of-band reason why it's safe. And I literally just came from looking at such a case when I replied to your initial patch... >> > Well, yeah, I don't know what's the proper way to deal with that. Ideas? >> >> Well the obvious one is just: >> >> tx_ring = priv->tx_ring[smp_processor_id() % num_ring_ids]; >> >> and then some kind of locking to deal with multiple CPUs accessing the >> same TX ring... > > By multiple CPUs accessing the same TX ring, you mean locking between > ndo_xdp_xmit and ndo_start_xmit? Can that even happen if the hardware > architecture is to have at least as many TX rings as CPUs? > > Because otherwise, I see that ndo_xdp_xmit is only called from > xdp_do_flush, which is in softirq context, which to my very rudimentary > knowledge run with bottom halves, thus preemption, disabled? So I don't > think it's possible for ndo_xdp_xmit and ndo_xmit, or even two > ndo_xdp_xmit instances, to access the same TX ring? Yup, I think you're right about that. The "we always have more TXQs than CPUs" condition was the bit I was missing (and of course you're *sure* that this would never change sometime in the future, right? ;)). > Sorry, I'm sure these are trivial questions, but I would like to really > understand what I need to change and why :D Given the above I think the only potentially breaking thing is the #RXQ > #TXQ case I outlined. And maybe a comment documenting why indexing the tx_ring array by smp_processor_id() is safe would be nice? :) -Toke
On Thu, Apr 01, 2021 at 08:01:42PM +0200, Toke Høiland-Jørgensen wrote: > Vladimir Oltean <olteanv@gmail.com> writes: > > > On Thu, Apr 01, 2021 at 01:39:05PM +0200, Toke Høiland-Jørgensen wrote: > >> Vladimir Oltean <olteanv@gmail.com> writes: > >> > >> > On Thu, Apr 01, 2021 at 01:26:02PM +0200, Toke Høiland-Jørgensen wrote: > >> >> > +int enetc_xdp_xmit(struct net_device *ndev, int num_frames, > >> >> > + struct xdp_frame **frames, u32 flags) > >> >> > +{ > >> >> > + struct enetc_tx_swbd xdp_redirect_arr[ENETC_MAX_SKB_FRAGS] = {0}; > >> >> > + struct enetc_ndev_priv *priv = netdev_priv(ndev); > >> >> > + struct enetc_bdr *tx_ring; > >> >> > + int xdp_tx_bd_cnt, i, k; > >> >> > + int xdp_tx_frm_cnt = 0; > >> >> > + > >> >> > + tx_ring = priv->tx_ring[smp_processor_id()]; > >> >> > >> >> What mechanism guarantees that this won't overflow the array? :) > >> > > >> > Which array, the array of TX rings? > >> > >> Yes. > >> > > > > The problem isn't even accessing an out-of-bounds element in the TX ring array. > > > > As it turns out, I had a relatively superficial understanding of how > > things are organized, but let me try to explain. > > > > The number of TX rings is a configurable resource (between PFs and VFs) > > and we read the capability at probe time: > > > > enetc_get_si_caps: > > val = enetc_rd(hw, ENETC_SICAPR0); > > si->num_rx_rings = (val >> 16) & 0xff; > > si->num_tx_rings = val & 0xff; > > > > enetc_init_si_rings_params: > > priv->num_tx_rings = si->num_tx_rings; > > > > In any case, the TX array is declared as: > > > > struct enetc_ndev_priv { > > struct enetc_bdr *tx_ring[16]; > > struct enetc_bdr *rx_ring[16]; > > }; > > > > because that's the maximum hardware capability. > > > > The priv->tx_ring array is populated in: > > > > enetc_alloc_msix: > > /* # of tx rings per int vector */ > > v_tx_rings = priv->num_tx_rings / priv->bdr_int_num; > > > > for (i = 0; i < priv->bdr_int_num; i++) { > > for (j = 0; j < v_tx_rings; j++) { > > if (priv->bdr_int_num == ENETC_MAX_BDR_INT) > > idx = 2 * j + i; /* 2 CPUs */ > > else > > idx = j + i * v_tx_rings; /* default */ > > > > priv->tx_ring[idx] = bdr; > > } > > } > > > > priv->bdr_int_num is set to "num_online_cpus()". > > On LS1028A, it can be either 1 or 2 (and the ENETC_MAX_BDR_INT macro is > > equal to 2). > > > > Otherwise said, the convoluted logic above does the following: > > - It affines an MSI interrupt vector per CPU > > - It affines an RX ring per MSI vector, hence per CPU > > - It balances the fixed number of TX rings (say 8) among the available > > MSI vectors, hence CPUs (say 2). It does this by iterating with i > > through the RX MSI interrupt vectors, and with j through the number of > > TX rings per MSI vector. > > > > This logic maps: > > - the even TX rings to CPU 0 and the odd TX rings to CPU 1, if 2 CPUs > > are used > > - all TX rings to CPU 0, if 1 CPU is used > > > > This is done because we have this logic in enetc_poll: > > > > for (i = 0; i < v->count_tx_rings; i++) > > if (!enetc_clean_tx_ring(&v->tx_ring[i], budget)) > > complete = false; > > > > for processing the TX completions of a given group of TX rings in the RX > > MSI interrupt handler of a certain CPU. > > > > Otherwise said, priv->tx_ring[i] is always BD ring i, and that mapping > > never changes. All 8 TX rings are enabled and available for use. > > > > What I knew about tc-taprio and tc-mqprio is that they only enqueue to > > TX queues [0, num_tc-1] because of this, as it turns out: > > > > enetc_xmit: > > tx_ring = priv->tx_ring[skb->queue_mapping]; > > > > where skb->queue_mapping is given by: > > err = netif_set_real_num_tx_queues(ndev, priv->num_tx_rings); > > and by this, respectively, from the mqprio code path: > > netif_set_real_num_tx_queues(ndev, num_tc); > > > > As for why XDP works, and priv->tx_ring[smp_processor_id()] is: > > - TX ring 0 for CPU 0 and TX ring 1 for CPU 1, if 2 CPUs are used > > - TX ring 0, if 1 CPU is used > > > > The TX completions in the first case are handled by: > > - CPU 0 for TX ring 0 (because it is even) and CPU 1 for TX ring 1 > > (because it is odd), if 2 CPUs are used, due to the mapping I talked > > about earlier > > - CPU 0 if only 1 CPU is used > > Right - thank you for the details! So what are the constraints on the > configuration. Specifically, given two netdevs on the same device, is it > possible that the system can ever end up in a situation where one device > has two *RXQs* configured, and the other only one *TXQ*. Because then > you could get a redirect from RXQ 1 on one device, which would also end > up trying to transmit on TXQ 1 on the other device; and that would break > if that other device only has TXQ 0 configured... Same thing if a single > device has 2 RXQs but only one TXQ (it can redirect to itself). I discover more and more of the driver as I talk to you, I like it :D So I said that there is a maximum number of RX and TX rings splittable between the PF and its VFs, but I wasn't exactly sure where that configuration is done. I found it now. enetc_port_si_configure: (SI == station interface) - read Port capability register 0 (PCAPR0) to determine how many RX rings and TX rings the hardware has for this port (PFs + VFs) in total. - assign num_rings = min(TX rings, RX rings) - try to assign 8 TX rings and 8 RX rings to the PF - if this fails, just assign ${num_rings} TX rings and ${num_rings} RX rings to the PF - split the remaining RX and TX rings to the number of configured VFs (example: if there are 16 RX rings and 16 TX rings for a port with 2 VFs, the driver assigns 8RX/8TX rings for the PF, and 4RX/4TX rings for each VF). - if we couldn't assign 8RX/8TX rings for the PF in the previous step, we don't assign any ring to the VF So yeah, we have an equal number of RX and TX rings. The driver, however, only uses 2 RX rings _actively_: one per CPU. The other 6, I don't know, I guess I can use them for AF_XDP (I haven't looked very closely at that yet), at the moment they're pretty much unused, even if reserved and not given to VFs. > >> > You mean that it's possible to receive a TC_SETUP_QDISC_MQPRIO or > >> > TC_SETUP_QDISC_TAPRIO with num_tc == 1, and we have 2 CPUs? > >> > >> Not just that, this ndo can be called on arbitrary CPUs after a > >> redirect. The code just calls through from the XDP receive path so which > >> CPU it ends up on depends on the RSS+IRQ config of the other device, > >> which may not even be the same driver; i.e., you have no control over > >> that... :) > >> > > > > What do you mean by "arbitrary" CPU? You can't plug CPUs in, it's a dual > > core system... Why does the source ifindex matter at all? I'm using the > > TX ring affined to the CPU that ndo_xdp_xmit is currently running on. > > See, this is why I asked 'what mechanism ensures'. Because if that > mechanism is 'this driver is only ever used on a system with fewer CPUs > than TXQs', then that's of course fine :) > > But there are drivers that do basically the same thing as what you've > done here, *without* having such an assurance, and just looking at that > function it's not obvious that there's an out-of-band reason why it's > safe. And I literally just came from looking at such a case when I > replied to your initial patch... Maybe you were confused seeing that this is a PCI device, thinking it's a plug-in card or something, therefore we don't get to choose the number of CPUs that the host has. In hindsight, I don't know why you didn't ask about this, it is pretty strange when you think about it. It is actually more like a platform device with a PCI front-end - we found this loophole in the PCI standard where you can create a "root complex/integrated endpoint" which is basically an ECAM where the config space contains PFs corresponding to some platform devices in the SoC (in our case, all 4 Ethernet ports have their own PF, the switch has its own PF, same thing for the MDIO controller and the 1588 timer). Their register map is exposed as a number of BARs which use Enhanced Allocation, so the generic PCI ECAM driver doesn't need to create any translation windows for these addresses, it just uses what's in there, which, surprise, is the actual base address of the peripheral in the SoC's memory space. We do that because we gain a lot of cool stuff by appearing as PCI devices to system software, like for example multiple interfaces on top of a 'shared MAC' are simply mapped over SR-IOV. So it just 'smells' like PCI, but they're regular memory-mapped devices, there is no PCI transaction layer or physical layer. At the moment the LS1028A is the only SoC running Linux that integrates the ENETC block, so we fully control the environment. > >> > Well, yeah, I don't know what's the proper way to deal with that. Ideas? > >> > >> Well the obvious one is just: > >> > >> tx_ring = priv->tx_ring[smp_processor_id() % num_ring_ids]; > >> > >> and then some kind of locking to deal with multiple CPUs accessing the > >> same TX ring... > > > > By multiple CPUs accessing the same TX ring, you mean locking between > > ndo_xdp_xmit and ndo_start_xmit? Can that even happen if the hardware > > architecture is to have at least as many TX rings as CPUs? > > > > Because otherwise, I see that ndo_xdp_xmit is only called from > > xdp_do_flush, which is in softirq context, which to my very rudimentary > > knowledge run with bottom halves, thus preemption, disabled? So I don't > > think it's possible for ndo_xdp_xmit and ndo_xmit, or even two > > ndo_xdp_xmit instances, to access the same TX ring? > > Yup, I think you're right about that. The "we always have more TXQs than > CPUs" condition was the bit I was missing (and of course you're *sure* > that this would never change sometime in the future, right? ;)). I'm pretty sure, yeah, we build the SoCs and one of the requirements we have is that every ENETC PF has enough TX rings in order for every CPU to have its own one. That helps a lot with avoiding contention and simplifying the driver. Maybe I'll use this opportunity to talk again to the hardware design guys and make sure that the next SoCs with Linux follow the same pattern as LS1028A, although I see no reason why not. > > Sorry, I'm sure these are trivial questions, but I would like to really > > understand what I need to change and why :D > > Given the above I think the only potentially breaking thing is the > #RXQ > #TXQ case I outlined. And maybe a comment documenting why indexing > the tx_ring array by smp_processor_id() is safe would be nice? :) Sure, which part exactly do you think would explain it best? Should I add a reference to enetc_port_si_configure?
On Thu, Apr 01, 2021 at 10:38:21PM +0300, Vladimir Oltean wrote: > On Thu, Apr 01, 2021 at 08:01:42PM +0200, Toke Høiland-Jørgensen wrote: > > Vladimir Oltean <olteanv@gmail.com> writes: > > > > > On Thu, Apr 01, 2021 at 01:39:05PM +0200, Toke Høiland-Jørgensen wrote: > > >> Vladimir Oltean <olteanv@gmail.com> writes: > > >> > > >> > On Thu, Apr 01, 2021 at 01:26:02PM +0200, Toke Høiland-Jørgensen wrote: > > >> >> > +int enetc_xdp_xmit(struct net_device *ndev, int num_frames, > > >> >> > + struct xdp_frame **frames, u32 flags) > > >> >> > +{ > > >> >> > + struct enetc_tx_swbd xdp_redirect_arr[ENETC_MAX_SKB_FRAGS] = {0}; > > >> >> > + struct enetc_ndev_priv *priv = netdev_priv(ndev); > > >> >> > + struct enetc_bdr *tx_ring; > > >> >> > + int xdp_tx_bd_cnt, i, k; > > >> >> > + int xdp_tx_frm_cnt = 0; > > >> >> > + > > >> >> > + tx_ring = priv->tx_ring[smp_processor_id()]; > > >> >> > > >> >> What mechanism guarantees that this won't overflow the array? :) > > >> > > > >> > Which array, the array of TX rings? > > >> > > >> Yes. > > >> > > > > > > The problem isn't even accessing an out-of-bounds element in the TX ring array. > > > > > > As it turns out, I had a relatively superficial understanding of how > > > things are organized, but let me try to explain. > > > > > > The number of TX rings is a configurable resource (between PFs and VFs) > > > and we read the capability at probe time: > > > > > > enetc_get_si_caps: > > > val = enetc_rd(hw, ENETC_SICAPR0); > > > si->num_rx_rings = (val >> 16) & 0xff; > > > si->num_tx_rings = val & 0xff; > > > > > > enetc_init_si_rings_params: > > > priv->num_tx_rings = si->num_tx_rings; > > > > > > In any case, the TX array is declared as: > > > > > > struct enetc_ndev_priv { > > > struct enetc_bdr *tx_ring[16]; > > > struct enetc_bdr *rx_ring[16]; > > > }; > > > > > > because that's the maximum hardware capability. > > > > > > The priv->tx_ring array is populated in: > > > > > > enetc_alloc_msix: > > > /* # of tx rings per int vector */ > > > v_tx_rings = priv->num_tx_rings / priv->bdr_int_num; > > > > > > for (i = 0; i < priv->bdr_int_num; i++) { > > > for (j = 0; j < v_tx_rings; j++) { > > > if (priv->bdr_int_num == ENETC_MAX_BDR_INT) > > > idx = 2 * j + i; /* 2 CPUs */ > > > else > > > idx = j + i * v_tx_rings; /* default */ > > > > > > priv->tx_ring[idx] = bdr; > > > } > > > } > > > > > > priv->bdr_int_num is set to "num_online_cpus()". > > > On LS1028A, it can be either 1 or 2 (and the ENETC_MAX_BDR_INT macro is > > > equal to 2). > > > > > > Otherwise said, the convoluted logic above does the following: > > > - It affines an MSI interrupt vector per CPU > > > - It affines an RX ring per MSI vector, hence per CPU > > > - It balances the fixed number of TX rings (say 8) among the available > > > MSI vectors, hence CPUs (say 2). It does this by iterating with i > > > through the RX MSI interrupt vectors, and with j through the number of > > > TX rings per MSI vector. > > > > > > This logic maps: > > > - the even TX rings to CPU 0 and the odd TX rings to CPU 1, if 2 CPUs > > > are used > > > - all TX rings to CPU 0, if 1 CPU is used > > > > > > This is done because we have this logic in enetc_poll: > > > > > > for (i = 0; i < v->count_tx_rings; i++) > > > if (!enetc_clean_tx_ring(&v->tx_ring[i], budget)) > > > complete = false; > > > > > > for processing the TX completions of a given group of TX rings in the RX > > > MSI interrupt handler of a certain CPU. > > > > > > Otherwise said, priv->tx_ring[i] is always BD ring i, and that mapping > > > never changes. All 8 TX rings are enabled and available for use. > > > > > > What I knew about tc-taprio and tc-mqprio is that they only enqueue to > > > TX queues [0, num_tc-1] because of this, as it turns out: > > > > > > enetc_xmit: > > > tx_ring = priv->tx_ring[skb->queue_mapping]; > > > > > > where skb->queue_mapping is given by: > > > err = netif_set_real_num_tx_queues(ndev, priv->num_tx_rings); > > > and by this, respectively, from the mqprio code path: > > > netif_set_real_num_tx_queues(ndev, num_tc); > > > > > > As for why XDP works, and priv->tx_ring[smp_processor_id()] is: > > > - TX ring 0 for CPU 0 and TX ring 1 for CPU 1, if 2 CPUs are used > > > - TX ring 0, if 1 CPU is used > > > > > > The TX completions in the first case are handled by: > > > - CPU 0 for TX ring 0 (because it is even) and CPU 1 for TX ring 1 > > > (because it is odd), if 2 CPUs are used, due to the mapping I talked > > > about earlier > > > - CPU 0 if only 1 CPU is used > > > > Right - thank you for the details! So what are the constraints on the > > configuration. Specifically, given two netdevs on the same device, is it > > possible that the system can ever end up in a situation where one device > > has two *RXQs* configured, and the other only one *TXQ*. Because then > > you could get a redirect from RXQ 1 on one device, which would also end > > up trying to transmit on TXQ 1 on the other device; and that would break > > if that other device only has TXQ 0 configured... Same thing if a single > > device has 2 RXQs but only one TXQ (it can redirect to itself). > > I discover more and more of the driver as I talk to you, I like it :D > > So I said that there is a maximum number of RX and TX rings splittable > between the PF and its VFs, but I wasn't exactly sure where that > configuration is done. I found it now. > > enetc_port_si_configure: (SI == station interface) > - read Port capability register 0 (PCAPR0) to determine how many > RX rings and TX rings the hardware has for this port (PFs + VFs) > in total. > - assign num_rings = min(TX rings, RX rings) > - try to assign 8 TX rings and 8 RX rings to the PF > - if this fails, just assign ${num_rings} TX rings and > ${num_rings} RX rings to the PF > - split the remaining RX and TX rings to the number of > configured VFs (example: if there are 16 RX rings and 16 TX > rings for a port with 2 VFs, the driver assigns 8RX/8TX rings > for the PF, and 4RX/4TX rings for each VF). > - if we couldn't assign 8RX/8TX rings for the PF in the > previous step, we don't assign any ring to the VF > > So yeah, we have an equal number of RX and TX rings. The driver, > however, only uses 2 RX rings _actively_: one per CPU. The other 6, I > don't know, I guess I can use them for AF_XDP (I haven't looked very > closely at that yet), at the moment they're pretty much unused, even if > reserved and not given to VFs. > > > >> > You mean that it's possible to receive a TC_SETUP_QDISC_MQPRIO or > > >> > TC_SETUP_QDISC_TAPRIO with num_tc == 1, and we have 2 CPUs? > > >> > > >> Not just that, this ndo can be called on arbitrary CPUs after a > > >> redirect. The code just calls through from the XDP receive path so which > > >> CPU it ends up on depends on the RSS+IRQ config of the other device, > > >> which may not even be the same driver; i.e., you have no control over > > >> that... :) > > >> > > > > > > What do you mean by "arbitrary" CPU? You can't plug CPUs in, it's a dual > > > core system... Why does the source ifindex matter at all? I'm using the > > > TX ring affined to the CPU that ndo_xdp_xmit is currently running on. > > > > See, this is why I asked 'what mechanism ensures'. Because if that > > mechanism is 'this driver is only ever used on a system with fewer CPUs > > than TXQs', then that's of course fine :) > > > > But there are drivers that do basically the same thing as what you've > > done here, *without* having such an assurance, and just looking at that > > function it's not obvious that there's an out-of-band reason why it's > > safe. And I literally just came from looking at such a case when I > > replied to your initial patch... > > Maybe you were confused seeing that this is a PCI device, thinking it's > a plug-in card or something, therefore we don't get to choose the number > of CPUs that the host has. In hindsight, I don't know why you didn't ask > about this, it is pretty strange when you think about it. > > It is actually more like a platform device with a PCI front-end - we > found this loophole in the PCI standard where you can create a "root > complex/integrated endpoint" which is basically an ECAM where the config > space contains PFs corresponding to some platform devices in the SoC (in > our case, all 4 Ethernet ports have their own PF, the switch has its own > PF, same thing for the MDIO controller and the 1588 timer). Their > register map is exposed as a number of BARs which use Enhanced > Allocation, so the generic PCI ECAM driver doesn't need to create any > translation windows for these addresses, it just uses what's in there, > which, surprise, is the actual base address of the peripheral in the > SoC's memory space. > > We do that because we gain a lot of cool stuff by appearing as PCI > devices to system software, like for example multiple interfaces on top > of a 'shared MAC' are simply mapped over SR-IOV. > > So it just 'smells' like PCI, but they're regular memory-mapped devices, > there is no PCI transaction layer or physical layer. At the moment the > LS1028A is the only SoC running Linux that integrates the ENETC block, > so we fully control the environment. > > > >> > Well, yeah, I don't know what's the proper way to deal with that. Ideas? > > >> > > >> Well the obvious one is just: > > >> > > >> tx_ring = priv->tx_ring[smp_processor_id() % num_ring_ids]; > > >> > > >> and then some kind of locking to deal with multiple CPUs accessing the > > >> same TX ring... > > > > > > By multiple CPUs accessing the same TX ring, you mean locking between > > > ndo_xdp_xmit and ndo_start_xmit? Can that even happen if the hardware > > > architecture is to have at least as many TX rings as CPUs? > > > > > > Because otherwise, I see that ndo_xdp_xmit is only called from > > > xdp_do_flush, which is in softirq context, which to my very rudimentary > > > knowledge run with bottom halves, thus preemption, disabled? So I don't > > > think it's possible for ndo_xdp_xmit and ndo_xmit, or even two > > > ndo_xdp_xmit instances, to access the same TX ring? > > > > Yup, I think you're right about that. The "we always have more TXQs than > > CPUs" condition was the bit I was missing (and of course you're *sure* > > that this would never change sometime in the future, right? ;)). > > I'm pretty sure, yeah, we build the SoCs and one of the requirements we > have is that every ENETC PF has enough TX rings in order for every CPU > to have its own one. That helps a lot with avoiding contention and > simplifying the driver. Maybe I'll use this opportunity to talk again to > the hardware design guys and make sure that the next SoCs with Linux > follow the same pattern as LS1028A, although I see no reason why not. > > > > Sorry, I'm sure these are trivial questions, but I would like to really > > > understand what I need to change and why :D > > > > Given the above I think the only potentially breaking thing is the > > #RXQ > #TXQ case I outlined. And maybe a comment documenting why indexing > > the tx_ring array by smp_processor_id() is safe would be nice? :) > > Sure, which part exactly do you think would explain it best? Should I > add a reference to enetc_port_si_configure? After discussing a bit more with Claudiu, I think we do have a problem, and it has to do with concurrent ndo_xdp_xmit on one CPU and ndo_start_xmit on another CPU. See, even if we have 8 TX rings, they are not really affined to any CPU. Instead, when we call netif_set_real_num_tx_queues, we allow netdev_pick_tx to hash amongs the TX queues of the same priority. There are three consequences: - Traffic with the same hash will be sent to the same TX queue, thus avoiding reordering for packets belonging to the same stream. - Traffic with different hashes are distributed to different TX queues. - If we have two CPUs sending traffic with the same hash, they will serialize on the TX lock of the same netdev queue. The last one is a problem because our XDP_REDIRECT tries to associate one TX ring with one CPU, and, as explained above, that TX ring might already be used by our ndo_start_xmit on another CPU, selected by netdev_pick_tx. The first idea was to implement ndo_select_queue for the network stack, and select the TX ring based on smp_processor_id(). But we know that this will break the first two effects of netdev_pick_tx, which are very much desirable. For example, if we have a user space process sending a TCP stream, and the scheduler migrates that process from one CPU to another, then the ndo_select_queue output for that TCP stream will change, and we will have TX reordering for packets belonging to the same stream. Not at all ideal. Another idea is to just crop some TX queues from the network stack, and basically call netif_set_real_num_tx_queues(6), leaving one TX ring per CPU dedicated to XDP. This will work just fine for normal qdiscs, except that with mqprio/taprio we have a problem. Our TX rings have a configurable strict priority for the hardware egress scheduler. When we don't have mqprio/taprio, all TX rings have the same priority of 0 (therefore it is safe to allow hashing to select one at random), but when we have mqprio or taprio, we enjoy the benefit of configuring the priority of each TX ring using the "map" option. The problem, of course, is that if we crop 2 TX rings out of what the network stack sees, then we are no longer able to configure their queue-to-traffic-class mapping through mqprio/taprio, so we cannot change their prioritization relative to the network stack queues. In a way, this seems to be in line with the XDP design because that bypasses anything that has to do with qdiscs, but we don't really like that. We also have some other qdisc-based offloads such as Credit Based Shaper, and we would very much like to be able to set bandwidth profiles for the XDP rings, for AVB/TSN use cases. Finally there is the option of taking the network stack's TX lock in our ndo_xdp_xmit, but frankly I would leave that option as a last resort. Maybe we could make this less expensive by bulk-enqueuing into a temporary array of buffer descriptors, and only taking the xmit lock when flushing that array (since that is the only portion that strictly needs to be protected against concurrency). But the problem with this approach is that if you have a temporary array, it becomes a lot more difficult and error-prone to not take more frames than you can enqueue. For example, the TX ring might have only 20 free entries, and you filled your BD array with 32 frames, and you told the caller of ndo_xdp_xmit that you processed all those frames. Now when push comes to shove and you actually need to enqueue them, you end up in the position that you must drop them yourself. This seems to be very much against the design principle of commit fdc13979f91e ("bpf, devmap: Move drop error path to devmap for XDP_REDIRECT") whose desire is to let XDP handle the dropping of excess TX frames. What do you think?
Vladimir Oltean <olteanv@gmail.com> writes: > On Thu, Apr 01, 2021 at 10:38:21PM +0300, Vladimir Oltean wrote: >> On Thu, Apr 01, 2021 at 08:01:42PM +0200, Toke Høiland-Jørgensen wrote: >> > Vladimir Oltean <olteanv@gmail.com> writes: >> > >> > > On Thu, Apr 01, 2021 at 01:39:05PM +0200, Toke Høiland-Jørgensen wrote: >> > >> Vladimir Oltean <olteanv@gmail.com> writes: >> > >> >> > >> > On Thu, Apr 01, 2021 at 01:26:02PM +0200, Toke Høiland-Jørgensen wrote: >> > >> >> > +int enetc_xdp_xmit(struct net_device *ndev, int num_frames, >> > >> >> > + struct xdp_frame **frames, u32 flags) >> > >> >> > +{ >> > >> >> > + struct enetc_tx_swbd xdp_redirect_arr[ENETC_MAX_SKB_FRAGS] = {0}; >> > >> >> > + struct enetc_ndev_priv *priv = netdev_priv(ndev); >> > >> >> > + struct enetc_bdr *tx_ring; >> > >> >> > + int xdp_tx_bd_cnt, i, k; >> > >> >> > + int xdp_tx_frm_cnt = 0; >> > >> >> > + >> > >> >> > + tx_ring = priv->tx_ring[smp_processor_id()]; >> > >> >> >> > >> >> What mechanism guarantees that this won't overflow the array? :) >> > >> > >> > >> > Which array, the array of TX rings? >> > >> >> > >> Yes. >> > >> >> > > >> > > The problem isn't even accessing an out-of-bounds element in the TX ring array. >> > > >> > > As it turns out, I had a relatively superficial understanding of how >> > > things are organized, but let me try to explain. >> > > >> > > The number of TX rings is a configurable resource (between PFs and VFs) >> > > and we read the capability at probe time: >> > > >> > > enetc_get_si_caps: >> > > val = enetc_rd(hw, ENETC_SICAPR0); >> > > si->num_rx_rings = (val >> 16) & 0xff; >> > > si->num_tx_rings = val & 0xff; >> > > >> > > enetc_init_si_rings_params: >> > > priv->num_tx_rings = si->num_tx_rings; >> > > >> > > In any case, the TX array is declared as: >> > > >> > > struct enetc_ndev_priv { >> > > struct enetc_bdr *tx_ring[16]; >> > > struct enetc_bdr *rx_ring[16]; >> > > }; >> > > >> > > because that's the maximum hardware capability. >> > > >> > > The priv->tx_ring array is populated in: >> > > >> > > enetc_alloc_msix: >> > > /* # of tx rings per int vector */ >> > > v_tx_rings = priv->num_tx_rings / priv->bdr_int_num; >> > > >> > > for (i = 0; i < priv->bdr_int_num; i++) { >> > > for (j = 0; j < v_tx_rings; j++) { >> > > if (priv->bdr_int_num == ENETC_MAX_BDR_INT) >> > > idx = 2 * j + i; /* 2 CPUs */ >> > > else >> > > idx = j + i * v_tx_rings; /* default */ >> > > >> > > priv->tx_ring[idx] = bdr; >> > > } >> > > } >> > > >> > > priv->bdr_int_num is set to "num_online_cpus()". >> > > On LS1028A, it can be either 1 or 2 (and the ENETC_MAX_BDR_INT macro is >> > > equal to 2). >> > > >> > > Otherwise said, the convoluted logic above does the following: >> > > - It affines an MSI interrupt vector per CPU >> > > - It affines an RX ring per MSI vector, hence per CPU >> > > - It balances the fixed number of TX rings (say 8) among the available >> > > MSI vectors, hence CPUs (say 2). It does this by iterating with i >> > > through the RX MSI interrupt vectors, and with j through the number of >> > > TX rings per MSI vector. >> > > >> > > This logic maps: >> > > - the even TX rings to CPU 0 and the odd TX rings to CPU 1, if 2 CPUs >> > > are used >> > > - all TX rings to CPU 0, if 1 CPU is used >> > > >> > > This is done because we have this logic in enetc_poll: >> > > >> > > for (i = 0; i < v->count_tx_rings; i++) >> > > if (!enetc_clean_tx_ring(&v->tx_ring[i], budget)) >> > > complete = false; >> > > >> > > for processing the TX completions of a given group of TX rings in the RX >> > > MSI interrupt handler of a certain CPU. >> > > >> > > Otherwise said, priv->tx_ring[i] is always BD ring i, and that mapping >> > > never changes. All 8 TX rings are enabled and available for use. >> > > >> > > What I knew about tc-taprio and tc-mqprio is that they only enqueue to >> > > TX queues [0, num_tc-1] because of this, as it turns out: >> > > >> > > enetc_xmit: >> > > tx_ring = priv->tx_ring[skb->queue_mapping]; >> > > >> > > where skb->queue_mapping is given by: >> > > err = netif_set_real_num_tx_queues(ndev, priv->num_tx_rings); >> > > and by this, respectively, from the mqprio code path: >> > > netif_set_real_num_tx_queues(ndev, num_tc); >> > > >> > > As for why XDP works, and priv->tx_ring[smp_processor_id()] is: >> > > - TX ring 0 for CPU 0 and TX ring 1 for CPU 1, if 2 CPUs are used >> > > - TX ring 0, if 1 CPU is used >> > > >> > > The TX completions in the first case are handled by: >> > > - CPU 0 for TX ring 0 (because it is even) and CPU 1 for TX ring 1 >> > > (because it is odd), if 2 CPUs are used, due to the mapping I talked >> > > about earlier >> > > - CPU 0 if only 1 CPU is used >> > >> > Right - thank you for the details! So what are the constraints on the >> > configuration. Specifically, given two netdevs on the same device, is it >> > possible that the system can ever end up in a situation where one device >> > has two *RXQs* configured, and the other only one *TXQ*. Because then >> > you could get a redirect from RXQ 1 on one device, which would also end >> > up trying to transmit on TXQ 1 on the other device; and that would break >> > if that other device only has TXQ 0 configured... Same thing if a single >> > device has 2 RXQs but only one TXQ (it can redirect to itself). >> >> I discover more and more of the driver as I talk to you, I like it :D >> >> So I said that there is a maximum number of RX and TX rings splittable >> between the PF and its VFs, but I wasn't exactly sure where that >> configuration is done. I found it now. >> >> enetc_port_si_configure: (SI == station interface) >> - read Port capability register 0 (PCAPR0) to determine how many >> RX rings and TX rings the hardware has for this port (PFs + VFs) >> in total. >> - assign num_rings = min(TX rings, RX rings) >> - try to assign 8 TX rings and 8 RX rings to the PF >> - if this fails, just assign ${num_rings} TX rings and >> ${num_rings} RX rings to the PF >> - split the remaining RX and TX rings to the number of >> configured VFs (example: if there are 16 RX rings and 16 TX >> rings for a port with 2 VFs, the driver assigns 8RX/8TX rings >> for the PF, and 4RX/4TX rings for each VF). >> - if we couldn't assign 8RX/8TX rings for the PF in the >> previous step, we don't assign any ring to the VF >> >> So yeah, we have an equal number of RX and TX rings. The driver, >> however, only uses 2 RX rings _actively_: one per CPU. The other 6, I >> don't know, I guess I can use them for AF_XDP (I haven't looked very >> closely at that yet), at the moment they're pretty much unused, even if >> reserved and not given to VFs. >> >> > >> > You mean that it's possible to receive a TC_SETUP_QDISC_MQPRIO or >> > >> > TC_SETUP_QDISC_TAPRIO with num_tc == 1, and we have 2 CPUs? >> > >> >> > >> Not just that, this ndo can be called on arbitrary CPUs after a >> > >> redirect. The code just calls through from the XDP receive path so which >> > >> CPU it ends up on depends on the RSS+IRQ config of the other device, >> > >> which may not even be the same driver; i.e., you have no control over >> > >> that... :) >> > >> >> > > >> > > What do you mean by "arbitrary" CPU? You can't plug CPUs in, it's a dual >> > > core system... Why does the source ifindex matter at all? I'm using the >> > > TX ring affined to the CPU that ndo_xdp_xmit is currently running on. >> > >> > See, this is why I asked 'what mechanism ensures'. Because if that >> > mechanism is 'this driver is only ever used on a system with fewer CPUs >> > than TXQs', then that's of course fine :) >> > >> > But there are drivers that do basically the same thing as what you've >> > done here, *without* having such an assurance, and just looking at that >> > function it's not obvious that there's an out-of-band reason why it's >> > safe. And I literally just came from looking at such a case when I >> > replied to your initial patch... >> >> Maybe you were confused seeing that this is a PCI device, thinking it's >> a plug-in card or something, therefore we don't get to choose the number >> of CPUs that the host has. In hindsight, I don't know why you didn't ask >> about this, it is pretty strange when you think about it. >> >> It is actually more like a platform device with a PCI front-end - we >> found this loophole in the PCI standard where you can create a "root >> complex/integrated endpoint" which is basically an ECAM where the config >> space contains PFs corresponding to some platform devices in the SoC (in >> our case, all 4 Ethernet ports have their own PF, the switch has its own >> PF, same thing for the MDIO controller and the 1588 timer). Their >> register map is exposed as a number of BARs which use Enhanced >> Allocation, so the generic PCI ECAM driver doesn't need to create any >> translation windows for these addresses, it just uses what's in there, >> which, surprise, is the actual base address of the peripheral in the >> SoC's memory space. >> >> We do that because we gain a lot of cool stuff by appearing as PCI >> devices to system software, like for example multiple interfaces on top >> of a 'shared MAC' are simply mapped over SR-IOV. >> >> So it just 'smells' like PCI, but they're regular memory-mapped devices, >> there is no PCI transaction layer or physical layer. At the moment the >> LS1028A is the only SoC running Linux that integrates the ENETC block, >> so we fully control the environment. >> >> > >> > Well, yeah, I don't know what's the proper way to deal with that. Ideas? >> > >> >> > >> Well the obvious one is just: >> > >> >> > >> tx_ring = priv->tx_ring[smp_processor_id() % num_ring_ids]; >> > >> >> > >> and then some kind of locking to deal with multiple CPUs accessing the >> > >> same TX ring... >> > > >> > > By multiple CPUs accessing the same TX ring, you mean locking between >> > > ndo_xdp_xmit and ndo_start_xmit? Can that even happen if the hardware >> > > architecture is to have at least as many TX rings as CPUs? >> > > >> > > Because otherwise, I see that ndo_xdp_xmit is only called from >> > > xdp_do_flush, which is in softirq context, which to my very rudimentary >> > > knowledge run with bottom halves, thus preemption, disabled? So I don't >> > > think it's possible for ndo_xdp_xmit and ndo_xmit, or even two >> > > ndo_xdp_xmit instances, to access the same TX ring? >> > >> > Yup, I think you're right about that. The "we always have more TXQs than >> > CPUs" condition was the bit I was missing (and of course you're *sure* >> > that this would never change sometime in the future, right? ;)). >> >> I'm pretty sure, yeah, we build the SoCs and one of the requirements we >> have is that every ENETC PF has enough TX rings in order for every CPU >> to have its own one. That helps a lot with avoiding contention and >> simplifying the driver. Maybe I'll use this opportunity to talk again to >> the hardware design guys and make sure that the next SoCs with Linux >> follow the same pattern as LS1028A, although I see no reason why not. >> >> > > Sorry, I'm sure these are trivial questions, but I would like to really >> > > understand what I need to change and why :D >> > >> > Given the above I think the only potentially breaking thing is the >> > #RXQ > #TXQ case I outlined. And maybe a comment documenting why indexing >> > the tx_ring array by smp_processor_id() is safe would be nice? :) >> >> Sure, which part exactly do you think would explain it best? Should I >> add a reference to enetc_port_si_configure? > > After discussing a bit more with Claudiu, I think we do have a problem, > and it has to do with concurrent ndo_xdp_xmit on one CPU and ndo_start_xmit > on another CPU. > > See, even if we have 8 TX rings, they are not really affined to any CPU. > Instead, when we call netif_set_real_num_tx_queues, we allow netdev_pick_tx > to hash amongs the TX queues of the same priority. There are three consequences: > - Traffic with the same hash will be sent to the same TX queue, thus > avoiding reordering for packets belonging to the same stream. > - Traffic with different hashes are distributed to different TX queues. > - If we have two CPUs sending traffic with the same hash, they will > serialize on the TX lock of the same netdev queue. > > The last one is a problem because our XDP_REDIRECT tries to associate > one TX ring with one CPU, and, as explained above, that TX ring might > already be used by our ndo_start_xmit on another CPU, selected by > netdev_pick_tx. > > The first idea was to implement ndo_select_queue for the network stack, > and select the TX ring based on smp_processor_id(). But we know that > this will break the first two effects of netdev_pick_tx, which are very > much desirable. For example, if we have a user space process sending a > TCP stream, and the scheduler migrates that process from one CPU to > another, then the ndo_select_queue output for that TCP stream will > change, and we will have TX reordering for packets belonging to the same > stream. Not at all ideal. > > Another idea is to just crop some TX queues from the network stack, and > basically call netif_set_real_num_tx_queues(6), leaving one TX ring per > CPU dedicated to XDP. This will work just fine for normal qdiscs, except > that with mqprio/taprio we have a problem. Our TX rings have a configurable > strict priority for the hardware egress scheduler. When we don't have > mqprio/taprio, all TX rings have the same priority of 0 (therefore it is > safe to allow hashing to select one at random), but when we have mqprio > or taprio, we enjoy the benefit of configuring the priority of each TX > ring using the "map" option. The problem, of course, is that if we crop > 2 TX rings out of what the network stack sees, then we are no longer > able to configure their queue-to-traffic-class mapping through > mqprio/taprio, so we cannot change their prioritization relative to the > network stack queues. In a way, this seems to be in line with the XDP > design because that bypasses anything that has to do with qdiscs, but we > don't really like that. We also have some other qdisc-based offloads > such as Credit Based Shaper, and we would very much like to be able to > set bandwidth profiles for the XDP rings, for AVB/TSN use cases. You'd not be the first driver to solve this by just carving out a couple of TX rings for XDP :) And while I get the desire for being able to configure these things for XDP as well, I'm not sure that the qdisc interface is the right one to use for that. There was a general TXQ allocation idea that unfortunately stalled out, but there is also ongoing work on XDP+TSN - I'm hoping Jesper can chime in with the details... > Finally there is the option of taking the network stack's TX lock in our > ndo_xdp_xmit, but frankly I would leave that option as a last resort. > Maybe we could make this less expensive by bulk-enqueuing into a > temporary array of buffer descriptors, and only taking the xmit lock > when flushing that array (since that is the only portion that strictly > needs to be protected against concurrency). But the problem with this > approach is that if you have a temporary array, it becomes a lot more > difficult and error-prone to not take more frames than you can enqueue. > For example, the TX ring might have only 20 free entries, and you filled > your BD array with 32 frames, and you told the caller of ndo_xdp_xmit > that you processed all those frames. Now when push comes to shove and > you actually need to enqueue them, you end up in the position that you > must drop them yourself. This seems to be very much against the design > principle of commit fdc13979f91e ("bpf, devmap: Move drop error path to > devmap for XDP_REDIRECT") whose desire is to let XDP handle the dropping > of excess TX frames. Note that there's already bulking in XDP_REDIRECT: after an XDP program returns XDP_REDIRECT, the packets will actually be put on a bulk queue (see bq_enqueue() in devmap.c), and that will be flushed to the TX driver at the end of the (RX) NAPI cycle. So taking a lock in ndo_xdp_xmit() may not be quite as much overhead as you think it is - so maybe it would be worth benchmarking before ruling this out entirely? :) -Toke
On Sat, Apr 03, 2021 at 01:07:29PM +0200, Toke Høiland-Jørgensen wrote: > Vladimir Oltean <olteanv@gmail.com> writes: > > > On Thu, Apr 01, 2021 at 10:38:21PM +0300, Vladimir Oltean wrote: > >> On Thu, Apr 01, 2021 at 08:01:42PM +0200, Toke Høiland-Jørgensen wrote: > >> > Vladimir Oltean <olteanv@gmail.com> writes: > >> > > >> > > On Thu, Apr 01, 2021 at 01:39:05PM +0200, Toke Høiland-Jørgensen wrote: > >> > >> Vladimir Oltean <olteanv@gmail.com> writes: > >> > >> > >> > >> > On Thu, Apr 01, 2021 at 01:26:02PM +0200, Toke Høiland-Jørgensen wrote: > >> > >> >> > +int enetc_xdp_xmit(struct net_device *ndev, int num_frames, > >> > >> >> > + struct xdp_frame **frames, u32 flags) > >> > >> >> > +{ > >> > >> >> > + struct enetc_tx_swbd xdp_redirect_arr[ENETC_MAX_SKB_FRAGS] = {0}; > >> > >> >> > + struct enetc_ndev_priv *priv = netdev_priv(ndev); > >> > >> >> > + struct enetc_bdr *tx_ring; > >> > >> >> > + int xdp_tx_bd_cnt, i, k; > >> > >> >> > + int xdp_tx_frm_cnt = 0; > >> > >> >> > + > >> > >> >> > + tx_ring = priv->tx_ring[smp_processor_id()]; > >> > >> >> > >> > >> >> What mechanism guarantees that this won't overflow the array? :) > >> > >> > > >> > >> > Which array, the array of TX rings? > >> > >> > >> > >> Yes. > >> > >> > >> > > > >> > > The problem isn't even accessing an out-of-bounds element in the TX ring array. > >> > > > >> > > As it turns out, I had a relatively superficial understanding of how > >> > > things are organized, but let me try to explain. > >> > > > >> > > The number of TX rings is a configurable resource (between PFs and VFs) > >> > > and we read the capability at probe time: > >> > > > >> > > enetc_get_si_caps: > >> > > val = enetc_rd(hw, ENETC_SICAPR0); > >> > > si->num_rx_rings = (val >> 16) & 0xff; > >> > > si->num_tx_rings = val & 0xff; > >> > > > >> > > enetc_init_si_rings_params: > >> > > priv->num_tx_rings = si->num_tx_rings; > >> > > > >> > > In any case, the TX array is declared as: > >> > > > >> > > struct enetc_ndev_priv { > >> > > struct enetc_bdr *tx_ring[16]; > >> > > struct enetc_bdr *rx_ring[16]; > >> > > }; > >> > > > >> > > because that's the maximum hardware capability. > >> > > > >> > > The priv->tx_ring array is populated in: > >> > > > >> > > enetc_alloc_msix: > >> > > /* # of tx rings per int vector */ > >> > > v_tx_rings = priv->num_tx_rings / priv->bdr_int_num; > >> > > > >> > > for (i = 0; i < priv->bdr_int_num; i++) { > >> > > for (j = 0; j < v_tx_rings; j++) { > >> > > if (priv->bdr_int_num == ENETC_MAX_BDR_INT) > >> > > idx = 2 * j + i; /* 2 CPUs */ > >> > > else > >> > > idx = j + i * v_tx_rings; /* default */ > >> > > > >> > > priv->tx_ring[idx] = bdr; > >> > > } > >> > > } > >> > > > >> > > priv->bdr_int_num is set to "num_online_cpus()". > >> > > On LS1028A, it can be either 1 or 2 (and the ENETC_MAX_BDR_INT macro is > >> > > equal to 2). > >> > > > >> > > Otherwise said, the convoluted logic above does the following: > >> > > - It affines an MSI interrupt vector per CPU > >> > > - It affines an RX ring per MSI vector, hence per CPU > >> > > - It balances the fixed number of TX rings (say 8) among the available > >> > > MSI vectors, hence CPUs (say 2). It does this by iterating with i > >> > > through the RX MSI interrupt vectors, and with j through the number of > >> > > TX rings per MSI vector. > >> > > > >> > > This logic maps: > >> > > - the even TX rings to CPU 0 and the odd TX rings to CPU 1, if 2 CPUs > >> > > are used > >> > > - all TX rings to CPU 0, if 1 CPU is used > >> > > > >> > > This is done because we have this logic in enetc_poll: > >> > > > >> > > for (i = 0; i < v->count_tx_rings; i++) > >> > > if (!enetc_clean_tx_ring(&v->tx_ring[i], budget)) > >> > > complete = false; > >> > > > >> > > for processing the TX completions of a given group of TX rings in the RX > >> > > MSI interrupt handler of a certain CPU. > >> > > > >> > > Otherwise said, priv->tx_ring[i] is always BD ring i, and that mapping > >> > > never changes. All 8 TX rings are enabled and available for use. > >> > > > >> > > What I knew about tc-taprio and tc-mqprio is that they only enqueue to > >> > > TX queues [0, num_tc-1] because of this, as it turns out: > >> > > > >> > > enetc_xmit: > >> > > tx_ring = priv->tx_ring[skb->queue_mapping]; > >> > > > >> > > where skb->queue_mapping is given by: > >> > > err = netif_set_real_num_tx_queues(ndev, priv->num_tx_rings); > >> > > and by this, respectively, from the mqprio code path: > >> > > netif_set_real_num_tx_queues(ndev, num_tc); > >> > > > >> > > As for why XDP works, and priv->tx_ring[smp_processor_id()] is: > >> > > - TX ring 0 for CPU 0 and TX ring 1 for CPU 1, if 2 CPUs are used > >> > > - TX ring 0, if 1 CPU is used > >> > > > >> > > The TX completions in the first case are handled by: > >> > > - CPU 0 for TX ring 0 (because it is even) and CPU 1 for TX ring 1 > >> > > (because it is odd), if 2 CPUs are used, due to the mapping I talked > >> > > about earlier > >> > > - CPU 0 if only 1 CPU is used > >> > > >> > Right - thank you for the details! So what are the constraints on the > >> > configuration. Specifically, given two netdevs on the same device, is it > >> > possible that the system can ever end up in a situation where one device > >> > has two *RXQs* configured, and the other only one *TXQ*. Because then > >> > you could get a redirect from RXQ 1 on one device, which would also end > >> > up trying to transmit on TXQ 1 on the other device; and that would break > >> > if that other device only has TXQ 0 configured... Same thing if a single > >> > device has 2 RXQs but only one TXQ (it can redirect to itself). > >> > >> I discover more and more of the driver as I talk to you, I like it :D > >> > >> So I said that there is a maximum number of RX and TX rings splittable > >> between the PF and its VFs, but I wasn't exactly sure where that > >> configuration is done. I found it now. > >> > >> enetc_port_si_configure: (SI == station interface) > >> - read Port capability register 0 (PCAPR0) to determine how many > >> RX rings and TX rings the hardware has for this port (PFs + VFs) > >> in total. > >> - assign num_rings = min(TX rings, RX rings) > >> - try to assign 8 TX rings and 8 RX rings to the PF > >> - if this fails, just assign ${num_rings} TX rings and > >> ${num_rings} RX rings to the PF > >> - split the remaining RX and TX rings to the number of > >> configured VFs (example: if there are 16 RX rings and 16 TX > >> rings for a port with 2 VFs, the driver assigns 8RX/8TX rings > >> for the PF, and 4RX/4TX rings for each VF). > >> - if we couldn't assign 8RX/8TX rings for the PF in the > >> previous step, we don't assign any ring to the VF > >> > >> So yeah, we have an equal number of RX and TX rings. The driver, > >> however, only uses 2 RX rings _actively_: one per CPU. The other 6, I > >> don't know, I guess I can use them for AF_XDP (I haven't looked very > >> closely at that yet), at the moment they're pretty much unused, even if > >> reserved and not given to VFs. > >> > >> > >> > You mean that it's possible to receive a TC_SETUP_QDISC_MQPRIO or > >> > >> > TC_SETUP_QDISC_TAPRIO with num_tc == 1, and we have 2 CPUs? > >> > >> > >> > >> Not just that, this ndo can be called on arbitrary CPUs after a > >> > >> redirect. The code just calls through from the XDP receive path so which > >> > >> CPU it ends up on depends on the RSS+IRQ config of the other device, > >> > >> which may not even be the same driver; i.e., you have no control over > >> > >> that... :) > >> > >> > >> > > > >> > > What do you mean by "arbitrary" CPU? You can't plug CPUs in, it's a dual > >> > > core system... Why does the source ifindex matter at all? I'm using the > >> > > TX ring affined to the CPU that ndo_xdp_xmit is currently running on. > >> > > >> > See, this is why I asked 'what mechanism ensures'. Because if that > >> > mechanism is 'this driver is only ever used on a system with fewer CPUs > >> > than TXQs', then that's of course fine :) > >> > > >> > But there are drivers that do basically the same thing as what you've > >> > done here, *without* having such an assurance, and just looking at that > >> > function it's not obvious that there's an out-of-band reason why it's > >> > safe. And I literally just came from looking at such a case when I > >> > replied to your initial patch... > >> > >> Maybe you were confused seeing that this is a PCI device, thinking it's > >> a plug-in card or something, therefore we don't get to choose the number > >> of CPUs that the host has. In hindsight, I don't know why you didn't ask > >> about this, it is pretty strange when you think about it. > >> > >> It is actually more like a platform device with a PCI front-end - we > >> found this loophole in the PCI standard where you can create a "root > >> complex/integrated endpoint" which is basically an ECAM where the config > >> space contains PFs corresponding to some platform devices in the SoC (in > >> our case, all 4 Ethernet ports have their own PF, the switch has its own > >> PF, same thing for the MDIO controller and the 1588 timer). Their > >> register map is exposed as a number of BARs which use Enhanced > >> Allocation, so the generic PCI ECAM driver doesn't need to create any > >> translation windows for these addresses, it just uses what's in there, > >> which, surprise, is the actual base address of the peripheral in the > >> SoC's memory space. > >> > >> We do that because we gain a lot of cool stuff by appearing as PCI > >> devices to system software, like for example multiple interfaces on top > >> of a 'shared MAC' are simply mapped over SR-IOV. > >> > >> So it just 'smells' like PCI, but they're regular memory-mapped devices, > >> there is no PCI transaction layer or physical layer. At the moment the > >> LS1028A is the only SoC running Linux that integrates the ENETC block, > >> so we fully control the environment. > >> > >> > >> > Well, yeah, I don't know what's the proper way to deal with that. Ideas? > >> > >> > >> > >> Well the obvious one is just: > >> > >> > >> > >> tx_ring = priv->tx_ring[smp_processor_id() % num_ring_ids]; > >> > >> > >> > >> and then some kind of locking to deal with multiple CPUs accessing the > >> > >> same TX ring... > >> > > > >> > > By multiple CPUs accessing the same TX ring, you mean locking between > >> > > ndo_xdp_xmit and ndo_start_xmit? Can that even happen if the hardware > >> > > architecture is to have at least as many TX rings as CPUs? > >> > > > >> > > Because otherwise, I see that ndo_xdp_xmit is only called from > >> > > xdp_do_flush, which is in softirq context, which to my very rudimentary > >> > > knowledge run with bottom halves, thus preemption, disabled? So I don't > >> > > think it's possible for ndo_xdp_xmit and ndo_xmit, or even two > >> > > ndo_xdp_xmit instances, to access the same TX ring? > >> > > >> > Yup, I think you're right about that. The "we always have more TXQs than > >> > CPUs" condition was the bit I was missing (and of course you're *sure* > >> > that this would never change sometime in the future, right? ;)). > >> > >> I'm pretty sure, yeah, we build the SoCs and one of the requirements we > >> have is that every ENETC PF has enough TX rings in order for every CPU > >> to have its own one. That helps a lot with avoiding contention and > >> simplifying the driver. Maybe I'll use this opportunity to talk again to > >> the hardware design guys and make sure that the next SoCs with Linux > >> follow the same pattern as LS1028A, although I see no reason why not. > >> > >> > > Sorry, I'm sure these are trivial questions, but I would like to really > >> > > understand what I need to change and why :D > >> > > >> > Given the above I think the only potentially breaking thing is the > >> > #RXQ > #TXQ case I outlined. And maybe a comment documenting why indexing > >> > the tx_ring array by smp_processor_id() is safe would be nice? :) > >> > >> Sure, which part exactly do you think would explain it best? Should I > >> add a reference to enetc_port_si_configure? > > > > After discussing a bit more with Claudiu, I think we do have a problem, > > and it has to do with concurrent ndo_xdp_xmit on one CPU and ndo_start_xmit > > on another CPU. > > > > See, even if we have 8 TX rings, they are not really affined to any CPU. > > Instead, when we call netif_set_real_num_tx_queues, we allow netdev_pick_tx > > to hash amongs the TX queues of the same priority. There are three consequences: > > - Traffic with the same hash will be sent to the same TX queue, thus > > avoiding reordering for packets belonging to the same stream. > > - Traffic with different hashes are distributed to different TX queues. > > - If we have two CPUs sending traffic with the same hash, they will > > serialize on the TX lock of the same netdev queue. > > > > The last one is a problem because our XDP_REDIRECT tries to associate > > one TX ring with one CPU, and, as explained above, that TX ring might > > already be used by our ndo_start_xmit on another CPU, selected by > > netdev_pick_tx. > > > > The first idea was to implement ndo_select_queue for the network stack, > > and select the TX ring based on smp_processor_id(). But we know that > > this will break the first two effects of netdev_pick_tx, which are very > > much desirable. For example, if we have a user space process sending a > > TCP stream, and the scheduler migrates that process from one CPU to > > another, then the ndo_select_queue output for that TCP stream will > > change, and we will have TX reordering for packets belonging to the same > > stream. Not at all ideal. > > > > Another idea is to just crop some TX queues from the network stack, and > > basically call netif_set_real_num_tx_queues(6), leaving one TX ring per > > CPU dedicated to XDP. This will work just fine for normal qdiscs, except > > that with mqprio/taprio we have a problem. Our TX rings have a configurable > > strict priority for the hardware egress scheduler. When we don't have > > mqprio/taprio, all TX rings have the same priority of 0 (therefore it is > > safe to allow hashing to select one at random), but when we have mqprio > > or taprio, we enjoy the benefit of configuring the priority of each TX > > ring using the "map" option. The problem, of course, is that if we crop > > 2 TX rings out of what the network stack sees, then we are no longer > > able to configure their queue-to-traffic-class mapping through > > mqprio/taprio, so we cannot change their prioritization relative to the > > network stack queues. In a way, this seems to be in line with the XDP > > design because that bypasses anything that has to do with qdiscs, but we > > don't really like that. We also have some other qdisc-based offloads > > such as Credit Based Shaper, and we would very much like to be able to > > set bandwidth profiles for the XDP rings, for AVB/TSN use cases. > > You'd not be the first driver to solve this by just carving out a couple > of TX rings for XDP :) > > And while I get the desire for being able to configure these things for > XDP as well, I'm not sure that the qdisc interface is the right one to > use for that. There was a general TXQ allocation idea that unfortunately > stalled out, but there is also ongoing work on XDP+TSN - I'm hoping > Jesper can chime in with the details... See, the reason why I don't like this answer is because when we tried to upstream our genetlink-based TSN configuration: https://patchwork.ozlabs.org/project/netdev/patch/1545968945-7290-1-git-send-email-Po.Liu@nxp.com/ we were told that it's a QoS feature and QoS belongs to the qdisc layer. I get the impression that XDP is largely incompatible with QoS by design, which sounds to me like a bit of a foot gun. For example, we have some customers interested in building an AVB application stack on top of AF_XDP, and for the endpoints (talker/listener) they really need to be able to configure bandwidth profiles for Stream Reservation classes A and B on the AF_XDP rings. To us, tc is mostly just a configuration interface for hardware features, the deal was that this is fine as long as they have a software counterpart with identical semantics. I think I understand the basic problem in that a software shaper would be bypassed by XDP, and therefore, the bandwidth profile would not be observed properly by the AVB talker if we were to rely on that. So that sounds indeed like we shouldn't even attempt to manage any TX queues on which XDP traffic is possible with tc, unless we're willing to pass XDP_REDIRECT through the qdisc layer (which I'm not suggesting is a good idea). But with the hardware offload that wouldn't be the case, so it's almost as if what would work for us would be to have some 'dummy' TX queues for XDP manageable by tc qdiscs where we could attach our offloadable filters and shapers and policers. I just don't want them to be completely invisible as far as tc is concerned. Managing which TX queues go to XDP, and not letting the driver choose that, would be even nicer. > > Finally there is the option of taking the network stack's TX lock in our > > ndo_xdp_xmit, but frankly I would leave that option as a last resort. > > Maybe we could make this less expensive by bulk-enqueuing into a > > temporary array of buffer descriptors, and only taking the xmit lock > > when flushing that array (since that is the only portion that strictly > > needs to be protected against concurrency). But the problem with this > > approach is that if you have a temporary array, it becomes a lot more > > difficult and error-prone to not take more frames than you can enqueue. > > For example, the TX ring might have only 20 free entries, and you filled > > your BD array with 32 frames, and you told the caller of ndo_xdp_xmit > > that you processed all those frames. Now when push comes to shove and > > you actually need to enqueue them, you end up in the position that you > > must drop them yourself. This seems to be very much against the design > > principle of commit fdc13979f91e ("bpf, devmap: Move drop error path to > > devmap for XDP_REDIRECT") whose desire is to let XDP handle the dropping > > of excess TX frames. > > Note that there's already bulking in XDP_REDIRECT: after an XDP program > returns XDP_REDIRECT, the packets will actually be put on a bulk queue > (see bq_enqueue() in devmap.c), and that will be flushed to the TX > driver at the end of the (RX) NAPI cycle. So taking a lock in > ndo_xdp_xmit() may not be quite as much overhead as you think it is - > so maybe it would be worth benchmarking before ruling this out entirely? :) If shared TX queues does turn out to be the best alternative - which I'm not convinced it is - then I'll benchmark it, sure.
Vladimir Oltean <olteanv@gmail.com> writes: > On Sat, Apr 03, 2021 at 01:07:29PM +0200, Toke Høiland-Jørgensen wrote: >> Vladimir Oltean <olteanv@gmail.com> writes: >> >> > On Thu, Apr 01, 2021 at 10:38:21PM +0300, Vladimir Oltean wrote: >> >> On Thu, Apr 01, 2021 at 08:01:42PM +0200, Toke Høiland-Jørgensen wrote: >> >> > Vladimir Oltean <olteanv@gmail.com> writes: >> >> > >> >> > > On Thu, Apr 01, 2021 at 01:39:05PM +0200, Toke Høiland-Jørgensen wrote: >> >> > >> Vladimir Oltean <olteanv@gmail.com> writes: >> >> > >> >> >> > >> > On Thu, Apr 01, 2021 at 01:26:02PM +0200, Toke Høiland-Jørgensen wrote: >> >> > >> >> > +int enetc_xdp_xmit(struct net_device *ndev, int num_frames, >> >> > >> >> > + struct xdp_frame **frames, u32 flags) >> >> > >> >> > +{ >> >> > >> >> > + struct enetc_tx_swbd xdp_redirect_arr[ENETC_MAX_SKB_FRAGS] = {0}; >> >> > >> >> > + struct enetc_ndev_priv *priv = netdev_priv(ndev); >> >> > >> >> > + struct enetc_bdr *tx_ring; >> >> > >> >> > + int xdp_tx_bd_cnt, i, k; >> >> > >> >> > + int xdp_tx_frm_cnt = 0; >> >> > >> >> > + >> >> > >> >> > + tx_ring = priv->tx_ring[smp_processor_id()]; >> >> > >> >> >> >> > >> >> What mechanism guarantees that this won't overflow the array? :) >> >> > >> > >> >> > >> > Which array, the array of TX rings? >> >> > >> >> >> > >> Yes. >> >> > >> >> >> > > >> >> > > The problem isn't even accessing an out-of-bounds element in the TX ring array. >> >> > > >> >> > > As it turns out, I had a relatively superficial understanding of how >> >> > > things are organized, but let me try to explain. >> >> > > >> >> > > The number of TX rings is a configurable resource (between PFs and VFs) >> >> > > and we read the capability at probe time: >> >> > > >> >> > > enetc_get_si_caps: >> >> > > val = enetc_rd(hw, ENETC_SICAPR0); >> >> > > si->num_rx_rings = (val >> 16) & 0xff; >> >> > > si->num_tx_rings = val & 0xff; >> >> > > >> >> > > enetc_init_si_rings_params: >> >> > > priv->num_tx_rings = si->num_tx_rings; >> >> > > >> >> > > In any case, the TX array is declared as: >> >> > > >> >> > > struct enetc_ndev_priv { >> >> > > struct enetc_bdr *tx_ring[16]; >> >> > > struct enetc_bdr *rx_ring[16]; >> >> > > }; >> >> > > >> >> > > because that's the maximum hardware capability. >> >> > > >> >> > > The priv->tx_ring array is populated in: >> >> > > >> >> > > enetc_alloc_msix: >> >> > > /* # of tx rings per int vector */ >> >> > > v_tx_rings = priv->num_tx_rings / priv->bdr_int_num; >> >> > > >> >> > > for (i = 0; i < priv->bdr_int_num; i++) { >> >> > > for (j = 0; j < v_tx_rings; j++) { >> >> > > if (priv->bdr_int_num == ENETC_MAX_BDR_INT) >> >> > > idx = 2 * j + i; /* 2 CPUs */ >> >> > > else >> >> > > idx = j + i * v_tx_rings; /* default */ >> >> > > >> >> > > priv->tx_ring[idx] = bdr; >> >> > > } >> >> > > } >> >> > > >> >> > > priv->bdr_int_num is set to "num_online_cpus()". >> >> > > On LS1028A, it can be either 1 or 2 (and the ENETC_MAX_BDR_INT macro is >> >> > > equal to 2). >> >> > > >> >> > > Otherwise said, the convoluted logic above does the following: >> >> > > - It affines an MSI interrupt vector per CPU >> >> > > - It affines an RX ring per MSI vector, hence per CPU >> >> > > - It balances the fixed number of TX rings (say 8) among the available >> >> > > MSI vectors, hence CPUs (say 2). It does this by iterating with i >> >> > > through the RX MSI interrupt vectors, and with j through the number of >> >> > > TX rings per MSI vector. >> >> > > >> >> > > This logic maps: >> >> > > - the even TX rings to CPU 0 and the odd TX rings to CPU 1, if 2 CPUs >> >> > > are used >> >> > > - all TX rings to CPU 0, if 1 CPU is used >> >> > > >> >> > > This is done because we have this logic in enetc_poll: >> >> > > >> >> > > for (i = 0; i < v->count_tx_rings; i++) >> >> > > if (!enetc_clean_tx_ring(&v->tx_ring[i], budget)) >> >> > > complete = false; >> >> > > >> >> > > for processing the TX completions of a given group of TX rings in the RX >> >> > > MSI interrupt handler of a certain CPU. >> >> > > >> >> > > Otherwise said, priv->tx_ring[i] is always BD ring i, and that mapping >> >> > > never changes. All 8 TX rings are enabled and available for use. >> >> > > >> >> > > What I knew about tc-taprio and tc-mqprio is that they only enqueue to >> >> > > TX queues [0, num_tc-1] because of this, as it turns out: >> >> > > >> >> > > enetc_xmit: >> >> > > tx_ring = priv->tx_ring[skb->queue_mapping]; >> >> > > >> >> > > where skb->queue_mapping is given by: >> >> > > err = netif_set_real_num_tx_queues(ndev, priv->num_tx_rings); >> >> > > and by this, respectively, from the mqprio code path: >> >> > > netif_set_real_num_tx_queues(ndev, num_tc); >> >> > > >> >> > > As for why XDP works, and priv->tx_ring[smp_processor_id()] is: >> >> > > - TX ring 0 for CPU 0 and TX ring 1 for CPU 1, if 2 CPUs are used >> >> > > - TX ring 0, if 1 CPU is used >> >> > > >> >> > > The TX completions in the first case are handled by: >> >> > > - CPU 0 for TX ring 0 (because it is even) and CPU 1 for TX ring 1 >> >> > > (because it is odd), if 2 CPUs are used, due to the mapping I talked >> >> > > about earlier >> >> > > - CPU 0 if only 1 CPU is used >> >> > >> >> > Right - thank you for the details! So what are the constraints on the >> >> > configuration. Specifically, given two netdevs on the same device, is it >> >> > possible that the system can ever end up in a situation where one device >> >> > has two *RXQs* configured, and the other only one *TXQ*. Because then >> >> > you could get a redirect from RXQ 1 on one device, which would also end >> >> > up trying to transmit on TXQ 1 on the other device; and that would break >> >> > if that other device only has TXQ 0 configured... Same thing if a single >> >> > device has 2 RXQs but only one TXQ (it can redirect to itself). >> >> >> >> I discover more and more of the driver as I talk to you, I like it :D >> >> >> >> So I said that there is a maximum number of RX and TX rings splittable >> >> between the PF and its VFs, but I wasn't exactly sure where that >> >> configuration is done. I found it now. >> >> >> >> enetc_port_si_configure: (SI == station interface) >> >> - read Port capability register 0 (PCAPR0) to determine how many >> >> RX rings and TX rings the hardware has for this port (PFs + VFs) >> >> in total. >> >> - assign num_rings = min(TX rings, RX rings) >> >> - try to assign 8 TX rings and 8 RX rings to the PF >> >> - if this fails, just assign ${num_rings} TX rings and >> >> ${num_rings} RX rings to the PF >> >> - split the remaining RX and TX rings to the number of >> >> configured VFs (example: if there are 16 RX rings and 16 TX >> >> rings for a port with 2 VFs, the driver assigns 8RX/8TX rings >> >> for the PF, and 4RX/4TX rings for each VF). >> >> - if we couldn't assign 8RX/8TX rings for the PF in the >> >> previous step, we don't assign any ring to the VF >> >> >> >> So yeah, we have an equal number of RX and TX rings. The driver, >> >> however, only uses 2 RX rings _actively_: one per CPU. The other 6, I >> >> don't know, I guess I can use them for AF_XDP (I haven't looked very >> >> closely at that yet), at the moment they're pretty much unused, even if >> >> reserved and not given to VFs. >> >> >> >> > >> > You mean that it's possible to receive a TC_SETUP_QDISC_MQPRIO or >> >> > >> > TC_SETUP_QDISC_TAPRIO with num_tc == 1, and we have 2 CPUs? >> >> > >> >> >> > >> Not just that, this ndo can be called on arbitrary CPUs after a >> >> > >> redirect. The code just calls through from the XDP receive path so which >> >> > >> CPU it ends up on depends on the RSS+IRQ config of the other device, >> >> > >> which may not even be the same driver; i.e., you have no control over >> >> > >> that... :) >> >> > >> >> >> > > >> >> > > What do you mean by "arbitrary" CPU? You can't plug CPUs in, it's a dual >> >> > > core system... Why does the source ifindex matter at all? I'm using the >> >> > > TX ring affined to the CPU that ndo_xdp_xmit is currently running on. >> >> > >> >> > See, this is why I asked 'what mechanism ensures'. Because if that >> >> > mechanism is 'this driver is only ever used on a system with fewer CPUs >> >> > than TXQs', then that's of course fine :) >> >> > >> >> > But there are drivers that do basically the same thing as what you've >> >> > done here, *without* having such an assurance, and just looking at that >> >> > function it's not obvious that there's an out-of-band reason why it's >> >> > safe. And I literally just came from looking at such a case when I >> >> > replied to your initial patch... >> >> >> >> Maybe you were confused seeing that this is a PCI device, thinking it's >> >> a plug-in card or something, therefore we don't get to choose the number >> >> of CPUs that the host has. In hindsight, I don't know why you didn't ask >> >> about this, it is pretty strange when you think about it. >> >> >> >> It is actually more like a platform device with a PCI front-end - we >> >> found this loophole in the PCI standard where you can create a "root >> >> complex/integrated endpoint" which is basically an ECAM where the config >> >> space contains PFs corresponding to some platform devices in the SoC (in >> >> our case, all 4 Ethernet ports have their own PF, the switch has its own >> >> PF, same thing for the MDIO controller and the 1588 timer). Their >> >> register map is exposed as a number of BARs which use Enhanced >> >> Allocation, so the generic PCI ECAM driver doesn't need to create any >> >> translation windows for these addresses, it just uses what's in there, >> >> which, surprise, is the actual base address of the peripheral in the >> >> SoC's memory space. >> >> >> >> We do that because we gain a lot of cool stuff by appearing as PCI >> >> devices to system software, like for example multiple interfaces on top >> >> of a 'shared MAC' are simply mapped over SR-IOV. >> >> >> >> So it just 'smells' like PCI, but they're regular memory-mapped devices, >> >> there is no PCI transaction layer or physical layer. At the moment the >> >> LS1028A is the only SoC running Linux that integrates the ENETC block, >> >> so we fully control the environment. >> >> >> >> > >> > Well, yeah, I don't know what's the proper way to deal with that. Ideas? >> >> > >> >> >> > >> Well the obvious one is just: >> >> > >> >> >> > >> tx_ring = priv->tx_ring[smp_processor_id() % num_ring_ids]; >> >> > >> >> >> > >> and then some kind of locking to deal with multiple CPUs accessing the >> >> > >> same TX ring... >> >> > > >> >> > > By multiple CPUs accessing the same TX ring, you mean locking between >> >> > > ndo_xdp_xmit and ndo_start_xmit? Can that even happen if the hardware >> >> > > architecture is to have at least as many TX rings as CPUs? >> >> > > >> >> > > Because otherwise, I see that ndo_xdp_xmit is only called from >> >> > > xdp_do_flush, which is in softirq context, which to my very rudimentary >> >> > > knowledge run with bottom halves, thus preemption, disabled? So I don't >> >> > > think it's possible for ndo_xdp_xmit and ndo_xmit, or even two >> >> > > ndo_xdp_xmit instances, to access the same TX ring? >> >> > >> >> > Yup, I think you're right about that. The "we always have more TXQs than >> >> > CPUs" condition was the bit I was missing (and of course you're *sure* >> >> > that this would never change sometime in the future, right? ;)). >> >> >> >> I'm pretty sure, yeah, we build the SoCs and one of the requirements we >> >> have is that every ENETC PF has enough TX rings in order for every CPU >> >> to have its own one. That helps a lot with avoiding contention and >> >> simplifying the driver. Maybe I'll use this opportunity to talk again to >> >> the hardware design guys and make sure that the next SoCs with Linux >> >> follow the same pattern as LS1028A, although I see no reason why not. >> >> >> >> > > Sorry, I'm sure these are trivial questions, but I would like to really >> >> > > understand what I need to change and why :D >> >> > >> >> > Given the above I think the only potentially breaking thing is the >> >> > #RXQ > #TXQ case I outlined. And maybe a comment documenting why indexing >> >> > the tx_ring array by smp_processor_id() is safe would be nice? :) >> >> >> >> Sure, which part exactly do you think would explain it best? Should I >> >> add a reference to enetc_port_si_configure? >> > >> > After discussing a bit more with Claudiu, I think we do have a problem, >> > and it has to do with concurrent ndo_xdp_xmit on one CPU and ndo_start_xmit >> > on another CPU. >> > >> > See, even if we have 8 TX rings, they are not really affined to any CPU. >> > Instead, when we call netif_set_real_num_tx_queues, we allow netdev_pick_tx >> > to hash amongs the TX queues of the same priority. There are three consequences: >> > - Traffic with the same hash will be sent to the same TX queue, thus >> > avoiding reordering for packets belonging to the same stream. >> > - Traffic with different hashes are distributed to different TX queues. >> > - If we have two CPUs sending traffic with the same hash, they will >> > serialize on the TX lock of the same netdev queue. >> > >> > The last one is a problem because our XDP_REDIRECT tries to associate >> > one TX ring with one CPU, and, as explained above, that TX ring might >> > already be used by our ndo_start_xmit on another CPU, selected by >> > netdev_pick_tx. >> > >> > The first idea was to implement ndo_select_queue for the network stack, >> > and select the TX ring based on smp_processor_id(). But we know that >> > this will break the first two effects of netdev_pick_tx, which are very >> > much desirable. For example, if we have a user space process sending a >> > TCP stream, and the scheduler migrates that process from one CPU to >> > another, then the ndo_select_queue output for that TCP stream will >> > change, and we will have TX reordering for packets belonging to the same >> > stream. Not at all ideal. >> > >> > Another idea is to just crop some TX queues from the network stack, and >> > basically call netif_set_real_num_tx_queues(6), leaving one TX ring per >> > CPU dedicated to XDP. This will work just fine for normal qdiscs, except >> > that with mqprio/taprio we have a problem. Our TX rings have a configurable >> > strict priority for the hardware egress scheduler. When we don't have >> > mqprio/taprio, all TX rings have the same priority of 0 (therefore it is >> > safe to allow hashing to select one at random), but when we have mqprio >> > or taprio, we enjoy the benefit of configuring the priority of each TX >> > ring using the "map" option. The problem, of course, is that if we crop >> > 2 TX rings out of what the network stack sees, then we are no longer >> > able to configure their queue-to-traffic-class mapping through >> > mqprio/taprio, so we cannot change their prioritization relative to the >> > network stack queues. In a way, this seems to be in line with the XDP >> > design because that bypasses anything that has to do with qdiscs, but we >> > don't really like that. We also have some other qdisc-based offloads >> > such as Credit Based Shaper, and we would very much like to be able to >> > set bandwidth profiles for the XDP rings, for AVB/TSN use cases. >> >> You'd not be the first driver to solve this by just carving out a couple >> of TX rings for XDP :) >> >> And while I get the desire for being able to configure these things for >> XDP as well, I'm not sure that the qdisc interface is the right one to >> use for that. There was a general TXQ allocation idea that unfortunately >> stalled out, but there is also ongoing work on XDP+TSN - I'm hoping >> Jesper can chime in with the details... > > See, the reason why I don't like this answer is because when we tried to > upstream our genetlink-based TSN configuration: > https://patchwork.ozlabs.org/project/netdev/patch/1545968945-7290-1-git-send-email-Po.Liu@nxp.com/ > we were told that it's a QoS feature and QoS belongs to the qdisc layer. > > I get the impression that XDP is largely incompatible with QoS by design, > which sounds to me like a bit of a foot gun. For example, we have some > customers interested in building an AVB application stack on top of AF_XDP, > and for the endpoints (talker/listener) they really need to be able to > configure bandwidth profiles for Stream Reservation classes A and B on > the AF_XDP rings. > > To us, tc is mostly just a configuration interface for hardware features, > the deal was that this is fine as long as they have a software counterpart > with identical semantics. I think I understand the basic problem in that > a software shaper would be bypassed by XDP, and therefore, the bandwidth > profile would not be observed properly by the AVB talker if we were to > rely on that. So that sounds indeed like we shouldn't even attempt to > manage any TX queues on which XDP traffic is possible with tc, unless > we're willing to pass XDP_REDIRECT through the qdisc layer (which I'm > not suggesting is a good idea). But with the hardware offload that > wouldn't be the case, so it's almost as if what would work for us would > be to have some 'dummy' TX queues for XDP manageable by tc qdiscs where > we could attach our offloadable filters and shapers and policers. I just > don't want them to be completely invisible as far as tc is concerned. > Managing which TX queues go to XDP, and not letting the driver choose > that, would be even nicer. I'm not objecting to being able to configure the hardware queues that will be used for XDP, I'm just saying that doing so via TC is not a very good interface for it... Rather, we need an interface for configuring hardware queues that can be used by *both* XDP and TC. And yeah, the lack of queueing and bandwidth management is a major footgun in XDP, which we do want to fix (also for regular XDP_REDIRECT, not just AF_XDP). -Toke
diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index ba5313a5d7a4..57049ae97201 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -8,6 +8,23 @@ #include <linux/vmalloc.h> #include <net/pkt_sched.h> +static struct sk_buff *enetc_tx_swbd_get_skb(struct enetc_tx_swbd *tx_swbd) +{ + if (tx_swbd->is_xdp_tx || tx_swbd->is_xdp_redirect) + return NULL; + + return tx_swbd->skb; +} + +static struct xdp_frame * +enetc_tx_swbd_get_xdp_frame(struct enetc_tx_swbd *tx_swbd) +{ + if (tx_swbd->is_xdp_redirect) + return tx_swbd->xdp_frame; + + return NULL; +} + static void enetc_unmap_tx_buff(struct enetc_bdr *tx_ring, struct enetc_tx_swbd *tx_swbd) { @@ -25,14 +42,20 @@ static void enetc_unmap_tx_buff(struct enetc_bdr *tx_ring, tx_swbd->dma = 0; } -static void enetc_free_tx_skb(struct enetc_bdr *tx_ring, - struct enetc_tx_swbd *tx_swbd) +static void enetc_free_tx_frame(struct enetc_bdr *tx_ring, + struct enetc_tx_swbd *tx_swbd) { + struct xdp_frame *xdp_frame = enetc_tx_swbd_get_xdp_frame(tx_swbd); + struct sk_buff *skb = enetc_tx_swbd_get_skb(tx_swbd); + if (tx_swbd->dma) enetc_unmap_tx_buff(tx_ring, tx_swbd); - if (tx_swbd->skb) { - dev_kfree_skb_any(tx_swbd->skb); + if (xdp_frame) { + xdp_return_frame(tx_swbd->xdp_frame); + tx_swbd->xdp_frame = NULL; + } else if (skb) { + dev_kfree_skb_any(skb); tx_swbd->skb = NULL; } } @@ -183,7 +206,7 @@ static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb, do { tx_swbd = &tx_ring->tx_swbd[i]; - enetc_free_tx_skb(tx_ring, tx_swbd); + enetc_free_tx_frame(tx_ring, tx_swbd); if (i == 0) i = tx_ring->bd_count; i--; @@ -381,6 +404,9 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) do_tstamp = false; while (bds_to_clean && tx_frm_cnt < ENETC_DEFAULT_TX_WORK) { + struct xdp_frame *xdp_frame = enetc_tx_swbd_get_xdp_frame(tx_swbd); + struct sk_buff *skb = enetc_tx_swbd_get_skb(tx_swbd); + if (unlikely(tx_swbd->check_wb)) { struct enetc_ndev_priv *priv = netdev_priv(ndev); union enetc_tx_bd *txbd; @@ -400,12 +426,15 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) else if (likely(tx_swbd->dma)) enetc_unmap_tx_buff(tx_ring, tx_swbd); - if (tx_swbd->skb) { + if (xdp_frame) { + xdp_return_frame(xdp_frame); + tx_swbd->xdp_frame = NULL; + } else if (skb) { if (unlikely(do_tstamp)) { - enetc_tstamp_tx(tx_swbd->skb, tstamp); + enetc_tstamp_tx(skb, tstamp); do_tstamp = false; } - napi_consume_skb(tx_swbd->skb, napi_budget); + napi_consume_skb(skb, napi_budget); tx_swbd->skb = NULL; } @@ -827,6 +856,109 @@ static bool enetc_xdp_tx(struct enetc_bdr *tx_ring, return true; } +static int enetc_xdp_frame_to_xdp_tx_swbd(struct enetc_bdr *tx_ring, + struct enetc_tx_swbd *xdp_tx_arr, + struct xdp_frame *xdp_frame) +{ + struct enetc_tx_swbd *xdp_tx_swbd = &xdp_tx_arr[0]; + struct skb_shared_info *shinfo; + void *data = xdp_frame->data; + int len = xdp_frame->len; + skb_frag_t *frag; + dma_addr_t dma; + unsigned int f; + int n = 0; + + dma = dma_map_single(tx_ring->dev, data, len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx_ring->dev, dma))) { + netdev_err(tx_ring->ndev, "DMA map error\n"); + return -1; + } + + xdp_tx_swbd->dma = dma; + xdp_tx_swbd->dir = DMA_TO_DEVICE; + xdp_tx_swbd->len = len; + xdp_tx_swbd->is_xdp_redirect = true; + xdp_tx_swbd->is_eof = false; + xdp_tx_swbd->xdp_frame = NULL; + + n++; + xdp_tx_swbd = &xdp_tx_arr[n]; + + shinfo = xdp_get_shared_info_from_frame(xdp_frame); + + for (f = 0, frag = &shinfo->frags[0]; f < shinfo->nr_frags; + f++, frag++) { + data = skb_frag_address(frag); + len = skb_frag_size(frag); + + dma = dma_map_single(tx_ring->dev, data, len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx_ring->dev, dma))) { + /* Undo the DMA mapping for all fragments */ + while (n-- >= 0) + enetc_unmap_tx_buff(tx_ring, &xdp_tx_arr[n]); + + netdev_err(tx_ring->ndev, "DMA map error\n"); + return -1; + } + + xdp_tx_swbd->dma = dma; + xdp_tx_swbd->dir = DMA_TO_DEVICE; + xdp_tx_swbd->len = len; + xdp_tx_swbd->is_xdp_redirect = true; + xdp_tx_swbd->is_eof = false; + xdp_tx_swbd->xdp_frame = NULL; + + n++; + xdp_tx_swbd = &xdp_tx_arr[n]; + } + + xdp_tx_arr[n - 1].is_eof = true; + xdp_tx_arr[n - 1].xdp_frame = xdp_frame; + + return n; +} + +int enetc_xdp_xmit(struct net_device *ndev, int num_frames, + struct xdp_frame **frames, u32 flags) +{ + struct enetc_tx_swbd xdp_redirect_arr[ENETC_MAX_SKB_FRAGS] = {0}; + struct enetc_ndev_priv *priv = netdev_priv(ndev); + struct enetc_bdr *tx_ring; + int xdp_tx_bd_cnt, i, k; + int xdp_tx_frm_cnt = 0; + + tx_ring = priv->tx_ring[smp_processor_id()]; + + prefetchw(ENETC_TXBD(*tx_ring, tx_ring->next_to_use)); + + for (k = 0; k < num_frames; k++) { + xdp_tx_bd_cnt = enetc_xdp_frame_to_xdp_tx_swbd(tx_ring, + xdp_redirect_arr, + frames[k]); + if (unlikely(xdp_tx_bd_cnt < 0)) + break; + + if (unlikely(!enetc_xdp_tx(tx_ring, xdp_redirect_arr, + xdp_tx_bd_cnt))) { + for (i = 0; i < xdp_tx_bd_cnt; i++) + enetc_unmap_tx_buff(tx_ring, + &xdp_redirect_arr[i]); + tx_ring->stats.xdp_tx_drops++; + break; + } + + xdp_tx_frm_cnt++; + } + + if (unlikely((flags & XDP_XMIT_FLUSH) || k != xdp_tx_frm_cnt)) + enetc_update_tx_ring_tail(tx_ring); + + tx_ring->stats.xdp_tx += xdp_tx_frm_cnt; + + return xdp_tx_frm_cnt; +} + static void enetc_map_rx_buff_to_xdp(struct enetc_bdr *rx_ring, int i, struct xdp_buff *xdp_buff, u16 size) { @@ -948,14 +1080,31 @@ static void enetc_xdp_drop(struct enetc_bdr *rx_ring, int rx_ring_first, rx_ring->stats.xdp_drops++; } +static void enetc_xdp_free(struct enetc_bdr *rx_ring, int rx_ring_first, + int rx_ring_last) +{ + while (rx_ring_first != rx_ring_last) { + struct enetc_rx_swbd *rx_swbd = &rx_ring->rx_swbd[rx_ring_first]; + + if (rx_swbd->page) { + dma_unmap_page(rx_ring->dev, rx_swbd->dma, PAGE_SIZE, + rx_swbd->dir); + __free_page(rx_swbd->page); + rx_swbd->page = NULL; + } + enetc_bdr_idx_inc(rx_ring, &rx_ring_first); + } + rx_ring->stats.xdp_redirect_failures++; +} + static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, struct napi_struct *napi, int work_limit, struct bpf_prog *prog) { + int xdp_tx_bd_cnt, xdp_tx_frm_cnt = 0, xdp_redirect_frm_cnt = 0; struct enetc_tx_swbd xdp_tx_arr[ENETC_MAX_SKB_FRAGS] = {0}; struct enetc_ndev_priv *priv = netdev_priv(rx_ring->ndev); struct enetc_bdr *tx_ring = priv->tx_ring[rx_ring->index]; - int xdp_tx_bd_cnt, xdp_tx_frm_cnt = 0; int rx_frm_cnt = 0, rx_byte_cnt = 0; int cleaned_cnt, i; u32 xdp_act; @@ -969,6 +1118,7 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, int orig_i, orig_cleaned_cnt; struct xdp_buff xdp_buff; struct sk_buff *skb; + int tmp_orig_i, err; u32 bd_status; rxbd = enetc_rxbd(rx_ring, i); @@ -1026,6 +1176,43 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, rx_ring->xdp.xdp_tx_in_flight += xdp_tx_bd_cnt; xdp_tx_frm_cnt++; } + break; + case XDP_REDIRECT: + /* xdp_return_frame does not support S/G in the sense + * that it leaks the fragments (__xdp_return should not + * call page_frag_free only for the initial buffer). + * Until XDP_REDIRECT gains support for S/G let's keep + * the code structure in place, but dead. We drop the + * S/G frames ourselves to avoid memory leaks which + * would otherwise leave the kernel OOM. + */ + if (unlikely(cleaned_cnt - orig_cleaned_cnt != 1)) { + enetc_xdp_drop(rx_ring, orig_i, i); + rx_ring->stats.xdp_redirect_sg++; + break; + } + + tmp_orig_i = orig_i; + + while (orig_i != i) { + enetc_put_rx_buff(rx_ring, + &rx_ring->rx_swbd[orig_i]); + enetc_bdr_idx_inc(rx_ring, &orig_i); + } + + err = xdp_do_redirect(rx_ring->ndev, &xdp_buff, prog); + if (unlikely(err)) { + enetc_xdp_free(rx_ring, tmp_orig_i, i); + } else { + xdp_redirect_frm_cnt++; + rx_ring->stats.xdp_redirect++; + } + + if (unlikely(xdp_redirect_frm_cnt > ENETC_DEFAULT_TX_WORK)) { + xdp_do_flush_map(); + xdp_redirect_frm_cnt = 0; + } + break; default: bpf_warn_invalid_xdp_action(xdp_act); @@ -1039,6 +1226,9 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, rx_ring->stats.packets += rx_frm_cnt; rx_ring->stats.bytes += rx_byte_cnt; + if (xdp_redirect_frm_cnt) + xdp_do_flush_map(); + if (xdp_tx_frm_cnt) enetc_update_tx_ring_tail(tx_ring); @@ -1173,7 +1363,7 @@ static void enetc_free_txbdr(struct enetc_bdr *txr) int size, i; for (i = 0; i < txr->bd_count; i++) - enetc_free_tx_skb(txr, &txr->tx_swbd[i]); + enetc_free_tx_frame(txr, &txr->tx_swbd[i]); size = txr->bd_count * sizeof(union enetc_tx_bd); @@ -1290,7 +1480,7 @@ static void enetc_free_tx_ring(struct enetc_bdr *tx_ring) for (i = 0; i < tx_ring->bd_count; i++) { struct enetc_tx_swbd *tx_swbd = &tx_ring->tx_swbd[i]; - enetc_free_tx_skb(tx_ring, tx_swbd); + enetc_free_tx_frame(tx_ring, tx_swbd); } tx_ring->next_to_clean = 0; diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index d0619fcbbe97..05474f46b0d9 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -19,7 +19,10 @@ (ETH_FCS_LEN + ETH_HLEN + VLAN_HLEN)) struct enetc_tx_swbd { - struct sk_buff *skb; + union { + struct sk_buff *skb; + struct xdp_frame *xdp_frame; + }; dma_addr_t dma; struct page *page; /* valid only if is_xdp_tx */ u16 page_offset; /* valid only if is_xdp_tx */ @@ -30,6 +33,7 @@ struct enetc_tx_swbd { u8 do_tstamp:1; u8 is_eof:1; u8 is_xdp_tx:1; + u8 is_xdp_redirect:1; }; #define ENETC_RX_MAXFRM_SIZE ENETC_MAC_MAXFRM_SIZE @@ -61,6 +65,9 @@ struct enetc_ring_stats { unsigned int xdp_drops; unsigned int xdp_tx; unsigned int xdp_tx_drops; + unsigned int xdp_redirect; + unsigned int xdp_redirect_failures; + unsigned int xdp_redirect_sg; unsigned int recycles; unsigned int recycle_failures; }; @@ -354,6 +361,8 @@ int enetc_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd); int enetc_setup_tc(struct net_device *ndev, enum tc_setup_type type, void *type_data); int enetc_setup_bpf(struct net_device *dev, struct netdev_bpf *xdp); +int enetc_xdp_xmit(struct net_device *ndev, int num_frames, + struct xdp_frame **frames, u32 flags); /* ethtool */ void enetc_set_ethtool_ops(struct net_device *ndev); diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c index 37821a8b225e..7cc81b453bd7 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c @@ -195,6 +195,9 @@ static const char rx_ring_stats[][ETH_GSTRING_LEN] = { "Rx ring %2d XDP drops", "Rx ring %2d recycles", "Rx ring %2d recycle failures", + "Rx ring %2d redirects", + "Rx ring %2d redirect failures", + "Rx ring %2d redirect S/G", }; static const char tx_ring_stats[][ETH_GSTRING_LEN] = { @@ -284,6 +287,9 @@ static void enetc_get_ethtool_stats(struct net_device *ndev, data[o++] = priv->rx_ring[i]->stats.xdp_drops; data[o++] = priv->rx_ring[i]->stats.recycles; data[o++] = priv->rx_ring[i]->stats.recycle_failures; + data[o++] = priv->rx_ring[i]->stats.xdp_redirect; + data[o++] = priv->rx_ring[i]->stats.xdp_redirect_failures; + data[o++] = priv->rx_ring[i]->stats.xdp_redirect_sg; } if (!enetc_si_is_pf(priv->si)) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index 0484dbe13422..f61fedf462e5 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -708,6 +708,7 @@ static const struct net_device_ops enetc_ndev_ops = { .ndo_do_ioctl = enetc_ioctl, .ndo_setup_tc = enetc_setup_tc, .ndo_bpf = enetc_setup_bpf, + .ndo_xdp_xmit = enetc_xdp_xmit, }; static void enetc_pf_netdev_setup(struct enetc_si *si, struct net_device *ndev,