Message ID | 20210125124516.3098129-2-liuhangbin@gmail.com |
---|---|
State | Superseded |
Headers | show |
Series | xdp: add a new helper for dev map multicast support | expand |
Hangbin Liu wrote: > From: Jesper Dangaard Brouer <brouer@redhat.com> > > This changes the devmap XDP program support to run the program when the > bulk queue is flushed instead of before the frame is enqueued. This has > a couple of benefits: > > - It "sorts" the packets by destination devmap entry, and then runs the > same BPF program on all the packets in sequence. This ensures that we > keep the XDP program and destination device properties hot in I-cache. > > - It makes the multicast implementation simpler because it can just > enqueue packets using bq_enqueue() without having to deal with the > devmap program at all. > > The drawback is that if the devmap program drops the packet, the enqueue > step is redundant. However, arguably this is mostly visible in a > micro-benchmark, and with more mixed traffic the I-cache benefit should > win out. The performance impact of just this patch is as follows: > > The bq_xmit_all's logic is also refactored and error label is removed. > When bq_xmit_all() is called from bq_enqueue(), another packet will > always be enqueued immediately after, so clearing dev_rx, xdp_prog and > flush_node in bq_xmit_all() is redundant. Let's move the clear to > __dev_flush(), and only check them once in bq_enqueue() since they are > all modified together. > > By using xdp_redirect_map in sample/bpf and send pkts via pktgen cmd: > ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64 > > There are about +/- 0.1M deviation for native testing, the performance > improved for the base-case, but some drop back with xdp devmap prog attached. > > Version | Test | Generic | Native | Native + 2nd xdp_prog > 5.10 rc6 | xdp_redirect_map i40e->i40e | 2.0M | 9.1M | 8.0M > 5.10 rc6 | xdp_redirect_map i40e->veth | 1.7M | 11.0M | 9.7M > 5.10 rc6 + patch | xdp_redirect_map i40e->i40e | 2.0M | 9.5M | 7.5M > 5.10 rc6 + patch | xdp_redirect_map i40e->veth | 1.7M | 11.6M | 9.1M > [...] > +static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, > + struct xdp_frame **frames, int n, > + struct net_device *dev) > +{ > + struct xdp_txq_info txq = { .dev = dev }; > + struct xdp_buff xdp; > + int i, nframes = 0; > + > + for (i = 0; i < n; i++) { > + struct xdp_frame *xdpf = frames[i]; > + u32 act; > + int err; > + > + xdp_convert_frame_to_buff(xdpf, &xdp); > + xdp.txq = &txq; > + > + act = bpf_prog_run_xdp(xdp_prog, &xdp); > + switch (act) { > + case XDP_PASS: > + err = xdp_update_frame_from_buff(&xdp, xdpf); > + if (unlikely(err < 0)) > + xdp_return_frame_rx_napi(xdpf); > + else > + frames[nframes++] = xdpf; > + break; > + default: > + bpf_warn_invalid_xdp_action(act); > + fallthrough; > + case XDP_ABORTED: > + trace_xdp_exception(dev, xdp_prog, act); > + fallthrough; > + case XDP_DROP: > + xdp_return_frame_rx_napi(xdpf); > + break; > + } > + } > + return nframes; /* sent frames count */ > +} > + > static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) > { > struct net_device *dev = bq->dev; > - int sent = 0, drops = 0, err = 0; > + unsigned int cnt = bq->count; > + int drops = 0, err = 0; > + int to_send = cnt; > + int sent = cnt; > int i; > > - if (unlikely(!bq->count)) > + if (unlikely(!cnt)) > return; > > - for (i = 0; i < bq->count; i++) { > + for (i = 0; i < cnt; i++) { > struct xdp_frame *xdpf = bq->q[i]; > > prefetch(xdpf); > } > > - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); > + if (bq->xdp_prog) { > + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); > + if (!to_send) { > + sent = 0; > + goto out; > + } > + drops = cnt - to_send; > + } I might be missing something about how *bq works here. What happens when dev_map_bpf_prog_run returns to_send < cnt? So I read this as it will send [0, to_send] and [to_send, cnt] will be dropped? How do we know the bpf prog would have dropped the set, [to_send+1, cnt]? > + > + sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags); > if (sent < 0) { > err = sent; > sent = 0; > - goto error; > + > + /* If ndo_xdp_xmit fails with an errno, no frames have been > + * xmit'ed and it's our responsibility to them free all. > + */ > + for (i = 0; i < cnt - drops; i++) { > + struct xdp_frame *xdpf = bq->q[i]; > + > + xdp_return_frame_rx_napi(xdpf); > + } > } > - drops = bq->count - sent; > out: > + drops = cnt - sent; > bq->count = 0; > > trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); I gather the remaining [to_send+1, cnt] packets are in fact dropped because we set bq->count=0 and trace_xdp_devmap_xmit seems to think they are dropped. Also update the comment in trace_xdp_devmap_xmit, /* Catch API error of drv ndo_xdp_xmit sent more than count */ so that it reads to account for devmap progs as well? > - bq->dev_rx = NULL; > - __list_del_clearprev(&bq->flush_node); > return; > -error: > - /* If ndo_xdp_xmit fails with an errno, no frames have been > - * xmit'ed and it's our responsibility to them free all. > - */ > - for (i = 0; i < bq->count; i++) { > - struct xdp_frame *xdpf = bq->q[i]; > - > - xdp_return_frame_rx_napi(xdpf); > - drops++; > - } > - goto out; > } [...] > -static struct xdp_buff *dev_map_run_prog(struct net_device *dev, > - struct xdp_buff *xdp, > - struct bpf_prog *xdp_prog) > -{ > - struct xdp_txq_info txq = { .dev = dev }; > - u32 act; > - > - xdp_set_data_meta_invalid(xdp); > - xdp->txq = &txq; > - > - act = bpf_prog_run_xdp(xdp_prog, xdp); > - switch (act) { > - case XDP_PASS: > - return xdp; > - case XDP_DROP: > - break; > - default: > - bpf_warn_invalid_xdp_action(act); > - fallthrough; > - case XDP_ABORTED: > - trace_xdp_exception(dev, xdp_prog, act); > - break; > - } > - > - xdp_return_buff(xdp); > - return NULL; > -} > - > int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, > struct net_device *dev_rx) > { > - return __xdp_enqueue(dev, xdp, dev_rx); > + return __xdp_enqueue(dev, xdp, dev_rx, NULL); > } > > int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > @@ -489,12 +516,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > { > struct net_device *dev = dst->dev; > > - if (dst->xdp_prog) { > - xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); > - if (!xdp) > - return 0; So here it looks like dev_map_run_prog will not drop extra packets, but will see every single packet. Are we changing the semantics subtle here? This looks like a problem to me. We should not drop packets in the new case unless bpf program tells us to. > - } > - return __xdp_enqueue(dev, xdp, dev_rx); > + return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog); > } Did I miss something? If not maybe a comment in the commit message would help or in the code itself. Thanks, John
John Fastabend <john.fastabend@gmail.com> writes: > Hangbin Liu wrote: >> From: Jesper Dangaard Brouer <brouer@redhat.com> >> >> This changes the devmap XDP program support to run the program when the >> bulk queue is flushed instead of before the frame is enqueued. This has >> a couple of benefits: >> >> - It "sorts" the packets by destination devmap entry, and then runs the >> same BPF program on all the packets in sequence. This ensures that we >> keep the XDP program and destination device properties hot in I-cache. >> >> - It makes the multicast implementation simpler because it can just >> enqueue packets using bq_enqueue() without having to deal with the >> devmap program at all. >> >> The drawback is that if the devmap program drops the packet, the enqueue >> step is redundant. However, arguably this is mostly visible in a >> micro-benchmark, and with more mixed traffic the I-cache benefit should >> win out. The performance impact of just this patch is as follows: >> >> The bq_xmit_all's logic is also refactored and error label is removed. >> When bq_xmit_all() is called from bq_enqueue(), another packet will >> always be enqueued immediately after, so clearing dev_rx, xdp_prog and >> flush_node in bq_xmit_all() is redundant. Let's move the clear to >> __dev_flush(), and only check them once in bq_enqueue() since they are >> all modified together. >> >> By using xdp_redirect_map in sample/bpf and send pkts via pktgen cmd: >> ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64 >> >> There are about +/- 0.1M deviation for native testing, the performance >> improved for the base-case, but some drop back with xdp devmap prog attached. >> >> Version | Test | Generic | Native | Native + 2nd xdp_prog >> 5.10 rc6 | xdp_redirect_map i40e->i40e | 2.0M | 9.1M | 8.0M >> 5.10 rc6 | xdp_redirect_map i40e->veth | 1.7M | 11.0M | 9.7M >> 5.10 rc6 + patch | xdp_redirect_map i40e->i40e | 2.0M | 9.5M | 7.5M >> 5.10 rc6 + patch | xdp_redirect_map i40e->veth | 1.7M | 11.6M | 9.1M >> > > [...] > >> +static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, >> + struct xdp_frame **frames, int n, >> + struct net_device *dev) >> +{ >> + struct xdp_txq_info txq = { .dev = dev }; >> + struct xdp_buff xdp; >> + int i, nframes = 0; >> + >> + for (i = 0; i < n; i++) { >> + struct xdp_frame *xdpf = frames[i]; >> + u32 act; >> + int err; >> + >> + xdp_convert_frame_to_buff(xdpf, &xdp); >> + xdp.txq = &txq; >> + >> + act = bpf_prog_run_xdp(xdp_prog, &xdp); >> + switch (act) { >> + case XDP_PASS: >> + err = xdp_update_frame_from_buff(&xdp, xdpf); >> + if (unlikely(err < 0)) >> + xdp_return_frame_rx_napi(xdpf); >> + else >> + frames[nframes++] = xdpf; >> + break; >> + default: >> + bpf_warn_invalid_xdp_action(act); >> + fallthrough; >> + case XDP_ABORTED: >> + trace_xdp_exception(dev, xdp_prog, act); >> + fallthrough; >> + case XDP_DROP: >> + xdp_return_frame_rx_napi(xdpf); >> + break; >> + } >> + } >> + return nframes; /* sent frames count */ >> +} >> + >> static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) >> { >> struct net_device *dev = bq->dev; >> - int sent = 0, drops = 0, err = 0; >> + unsigned int cnt = bq->count; >> + int drops = 0, err = 0; >> + int to_send = cnt; >> + int sent = cnt; >> int i; >> >> - if (unlikely(!bq->count)) >> + if (unlikely(!cnt)) >> return; >> >> - for (i = 0; i < bq->count; i++) { >> + for (i = 0; i < cnt; i++) { >> struct xdp_frame *xdpf = bq->q[i]; >> >> prefetch(xdpf); >> } >> >> - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); >> + if (bq->xdp_prog) { >> + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); >> + if (!to_send) { >> + sent = 0; >> + goto out; >> + } >> + drops = cnt - to_send; >> + } > > I might be missing something about how *bq works here. What happens when > dev_map_bpf_prog_run returns to_send < cnt? > > So I read this as it will send [0, to_send] and [to_send, cnt] will be > dropped? How do we know the bpf prog would have dropped the set, > [to_send+1, cnt]? Because dev_map_bpf_prog_run() compacts the array: + case XDP_PASS: + err = xdp_update_frame_from_buff(&xdp, xdpf); + if (unlikely(err < 0)) + xdp_return_frame_rx_napi(xdpf); + else + frames[nframes++] = xdpf; + break; [...] >> int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, >> @@ -489,12 +516,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, >> { >> struct net_device *dev = dst->dev; >> >> - if (dst->xdp_prog) { >> - xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); >> - if (!xdp) >> - return 0; > > So here it looks like dev_map_run_prog will not drop extra > packets, but will see every single packet. > > Are we changing the semantics subtle here? This looks like > a problem to me. We should not drop packets in the new case > unless bpf program tells us to. It's not a change in semantics (see above), but I'll grant you that it's subtle :) -Toke
On Wed, Jan 27, 2021 at 10:41:44AM +0100, Toke Høiland-Jørgensen wrote: > John Fastabend <john.fastabend@gmail.com> writes: > > > Hangbin Liu wrote: > >> From: Jesper Dangaard Brouer <brouer@redhat.com> > >> > >> This changes the devmap XDP program support to run the program when the > >> bulk queue is flushed instead of before the frame is enqueued. This has > >> a couple of benefits: > >> > >> - It "sorts" the packets by destination devmap entry, and then runs the > >> same BPF program on all the packets in sequence. This ensures that we > >> keep the XDP program and destination device properties hot in I-cache. > >> > >> - It makes the multicast implementation simpler because it can just > >> enqueue packets using bq_enqueue() without having to deal with the > >> devmap program at all. > >> > >> The drawback is that if the devmap program drops the packet, the enqueue > >> step is redundant. However, arguably this is mostly visible in a > >> micro-benchmark, and with more mixed traffic the I-cache benefit should > >> win out. The performance impact of just this patch is as follows: > >> > >> The bq_xmit_all's logic is also refactored and error label is removed. > >> When bq_xmit_all() is called from bq_enqueue(), another packet will > >> always be enqueued immediately after, so clearing dev_rx, xdp_prog and > >> flush_node in bq_xmit_all() is redundant. Let's move the clear to > >> __dev_flush(), and only check them once in bq_enqueue() since they are > >> all modified together. > >> > >> By using xdp_redirect_map in sample/bpf and send pkts via pktgen cmd: > >> ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64 > >> > >> There are about +/- 0.1M deviation for native testing, the performance > >> improved for the base-case, but some drop back with xdp devmap prog attached. > >> > >> Version | Test | Generic | Native | Native + 2nd xdp_prog > >> 5.10 rc6 | xdp_redirect_map i40e->i40e | 2.0M | 9.1M | 8.0M > >> 5.10 rc6 | xdp_redirect_map i40e->veth | 1.7M | 11.0M | 9.7M > >> 5.10 rc6 + patch | xdp_redirect_map i40e->i40e | 2.0M | 9.5M | 7.5M > >> 5.10 rc6 + patch | xdp_redirect_map i40e->veth | 1.7M | 11.6M | 9.1M > >> > > > > [...] > > > >> +static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, > >> + struct xdp_frame **frames, int n, > >> + struct net_device *dev) > >> +{ > >> + struct xdp_txq_info txq = { .dev = dev }; > >> + struct xdp_buff xdp; > >> + int i, nframes = 0; > >> + > >> + for (i = 0; i < n; i++) { > >> + struct xdp_frame *xdpf = frames[i]; > >> + u32 act; > >> + int err; > >> + > >> + xdp_convert_frame_to_buff(xdpf, &xdp); > >> + xdp.txq = &txq; > >> + > >> + act = bpf_prog_run_xdp(xdp_prog, &xdp); > >> + switch (act) { > >> + case XDP_PASS: > >> + err = xdp_update_frame_from_buff(&xdp, xdpf); > >> + if (unlikely(err < 0)) > >> + xdp_return_frame_rx_napi(xdpf); > >> + else > >> + frames[nframes++] = xdpf; > >> + break; > >> + default: > >> + bpf_warn_invalid_xdp_action(act); > >> + fallthrough; > >> + case XDP_ABORTED: > >> + trace_xdp_exception(dev, xdp_prog, act); > >> + fallthrough; > >> + case XDP_DROP: > >> + xdp_return_frame_rx_napi(xdpf); > >> + break; > >> + } > >> + } > >> + return nframes; /* sent frames count */ > >> +} > >> + > >> static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) > >> { > >> struct net_device *dev = bq->dev; > >> - int sent = 0, drops = 0, err = 0; > >> + unsigned int cnt = bq->count; > >> + int drops = 0, err = 0; > >> + int to_send = cnt; > >> + int sent = cnt; > >> int i; > >> > >> - if (unlikely(!bq->count)) > >> + if (unlikely(!cnt)) > >> return; > >> > >> - for (i = 0; i < bq->count; i++) { > >> + for (i = 0; i < cnt; i++) { > >> struct xdp_frame *xdpf = bq->q[i]; > >> > >> prefetch(xdpf); > >> } > >> > >> - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); > >> + if (bq->xdp_prog) { > >> + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); > >> + if (!to_send) { > >> + sent = 0; > >> + goto out; > >> + } > >> + drops = cnt - to_send; > >> + } > > > > I might be missing something about how *bq works here. What happens when > > dev_map_bpf_prog_run returns to_send < cnt? > > > > So I read this as it will send [0, to_send] and [to_send, cnt] will be > > dropped? How do we know the bpf prog would have dropped the set, > > [to_send+1, cnt]? You know that via recalculation of 'drops' value after you returned from dev_map_bpf_prog_run() which later on is provided onto trace_xdp_devmap_xmit. > > Because dev_map_bpf_prog_run() compacts the array: > > + case XDP_PASS: > + err = xdp_update_frame_from_buff(&xdp, xdpf); > + if (unlikely(err < 0)) > + xdp_return_frame_rx_napi(xdpf); > + else > + frames[nframes++] = xdpf; > + break; To expand this a little, 'frames' array is reused and 'nframes' above is the value that is returned and we store it onto 'to_send' variable. > > [...] > > >> int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > >> @@ -489,12 +516,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > >> { > >> struct net_device *dev = dst->dev; > >> > >> - if (dst->xdp_prog) { > >> - xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); > >> - if (!xdp) > >> - return 0; > > > > So here it looks like dev_map_run_prog will not drop extra > > packets, but will see every single packet. > > > > Are we changing the semantics subtle here? This looks like > > a problem to me. We should not drop packets in the new case > > unless bpf program tells us to. > > It's not a change in semantics (see above), but I'll grant you that it's > subtle :) dev map xdp prog still sees all of the frames. Maybe you were referring to a fact that for XDP_PASS action you might fail with xdp->xdpf conversion? I'm wondering if we could actually do a further optimization and avoid xdpf/xdp juggling. What if xdp_dev_bulk_queue would be storing the xdp_buffs instead of xdp_frames ? Then you hit bq_xmit_all and if prog is present it doesn't have to do that dance like we have right now. After that you walk through xdp_buff array and do the conversion so that xdp_frame array will be passed do ndo_xdp_xmit. I had a bad sleep so maybe I'm talking nonsense over here, will take another look in the evening though :) > > -Toke >
On Wed, 27 Jan 2021 13:20:50 +0100 Maciej Fijalkowski <maciej.fijalkowski@intel.com> wrote: > On Wed, Jan 27, 2021 at 10:41:44AM +0100, Toke Høiland-Jørgensen wrote: > > John Fastabend <john.fastabend@gmail.com> writes: > > > > > Hangbin Liu wrote: > > >> From: Jesper Dangaard Brouer <brouer@redhat.com> > > >> > > >> This changes the devmap XDP program support to run the program when the > > >> bulk queue is flushed instead of before the frame is enqueued. This has > > >> a couple of benefits: > > >> > > >> - It "sorts" the packets by destination devmap entry, and then runs the > > >> same BPF program on all the packets in sequence. This ensures that we > > >> keep the XDP program and destination device properties hot in I-cache. > > >> > > >> - It makes the multicast implementation simpler because it can just > > >> enqueue packets using bq_enqueue() without having to deal with the > > >> devmap program at all. > > >> > > >> The drawback is that if the devmap program drops the packet, the enqueue > > >> step is redundant. However, arguably this is mostly visible in a > > >> micro-benchmark, and with more mixed traffic the I-cache benefit should > > >> win out. The performance impact of just this patch is as follows: > > >> > > >> The bq_xmit_all's logic is also refactored and error label is removed. > > >> When bq_xmit_all() is called from bq_enqueue(), another packet will > > >> always be enqueued immediately after, so clearing dev_rx, xdp_prog and > > >> flush_node in bq_xmit_all() is redundant. Let's move the clear to > > >> __dev_flush(), and only check them once in bq_enqueue() since they are > > >> all modified together. > > >> > > >> By using xdp_redirect_map in sample/bpf and send pkts via pktgen cmd: > > >> ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64 > > >> > > >> There are about +/- 0.1M deviation for native testing, the performance > > >> improved for the base-case, but some drop back with xdp devmap prog attached. > > >> > > >> Version | Test | Generic | Native | Native + 2nd xdp_prog > > >> 5.10 rc6 | xdp_redirect_map i40e->i40e | 2.0M | 9.1M | 8.0M > > >> 5.10 rc6 | xdp_redirect_map i40e->veth | 1.7M | 11.0M | 9.7M > > >> 5.10 rc6 + patch | xdp_redirect_map i40e->i40e | 2.0M | 9.5M | 7.5M > > >> 5.10 rc6 + patch | xdp_redirect_map i40e->veth | 1.7M | 11.6M | 9.1M > > >> > > > > > > [...] > > > > > >> +static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, > > >> + struct xdp_frame **frames, int n, > > >> + struct net_device *dev) > > >> +{ > > >> + struct xdp_txq_info txq = { .dev = dev }; > > >> + struct xdp_buff xdp; > > >> + int i, nframes = 0; > > >> + > > >> + for (i = 0; i < n; i++) { > > >> + struct xdp_frame *xdpf = frames[i]; > > >> + u32 act; > > >> + int err; > > >> + > > >> + xdp_convert_frame_to_buff(xdpf, &xdp); > > >> + xdp.txq = &txq; > > >> + > > >> + act = bpf_prog_run_xdp(xdp_prog, &xdp); > > >> + switch (act) { > > >> + case XDP_PASS: > > >> + err = xdp_update_frame_from_buff(&xdp, xdpf); > > >> + if (unlikely(err < 0)) > > >> + xdp_return_frame_rx_napi(xdpf); > > >> + else > > >> + frames[nframes++] = xdpf; > > >> + break; > > >> + default: > > >> + bpf_warn_invalid_xdp_action(act); > > >> + fallthrough; > > >> + case XDP_ABORTED: > > >> + trace_xdp_exception(dev, xdp_prog, act); > > >> + fallthrough; > > >> + case XDP_DROP: > > >> + xdp_return_frame_rx_napi(xdpf); > > >> + break; > > >> + } > > >> + } > > >> + return nframes; /* sent frames count */ > > >> +} > > >> + > > >> static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) > > >> { > > >> struct net_device *dev = bq->dev; > > >> - int sent = 0, drops = 0, err = 0; > > >> + unsigned int cnt = bq->count; > > >> + int drops = 0, err = 0; > > >> + int to_send = cnt; > > >> + int sent = cnt; > > >> int i; > > >> > > >> - if (unlikely(!bq->count)) > > >> + if (unlikely(!cnt)) > > >> return; > > >> > > >> - for (i = 0; i < bq->count; i++) { > > >> + for (i = 0; i < cnt; i++) { > > >> struct xdp_frame *xdpf = bq->q[i]; > > >> > > >> prefetch(xdpf); > > >> } > > >> > > >> - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); > > >> + if (bq->xdp_prog) { > > >> + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); > > >> + if (!to_send) { > > >> + sent = 0; > > >> + goto out; > > >> + } > > >> + drops = cnt - to_send; > > >> + } > > > > > > I might be missing something about how *bq works here. What happens when > > > dev_map_bpf_prog_run returns to_send < cnt? > > > > > > So I read this as it will send [0, to_send] and [to_send, cnt] will be > > > dropped? How do we know the bpf prog would have dropped the set, > > > [to_send+1, cnt]? > > You know that via recalculation of 'drops' value after you returned from > dev_map_bpf_prog_run() which later on is provided onto trace_xdp_devmap_xmit. > > > > > Because dev_map_bpf_prog_run() compacts the array: > > > > + case XDP_PASS: > > + err = xdp_update_frame_from_buff(&xdp, xdpf); > > + if (unlikely(err < 0)) > > + xdp_return_frame_rx_napi(xdpf); > > + else > > + frames[nframes++] = xdpf; > > + break; > > To expand this a little, 'frames' array is reused and 'nframes' above is > the value that is returned and we store it onto 'to_send' variable. > > > > > [...] > > > > >> int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > > >> @@ -489,12 +516,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > > >> { > > >> struct net_device *dev = dst->dev; > > >> > > >> - if (dst->xdp_prog) { > > >> - xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); > > >> - if (!xdp) > > >> - return 0; > > > > > > So here it looks like dev_map_run_prog will not drop extra > > > packets, but will see every single packet. > > > > > > Are we changing the semantics subtle here? This looks like > > > a problem to me. We should not drop packets in the new case > > > unless bpf program tells us to. > > > > It's not a change in semantics (see above), but I'll grant you that it's > > subtle :) > > dev map xdp prog still sees all of the frames. > > Maybe you were referring to a fact that for XDP_PASS action you might fail > with xdp->xdpf conversion? > > I'm wondering if we could actually do a further optimization and avoid > xdpf/xdp juggling. > > What if xdp_dev_bulk_queue would be storing the xdp_buffs instead of > xdp_frames ? Not possible. Remember that struct xdp_buff is "allocated" on the call stack. Thus, you cannot store a pointer to the xdp_buffs in xdp_dev_bulk_queue. The xdp_frame also avoids allocation, via using memory placed in top of data-frame. Thus, you can store a pointer to the xdp_frame, as it is actually backed by real memory. See[1] slide-11 ("Fundamental structs") > Then you hit bq_xmit_all and if prog is present it doesn't have to do that > dance like we have right now. After that you walk through xdp_buff array > and do the conversion so that xdp_frame array will be passed do > ndo_xdp_xmit. If you want to performance optimize this, I suggest that we detect if we need to call xdp_update_frame_from_buff(&xdp, xdpf) after the 2nd XDP-prog ran. In many case the BPF-prog don't move head/tail/metadata, so that call becomes unnecessary. > I had a bad sleep so maybe I'm talking nonsense over here, will take > another look in the evening though :) :) [1] https://people.netfilter.org/hawk/presentations/KernelRecipes2019/xdp-netstack-concert.pdf -- Best regards, Jesper Dangaard Brouer MSc.CS, Principal Kernel Engineer at Red Hat LinkedIn: http://www.linkedin.com/in/brouer
Jesper Dangaard Brouer wrote: > On Wed, 27 Jan 2021 13:20:50 +0100 > Maciej Fijalkowski <maciej.fijalkowski@intel.com> wrote: > > > On Wed, Jan 27, 2021 at 10:41:44AM +0100, Toke Høiland-Jørgensen wrote: > > > John Fastabend <john.fastabend@gmail.com> writes: > > > > > > > Hangbin Liu wrote: > > > >> From: Jesper Dangaard Brouer <brouer@redhat.com> > > > >> > > > >> This changes the devmap XDP program support to run the program when the > > > >> bulk queue is flushed instead of before the frame is enqueued. This has > > > >> a couple of benefits: > > > >> > > > >> - It "sorts" the packets by destination devmap entry, and then runs the > > > >> same BPF program on all the packets in sequence. This ensures that we > > > >> keep the XDP program and destination device properties hot in I-cache. > > > >> > > > >> - It makes the multicast implementation simpler because it can just > > > >> enqueue packets using bq_enqueue() without having to deal with the > > > >> devmap program at all. > > > >> > > > >> The drawback is that if the devmap program drops the packet, the enqueue > > > >> step is redundant. However, arguably this is mostly visible in a > > > >> micro-benchmark, and with more mixed traffic the I-cache benefit should > > > >> win out. The performance impact of just this patch is as follows: > > > >> > > > >> The bq_xmit_all's logic is also refactored and error label is removed. > > > >> When bq_xmit_all() is called from bq_enqueue(), another packet will > > > >> always be enqueued immediately after, so clearing dev_rx, xdp_prog and > > > >> flush_node in bq_xmit_all() is redundant. Let's move the clear to > > > >> __dev_flush(), and only check them once in bq_enqueue() since they are > > > >> all modified together. > > > >> > > > >> By using xdp_redirect_map in sample/bpf and send pkts via pktgen cmd: > > > >> ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64 > > > >> > > > >> There are about +/- 0.1M deviation for native testing, the performance > > > >> improved for the base-case, but some drop back with xdp devmap prog attached. > > > >> > > > >> Version | Test | Generic | Native | Native + 2nd xdp_prog > > > >> 5.10 rc6 | xdp_redirect_map i40e->i40e | 2.0M | 9.1M | 8.0M > > > >> 5.10 rc6 | xdp_redirect_map i40e->veth | 1.7M | 11.0M | 9.7M > > > >> 5.10 rc6 + patch | xdp_redirect_map i40e->i40e | 2.0M | 9.5M | 7.5M > > > >> 5.10 rc6 + patch | xdp_redirect_map i40e->veth | 1.7M | 11.6M | 9.1M > > > >> > > > > > > > > [...] Acked-by: John Fastabend <john.fastabend@gmail.com> > > > >> static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) > > > >> { > > > >> struct net_device *dev = bq->dev; > > > >> - int sent = 0, drops = 0, err = 0; > > > >> + unsigned int cnt = bq->count; > > > >> + int drops = 0, err = 0; > > > >> + int to_send = cnt; > > > >> + int sent = cnt; > > > >> int i; > > > >> > > > >> - if (unlikely(!bq->count)) > > > >> + if (unlikely(!cnt)) > > > >> return; > > > >> > > > >> - for (i = 0; i < bq->count; i++) { > > > >> + for (i = 0; i < cnt; i++) { > > > >> struct xdp_frame *xdpf = bq->q[i]; > > > >> > > > >> prefetch(xdpf); > > > >> } > > > >> > > > >> - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); > > > >> + if (bq->xdp_prog) { > > > >> + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); > > > >> + if (!to_send) { > > > >> + sent = 0; > > > >> + goto out; > > > >> + } > > > >> + drops = cnt - to_send; > > > >> + } > > > > > > > > I might be missing something about how *bq works here. What happens when > > > > dev_map_bpf_prog_run returns to_send < cnt? > > > > > > > > So I read this as it will send [0, to_send] and [to_send, cnt] will be > > > > dropped? How do we know the bpf prog would have dropped the set, > > > > [to_send+1, cnt]? > > > > You know that via recalculation of 'drops' value after you returned from > > dev_map_bpf_prog_run() which later on is provided onto trace_xdp_devmap_xmit. > > > > > > > > Because dev_map_bpf_prog_run() compacts the array: > > > > > > + case XDP_PASS: > > > + err = xdp_update_frame_from_buff(&xdp, xdpf); > > > + if (unlikely(err < 0)) > > > + xdp_return_frame_rx_napi(xdpf); > > > + else > > > + frames[nframes++] = xdpf; > > > + break; > > > > To expand this a little, 'frames' array is reused and 'nframes' above is > > the value that is returned and we store it onto 'to_send' variable. > > In the morning with coffee looks good to me. Thanks Toke, Jesper.
On Mon, 25 Jan 2021 20:45:11 +0800 Hangbin Liu <liuhangbin@gmail.com> wrote: > By using xdp_redirect_map in sample/bpf and send pkts via pktgen cmd: > ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64 > > There are about +/- 0.1M deviation for native testing, the performance > improved for the base-case, but some drop back with xdp devmap prog attached. > > Version | Test | Generic | Native | Native + 2nd xdp_prog > 5.10 rc6 | xdp_redirect_map i40e->i40e | 2.0M | 9.1M | 8.0M > 5.10 rc6 | xdp_redirect_map i40e->veth | 1.7M | 11.0M | 9.7M > 5.10 rc6 + patch | xdp_redirect_map i40e->i40e | 2.0M | 9.5M | 7.5M > 5.10 rc6 + patch | xdp_redirect_map i40e->veth | 1.7M | 11.6M | 9.1M I want to highlight the improvement 9.1M -> 9.5M. This is the native (40e->i40e) case where the isn't any "2nd xdp_prog". This means that when we introduced the "2nd xdp_prog", we lost a little performance without noticing (death-by-a-1000-paper-cuts), for the baseline case where this feature is not used/activated. This patch regains that performance for our baseline. That in itself make this patch worth it. -- Best regards, Jesper Dangaard Brouer MSc.CS, Principal Kernel Engineer at Red Hat LinkedIn: http://www.linkedin.com/in/brouer
John Fastabend <john.fastabend@gmail.com> writes: > Jesper Dangaard Brouer wrote: >> On Wed, 27 Jan 2021 13:20:50 +0100 >> Maciej Fijalkowski <maciej.fijalkowski@intel.com> wrote: >> >> > On Wed, Jan 27, 2021 at 10:41:44AM +0100, Toke Høiland-Jørgensen wrote: >> > > John Fastabend <john.fastabend@gmail.com> writes: >> > > >> > > > Hangbin Liu wrote: >> > > >> From: Jesper Dangaard Brouer <brouer@redhat.com> >> > > >> >> > > >> This changes the devmap XDP program support to run the program when the >> > > >> bulk queue is flushed instead of before the frame is enqueued. This has >> > > >> a couple of benefits: >> > > >> >> > > >> - It "sorts" the packets by destination devmap entry, and then runs the >> > > >> same BPF program on all the packets in sequence. This ensures that we >> > > >> keep the XDP program and destination device properties hot in I-cache. >> > > >> >> > > >> - It makes the multicast implementation simpler because it can just >> > > >> enqueue packets using bq_enqueue() without having to deal with the >> > > >> devmap program at all. >> > > >> >> > > >> The drawback is that if the devmap program drops the packet, the enqueue >> > > >> step is redundant. However, arguably this is mostly visible in a >> > > >> micro-benchmark, and with more mixed traffic the I-cache benefit should >> > > >> win out. The performance impact of just this patch is as follows: >> > > >> >> > > >> The bq_xmit_all's logic is also refactored and error label is removed. >> > > >> When bq_xmit_all() is called from bq_enqueue(), another packet will >> > > >> always be enqueued immediately after, so clearing dev_rx, xdp_prog and >> > > >> flush_node in bq_xmit_all() is redundant. Let's move the clear to >> > > >> __dev_flush(), and only check them once in bq_enqueue() since they are >> > > >> all modified together. >> > > >> >> > > >> By using xdp_redirect_map in sample/bpf and send pkts via pktgen cmd: >> > > >> ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64 >> > > >> >> > > >> There are about +/- 0.1M deviation for native testing, the performance >> > > >> improved for the base-case, but some drop back with xdp devmap prog attached. >> > > >> >> > > >> Version | Test | Generic | Native | Native + 2nd xdp_prog >> > > >> 5.10 rc6 | xdp_redirect_map i40e->i40e | 2.0M | 9.1M | 8.0M >> > > >> 5.10 rc6 | xdp_redirect_map i40e->veth | 1.7M | 11.0M | 9.7M >> > > >> 5.10 rc6 + patch | xdp_redirect_map i40e->i40e | 2.0M | 9.5M | 7.5M >> > > >> 5.10 rc6 + patch | xdp_redirect_map i40e->veth | 1.7M | 11.6M | 9.1M >> > > >> >> > > > >> > > > [...] > > Acked-by: John Fastabend <john.fastabend@gmail.com> > >> > > >> static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) >> > > >> { >> > > >> struct net_device *dev = bq->dev; >> > > >> - int sent = 0, drops = 0, err = 0; >> > > >> + unsigned int cnt = bq->count; >> > > >> + int drops = 0, err = 0; >> > > >> + int to_send = cnt; >> > > >> + int sent = cnt; >> > > >> int i; >> > > >> >> > > >> - if (unlikely(!bq->count)) >> > > >> + if (unlikely(!cnt)) >> > > >> return; >> > > >> >> > > >> - for (i = 0; i < bq->count; i++) { >> > > >> + for (i = 0; i < cnt; i++) { >> > > >> struct xdp_frame *xdpf = bq->q[i]; >> > > >> >> > > >> prefetch(xdpf); >> > > >> } >> > > >> >> > > >> - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); >> > > >> + if (bq->xdp_prog) { >> > > >> + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); >> > > >> + if (!to_send) { >> > > >> + sent = 0; >> > > >> + goto out; >> > > >> + } >> > > >> + drops = cnt - to_send; >> > > >> + } >> > > > >> > > > I might be missing something about how *bq works here. What happens when >> > > > dev_map_bpf_prog_run returns to_send < cnt? >> > > > >> > > > So I read this as it will send [0, to_send] and [to_send, cnt] will be >> > > > dropped? How do we know the bpf prog would have dropped the set, >> > > > [to_send+1, cnt]? >> > >> > You know that via recalculation of 'drops' value after you returned from >> > dev_map_bpf_prog_run() which later on is provided onto trace_xdp_devmap_xmit. >> > >> > > >> > > Because dev_map_bpf_prog_run() compacts the array: >> > > >> > > + case XDP_PASS: >> > > + err = xdp_update_frame_from_buff(&xdp, xdpf); >> > > + if (unlikely(err < 0)) >> > > + xdp_return_frame_rx_napi(xdpf); >> > > + else >> > > + frames[nframes++] = xdpf; >> > > + break; >> > >> > To expand this a little, 'frames' array is reused and 'nframes' above is >> > the value that is returned and we store it onto 'to_send' variable. >> > > > In the morning with coffee looks good to me. Thanks Toke, Jesper. Haha, yeah, coffee does tend to help, doesn't it? You're welcome :) -Toke
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f6e9c68afdd4..bf8b6b5c9cab 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -57,6 +57,7 @@ struct xdp_dev_bulk_queue { struct list_head flush_node; struct net_device *dev; struct net_device *dev_rx; + struct bpf_prog *xdp_prog; unsigned int count; }; @@ -327,46 +328,92 @@ bool dev_map_can_have_prog(struct bpf_map *map) return false; } +static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, + struct xdp_frame **frames, int n, + struct net_device *dev) +{ + struct xdp_txq_info txq = { .dev = dev }; + struct xdp_buff xdp; + int i, nframes = 0; + + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + u32 act; + int err; + + xdp_convert_frame_to_buff(xdpf, &xdp); + xdp.txq = &txq; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + switch (act) { + case XDP_PASS: + err = xdp_update_frame_from_buff(&xdp, xdpf); + if (unlikely(err < 0)) + xdp_return_frame_rx_napi(xdpf); + else + frames[nframes++] = xdpf; + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dev, xdp_prog, act); + fallthrough; + case XDP_DROP: + xdp_return_frame_rx_napi(xdpf); + break; + } + } + return nframes; /* sent frames count */ +} + static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; - int sent = 0, drops = 0, err = 0; + unsigned int cnt = bq->count; + int drops = 0, err = 0; + int to_send = cnt; + int sent = cnt; int i; - if (unlikely(!bq->count)) + if (unlikely(!cnt)) return; - for (i = 0; i < bq->count; i++) { + for (i = 0; i < cnt; i++) { struct xdp_frame *xdpf = bq->q[i]; prefetch(xdpf); } - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); + if (bq->xdp_prog) { + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); + if (!to_send) { + sent = 0; + goto out; + } + drops = cnt - to_send; + } + + sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags); if (sent < 0) { err = sent; sent = 0; - goto error; + + /* If ndo_xdp_xmit fails with an errno, no frames have been + * xmit'ed and it's our responsibility to them free all. + */ + for (i = 0; i < cnt - drops; i++) { + struct xdp_frame *xdpf = bq->q[i]; + + xdp_return_frame_rx_napi(xdpf); + } } - drops = bq->count - sent; out: + drops = cnt - sent; bq->count = 0; trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); - bq->dev_rx = NULL; - __list_del_clearprev(&bq->flush_node); return; -error: - /* If ndo_xdp_xmit fails with an errno, no frames have been - * xmit'ed and it's our responsibility to them free all. - */ - for (i = 0; i < bq->count; i++) { - struct xdp_frame *xdpf = bq->q[i]; - - xdp_return_frame_rx_napi(xdpf); - drops++; - } - goto out; } /* __dev_flush is called from xdp_do_flush() which _must_ be signaled @@ -384,8 +431,12 @@ void __dev_flush(void) struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq, *tmp; - list_for_each_entry_safe(bq, tmp, flush_list, flush_node) + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { bq_xmit_all(bq, XDP_XMIT_FLUSH); + bq->dev_rx = NULL; + bq->xdp_prog = NULL; + __list_del_clearprev(&bq->flush_node); + } } /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or @@ -408,7 +459,7 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) * Thus, safe percpu variable access. */ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, - struct net_device *dev_rx) + struct net_device *dev_rx, struct bpf_prog *xdp_prog) { struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); @@ -419,18 +470,22 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed * from net_device drivers NAPI func end. + * + * Do the same with xdp_prog and flush_list since these fields + * are only ever modified together. */ - if (!bq->dev_rx) + if (!bq->dev_rx) { bq->dev_rx = dev_rx; + bq->xdp_prog = xdp_prog; + list_add(&bq->flush_node, flush_list); + } bq->q[bq->count++] = xdpf; - - if (!bq->flush_node.prev) - list_add(&bq->flush_node, flush_list); } static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, - struct net_device *dev_rx) + struct net_device *dev_rx, + struct bpf_prog *xdp_prog) { struct xdp_frame *xdpf; int err; @@ -446,42 +501,14 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, if (unlikely(!xdpf)) return -EOVERFLOW; - bq_enqueue(dev, xdpf, dev_rx); + bq_enqueue(dev, xdpf, dev_rx, xdp_prog); return 0; } -static struct xdp_buff *dev_map_run_prog(struct net_device *dev, - struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) -{ - struct xdp_txq_info txq = { .dev = dev }; - u32 act; - - xdp_set_data_meta_invalid(xdp); - xdp->txq = &txq; - - act = bpf_prog_run_xdp(xdp_prog, xdp); - switch (act) { - case XDP_PASS: - return xdp; - case XDP_DROP: - break; - default: - bpf_warn_invalid_xdp_action(act); - fallthrough; - case XDP_ABORTED: - trace_xdp_exception(dev, xdp_prog, act); - break; - } - - xdp_return_buff(xdp); - return NULL; -} - int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx) { - return __xdp_enqueue(dev, xdp, dev_rx); + return __xdp_enqueue(dev, xdp, dev_rx, NULL); } int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, @@ -489,12 +516,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; - if (dst->xdp_prog) { - xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); - if (!xdp) - return 0; - } - return __xdp_enqueue(dev, xdp, dev_rx); + return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog); } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,