Message ID | cover.1601414174.git.daniel@iogearbox.net |
---|---|
Headers | show |
Series | Various BPF helper improvements | expand |
On Tue, Sep 29, 2020 at 11:23:03PM +0200, Daniel Borkmann wrote: [ ... ] > --- > include/linux/skbuff.h | 5 + > include/uapi/linux/bpf.h | 14 ++ > net/core/filter.c | 273 +++++++++++++++++++++++++++++++-- > tools/include/uapi/linux/bpf.h | 14 ++ > 4 files changed, 293 insertions(+), 13 deletions(-) > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index 04a18e01b362..3d0cf3722bb4 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -2548,6 +2548,11 @@ static inline int skb_mac_header_was_set(const struct sk_buff *skb) > return skb->mac_header != (typeof(skb->mac_header))~0U; > } > > +static inline void skb_unset_mac_header(struct sk_buff *skb) > +{ > + skb->mac_header = (typeof(skb->mac_header))~0U; > +} > + > static inline void skb_reset_mac_header(struct sk_buff *skb) > { > skb->mac_header = skb->data - skb->head; > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 6116a7f54c8f..1f17c6752deb 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -3652,6 +3652,19 @@ union bpf_attr { > * associated socket instead of the current process. > * Return > * The id is returned or 0 in case the id could not be retrieved. > + * > + * long bpf_redirect_neigh(u32 ifindex, u64 flags) > + * Description > + * Redirect the packet to another net device of index *ifindex* > + * and fill in L2 addresses from neighboring subsystem. This helper > + * is somewhat similar to **bpf_redirect**\ (), except that it > + * fills in e.g. MAC addresses based on the L3 information from > + * the packet. This helper is supported for IPv4 and IPv6 protocols. > + * The *flags* argument is reserved and must be 0. The helper is > + * currently only supported for tc BPF program types. > + * Return > + * The helper returns **TC_ACT_REDIRECT** on success or > + * **TC_ACT_SHOT** on error. > */ > #define __BPF_FUNC_MAPPER(FN) \ > FN(unspec), \ > @@ -3806,6 +3819,7 @@ union bpf_attr { > FN(snprintf_btf), \ > FN(seq_printf_btf), \ > FN(skb_cgroup_classid), \ > + FN(redirect_neigh), \ > /* */ > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > diff --git a/net/core/filter.c b/net/core/filter.c > index a0776e48dcc9..14b1534f6b46 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -2163,6 +2163,222 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, > return __bpf_redirect_no_mac(skb, dev, flags); > } > > +#if IS_ENABLED(CONFIG_IPV6) > +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) > +{ > + struct dst_entry *dst = skb_dst(skb); > + struct net_device *dev = dst->dev; > + u32 hh_len = LL_RESERVED_SPACE(dev); > + const struct in6_addr *nexthop; > + struct neighbour *neigh; > + > + if (dev_xmit_recursion()) > + goto out_rec; > + > + skb->dev = dev; > + skb->tstamp = 0; > + > + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { > + struct sk_buff *skb2; > + > + skb2 = skb_realloc_headroom(skb, hh_len); > + if (!skb2) { > + kfree_skb(skb); > + return -ENOMEM; > + } > + if (skb->sk) > + skb_set_owner_w(skb2, skb->sk); > + consume_skb(skb); > + skb = skb2; > + } > + > + rcu_read_lock_bh(); > + nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), > + &ipv6_hdr(skb)->daddr); > + neigh = ip_neigh_gw6(dev, nexthop); > + if (likely(!IS_ERR(neigh))) { > + int ret; > + > + sock_confirm_neigh(skb, neigh); > + dev_xmit_recursion_inc(); > + ret = neigh_output(neigh, skb, false); > + dev_xmit_recursion_dec(); > + rcu_read_unlock_bh(); > + return ret; > + } > + rcu_read_unlock_bh(); > + IP6_INC_STATS(dev_net(dst->dev), > + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); > +out_drop: > + kfree_skb(skb); > + return -EINVAL; > +out_rec: > + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); > + goto out_drop; nit. may be log this at the earlier "if (dev_xmit_recursion)" and then directly goto out_drop. > +} > + [ ... ] > +/* Internal, non-exposed redirect flags. */ > +enum { > + BPF_F_NEIGH = (1ULL << 1), > +}; It will be useful to ensure the future "flags" of BPF_FUNC_redirect will not overlap with this. May be a BUILD_BUG_ON? Others LGTM. Acked-by: Martin KaFai Lau <kafai@fb.com> > > int skb_do_redirect(struct sk_buff *skb) > { > struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); > struct net_device *dev; > + u32 flags = ri->flags; > > dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index); > ri->tgt_index = 0; > @@ -2231,7 +2440,22 @@ int skb_do_redirect(struct sk_buff *skb) > return -EINVAL; > } > > - return __bpf_redirect(skb, dev, ri->flags); > + return flags & BPF_F_NEIGH ? > + __bpf_redirect_neigh(skb, dev) : > + __bpf_redirect(skb, dev, flags); > +}
On 9/30/20 8:48 AM, Martin KaFai Lau wrote: > On Tue, Sep 29, 2020 at 11:23:03PM +0200, Daniel Borkmann wrote: [...] > >> +/* Internal, non-exposed redirect flags. */ >> +enum { >> + BPF_F_NEIGH = (1ULL << 1), >> +}; > It will be useful to ensure the future "flags" of BPF_FUNC_redirect > will not overlap with this. May be a BUILD_BUG_ON? I was thinking about this as well, but didn't go for it since typically this would mean that we need to add a mask of all flags for redirect helper in uapi right next to where we define BPF_F_INGRESS such that people don't forget to update the mask whenever they extend the flags there in order for the BUILD_BUG_ON() assertion to be actually effective (see also RTAX_FEATURE_MASK vs DST_FEATURE_MASK). If the mask sits in a different location, then developers might forget to update, it might slip through review (since not included in diff) and the build failure doesn't trigger. So far we have avoided to extend bpf uapi in such way. That was basically my rationale, another option could be to just add a comment in the enum right underneath BPF_F_INGRESS that the (1ULL << 1) bit is currently kernel-internal. > Others LGTM. > > Acked-by: Martin KaFai Lau <kafai@fb.com> Thanks!