Message ID | 160319106221.15822.2629789706666194966.stgit@toke.dk |
---|---|
State | New |
Headers | show |
Series | bpf: Rework bpf_redirect_neigh() to allow supplying nexthop from caller | expand |
On 10/20/20 12:51 PM, Toke Høiland-Jørgensen wrote: > From: Toke Høiland-Jørgensen <toke@redhat.com> [...] > BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) > @@ -2455,8 +2487,8 @@ int skb_do_redirect(struct sk_buff *skb) > return -EAGAIN; > } > return flags & BPF_F_NEIGH ? > - __bpf_redirect_neigh(skb, dev) : > - __bpf_redirect(skb, dev, flags); > + __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? &ri->nh : NULL) : > + __bpf_redirect(skb, dev, flags); > out_drop: > kfree_skb(skb); > return -EINVAL; > @@ -2504,16 +2536,25 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = { > .arg2_type = ARG_ANYTHING, > }; > > -BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) > +BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, > + int, plen, u64, flags) > { > struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); > > - if (unlikely(flags)) > + if (unlikely((plen && plen < sizeof(*params)) || flags)) > + return TC_ACT_SHOT; > + > + if (unlikely(plen && (params->unused[0] || params->unused[1] || > + params->unused[2]))) small nit: maybe fold this into the prior check that already tests non-zero plen if (unlikely((plen && (plen < sizeof(*params) || (params->unused[0] | params->unused[1] | params->unused[2]))) || flags)) return TC_ACT_SHOT; > return TC_ACT_SHOT; > > - ri->flags = BPF_F_NEIGH; > + ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); > ri->tgt_index = ifindex; > > + BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); > + if (plen) > + memcpy(&ri->nh, params, sizeof(ri->nh)); > + > return TC_ACT_REDIRECT; > } >
On Tue, 20 Oct 2020 12:51:02 +0200 Toke Høiland-Jørgensen wrote: > diff --git a/include/linux/filter.h b/include/linux/filter.h > index 20fc24c9779a..ba9de7188cd0 100644 > --- a/include/linux/filter.h > +++ b/include/linux/filter.h > @@ -607,12 +607,21 @@ struct bpf_skb_data_end { > void *data_end; > }; > > +struct bpf_nh_params { > + u8 nh_family; > + union { > + __u32 ipv4_nh; > + struct in6_addr ipv6_nh; > + }; > +}; > @@ -4906,6 +4910,18 @@ struct bpf_fib_lookup { > __u8 dmac[6]; /* ETH_ALEN */ > }; > > +struct bpf_redir_neigh { > + /* network family for lookup (AF_INET, AF_INET6) */ > + __u8 nh_family; > + /* avoid hole in struct - must be set to 0 */ > + __u8 unused[3]; > + /* network address of nexthop; skips fib lookup to find gateway */ > + union { > + __be32 ipv4_nh; > + __u32 ipv6_nh[4]; /* in6_addr; network order */ > + }; > +}; Isn't this backward? The hole could be named in the internal structure. This is a bit of a gray area, but if you name this hole in uAPI and programs start referring to it you will never be able to reuse it. So you may as well not require it to be zeroed..
On Tue, 20 Oct 2020 12:51:02 +0200 Toke Høiland-Jørgensen wrote: > +struct bpf_nh_params { > + u8 nh_family; > + union { > + __u32 ipv4_nh; > + struct in6_addr ipv6_nh; > + }; > +}; Folks, not directly related to this set, but there's a SRv6 patch going around which adds ifindex, otherwise nh can't be link local. I wonder if we want to consider this use case from the start (or the close approximation of start in this case ;)).
Jakub Kicinski <kuba@kernel.org> writes: > On Tue, 20 Oct 2020 12:51:02 +0200 Toke Høiland-Jørgensen wrote: >> +struct bpf_nh_params { >> + u8 nh_family; >> + union { >> + __u32 ipv4_nh; >> + struct in6_addr ipv6_nh; >> + }; >> +}; > > Folks, not directly related to this set, but there's a SRv6 patch going > around which adds ifindex, otherwise nh can't be link local. > > I wonder if we want to consider this use case from the start (or the > close approximation of start in this case ;)). The ifindex is there, it's just in the function call signature instead of the struct... Or did you mean something different? -Toke
Jakub Kicinski <kuba@kernel.org> writes: > On Tue, 20 Oct 2020 12:51:02 +0200 Toke Høiland-Jørgensen wrote: >> diff --git a/include/linux/filter.h b/include/linux/filter.h >> index 20fc24c9779a..ba9de7188cd0 100644 >> --- a/include/linux/filter.h >> +++ b/include/linux/filter.h >> @@ -607,12 +607,21 @@ struct bpf_skb_data_end { >> void *data_end; >> }; >> >> +struct bpf_nh_params { >> + u8 nh_family; >> + union { >> + __u32 ipv4_nh; >> + struct in6_addr ipv6_nh; >> + }; >> +}; > >> @@ -4906,6 +4910,18 @@ struct bpf_fib_lookup { >> __u8 dmac[6]; /* ETH_ALEN */ >> }; >> >> +struct bpf_redir_neigh { >> + /* network family for lookup (AF_INET, AF_INET6) */ >> + __u8 nh_family; >> + /* avoid hole in struct - must be set to 0 */ >> + __u8 unused[3]; >> + /* network address of nexthop; skips fib lookup to find gateway */ >> + union { >> + __be32 ipv4_nh; >> + __u32 ipv6_nh[4]; /* in6_addr; network order */ >> + }; >> +}; > > Isn't this backward? The hole could be named in the internal structure. > This is a bit of a gray area, but if you name this hole in uAPI and > programs start referring to it you will never be able to reuse it. > So you may as well not require it to be zeroed.. Hmm, yeah, suppose you're right. Doesn't the verifier prevent any part of the memory from being unitialised anyway? I seem to recall having run into verifier complaints when I didn't initialise struct on the stack... -Toke
Daniel Borkmann <daniel@iogearbox.net> writes: > On 10/20/20 12:51 PM, Toke Høiland-Jørgensen wrote: >> From: Toke Høiland-Jørgensen <toke@redhat.com> > [...] >> BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) >> @@ -2455,8 +2487,8 @@ int skb_do_redirect(struct sk_buff *skb) >> return -EAGAIN; >> } >> return flags & BPF_F_NEIGH ? >> - __bpf_redirect_neigh(skb, dev) : >> - __bpf_redirect(skb, dev, flags); >> + __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? &ri->nh : NULL) : >> + __bpf_redirect(skb, dev, flags); >> out_drop: >> kfree_skb(skb); >> return -EINVAL; >> @@ -2504,16 +2536,25 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = { >> .arg2_type = ARG_ANYTHING, >> }; >> >> -BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) >> +BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, >> + int, plen, u64, flags) >> { >> struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); >> >> - if (unlikely(flags)) >> + if (unlikely((plen && plen < sizeof(*params)) || flags)) >> + return TC_ACT_SHOT; >> + >> + if (unlikely(plen && (params->unused[0] || params->unused[1] || >> + params->unused[2]))) > > small nit: maybe fold this into the prior check that already tests non-zero plen > > if (unlikely((plen && (plen < sizeof(*params) || > (params->unused[0] | params->unused[1] | > params->unused[2]))) || flags)) > return TC_ACT_SHOT; Well that was my first thought as well, but I thought it was uglier. Isn't the compiler smart enough to make those two equivalent? Anyway, given Jakub's comment, I guess this is moot anyway, as we should just get rid of the member, no? -Toke
On 10/20/20 10:30 AM, Jakub Kicinski wrote: > On Tue, 20 Oct 2020 12:51:02 +0200 Toke Høiland-Jørgensen wrote: >> diff --git a/include/linux/filter.h b/include/linux/filter.h >> index 20fc24c9779a..ba9de7188cd0 100644 >> --- a/include/linux/filter.h >> +++ b/include/linux/filter.h >> @@ -607,12 +607,21 @@ struct bpf_skb_data_end { >> void *data_end; >> }; >> >> +struct bpf_nh_params { >> + u8 nh_family; >> + union { >> + __u32 ipv4_nh; >> + struct in6_addr ipv6_nh; >> + }; >> +}; > >> @@ -4906,6 +4910,18 @@ struct bpf_fib_lookup { >> __u8 dmac[6]; /* ETH_ALEN */ >> }; >> >> +struct bpf_redir_neigh { >> + /* network family for lookup (AF_INET, AF_INET6) */ >> + __u8 nh_family; >> + /* avoid hole in struct - must be set to 0 */ >> + __u8 unused[3]; >> + /* network address of nexthop; skips fib lookup to find gateway */ >> + union { >> + __be32 ipv4_nh; >> + __u32 ipv6_nh[4]; /* in6_addr; network order */ >> + }; >> +}; > > Isn't this backward? The hole could be named in the internal structure. > This is a bit of a gray area, but if you name this hole in uAPI and > programs start referring to it you will never be able to reuse it. > So you may as well not require it to be zeroed.. > for uapi naming the holes, stating they are unused and requiring a 0 value allows them to be used later if an api change needs to.
On 10/20/20 12:03 PM, Toke Høiland-Jørgensen wrote: > Jakub Kicinski <kuba@kernel.org> writes: > >> On Tue, 20 Oct 2020 12:51:02 +0200 Toke Høiland-Jørgensen wrote: >>> +struct bpf_nh_params { >>> + u8 nh_family; >>> + union { >>> + __u32 ipv4_nh; >>> + struct in6_addr ipv6_nh; >>> + }; >>> +}; >> >> Folks, not directly related to this set, but there's a SRv6 patch going >> around which adds ifindex, otherwise nh can't be link local. >> >> I wonder if we want to consider this use case from the start (or the >> close approximation of start in this case ;)). > > The ifindex is there, it's just in the function call signature instead > of the struct... Or did you mean something different? > ifindex as the first argument qualifies the device for the address.
On Tue, 20 Oct 2020 12:14:16 -0600 David Ahern wrote: > On 10/20/20 12:03 PM, Toke Høiland-Jørgensen wrote: > > Jakub Kicinski <kuba@kernel.org> writes: > >> On Tue, 20 Oct 2020 12:51:02 +0200 Toke Høiland-Jørgensen wrote: > >>> +struct bpf_nh_params { > >>> + u8 nh_family; > >>> + union { > >>> + __u32 ipv4_nh; > >>> + struct in6_addr ipv6_nh; > >>> + }; > >>> +}; > >> > >> Folks, not directly related to this set, but there's a SRv6 patch going > >> around which adds ifindex, otherwise nh can't be link local. > >> > >> I wonder if we want to consider this use case from the start (or the > >> close approximation of start in this case ;)). > > > > The ifindex is there, it's just in the function call signature instead > > of the struct... Or did you mean something different? > > ifindex as the first argument qualifies the device for the address. Ah, I should have read closer. Seeing there is a plen I assumed all args would naturally be in the structure, but I'm guessing the case where params are NULL will be quite common. Don't mind me.
On Tue, 20 Oct 2020 12:12:32 -0600 David Ahern wrote: > On 10/20/20 10:30 AM, Jakub Kicinski wrote: > > On Tue, 20 Oct 2020 12:51:02 +0200 Toke Høiland-Jørgensen wrote: > >> diff --git a/include/linux/filter.h b/include/linux/filter.h > >> index 20fc24c9779a..ba9de7188cd0 100644 > >> --- a/include/linux/filter.h > >> +++ b/include/linux/filter.h > >> @@ -607,12 +607,21 @@ struct bpf_skb_data_end { > >> void *data_end; > >> }; > >> > >> +struct bpf_nh_params { > >> + u8 nh_family; > >> + union { > >> + __u32 ipv4_nh; > >> + struct in6_addr ipv6_nh; > >> + }; > >> +}; > > > >> @@ -4906,6 +4910,18 @@ struct bpf_fib_lookup { > >> __u8 dmac[6]; /* ETH_ALEN */ > >> }; > >> > >> +struct bpf_redir_neigh { > >> + /* network family for lookup (AF_INET, AF_INET6) */ > >> + __u8 nh_family; > >> + /* avoid hole in struct - must be set to 0 */ > >> + __u8 unused[3]; > >> + /* network address of nexthop; skips fib lookup to find gateway */ > >> + union { > >> + __be32 ipv4_nh; > >> + __u32 ipv6_nh[4]; /* in6_addr; network order */ > >> + }; > >> +}; > > > > Isn't this backward? The hole could be named in the internal structure. > > This is a bit of a gray area, but if you name this hole in uAPI and > > programs start referring to it you will never be able to reuse it. > > So you may as well not require it to be zeroed.. > > for uapi naming the holes, stating they are unused and requiring a 0 > value allows them to be used later if an api change needs to. I'm not sure what you're saying, if the field is referenced it can't be removed. But we could use a union, so I guess it's not a deal breaker.
On Tue, 20 Oct 2020 20:08:18 +0200 Toke Høiland-Jørgensen wrote: > > Isn't this backward? The hole could be named in the internal structure. > > This is a bit of a gray area, but if you name this hole in uAPI and > > programs start referring to it you will never be able to reuse it. > > So you may as well not require it to be zeroed.. > > Hmm, yeah, suppose you're right. Doesn't the verifier prevent any part > of the memory from being unitialised anyway? I seem to recall having run > into verifier complaints when I didn't initialise struct on the stack... Good point, in which case we have a convenient way to zero the hole after nh_family but no convenient way to zero the empty address space for IPv4 :) (even though that one only needs to be zeroed for the verifier)
On 10/20/20 9:01 PM, Jakub Kicinski wrote: > On Tue, 20 Oct 2020 20:08:18 +0200 Toke Høiland-Jørgensen wrote: >>> Isn't this backward? The hole could be named in the internal structure. >>> This is a bit of a gray area, but if you name this hole in uAPI and >>> programs start referring to it you will never be able to reuse it. >>> So you may as well not require it to be zeroed.. >> >> Hmm, yeah, suppose you're right. Doesn't the verifier prevent any part >> of the memory from being unitialised anyway? I seem to recall having run >> into verifier complaints when I didn't initialise struct on the stack... > > Good point, in which case we have a convenient way to zero the hole > after nh_family but no convenient way to zero the empty address space > for IPv4 :) (even though that one only needs to be zeroed for the > verifier) Technically, it's uninitialized, so zero or any other garbage from BPF stack's previous use of the program. We could use couple of __u8 :8 after nh_family to have an unnamed placeholder (like in __bpf_md_ptr()), or we might as well just switch to __u32 nh_family and avoid the hole that way (also gets rid of the extra check) ... given we have the liberty to extend later anyway if ever needed.
diff --git a/include/linux/filter.h b/include/linux/filter.h index 20fc24c9779a..ba9de7188cd0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -607,12 +607,21 @@ struct bpf_skb_data_end { void *data_end; }; +struct bpf_nh_params { + u8 nh_family; + union { + __u32 ipv4_nh; + struct in6_addr ipv6_nh; + }; +}; + struct bpf_redirect_info { u32 flags; u32 tgt_index; void *tgt_value; struct bpf_map *map; u32 kern_flags; + struct bpf_nh_params nh; }; DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bf5a99d803e4..9668cde9d684 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3677,15 +3677,19 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * long bpf_redirect_neigh(u32 ifindex, u64 flags) + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) * Description * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it * populates L2 addresses as well, meaning, internally, the helper - * performs a FIB lookup based on the skb's networking header to - * get the address of the next hop and then relies on the neighbor - * lookup for the L2 address of the nexthop. + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. * * The *flags* argument is reserved and must be 0. The helper is * currently only supported for tc BPF program types, and enabled @@ -4906,6 +4910,18 @@ struct bpf_fib_lookup { __u8 dmac[6]; /* ETH_ALEN */ }; +struct bpf_redir_neigh { + /* network family for lookup (AF_INET, AF_INET6) */ + __u8 nh_family; + /* avoid hole in struct - must be set to 0 */ + __u8 unused[3]; + /* network address of nexthop; skips fib lookup to find gateway */ + union { + __be32 ipv4_nh; + __u32 ipv6_nh[4]; /* in6_addr; network order */ + }; +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ diff --git a/net/core/filter.c b/net/core/filter.c index c5e2a1c5fd8d..fa09b4f141ae 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2165,12 +2165,12 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, } #if IS_ENABLED(CONFIG_IPV6) -static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, + struct net_device *dev, struct bpf_nh_params *nh) { - struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst->dev; u32 hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *nexthop; + struct dst_entry *dst = NULL; struct neighbour *neigh; if (dev_xmit_recursion()) { @@ -2196,8 +2196,13 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) } rcu_read_lock_bh(); - nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), - &ipv6_hdr(skb)->daddr); + if (!nh) { + dst = skb_dst(skb); + nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), + &ipv6_hdr(skb)->daddr); + } else { + nexthop = &nh->ipv6_nh; + } neigh = ip_neigh_gw6(dev, nexthop); if (likely(!IS_ERR(neigh))) { int ret; @@ -2210,36 +2215,43 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) return ret; } rcu_read_unlock_bh(); - IP6_INC_STATS(dev_net(dst->dev), - ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + if (dst) + IP6_INC_STATS(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: kfree_skb(skb); return -ENETDOWN; } -static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; - struct dst_entry *dst; - struct flowi6 fl6 = { - .flowi6_flags = FLOWI_FLAG_ANYSRC, - .flowi6_mark = skb->mark, - .flowlabel = ip6_flowinfo(ip6h), - .flowi6_oif = dev->ifindex, - .flowi6_proto = ip6h->nexthdr, - .daddr = ip6h->daddr, - .saddr = ip6h->saddr, - }; - dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); - if (IS_ERR(dst)) - goto out_drop; + if (!nh) { + struct dst_entry *dst; + struct flowi6 fl6 = { + .flowi6_flags = FLOWI_FLAG_ANYSRC, + .flowi6_mark = skb->mark, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_oif = dev->ifindex, + .flowi6_proto = ip6h->nexthdr, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + }; + + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); + if (IS_ERR(dst)) + goto out_drop; - skb_dst_set(skb, dst); + skb_dst_set(skb, dst); + } else if (nh->nh_family != AF_INET6) { + goto out_drop; + } - err = bpf_out_neigh_v6(net, skb); + err = bpf_out_neigh_v6(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) dev->stats.tx_errors++; else @@ -2252,7 +2264,8 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) return ret; } #else -static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; @@ -2260,11 +2273,9 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) #endif /* CONFIG_IPV6 */ #if IS_ENABLED(CONFIG_INET) -static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) +static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, + struct net_device *dev, struct bpf_nh_params *nh) { - struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = container_of(dst, struct rtable, dst); - struct net_device *dev = dst->dev; u32 hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; @@ -2292,7 +2303,21 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) } rcu_read_lock_bh(); - neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); + if (!nh) { + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = container_of(dst, struct rtable, dst); + + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); + } else if (nh->nh_family == AF_INET6) { + neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); + is_v6gw = true; + } else if (nh->nh_family == AF_INET) { + neigh = ip_neigh_gw4(dev, nh->ipv4_nh); + } else { + rcu_read_unlock_bh(); + goto out_drop; + } + if (likely(!IS_ERR(neigh))) { int ret; @@ -2309,33 +2334,37 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) return -ENETDOWN; } -static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { const struct iphdr *ip4h = ip_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; - struct rtable *rt; - struct flowi4 fl4 = { - .flowi4_flags = FLOWI_FLAG_ANYSRC, - .flowi4_mark = skb->mark, - .flowi4_tos = RT_TOS(ip4h->tos), - .flowi4_oif = dev->ifindex, - .flowi4_proto = ip4h->protocol, - .daddr = ip4h->daddr, - .saddr = ip4h->saddr, - }; - rt = ip_route_output_flow(net, &fl4, NULL); - if (IS_ERR(rt)) - goto out_drop; - if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { - ip_rt_put(rt); - goto out_drop; - } + if (!nh) { + struct flowi4 fl4 = { + .flowi4_flags = FLOWI_FLAG_ANYSRC, + .flowi4_mark = skb->mark, + .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_oif = dev->ifindex, + .flowi4_proto = ip4h->protocol, + .daddr = ip4h->daddr, + .saddr = ip4h->saddr, + }; + struct rtable *rt; + + rt = ip_route_output_flow(net, &fl4, NULL); + if (IS_ERR(rt)) + goto out_drop; + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { + ip_rt_put(rt); + goto out_drop; + } - skb_dst_set(skb, &rt->dst); + skb_dst_set(skb, &rt->dst); + } - err = bpf_out_neigh_v4(net, skb); + err = bpf_out_neigh_v4(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) dev->stats.tx_errors++; else @@ -2348,14 +2377,16 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) return ret; } #else -static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; } #endif /* CONFIG_INET */ -static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { struct ethhdr *ethh = eth_hdr(skb); @@ -2370,9 +2401,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) skb_reset_network_header(skb); if (skb->protocol == htons(ETH_P_IP)) - return __bpf_redirect_neigh_v4(skb, dev); + return __bpf_redirect_neigh_v4(skb, dev, nh); else if (skb->protocol == htons(ETH_P_IPV6)) - return __bpf_redirect_neigh_v6(skb, dev); + return __bpf_redirect_neigh_v6(skb, dev, nh); out: kfree_skb(skb); return -ENOTSUPP; @@ -2382,7 +2413,8 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) enum { BPF_F_NEIGH = (1ULL << 1), BPF_F_PEER = (1ULL << 2), -#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER) + BPF_F_NEXTHOP = (1ULL << 3), +#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP) }; BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) @@ -2455,8 +2487,8 @@ int skb_do_redirect(struct sk_buff *skb) return -EAGAIN; } return flags & BPF_F_NEIGH ? - __bpf_redirect_neigh(skb, dev) : - __bpf_redirect(skb, dev, flags); + __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? &ri->nh : NULL) : + __bpf_redirect(skb, dev, flags); out_drop: kfree_skb(skb); return -EINVAL; @@ -2504,16 +2536,25 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) +BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, + int, plen, u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - if (unlikely(flags)) + if (unlikely((plen && plen < sizeof(*params)) || flags)) + return TC_ACT_SHOT; + + if (unlikely(plen && (params->unused[0] || params->unused[1] || + params->unused[2]))) return TC_ACT_SHOT; - ri->flags = BPF_F_NEIGH; + ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); ri->tgt_index = ifindex; + BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); + if (plen) + memcpy(&ri->nh, params, sizeof(ri->nh)); + return TC_ACT_REDIRECT; } @@ -2522,7 +2563,9 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 7d86fdd190be..6769caae142f 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -453,6 +453,7 @@ class PrinterHelpers(Printer): 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', + 'struct bpf_redir_neigh', 'struct bpf_sk_lookup', 'struct bpf_sock', 'struct bpf_sock_addr', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bf5a99d803e4..9668cde9d684 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3677,15 +3677,19 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * long bpf_redirect_neigh(u32 ifindex, u64 flags) + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) * Description * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it * populates L2 addresses as well, meaning, internally, the helper - * performs a FIB lookup based on the skb's networking header to - * get the address of the next hop and then relies on the neighbor - * lookup for the L2 address of the nexthop. + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. * * The *flags* argument is reserved and must be 0. The helper is * currently only supported for tc BPF program types, and enabled @@ -4906,6 +4910,18 @@ struct bpf_fib_lookup { __u8 dmac[6]; /* ETH_ALEN */ }; +struct bpf_redir_neigh { + /* network family for lookup (AF_INET, AF_INET6) */ + __u8 nh_family; + /* avoid hole in struct - must be set to 0 */ + __u8 unused[3]; + /* network address of nexthop; skips fib lookup to find gateway */ + union { + __be32 ipv4_nh; + __u32 ipv6_nh[4]; /* in6_addr; network order */ + }; +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */