Message ID | 160277680746.157904.8726318184090980429.stgit@toke.dk |
---|---|
Headers | show |
Series | bpf: Rework bpf_redirect_neigh() to allow supplying nexthop from caller | expand |
On 10/15/20 9:46 AM, Toke Høiland-Jørgensen wrote: > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index bf5a99d803e4..980cc1363be8 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -3677,15 +3677,19 @@ union bpf_attr { > * Return > * The id is returned or 0 in case the id could not be retrieved. > * > - * long bpf_redirect_neigh(u32 ifindex, u64 flags) > + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) why not fold ifindex into params? with params and plen this should be extensible later if needed. A couple of nits below that caught me eye. > * Description > * Redirect the packet to another net device of index *ifindex* > * and fill in L2 addresses from neighboring subsystem. This helper > * is somewhat similar to **bpf_redirect**\ (), except that it > * populates L2 addresses as well, meaning, internally, the helper > - * performs a FIB lookup based on the skb's networking header to > - * get the address of the next hop and then relies on the neighbor > - * lookup for the L2 address of the nexthop. > + * relies on the neighbor lookup for the L2 address of the nexthop. > + * > + * The helper will perform a FIB lookup based on the skb's > + * networking header to get the address of the next hop, unless > + * this is supplied by the caller in the *params* argument. The > + * *plen* argument indicates the len of *params* and should be set > + * to 0 if *params* is NULL. > * > * The *flags* argument is reserved and must be 0. The helper is > * currently only supported for tc BPF program types, and enabled > @@ -4906,6 +4910,17 @@ struct bpf_fib_lookup { > __u8 dmac[6]; /* ETH_ALEN */ > }; > > +struct bpf_redir_neigh { > + /* network family for lookup (AF_INET, AF_INET6) > + */ second line for the comment is not needed. > + __u8 nh_family; > + /* network address of nexthop; skips fib lookup to find gateway */ > + union { > + __be32 ipv4_nh; > + __u32 ipv6_nh[4]; /* in6_addr; network order */ > + }; > +}; > + > enum bpf_task_fd_type { > BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ > BPF_FD_TYPE_TRACEPOINT, /* tp name */ > diff --git a/net/core/filter.c b/net/core/filter.c > index c5e2a1c5fd8d..d073031a3a61 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -2165,12 +2165,11 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, > } > > #if IS_ENABLED(CONFIG_IPV6) > -static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) > +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, > + struct net_device *dev, const struct in6_addr *nexthop) > { > - struct dst_entry *dst = skb_dst(skb); > - struct net_device *dev = dst->dev; > u32 hh_len = LL_RESERVED_SPACE(dev); > - const struct in6_addr *nexthop; > + struct dst_entry *dst = NULL; > struct neighbour *neigh; > > if (dev_xmit_recursion()) { > @@ -2196,8 +2195,11 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) > } > > rcu_read_lock_bh(); > - nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), > - &ipv6_hdr(skb)->daddr); > + if (!nexthop) { > + dst = skb_dst(skb); > + nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), > + &ipv6_hdr(skb)->daddr); > + } > neigh = ip_neigh_gw6(dev, nexthop); > if (likely(!IS_ERR(neigh))) { > int ret; > @@ -2210,36 +2212,46 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) > return ret; > } > rcu_read_unlock_bh(); > - IP6_INC_STATS(dev_net(dst->dev), > - ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); > + if (dst) > + IP6_INC_STATS(dev_net(dst->dev), > + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); > out_drop: > kfree_skb(skb); > return -ENETDOWN; > } > > -static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) > +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, > + struct bpf_nh_params *nh) > { > const struct ipv6hdr *ip6h = ipv6_hdr(skb); > + struct in6_addr *nexthop = NULL; > struct net *net = dev_net(dev); > int err, ret = NET_XMIT_DROP; > - struct dst_entry *dst; > - struct flowi6 fl6 = { > - .flowi6_flags = FLOWI_FLAG_ANYSRC, > - .flowi6_mark = skb->mark, > - .flowlabel = ip6_flowinfo(ip6h), > - .flowi6_oif = dev->ifindex, > - .flowi6_proto = ip6h->nexthdr, > - .daddr = ip6h->daddr, > - .saddr = ip6h->saddr, > - }; > > - dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); > - if (IS_ERR(dst)) > - goto out_drop; > + if (!nh->nh_family) { > + struct dst_entry *dst; reverse xmas tree ordering > + struct flowi6 fl6 = { > + .flowi6_flags = FLOWI_FLAG_ANYSRC, > + .flowi6_mark = skb->mark, > + .flowlabel = ip6_flowinfo(ip6h), > + .flowi6_oif = dev->ifindex, > + .flowi6_proto = ip6h->nexthdr, > + .daddr = ip6h->daddr, > + .saddr = ip6h->saddr, > + }; > + > + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); > + if (IS_ERR(dst)) > + goto out_drop; > > - skb_dst_set(skb, dst); > + skb_dst_set(skb, dst); > + } else if (nh->nh_family == AF_INET6) { > + nexthop = &nh->ipv6_nh; > + } else { > + goto out_drop; > + } > > - err = bpf_out_neigh_v6(net, skb); > + err = bpf_out_neigh_v6(net, skb, dev, nexthop); > if (unlikely(net_xmit_eval(err))) > dev->stats.tx_errors++; > else
David Ahern <dsahern@gmail.com> writes: > On 10/15/20 9:46 AM, Toke Høiland-Jørgensen wrote: >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index bf5a99d803e4..980cc1363be8 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h >> @@ -3677,15 +3677,19 @@ union bpf_attr { >> * Return >> * The id is returned or 0 in case the id could not be retrieved. >> * >> - * long bpf_redirect_neigh(u32 ifindex, u64 flags) >> + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) > > why not fold ifindex into params? with params and plen this should be > extensible later if needed. Figured this way would make it easier to run *without* the params (like in the existing examples). But don't feel strongly about it, let's see what Daniel thinks. > A couple of nits below that caught me eye. Thanks, will fix; the kernel bot also found a sparse warning, so I guess I need to respin anyway (but waiting for Daniel's comments and/or instructions on what tree to properly submit this to). -Toke
On 10/15/20 9:34 PM, Toke Høiland-Jørgensen wrote: > David Ahern <dsahern@gmail.com> writes: >> On 10/15/20 9:46 AM, Toke Høiland-Jørgensen wrote: >>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >>> index bf5a99d803e4..980cc1363be8 100644 >>> --- a/include/uapi/linux/bpf.h >>> +++ b/include/uapi/linux/bpf.h >>> @@ -3677,15 +3677,19 @@ union bpf_attr { >>> * Return >>> * The id is returned or 0 in case the id could not be retrieved. >>> * >>> - * long bpf_redirect_neigh(u32 ifindex, u64 flags) >>> + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) >> >> why not fold ifindex into params? with params and plen this should be >> extensible later if needed. > > Figured this way would make it easier to run *without* the params (like > in the existing examples). But don't feel strongly about it, let's see > what Daniel thinks. My preference is what Toke has here, this simplifies use by just being able to call bpf_redirect_neigh(ifindex, NULL, 0, 0) when just single external facing device is used. >> A couple of nits below that caught me eye. > > Thanks, will fix; the kernel bot also found a sparse warning, so I guess > I need to respin anyway (but waiting for Daniel's comments and/or > instructions on what tree to properly submit this to). Given API change, lets do bpf. (Will review the rest later today.)
Daniel Borkmann <daniel@iogearbox.net> writes: > On 10/15/20 9:34 PM, Toke Høiland-Jørgensen wrote: >> David Ahern <dsahern@gmail.com> writes: >>> On 10/15/20 9:46 AM, Toke Høiland-Jørgensen wrote: >>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >>>> index bf5a99d803e4..980cc1363be8 100644 >>>> --- a/include/uapi/linux/bpf.h >>>> +++ b/include/uapi/linux/bpf.h >>>> @@ -3677,15 +3677,19 @@ union bpf_attr { >>>> * Return >>>> * The id is returned or 0 in case the id could not be retrieved. >>>> * >>>> - * long bpf_redirect_neigh(u32 ifindex, u64 flags) >>>> + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) >>> >>> why not fold ifindex into params? with params and plen this should be >>> extensible later if needed. >> >> Figured this way would make it easier to run *without* the params (like >> in the existing examples). But don't feel strongly about it, let's see >> what Daniel thinks. > > My preference is what Toke has here, this simplifies use by just being able to > call bpf_redirect_neigh(ifindex, NULL, 0, 0) when just single external facing > device is used. > >>> A couple of nits below that caught me eye. >> >> Thanks, will fix; the kernel bot also found a sparse warning, so I guess >> I need to respin anyway (but waiting for Daniel's comments and/or >> instructions on what tree to properly submit this to). > > Given API change, lets do bpf. (Will review the rest later today.) Right, ACK. I'll wait for your review, then resubmit against the bpf tree :) -Toke
Daniel Borkmann <daniel@iogearbox.net> writes: > On 10/15/20 5:46 PM, Toke Høiland-Jørgensen wrote: >> From: Toke Høiland-Jørgensen <toke@redhat.com> >> >> Based on the discussion in [0], update the bpf_redirect_neigh() helper to >> accept an optional parameter specifying the nexthop information. This makes >> it possible to combine bpf_fib_lookup() and bpf_redirect_neigh() without >> incurring a duplicate FIB lookup - since the FIB lookup helper will return >> the nexthop information even if no neighbour is present, this can simply be >> passed on to bpf_redirect_neigh() if bpf_fib_lookup() returns >> BPF_FIB_LKUP_RET_NO_NEIGH. >> >> [0] https://lore.kernel.org/bpf/393e17fc-d187-3a8d-2f0d-a627c7c63fca@iogearbox.net/ >> >> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com> > > Overall looks good from what I can tell, just small nits below on top of > David's feedback: > > [...] >> -static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) >> +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, >> + struct bpf_nh_params *nh) >> { >> const struct iphdr *ip4h = ip_hdr(skb); >> struct net *net = dev_net(dev); >> int err, ret = NET_XMIT_DROP; >> - struct rtable *rt; >> - struct flowi4 fl4 = { >> - .flowi4_flags = FLOWI_FLAG_ANYSRC, >> - .flowi4_mark = skb->mark, >> - .flowi4_tos = RT_TOS(ip4h->tos), >> - .flowi4_oif = dev->ifindex, >> - .flowi4_proto = ip4h->protocol, >> - .daddr = ip4h->daddr, >> - .saddr = ip4h->saddr, >> - }; >> >> - rt = ip_route_output_flow(net, &fl4, NULL); >> - if (IS_ERR(rt)) >> - goto out_drop; >> - if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { >> - ip_rt_put(rt); >> - goto out_drop; >> - } >> + if (!nh->nh_family) { >> + struct rtable *rt; >> + struct flowi4 fl4 = { >> + .flowi4_flags = FLOWI_FLAG_ANYSRC, >> + .flowi4_mark = skb->mark, >> + .flowi4_tos = RT_TOS(ip4h->tos), >> + .flowi4_oif = dev->ifindex, >> + .flowi4_proto = ip4h->protocol, >> + .daddr = ip4h->daddr, >> + .saddr = ip4h->saddr, >> + }; >> + >> + rt = ip_route_output_flow(net, &fl4, NULL); >> + if (IS_ERR(rt)) >> + goto out_drop; >> + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { >> + ip_rt_put(rt); >> + goto out_drop; >> + } >> >> - skb_dst_set(skb, &rt->dst); >> + skb_dst_set(skb, &rt->dst); >> + nh = NULL; >> + } >> >> - err = bpf_out_neigh_v4(net, skb); >> + err = bpf_out_neigh_v4(net, skb, dev, nh); >> if (unlikely(net_xmit_eval(err))) >> dev->stats.tx_errors++; >> else >> @@ -2355,7 +2383,8 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) >> } >> #endif /* CONFIG_INET */ >> >> -static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) >> +static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, >> + struct bpf_nh_params *nh) >> { >> struct ethhdr *ethh = eth_hdr(skb); >> >> @@ -2370,9 +2399,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) >> skb_reset_network_header(skb); >> >> if (skb->protocol == htons(ETH_P_IP)) >> - return __bpf_redirect_neigh_v4(skb, dev); >> + return __bpf_redirect_neigh_v4(skb, dev, nh); >> else if (skb->protocol == htons(ETH_P_IPV6)) >> - return __bpf_redirect_neigh_v6(skb, dev); >> + return __bpf_redirect_neigh_v6(skb, dev, nh); >> out: >> kfree_skb(skb); >> return -ENOTSUPP; >> @@ -2455,8 +2484,8 @@ int skb_do_redirect(struct sk_buff *skb) >> return -EAGAIN; >> } >> return flags & BPF_F_NEIGH ? >> - __bpf_redirect_neigh(skb, dev) : >> - __bpf_redirect(skb, dev, flags); >> + __bpf_redirect_neigh(skb, dev, &ri->nh) : >> + __bpf_redirect(skb, dev, flags); >> out_drop: >> kfree_skb(skb); >> return -EINVAL; >> @@ -2504,16 +2533,23 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = { >> .arg2_type = ARG_ANYTHING, >> }; >> >> -BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) >> +BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, >> + int, plen, u64, flags) >> { >> struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); >> >> - if (unlikely(flags)) >> + if (unlikely((plen && plen < sizeof(*params)) || flags)) >> return TC_ACT_SHOT; >> >> ri->flags = BPF_F_NEIGH; >> ri->tgt_index = ifindex; >> >> + BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); >> + if (plen) >> + memcpy(&ri->nh, params, sizeof(ri->nh)); >> + else >> + ri->nh.nh_family = 0; /* clear previous value */ > > I'd probably just add an internal flag and do ... > > ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); > > ... instead of above clearing, and skb_do_redirect() then becomes: > > __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? &ri->nh : NULL) > > ... which would then also avoid this !nh->nh_family check where you later on > set nh = NULL to pass it onwards. Ah yes, excellent idea! Will fix :) -Toke
On 10/15/20 5:46 PM, Toke Høiland-Jørgensen wrote: [...] > +struct bpf_redir_neigh { > + /* network family for lookup (AF_INET, AF_INET6) > + */ > + __u8 nh_family; > + /* network address of nexthop; skips fib lookup to find gateway */ > + union { > + __be32 ipv4_nh; > + __u32 ipv6_nh[4]; /* in6_addr; network order */ > + }; > +}; > + > enum bpf_task_fd_type { > BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ > BPF_FD_TYPE_TRACEPOINT, /* tp name */ > diff --git a/net/core/filter.c b/net/core/filter.c > index c5e2a1c5fd8d..d073031a3a61 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -2165,12 +2165,11 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, > } > > #if IS_ENABLED(CONFIG_IPV6) > -static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) > +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, > + struct net_device *dev, const struct in6_addr *nexthop) > { > - struct dst_entry *dst = skb_dst(skb); > - struct net_device *dev = dst->dev; > u32 hh_len = LL_RESERVED_SPACE(dev); > - const struct in6_addr *nexthop; > + struct dst_entry *dst = NULL; > struct neighbour *neigh; > > if (dev_xmit_recursion()) { > @@ -2196,8 +2195,11 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) > } > > rcu_read_lock_bh(); > - nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), > - &ipv6_hdr(skb)->daddr); > + if (!nexthop) { > + dst = skb_dst(skb); > + nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), > + &ipv6_hdr(skb)->daddr); > + } > neigh = ip_neigh_gw6(dev, nexthop); > if (likely(!IS_ERR(neigh))) { > int ret; > @@ -2210,36 +2212,46 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) > return ret; > } > rcu_read_unlock_bh(); > - IP6_INC_STATS(dev_net(dst->dev), > - ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); > + if (dst) > + IP6_INC_STATS(dev_net(dst->dev), > + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); > out_drop: > kfree_skb(skb); > return -ENETDOWN; > } > > -static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) > +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, > + struct bpf_nh_params *nh) > { > const struct ipv6hdr *ip6h = ipv6_hdr(skb); > + struct in6_addr *nexthop = NULL; > struct net *net = dev_net(dev); > int err, ret = NET_XMIT_DROP; > - struct dst_entry *dst; > - struct flowi6 fl6 = { > - .flowi6_flags = FLOWI_FLAG_ANYSRC, > - .flowi6_mark = skb->mark, > - .flowlabel = ip6_flowinfo(ip6h), > - .flowi6_oif = dev->ifindex, > - .flowi6_proto = ip6h->nexthdr, > - .daddr = ip6h->daddr, > - .saddr = ip6h->saddr, > - }; > > - dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); > - if (IS_ERR(dst)) > - goto out_drop; > + if (!nh->nh_family) { > + struct dst_entry *dst; > + struct flowi6 fl6 = { > + .flowi6_flags = FLOWI_FLAG_ANYSRC, > + .flowi6_mark = skb->mark, > + .flowlabel = ip6_flowinfo(ip6h), > + .flowi6_oif = dev->ifindex, > + .flowi6_proto = ip6h->nexthdr, > + .daddr = ip6h->daddr, > + .saddr = ip6h->saddr, nit: Would be good for readability to keep the previous whitespace alignment intact. > + }; > + > + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); > + if (IS_ERR(dst)) > + goto out_drop; > > - skb_dst_set(skb, dst); > + skb_dst_set(skb, dst); > + } else if (nh->nh_family == AF_INET6) { > + nexthop = &nh->ipv6_nh; > + } else { > + goto out_drop; > + } > > - err = bpf_out_neigh_v6(net, skb); > + err = bpf_out_neigh_v6(net, skb, dev, nexthop); I'd probably model the bpf_out_neigh_v{4,6}() as close as possible similar to each other in terms of args we pass etc. In the v6 case you pass the nexthop in6_addr directly whereas v4 passes bpf_nh_params, I'd probably also stick to the latter for v6 to keep it symmetric. > if (unlikely(net_xmit_eval(err))) > dev->stats.tx_errors++; > else > @@ -2260,11 +2272,9 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) > #endif /* CONFIG_IPV6 */ > > #if IS_ENABLED(CONFIG_INET) > -static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) > +static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, > + struct net_device *dev, struct bpf_nh_params *nh) > { > - struct dst_entry *dst = skb_dst(skb); > - struct rtable *rt = container_of(dst, struct rtable, dst); > - struct net_device *dev = dst->dev; > u32 hh_len = LL_RESERVED_SPACE(dev); > struct neighbour *neigh; > bool is_v6gw = false; > @@ -2292,7 +2302,20 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) > } > > rcu_read_lock_bh(); > - neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); > + if (!nh) { > + struct dst_entry *dst = skb_dst(skb); > + struct rtable *rt = container_of(dst, struct rtable, dst); > + > + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); > + } else if (nh->nh_family == AF_INET6) { > + neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); > + is_v6gw = true; > + } else if (nh->nh_family == AF_INET) { > + neigh = ip_neigh_gw4(dev, nh->ipv4_nh); > + } else { > + goto out_drop; > + } > + > if (likely(!IS_ERR(neigh))) { > int ret; >