[RFC,bpf-next,0/2] bpf: Rework bpf_redirect_neigh() to allow supplying nexthop from caller

Message ID	160277680746.157904.8726318184090980429.stgit@toke.dk
Headers	show Return-Path: <SRS0=ru8x=DW=vger.kernel.org=netdev-owner@kernel.org> Subject: [PATCH RFC bpf-next 0/2] bpf: Rework bpf_redirect_neigh() to allow supplying nexthop from caller From: =?utf-8?q?Toke_H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com> To: Daniel Borkmann <daniel@iogearbox.net> Cc: David Ahern <dsahern@kernel.org>, netdev@vger.kernel.org, bpf@vger.kernel.org Date: Thu, 15 Oct 2020 17:46:47 +0200 Message-ID: <160277680746.157904.8726318184090980429.stgit@toke.dk> User-Agent: StGit/0.23 MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 8bit Precedence: bulk
Series	bpf: Rework bpf_redirect_neigh() to allow supplying nexthop from caller \| expand [RFC,bpf-next,0/2] bpf: Rework bpf_redirect_neigh() to allow supplying nexthop from caller [RFC,bpf-next,1/2] bpf_redirect_neigh: Support supplying the nexthop as a helper parameter [RFC,bpf-next,2/2] selftests: Update test_tc_neigh to use the modified bpf_redirect_neigh()

Toke Høiland-Jørgensen Oct. 15, 2020, 3:46 p.m. UTC

Based on previous discussion[0], we determined that it would be beneficial to
rework bpf_redirect_neigh() so the caller can supply the nexthop information
(e.g., from a previous call to bpf_fib_lookup()). This way, the two helpers can
be combined without incurring a second FIB lookup to find the nexthop, and
bpf_fib_lookup() becomes usable even if no nexthop entry currently exists.

This patch (and accompanying selftest update) accomplishes this by way of an
optional paramter to bpf_redirect_neigh(). This is an API change, and so should
really be merged into the bpf tree to be part of the 5.10 cycle; however, since
bpf-next has not yet been merged into bpf, I'm sending this as an RFC against
bpf-next for discussion, and will repost against bpf once that merge happens
(Daniel, unless you have a better way of doing this, of course).

-Toke

[0] https://lore.kernel.org/bpf/393e17fc-d187-3a8d-2f0d-a627c7c63fca@iogearbox.net/

---

Toke Høiland-Jørgensen (2):
      bpf_redirect_neigh: Support supplying the nexthop as a helper parameter
      selftests: Update test_tc_neigh to use the modified bpf_redirect_neigh()


 .../selftests/bpf/progs/test_tc_neigh.c       | 83 ++++++++++++++++---
 .../testing/selftests/bpf/test_tc_redirect.sh |  8 +-
 2 files changed, 78 insertions(+), 13 deletions(-)

David Ahern Oct. 15, 2020, 4:27 p.m. UTC | #1

On 10/15/20 9:46 AM, Toke Høiland-Jørgensen wrote:
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index bf5a99d803e4..980cc1363be8 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3677,15 +3677,19 @@ union bpf_attr {
>   * 	Return
>   * 		The id is returned or 0 in case the id could not be retrieved.
>   *
> - * long bpf_redirect_neigh(u32 ifindex, u64 flags)
> + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags)

why not fold ifindex into params? with params and plen this should be
extensible later if needed.

A couple of nits below that caught me eye.


>   * 	Description
>   * 		Redirect the packet to another net device of index *ifindex*
>   * 		and fill in L2 addresses from neighboring subsystem. This helper
>   * 		is somewhat similar to **bpf_redirect**\ (), except that it
>   * 		populates L2 addresses as well, meaning, internally, the helper
> - * 		performs a FIB lookup based on the skb's networking header to
> - * 		get the address of the next hop and then relies on the neighbor
> - * 		lookup for the L2 address of the nexthop.
> + * 		relies on the neighbor lookup for the L2 address of the nexthop.
> + *
> + * 		The helper will perform a FIB lookup based on the skb's
> + * 		networking header to get the address of the next hop, unless
> + * 		this is supplied by the caller in the *params* argument. The
> + * 		*plen* argument indicates the len of *params* and should be set
> + * 		to 0 if *params* is NULL.
>   *
>   * 		The *flags* argument is reserved and must be 0. The helper is
>   * 		currently only supported for tc BPF program types, and enabled
> @@ -4906,6 +4910,17 @@ struct bpf_fib_lookup {
>  	__u8	dmac[6];     /* ETH_ALEN */
>  };
>  
> +struct bpf_redir_neigh {
> +	/* network family for lookup (AF_INET, AF_INET6)
> +	 */

second line for the comment is not needed.

> +	__u8	nh_family;
> +	/* network address of nexthop; skips fib lookup to find gateway */
> +	union {
> +		__be32		ipv4_nh;
> +		__u32		ipv6_nh[4];  /* in6_addr; network order */
> +	};
> +};
> +
>  enum bpf_task_fd_type {
>  	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
>  	BPF_FD_TYPE_TRACEPOINT,		/* tp name */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index c5e2a1c5fd8d..d073031a3a61 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2165,12 +2165,11 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
>  }
>  
>  #if IS_ENABLED(CONFIG_IPV6)
> -static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
> +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
> +			    struct net_device *dev, const struct in6_addr *nexthop)
>  {
> -	struct dst_entry *dst = skb_dst(skb);
> -	struct net_device *dev = dst->dev;
>  	u32 hh_len = LL_RESERVED_SPACE(dev);
> -	const struct in6_addr *nexthop;
> +	struct dst_entry *dst = NULL;
>  	struct neighbour *neigh;
>  
>  	if (dev_xmit_recursion()) {
> @@ -2196,8 +2195,11 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
>  	}
>  
>  	rcu_read_lock_bh();
> -	nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
> -			      &ipv6_hdr(skb)->daddr);
> +	if (!nexthop) {
> +		dst = skb_dst(skb);
> +		nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
> +				      &ipv6_hdr(skb)->daddr);
> +	}
>  	neigh = ip_neigh_gw6(dev, nexthop);
>  	if (likely(!IS_ERR(neigh))) {
>  		int ret;
> @@ -2210,36 +2212,46 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
>  		return ret;
>  	}
>  	rcu_read_unlock_bh();
> -	IP6_INC_STATS(dev_net(dst->dev),
> -		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
> +	if (dst)
> +		IP6_INC_STATS(dev_net(dst->dev),
> +			      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
>  out_drop:
>  	kfree_skb(skb);
>  	return -ENETDOWN;
>  }
>  
> -static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
> +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
> +				   struct bpf_nh_params *nh)
>  {
>  	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
> +	struct in6_addr *nexthop = NULL;
>  	struct net *net = dev_net(dev);
>  	int err, ret = NET_XMIT_DROP;
> -	struct dst_entry *dst;
> -	struct flowi6 fl6 = {
> -		.flowi6_flags	= FLOWI_FLAG_ANYSRC,
> -		.flowi6_mark	= skb->mark,
> -		.flowlabel	= ip6_flowinfo(ip6h),
> -		.flowi6_oif	= dev->ifindex,
> -		.flowi6_proto	= ip6h->nexthdr,
> -		.daddr		= ip6h->daddr,
> -		.saddr		= ip6h->saddr,
> -	};
>  
> -	dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
> -	if (IS_ERR(dst))
> -		goto out_drop;
> +	if (!nh->nh_family) {
> +		struct dst_entry *dst;

reverse xmas tree ordering

> +		struct flowi6 fl6 = {
> +			.flowi6_flags = FLOWI_FLAG_ANYSRC,
> +			.flowi6_mark = skb->mark,
> +			.flowlabel = ip6_flowinfo(ip6h),
> +			.flowi6_oif = dev->ifindex,
> +			.flowi6_proto = ip6h->nexthdr,
> +			.daddr = ip6h->daddr,
> +			.saddr = ip6h->saddr,
> +		};
> +
> +		dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
> +		if (IS_ERR(dst))
> +			goto out_drop;
>  
> -	skb_dst_set(skb, dst);
> +		skb_dst_set(skb, dst);
> +	} else if (nh->nh_family == AF_INET6) {
> +		nexthop = &nh->ipv6_nh;
> +	} else {
> +		goto out_drop;
> +	}
>  
> -	err = bpf_out_neigh_v6(net, skb);
> +	err = bpf_out_neigh_v6(net, skb, dev, nexthop);
>  	if (unlikely(net_xmit_eval(err)))
>  		dev->stats.tx_errors++;
>  	else

Toke Høiland-Jørgensen Oct. 15, 2020, 7:34 p.m. UTC | #2

David Ahern <dsahern@gmail.com> writes:

> On 10/15/20 9:46 AM, Toke Høiland-Jørgensen wrote:
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index bf5a99d803e4..980cc1363be8 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -3677,15 +3677,19 @@ union bpf_attr {
>>   * 	Return
>>   * 		The id is returned or 0 in case the id could not be retrieved.
>>   *
>> - * long bpf_redirect_neigh(u32 ifindex, u64 flags)
>> + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags)
>
> why not fold ifindex into params? with params and plen this should be
> extensible later if needed.

Figured this way would make it easier to run *without* the params (like
in the existing examples). But don't feel strongly about it, let's see
what Daniel thinks.

> A couple of nits below that caught me eye.

Thanks, will fix; the kernel bot also found a sparse warning, so I guess
I need to respin anyway (but waiting for Daniel's comments and/or
instructions on what tree to properly submit this to).

-Toke

Daniel Borkmann Oct. 19, 2020, 1:09 p.m. UTC | #3

On 10/15/20 9:34 PM, Toke Høiland-Jørgensen wrote:
> David Ahern <dsahern@gmail.com> writes:

>> On 10/15/20 9:46 AM, Toke Høiland-Jørgensen wrote:

>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

>>> index bf5a99d803e4..980cc1363be8 100644

>>> --- a/include/uapi/linux/bpf.h

>>> +++ b/include/uapi/linux/bpf.h

>>> @@ -3677,15 +3677,19 @@ union bpf_attr {

>>>    * 	Return

>>>    * 		The id is returned or 0 in case the id could not be retrieved.

>>>    *

>>> - * long bpf_redirect_neigh(u32 ifindex, u64 flags)

>>> + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags)

>>

>> why not fold ifindex into params? with params and plen this should be

>> extensible later if needed.

> 

> Figured this way would make it easier to run *without* the params (like

> in the existing examples). But don't feel strongly about it, let's see

> what Daniel thinks.


My preference is what Toke has here, this simplifies use by just being able to
call bpf_redirect_neigh(ifindex, NULL, 0, 0) when just single external facing
device is used.

>> A couple of nits below that caught me eye.

> 

> Thanks, will fix; the kernel bot also found a sparse warning, so I guess

> I need to respin anyway (but waiting for Daniel's comments and/or

> instructions on what tree to properly submit this to).


Given API change, lets do bpf. (Will review the rest later today.)

Toke Høiland-Jørgensen Oct. 19, 2020, 1:28 p.m. UTC | #4

Daniel Borkmann <daniel@iogearbox.net> writes:

> On 10/15/20 9:34 PM, Toke Høiland-Jørgensen wrote:

>> David Ahern <dsahern@gmail.com> writes:

>>> On 10/15/20 9:46 AM, Toke Høiland-Jørgensen wrote:

>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

>>>> index bf5a99d803e4..980cc1363be8 100644

>>>> --- a/include/uapi/linux/bpf.h

>>>> +++ b/include/uapi/linux/bpf.h

>>>> @@ -3677,15 +3677,19 @@ union bpf_attr {

>>>>    * 	Return

>>>>    * 		The id is returned or 0 in case the id could not be retrieved.

>>>>    *

>>>> - * long bpf_redirect_neigh(u32 ifindex, u64 flags)

>>>> + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags)

>>>

>>> why not fold ifindex into params? with params and plen this should be

>>> extensible later if needed.

>> 

>> Figured this way would make it easier to run *without* the params (like

>> in the existing examples). But don't feel strongly about it, let's see

>> what Daniel thinks.

>

> My preference is what Toke has here, this simplifies use by just being able to

> call bpf_redirect_neigh(ifindex, NULL, 0, 0) when just single external facing

> device is used.

>

>>> A couple of nits below that caught me eye.

>> 

>> Thanks, will fix; the kernel bot also found a sparse warning, so I guess

>> I need to respin anyway (but waiting for Daniel's comments and/or

>> instructions on what tree to properly submit this to).

>

> Given API change, lets do bpf. (Will review the rest later today.)


Right, ACK. I'll wait for your review, then resubmit against the bpf
tree :)

-Toke

Toke Høiland-Jørgensen Oct. 19, 2020, 2:56 p.m. UTC | #5

Daniel Borkmann <daniel@iogearbox.net> writes:

> On 10/15/20 5:46 PM, Toke Høiland-Jørgensen wrote:
>> From: Toke Høiland-Jørgensen <toke@redhat.com>
>> 
>> Based on the discussion in [0], update the bpf_redirect_neigh() helper to
>> accept an optional parameter specifying the nexthop information. This makes
>> it possible to combine bpf_fib_lookup() and bpf_redirect_neigh() without
>> incurring a duplicate FIB lookup - since the FIB lookup helper will return
>> the nexthop information even if no neighbour is present, this can simply be
>> passed on to bpf_redirect_neigh() if bpf_fib_lookup() returns
>> BPF_FIB_LKUP_RET_NO_NEIGH.
>> 
>> [0] https://lore.kernel.org/bpf/393e17fc-d187-3a8d-2f0d-a627c7c63fca@iogearbox.net/
>> 
>> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
>
> Overall looks good from what I can tell, just small nits below on top of
> David's feedback:
>
> [...]
>> -static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev)
>> +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
>> +				   struct bpf_nh_params *nh)
>>   {
>>   	const struct iphdr *ip4h = ip_hdr(skb);
>>   	struct net *net = dev_net(dev);
>>   	int err, ret = NET_XMIT_DROP;
>> -	struct rtable *rt;
>> -	struct flowi4 fl4 = {
>> -		.flowi4_flags	= FLOWI_FLAG_ANYSRC,
>> -		.flowi4_mark	= skb->mark,
>> -		.flowi4_tos	= RT_TOS(ip4h->tos),
>> -		.flowi4_oif	= dev->ifindex,
>> -		.flowi4_proto	= ip4h->protocol,
>> -		.daddr		= ip4h->daddr,
>> -		.saddr		= ip4h->saddr,
>> -	};
>>   
>> -	rt = ip_route_output_flow(net, &fl4, NULL);
>> -	if (IS_ERR(rt))
>> -		goto out_drop;
>> -	if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
>> -		ip_rt_put(rt);
>> -		goto out_drop;
>> -	}
>> +	if (!nh->nh_family) {
>> +		struct rtable *rt;
>> +		struct flowi4 fl4 = {
>> +			.flowi4_flags = FLOWI_FLAG_ANYSRC,
>> +			.flowi4_mark = skb->mark,
>> +			.flowi4_tos = RT_TOS(ip4h->tos),
>> +			.flowi4_oif = dev->ifindex,
>> +			.flowi4_proto = ip4h->protocol,
>> +			.daddr = ip4h->daddr,
>> +			.saddr = ip4h->saddr,
>> +		};
>> +
>> +		rt = ip_route_output_flow(net, &fl4, NULL);
>> +		if (IS_ERR(rt))
>> +			goto out_drop;
>> +		if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
>> +			ip_rt_put(rt);
>> +			goto out_drop;
>> +		}
>>   
>> -	skb_dst_set(skb, &rt->dst);
>> +		skb_dst_set(skb, &rt->dst);
>> +		nh = NULL;
>> +	}
>>   
>> -	err = bpf_out_neigh_v4(net, skb);
>> +	err = bpf_out_neigh_v4(net, skb, dev, nh);
>>   	if (unlikely(net_xmit_eval(err)))
>>   		dev->stats.tx_errors++;
>>   	else
>> @@ -2355,7 +2383,8 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev)
>>   }
>>   #endif /* CONFIG_INET */
>>   
>> -static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
>> +static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
>> +				struct bpf_nh_params *nh)
>>   {
>>   	struct ethhdr *ethh = eth_hdr(skb);
>>   
>> @@ -2370,9 +2399,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
>>   	skb_reset_network_header(skb);
>>   
>>   	if (skb->protocol == htons(ETH_P_IP))
>> -		return __bpf_redirect_neigh_v4(skb, dev);
>> +		return __bpf_redirect_neigh_v4(skb, dev, nh);
>>   	else if (skb->protocol == htons(ETH_P_IPV6))
>> -		return __bpf_redirect_neigh_v6(skb, dev);
>> +		return __bpf_redirect_neigh_v6(skb, dev, nh);
>>   out:
>>   	kfree_skb(skb);
>>   	return -ENOTSUPP;
>> @@ -2455,8 +2484,8 @@ int skb_do_redirect(struct sk_buff *skb)
>>   		return -EAGAIN;
>>   	}
>>   	return flags & BPF_F_NEIGH ?
>> -	       __bpf_redirect_neigh(skb, dev) :
>> -	       __bpf_redirect(skb, dev, flags);
>> +		__bpf_redirect_neigh(skb, dev, &ri->nh) :
>> +		__bpf_redirect(skb, dev, flags);
>>   out_drop:
>>   	kfree_skb(skb);
>>   	return -EINVAL;
>> @@ -2504,16 +2533,23 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = {
>>   	.arg2_type      = ARG_ANYTHING,
>>   };
>>   
>> -BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
>> +BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
>> +	   int, plen, u64, flags)
>>   {
>>   	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>>   
>> -	if (unlikely(flags))
>> +	if (unlikely((plen && plen < sizeof(*params)) || flags))
>>   		return TC_ACT_SHOT;
>>   
>>   	ri->flags = BPF_F_NEIGH;
>>   	ri->tgt_index = ifindex;
>>   
>> +	BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
>> +	if (plen)
>> +		memcpy(&ri->nh, params, sizeof(ri->nh));
>> +	else
>> +		ri->nh.nh_family = 0; /* clear previous value */
>
> I'd probably just add an internal flag and do ...
>
>    ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
>
> ... instead of above clearing, and skb_do_redirect() then becomes:
>
>    __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? &ri->nh : NULL)
>
> ... which would then also avoid this !nh->nh_family check where you later on
> set nh = NULL to pass it onwards.

Ah yes, excellent idea! Will fix :)

-Toke

Daniel Borkmann Oct. 19, 2020, 3:01 p.m. UTC | #6

On 10/15/20 5:46 PM, Toke Høiland-Jørgensen wrote:
[...]
> +struct bpf_redir_neigh {
> +	/* network family for lookup (AF_INET, AF_INET6)
> +	 */
> +	__u8	nh_family;
> +	/* network address of nexthop; skips fib lookup to find gateway */
> +	union {
> +		__be32		ipv4_nh;
> +		__u32		ipv6_nh[4];  /* in6_addr; network order */
> +	};
> +};
> +
>   enum bpf_task_fd_type {
>   	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
>   	BPF_FD_TYPE_TRACEPOINT,		/* tp name */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index c5e2a1c5fd8d..d073031a3a61 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2165,12 +2165,11 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
>   }
>   
>   #if IS_ENABLED(CONFIG_IPV6)
> -static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
> +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
> +			    struct net_device *dev, const struct in6_addr *nexthop)
>   {
> -	struct dst_entry *dst = skb_dst(skb);
> -	struct net_device *dev = dst->dev;
>   	u32 hh_len = LL_RESERVED_SPACE(dev);
> -	const struct in6_addr *nexthop;
> +	struct dst_entry *dst = NULL;
>   	struct neighbour *neigh;
>   
>   	if (dev_xmit_recursion()) {
> @@ -2196,8 +2195,11 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
>   	}
>   
>   	rcu_read_lock_bh();
> -	nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
> -			      &ipv6_hdr(skb)->daddr);
> +	if (!nexthop) {
> +		dst = skb_dst(skb);
> +		nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
> +				      &ipv6_hdr(skb)->daddr);
> +	}
>   	neigh = ip_neigh_gw6(dev, nexthop);
>   	if (likely(!IS_ERR(neigh))) {
>   		int ret;
> @@ -2210,36 +2212,46 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
>   		return ret;
>   	}
>   	rcu_read_unlock_bh();
> -	IP6_INC_STATS(dev_net(dst->dev),
> -		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
> +	if (dst)
> +		IP6_INC_STATS(dev_net(dst->dev),
> +			      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
>   out_drop:
>   	kfree_skb(skb);
>   	return -ENETDOWN;
>   }
>   
> -static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
> +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
> +				   struct bpf_nh_params *nh)
>   {
>   	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
> +	struct in6_addr *nexthop = NULL;
>   	struct net *net = dev_net(dev);
>   	int err, ret = NET_XMIT_DROP;
> -	struct dst_entry *dst;
> -	struct flowi6 fl6 = {
> -		.flowi6_flags	= FLOWI_FLAG_ANYSRC,
> -		.flowi6_mark	= skb->mark,
> -		.flowlabel	= ip6_flowinfo(ip6h),
> -		.flowi6_oif	= dev->ifindex,
> -		.flowi6_proto	= ip6h->nexthdr,
> -		.daddr		= ip6h->daddr,
> -		.saddr		= ip6h->saddr,
> -	};
>   
> -	dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
> -	if (IS_ERR(dst))
> -		goto out_drop;
> +	if (!nh->nh_family) {
> +		struct dst_entry *dst;
> +		struct flowi6 fl6 = {
> +			.flowi6_flags = FLOWI_FLAG_ANYSRC,
> +			.flowi6_mark = skb->mark,
> +			.flowlabel = ip6_flowinfo(ip6h),
> +			.flowi6_oif = dev->ifindex,
> +			.flowi6_proto = ip6h->nexthdr,
> +			.daddr = ip6h->daddr,
> +			.saddr = ip6h->saddr,

nit: Would be good for readability to keep the previous whitespace alignment intact.

> +		};
> +
> +		dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
> +		if (IS_ERR(dst))
> +			goto out_drop;
>   
> -	skb_dst_set(skb, dst);
> +		skb_dst_set(skb, dst);
> +	} else if (nh->nh_family == AF_INET6) {
> +		nexthop = &nh->ipv6_nh;
> +	} else {
> +		goto out_drop;
> +	}
>   
> -	err = bpf_out_neigh_v6(net, skb);
> +	err = bpf_out_neigh_v6(net, skb, dev, nexthop);

I'd probably model the bpf_out_neigh_v{4,6}() as close as possible similar to each other in terms
of args we pass etc. In the v6 case you pass the nexthop in6_addr directly whereas v4 passes
bpf_nh_params, I'd probably also stick to the latter for v6 to keep it symmetric.

>   	if (unlikely(net_xmit_eval(err)))
>   		dev->stats.tx_errors++;
>   	else
> @@ -2260,11 +2272,9 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
>   #endif /* CONFIG_IPV6 */
>   
>   #if IS_ENABLED(CONFIG_INET)
> -static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb)
> +static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
> +			    struct net_device *dev, struct bpf_nh_params *nh)
>   {
> -	struct dst_entry *dst = skb_dst(skb);
> -	struct rtable *rt = container_of(dst, struct rtable, dst);
> -	struct net_device *dev = dst->dev;
>   	u32 hh_len = LL_RESERVED_SPACE(dev);
>   	struct neighbour *neigh;
>   	bool is_v6gw = false;
> @@ -2292,7 +2302,20 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb)
>   	}
>   
>   	rcu_read_lock_bh();
> -	neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
> +	if (!nh) {
> +		struct dst_entry *dst = skb_dst(skb);
> +		struct rtable *rt = container_of(dst, struct rtable, dst);
> +
> +		neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
> +	} else if (nh->nh_family == AF_INET6) {
> +		neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
> +		is_v6gw = true;
> +	} else if (nh->nh_family == AF_INET) {
> +		neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
> +	} else {
> +		goto out_drop;
> +	}
> +
>   	if (likely(!IS_ERR(neigh))) {
>   		int ret;
>

[RFC,bpf-next,0/2] bpf: Rework bpf_redirect_neigh() to allow supplying nexthop from caller

Message

Comments