diff mbox series

[v5,bpf-next,13/14] bpf: add new frame_length field to the XDP ctx

Message ID 0547d6f752e325f56a8e5f6466b50e81ff29d65f.1607349924.git.lorenzo@kernel.org
State New
Headers show
Series mvneta: introduce XDP multi-buffer support | expand

Commit Message

Lorenzo Bianconi Dec. 7, 2020, 4:32 p.m. UTC
From: Eelco Chaudron <echaudro@redhat.com>

This patch adds a new field to the XDP context called frame_length,
which will hold the full length of the packet, including fragments
if existing.

eBPF programs can determine if fragments are present using something
like:

  if (ctx->data_end - ctx->data < ctx->frame_length) {
    /* Fragements exists. /*
  }

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 include/net/xdp.h              | 22 +++++++++
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/verifier.c          |  2 +-
 net/core/filter.c              | 83 ++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  1 +
 5 files changed, 108 insertions(+), 1 deletion(-)

Comments

Maciej Fijalkowski Dec. 8, 2020, 10:17 p.m. UTC | #1
On Mon, Dec 07, 2020 at 05:32:42PM +0100, Lorenzo Bianconi wrote:
> From: Eelco Chaudron <echaudro@redhat.com>

> 

> This patch adds a new field to the XDP context called frame_length,

> which will hold the full length of the packet, including fragments

> if existing.


The approach you took for ctx access conversion is barely described :/

> 

> eBPF programs can determine if fragments are present using something

> like:

> 

>   if (ctx->data_end - ctx->data < ctx->frame_length) {

>     /* Fragements exists. /*

>   }

> 

> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>

> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>

> ---

>  include/net/xdp.h              | 22 +++++++++

>  include/uapi/linux/bpf.h       |  1 +

>  kernel/bpf/verifier.c          |  2 +-

>  net/core/filter.c              | 83 ++++++++++++++++++++++++++++++++++

>  tools/include/uapi/linux/bpf.h |  1 +

>  5 files changed, 108 insertions(+), 1 deletion(-)

> 

> diff --git a/include/net/xdp.h b/include/net/xdp.h

> index 09078ab6644c..e54d733c90ed 100644

> --- a/include/net/xdp.h

> +++ b/include/net/xdp.h

> @@ -73,8 +73,30 @@ struct xdp_buff {

>  	void *data_hard_start;

>  	struct xdp_rxq_info *rxq;

>  	struct xdp_txq_info *txq;

> +	/* If any of the bitfield lengths for frame_sz or mb below change,

> +	 * make sure the defines here are also updated!

> +	 */

> +#ifdef __BIG_ENDIAN_BITFIELD

> +#define MB_SHIFT	  0

> +#define MB_MASK		  0x00000001

> +#define FRAME_SZ_SHIFT	  1

> +#define FRAME_SZ_MASK	  0xfffffffe

> +#else

> +#define MB_SHIFT	  31

> +#define MB_MASK		  0x80000000

> +#define FRAME_SZ_SHIFT	  0

> +#define FRAME_SZ_MASK	  0x7fffffff

> +#endif

> +#define FRAME_SZ_OFFSET() offsetof(struct xdp_buff, __u32_bit_fields_offset)

> +#define MB_OFFSET()	  offsetof(struct xdp_buff, __u32_bit_fields_offset)

> +	/* private: */

> +	u32 __u32_bit_fields_offset[0];


Why? I don't get that. Please explain.
Also, looking at all the need for masking/shifting, I wonder if it would
be better to have u32 frame_sz and u8 mb...

> +	/* public: */

>  	u32 frame_sz:31; /* frame size to deduce data_hard_end/reserved tailroom*/

>  	u32 mb:1; /* xdp non-linear buffer */

> +

> +	/* Temporary registers to make conditional access/stores possible. */

> +	u64 tmp_reg[2];


IMHO this kills the bitfield approach we have for vars above.

>  };

>  

>  /* Reserve memory area at end-of data area.

> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

> index 30b477a26482..62c50ab28ea9 100644

> --- a/include/uapi/linux/bpf.h

> +++ b/include/uapi/linux/bpf.h

> @@ -4380,6 +4380,7 @@ struct xdp_md {

>  	__u32 rx_queue_index;  /* rxq->queue_index  */

>  

>  	__u32 egress_ifindex;  /* txq->dev->ifindex */

> +	__u32 frame_length;

>  };

>  

>  /* DEVMAP map-value layout

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

> index 93def76cf32b..c50caea29fa2 100644

> --- a/kernel/bpf/verifier.c

> +++ b/kernel/bpf/verifier.c

> @@ -10526,7 +10526,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)

>  	const struct bpf_verifier_ops *ops = env->ops;

>  	int i, cnt, size, ctx_field_size, delta = 0;

>  	const int insn_cnt = env->prog->len;

> -	struct bpf_insn insn_buf[16], *insn;

> +	struct bpf_insn insn_buf[32], *insn;

>  	u32 target_size, size_default, off;

>  	struct bpf_prog *new_prog;

>  	enum bpf_access_type type;

> diff --git a/net/core/filter.c b/net/core/filter.c

> index 4c4882d4d92c..278640db9e0a 100644

> --- a/net/core/filter.c

> +++ b/net/core/filter.c

> @@ -8908,6 +8908,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,

>  				  struct bpf_insn *insn_buf,

>  				  struct bpf_prog *prog, u32 *target_size)

>  {

> +	int ctx_reg, dst_reg, scratch_reg;

>  	struct bpf_insn *insn = insn_buf;

>  

>  	switch (si->off) {

> @@ -8954,6 +8955,88 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,

>  		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,

>  				      offsetof(struct net_device, ifindex));

>  		break;

> +	case offsetof(struct xdp_md, frame_length):

> +		/* Need tmp storage for src_reg in case src_reg == dst_reg,

> +		 * and a scratch reg */

> +		scratch_reg = BPF_REG_9;

> +		dst_reg = si->dst_reg;

> +

> +		if (dst_reg == scratch_reg)

> +			scratch_reg--;

> +

> +		ctx_reg = (si->src_reg == si->dst_reg) ? scratch_reg - 1 : si->src_reg;

> +		while (dst_reg == ctx_reg || scratch_reg == ctx_reg)

> +			ctx_reg--;

> +

> +		/* Save scratch registers */

> +		if (ctx_reg != si->src_reg) {

> +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, ctx_reg,

> +					      offsetof(struct xdp_buff,

> +						       tmp_reg[1]));

> +

> +			*insn++ = BPF_MOV64_REG(ctx_reg, si->src_reg);

> +		}

> +

> +		*insn++ = BPF_STX_MEM(BPF_DW, ctx_reg, scratch_reg,

> +				      offsetof(struct xdp_buff, tmp_reg[0]));


Why don't you push regs to stack, use it and then pop it back? That way I
suppose you could avoid polluting xdp_buff with tmp_reg[2].

> +

> +		/* What does this code do?

> +		 *   dst_reg = 0

> +		 *

> +		 *   if (!ctx_reg->mb)

> +		 *      goto no_mb:

> +		 *

> +		 *   dst_reg = (struct xdp_shared_info *)xdp_data_hard_end(xdp)

> +		 *   dst_reg = dst_reg->data_length

> +		 *

> +		 * NOTE: xdp_data_hard_end() is xdp->hard_start +

> +		 *       xdp->frame_sz - sizeof(shared_info)

> +		 *

> +		 * no_mb:

> +		 *   dst_reg += ctx_reg->data_end - ctx_reg->data

> +		 */

> +		*insn++ = BPF_MOV64_IMM(dst_reg, 0);

> +

> +		*insn++ = BPF_LDX_MEM(BPF_W, scratch_reg, ctx_reg, MB_OFFSET());

> +		*insn++ = BPF_ALU32_IMM(BPF_AND, scratch_reg, MB_MASK);

> +		*insn++ = BPF_JMP_IMM(BPF_JEQ, scratch_reg, 0, 7); /*goto no_mb; */

> +

> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff,

> +						       data_hard_start),

> +				      dst_reg, ctx_reg,

> +				      offsetof(struct xdp_buff, data_hard_start));

> +		*insn++ = BPF_LDX_MEM(BPF_W, scratch_reg, ctx_reg,

> +				      FRAME_SZ_OFFSET());

> +		*insn++ = BPF_ALU32_IMM(BPF_AND, scratch_reg, FRAME_SZ_MASK);

> +		*insn++ = BPF_ALU32_IMM(BPF_RSH, scratch_reg, FRAME_SZ_SHIFT);

> +		*insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, scratch_reg);

> +		*insn++ = BPF_ALU64_IMM(BPF_SUB, dst_reg,

> +					SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));

> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_shared_info,

> +						       data_length),

> +				      dst_reg, dst_reg,

> +				      offsetof(struct xdp_shared_info,

> +					       data_length));

> +

> +		/* no_mb: */

> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),

> +				      scratch_reg, ctx_reg,

> +				      offsetof(struct xdp_buff, data_end));

> +		*insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, scratch_reg);

> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),

> +				      scratch_reg, ctx_reg,

> +				      offsetof(struct xdp_buff, data));

> +		*insn++ = BPF_ALU64_REG(BPF_SUB, dst_reg, scratch_reg);

> +

> +		/* Restore scratch registers */

> +		*insn++ = BPF_LDX_MEM(BPF_DW, scratch_reg, ctx_reg,

> +				      offsetof(struct xdp_buff, tmp_reg[0]));

> +

> +		if (ctx_reg != si->src_reg)

> +			*insn++ = BPF_LDX_MEM(BPF_DW, ctx_reg, ctx_reg,

> +					      offsetof(struct xdp_buff,

> +						       tmp_reg[1]));

> +		break;

>  	}

>  

>  	return insn - insn_buf;

> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h

> index 30b477a26482..62c50ab28ea9 100644

> --- a/tools/include/uapi/linux/bpf.h

> +++ b/tools/include/uapi/linux/bpf.h

> @@ -4380,6 +4380,7 @@ struct xdp_md {

>  	__u32 rx_queue_index;  /* rxq->queue_index  */

>  

>  	__u32 egress_ifindex;  /* txq->dev->ifindex */

> +	__u32 frame_length;

>  };

>  

>  /* DEVMAP map-value layout

> -- 

> 2.28.0

>
Eelco Chaudron Dec. 9, 2020, 10:35 a.m. UTC | #2
On 8 Dec 2020, at 23:17, Maciej Fijalkowski wrote:

> On Mon, Dec 07, 2020 at 05:32:42PM +0100, Lorenzo Bianconi wrote:

>> From: Eelco Chaudron <echaudro@redhat.com>

>>

>> This patch adds a new field to the XDP context called frame_length,

>> which will hold the full length of the packet, including fragments

>> if existing.

>

> The approach you took for ctx access conversion is barely described :/


You are right, I should have added some details on why I have chosen to 
take this approach. The reason is, to avoid a dedicated entry in the 
xdp_frame structure and maintaining it in the various eBPF helpers.

I'll update the commit message in the next revision to include this.

>>

>> eBPF programs can determine if fragments are present using something

>> like:

>>

>>   if (ctx->data_end - ctx->data < ctx->frame_length) {

>>     /* Fragements exists. /*

>>   }

>>

>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>

>> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>

>> ---

>>  include/net/xdp.h              | 22 +++++++++

>>  include/uapi/linux/bpf.h       |  1 +

>>  kernel/bpf/verifier.c          |  2 +-

>>  net/core/filter.c              | 83 

>> ++++++++++++++++++++++++++++++++++

>>  tools/include/uapi/linux/bpf.h |  1 +

>>  5 files changed, 108 insertions(+), 1 deletion(-)

>>

>> diff --git a/include/net/xdp.h b/include/net/xdp.h

>> index 09078ab6644c..e54d733c90ed 100644

>> --- a/include/net/xdp.h

>> +++ b/include/net/xdp.h

>> @@ -73,8 +73,30 @@ struct xdp_buff {

>>  	void *data_hard_start;

>>  	struct xdp_rxq_info *rxq;

>>  	struct xdp_txq_info *txq;

>> +	/* If any of the bitfield lengths for frame_sz or mb below change,

>> +	 * make sure the defines here are also updated!

>> +	 */

>> +#ifdef __BIG_ENDIAN_BITFIELD

>> +#define MB_SHIFT	  0

>> +#define MB_MASK		  0x00000001

>> +#define FRAME_SZ_SHIFT	  1

>> +#define FRAME_SZ_MASK	  0xfffffffe

>> +#else

>> +#define MB_SHIFT	  31

>> +#define MB_MASK		  0x80000000

>> +#define FRAME_SZ_SHIFT	  0

>> +#define FRAME_SZ_MASK	  0x7fffffff

>> +#endif

>> +#define FRAME_SZ_OFFSET() offsetof(struct xdp_buff, 

>> __u32_bit_fields_offset)

>> +#define MB_OFFSET()	  offsetof(struct xdp_buff, 

>> __u32_bit_fields_offset)

>> +	/* private: */

>> +	u32 __u32_bit_fields_offset[0];

>

> Why? I don't get that. Please explain.


I was trying to find an easy way to extract the data/fields, maybe using 
BTF but had no luck.
So I resorted back to an existing approach in sk_buff, see 
https://elixir.bootlin.com/linux/v5.10-rc7/source/include/linux/skbuff.h#L780

> Also, looking at all the need for masking/shifting, I wonder if it 

> would

> be better to have u32 frame_sz and u8 mb...


Yes, I agree having u32 would be way better, even for u32 for the mb 
field. I’ve seen other code converting flags to u32 for easy access in 
the eBPF context structures.

I’ll see there are some comments in general on the bit definitions for 
mb, but I’ll try to convince them to use u32 for both in the next 
revision, as I think for the xdp_buff structure size is not a real 
problem ;)

>> +	/* public: */

>>  	u32 frame_sz:31; /* frame size to deduce data_hard_end/reserved 

>> tailroom*/

>>  	u32 mb:1; /* xdp non-linear buffer */

>> +

>> +	/* Temporary registers to make conditional access/stores possible. 

>> */

>> +	u64 tmp_reg[2];

>

> IMHO this kills the bitfield approach we have for vars above.


See above…

>>  };

>>

>>  /* Reserve memory area at end-of data area.

>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

>> index 30b477a26482..62c50ab28ea9 100644

>> --- a/include/uapi/linux/bpf.h

>> +++ b/include/uapi/linux/bpf.h

>> @@ -4380,6 +4380,7 @@ struct xdp_md {

>>  	__u32 rx_queue_index;  /* rxq->queue_index  */

>>

>>  	__u32 egress_ifindex;  /* txq->dev->ifindex */

>> +	__u32 frame_length;

>>  };

>>

>>  /* DEVMAP map-value layout

>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

>> index 93def76cf32b..c50caea29fa2 100644

>> --- a/kernel/bpf/verifier.c

>> +++ b/kernel/bpf/verifier.c

>> @@ -10526,7 +10526,7 @@ static int convert_ctx_accesses(struct 

>> bpf_verifier_env *env)

>>  	const struct bpf_verifier_ops *ops = env->ops;

>>  	int i, cnt, size, ctx_field_size, delta = 0;

>>  	const int insn_cnt = env->prog->len;

>> -	struct bpf_insn insn_buf[16], *insn;

>> +	struct bpf_insn insn_buf[32], *insn;

>>  	u32 target_size, size_default, off;

>>  	struct bpf_prog *new_prog;

>>  	enum bpf_access_type type;

>> diff --git a/net/core/filter.c b/net/core/filter.c

>> index 4c4882d4d92c..278640db9e0a 100644

>> --- a/net/core/filter.c

>> +++ b/net/core/filter.c

>> @@ -8908,6 +8908,7 @@ static u32 xdp_convert_ctx_access(enum 

>> bpf_access_type type,

>>  				  struct bpf_insn *insn_buf,

>>  				  struct bpf_prog *prog, u32 *target_size)

>>  {

>> +	int ctx_reg, dst_reg, scratch_reg;

>>  	struct bpf_insn *insn = insn_buf;

>>

>>  	switch (si->off) {

>> @@ -8954,6 +8955,88 @@ static u32 xdp_convert_ctx_access(enum 

>> bpf_access_type type,

>>  		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,

>>  				      offsetof(struct net_device, ifindex));

>>  		break;

>> +	case offsetof(struct xdp_md, frame_length):

>> +		/* Need tmp storage for src_reg in case src_reg == dst_reg,

>> +		 * and a scratch reg */

>> +		scratch_reg = BPF_REG_9;

>> +		dst_reg = si->dst_reg;

>> +

>> +		if (dst_reg == scratch_reg)

>> +			scratch_reg--;

>> +

>> +		ctx_reg = (si->src_reg == si->dst_reg) ? scratch_reg - 1 : 

>> si->src_reg;

>> +		while (dst_reg == ctx_reg || scratch_reg == ctx_reg)

>> +			ctx_reg--;

>> +

>> +		/* Save scratch registers */

>> +		if (ctx_reg != si->src_reg) {

>> +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, ctx_reg,

>> +					      offsetof(struct xdp_buff,

>> +						       tmp_reg[1]));

>> +

>> +			*insn++ = BPF_MOV64_REG(ctx_reg, si->src_reg);

>> +		}

>> +

>> +		*insn++ = BPF_STX_MEM(BPF_DW, ctx_reg, scratch_reg,

>> +				      offsetof(struct xdp_buff, tmp_reg[0]));

>

> Why don't you push regs to stack, use it and then pop it back? That 

> way I

> suppose you could avoid polluting xdp_buff with tmp_reg[2].


There is no “real” stack in eBPF, only a read-only frame pointer, 
and as we are replacing a single instruction, we have no info on what we 
can use as scratch space.

>> +

>> +		/* What does this code do?

>> +		 *   dst_reg = 0

>> +		 *

>> +		 *   if (!ctx_reg->mb)

>> +		 *      goto no_mb:

>> +		 *

>> +		 *   dst_reg = (struct xdp_shared_info *)xdp_data_hard_end(xdp)

>> +		 *   dst_reg = dst_reg->data_length

>> +		 *

>> +		 * NOTE: xdp_data_hard_end() is xdp->hard_start +

>> +		 *       xdp->frame_sz - sizeof(shared_info)

>> +		 *

>> +		 * no_mb:

>> +		 *   dst_reg += ctx_reg->data_end - ctx_reg->data

>> +		 */

>> +		*insn++ = BPF_MOV64_IMM(dst_reg, 0);

>> +

>> +		*insn++ = BPF_LDX_MEM(BPF_W, scratch_reg, ctx_reg, MB_OFFSET());

>> +		*insn++ = BPF_ALU32_IMM(BPF_AND, scratch_reg, MB_MASK);

>> +		*insn++ = BPF_JMP_IMM(BPF_JEQ, scratch_reg, 0, 7); /*goto no_mb; 

>> */

>> +

>> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff,

>> +						       data_hard_start),

>> +				      dst_reg, ctx_reg,

>> +				      offsetof(struct xdp_buff, data_hard_start));

>> +		*insn++ = BPF_LDX_MEM(BPF_W, scratch_reg, ctx_reg,

>> +				      FRAME_SZ_OFFSET());

>> +		*insn++ = BPF_ALU32_IMM(BPF_AND, scratch_reg, FRAME_SZ_MASK);

>> +		*insn++ = BPF_ALU32_IMM(BPF_RSH, scratch_reg, FRAME_SZ_SHIFT);

>> +		*insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, scratch_reg);

>> +		*insn++ = BPF_ALU64_IMM(BPF_SUB, dst_reg,

>> +					SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));

>> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_shared_info,

>> +						       data_length),

>> +				      dst_reg, dst_reg,

>> +				      offsetof(struct xdp_shared_info,

>> +					       data_length));

>> +

>> +		/* no_mb: */

>> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),

>> +				      scratch_reg, ctx_reg,

>> +				      offsetof(struct xdp_buff, data_end));

>> +		*insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, scratch_reg);

>> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),

>> +				      scratch_reg, ctx_reg,

>> +				      offsetof(struct xdp_buff, data));

>> +		*insn++ = BPF_ALU64_REG(BPF_SUB, dst_reg, scratch_reg);

>> +

>> +		/* Restore scratch registers */

>> +		*insn++ = BPF_LDX_MEM(BPF_DW, scratch_reg, ctx_reg,

>> +				      offsetof(struct xdp_buff, tmp_reg[0]));

>> +

>> +		if (ctx_reg != si->src_reg)

>> +			*insn++ = BPF_LDX_MEM(BPF_DW, ctx_reg, ctx_reg,

>> +					      offsetof(struct xdp_buff,

>> +						       tmp_reg[1]));

>> +		break;

>>  	}

>>

>>  	return insn - insn_buf;

>> diff --git a/tools/include/uapi/linux/bpf.h 

>> b/tools/include/uapi/linux/bpf.h

>> index 30b477a26482..62c50ab28ea9 100644

>> --- a/tools/include/uapi/linux/bpf.h

>> +++ b/tools/include/uapi/linux/bpf.h

>> @@ -4380,6 +4380,7 @@ struct xdp_md {

>>  	__u32 rx_queue_index;  /* rxq->queue_index  */

>>

>>  	__u32 egress_ifindex;  /* txq->dev->ifindex */

>> +	__u32 frame_length;

>>  };

>>

>>  /* DEVMAP map-value layout

>> -- 

>> 2.28.0

>>
Maciej Fijalkowski Dec. 9, 2020, 11:10 a.m. UTC | #3
On Wed, Dec 09, 2020 at 11:35:13AM +0100, Eelco Chaudron wrote:
> 

> 

> On 8 Dec 2020, at 23:17, Maciej Fijalkowski wrote:

> 

> > On Mon, Dec 07, 2020 at 05:32:42PM +0100, Lorenzo Bianconi wrote:

> > > From: Eelco Chaudron <echaudro@redhat.com>

> > > 

> > > This patch adds a new field to the XDP context called frame_length,

> > > which will hold the full length of the packet, including fragments

> > > if existing.

> > 

> > The approach you took for ctx access conversion is barely described :/

> 

> You are right, I should have added some details on why I have chosen to take

> this approach. The reason is, to avoid a dedicated entry in the xdp_frame

> structure and maintaining it in the various eBPF helpers.

> 

> I'll update the commit message in the next revision to include this.

> 

> > > 

> > > eBPF programs can determine if fragments are present using something

> > > like:

> > > 

> > >   if (ctx->data_end - ctx->data < ctx->frame_length) {

> > >     /* Fragements exists. /*

> > >   }

> > > 

> > > Signed-off-by: Eelco Chaudron <echaudro@redhat.com>

> > > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>

> > > ---

> > >  include/net/xdp.h              | 22 +++++++++

> > >  include/uapi/linux/bpf.h       |  1 +

> > >  kernel/bpf/verifier.c          |  2 +-

> > >  net/core/filter.c              | 83

> > > ++++++++++++++++++++++++++++++++++

> > >  tools/include/uapi/linux/bpf.h |  1 +

> > >  5 files changed, 108 insertions(+), 1 deletion(-)

> > > 

> > > diff --git a/include/net/xdp.h b/include/net/xdp.h

> > > index 09078ab6644c..e54d733c90ed 100644

> > > --- a/include/net/xdp.h

> > > +++ b/include/net/xdp.h

> > > @@ -73,8 +73,30 @@ struct xdp_buff {

> > >  	void *data_hard_start;

> > >  	struct xdp_rxq_info *rxq;

> > >  	struct xdp_txq_info *txq;

> > > +	/* If any of the bitfield lengths for frame_sz or mb below change,

> > > +	 * make sure the defines here are also updated!

> > > +	 */

> > > +#ifdef __BIG_ENDIAN_BITFIELD

> > > +#define MB_SHIFT	  0

> > > +#define MB_MASK		  0x00000001

> > > +#define FRAME_SZ_SHIFT	  1

> > > +#define FRAME_SZ_MASK	  0xfffffffe

> > > +#else

> > > +#define MB_SHIFT	  31

> > > +#define MB_MASK		  0x80000000

> > > +#define FRAME_SZ_SHIFT	  0

> > > +#define FRAME_SZ_MASK	  0x7fffffff

> > > +#endif

> > > +#define FRAME_SZ_OFFSET() offsetof(struct xdp_buff,

> > > __u32_bit_fields_offset)

> > > +#define MB_OFFSET()	  offsetof(struct xdp_buff,

> > > __u32_bit_fields_offset)

> > > +	/* private: */

> > > +	u32 __u32_bit_fields_offset[0];

> > 

> > Why? I don't get that. Please explain.

> 

> I was trying to find an easy way to extract the data/fields, maybe using BTF

> but had no luck.

> So I resorted back to an existing approach in sk_buff, see

> https://elixir.bootlin.com/linux/v5.10-rc7/source/include/linux/skbuff.h#L780

> 

> > Also, looking at all the need for masking/shifting, I wonder if it would

> > be better to have u32 frame_sz and u8 mb...

> 

> Yes, I agree having u32 would be way better, even for u32 for the mb field.

> I’ve seen other code converting flags to u32 for easy access in the eBPF

> context structures.

> 

> I’ll see there are some comments in general on the bit definitions for mb,

> but I’ll try to convince them to use u32 for both in the next revision, as I

> think for the xdp_buff structure size is not a real problem ;)


Generally people were really strict on xdp_buff extensions as we didn't
want to end up with another skb-like monster. I think Jesper somewhere
said that one cacheline is max for that. With your tmp_reg[2] you exceed
that from what I see, but I might be short on coffee.

> 

> > > +	/* public: */

> > >  	u32 frame_sz:31; /* frame size to deduce data_hard_end/reserved

> > > tailroom*/

> > >  	u32 mb:1; /* xdp non-linear buffer */

> > > +

> > > +	/* Temporary registers to make conditional access/stores possible.

> > > */

> > > +	u64 tmp_reg[2];

> > 

> > IMHO this kills the bitfield approach we have for vars above.

> 

> See above…

> 

> > >  };

> > > 

> > >  /* Reserve memory area at end-of data area.

> > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

> > > index 30b477a26482..62c50ab28ea9 100644

> > > --- a/include/uapi/linux/bpf.h

> > > +++ b/include/uapi/linux/bpf.h

> > > @@ -4380,6 +4380,7 @@ struct xdp_md {

> > >  	__u32 rx_queue_index;  /* rxq->queue_index  */

> > > 

> > >  	__u32 egress_ifindex;  /* txq->dev->ifindex */

> > > +	__u32 frame_length;

> > >  };

> > > 

> > >  /* DEVMAP map-value layout

> > > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

> > > index 93def76cf32b..c50caea29fa2 100644

> > > --- a/kernel/bpf/verifier.c

> > > +++ b/kernel/bpf/verifier.c

> > > @@ -10526,7 +10526,7 @@ static int convert_ctx_accesses(struct

> > > bpf_verifier_env *env)

> > >  	const struct bpf_verifier_ops *ops = env->ops;

> > >  	int i, cnt, size, ctx_field_size, delta = 0;

> > >  	const int insn_cnt = env->prog->len;

> > > -	struct bpf_insn insn_buf[16], *insn;

> > > +	struct bpf_insn insn_buf[32], *insn;

> > >  	u32 target_size, size_default, off;

> > >  	struct bpf_prog *new_prog;

> > >  	enum bpf_access_type type;

> > > diff --git a/net/core/filter.c b/net/core/filter.c

> > > index 4c4882d4d92c..278640db9e0a 100644

> > > --- a/net/core/filter.c

> > > +++ b/net/core/filter.c

> > > @@ -8908,6 +8908,7 @@ static u32 xdp_convert_ctx_access(enum

> > > bpf_access_type type,

> > >  				  struct bpf_insn *insn_buf,

> > >  				  struct bpf_prog *prog, u32 *target_size)

> > >  {

> > > +	int ctx_reg, dst_reg, scratch_reg;

> > >  	struct bpf_insn *insn = insn_buf;

> > > 

> > >  	switch (si->off) {

> > > @@ -8954,6 +8955,88 @@ static u32 xdp_convert_ctx_access(enum

> > > bpf_access_type type,

> > >  		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,

> > >  				      offsetof(struct net_device, ifindex));

> > >  		break;

> > > +	case offsetof(struct xdp_md, frame_length):

> > > +		/* Need tmp storage for src_reg in case src_reg == dst_reg,

> > > +		 * and a scratch reg */

> > > +		scratch_reg = BPF_REG_9;

> > > +		dst_reg = si->dst_reg;

> > > +

> > > +		if (dst_reg == scratch_reg)

> > > +			scratch_reg--;

> > > +

> > > +		ctx_reg = (si->src_reg == si->dst_reg) ? scratch_reg - 1 :

> > > si->src_reg;

> > > +		while (dst_reg == ctx_reg || scratch_reg == ctx_reg)

> > > +			ctx_reg--;

> > > +

> > > +		/* Save scratch registers */

> > > +		if (ctx_reg != si->src_reg) {

> > > +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, ctx_reg,

> > > +					      offsetof(struct xdp_buff,

> > > +						       tmp_reg[1]));

> > > +

> > > +			*insn++ = BPF_MOV64_REG(ctx_reg, si->src_reg);

> > > +		}

> > > +

> > > +		*insn++ = BPF_STX_MEM(BPF_DW, ctx_reg, scratch_reg,

> > > +				      offsetof(struct xdp_buff, tmp_reg[0]));

> > 

> > Why don't you push regs to stack, use it and then pop it back? That way

> > I

> > suppose you could avoid polluting xdp_buff with tmp_reg[2].

> 

> There is no “real” stack in eBPF, only a read-only frame pointer, and as we

> are replacing a single instruction, we have no info on what we can use as

> scratch space.


Uhm, what? You use R10 for stack operations. Verifier tracks the stack
depth used by programs and then it is passed down to JIT so that native
asm will create a properly sized stack frame.

From the top of my head I would let know xdp_convert_ctx_access of a
current stack depth and use it for R10 stores, so your scratch space would
be R10 + (stack depth + 8), R10 + (stack_depth + 16).

Problem with that would be the fact that convert_ctx_accesses() happens to
be called after the check_max_stack_depth(), so probably stack_depth of a
prog that has frame_length accesses would have to be adjusted earlier.

> 

> > > +

> > > +		/* What does this code do?

> > > +		 *   dst_reg = 0

> > > +		 *

> > > +		 *   if (!ctx_reg->mb)

> > > +		 *      goto no_mb:

> > > +		 *

> > > +		 *   dst_reg = (struct xdp_shared_info *)xdp_data_hard_end(xdp)

> > > +		 *   dst_reg = dst_reg->data_length

> > > +		 *

> > > +		 * NOTE: xdp_data_hard_end() is xdp->hard_start +

> > > +		 *       xdp->frame_sz - sizeof(shared_info)

> > > +		 *

> > > +		 * no_mb:

> > > +		 *   dst_reg += ctx_reg->data_end - ctx_reg->data

> > > +		 */

> > > +		*insn++ = BPF_MOV64_IMM(dst_reg, 0);

> > > +

> > > +		*insn++ = BPF_LDX_MEM(BPF_W, scratch_reg, ctx_reg, MB_OFFSET());

> > > +		*insn++ = BPF_ALU32_IMM(BPF_AND, scratch_reg, MB_MASK);

> > > +		*insn++ = BPF_JMP_IMM(BPF_JEQ, scratch_reg, 0, 7); /*goto no_mb;

> > > */

> > > +

> > > +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff,

> > > +						       data_hard_start),

> > > +				      dst_reg, ctx_reg,

> > > +				      offsetof(struct xdp_buff, data_hard_start));

> > > +		*insn++ = BPF_LDX_MEM(BPF_W, scratch_reg, ctx_reg,

> > > +				      FRAME_SZ_OFFSET());

> > > +		*insn++ = BPF_ALU32_IMM(BPF_AND, scratch_reg, FRAME_SZ_MASK);

> > > +		*insn++ = BPF_ALU32_IMM(BPF_RSH, scratch_reg, FRAME_SZ_SHIFT);

> > > +		*insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, scratch_reg);

> > > +		*insn++ = BPF_ALU64_IMM(BPF_SUB, dst_reg,

> > > +					SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));

> > > +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_shared_info,

> > > +						       data_length),

> > > +				      dst_reg, dst_reg,

> > > +				      offsetof(struct xdp_shared_info,

> > > +					       data_length));

> > > +

> > > +		/* no_mb: */

> > > +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),

> > > +				      scratch_reg, ctx_reg,

> > > +				      offsetof(struct xdp_buff, data_end));

> > > +		*insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, scratch_reg);

> > > +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),

> > > +				      scratch_reg, ctx_reg,

> > > +				      offsetof(struct xdp_buff, data));

> > > +		*insn++ = BPF_ALU64_REG(BPF_SUB, dst_reg, scratch_reg);

> > > +

> > > +		/* Restore scratch registers */

> > > +		*insn++ = BPF_LDX_MEM(BPF_DW, scratch_reg, ctx_reg,

> > > +				      offsetof(struct xdp_buff, tmp_reg[0]));

> > > +

> > > +		if (ctx_reg != si->src_reg)

> > > +			*insn++ = BPF_LDX_MEM(BPF_DW, ctx_reg, ctx_reg,

> > > +					      offsetof(struct xdp_buff,

> > > +						       tmp_reg[1]));

> > > +		break;

> > >  	}

> > > 

> > >  	return insn - insn_buf;

> > > diff --git a/tools/include/uapi/linux/bpf.h

> > > b/tools/include/uapi/linux/bpf.h

> > > index 30b477a26482..62c50ab28ea9 100644

> > > --- a/tools/include/uapi/linux/bpf.h

> > > +++ b/tools/include/uapi/linux/bpf.h

> > > @@ -4380,6 +4380,7 @@ struct xdp_md {

> > >  	__u32 rx_queue_index;  /* rxq->queue_index  */

> > > 

> > >  	__u32 egress_ifindex;  /* txq->dev->ifindex */

> > > +	__u32 frame_length;

> > >  };

> > > 

> > >  /* DEVMAP map-value layout

> > > -- 

> > > 2.28.0

> > > 

>
Eelco Chaudron Dec. 9, 2020, 12:07 p.m. UTC | #4
On 9 Dec 2020, at 12:10, Maciej Fijalkowski wrote:

> On Wed, Dec 09, 2020 at 11:35:13AM +0100, Eelco Chaudron wrote:

>>

>>

>> On 8 Dec 2020, at 23:17, Maciej Fijalkowski wrote:

>>

>>> On Mon, Dec 07, 2020 at 05:32:42PM +0100, Lorenzo Bianconi wrote:

>>>> From: Eelco Chaudron <echaudro@redhat.com>

>>>>

>>>> This patch adds a new field to the XDP context called frame_length,

>>>> which will hold the full length of the packet, including fragments

>>>> if existing.

>>>

>>> The approach you took for ctx access conversion is barely described 

>>> :/

>>

>> You are right, I should have added some details on why I have chosen 

>> to take

>> this approach. The reason is, to avoid a dedicated entry in the 

>> xdp_frame

>> structure and maintaining it in the various eBPF helpers.

>>

>> I'll update the commit message in the next revision to include this.

>>

>>>>

>>>> eBPF programs can determine if fragments are present using 

>>>> something

>>>> like:

>>>>

>>>>   if (ctx->data_end - ctx->data < ctx->frame_length) {

>>>>     /* Fragements exists. /*

>>>>   }

>>>>

>>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>

>>>> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>

>>>> ---

>>>>  include/net/xdp.h              | 22 +++++++++

>>>>  include/uapi/linux/bpf.h       |  1 +

>>>>  kernel/bpf/verifier.c          |  2 +-

>>>>  net/core/filter.c              | 83

>>>> ++++++++++++++++++++++++++++++++++

>>>>  tools/include/uapi/linux/bpf.h |  1 +

>>>>  5 files changed, 108 insertions(+), 1 deletion(-)

>>>>

>>>> diff --git a/include/net/xdp.h b/include/net/xdp.h

>>>> index 09078ab6644c..e54d733c90ed 100644

>>>> --- a/include/net/xdp.h

>>>> +++ b/include/net/xdp.h

>>>> @@ -73,8 +73,30 @@ struct xdp_buff {

>>>>  	void *data_hard_start;

>>>>  	struct xdp_rxq_info *rxq;

>>>>  	struct xdp_txq_info *txq;

>>>> +	/* If any of the bitfield lengths for frame_sz or mb below 

>>>> change,

>>>> +	 * make sure the defines here are also updated!

>>>> +	 */

>>>> +#ifdef __BIG_ENDIAN_BITFIELD

>>>> +#define MB_SHIFT	  0

>>>> +#define MB_MASK		  0x00000001

>>>> +#define FRAME_SZ_SHIFT	  1

>>>> +#define FRAME_SZ_MASK	  0xfffffffe

>>>> +#else

>>>> +#define MB_SHIFT	  31

>>>> +#define MB_MASK		  0x80000000

>>>> +#define FRAME_SZ_SHIFT	  0

>>>> +#define FRAME_SZ_MASK	  0x7fffffff

>>>> +#endif

>>>> +#define FRAME_SZ_OFFSET() offsetof(struct xdp_buff,

>>>> __u32_bit_fields_offset)

>>>> +#define MB_OFFSET()	  offsetof(struct xdp_buff,

>>>> __u32_bit_fields_offset)

>>>> +	/* private: */

>>>> +	u32 __u32_bit_fields_offset[0];

>>>

>>> Why? I don't get that. Please explain.

>>

>> I was trying to find an easy way to extract the data/fields, maybe 

>> using BTF

>> but had no luck.

>> So I resorted back to an existing approach in sk_buff, see

>> https://elixir.bootlin.com/linux/v5.10-rc7/source/include/linux/skbuff.h#L780

>>

>>> Also, looking at all the need for masking/shifting, I wonder if it 

>>> would

>>> be better to have u32 frame_sz and u8 mb...

>>

>> Yes, I agree having u32 would be way better, even for u32 for the mb 

>> field.

>> I’ve seen other code converting flags to u32 for easy access in the 

>> eBPF

>> context structures.

>>

>> I’ll see there are some comments in general on the bit definitions 

>> for mb,

>> but I’ll try to convince them to use u32 for both in the next 

>> revision, as I

>> think for the xdp_buff structure size is not a real problem ;)

>

> Generally people were really strict on xdp_buff extensions as we 

> didn't

> want to end up with another skb-like monster. I think Jesper somewhere

> said that one cacheline is max for that. With your tmp_reg[2] you 

> exceed

> that from what I see, but I might be short on coffee.


Guess you are right! I got confused with xdp_md, guess I did not have 
enough coffee when I replied :)

The common use case will not hit the second cache line (if src reg != 
dst reg), but it might happen.

>>

>>>> +	/* public: */

>>>>  	u32 frame_sz:31; /* frame size to deduce data_hard_end/reserved

>>>> tailroom*/

>>>>  	u32 mb:1; /* xdp non-linear buffer */

>>>> +

>>>> +	/* Temporary registers to make conditional access/stores 

>>>> possible.

>>>> */

>>>> +	u64 tmp_reg[2];

>>>

>>> IMHO this kills the bitfield approach we have for vars above.

>>

>> See above…

>>

>>>>  };

>>>>

>>>>  /* Reserve memory area at end-of data area.

>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

>>>> index 30b477a26482..62c50ab28ea9 100644

>>>> --- a/include/uapi/linux/bpf.h

>>>> +++ b/include/uapi/linux/bpf.h

>>>> @@ -4380,6 +4380,7 @@ struct xdp_md {

>>>>  	__u32 rx_queue_index;  /* rxq->queue_index  */

>>>>

>>>>  	__u32 egress_ifindex;  /* txq->dev->ifindex */

>>>> +	__u32 frame_length;

>>>>  };

>>>>

>>>>  /* DEVMAP map-value layout

>>>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

>>>> index 93def76cf32b..c50caea29fa2 100644

>>>> --- a/kernel/bpf/verifier.c

>>>> +++ b/kernel/bpf/verifier.c

>>>> @@ -10526,7 +10526,7 @@ static int convert_ctx_accesses(struct

>>>> bpf_verifier_env *env)

>>>>  	const struct bpf_verifier_ops *ops = env->ops;

>>>>  	int i, cnt, size, ctx_field_size, delta = 0;

>>>>  	const int insn_cnt = env->prog->len;

>>>> -	struct bpf_insn insn_buf[16], *insn;

>>>> +	struct bpf_insn insn_buf[32], *insn;

>>>>  	u32 target_size, size_default, off;

>>>>  	struct bpf_prog *new_prog;

>>>>  	enum bpf_access_type type;

>>>> diff --git a/net/core/filter.c b/net/core/filter.c

>>>> index 4c4882d4d92c..278640db9e0a 100644

>>>> --- a/net/core/filter.c

>>>> +++ b/net/core/filter.c

>>>> @@ -8908,6 +8908,7 @@ static u32 xdp_convert_ctx_access(enum

>>>> bpf_access_type type,

>>>>  				  struct bpf_insn *insn_buf,

>>>>  				  struct bpf_prog *prog, u32 *target_size)

>>>>  {

>>>> +	int ctx_reg, dst_reg, scratch_reg;

>>>>  	struct bpf_insn *insn = insn_buf;

>>>>

>>>>  	switch (si->off) {

>>>> @@ -8954,6 +8955,88 @@ static u32 xdp_convert_ctx_access(enum

>>>> bpf_access_type type,

>>>>  		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,

>>>>  				      offsetof(struct net_device, ifindex));

>>>>  		break;

>>>> +	case offsetof(struct xdp_md, frame_length):

>>>> +		/* Need tmp storage for src_reg in case src_reg == dst_reg,

>>>> +		 * and a scratch reg */

>>>> +		scratch_reg = BPF_REG_9;

>>>> +		dst_reg = si->dst_reg;

>>>> +

>>>> +		if (dst_reg == scratch_reg)

>>>> +			scratch_reg--;

>>>> +

>>>> +		ctx_reg = (si->src_reg == si->dst_reg) ? scratch_reg - 1 :

>>>> si->src_reg;

>>>> +		while (dst_reg == ctx_reg || scratch_reg == ctx_reg)

>>>> +			ctx_reg--;

>>>> +

>>>> +		/* Save scratch registers */

>>>> +		if (ctx_reg != si->src_reg) {

>>>> +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, ctx_reg,

>>>> +					      offsetof(struct xdp_buff,

>>>> +						       tmp_reg[1]));

>>>> +

>>>> +			*insn++ = BPF_MOV64_REG(ctx_reg, si->src_reg);

>>>> +		}

>>>> +

>>>> +		*insn++ = BPF_STX_MEM(BPF_DW, ctx_reg, scratch_reg,

>>>> +				      offsetof(struct xdp_buff, tmp_reg[0]));

>>>

>>> Why don't you push regs to stack, use it and then pop it back? That 

>>> way

>>> I

>>> suppose you could avoid polluting xdp_buff with tmp_reg[2].

>>

>> There is no “real” stack in eBPF, only a read-only frame pointer, 

>> and as we

>> are replacing a single instruction, we have no info on what we can 

>> use as

>> scratch space.

>

> Uhm, what? You use R10 for stack operations. Verifier tracks the stack

> depth used by programs and then it is passed down to JIT so that 

> native

> asm will create a properly sized stack frame.

>

> From the top of my head I would let know xdp_convert_ctx_access of a

> current stack depth and use it for R10 stores, so your scratch space 

> would

> be R10 + (stack depth + 8), R10 + (stack_depth + 16).


Other instances do exactly the same, i.e. put some scratch registers in 
the underlying data structure, so I reused this approach. From the 
current information in the callback, I was not able to determine the 
current stack_depth. With "real" stack above, I meant having a pop/push 
like instruction.

I do not know the verifier code well enough, but are you suggesting I 
can get the current stack_depth from the verifier in the 
xdp_convert_ctx_access() callback? If so any pointers?

> Problem with that would be the fact that convert_ctx_accesses() 

> happens to

> be called after the check_max_stack_depth(), so probably stack_depth 

> of a

> prog that has frame_length accesses would have to be adjusted earlier.


Ack, need to learn more on the verifier part…

>>

>>>> +

>>>> +		/* What does this code do?

>>>> +		 *   dst_reg = 0

>>>> +		 *

>>>> +		 *   if (!ctx_reg->mb)

>>>> +		 *      goto no_mb:

>>>> +		 *

>>>> +		 *   dst_reg = (struct xdp_shared_info *)xdp_data_hard_end(xdp)

>>>> +		 *   dst_reg = dst_reg->data_length

>>>> +		 *

>>>> +		 * NOTE: xdp_data_hard_end() is xdp->hard_start +

>>>> +		 *       xdp->frame_sz - sizeof(shared_info)

>>>> +		 *

>>>> +		 * no_mb:

>>>> +		 *   dst_reg += ctx_reg->data_end - ctx_reg->data

>>>> +		 */

>>>> +		*insn++ = BPF_MOV64_IMM(dst_reg, 0);

>>>> +

>>>> +		*insn++ = BPF_LDX_MEM(BPF_W, scratch_reg, ctx_reg, MB_OFFSET());

>>>> +		*insn++ = BPF_ALU32_IMM(BPF_AND, scratch_reg, MB_MASK);

>>>> +		*insn++ = BPF_JMP_IMM(BPF_JEQ, scratch_reg, 0, 7); /*goto no_mb;

>>>> */

>>>> +

>>>> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff,

>>>> +						       data_hard_start),

>>>> +				      dst_reg, ctx_reg,

>>>> +				      offsetof(struct xdp_buff, data_hard_start));

>>>> +		*insn++ = BPF_LDX_MEM(BPF_W, scratch_reg, ctx_reg,

>>>> +				      FRAME_SZ_OFFSET());

>>>> +		*insn++ = BPF_ALU32_IMM(BPF_AND, scratch_reg, FRAME_SZ_MASK);

>>>> +		*insn++ = BPF_ALU32_IMM(BPF_RSH, scratch_reg, FRAME_SZ_SHIFT);

>>>> +		*insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, scratch_reg);

>>>> +		*insn++ = BPF_ALU64_IMM(BPF_SUB, dst_reg,

>>>> +					SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));

>>>> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_shared_info,

>>>> +						       data_length),

>>>> +				      dst_reg, dst_reg,

>>>> +				      offsetof(struct xdp_shared_info,

>>>> +					       data_length));

>>>> +

>>>> +		/* no_mb: */

>>>> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, 

>>>> data_end),

>>>> +				      scratch_reg, ctx_reg,

>>>> +				      offsetof(struct xdp_buff, data_end));

>>>> +		*insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, scratch_reg);

>>>> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),

>>>> +				      scratch_reg, ctx_reg,

>>>> +				      offsetof(struct xdp_buff, data));

>>>> +		*insn++ = BPF_ALU64_REG(BPF_SUB, dst_reg, scratch_reg);

>>>> +

>>>> +		/* Restore scratch registers */

>>>> +		*insn++ = BPF_LDX_MEM(BPF_DW, scratch_reg, ctx_reg,

>>>> +				      offsetof(struct xdp_buff, tmp_reg[0]));

>>>> +

>>>> +		if (ctx_reg != si->src_reg)

>>>> +			*insn++ = BPF_LDX_MEM(BPF_DW, ctx_reg, ctx_reg,

>>>> +					      offsetof(struct xdp_buff,

>>>> +						       tmp_reg[1]));

>>>> +		break;

>>>>  	}

>>>>

>>>>  	return insn - insn_buf;

>>>> diff --git a/tools/include/uapi/linux/bpf.h

>>>> b/tools/include/uapi/linux/bpf.h

>>>> index 30b477a26482..62c50ab28ea9 100644

>>>> --- a/tools/include/uapi/linux/bpf.h

>>>> +++ b/tools/include/uapi/linux/bpf.h

>>>> @@ -4380,6 +4380,7 @@ struct xdp_md {

>>>>  	__u32 rx_queue_index;  /* rxq->queue_index  */

>>>>

>>>>  	__u32 egress_ifindex;  /* txq->dev->ifindex */

>>>> +	__u32 frame_length;

>>>>  };

>>>>

>>>>  /* DEVMAP map-value layout

>>>> -- 

>>>> 2.28.0

>>>>

>>
Eelco Chaudron Dec. 15, 2020, 1:28 p.m. UTC | #5
On 9 Dec 2020, at 13:07, Eelco Chaudron wrote:

> On 9 Dec 2020, at 12:10, Maciej Fijalkowski wrote:


<SNIP>

>>>>> +

>>>>> +		ctx_reg = (si->src_reg == si->dst_reg) ? scratch_reg - 1 :

>>>>> si->src_reg;

>>>>> +		while (dst_reg == ctx_reg || scratch_reg == ctx_reg)

>>>>> +			ctx_reg--;

>>>>> +

>>>>> +		/* Save scratch registers */

>>>>> +		if (ctx_reg != si->src_reg) {

>>>>> +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, ctx_reg,

>>>>> +					      offsetof(struct xdp_buff,

>>>>> +						       tmp_reg[1]));

>>>>> +

>>>>> +			*insn++ = BPF_MOV64_REG(ctx_reg, si->src_reg);

>>>>> +		}

>>>>> +

>>>>> +		*insn++ = BPF_STX_MEM(BPF_DW, ctx_reg, scratch_reg,

>>>>> +				      offsetof(struct xdp_buff, tmp_reg[0]));

>>>>

>>>> Why don't you push regs to stack, use it and then pop it back? That 

>>>> way

>>>> I

>>>> suppose you could avoid polluting xdp_buff with tmp_reg[2].

>>>

>>> There is no “real” stack in eBPF, only a read-only frame 

>>> pointer, and as we

>>> are replacing a single instruction, we have no info on what we can 

>>> use as

>>> scratch space.

>>

>> Uhm, what? You use R10 for stack operations. Verifier tracks the 

>> stack

>> depth used by programs and then it is passed down to JIT so that 

>> native

>> asm will create a properly sized stack frame.

>>

>> From the top of my head I would let know xdp_convert_ctx_access of a

>> current stack depth and use it for R10 stores, so your scratch space 

>> would

>> be R10 + (stack depth + 8), R10 + (stack_depth + 16).

>

> Other instances do exactly the same, i.e. put some scratch registers 

> in the underlying data structure, so I reused this approach. From the 

> current information in the callback, I was not able to determine the 

> current stack_depth. With "real" stack above, I meant having a 

> pop/push like instruction.

>

> I do not know the verifier code well enough, but are you suggesting I 

> can get the current stack_depth from the verifier in the 

> xdp_convert_ctx_access() callback? If so any pointers?


Maciej any feedback on the above, i.e. getting the stack_depth in 
xdp_convert_ctx_access()?

>> Problem with that would be the fact that convert_ctx_accesses() 

>> happens to

>> be called after the check_max_stack_depth(), so probably stack_depth 

>> of a

>> prog that has frame_length accesses would have to be adjusted 

>> earlier.

>

> Ack, need to learn more on the verifier part…


<SNIP>
Maciej Fijalkowski Dec. 15, 2020, 6:06 p.m. UTC | #6
On Tue, Dec 15, 2020 at 02:28:39PM +0100, Eelco Chaudron wrote:
> 

> 

> On 9 Dec 2020, at 13:07, Eelco Chaudron wrote:

> 

> > On 9 Dec 2020, at 12:10, Maciej Fijalkowski wrote:

> 

> <SNIP>

> 

> > > > > > +

> > > > > > +		ctx_reg = (si->src_reg == si->dst_reg) ? scratch_reg - 1 :

> > > > > > si->src_reg;

> > > > > > +		while (dst_reg == ctx_reg || scratch_reg == ctx_reg)

> > > > > > +			ctx_reg--;

> > > > > > +

> > > > > > +		/* Save scratch registers */

> > > > > > +		if (ctx_reg != si->src_reg) {

> > > > > > +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, ctx_reg,

> > > > > > +					      offsetof(struct xdp_buff,

> > > > > > +						       tmp_reg[1]));

> > > > > > +

> > > > > > +			*insn++ = BPF_MOV64_REG(ctx_reg, si->src_reg);

> > > > > > +		}

> > > > > > +

> > > > > > +		*insn++ = BPF_STX_MEM(BPF_DW, ctx_reg, scratch_reg,

> > > > > > +				      offsetof(struct xdp_buff, tmp_reg[0]));

> > > > > 

> > > > > Why don't you push regs to stack, use it and then pop it

> > > > > back? That way

> > > > > I

> > > > > suppose you could avoid polluting xdp_buff with tmp_reg[2].

> > > > 

> > > > There is no “real” stack in eBPF, only a read-only frame

> > > > pointer, and as we

> > > > are replacing a single instruction, we have no info on what we

> > > > can use as

> > > > scratch space.

> > > 

> > > Uhm, what? You use R10 for stack operations. Verifier tracks the

> > > stack

> > > depth used by programs and then it is passed down to JIT so that

> > > native

> > > asm will create a properly sized stack frame.

> > > 

> > > From the top of my head I would let know xdp_convert_ctx_access of a

> > > current stack depth and use it for R10 stores, so your scratch space

> > > would

> > > be R10 + (stack depth + 8), R10 + (stack_depth + 16).

> > 

> > Other instances do exactly the same, i.e. put some scratch registers in

> > the underlying data structure, so I reused this approach. From the

> > current information in the callback, I was not able to determine the

> > current stack_depth. With "real" stack above, I meant having a pop/push

> > like instruction.

> > 

> > I do not know the verifier code well enough, but are you suggesting I

> > can get the current stack_depth from the verifier in the

> > xdp_convert_ctx_access() callback? If so any pointers?

> 

> Maciej any feedback on the above, i.e. getting the stack_depth in

> xdp_convert_ctx_access()?


Sorry. I'll try to get my head around it. If i recall correctly stack
depth is tracked per subprogram whereas convert_ctx_accesses is iterating
through *all* insns (so a prog that is not chunked onto subprogs), but
maybe we could dig up the subprog based on insn idx.

But at first, you mentioned that you took the approach from other
instances, can you point me to them?

I'd also like to hear from Daniel/Alexei/John and others their thoughts.

> 

> > > Problem with that would be the fact that convert_ctx_accesses()

> > > happens to

> > > be called after the check_max_stack_depth(), so probably stack_depth

> > > of a

> > > prog that has frame_length accesses would have to be adjusted

> > > earlier.

> > 

> > Ack, need to learn more on the verifier part…

> 

> <SNIP>

>
Eelco Chaudron Dec. 16, 2020, 2:08 p.m. UTC | #7
On 15 Dec 2020, at 19:06, Maciej Fijalkowski wrote:

> On Tue, Dec 15, 2020 at 02:28:39PM +0100, Eelco Chaudron wrote:

>>

>>

>> On 9 Dec 2020, at 13:07, Eelco Chaudron wrote:

>>

>>> On 9 Dec 2020, at 12:10, Maciej Fijalkowski wrote:

>>

>> <SNIP>

>>

>>>>>>> +

>>>>>>> +		ctx_reg = (si->src_reg == si->dst_reg) ? scratch_reg - 1 :

>>>>>>> si->src_reg;

>>>>>>> +		while (dst_reg == ctx_reg || scratch_reg == ctx_reg)

>>>>>>> +			ctx_reg--;

>>>>>>> +

>>>>>>> +		/* Save scratch registers */

>>>>>>> +		if (ctx_reg != si->src_reg) {

>>>>>>> +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, ctx_reg,

>>>>>>> +					      offsetof(struct xdp_buff,

>>>>>>> +						       tmp_reg[1]));

>>>>>>> +

>>>>>>> +			*insn++ = BPF_MOV64_REG(ctx_reg, si->src_reg);

>>>>>>> +		}

>>>>>>> +

>>>>>>> +		*insn++ = BPF_STX_MEM(BPF_DW, ctx_reg, scratch_reg,

>>>>>>> +				      offsetof(struct xdp_buff, tmp_reg[0]));

>>>>>>

>>>>>> Why don't you push regs to stack, use it and then pop it

>>>>>> back? That way

>>>>>> I

>>>>>> suppose you could avoid polluting xdp_buff with tmp_reg[2].

>>>>>

>>>>> There is no “real” stack in eBPF, only a read-only frame

>>>>> pointer, and as we

>>>>> are replacing a single instruction, we have no info on what we

>>>>> can use as

>>>>> scratch space.

>>>>

>>>> Uhm, what? You use R10 for stack operations. Verifier tracks the

>>>> stack

>>>> depth used by programs and then it is passed down to JIT so that

>>>> native

>>>> asm will create a properly sized stack frame.

>>>>

>>>> From the top of my head I would let know xdp_convert_ctx_access of a

>>>> current stack depth and use it for R10 stores, so your scratch space

>>>> would

>>>> be R10 + (stack depth + 8), R10 + (stack_depth + 16).

>>>

>>> Other instances do exactly the same, i.e. put some scratch registers in

>>> the underlying data structure, so I reused this approach. From the

>>> current information in the callback, I was not able to determine the

>>> current stack_depth. With "real" stack above, I meant having a pop/push

>>> like instruction.

>>>

>>> I do not know the verifier code well enough, but are you suggesting I

>>> can get the current stack_depth from the verifier in the

>>> xdp_convert_ctx_access() callback? If so any pointers?

>>

>> Maciej any feedback on the above, i.e. getting the stack_depth in

>> xdp_convert_ctx_access()?

>

> Sorry. I'll try to get my head around it. If i recall correctly stack

> depth is tracked per subprogram whereas convert_ctx_accesses is iterating

> through *all* insns (so a prog that is not chunked onto subprogs), but

> maybe we could dig up the subprog based on insn idx.

>

> But at first, you mentioned that you took the approach from other

> instances, can you point me to them?


Quick search found the following two (sure there is one more with two regs):

https://elixir.bootlin.com/linux/v5.10.1/source/kernel/bpf/cgroup.c#L1718
https://elixir.bootlin.com/linux/v5.10.1/source/net/core/filter.c#L8977

> I'd also like to hear from Daniel/Alexei/John and others their thoughts.


Please keep me in the loop…

>>

>>>> Problem with that would be the fact that convert_ctx_accesses()

>>>> happens to

>>>> be called after the check_max_stack_depth(), so probably stack_depth

>>>> of a

>>>> prog that has frame_length accesses would have to be adjusted

>>>> earlier.

>>>

>>> Ack, need to learn more on the verifier part…

>>

>> <SNIP>

>>
Eelco Chaudron Jan. 15, 2021, 4:36 p.m. UTC | #8
On 16 Dec 2020, at 15:08, Eelco Chaudron wrote:

> On 15 Dec 2020, at 19:06, Maciej Fijalkowski wrote:

>

>> On Tue, Dec 15, 2020 at 02:28:39PM +0100, Eelco Chaudron wrote:

>>>

>>>

>>> On 9 Dec 2020, at 13:07, Eelco Chaudron wrote:

>>>

>>>> On 9 Dec 2020, at 12:10, Maciej Fijalkowski wrote:

>>>

>>> <SNIP>

>>>

>>>>>>>> +

>>>>>>>> +		ctx_reg = (si->src_reg == si->dst_reg) ? scratch_reg - 1 :

>>>>>>>> si->src_reg;

>>>>>>>> +		while (dst_reg == ctx_reg || scratch_reg == ctx_reg)

>>>>>>>> +			ctx_reg--;

>>>>>>>> +

>>>>>>>> +		/* Save scratch registers */

>>>>>>>> +		if (ctx_reg != si->src_reg) {

>>>>>>>> +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, ctx_reg,

>>>>>>>> +					      offsetof(struct xdp_buff,

>>>>>>>> +						       tmp_reg[1]));

>>>>>>>> +

>>>>>>>> +			*insn++ = BPF_MOV64_REG(ctx_reg, si->src_reg);

>>>>>>>> +		}

>>>>>>>> +

>>>>>>>> +		*insn++ = BPF_STX_MEM(BPF_DW, ctx_reg, scratch_reg,

>>>>>>>> +				      offsetof(struct xdp_buff, tmp_reg[0]));

>>>>>>>

>>>>>>> Why don't you push regs to stack, use it and then pop it

>>>>>>> back? That way

>>>>>>> I

>>>>>>> suppose you could avoid polluting xdp_buff with tmp_reg[2].

>>>>>>

>>>>>> There is no “real” stack in eBPF, only a read-only frame

>>>>>> pointer, and as we

>>>>>> are replacing a single instruction, we have no info on what we

>>>>>> can use as

>>>>>> scratch space.

>>>>>

>>>>> Uhm, what? You use R10 for stack operations. Verifier tracks the

>>>>> stack

>>>>> depth used by programs and then it is passed down to JIT so that

>>>>> native

>>>>> asm will create a properly sized stack frame.

>>>>>

>>>>> From the top of my head I would let know xdp_convert_ctx_access of 

>>>>> a

>>>>> current stack depth and use it for R10 stores, so your scratch 

>>>>> space

>>>>> would

>>>>> be R10 + (stack depth + 8), R10 + (stack_depth + 16).

>>>>

>>>> Other instances do exactly the same, i.e. put some scratch 

>>>> registers in

>>>> the underlying data structure, so I reused this approach. From the

>>>> current information in the callback, I was not able to determine 

>>>> the

>>>> current stack_depth. With "real" stack above, I meant having a 

>>>> pop/push

>>>> like instruction.

>>>>

>>>> I do not know the verifier code well enough, but are you suggesting 

>>>> I

>>>> can get the current stack_depth from the verifier in the

>>>> xdp_convert_ctx_access() callback? If so any pointers?

>>>

>>> Maciej any feedback on the above, i.e. getting the stack_depth in

>>> xdp_convert_ctx_access()?

>>

>> Sorry. I'll try to get my head around it. If i recall correctly stack

>> depth is tracked per subprogram whereas convert_ctx_accesses is 

>> iterating

>> through *all* insns (so a prog that is not chunked onto subprogs), 

>> but

>> maybe we could dig up the subprog based on insn idx.

>>

>> But at first, you mentioned that you took the approach from other

>> instances, can you point me to them?

>

> Quick search found the following two (sure there is one more with two 

> regs):

>

> https://elixir.bootlin.com/linux/v5.10.1/source/kernel/bpf/cgroup.c#L1718

> https://elixir.bootlin.com/linux/v5.10.1/source/net/core/filter.c#L8977

>

>> I'd also like to hear from Daniel/Alexei/John and others their 

>> thoughts.

>

> Please keep me in the loop…


Any thoughts/update on the above so I can move this patchset forward?
Maciej Fijalkowski Jan. 18, 2021, 4:48 p.m. UTC | #9
On Fri, Jan 15, 2021 at 05:36:23PM +0100, Eelco Chaudron wrote:
> 

> 

> On 16 Dec 2020, at 15:08, Eelco Chaudron wrote:

> 

> > On 15 Dec 2020, at 19:06, Maciej Fijalkowski wrote:

> > 

> > > On Tue, Dec 15, 2020 at 02:28:39PM +0100, Eelco Chaudron wrote:

> > > > 

> > > > 

> > > > On 9 Dec 2020, at 13:07, Eelco Chaudron wrote:

> > > > 

> > > > > On 9 Dec 2020, at 12:10, Maciej Fijalkowski wrote:

> > > > 

> > > > <SNIP>

> > > > 

> > > > > > > > > +

> > > > > > > > > +		ctx_reg = (si->src_reg == si->dst_reg) ? scratch_reg - 1 :

> > > > > > > > > si->src_reg;

> > > > > > > > > +		while (dst_reg == ctx_reg || scratch_reg == ctx_reg)

> > > > > > > > > +			ctx_reg--;

> > > > > > > > > +

> > > > > > > > > +		/* Save scratch registers */

> > > > > > > > > +		if (ctx_reg != si->src_reg) {

> > > > > > > > > +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, ctx_reg,

> > > > > > > > > +					      offsetof(struct xdp_buff,

> > > > > > > > > +						       tmp_reg[1]));

> > > > > > > > > +

> > > > > > > > > +			*insn++ = BPF_MOV64_REG(ctx_reg, si->src_reg);

> > > > > > > > > +		}

> > > > > > > > > +

> > > > > > > > > +		*insn++ = BPF_STX_MEM(BPF_DW, ctx_reg, scratch_reg,

> > > > > > > > > +				      offsetof(struct xdp_buff, tmp_reg[0]));

> > > > > > > > 

> > > > > > > > Why don't you push regs to stack, use it and then pop it

> > > > > > > > back? That way

> > > > > > > > I

> > > > > > > > suppose you could avoid polluting xdp_buff with tmp_reg[2].

> > > > > > > 

> > > > > > > There is no “real” stack in eBPF, only a read-only frame

> > > > > > > pointer, and as we

> > > > > > > are replacing a single instruction, we have no info on what we

> > > > > > > can use as

> > > > > > > scratch space.

> > > > > > 

> > > > > > Uhm, what? You use R10 for stack operations. Verifier tracks the

> > > > > > stack

> > > > > > depth used by programs and then it is passed down to JIT so that

> > > > > > native

> > > > > > asm will create a properly sized stack frame.

> > > > > > 

> > > > > > From the top of my head I would let know

> > > > > > xdp_convert_ctx_access of a

> > > > > > current stack depth and use it for R10 stores, so your

> > > > > > scratch space

> > > > > > would

> > > > > > be R10 + (stack depth + 8), R10 + (stack_depth + 16).

> > > > > 

> > > > > Other instances do exactly the same, i.e. put some scratch

> > > > > registers in

> > > > > the underlying data structure, so I reused this approach. From the

> > > > > current information in the callback, I was not able to

> > > > > determine the

> > > > > current stack_depth. With "real" stack above, I meant having

> > > > > a pop/push

> > > > > like instruction.

> > > > > 

> > > > > I do not know the verifier code well enough, but are you

> > > > > suggesting I

> > > > > can get the current stack_depth from the verifier in the

> > > > > xdp_convert_ctx_access() callback? If so any pointers?

> > > > 

> > > > Maciej any feedback on the above, i.e. getting the stack_depth in

> > > > xdp_convert_ctx_access()?

> > > 

> > > Sorry. I'll try to get my head around it. If i recall correctly stack

> > > depth is tracked per subprogram whereas convert_ctx_accesses is

> > > iterating

> > > through *all* insns (so a prog that is not chunked onto subprogs),

> > > but

> > > maybe we could dig up the subprog based on insn idx.

> > > 

> > > But at first, you mentioned that you took the approach from other

> > > instances, can you point me to them?

> > 

> > Quick search found the following two (sure there is one more with two

> > regs):

> > 

> > https://elixir.bootlin.com/linux/v5.10.1/source/kernel/bpf/cgroup.c#L1718

> > https://elixir.bootlin.com/linux/v5.10.1/source/net/core/filter.c#L8977

> > 

> > > I'd also like to hear from Daniel/Alexei/John and others their

> > > thoughts.

> > 

> > Please keep me in the loop…

> 

> Any thoughts/update on the above so I can move this patchset forward?


Cc: John, Jesper, Bjorn

I didn't spend time thinking about it, but I still am against xdp_buff
extension for the purpose that code within this patch has.

Daniel/Alexei/John/Jesper/Bjorn,

any objections for not having the scratch registers but rather use the
stack and update the stack depth to calculate the frame length?

This seems not trivial so I really would like to have an input from better
BPF developers than me :)

> 

>
Eelco Chaudron Jan. 20, 2021, 1:20 p.m. UTC | #10
On 18 Jan 2021, at 17:48, Maciej Fijalkowski wrote:

> On Fri, Jan 15, 2021 at 05:36:23PM +0100, Eelco Chaudron wrote:

>>

>>

>> On 16 Dec 2020, at 15:08, Eelco Chaudron wrote:

>>

>>> On 15 Dec 2020, at 19:06, Maciej Fijalkowski wrote:

>>>

>>>> On Tue, Dec 15, 2020 at 02:28:39PM +0100, Eelco Chaudron wrote:

>>>>>

>>>>>

>>>>> On 9 Dec 2020, at 13:07, Eelco Chaudron wrote:

>>>>>

>>>>>> On 9 Dec 2020, at 12:10, Maciej Fijalkowski wrote:

>>>>>

>>>>> <SNIP>

>>>>>

>>>>>>>>>> +

>>>>>>>>>> +		ctx_reg = (si->src_reg == si->dst_reg) ? scratch_reg - 1 :

>>>>>>>>>> si->src_reg;

>>>>>>>>>> +		while (dst_reg == ctx_reg || scratch_reg == ctx_reg)

>>>>>>>>>> +			ctx_reg--;

>>>>>>>>>> +

>>>>>>>>>> +		/* Save scratch registers */

>>>>>>>>>> +		if (ctx_reg != si->src_reg) {

>>>>>>>>>> +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, ctx_reg,

>>>>>>>>>> +					      offsetof(struct xdp_buff,

>>>>>>>>>> +						       tmp_reg[1]));

>>>>>>>>>> +

>>>>>>>>>> +			*insn++ = BPF_MOV64_REG(ctx_reg, si->src_reg);

>>>>>>>>>> +		}

>>>>>>>>>> +

>>>>>>>>>> +		*insn++ = BPF_STX_MEM(BPF_DW, ctx_reg, scratch_reg,

>>>>>>>>>> +				      offsetof(struct xdp_buff, tmp_reg[0]));

>>>>>>>>>

>>>>>>>>> Why don't you push regs to stack, use it and then pop it

>>>>>>>>> back? That way

>>>>>>>>> I

>>>>>>>>> suppose you could avoid polluting xdp_buff with tmp_reg[2].

>>>>>>>>

>>>>>>>> There is no “real” stack in eBPF, only a read-only frame

>>>>>>>> pointer, and as we

>>>>>>>> are replacing a single instruction, we have no info on what we

>>>>>>>> can use as

>>>>>>>> scratch space.

>>>>>>>

>>>>>>> Uhm, what? You use R10 for stack operations. Verifier tracks the

>>>>>>> stack

>>>>>>> depth used by programs and then it is passed down to JIT so that

>>>>>>> native

>>>>>>> asm will create a properly sized stack frame.

>>>>>>>

>>>>>>> From the top of my head I would let know

>>>>>>> xdp_convert_ctx_access of a

>>>>>>> current stack depth and use it for R10 stores, so your

>>>>>>> scratch space

>>>>>>> would

>>>>>>> be R10 + (stack depth + 8), R10 + (stack_depth + 16).

>>>>>>

>>>>>> Other instances do exactly the same, i.e. put some scratch

>>>>>> registers in

>>>>>> the underlying data structure, so I reused this approach. From 

>>>>>> the

>>>>>> current information in the callback, I was not able to

>>>>>> determine the

>>>>>> current stack_depth. With "real" stack above, I meant having

>>>>>> a pop/push

>>>>>> like instruction.

>>>>>>

>>>>>> I do not know the verifier code well enough, but are you

>>>>>> suggesting I

>>>>>> can get the current stack_depth from the verifier in the

>>>>>> xdp_convert_ctx_access() callback? If so any pointers?

>>>>>

>>>>> Maciej any feedback on the above, i.e. getting the stack_depth in

>>>>> xdp_convert_ctx_access()?

>>>>

>>>> Sorry. I'll try to get my head around it. If i recall correctly 

>>>> stack

>>>> depth is tracked per subprogram whereas convert_ctx_accesses is

>>>> iterating

>>>> through *all* insns (so a prog that is not chunked onto subprogs),

>>>> but

>>>> maybe we could dig up the subprog based on insn idx.

>>>>

>>>> But at first, you mentioned that you took the approach from other

>>>> instances, can you point me to them?

>>>

>>> Quick search found the following two (sure there is one more with 

>>> two

>>> regs):

>>>

>>> https://elixir.bootlin.com/linux/v5.10.1/source/kernel/bpf/cgroup.c#L1718

>>> https://elixir.bootlin.com/linux/v5.10.1/source/net/core/filter.c#L8977

>>>

>>>> I'd also like to hear from Daniel/Alexei/John and others their

>>>> thoughts.

>>>

>>> Please keep me in the loop…

>>

>> Any thoughts/update on the above so I can move this patchset forward?

>

> Cc: John, Jesper, Bjorn

>

> I didn't spend time thinking about it, but I still am against xdp_buff

> extension for the purpose that code within this patch has.


Yes I agree, if we can not find an easy way to store the scratch 
registers on the stack, I’ll rework this patch to just store the total 
frame length in xdp_buff, as it will be less and still fit in one cache 
line.

> Daniel/Alexei/John/Jesper/Bjorn,

>

> any objections for not having the scratch registers but rather use the

> stack and update the stack depth to calculate the frame length?

>

> This seems not trivial so I really would like to have an input from 

> better

> BPF developers than me :)
Eelco Chaudron Feb. 1, 2021, 4 p.m. UTC | #11
On 20 Jan 2021, at 14:20, Eelco Chaudron wrote:

> On 18 Jan 2021, at 17:48, Maciej Fijalkowski wrote:

>

>> On Fri, Jan 15, 2021 at 05:36:23PM +0100, Eelco Chaudron wrote:

>>>

>>>

>>> On 16 Dec 2020, at 15:08, Eelco Chaudron wrote:

>>>

>>>> On 15 Dec 2020, at 19:06, Maciej Fijalkowski wrote:

>>>>

>>>>> On Tue, Dec 15, 2020 at 02:28:39PM +0100, Eelco Chaudron wrote:

>>>>>>

>>>>>>

>>>>>> On 9 Dec 2020, at 13:07, Eelco Chaudron wrote:

>>>>>>

>>>>>>> On 9 Dec 2020, at 12:10, Maciej Fijalkowski wrote:

>>>>>>

>>>>>> <SNIP>

>>>>>>

>>>>>>>>>>> +

>>>>>>>>>>> +		ctx_reg = (si->src_reg == si->dst_reg) ? scratch_reg - 1 

>>>>>>>>>>> :

>>>>>>>>>>> si->src_reg;

>>>>>>>>>>> +		while (dst_reg == ctx_reg || scratch_reg == ctx_reg)

>>>>>>>>>>> +			ctx_reg--;

>>>>>>>>>>> +

>>>>>>>>>>> +		/* Save scratch registers */

>>>>>>>>>>> +		if (ctx_reg != si->src_reg) {

>>>>>>>>>>> +			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, ctx_reg,

>>>>>>>>>>> +					      offsetof(struct xdp_buff,

>>>>>>>>>>> +						       tmp_reg[1]));

>>>>>>>>>>> +

>>>>>>>>>>> +			*insn++ = BPF_MOV64_REG(ctx_reg, si->src_reg);

>>>>>>>>>>> +		}

>>>>>>>>>>> +

>>>>>>>>>>> +		*insn++ = BPF_STX_MEM(BPF_DW, ctx_reg, scratch_reg,

>>>>>>>>>>> +				      offsetof(struct xdp_buff, tmp_reg[0]));

>>>>>>>>>>

>>>>>>>>>> Why don't you push regs to stack, use it and then pop it

>>>>>>>>>> back? That way

>>>>>>>>>> I

>>>>>>>>>> suppose you could avoid polluting xdp_buff with tmp_reg[2].

>>>>>>>>>

>>>>>>>>> There is no “real” stack in eBPF, only a read-only frame

>>>>>>>>> pointer, and as we

>>>>>>>>> are replacing a single instruction, we have no info on what we

>>>>>>>>> can use as

>>>>>>>>> scratch space.

>>>>>>>>

>>>>>>>> Uhm, what? You use R10 for stack operations. Verifier tracks 

>>>>>>>> the

>>>>>>>> stack

>>>>>>>> depth used by programs and then it is passed down to JIT so 

>>>>>>>> that

>>>>>>>> native

>>>>>>>> asm will create a properly sized stack frame.

>>>>>>>>

>>>>>>>> From the top of my head I would let know

>>>>>>>> xdp_convert_ctx_access of a

>>>>>>>> current stack depth and use it for R10 stores, so your

>>>>>>>> scratch space

>>>>>>>> would

>>>>>>>> be R10 + (stack depth + 8), R10 + (stack_depth + 16).

>>>>>>>

>>>>>>> Other instances do exactly the same, i.e. put some scratch

>>>>>>> registers in

>>>>>>> the underlying data structure, so I reused this approach. From 

>>>>>>> the

>>>>>>> current information in the callback, I was not able to

>>>>>>> determine the

>>>>>>> current stack_depth. With "real" stack above, I meant having

>>>>>>> a pop/push

>>>>>>> like instruction.

>>>>>>>

>>>>>>> I do not know the verifier code well enough, but are you

>>>>>>> suggesting I

>>>>>>> can get the current stack_depth from the verifier in the

>>>>>>> xdp_convert_ctx_access() callback? If so any pointers?

>>>>>>

>>>>>> Maciej any feedback on the above, i.e. getting the stack_depth in

>>>>>> xdp_convert_ctx_access()?

>>>>>

>>>>> Sorry. I'll try to get my head around it. If i recall correctly 

>>>>> stack

>>>>> depth is tracked per subprogram whereas convert_ctx_accesses is

>>>>> iterating

>>>>> through *all* insns (so a prog that is not chunked onto subprogs),

>>>>> but

>>>>> maybe we could dig up the subprog based on insn idx.

>>>>>

>>>>> But at first, you mentioned that you took the approach from other

>>>>> instances, can you point me to them?

>>>>

>>>> Quick search found the following two (sure there is one more with 

>>>> two

>>>> regs):

>>>>

>>>> https://elixir.bootlin.com/linux/v5.10.1/source/kernel/bpf/cgroup.c#L1718

>>>> https://elixir.bootlin.com/linux/v5.10.1/source/net/core/filter.c#L8977

>>>>

>>>>> I'd also like to hear from Daniel/Alexei/John and others their

>>>>> thoughts.

>>>>

>>>> Please keep me in the loop…

>>>

>>> Any thoughts/update on the above so I can move this patchset 

>>> forward?

>>

>> Cc: John, Jesper, Bjorn

>>

>> I didn't spend time thinking about it, but I still am against 

>> xdp_buff

>> extension for the purpose that code within this patch has.

>

> Yes I agree, if we can not find an easy way to store the scratch 

> registers on the stack, I’ll rework this patch to just store the 

> total frame length in xdp_buff, as it will be less and still fit in 

> one cache line.

>

>> Daniel/Alexei/John/Jesper/Bjorn,


Daniel/Alexei and input on how to easily allocate two scratch registers 
on the stack from a function like xdp_convert_ctx_access() through the 
verifier state? See above for some more details.

If you are not the right persons, who might be the verifier guru to ask?

>> any objections for not having the scratch registers but rather use 

>> the

>> stack and update the stack depth to calculate the frame length?

>>

>> This seems not trivial so I really would like to have an input from 

>> better

>> BPF developers than me :)
diff mbox series

Patch

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 09078ab6644c..e54d733c90ed 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -73,8 +73,30 @@  struct xdp_buff {
 	void *data_hard_start;
 	struct xdp_rxq_info *rxq;
 	struct xdp_txq_info *txq;
+	/* If any of the bitfield lengths for frame_sz or mb below change,
+	 * make sure the defines here are also updated!
+	 */
+#ifdef __BIG_ENDIAN_BITFIELD
+#define MB_SHIFT	  0
+#define MB_MASK		  0x00000001
+#define FRAME_SZ_SHIFT	  1
+#define FRAME_SZ_MASK	  0xfffffffe
+#else
+#define MB_SHIFT	  31
+#define MB_MASK		  0x80000000
+#define FRAME_SZ_SHIFT	  0
+#define FRAME_SZ_MASK	  0x7fffffff
+#endif
+#define FRAME_SZ_OFFSET() offsetof(struct xdp_buff, __u32_bit_fields_offset)
+#define MB_OFFSET()	  offsetof(struct xdp_buff, __u32_bit_fields_offset)
+	/* private: */
+	u32 __u32_bit_fields_offset[0];
+	/* public: */
 	u32 frame_sz:31; /* frame size to deduce data_hard_end/reserved tailroom*/
 	u32 mb:1; /* xdp non-linear buffer */
+
+	/* Temporary registers to make conditional access/stores possible. */
+	u64 tmp_reg[2];
 };
 
 /* Reserve memory area at end-of data area.
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 30b477a26482..62c50ab28ea9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4380,6 +4380,7 @@  struct xdp_md {
 	__u32 rx_queue_index;  /* rxq->queue_index  */
 
 	__u32 egress_ifindex;  /* txq->dev->ifindex */
+	__u32 frame_length;
 };
 
 /* DEVMAP map-value layout
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 93def76cf32b..c50caea29fa2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10526,7 +10526,7 @@  static int convert_ctx_accesses(struct bpf_verifier_env *env)
 	const struct bpf_verifier_ops *ops = env->ops;
 	int i, cnt, size, ctx_field_size, delta = 0;
 	const int insn_cnt = env->prog->len;
-	struct bpf_insn insn_buf[16], *insn;
+	struct bpf_insn insn_buf[32], *insn;
 	u32 target_size, size_default, off;
 	struct bpf_prog *new_prog;
 	enum bpf_access_type type;
diff --git a/net/core/filter.c b/net/core/filter.c
index 4c4882d4d92c..278640db9e0a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8908,6 +8908,7 @@  static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				  struct bpf_insn *insn_buf,
 				  struct bpf_prog *prog, u32 *target_size)
 {
+	int ctx_reg, dst_reg, scratch_reg;
 	struct bpf_insn *insn = insn_buf;
 
 	switch (si->off) {
@@ -8954,6 +8955,88 @@  static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct net_device, ifindex));
 		break;
+	case offsetof(struct xdp_md, frame_length):
+		/* Need tmp storage for src_reg in case src_reg == dst_reg,
+		 * and a scratch reg */
+		scratch_reg = BPF_REG_9;
+		dst_reg = si->dst_reg;
+
+		if (dst_reg == scratch_reg)
+			scratch_reg--;
+
+		ctx_reg = (si->src_reg == si->dst_reg) ? scratch_reg - 1 : si->src_reg;
+		while (dst_reg == ctx_reg || scratch_reg == ctx_reg)
+			ctx_reg--;
+
+		/* Save scratch registers */
+		if (ctx_reg != si->src_reg) {
+			*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, ctx_reg,
+					      offsetof(struct xdp_buff,
+						       tmp_reg[1]));
+
+			*insn++ = BPF_MOV64_REG(ctx_reg, si->src_reg);
+		}
+
+		*insn++ = BPF_STX_MEM(BPF_DW, ctx_reg, scratch_reg,
+				      offsetof(struct xdp_buff, tmp_reg[0]));
+
+		/* What does this code do?
+		 *   dst_reg = 0
+		 *
+		 *   if (!ctx_reg->mb)
+		 *      goto no_mb:
+		 *
+		 *   dst_reg = (struct xdp_shared_info *)xdp_data_hard_end(xdp)
+		 *   dst_reg = dst_reg->data_length
+		 *
+		 * NOTE: xdp_data_hard_end() is xdp->hard_start +
+		 *       xdp->frame_sz - sizeof(shared_info)
+		 *
+		 * no_mb:
+		 *   dst_reg += ctx_reg->data_end - ctx_reg->data
+		 */
+		*insn++ = BPF_MOV64_IMM(dst_reg, 0);
+
+		*insn++ = BPF_LDX_MEM(BPF_W, scratch_reg, ctx_reg, MB_OFFSET());
+		*insn++ = BPF_ALU32_IMM(BPF_AND, scratch_reg, MB_MASK);
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, scratch_reg, 0, 7); /*goto no_mb; */
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff,
+						       data_hard_start),
+				      dst_reg, ctx_reg,
+				      offsetof(struct xdp_buff, data_hard_start));
+		*insn++ = BPF_LDX_MEM(BPF_W, scratch_reg, ctx_reg,
+				      FRAME_SZ_OFFSET());
+		*insn++ = BPF_ALU32_IMM(BPF_AND, scratch_reg, FRAME_SZ_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, scratch_reg, FRAME_SZ_SHIFT);
+		*insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, scratch_reg);
+		*insn++ = BPF_ALU64_IMM(BPF_SUB, dst_reg,
+					SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_shared_info,
+						       data_length),
+				      dst_reg, dst_reg,
+				      offsetof(struct xdp_shared_info,
+					       data_length));
+
+		/* no_mb: */
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
+				      scratch_reg, ctx_reg,
+				      offsetof(struct xdp_buff, data_end));
+		*insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, scratch_reg);
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
+				      scratch_reg, ctx_reg,
+				      offsetof(struct xdp_buff, data));
+		*insn++ = BPF_ALU64_REG(BPF_SUB, dst_reg, scratch_reg);
+
+		/* Restore scratch registers */
+		*insn++ = BPF_LDX_MEM(BPF_DW, scratch_reg, ctx_reg,
+				      offsetof(struct xdp_buff, tmp_reg[0]));
+
+		if (ctx_reg != si->src_reg)
+			*insn++ = BPF_LDX_MEM(BPF_DW, ctx_reg, ctx_reg,
+					      offsetof(struct xdp_buff,
+						       tmp_reg[1]));
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 30b477a26482..62c50ab28ea9 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4380,6 +4380,7 @@  struct xdp_md {
 	__u32 rx_queue_index;  /* rxq->queue_index  */
 
 	__u32 egress_ifindex;  /* txq->dev->ifindex */
+	__u32 frame_length;
 };
 
 /* DEVMAP map-value layout