diff mbox series

[bpf-next,v2] arm64, bpf: Add 12-argument support for bpf trampoline

Message ID 20240705125336.46820-1-puranjay@kernel.org
State New
Headers show
Series [bpf-next,v2] arm64, bpf: Add 12-argument support for bpf trampoline | expand

Commit Message

Puranjay Mohan July 5, 2024, 12:53 p.m. UTC
The arm64 bpf JIT currently supports attaching the trampoline to
functions with <= 8 arguments. This is because up to 8 arguments can be
passed in registers r0-r7. If there are more than 8 arguments then the
9th and later arguments are passed on the stack, with SP pointing to the
first stacked argument. See aapcs64[1] for more details.

If the 8th argument is a structure of size > 8B, then it is passed fully
on stack and r7 is not used for passing any argument. If there is a 9th
argument, it will be passed on the stack, even though r7 is available.

Add the support of storing and restoring arguments passed on the stack
to the arm64 bpf trampoline. This will allow attaching the trampoline to
functions that take up to 12 arguments.

[1] https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#parameter-passing

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
Changes in V1 -> V2:
V1: https://lore.kernel.org/all/20240704173227.130491-1-puranjay@kernel.org/
- Fixed the argument handling for composite types (structs)
---
 arch/arm64/net/bpf_jit_comp.c                | 139 ++++++++++++++-----
 tools/testing/selftests/bpf/DENYLIST.aarch64 |   3 -
 2 files changed, 107 insertions(+), 35 deletions(-)

Comments

Xu Kuohai July 6, 2024, 9:28 a.m. UTC | #1
On 7/5/2024 8:53 PM, Puranjay Mohan wrote:
> The arm64 bpf JIT currently supports attaching the trampoline to
> functions with <= 8 arguments. This is because up to 8 arguments can be
> passed in registers r0-r7. If there are more than 8 arguments then the
> 9th and later arguments are passed on the stack, with SP pointing to the
> first stacked argument. See aapcs64[1] for more details.
> 
> If the 8th argument is a structure of size > 8B, then it is passed fully
> on stack and r7 is not used for passing any argument. If there is a 9th
> argument, it will be passed on the stack, even though r7 is available.
> 
> Add the support of storing and restoring arguments passed on the stack
> to the arm64 bpf trampoline. This will allow attaching the trampoline to
> functions that take up to 12 arguments.
> 
> [1] https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#parameter-passing
> 
> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> ---
> Changes in V1 -> V2:
> V1: https://lore.kernel.org/all/20240704173227.130491-1-puranjay@kernel.org/
> - Fixed the argument handling for composite types (structs)
> ---
>   arch/arm64/net/bpf_jit_comp.c                | 139 ++++++++++++++-----
>   tools/testing/selftests/bpf/DENYLIST.aarch64 |   3 -
>   2 files changed, 107 insertions(+), 35 deletions(-)
> 
> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
> index 751331f5ba90..063bf5e11fc6 100644
> --- a/arch/arm64/net/bpf_jit_comp.c
> +++ b/arch/arm64/net/bpf_jit_comp.c
> @@ -30,6 +30,8 @@
>   #define TMP_REG_3 (MAX_BPF_JIT_REG + 3)
>   #define FP_BOTTOM (MAX_BPF_JIT_REG + 4)
>   #define ARENA_VM_START (MAX_BPF_JIT_REG + 5)
> +/* Up to eight function arguments are passed in registers r0-r7 */
> +#define ARM64_MAX_REG_ARGS 8
>   
>   #define check_imm(bits, imm) do {				\
>   	if ((((imm) > 0) && ((imm) >> (bits))) ||		\
> @@ -2001,26 +2003,51 @@ static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl,
>   	}
>   }
>   
> -static void save_args(struct jit_ctx *ctx, int args_off, int nregs)
> +static void save_args(struct jit_ctx *ctx, int args_off, int orig_sp_off,
> +		      int nargs, int nreg_args)
>   {
> +	const u8 tmp = bpf2a64[TMP_REG_1];
> +	int arg_pos;
>   	int i;
>   
> -	for (i = 0; i < nregs; i++) {
> -		emit(A64_STR64I(i, A64_SP, args_off), ctx);
> +	for (i = 0; i < nargs; i++) {
> +		if (i < nreg_args) {
> +			emit(A64_STR64I(i, A64_SP, args_off), ctx);
> +		} else {
> +			arg_pos = orig_sp_off + (i - nreg_args) * 8;
> +			emit(A64_LDR64I(tmp, A64_SP, arg_pos), ctx);
> +			emit(A64_STR64I(tmp, A64_SP, args_off), ctx);
> +		}
>   		args_off += 8;
>   	}
>   }
>   
> -static void restore_args(struct jit_ctx *ctx, int args_off, int nregs)
> +static void restore_args(struct jit_ctx *ctx, int args_off, int nreg_args)
>   {
>   	int i;
>   
> -	for (i = 0; i < nregs; i++) {
> +	for (i = 0; i < nreg_args; i++) {
>   		emit(A64_LDR64I(i, A64_SP, args_off), ctx);
>   		args_off += 8;
>   	}
>   }
>   
> +static void restore_stack_args(struct jit_ctx *ctx, int args_off, int stk_arg_off,
> +			       int nargs, int nreg_args)
> +{
> +	const u8 tmp = bpf2a64[TMP_REG_1];
> +	int arg_pos;
> +	int i;
> +
> +	for (i = nreg_args; i < nargs; i++) {
> +		arg_pos = args_off + i * 8;
> +		emit(A64_LDR64I(tmp, A64_SP, arg_pos), ctx);
> +		emit(A64_STR64I(tmp, A64_SP, stk_arg_off), ctx);
> +
> +		stk_arg_off += 8;
> +	}
> +}
> +
>   /* Based on the x86's implementation of arch_prepare_bpf_trampoline().
>    *
>    * bpf prog and function entry before bpf trampoline hooked:
> @@ -2034,15 +2061,17 @@ static void restore_args(struct jit_ctx *ctx, int args_off, int nregs)
>    */
>   static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>   			      struct bpf_tramp_links *tlinks, void *func_addr,
> -			      int nregs, u32 flags)
> +			      int nargs, int nreg_args, u32 flags)
>   {
>   	int i;
>   	int stack_size;
> +	int stk_arg_off;
> +	int orig_sp_off;
>   	int retaddr_off;
>   	int regs_off;
>   	int retval_off;
>   	int args_off;
> -	int nregs_off;
> +	int nargs_off;
>   	int ip_off;
>   	int run_ctx_off;
>   	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
> @@ -2052,6 +2081,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>   	__le32 **branches = NULL;
>   
>   	/* trampoline stack layout:
> +	 * SP + orig_sp_off [ first stack arg   ] if nargs > 8
>   	 *                  [ parent ip         ]
>   	 *                  [ FP                ]
>   	 * SP + retaddr_off [ self ip           ]
> @@ -2069,14 +2099,24 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>   	 *                  [ ...               ]
>   	 * SP + args_off    [ arg reg 1         ]
>   	 *
> -	 * SP + nregs_off   [ arg regs count    ]
> +	 * SP + nargs_off   [ arg count         ]
>   	 *
>   	 * SP + ip_off      [ traced function   ] BPF_TRAMP_F_IP_ARG flag
>   	 *
>   	 * SP + run_ctx_off [ bpf_tramp_run_ctx ]
> +	 *
> +	 *		    [ stack_argN	]
> +	 *		    [ ...		]
> +	 * SP + stk_arg_off [ stack_arg1	] BPF_TRAMP_F_CALL_ORIG
>   	 */
>   
>   	stack_size = 0;
> +	stk_arg_off = stack_size;
> +	if ((flags & BPF_TRAMP_F_CALL_ORIG) && (nargs - nreg_args > 0)) {
> +		/* room for saving arguments passed on stack */
> +		stack_size += (nargs - nreg_args) * 8;
> +	}
> +
>   	run_ctx_off = stack_size;
>   	/* room for bpf_tramp_run_ctx */
>   	stack_size += round_up(sizeof(struct bpf_tramp_run_ctx), 8);
> @@ -2086,13 +2126,13 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>   	if (flags & BPF_TRAMP_F_IP_ARG)
>   		stack_size += 8;
>   
> -	nregs_off = stack_size;
> +	nargs_off = stack_size;
>   	/* room for args count */
>   	stack_size += 8;
>   
>   	args_off = stack_size;
>   	/* room for args */
> -	stack_size += nregs * 8;
> +	stack_size += nargs * 8;
>   
>   	/* room for return value */
>   	retval_off = stack_size;
> @@ -2110,6 +2150,11 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>   	/* return address locates above FP */
>   	retaddr_off = stack_size + 8;
>   
> +	/* original SP position
> +	 * stack_size + parent function frame + patched function frame
> +	 */
> +	orig_sp_off = stack_size + 32;
> +
>   	/* bpf trampoline may be invoked by 3 instruction types:
>   	 * 1. bl, attached to bpf prog or kernel function via short jump
>   	 * 2. br, attached to bpf prog or kernel function via long jump
> @@ -2135,12 +2180,12 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>   		emit(A64_STR64I(A64_R(10), A64_SP, ip_off), ctx);
>   	}
>   
> -	/* save arg regs count*/
> -	emit(A64_MOVZ(1, A64_R(10), nregs, 0), ctx);
> -	emit(A64_STR64I(A64_R(10), A64_SP, nregs_off), ctx);
> +	/* save argument count */
> +	emit(A64_MOVZ(1, A64_R(10), nargs, 0), ctx);
> +	emit(A64_STR64I(A64_R(10), A64_SP, nargs_off), ctx);
>   
> -	/* save arg regs */
> -	save_args(ctx, args_off, nregs);
> +	/* save arguments passed in regs and on the stack */
> +	save_args(ctx, args_off, orig_sp_off, nargs, nreg_args);
>   
>   	/* save callee saved registers */
>   	emit(A64_STR64I(A64_R(19), A64_SP, regs_off), ctx);
> @@ -2167,7 +2212,10 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>   	}
>   
>   	if (flags & BPF_TRAMP_F_CALL_ORIG) {
> -		restore_args(ctx, args_off, nregs);
> +		/* restore arguments that were passed in registers */
> +		restore_args(ctx, args_off, nreg_args);
> +		/* restore arguments that were passed on the stack */
> +		restore_stack_args(ctx, args_off, stk_arg_off, nargs, nreg_args);
>   		/* call original func */
>   		emit(A64_LDR64I(A64_R(10), A64_SP, retaddr_off), ctx);
>   		emit(A64_ADR(A64_LR, AARCH64_INSN_SIZE * 2), ctx);
> @@ -2196,7 +2244,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>   	}
>   
>   	if (flags & BPF_TRAMP_F_RESTORE_REGS)
> -		restore_args(ctx, args_off, nregs);
> +		restore_args(ctx, args_off, nreg_args);
>   
>   	/* restore callee saved register x19 and x20 */
>   	emit(A64_LDR64I(A64_R(19), A64_SP, regs_off), ctx);
> @@ -2228,19 +2276,42 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
>   	return ctx->idx;
>   }
>   
> -static int btf_func_model_nregs(const struct btf_func_model *m)
> +static int btf_func_model_nargs(const struct btf_func_model *m)
>   {
> -	int nregs = m->nr_args;
> +	int nargs = m->nr_args;
>   	int i;
>   
> -	/* extra registers needed for struct argument */
> +	/* extra registers or stack slots needed for struct argument */
>   	for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) {
>   		/* The arg_size is at most 16 bytes, enforced by the verifier. */
>   		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
> -			nregs += (m->arg_size[i] + 7) / 8 - 1;
> +			nargs += (m->arg_size[i] + 7) / 8 - 1;
>   	}
>   
> -	return nregs;
> +	return nargs;
> +}
> +
> +/* get the count of the regs that are used to pass arguments */
> +static int btf_func_model_nreg_args(const struct btf_func_model *m)
> +{
> +	int nargs = m->nr_args;
> +	int nreg_args = 0;
> +	int i;
> +
> +	for (i = 0; i < nargs; i++) {
> +		/* The arg_size is at most 16 bytes, enforced by the verifier. */
> +		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) {
> +			/* struct members are all in the registers or all
> +			 * on the stack.
> +			 */
> +			if (nreg_args + ((m->arg_size[i] + 7) / 8 - 1) > 7)
> +				break;
> +			nreg_args += (m->arg_size[i] + 7) / 8 - 1;
> +		}
> +		nreg_args++;
> +	}
> +
> +	return (nreg_args > ARM64_MAX_REG_ARGS ? ARM64_MAX_REG_ARGS : nreg_args);
>   }
>   
>   int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
> @@ -2251,14 +2322,16 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
>   		.idx = 0,
>   	};
>   	struct bpf_tramp_image im;
> -	int nregs, ret;
> +	int nargs, nreg_args, ret;
>   
> -	nregs = btf_func_model_nregs(m);
> -	/* the first 8 registers are used for arguments */
> -	if (nregs > 8)
> +	nargs = btf_func_model_nargs(m);
> +	if (nargs > MAX_BPF_FUNC_ARGS)
>   		return -ENOTSUPP;
>   
> -	ret = prepare_trampoline(&ctx, &im, tlinks, func_addr, nregs, flags);
> +	nreg_args = btf_func_model_nreg_args(m);
> +
> +	ret = prepare_trampoline(&ctx, &im, tlinks, func_addr, nargs, nreg_args,
> +				 flags);
>   	if (ret < 0)
>   		return ret;
>   
> @@ -2285,7 +2358,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image,
>   				u32 flags, struct bpf_tramp_links *tlinks,
>   				void *func_addr)
>   {
> -	int ret, nregs;
> +	int ret, nargs, nreg_args;
>   	void *image, *tmp;
>   	u32 size = ro_image_end - ro_image;
>   
> @@ -2302,13 +2375,15 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image,
>   		.idx = 0,
>   	};
>   
> -	nregs = btf_func_model_nregs(m);
> -	/* the first 8 registers are used for arguments */
> -	if (nregs > 8)
> +	nargs = btf_func_model_nargs(m);
> +	if (nargs > MAX_BPF_FUNC_ARGS)
>   		return -ENOTSUPP;
>   
> +	nreg_args = btf_func_model_nreg_args(m);
> +
>   	jit_fill_hole(image, (unsigned int)(ro_image_end - ro_image));
> -	ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags);
> +	ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nargs, nreg_args,
> +				 flags);
>   
>   	if (ret > 0 && validate_code(&ctx) < 0) {
>   		ret = -EINVAL;
> diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64
> index 3c7c3e79aa93..e865451e90d2 100644
> --- a/tools/testing/selftests/bpf/DENYLIST.aarch64
> +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64
> @@ -4,9 +4,6 @@ fexit_sleep                                      # The test never returns. The r
>   kprobe_multi_bench_attach                        # needs CONFIG_FPROBE
>   kprobe_multi_test                                # needs CONFIG_FPROBE
>   module_attach                                    # prog 'kprobe_multi': failed to auto-attach: -95
> -fentry_test/fentry_many_args                     # fentry_many_args:FAIL:fentry_many_args_attach unexpected error: -524
> -fexit_test/fexit_many_args                       # fexit_many_args:FAIL:fexit_many_args_attach unexpected error: -524
> -tracing_struct/struct_many_args                  # struct_many_args:FAIL:tracing_struct_many_args__attach unexpected error: -524
>   fill_link_info/kprobe_multi_link_info            # bpf_program__attach_kprobe_multi_opts unexpected error: -95
>   fill_link_info/kretprobe_multi_link_info         # bpf_program__attach_kprobe_multi_opts unexpected error: -95
>   fill_link_info/kprobe_multi_invalid_ubuff        # bpf_program__attach_kprobe_multi_opts unexpected error: -95

It looks like this patch, similar to [1], also does not handle
parameter alignment properly [2].

For example:

int func(__int128 a, __int128 b, __int128 c, int64_t d, __int128 e, int64_t f, __int128 g)
{
}

parameter a~d are passed through register x0~x6, while parameter
e~g are passed through stack. Since __int128 is 16-byte aligned,
parameter e, f, and g should be placed at sp + 0, +16, and +32
respectively, with 8 bytes **padding** between f and g.


And the compiler's alignment or packed attribute may make things
worse, causing parameters to be placed on the stack at positions
that are not naturally aligned.

[1] https://lore.kernel.org/bpf/20230917150752.69612-1-xukuohai@huaweicloud.com/
[2] https://lore.kernel.org/bpf/CABRcYmLtk8aQEzoUFw+j5Rdd-MXf-q+i7RHXZtu-skjRz11ZDw@mail.gmail.com/
Xu Kuohai July 8, 2024, 9:33 a.m. UTC | #2
On 7/8/2024 5:00 PM, Puranjay Mohan wrote:
> Xu Kuohai <xukuohai@huaweicloud.com> writes:
> 
>> On 7/5/2024 8:53 PM, Puranjay Mohan wrote:
>>> The arm64 bpf JIT currently supports attaching the trampoline to
>>> functions with <= 8 arguments. This is because up to 8 arguments can be
>>> passed in registers r0-r7. If there are more than 8 arguments then the
>>> 9th and later arguments are passed on the stack, with SP pointing to the
>>> first stacked argument. See aapcs64[1] for more details.
>>>
>>> If the 8th argument is a structure of size > 8B, then it is passed fully
>>> on stack and r7 is not used for passing any argument. If there is a 9th
>>> argument, it will be passed on the stack, even though r7 is available.
>>>
>>> Add the support of storing and restoring arguments passed on the stack
>>> to the arm64 bpf trampoline. This will allow attaching the trampoline to
>>> functions that take up to 12 arguments.
>>>
>>> [1] https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#parameter-passing
>>>
>>> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
>>> ---
>>> Changes in V1 -> V2:
>>> V1: https://lore.kernel.org/all/20240704173227.130491-1-puranjay@kernel.org/
>>> - Fixed the argument handling for composite types (structs)
>>> ---
>>>    arch/arm64/net/bpf_jit_comp.c                | 139 ++++++++++++++-----
>>>    tools/testing/selftests/bpf/DENYLIST.aarch64 |   3 -
>>>    2 files changed, 107 insertions(+), 35 deletions(-)
>>>
> 
> [SNIP]
> 
>>>    fill_link_info/kprobe_multi_invalid_ubuff        # bpf_program__attach_kprobe_multi_opts unexpected error: -95
>>
>> It looks like this patch, similar to [1], also does not handle
>> parameter alignment properly [2].
>>
>> For example:
>>
>> int func(__int128 a, __int128 b, __int128 c, int64_t d, __int128 e, int64_t f, __int128 g)
>> {
>> }
>>
>> parameter a~d are passed through register x0~x6, while parameter
>> e~g are passed through stack. Since __int128 is 16-byte aligned,
>> parameter e, f, and g should be placed at sp + 0, +16, and +32
>> respectively, with 8 bytes **padding** between f and g.
>>
>>
>> And the compiler's alignment or packed attribute may make things
>> worse, causing parameters to be placed on the stack at positions
>> that are not naturally aligned.
> 
> Hi Xu,
> Thanks for explaining this. I was not aware that you have already sent a
> patch[1] to add this support to arm64.
> 
> So, I see that it will be non-trivial to calculate padding for each
> argument passed on the stack. If you are not planning to work on this
> then I can try to implement it.
>

Sure, go ahead and do it.

> Alsoi, do we currently have a selftest that checks for this edge case? if
> not I can try to add that too.
>

Not yet, feel free to add it. Thanks.

> Thanks,
> Puranjay
> 
>> [1] https://lore.kernel.org/bpf/20230917150752.69612-1-xukuohai@huaweicloud.com/
>> [2] https://lore.kernel.org/bpf/CABRcYmLtk8aQEzoUFw+j5Rdd-MXf-q+i7RHXZtu-skjRz11ZDw@mail.gmail.com/
diff mbox series

Patch

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 751331f5ba90..063bf5e11fc6 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -30,6 +30,8 @@ 
 #define TMP_REG_3 (MAX_BPF_JIT_REG + 3)
 #define FP_BOTTOM (MAX_BPF_JIT_REG + 4)
 #define ARENA_VM_START (MAX_BPF_JIT_REG + 5)
+/* Up to eight function arguments are passed in registers r0-r7 */
+#define ARM64_MAX_REG_ARGS 8
 
 #define check_imm(bits, imm) do {				\
 	if ((((imm) > 0) && ((imm) >> (bits))) ||		\
@@ -2001,26 +2003,51 @@  static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl,
 	}
 }
 
-static void save_args(struct jit_ctx *ctx, int args_off, int nregs)
+static void save_args(struct jit_ctx *ctx, int args_off, int orig_sp_off,
+		      int nargs, int nreg_args)
 {
+	const u8 tmp = bpf2a64[TMP_REG_1];
+	int arg_pos;
 	int i;
 
-	for (i = 0; i < nregs; i++) {
-		emit(A64_STR64I(i, A64_SP, args_off), ctx);
+	for (i = 0; i < nargs; i++) {
+		if (i < nreg_args) {
+			emit(A64_STR64I(i, A64_SP, args_off), ctx);
+		} else {
+			arg_pos = orig_sp_off + (i - nreg_args) * 8;
+			emit(A64_LDR64I(tmp, A64_SP, arg_pos), ctx);
+			emit(A64_STR64I(tmp, A64_SP, args_off), ctx);
+		}
 		args_off += 8;
 	}
 }
 
-static void restore_args(struct jit_ctx *ctx, int args_off, int nregs)
+static void restore_args(struct jit_ctx *ctx, int args_off, int nreg_args)
 {
 	int i;
 
-	for (i = 0; i < nregs; i++) {
+	for (i = 0; i < nreg_args; i++) {
 		emit(A64_LDR64I(i, A64_SP, args_off), ctx);
 		args_off += 8;
 	}
 }
 
+static void restore_stack_args(struct jit_ctx *ctx, int args_off, int stk_arg_off,
+			       int nargs, int nreg_args)
+{
+	const u8 tmp = bpf2a64[TMP_REG_1];
+	int arg_pos;
+	int i;
+
+	for (i = nreg_args; i < nargs; i++) {
+		arg_pos = args_off + i * 8;
+		emit(A64_LDR64I(tmp, A64_SP, arg_pos), ctx);
+		emit(A64_STR64I(tmp, A64_SP, stk_arg_off), ctx);
+
+		stk_arg_off += 8;
+	}
+}
+
 /* Based on the x86's implementation of arch_prepare_bpf_trampoline().
  *
  * bpf prog and function entry before bpf trampoline hooked:
@@ -2034,15 +2061,17 @@  static void restore_args(struct jit_ctx *ctx, int args_off, int nregs)
  */
 static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 			      struct bpf_tramp_links *tlinks, void *func_addr,
-			      int nregs, u32 flags)
+			      int nargs, int nreg_args, u32 flags)
 {
 	int i;
 	int stack_size;
+	int stk_arg_off;
+	int orig_sp_off;
 	int retaddr_off;
 	int regs_off;
 	int retval_off;
 	int args_off;
-	int nregs_off;
+	int nargs_off;
 	int ip_off;
 	int run_ctx_off;
 	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
@@ -2052,6 +2081,7 @@  static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	__le32 **branches = NULL;
 
 	/* trampoline stack layout:
+	 * SP + orig_sp_off [ first stack arg   ] if nargs > 8
 	 *                  [ parent ip         ]
 	 *                  [ FP                ]
 	 * SP + retaddr_off [ self ip           ]
@@ -2069,14 +2099,24 @@  static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	 *                  [ ...               ]
 	 * SP + args_off    [ arg reg 1         ]
 	 *
-	 * SP + nregs_off   [ arg regs count    ]
+	 * SP + nargs_off   [ arg count         ]
 	 *
 	 * SP + ip_off      [ traced function   ] BPF_TRAMP_F_IP_ARG flag
 	 *
 	 * SP + run_ctx_off [ bpf_tramp_run_ctx ]
+	 *
+	 *		    [ stack_argN	]
+	 *		    [ ...		]
+	 * SP + stk_arg_off [ stack_arg1	] BPF_TRAMP_F_CALL_ORIG
 	 */
 
 	stack_size = 0;
+	stk_arg_off = stack_size;
+	if ((flags & BPF_TRAMP_F_CALL_ORIG) && (nargs - nreg_args > 0)) {
+		/* room for saving arguments passed on stack */
+		stack_size += (nargs - nreg_args) * 8;
+	}
+
 	run_ctx_off = stack_size;
 	/* room for bpf_tramp_run_ctx */
 	stack_size += round_up(sizeof(struct bpf_tramp_run_ctx), 8);
@@ -2086,13 +2126,13 @@  static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	if (flags & BPF_TRAMP_F_IP_ARG)
 		stack_size += 8;
 
-	nregs_off = stack_size;
+	nargs_off = stack_size;
 	/* room for args count */
 	stack_size += 8;
 
 	args_off = stack_size;
 	/* room for args */
-	stack_size += nregs * 8;
+	stack_size += nargs * 8;
 
 	/* room for return value */
 	retval_off = stack_size;
@@ -2110,6 +2150,11 @@  static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	/* return address locates above FP */
 	retaddr_off = stack_size + 8;
 
+	/* original SP position
+	 * stack_size + parent function frame + patched function frame
+	 */
+	orig_sp_off = stack_size + 32;
+
 	/* bpf trampoline may be invoked by 3 instruction types:
 	 * 1. bl, attached to bpf prog or kernel function via short jump
 	 * 2. br, attached to bpf prog or kernel function via long jump
@@ -2135,12 +2180,12 @@  static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 		emit(A64_STR64I(A64_R(10), A64_SP, ip_off), ctx);
 	}
 
-	/* save arg regs count*/
-	emit(A64_MOVZ(1, A64_R(10), nregs, 0), ctx);
-	emit(A64_STR64I(A64_R(10), A64_SP, nregs_off), ctx);
+	/* save argument count */
+	emit(A64_MOVZ(1, A64_R(10), nargs, 0), ctx);
+	emit(A64_STR64I(A64_R(10), A64_SP, nargs_off), ctx);
 
-	/* save arg regs */
-	save_args(ctx, args_off, nregs);
+	/* save arguments passed in regs and on the stack */
+	save_args(ctx, args_off, orig_sp_off, nargs, nreg_args);
 
 	/* save callee saved registers */
 	emit(A64_STR64I(A64_R(19), A64_SP, regs_off), ctx);
@@ -2167,7 +2212,10 @@  static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	}
 
 	if (flags & BPF_TRAMP_F_CALL_ORIG) {
-		restore_args(ctx, args_off, nregs);
+		/* restore arguments that were passed in registers */
+		restore_args(ctx, args_off, nreg_args);
+		/* restore arguments that were passed on the stack */
+		restore_stack_args(ctx, args_off, stk_arg_off, nargs, nreg_args);
 		/* call original func */
 		emit(A64_LDR64I(A64_R(10), A64_SP, retaddr_off), ctx);
 		emit(A64_ADR(A64_LR, AARCH64_INSN_SIZE * 2), ctx);
@@ -2196,7 +2244,7 @@  static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	}
 
 	if (flags & BPF_TRAMP_F_RESTORE_REGS)
-		restore_args(ctx, args_off, nregs);
+		restore_args(ctx, args_off, nreg_args);
 
 	/* restore callee saved register x19 and x20 */
 	emit(A64_LDR64I(A64_R(19), A64_SP, regs_off), ctx);
@@ -2228,19 +2276,42 @@  static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	return ctx->idx;
 }
 
-static int btf_func_model_nregs(const struct btf_func_model *m)
+static int btf_func_model_nargs(const struct btf_func_model *m)
 {
-	int nregs = m->nr_args;
+	int nargs = m->nr_args;
 	int i;
 
-	/* extra registers needed for struct argument */
+	/* extra registers or stack slots needed for struct argument */
 	for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) {
 		/* The arg_size is at most 16 bytes, enforced by the verifier. */
 		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
-			nregs += (m->arg_size[i] + 7) / 8 - 1;
+			nargs += (m->arg_size[i] + 7) / 8 - 1;
 	}
 
-	return nregs;
+	return nargs;
+}
+
+/* get the count of the regs that are used to pass arguments */
+static int btf_func_model_nreg_args(const struct btf_func_model *m)
+{
+	int nargs = m->nr_args;
+	int nreg_args = 0;
+	int i;
+
+	for (i = 0; i < nargs; i++) {
+		/* The arg_size is at most 16 bytes, enforced by the verifier. */
+		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) {
+			/* struct members are all in the registers or all
+			 * on the stack.
+			 */
+			if (nreg_args + ((m->arg_size[i] + 7) / 8 - 1) > 7)
+				break;
+			nreg_args += (m->arg_size[i] + 7) / 8 - 1;
+		}
+		nreg_args++;
+	}
+
+	return (nreg_args > ARM64_MAX_REG_ARGS ? ARM64_MAX_REG_ARGS : nreg_args);
 }
 
 int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
@@ -2251,14 +2322,16 @@  int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
 		.idx = 0,
 	};
 	struct bpf_tramp_image im;
-	int nregs, ret;
+	int nargs, nreg_args, ret;
 
-	nregs = btf_func_model_nregs(m);
-	/* the first 8 registers are used for arguments */
-	if (nregs > 8)
+	nargs = btf_func_model_nargs(m);
+	if (nargs > MAX_BPF_FUNC_ARGS)
 		return -ENOTSUPP;
 
-	ret = prepare_trampoline(&ctx, &im, tlinks, func_addr, nregs, flags);
+	nreg_args = btf_func_model_nreg_args(m);
+
+	ret = prepare_trampoline(&ctx, &im, tlinks, func_addr, nargs, nreg_args,
+				 flags);
 	if (ret < 0)
 		return ret;
 
@@ -2285,7 +2358,7 @@  int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image,
 				u32 flags, struct bpf_tramp_links *tlinks,
 				void *func_addr)
 {
-	int ret, nregs;
+	int ret, nargs, nreg_args;
 	void *image, *tmp;
 	u32 size = ro_image_end - ro_image;
 
@@ -2302,13 +2375,15 @@  int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image,
 		.idx = 0,
 	};
 
-	nregs = btf_func_model_nregs(m);
-	/* the first 8 registers are used for arguments */
-	if (nregs > 8)
+	nargs = btf_func_model_nargs(m);
+	if (nargs > MAX_BPF_FUNC_ARGS)
 		return -ENOTSUPP;
 
+	nreg_args = btf_func_model_nreg_args(m);
+
 	jit_fill_hole(image, (unsigned int)(ro_image_end - ro_image));
-	ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags);
+	ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nargs, nreg_args,
+				 flags);
 
 	if (ret > 0 && validate_code(&ctx) < 0) {
 		ret = -EINVAL;
diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64
index 3c7c3e79aa93..e865451e90d2 100644
--- a/tools/testing/selftests/bpf/DENYLIST.aarch64
+++ b/tools/testing/selftests/bpf/DENYLIST.aarch64
@@ -4,9 +4,6 @@  fexit_sleep                                      # The test never returns. The r
 kprobe_multi_bench_attach                        # needs CONFIG_FPROBE
 kprobe_multi_test                                # needs CONFIG_FPROBE
 module_attach                                    # prog 'kprobe_multi': failed to auto-attach: -95
-fentry_test/fentry_many_args                     # fentry_many_args:FAIL:fentry_many_args_attach unexpected error: -524
-fexit_test/fexit_many_args                       # fexit_many_args:FAIL:fexit_many_args_attach unexpected error: -524
-tracing_struct/struct_many_args                  # struct_many_args:FAIL:tracing_struct_many_args__attach unexpected error: -524
 fill_link_info/kprobe_multi_link_info            # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 fill_link_info/kretprobe_multi_link_info         # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 fill_link_info/kprobe_multi_invalid_ubuff        # bpf_program__attach_kprobe_multi_opts unexpected error: -95