Message ID | 20230905210621.1711859-1-puranjay12@gmail.com |
---|---|
Headers | show |
Series | arm32, bpf: add support for cpuv4 insns | expand |
On Tue, Sep 05, 2023 at 09:06:15PM +0000, Puranjay Mohan wrote: > The cpuv4 added the support of an instruction that is similar to load > but also sign-extends the result after the load. > > BPF_MEMSX | <size> | BPF_LDX means dst = *(signed size *) (src + offset) > here <size> can be one of BPF_B, BPF_H, BPF_W. > > ARM32 has instructions to load a byte or a half word with sign > extension into a 32bit register. As the JIT uses two 32 bit registers > to simulate a 64-bit BPF register, an extra instruction is emitted to > sign-extent the result up to the second register. > > Signed-off-by: Puranjay Mohan <puranjay12@gmail.com> > --- > arch/arm/net/bpf_jit_32.c | 69 ++++++++++++++++++++++++++++++++++++++- > arch/arm/net/bpf_jit_32.h | 2 ++ > 2 files changed, 70 insertions(+), 1 deletion(-) > > diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c > index b26579da770e..f7c162479cf2 100644 > --- a/arch/arm/net/bpf_jit_32.c > +++ b/arch/arm/net/bpf_jit_32.c > @@ -333,6 +333,9 @@ static u32 arm_bpf_ldst_imm8(u32 op, u8 rt, u8 rn, s16 imm8) > #define ARM_LDRD_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRD_I, rt, rn, off) > #define ARM_LDRH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRH_I, rt, rn, off) > > +#define ARM_LDRSH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRSH_I, rt, rn, off) > +#define ARM_LDRSB_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRSB_I, rt, rn, off) > + > #define ARM_STR_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_STR_I, rt, rn, off) > #define ARM_STRB_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_STRB_I, rt, rn, off) > #define ARM_STRD_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_STRD_I, rt, rn, off) > @@ -1026,6 +1029,24 @@ static bool is_ldst_imm(s16 off, const u8 size) > return -off_max <= off && off <= off_max; > } > > +static bool is_ldst_imm8(s16 off, const u8 size) > +{ > + s16 off_max = 0; > + > + switch (size) { > + case BPF_B: > + off_max = 0xff; > + break; > + case BPF_W: > + off_max = 0xfff; > + break; > + case BPF_H: > + off_max = 0xff; > + break; > + } > + return -off_max <= off && off <= off_max; > +} > + > /* *(size *)(dst + off) = src */ > static inline void emit_str_r(const s8 dst, const s8 src[], > s16 off, struct jit_ctx *ctx, const u8 sz){ > @@ -1105,6 +1126,45 @@ static inline void emit_ldx_r(const s8 dst[], const s8 src, > arm_bpf_put_reg64(dst, rd, ctx); > } > > +/* dst = *(signed size*)(src + off) */ > +static inline void emit_ldsx_r(const s8 dst[], const s8 src, > + s16 off, struct jit_ctx *ctx, const u8 sz){ > + const s8 *tmp = bpf2a32[TMP_REG_1]; > + const s8 *rd = is_stacked(dst_lo) ? tmp : dst; > + s8 rm = src; > + > + if (!is_ldst_imm8(off, sz)) { > + emit_a32_mov_i(tmp[0], off, ctx); > + emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx); Hmm. This looks inefficient when "off" is able to fit in an immediate. Please try: int add_off; if (!is_ldst_imm8(off, sz)) { add_off = imm8m(off); if (add_off > 0) { emit(ARM_ADD_I(tmp[0], src, add_off), ctx); rm = tmp[0]; } else { emit_a32_mov_i(tmp[0], off, ctx); emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx); rm = tmp[0]; } off = 0; > + } else if (rd[1] == rm) { > + emit(ARM_MOV_R(tmp[0], rm), ctx); > + rm = tmp[0]; Why do you need this? rd and rm can be the same for LDRS[BH]. > + } > + switch (sz) { > + case BPF_B: > + /* Load a Byte with sign extension*/ > + emit(ARM_LDRSB_I(rd[1], rm, off), ctx); > + /* Carry the sign extension to upper 32 bits */ > + emit(ARM_ASR_I(rd[0], rd[1], 31), ctx); > + break; > + case BPF_H: > + /* Load a HalfWord with sign extension*/ > + emit(ARM_LDRSH_I(rd[1], rm, off), ctx); > + /* Carry the sign extension to upper 32 bits */ > + emit(ARM_ASR_I(rd[0], rd[1], 31), ctx); > + break; > + case BPF_W: > + /* Load a Word*/ > + emit(ARM_LDR_I(rd[1], rm, off), ctx); > + /* Carry the sign extension to upper 32 bits */ > + emit(ARM_ASR_I(rd[0], rd[1], 31), ctx); The last instruction extending to the upper 32 bits is the same in each of these cases, so is there any reason not to do it outside the switch statement?
On Tue, Sep 05, 2023 at 09:06:18PM +0000, Puranjay Mohan wrote: > The cpuv4 added a new BPF_SDIV instruction that does signed division. > The encoding is similar to BPF_DIV but BPF_SDIV sets offset=1. > > ARM32 already supports 32-bit BPF_DIV which can be easily extended to > support BPF_SDIV as ARM32 has the SDIV instruction. When the CPU is not > ARM-v7, we implement that SDIV/SMOD with the function call similar to > the implementation of DIV/MOD. > > Signed-off-by: Puranjay Mohan <puranjay12@gmail.com> > --- > arch/arm/net/bpf_jit_32.c | 26 ++++++++++++++++++++------ > arch/arm/net/bpf_jit_32.h | 2 ++ > 2 files changed, 22 insertions(+), 6 deletions(-) > > diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c > index 09496203f13e..f580ecf75710 100644 > --- a/arch/arm/net/bpf_jit_32.c > +++ b/arch/arm/net/bpf_jit_32.c > @@ -228,6 +228,16 @@ static u32 jit_mod32(u32 dividend, u32 divisor) > return dividend % divisor; > } > > +static s32 jit_sdiv32(s32 dividend, s32 divisor) > +{ > + return dividend / divisor; > +} > + > +static s32 jit_smod32(s32 dividend, s32 divisor) > +{ > + return dividend % divisor; > +} > + > static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx) > { > inst |= (cond << 28); > @@ -477,7 +487,7 @@ static inline int epilogue_offset(const struct jit_ctx *ctx) > return to - from - 2; > } > > -static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) > +static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op, u8 sign) > { > const int exclude_mask = BIT(ARM_R0) | BIT(ARM_R1); > const s8 *tmp = bpf2a32[TMP_REG_1]; > @@ -485,9 +495,10 @@ static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) > #if __LINUX_ARM_ARCH__ == 7 > if (elf_hwcap & HWCAP_IDIVA) { > if (op == BPF_DIV) > - emit(ARM_UDIV(rd, rm, rn), ctx); > + sign ? emit(ARM_SDIV(rd, rm, rn), ctx) : emit(ARM_UDIV(rd, rm, rn), ctx); Oh no, let's not go using the ternary operator like that. If we want to use the ternary operator, then: emit(sign ? ARM_SDIV(rd, rm, rn) : ARM_UDIV(rd, rm, rn), ctx); would be _much_ better, since what is actually conditional is the value passed to emit(). If we want to avoid the ternary operator altogether, then obviously if() emit() else emit(), but I'd prefer my suggestion above. > /* Call appropriate function */ > - emit_mov_i(ARM_IP, op == BPF_DIV ? > - (u32)jit_udiv32 : (u32)jit_mod32, ctx); > + if (sign) > + emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_sdiv32 : (u32)jit_smod32, ctx); > + else > + emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_udiv32 : (u32)jit_mod32, ctx); u32 dst; if (sign) { if (op == BPF_DIV) dst = (u32)jit_sdiv32; else dst = (u32)jit_smod32; } else { if (op == BPF_DIV) dst = (u32)jit_udiv32; else dst = (u32)hit_mod32; } emit_mov_i(ARM_IP, dst, dtx); > emit_blx_r(ARM_IP, ctx); > > /* Restore caller-saved registers from stack */
On Tue, Sep 05 2023, Russell King (Oracle) wrote: > On Tue, Sep 05, 2023 at 09:06:19PM +0000, Puranjay Mohan wrote: >> +cont: >> + >> + /* Call appropriate function */ >> + if (sign) >> + emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_sdiv64 : (u32)jit_smod64, ctx); >> + else >> + emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_udiv64 : (u32)jit_mod64, ctx); > > Same comment as the previous patch here. Will fix both in next version. > >> + >> + emit_blx_r(ARM_IP, ctx); >> + >> + /* Save return value */ >> + if (rd[1] != ARM_R0) { >> + emit(ARM_MOV_R(rd[0], ARM_R1), ctx); >> + emit(ARM_MOV_R(rd[1], ARM_R0), ctx); >> + } >> + >> + /* Recover {R1, R0} from stack if it is not Rd */ >> + if (rd[1] != ARM_R0) >> + emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); >> + else >> + emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); >> + >> + /* Recover {R3, R2} from stack if it is not Rd */ >> + if (rd[1] != ARM_R2) >> + emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); >> + else >> + emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); > > if (rd[1] != ARM_R0) { > emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); > emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); > } else if (rd[1] != ARM_R2) { > emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); > emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); > } else { > emit(ARM_ADD_I(ARM_SP, ARM_SP, 16), ctx); > } > > Hmm? Actually, there can also be a situation where rd[1] != ARM_R0 && rd[1] != ARM_R2, so should I do it like: if (rd[1] != ARM_R0 && rd[1] != ARM_R2) { emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); } else if (rd[1] != ARM_R0) { emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); } else if (rd[1] != ARM_R2) { emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); } else { emit(ARM_ADD_I(ARM_SP, ARM_SP, 16), ctx); } Thanks, Puranjay
On Tue, Sep 05 2023, Russell King (Oracle) wrote: [...] >> +/* dst = *(signed size*)(src + off) */ >> +static inline void emit_ldsx_r(const s8 dst[], const s8 src, >> + s16 off, struct jit_ctx *ctx, const u8 sz){ >> + const s8 *tmp = bpf2a32[TMP_REG_1]; >> + const s8 *rd = is_stacked(dst_lo) ? tmp : dst; >> + s8 rm = src; >> + >> + if (!is_ldst_imm8(off, sz)) { >> + emit_a32_mov_i(tmp[0], off, ctx); >> + emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx); > > Hmm. This looks inefficient when "off" is able to fit in an immediate. > Please try: > > int add_off; > > if (!is_ldst_imm8(off, sz)) { > add_off = imm8m(off); > if (add_off > 0) { > emit(ARM_ADD_I(tmp[0], src, add_off), ctx); > rm = tmp[0]; > } else { > emit_a32_mov_i(tmp[0], off, ctx); > emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx); > rm = tmp[0]; > } > off = 0; >> + } else if (rd[1] == rm) { >> + emit(ARM_MOV_R(tmp[0], rm), ctx); >> + rm = tmp[0]; > > Why do you need this? rd and rm can be the same for LDRS[BH]. I agree that this is not required, will remove in the next version. Will also use the suggested optimization for immediate. >> + } >> + switch (sz) { >> + case BPF_B: >> + /* Load a Byte with sign extension*/ >> + emit(ARM_LDRSB_I(rd[1], rm, off), ctx); >> + /* Carry the sign extension to upper 32 bits */ >> + emit(ARM_ASR_I(rd[0], rd[1], 31), ctx); >> + break; >> + case BPF_H: >> + /* Load a HalfWord with sign extension*/ >> + emit(ARM_LDRSH_I(rd[1], rm, off), ctx); >> + /* Carry the sign extension to upper 32 bits */ >> + emit(ARM_ASR_I(rd[0], rd[1], 31), ctx); >> + break; >> + case BPF_W: >> + /* Load a Word*/ >> + emit(ARM_LDR_I(rd[1], rm, off), ctx); >> + /* Carry the sign extension to upper 32 bits */ >> + emit(ARM_ASR_I(rd[0], rd[1], 31), ctx); > > The last instruction extending to the upper 32 bits is the same in each > of these cases, so is there any reason not to do it outside the switch > statement? Will move it outside in the next version. Thanks, Puranjay
On Wed, Sep 06, 2023 at 09:29:19AM +0000, Puranjay Mohan wrote: > On Tue, Sep 05 2023, Russell King (Oracle) wrote: > > > On Tue, Sep 05, 2023 at 09:06:19PM +0000, Puranjay Mohan wrote: > Actually, there can also be a situation where rd[1] != ARM_R0 && rd[1] != ARM_R2, > so should I do it like: > > if (rd[1] != ARM_R0 && rd[1] != ARM_R2) { > emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); > emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); > } else if (rd[1] != ARM_R0) { > emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); > emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); > } else if (rd[1] != ARM_R2) { > emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); > emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); > } else { > emit(ARM_ADD_I(ARM_SP, ARM_SP, 16), ctx); > } Are you sure all four states are possible?
On Wed, Sep 06 2023, Russell King (Oracle) wrote: > On Wed, Sep 06, 2023 at 09:29:19AM +0000, Puranjay Mohan wrote: >> On Tue, Sep 05 2023, Russell King (Oracle) wrote: >> >> > On Tue, Sep 05, 2023 at 09:06:19PM +0000, Puranjay Mohan wrote: >> Actually, there can also be a situation where rd[1] != ARM_R0 && rd[1] != ARM_R2, >> so should I do it like: >> >> if (rd[1] != ARM_R0 && rd[1] != ARM_R2) { >> emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); >> emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); >> } else if (rd[1] != ARM_R0) { >> emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); >> emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); >> } else if (rd[1] != ARM_R2) { >> emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); >> emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); >> } else { >> emit(ARM_ADD_I(ARM_SP, ARM_SP, 16), ctx); >> } > > Are you sure all four states are possible? ohh! I just realized that the last else will never run. rd[1] can never be equal to both ARM_R0 and ARM_R2. Will fix it in V3 as I already sent out the V2. I need to learn to leave patches on the list for few days before re-spinning. Thanks, Puranjay
On Wed, Sep 06, 2023 at 07:19:50PM +0000, Puranjay Mohan wrote: > On Wed, Sep 06 2023, Russell King (Oracle) wrote: > > > On Wed, Sep 06, 2023 at 09:29:19AM +0000, Puranjay Mohan wrote: > >> On Tue, Sep 05 2023, Russell King (Oracle) wrote: > >> > >> > On Tue, Sep 05, 2023 at 09:06:19PM +0000, Puranjay Mohan wrote: > >> Actually, there can also be a situation where rd[1] != ARM_R0 && rd[1] != ARM_R2, > >> so should I do it like: > >> > >> if (rd[1] != ARM_R0 && rd[1] != ARM_R2) { > >> emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); > >> emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); > >> } else if (rd[1] != ARM_R0) { > >> emit(ARM_POP(BIT(ARM_R0) | BIT(ARM_R1)), ctx); > >> emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); > >> } else if (rd[1] != ARM_R2) { > >> emit(ARM_ADD_I(ARM_SP, ARM_SP, 8), ctx); > >> emit(ARM_POP(BIT(ARM_R2) | BIT(ARM_R3)), ctx); > >> } else { > >> emit(ARM_ADD_I(ARM_SP, ARM_SP, 16), ctx); > >> } > > > > Are you sure all four states are possible? > > ohh! > > I just realized that the last else will never run. > rd[1] can never be equal to both ARM_R0 and ARM_R2. > Will fix it in V3 as I already sent out the V2. > > I need to learn to leave patches on the list for few days before re-spinning. The last comment on that is you can pop r0-r3 in one go, rather than using two instructions.