diff mbox

[v2,3/8] ARM: add macro to perform far branches (b/bl)

Message ID 1426248452-4773-4-git-send-email-ard.biesheuvel@linaro.org
State New
Headers show

Commit Message

Ard Biesheuvel March 13, 2015, 12:07 p.m. UTC
These macros execute PC-relative branches, but with a larger
reach than the 24 bits that are available in the b and bl opcodes.

Acked-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm/include/asm/assembler.h | 83 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

Comments

Ard Biesheuvel March 17, 2015, 8:35 p.m. UTC | #1
On 13 March 2015 at 17:40, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> On Fri, Mar 13, 2015 at 01:07:27PM +0100, Ard Biesheuvel wrote:
>> +     .macro  bl_abs, target, c=
>> +#if defined(CONFIG_CPU_32v7) || defined(CONFIG_CPU_32v7M)
>> +     movt\c  lr, #:upper16:\target
>> +     movw\c  lr, #:lower16:\target
>> +     blx\c   lr
>
> So I've looked this up, and it's valid, which is surprising because BLX
> itself writes to LR - the read from LR must happen before BLX itself
> writes to LR.  Thankfully, because of the pipelining, this is probably
> guaranteed.
>

I hadn't given it another thought, to be honest, as arithmetic
instructions can also use the same register as input and output.
But I suppose branch instructions don't go through all the ordinary
pipeline stages

> I wonder whether there will be any errata on this... maybe on non-ARM
> CPUs?  It'll be interesting to find out what happens once we merge
> this... :)
>
Ard Biesheuvel March 18, 2015, 10:07 a.m. UTC | #2
On 13 March 2015 at 13:07, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> These macros execute PC-relative branches, but with a larger
> reach than the 24 bits that are available in the b and bl opcodes.
>
> Acked-by: Nicolas Pitre <nico@linaro.org>
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
>  arch/arm/include/asm/assembler.h | 83 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 83 insertions(+)
>
> diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
> index f67fd3afebdf..2e7f55194782 100644
> --- a/arch/arm/include/asm/assembler.h
> +++ b/arch/arm/include/asm/assembler.h
> @@ -88,6 +88,17 @@
>  #endif
>
>  /*
> + * The program counter is always ahead of the address of the currently
> + * executing instruction by PC_BIAS bytes, whose value differs depending
> + * on the execution mode.
> + */
> +#ifdef CONFIG_THUMB2_KERNEL
> +#define PC_BIAS                4
> +#else
> +#define PC_BIAS                8
> +#endif
> +
> +/*
>   * Enable and disable interrupts
>   */
>  #if __LINUX_ARM_ARCH__ >= 6
> @@ -108,6 +119,78 @@
>         .endm
>  #endif
>
> +       /*
> +        * Macros to emit relative conditional branches that may exceed the
> +        * range of the 24-bit immediate of the ordinary b/bl instructions.
> +        * NOTE: this doesn't work with locally defined symbols, as they
> +        * lack the ARM/Thumb annotation (even if they are annotated as
> +        * functions)
> +        */
> +       .macro  b_far, target, tmpreg, c=
> +#if defined(CONFIG_CPU_32v7) || defined(CONFIG_CPU_32v7M)
> +       movt\c  \tmpreg, #:upper16:(\target - (8888f + PC_BIAS))
> +       movw\c  \tmpreg, #:lower16:(\target - (8888f + PC_BIAS))
> +8888:  add\c   pc, pc, \tmpreg
> +#else
> +       ldr\c   \tmpreg, 8889f
> +8888:  add\c   pc, pc, \tmpreg
> +       .ifnb   \c
> +       b       8890f
> +       .endif
> +8889:  .long   \target - (8888b + PC_BIAS)
> +8890:
> +#endif
> +       .endm

Actually, I have found something better:

add\c \tmpreg, pc, #:pc_g0_nc:\target - PC_BIAS
add\c \tmpreg, \tmpreg, #:pc_g1_nc:\target - PC_BIAS + 4
add\c pc, \tmpreg, #:pc_g2:\target - PC_BIAS + 8

This uses a PC-relative group relocation to split the offset into
12-bit chunks and poke them into the add instructions
This way, we don't need the literal at all.

Note that add with pc as destination is ARM-only, so we should
probably retain the v7 movw/movt regardless


> +
> +       .macro  bl_far, target, c=
> +#if defined(CONFIG_CPU_32v7) || defined(CONFIG_CPU_32v7M)
> +       movt\c  ip, #:upper16:(\target - (8887f + PC_BIAS))
> +       movw\c  ip, #:lower16:(\target - (8887f + PC_BIAS))
> +8887:  add\c   ip, ip, pc
> +       blx\c   ip
> +#else
> +       adr\c   lr, 8887f
> +       b_far   \target, ip, \c
> +8887:
> +#endif
> +       .endm
> +
> +       /*
> +        * Macros to emit absolute conditional branches: these are preferred
> +        * over the far variants above because they use fewer instructions
> +        * and/or use implicit literals that the assembler can group together
> +        * to optimize cache utilization. However, they can only be used to
> +        * call functions at their link time address, which rules out early boot
> +        * code that executes with the MMU off.
> +        * The v7 variant uses a movt/movw pair to prevent potential D-cache
> +        * stalls on the literal, so using these macros is preferred over using
> +        * 'ldr pc, =XXX' directly (unless no scratch register is available)
> +        * NOTE: this doesn't work with locally defined symbols, as they
> +        * lack the ARM/Thumb annotation (even if they are annotated as
> +        * functions)
> +        */
> +       .macro  b_abs, target, tmpreg, c=
> +#if defined(CONFIG_CPU_32v7) || defined(CONFIG_CPU_32v7M)
> +       movt\c  \tmpreg, #:upper16:\target
> +       movw\c  \tmpreg, #:lower16:\target
> +       bx\c    \tmpreg
> +#else
> +       ldr\c   pc, =\target
> +#endif
> +       .endm
> +
> +       .macro  bl_abs, target, c=
> +#if defined(CONFIG_CPU_32v7) || defined(CONFIG_CPU_32v7M)
> +       movt\c  lr, #:upper16:\target
> +       movw\c  lr, #:lower16:\target
> +       blx\c   lr
> +#else
> +       adr\c   lr, BSYM(8886f)
> +       ldr\c   pc, =\target
> +8886:
> +#endif
> +       .endm
> +
>         .macro asm_trace_hardirqs_off
>  #if defined(CONFIG_TRACE_IRQFLAGS)
>         stmdb   sp!, {r0-r3, ip, lr}
> --
> 1.8.3.2
>
diff mbox

Patch

diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index f67fd3afebdf..2e7f55194782 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -88,6 +88,17 @@ 
 #endif
 
 /*
+ * The program counter is always ahead of the address of the currently
+ * executing instruction by PC_BIAS bytes, whose value differs depending
+ * on the execution mode.
+ */
+#ifdef CONFIG_THUMB2_KERNEL
+#define PC_BIAS		4
+#else
+#define PC_BIAS		8
+#endif
+
+/*
  * Enable and disable interrupts
  */
 #if __LINUX_ARM_ARCH__ >= 6
@@ -108,6 +119,78 @@ 
 	.endm
 #endif
 
+	/*
+	 * Macros to emit relative conditional branches that may exceed the
+	 * range of the 24-bit immediate of the ordinary b/bl instructions.
+	 * NOTE: this doesn't work with locally defined symbols, as they
+	 * lack the ARM/Thumb annotation (even if they are annotated as
+	 * functions)
+	 */
+	.macro  b_far, target, tmpreg, c=
+#if defined(CONFIG_CPU_32v7) || defined(CONFIG_CPU_32v7M)
+	movt\c	\tmpreg, #:upper16:(\target - (8888f + PC_BIAS))
+	movw\c	\tmpreg, #:lower16:(\target - (8888f + PC_BIAS))
+8888:	add\c	pc, pc, \tmpreg
+#else
+	ldr\c	\tmpreg, 8889f
+8888:	add\c	pc, pc, \tmpreg
+	.ifnb	\c
+	b	8890f
+	.endif
+8889:	.long	\target - (8888b + PC_BIAS)
+8890:
+#endif
+	.endm
+
+	.macro	bl_far, target, c=
+#if defined(CONFIG_CPU_32v7) || defined(CONFIG_CPU_32v7M)
+	movt\c	ip, #:upper16:(\target - (8887f + PC_BIAS))
+	movw\c	ip, #:lower16:(\target - (8887f + PC_BIAS))
+8887:	add\c	ip, ip, pc
+	blx\c	ip
+#else
+	adr\c	lr, 8887f
+	b_far	\target, ip, \c
+8887:
+#endif
+	.endm
+
+	/*
+	 * Macros to emit absolute conditional branches: these are preferred
+	 * over the far variants above because they use fewer instructions
+	 * and/or use implicit literals that the assembler can group together
+	 * to optimize cache utilization. However, they can only be used to
+	 * call functions at their link time address, which rules out early boot
+	 * code that executes with the MMU off.
+	 * The v7 variant uses a movt/movw pair to prevent potential D-cache
+	 * stalls on the literal, so using these macros is preferred over using
+	 * 'ldr pc, =XXX' directly (unless no scratch register is available)
+	 * NOTE: this doesn't work with locally defined symbols, as they
+	 * lack the ARM/Thumb annotation (even if they are annotated as
+	 * functions)
+	 */
+	.macro	b_abs, target, tmpreg, c=
+#if defined(CONFIG_CPU_32v7) || defined(CONFIG_CPU_32v7M)
+	movt\c	\tmpreg, #:upper16:\target
+	movw\c	\tmpreg, #:lower16:\target
+	bx\c	\tmpreg
+#else
+	ldr\c	pc, =\target
+#endif
+	.endm
+
+	.macro	bl_abs, target, c=
+#if defined(CONFIG_CPU_32v7) || defined(CONFIG_CPU_32v7M)
+	movt\c	lr, #:upper16:\target
+	movw\c	lr, #:lower16:\target
+	blx\c	lr
+#else
+	adr\c	lr, BSYM(8886f)
+	ldr\c	pc, =\target
+8886:
+#endif
+	.endm
+
 	.macro asm_trace_hardirqs_off
 #if defined(CONFIG_TRACE_IRQFLAGS)
 	stmdb   sp!, {r0-r3, ip, lr}