diff mbox

ARM: decompressor: ensure I-side picks up relocated code

Message ID 1414688066-11043-1-git-send-email-will.deacon@arm.com
State Accepted
Commit 238962ac71910d6c20162ea5230685fead1836a4
Headers show

Commit Message

Will Deacon Oct. 30, 2014, 4:54 p.m. UTC
To speed up decompression, the decompressor sets up a flat, cacheable
mapping of memory. However, when there is insufficient space to hold
the page tables for this mapping, we don't bother to enable the caches
and subsequently skip all the cache maintenance hooks.

Skipping the cache maintenance before jumping to the relocated code
allows the processor to predict the branch and populate the I-cache
with stale data before the relocation loop has completed (since a
bootloader may have SCTLR.I set, which permits normal, cacheable
instruction fetches regardless of SCTLR.M).

This patch moves the cache maintenance check into the maintenance
routines themselves, allowing the v6/v7 versions to invalidate the
I-cache regardless of the MMU state.

Cc: Julien Grall <julien.grall@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---

This fixes boot on my TC2 w/ multi_v7_defconfig. Julien was also
reporting decompressor failures with a Xen payload, so hopefully this
helps him too. If so, this is probably a candidate for stable.

 arch/arm/boot/compressed/head.S | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

Comments

Florian Fainelli Oct. 30, 2014, 7:28 p.m. UTC | #1
On 10/30/2014 09:54 AM, Will Deacon wrote:
> To speed up decompression, the decompressor sets up a flat, cacheable
> mapping of memory. However, when there is insufficient space to hold
> the page tables for this mapping, we don't bother to enable the caches
> and subsequently skip all the cache maintenance hooks.
> 
> Skipping the cache maintenance before jumping to the relocated code
> allows the processor to predict the branch and populate the I-cache
> with stale data before the relocation loop has completed (since a
> bootloader may have SCTLR.I set, which permits normal, cacheable
> instruction fetches regardless of SCTLR.M).
> 
> This patch moves the cache maintenance check into the maintenance
> routines themselves, allowing the v6/v7 versions to invalidate the
> I-cache regardless of the MMU state.
> 
> Cc: Julien Grall <julien.grall@linaro.org>
> Signed-off-by: Will Deacon <will.deacon@arm.com>

I guess we could credit Marc as well for reporting and providing early
patches addressing this?

http://comments.gmane.org/gmane.linux.ports.arm.kernel/347950

> ---
> 
> This fixes boot on my TC2 w/ multi_v7_defconfig. Julien was also
> reporting decompressor failures with a Xen payload, so hopefully this
> helps him too. If so, this is probably a candidate for stable.
> 
>  arch/arm/boot/compressed/head.S | 20 ++++++++++++++++----
>  1 file changed, 16 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
> index 413fd94b5301..68be9017593d 100644
> --- a/arch/arm/boot/compressed/head.S
> +++ b/arch/arm/boot/compressed/head.S
> @@ -397,8 +397,7 @@ dtb_check_done:
>  		add	sp, sp, r6
>  #endif
>  
> -		tst	r4, #1
> -		bleq	cache_clean_flush
> +		bl	cache_clean_flush
>  
>  		adr	r0, BSYM(restart)
>  		add	r0, r0, r6
> @@ -1047,6 +1046,8 @@ cache_clean_flush:
>  		b	call_cache_fn
>  
>  __armv4_mpu_cache_flush:
> +		tst	r4, #1
> +		movne	pc, lr
>  		mov	r2, #1
>  		mov	r3, #0
>  		mcr	p15, 0, ip, c7, c6, 0	@ invalidate D cache
> @@ -1064,6 +1065,8 @@ __armv4_mpu_cache_flush:
>  		mov	pc, lr
>  		
>  __fa526_cache_flush:
> +		tst	r4, #1
> +		movne	pc, lr
>  		mov	r1, #0
>  		mcr	p15, 0, r1, c7, c14, 0	@ clean and invalidate D cache
>  		mcr	p15, 0, r1, c7, c5, 0	@ flush I cache
> @@ -1072,13 +1075,16 @@ __fa526_cache_flush:
>  
>  __armv6_mmu_cache_flush:
>  		mov	r1, #0
> -		mcr	p15, 0, r1, c7, c14, 0	@ clean+invalidate D
> +		tst	r4, #1
> +		mcreq	p15, 0, r1, c7, c14, 0	@ clean+invalidate D
>  		mcr	p15, 0, r1, c7, c5, 0	@ invalidate I+BTB
> -		mcr	p15, 0, r1, c7, c15, 0	@ clean+invalidate unified
> +		mcreq	p15, 0, r1, c7, c15, 0	@ clean+invalidate unified
>  		mcr	p15, 0, r1, c7, c10, 4	@ drain WB
>  		mov	pc, lr
>  
>  __armv7_mmu_cache_flush:
> +		tst	r4, #1
> +		bne	iflush
>  		mrc	p15, 0, r10, c0, c1, 5	@ read ID_MMFR1
>  		tst	r10, #0xf << 16		@ hierarchical cache (ARMv7)
>  		mov	r10, #0
> @@ -1139,6 +1145,8 @@ iflush:
>  		mov	pc, lr
>  
>  __armv5tej_mmu_cache_flush:
> +		tst	r4, #1
> +		movne	pc, lr
>  1:		mrc	p15, 0, r15, c7, c14, 3	@ test,clean,invalidate D cache
>  		bne	1b
>  		mcr	p15, 0, r0, c7, c5, 0	@ flush I cache
> @@ -1146,6 +1154,8 @@ __armv5tej_mmu_cache_flush:
>  		mov	pc, lr
>  
>  __armv4_mmu_cache_flush:
> +		tst	r4, #1
> +		movne	pc, lr
>  		mov	r2, #64*1024		@ default: 32K dcache size (*2)
>  		mov	r11, #32		@ default: 32 byte line size
>  		mrc	p15, 0, r3, c0, c0, 1	@ read cache type
> @@ -1179,6 +1189,8 @@ no_cache_id:
>  
>  __armv3_mmu_cache_flush:
>  __armv3_mpu_cache_flush:
> +		tst	r4, #1
> +		movne	pc, lr
>  		mov	r1, #0
>  		mcr	p15, 0, r1, c7, c0, 0	@ invalidate whole cache v3
>  		mov	pc, lr
>
Julien Grall Oct. 30, 2014, 11:09 p.m. UTC | #2
Hi Will,

On 30/10/2014 16:54, Will Deacon wrote:
> To speed up decompression, the decompressor sets up a flat, cacheable
> mapping of memory. However, when there is insufficient space to hold
> the page tables for this mapping, we don't bother to enable the caches
> and subsequently skip all the cache maintenance hooks.
>
> Skipping the cache maintenance before jumping to the relocated code
> allows the processor to predict the branch and populate the I-cache
> with stale data before the relocation loop has completed (since a
> bootloader may have SCTLR.I set, which permits normal, cacheable
> instruction fetches regardless of SCTLR.M).
>
> This patch moves the cache maintenance check into the maintenance
> routines themselves, allowing the v6/v7 versions to invalidate the
> I-cache regardless of the MMU state.
>
> Cc: Julien Grall <julien.grall@linaro.org>
> Signed-off-by: Will Deacon <will.deacon@arm.com>
> ---
>
> This fixes boot on my TC2 w/ multi_v7_defconfig. Julien was also
> reporting decompressor failures with a Xen payload, so hopefully this
> helps him too. If so, this is probably a candidate for stable.

With this patch, I'm able to boot a multi_v7_defconfig guest on Xen.
It might also fix some issue that Christoffer Dall saw on KVM.

Tested-by: Julien Grall <julien.grall@linaro.org>

Regards,
Russell King - ARM Linux Oct. 30, 2014, 11:37 p.m. UTC | #3
On Thu, Oct 30, 2014 at 12:28:14PM -0700, Florian Fainelli wrote:
> On 10/30/2014 09:54 AM, Will Deacon wrote:
> > To speed up decompression, the decompressor sets up a flat, cacheable
> > mapping of memory. However, when there is insufficient space to hold
> > the page tables for this mapping, we don't bother to enable the caches
> > and subsequently skip all the cache maintenance hooks.
> > 
> > Skipping the cache maintenance before jumping to the relocated code
> > allows the processor to predict the branch and populate the I-cache
> > with stale data before the relocation loop has completed (since a
> > bootloader may have SCTLR.I set, which permits normal, cacheable
> > instruction fetches regardless of SCTLR.M).
> > 
> > This patch moves the cache maintenance check into the maintenance
> > routines themselves, allowing the v6/v7 versions to invalidate the
> > I-cache regardless of the MMU state.
> > 
> > Cc: Julien Grall <julien.grall@linaro.org>
> > Signed-off-by: Will Deacon <will.deacon@arm.com>
> 
> I guess we could credit Marc as well for reporting and providing early
> patches addressing this?

If we're going to start doing that, then it should also have:

Suggested-by: Russell King <rmk+kernel@arm.linux.org.uk>

since Will and myself discussed it earlier today and I suggested moving
the tst check into the cache_clean_flush methods as a way to solve this
issue.
Will Deacon Oct. 31, 2014, 10:04 a.m. UTC | #4
On Thu, Oct 30, 2014 at 07:28:14PM +0000, Florian Fainelli wrote:
> On 10/30/2014 09:54 AM, Will Deacon wrote:
> > To speed up decompression, the decompressor sets up a flat, cacheable
> > mapping of memory. However, when there is insufficient space to hold
> > the page tables for this mapping, we don't bother to enable the caches
> > and subsequently skip all the cache maintenance hooks.
> > 
> > Skipping the cache maintenance before jumping to the relocated code
> > allows the processor to predict the branch and populate the I-cache
> > with stale data before the relocation loop has completed (since a
> > bootloader may have SCTLR.I set, which permits normal, cacheable
> > instruction fetches regardless of SCTLR.M).
> > 
> > This patch moves the cache maintenance check into the maintenance
> > routines themselves, allowing the v6/v7 versions to invalidate the
> > I-cache regardless of the MMU state.
> > 
> > Cc: Julien Grall <julien.grall@linaro.org>
> > Signed-off-by: Will Deacon <will.deacon@arm.com>
> 
> I guess we could credit Marc as well for reporting and providing early
> patches addressing this?
> 
> http://comments.gmane.org/gmane.linux.ports.arm.kernel/347950

Thanks, I hadn't noticed that thread. Not sure what tag I'd add though --
Marc could ack it I suppose?

Maybe I can collect the maximal set of tags.

Will
diff mbox

Patch

diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index 413fd94b5301..68be9017593d 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -397,8 +397,7 @@  dtb_check_done:
 		add	sp, sp, r6
 #endif
 
-		tst	r4, #1
-		bleq	cache_clean_flush
+		bl	cache_clean_flush
 
 		adr	r0, BSYM(restart)
 		add	r0, r0, r6
@@ -1047,6 +1046,8 @@  cache_clean_flush:
 		b	call_cache_fn
 
 __armv4_mpu_cache_flush:
+		tst	r4, #1
+		movne	pc, lr
 		mov	r2, #1
 		mov	r3, #0
 		mcr	p15, 0, ip, c7, c6, 0	@ invalidate D cache
@@ -1064,6 +1065,8 @@  __armv4_mpu_cache_flush:
 		mov	pc, lr
 		
 __fa526_cache_flush:
+		tst	r4, #1
+		movne	pc, lr
 		mov	r1, #0
 		mcr	p15, 0, r1, c7, c14, 0	@ clean and invalidate D cache
 		mcr	p15, 0, r1, c7, c5, 0	@ flush I cache
@@ -1072,13 +1075,16 @@  __fa526_cache_flush:
 
 __armv6_mmu_cache_flush:
 		mov	r1, #0
-		mcr	p15, 0, r1, c7, c14, 0	@ clean+invalidate D
+		tst	r4, #1
+		mcreq	p15, 0, r1, c7, c14, 0	@ clean+invalidate D
 		mcr	p15, 0, r1, c7, c5, 0	@ invalidate I+BTB
-		mcr	p15, 0, r1, c7, c15, 0	@ clean+invalidate unified
+		mcreq	p15, 0, r1, c7, c15, 0	@ clean+invalidate unified
 		mcr	p15, 0, r1, c7, c10, 4	@ drain WB
 		mov	pc, lr
 
 __armv7_mmu_cache_flush:
+		tst	r4, #1
+		bne	iflush
 		mrc	p15, 0, r10, c0, c1, 5	@ read ID_MMFR1
 		tst	r10, #0xf << 16		@ hierarchical cache (ARMv7)
 		mov	r10, #0
@@ -1139,6 +1145,8 @@  iflush:
 		mov	pc, lr
 
 __armv5tej_mmu_cache_flush:
+		tst	r4, #1
+		movne	pc, lr
 1:		mrc	p15, 0, r15, c7, c14, 3	@ test,clean,invalidate D cache
 		bne	1b
 		mcr	p15, 0, r0, c7, c5, 0	@ flush I cache
@@ -1146,6 +1154,8 @@  __armv5tej_mmu_cache_flush:
 		mov	pc, lr
 
 __armv4_mmu_cache_flush:
+		tst	r4, #1
+		movne	pc, lr
 		mov	r2, #64*1024		@ default: 32K dcache size (*2)
 		mov	r11, #32		@ default: 32 byte line size
 		mrc	p15, 0, r3, c0, c0, 1	@ read cache type
@@ -1179,6 +1189,8 @@  no_cache_id:
 
 __armv3_mmu_cache_flush:
 __armv3_mpu_cache_flush:
+		tst	r4, #1
+		movne	pc, lr
 		mov	r1, #0
 		mcr	p15, 0, r1, c7, c0, 0	@ invalidate whole cache v3
 		mov	pc, lr