diff mbox

[2/2] arm64: percpu: rewrite ll/sc loops in assembly

Message ID 1476874784-16214-2-git-send-email-will.deacon@arm.com
State Accepted
Commit 1e6e57d9b34a9075d5f9e2048ea7b09756590d11
Headers show

Commit Message

Will Deacon Oct. 19, 2016, 10:59 a.m. UTC
Writing the outer loop of an LL/SC sequence using do {...} while
constructs potentially allows the compiler to hoist memory accesses
between the STXR and the branch back to the LDXR. On CPUs that do not
guarantee forward progress of LL/SC loops when faced with memory
accesses to the same ERG (up to 2k) between the failed STXR and the
branch back, we may end up livelocking.

This patch avoids this issue in our percpu atomics by rewriting the
outer loop as part of the LL/SC inline assembly block.

Signed-off-by: Will Deacon <will.deacon@arm.com>

---
 arch/arm64/include/asm/percpu.h | 120 +++++++++++++++++++---------------------
 1 file changed, 56 insertions(+), 64 deletions(-)

-- 
2.1.4


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

Comments

Mark Rutland Oct. 19, 2016, 2:20 p.m. UTC | #1
On Wed, Oct 19, 2016 at 11:59:44AM +0100, Will Deacon wrote:
> Writing the outer loop of an LL/SC sequence using do {...} while

> constructs potentially allows the compiler to hoist memory accesses

> between the STXR and the branch back to the LDXR. On CPUs that do not

> guarantee forward progress of LL/SC loops when faced with memory

> accesses to the same ERG (up to 2k) between the failed STXR and the

> branch back, we may end up livelocking.

> 

> This patch avoids this issue in our percpu atomics by rewriting the

> outer loop as part of the LL/SC inline assembly block.

> 

> Signed-off-by: Will Deacon <will.deacon@arm.com>


The new templates look correct to me, and appear to have been duplicated
correctly for each different size of access. My machines boot happily
with this applied, so FWIW:

Reviewed-by: Mark Rutland <mark.rutland@arm.com>

Tested-by: Mark Rutland <mark.rutland@arm.com>


I take it this (and the previous patch) will be Cc'd to stable?

As an aside, I have a local patch which gets rid of the duplication
here; I'll rebase that and send it out once this is in.

Thanks,
Mark.

> ---

>  arch/arm64/include/asm/percpu.h | 120 +++++++++++++++++++---------------------

>  1 file changed, 56 insertions(+), 64 deletions(-)

> 

> diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h

> index 2fee2f59288c..5394c8405e66 100644

> --- a/arch/arm64/include/asm/percpu.h

> +++ b/arch/arm64/include/asm/percpu.h

> @@ -44,48 +44,44 @@ static inline unsigned long __percpu_##op(void *ptr,			\

>  									\

>  	switch (size) {							\

>  	case 1:								\

> -		do {							\

> -			asm ("//__per_cpu_" #op "_1\n"			\

> -			"ldxrb	  %w[ret], %[ptr]\n"			\

> +		asm ("//__per_cpu_" #op "_1\n"				\

> +		"1:	ldxrb	  %w[ret], %[ptr]\n"			\

>  			#asm_op " %w[ret], %w[ret], %w[val]\n"		\

> -			"stxrb	  %w[loop], %w[ret], %[ptr]\n"		\

> -			: [loop] "=&r" (loop), [ret] "=&r" (ret),	\

> -			  [ptr] "+Q"(*(u8 *)ptr)			\

> -			: [val] "Ir" (val));				\

> -		} while (loop);						\

> +		"	stxrb	  %w[loop], %w[ret], %[ptr]\n"		\

> +		"	cbnz	  %w[loop], 1b"				\

> +		: [loop] "=&r" (loop), [ret] "=&r" (ret),		\

> +		  [ptr] "+Q"(*(u8 *)ptr)				\

> +		: [val] "Ir" (val));					\

>  		break;							\

>  	case 2:								\

> -		do {							\

> -			asm ("//__per_cpu_" #op "_2\n"			\

> -			"ldxrh	  %w[ret], %[ptr]\n"			\

> +		asm ("//__per_cpu_" #op "_2\n"				\

> +		"1:	ldxrh	  %w[ret], %[ptr]\n"			\

>  			#asm_op " %w[ret], %w[ret], %w[val]\n"		\

> -			"stxrh	  %w[loop], %w[ret], %[ptr]\n"		\

> -			: [loop] "=&r" (loop), [ret] "=&r" (ret),	\

> -			  [ptr]  "+Q"(*(u16 *)ptr)			\

> -			: [val] "Ir" (val));				\

> -		} while (loop);						\

> +		"	stxrh	  %w[loop], %w[ret], %[ptr]\n"		\

> +		"	cbnz	  %w[loop], 1b"				\

> +		: [loop] "=&r" (loop), [ret] "=&r" (ret),		\

> +		  [ptr]  "+Q"(*(u16 *)ptr)				\

> +		: [val] "Ir" (val));					\

>  		break;							\

>  	case 4:								\

> -		do {							\

> -			asm ("//__per_cpu_" #op "_4\n"			\

> -			"ldxr	  %w[ret], %[ptr]\n"			\

> +		asm ("//__per_cpu_" #op "_4\n"				\

> +		"1:	ldxr	  %w[ret], %[ptr]\n"			\

>  			#asm_op " %w[ret], %w[ret], %w[val]\n"		\

> -			"stxr	  %w[loop], %w[ret], %[ptr]\n"		\

> -			: [loop] "=&r" (loop), [ret] "=&r" (ret),	\

> -			  [ptr] "+Q"(*(u32 *)ptr)			\

> -			: [val] "Ir" (val));				\

> -		} while (loop);						\

> +		"	stxr	  %w[loop], %w[ret], %[ptr]\n"		\

> +		"	cbnz	  %w[loop], 1b"				\

> +		: [loop] "=&r" (loop), [ret] "=&r" (ret),		\

> +		  [ptr] "+Q"(*(u32 *)ptr)				\

> +		: [val] "Ir" (val));					\

>  		break;							\

>  	case 8:								\

> -		do {							\

> -			asm ("//__per_cpu_" #op "_8\n"			\

> -			"ldxr	  %[ret], %[ptr]\n"			\

> +		asm ("//__per_cpu_" #op "_8\n"				\

> +		"1:	ldxr	  %[ret], %[ptr]\n"			\

>  			#asm_op " %[ret], %[ret], %[val]\n"		\

> -			"stxr	  %w[loop], %[ret], %[ptr]\n"		\

> -			: [loop] "=&r" (loop), [ret] "=&r" (ret),	\

> -			  [ptr] "+Q"(*(u64 *)ptr)			\

> -			: [val] "Ir" (val));				\

> -		} while (loop);						\

> +		"	stxr	  %w[loop], %[ret], %[ptr]\n"		\

> +		"	cbnz	  %w[loop], 1b"				\

> +		: [loop] "=&r" (loop), [ret] "=&r" (ret),		\

> +		  [ptr] "+Q"(*(u64 *)ptr)				\

> +		: [val] "Ir" (val));					\

>  		break;							\

>  	default:							\

>  		BUILD_BUG();						\

> @@ -150,44 +146,40 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,

>  

>  	switch (size) {

>  	case 1:

> -		do {

> -			asm ("//__percpu_xchg_1\n"

> -			"ldxrb %w[ret], %[ptr]\n"

> -			"stxrb %w[loop], %w[val], %[ptr]\n"

> -			: [loop] "=&r"(loop), [ret] "=&r"(ret),

> -			  [ptr] "+Q"(*(u8 *)ptr)

> -			: [val] "r" (val));

> -		} while (loop);

> +		asm ("//__percpu_xchg_1\n"

> +		"1:	ldxrb	%w[ret], %[ptr]\n"

> +		"	stxrb	%w[loop], %w[val], %[ptr]\n"

> +		"	cbnz	%w[loop], 1b"

> +		: [loop] "=&r"(loop), [ret] "=&r"(ret),

> +		  [ptr] "+Q"(*(u8 *)ptr)

> +		: [val] "r" (val));

>  		break;

>  	case 2:

> -		do {

> -			asm ("//__percpu_xchg_2\n"

> -			"ldxrh %w[ret], %[ptr]\n"

> -			"stxrh %w[loop], %w[val], %[ptr]\n"

> -			: [loop] "=&r"(loop), [ret] "=&r"(ret),

> -			  [ptr] "+Q"(*(u16 *)ptr)

> -			: [val] "r" (val));

> -		} while (loop);

> +		asm ("//__percpu_xchg_2\n"

> +		"1:	ldxrh	%w[ret], %[ptr]\n"

> +		"	stxrh	%w[loop], %w[val], %[ptr]\n"

> +		"	cbnz	%w[loop], 1b"

> +		: [loop] "=&r"(loop), [ret] "=&r"(ret),

> +		  [ptr] "+Q"(*(u16 *)ptr)

> +		: [val] "r" (val));

>  		break;

>  	case 4:

> -		do {

> -			asm ("//__percpu_xchg_4\n"

> -			"ldxr %w[ret], %[ptr]\n"

> -			"stxr %w[loop], %w[val], %[ptr]\n"

> -			: [loop] "=&r"(loop), [ret] "=&r"(ret),

> -			  [ptr] "+Q"(*(u32 *)ptr)

> -			: [val] "r" (val));

> -		} while (loop);

> +		asm ("//__percpu_xchg_4\n"

> +		"1:	ldxr	%w[ret], %[ptr]\n"

> +		"	stxr	%w[loop], %w[val], %[ptr]\n"

> +		"	cbnz	%w[loop], 1b"

> +		: [loop] "=&r"(loop), [ret] "=&r"(ret),

> +		  [ptr] "+Q"(*(u32 *)ptr)

> +		: [val] "r" (val));

>  		break;

>  	case 8:

> -		do {

> -			asm ("//__percpu_xchg_8\n"

> -			"ldxr %[ret], %[ptr]\n"

> -			"stxr %w[loop], %[val], %[ptr]\n"

> -			: [loop] "=&r"(loop), [ret] "=&r"(ret),

> -			  [ptr] "+Q"(*(u64 *)ptr)

> -			: [val] "r" (val));

> -		} while (loop);

> +		asm ("//__percpu_xchg_8\n"

> +		"1:	ldxr	%[ret], %[ptr]\n"

> +		"	stxr	%w[loop], %[val], %[ptr]\n"

> +		"	cbnz	%w[loop], 1b"

> +		: [loop] "=&r"(loop), [ret] "=&r"(ret),

> +		  [ptr] "+Q"(*(u64 *)ptr)

> +		: [val] "r" (val));

>  		break;

>  	default:

>  		BUILD_BUG();

> -- 

> 2.1.4

> 

> 

> _______________________________________________

> linux-arm-kernel mailing list

> linux-arm-kernel@lists.infradead.org

> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

> 


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
diff mbox

Patch

diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h
index 2fee2f59288c..5394c8405e66 100644
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -44,48 +44,44 @@  static inline unsigned long __percpu_##op(void *ptr,			\
 									\
 	switch (size) {							\
 	case 1:								\
-		do {							\
-			asm ("//__per_cpu_" #op "_1\n"			\
-			"ldxrb	  %w[ret], %[ptr]\n"			\
+		asm ("//__per_cpu_" #op "_1\n"				\
+		"1:	ldxrb	  %w[ret], %[ptr]\n"			\
 			#asm_op " %w[ret], %w[ret], %w[val]\n"		\
-			"stxrb	  %w[loop], %w[ret], %[ptr]\n"		\
-			: [loop] "=&r" (loop), [ret] "=&r" (ret),	\
-			  [ptr] "+Q"(*(u8 *)ptr)			\
-			: [val] "Ir" (val));				\
-		} while (loop);						\
+		"	stxrb	  %w[loop], %w[ret], %[ptr]\n"		\
+		"	cbnz	  %w[loop], 1b"				\
+		: [loop] "=&r" (loop), [ret] "=&r" (ret),		\
+		  [ptr] "+Q"(*(u8 *)ptr)				\
+		: [val] "Ir" (val));					\
 		break;							\
 	case 2:								\
-		do {							\
-			asm ("//__per_cpu_" #op "_2\n"			\
-			"ldxrh	  %w[ret], %[ptr]\n"			\
+		asm ("//__per_cpu_" #op "_2\n"				\
+		"1:	ldxrh	  %w[ret], %[ptr]\n"			\
 			#asm_op " %w[ret], %w[ret], %w[val]\n"		\
-			"stxrh	  %w[loop], %w[ret], %[ptr]\n"		\
-			: [loop] "=&r" (loop), [ret] "=&r" (ret),	\
-			  [ptr]  "+Q"(*(u16 *)ptr)			\
-			: [val] "Ir" (val));				\
-		} while (loop);						\
+		"	stxrh	  %w[loop], %w[ret], %[ptr]\n"		\
+		"	cbnz	  %w[loop], 1b"				\
+		: [loop] "=&r" (loop), [ret] "=&r" (ret),		\
+		  [ptr]  "+Q"(*(u16 *)ptr)				\
+		: [val] "Ir" (val));					\
 		break;							\
 	case 4:								\
-		do {							\
-			asm ("//__per_cpu_" #op "_4\n"			\
-			"ldxr	  %w[ret], %[ptr]\n"			\
+		asm ("//__per_cpu_" #op "_4\n"				\
+		"1:	ldxr	  %w[ret], %[ptr]\n"			\
 			#asm_op " %w[ret], %w[ret], %w[val]\n"		\
-			"stxr	  %w[loop], %w[ret], %[ptr]\n"		\
-			: [loop] "=&r" (loop), [ret] "=&r" (ret),	\
-			  [ptr] "+Q"(*(u32 *)ptr)			\
-			: [val] "Ir" (val));				\
-		} while (loop);						\
+		"	stxr	  %w[loop], %w[ret], %[ptr]\n"		\
+		"	cbnz	  %w[loop], 1b"				\
+		: [loop] "=&r" (loop), [ret] "=&r" (ret),		\
+		  [ptr] "+Q"(*(u32 *)ptr)				\
+		: [val] "Ir" (val));					\
 		break;							\
 	case 8:								\
-		do {							\
-			asm ("//__per_cpu_" #op "_8\n"			\
-			"ldxr	  %[ret], %[ptr]\n"			\
+		asm ("//__per_cpu_" #op "_8\n"				\
+		"1:	ldxr	  %[ret], %[ptr]\n"			\
 			#asm_op " %[ret], %[ret], %[val]\n"		\
-			"stxr	  %w[loop], %[ret], %[ptr]\n"		\
-			: [loop] "=&r" (loop), [ret] "=&r" (ret),	\
-			  [ptr] "+Q"(*(u64 *)ptr)			\
-			: [val] "Ir" (val));				\
-		} while (loop);						\
+		"	stxr	  %w[loop], %[ret], %[ptr]\n"		\
+		"	cbnz	  %w[loop], 1b"				\
+		: [loop] "=&r" (loop), [ret] "=&r" (ret),		\
+		  [ptr] "+Q"(*(u64 *)ptr)				\
+		: [val] "Ir" (val));					\
 		break;							\
 	default:							\
 		BUILD_BUG();						\
@@ -150,44 +146,40 @@  static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
 
 	switch (size) {
 	case 1:
-		do {
-			asm ("//__percpu_xchg_1\n"
-			"ldxrb %w[ret], %[ptr]\n"
-			"stxrb %w[loop], %w[val], %[ptr]\n"
-			: [loop] "=&r"(loop), [ret] "=&r"(ret),
-			  [ptr] "+Q"(*(u8 *)ptr)
-			: [val] "r" (val));
-		} while (loop);
+		asm ("//__percpu_xchg_1\n"
+		"1:	ldxrb	%w[ret], %[ptr]\n"
+		"	stxrb	%w[loop], %w[val], %[ptr]\n"
+		"	cbnz	%w[loop], 1b"
+		: [loop] "=&r"(loop), [ret] "=&r"(ret),
+		  [ptr] "+Q"(*(u8 *)ptr)
+		: [val] "r" (val));
 		break;
 	case 2:
-		do {
-			asm ("//__percpu_xchg_2\n"
-			"ldxrh %w[ret], %[ptr]\n"
-			"stxrh %w[loop], %w[val], %[ptr]\n"
-			: [loop] "=&r"(loop), [ret] "=&r"(ret),
-			  [ptr] "+Q"(*(u16 *)ptr)
-			: [val] "r" (val));
-		} while (loop);
+		asm ("//__percpu_xchg_2\n"
+		"1:	ldxrh	%w[ret], %[ptr]\n"
+		"	stxrh	%w[loop], %w[val], %[ptr]\n"
+		"	cbnz	%w[loop], 1b"
+		: [loop] "=&r"(loop), [ret] "=&r"(ret),
+		  [ptr] "+Q"(*(u16 *)ptr)
+		: [val] "r" (val));
 		break;
 	case 4:
-		do {
-			asm ("//__percpu_xchg_4\n"
-			"ldxr %w[ret], %[ptr]\n"
-			"stxr %w[loop], %w[val], %[ptr]\n"
-			: [loop] "=&r"(loop), [ret] "=&r"(ret),
-			  [ptr] "+Q"(*(u32 *)ptr)
-			: [val] "r" (val));
-		} while (loop);
+		asm ("//__percpu_xchg_4\n"
+		"1:	ldxr	%w[ret], %[ptr]\n"
+		"	stxr	%w[loop], %w[val], %[ptr]\n"
+		"	cbnz	%w[loop], 1b"
+		: [loop] "=&r"(loop), [ret] "=&r"(ret),
+		  [ptr] "+Q"(*(u32 *)ptr)
+		: [val] "r" (val));
 		break;
 	case 8:
-		do {
-			asm ("//__percpu_xchg_8\n"
-			"ldxr %[ret], %[ptr]\n"
-			"stxr %w[loop], %[val], %[ptr]\n"
-			: [loop] "=&r"(loop), [ret] "=&r"(ret),
-			  [ptr] "+Q"(*(u64 *)ptr)
-			: [val] "r" (val));
-		} while (loop);
+		asm ("//__percpu_xchg_8\n"
+		"1:	ldxr	%[ret], %[ptr]\n"
+		"	stxr	%w[loop], %[val], %[ptr]\n"
+		"	cbnz	%w[loop], 1b"
+		: [loop] "=&r"(loop), [ret] "=&r"(ret),
+		  [ptr] "+Q"(*(u64 *)ptr)
+		: [val] "r" (val));
 		break;
 	default:
 		BUILD_BUG();