diff mbox

[RFC] mips: Fix arch_spin_unlock()

Message ID 20160209114242.GE22874@arm.com
State New
Headers show

Commit Message

Will Deacon Feb. 9, 2016, 11:42 a.m. UTC
On Tue, Feb 09, 2016 at 12:23:58PM +0100, Ingo Molnar wrote:
> * Will Deacon <will.deacon@arm.com> wrote:

> > On Wed, Feb 03, 2016 at 01:32:10PM +0000, Will Deacon wrote:

> > > On Wed, Feb 03, 2016 at 09:33:39AM +0100, Ingo Molnar wrote:

> > > > In fact I'd suggest to test this via a quick runtime hack like this in rcupdate.h:

> > > > 

> > > > 	extern int panic_timeout;

> > > > 

> > > > 	...

> > > > 

> > > > 	if (panic_timeout)

> > > > 		smp_load_acquire(p);

> > > > 	else

> > > > 		typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p);

> > > > 

> > > > (or so)

> > > 

> > > So the problem with this is that a LOAD <ctrl> LOAD sequence isn't an

> > > ordering hazard on ARM, so you're potentially at the mercy of the branch

> > > predictor as to whether you get an acquire. That's not to say it won't

> > > be discarded as soon as the conditional is resolved, but it could

> > > screw up the benchmarking.

> > > 

> > > I'd be better off doing some runtime patching, but that's not something

> > > I can knock up in a couple of minutes (so I'll add it to my list).

> > 

> > ... so I actually got that up and running, believe it or not. Filthy stuff.

> 

> Wow!

> 

> I tried to implement the simpler solution by hacking rcupdate.h, but got drowned 

> in nasty circular header file dependencies and gave up...


I hacked linux/compiler.h, but got similar problems and ended up copying
what I need under a new namespace (posh way of saying I added _WILL to
everything until it built).

> If you are not overly embarrassed by posting hacky patches, mind posting your 

> solution?


Ok, since you asked! I'm thoroughly ashamed of the hacks, but maybe it
will keep those spammy Microsoft recruiters at bay ;)

Note that the "trigger" is changing the loglevel, but you need to do this
by echoing to /proc/sysrq-trigger, otherwise the whole thing gets invoked
in irq context which ends badly. It's also heavily arm64-specific.

> > The good news is that you're right, and I'm now seeing ~1% difference between 

> > the runs with ~0.3% noise for either of them. I still think that's significant, 

> > but it's a lot more reassuring than 4%.

> 

> hm, so for such marginal effects I think we could improve the testing method a 

> bit: we could improve 'perf bench sched messaging' to allow 'steady state 

> testing': to not exit+restart all the processes between test iterations, but to 

> continuously measure and print out current performance figures.

> 

> I.e. every 10 seconds it could print a decaying running average of current 

> throughput.

> 

> That way you could patch/unpatch the instructions without having to restart the 

> tasks. If you still see an effect (in the numbers reported every 10 seconds), then 

> that's a guaranteed result.

> 

> [ We have such functionality in 'perf bench numa' (the --show-convergence option), 

>   for similar reasons, to allow runtime monitoring and tweaking of kernel 

>   parameters. ]


That sounds handy.

Will

--->8
diff mbox

Patch

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 8f271b83f910..1a1e353983be 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -30,8 +30,8 @@ 
 #define ARM64_HAS_LSE_ATOMICS			5
 #define ARM64_WORKAROUND_CAVIUM_23154		6
 #define ARM64_WORKAROUND_834220			7
-
-#define ARM64_NCAPS				8
+#define ARM64_RCU_USES_ACQUIRE			8
+#define ARM64_NCAPS				9
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index d2ee1b21a10d..edbf188a8541 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -143,6 +143,66 @@  static int __apply_alternatives_multi_stop(void *unused)
 	return 0;
 }
 
+static void __apply_alternatives_rcu(void *alt_region)
+{
+	struct alt_instr *alt;
+	struct alt_region *region = alt_region;
+	u32 *origptr, *replptr;
+
+	for (alt = region->begin; alt < region->end; alt++) {
+		u32 insn, orig_insn;
+		int nr_inst;
+
+		if (alt->cpufeature != ARM64_RCU_USES_ACQUIRE)
+			continue;
+
+		BUG_ON(alt->alt_len != alt->orig_len);
+
+		origptr = ALT_ORIG_PTR(alt);
+		replptr = ALT_REPL_PTR(alt);
+		nr_inst = alt->alt_len / sizeof(insn);
+
+		BUG_ON(nr_inst != 1);
+
+		insn = le32_to_cpu(*replptr);
+		orig_insn = le32_to_cpu(*origptr);
+		*(origptr) = cpu_to_le32(insn);
+		*replptr = cpu_to_le32(orig_insn);
+
+		flush_icache_range((uintptr_t)origptr, (uintptr_t)(origptr + 1));
+		pr_info_ratelimited("%p: 0x%x => 0x%x\n", origptr, orig_insn, insn);
+	}
+}
+
+static int will_patched;
+
+static int __apply_alternatives_rcu_multi_stop(void *unused)
+{
+	struct alt_region region = {
+		.begin	= __alt_instructions,
+		.end	= __alt_instructions_end,
+	};
+
+	/* We always have a CPU 0 at this point (__init) */
+	if (smp_processor_id()) {
+		while (!READ_ONCE(will_patched))
+			cpu_relax();
+		isb();
+	} else {
+		__apply_alternatives_rcu(&region);
+		/* Barriers provided by the cache flushing */
+		WRITE_ONCE(will_patched, 1);
+	}
+
+	return 0;
+}
+
+void apply_alternatives_rcu(void)
+{
+	will_patched = 0;
+	stop_machine(__apply_alternatives_rcu_multi_stop, NULL, cpu_online_mask);
+}
+
 void __init apply_alternatives_all(void)
 {
 	/* better not try code patching on a live SMP system */
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index e3928f578891..98c132c3b018 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -141,7 +141,10 @@  SECTIONS
 
 	PERCPU_SECTION(L1_CACHE_BYTES)
 
-	. = ALIGN(4);
+	__init_end = .;
+	. = ALIGN(PAGE_SIZE);
+
+
 	.altinstructions : {
 		__alt_instructions = .;
 		*(.altinstructions)
@@ -151,8 +154,7 @@  SECTIONS
 		*(.altinstr_replacement)
 	}
 
-	. = ALIGN(PAGE_SIZE);
-	__init_end = .;
+	. = ALIGN(4);
 
 	_data = .;
 	_sdata = .;
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index e5139402e7f8..3eb7193fcf88 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -80,6 +80,7 @@  static int __init sysrq_always_enabled_setup(char *str)
 
 __setup("sysrq_always_enabled", sysrq_always_enabled_setup);
 
+extern void apply_alternatives_rcu(void);
 
 static void sysrq_handle_loglevel(int key)
 {
@@ -89,6 +90,7 @@  static void sysrq_handle_loglevel(int key)
 	console_loglevel = CONSOLE_LOGLEVEL_DEFAULT;
 	pr_info("Loglevel set to %d\n", i);
 	console_loglevel = i;
+	apply_alternatives_rcu();
 }
 static struct sysrq_key_op sysrq_loglevel_op = {
 	.handler	= sysrq_handle_loglevel,
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 00b042c49ccd..75e29d61fedd 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -218,6 +218,61 @@  void __read_once_size(const volatile void *p, void *res, int size)
 	__READ_ONCE_SIZE;
 }
 
+extern void panic(const char *fmt, ...);
+
+#define __stringify_1_will(x...)	#x
+#define __stringify_will(x...)	__stringify_1_will(x)
+
+#define ALTINSTR_ENTRY_WILL(feature)						      \
+	" .word 661b - .\n"				/* label           */ \
+	" .word 663f - .\n"				/* new instruction */ \
+	" .hword " __stringify_will(feature) "\n"		/* feature bit     */ \
+	" .byte 662b-661b\n"				/* source len      */ \
+	" .byte 664f-663f\n"				/* replacement len */
+
+#define __ALTERNATIVE_CFG_WILL(oldinstr, newinstr, feature, cfg_enabled)	\
+	".if "__stringify_will(cfg_enabled)" == 1\n"				\
+	"661:\n\t"							\
+	oldinstr "\n"							\
+	"662:\n"							\
+	".pushsection .altinstructions,\"a\"\n"				\
+	ALTINSTR_ENTRY_WILL(feature)						\
+	".popsection\n"							\
+	".pushsection .altinstr_replacement, \"a\"\n"			\
+	"663:\n\t"							\
+	newinstr "\n"							\
+	"664:\n\t"							\
+	".popsection\n\t"						\
+	".org	. - (664b-663b) + (662b-661b)\n\t"			\
+	".org	. - (662b-661b) + (664b-663b)\n"			\
+	".endif\n"
+
+#define _ALTERNATIVE_CFG_WILL(oldinstr, newinstr, feature, cfg, ...)	\
+	__ALTERNATIVE_CFG_WILL(oldinstr, newinstr, feature, 1)
+
+#define ALTERNATIVE_WILL(oldinstr, newinstr, ...)   \
+	_ALTERNATIVE_CFG_WILL(oldinstr, newinstr, __VA_ARGS__, 1)
+
+#define __READ_ONCE_SIZE_WILL						\
+({									\
+	__u64 tmp;							\
+									\
+	switch (size) {							\
+	case 8: asm volatile(						\
+		ALTERNATIVE_WILL("ldr %0, %1", "ldar %0, %1", 8)	\
+		: "=r" (tmp) : "Q" (*(volatile __u64 *)p));		\
+		*(__u64 *)res = tmp; break;				\
+	default:							\
+		panic("that's no pointer, yo");				\
+	}								\
+})
+
+static __always_inline
+void __read_once_size_will(const volatile void *p, void *res, int size)
+{
+	__READ_ONCE_SIZE_WILL;
+}
+
 #ifdef CONFIG_KASAN
 /*
  * This function is not 'inline' because __no_sanitize_address confilcts
@@ -537,10 +592,17 @@  static __always_inline void __write_once_size(volatile void *p, void *res, int s
  * object's lifetime is managed by something other than RCU.  That
  * "something other" might be reference counting or simple immortality.
  */
+
+#define READ_ONCE_WILL(x)						\
+({									\
+	union { typeof(x) __val; char __c[1]; } __u;			\
+	__read_once_size_will(&(x), __u.__c, sizeof(x));		\
+	__u.__val;							\
+})
+
 #define lockless_dereference(p) \
 ({ \
-	typeof(p) _________p1 = READ_ONCE(p); \
-	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
+	typeof(p) _________p1 = READ_ONCE_WILL(p); \
 	(_________p1); \
 })