Message ID | 1502130965-18710-15-git-send-email-mark.rutland@arm.com |
---|---|
State | Superseded |
Headers | show |
Series | arm64: VMAP_STACK support | expand |
Just some minor comments on this (after taking ages to realise you were using tpidr_el0 as a temporary rather than tpidr_el1 and getting totally confused!). On Mon, Aug 07, 2017 at 07:36:05PM +0100, Mark Rutland wrote: > This patch adds stack overflow detection to arm64, usable when vmap'd stacks > are in use. > > Overflow is detected in a small preamble executed for each exception entry, > which checks whether there is enough space on the current stack for the general > purpose registers to be saved. If there is not enough space, the overflow > handler is invoked on a per-cpu overflow stack. This approach preserves the > original exception information in ESR_EL1 (and where appropriate, FAR_EL1). > > Task and IRQ stacks are aligned to double their size, enabling overflow to be > detected with a single bit test. For example, a 16K stack is aligned to 32K, > ensuring that bit 14 of the SP must be zero. On an overflow (or underflow), > this bit is flipped. Thus, overflow (of less than the size of the stack) can be > detected by testing whether this bit is set. > > The overflow check is performed before any attempt is made to access the > stack, avoiding recursive faults (and the loss of exception information > these would entail). As logical operations cannot be performed on the SP > directly, the SP is temporarily swapped with a general purpose register > using arithmetic operations to enable the test to be performed. [...] > diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h > index c5cd2c5..1a025b7 100644 > --- a/arch/arm64/include/asm/memory.h > +++ b/arch/arm64/include/asm/memory.h > @@ -133,6 +133,8 @@ > > #define IRQ_STACK_SIZE THREAD_SIZE > > +#define OVERFLOW_STACK_SIZE SZ_4K > + > /* > * Alignment of kernel segments (e.g. .text, .data). > */ > diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h > index 92ddb6d..ee19563 100644 > --- a/arch/arm64/include/asm/stacktrace.h > +++ b/arch/arm64/include/asm/stacktrace.h > @@ -57,6 +57,22 @@ static inline bool on_task_stack(struct task_struct *tsk, unsigned long sp) > return (low <= sp && sp < high); > } > > +#ifdef CONFIG_VMAP_STACK > +DECLARE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); > + > +#define OVERFLOW_STACK_PTR() ((unsigned long)this_cpu_ptr(overflow_stack) + OVERFLOW_STACK_SIZE) > + > +static inline bool on_overflow_stack(unsigned long sp) > +{ > + unsigned long low = (unsigned long)this_cpu_ptr(overflow_stack); Can you use raw_cpu_ptr here, like you do for the irq stack? > + unsigned long high = low + OVERFLOW_STACK_SIZE; > + > + return (low <= sp && sp < high); > +} > +#else > +static inline bool on_overflow_stack(unsigned long sp) { return false; } > +#endif > + > /* > * We can only safely access per-cpu stacks from current in a non-preemptible > * context. > @@ -69,6 +85,8 @@ static inline bool on_accessible_stack(struct task_struct *tsk, unsigned long sp > return false; > if (on_irq_stack(sp)) > return true; > + if (on_overflow_stack(sp)) > + return true; I find the "return false" clause in this function makes it fiddly to read because it's really predicating all following conditionals on current && !preemptible, but I haven't got any better ideas :( > return false; > } > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S > index e5aa866..44a27c3 100644 > --- a/arch/arm64/kernel/entry.S > +++ b/arch/arm64/kernel/entry.S > @@ -72,6 +72,37 @@ > .macro kernel_ventry label > .align 7 > sub sp, sp, #S_FRAME_SIZE > +#ifdef CONFIG_VMAP_STACK > + add sp, sp, x0 // sp' = sp + x0 > + sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp > + tbnz x0, #THREAD_SHIFT, 0f > + sub x0, sp, x0 // sp' - x0' = (sp + x0) - sp = x0 > + sub sp, sp, x0 // sp' - x0 = (sp + x0) - x0 = sp > + b \label > + > + /* Stash the original SP value in tpidr_el0 */ > +0: msr tpidr_el0, x0 The comment here is a bit confusing, since the sp has already been decremented for the frame, as mention in a later comment. > + > + /* Recover the original x0 value and stash it in tpidrro_el0 */ > + sub x0, sp, x0 > + msr tpidrro_el0, x0 > + > + /* Switch to the overflow stack */ > + adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0 > + > + /* > + * Check whether we were already on the overflow stack. This may happen > + * after panic() re-enables interrupts. > + */ > + mrs x0, tpidr_el0 // sp of interrupted context > + sub x0, sp, x0 // delta with top of overflow stack > + tst x0, #~(OVERFLOW_STACK_SIZE - 1) // within range? > + b.ne __bad_stack // no? -> bad stack pointer > + > + /* We were already on the overflow stack. Restore sp/x0 and carry on. */ > + sub sp, sp, x0 > + mrs x0, tpidrro_el0 > +#endif > b \label > .endm > > @@ -348,6 +379,34 @@ ENTRY(vectors) > #endif > END(vectors) > > +#ifdef CONFIG_VMAP_STACK > + /* > + * We detected an overflow in kernel_ventry, which switched to the > + * overflow stack. Stash the exception regs, and head to our overflow > + * handler. > + */ > +__bad_stack: > + /* Restore the original x0 value */ > + mrs x0, tpidrro_el0 > + > + /* > + * Store the original GPRs to the new stack. The orginial SP (minus original > + * S_FRAME_SIZE) was stashed in tpidr_el0 by kernel_ventry. > + */ > + sub sp, sp, #S_FRAME_SIZE > + kernel_entry 1 > + mrs x0, tpidr_el0 > + add x0, x0, #S_FRAME_SIZE > + str x0, [sp, #S_SP] > + > + /* Stash the regs for handle_bad_stack */ > + mov x0, sp > + > + /* Time to die */ > + bl handle_bad_stack > + ASM_BUG() Why not just a b without the ASM_BUG? Will
On Mon, Aug 14, 2017 at 04:32:53PM +0100, Will Deacon wrote: > Just some minor comments on this (after taking ages to realise you were > using tpidr_el0 as a temporary rather than tpidr_el1 and getting totally > confused!). > > On Mon, Aug 07, 2017 at 07:36:05PM +0100, Mark Rutland wrote: > > +static inline bool on_overflow_stack(unsigned long sp) > > +{ > > + unsigned long low = (unsigned long)this_cpu_ptr(overflow_stack); > > Can you use raw_cpu_ptr here, like you do for the irq stack? Sure; done. > > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S > > index e5aa866..44a27c3 100644 > > --- a/arch/arm64/kernel/entry.S > > +++ b/arch/arm64/kernel/entry.S > > @@ -72,6 +72,37 @@ > > .macro kernel_ventry label > > .align 7 > > sub sp, sp, #S_FRAME_SIZE > > +#ifdef CONFIG_VMAP_STACK > > + add sp, sp, x0 // sp' = sp + x0 > > + sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp > > + tbnz x0, #THREAD_SHIFT, 0f > > + sub x0, sp, x0 // sp' - x0' = (sp + x0) - sp = x0 > > + sub sp, sp, x0 // sp' - x0 = (sp + x0) - x0 = sp > > + b \label > > + > > + /* Stash the original SP value in tpidr_el0 */ > > +0: msr tpidr_el0, x0 > > The comment here is a bit confusing, since the sp has already been > decremented for the frame, as mention in a later comment. True. I've updated the comment to say: /* * Stash the SP (minus S_FRAME_SIZE) in tpidr_el0. We can recover the * original SP value later if we need it. */ [...] > > + * Store the original GPRs to the new stack. The orginial SP (minus > > original Took me a moment to spot the second instance. Fixed now. [...] > > + /* Time to die */ > > + bl handle_bad_stack > > + ASM_BUG() > > Why not just a b without the ASM_BUG? We need the BL to ensure that the LR is valid for unwinding. That's necessary for the backtrace to identify the exception regs based on the LR falling into .entry.text. The ASM_BUG() ensures that the LR value definitely falls in .entry.text, and makes the backtrace resolve the symbol correctly regardless of what's next. I didn't add a comment for the other cases, so I hadn't bothered here. I'm happy to add those, so long as we're consistent. Thanks, Mark.
On Mon, Aug 07, 2017 at 07:36:05PM +0100, Mark Rutland wrote: > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S > index e5aa866..44a27c3 100644 > --- a/arch/arm64/kernel/entry.S > +++ b/arch/arm64/kernel/entry.S > @@ -72,6 +72,37 @@ > .macro kernel_ventry label > .align 7 > sub sp, sp, #S_FRAME_SIZE > +#ifdef CONFIG_VMAP_STACK > + add sp, sp, x0 // sp' = sp + x0 > + sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp > + tbnz x0, #THREAD_SHIFT, 0f > + sub x0, sp, x0 // sp' - x0' = (sp + x0) - sp = x0 > + sub sp, sp, x0 // sp' - x0 = (sp + x0) - x0 = sp > + b \label Maybe a small comment before this hunk just to tell the user that it's trying to test a bit in SP without corrupting a gpr. It's obvious once you read it but not you see it for the first time ;). > + > + /* Stash the original SP value in tpidr_el0 */ > +0: msr tpidr_el0, x0 And a comment here that on this path we no longer care about the user tpidr_el0 as we are never returning there. Otherwise I'm fine with the series (I'm not a fan of the complexity it adds but I don't have a better suggestion). -- Catalin
On Tue, Aug 15, 2017 at 12:10:32PM +0100, Catalin Marinas wrote: > On Mon, Aug 07, 2017 at 07:36:05PM +0100, Mark Rutland wrote: > > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S > > index e5aa866..44a27c3 100644 > > --- a/arch/arm64/kernel/entry.S > > +++ b/arch/arm64/kernel/entry.S > > @@ -72,6 +72,37 @@ > > .macro kernel_ventry label > > .align 7 > > sub sp, sp, #S_FRAME_SIZE > > +#ifdef CONFIG_VMAP_STACK > > + add sp, sp, x0 // sp' = sp + x0 > > + sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp > > + tbnz x0, #THREAD_SHIFT, 0f > > + sub x0, sp, x0 // sp' - x0' = (sp + x0) - sp = x0 > > + sub sp, sp, x0 // sp' - x0 = (sp + x0) - x0 = sp > > + b \label > > Maybe a small comment before this hunk just to tell the user that it's > trying to test a bit in SP without corrupting a gpr. It's obvious once > you read it but not you see it for the first time ;). > > > + > > + /* Stash the original SP value in tpidr_el0 */ > > +0: msr tpidr_el0, x0 > > And a comment here that on this path we no longer care about the user > tpidr_el0 as we are never returning there. Ok. I've updated comments in both cases. > Otherwise I'm fine with the series (I'm not a fan of the complexity it > adds but I don't have a better suggestion). Thanks! I'll send out a v2 shortly with the changes you requested. Thanks, Mark.
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index c5cd2c5..1a025b7 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -133,6 +133,8 @@ #define IRQ_STACK_SIZE THREAD_SIZE +#define OVERFLOW_STACK_SIZE SZ_4K + /* * Alignment of kernel segments (e.g. .text, .data). */ diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 92ddb6d..ee19563 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -57,6 +57,22 @@ static inline bool on_task_stack(struct task_struct *tsk, unsigned long sp) return (low <= sp && sp < high); } +#ifdef CONFIG_VMAP_STACK +DECLARE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); + +#define OVERFLOW_STACK_PTR() ((unsigned long)this_cpu_ptr(overflow_stack) + OVERFLOW_STACK_SIZE) + +static inline bool on_overflow_stack(unsigned long sp) +{ + unsigned long low = (unsigned long)this_cpu_ptr(overflow_stack); + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return (low <= sp && sp < high); +} +#else +static inline bool on_overflow_stack(unsigned long sp) { return false; } +#endif + /* * We can only safely access per-cpu stacks from current in a non-preemptible * context. @@ -69,6 +85,8 @@ static inline bool on_accessible_stack(struct task_struct *tsk, unsigned long sp return false; if (on_irq_stack(sp)) return true; + if (on_overflow_stack(sp)) + return true; return false; } diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index e5aa866..44a27c3 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -72,6 +72,37 @@ .macro kernel_ventry label .align 7 sub sp, sp, #S_FRAME_SIZE +#ifdef CONFIG_VMAP_STACK + add sp, sp, x0 // sp' = sp + x0 + sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp + tbnz x0, #THREAD_SHIFT, 0f + sub x0, sp, x0 // sp' - x0' = (sp + x0) - sp = x0 + sub sp, sp, x0 // sp' - x0 = (sp + x0) - x0 = sp + b \label + + /* Stash the original SP value in tpidr_el0 */ +0: msr tpidr_el0, x0 + + /* Recover the original x0 value and stash it in tpidrro_el0 */ + sub x0, sp, x0 + msr tpidrro_el0, x0 + + /* Switch to the overflow stack */ + adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0 + + /* + * Check whether we were already on the overflow stack. This may happen + * after panic() re-enables interrupts. + */ + mrs x0, tpidr_el0 // sp of interrupted context + sub x0, sp, x0 // delta with top of overflow stack + tst x0, #~(OVERFLOW_STACK_SIZE - 1) // within range? + b.ne __bad_stack // no? -> bad stack pointer + + /* We were already on the overflow stack. Restore sp/x0 and carry on. */ + sub sp, sp, x0 + mrs x0, tpidrro_el0 +#endif b \label .endm @@ -348,6 +379,34 @@ ENTRY(vectors) #endif END(vectors) +#ifdef CONFIG_VMAP_STACK + /* + * We detected an overflow in kernel_ventry, which switched to the + * overflow stack. Stash the exception regs, and head to our overflow + * handler. + */ +__bad_stack: + /* Restore the original x0 value */ + mrs x0, tpidrro_el0 + + /* + * Store the original GPRs to the new stack. The orginial SP (minus + * S_FRAME_SIZE) was stashed in tpidr_el0 by kernel_ventry. + */ + sub sp, sp, #S_FRAME_SIZE + kernel_entry 1 + mrs x0, tpidr_el0 + add x0, x0, #S_FRAME_SIZE + str x0, [sp, #S_SP] + + /* Stash the regs for handle_bad_stack */ + mov x0, sp + + /* Time to die */ + bl handle_bad_stack + ASM_BUG() +#endif /* CONFIG_VMAP_STACK */ + /* * Invalid mode handlers */ diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index d01c598..2c80a11 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -32,6 +32,7 @@ #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> +#include <linux/sizes.h> #include <linux/syscalls.h> #include <linux/mm_types.h> @@ -41,6 +42,7 @@ #include <asm/esr.h> #include <asm/insn.h> #include <asm/traps.h> +#include <asm/smp.h> #include <asm/stack_pointer.h> #include <asm/stacktrace.h> #include <asm/exception.h> @@ -666,6 +668,43 @@ asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) force_sig_info(info.si_signo, &info, current); } +#ifdef CONFIG_VMAP_STACK + +DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) + __aligned(16); + +asmlinkage void handle_bad_stack(struct pt_regs *regs) +{ + unsigned long tsk_stk = (unsigned long)current->stack; + unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr); + unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack); + unsigned int esr = read_sysreg(esr_el1); + unsigned long far = read_sysreg(far_el1); + + console_verbose(); + pr_emerg("Insufficient stack space to handle exception!"); + + __show_regs(regs); + + pr_emerg("Task stack: [0x%016lx..0x%016lx]\n", + tsk_stk, tsk_stk + THREAD_SIZE); + pr_emerg("IRQ stack: [0x%016lx..0x%016lx]\n", + irq_stk, irq_stk + THREAD_SIZE); + pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n", + ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE); + + pr_emerg("ESR: 0x%08x -- %s\n", esr, esr_get_class_string(esr)); + pr_emerg("FAR: 0x%016lx\n", far); + + /* + * We use nmi_panic to limit the potential for recusive overflows, and + * to get a better stack trace. + */ + nmi_panic(NULL, "kernel stack overflow"); + cpu_park_loop(); +} +#endif + void __pte_error(const char *file, int line, unsigned long val) { pr_err("%s:%d: bad pte %016lx.\n", file, line, val);