diff mbox series

[RFT,v7,9/9] selftests/clone3: Test shadow stack support

Message ID 20240731-clone3-shadow-stack-v7-9-a9532eebfb1d@kernel.org
State Superseded
Headers show
Series [RFT,v7,1/9] Documentation: userspace-api: Add shadow stack API documentation | expand

Commit Message

Mark Brown July 31, 2024, 12:14 p.m. UTC
Add basic test coverage for specifying the shadow stack for a newly
created thread via clone3(), including coverage of the newly extended
argument structure.  We check that a user specified shadow stack can be
provided, and that invalid combinations of parameters are rejected.

In order to facilitate testing on systems without userspace shadow stack
support we manually enable shadow stacks on startup, this is architecture
specific due to the use of an arch_prctl() on x86. Due to interactions with
potential userspace locking of features we actually detect support for
shadow stacks on the running system by attempting to allocate a shadow
stack page during initialisation using map_shadow_stack(), warning if this
succeeds when the enable failed.

In order to allow testing of user configured shadow stacks on
architectures with that feature we need to ensure that we do not return
from the function where the clone3() syscall is called in the child
process, doing so would trigger a shadow stack underflow.  To do this we
use inline assembly rather than the standard syscall wrapper to call
clone3().  In order to avoid surprises we also use a syscall rather than
the libc exit() function., this should be overly cautious.

Signed-off-by: Mark Brown <broonie@kernel.org>
---
 tools/testing/selftests/clone3/clone3.c           | 134 +++++++++++++++++++++-
 tools/testing/selftests/clone3/clone3_selftests.h |  38 ++++++
 2 files changed, 171 insertions(+), 1 deletion(-)

Comments

Kees Cook Aug. 6, 2024, 3:54 a.m. UTC | #1
On Wed, Jul 31, 2024 at 01:14:15PM +0100, Mark Brown wrote:
> Add basic test coverage for specifying the shadow stack for a newly
> created thread via clone3(), including coverage of the newly extended
> argument structure.  We check that a user specified shadow stack can be
> provided, and that invalid combinations of parameters are rejected.
> 
> In order to facilitate testing on systems without userspace shadow stack
> support we manually enable shadow stacks on startup, this is architecture
> specific due to the use of an arch_prctl() on x86. Due to interactions with
> potential userspace locking of features we actually detect support for
> shadow stacks on the running system by attempting to allocate a shadow
> stack page during initialisation using map_shadow_stack(), warning if this
> succeeds when the enable failed.
> 
> In order to allow testing of user configured shadow stacks on
> architectures with that feature we need to ensure that we do not return
> from the function where the clone3() syscall is called in the child
> process, doing so would trigger a shadow stack underflow.  To do this we
> use inline assembly rather than the standard syscall wrapper to call
> clone3().  In order to avoid surprises we also use a syscall rather than
> the libc exit() function., this should be overly cautious.
> 
> Signed-off-by: Mark Brown <broonie@kernel.org>
> ---
>  tools/testing/selftests/clone3/clone3.c           | 134 +++++++++++++++++++++-
>  tools/testing/selftests/clone3/clone3_selftests.h |  38 ++++++
>  2 files changed, 171 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c
> index 26221661e9ae..81c2e8648e8b 100644
> --- a/tools/testing/selftests/clone3/clone3.c
> +++ b/tools/testing/selftests/clone3/clone3.c
> @@ -3,6 +3,7 @@
>  /* Based on Christian Brauner's clone3() example */
>  
>  #define _GNU_SOURCE
> +#include <asm/mman.h>
>  #include <errno.h>
>  #include <inttypes.h>
>  #include <linux/types.h>
> @@ -11,6 +12,7 @@
>  #include <stdint.h>
>  #include <stdio.h>
>  #include <stdlib.h>
> +#include <sys/mman.h>
>  #include <sys/syscall.h>
>  #include <sys/types.h>
>  #include <sys/un.h>
> @@ -19,8 +21,12 @@
>  #include <sched.h>
>  
>  #include "../kselftest.h"
> +#include "../ksft_shstk.h"
>  #include "clone3_selftests.h"
>  
> +static bool shadow_stack_supported;
> +static size_t max_supported_args_size;
> +
>  enum test_mode {
>  	CLONE3_ARGS_NO_TEST,
>  	CLONE3_ARGS_ALL_0,
> @@ -28,6 +34,10 @@ enum test_mode {
>  	CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG,
>  	CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG,
>  	CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG,
> +	CLONE3_ARGS_SHADOW_STACK,
> +	CLONE3_ARGS_SHADOW_STACK_NO_SIZE,
> +	CLONE3_ARGS_SHADOW_STACK_NO_POINTER,
> +	CLONE3_ARGS_SHADOW_STACK_NO_TOKEN,
>  };
>  
>  typedef bool (*filter_function)(void);
> @@ -44,6 +54,44 @@ struct test {
>  	filter_function filter;
>  };
>  
> +
> +/*
> + * We check for shadow stack support by attempting to use
> + * map_shadow_stack() since features may have been locked by the
> + * dynamic linker resulting in spurious errors when we attempt to
> + * enable on startup.  We warn if the enable failed.
> + */
> +static void test_shadow_stack_supported(void)
> +{
> +	long ret;
> +
> +	ret = syscall(__NR_map_shadow_stack, 0, getpagesize(), 0);
> +	if (ret == -1) {
> +		ksft_print_msg("map_shadow_stack() not supported\n");
> +	} else if ((void *)ret == MAP_FAILED) {
> +		ksft_print_msg("Failed to map shadow stack\n");
> +	} else {
> +		ksft_print_msg("Shadow stack supportd\n");

typo: supportd -> supported

> +		shadow_stack_supported = true;
> +
> +		if (!shadow_stack_enabled)
> +			ksft_print_msg("Mapped but did not enable shadow stack\n");
> +	}
> +}

On my CET system, this reports:

  ...
  # clone3() syscall supported
  # Shadow stack supportd
  # Running test 'simple clone3()'
  ...

(happily doesn't print "Mapped but did not enable ...").

> +
> +static unsigned long long get_shadow_stack_page(unsigned long flags)
> +{
> +	unsigned long long page;
> +
> +	page = syscall(__NR_map_shadow_stack, 0, getpagesize(), flags);
> +	if ((void *)page == MAP_FAILED) {
> +		ksft_print_msg("map_shadow_stack() failed: %d\n", errno);
> +		return 0;
> +	}
> +
> +	return page;
> +}
> +
>  static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode)
>  {
>  	struct __clone_args args = {
> @@ -89,6 +137,21 @@ static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode)
>  	case CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG:
>  		args.exit_signal = 0x00000000000000f0ULL;
>  		break;
> +	case CLONE3_ARGS_SHADOW_STACK:
> +		/* We need to specify a normal stack too to avoid corruption */
> +		args.shadow_stack = get_shadow_stack_page(SHADOW_STACK_SET_TOKEN);
> +		args.shadow_stack_size = getpagesize();
> +		break;

  # Running test 'Shadow stack on system with shadow stack'
  # [5496] Trying clone3() with flags 0 (size 0)
  # I am the parent (5496). My child's pid is 5505
  # Child exited with signal 11
  # [5496] clone3() with flags says: 11 expected 0
  # [5496] Result (11) is different than expected (0)
  not ok 20 Shadow stack on system with shadow stack

The child segfaults immediately, it seems?

> +	case CLONE3_ARGS_SHADOW_STACK_NO_POINTER:
> +		args.shadow_stack_size = getpagesize();
> +		break;

  # Running test 'Shadow stack with no pointer'
  # [5496] Trying clone3() with flags 0 (size 0)
  # Invalid argument - Failed to create new process
  # [5496] clone3() with flags says: -22 expected -22
  ok 21 Shadow stack with no pointer

This seems like it misses the failure and reports ok

> +	case CLONE3_ARGS_SHADOW_STACK_NO_SIZE:
> +		args.shadow_stack = get_shadow_stack_page(SHADOW_STACK_SET_TOKEN);
> +		break;

  # Running test 'Shadow stack with no size'
  # [5496] Trying clone3() with flags 0 (size 0)
  # Invalid argument - Failed to create new process
  # [5496] clone3() with flags says: -22 expected -22
  ok 22 Shadow stack with no size

Same?

> +	case CLONE3_ARGS_SHADOW_STACK_NO_TOKEN:
> +		args.shadow_stack = get_shadow_stack_page(0);
> +		args.shadow_stack_size = getpagesize();
> +		break;

This actually segfaults the parent:

  # Running test 'Shadow stack with no token'
  # [5496] Trying clone3() with flags 0x100 (size 0)
  # I am the parent (5496). My child's pid is 5507
  Segmentation fault

Let me know what would be most helpful to dig into more...
Mark Brown Aug. 6, 2024, 3:10 p.m. UTC | #2
On Mon, Aug 05, 2024 at 08:54:54PM -0700, Kees Cook wrote:
> On Wed, Jul 31, 2024 at 01:14:15PM +0100, Mark Brown wrote:

> > +	case CLONE3_ARGS_SHADOW_STACK:
> > +		/* We need to specify a normal stack too to avoid corruption */
> > +		args.shadow_stack = get_shadow_stack_page(SHADOW_STACK_SET_TOKEN);
> > +		args.shadow_stack_size = getpagesize();
> > +		break;

>   # Running test 'Shadow stack on system with shadow stack'
>   # [5496] Trying clone3() with flags 0 (size 0)
>   # I am the parent (5496). My child's pid is 5505
>   # Child exited with signal 11
>   # [5496] clone3() with flags says: 11 expected 0
>   # [5496] Result (11) is different than expected (0)
>   not ok 20 Shadow stack on system with shadow stack

> The child segfaults immediately, it seems?

That's what I'd expect if we either didn't manage to create the shadow
stack token in the page we mapped or we messed up in
arch_shstk_post_fork() somehow, probably getting the wrong pointer for
the token or not mapping things correctly.  I'll have done something
silly, it'll doubtless be very obvious and embarrassing when I see it
but I'm not seeing it right now...

> > +	case CLONE3_ARGS_SHADOW_STACK_NO_POINTER:
> > +		args.shadow_stack_size = getpagesize();
> > +		break;

>   # Running test 'Shadow stack with no pointer'
>   # [5496] Trying clone3() with flags 0 (size 0)
>   # Invalid argument - Failed to create new process
>   # [5496] clone3() with flags says: -22 expected -22
>   ok 21 Shadow stack with no pointer

> This seems like it misses the failure and reports ok

No, this is testing to make sure we get the failure - if we have
arguments that can't possibly be valid then we should reject them with
an error code during validation prior to trying to create the new
thread.  The "expected -22" in the output says it's looking for an
error.  Same for the other similar expected error code.

> > +	case CLONE3_ARGS_SHADOW_STACK_NO_TOKEN:
> > +		args.shadow_stack = get_shadow_stack_page(0);
> > +		args.shadow_stack_size = getpagesize();
> > +		break;

> This actually segfaults the parent:

>   # Running test 'Shadow stack with no token'
>   # [5496] Trying clone3() with flags 0x100 (size 0)
>   # I am the parent (5496). My child's pid is 5507
>   Segmentation fault

Oh dear.  We possibly manage to corrupt the parent's shadow stack
somehow?  I don't think I managed to do that in my arm64 testing.  This
should also be something going wrong in arch_shstk_post_fork().

> Let me know what would be most helpful to dig into more...

It'll almost certianly be something in arch_shstk_post_fork(), that's
the bit I couldn't test.  Just making that always return success should
avoid the first fault, the second ought to not crash but will report a
fail as we should be rejecting the shadow stack when we try to consume
the token.
Mark Brown Aug. 6, 2024, 8:10 p.m. UTC | #3
On Mon, Aug 05, 2024 at 08:54:54PM -0700, Kees Cook wrote:

>   # Running test 'Shadow stack on system with shadow stack'
>   # [5496] Trying clone3() with flags 0 (size 0)
>   # I am the parent (5496). My child's pid is 5505
>   # Child exited with signal 11
>   # [5496] clone3() with flags says: 11 expected 0
>   # [5496] Result (11) is different than expected (0)
>   not ok 20 Shadow stack on system with shadow stack

> The child segfaults immediately, it seems?

Does this help:

diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 1755fa21e6fb..27acbdf44c5f 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -198,13 +198,14 @@ int arch_shstk_post_fork(struct task_struct *t, struct kernel_clone_args *args)
 	 * the token 64-bit.
 	 */
 	struct mm_struct *mm;
-	unsigned long addr;
+	unsigned long addr, ssp;
 	u64 expected;
 	u64 val;
-	int ret = -EINVAL;;
+	int ret = -EINVAL;
 
-	addr = args->shadow_stack + args->shadow_stack_size - sizeof(u64);
-	expected = (addr - SS_FRAME_SIZE) | BIT(0);
+	ssp = args->shadow_stack + args->shadow_stack_size;
+	addr = ssp - SS_FRAME_SIZE;
+	expected = ssp | BIT(0);
 
 	mm = get_task_mm(t);
 	if (!mm)
Kees Cook Aug. 6, 2024, 9:43 p.m. UTC | #4
On Tue, Aug 06, 2024 at 09:10:28PM +0100, Mark Brown wrote:
> On Mon, Aug 05, 2024 at 08:54:54PM -0700, Kees Cook wrote:
> 
> >   # Running test 'Shadow stack on system with shadow stack'
> >   # [5496] Trying clone3() with flags 0 (size 0)
> >   # I am the parent (5496). My child's pid is 5505
> >   # Child exited with signal 11
> >   # [5496] clone3() with flags says: 11 expected 0
> >   # [5496] Result (11) is different than expected (0)
> >   not ok 20 Shadow stack on system with shadow stack
> 
> > The child segfaults immediately, it seems?
> 
> Does this help:
> 
> diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
> index 1755fa21e6fb..27acbdf44c5f 100644
> --- a/arch/x86/kernel/shstk.c
> +++ b/arch/x86/kernel/shstk.c
> @@ -198,13 +198,14 @@ int arch_shstk_post_fork(struct task_struct *t, struct kernel_clone_args *args)
>  	 * the token 64-bit.
>  	 */
>  	struct mm_struct *mm;
> -	unsigned long addr;
> +	unsigned long addr, ssp;
>  	u64 expected;
>  	u64 val;
> -	int ret = -EINVAL;;
> +	int ret = -EINVAL;
>  
> -	addr = args->shadow_stack + args->shadow_stack_size - sizeof(u64);
> -	expected = (addr - SS_FRAME_SIZE) | BIT(0);
> +	ssp = args->shadow_stack + args->shadow_stack_size;
> +	addr = ssp - SS_FRAME_SIZE;
> +	expected = ssp | BIT(0);
>  
>  	mm = get_task_mm(t);
>  	if (!mm)

Yes indeed! This passes now.

"Shadow stack with no token" still crashes the parent. It seems to
crash in waitpid(). Under gdb it hangs instead, showing it's in glibc's
__GI___wait4(). Ah, it's crashing at c3 (ret), so shadow stack problem,
I imagine.

Does waitpid() need to be open-coded like the clone3() call too?
Mark Brown Aug. 6, 2024, 10:21 p.m. UTC | #5
On Tue, Aug 06, 2024 at 10:57:39PM +0100, Mark Brown wrote:
> On Tue, Aug 06, 2024 at 02:43:22PM -0700, Kees Cook wrote:

> > "Shadow stack with no token" still crashes the parent. It seems to
> > crash in waitpid(). Under gdb it hangs instead, showing it's in glibc's
> > __GI___wait4(). Ah, it's crashing at c3 (ret), so shadow stack problem,
> > I imagine.

> Yes, likely.  They are delivered as a SEGV with SEGV_CPERR.

> > Does waitpid() need to be open-coded like the clone3() call too?

> I wouldn't have expected so, it should just be a function call and
> definitely didn't do anything funky on arm64.  It seems more likely that
> we've managed to corrupt the stack or shadow stack - most likely the new
> thread is still using the original shadow stack rather than the new one
> and so corrupts it.  Again not immediately seeing where.  I'll have
> another look tomorrow if nobody has any bright ideas before then...

...or possibly we're delivering the signal that's generated when we fail
to validate the child's shadow stack token to the parent rather than the
child.  That logic (in shstk_post_fork()) should be shared with arm64
though so it ought to have been failing for me too.  Failure to validate
the token should look to the parent like the child immediately taking a
shadow stack fault.
Kees Cook Aug. 7, 2024, 5:08 a.m. UTC | #6
On Tue, Aug 06, 2024 at 04:10:02PM +0100, Mark Brown wrote:
> On Mon, Aug 05, 2024 at 08:54:54PM -0700, Kees Cook wrote:
> > This actually segfaults the parent:
> 
> >   # Running test 'Shadow stack with no token'
> >   # [5496] Trying clone3() with flags 0x100 (size 0)
> >   # I am the parent (5496). My child's pid is 5507
> >   Segmentation fault
> 
> Oh dear.  We possibly manage to corrupt the parent's shadow stack
> somehow?  I don't think I managed to do that in my arm64 testing.  This
> should also be something going wrong in arch_shstk_post_fork().
> 
> > Let me know what would be most helpful to dig into more...
> 
> It'll almost certianly be something in arch_shstk_post_fork(), that's
> the bit I couldn't test.  Just making that always return success should
> avoid the first fault, the second ought to not crash but will report a
> fail as we should be rejecting the shadow stack when we try to consume
> the token.

It took me a while to figure out where a thread switches shstk (even
without this series):

kernel_clone, copy_process, copy_thread, fpu_clone, update_fpu_shstk
(and shstk_alloc_thread_stack is called just before update_fpu_shstk).

I don't understand the token consumption in arch_shstk_post_fork(). This
wasn't needed before with the fixed-size new shstk, why is it needed
now?

Anyway, my attempt to trace the shstk changes for the test:

write(1, "TAP version 13\n", 15)        = 15
write(1, "1..2\n", 5)                   = 5
clone3({flags=0, exit_signal=18446744073709551615, stack=NULL, stack_size=0}, 104) = -1 EINVAL (Invalid argument)
write(1, "# clone3() syscall supported\n", 29) = 29
map_shadow_stack(NULL, 4096, 0)         = 125837480497152
write(1, "# Shadow stack supportd\n", 24) = 24
write(1, "# Running test 'Shadow stack wit"..., 44) = 44
getpid()                                = 4943
write(1, "# [4943] Trying clone3() with fl"..., 51) = 51
map_shadow_stack(NULL, 4096, 0)         = 125837480488960
clone3({flags=CLONE_VM, exit_signal=SIGCHLD, stack=NULL, stack_size=0, /* bytes 88..103 */ "\x00\xf0\x52\xd2\x72\x72\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00"} => {/* bytes 88..103 */ "\x00\xf0\x52\xd2\x72\x72\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00"}, 104) = 4944
getpid()                                = 4943
write(1, "# I am the parent (4943). My chi"..., 49strace: Process 4944 attached
) = 49
[pid  4944] --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_CPERR, si_addr=NULL} ---
[pid  4943] wait4(-1,  <unfinished ...>
[pid  4944] +++ killed by SIGSEGV (core dumped) +++
<... wait4 resumed>[{WIFSIGNALED(s) && WTERMSIG(s) == SIGSEGV && WCOREDUMP(s)}], __WALL, NULL) = 4944
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_DUMPED, si_pid=4944, si_uid=0, si_status=SIGSEGV, si_utime=0, si_stime=0} ---
--- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_MAPERR, si_addr=0x7272d21fffe8} ---
+++ killed by SIGSEGV (core dumped) +++

[  569.153288] shstk_setup: clone3[4943] ssp:7272d2200000
[  569.153998] process: copy_thread: clone3[4943] new_ssp:7272d2530000
[  569.154002] update_fpu_shstk: clone3[4943] ssp:7272d2530000
[  569.154008] shstk_post_fork: clone3[4944]
[  569.154011] shstk_post_fork: clone3[4944] sending SIGSEGV post fork

I don't see an update_fpu_shstk for 4944? Should I with this test?

And the parent dies with SEGV_MAPERR??

I'll keep looking in the morning ...
Mark Brown Aug. 7, 2024, 12:39 p.m. UTC | #7
On Tue, Aug 06, 2024 at 10:08:44PM -0700, Kees Cook wrote:
> On Tue, Aug 06, 2024 at 04:10:02PM +0100, Mark Brown wrote:

> > >   # Running test 'Shadow stack with no token'

> It took me a while to figure out where a thread switches shstk (even
> without this series):

> kernel_clone, copy_process, copy_thread, fpu_clone, update_fpu_shstk
> (and shstk_alloc_thread_stack is called just before update_fpu_shstk).

> I don't understand the token consumption in arch_shstk_post_fork(). This
> wasn't needed before with the fixed-size new shstk, why is it needed
> now?

Concerns were raised on earlier rounds of review that since instead of
allocating the shadow stack as part of creating the new thread we are
using a previously allocated shadow stack someone could use this as part
of an exploit.  You could just jump on top of any existing shadow stack
and cause writes to it.

> Anyway, my attempt to trace the shstk changes for the test:

> write(1, "TAP version 13\n", 15)        = 15
> write(1, "1..2\n", 5)                   = 5
> clone3({flags=0, exit_signal=18446744073709551615, stack=NULL, stack_size=0}, 104) = -1 EINVAL (Invalid argument)
> write(1, "# clone3() syscall supported\n", 29) = 29
> map_shadow_stack(NULL, 4096, 0)         = 125837480497152
> write(1, "# Shadow stack supportd\n", 24) = 24
> write(1, "# Running test 'Shadow stack wit"..., 44) = 44
> getpid()                                = 4943
> write(1, "# [4943] Trying clone3() with fl"..., 51) = 51
> map_shadow_stack(NULL, 4096, 0)         = 125837480488960
> clone3({flags=CLONE_VM, exit_signal=SIGCHLD, stack=NULL, stack_size=0, /* bytes 88..103 */ "\x00\xf0\x52\xd2\x72\x72\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00"} => {/* bytes 88..103 */ "\x00\xf0\x52\xd2\x72\x72\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00"}, 104) = 4944
> getpid()                                = 4943
> write(1, "# I am the parent (4943). My chi"..., 49strace: Process 4944 attached
> ) = 49
> [pid  4944] --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_CPERR, si_addr=NULL} ---
> [pid  4943] wait4(-1,  <unfinished ...>
> [pid  4944] +++ killed by SIGSEGV (core dumped) +++

So we created the thread, then before we get to the wait4() in the
parent we start delivering a SEGV_CPERR to the child.  The flow for the
child is as expected.

> <... wait4 resumed>[{WIFSIGNALED(s) && WTERMSIG(s) == SIGSEGV && WCOREDUMP(s)}], __WALL, NULL) = 4944
> --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_DUMPED, si_pid=4944, si_uid=0, si_status=SIGSEGV, si_utime=0, si_stime=0} ---
> --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_MAPERR, si_addr=0x7272d21fffe8} ---
> +++ killed by SIGSEGV (core dumped) +++

Then the parent gets an ordinary segfault, not a shadow stack specific
one, like some memory got deallocated underneath it or a pointer got
corrupted.

> [  569.153288] shstk_setup: clone3[4943] ssp:7272d2200000
> [  569.153998] process: copy_thread: clone3[4943] new_ssp:7272d2530000
> [  569.154002] update_fpu_shstk: clone3[4943] ssp:7272d2530000
> [  569.154008] shstk_post_fork: clone3[4944]
> [  569.154011] shstk_post_fork: clone3[4944] sending SIGSEGV post fork

> I don't see an update_fpu_shstk for 4944? Should I with this test?

I'd only expect to see one update, my understanding is that that update
is for the child but happening in the context of the parent as the hild
is not yet started.

Does this help:

diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 27acbdf44c5f..d7005974aff5 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -258,6 +258,8 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
 	if (args->shadow_stack) {
 		addr = args->shadow_stack;
 		size = args->shadow_stack_size;
+		shstk->base = 0;
+		shstk->size = 0;
 	} else {
 		/*
 		 * For CLONE_VFORK the child will share the parents
Kees Cook Aug. 7, 2024, 6:23 p.m. UTC | #8
On Wed, Aug 07, 2024 at 01:39:27PM +0100, Mark Brown wrote:
> On Tue, Aug 06, 2024 at 10:08:44PM -0700, Kees Cook wrote:
> > On Tue, Aug 06, 2024 at 04:10:02PM +0100, Mark Brown wrote:
> 
> > > >   # Running test 'Shadow stack with no token'
> 
> > It took me a while to figure out where a thread switches shstk (even
> > without this series):
> 
> > kernel_clone, copy_process, copy_thread, fpu_clone, update_fpu_shstk
> > (and shstk_alloc_thread_stack is called just before update_fpu_shstk).
> 
> > I don't understand the token consumption in arch_shstk_post_fork(). This
> > wasn't needed before with the fixed-size new shstk, why is it needed
> > now?
> 
> Concerns were raised on earlier rounds of review that since instead of
> allocating the shadow stack as part of creating the new thread we are
> using a previously allocated shadow stack someone could use this as part
> of an exploit.  You could just jump on top of any existing shadow stack
> and cause writes to it.
> 
> > Anyway, my attempt to trace the shstk changes for the test:
> 
> > write(1, "TAP version 13\n", 15)        = 15
> > write(1, "1..2\n", 5)                   = 5
> > clone3({flags=0, exit_signal=18446744073709551615, stack=NULL, stack_size=0}, 104) = -1 EINVAL (Invalid argument)
> > write(1, "# clone3() syscall supported\n", 29) = 29
> > map_shadow_stack(NULL, 4096, 0)         = 125837480497152
> > write(1, "# Shadow stack supportd\n", 24) = 24
> > write(1, "# Running test 'Shadow stack wit"..., 44) = 44
> > getpid()                                = 4943
> > write(1, "# [4943] Trying clone3() with fl"..., 51) = 51
> > map_shadow_stack(NULL, 4096, 0)         = 125837480488960
> > clone3({flags=CLONE_VM, exit_signal=SIGCHLD, stack=NULL, stack_size=0, /* bytes 88..103 */ "\x00\xf0\x52\xd2\x72\x72\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00"} => {/* bytes 88..103 */ "\x00\xf0\x52\xd2\x72\x72\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00"}, 104) = 4944
> > getpid()                                = 4943
> > write(1, "# I am the parent (4943). My chi"..., 49strace: Process 4944 attached
> > ) = 49
> > [pid  4944] --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_CPERR, si_addr=NULL} ---
> > [pid  4943] wait4(-1,  <unfinished ...>
> > [pid  4944] +++ killed by SIGSEGV (core dumped) +++
> 
> So we created the thread, then before we get to the wait4() in the
> parent we start delivering a SEGV_CPERR to the child.  The flow for the
> child is as expected.
> 
> > <... wait4 resumed>[{WIFSIGNALED(s) && WTERMSIG(s) == SIGSEGV && WCOREDUMP(s)}], __WALL, NULL) = 4944
> > --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_DUMPED, si_pid=4944, si_uid=0, si_status=SIGSEGV, si_utime=0, si_stime=0} ---
> > --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_MAPERR, si_addr=0x7272d21fffe8} ---
> > +++ killed by SIGSEGV (core dumped) +++
> 
> Then the parent gets an ordinary segfault, not a shadow stack specific
> one, like some memory got deallocated underneath it or a pointer got
> corrupted.
> 
> > [  569.153288] shstk_setup: clone3[4943] ssp:7272d2200000
> > [  569.153998] process: copy_thread: clone3[4943] new_ssp:7272d2530000
> > [  569.154002] update_fpu_shstk: clone3[4943] ssp:7272d2530000
> > [  569.154008] shstk_post_fork: clone3[4944]
> > [  569.154011] shstk_post_fork: clone3[4944] sending SIGSEGV post fork
> 
> > I don't see an update_fpu_shstk for 4944? Should I with this test?
> 
> I'd only expect to see one update, my understanding is that that update
> is for the child but happening in the context of the parent as the hild
> is not yet started.

What's weird here that I don't understand is that the parent is 4943, so
this report makes sense:

> > [  569.153288] shstk_setup: clone3[4943] ssp:7272d2200000

The child is 4944, yet I see:

> > [  569.153998] process: copy_thread: clone3[4943] new_ssp:7272d2530000
> > [  569.154002] update_fpu_shstk: clone3[4943] ssp:7272d2530000

These map to my logging:

copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
	...
	new_ssp = shstk_alloc_thread_stack(p, args);
	pr_err("%s: %s[%d] new_ssp:%lx\n", __func__, p->comm, task_pid_nr(p), new_ssp);

and

update_fpu_shstk(struct task_struct *dst, unsigned long ssp)
	...
        xstate->user_ssp = (u64)ssp;
	pr_err("%s: %s[%d] ssp:%lx\n", __func__, dst->comm, task_pid_nr(dst), ssp);

The child should be "p" (and "dst") here -- stuff is being copied from
current to p, but p is reporting itself as 4943 here? (Oh, this is
reporting pid, not tid... I bet that's what I've got wrong.)

> Does this help:
> 
> diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
> index 27acbdf44c5f..d7005974aff5 100644
> --- a/arch/x86/kernel/shstk.c
> +++ b/arch/x86/kernel/shstk.c
> @@ -258,6 +258,8 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
>  	if (args->shadow_stack) {
>  		addr = args->shadow_stack;
>  		size = args->shadow_stack_size;
> +		shstk->base = 0;
> +		shstk->size = 0;
>  	} else {
>  		/*
>  		 * For CLONE_VFORK the child will share the parents

I'll fix my reporting and give this patch a try too. Thanks!

-Kees
Kees Cook Aug. 7, 2024, 7:23 p.m. UTC | #9
On Wed, Aug 07, 2024 at 01:39:27PM +0100, Mark Brown wrote:
> On Tue, Aug 06, 2024 at 10:08:44PM -0700, Kees Cook wrote:
> > On Tue, Aug 06, 2024 at 04:10:02PM +0100, Mark Brown wrote:
> 
> > > >   # Running test 'Shadow stack with no token'
> 
> > It took me a while to figure out where a thread switches shstk (even
> > without this series):
> 
> > kernel_clone, copy_process, copy_thread, fpu_clone, update_fpu_shstk
> > (and shstk_alloc_thread_stack is called just before update_fpu_shstk).
> 
> > I don't understand the token consumption in arch_shstk_post_fork(). This
> > wasn't needed before with the fixed-size new shstk, why is it needed
> > now?
> 
> Concerns were raised on earlier rounds of review that since instead of
> allocating the shadow stack as part of creating the new thread we are
> using a previously allocated shadow stack someone could use this as part
> of an exploit.  You could just jump on top of any existing shadow stack
> and cause writes to it.
> 
> > Anyway, my attempt to trace the shstk changes for the test:
> 
> > write(1, "TAP version 13\n", 15)        = 15
> > write(1, "1..2\n", 5)                   = 5
> > clone3({flags=0, exit_signal=18446744073709551615, stack=NULL, stack_size=0}, 104) = -1 EINVAL (Invalid argument)
> > write(1, "# clone3() syscall supported\n", 29) = 29
> > map_shadow_stack(NULL, 4096, 0)         = 125837480497152
> > write(1, "# Shadow stack supportd\n", 24) = 24
> > write(1, "# Running test 'Shadow stack wit"..., 44) = 44
> > getpid()                                = 4943
> > write(1, "# [4943] Trying clone3() with fl"..., 51) = 51
> > map_shadow_stack(NULL, 4096, 0)         = 125837480488960
> > clone3({flags=CLONE_VM, exit_signal=SIGCHLD, stack=NULL, stack_size=0, /* bytes 88..103 */ "\x00\xf0\x52\xd2\x72\x72\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00"} => {/* bytes 88..103 */ "\x00\xf0\x52\xd2\x72\x72\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00"}, 104) = 4944
> > getpid()                                = 4943
> > write(1, "# I am the parent (4943). My chi"..., 49strace: Process 4944 attached
> > ) = 49
> > [pid  4944] --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_CPERR, si_addr=NULL} ---
> > [pid  4943] wait4(-1,  <unfinished ...>
> > [pid  4944] +++ killed by SIGSEGV (core dumped) +++
> 
> So we created the thread, then before we get to the wait4() in the
> parent we start delivering a SEGV_CPERR to the child.  The flow for the
> child is as expected.
> 
> > <... wait4 resumed>[{WIFSIGNALED(s) && WTERMSIG(s) == SIGSEGV && WCOREDUMP(s)}], __WALL, NULL) = 4944
> > --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_DUMPED, si_pid=4944, si_uid=0, si_status=SIGSEGV, si_utime=0, si_stime=0} ---
> > --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_MAPERR, si_addr=0x7272d21fffe8} ---
> > +++ killed by SIGSEGV (core dumped) +++
> 
> Then the parent gets an ordinary segfault, not a shadow stack specific
> one, like some memory got deallocated underneath it or a pointer got
> corrupted.
> 
> > [  569.153288] shstk_setup: clone3[4943] ssp:7272d2200000
> > [  569.153998] process: copy_thread: clone3[4943] new_ssp:7272d2530000
> > [  569.154002] update_fpu_shstk: clone3[4943] ssp:7272d2530000
> > [  569.154008] shstk_post_fork: clone3[4944]
> > [  569.154011] shstk_post_fork: clone3[4944] sending SIGSEGV post fork
> 
> > I don't see an update_fpu_shstk for 4944? Should I with this test?
> 
> I'd only expect to see one update, my understanding is that that update
> is for the child but happening in the context of the parent as the hild
> is not yet started.
> 
> Does this help:
> 
> diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
> index 27acbdf44c5f..d7005974aff5 100644
> --- a/arch/x86/kernel/shstk.c
> +++ b/arch/x86/kernel/shstk.c
> @@ -258,6 +258,8 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
>  	if (args->shadow_stack) {
>  		addr = args->shadow_stack;
>  		size = args->shadow_stack_size;
> +		shstk->base = 0;
> +		shstk->size = 0;
>  	} else {
>  		/*
>  		 * For CLONE_VFORK the child will share the parents

Yup, that fixes it!

  # Totals: pass:23 fail:0 xfail:0 xpass:0 skip:1 error:0

(The skip is "Shadow stack on system without shadow stack")
Mark Brown Aug. 7, 2024, 10:03 p.m. UTC | #10
On Wed, Aug 07, 2024 at 12:23:01PM -0700, Kees Cook wrote:
> On Wed, Aug 07, 2024 at 01:39:27PM +0100, Mark Brown wrote:

> >  		size = args->shadow_stack_size;
> > +		shstk->base = 0;
> > +		shstk->size = 0;

> Yup, that fixes it!

>   # Totals: pass:23 fail:0 xfail:0 xpass:0 skip:1 error:0

> (The skip is "Shadow stack on system without shadow stack")

Excellent, thanks!  It's amazing how many dumb mistakes you can find if
you actually try running the code :/ .
Kees Cook Aug. 7, 2024, 11:22 p.m. UTC | #11
On Wed, Aug 07, 2024 at 11:03:24PM +0100, Mark Brown wrote:
> On Wed, Aug 07, 2024 at 12:23:01PM -0700, Kees Cook wrote:
> > On Wed, Aug 07, 2024 at 01:39:27PM +0100, Mark Brown wrote:
> 
> > >  		size = args->shadow_stack_size;
> > > +		shstk->base = 0;
> > > +		shstk->size = 0;
> 
> > Yup, that fixes it!
> 
> >   # Totals: pass:23 fail:0 xfail:0 xpass:0 skip:1 error:0
> 
> > (The skip is "Shadow stack on system without shadow stack")
> 
> Excellent, thanks!  It's amazing how many dumb mistakes you can find if
> you actually try running the code :/ .

Heh, well, it's tricky work writing it without reference hardware. :) I
just wish there was CET emulation in QEmu...
diff mbox series

Patch

diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c
index 26221661e9ae..81c2e8648e8b 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -3,6 +3,7 @@ 
 /* Based on Christian Brauner's clone3() example */
 
 #define _GNU_SOURCE
+#include <asm/mman.h>
 #include <errno.h>
 #include <inttypes.h>
 #include <linux/types.h>
@@ -11,6 +12,7 @@ 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <sys/mman.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <sys/un.h>
@@ -19,8 +21,12 @@ 
 #include <sched.h>
 
 #include "../kselftest.h"
+#include "../ksft_shstk.h"
 #include "clone3_selftests.h"
 
+static bool shadow_stack_supported;
+static size_t max_supported_args_size;
+
 enum test_mode {
 	CLONE3_ARGS_NO_TEST,
 	CLONE3_ARGS_ALL_0,
@@ -28,6 +34,10 @@  enum test_mode {
 	CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG,
 	CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG,
 	CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG,
+	CLONE3_ARGS_SHADOW_STACK,
+	CLONE3_ARGS_SHADOW_STACK_NO_SIZE,
+	CLONE3_ARGS_SHADOW_STACK_NO_POINTER,
+	CLONE3_ARGS_SHADOW_STACK_NO_TOKEN,
 };
 
 typedef bool (*filter_function)(void);
@@ -44,6 +54,44 @@  struct test {
 	filter_function filter;
 };
 
+
+/*
+ * We check for shadow stack support by attempting to use
+ * map_shadow_stack() since features may have been locked by the
+ * dynamic linker resulting in spurious errors when we attempt to
+ * enable on startup.  We warn if the enable failed.
+ */
+static void test_shadow_stack_supported(void)
+{
+	long ret;
+
+	ret = syscall(__NR_map_shadow_stack, 0, getpagesize(), 0);
+	if (ret == -1) {
+		ksft_print_msg("map_shadow_stack() not supported\n");
+	} else if ((void *)ret == MAP_FAILED) {
+		ksft_print_msg("Failed to map shadow stack\n");
+	} else {
+		ksft_print_msg("Shadow stack supportd\n");
+		shadow_stack_supported = true;
+
+		if (!shadow_stack_enabled)
+			ksft_print_msg("Mapped but did not enable shadow stack\n");
+	}
+}
+
+static unsigned long long get_shadow_stack_page(unsigned long flags)
+{
+	unsigned long long page;
+
+	page = syscall(__NR_map_shadow_stack, 0, getpagesize(), flags);
+	if ((void *)page == MAP_FAILED) {
+		ksft_print_msg("map_shadow_stack() failed: %d\n", errno);
+		return 0;
+	}
+
+	return page;
+}
+
 static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode)
 {
 	struct __clone_args args = {
@@ -89,6 +137,21 @@  static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode)
 	case CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG:
 		args.exit_signal = 0x00000000000000f0ULL;
 		break;
+	case CLONE3_ARGS_SHADOW_STACK:
+		/* We need to specify a normal stack too to avoid corruption */
+		args.shadow_stack = get_shadow_stack_page(SHADOW_STACK_SET_TOKEN);
+		args.shadow_stack_size = getpagesize();
+		break;
+	case CLONE3_ARGS_SHADOW_STACK_NO_POINTER:
+		args.shadow_stack_size = getpagesize();
+		break;
+	case CLONE3_ARGS_SHADOW_STACK_NO_SIZE:
+		args.shadow_stack = get_shadow_stack_page(SHADOW_STACK_SET_TOKEN);
+		break;
+	case CLONE3_ARGS_SHADOW_STACK_NO_TOKEN:
+		args.shadow_stack = get_shadow_stack_page(0);
+		args.shadow_stack_size = getpagesize();
+		break;
 	}
 
 	memcpy(&args_ext.args, &args, sizeof(struct __clone_args));
@@ -102,7 +165,12 @@  static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode)
 
 	if (pid == 0) {
 		ksft_print_msg("I am the child, my PID is %d\n", getpid());
-		_exit(EXIT_SUCCESS);
+		/*
+		 * Use a raw syscall to ensure we don't get issues
+		 * with manually specified shadow stack and exit handlers.
+		 */
+		syscall(__NR_exit, EXIT_SUCCESS);
+		ksft_print_msg("CHILD FAILED TO EXIT PID is %d\n", getpid());
 	}
 
 	ksft_print_msg("I am the parent (%d). My child's pid is %d\n",
@@ -191,6 +259,26 @@  static bool no_timenamespace(void)
 	return true;
 }
 
+static bool have_shadow_stack(void)
+{
+	if (shadow_stack_supported) {
+		ksft_print_msg("Shadow stack supported\n");
+		return true;
+	}
+
+	return false;
+}
+
+static bool no_shadow_stack(void)
+{
+	if (!shadow_stack_supported) {
+		ksft_print_msg("Shadow stack not supported\n");
+		return true;
+	}
+
+	return false;
+}
+
 static size_t page_size_plus_8(void)
 {
 	return getpagesize() + 8;
@@ -334,6 +422,47 @@  static const struct test tests[] = {
 		.expected = -EINVAL,
 		.test_mode = CLONE3_ARGS_NO_TEST,
 	},
+	{
+		.name = "Shadow stack on system with shadow stack",
+		.size = 0,
+		.expected = 0,
+		.e2big_valid = true,
+		.test_mode = CLONE3_ARGS_SHADOW_STACK,
+		.filter = no_shadow_stack,
+	},
+	{
+		.name = "Shadow stack with no pointer",
+		.size = 0,
+		.expected = -EINVAL,
+		.e2big_valid = true,
+		.test_mode = CLONE3_ARGS_SHADOW_STACK_NO_POINTER,
+	},
+	{
+		.name = "Shadow stack with no size",
+		.size = 0,
+		.expected = -EINVAL,
+		.e2big_valid = true,
+		.test_mode = CLONE3_ARGS_SHADOW_STACK_NO_SIZE,
+		.filter = no_shadow_stack,
+	},
+	{
+		.name = "Shadow stack with no token",
+		.flags = CLONE_VM,
+		.size = 0,
+		.expected = SIGSEGV,
+		.e2big_valid = true,
+		.test_mode = CLONE3_ARGS_SHADOW_STACK_NO_TOKEN,
+		.filter = no_shadow_stack,
+	},
+	{
+		.name = "Shadow stack on system without shadow stack",
+		.flags = CLONE_VM,
+		.size = 0,
+		.expected = -EINVAL,
+		.e2big_valid = true,
+		.test_mode = CLONE3_ARGS_SHADOW_STACK,
+		.filter = have_shadow_stack,
+	},
 };
 
 int main(int argc, char *argv[])
@@ -341,9 +470,12 @@  int main(int argc, char *argv[])
 	size_t size;
 	int i;
 
+	enable_shadow_stack();
+
 	ksft_print_header();
 	ksft_set_plan(ARRAY_SIZE(tests));
 	test_clone3_supported();
+	test_shadow_stack_supported();
 
 	for (i = 0; i < ARRAY_SIZE(tests); i++)
 		test_clone3(&tests[i]);
diff --git a/tools/testing/selftests/clone3/clone3_selftests.h b/tools/testing/selftests/clone3/clone3_selftests.h
index 39b5dcba663c..38d82934668a 100644
--- a/tools/testing/selftests/clone3/clone3_selftests.h
+++ b/tools/testing/selftests/clone3/clone3_selftests.h
@@ -31,12 +31,50 @@  struct __clone_args {
 	__aligned_u64 set_tid;
 	__aligned_u64 set_tid_size;
 	__aligned_u64 cgroup;
+#ifndef CLONE_ARGS_SIZE_VER2
+#define CLONE_ARGS_SIZE_VER2 88	/* sizeof third published struct */
+#endif
+	__aligned_u64 shadow_stack;
+	__aligned_u64 shadow_stack_size;
+#ifndef CLONE_ARGS_SIZE_VER3
+#define CLONE_ARGS_SIZE_VER3 104 /* sizeof fourth published struct */
+#endif
 };
 
+/*
+ * For architectures with shadow stack support we need to be
+ * absolutely sure that the clone3() syscall will be inline and not a
+ * function call so we open code.
+ */
+#ifdef __x86_64__
+static pid_t __always_inline sys_clone3(struct __clone_args *args, size_t size)
+{
+	long ret;
+	register long _num  __asm__ ("rax") = __NR_clone3;
+	register long _args __asm__ ("rdi") = (long)(args);
+	register long _size __asm__ ("rsi") = (long)(size);
+
+	__asm__ volatile (
+		"syscall\n"
+		: "=a"(ret)
+		: "r"(_args), "r"(_size),
+		  "0"(_num)
+		: "rcx", "r11", "memory", "cc"
+	);
+
+	if (ret < 0) {
+		errno = -ret;
+		return -1;
+	}
+
+	return ret;
+}
+#else
 static pid_t sys_clone3(struct __clone_args *args, size_t size)
 {
 	return syscall(__NR_clone3, args, size);
 }
+#endif
 
 static inline void test_clone3_supported(void)
 {