Message ID | 20230101162910.710293-8-Jason@zx2c4.com |
---|---|
State | Superseded |
Headers | show |
Series | implement getrandom() in vDSO | expand |
Le 01/01/2023 à 17:29, Jason A. Donenfeld a écrit : > Hook up the generic vDSO implementation to the x86 vDSO data page. Since > the existing vDSO infrastructure is heavily based on the timekeeping > functionality, which works over arrays of bases, a new macro is > introduced for vvars that are not arrays. > > The vDSO function requires a ChaCha20 implementation that does not write > to the stack, yet can still do an entire ChaCha20 permutation, so > provide this using SSE2, since this is userland code that must work on > all x86-64 processors. There's a simple test for this code as well. As far as I understand the test is not dependent on the architecture, can it be a separate patch ? Also, as the chacha implementation is standalone and can be tested by the above mentionned simple test, can it be a separate patch as well ? Then the last patch only has the glue to wire-up getrandom VDSO to the architecture, and can be used as a reference for other architectures. > > Reviewed-by: Samuel Neves <sneves@dei.uc.pt> # for vgetrandom-chacha.S > Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> > --- > arch/x86/Kconfig | 1 + > arch/x86/entry/vdso/Makefile | 3 +- > arch/x86/entry/vdso/vdso.lds.S | 2 + > arch/x86/entry/vdso/vgetrandom-chacha.S | 178 ++++++++++++++++++ > arch/x86/entry/vdso/vgetrandom.c | 17 ++ > arch/x86/include/asm/vdso/getrandom.h | 55 ++++++ > arch/x86/include/asm/vdso/vsyscall.h | 2 + > arch/x86/include/asm/vvar.h | 16 ++ > tools/testing/selftests/vDSO/.gitignore | 1 + > tools/testing/selftests/vDSO/Makefile | 9 + > .../testing/selftests/vDSO/vdso_test_chacha.c | 43 +++++ > 11 files changed, 326 insertions(+), 1 deletion(-) > create mode 100644 arch/x86/entry/vdso/vgetrandom-chacha.S > create mode 100644 arch/x86/entry/vdso/vgetrandom.c > create mode 100644 arch/x86/include/asm/vdso/getrandom.h > create mode 100644 tools/testing/selftests/vDSO/vdso_test_chacha.c > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index 3604074a878b..ed689d831362 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -272,6 +272,7 @@ config X86 > select HAVE_UNSTABLE_SCHED_CLOCK > select HAVE_USER_RETURN_NOTIFIER > select HAVE_GENERIC_VDSO > + select VDSO_GETRANDOM if X86_64 > select HOTPLUG_SMT if SMP > select IRQ_FORCED_THREADING > select NEED_PER_CPU_EMBED_FIRST_CHUNK > diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile > index 838613ac15b8..3979bb4a61ae 100644 > --- a/arch/x86/entry/vdso/Makefile > +++ b/arch/x86/entry/vdso/Makefile > @@ -27,7 +27,7 @@ VDSO32-$(CONFIG_X86_32) := y > VDSO32-$(CONFIG_IA32_EMULATION) := y > > # files to link into the vdso > -vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o > +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o > vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o > vobjs32-y += vdso32/vclock_gettime.o > vobjs-$(CONFIG_X86_SGX) += vsgx.o > @@ -105,6 +105,7 @@ CFLAGS_REMOVE_vclock_gettime.o = -pg > CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg > CFLAGS_REMOVE_vgetcpu.o = -pg > CFLAGS_REMOVE_vsgx.o = -pg > +CFLAGS_REMOVE_vgetrandom.o = -pg > > # > # X32 processes use x32 vDSO to access 64bit kernel data. > diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S > index e8c60ae7a7c8..0bab5f4af6d1 100644 > --- a/arch/x86/entry/vdso/vdso.lds.S > +++ b/arch/x86/entry/vdso/vdso.lds.S > @@ -30,6 +30,8 @@ VERSION { > #ifdef CONFIG_X86_SGX > __vdso_sgx_enter_enclave; > #endif > + getrandom; > + __vdso_getrandom; > local: *; > }; > } > diff --git a/arch/x86/entry/vdso/vgetrandom-chacha.S b/arch/x86/entry/vdso/vgetrandom-chacha.S > new file mode 100644 > index 000000000000..d79e2bd97598 > --- /dev/null > +++ b/arch/x86/entry/vdso/vgetrandom-chacha.S > @@ -0,0 +1,178 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. > + */ > + > +#include <linux/linkage.h> > +#include <asm/frame.h> > + > +.section .rodata, "a" > +.align 16 > +CONSTANTS: .octa 0x6b20657479622d323320646e61707865 > +.text > + > +/* > + * Very basic SSE2 implementation of ChaCha20. Produces a given positive number > + * of blocks of output with a nonce of 0, taking an input key and 8-byte > + * counter. Importantly does not spill to the stack. Its arguments are: > + * > + * rdi: output bytes > + * rsi: 32-byte key input > + * rdx: 8-byte counter input/output Why a 8-byte counter ? According to RFC 7539, chacha20 takes: The inputs to ChaCha20 are: o A 256-bit key, treated as a concatenation of eight 32-bit little- endian integers. o A 96-bit nonce, treated as a concatenation of three 32-bit little- endian integers. o A 32-bit block count parameter, treated as a 32-bit little-endian integer. Are you mixing up the upper part of the counter with the nonce ? In that case you can't say you use a 0 nonce, can you ? > + * rcx: number of 64-byte blocks to write to output > + */ > +SYM_FUNC_START(__arch_chacha20_blocks_nostack) > + > +.set output, %rdi > +.set key, %rsi > +.set counter, %rdx > +.set nblocks, %rcx > +.set i, %al > +/* xmm registers are *not* callee-save. */ > +.set state0, %xmm0 > +.set state1, %xmm1 > +.set state2, %xmm2 > +.set state3, %xmm3 > +.set copy0, %xmm4 > +.set copy1, %xmm5 > +.set copy2, %xmm6 > +.set copy3, %xmm7 > +.set temp, %xmm8 > +.set one, %xmm9 > + > + /* copy0 = "expand 32-byte k" */ > + movaps CONSTANTS(%rip),copy0 > + /* copy1,copy2 = key */ > + movups 0x00(key),copy1 > + movups 0x10(key),copy2 > + /* copy3 = counter || zero nonce */ > + movq 0x00(counter),copy3 > + /* one = 1 || 0 */ > + movq $1,%rax > + movq %rax,one > + > +.Lblock: > + /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */ > + movdqa copy0,state0 > + movdqa copy1,state1 > + movdqa copy2,state2 > + movdqa copy3,state3 > + > + movb $10,i > +.Lpermute: > + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ > + paddd state1,state0 > + pxor state0,state3 > + movdqa state3,temp > + pslld $16,temp > + psrld $16,state3 > + por temp,state3 There is a lot of similarities between all the blocks, can you use GAS macros to avoid repetitions ? > + > + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ > + paddd state3,state2 > + pxor state2,state1 > + movdqa state1,temp > + pslld $12,temp > + psrld $20,state1 > + por temp,state1 > + > + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ > + paddd state1,state0 > + pxor state0,state3 > + movdqa state3,temp > + pslld $8,temp > + psrld $24,state3 > + por temp,state3 > + > + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ > + paddd state3,state2 > + pxor state2,state1 > + movdqa state1,temp > + pslld $7,temp > + psrld $25,state1 > + por temp,state1 > + > + /* state1[0,1,2,3] = state1[1,2,3,0] */ > + pshufd $0x39,state1,state1 > + /* state2[0,1,2,3] = state2[2,3,0,1] */ > + pshufd $0x4e,state2,state2 > + /* state3[0,1,2,3] = state3[3,0,1,2] */ > + pshufd $0x93,state3,state3 > + > + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ > + paddd state1,state0 > + pxor state0,state3 > + movdqa state3,temp > + pslld $16,temp > + psrld $16,state3 > + por temp,state3 > + > + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ > + paddd state3,state2 > + pxor state2,state1 > + movdqa state1,temp > + pslld $12,temp > + psrld $20,state1 > + por temp,state1 > + > + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ > + paddd state1,state0 > + pxor state0,state3 > + movdqa state3,temp > + pslld $8,temp > + psrld $24,state3 > + por temp,state3 > + > + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ > + paddd state3,state2 > + pxor state2,state1 > + movdqa state1,temp > + pslld $7,temp > + psrld $25,state1 > + por temp,state1 > + > + /* state1[0,1,2,3] = state1[3,0,1,2] */ > + pshufd $0x93,state1,state1 > + /* state2[0,1,2,3] = state2[2,3,0,1] */ > + pshufd $0x4e,state2,state2 > + /* state3[0,1,2,3] = state3[1,2,3,0] */ > + pshufd $0x39,state3,state3 > + > + decb i > + jnz .Lpermute > + > + /* output0 = state0 + copy0 */ > + paddd copy0,state0 > + movups state0,0x00(output) > + /* output1 = state1 + copy1 */ > + paddd copy1,state1 > + movups state1,0x10(output) > + /* output2 = state2 + copy2 */ > + paddd copy2,state2 > + movups state2,0x20(output) > + /* output3 = state3 + copy3 */ > + paddd copy3,state3 > + movups state3,0x30(output) > + > + /* ++copy3.counter */ > + paddq one,copy3 > + > + /* output += 64, --nblocks */ > + addq $64,output > + decq nblocks > + jnz .Lblock > + > + /* counter = copy3.counter */ > + movq copy3,0x00(counter) > + > + /* Zero out the potentially sensitive regs, in case nothing uses these again. */ > + pxor state0,state0 > + pxor state1,state1 > + pxor state2,state2 > + pxor state3,state3 > + pxor copy1,copy1 > + pxor copy2,copy2 > + pxor temp,temp > + > + ret > +SYM_FUNC_END(__arch_chacha20_blocks_nostack) > diff --git a/arch/x86/entry/vdso/vgetrandom.c b/arch/x86/entry/vdso/vgetrandom.c > new file mode 100644 > index 000000000000..6045ded5da90 > --- /dev/null > +++ b/arch/x86/entry/vdso/vgetrandom.c > @@ -0,0 +1,17 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. > + */ > +#include <linux/types.h> > + > +#include "../../../../lib/vdso/getrandom.c" > + > +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *state); > + > +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *state) > +{ > + return __cvdso_getrandom(buffer, len, flags, state); > +} > + > +ssize_t getrandom(void *, size_t, unsigned int, void *) > + __attribute__((weak, alias("__vdso_getrandom"))); > diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h > new file mode 100644 > index 000000000000..46f99d735ae6 > --- /dev/null > +++ b/arch/x86/include/asm/vdso/getrandom.h > @@ -0,0 +1,55 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. > + */ > +#ifndef __ASM_VDSO_GETRANDOM_H > +#define __ASM_VDSO_GETRANDOM_H > + > +#ifndef __ASSEMBLY__ > + > +#include <asm/unistd.h> > +#include <asm/vvar.h> > + > +/** > + * getrandom_syscall - Invoke the getrandom() syscall. > + * @buffer: Destination buffer to fill with random bytes. > + * @len: Size of @buffer in bytes. > + * @flags: Zero or more GRND_* flags. > + * Returns the number of random bytes written to @buffer, or a negative value indicating an error. > + */ > +static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags) > +{ > + long ret; > + > + asm ("syscall" : "=a" (ret) : > + "0" (__NR_getrandom), "D" (buffer), "S" (len), "d" (flags) : > + "rcx", "r11", "memory"); > + > + return ret; > +} > + > +#define __vdso_rng_data (VVAR(_vdso_rng_data)) > + > +static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) > +{ > + if (__vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS) > + return (void *)&__vdso_rng_data + ((void *)&__timens_vdso_data - (void *)&__vdso_data); > + return &__vdso_rng_data; > +} > + > +/** > + * __arch_chacha20_blocks_nostack - Generate ChaCha20 stream without using the stack. > + * @dst_bytes: Destination buffer to hold @nblocks * 64 bytes of output. > + * @key: 32-byte input key. > + * @counter: 8-byte counter, read on input and updated on return. > + * @nblocks: Number of blocks to generate. > + * > + * Generates a given positive number of blocks of ChaCha20 output with nonce=0, and does not write > + * to any stack or memory outside of the parameters passed to it, in order to mitigate stack data > + * leaking into forked child processes. > + */ > +extern void __arch_chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key, u32 *counter, size_t nblocks); > + > +#endif /* !__ASSEMBLY__ */ > + > +#endif /* __ASM_VDSO_GETRANDOM_H */ > diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h > index be199a9b2676..71c56586a22f 100644 > --- a/arch/x86/include/asm/vdso/vsyscall.h > +++ b/arch/x86/include/asm/vdso/vsyscall.h > @@ -11,6 +11,8 @@ > #include <asm/vvar.h> > > DEFINE_VVAR(struct vdso_data, _vdso_data); > +DEFINE_VVAR_SINGLE(struct vdso_rng_data, _vdso_rng_data); > + > /* > * Update the vDSO data page to keep in sync with kernel timekeeping. > */ > diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h > index 183e98e49ab9..9d9af37f7cab 100644 > --- a/arch/x86/include/asm/vvar.h > +++ b/arch/x86/include/asm/vvar.h > @@ -26,6 +26,8 @@ > */ > #define DECLARE_VVAR(offset, type, name) \ > EMIT_VVAR(name, offset) > +#define DECLARE_VVAR_SINGLE(offset, type, name) \ > + EMIT_VVAR(name, offset) > > #else > > @@ -37,6 +39,10 @@ extern char __vvar_page; > extern type timens_ ## name[CS_BASES] \ > __attribute__((visibility("hidden"))); \ > > +#define DECLARE_VVAR_SINGLE(offset, type, name) \ > + extern type vvar_ ## name \ > + __attribute__((visibility("hidden"))); \ > + > #define VVAR(name) (vvar_ ## name) > #define TIMENS(name) (timens_ ## name) > > @@ -44,12 +50,22 @@ extern char __vvar_page; > type name[CS_BASES] \ > __attribute__((section(".vvar_" #name), aligned(16))) __visible > > +#define DEFINE_VVAR_SINGLE(type, name) \ > + type name \ > + __attribute__((section(".vvar_" #name), aligned(16))) __visible > + > #endif > > /* DECLARE_VVAR(offset, type, name) */ > > DECLARE_VVAR(128, struct vdso_data, _vdso_data) > > +#if !defined(_SINGLE_DATA) > +#define _SINGLE_DATA > +DECLARE_VVAR_SINGLE(640, struct vdso_rng_data, _vdso_rng_data) > +#endif > + > #undef DECLARE_VVAR > +#undef DECLARE_VVAR_SINGLE > > #endif > diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore > index 7dbfdec53f3d..30d5c8f0e5c7 100644 > --- a/tools/testing/selftests/vDSO/.gitignore > +++ b/tools/testing/selftests/vDSO/.gitignore > @@ -7,3 +7,4 @@ vdso_test_gettimeofday > vdso_test_getcpu > vdso_standalone_test_x86 > vdso_test_getrandom > +vdso_test_chacha > diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile > index a33b4d200a32..54a015afe60c 100644 > --- a/tools/testing/selftests/vDSO/Makefile > +++ b/tools/testing/selftests/vDSO/Makefile > @@ -3,6 +3,7 @@ include ../lib.mk > > uname_M := $(shell uname -m 2>/dev/null || echo not) > ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) > +SODIUM := $(shell pkg-config --libs libsodium 2>/dev/null) > > TEST_GEN_PROGS := $(OUTPUT)/vdso_test_gettimeofday $(OUTPUT)/vdso_test_getcpu > TEST_GEN_PROGS += $(OUTPUT)/vdso_test_abi > @@ -12,9 +13,15 @@ TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86 > endif > TEST_GEN_PROGS += $(OUTPUT)/vdso_test_correctness > TEST_GEN_PROGS += $(OUTPUT)/vdso_test_getrandom > +ifeq ($(uname_M),x86_64) > +ifneq ($(SODIUM),) > +TEST_GEN_PROGS += $(OUTPUT)/vdso_test_chacha > +endif > +endif > > CFLAGS := -std=gnu99 > CFLAGS_vdso_standalone_test_x86 := -nostdlib -fno-asynchronous-unwind-tables -fno-stack-protector > +CFLAGS_vdso_test_chacha := $(SODIUM) -idirafter $(top_srcdir)/include -idirafter $(top_srcdir)/arch/$(ARCH)/include -D__ASSEMBLY__ -DBULID_VDSO -DCONFIG_FUNCTION_ALIGNMENT=0 -Wa,--noexecstack > LDFLAGS_vdso_test_correctness := -ldl > ifeq ($(CONFIG_X86_32),y) > LDLIBS += -lgcc_s > @@ -35,3 +42,5 @@ $(OUTPUT)/vdso_test_correctness: vdso_test_correctness.c > -o $@ \ > $(LDFLAGS_vdso_test_correctness) > $(OUTPUT)/vdso_test_getrandom: parse_vdso.c > +$(OUTPUT)/vdso_test_chacha: CFLAGS += $(CFLAGS_vdso_test_chacha) > +$(OUTPUT)/vdso_test_chacha: $(top_srcdir)/arch/$(ARCH)/entry/vdso/vgetrandom-chacha.S > diff --git a/tools/testing/selftests/vDSO/vdso_test_chacha.c b/tools/testing/selftests/vDSO/vdso_test_chacha.c > new file mode 100644 > index 000000000000..bce7a7752b11 > --- /dev/null > +++ b/tools/testing/selftests/vDSO/vdso_test_chacha.c > @@ -0,0 +1,43 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. > + */ > + > +#include <sodium/crypto_stream_chacha20.h> Is that standard ? Every distribution has sodium ? > +#include <sys/random.h> > +#include <string.h> > +#include <stdint.h> > +#include "../kselftest.h" > + > +extern void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, const uint8_t *key, uint32_t *counter, size_t nblocks); > + > +int main(int argc, char *argv[]) > +{ > + enum { TRIALS = 1000, BLOCKS = 128, BLOCK_SIZE = 64 }; > + static const uint8_t nonce[8] = { 0 }; > + uint32_t counter[2]; > + uint8_t key[32]; > + uint8_t output1[BLOCK_SIZE * BLOCKS], output2[BLOCK_SIZE * BLOCKS]; > + > + ksft_print_header(); > + ksft_set_plan(1); > + > + for (unsigned int trial = 0; trial < TRIALS; ++trial) { > + if (getrandom(key, sizeof(key), 0) != sizeof(key)) { > + printf("getrandom() failed!\n"); > + return KSFT_SKIP; > + } > + crypto_stream_chacha20(output1, sizeof(output1), nonce, key); > + for (unsigned int split = 0; split < BLOCKS; ++split) { > + memset(output2, 'X', sizeof(output2)); > + memset(counter, 0, sizeof(counter)); > + if (split) > + __arch_chacha20_blocks_nostack(output2, key, counter, split); > + __arch_chacha20_blocks_nostack(output2 + split * BLOCK_SIZE, key, counter, BLOCKS - split); > + if (memcmp(output1, output2, sizeof(output1))) > + return KSFT_FAIL; > + } > + } > + ksft_test_result_pass("chacha: PASS\n"); > + return KSFT_PASS; > +} Christophe
Hi Christophe, Thanks for the review. On Thu, Jan 12, 2023 at 6:27 PM Christophe Leroy <christophe.leroy@csgroup.eu> wrote: > Le 01/01/2023 à 17:29, Jason A. Donenfeld a écrit : > > Hook up the generic vDSO implementation to the x86 vDSO data page. Since > > the existing vDSO infrastructure is heavily based on the timekeeping > > functionality, which works over arrays of bases, a new macro is > > introduced for vvars that are not arrays. > > > > The vDSO function requires a ChaCha20 implementation that does not write > > to the stack, yet can still do an entire ChaCha20 permutation, so > > provide this using SSE2, since this is userland code that must work on > > all x86-64 processors. There's a simple test for this code as well. > > As far as I understand the test is not dependent on the architecture, > can it be a separate patch ? The test is dependent on architectures for which there's a vDSO implementation. I could move it to a patch before or after this one, though, if you think it'd be better to keep this commit as a template commit for other architectures. > Also, as the chacha implementation is standalone and can be tested by > the above mentionned simple test, can it be a separate patch as well ? No, that actually needs to be part of this one, as it's part and parcel of the x86 implementation. > Then the last patch only has the glue to wire-up getrandom VDSO to the > architecture, and can be used as a reference for other architectures. This is part of the required glue, so no, it belongs in this one. > > + * rdx: 8-byte counter input/output > > Why a 8-byte counter ? According to RFC 7539, chacha20 takes: > Are you mixing up the upper part of the counter with the nonce ? In that > case you can't say you use a 0 nonce, can you ? No, I'm not mixing anything up. This is the same algorithm that random.c uses. And wireguard, for that matter. 8 byte nonce, 8 byte block counter. In this case, the nonce is 0. > > +#include <sodium/crypto_stream_chacha20.h> > > Is that standard ? Every distribution has sodium ? As far as I can tell, yes. Jason
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3604074a878b..ed689d831362 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -272,6 +272,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER select HAVE_GENERIC_VDSO + select VDSO_GETRANDOM if X86_64 select HOTPLUG_SMT if SMP select IRQ_FORCED_THREADING select NEED_PER_CPU_EMBED_FIRST_CHUNK diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 838613ac15b8..3979bb4a61ae 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -27,7 +27,7 @@ VDSO32-$(CONFIG_X86_32) := y VDSO32-$(CONFIG_IA32_EMULATION) := y # files to link into the vdso -vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o vobjs32-y += vdso32/vclock_gettime.o vobjs-$(CONFIG_X86_SGX) += vsgx.o @@ -105,6 +105,7 @@ CFLAGS_REMOVE_vclock_gettime.o = -pg CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg CFLAGS_REMOVE_vgetcpu.o = -pg CFLAGS_REMOVE_vsgx.o = -pg +CFLAGS_REMOVE_vgetrandom.o = -pg # # X32 processes use x32 vDSO to access 64bit kernel data. diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S index e8c60ae7a7c8..0bab5f4af6d1 100644 --- a/arch/x86/entry/vdso/vdso.lds.S +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -30,6 +30,8 @@ VERSION { #ifdef CONFIG_X86_SGX __vdso_sgx_enter_enclave; #endif + getrandom; + __vdso_getrandom; local: *; }; } diff --git a/arch/x86/entry/vdso/vgetrandom-chacha.S b/arch/x86/entry/vdso/vgetrandom-chacha.S new file mode 100644 index 000000000000..d79e2bd97598 --- /dev/null +++ b/arch/x86/entry/vdso/vgetrandom-chacha.S @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +.section .rodata, "a" +.align 16 +CONSTANTS: .octa 0x6b20657479622d323320646e61707865 +.text + +/* + * Very basic SSE2 implementation of ChaCha20. Produces a given positive number + * of blocks of output with a nonce of 0, taking an input key and 8-byte + * counter. Importantly does not spill to the stack. Its arguments are: + * + * rdi: output bytes + * rsi: 32-byte key input + * rdx: 8-byte counter input/output + * rcx: number of 64-byte blocks to write to output + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack) + +.set output, %rdi +.set key, %rsi +.set counter, %rdx +.set nblocks, %rcx +.set i, %al +/* xmm registers are *not* callee-save. */ +.set state0, %xmm0 +.set state1, %xmm1 +.set state2, %xmm2 +.set state3, %xmm3 +.set copy0, %xmm4 +.set copy1, %xmm5 +.set copy2, %xmm6 +.set copy3, %xmm7 +.set temp, %xmm8 +.set one, %xmm9 + + /* copy0 = "expand 32-byte k" */ + movaps CONSTANTS(%rip),copy0 + /* copy1,copy2 = key */ + movups 0x00(key),copy1 + movups 0x10(key),copy2 + /* copy3 = counter || zero nonce */ + movq 0x00(counter),copy3 + /* one = 1 || 0 */ + movq $1,%rax + movq %rax,one + +.Lblock: + /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */ + movdqa copy0,state0 + movdqa copy1,state1 + movdqa copy2,state2 + movdqa copy3,state3 + + movb $10,i +.Lpermute: + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ + paddd state1,state0 + pxor state0,state3 + movdqa state3,temp + pslld $16,temp + psrld $16,state3 + por temp,state3 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ + paddd state3,state2 + pxor state2,state1 + movdqa state1,temp + pslld $12,temp + psrld $20,state1 + por temp,state1 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ + paddd state1,state0 + pxor state0,state3 + movdqa state3,temp + pslld $8,temp + psrld $24,state3 + por temp,state3 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ + paddd state3,state2 + pxor state2,state1 + movdqa state1,temp + pslld $7,temp + psrld $25,state1 + por temp,state1 + + /* state1[0,1,2,3] = state1[1,2,3,0] */ + pshufd $0x39,state1,state1 + /* state2[0,1,2,3] = state2[2,3,0,1] */ + pshufd $0x4e,state2,state2 + /* state3[0,1,2,3] = state3[3,0,1,2] */ + pshufd $0x93,state3,state3 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ + paddd state1,state0 + pxor state0,state3 + movdqa state3,temp + pslld $16,temp + psrld $16,state3 + por temp,state3 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ + paddd state3,state2 + pxor state2,state1 + movdqa state1,temp + pslld $12,temp + psrld $20,state1 + por temp,state1 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ + paddd state1,state0 + pxor state0,state3 + movdqa state3,temp + pslld $8,temp + psrld $24,state3 + por temp,state3 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ + paddd state3,state2 + pxor state2,state1 + movdqa state1,temp + pslld $7,temp + psrld $25,state1 + por temp,state1 + + /* state1[0,1,2,3] = state1[3,0,1,2] */ + pshufd $0x93,state1,state1 + /* state2[0,1,2,3] = state2[2,3,0,1] */ + pshufd $0x4e,state2,state2 + /* state3[0,1,2,3] = state3[1,2,3,0] */ + pshufd $0x39,state3,state3 + + decb i + jnz .Lpermute + + /* output0 = state0 + copy0 */ + paddd copy0,state0 + movups state0,0x00(output) + /* output1 = state1 + copy1 */ + paddd copy1,state1 + movups state1,0x10(output) + /* output2 = state2 + copy2 */ + paddd copy2,state2 + movups state2,0x20(output) + /* output3 = state3 + copy3 */ + paddd copy3,state3 + movups state3,0x30(output) + + /* ++copy3.counter */ + paddq one,copy3 + + /* output += 64, --nblocks */ + addq $64,output + decq nblocks + jnz .Lblock + + /* counter = copy3.counter */ + movq copy3,0x00(counter) + + /* Zero out the potentially sensitive regs, in case nothing uses these again. */ + pxor state0,state0 + pxor state1,state1 + pxor state2,state2 + pxor state3,state3 + pxor copy1,copy1 + pxor copy2,copy2 + pxor temp,temp + + ret +SYM_FUNC_END(__arch_chacha20_blocks_nostack) diff --git a/arch/x86/entry/vdso/vgetrandom.c b/arch/x86/entry/vdso/vgetrandom.c new file mode 100644 index 000000000000..6045ded5da90 --- /dev/null +++ b/arch/x86/entry/vdso/vgetrandom.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ +#include <linux/types.h> + +#include "../../../../lib/vdso/getrandom.c" + +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *state); + +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *state) +{ + return __cvdso_getrandom(buffer, len, flags, state); +} + +ssize_t getrandom(void *, size_t, unsigned int, void *) + __attribute__((weak, alias("__vdso_getrandom"))); diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h new file mode 100644 index 000000000000..46f99d735ae6 --- /dev/null +++ b/arch/x86/include/asm/vdso/getrandom.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ +#ifndef __ASM_VDSO_GETRANDOM_H +#define __ASM_VDSO_GETRANDOM_H + +#ifndef __ASSEMBLY__ + +#include <asm/unistd.h> +#include <asm/vvar.h> + +/** + * getrandom_syscall - Invoke the getrandom() syscall. + * @buffer: Destination buffer to fill with random bytes. + * @len: Size of @buffer in bytes. + * @flags: Zero or more GRND_* flags. + * Returns the number of random bytes written to @buffer, or a negative value indicating an error. + */ +static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags) +{ + long ret; + + asm ("syscall" : "=a" (ret) : + "0" (__NR_getrandom), "D" (buffer), "S" (len), "d" (flags) : + "rcx", "r11", "memory"); + + return ret; +} + +#define __vdso_rng_data (VVAR(_vdso_rng_data)) + +static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) +{ + if (__vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS) + return (void *)&__vdso_rng_data + ((void *)&__timens_vdso_data - (void *)&__vdso_data); + return &__vdso_rng_data; +} + +/** + * __arch_chacha20_blocks_nostack - Generate ChaCha20 stream without using the stack. + * @dst_bytes: Destination buffer to hold @nblocks * 64 bytes of output. + * @key: 32-byte input key. + * @counter: 8-byte counter, read on input and updated on return. + * @nblocks: Number of blocks to generate. + * + * Generates a given positive number of blocks of ChaCha20 output with nonce=0, and does not write + * to any stack or memory outside of the parameters passed to it, in order to mitigate stack data + * leaking into forked child processes. + */ +extern void __arch_chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key, u32 *counter, size_t nblocks); + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h index be199a9b2676..71c56586a22f 100644 --- a/arch/x86/include/asm/vdso/vsyscall.h +++ b/arch/x86/include/asm/vdso/vsyscall.h @@ -11,6 +11,8 @@ #include <asm/vvar.h> DEFINE_VVAR(struct vdso_data, _vdso_data); +DEFINE_VVAR_SINGLE(struct vdso_rng_data, _vdso_rng_data); + /* * Update the vDSO data page to keep in sync with kernel timekeeping. */ diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 183e98e49ab9..9d9af37f7cab 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -26,6 +26,8 @@ */ #define DECLARE_VVAR(offset, type, name) \ EMIT_VVAR(name, offset) +#define DECLARE_VVAR_SINGLE(offset, type, name) \ + EMIT_VVAR(name, offset) #else @@ -37,6 +39,10 @@ extern char __vvar_page; extern type timens_ ## name[CS_BASES] \ __attribute__((visibility("hidden"))); \ +#define DECLARE_VVAR_SINGLE(offset, type, name) \ + extern type vvar_ ## name \ + __attribute__((visibility("hidden"))); \ + #define VVAR(name) (vvar_ ## name) #define TIMENS(name) (timens_ ## name) @@ -44,12 +50,22 @@ extern char __vvar_page; type name[CS_BASES] \ __attribute__((section(".vvar_" #name), aligned(16))) __visible +#define DEFINE_VVAR_SINGLE(type, name) \ + type name \ + __attribute__((section(".vvar_" #name), aligned(16))) __visible + #endif /* DECLARE_VVAR(offset, type, name) */ DECLARE_VVAR(128, struct vdso_data, _vdso_data) +#if !defined(_SINGLE_DATA) +#define _SINGLE_DATA +DECLARE_VVAR_SINGLE(640, struct vdso_rng_data, _vdso_rng_data) +#endif + #undef DECLARE_VVAR +#undef DECLARE_VVAR_SINGLE #endif diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore index 7dbfdec53f3d..30d5c8f0e5c7 100644 --- a/tools/testing/selftests/vDSO/.gitignore +++ b/tools/testing/selftests/vDSO/.gitignore @@ -7,3 +7,4 @@ vdso_test_gettimeofday vdso_test_getcpu vdso_standalone_test_x86 vdso_test_getrandom +vdso_test_chacha diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index a33b4d200a32..54a015afe60c 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -3,6 +3,7 @@ include ../lib.mk uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) +SODIUM := $(shell pkg-config --libs libsodium 2>/dev/null) TEST_GEN_PROGS := $(OUTPUT)/vdso_test_gettimeofday $(OUTPUT)/vdso_test_getcpu TEST_GEN_PROGS += $(OUTPUT)/vdso_test_abi @@ -12,9 +13,15 @@ TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86 endif TEST_GEN_PROGS += $(OUTPUT)/vdso_test_correctness TEST_GEN_PROGS += $(OUTPUT)/vdso_test_getrandom +ifeq ($(uname_M),x86_64) +ifneq ($(SODIUM),) +TEST_GEN_PROGS += $(OUTPUT)/vdso_test_chacha +endif +endif CFLAGS := -std=gnu99 CFLAGS_vdso_standalone_test_x86 := -nostdlib -fno-asynchronous-unwind-tables -fno-stack-protector +CFLAGS_vdso_test_chacha := $(SODIUM) -idirafter $(top_srcdir)/include -idirafter $(top_srcdir)/arch/$(ARCH)/include -D__ASSEMBLY__ -DBULID_VDSO -DCONFIG_FUNCTION_ALIGNMENT=0 -Wa,--noexecstack LDFLAGS_vdso_test_correctness := -ldl ifeq ($(CONFIG_X86_32),y) LDLIBS += -lgcc_s @@ -35,3 +42,5 @@ $(OUTPUT)/vdso_test_correctness: vdso_test_correctness.c -o $@ \ $(LDFLAGS_vdso_test_correctness) $(OUTPUT)/vdso_test_getrandom: parse_vdso.c +$(OUTPUT)/vdso_test_chacha: CFLAGS += $(CFLAGS_vdso_test_chacha) +$(OUTPUT)/vdso_test_chacha: $(top_srcdir)/arch/$(ARCH)/entry/vdso/vgetrandom-chacha.S diff --git a/tools/testing/selftests/vDSO/vdso_test_chacha.c b/tools/testing/selftests/vDSO/vdso_test_chacha.c new file mode 100644 index 000000000000..bce7a7752b11 --- /dev/null +++ b/tools/testing/selftests/vDSO/vdso_test_chacha.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <sodium/crypto_stream_chacha20.h> +#include <sys/random.h> +#include <string.h> +#include <stdint.h> +#include "../kselftest.h" + +extern void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, const uint8_t *key, uint32_t *counter, size_t nblocks); + +int main(int argc, char *argv[]) +{ + enum { TRIALS = 1000, BLOCKS = 128, BLOCK_SIZE = 64 }; + static const uint8_t nonce[8] = { 0 }; + uint32_t counter[2]; + uint8_t key[32]; + uint8_t output1[BLOCK_SIZE * BLOCKS], output2[BLOCK_SIZE * BLOCKS]; + + ksft_print_header(); + ksft_set_plan(1); + + for (unsigned int trial = 0; trial < TRIALS; ++trial) { + if (getrandom(key, sizeof(key), 0) != sizeof(key)) { + printf("getrandom() failed!\n"); + return KSFT_SKIP; + } + crypto_stream_chacha20(output1, sizeof(output1), nonce, key); + for (unsigned int split = 0; split < BLOCKS; ++split) { + memset(output2, 'X', sizeof(output2)); + memset(counter, 0, sizeof(counter)); + if (split) + __arch_chacha20_blocks_nostack(output2, key, counter, split); + __arch_chacha20_blocks_nostack(output2 + split * BLOCK_SIZE, key, counter, BLOCKS - split); + if (memcmp(output1, output2, sizeof(output1))) + return KSFT_FAIL; + } + } + ksft_test_result_pass("chacha: PASS\n"); + return KSFT_PASS; +}