Message ID | 20240925150059.3955569-30-ardb+git@google.com |
---|---|
Headers | show |
Series | x86: Rely on toolchain for relocatable code | expand |
On Wed, Sep 25, 2024 at 8:02 AM Ard Biesheuvel <ardb+git@google.com> wrote: > > From: Ard Biesheuvel <ardb@kernel.org> > > Specify the guard symbol for the stack cookie explicitly, rather than > positioning it exactly 40 bytes into the per-CPU area. Doing so removes > the need for the per-CPU region to be absolute rather than relative to > the placement of the per-CPU template region in the kernel image, and > this allows the special handling for absolute per-CPU symbols to be > removed entirely. > > This is a worthwhile cleanup in itself, but it is also a prerequisite > for PIE codegen and PIE linking, which can replace our bespoke and > rather clunky runtime relocation handling. > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org> > --- > arch/x86/Makefile | 4 ++++ > arch/x86/include/asm/init.h | 2 +- > arch/x86/include/asm/processor.h | 11 +++-------- > arch/x86/include/asm/stackprotector.h | 4 ---- > tools/perf/util/annotate.c | 4 ++-- > 5 files changed, 10 insertions(+), 15 deletions(-) > > diff --git a/arch/x86/Makefile b/arch/x86/Makefile > index 6b3fe6e2aadd..b78b7623a4a9 100644 > --- a/arch/x86/Makefile > +++ b/arch/x86/Makefile > @@ -193,6 +193,10 @@ else > KBUILD_RUSTFLAGS += -Cno-redzone=y > KBUILD_RUSTFLAGS += -Ccode-model=kernel > > + ifeq ($(CONFIG_STACKPROTECTOR),y) > + KBUILD_CFLAGS += -mstack-protector-guard-symbol=fixed_percpu_data > + endif > + > # Don't emit relaxable GOTPCREL relocations > KBUILD_AFLAGS_KERNEL += -Wa,-mrelax-relocations=no > KBUILD_CFLAGS_KERNEL += -Wa,-mrelax-relocations=no > diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h > index 14d72727d7ee..3ed0e8ec973f 100644 > --- a/arch/x86/include/asm/init.h > +++ b/arch/x86/include/asm/init.h > @@ -2,7 +2,7 @@ > #ifndef _ASM_X86_INIT_H > #define _ASM_X86_INIT_H > > -#define __head __section(".head.text") > +#define __head __section(".head.text") __no_stack_protector > > struct x86_mapping_info { > void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ > diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h > index 4a686f0e5dbf..56bc36116814 100644 > --- a/arch/x86/include/asm/processor.h > +++ b/arch/x86/include/asm/processor.h > @@ -402,14 +402,9 @@ struct irq_stack { > #ifdef CONFIG_X86_64 > struct fixed_percpu_data { > /* > - * GCC hardcodes the stack canary as %gs:40. Since the > - * irq_stack is the object at %gs:0, we reserve the bottom > - * 48 bytes of the irq stack for the canary. > - * > - * Once we are willing to require -mstack-protector-guard-symbol= > - * support for x86_64 stackprotector, we can get rid of this. > + * Since the irq_stack is the object at %gs:0, the bottom 8 bytes of > + * the irq stack are reserved for the canary. > */ > - char gs_base[40]; > unsigned long stack_canary; > }; > > @@ -418,7 +413,7 @@ DECLARE_INIT_PER_CPU(fixed_percpu_data); > > static inline unsigned long cpu_kernelmode_gs_base(int cpu) > { > - return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); > + return (unsigned long)&per_cpu(fixed_percpu_data, cpu); > } > > extern asmlinkage void entry_SYSCALL32_ignore(void); > diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h > index 00473a650f51..d1dcd22a0a4c 100644 > --- a/arch/x86/include/asm/stackprotector.h > +++ b/arch/x86/include/asm/stackprotector.h > @@ -51,10 +51,6 @@ static __always_inline void boot_init_stack_canary(void) > { > unsigned long canary = get_random_canary(); > > -#ifdef CONFIG_X86_64 > - BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40); > -#endif > - > current->stack_canary = canary; > #ifdef CONFIG_X86_64 > this_cpu_write(fixed_percpu_data.stack_canary, canary); > diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c > index 37ce43c4eb8f..7ecfedf5edb9 100644 > --- a/tools/perf/util/annotate.c > +++ b/tools/perf/util/annotate.c > @@ -2485,10 +2485,10 @@ static bool is_stack_operation(struct arch *arch, struct disasm_line *dl) > > static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc) > { > - /* On x86_64, %gs:40 is used for stack canary */ > + /* On x86_64, %gs:0 is used for stack canary */ > if (arch__is(arch, "x86")) { > if (loc->segment == INSN_SEG_X86_GS && loc->imm && > - loc->offset == 40) > + loc->offset == 0) As a new perf tool can run on old kernels we may need to have this be something like: (loc->offset == 40 /* pre v6.xx kernels */ || loc->offset == 0 /* v6.xx and later */ ) We could make this dependent on the kernel by processing the os_release string: https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/env.h#n55 but that could well be more trouble than it is worth. Thanks, Ian > return true; > } > > -- > 2.46.0.792.g87dc391469-goog >
On Wed, 25 Sept 2024 at 17:54, Ian Rogers <irogers@google.com> wrote: > > On Wed, Sep 25, 2024 at 8:02 AM Ard Biesheuvel <ardb+git@google.com> wrote: > > > > From: Ard Biesheuvel <ardb@kernel.org> > > > > Specify the guard symbol for the stack cookie explicitly, rather than > > positioning it exactly 40 bytes into the per-CPU area. Doing so removes > > the need for the per-CPU region to be absolute rather than relative to > > the placement of the per-CPU template region in the kernel image, and > > this allows the special handling for absolute per-CPU symbols to be > > removed entirely. > > > > This is a worthwhile cleanup in itself, but it is also a prerequisite > > for PIE codegen and PIE linking, which can replace our bespoke and > > rather clunky runtime relocation handling. > > > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org> > > --- > > arch/x86/Makefile | 4 ++++ > > arch/x86/include/asm/init.h | 2 +- > > arch/x86/include/asm/processor.h | 11 +++-------- > > arch/x86/include/asm/stackprotector.h | 4 ---- > > tools/perf/util/annotate.c | 4 ++-- > > 5 files changed, 10 insertions(+), 15 deletions(-) > > ... > > diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c > > index 37ce43c4eb8f..7ecfedf5edb9 100644 > > --- a/tools/perf/util/annotate.c > > +++ b/tools/perf/util/annotate.c > > @@ -2485,10 +2485,10 @@ static bool is_stack_operation(struct arch *arch, struct disasm_line *dl) > > > > static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc) > > { > > - /* On x86_64, %gs:40 is used for stack canary */ > > + /* On x86_64, %gs:0 is used for stack canary */ > > if (arch__is(arch, "x86")) { > > if (loc->segment == INSN_SEG_X86_GS && loc->imm && > > - loc->offset == 40) > > + loc->offset == 0) > > As a new perf tool can run on old kernels we may need to have this be > something like: > (loc->offset == 40 /* pre v6.xx kernels */ || loc->offset == 0 /* > v6.xx and later */ ) > > We could make this dependent on the kernel by processing the os_release string: > https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/env.h#n55 > but that could well be more trouble than it is worth. > Yeah. I also wonder what the purpose of this feature is. At the end of this series, the stack cookie will no longer be at a fixed offset of %GS anyway, and so perf will not be able to identify it in the same manner. So it is probably better to just leave this in place, as the %gs:0 case will not exist in the field (assuming that the series lands all at once). Any idea why this deviates from other architectures? Is x86_64 the only arch that needs to identify stack canary accesses in perf? We could rename the symbol to something identifiable, and do it across all architectures, if this really serves a need (and assuming that perf has insight into the symbol table).
On Wed, Sep 25, 2024 at 10:43 AM Ard Biesheuvel <ardb@kernel.org> wrote: > > On Wed, 25 Sept 2024 at 17:54, Ian Rogers <irogers@google.com> wrote: > > > > On Wed, Sep 25, 2024 at 8:02 AM Ard Biesheuvel <ardb+git@google.com> wrote: > > > > > > From: Ard Biesheuvel <ardb@kernel.org> > > > > > > Specify the guard symbol for the stack cookie explicitly, rather than > > > positioning it exactly 40 bytes into the per-CPU area. Doing so removes > > > the need for the per-CPU region to be absolute rather than relative to > > > the placement of the per-CPU template region in the kernel image, and > > > this allows the special handling for absolute per-CPU symbols to be > > > removed entirely. > > > > > > This is a worthwhile cleanup in itself, but it is also a prerequisite > > > for PIE codegen and PIE linking, which can replace our bespoke and > > > rather clunky runtime relocation handling. > > > > > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org> > > > --- > > > arch/x86/Makefile | 4 ++++ > > > arch/x86/include/asm/init.h | 2 +- > > > arch/x86/include/asm/processor.h | 11 +++-------- > > > arch/x86/include/asm/stackprotector.h | 4 ---- > > > tools/perf/util/annotate.c | 4 ++-- > > > 5 files changed, 10 insertions(+), 15 deletions(-) > > > > ... > > > diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c > > > index 37ce43c4eb8f..7ecfedf5edb9 100644 > > > --- a/tools/perf/util/annotate.c > > > +++ b/tools/perf/util/annotate.c > > > @@ -2485,10 +2485,10 @@ static bool is_stack_operation(struct arch *arch, struct disasm_line *dl) > > > > > > static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc) > > > { > > > - /* On x86_64, %gs:40 is used for stack canary */ > > > + /* On x86_64, %gs:0 is used for stack canary */ > > > if (arch__is(arch, "x86")) { > > > if (loc->segment == INSN_SEG_X86_GS && loc->imm && > > > - loc->offset == 40) > > > + loc->offset == 0) > > > > As a new perf tool can run on old kernels we may need to have this be > > something like: > > (loc->offset == 40 /* pre v6.xx kernels */ || loc->offset == 0 /* > > v6.xx and later */ ) > > > > We could make this dependent on the kernel by processing the os_release string: > > https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/env.h#n55 > > but that could well be more trouble than it is worth. > > > > Yeah. I also wonder what the purpose of this feature is. At the end of > this series, the stack cookie will no longer be at a fixed offset of > %GS anyway, and so perf will not be able to identify it in the same > manner. So it is probably better to just leave this in place, as the > %gs:0 case will not exist in the field (assuming that the series lands > all at once). > > Any idea why this deviates from other architectures? Is x86_64 the > only arch that needs to identify stack canary accesses in perf? We > could rename the symbol to something identifiable, and do it across > all architectures, if this really serves a need (and assuming that > perf has insight into the symbol table). This is relatively new work coming from Namhyung for data type profiling and I believe is pretty much just x86 at the moment - although the ever awesome IBM made contributions for PowerPC. The data type profiling is trying to classify memory accesses which is why it cares about the stack canary instruction, the particular encoding shouldn't matter. Thanks, Ian
On Wed, Sep 25, 2024 at 5:02 PM Ard Biesheuvel <ardb+git@google.com> wrote: > > From: Ard Biesheuvel <ardb@kernel.org> > > Specify the guard symbol for the stack cookie explicitly, rather than > positioning it exactly 40 bytes into the per-CPU area. Doing so removes > the need for the per-CPU region to be absolute rather than relative to > the placement of the per-CPU template region in the kernel image, and > this allows the special handling for absolute per-CPU symbols to be > removed entirely. > > This is a worthwhile cleanup in itself, but it is also a prerequisite > for PIE codegen and PIE linking, which can replace our bespoke and > rather clunky runtime relocation handling. I would like to point out a series that converted the stack protector guard symbol to a normal percpu variable [1], so there was no need to assume anything about the location of the guard symbol. [1] "[PATCH v4 00/16] x86-64: Stack protector and percpu improvements" https://lore.kernel.org/lkml/20240322165233.71698-1-brgerst@gmail.com/ Uros. > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org> > --- > arch/x86/Makefile | 4 ++++ > arch/x86/include/asm/init.h | 2 +- > arch/x86/include/asm/processor.h | 11 +++-------- > arch/x86/include/asm/stackprotector.h | 4 ---- > tools/perf/util/annotate.c | 4 ++-- > 5 files changed, 10 insertions(+), 15 deletions(-) > > diff --git a/arch/x86/Makefile b/arch/x86/Makefile > index 6b3fe6e2aadd..b78b7623a4a9 100644 > --- a/arch/x86/Makefile > +++ b/arch/x86/Makefile > @@ -193,6 +193,10 @@ else > KBUILD_RUSTFLAGS += -Cno-redzone=y > KBUILD_RUSTFLAGS += -Ccode-model=kernel > > + ifeq ($(CONFIG_STACKPROTECTOR),y) > + KBUILD_CFLAGS += -mstack-protector-guard-symbol=fixed_percpu_data > + endif > + > # Don't emit relaxable GOTPCREL relocations > KBUILD_AFLAGS_KERNEL += -Wa,-mrelax-relocations=no > KBUILD_CFLAGS_KERNEL += -Wa,-mrelax-relocations=no > diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h > index 14d72727d7ee..3ed0e8ec973f 100644 > --- a/arch/x86/include/asm/init.h > +++ b/arch/x86/include/asm/init.h > @@ -2,7 +2,7 @@ > #ifndef _ASM_X86_INIT_H > #define _ASM_X86_INIT_H > > -#define __head __section(".head.text") > +#define __head __section(".head.text") __no_stack_protector > > struct x86_mapping_info { > void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ > diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h > index 4a686f0e5dbf..56bc36116814 100644 > --- a/arch/x86/include/asm/processor.h > +++ b/arch/x86/include/asm/processor.h > @@ -402,14 +402,9 @@ struct irq_stack { > #ifdef CONFIG_X86_64 > struct fixed_percpu_data { > /* > - * GCC hardcodes the stack canary as %gs:40. Since the > - * irq_stack is the object at %gs:0, we reserve the bottom > - * 48 bytes of the irq stack for the canary. > - * > - * Once we are willing to require -mstack-protector-guard-symbol= > - * support for x86_64 stackprotector, we can get rid of this. > + * Since the irq_stack is the object at %gs:0, the bottom 8 bytes of > + * the irq stack are reserved for the canary. > */ > - char gs_base[40]; > unsigned long stack_canary; > }; > > @@ -418,7 +413,7 @@ DECLARE_INIT_PER_CPU(fixed_percpu_data); > > static inline unsigned long cpu_kernelmode_gs_base(int cpu) > { > - return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); > + return (unsigned long)&per_cpu(fixed_percpu_data, cpu); > } > > extern asmlinkage void entry_SYSCALL32_ignore(void); > diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h > index 00473a650f51..d1dcd22a0a4c 100644 > --- a/arch/x86/include/asm/stackprotector.h > +++ b/arch/x86/include/asm/stackprotector.h > @@ -51,10 +51,6 @@ static __always_inline void boot_init_stack_canary(void) > { > unsigned long canary = get_random_canary(); > > -#ifdef CONFIG_X86_64 > - BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40); > -#endif > - > current->stack_canary = canary; > #ifdef CONFIG_X86_64 > this_cpu_write(fixed_percpu_data.stack_canary, canary); > diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c > index 37ce43c4eb8f..7ecfedf5edb9 100644 > --- a/tools/perf/util/annotate.c > +++ b/tools/perf/util/annotate.c > @@ -2485,10 +2485,10 @@ static bool is_stack_operation(struct arch *arch, struct disasm_line *dl) > > static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc) > { > - /* On x86_64, %gs:40 is used for stack canary */ > + /* On x86_64, %gs:0 is used for stack canary */ > if (arch__is(arch, "x86")) { > if (loc->segment == INSN_SEG_X86_GS && loc->imm && > - loc->offset == 40) > + loc->offset == 0) > return true; > } > > -- > 2.46.0.792.g87dc391469-goog >
On Wed, Sep 25, 2024 at 5:02 PM Ard Biesheuvel <ardb+git@google.com> wrote: > > From: Ard Biesheuvel <ardb@kernel.org> > > Build the kernel as a Position Independent Executable (PIE). This > results in more efficient relocation processing for the virtual > displacement of the kernel (for KASLR). More importantly, it instructs > the linker to generate what is actually needed (a program that can be > moved around in memory before execution), which is better than having to > rely on the linker to create a position dependent binary that happens to > tolerate being moved around after poking it in exactly the right manner. > > Note that this means that all codegen should be compatible with PIE, > including Rust objects, so this needs to switch to the small code model > with the PIE relocation model as well. I think that related to this work is the patch series [1] that introduces the changes necessary to build the kernel as Position Independent Executable (PIE) on x86_64 [1]. There are some more places that need to be adapted for PIE. The patch series also introduces objtool functionality to add validation for x86 PIE. [1] "[PATCH RFC 00/43] x86/pie: Make kernel image's virtual address flexible" https://lore.kernel.org/lkml/cover.1682673542.git.houwenlong.hwl@antgroup.com/ Uros. > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org> > --- > arch/x86/Kconfig | 2 +- > arch/x86/Makefile | 11 +++++++---- > arch/x86/boot/compressed/misc.c | 2 ++ > arch/x86/kernel/vmlinux.lds.S | 5 +++++ > drivers/firmware/efi/libstub/x86-stub.c | 2 ++ > 5 files changed, 17 insertions(+), 5 deletions(-) > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index 54cb1f14218b..dbb4d284b0e1 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -2187,7 +2187,7 @@ config RANDOMIZE_BASE > # Relocation on x86 needs some additional build support > config X86_NEED_RELOCS > def_bool y > - depends on RANDOMIZE_BASE || (X86_32 && RELOCATABLE) > + depends on X86_32 && RELOCATABLE > > config PHYSICAL_ALIGN > hex "Alignment value to which kernel should be aligned" > diff --git a/arch/x86/Makefile b/arch/x86/Makefile > index 83d20f402535..c1dcff444bc8 100644 > --- a/arch/x86/Makefile > +++ b/arch/x86/Makefile > @@ -206,9 +206,8 @@ else > PIE_CFLAGS-$(CONFIG_SMP) += -mstack-protector-guard-reg=gs > endif > > - # Don't emit relaxable GOTPCREL relocations > - KBUILD_AFLAGS_KERNEL += -Wa,-mrelax-relocations=no > - KBUILD_CFLAGS_KERNEL += -Wa,-mrelax-relocations=no $(PIE_CFLAGS-y) > + KBUILD_CFLAGS_KERNEL += $(PIE_CFLAGS-y) > + KBUILD_RUSTFLAGS_KERNEL += -Ccode-model=small -Crelocation-model=pie > endif > > # > @@ -264,12 +263,16 @@ else > LDFLAGS_vmlinux := > endif > > +ifdef CONFIG_X86_64 > +ldflags-pie-$(CONFIG_LD_IS_LLD) := --apply-dynamic-relocs > +ldflags-pie-$(CONFIG_LD_IS_BFD) := -z call-nop=suffix-nop > +LDFLAGS_vmlinux += --pie -z text $(ldflags-pie-y) > + > # > # The 64-bit kernel must be aligned to 2MB. Pass -z max-page-size=0x200000 to > # the linker to force 2MB page size regardless of the default page size used > # by the linker. > # > -ifdef CONFIG_X86_64 > LDFLAGS_vmlinux += -z max-page-size=0x200000 > endif > > diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c > index 89f01375cdb7..79e3ffe16f61 100644 > --- a/arch/x86/boot/compressed/misc.c > +++ b/arch/x86/boot/compressed/misc.c > @@ -495,6 +495,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) > error("Destination virtual address changed when not relocatable"); > #endif > > + boot_params_ptr->kaslr_va_shift = virt_addr - LOAD_PHYSICAL_ADDR; > + > debug_putstr("\nDecompressing Linux... "); > > if (init_unaccepted_memory()) { > diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S > index f7e832c2ac61..d172e6e8eaaf 100644 > --- a/arch/x86/kernel/vmlinux.lds.S > +++ b/arch/x86/kernel/vmlinux.lds.S > @@ -459,6 +459,11 @@ xen_elfnote_phys32_entry_offset = > > DISCARDS > > + /DISCARD/ : { > + *(.dynsym .gnu.hash .hash .dynamic .dynstr) > + *(.interp .dynbss .eh_frame .sframe) > + } > + > /* > * Make sure that the .got.plt is either completely empty or it > * contains only the lazy dispatch entries. > diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c > index f8e465da344d..5c03954924fe 100644 > --- a/drivers/firmware/efi/libstub/x86-stub.c > +++ b/drivers/firmware/efi/libstub/x86-stub.c > @@ -912,6 +912,8 @@ static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry) > if (status != EFI_SUCCESS) > return status; > > + boot_params_ptr->kaslr_va_shift = virt_addr - LOAD_PHYSICAL_ADDR; > + > entry = decompress_kernel((void *)addr, virt_addr, error); > if (entry == ULONG_MAX) { > efi_free(alloc_size, addr); > -- > 2.46.0.792.g87dc391469-goog >
On Wed, 25 Sept 2024 at 20:54, Uros Bizjak <ubizjak@gmail.com> wrote: > > On Wed, Sep 25, 2024 at 5:02 PM Ard Biesheuvel <ardb+git@google.com> wrote: > > > > From: Ard Biesheuvel <ardb@kernel.org> > > > > Build the kernel as a Position Independent Executable (PIE). This > > results in more efficient relocation processing for the virtual > > displacement of the kernel (for KASLR). More importantly, it instructs > > the linker to generate what is actually needed (a program that can be > > moved around in memory before execution), which is better than having to > > rely on the linker to create a position dependent binary that happens to > > tolerate being moved around after poking it in exactly the right manner. > > > > Note that this means that all codegen should be compatible with PIE, > > including Rust objects, so this needs to switch to the small code model > > with the PIE relocation model as well. > > I think that related to this work is the patch series [1] that > introduces the changes necessary to build the kernel as Position > Independent Executable (PIE) on x86_64 [1]. There are some more places > that need to be adapted for PIE. The patch series also introduces > objtool functionality to add validation for x86 PIE. > > [1] "[PATCH RFC 00/43] x86/pie: Make kernel image's virtual address flexible" > https://lore.kernel.org/lkml/cover.1682673542.git.houwenlong.hwl@antgroup.com/ > Hi Uros, I am aware of that discussion, as I took part in it as well. I don't think any of those changes are actually needed now - did you notice anything in particular that is missing?
On Wed, Sep 25, 2024 at 9:14 PM Ard Biesheuvel <ardb@kernel.org> wrote: > > On Wed, 25 Sept 2024 at 20:54, Uros Bizjak <ubizjak@gmail.com> wrote: > > > > On Wed, Sep 25, 2024 at 5:02 PM Ard Biesheuvel <ardb+git@google.com> wrote: > > > > > > From: Ard Biesheuvel <ardb@kernel.org> > > > > > > Build the kernel as a Position Independent Executable (PIE). This > > > results in more efficient relocation processing for the virtual > > > displacement of the kernel (for KASLR). More importantly, it instructs > > > the linker to generate what is actually needed (a program that can be > > > moved around in memory before execution), which is better than having to > > > rely on the linker to create a position dependent binary that happens to > > > tolerate being moved around after poking it in exactly the right manner. > > > > > > Note that this means that all codegen should be compatible with PIE, > > > including Rust objects, so this needs to switch to the small code model > > > with the PIE relocation model as well. > > > > I think that related to this work is the patch series [1] that > > introduces the changes necessary to build the kernel as Position > > Independent Executable (PIE) on x86_64 [1]. There are some more places > > that need to be adapted for PIE. The patch series also introduces > > objtool functionality to add validation for x86 PIE. > > > > [1] "[PATCH RFC 00/43] x86/pie: Make kernel image's virtual address flexible" > > https://lore.kernel.org/lkml/cover.1682673542.git.houwenlong.hwl@antgroup.com/ > > > > Hi Uros, > > I am aware of that discussion, as I took part in it as well. > > I don't think any of those changes are actually needed now - did you > notice anything in particular that is missing? Some time ago I went through the kernel sources and proposed several patches that changed all trivial occurrences of non-RIP addresses to RIP ones. The work was partially based on the mentioned patch series, and I remember, I left some of them out [e.g. 1], because they required a temporary variable. Also, there was discussion about ftrace [2], where no solution was found. Looking through your series, I didn't find some of the non-RIP -> RIP changes proposed by the original series (especially the ftrace part), and noticed that there is no objtool validator proposed to ensure that all generated code is indeed PIE compatible. Speaking of non-RIP -> RIP changes that require a temporary - would it be beneficial to make a macro that would use the RIP form only when #ifdef CONFIG_X86_PIE? That would avoid code size increase when PIE is not needed. [1] https://lore.kernel.org/lkml/a0b69f3fac1834c05f960b916cc6eb0004cdffbf.1682673543.git.houwenlong.hwl@antgroup.com/ [2] https://lore.kernel.org/lkml/20230428094454.0f2f5049@gandalf.local.home/ [3] https://lore.kernel.org/lkml/226af8c63c5bfa361763dd041a997ee84fe926cf.1682673543.git.houwenlong.hwl@antgroup.com/ Thanks and best regards, Uros.
On Wed, 25 Sept 2024 at 21:39, Uros Bizjak <ubizjak@gmail.com> wrote: > > On Wed, Sep 25, 2024 at 9:14 PM Ard Biesheuvel <ardb@kernel.org> wrote: > > > > On Wed, 25 Sept 2024 at 20:54, Uros Bizjak <ubizjak@gmail.com> wrote: > > > > > > On Wed, Sep 25, 2024 at 5:02 PM Ard Biesheuvel <ardb+git@google.com> wrote: > > > > > > > > From: Ard Biesheuvel <ardb@kernel.org> > > > > > > > > Build the kernel as a Position Independent Executable (PIE). This > > > > results in more efficient relocation processing for the virtual > > > > displacement of the kernel (for KASLR). More importantly, it instructs > > > > the linker to generate what is actually needed (a program that can be > > > > moved around in memory before execution), which is better than having to > > > > rely on the linker to create a position dependent binary that happens to > > > > tolerate being moved around after poking it in exactly the right manner. > > > > > > > > Note that this means that all codegen should be compatible with PIE, > > > > including Rust objects, so this needs to switch to the small code model > > > > with the PIE relocation model as well. > > > > > > I think that related to this work is the patch series [1] that > > > introduces the changes necessary to build the kernel as Position > > > Independent Executable (PIE) on x86_64 [1]. There are some more places > > > that need to be adapted for PIE. The patch series also introduces > > > objtool functionality to add validation for x86 PIE. > > > > > > [1] "[PATCH RFC 00/43] x86/pie: Make kernel image's virtual address flexible" > > > https://lore.kernel.org/lkml/cover.1682673542.git.houwenlong.hwl@antgroup.com/ > > > > > > > Hi Uros, > > > > I am aware of that discussion, as I took part in it as well. > > > > I don't think any of those changes are actually needed now - did you > > notice anything in particular that is missing? > > Some time ago I went through the kernel sources and proposed several > patches that changed all trivial occurrences of non-RIP addresses to > RIP ones. The work was partially based on the mentioned patch series, > and I remember, I left some of them out [e.g. 1], because they > required a temporary variable. I have a similar patch in my series, but the DEBUG_ENTRY code just uses pushf 1f@GOTPCREL(%rip) so no temporaries are needed. > Also, there was discussion about ftrace > [2], where no solution was found. > When linking with -z call-nop=suffix-nop, the __fentry__ call via the GOT will be relaxed by the linker into a 5 byte call followed by a 1 byte NOP, so I don't think we need to do anything special here. It might mean we currently lose -mnop-mcount until we find a solution for that in the compiler. In case you remember, I contributed and you merged a GCC patch that makes the __fentry__ emission logic honour -fdirect-access-external-data which should help here. This landed in GCC 14. > Looking through your series, I didn't find some of the non-RIP -> RIP > changes proposed by the original series (especially the ftrace part), > and noticed that there is no objtool validator proposed to ensure that > all generated code is indeed PIE compatible. > What would be the point of that? The linker will complain and throw an error if the code cannot be converted into a PIE executable, so I don't think we need objtool's help for that. > Speaking of non-RIP -> RIP changes that require a temporary - would it > be beneficial to make a macro that would use the RIP form only when > #ifdef CONFIG_X86_PIE? That would avoid code size increase when PIE is > not needed. > This series does not make the PIE support configurable. Do you think the code size increase is a concern if all GOT based symbol references are elided, e.g, via -fdirect-access-external-data?
On Wed, Sep 25, 2024 at 10:01 PM Ard Biesheuvel <ardb@kernel.org> wrote: > > On Wed, 25 Sept 2024 at 21:39, Uros Bizjak <ubizjak@gmail.com> wrote: > > > > On Wed, Sep 25, 2024 at 9:14 PM Ard Biesheuvel <ardb@kernel.org> wrote: > > > > > > On Wed, 25 Sept 2024 at 20:54, Uros Bizjak <ubizjak@gmail.com> wrote: > > > > > > > > On Wed, Sep 25, 2024 at 5:02 PM Ard Biesheuvel <ardb+git@google.com> wrote: > > > > > > > > > > From: Ard Biesheuvel <ardb@kernel.org> > > > > > > > > > > Build the kernel as a Position Independent Executable (PIE). This > > > > > results in more efficient relocation processing for the virtual > > > > > displacement of the kernel (for KASLR). More importantly, it instructs > > > > > the linker to generate what is actually needed (a program that can be > > > > > moved around in memory before execution), which is better than having to > > > > > rely on the linker to create a position dependent binary that happens to > > > > > tolerate being moved around after poking it in exactly the right manner. > > > > > > > > > > Note that this means that all codegen should be compatible with PIE, > > > > > including Rust objects, so this needs to switch to the small code model > > > > > with the PIE relocation model as well. > > > > > > > > I think that related to this work is the patch series [1] that > > > > introduces the changes necessary to build the kernel as Position > > > > Independent Executable (PIE) on x86_64 [1]. There are some more places > > > > that need to be adapted for PIE. The patch series also introduces > > > > objtool functionality to add validation for x86 PIE. > > > > > > > > [1] "[PATCH RFC 00/43] x86/pie: Make kernel image's virtual address flexible" > > > > https://lore.kernel.org/lkml/cover.1682673542.git.houwenlong.hwl@antgroup.com/ > > > > > > > > > > Hi Uros, > > > > > > I am aware of that discussion, as I took part in it as well. > > > > > > I don't think any of those changes are actually needed now - did you > > > notice anything in particular that is missing? > > > > Some time ago I went through the kernel sources and proposed several > > patches that changed all trivial occurrences of non-RIP addresses to > > RIP ones. The work was partially based on the mentioned patch series, > > and I remember, I left some of them out [e.g. 1], because they > > required a temporary variable. > > I have a similar patch in my series, but the DEBUG_ENTRY code just uses > > pushf 1f@GOTPCREL(%rip) > > so no temporaries are needed. > > > Also, there was discussion about ftrace > > [2], where no solution was found. > > > > When linking with -z call-nop=suffix-nop, the __fentry__ call via the > GOT will be relaxed by the linker into a 5 byte call followed by a 1 > byte NOP, so I don't think we need to do anything special here. It > might mean we currently lose -mnop-mcount until we find a solution for > that in the compiler. In case you remember, I contributed and you > merged a GCC patch that makes the __fentry__ emission logic honour > -fdirect-access-external-data which should help here. This landed in > GCC 14. > > > Looking through your series, I didn't find some of the non-RIP -> RIP > > changes proposed by the original series (especially the ftrace part), > > and noticed that there is no objtool validator proposed to ensure that > > all generated code is indeed PIE compatible. > > > > What would be the point of that? The linker will complain and throw an > error if the code cannot be converted into a PIE executable, so I > don't think we need objtool's help for that. Indeed. > > Speaking of non-RIP -> RIP changes that require a temporary - would it > > be beneficial to make a macro that would use the RIP form only when > > #ifdef CONFIG_X86_PIE? That would avoid code size increase when PIE is > > not needed. > > > > This series does not make the PIE support configurable. Do you think > the code size increase is a concern if all GOT based symbol references > are elided, e.g, via -fdirect-access-external-data? I was looking at the code size measurement of the original patch series (perhaps these are not relevant with your series) and I think 2.2% - 2.4% code size increase can be problematic. Can you perhaps provide new code size increase measurements with your patches applied? Thanks and BR, Uros.
On 25/09/2024 17:01, Ard Biesheuvel wrote: > From: Ard Biesheuvel <ardb@kernel.org> > > Build the kernel as a Position Independent Executable (PIE). This > results in more efficient relocation processing for the virtual > displacement of the kernel (for KASLR). More importantly, it instructs > the linker to generate what is actually needed (a program that can be > moved around in memory before execution), which is better than having to > rely on the linker to create a position dependent binary that happens to > tolerate being moved around after poking it in exactly the right manner. > > Note that this means that all codegen should be compatible with PIE, > including Rust objects, so this needs to switch to the small code model > with the PIE relocation model as well. > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org> > --- > arch/x86/Kconfig | 2 +- > arch/x86/Makefile | 11 +++++++---- > arch/x86/boot/compressed/misc.c | 2 ++ > arch/x86/kernel/vmlinux.lds.S | 5 +++++ > drivers/firmware/efi/libstub/x86-stub.c | 2 ++ > 5 files changed, 17 insertions(+), 5 deletions(-) > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index 54cb1f14218b..dbb4d284b0e1 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -2187,7 +2187,7 @@ config RANDOMIZE_BASE > # Relocation on x86 needs some additional build support > config X86_NEED_RELOCS > def_bool y > - depends on RANDOMIZE_BASE || (X86_32 && RELOCATABLE) > + depends on X86_32 && RELOCATABLE > > config PHYSICAL_ALIGN > hex "Alignment value to which kernel should be aligned" > diff --git a/arch/x86/Makefile b/arch/x86/Makefile > index 83d20f402535..c1dcff444bc8 100644 > --- a/arch/x86/Makefile > +++ b/arch/x86/Makefile > @@ -206,9 +206,8 @@ else > PIE_CFLAGS-$(CONFIG_SMP) += -mstack-protector-guard-reg=gs > endif > > - # Don't emit relaxable GOTPCREL relocations > - KBUILD_AFLAGS_KERNEL += -Wa,-mrelax-relocations=no > - KBUILD_CFLAGS_KERNEL += -Wa,-mrelax-relocations=no $(PIE_CFLAGS-y) > + KBUILD_CFLAGS_KERNEL += $(PIE_CFLAGS-y) > + KBUILD_RUSTFLAGS_KERNEL += -Ccode-model=small -Crelocation-model=pie > endif > > # > @@ -264,12 +263,16 @@ else > LDFLAGS_vmlinux := > endif > > +ifdef CONFIG_X86_64 > +ldflags-pie-$(CONFIG_LD_IS_LLD) := --apply-dynamic-relocs > +ldflags-pie-$(CONFIG_LD_IS_BFD) := -z call-nop=suffix-nop > +LDFLAGS_vmlinux += --pie -z text $(ldflags-pie-y) > + > # > # The 64-bit kernel must be aligned to 2MB. Pass -z max-page-size=0x200000 to > # the linker to force 2MB page size regardless of the default page size used > # by the linker. > # > -ifdef CONFIG_X86_64 > LDFLAGS_vmlinux += -z max-page-size=0x200000 > endif > > diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c > index 89f01375cdb7..79e3ffe16f61 100644 > --- a/arch/x86/boot/compressed/misc.c > +++ b/arch/x86/boot/compressed/misc.c > @@ -495,6 +495,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) > error("Destination virtual address changed when not relocatable"); > #endif > > + boot_params_ptr->kaslr_va_shift = virt_addr - LOAD_PHYSICAL_ADDR; > + > debug_putstr("\nDecompressing Linux... "); > > if (init_unaccepted_memory()) { > diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S > index f7e832c2ac61..d172e6e8eaaf 100644 > --- a/arch/x86/kernel/vmlinux.lds.S > +++ b/arch/x86/kernel/vmlinux.lds.S > @@ -459,6 +459,11 @@ xen_elfnote_phys32_entry_offset = > > DISCARDS > > + /DISCARD/ : { > + *(.dynsym .gnu.hash .hash .dynamic .dynstr) > + *(.interp .dynbss .eh_frame .sframe) > + } > + > /* > * Make sure that the .got.plt is either completely empty or it > * contains only the lazy dispatch entries. > diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c > index f8e465da344d..5c03954924fe 100644 > --- a/drivers/firmware/efi/libstub/x86-stub.c > +++ b/drivers/firmware/efi/libstub/x86-stub.c > @@ -912,6 +912,8 @@ static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry) > if (status != EFI_SUCCESS) > return status; > > + boot_params_ptr->kaslr_va_shift = virt_addr - LOAD_PHYSICAL_ADDR; > + > entry = decompress_kernel((void *)addr, virt_addr, error); > if (entry == ULONG_MAX) { > efi_free(alloc_size, addr); This patch causes a build failure here (on 64-bit): LD .tmp_vmlinux2 NM .tmp_vmlinux2.syms KSYMS .tmp_vmlinux2.kallsyms.S AS .tmp_vmlinux2.kallsyms.o LD vmlinux BTFIDS vmlinux WARN: resolve_btfids: unresolved symbol bpf_lsm_key_free FAILED elf_update(WRITE): invalid section entry size make[5]: *** [scripts/Makefile.vmlinux:34: vmlinux] Error 255 make[5]: *** Deleting file 'vmlinux' make[4]: *** [Makefile:1153: vmlinux] Error 2 make[3]: *** [debian/rules:74: build-arch] Error 2 dpkg-buildpackage: error: make -f debian/rules binary subprocess returned exit status 2 make[2]: *** [scripts/Makefile.package:121: bindeb-pkg] Error 2 make[1]: *** [/home/opc/linux-mainline-worktree2/Makefile:1544: bindeb-pkg] Error 2 make: *** [Makefile:224: __sub-make] Error 2 The parent commit builds fine. With V=1: + ldflags='-m elf_x86_64 -z noexecstack --pie -z text -z call-nop=suffix-nop -z max-page-size=0x200000 --build-id=sha1 --orphan-handling=warn --script=./arch/x86/kernel/vmlinux.lds -Map=vmlinux.map' + ld -m elf_x86_64 -z noexecstack --pie -z text -z call-nop=suffix-nop -z max-page-size=0x200000 --build-id=sha1 --orphan-handling=warn --script=./arch/x86/kernel/vmlinux.lds -Map=vmlinux.map -o vmlinux --whole-archive vmlinux.a .vmlinux.export.o init/version-timestamp.o --no-whole-archive --start-group --end-group .tmp_vmlinux2.kallsyms.o .tmp_vmlinux1.btf.o + is_enabled CONFIG_DEBUG_INFO_BTF + grep -q '^CONFIG_DEBUG_INFO_BTF=y' include/config/auto.conf + info BTFIDS vmlinux + printf ' %-7s %s\n' BTFIDS vmlinux BTFIDS vmlinux + ./tools/bpf/resolve_btfids/resolve_btfids vmlinux WARN: resolve_btfids: unresolved symbol bpf_lsm_key_free FAILED elf_update(WRITE): invalid section entry size I can send the full config off-list if necessary, but looks like it might be enough to set CONFIG_DEBUG_INFO_BTF=y. Vegard
Hi Ard, On 2024-09-25 11:01, Ard Biesheuvel wrote: > From: Ard Biesheuvel <ardb@kernel.org> > > The .head.text section contains code that may execute from a different > address than it was linked at. This is fragile, given that the x86 ABI > can refer to global symbols via absolute or relative references, and the > toolchain assumes that these are interchangeable, which they are not in > this particular case. > > In the case of the PVH code, there are some additional complications: > - the absolute references are in 32-bit code, which get emitted with > R_X86_64_32 relocations, and these are not permitted in PIE code; > - the code in question is not actually relocatable: it can only run > correctly from the physical load address specified in the ELF note. > > So rewrite the code to only rely on relative symbol references: these > are always 32-bits wide, even in 64-bit code, and are resolved by the > linker at build time. > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Juergen queued up my patches to make the PVH entry point position independent (5 commits): https://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git/log/?h=linux-next My commit that corresponds to this patch of yours is: https://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git/commit/?h=linux-next&id=1db29f99edb056d8445876292f53a63459142309 (There are more changes to handle adjusting the page tables.) Regards, Jason
On 2024-09-25 11:01, Ard Biesheuvel wrote: > From: Ard Biesheuvel <ardb@kernel.org> > > Calling C code via a different mapping than it was linked at is > problematic, because the compiler assumes that RIP-relative and absolute > symbol references are interchangeable. GCC in particular may use > RIP-relative per-CPU variable references even when not using -fpic. > > So call xen_prepare_pvh() via its kernel virtual mapping on x86_64, so > that those RIP-relative references produce the correct values. This > matches the pre-existing behavior for i386, which also invokes > xen_prepare_pvh() via the kernel virtual mapping before invoking > startup_32 with paging disabled again. > > Fixes: 7243b93345f7 ("xen/pvh: Bootstrap PVH guest") > Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Tested-by: Jason Andryuk <jason.andryuk@amd.com> Reviewed-by: Jason Andryuk <jason.andryuk@amd.com> I found that before this change xen_prepare_pvh() would call through some pv_ops function pointers into the kernel virtual mapping. Regards, Jason
On Wed, 25 Sept 2024 at 23:11, Jason Andryuk <jason.andryuk@amd.com> wrote: > > Hi Ard, > > On 2024-09-25 11:01, Ard Biesheuvel wrote: > > From: Ard Biesheuvel <ardb@kernel.org> > > > > The .head.text section contains code that may execute from a different > > address than it was linked at. This is fragile, given that the x86 ABI > > can refer to global symbols via absolute or relative references, and the > > toolchain assumes that these are interchangeable, which they are not in > > this particular case. > > > > In the case of the PVH code, there are some additional complications: > > - the absolute references are in 32-bit code, which get emitted with > > R_X86_64_32 relocations, and these are not permitted in PIE code; > > - the code in question is not actually relocatable: it can only run > > correctly from the physical load address specified in the ELF note. > > > > So rewrite the code to only rely on relative symbol references: these > > are always 32-bits wide, even in 64-bit code, and are resolved by the > > linker at build time. > > > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org> > > Juergen queued up my patches to make the PVH entry point position > independent (5 commits): > https://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git/log/?h=linux-next > > My commit that corresponds to this patch of yours is: > https://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git/commit/?h=linux-next&id=1db29f99edb056d8445876292f53a63459142309 > > (There are more changes to handle adjusting the page tables.) > Thanks for the head's up. Those changes look quite similar, so I guess I should just rebase my stuff onto the xen tree. The only thing that I would like to keep from my version is + lea (gdt - pvh_start_xen)(%ebp), %eax + add %eax, 2(%eax) + lgdt (%eax) and - .word gdt_end - gdt_start - .long _pa(gdt_start) + .word gdt_end - gdt_start - 1 + .long gdt_start - gdt The first line is a bugfix, btw, so perhaps I should send that out separately. But my series relies on all 32-bit absolute symbol references being removed, since the linker rejects those when running in PIE mode, and so the second line is needed to get rid of the _pa() there.
On 2024-09-25 17:50, Ard Biesheuvel wrote: > On Wed, 25 Sept 2024 at 23:11, Jason Andryuk <jason.andryuk@amd.com> wrote: >> >> Hi Ard, >> >> On 2024-09-25 11:01, Ard Biesheuvel wrote: >>> From: Ard Biesheuvel <ardb@kernel.org> >>> >>> The .head.text section contains code that may execute from a different >>> address than it was linked at. This is fragile, given that the x86 ABI >>> can refer to global symbols via absolute or relative references, and the >>> toolchain assumes that these are interchangeable, which they are not in >>> this particular case. >>> >>> In the case of the PVH code, there are some additional complications: >>> - the absolute references are in 32-bit code, which get emitted with >>> R_X86_64_32 relocations, and these are not permitted in PIE code; >>> - the code in question is not actually relocatable: it can only run >>> correctly from the physical load address specified in the ELF note. >>> >>> So rewrite the code to only rely on relative symbol references: these >>> are always 32-bits wide, even in 64-bit code, and are resolved by the >>> linker at build time. >>> >>> Signed-off-by: Ard Biesheuvel <ardb@kernel.org> >> >> Juergen queued up my patches to make the PVH entry point position >> independent (5 commits): >> https://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git/log/?h=linux-next >> >> My commit that corresponds to this patch of yours is: >> https://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git/commit/?h=linux-next&id=1db29f99edb056d8445876292f53a63459142309 >> >> (There are more changes to handle adjusting the page tables.) >> > > Thanks for the head's up. Those changes look quite similar, so I guess > I should just rebase my stuff onto the xen tree. > > The only thing that I would like to keep from my version is > > + lea (gdt - pvh_start_xen)(%ebp), %eax If you rebase on top of the xen tree, using rva() would match the rest of the code: lea rva(gdt)(%ebp), %eax > + add %eax, 2(%eax) > + lgdt (%eax) > > and > > - .word gdt_end - gdt_start > - .long _pa(gdt_start) > + .word gdt_end - gdt_start - 1 > + .long gdt_start - gdt > > The first line is a bugfix, btw, so perhaps I should send that out > separately. But my series relies on all 32-bit absolute symbol > references being removed, since the linker rejects those when running > in PIE mode, and so the second line is needed to get rid of the _pa() > there. Sounds good. Regards, Jason
On Wed, 25 Sept 2024 at 22:25, Vegard Nossum <vegard.nossum@oracle.com> wrote: > > > On 25/09/2024 17:01, Ard Biesheuvel wrote: > > From: Ard Biesheuvel <ardb@kernel.org> > > > > Build the kernel as a Position Independent Executable (PIE). This > > results in more efficient relocation processing for the virtual > > displacement of the kernel (for KASLR). More importantly, it instructs > > the linker to generate what is actually needed (a program that can be > > moved around in memory before execution), which is better than having to > > rely on the linker to create a position dependent binary that happens to > > tolerate being moved around after poking it in exactly the right manner. > > > > Note that this means that all codegen should be compatible with PIE, > > including Rust objects, so this needs to switch to the small code model > > with the PIE relocation model as well. > > > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org> > > --- > > arch/x86/Kconfig | 2 +- > > arch/x86/Makefile | 11 +++++++---- > > arch/x86/boot/compressed/misc.c | 2 ++ > > arch/x86/kernel/vmlinux.lds.S | 5 +++++ > > drivers/firmware/efi/libstub/x86-stub.c | 2 ++ > > 5 files changed, 17 insertions(+), 5 deletions(-) > > ... > > This patch causes a build failure here (on 64-bit): > > LD .tmp_vmlinux2 > NM .tmp_vmlinux2.syms > KSYMS .tmp_vmlinux2.kallsyms.S > AS .tmp_vmlinux2.kallsyms.o > LD vmlinux > BTFIDS vmlinux > WARN: resolve_btfids: unresolved symbol bpf_lsm_key_free > FAILED elf_update(WRITE): invalid section entry size > make[5]: *** [scripts/Makefile.vmlinux:34: vmlinux] Error 255 > make[5]: *** Deleting file 'vmlinux' > make[4]: *** [Makefile:1153: vmlinux] Error 2 > make[3]: *** [debian/rules:74: build-arch] Error 2 > dpkg-buildpackage: error: make -f debian/rules binary subprocess > returned exit status 2 > make[2]: *** [scripts/Makefile.package:121: bindeb-pkg] Error 2 > make[1]: *** [/home/opc/linux-mainline-worktree2/Makefile:1544: > bindeb-pkg] Error 2 > make: *** [Makefile:224: __sub-make] Error 2 > > The parent commit builds fine. With V=1: > > + ldflags='-m elf_x86_64 -z noexecstack --pie -z text -z > call-nop=suffix-nop -z max-page-size=0x200000 --build-id=sha1 > --orphan-handling=warn --script=./arch/x86/kernel/vmlinux.lds > -Map=vmlinux.map' > + ld -m elf_x86_64 -z noexecstack --pie -z text -z call-nop=suffix-nop > -z max-page-size=0x200000 --build-id=sha1 --orphan-handling=warn > --script=./arch/x86/kernel/vmlinux.lds -Map=vmlinux.map -o vmlinux > --whole-archive vmlinux.a .vmlinux.export.o init/version-timestamp.o > --no-whole-archive --start-group --end-group .tmp_vmlinux2.kallsyms.o > .tmp_vmlinux1.btf.o > + is_enabled CONFIG_DEBUG_INFO_BTF > + grep -q '^CONFIG_DEBUG_INFO_BTF=y' include/config/auto.conf > + info BTFIDS vmlinux > + printf ' %-7s %s\n' BTFIDS vmlinux > BTFIDS vmlinux > + ./tools/bpf/resolve_btfids/resolve_btfids vmlinux > WARN: resolve_btfids: unresolved symbol bpf_lsm_key_free > FAILED elf_update(WRITE): invalid section entry size > > I can send the full config off-list if necessary, but looks like it > might be enough to set CONFIG_DEBUG_INFO_BTF=y. > Thanks for the report. Turns out that adding the GOT to .rodata bumps the section's sh_entsize to 8, and libelf complains if the section size is not a multiple of the entry size. I'll include a fix in the next revision.
On Wed, Sep 25, 2024 at 2:33 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > On Wed, Sep 25, 2024 at 5:02 PM Ard Biesheuvel <ardb+git@google.com> wrote: > > > > From: Ard Biesheuvel <ardb@kernel.org> > > > > Specify the guard symbol for the stack cookie explicitly, rather than > > positioning it exactly 40 bytes into the per-CPU area. Doing so removes > > the need for the per-CPU region to be absolute rather than relative to > > the placement of the per-CPU template region in the kernel image, and > > this allows the special handling for absolute per-CPU symbols to be > > removed entirely. > > > > This is a worthwhile cleanup in itself, but it is also a prerequisite > > for PIE codegen and PIE linking, which can replace our bespoke and > > rather clunky runtime relocation handling. > > I would like to point out a series that converted the stack protector > guard symbol to a normal percpu variable [1], so there was no need to > assume anything about the location of the guard symbol. > > [1] "[PATCH v4 00/16] x86-64: Stack protector and percpu improvements" > https://lore.kernel.org/lkml/20240322165233.71698-1-brgerst@gmail.com/ > > Uros. I plan on resubmitting that series sometime after the 6.12 merge window closes. As I recall from the last version, it was decided to wait until after the next LTS release to raise the minimum GCC version to 8.1 and avoid the need to be compatible with the old stack protector layout. Brian Gerst
On Sat, 28 Sept 2024 at 15:41, Brian Gerst <brgerst@gmail.com> wrote: > > On Wed, Sep 25, 2024 at 2:33 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > > > On Wed, Sep 25, 2024 at 5:02 PM Ard Biesheuvel <ardb+git@google.com> wrote: > > > > > > From: Ard Biesheuvel <ardb@kernel.org> > > > > > > Specify the guard symbol for the stack cookie explicitly, rather than > > > positioning it exactly 40 bytes into the per-CPU area. Doing so removes > > > the need for the per-CPU region to be absolute rather than relative to > > > the placement of the per-CPU template region in the kernel image, and > > > this allows the special handling for absolute per-CPU symbols to be > > > removed entirely. > > > > > > This is a worthwhile cleanup in itself, but it is also a prerequisite > > > for PIE codegen and PIE linking, which can replace our bespoke and > > > rather clunky runtime relocation handling. > > > > I would like to point out a series that converted the stack protector > > guard symbol to a normal percpu variable [1], so there was no need to > > assume anything about the location of the guard symbol. > > > > [1] "[PATCH v4 00/16] x86-64: Stack protector and percpu improvements" > > https://lore.kernel.org/lkml/20240322165233.71698-1-brgerst@gmail.com/ > > > > Uros. > > I plan on resubmitting that series sometime after the 6.12 merge > window closes. As I recall from the last version, it was decided to > wait until after the next LTS release to raise the minimum GCC version > to 8.1 and avoid the need to be compatible with the old stack > protector layout. > Hi Brian, I'd be more than happy to compare notes on that - I wasn't aware of your intentions here, or I would have reached out before sending this RFC. There are two things that you would need to address for Clang support to work correctly: - the workaround I cc'ed you on the other day [0], - a workaround for the module loader so it tolerates the GOTPCRELX relocations that Clang emits [1] [0] https://lore.kernel.org/all/20241002092534.3163838-2-ardb+git@google.com/ [1] https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/commit/?id=a18121aabbdd
On Fri, Oct 4, 2024 at 9:15 AM Ard Biesheuvel <ardb@kernel.org> wrote: > > On Sat, 28 Sept 2024 at 15:41, Brian Gerst <brgerst@gmail.com> wrote: > > > > On Wed, Sep 25, 2024 at 2:33 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > > > > > On Wed, Sep 25, 2024 at 5:02 PM Ard Biesheuvel <ardb+git@google.com> wrote: > > > > > > > > From: Ard Biesheuvel <ardb@kernel.org> > > > > > > > > Specify the guard symbol for the stack cookie explicitly, rather than > > > > positioning it exactly 40 bytes into the per-CPU area. Doing so removes > > > > the need for the per-CPU region to be absolute rather than relative to > > > > the placement of the per-CPU template region in the kernel image, and > > > > this allows the special handling for absolute per-CPU symbols to be > > > > removed entirely. > > > > > > > > This is a worthwhile cleanup in itself, but it is also a prerequisite > > > > for PIE codegen and PIE linking, which can replace our bespoke and > > > > rather clunky runtime relocation handling. > > > > > > I would like to point out a series that converted the stack protector > > > guard symbol to a normal percpu variable [1], so there was no need to > > > assume anything about the location of the guard symbol. > > > > > > [1] "[PATCH v4 00/16] x86-64: Stack protector and percpu improvements" > > > https://lore.kernel.org/lkml/20240322165233.71698-1-brgerst@gmail.com/ > > > > > > Uros. > > > > I plan on resubmitting that series sometime after the 6.12 merge > > window closes. As I recall from the last version, it was decided to > > wait until after the next LTS release to raise the minimum GCC version > > to 8.1 and avoid the need to be compatible with the old stack > > protector layout. > > > > Hi Brian, > > I'd be more than happy to compare notes on that - I wasn't aware of > your intentions here, or I would have reached out before sending this > RFC. > > There are two things that you would need to address for Clang support > to work correctly: > - the workaround I cc'ed you on the other day [0], > - a workaround for the module loader so it tolerates the GOTPCRELX > relocations that Clang emits [1] > > > > [0] https://lore.kernel.org/all/20241002092534.3163838-2-ardb+git@google.com/ > [1] https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/commit/?id=a18121aabbdd The first patch should be applied independently as a bug fix, since it already affects the 32-bit build with clang. I don't have an environment with an older clang compiler to test the second patch, but I'll assume it will be necessary. I did run into an issue with the GOTPCRELX relocations before [1], but I thought it was just an objtool issue and didn't do more testing to know if modules were broken or not. Brian Gerst [1] https://lore.kernel.org/all/20231026160100.195099-6-brgerst@gmail.com/
From: Ard Biesheuvel <ardb@kernel.org> The x86_64 port has a number of historical quirks that result in a reliance on toolchain features that are either poorly specified or basically implementation details of the toolchain: - the 'kernel' C model implemented by the compiler is intended for position dependent code residing in the 'negative' 2 GiB of the virtual address space, but is used to create a position independent executable (for virtual KASLR); - the 'kernel' C model has other properties that are not written down anywhere, and may therefore deviate between compilers and versions, which now includes the Rust compilers too (e.g., use %gs not %fs for per-CPU references); - the relocation format used to perform the PIE relocation at boot is complicated and non-standard, as it deals with 3 types of displacements, including 32-bit negative displacements for RIP-relative per-CPU references that are not subject to relocation fixups (as they are places in a separate, disjoint address space); - the relocation table is generated from static relocation metadata taken from the ELF input objects into the linker, and these describe the input not the output - relaxations or other linker tweaks may result in a mismatch between the two, and GNU ld and LLD display different behavior here; - this disjoint per-CPU address space requires elaborate hacks in the linker script and the startup code; - some of the startup code executes from a 1:1 mapping of memory, where RIP-relative references are mandatory, whereas RIP-relative per-CPU variable references can only work correctly from the kernel virtual mapping (as they need to wrap around from the negative 2 GiB space into the 0x0 based per-CPU region); The reason for this odd situation wrt per-CPU variable addressing is the fact that we rely on the user-space TLS arrangement for per-task stack cookies, and this was implemented using a fixed offset of 40 bytes from %GS. If we bump the minimum GCC version to 8.1, we can switch to symbol based stack cookie references, allowing the same arrangement to be adopted as on other architectures, i.e., where the CPU register carries the per-CPU offset, and UP or boot-time per-CPU references point into the per-CPU load area directly (using an offset of 0x0). With that out of the way, we can untangle this whole thing, and replace the bespoke tooling and relocation formats with ordinary, linker generated relocation tables, using the RELR format that reduces the memory footprint of the relocation table by 20x. The compilers can efficiently generate position independent code these days, without unnecessary indirections via the Global Object Table (GOT) except for a handful of special cases (see the KVM patch for an example where a GOT-based indirection is the best choice for pushing the absolute address of a symbol onto the stack in a position independent manner when there are no free GPRs) It also brings us much closer to the ordinary PIE relocation model used for most of user space, which is therefore much better supported and less likely to create problems as we increase the range of compilers and linkers that need to be supported. Tested on GCC 8 - 14 and Clang 15 - 17, using EFI and bare metal boot using a variety of entry points (decompressor, EFI stub, XenPV, PVH) Cc: x86@kernel.org Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Uros Bizjak <ubizjak@gmail.com> Cc: Dennis Zhou <dennis@kernel.org> Cc: Tejun Heo <tj@kernel.org> Cc: Christoph Lameter <cl@linux.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Vitaly Kuznetsov <vkuznets@redhat.com> Cc: Juergen Gross <jgross@suse.com> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Masahiro Yamada <masahiroy@kernel.org> Cc: Kees Cook <kees@kernel.org> Cc: Nathan Chancellor <nathan@kernel.org> Cc: Keith Packard <keithp@keithp.com> Cc: Justin Stitt <justinstitt@google.com> Cc: Josh Poimboeuf <jpoimboe@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Ian Rogers <irogers@google.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: linux-doc@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: kvm@vger.kernel.org Cc: xen-devel@lists.xenproject.org Cc: linux-efi@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-sparse@vger.kernel.org Cc: linux-kbuild@vger.kernel.org Cc: linux-perf-users@vger.kernel.org Cc: rust-for-linux@vger.kernel.org Cc: llvm@lists.linux.dev Ard Biesheuvel (28): x86/pvh: Call C code via the kernel virtual mapping Documentation: Bump minimum GCC version to 8.1 x86/tools: Use mmap() to simplify relocs host tool x86/boot: Permit GOTPCREL relocations for x86_64 builds x86: Define the stack protector guard symbol explicitly x86/percpu: Get rid of absolute per-CPU variable placement scripts/kallsyms: Avoid 0x0 as the relative base scripts/kallsyms: Remove support for absolute per-CPU variables x86/tools: Remove special relocation handling for per-CPU variables x86/xen: Avoid relocatable quantities in Xen ELF notes x86/pvh: Avoid absolute symbol references in .head.text x86/pm-trace: Use RIP-relative accesses for .tracedata x86/kvm: Use RIP-relative addressing x86/rethook: Use RIP-relative reference for return address x86/sync_core: Use RIP-relative addressing x86/entry_64: Use RIP-relative addressing x86/hibernate: Prefer RIP-relative accesses x86/boot/64: Determine VA/PA offset before entering C code x86/boot/64: Avoid intentional absolute symbol references in .head.text x64/acpi: Use PIC-compatible references in wakeup_64.S x86/head: Use PIC-compatible symbol references in startup code asm-generic: Treat PIC .data.rel.ro sections as .rodata tools/objtool: Mark generated sections as writable tools/objtool: Treat indirect ftrace calls as direct calls x86: Use PIE codegen for the core kernel x86/boot: Implement support for ELF RELA/RELR relocations x86/kernel: Switch to PIE linking for the core kernel x86/tools: Drop x86_64 support from 'relocs' tool Documentation/admin-guide/README.rst | 2 +- Documentation/arch/x86/zero-page.rst | 3 +- Documentation/process/changes.rst | 2 +- arch/x86/Kconfig | 3 +- arch/x86/Makefile | 22 +- arch/x86/boot/Makefile | 1 + arch/x86/boot/compressed/Makefile | 2 +- arch/x86/boot/compressed/misc.c | 16 +- arch/x86/entry/calling.h | 9 +- arch/x86/entry/entry_64.S | 12 +- arch/x86/entry/vdso/Makefile | 1 + arch/x86/include/asm/desc.h | 1 - arch/x86/include/asm/init.h | 2 +- arch/x86/include/asm/percpu.h | 22 - arch/x86/include/asm/pm-trace.h | 4 +- arch/x86/include/asm/processor.h | 14 +- arch/x86/include/asm/setup.h | 3 +- arch/x86/include/asm/stackprotector.h | 4 - arch/x86/include/asm/sync_core.h | 3 +- arch/x86/include/uapi/asm/bootparam.h | 2 +- arch/x86/kernel/acpi/wakeup_64.S | 11 +- arch/x86/kernel/head64.c | 76 +++- arch/x86/kernel/head_64.S | 40 +- arch/x86/kernel/irq_64.c | 1 - arch/x86/kernel/kvm.c | 8 +- arch/x86/kernel/relocate_kernel_64.S | 6 +- arch/x86/kernel/rethook.c | 3 +- arch/x86/kernel/setup_percpu.c | 9 +- arch/x86/kernel/vmlinux.lds.S | 75 ++-- arch/x86/platform/pvh/head.S | 57 ++- arch/x86/power/hibernate_asm_64.S | 4 +- arch/x86/realmode/rm/Makefile | 1 + arch/x86/tools/Makefile | 2 +- arch/x86/tools/relocs.c | 425 +++----------------- arch/x86/tools/relocs.h | 11 +- arch/x86/tools/relocs_64.c | 18 - arch/x86/tools/relocs_common.c | 11 +- arch/x86/xen/xen-head.S | 16 +- drivers/base/power/trace.c | 6 +- drivers/firmware/efi/libstub/x86-stub.c | 2 + include/asm-generic/vmlinux.lds.h | 10 +- include/linux/compiler.h | 2 +- init/Kconfig | 5 - kernel/kallsyms.c | 12 +- scripts/kallsyms.c | 53 +-- scripts/link-vmlinux.sh | 4 - tools/objtool/check.c | 43 +- tools/objtool/elf.c | 2 +- tools/objtool/include/objtool/special.h | 2 +- tools/perf/util/annotate.c | 4 +- 50 files changed, 380 insertions(+), 667 deletions(-) delete mode 100644 arch/x86/tools/relocs_64.c