diff mbox series

Do not clear BSS region in x86 stub

Message ID DS7PR19MB5709B2A263E769B461091B0D8BA32@DS7PR19MB5709.namprd19.prod.outlook.com
State Superseded
Headers show
Series Do not clear BSS region in x86 stub | expand

Commit Message

Shao, Marshall July 17, 2024, 3:16 p.m. UTC
Clearing the BSS region may cause the UEFI firmware to malfunction
during boot.

When booting the kernel from an older firmware version that has TPM
enabled and the MemoryOverwriteRequestControl bit set to 1, the 
firmware's boot service might encounter an exception if it attempts 
to initialize the BSS region within the x86 stub.

To circumvent the firmware exception, it is advisable to enlarge the 
BOOT_STACK_SIZE and to perform the initialization of static variables 
prior to the decompression of the bzImage.

Signed-off-by: Marshall Shao <marshall.shao@dell.com>
---
 arch/x86/boot/compressed/misc.c         | 8 +++-----
 arch/x86/include/asm/boot.h             | 2 +-
 drivers/firmware/efi/libstub/x86-stub.c | 5 -----
 3 files changed, 4 insertions(+), 11 deletions(-)

Comments

Shao, Marshall July 20, 2024, 3:40 p.m. UTC | #1
Hi Ard,

Thanks for your reply. Here is the problem I encountered, and the problem
still exists in the mainline:

Firmware: UEFI 2.4,
EFI loader: systemd-boot

When I update my kernel from LTS v6.1.x to v6.6.x, the kernel fails to
 boot when it jumps from the handover protocol. And it fails only on an
old Firmware that MemoryOverwriteRequestControl(MOR) bit is set to 1.

According to the TCG Reset Attack Mitigation Spec, a Memory Clear Method
will be loaded in the memory on boot, and this causes the EFI loader
and OS memory address to shift. Another important factor is that my 
BSS region and boot_idt entries are not as clean as those in QEMU and 
other platforms. 

This means that clearing BSS in the handover function can cause the 
firmware's IDT/GDT corruption, as you mentioned here commit <7b8439b0369b> 
(x86/efistub: Drop redundant clearing of BSS). I noticed that most of 
variables in BSS will be initialized before accessing, and only a few are 
not, that's why I removed memset part in handover entry point.

In terms of the BOOT_STACK_SIZE, I found out that the boot_stack region is
used in this case (at the beginning of _bss), and 0x4000 is not sufficient 
to cover this issue since the data in boot_heap will be overwritten during
decompression. Prior to image decompression in EFI stub, it was ok.

00000000008fc000 g       .bss   0000000000000000 .hidden _bss              
00000000008fc000 l     O .bss   0000000000004000 boot_stack                
0000000000900000 g     O .bss   0000000000000004 .hidden spurious_nmi_count
0000000000900000 l     O .bss   0000000000000000 boot_stack_end            
0000000000900010 g     O .bss   0000000000000018 .hidden pio_ops           
0000000000900028 g     O .bss   0000000000000008 .hidden boot_params_ptr   
0000000000900040 l     O .bss   0000000000001000 scratch.0                 
0000000000901040 l     O .bss   0000000000010000 boot_heap        

My thought was that increasing the size of boot_stack should be harmless.
As for the memset part, there could be an alternative way, althought it 
looks a bit ugly.

memset(_bss+0x10000, 0, _ebss - _bss - 0x10000)

However, I've updated the diff, if you think this is more like a 
workaround, please take this thread as a bug report. Thanks!

Regards,

Marshall Shao


Signed-off-by: Marshall Shao <marshall.shao@dell.com>
---
 arch/x86/boot/compressed/misc.c         | 4 ++--
 arch/x86/include/asm/boot.h             | 2 +-
 drivers/firmware/efi/libstub/x86-5lvl.c | 2 +-
 drivers/firmware/efi/libstub/x86-stub.c | 2 --
 include/linux/decompress/mm.h           | 2 +-
 5 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 944454306ef4..49b68f57dd18 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -50,8 +50,8 @@ struct boot_params *boot_params_ptr;
 
 struct port_io_ops pio_ops;
 
-memptr free_mem_ptr;
-memptr free_mem_end_ptr;
+memptr free_mem_ptr __section(".data");
+memptr free_mem_end_ptr  __section(".data");
 int spurious_nmi_count;
 
 static char *vidmem;
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 3e5b111e619d..312bc87ab027 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -33,7 +33,7 @@
 #endif
 
 #ifdef CONFIG_X86_64
-# define BOOT_STACK_SIZE	0x4000
+# define BOOT_STACK_SIZE	0x10000
 
 /*
  * Used by decompressor's startup_32() to allocate page tables for identity
diff --git a/drivers/firmware/efi/libstub/x86-5lvl.c b/drivers/firmware/efi/libstub/x86-5lvl.c
index 77359e802181..bebae4fdfb93 100644
--- a/drivers/firmware/efi/libstub/x86-5lvl.c
+++ b/drivers/firmware/efi/libstub/x86-5lvl.c
@@ -10,7 +10,7 @@
 
 bool efi_no5lvl;
 
-static void (*la57_toggle)(void *cr3);
+static void (*la57_toggle)(void *cr3) __section(".data");
 
 static const struct desc_struct gdt[] = {
 	[GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c
index 078055b054e3..ffc62af50669 100644
--- a/drivers/firmware/efi/libstub/x86-stub.c
+++ b/drivers/firmware/efi/libstub/x86-stub.c
@@ -21,7 +21,6 @@
 #include "efistub.h"
 #include "x86-stub.h"
 
-extern char _bss[], _ebss[];
 
 const efi_system_table_t *efi_system_table;
 const efi_dxe_services_table_t *efi_dxe_table;
@@ -1059,7 +1058,6 @@ void __noreturn efi_stub_entry(efi_handle_t handle,
 void efi_handover_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg,
 			struct boot_params *boot_params)
 {
-	memset(_bss, 0, _ebss - _bss);
 	efi_stub_entry(handle, sys_table_arg, boot_params);
 }
 
diff --git a/include/linux/decompress/mm.h b/include/linux/decompress/mm.h
index ac862422df15..62c04691c898 100644
--- a/include/linux/decompress/mm.h
+++ b/include/linux/decompress/mm.h
@@ -36,7 +36,7 @@
 /* A trivial malloc implementation, adapted from
  *  malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
  */
-STATIC_RW_DATA unsigned long malloc_ptr;
+STATIC_RW_DATA unsigned long malloc_ptr  __section(".data");
 STATIC_RW_DATA int malloc_count;
 
 MALLOC_VISIBLE void *malloc(int size)
Ard Biesheuvel July 21, 2024, 1:26 p.m. UTC | #2
On Sat, 20 Jul 2024 at 17:40, Shao, Marshall <Marshall.Shao@dell.com> wrote:
>
> Hi Ard,
>
> Thanks for your reply. Here is the problem I encountered, and the problem
> still exists in the mainline:
>
> Firmware: UEFI 2.4,
> EFI loader: systemd-boot
>
> When I update my kernel from LTS v6.1.x to v6.6.x, the kernel fails to
>  boot when it jumps from the handover protocol. And it fails only on an
> old Firmware that MemoryOverwriteRequestControl(MOR) bit is set to 1.
>
> According to the TCG Reset Attack Mitigation Spec, a Memory Clear Method
> will be loaded in the memory on boot, and this causes the EFI loader
> and OS memory address to shift. Another important factor is that my
> BSS region and boot_idt entries are not as clean as those in QEMU and
> other platforms.
>
> This means that clearing BSS in the handover function can cause the
> firmware's IDT/GDT corruption, as you mentioned here commit <7b8439b0369b>

This commit ID does not exist, I take it you mean
ebf5a79acf9a2970e93d30a9e97b08913ef15711

> (x86/efistub: Drop redundant clearing of BSS).

systemd-boot does not use the handover entrypoint, it uses the native
PE entrypoint, which no longer clears BSS with the commit above
applied.

> I noticed that most of
> variables in BSS will be initialized before accessing, and only a few are
> not, that's why I removed memset part in handover entry point.
>

It would be better not to rely on special semantics that deviate from
standard C. That said, it would also be better not to rely on this
funky EFI handover protocol in the first place, which is only
implemented by downstream, distro versions of GRUB. Given that GRUB
now supports the native EFI entrypoint properly, the handover protocol
is essentially deprecated.

This does not mean, of course, that we should stop supporting it. But
removing the memset() there may break things in a way that only
becomes apparent once the changes trickle down to systems running
those older GRUBs.

> In terms of the BOOT_STACK_SIZE, I found out that the boot_stack region is
> used in this case (at the beginning of _bss), and 0x4000 is not sufficient
> to cover this issue since the data in boot_heap will be overwritten during
> decompression.

EFI boot uses the EFI firmware stack, and never enters the
decompressor, where the address of boot_stack_end is loaded into
ESP/RSP.

This means that increasing the boot stack size does not prevent an
overrun, but is affecting the image layout in a different way,
avoiding your boot issue.

It would be nice to get to the bottom of this: can you double check
that the PE image metadata is accurate? llvm-readelf -a can be used to
dump the PE header of a bzImage.

> Prior to image decompression in EFI stub, it was ok.
>
> 00000000008fc000 g       .bss   0000000000000000 .hidden _bss
> 00000000008fc000 l     O .bss   0000000000004000 boot_stack
> 0000000000900000 g     O .bss   0000000000000004 .hidden spurious_nmi_count
> 0000000000900000 l     O .bss   0000000000000000 boot_stack_end
> 0000000000900010 g     O .bss   0000000000000018 .hidden pio_ops
> 0000000000900028 g     O .bss   0000000000000008 .hidden boot_params_ptr
> 0000000000900040 l     O .bss   0000000000001000 scratch.0
> 0000000000901040 l     O .bss   0000000000010000 boot_heap
>
> My thought was that increasing the size of boot_stack should be harmless.

I'm not bothered by the change itself, I just want to make sure that
the fix is guaranteed to address the problem rather than paper over
it.

> As for the memset part, there could be an alternative way, althought it
> looks a bit ugly.
>
> memset(_bss+0x10000, 0, _ebss - _bss - 0x10000)
>

So now you are applying the memset only to part of BSS, right? How
does this help?


> However, I've updated the diff, if you think this is more like a
> workaround, please take this thread as a bug report. Thanks!
>
> Regards,
>
> Marshall Shao
>
>
> Signed-off-by: Marshall Shao <marshall.shao@dell.com>
> ---
>  arch/x86/boot/compressed/misc.c         | 4 ++--
>  arch/x86/include/asm/boot.h             | 2 +-
>  drivers/firmware/efi/libstub/x86-5lvl.c | 2 +-
>  drivers/firmware/efi/libstub/x86-stub.c | 2 --
>  include/linux/decompress/mm.h           | 2 +-
>  5 files changed, 5 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
> index 944454306ef4..49b68f57dd18 100644
> --- a/arch/x86/boot/compressed/misc.c
> +++ b/arch/x86/boot/compressed/misc.c
> @@ -50,8 +50,8 @@ struct boot_params *boot_params_ptr;
>
>  struct port_io_ops pio_ops;
>
> -memptr free_mem_ptr;
> -memptr free_mem_end_ptr;
> +memptr free_mem_ptr __section(".data");
> +memptr free_mem_end_ptr  __section(".data");
>  int spurious_nmi_count;
>
>  static char *vidmem;
> diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
> index 3e5b111e619d..312bc87ab027 100644
> --- a/arch/x86/include/asm/boot.h
> +++ b/arch/x86/include/asm/boot.h
> @@ -33,7 +33,7 @@
>  #endif
>
>  #ifdef CONFIG_X86_64
> -# define BOOT_STACK_SIZE       0x4000
> +# define BOOT_STACK_SIZE       0x10000
>
>  /*
>   * Used by decompressor's startup_32() to allocate page tables for identity
> diff --git a/drivers/firmware/efi/libstub/x86-5lvl.c b/drivers/firmware/efi/libstub/x86-5lvl.c
> index 77359e802181..bebae4fdfb93 100644
> --- a/drivers/firmware/efi/libstub/x86-5lvl.c
> +++ b/drivers/firmware/efi/libstub/x86-5lvl.c
> @@ -10,7 +10,7 @@
>
>  bool efi_no5lvl;
>
> -static void (*la57_toggle)(void *cr3);
> +static void (*la57_toggle)(void *cr3) __section(".data");
>
>  static const struct desc_struct gdt[] = {
>         [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
> diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c
> index 078055b054e3..ffc62af50669 100644
> --- a/drivers/firmware/efi/libstub/x86-stub.c
> +++ b/drivers/firmware/efi/libstub/x86-stub.c
> @@ -21,7 +21,6 @@
>  #include "efistub.h"
>  #include "x86-stub.h"
>
> -extern char _bss[], _ebss[];
>
>  const efi_system_table_t *efi_system_table;
>  const efi_dxe_services_table_t *efi_dxe_table;
> @@ -1059,7 +1058,6 @@ void __noreturn efi_stub_entry(efi_handle_t handle,
>  void efi_handover_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg,
>                         struct boot_params *boot_params)
>  {
> -       memset(_bss, 0, _ebss - _bss);
>         efi_stub_entry(handle, sys_table_arg, boot_params);
>  }
>
> diff --git a/include/linux/decompress/mm.h b/include/linux/decompress/mm.h
> index ac862422df15..62c04691c898 100644
> --- a/include/linux/decompress/mm.h
> +++ b/include/linux/decompress/mm.h
> @@ -36,7 +36,7 @@
>  /* A trivial malloc implementation, adapted from
>   *  malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
>   */
> -STATIC_RW_DATA unsigned long malloc_ptr;
> +STATIC_RW_DATA unsigned long malloc_ptr  __section(".data");
>  STATIC_RW_DATA int malloc_count;
>
>  MALLOC_VISIBLE void *malloc(int size)
> --
> 2.34.1
>
Shao, Marshall July 22, 2024, 11:45 a.m. UTC | #3
Hi Ard,

> Given that GRUB now supports the native EFI entrypoint properly, 
> the handover protocol is essentially deprecated.

In my case, the systemd-boot jumped into the EFI stub code via
handover protocol, this may not be an orthodox way to boot the kernel
but it performs well on the others, I have tested on at least 
6 firmware.

I understand that the handover protocol is going to be deprecated.
However, as of now, I can't guarantee which EFI loader will be 
used to load my bzImage. Although it’s not very common, booting 
from the handover protocol with uncleaned BSS memory is possible.

>> memset(_bss+0x10000, 0, _ebss - _bss - 0x10000)

> So now you are applying the memset only to part of BSS, right? How
> does this help?

This part doesn't work without increasing the BOOT_STACK_SIZE.

And following content is my PE metadata, many thanks!


File: bzImage
Format: COFF-x86-64
Arch: x86_64
AddressSize: 64bit
ImageFileHeader {
  Machine: IMAGE_FILE_MACHINE_AMD64 (0x8664)
  SectionCount: 3
  TimeDateStamp: 1970-01-01 00:00:00 (0x0)
  PointerToSymbolTable: 0x0
  SymbolCount: 0
  StringTableSize: 0
  OptionalHeaderSize: 160
  Characteristics [ (0x206)
    IMAGE_FILE_DEBUG_STRIPPED (0x200)
    IMAGE_FILE_EXECUTABLE_IMAGE (0x2)
    IMAGE_FILE_LINE_NUMS_STRIPPED (0x4)
  ]
}
ImageOptionalHeader {
  Magic: 0x20B
  MajorLinkerVersion: 2
  MinorLinkerVersion: 20
  SizeOfCode: 9416704
  SizeOfInitializedData: 241664
  SizeOfUninitializedData: 0
  AddressOfEntryPoint: 0x8F8990
  BaseOfCode: 0x4000
  ImageBase: 0x0
  SectionAlignment: 4096
  FileAlignment: 512
  MajorOperatingSystemVersion: 0
  MinorOperatingSystemVersion: 0
  MajorImageVersion: 3
  MinorImageVersion: 0
  MajorSubsystemVersion: 0
  MinorSubsystemVersion: 0
  SizeOfImage: 9674752
  SizeOfHeaders: 4096
  Subsystem: IMAGE_SUBSYSTEM_EFI_APPLICATION (0xA)
  Characteristics [ (0x100)
    IMAGE_DLL_CHARACTERISTICS_NX_COMPAT (0x100)
  ]
  SizeOfStackReserve: 0
  SizeOfStackCommit: 0
  SizeOfHeapReserve: 0
  SizeOfHeapCommit: 0
  NumberOfRvaAndSize: 6
  DataDirectory {
    ExportTableRVA: 0x0
    ExportTableSize: 0x0
    ImportTableRVA: 0x0
    ImportTableSize: 0x0
    ResourceTableRVA: 0x0
    ResourceTableSize: 0x0
    ExceptionTableRVA: 0x0
    ExceptionTableSize: 0x0
    CertificateTableRVA: 0x0
    CertificateTableSize: 0x0
    BaseRelocationTableRVA: 0x0
    BaseRelocationTableSize: 0x0
  }
}
DOSHeader {
  Magic: MZ
  UsedBytesInTheLastPage: 0
  FileSizeInPages: 0
  NumberOfRelocationItems: 0
  HeaderSizeInParagraphs: 0
  MinimumExtraParagraphs: 0
  MaximumExtraParagraphs: 0
  InitialRelativeSS: 0
  InitialSP: 0
  Checksum: 0
  InitialIP: 0
  InitialRelativeCS: 0
  AddressOfRelocationTable: 0
  OverlayNumber: 0
  OEMid: 0
  OEMinfo: 0
  AddressOfNewExeHeader: 64
}
Sections [
  Section {
    Number: 1
    Name: .setup (2E 73 65 74 75 70 00 00)
    VirtualSize: 0x3000
    VirtualAddress: 0x1000
    RawDataSize: 12288
    PointerToRawData: 0x1000
    PointerToRelocations: 0x0
    PointerToLineNumbers: 0x0
    RelocationCount: 0
    LineNumberCount: 0
    Characteristics [ (0x42000040)
      IMAGE_SCN_CNT_INITIALIZED_DATA (0x40)
      IMAGE_SCN_MEM_DISCARDABLE (0x2000000)
      IMAGE_SCN_MEM_READ (0x40000000)
    ]
  }
  Section {
    Number: 2
    Name: .text (2E 74 65 78 74 00 00 00)
    VirtualSize: 0x8FB000
    VirtualAddress: 0x4000
    RawDataSize: 9416704
    PointerToRawData: 0x4000
    PointerToRelocations: 0x0
    PointerToLineNumbers: 0x0
    RelocationCount: 0
    LineNumberCount: 0
    Characteristics [ (0x60000020)
      IMAGE_SCN_CNT_CODE (0x20)
      IMAGE_SCN_MEM_EXECUTE (0x20000000)
      IMAGE_SCN_MEM_READ (0x40000000)
    ]
  }
  Section {
    Number: 3
    Name: .data (2E 64 61 74 61 00 00 00)
    VirtualSize: 0x3B000
    VirtualAddress: 0x8FF000
    RawDataSize: 1024
    PointerToRawData: 0x8FF000
    PointerToRelocations: 0x0
    PointerToLineNumbers: 0x0
    RelocationCount: 0
    LineNumberCount: 0
    Characteristics [ (0xC0000040)
      IMAGE_SCN_CNT_INITIALIZED_DATA (0x40)
      IMAGE_SCN_MEM_READ (0x40000000)
      IMAGE_SCN_MEM_WRITE (0x80000000)
    ]
  }
]
Relocations [
]
UnwindInformation [
]
Symbols [
]
Ard Biesheuvel July 22, 2024, 10:14 p.m. UTC | #4
On Mon, 22 Jul 2024 at 13:48, Shao, Marshall <Marshall.Shao@dell.com> wrote:
>
> Hi Ard,
>
> > Given that GRUB now supports the native EFI entrypoint properly,
> > the handover protocol is essentially deprecated.
>
> In my case, the systemd-boot jumped into the EFI stub code via
> handover protocol, this may not be an orthodox way to boot the kernel
> but it performs well on the others, I have tested on at least
> 6 firmware.
>

systemd-boot does not implement the EFI handover protocol.
systemd-stub does implement it (for UKIs) but only for kernel versions
v5.8 or older.

The EFI handover protocol is known to be problematic as the loaders
often fail to allocate memory for the entire image, and only allocate
enough pages to load the bzImage itself.

This means that clearing BSS will wipe unrelated memory if the region
after the image happens to be used already. It also means that not
clearing BSS is just a crutch, and the correct fix is to ensure that
systemd-stub allocates the correct number of pages, and clears the
ones that are not covered by the bzImage payload.

> I understand that the handover protocol is going to be deprecated.
> However, as of now, I can't guarantee which EFI loader will be
> used to load my bzImage. Although it’s not very common, booting
> from the handover protocol with uncleaned BSS memory is possible.
>

systemd-boot does not use the EFI handover protocol. Please try to
determine where this confusion comes from: are you using a UKI image
perhaps?

> >> memset(_bss+0x10000, 0, _ebss - _bss - 0x10000)
>
> > So now you are applying the memset only to part of BSS, right? How
> > does this help?
>
> This part doesn't work without increasing the BOOT_STACK_SIZE.
>

... because the 0x10000 value would be incorrect otherwise?

I am trying to understand *why* this particular change works around
the issue. Please elaborate.

My preliminary conclusion here is that your implementation of the EFI
handover protocol (which I fail to understand where it comes from) is
not allocating enough memory. This should be fixed on the bootloader
side, as not clearing the BSS does not prevent this memory from being
corrupted.
Shao, Marshall July 23, 2024, 2:21 p.m. UTC | #5
Hi Ard,

Many thanks for your reply.

> systemd-boot does not use the EFI handover protocol. Please try to
> determine where this confusion comes from: are you using a UKI image
> perhaps?

I can confirm that both systemd-boot and stub will be used, and you 
are correct about the stub part, because currently, the stub will lead 
the system to handover protocol and trigger the problem.

> I am trying to understand *why* this particular change works around
> the issue. Please elaborate.

When I removed the memset, and booted to efi_decompress_kernel, the
boot service crashed, and it indicated that the memory region from _bss
to the end of boot_heap cannot be overwritten. Upon inspecting the data 
in the BSS region found one thing is that the _bss address is not fixed on 
each boot (when the MOR bit is set to 1), and it changes randomly. 

For example, in normal boot the _bss address is 0xffee0000, if I set MOR 
to 1, then the address shifts to 0xff990000 or 0xff991000 or
 0xff993000. I cannot predict which will be the starting address for the 
next boot.

Since the entire BSS region was not cleaned, and it contains zeros and 
other data, so I tried to increase the boot_stack size by 0x3000 to 
cover the 'fragile' part. 

> My preliminary conclusion here is that your implementation of the EFI
> handover protocol (which I fail to understand where it comes from) is
> not allocating enough memory. This should be fixed on the bootloader
> side, as not clearing the BSS does not prevent this memory from being
> corrupted.

I understand that the handover protocol is nearing the end of its support 
and it seems I am only one experiencing the issue. However,
from the perspective of backward compatibility, I think this patch maybe 
useful.
Ard Biesheuvel July 23, 2024, 2:26 p.m. UTC | #6
On Tue, 23 Jul 2024 at 16:21, Shao, Marshall <Marshall.Shao@dell.com> wrote:
>
> Hi Ard,
>
> Many thanks for your reply.
>
> > systemd-boot does not use the EFI handover protocol. Please try to
> > determine where this confusion comes from: are you using a UKI image
> > perhaps?
>
> I can confirm that both systemd-boot and stub will be used, and you
> are correct about the stub part, because currently, the stub will lead
> the system to handover protocol and trigger the problem.
>

Can you explain why this is the case? systemd-stub should only use the
EFI handover protocol for v5.8 or older.

> > I am trying to understand *why* this particular change works around
> > the issue. Please elaborate.
>
> When I removed the memset, and booted to efi_decompress_kernel, the
> boot service crashed, and it indicated that the memory region from _bss
> to the end of boot_heap cannot be overwritten. Upon inspecting the data
> in the BSS region found one thing is that the _bss address is not fixed on
> each boot (when the MOR bit is set to 1), and it changes randomly.
>
> For example, in normal boot the _bss address is 0xffee0000, if I set MOR
> to 1, then the address shifts to 0xff990000 or 0xff991000 or
>  0xff993000. I cannot predict which will be the starting address for the
> next boot.
>
> Since the entire BSS region was not cleaned, and it contains zeros and
> other data, so I tried to increase the boot_stack size by 0x3000 to
> cover the 'fragile' part.
>

This is not a proper fix. As I indicated in my previous reply, even if
you omit the memset() of BSS, the running code will still treat it as
usable memory, whereas the memory is already allocated for something
else.

> > My preliminary conclusion here is that your implementation of the EFI
> > handover protocol (which I fail to understand where it comes from) is
> > not allocating enough memory. This should be fixed on the bootloader
> > side, as not clearing the BSS does not prevent this memory from being
> > corrupted.
>
> I understand that the handover protocol is nearing the end of its support
> and it seems I am only one experiencing the issue. However,
> from the perspective of backward compatibility, I think this patch maybe
> useful.
>

Again, this is not an issue in the implementation of the EFI handover
protocol. It is an issue in the implementation of the bootloader.

I filed an issue with systemd here:
https://github.com/systemd/systemd/issues/33816

I will not consider this patch for the kernel - please work with
systemd upstream and/or your downstream to clarify why the EFI
handover protocol is being used to begin with, and get it fixed in
your code base.
Shao, Marshall July 24, 2024, 1:48 a.m. UTC | #7
> Can you explain why this is the case? systemd-stub should only use the
> EFI handover protocol for v5.8 or older.

Are you referring to this comment? In systemd/src/boot/efi/linux.c

> This method works for Linux 5.8 and newer on ARM/Aarch64, x86/x68_64 
> and RISC-V.

I noticed that this was not present in systemd v249, that could be the reason. 
Upgrading the systemd version may solve the issue. I will give it a try.

Anyway, thanks for bringing the issue to systemd's attention.
diff mbox series

Patch

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b70e4a21c15f..bac5a3c55c2c 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -356,11 +356,9 @@  unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
 				void (*error)(char *x))
 {
 	unsigned long entry;
-
-	if (!free_mem_ptr) {
-		free_mem_ptr     = (unsigned long)boot_heap;
-		free_mem_end_ptr = (unsigned long)boot_heap + sizeof(boot_heap);
-	}
+	free_mem_ptr     = (unsigned long)boot_heap;
+	free_mem_end_ptr = (unsigned long)boot_heap + sizeof(boot_heap);
+	malloc_ptr = free_mem_ptr;
 
 	if (__decompress(input_data, input_len, NULL, NULL, outbuf, output_len,
 			 NULL, error) < 0)
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 3e5b111e619d..312bc87ab027 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -33,7 +33,7 @@ 
 #endif
 
 #ifdef CONFIG_X86_64
-# define BOOT_STACK_SIZE	0x4000
+# define BOOT_STACK_SIZE	0x10000
 
 /*
  * Used by decompressor's startup_32() to allocate page tables for identity
diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c
index 1983fd3bf392..d92d2ccc709b 100644
--- a/drivers/firmware/efi/libstub/x86-stub.c
+++ b/drivers/firmware/efi/libstub/x86-stub.c
@@ -21,7 +21,6 @@ 
 #include "efistub.h"
 #include "x86-stub.h"
 
-extern char _bss[], _ebss[];
 
 const efi_system_table_t *efi_system_table;
 const efi_dxe_services_table_t *efi_dxe_table;
@@ -476,9 +475,6 @@  efi_status_t __efiapi efi_pe_entry(efi_handle_t handle,
 	efi_status_t status;
 	char *cmdline_ptr;
 
-	if (efi_is_native())
-		memset(_bss, 0, _ebss - _bss);
-
 	efi_system_table = sys_table_arg;
 
 	/* Check if we were booted by the EFI firmware */
@@ -1000,7 +996,6 @@  void __noreturn efi_stub_entry(efi_handle_t handle,
 void efi_handover_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg,
 			struct boot_params *boot_params)
 {
-	memset(_bss, 0, _ebss - _bss);
 	efi_stub_entry(handle, sys_table_arg, boot_params);
 }