diff mbox series

[v2,1/2] x86/kexec: Add EFI config table identity mapping for kexec kernel

Message ID 20240715185309.1637839-2-steve.wahl@hpe.com
State New
Headers show
Series Resolve problems with kexec identity mapping | expand

Commit Message

Steve Wahl July 15, 2024, 6:53 p.m. UTC
From: Tao Liu <ltao@redhat.com>

A kexec kernel boot failure is sometimes observed on AMD CPUs due to
an unmapped EFI config table array.  This can be seen when "nogbpages"
is on the kernel command line, and has been observed as a full BIOS
reboot rather than a successful kexec.

This was also the cause of reported regressions attributed to Commit
7143c5f4cf20 ("x86/mm/ident_map: Use gbpages only where full GB page
should be mapped.") which was subsequently reverted.

To avoid this page fault, explicitly include the EFI config table
array in the kexec identity map.

Further explanation:

The following 2 commits caused the EFI config table array to be
accessed when enabling sev at kernel startup.

    commit ec1c66af3a30 ("x86/compressed/64: Detect/setup SEV/SME features
                          earlier during boot")
    commit c01fce9cef84 ("x86/compressed: Add SEV-SNP feature
                          detection/setup")

This is in the code that examines whether SEV should be enabled or
not, so it can even affect systems that are not SEV capable.

This may result in a page fault if the EFI config table array's
address is unmapped. Since the page fault occurs before the new kernel
establishes its own identity map and page fault routines, it is
unrecoverable and kexec fails.

Most often, this problem is not seen because the EFI config table
array gets included in the map by the luck of being placed at a memory
address close enough to other memory areas that *are* included in the
map created by kexec.

Both the "nogbpages" command line option and the "use gpbages only
where full GB page should be mapped" patch greatly reduce the chance
of being included in the map by luck, which is why the problem
appears.

Signed-off-by: Tao Liu <ltao@redhat.com>
Signed-off-by: Steve Wahl <steve.wahl@hpe.com>
Tested-by: Pavin Joseph <me@pavinjoseph.com>
Tested-by: Sarah Brofeldt <srhb@dbc.dk>
Tested-by: Eric Hagberg <ehagberg@gmail.com>
---
 arch/x86/kernel/machine_kexec_64.c | 35 ++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

Comments

Ard Biesheuvel July 16, 2024, 3:17 p.m. UTC | #1
On Mon, 15 Jul 2024 at 11:53, Steve Wahl <steve.wahl@hpe.com> wrote:
>
> From: Tao Liu <ltao@redhat.com>
>
> A kexec kernel boot failure is sometimes observed on AMD CPUs due to
> an unmapped EFI config table array.  This can be seen when "nogbpages"
> is on the kernel command line, and has been observed as a full BIOS
> reboot rather than a successful kexec.
>
> This was also the cause of reported regressions attributed to Commit
> 7143c5f4cf20 ("x86/mm/ident_map: Use gbpages only where full GB page
> should be mapped.") which was subsequently reverted.
>
> To avoid this page fault, explicitly include the EFI config table
> array in the kexec identity map.
>
> Further explanation:
>
> The following 2 commits caused the EFI config table array to be
> accessed when enabling sev at kernel startup.
>
>     commit ec1c66af3a30 ("x86/compressed/64: Detect/setup SEV/SME features
>                           earlier during boot")
>     commit c01fce9cef84 ("x86/compressed: Add SEV-SNP feature
>                           detection/setup")
>
> This is in the code that examines whether SEV should be enabled or
> not, so it can even affect systems that are not SEV capable.
>
> This may result in a page fault if the EFI config table array's
> address is unmapped. Since the page fault occurs before the new kernel
> establishes its own identity map and page fault routines, it is
> unrecoverable and kexec fails.
>
> Most often, this problem is not seen because the EFI config table
> array gets included in the map by the luck of being placed at a memory
> address close enough to other memory areas that *are* included in the
> map created by kexec.
>
> Both the "nogbpages" command line option and the "use gpbages only
> where full GB page should be mapped" patch greatly reduce the chance
> of being included in the map by luck, which is why the problem
> appears.
>
> Signed-off-by: Tao Liu <ltao@redhat.com>
> Signed-off-by: Steve Wahl <steve.wahl@hpe.com>
> Tested-by: Pavin Joseph <me@pavinjoseph.com>
> Tested-by: Sarah Brofeldt <srhb@dbc.dk>
> Tested-by: Eric Hagberg <ehagberg@gmail.com>
> ---
>  arch/x86/kernel/machine_kexec_64.c | 35 ++++++++++++++++++++++++++----
>  1 file changed, 31 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
> index cc0f7f70b17b..563d119f9f29 100644
> --- a/arch/x86/kernel/machine_kexec_64.c
> +++ b/arch/x86/kernel/machine_kexec_64.c
> @@ -28,6 +28,7 @@
>  #include <asm/setup.h>
>  #include <asm/set_memory.h>
>  #include <asm/cpu.h>
> +#include <asm/efi.h>
>
>  #ifdef CONFIG_ACPI
>  /*
> @@ -83,10 +84,12 @@ const struct kexec_file_ops * const kexec_file_loaders[] = {
>  #endif
>
>  static int
> -map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)

I think we can keep the name - the array of EFI config table
references could be considered part of the system table, even though
it may live in a separate allocation.

> +map_efi_tables(struct x86_mapping_info *info, pgd_t *level4p)
>  {
>  #ifdef CONFIG_EFI
>         unsigned long mstart, mend;
> +       void *kaddr;
> +       int ret;
>
>         if (!efi_enabled(EFI_BOOT))
>                 return 0;
> @@ -102,6 +105,30 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
>         if (!mstart)
>                 return 0;
>
> +       ret = kernel_ident_mapping_init(info, level4p, mstart, mend);
> +       if (ret)
> +               return ret;
> +
> +       kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB);
> +       if (!kaddr) {
> +               pr_err("Could not map UEFI system table\n");
> +               return -ENOMEM;
> +       }
> +
> +       mstart = efi_config_table;
> +
> +       if (efi_enabled(EFI_64BIT)) {
> +               efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr;
> +
> +               mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables;
> +       } else {
> +               efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr;
> +
> +               mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables;
> +       }
> +
> +       memunmap(kaddr);
> +
>         return kernel_ident_mapping_init(info, level4p, mstart, mend);
>  #endif
>         return 0;
> @@ -241,10 +268,10 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
>         }
>
>         /*
> -        * Prepare EFI systab and ACPI tables for kexec kernel since they are
> -        * not covered by pfn_mapped.
> +        * Prepare EFI systab, config table and ACPI tables for kexec kernel

Please avoid 'config table' here, as it is ambiguous. IMO you can just
drop this hunk (and the one below)

> +        * since they are not covered by pfn_mapped.
>          */
> -       result = map_efi_systab(&info, level4p);
> +       result = map_efi_tables(&info, level4p);
>         if (result)
>                 return result;
>
> --
> 2.26.2
>
diff mbox series

Patch

diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index cc0f7f70b17b..563d119f9f29 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -28,6 +28,7 @@ 
 #include <asm/setup.h>
 #include <asm/set_memory.h>
 #include <asm/cpu.h>
+#include <asm/efi.h>
 
 #ifdef CONFIG_ACPI
 /*
@@ -83,10 +84,12 @@  const struct kexec_file_ops * const kexec_file_loaders[] = {
 #endif
 
 static int
-map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
+map_efi_tables(struct x86_mapping_info *info, pgd_t *level4p)
 {
 #ifdef CONFIG_EFI
 	unsigned long mstart, mend;
+	void *kaddr;
+	int ret;
 
 	if (!efi_enabled(EFI_BOOT))
 		return 0;
@@ -102,6 +105,30 @@  map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
 	if (!mstart)
 		return 0;
 
+	ret = kernel_ident_mapping_init(info, level4p, mstart, mend);
+	if (ret)
+		return ret;
+
+	kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB);
+	if (!kaddr) {
+		pr_err("Could not map UEFI system table\n");
+		return -ENOMEM;
+	}
+
+	mstart = efi_config_table;
+
+	if (efi_enabled(EFI_64BIT)) {
+		efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr;
+
+		mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables;
+	} else {
+		efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr;
+
+		mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables;
+	}
+
+	memunmap(kaddr);
+
 	return kernel_ident_mapping_init(info, level4p, mstart, mend);
 #endif
 	return 0;
@@ -241,10 +268,10 @@  static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 	}
 
 	/*
-	 * Prepare EFI systab and ACPI tables for kexec kernel since they are
-	 * not covered by pfn_mapped.
+	 * Prepare EFI systab, config table and ACPI tables for kexec kernel
+	 * since they are not covered by pfn_mapped.
 	 */
-	result = map_efi_systab(&info, level4p);
+	result = map_efi_tables(&info, level4p);
 	if (result)
 		return result;