diff mbox series

[RFC] efi/tpm: add efi.tpm_log as a reserved region in 820_table_firmware

Message ID 20240911104109.1831501-1-usamaarif642@gmail.com
State New
Headers show
Series [RFC] efi/tpm: add efi.tpm_log as a reserved region in 820_table_firmware | expand

Commit Message

Usama Arif Sept. 11, 2024, 10:41 a.m. UTC
Looking at the TPM spec [1]

If the ACPI TPM2 table contains the address and size of the Platform
Firmware TCG log, firmware “pins” the memory associated with the
Platform FirmwareTCG log, and reports this memory as “Reserved” memory
via the INT 15h/E820 interface.

It looks like the firmware should pass this as reserved in e820 memory
map. However, it doesn't seem to. The firmware being tested on is:
dmidecode -s bios-version
edk2-20240214-2.el9

When this area is not reserved, it comes up as usable in
/sys/firmware/memmap. This means that kexec, which uses that memmap
to find usable memory regions, can select the region where efi.tpm_log
is and overwrite it and relocate_kernel.

Having a fix in firmware can be difficult to get through. As a secondary
fix, this patch marks that region as reserved in e820_table_firmware if it
is currently E820_TYPE_RAM so that kexec doesn't use it for kernel segments.

[1] https://trustedcomputinggroup.org/wp-content/uploads/PC-ClientPlatform_Profile_for_TPM_2p0_Systems_v49_161114_public-review.pdf

Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
 arch/x86/include/asm/e820/api.h | 2 ++
 arch/x86/kernel/e820.c          | 6 ++++++
 arch/x86/platform/efi/efi.c     | 9 +++++++++
 drivers/firmware/efi/tpm.c      | 2 +-
 include/linux/efi.h             | 7 +++++++
 5 files changed, 25 insertions(+), 1 deletion(-)

Comments

Usama Arif Sept. 11, 2024, 2:34 p.m. UTC | #1
On 11/09/2024 12:51, Ard Biesheuvel wrote:
> On Wed, 11 Sept 2024 at 12:41, Usama Arif <usamaarif642@gmail.com> wrote:
>>
>> Looking at the TPM spec [1]
>>
>> If the ACPI TPM2 table contains the address and size of the Platform
>> Firmware TCG log, firmware “pins” the memory associated with the
>> Platform FirmwareTCG log, and reports this memory as “Reserved” memory
>> via the INT 15h/E820 interface.
>>
>> It looks like the firmware should pass this as reserved in e820 memory
>> map. However, it doesn't seem to. The firmware being tested on is:
>> dmidecode -s bios-version
>> edk2-20240214-2.el9
>>
>> When this area is not reserved, it comes up as usable in
>> /sys/firmware/memmap. This means that kexec, which uses that memmap
>> to find usable memory regions, can select the region where efi.tpm_log
>> is and overwrite it and relocate_kernel.
>>
>> Having a fix in firmware can be difficult to get through. As a secondary
>> fix, this patch marks that region as reserved in e820_table_firmware if it
>> is currently E820_TYPE_RAM so that kexec doesn't use it for kernel segments.
>>
>> [1] https://trustedcomputinggroup.org/wp-content/uploads/PC-ClientPlatform_Profile_for_TPM_2p0_Systems_v49_161114_public-review.pdf
>>
>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
Forgot to add:

Reported-by: Breno Leitao <leitao@debian.org>

> 
> I would expect the EFI memory map to E820 conversion implemented in
> the EFI stub to take care of this.
> 
> If you are not booting via the EFI stub, the bootloader is performing
> this conversion, and so it should be done there instead.
> 
I will look into this and report back.
Thanks

> 
>> ---
>>  arch/x86/include/asm/e820/api.h | 2 ++
>>  arch/x86/kernel/e820.c          | 6 ++++++
>>  arch/x86/platform/efi/efi.c     | 9 +++++++++
>>  drivers/firmware/efi/tpm.c      | 2 +-
>>  include/linux/efi.h             | 7 +++++++
>>  5 files changed, 25 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
>> index 2e74a7f0e935..4e9aa24f03bd 100644
>> --- a/arch/x86/include/asm/e820/api.h
>> +++ b/arch/x86/include/asm/e820/api.h
>> @@ -16,6 +16,8 @@ extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type);
>>
>>  extern void e820__range_add   (u64 start, u64 size, enum e820_type type);
>>  extern u64  e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type);
>> +extern u64  e820__range_update_firmware(u64 start, u64 size, enum e820_type old_type,
>> +                                       enum e820_type new_type);
>>  extern u64  e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type);
>>  extern u64  e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type);
>>
>> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
>> index 4893d30ce438..912400161623 100644
>> --- a/arch/x86/kernel/e820.c
>> +++ b/arch/x86/kernel/e820.c
>> @@ -538,6 +538,12 @@ u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size,
>>         return __e820__range_update(t, start, size, old_type, new_type);
>>  }
>>
>> +u64 __init e820__range_update_firmware(u64 start, u64 size, enum e820_type old_type,
>> +                                      enum e820_type new_type)
>> +{
>> +       return __e820__range_update(e820_table_firmware, start, size, old_type, new_type);
>> +}
>> +
>>  /* Remove a range of memory from the E820 table: */
>>  u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type)
>>  {
>> diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
>> index 88a96816de9a..aa95f77d7a30 100644
>> --- a/arch/x86/platform/efi/efi.c
>> +++ b/arch/x86/platform/efi/efi.c
>> @@ -171,6 +171,15 @@ static void __init do_add_efi_memmap(void)
>>         e820__update_table(e820_table);
>>  }
>>
>> +/* Reserve firmware area if it was marked as RAM */
>> +void arch_update_firmware_area(u64 addr, u64 size)
>> +{
>> +       if (e820__get_entry_type(addr, addr + size) == E820_TYPE_RAM) {
>> +               e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
>> +               e820__update_table(e820_table_firmware);
>> +       }
>> +}
>> +
>>  /*
>>   * Given add_efi_memmap defaults to 0 and there is no alternative
>>   * e820 mechanism for soft-reserved memory, import the full EFI memory
>> diff --git a/drivers/firmware/efi/tpm.c b/drivers/firmware/efi/tpm.c
>> index e8d69bd548f3..8e6e7131d718 100644
>> --- a/drivers/firmware/efi/tpm.c
>> +++ b/drivers/firmware/efi/tpm.c
>> @@ -60,6 +60,7 @@ int __init efi_tpm_eventlog_init(void)
>>         }
>>
>>         tbl_size = sizeof(*log_tbl) + log_tbl->size;
>> +       arch_update_firmware_area(efi.tpm_log, tbl_size);
>>         memblock_reserve(efi.tpm_log, tbl_size);
>>
>>         if (efi.tpm_final_log == EFI_INVALID_TABLE_ADDR) {
>> @@ -107,4 +108,3 @@ int __init efi_tpm_eventlog_init(void)
>>         early_memunmap(log_tbl, sizeof(*log_tbl));
>>         return ret;
>>  }
>> -
>> diff --git a/include/linux/efi.h b/include/linux/efi.h
>> index 6bf3c4fe8511..9c239cdff771 100644
>> --- a/include/linux/efi.h
>> +++ b/include/linux/efi.h
>> @@ -1371,4 +1371,11 @@ extern struct blocking_notifier_head efivar_ops_nh;
>>  void efivars_generic_ops_register(void);
>>  void efivars_generic_ops_unregister(void);
>>
>> +#ifdef CONFIG_X86_64
>> +void __init arch_update_firmware_area(u64 addr, u64 size);
>> +#else
>> +static inline void __init arch_update_firmware_area(u64 addr, u64 size)
>> +{
>> +}
>> +#endif
>>  #endif /* _LINUX_EFI_H */
>> --
>> 2.43.5
>>
Ard Biesheuvel Sept. 12, 2024, 10:51 a.m. UTC | #2
On Thu, 12 Sept 2024 at 12:23, Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 11/09/2024 12:51, Ard Biesheuvel wrote:
> > On Wed, 11 Sept 2024 at 12:41, Usama Arif <usamaarif642@gmail.com> wrote:
> >>
> >> Looking at the TPM spec [1]
> >>
> >> If the ACPI TPM2 table contains the address and size of the Platform
> >> Firmware TCG log, firmware “pins” the memory associated with the
> >> Platform FirmwareTCG log, and reports this memory as “Reserved” memory
> >> via the INT 15h/E820 interface.
> >>
> >> It looks like the firmware should pass this as reserved in e820 memory
> >> map. However, it doesn't seem to. The firmware being tested on is:
> >> dmidecode -s bios-version
> >> edk2-20240214-2.el9
> >>
> >> When this area is not reserved, it comes up as usable in
> >> /sys/firmware/memmap. This means that kexec, which uses that memmap
> >> to find usable memory regions, can select the region where efi.tpm_log
> >> is and overwrite it and relocate_kernel.
> >>
> >> Having a fix in firmware can be difficult to get through. As a secondary
> >> fix, this patch marks that region as reserved in e820_table_firmware if it
> >> is currently E820_TYPE_RAM so that kexec doesn't use it for kernel segments.
> >>
> >> [1] https://trustedcomputinggroup.org/wp-content/uploads/PC-ClientPlatform_Profile_for_TPM_2p0_Systems_v49_161114_public-review.pdf
> >>
> >> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >
> > I would expect the EFI memory map to E820 conversion implemented in
> > the EFI stub to take care of this.
> >
>
> So I have been making a prototype with EFI stub, and the unfinished version is looking like a
> horrible hack.
>
> The only way to do this in libstub is to pass log_tbl all the way from efi_retrieve_tcg2_eventlog
> to efi_stub_entry and from there to setup_e820.
> While going through the efi memory map and converting it to e820 table in setup_e820, you have to check
> if log_tbl falls in any of the ranges and if the range is E820_TYPE_RAM. If that condition is satisfied,
> then you have to split that range into 3. i.e. the E820_TYPE_RAM range before tpm_log, the tpm_log
> E820_TYPE_RESERVED range, and the E820_TYPE_RAM range after. There are no helper functions, so this
> splitting involves playing with a lot of pointers, and it looks quite ugly. I believe doing this
> way is more likely to introduce bugs.
>
> If we are having to compensate for an EFI bug, would it make sense to do it in the way done
> in RFC and do it in kernel rather than libstub? It is simple and very likely to be bug free.
>

I don't see how this could be an EFI bug, given that it does not deal
with E820 tables at all.
Breno Leitao Sept. 12, 2024, 1:03 p.m. UTC | #3
Hello Ard,

On Thu, Sep 12, 2024 at 12:51:57PM +0200, Ard Biesheuvel wrote:
> I don't see how this could be an EFI bug, given that it does not deal
> with E820 tables at all.

I want to back up a little bit and make sure I am following the
discussion.
Usama Arif Sept. 12, 2024, 1:54 p.m. UTC | #4
On 12/09/2024 14:10, Ard Biesheuvel wrote:
> Does the below help at all?
> 
> --- a/drivers/firmware/efi/tpm.c
> +++ b/drivers/firmware/efi/tpm.c
> @@ -60,7 +60,7 @@ int __init efi_tpm_eventlog_init(void)
>         }
> 
>         tbl_size = sizeof(*log_tbl) + log_tbl->size;
> -       memblock_reserve(efi.tpm_log, tbl_size);
> +       efi_mem_reserve(efi.tpm_log, tbl_size);
> 
>         if (efi.tpm_final_log == EFI_INVALID_TABLE_ADDR) {
>                 pr_info("TPM Final Events table not present\n");

Unfortunately not. efi_mem_reserve updates e820_table, while kexec looks at /sys/firmware/memmap
which is e820_table_firmware.

arch_update_firmware_area introduced in the RFC patch does the same thing as efi_mem_reserve does at
its end, just with e820_table_firmware instead of e820_table.
i.e. efi_mem_reserve does:
	e820__range_update(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
	e820__update_table(e820_table);

while arch_update_firmware_area does:
	e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
	e820__update_table(e820_table_firmware);

Thanks,
Usama
Ard Biesheuvel Sept. 12, 2024, 2:14 p.m. UTC | #5
(cc Dave)

Full thread here:
https://lore.kernel.org/all/CAMj1kXG1hbiafKRyC5qM1Vj5X7x-dmLndqqo2AYnHMRxDz-80w@mail.gmail.com/T/#u

On Thu, 12 Sept 2024 at 16:05, Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Thu, 12 Sept 2024 at 15:55, Usama Arif <usamaarif642@gmail.com> wrote:
> >
> >
> >
> > On 12/09/2024 14:10, Ard Biesheuvel wrote:
> > > Does the below help at all?
> > >
> > > --- a/drivers/firmware/efi/tpm.c
> > > +++ b/drivers/firmware/efi/tpm.c
> > > @@ -60,7 +60,7 @@ int __init efi_tpm_eventlog_init(void)
> > >         }
> > >
> > >         tbl_size = sizeof(*log_tbl) + log_tbl->size;
> > > -       memblock_reserve(efi.tpm_log, tbl_size);
> > > +       efi_mem_reserve(efi.tpm_log, tbl_size);
> > >
> > >         if (efi.tpm_final_log == EFI_INVALID_TABLE_ADDR) {
> > >                 pr_info("TPM Final Events table not present\n");
> >
> > Unfortunately not. efi_mem_reserve updates e820_table, while kexec looks at /sys/firmware/memmap
> > which is e820_table_firmware.
> >
> > arch_update_firmware_area introduced in the RFC patch does the same thing as efi_mem_reserve does at
> > its end, just with e820_table_firmware instead of e820_table.
> > i.e. efi_mem_reserve does:
> >         e820__range_update(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
> >         e820__update_table(e820_table);
> >
> > while arch_update_firmware_area does:
> >         e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
> >         e820__update_table(e820_table_firmware);
> >
>
> Shame.
>
> Using efi_mem_reserve() is appropriate here in any case, but I guess
> kexec on x86 needs to be fixed to juggle the EFI memory map, memblock
> table, and 3 (!) versions of the E820 table in the correct way
> (e820_table, e820_table_kexec and e820_table_firmware)
>
> Perhaps we can put this additional logic in x86's implementation of
> efi_arch_mem_reserve()? AFAICT, all callers of efi_mem_reserve() deal
> with configuration tables produced by the firmware that may not be
> reserved correctly if kexec looks at e820_table_firmware[] only.
Ard Biesheuvel Sept. 12, 2024, 3:21 p.m. UTC | #6
On Thu, 12 Sept 2024 at 16:29, Breno Leitao <leitao@debian.org> wrote:
>
> On Thu, Sep 12, 2024 at 03:10:43PM +0200, Ard Biesheuvel wrote:
> > On Thu, 12 Sept 2024 at 15:03, Breno Leitao <leitao@debian.org> wrote:
> > > On Thu, Sep 12, 2024 at 12:51:57PM +0200, Ard Biesheuvel wrote:
> > > > I don't see how this could be an EFI bug, given that it does not deal
> > > > with E820 tables at all.
> > >
> > > I want to back up a little bit and make sure I am following the
> > > discussion.
> > >
> > > From what I understand from previous discussion, we have an EFI bug as
> > > the root cause of this issue.
> > >
> > > This happens because the EFI does NOT mark the EFI TPM event log memory
> > > region as reserved (EFI_RESERVED_TYPE).
> >
> > Why do you think EFI should use EFI_RESERVED_TYPE in this case?
> >
> > The EFI spec is very clear that EFI_RESERVED_TYPE really shouldn't be
> > used for anything by EFI itself. It is quite common for EFI
> > configuration tables to be passed as EfiRuntimeServicesData (SMBIOS),
> > EfiBootServicesData (ESRT) or EFiAcpiReclaim (ACPI tables).
> >
> > Reserved memory is mostly for memory that even the firmware does not
> > know what it is for, i.e., particular platform specific uses.
> >
> > In general, it is up to the OS to ensure that EFI configuration tables
> > that it cares about should be reserved in the correct way.
>
> Thanks for the explanation.
>
> So, if I understand what you meant here, the TPM event log memory range
> shouldn't be listed as a memory region in EFI memory map (as passed by
> the firmware to the OS).
>
> Hence, this is not a EFI firmware bug, but a OS/Kernel bug.
>
> Am I correct with the statements above?
>

No, not entirely. But I also missed the face that this table is
actually created by the EFI stub in Linux, not the firmware. It is
*not* the same memory region that the TPM2 ACPI table describes, and
so what the various specs say about that is entirely irrelevant.

The TPM event log configuration table is created by the EFI stub,
which uses the TCG2::GetEventLog () protocol method to obtain it. This
must be done by the EFI stub because these protocols will no longer be
available once the OS boots. But the data is not used by the EFI stub,
only by the OS, which is why it is passed in memory like this.

The memory in question is allocated as EFI_LOADER_DATA, and so we are
relying on the OS to know that this memory is special, and needs to be
preserved.

I think the solution here is to use a different memory type:

--- a/drivers/firmware/efi/libstub/tpm.c
+++ b/drivers/firmware/efi/libstub/tpm.c
@@ -96,7 +96,7 @@ static void efi_retrieve_tcg2_eventlog(int version,
efi_physical_addr_t log_loca
        }

        /* Allocate space for the logs and copy them. */
-       status = efi_bs_call(allocate_pool, EFI_LOADER_DATA,
+       status = efi_bs_call(allocate_pool, EFI_ACPI_RECLAIM_MEMORY,
                             sizeof(*log_tbl) + log_size, (void **)&log_tbl);

        if (status != EFI_SUCCESS) {

which will be treated appropriately by the existing EFI-to-E820
conversion logic.
Ard Biesheuvel Sept. 12, 2024, 3:45 p.m. UTC | #7
On Thu, 12 Sept 2024 at 17:35, Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 12/09/2024 16:21, Ard Biesheuvel wrote:
> > On Thu, 12 Sept 2024 at 16:29, Breno Leitao <leitao@debian.org> wrote:
> >>
> >> On Thu, Sep 12, 2024 at 03:10:43PM +0200, Ard Biesheuvel wrote:
> >>> On Thu, 12 Sept 2024 at 15:03, Breno Leitao <leitao@debian.org> wrote:
> >>>> On Thu, Sep 12, 2024 at 12:51:57PM +0200, Ard Biesheuvel wrote:
> >>>>> I don't see how this could be an EFI bug, given that it does not deal
> >>>>> with E820 tables at all.
> >>>>
> >>>> I want to back up a little bit and make sure I am following the
> >>>> discussion.
> >>>>
> >>>> From what I understand from previous discussion, we have an EFI bug as
> >>>> the root cause of this issue.
> >>>>
> >>>> This happens because the EFI does NOT mark the EFI TPM event log memory
> >>>> region as reserved (EFI_RESERVED_TYPE).
> >>>
> >>> Why do you think EFI should use EFI_RESERVED_TYPE in this case?
> >>>
> >>> The EFI spec is very clear that EFI_RESERVED_TYPE really shouldn't be
> >>> used for anything by EFI itself. It is quite common for EFI
> >>> configuration tables to be passed as EfiRuntimeServicesData (SMBIOS),
> >>> EfiBootServicesData (ESRT) or EFiAcpiReclaim (ACPI tables).
> >>>
> >>> Reserved memory is mostly for memory that even the firmware does not
> >>> know what it is for, i.e., particular platform specific uses.
> >>>
> >>> In general, it is up to the OS to ensure that EFI configuration tables
> >>> that it cares about should be reserved in the correct way.
> >>
> >> Thanks for the explanation.
> >>
> >> So, if I understand what you meant here, the TPM event log memory range
> >> shouldn't be listed as a memory region in EFI memory map (as passed by
> >> the firmware to the OS).
> >>
> >> Hence, this is not a EFI firmware bug, but a OS/Kernel bug.
> >>
> >> Am I correct with the statements above?
> >>
> >
> > No, not entirely. But I also missed the face that this table is
> > actually created by the EFI stub in Linux, not the firmware. It is
> > *not* the same memory region that the TPM2 ACPI table describes, and
> > so what the various specs say about that is entirely irrelevant.
> >
> > The TPM event log configuration table is created by the EFI stub,
> > which uses the TCG2::GetEventLog () protocol method to obtain it. This
> > must be done by the EFI stub because these protocols will no longer be
> > available once the OS boots. But the data is not used by the EFI stub,
> > only by the OS, which is why it is passed in memory like this.
> >
> > The memory in question is allocated as EFI_LOADER_DATA, and so we are
> > relying on the OS to know that this memory is special, and needs to be
> > preserved.
> >
> > I think the solution here is to use a different memory type:
> >
> > --- a/drivers/firmware/efi/libstub/tpm.c
> > +++ b/drivers/firmware/efi/libstub/tpm.c
> > @@ -96,7 +96,7 @@ static void efi_retrieve_tcg2_eventlog(int version,
> > efi_physical_addr_t log_loca
> >         }
> >
> >         /* Allocate space for the logs and copy them. */
> > -       status = efi_bs_call(allocate_pool, EFI_LOADER_DATA,
> > +       status = efi_bs_call(allocate_pool, EFI_ACPI_RECLAIM_MEMORY,
> >                              sizeof(*log_tbl) + log_size, (void **)&log_tbl);
> >
> >         if (status != EFI_SUCCESS) {
> >
> > which will be treated appropriately by the existing EFI-to-E820
> > conversion logic.
>
> I have tested above diff, and it works! No memory corruption.
>
> The region comes up as ACPI data:
> [    0.000000] BIOS-e820: [mem 0x000000007fb6d000-0x000000007fb7efff] ACPI data
>
> and kexec doesnt interfere with it.
>

Thanks, I'll take that as a tested-by
James Bottomley Sept. 12, 2024, 4:22 p.m. UTC | #8
On Thu, 2024-09-12 at 06:03 -0700, Breno Leitao wrote:
> Hello Ard,
> 
> On Thu, Sep 12, 2024 at 12:51:57PM +0200, Ard Biesheuvel wrote:
> > I don't see how this could be an EFI bug, given that it does not
> > deal with E820 tables at all.
> 
> I want to back up a little bit and make sure I am following the
> discussion.
> 
> From what I understand from previous discussion, we have an EFI bug
> as the root cause of this issue.
> 
> This happens because the EFI does NOT mark the EFI TPM event log
> memory region as reserved (EFI_RESERVED_TYPE). Not having an entry
> for the event table memory in EFI memory mapped, then libstub will
> ignore it completely (the TPM event log memory range) and not
> populate e820 table with it.

Wait, that's not correct.  The TPM log is in memory that doesn't
survive ExitBootServices (by design in case the OS doesn't care about
it).  So the EFI stub actually copies it over to a new configuration
table that is in reserved memory before it calls ExitBootServices. 
This new copy should be in kernel reserved memory regardless of its
e820 map status.

Regards,

James
Usama Arif Sept. 13, 2024, 11:06 a.m. UTC | #9
On 13/09/2024 11:56, Dave Young wrote:
> On Thu, 12 Sept 2024 at 22:15, Ard Biesheuvel <ardb@kernel.org> wrote:
>>
>> (cc Dave)
> 
> Thanks for ccing me.
> 
>>
>> Full thread here:
>> https://lore.kernel.org/all/CAMj1kXG1hbiafKRyC5qM1Vj5X7x-dmLndqqo2AYnHMRxDz-80w@mail.gmail.com/T/#u
>>
>> On Thu, 12 Sept 2024 at 16:05, Ard Biesheuvel <ardb@kernel.org> wrote:
>>>
>>> On Thu, 12 Sept 2024 at 15:55, Usama Arif <usamaarif642@gmail.com> wrote:
>>>>
>>>>
>>>>
>>>> On 12/09/2024 14:10, Ard Biesheuvel wrote:
>>>>> Does the below help at all?
>>>>>
>>>>> --- a/drivers/firmware/efi/tpm.c
>>>>> +++ b/drivers/firmware/efi/tpm.c
>>>>> @@ -60,7 +60,7 @@ int __init efi_tpm_eventlog_init(void)
>>>>>         }
>>>>>
>>>>>         tbl_size = sizeof(*log_tbl) + log_tbl->size;
>>>>> -       memblock_reserve(efi.tpm_log, tbl_size);
>>>>> +       efi_mem_reserve(efi.tpm_log, tbl_size);
>>>>>
>>>>>         if (efi.tpm_final_log == EFI_INVALID_TABLE_ADDR) {
>>>>>                 pr_info("TPM Final Events table not present\n");
>>>>
>>>> Unfortunately not. efi_mem_reserve updates e820_table, while kexec looks at /sys/firmware/memmap
>>>> which is e820_table_firmware.
>>>>
>>>> arch_update_firmware_area introduced in the RFC patch does the same thing as efi_mem_reserve does at
>>>> its end, just with e820_table_firmware instead of e820_table.
>>>> i.e. efi_mem_reserve does:
>>>>         e820__range_update(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
>>>>         e820__update_table(e820_table);
>>>>
>>>> while arch_update_firmware_area does:
>>>>         e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
>>>>         e820__update_table(e820_table_firmware);
>>>>
>>>
>>> Shame.
>>>
>>> Using efi_mem_reserve() is appropriate here in any case, but I guess
>>> kexec on x86 needs to be fixed to juggle the EFI memory map, memblock
>>> table, and 3 (!) versions of the E820 table in the correct way
>>> (e820_table, e820_table_kexec and e820_table_firmware)
>>>
>>> Perhaps we can put this additional logic in x86's implementation of
>>> efi_arch_mem_reserve()? AFAICT, all callers of efi_mem_reserve() deal
>>> with configuration tables produced by the firmware that may not be
>>> reserved correctly if kexec looks at e820_table_firmware[] only.
>>
> 
> I have not read all the conversations,  let me have a look and response later.
> 
> The first glance about the patch is that I think the kexec_file_load
> syscall (default of latest kexec-tools) will not use
> e820_table_firmware AFAIK.  it will only use e820_table_kexec.

I initially thought that as well. But it looks like kexec just reads /sys/firmware/memmap

https://github.com/horms/kexec-tools/blob/main/kexec/firmware_memmap.h#L29

which is e820_table_firmware.

The patch that Ard sent in https://lore.kernel.org/all/20240912155159.1951792-2-ardb+git@google.com/
is the right approach to it I believe, and I dont see the issue anymore after applying that patch.

> 
> Usama, can you confirm how you tested this?
> kexec -c -l  will use kexec_load syscall

I am currently testing in my VM setup with kexec_load. But production is running
kexec_file_load and has the same issue.

Thanks,
Usama

> kexec [-s] -l will use kexec_file_load syscall
> 
> Thanks
> Dave
>
Dave Young Sept. 13, 2024, 11:49 a.m. UTC | #10
On Fri, 13 Sept 2024 at 19:13, Dave Young <dyoung@redhat.com> wrote:
>
> On Fri, 13 Sept 2024 at 19:07, Usama Arif <usamaarif642@gmail.com> wrote:
> >
> >
> >
> > On 13/09/2024 11:56, Dave Young wrote:
> > > On Thu, 12 Sept 2024 at 22:15, Ard Biesheuvel <ardb@kernel.org> wrote:
> > >>
> > >> (cc Dave)
> > >
> > > Thanks for ccing me.
> > >
> > >>
> > >> Full thread here:
> > >> https://lore.kernel.org/all/CAMj1kXG1hbiafKRyC5qM1Vj5X7x-dmLndqqo2AYnHMRxDz-80w@mail.gmail.com/T/#u
> > >>
> > >> On Thu, 12 Sept 2024 at 16:05, Ard Biesheuvel <ardb@kernel.org> wrote:
> > >>>
> > >>> On Thu, 12 Sept 2024 at 15:55, Usama Arif <usamaarif642@gmail.com> wrote:
> > >>>>
> > >>>>
> > >>>>
> > >>>> On 12/09/2024 14:10, Ard Biesheuvel wrote:
> > >>>>> Does the below help at all?
> > >>>>>
> > >>>>> --- a/drivers/firmware/efi/tpm.c
> > >>>>> +++ b/drivers/firmware/efi/tpm.c
> > >>>>> @@ -60,7 +60,7 @@ int __init efi_tpm_eventlog_init(void)
> > >>>>>         }
> > >>>>>
> > >>>>>         tbl_size = sizeof(*log_tbl) + log_tbl->size;
> > >>>>> -       memblock_reserve(efi.tpm_log, tbl_size);
> > >>>>> +       efi_mem_reserve(efi.tpm_log, tbl_size);
> > >>>>>
> > >>>>>         if (efi.tpm_final_log == EFI_INVALID_TABLE_ADDR) {
> > >>>>>                 pr_info("TPM Final Events table not present\n");
> > >>>>
> > >>>> Unfortunately not. efi_mem_reserve updates e820_table, while kexec looks at /sys/firmware/memmap
> > >>>> which is e820_table_firmware.
> > >>>>
> > >>>> arch_update_firmware_area introduced in the RFC patch does the same thing as efi_mem_reserve does at
> > >>>> its end, just with e820_table_firmware instead of e820_table.
> > >>>> i.e. efi_mem_reserve does:
> > >>>>         e820__range_update(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
> > >>>>         e820__update_table(e820_table);
> > >>>>
> > >>>> while arch_update_firmware_area does:
> > >>>>         e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
> > >>>>         e820__update_table(e820_table_firmware);
> > >>>>
> > >>>
> > >>> Shame.
> > >>>
> > >>> Using efi_mem_reserve() is appropriate here in any case, but I guess
> > >>> kexec on x86 needs to be fixed to juggle the EFI memory map, memblock
> > >>> table, and 3 (!) versions of the E820 table in the correct way
> > >>> (e820_table, e820_table_kexec and e820_table_firmware)
> > >>>
> > >>> Perhaps we can put this additional logic in x86's implementation of
> > >>> efi_arch_mem_reserve()? AFAICT, all callers of efi_mem_reserve() deal
> > >>> with configuration tables produced by the firmware that may not be
> > >>> reserved correctly if kexec looks at e820_table_firmware[] only.
> > >>
> > >
> > > I have not read all the conversations,  let me have a look and response later.
> > >
> > > The first glance about the patch is that I think the kexec_file_load
> > > syscall (default of latest kexec-tools) will not use
> > > e820_table_firmware AFAIK.  it will only use e820_table_kexec.
> >
> > I initially thought that as well. But it looks like kexec just reads /sys/firmware/memmap
> >
> > https://github.com/horms/kexec-tools/blob/main/kexec/firmware_memmap.h#L29
> >
> > which is e820_table_firmware.
>
> That piece of code is only used by kexec_load
>
> >
> > The patch that Ard sent in https://lore.kernel.org/all/20240912155159.1951792-2-ardb+git@google.com/
> > is the right approach to it I believe, and I dont see the issue anymore after applying that patch.
> >
> > >
> > > Usama, can you confirm how you tested this?
> > > kexec -c -l  will use kexec_load syscall
> >
> > I am currently testing in my VM setup with kexec_load. But production is running
> > kexec_file_load and has the same issue.
>
> Ok, I mean efi_mem_reserve should be able to work if you retest with
> kexec_file_load.

Hold on,  I'm not sure about above :(

checking the efi_arch_mem_reserve(), currently it updates the e820
table, if you update the e820_table_kexec and e820_table_firmware then
I think both kexec_load and kexec_file_load will work.

Anyway I was not aware very much about the firmware e820 tables and
kexec tables when they were created.   I suspect that a cleanup and
revisit is needed.  I will have a look at that.

For Ard's fix to allocate it as ACPI memory, I think it should be good
and simpler.

>
> >
> > Thanks,
> > Usama
> >
> > > kexec [-s] -l will use kexec_file_load syscall
> > >
> > > Thanks
> > > Dave
> > >
> >
Breno Leitao Sept. 13, 2024, 11:57 a.m. UTC | #11
Hello James,

On Thu, Sep 12, 2024 at 12:22:01PM -0400, James Bottomley wrote:
> On Thu, 2024-09-12 at 06:03 -0700, Breno Leitao wrote:
> > Hello Ard,
> > 
> > On Thu, Sep 12, 2024 at 12:51:57PM +0200, Ard Biesheuvel wrote:
> > > I don't see how this could be an EFI bug, given that it does not
> > > deal with E820 tables at all.
> > 
> > I want to back up a little bit and make sure I am following the
> > discussion.
> > 
> > From what I understand from previous discussion, we have an EFI bug
> > as the root cause of this issue.
> > 
> > This happens because the EFI does NOT mark the EFI TPM event log
> > memory region as reserved (EFI_RESERVED_TYPE). Not having an entry
> > for the event table memory in EFI memory mapped, then libstub will
> > ignore it completely (the TPM event log memory range) and not
> > populate e820 table with it.
> 
> Wait, that's not correct.  The TPM log is in memory that doesn't
> survive ExitBootServices (by design in case the OS doesn't care about
> it).  So the EFI stub actually copies it over to a new configuration
> table that is in reserved memory before it calls ExitBootServices. 
> This new copy should be in kernel reserved memory regardless of its
> e820 map status.

First of all, thanks for clarifying some points here.

How should the TPM log table be passed to the next kernel when
kexecing() since it didn't surive ExitBootServices?
Dave Young Sept. 13, 2024, 12:52 p.m. UTC | #12
Hi Usama,

> > Anyway I was not aware very much about the firmware e820 tables and
> > kexec tables when they were created.   I suspect that a cleanup and
> > revisit is needed.  I will have a look at that.
>
> Yes, I feel like there is one too many tables! From reading the code
> I understand that /sys/firmware/memmap should contain the untouched map
> at time of boot, i.e. e820_table_firmware. But I would be in favour of
> getting rid of e820_table_firmware, and just having e820_table_kexec.
> And /sys/firmware/memmap gets data from e820_table_kexec.

Agreed, I have the same feelings.

Thanks
Dave
Dave Young Sept. 14, 2024, 6:46 a.m. UTC | #13
On Fri, 13 Sept 2024 at 18:56, Dave Young <dyoung@redhat.com> wrote:
>
> On Thu, 12 Sept 2024 at 22:15, Ard Biesheuvel <ardb@kernel.org> wrote:
> >
> > (cc Dave)
>
> Thanks for ccing me.
>
> >
> > Full thread here:
> > https://lore.kernel.org/all/CAMj1kXG1hbiafKRyC5qM1Vj5X7x-dmLndqqo2AYnHMRxDz-80w@mail.gmail.com/T/#u
> >
> > On Thu, 12 Sept 2024 at 16:05, Ard Biesheuvel <ardb@kernel.org> wrote:
> > >
> > > On Thu, 12 Sept 2024 at 15:55, Usama Arif <usamaarif642@gmail.com> wrote:
> > > >
> > > >
> > > >
> > > > On 12/09/2024 14:10, Ard Biesheuvel wrote:
> > > > > Does the below help at all?
> > > > >
> > > > > --- a/drivers/firmware/efi/tpm.c
> > > > > +++ b/drivers/firmware/efi/tpm.c
> > > > > @@ -60,7 +60,7 @@ int __init efi_tpm_eventlog_init(void)
> > > > >         }
> > > > >
> > > > >         tbl_size = sizeof(*log_tbl) + log_tbl->size;
> > > > > -       memblock_reserve(efi.tpm_log, tbl_size);
> > > > > +       efi_mem_reserve(efi.tpm_log, tbl_size);
> > > > >
> > > > >         if (efi.tpm_final_log == EFI_INVALID_TABLE_ADDR) {
> > > > >                 pr_info("TPM Final Events table not present\n");
> > > >
> > > > Unfortunately not. efi_mem_reserve updates e820_table, while kexec looks at /sys/firmware/memmap
> > > > which is e820_table_firmware.
> > > >
> > > > arch_update_firmware_area introduced in the RFC patch does the same thing as efi_mem_reserve does at
> > > > its end, just with e820_table_firmware instead of e820_table.
> > > > i.e. efi_mem_reserve does:
> > > >         e820__range_update(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
> > > >         e820__update_table(e820_table);
> > > >
> > > > while arch_update_firmware_area does:
> > > >         e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
> > > >         e820__update_table(e820_table_firmware);
> > > >
> > >
> > > Shame.
> > >
> > > Using efi_mem_reserve() is appropriate here in any case, but I guess
> > > kexec on x86 needs to be fixed to juggle the EFI memory map, memblock
> > > table, and 3 (!) versions of the E820 table in the correct way
> > > (e820_table, e820_table_kexec and e820_table_firmware)
> > >
> > > Perhaps we can put this additional logic in x86's implementation of
> > > efi_arch_mem_reserve()? AFAICT, all callers of efi_mem_reserve() deal
> > > with configuration tables produced by the firmware that may not be
> > > reserved correctly if kexec looks at e820_table_firmware[] only.
> >
>
> I have not read all the conversations,  let me have a look and response later.
>

I'm still confused after reading the code about why this issue can
still happen with a efi_mem_reserve.
Usama, Breno, could any of you share the exact steps on how to
reproduce this issue with a kvm guest?

Thanks
Daev
Ard Biesheuvel Sept. 14, 2024, 8:31 a.m. UTC | #14
On Sat, 14 Sept 2024 at 08:46, Dave Young <dyoung@redhat.com> wrote:
>
> On Fri, 13 Sept 2024 at 18:56, Dave Young <dyoung@redhat.com> wrote:
> >
> > On Thu, 12 Sept 2024 at 22:15, Ard Biesheuvel <ardb@kernel.org> wrote:
> > >
> > > (cc Dave)
> >
> > Thanks for ccing me.
> >
> > >
> > > Full thread here:
> > > https://lore.kernel.org/all/CAMj1kXG1hbiafKRyC5qM1Vj5X7x-dmLndqqo2AYnHMRxDz-80w@mail.gmail.com/T/#u
> > >
> > > On Thu, 12 Sept 2024 at 16:05, Ard Biesheuvel <ardb@kernel.org> wrote:
> > > >
> > > > On Thu, 12 Sept 2024 at 15:55, Usama Arif <usamaarif642@gmail.com> wrote:
> > > > >
> > > > >
> > > > >
> > > > > On 12/09/2024 14:10, Ard Biesheuvel wrote:
> > > > > > Does the below help at all?
> > > > > >
> > > > > > --- a/drivers/firmware/efi/tpm.c
> > > > > > +++ b/drivers/firmware/efi/tpm.c
> > > > > > @@ -60,7 +60,7 @@ int __init efi_tpm_eventlog_init(void)
> > > > > >         }
> > > > > >
> > > > > >         tbl_size = sizeof(*log_tbl) + log_tbl->size;
> > > > > > -       memblock_reserve(efi.tpm_log, tbl_size);
> > > > > > +       efi_mem_reserve(efi.tpm_log, tbl_size);
> > > > > >
> > > > > >         if (efi.tpm_final_log == EFI_INVALID_TABLE_ADDR) {
> > > > > >                 pr_info("TPM Final Events table not present\n");
> > > > >
> > > > > Unfortunately not. efi_mem_reserve updates e820_table, while kexec looks at /sys/firmware/memmap
> > > > > which is e820_table_firmware.
> > > > >
> > > > > arch_update_firmware_area introduced in the RFC patch does the same thing as efi_mem_reserve does at
> > > > > its end, just with e820_table_firmware instead of e820_table.
> > > > > i.e. efi_mem_reserve does:
> > > > >         e820__range_update(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
> > > > >         e820__update_table(e820_table);
> > > > >
> > > > > while arch_update_firmware_area does:
> > > > >         e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
> > > > >         e820__update_table(e820_table_firmware);
> > > > >
> > > >
> > > > Shame.
> > > >
> > > > Using efi_mem_reserve() is appropriate here in any case, but I guess
> > > > kexec on x86 needs to be fixed to juggle the EFI memory map, memblock
> > > > table, and 3 (!) versions of the E820 table in the correct way
> > > > (e820_table, e820_table_kexec and e820_table_firmware)
> > > >
> > > > Perhaps we can put this additional logic in x86's implementation of
> > > > efi_arch_mem_reserve()? AFAICT, all callers of efi_mem_reserve() deal
> > > > with configuration tables produced by the firmware that may not be
> > > > reserved correctly if kexec looks at e820_table_firmware[] only.
> > >
> >
> > I have not read all the conversations,  let me have a look and response later.
> >
>
> I'm still confused after reading the code about why this issue can
> still happen with a efi_mem_reserve.
> Usama, Breno, could any of you share the exact steps on how to
> reproduce this issue with a kvm guest?
>

The code does not use efi_mem_reserve() only memblock_reserve().
Dave Young Sept. 14, 2024, 9:24 a.m. UTC | #15
On Sat, 14 Sept 2024 at 16:31, Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Sat, 14 Sept 2024 at 08:46, Dave Young <dyoung@redhat.com> wrote:
> >
> > On Fri, 13 Sept 2024 at 18:56, Dave Young <dyoung@redhat.com> wrote:
> > >
> > > On Thu, 12 Sept 2024 at 22:15, Ard Biesheuvel <ardb@kernel.org> wrote:
> > > >
> > > > (cc Dave)
> > >
> > > Thanks for ccing me.
> > >
> > > >
> > > > Full thread here:
> > > > https://lore.kernel.org/all/CAMj1kXG1hbiafKRyC5qM1Vj5X7x-dmLndqqo2AYnHMRxDz-80w@mail.gmail.com/T/#u
> > > >
> > > > On Thu, 12 Sept 2024 at 16:05, Ard Biesheuvel <ardb@kernel.org> wrote:
> > > > >
> > > > > On Thu, 12 Sept 2024 at 15:55, Usama Arif <usamaarif642@gmail.com> wrote:
> > > > > >
> > > > > >
> > > > > >
> > > > > > On 12/09/2024 14:10, Ard Biesheuvel wrote:
> > > > > > > Does the below help at all?
> > > > > > >
> > > > > > > --- a/drivers/firmware/efi/tpm.c
> > > > > > > +++ b/drivers/firmware/efi/tpm.c
> > > > > > > @@ -60,7 +60,7 @@ int __init efi_tpm_eventlog_init(void)
> > > > > > >         }
> > > > > > >
> > > > > > >         tbl_size = sizeof(*log_tbl) + log_tbl->size;
> > > > > > > -       memblock_reserve(efi.tpm_log, tbl_size);
> > > > > > > +       efi_mem_reserve(efi.tpm_log, tbl_size);
> > > > > > >
> > > > > > >         if (efi.tpm_final_log == EFI_INVALID_TABLE_ADDR) {
> > > > > > >                 pr_info("TPM Final Events table not present\n");
> > > > > >
> > > > > > Unfortunately not. efi_mem_reserve updates e820_table, while kexec looks at /sys/firmware/memmap
> > > > > > which is e820_table_firmware.

Updating e820_table should be good enough, it depends on where the
corruption is happening.

kexec will find a suitable memory for the kernel via searching through
the system ram resources.   So efi_mem_reserve will update e820_table,
then reserve in the resources list as E820_TYPE_RESERVED, thus it
should not be a problem.
During the 2nd kernel boot phase, it is carried as EFI_LOADER_DATA
with EFI_MEMORY_RUNTIME attribute, I think it is also fine,  and later
efi_mem_reserve will be called as what have been done in previous
kernel.

So I think no need to update the e820_table_kexec and e820_table_firmware

> > > > > >
> > > > > > arch_update_firmware_area introduced in the RFC patch does the same thing as efi_mem_reserve does at
> > > > > > its end, just with e820_table_firmware instead of e820_table.
> > > > > > i.e. efi_mem_reserve does:
> > > > > >         e820__range_update(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
> > > > > >         e820__update_table(e820_table);
> > > > > >
> > > > > > while arch_update_firmware_area does:
> > > > > >         e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
> > > > > >         e820__update_table(e820_table_firmware);
> > > > > >
> > > > >
> > > > > Shame.
> > > > >
> > > > > Using efi_mem_reserve() is appropriate here in any case, but I guess
> > > > > kexec on x86 needs to be fixed to juggle the EFI memory map, memblock
> > > > > table, and 3 (!) versions of the E820 table in the correct way
> > > > > (e820_table, e820_table_kexec and e820_table_firmware)
> > > > >
> > > > > Perhaps we can put this additional logic in x86's implementation of
> > > > > efi_arch_mem_reserve()? AFAICT, all callers of efi_mem_reserve() deal
> > > > > with configuration tables produced by the firmware that may not be
> > > > > reserved correctly if kexec looks at e820_table_firmware[] only.
> > > >
> > >
> > > I have not read all the conversations,  let me have a look and response later.
> > >
> >
> > I'm still confused after reading the code about why this issue can
> > still happen with a efi_mem_reserve.
> > Usama, Breno, could any of you share the exact steps on how to
> > reproduce this issue with a kvm guest?
> >
>
> The code does not use efi_mem_reserve() only memblock_reserve().

Yes, I see this, I just thought that Usama tested with changes to
efi_mem_reserve and it still did not work, this is what I'm confused
about.

But maybe Usama did not test and only checked the code and assumed
that we have to update the e820_table_kexec and e820_table_firmware.
See my reply inline above.

Thanks
Dave
>
Dave Young Sept. 14, 2024, 9:46 a.m. UTC | #16
On Sat, 14 Sept 2024 at 17:24, Dave Young <dyoung@redhat.com> wrote:
>
> On Sat, 14 Sept 2024 at 16:31, Ard Biesheuvel <ardb@kernel.org> wrote:
> >
> > On Sat, 14 Sept 2024 at 08:46, Dave Young <dyoung@redhat.com> wrote:
> > >
> > > On Fri, 13 Sept 2024 at 18:56, Dave Young <dyoung@redhat.com> wrote:
> > > >
> > > > On Thu, 12 Sept 2024 at 22:15, Ard Biesheuvel <ardb@kernel.org> wrote:
> > > > >
> > > > > (cc Dave)
> > > >
> > > > Thanks for ccing me.
> > > >
> > > > >
> > > > > Full thread here:
> > > > > https://lore.kernel.org/all/CAMj1kXG1hbiafKRyC5qM1Vj5X7x-dmLndqqo2AYnHMRxDz-80w@mail.gmail.com/T/#u
> > > > >
> > > > > On Thu, 12 Sept 2024 at 16:05, Ard Biesheuvel <ardb@kernel.org> wrote:
> > > > > >
> > > > > > On Thu, 12 Sept 2024 at 15:55, Usama Arif <usamaarif642@gmail.com> wrote:
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > On 12/09/2024 14:10, Ard Biesheuvel wrote:
> > > > > > > > Does the below help at all?
> > > > > > > >
> > > > > > > > --- a/drivers/firmware/efi/tpm.c
> > > > > > > > +++ b/drivers/firmware/efi/tpm.c
> > > > > > > > @@ -60,7 +60,7 @@ int __init efi_tpm_eventlog_init(void)
> > > > > > > >         }
> > > > > > > >
> > > > > > > >         tbl_size = sizeof(*log_tbl) + log_tbl->size;
> > > > > > > > -       memblock_reserve(efi.tpm_log, tbl_size);
> > > > > > > > +       efi_mem_reserve(efi.tpm_log, tbl_size);
> > > > > > > >
> > > > > > > >         if (efi.tpm_final_log == EFI_INVALID_TABLE_ADDR) {
> > > > > > > >                 pr_info("TPM Final Events table not present\n");
> > > > > > >
> > > > > > > Unfortunately not. efi_mem_reserve updates e820_table, while kexec looks at /sys/firmware/memmap
> > > > > > > which is e820_table_firmware.
>
> Updating e820_table should be good enough, it depends on where the
> corruption is happening.
>
> kexec will find a suitable memory for the kernel via searching through
> the system ram resources.   So efi_mem_reserve will update e820_table,
> then reserve in the resources list as E820_TYPE_RESERVED, thus it
> should not be a problem.
> During the 2nd kernel boot phase, it is carried as EFI_LOADER_DATA
> with EFI_MEMORY_RUNTIME attribute, I think it is also fine,  and later
> efi_mem_reserve will be called as what have been done in previous
> kernel.
>
> So I think no need to update the e820_table_kexec and e820_table_firmware


Hmm,  oops, I again forgot the kexec_load code in userspace kexec-tools.
The kexec-tools code still searching for memory ranges from e820_table_firmware

>
> > > > > > >
> > > > > > > arch_update_firmware_area introduced in the RFC patch does the same thing as efi_mem_reserve does at
> > > > > > > its end, just with e820_table_firmware instead of e820_table.
> > > > > > > i.e. efi_mem_reserve does:
> > > > > > >         e820__range_update(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
> > > > > > >         e820__update_table(e820_table);
> > > > > > >
> > > > > > > while arch_update_firmware_area does:
> > > > > > >         e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
> > > > > > >         e820__update_table(e820_table_firmware);
> > > > > > >
> > > > > >
> > > > > > Shame.
> > > > > >
> > > > > > Using efi_mem_reserve() is appropriate here in any case, but I guess
> > > > > > kexec on x86 needs to be fixed to juggle the EFI memory map, memblock
> > > > > > table, and 3 (!) versions of the E820 table in the correct way
> > > > > > (e820_table, e820_table_kexec and e820_table_firmware)
> > > > > >
> > > > > > Perhaps we can put this additional logic in x86's implementation of
> > > > > > efi_arch_mem_reserve()? AFAICT, all callers of efi_mem_reserve() deal
> > > > > > with configuration tables produced by the firmware that may not be
> > > > > > reserved correctly if kexec looks at e820_table_firmware[] only.
> > > > >
> > > >
> > > > I have not read all the conversations,  let me have a look and response later.
> > > >
> > >
> > > I'm still confused after reading the code about why this issue can
> > > still happen with a efi_mem_reserve.
> > > Usama, Breno, could any of you share the exact steps on how to
> > > reproduce this issue with a kvm guest?
> > >
> >
> > The code does not use efi_mem_reserve() only memblock_reserve().
>
> Yes, I see this, I just thought that Usama tested with changes to
> efi_mem_reserve and it still did not work, this is what I'm confused
> about.
>
> But maybe Usama did not test and only checked the code and assumed
> that we have to update the e820_table_kexec and e820_table_firmware.
> See my reply inline above.

Please ignore the above comment.   The userspace code does need the
e820_table_firmware.
So the best way to make it easier is to clean up the e820 tables and
maintain only one table then the kernel kexec_file_load behavior will
be the same as the userspace.   But need a closer look about the
details, eg. if the hibernate (mentioned in code comment) is happy.

Or to change userspace to go through the /proc/iomem instead of
checking the /sys/firmware/memmap

>
> Thanks
> Dave
> >
Eric W. Biederman Sept. 16, 2024, 8:20 p.m. UTC | #17
James Bottomley <James.Bottomley@HansenPartnership.com> writes:

> On Fri, 2024-09-13 at 04:57 -0700, Breno Leitao wrote:
>> Hello James,
>> 
>> On Thu, Sep 12, 2024 at 12:22:01PM -0400, James Bottomley wrote:
>> > On Thu, 2024-09-12 at 06:03 -0700, Breno Leitao wrote:
>> > > Hello Ard,
>> > > 
>> > > On Thu, Sep 12, 2024 at 12:51:57PM +0200, Ard Biesheuvel wrote:
>> > > > I don't see how this could be an EFI bug, given that it does
>> > > > not deal with E820 tables at all.
>> > > 
>> > > I want to back up a little bit and make sure I am following the
>> > > discussion.
>> > > 
>> > > From what I understand from previous discussion, we have an EFI
>> > > bug as the root cause of this issue.
>> > > 
>> > > This happens because the EFI does NOT mark the EFI TPM event log
>> > > memory region as reserved (EFI_RESERVED_TYPE). Not having an
>> > > entry for the event table memory in EFI memory mapped, then
>> > > libstub will ignore it completely (the TPM event log memory
>> > > range) and not populate e820 table with it.
>> > 
>> > Wait, that's not correct.  The TPM log is in memory that doesn't
>> > survive ExitBootServices (by design in case the OS doesn't care
>> > about it).  So the EFI stub actually copies it over to a new
>> > configuration table that is in reserved memory before it calls
>> > ExitBootServices.  This new copy should be in kernel reserved
>> > memory regardless of its e820 map status.
>> 
>> First of all, thanks for clarifying some points here.
>> 
>> How should the TPM log table be passed to the next kernel when
>> kexecing() since it didn't surive ExitBootServices?
>
> I've no idea.  I'm assuming you don't elaborately reconstruct the EFI
> boot services, so you can't enter the EFI boot stub before
> ExitBootServices is called?  So I'd guess you want to preserve the EFI
> table that copied the TPM data in to kernel memory.

This leaves two practical questions if I have been following everything
correctly.

1) How to get kexec to avoid picking that memory for the new kernel to
   run in before it initializes itself. (AKA the getting stomped by
   relocate kernel problem).

2) How to point the new kernel to preserved tpm_log.


This recommendation is from memory so it may be a bit off but
the general structure should work.  The idea is as follows.

- Pass the information between kernels.

  It is probably simplest for the kernel to have a command line option
  that tells the kernel the address and size of the tpm_log.

  We have a couple of mechanisms here.  Assuming you are loading a
  bzImage with kexec_file_load you should be able to have the in kernel
  loader to add those arguments to the kernel command line.

- Ensure that when the loader is finding an address to load the new
  kernel it treats the address of the tpm_log as unavailable.

Does that sound like a doable plan?

Eric
Ard Biesheuvel Sept. 17, 2024, 6:45 a.m. UTC | #18
Hi Eric,

Thanks for chiming in.

On Mon, 16 Sept 2024 at 22:21, Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> James Bottomley <James.Bottomley@HansenPartnership.com> writes:
>
> > On Fri, 2024-09-13 at 04:57 -0700, Breno Leitao wrote:
> >> Hello James,
> >>
> >> On Thu, Sep 12, 2024 at 12:22:01PM -0400, James Bottomley wrote:
> >> > On Thu, 2024-09-12 at 06:03 -0700, Breno Leitao wrote:
> >> > > Hello Ard,
> >> > >
> >> > > On Thu, Sep 12, 2024 at 12:51:57PM +0200, Ard Biesheuvel wrote:
> >> > > > I don't see how this could be an EFI bug, given that it does
> >> > > > not deal with E820 tables at all.
> >> > >
> >> > > I want to back up a little bit and make sure I am following the
> >> > > discussion.
> >> > >
> >> > > From what I understand from previous discussion, we have an EFI
> >> > > bug as the root cause of this issue.
> >> > >
> >> > > This happens because the EFI does NOT mark the EFI TPM event log
> >> > > memory region as reserved (EFI_RESERVED_TYPE). Not having an
> >> > > entry for the event table memory in EFI memory mapped, then
> >> > > libstub will ignore it completely (the TPM event log memory
> >> > > range) and not populate e820 table with it.
> >> >
> >> > Wait, that's not correct.  The TPM log is in memory that doesn't
> >> > survive ExitBootServices (by design in case the OS doesn't care
> >> > about it).  So the EFI stub actually copies it over to a new
> >> > configuration table that is in reserved memory before it calls
> >> > ExitBootServices.  This new copy should be in kernel reserved
> >> > memory regardless of its e820 map status.
> >>
> >> First of all, thanks for clarifying some points here.
> >>
> >> How should the TPM log table be passed to the next kernel when
> >> kexecing() since it didn't surive ExitBootServices?
> >
> > I've no idea.  I'm assuming you don't elaborately reconstruct the EFI
> > boot services, so you can't enter the EFI boot stub before
> > ExitBootServices is called?  So I'd guess you want to preserve the EFI
> > table that copied the TPM data in to kernel memory.
>
> This leaves two practical questions if I have been following everything
> correctly.
>
> 1) How to get kexec to avoid picking that memory for the new kernel to
>    run in before it initializes itself. (AKA the getting stomped by
>    relocate kernel problem).
>
> 2) How to point the new kernel to preserved tpm_log.
>
>
> This recommendation is from memory so it may be a bit off but
> the general structure should work.  The idea is as follows.
>
> - Pass the information between kernels.
>
>   It is probably simplest for the kernel to have a command line option
>   that tells the kernel the address and size of the tpm_log.
>
>   We have a couple of mechanisms here.  Assuming you are loading a
>   bzImage with kexec_file_load you should be able to have the in kernel
>   loader to add those arguments to the kernel command line.
>

This shouldn't be necessary, and I think it is actively harmful to
keep inventing special ways for the kexec kernel to learn about these
things that deviate from the methods used by the first kernel. This is
how we ended up with 5 sources of truth for the physical memory map
(EFI memory map, memblock and 3 different versions of the e820 memory
map).

We should try very hard to make kexec idempotent, and reuse the
existing methods where possible. In this case, the EFI configuration
table is already being exposed to the kexec kernel, which describes
the base of the allocation. The size of the allocation can be derived
from the table header.

> - Ensure that when the loader is finding an address to load the new
>   kernel it treats the address of the tpm_log as unavailable.
>

The TPM log is a table created by the EFI stub loader, which is part
of the kernel. So if we need to tweak this for kexec's benefit, I'd
prefer changing it in a way that can accommodate the first kernel too.
However, I think the current method already has that property so I
don't think we need to do anything (modulo fixing the bug)

That said, I am doubtful that the kexec kernel can make meaningful use
of the TPM log to begin with, given that the TPM will be out of sync
at this point. But it is still better to keep it for symmetry, letting
the higher level kexec/kdump logic running in user space reason about
whether the TPM log has any value to it.
Eric W. Biederman Sept. 17, 2024, 3:24 p.m. UTC | #19
Ard Biesheuvel <ardb@kernel.org> writes:

> Hi Eric,
>
> Thanks for chiming in.

It just looked like after James gave some expert input the
conversation got stuck, so I am just trying to move it along.

I don't think anyone knows what this whole elephant looks like,
which makes solving the problem tricky.

> On Mon, 16 Sept 2024 at 22:21, Eric W. Biederman <ebiederm@xmission.com> wrote:
>>
>> James Bottomley <James.Bottomley@HansenPartnership.com> writes:
>>
>> > On Fri, 2024-09-13 at 04:57 -0700, Breno Leitao wrote:
>> >> Hello James,
>> >>
>> >> On Thu, Sep 12, 2024 at 12:22:01PM -0400, James Bottomley wrote:
>> >> > On Thu, 2024-09-12 at 06:03 -0700, Breno Leitao wrote:
>> >> > > Hello Ard,
>> >> > >
>> >> > > On Thu, Sep 12, 2024 at 12:51:57PM +0200, Ard Biesheuvel wrote:
>> >> > > > I don't see how this could be an EFI bug, given that it does
>> >> > > > not deal with E820 tables at all.
>> >> > >
>> >> > > I want to back up a little bit and make sure I am following the
>> >> > > discussion.
>> >> > >
>> >> > > From what I understand from previous discussion, we have an EFI
>> >> > > bug as the root cause of this issue.
>> >> > >
>> >> > > This happens because the EFI does NOT mark the EFI TPM event log
>> >> > > memory region as reserved (EFI_RESERVED_TYPE). Not having an
>> >> > > entry for the event table memory in EFI memory mapped, then
>> >> > > libstub will ignore it completely (the TPM event log memory
>> >> > > range) and not populate e820 table with it.
>> >> >
>> >> > Wait, that's not correct.  The TPM log is in memory that doesn't
>> >> > survive ExitBootServices (by design in case the OS doesn't care
>> >> > about it).  So the EFI stub actually copies it over to a new
>> >> > configuration table that is in reserved memory before it calls
>> >> > ExitBootServices.  This new copy should be in kernel reserved
>> >> > memory regardless of its e820 map status.
>> >>
>> >> First of all, thanks for clarifying some points here.
>> >>
>> >> How should the TPM log table be passed to the next kernel when
>> >> kexecing() since it didn't surive ExitBootServices?
>> >
>> > I've no idea.  I'm assuming you don't elaborately reconstruct the EFI
>> > boot services, so you can't enter the EFI boot stub before
>> > ExitBootServices is called?  So I'd guess you want to preserve the EFI
>> > table that copied the TPM data in to kernel memory.
>>
>> This leaves two practical questions if I have been following everything
>> correctly.
>>
>> 1) How to get kexec to avoid picking that memory for the new kernel to
>>    run in before it initializes itself. (AKA the getting stomped by
>>    relocate kernel problem).
>>
>> 2) How to point the new kernel to preserved tpm_log.
>>
>>
>> This recommendation is from memory so it may be a bit off but
>> the general structure should work.  The idea is as follows.
>>
>> - Pass the information between kernels.
>>
>>   It is probably simplest for the kernel to have a command line option
>>   that tells the kernel the address and size of the tpm_log.
>>
>>   We have a couple of mechanisms here.  Assuming you are loading a
>>   bzImage with kexec_file_load you should be able to have the in kernel
>>   loader to add those arguments to the kernel command line.
>>
>
> This shouldn't be necessary, and I think it is actively harmful to
> keep inventing special ways for the kexec kernel to learn about these
> things that deviate from the methods used by the first kernel. This is
> how we ended up with 5 sources of truth for the physical memory map
> (EFI memory map, memblock and 3 different versions of the e820 memory
> map).
>
> We should try very hard to make kexec idempotent, and reuse the
> existing methods where possible. In this case, the EFI configuration
> table is already being exposed to the kexec kernel, which describes
> the base of the allocation. The size of the allocation can be derived
> from the table header.
>
>> - Ensure that when the loader is finding an address to load the new
>>   kernel it treats the address of the tpm_log as unavailable.
>>
>
> The TPM log is a table created by the EFI stub loader, which is part
> of the kernel. So if we need to tweak this for kexec's benefit, I'd
> prefer changing it in a way that can accommodate the first kernel too.
> However, I think the current method already has that property so I
> don't think we need to do anything (modulo fixing the bug)

I am fine with not inventing a new mechanism, but I think we need
to reuse whatever mechanism the stub loader uses to pass it's
table to the kernel.  Not the EFI table that disappears at
ExitBootServices().

> That said, I am doubtful that the kexec kernel can make meaningful use
> of the TPM log to begin with, given that the TPM will be out of sync
> at this point. But it is still better to keep it for symmetry, letting
> the higher level kexec/kdump logic running in user space reason about
> whether the TPM log has any value to it.

Someone seems to think so or there would not be a complaint that it is
getting corrupted.

This should not be the kexec-on-panic kernel as that runs in memory
that is reserved solely for it's own use.  So we are talking something
like using kexec as a bootloader.

Eric
Ard Biesheuvel Sept. 17, 2024, 3:35 p.m. UTC | #20
On Tue, 17 Sept 2024 at 17:24, Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> Ard Biesheuvel <ardb@kernel.org> writes:
>
> > Hi Eric,
> >
> > Thanks for chiming in.
>
> It just looked like after James gave some expert input the
> conversation got stuck, so I am just trying to move it along.
>
> I don't think anyone knows what this whole elephant looks like,
> which makes solving the problem tricky.
>
> > On Mon, 16 Sept 2024 at 22:21, Eric W. Biederman <ebiederm@xmission.com> wrote:
> >>
...
> >>
> >> This leaves two practical questions if I have been following everything
> >> correctly.
> >>
> >> 1) How to get kexec to avoid picking that memory for the new kernel to
> >>    run in before it initializes itself. (AKA the getting stomped by
> >>    relocate kernel problem).
> >>
> >> 2) How to point the new kernel to preserved tpm_log.
> >>
> >>
> >> This recommendation is from memory so it may be a bit off but
> >> the general structure should work.  The idea is as follows.
> >>
> >> - Pass the information between kernels.
> >>
> >>   It is probably simplest for the kernel to have a command line option
> >>   that tells the kernel the address and size of the tpm_log.
> >>
> >>   We have a couple of mechanisms here.  Assuming you are loading a
> >>   bzImage with kexec_file_load you should be able to have the in kernel
> >>   loader to add those arguments to the kernel command line.
> >>
> >
> > This shouldn't be necessary, and I think it is actively harmful to
> > keep inventing special ways for the kexec kernel to learn about these
> > things that deviate from the methods used by the first kernel. This is
> > how we ended up with 5 sources of truth for the physical memory map
> > (EFI memory map, memblock and 3 different versions of the e820 memory
> > map).
> >
> > We should try very hard to make kexec idempotent, and reuse the
> > existing methods where possible. In this case, the EFI configuration
> > table is already being exposed to the kexec kernel, which describes
> > the base of the allocation. The size of the allocation can be derived
> > from the table header.
> >
> >> - Ensure that when the loader is finding an address to load the new
> >>   kernel it treats the address of the tpm_log as unavailable.
> >>
> >
> > The TPM log is a table created by the EFI stub loader, which is part
> > of the kernel. So if we need to tweak this for kexec's benefit, I'd
> > prefer changing it in a way that can accommodate the first kernel too.
> > However, I think the current method already has that property so I
> > don't think we need to do anything (modulo fixing the bug)
>
> I am fine with not inventing a new mechanism, but I think we need
> to reuse whatever mechanism the stub loader uses to pass it's
> table to the kernel.  Not the EFI table that disappears at
> ExitBootServices().
>

Not sure what you mean here - the EFI table that gets clobbered by
kexec *is* the table that is created by the stub loader to pass the
TPM log to the kernel. Not sure what alternative you have in mind
here.

> > That said, I am doubtful that the kexec kernel can make meaningful use
> > of the TPM log to begin with, given that the TPM will be out of sync
> > at this point. But it is still better to keep it for symmetry, letting
> > the higher level kexec/kdump logic running in user space reason about
> > whether the TPM log has any value to it.
>
> Someone seems to think so or there would not be a complaint that it is
> getting corrupted.
>

No. The problem is that the size of the table is *in* the table, and
so if it gets corrupted, the code that attempts to memblock_reserve()
it goes off into the weeds. But that does not imply there is a point
to having access to this table from a kexec kernel in the first place.

> This should not be the kexec-on-panic kernel as that runs in memory
> that is reserved solely for it's own use.  So we are talking something
> like using kexec as a bootloader.
>

kexec as a bootloader under TPM based measured boot will need to do a
lot more than pass the firmware's event log to the next kernel. I'd
expect a properly engineered kexec to replace this table entirely, and
include the hashes of the assets it has loaded and measured into the
respective PCRs.

But let's stick to solving the actual issue here, rather than
philosophize on how kexec might work in this context.
Eric W. Biederman Sept. 18, 2024, 3:13 a.m. UTC | #21
Ard Biesheuvel <ardb@kernel.org> writes:

> On Tue, 17 Sept 2024 at 17:24, Eric W. Biederman <ebiederm@xmission.com> wrote:
>>
>> Ard Biesheuvel <ardb@kernel.org> writes:
>>
>> > Hi Eric,
>> >
>> > Thanks for chiming in.
>>
>> It just looked like after James gave some expert input the
>> conversation got stuck, so I am just trying to move it along.
>>
>> I don't think anyone knows what this whole elephant looks like,
>> which makes solving the problem tricky.
>>
>> > On Mon, 16 Sept 2024 at 22:21, Eric W. Biederman <ebiederm@xmission.com> wrote:
>> >>
> ...
>> >>
>> >> This leaves two practical questions if I have been following everything
>> >> correctly.
>> >>
>> >> 1) How to get kexec to avoid picking that memory for the new kernel to
>> >>    run in before it initializes itself. (AKA the getting stomped by
>> >>    relocate kernel problem).
>> >>
>> >> 2) How to point the new kernel to preserved tpm_log.
>> >>
>> >>
>> >> This recommendation is from memory so it may be a bit off but
>> >> the general structure should work.  The idea is as follows.
>> >>
>> >> - Pass the information between kernels.
>> >>
>> >>   It is probably simplest for the kernel to have a command line option
>> >>   that tells the kernel the address and size of the tpm_log.
>> >>
>> >>   We have a couple of mechanisms here.  Assuming you are loading a
>> >>   bzImage with kexec_file_load you should be able to have the in kernel
>> >>   loader to add those arguments to the kernel command line.
>> >>
>> >
>> > This shouldn't be necessary, and I think it is actively harmful to
>> > keep inventing special ways for the kexec kernel to learn about these
>> > things that deviate from the methods used by the first kernel. This is
>> > how we ended up with 5 sources of truth for the physical memory map
>> > (EFI memory map, memblock and 3 different versions of the e820 memory
>> > map).
>> >
>> > We should try very hard to make kexec idempotent, and reuse the
>> > existing methods where possible. In this case, the EFI configuration
>> > table is already being exposed to the kexec kernel, which describes
>> > the base of the allocation. The size of the allocation can be derived
>> > from the table header.
>> >
>> >> - Ensure that when the loader is finding an address to load the new
>> >>   kernel it treats the address of the tpm_log as unavailable.
>> >>
>> >
>> > The TPM log is a table created by the EFI stub loader, which is part
>> > of the kernel. So if we need to tweak this for kexec's benefit, I'd
>> > prefer changing it in a way that can accommodate the first kernel too.
>> > However, I think the current method already has that property so I
>> > don't think we need to do anything (modulo fixing the bug)
>>
>> I am fine with not inventing a new mechanism, but I think we need
>> to reuse whatever mechanism the stub loader uses to pass it's
>> table to the kernel.  Not the EFI table that disappears at
>> ExitBootServices().
>>
>
> Not sure what you mean here - the EFI table that gets clobbered by
> kexec *is* the table that is created by the stub loader to pass the
> TPM log to the kernel. Not sure what alternative you have in mind
> here.

I was referring to whatever the EFI table that James Bottomley mentioned
that I presume the stub loader reads from when the stub loader
constructs the tpm_log that is passed to the kernel.

So I believe we are in agreement of everything except terminology.

>> > That said, I am doubtful that the kexec kernel can make meaningful use
>> > of the TPM log to begin with, given that the TPM will be out of sync
>> > at this point. But it is still better to keep it for symmetry, letting
>> > the higher level kexec/kdump logic running in user space reason about
>> > whether the TPM log has any value to it.
>>
>> Someone seems to think so or there would not be a complaint that it is
>> getting corrupted.
>>
>
> No. The problem is that the size of the table is *in* the table, and
> so if it gets corrupted, the code that attempts to memblock_reserve()
> it goes off into the weeds. But that does not imply there is a point
> to having access to this table from a kexec kernel in the first place.

If there is no point to having access to it then we should just not
pass anything to the loaded kernel, so the kernel does not think there
is anything there.

>> This should not be the kexec-on-panic kernel as that runs in memory
>> that is reserved solely for it's own use.  So we are talking something
>> like using kexec as a bootloader.
>>
>
> kexec as a bootloader under TPM based measured boot will need to do a
> lot more than pass the firmware's event log to the next kernel. I'd
> expect a properly engineered kexec to replace this table entirely, and
> include the hashes of the assets it has loaded and measured into the
> respective PCRs.
>
> But let's stick to solving the actual issue here, rather than
> philosophize on how kexec might work in this context.


I am fine with that.  The complaint I had seen was that the table was
being corrupted and asking how to solve that.  It seems I haven't read
the part of the conversation where it was made clear that no one wants
the tpm_log after kexec.

If someone wants the tpm_log then we need to solve this problem.

Eric
Ard Biesheuvel Sept. 18, 2024, 7:36 a.m. UTC | #22
On Wed, 18 Sept 2024 at 05:14, Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> Ard Biesheuvel <ardb@kernel.org> writes:
>
> > On Tue, 17 Sept 2024 at 17:24, Eric W. Biederman <ebiederm@xmission.com> wrote:
> >>
> >> Ard Biesheuvel <ardb@kernel.org> writes:
> >>
> >> > Hi Eric,
> >> >
> >> > Thanks for chiming in.
> >>
> >> It just looked like after James gave some expert input the
> >> conversation got stuck, so I am just trying to move it along.
> >>
> >> I don't think anyone knows what this whole elephant looks like,
> >> which makes solving the problem tricky.
> >>
> >> > On Mon, 16 Sept 2024 at 22:21, Eric W. Biederman <ebiederm@xmission.com> wrote:
> >> >>
> > ...
> >> >>
> >> >> This leaves two practical questions if I have been following everything
> >> >> correctly.
> >> >>
> >> >> 1) How to get kexec to avoid picking that memory for the new kernel to
> >> >>    run in before it initializes itself. (AKA the getting stomped by
> >> >>    relocate kernel problem).
> >> >>
> >> >> 2) How to point the new kernel to preserved tpm_log.
> >> >>
> >> >>
> >> >> This recommendation is from memory so it may be a bit off but
> >> >> the general structure should work.  The idea is as follows.
> >> >>
> >> >> - Pass the information between kernels.
> >> >>
> >> >>   It is probably simplest for the kernel to have a command line option
> >> >>   that tells the kernel the address and size of the tpm_log.
> >> >>
> >> >>   We have a couple of mechanisms here.  Assuming you are loading a
> >> >>   bzImage with kexec_file_load you should be able to have the in kernel
> >> >>   loader to add those arguments to the kernel command line.
> >> >>
> >> >
> >> > This shouldn't be necessary, and I think it is actively harmful to
> >> > keep inventing special ways for the kexec kernel to learn about these
> >> > things that deviate from the methods used by the first kernel. This is
> >> > how we ended up with 5 sources of truth for the physical memory map
> >> > (EFI memory map, memblock and 3 different versions of the e820 memory
> >> > map).
> >> >
> >> > We should try very hard to make kexec idempotent, and reuse the
> >> > existing methods where possible. In this case, the EFI configuration
> >> > table is already being exposed to the kexec kernel, which describes
> >> > the base of the allocation. The size of the allocation can be derived
> >> > from the table header.
> >> >
> >> >> - Ensure that when the loader is finding an address to load the new
> >> >>   kernel it treats the address of the tpm_log as unavailable.
> >> >>
> >> >
> >> > The TPM log is a table created by the EFI stub loader, which is part
> >> > of the kernel. So if we need to tweak this for kexec's benefit, I'd
> >> > prefer changing it in a way that can accommodate the first kernel too.
> >> > However, I think the current method already has that property so I
> >> > don't think we need to do anything (modulo fixing the bug)
> >>
> >> I am fine with not inventing a new mechanism, but I think we need
> >> to reuse whatever mechanism the stub loader uses to pass it's
> >> table to the kernel.  Not the EFI table that disappears at
> >> ExitBootServices().
> >>
> >
> > Not sure what you mean here - the EFI table that gets clobbered by
> > kexec *is* the table that is created by the stub loader to pass the
> > TPM log to the kernel. Not sure what alternative you have in mind
> > here.
>
> I was referring to whatever the EFI table that James Bottomley mentioned
> that I presume the stub loader reads from when the stub loader
> constructs the tpm_log that is passed to the kernel.
>

There is no such table. The event log is exposed by the firmware via a
TCG2 protocol interface, which is no longer available after boot. So
the stub loader (which is the last kernel component that has access to
this interface) invokes this protocol and copies the output into a
table in memory which is exposed to the kernel proper as a EFI
configuration table.

So the main issue here is that EFI configuration tables are passed on
to kexec kernels, and we have to ensure (in the general case) that the
associated memory is not reused. The implication is that the stub
loader should always use EFI_ACPI_RECLAIM_MEMORY for allocations that
are referenced via EFI configuration tables, and doing so for this
table makes the bug go away.


> So I believe we are in agreement of everything except terminology.
>

Sure.

> >> > That said, I am doubtful that the kexec kernel can make meaningful use
> >> > of the TPM log to begin with, given that the TPM will be out of sync
> >> > at this point. But it is still better to keep it for symmetry, letting
> >> > the higher level kexec/kdump logic running in user space reason about
> >> > whether the TPM log has any value to it.
> >>
> >> Someone seems to think so or there would not be a complaint that it is
> >> getting corrupted.
> >>
> >
> > No. The problem is that the size of the table is *in* the table, and
> > so if it gets corrupted, the code that attempts to memblock_reserve()
> > it goes off into the weeds. But that does not imply there is a point
> > to having access to this table from a kexec kernel in the first place.
>
> If there is no point to having access to it then we should just not
> pass anything to the loaded kernel, so the kernel does not think there
> is anything there.
>
> >> This should not be the kexec-on-panic kernel as that runs in memory
> >> that is reserved solely for it's own use.  So we are talking something
> >> like using kexec as a bootloader.
> >>
> >
> > kexec as a bootloader under TPM based measured boot will need to do a
> > lot more than pass the firmware's event log to the next kernel. I'd
> > expect a properly engineered kexec to replace this table entirely, and
> > include the hashes of the assets it has loaded and measured into the
> > respective PCRs.
> >
> > But let's stick to solving the actual issue here, rather than
> > philosophize on how kexec might work in this context.
>
>
> I am fine with that.  The complaint I had seen was that the table was
> being corrupted and asking how to solve that.  It seems I haven't read
> the part of the conversation where it was made clear that no one wants
> the tpm_log after kexec.
>

It was not made clear, that is why I raised the question. I argued
that the TPM log has limited utility after a kexec, given that we will
be in one of two situations:
- the kexec boot chain cares about the TPM and measured boot, and will
therefore have extended the TPM's PCRs and the TPM log will be out of
sync;
- the kexec boot chain does not care, and so there is no point in
forwarding the TPM log.

Perhaps there is a third case where kdump wants to inspect the TPM log
that the crashed kernel accessed? But this is rather speculative.

> If someone wants the tpm_log then we need to solve this problem.
>

Agreed.
Jonathan McDowell Oct. 9, 2024, 9:10 a.m. UTC | #23
On Wed, Sep 18, 2024 at 09:36:06AM +0200, Ard Biesheuvel wrote:
> On Wed, 18 Sept 2024 at 05:14, Eric W. Biederman <ebiederm@xmission.com> wrote:
> > Ard Biesheuvel <ardb@kernel.org> writes:
> > > On Tue, 17 Sept 2024 at 17:24, Eric W. Biederman <ebiederm@xmission.com> wrote:
> > >> Ard Biesheuvel <ardb@kernel.org> writes:

> > >> This should not be the kexec-on-panic kernel as that runs in memory
> > >> that is reserved solely for it's own use.  So we are talking something
> > >> like using kexec as a bootloader.
> > >
> > > kexec as a bootloader under TPM based measured boot will need to do a
> > > lot more than pass the firmware's event log to the next kernel. I'd
> > > expect a properly engineered kexec to replace this table entirely, and
> > > include the hashes of the assets it has loaded and measured into the
> > > respective PCRs.
> > >
> > > But let's stick to solving the actual issue here, rather than
> > > philosophize on how kexec might work in this context.
> >
> > I am fine with that.  The complaint I had seen was that the table was
> > being corrupted and asking how to solve that.  It seems I haven't read
> > the part of the conversation where it was made clear that no one wants
> > the tpm_log after kexec.
> >
> It was not made clear, that is why I raised the question. I argued
> that the TPM log has limited utility after a kexec, given that we will
> be in one of two situations:
> - the kexec boot chain cares about the TPM and measured boot, and will
> therefore have extended the TPM's PCRs and the TPM log will be out of
> sync;
> - the kexec boot chain does not care, and so there is no point in
> forwarding the TPM log.
> 
> Perhaps there is a third case where kdump wants to inspect the TPM log
> that the crashed kernel accessed? But this is rather speculative.

Generally the kernel/host OS and the firmware are touching different
PCRs in the TPM.

The firmware eventlog covers what the firmware/bootloader measured;
itself, option ROMs, secure boot details, bootloader, initial
kernel/initrd (if we're talking grub as the initial bootloader). These
details are not changed by a kexec, and provide the anchor of the
software trust chain.

A kexec'd kernel will generally not touch the same PCRs. The primary way
I know to carry kexec measurements / logs over to new kernels is using
IMA, which will be configured to use one of the later PCRs (default of
10).

That means that the firmware event log is still completely valid to
subsequent kernels, and is still required to validate the
firmware/bootloader trust chain. You then probably _also_ want to make
use of the IMA log to validate the kexec'd kernel chain, but that's
separate.

> > If someone wants the tpm_log then we need to solve this problem.
> Agreed.

There's a real requirement and use for kexec'd kernels to be able to
continue to access the firmware TPM event log; to the extent that there
are also patches floating around to have this carried over via device
tree on non-UEFI platforms.

J.
Breno Leitao Oct. 9, 2024, 10:46 a.m. UTC | #24
On Wed, Oct 09, 2024 at 10:10:57AM +0100, Jonathan McDowell wrote:
> On Wed, Sep 18, 2024 at 09:36:06AM +0200, Ard Biesheuvel wrote:
> > On Wed, 18 Sept 2024 at 05:14, Eric W. Biederman <ebiederm@xmission.com> wrote:
> > > Ard Biesheuvel <ardb@kernel.org> writes:
> > > > On Tue, 17 Sept 2024 at 17:24, Eric W. Biederman <ebiederm@xmission.com> wrote:
> > > >> Ard Biesheuvel <ardb@kernel.org> writes:
> 
> > > >> This should not be the kexec-on-panic kernel as that runs in memory
> > > >> that is reserved solely for it's own use.  So we are talking something
> > > >> like using kexec as a bootloader.
> > > >
> > > > kexec as a bootloader under TPM based measured boot will need to do a
> > > > lot more than pass the firmware's event log to the next kernel. I'd
> > > > expect a properly engineered kexec to replace this table entirely, and
> > > > include the hashes of the assets it has loaded and measured into the
> > > > respective PCRs.
> > > >
> > > > But let's stick to solving the actual issue here, rather than
> > > > philosophize on how kexec might work in this context.
> > >
> > > I am fine with that.  The complaint I had seen was that the table was
> > > being corrupted and asking how to solve that.  It seems I haven't read
> > > the part of the conversation where it was made clear that no one wants
> > > the tpm_log after kexec.
> > >
> > It was not made clear, that is why I raised the question. I argued
> > that the TPM log has limited utility after a kexec, given that we will
> > be in one of two situations:
> > - the kexec boot chain cares about the TPM and measured boot, and will
> > therefore have extended the TPM's PCRs and the TPM log will be out of
> > sync;
> > - the kexec boot chain does not care, and so there is no point in
> > forwarding the TPM log.
> > 
> > Perhaps there is a third case where kdump wants to inspect the TPM log
> > that the crashed kernel accessed? But this is rather speculative.
> 
> Generally the kernel/host OS and the firmware are touching different
> PCRs in the TPM.
> 
> The firmware eventlog covers what the firmware/bootloader measured;
> itself, option ROMs, secure boot details, bootloader, initial
> kernel/initrd (if we're talking grub as the initial bootloader). These
> details are not changed by a kexec, and provide the anchor of the
> software trust chain.
> 
> A kexec'd kernel will generally not touch the same PCRs. The primary way
> I know to carry kexec measurements / logs over to new kernels is using
> IMA, which will be configured to use one of the later PCRs (default of
> 10).

What about in the case where you don't have Grub, but, use the kernel as
the bootloader, kexecing into the desired kernel?

Will the bootloader-kernel touch the same PCRs as GRUB, or it will only
touch PCRs above 10?

Thanks
--breno
Jonathan McDowell Oct. 9, 2024, 2:05 p.m. UTC | #25
On Wed, Oct 09, 2024 at 03:46:32AM -0700, Breno Leitao wrote:
> On Wed, Oct 09, 2024 at 10:10:57AM +0100, Jonathan McDowell wrote:
> > On Wed, Sep 18, 2024 at 09:36:06AM +0200, Ard Biesheuvel wrote:
> > > On Wed, 18 Sept 2024 at 05:14, Eric W. Biederman <ebiederm@xmission.com> wrote:
> > > > Ard Biesheuvel <ardb@kernel.org> writes:
> > > > > On Tue, 17 Sept 2024 at 17:24, Eric W. Biederman <ebiederm@xmission.com> wrote:
> > > > >> Ard Biesheuvel <ardb@kernel.org> writes:
> > 
> > > > >> This should not be the kexec-on-panic kernel as that runs in memory
> > > > >> that is reserved solely for it's own use.  So we are talking something
> > > > >> like using kexec as a bootloader.
> > > > >
> > > > > kexec as a bootloader under TPM based measured boot will need to do a
> > > > > lot more than pass the firmware's event log to the next kernel. I'd
> > > > > expect a properly engineered kexec to replace this table entirely, and
> > > > > include the hashes of the assets it has loaded and measured into the
> > > > > respective PCRs.
> > > > >
> > > > > But let's stick to solving the actual issue here, rather than
> > > > > philosophize on how kexec might work in this context.
> > > >
> > > > I am fine with that.  The complaint I had seen was that the table was
> > > > being corrupted and asking how to solve that.  It seems I haven't read
> > > > the part of the conversation where it was made clear that no one wants
> > > > the tpm_log after kexec.
> > > >
> > > It was not made clear, that is why I raised the question. I argued
> > > that the TPM log has limited utility after a kexec, given that we will
> > > be in one of two situations:
> > > - the kexec boot chain cares about the TPM and measured boot, and will
> > > therefore have extended the TPM's PCRs and the TPM log will be out of
> > > sync;
> > > - the kexec boot chain does not care, and so there is no point in
> > > forwarding the TPM log.
> > > 
> > > Perhaps there is a third case where kdump wants to inspect the TPM log
> > > that the crashed kernel accessed? But this is rather speculative.
> > 
> > Generally the kernel/host OS and the firmware are touching different
> > PCRs in the TPM.
> > 
> > The firmware eventlog covers what the firmware/bootloader measured;
> > itself, option ROMs, secure boot details, bootloader, initial
> > kernel/initrd (if we're talking grub as the initial bootloader). These
> > details are not changed by a kexec, and provide the anchor of the
> > software trust chain.
> > 
> > A kexec'd kernel will generally not touch the same PCRs. The primary way
> > I know to carry kexec measurements / logs over to new kernels is using
> > IMA, which will be configured to use one of the later PCRs (default of
> > 10).
> 
> What about in the case where you don't have Grub, but, use the kernel as
> the bootloader, kexecing into the desired kernel?
> 
> Will the bootloader-kernel touch the same PCRs as GRUB, or it will only
> touch PCRs above 10?

A kernel kexecing into another will generally use IMA if it wants to
measure into the TPM, which will use PCR 10 by default and not conflict
with the firmware PCRs (and you then use the IMA integrity log, which is
passed over a kexec, to work out the kexec side of things).

You still need the firmware event log in that case because the
"bootloader" kernel combo you load is measured into the TPM by the
firmware.

You _could_ technically configure things up to re-use some of the
firmware PCRs, but it generally wouldn't make a lot of sense to do so
and I've not seen any examples of that sort of configuration.

J.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index 2e74a7f0e935..4e9aa24f03bd 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -16,6 +16,8 @@  extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type);
 
 extern void e820__range_add   (u64 start, u64 size, enum e820_type type);
 extern u64  e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type);
+extern u64  e820__range_update_firmware(u64 start, u64 size, enum e820_type old_type,
+					enum e820_type new_type);
 extern u64  e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type);
 extern u64  e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type);
 
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 4893d30ce438..912400161623 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -538,6 +538,12 @@  u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size,
 	return __e820__range_update(t, start, size, old_type, new_type);
 }
 
+u64 __init e820__range_update_firmware(u64 start, u64 size, enum e820_type old_type,
+				       enum e820_type new_type)
+{
+	return __e820__range_update(e820_table_firmware, start, size, old_type, new_type);
+}
+
 /* Remove a range of memory from the E820 table: */
 u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type)
 {
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 88a96816de9a..aa95f77d7a30 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -171,6 +171,15 @@  static void __init do_add_efi_memmap(void)
 	e820__update_table(e820_table);
 }
 
+/* Reserve firmware area if it was marked as RAM */
+void arch_update_firmware_area(u64 addr, u64 size)
+{
+	if (e820__get_entry_type(addr, addr + size) == E820_TYPE_RAM) {
+		e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
+		e820__update_table(e820_table_firmware);
+	}
+}
+
 /*
  * Given add_efi_memmap defaults to 0 and there is no alternative
  * e820 mechanism for soft-reserved memory, import the full EFI memory
diff --git a/drivers/firmware/efi/tpm.c b/drivers/firmware/efi/tpm.c
index e8d69bd548f3..8e6e7131d718 100644
--- a/drivers/firmware/efi/tpm.c
+++ b/drivers/firmware/efi/tpm.c
@@ -60,6 +60,7 @@  int __init efi_tpm_eventlog_init(void)
 	}
 
 	tbl_size = sizeof(*log_tbl) + log_tbl->size;
+	arch_update_firmware_area(efi.tpm_log, tbl_size);
 	memblock_reserve(efi.tpm_log, tbl_size);
 
 	if (efi.tpm_final_log == EFI_INVALID_TABLE_ADDR) {
@@ -107,4 +108,3 @@  int __init efi_tpm_eventlog_init(void)
 	early_memunmap(log_tbl, sizeof(*log_tbl));
 	return ret;
 }
-
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 6bf3c4fe8511..9c239cdff771 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1371,4 +1371,11 @@  extern struct blocking_notifier_head efivar_ops_nh;
 void efivars_generic_ops_register(void);
 void efivars_generic_ops_unregister(void);
 
+#ifdef CONFIG_X86_64
+void __init arch_update_firmware_area(u64 addr, u64 size);
+#else
+static inline void __init arch_update_firmware_area(u64 addr, u64 size)
+{
+}
+#endif
 #endif /* _LINUX_EFI_H */