diff mbox

[RFC] arm64/efi: use id mapping for Runtime Services

Message ID 1406815909-26825-1-git-send-email-ard.biesheuvel@linaro.org
State New
Headers show

Commit Message

Ard Biesheuvel July 31, 2014, 2:11 p.m. UTC
There are 2 interesting pieces of information in the UEFI spec section 2.3.6
regarding the mapping of runtime regions:
(a) the firmware should not request a virtual mapping for configuration tables,
    even though they are marked as EfiRuntimeServicesData;
(b) calling SetVirtualAddressMap() is optional, and it is equally appropriate to
    call Runtime Services using an identity mapping.

So we can eliminate some of the complexity around UEFI Runtime Services by not
using a virtual mapping at all, and calling the services at their physical
address. This is especially useful under kexec, as SetVirtualAddressMap() may
only be called once, and there is no guarantee that mappings are stable between
different kexec'd kernels.

The fallout for other in-kernel users of UEFI data structures should be
negligible, as they cannot legally access those data structures through
pre-existing virtual mappings anyway (point (a) above)

It should also be noted that, as the kernel side of the address space (TTBR1) is
retained, the stack and pointer function arguments remain accessible to the
runtime service while the id mapping is active.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/include/asm/efi.h |  24 ++++++++--
 arch/arm64/kernel/efi.c      | 106 ++-----------------------------------------
 2 files changed, 23 insertions(+), 107 deletions(-)

Comments

Mark Salter July 31, 2014, 3:02 p.m. UTC | #1
On Thu, 2014-07-31 at 16:11 +0200, Ard Biesheuvel wrote:
> There are 2 interesting pieces of information in the UEFI spec section 2.3.6
> regarding the mapping of runtime regions:
> (a) the firmware should not request a virtual mapping for configuration tables,
>     even though they are marked as EfiRuntimeServicesData;
> (b) calling SetVirtualAddressMap() is optional, and it is equally appropriate to
>     call Runtime Services using an identity mapping.
> 
> So we can eliminate some of the complexity around UEFI Runtime Services by not
> using a virtual mapping at all, and calling the services at their physical
> address. This is especially useful under kexec, as SetVirtualAddressMap() may
> only be called once, and there is no guarantee that mappings are stable between
> different kexec'd kernels.
> 
> The fallout for other in-kernel users of UEFI data structures should be
> negligible, as they cannot legally access those data structures through
> pre-existing virtual mappings anyway (point (a) above)
> 
> It should also be noted that, as the kernel side of the address space (TTBR1) is
> retained, the stack and pointer function arguments remain accessible to the
> runtime service while the id mapping is active.
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
>  arch/arm64/include/asm/efi.h |  24 ++++++++--
>  arch/arm64/kernel/efi.c      | 106 ++-----------------------------------------
>  2 files changed, 23 insertions(+), 107 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
> index a34fd3b12e2b..d42a21e79b39 100644
> --- a/arch/arm64/include/asm/efi.h
> +++ b/arch/arm64/include/asm/efi.h
> @@ -1,8 +1,10 @@
>  #ifndef _ASM_EFI_H
>  #define _ASM_EFI_H
>  
> +#include <asm/cacheflush.h>
>  #include <asm/io.h>
>  #include <asm/neon.h>
> +#include <asm/tlbflush.h>
>  
>  #ifdef CONFIG_EFI
>  extern void efi_init(void);
> @@ -12,23 +14,37 @@ extern void efi_idmap_init(void);
>  #define efi_idmap_init()
>  #endif
>  
> +static inline void switch_pgd(pgd_t *pgd, struct mm_struct *mm)
> +{
> +	cpu_switch_mm(pgd, mm);
> +	flush_tlb_all();
> +	if (icache_is_aivivt())
> +		__flush_icache_all();
> +}
> +
>  #define efi_call_virt(f, ...)						\
>  ({									\
> -	efi_##f##_t *__f = efi.systab->runtime->f;			\
> +	efi_##f##_t *__f;						\
>  	efi_status_t __s;						\
>  									\
> -	kernel_neon_begin();						\
> +	kernel_neon_begin(); /* disables preemption */			\
> +	switch_pgd(idmap_pg_dir, &init_mm);				\
> +	__f =  efi.systab->runtime->f;					\
>  	__s = __f(__VA_ARGS__);						\
> +	switch_pgd(current->active_mm->pgd, current->active_mm);	\
>  	kernel_neon_end();						\
>  	__s;								\
>  })
>  
>  #define __efi_call_virt(f, ...)						\
>  ({									\
> -	efi_##f##_t *__f = efi.systab->runtime->f;			\
> +	efi_##f##_t *__f;						\
>  									\
> -	kernel_neon_begin();						\
> +	kernel_neon_begin(); /* disables preemption */			\
> +	switch_pgd(idmap_pg_dir, &init_mm);				\
> +	__f =  efi.systab->runtime->f;					\
>  	__f(__VA_ARGS__);						\
> +	switch_pgd(current->active_mm->pgd, current->active_mm);	\
>  	kernel_neon_end();						\
>  })

If you replace the current user pgd with idmap pgd and there is
an exception in the firmware which would lead to the user task
being killed, what happens?

>  
> diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
> index e72f3100958f..d620a031e7bf 100644
> --- a/arch/arm64/kernel/efi.c
> +++ b/arch/arm64/kernel/efi.c
> @@ -27,8 +27,6 @@
>  
>  struct efi_memory_map memmap;
>  
> -static efi_runtime_services_t *runtime;
> -
>  static u64 efi_system_table;
>  
>  static int uefi_debug __initdata;
> @@ -340,51 +338,9 @@ void __init efi_idmap_init(void)
>  	efi_setup_idmap();
>  }
>  
> -static int __init remap_region(efi_memory_desc_t *md, void **new)
> -{
> -	u64 paddr, vaddr, npages, size;
> -
> -	paddr = md->phys_addr;
> -	npages = md->num_pages;
> -	memrange_efi_to_native(&paddr, &npages);
> -	size = npages << PAGE_SHIFT;
> -
> -	if (is_normal_ram(md))
> -		vaddr = (__force u64)ioremap_cache(paddr, size);
> -	else
> -		vaddr = (__force u64)ioremap(paddr, size);
> -
> -	if (!vaddr) {
> -		pr_err("Unable to remap 0x%llx pages @ %p\n",
> -		       npages, (void *)paddr);
> -		return 0;
> -	}
> -
> -	/* adjust for any rounding when EFI and system pagesize differs */
> -	md->virt_addr = vaddr + (md->phys_addr - paddr);
> -
> -	if (uefi_debug)
> -		pr_info("  EFI remap 0x%012llx => %p\n",
> -			md->phys_addr, (void *)md->virt_addr);
> -
> -	memcpy(*new, md, memmap.desc_size);
> -	*new += memmap.desc_size;
> -
> -	return 1;
> -}
> -
> -/*
> - * Switch UEFI from an identity map to a kernel virtual map
> - */
>  static int __init arm64_enter_virtual_mode(void)
>  {
> -	efi_memory_desc_t *md;
> -	phys_addr_t virtmap_phys;
> -	void *virtmap, *virt_md;
> -	efi_status_t status;
>  	u64 mapsize;
> -	int count = 0;
> -	unsigned long flags;
>  
>  	if (!efi_enabled(EFI_BOOT)) {
>  		pr_info("EFI services will not be available.\n");
> @@ -402,76 +358,20 @@ static int __init arm64_enter_virtual_mode(void)
>  
>  	efi.memmap = &memmap;
>  
> -	/* Map the runtime regions */
> -	virtmap = kmalloc(mapsize, GFP_KERNEL);
> -	if (!virtmap) {
> -		pr_err("Failed to allocate EFI virtual memmap\n");
> -		return -1;
> -	}
> -	virtmap_phys = virt_to_phys(virtmap);
> -	virt_md = virtmap;
> -
> -	for_each_efi_memory_desc(&memmap, md) {
> -		if (!(md->attribute & EFI_MEMORY_RUNTIME))
> -			continue;
> -		if (!remap_region(md, &virt_md))
> -			goto err_unmap;
> -		++count;
> -	}
> -
> -	efi.systab = (__force void *)efi_lookup_mapped_addr(efi_system_table);
> +	efi.systab = (__force void *)ioremap_cache(efi_system_table,
> +						   sizeof(efi_system_table_t));
>  	if (!efi.systab) {
> -		/*
> -		 * If we have no virtual mapping for the System Table at this
> -		 * point, the memory map doesn't cover the physical offset where
> -		 * it resides. This means the System Table will be inaccessible
> -		 * to Runtime Services themselves once the virtual mapping is
> -		 * installed.
> -		 */
>  		pr_err("Failed to remap EFI System Table -- buggy firmware?\n");
> -		goto err_unmap;
> +		return -1;
>  	}
>  	set_bit(EFI_SYSTEM_TABLES, &efi.flags);
>  
> -	local_irq_save(flags);
> -	cpu_switch_mm(idmap_pg_dir, &init_mm);
> -
> -	/* Call SetVirtualAddressMap with the physical address of the map */
> -	runtime = efi.systab->runtime;
> -	efi.set_virtual_address_map = runtime->set_virtual_address_map;
> -
> -	status = efi.set_virtual_address_map(count * memmap.desc_size,
> -					     memmap.desc_size,
> -					     memmap.desc_version,
> -					     (efi_memory_desc_t *)virtmap_phys);
> -	cpu_set_reserved_ttbr0();
> -	flush_tlb_all();
> -	local_irq_restore(flags);
> -
> -	kfree(virtmap);
> -
>  	free_boot_services();
>  
> -	if (status != EFI_SUCCESS) {
> -		pr_err("Failed to set EFI virtual address map! [%lx]\n",
> -			status);
> -		return -1;
> -	}
> -
>  	/* Set up runtime services function pointers */
> -	runtime = efi.systab->runtime;
>  	efi_native_runtime_setup();
>  	set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
>  
>  	return 0;
> -
> -err_unmap:
> -	/* unmap all mappings that succeeded: there are 'count' of those */
> -	for (virt_md = virtmap; count--; virt_md += memmap.desc_size) {
> -		md = virt_md;
> -		iounmap((__force void __iomem *)md->virt_addr);
> -	}
> -	kfree(virtmap);
> -	return -1;
>  }
>  early_initcall(arm64_enter_virtual_mode);
Ard Biesheuvel July 31, 2014, 6:37 p.m. UTC | #2
On 31 July 2014 17:02, Mark Salter <msalter@redhat.com> wrote:
> On Thu, 2014-07-31 at 16:11 +0200, Ard Biesheuvel wrote:
>> There are 2 interesting pieces of information in the UEFI spec section 2.3.6
>> regarding the mapping of runtime regions:
>> (a) the firmware should not request a virtual mapping for configuration tables,
>>     even though they are marked as EfiRuntimeServicesData;
>> (b) calling SetVirtualAddressMap() is optional, and it is equally appropriate to
>>     call Runtime Services using an identity mapping.
>>
>> So we can eliminate some of the complexity around UEFI Runtime Services by not
>> using a virtual mapping at all, and calling the services at their physical
>> address. This is especially useful under kexec, as SetVirtualAddressMap() may
>> only be called once, and there is no guarantee that mappings are stable between
>> different kexec'd kernels.
>>
>> The fallout for other in-kernel users of UEFI data structures should be
>> negligible, as they cannot legally access those data structures through
>> pre-existing virtual mappings anyway (point (a) above)
>>
>> It should also be noted that, as the kernel side of the address space (TTBR1) is
>> retained, the stack and pointer function arguments remain accessible to the
>> runtime service while the id mapping is active.
>>
>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>> ---
>>  arch/arm64/include/asm/efi.h |  24 ++++++++--
>>  arch/arm64/kernel/efi.c      | 106 ++-----------------------------------------
>>  2 files changed, 23 insertions(+), 107 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
>> index a34fd3b12e2b..d42a21e79b39 100644
>> --- a/arch/arm64/include/asm/efi.h
>> +++ b/arch/arm64/include/asm/efi.h
>> @@ -1,8 +1,10 @@
>>  #ifndef _ASM_EFI_H
>>  #define _ASM_EFI_H
>>
>> +#include <asm/cacheflush.h>
>>  #include <asm/io.h>
>>  #include <asm/neon.h>
>> +#include <asm/tlbflush.h>
>>
>>  #ifdef CONFIG_EFI
>>  extern void efi_init(void);
>> @@ -12,23 +14,37 @@ extern void efi_idmap_init(void);
>>  #define efi_idmap_init()
>>  #endif
>>
>> +static inline void switch_pgd(pgd_t *pgd, struct mm_struct *mm)
>> +{
>> +     cpu_switch_mm(pgd, mm);
>> +     flush_tlb_all();
>> +     if (icache_is_aivivt())
>> +             __flush_icache_all();
>> +}
>> +
>>  #define efi_call_virt(f, ...)                                                \
>>  ({                                                                   \
>> -     efi_##f##_t *__f = efi.systab->runtime->f;                      \
>> +     efi_##f##_t *__f;                                               \
>>       efi_status_t __s;                                               \
>>                                                                       \
>> -     kernel_neon_begin();                                            \
>> +     kernel_neon_begin(); /* disables preemption */                  \
>> +     switch_pgd(idmap_pg_dir, &init_mm);                             \
>> +     __f =  efi.systab->runtime->f;                                  \
>>       __s = __f(__VA_ARGS__);                                         \
>> +     switch_pgd(current->active_mm->pgd, current->active_mm);        \
>>       kernel_neon_end();                                              \
>>       __s;                                                            \
>>  })
>>
>>  #define __efi_call_virt(f, ...)                                              \
>>  ({                                                                   \
>> -     efi_##f##_t *__f = efi.systab->runtime->f;                      \
>> +     efi_##f##_t *__f;                                               \
>>                                                                       \
>> -     kernel_neon_begin();                                            \
>> +     kernel_neon_begin(); /* disables preemption */                  \
>> +     switch_pgd(idmap_pg_dir, &init_mm);                             \
>> +     __f =  efi.systab->runtime->f;                                  \
>>       __f(__VA_ARGS__);                                               \
>> +     switch_pgd(current->active_mm->pgd, current->active_mm);        \
>>       kernel_neon_end();                                              \
>>  })
>
> If you replace the current user pgd with idmap pgd and there is
> an exception in the firmware which would lead to the user task
> being killed, what happens?
>

Changing the function pointer for GetNextVariable() to 0 gives me this:

root@genericarmv8:~# insmod ./efivars.ko
EFI Variables Facility v0.08 2004-May-17
Bad mode in Synchronous Abort handler detected, code 0x86000006
CPU: 0 PID: 829 Comm: insmod Not tainted 3.16.0-rc4+ #48
task: ffffffc8787d2c00 ti: ffffffc07e68c000 task.ti: ffffffc07e68c000
PC is at 0x0
LR is at virt_efi_get_next_variable+0x84/0xe8
pc : [<0000000000000000>] lr : [<ffffffc0003d5910>] pstate: 800001c5
sp : ffffffc07e68fb40
x29: ffffffc07e68fb40 x28: 0000000000000001
x27: ffffffc0006a3000 x26: 0000000000000000
x25: 0000000000000000 x24: ffffffc0006f9388
x23: 0000000000000400 x22: ffffffc07e655000
x21: ffffffc07e68fc10 x20: ffffffc0006d7000
x19: ffffffc0006d7000 x18: 0000007fd99f2770
x17: 0000007f9f904e20 x16: ffffffc0001bc768
x15: 0000007f9f982598 x14: 0fffffffffffffff
x13: 0000000000000020 x12: 0000000000000020
x11: 0101010101010101 x10: ffffffff7f7f7f7f
x9 : 0000000000000000 x8 : ffffffc07e655400
x7 : 0000000000000000 x6 : 00000000000003ff
x5 : 0000000000000000 x4 : 0000000000000008
x3 : 0000000000000000 x2 : ffffffc07e68fc00
x1 : ffffffc07e655000 x0 : ffffffc07e68fc10

Internal error: Oops - bad mode: 0 [#1] PREEMPT SMP
Modules linked in: efivars(+)
CPU: 0 PID: 829 Comm: insmod Not tainted 3.16.0-rc4+ #48
task: ffffffc8787d2c00 ti: ffffffc07e68c000 task.ti: ffffffc07e68c000
PC is at 0x0
LR is at virt_efi_get_next_variable+0x84/0xe8
pc : [<0000000000000000>] lr : [<ffffffc0003d5910>] pstate: 800001c5
sp : ffffffc07e68fb40
x29: ffffffc07e68fb40 x28: 0000000000000001
x27: ffffffc0006a3000 x26: 0000000000000000
x25: 0000000000000000 x24: ffffffc0006f9388
x23: 0000000000000400 x22: ffffffc07e655000
x21: ffffffc07e68fc10 x20: ffffffc0006d7000
x19: ffffffc0006d7000 x18: 0000007fd99f2770
x17: 0000007f9f904e20 x16: ffffffc0001bc768
x15: 0000007f9f982598 x14: 0fffffffffffffff
x13: 0000000000000020 x12: 0000000000000020
x11: 0101010101010101 x10: ffffffff7f7f7f7f
x9 : 0000000000000000 x8 : ffffffc07e655400
x7 : 0000000000000000 x6 : 00000000000003ff
x5 : 0000000000000000 x4 : 0000000000000008
x3 : 0000000000000000 x2 : ffffffc07e68fc00
x1 : ffffffc07e655000 x0 : ffffffc07e68fc10

Process insmod (pid: 829, stack limit = 0xffffffc07e68c058)
Stack: (0xffffffc07e68fb40 to 0xffffffc07e690000)
[...]
Call trace:
[<          (null)>]           (null)
[<ffffffc0003d47d4>] efivar_init+0x88/0x2d8
[<ffffffbffc000ddc>] init_module+0xb4/0x20c [efivars]
[<ffffffc0000814a4>] do_one_initcall+0x88/0x198
[<ffffffc000102204>] load_module+0x16e8/0x1ce4
[<ffffffc0001028bc>] SyS_init_module+0xbc/0xf0
Code: bad PC value
---[ end trace a3d5e8b14467fa1f ]---
note: insmod[829] exited with preempt_count 2
Segmentation fault
root@genericarmv8:~#

so it appears to handle this fairly well. The SIGSEV is not delivered
in the ordinary way, but do_exit(SIGSEGV) is called when taking (and
not handling) an exception at EL1 in process context, so the process
is terminated immediately. Data aborts will not trigger the page fault
handling for lack of fixups.
Will Deacon Aug. 6, 2014, 2:36 p.m. UTC | #3
On Thu, Jul 31, 2014 at 03:11:49PM +0100, Ard Biesheuvel wrote:
> There are 2 interesting pieces of information in the UEFI spec section 2.3.6
> regarding the mapping of runtime regions:
> (a) the firmware should not request a virtual mapping for configuration tables,
>     even though they are marked as EfiRuntimeServicesData;
> (b) calling SetVirtualAddressMap() is optional, and it is equally appropriate to
>     call Runtime Services using an identity mapping.
> 
> So we can eliminate some of the complexity around UEFI Runtime Services by not
> using a virtual mapping at all, and calling the services at their physical
> address. This is especially useful under kexec, as SetVirtualAddressMap() may
> only be called once, and there is no guarantee that mappings are stable between
> different kexec'd kernels.
> 
> The fallout for other in-kernel users of UEFI data structures should be
> negligible, as they cannot legally access those data structures through
> pre-existing virtual mappings anyway (point (a) above)
> 
> It should also be noted that, as the kernel side of the address space (TTBR1) is
> retained, the stack and pointer function arguments remain accessible to the
> runtime service while the id mapping is active.
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
>  arch/arm64/include/asm/efi.h |  24 ++++++++--
>  arch/arm64/kernel/efi.c      | 106 ++-----------------------------------------
>  2 files changed, 23 insertions(+), 107 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
> index a34fd3b12e2b..d42a21e79b39 100644
> --- a/arch/arm64/include/asm/efi.h
> +++ b/arch/arm64/include/asm/efi.h
> @@ -1,8 +1,10 @@
>  #ifndef _ASM_EFI_H
>  #define _ASM_EFI_H
>  
> +#include <asm/cacheflush.h>
>  #include <asm/io.h>
>  #include <asm/neon.h>
> +#include <asm/tlbflush.h>
>  
>  #ifdef CONFIG_EFI
>  extern void efi_init(void);
> @@ -12,23 +14,37 @@ extern void efi_idmap_init(void);
>  #define efi_idmap_init()
>  #endif
>  
> +static inline void switch_pgd(pgd_t *pgd, struct mm_struct *mm)
> +{
> +	cpu_switch_mm(pgd, mm);
> +	flush_tlb_all();
> +	if (icache_is_aivivt())
> +		__flush_icache_all();
> +}
> +
>  #define efi_call_virt(f, ...)						\
>  ({									\
> -	efi_##f##_t *__f = efi.systab->runtime->f;			\
> +	efi_##f##_t *__f;						\
>  	efi_status_t __s;						\
>  									\
> -	kernel_neon_begin();						\
> +	kernel_neon_begin(); /* disables preemption */			\
> +	switch_pgd(idmap_pg_dir, &init_mm);				\
> +	__f =  efi.systab->runtime->f;					\
>  	__s = __f(__VA_ARGS__);						\
> +	switch_pgd(current->active_mm->pgd, current->active_mm);	\
>  	kernel_neon_end();						\
>  	__s;								\
>  })

This scares the bejesus out of me, but I can't put my finger on exactly why.
I think it does what you intend and I can't break it myself, so it would be
really good if the EFI folks could confirm that this looks good to them.

Cheers,

Will
Ard Biesheuvel Aug. 6, 2014, 3:15 p.m. UTC | #4
On 6 August 2014 16:36, Will Deacon <will.deacon@arm.com> wrote:
> On Thu, Jul 31, 2014 at 03:11:49PM +0100, Ard Biesheuvel wrote:
>> There are 2 interesting pieces of information in the UEFI spec section 2.3.6
>> regarding the mapping of runtime regions:
>> (a) the firmware should not request a virtual mapping for configuration tables,
>>     even though they are marked as EfiRuntimeServicesData;
>> (b) calling SetVirtualAddressMap() is optional, and it is equally appropriate to
>>     call Runtime Services using an identity mapping.
>>
>> So we can eliminate some of the complexity around UEFI Runtime Services by not
>> using a virtual mapping at all, and calling the services at their physical
>> address. This is especially useful under kexec, as SetVirtualAddressMap() may
>> only be called once, and there is no guarantee that mappings are stable between
>> different kexec'd kernels.
>>
>> The fallout for other in-kernel users of UEFI data structures should be
>> negligible, as they cannot legally access those data structures through
>> pre-existing virtual mappings anyway (point (a) above)
>>
>> It should also be noted that, as the kernel side of the address space (TTBR1) is
>> retained, the stack and pointer function arguments remain accessible to the
>> runtime service while the id mapping is active.
>>
>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>> ---
>>  arch/arm64/include/asm/efi.h |  24 ++++++++--
>>  arch/arm64/kernel/efi.c      | 106 ++-----------------------------------------
>>  2 files changed, 23 insertions(+), 107 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
>> index a34fd3b12e2b..d42a21e79b39 100644
>> --- a/arch/arm64/include/asm/efi.h
>> +++ b/arch/arm64/include/asm/efi.h
>> @@ -1,8 +1,10 @@
>>  #ifndef _ASM_EFI_H
>>  #define _ASM_EFI_H
>>
>> +#include <asm/cacheflush.h>
>>  #include <asm/io.h>
>>  #include <asm/neon.h>
>> +#include <asm/tlbflush.h>
>>
>>  #ifdef CONFIG_EFI
>>  extern void efi_init(void);
>> @@ -12,23 +14,37 @@ extern void efi_idmap_init(void);
>>  #define efi_idmap_init()
>>  #endif
>>
>> +static inline void switch_pgd(pgd_t *pgd, struct mm_struct *mm)
>> +{
>> +     cpu_switch_mm(pgd, mm);
>> +     flush_tlb_all();
>> +     if (icache_is_aivivt())
>> +             __flush_icache_all();
>> +}
>> +
>>  #define efi_call_virt(f, ...)                                                \
>>  ({                                                                   \
>> -     efi_##f##_t *__f = efi.systab->runtime->f;                      \
>> +     efi_##f##_t *__f;                                               \
>>       efi_status_t __s;                                               \
>>                                                                       \
>> -     kernel_neon_begin();                                            \
>> +     kernel_neon_begin(); /* disables preemption */                  \
>> +     switch_pgd(idmap_pg_dir, &init_mm);                             \
>> +     __f =  efi.systab->runtime->f;                                  \
>>       __s = __f(__VA_ARGS__);                                         \
>> +     switch_pgd(current->active_mm->pgd, current->active_mm);        \
>>       kernel_neon_end();                                              \
>>       __s;                                                            \
>>  })
>
> This scares the bejesus out of me, but I can't put my finger on exactly why.
> I think it does what you intend and I can't break it myself, so it would be
> really good if the EFI folks could confirm that this looks good to them.
>

There is something similar in the x86 code (arch/x86/platform/efi/efi.c)
"""
 * The new method does a pagetable switch in a preemption-safe manner
 * so that we're in a different address space when calling a runtime
 * function. For function arguments passing we do copy the PGDs of the
 * kernel page table into ->trampoline_pgd prior to each call.
"""

How exactly this will turn out for arm64 (and ARM) is still under
discussion, though. My position is that if you are going to switch
pgd's anyway, why not just use the id mapping? And even if you feel it
is mandatory to install a virtual address mapping into UEFI (which I
think is /not/ the case), you could install an id mapping as well,
which means all the related machinery still gets invoked.
The alternative to using a TTBR0 mapping would be to reserve a slice
of kernel virtual memory so that the Runtime Services are guaranteed
to live at the same virtual address after a kexec, ideally the same
region on 4k and 64k pages ...

We are planning to discuss this further at Linaro Connect next month.
Will Deacon Aug. 6, 2014, 4:32 p.m. UTC | #5
On Wed, Aug 06, 2014 at 04:15:23PM +0100, Ard Biesheuvel wrote:
> On 6 August 2014 16:36, Will Deacon <will.deacon@arm.com> wrote:
> > On Thu, Jul 31, 2014 at 03:11:49PM +0100, Ard Biesheuvel wrote:
> >>  #define efi_call_virt(f, ...)                                                \
> >>  ({                                                                   \
> >> -     efi_##f##_t *__f = efi.systab->runtime->f;                      \
> >> +     efi_##f##_t *__f;                                               \
> >>       efi_status_t __s;                                               \
> >>                                                                       \
> >> -     kernel_neon_begin();                                            \
> >> +     kernel_neon_begin(); /* disables preemption */                  \
> >> +     switch_pgd(idmap_pg_dir, &init_mm);                             \
> >> +     __f =  efi.systab->runtime->f;                                  \
> >>       __s = __f(__VA_ARGS__);                                         \
> >> +     switch_pgd(current->active_mm->pgd, current->active_mm);        \
> >>       kernel_neon_end();                                              \
> >>       __s;                                                            \
> >>  })
> >
> > This scares the bejesus out of me, but I can't put my finger on exactly why.
> > I think it does what you intend and I can't break it myself, so it would be
> > really good if the EFI folks could confirm that this looks good to them.
> >
> 
> There is something similar in the x86 code (arch/x86/platform/efi/efi.c)
> """
>  * The new method does a pagetable switch in a preemption-safe manner
>  * so that we're in a different address space when calling a runtime
>  * function. For function arguments passing we do copy the PGDs of the
>  * kernel page table into ->trampoline_pgd prior to each call.
> """
> 
> How exactly this will turn out for arm64 (and ARM) is still under
> discussion, though. My position is that if you are going to switch
> pgd's anyway, why not just use the id mapping? And even if you feel it
> is mandatory to install a virtual address mapping into UEFI (which I
> think is /not/ the case), you could install an id mapping as well,
> which means all the related machinery still gets invoked.
> The alternative to using a TTBR0 mapping would be to reserve a slice
> of kernel virtual memory so that the Runtime Services are guaranteed
> to live at the same virtual address after a kexec, ideally the same
> region on 4k and 64k pages ...
> 
> We are planning to discuss this further at Linaro Connect next month.

Ok, thanks for the update. I'll hold off on this until you've discussed it
some more.

Cheers,

Will
Grant Likely Sept. 10, 2014, 1:44 p.m. UTC | #6
I had missed this conversation, but Ard pointed me at it yesterday, so
I'll chime in now.

Ard, you are correct that SetVirtualAddressMap() is optional in the
spec and we could simply not call it. By a strict reading of the spec
that is completely valid. However, the problem is not with the spec,
it is with actual implementations. Vendors are notorious for shipping
hardware that works enough to boot the operating system(s) they care
about even though the implementation is out of spec. If we don't call
SetVirtualAddressMap() it pretty much guarantees that we'll never be
able to reliably call SetVirtualAddressMap() on ARM platforms because
there will be a non-trivial set of platforms that implement it
completely wrong.

In the ARM server space, Linux will be the dominant OS for the near
future, not Windows, so we are in the happy position of hardware
vendors making sure their platform boots Linux instead of trying to
run Linux on a platform only tested with Windows. We want to call
SetVirtualAddressMap() because it is a pretty fundamental part of
runtime services, and we want to make sure it works. I'd even argue
that it would be a good idea to randomize the map layout on each boot
to flush out buggy assumptions about the map it will be given.

That said, putting runtime services into a separate mapping apart from
the kernel mappings is exactly the right thing to do. If runtime
services access any data outside of declared address map, we want to
know about it and prevent it from happening. Otherwise we've got no
idea if UEFI is corrupting kernel data structures.

As for kexec, we can pass the assigned mapping on to the next kernel.
We already are passing the address map via the boot dt. As long as the
kernel can detect if a mapping has already been set, which should be
easy, the new kernel can use it instead of assigning one of it's own.

g.


On Wed, Aug 6, 2014 at 4:15 PM, Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> On 6 August 2014 16:36, Will Deacon <will.deacon@arm.com> wrote:
>> On Thu, Jul 31, 2014 at 03:11:49PM +0100, Ard Biesheuvel wrote:
>>> There are 2 interesting pieces of information in the UEFI spec section 2.3.6
>>> regarding the mapping of runtime regions:
>>> (a) the firmware should not request a virtual mapping for configuration tables,
>>>     even though they are marked as EfiRuntimeServicesData;
>>> (b) calling SetVirtualAddressMap() is optional, and it is equally appropriate to
>>>     call Runtime Services using an identity mapping.
>>>
>>> So we can eliminate some of the complexity around UEFI Runtime Services by not
>>> using a virtual mapping at all, and calling the services at their physical
>>> address. This is especially useful under kexec, as SetVirtualAddressMap() may
>>> only be called once, and there is no guarantee that mappings are stable between
>>> different kexec'd kernels.
>>>
>>> The fallout for other in-kernel users of UEFI data structures should be
>>> negligible, as they cannot legally access those data structures through
>>> pre-existing virtual mappings anyway (point (a) above)
>>>
>>> It should also be noted that, as the kernel side of the address space (TTBR1) is
>>> retained, the stack and pointer function arguments remain accessible to the
>>> runtime service while the id mapping is active.
>>>
>>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>>> ---
>>>  arch/arm64/include/asm/efi.h |  24 ++++++++--
>>>  arch/arm64/kernel/efi.c      | 106 ++-----------------------------------------
>>>  2 files changed, 23 insertions(+), 107 deletions(-)
>>>
>>> diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
>>> index a34fd3b12e2b..d42a21e79b39 100644
>>> --- a/arch/arm64/include/asm/efi.h
>>> +++ b/arch/arm64/include/asm/efi.h
>>> @@ -1,8 +1,10 @@
>>>  #ifndef _ASM_EFI_H
>>>  #define _ASM_EFI_H
>>>
>>> +#include <asm/cacheflush.h>
>>>  #include <asm/io.h>
>>>  #include <asm/neon.h>
>>> +#include <asm/tlbflush.h>
>>>
>>>  #ifdef CONFIG_EFI
>>>  extern void efi_init(void);
>>> @@ -12,23 +14,37 @@ extern void efi_idmap_init(void);
>>>  #define efi_idmap_init()
>>>  #endif
>>>
>>> +static inline void switch_pgd(pgd_t *pgd, struct mm_struct *mm)
>>> +{
>>> +     cpu_switch_mm(pgd, mm);
>>> +     flush_tlb_all();
>>> +     if (icache_is_aivivt())
>>> +             __flush_icache_all();
>>> +}
>>> +
>>>  #define efi_call_virt(f, ...)                                                \
>>>  ({                                                                   \
>>> -     efi_##f##_t *__f = efi.systab->runtime->f;                      \
>>> +     efi_##f##_t *__f;                                               \
>>>       efi_status_t __s;                                               \
>>>                                                                       \
>>> -     kernel_neon_begin();                                            \
>>> +     kernel_neon_begin(); /* disables preemption */                  \
>>> +     switch_pgd(idmap_pg_dir, &init_mm);                             \
>>> +     __f =  efi.systab->runtime->f;                                  \
>>>       __s = __f(__VA_ARGS__);                                         \
>>> +     switch_pgd(current->active_mm->pgd, current->active_mm);        \
>>>       kernel_neon_end();                                              \
>>>       __s;                                                            \
>>>  })
>>
>> This scares the bejesus out of me, but I can't put my finger on exactly why.
>> I think it does what you intend and I can't break it myself, so it would be
>> really good if the EFI folks could confirm that this looks good to them.
>>
>
> There is something similar in the x86 code (arch/x86/platform/efi/efi.c)
> """
>  * The new method does a pagetable switch in a preemption-safe manner
>  * so that we're in a different address space when calling a runtime
>  * function. For function arguments passing we do copy the PGDs of the
>  * kernel page table into ->trampoline_pgd prior to each call.
> """
>
> How exactly this will turn out for arm64 (and ARM) is still under
> discussion, though. My position is that if you are going to switch
> pgd's anyway, why not just use the id mapping? And even if you feel it
> is mandatory to install a virtual address mapping into UEFI (which I
> think is /not/ the case), you could install an id mapping as well,
> which means all the related machinery still gets invoked.
> The alternative to using a TTBR0 mapping would be to reserve a slice
> of kernel virtual memory so that the Runtime Services are guaranteed
> to live at the same virtual address after a kexec, ideally the same
> region on 4k and 64k pages ...
>
> We are planning to discuss this further at Linaro Connect next month.
>
> --
> Ard.
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
diff mbox

Patch

diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index a34fd3b12e2b..d42a21e79b39 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -1,8 +1,10 @@ 
 #ifndef _ASM_EFI_H
 #define _ASM_EFI_H
 
+#include <asm/cacheflush.h>
 #include <asm/io.h>
 #include <asm/neon.h>
+#include <asm/tlbflush.h>
 
 #ifdef CONFIG_EFI
 extern void efi_init(void);
@@ -12,23 +14,37 @@  extern void efi_idmap_init(void);
 #define efi_idmap_init()
 #endif
 
+static inline void switch_pgd(pgd_t *pgd, struct mm_struct *mm)
+{
+	cpu_switch_mm(pgd, mm);
+	flush_tlb_all();
+	if (icache_is_aivivt())
+		__flush_icache_all();
+}
+
 #define efi_call_virt(f, ...)						\
 ({									\
-	efi_##f##_t *__f = efi.systab->runtime->f;			\
+	efi_##f##_t *__f;						\
 	efi_status_t __s;						\
 									\
-	kernel_neon_begin();						\
+	kernel_neon_begin(); /* disables preemption */			\
+	switch_pgd(idmap_pg_dir, &init_mm);				\
+	__f =  efi.systab->runtime->f;					\
 	__s = __f(__VA_ARGS__);						\
+	switch_pgd(current->active_mm->pgd, current->active_mm);	\
 	kernel_neon_end();						\
 	__s;								\
 })
 
 #define __efi_call_virt(f, ...)						\
 ({									\
-	efi_##f##_t *__f = efi.systab->runtime->f;			\
+	efi_##f##_t *__f;						\
 									\
-	kernel_neon_begin();						\
+	kernel_neon_begin(); /* disables preemption */			\
+	switch_pgd(idmap_pg_dir, &init_mm);				\
+	__f =  efi.systab->runtime->f;					\
 	__f(__VA_ARGS__);						\
+	switch_pgd(current->active_mm->pgd, current->active_mm);	\
 	kernel_neon_end();						\
 })
 
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index e72f3100958f..d620a031e7bf 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -27,8 +27,6 @@ 
 
 struct efi_memory_map memmap;
 
-static efi_runtime_services_t *runtime;
-
 static u64 efi_system_table;
 
 static int uefi_debug __initdata;
@@ -340,51 +338,9 @@  void __init efi_idmap_init(void)
 	efi_setup_idmap();
 }
 
-static int __init remap_region(efi_memory_desc_t *md, void **new)
-{
-	u64 paddr, vaddr, npages, size;
-
-	paddr = md->phys_addr;
-	npages = md->num_pages;
-	memrange_efi_to_native(&paddr, &npages);
-	size = npages << PAGE_SHIFT;
-
-	if (is_normal_ram(md))
-		vaddr = (__force u64)ioremap_cache(paddr, size);
-	else
-		vaddr = (__force u64)ioremap(paddr, size);
-
-	if (!vaddr) {
-		pr_err("Unable to remap 0x%llx pages @ %p\n",
-		       npages, (void *)paddr);
-		return 0;
-	}
-
-	/* adjust for any rounding when EFI and system pagesize differs */
-	md->virt_addr = vaddr + (md->phys_addr - paddr);
-
-	if (uefi_debug)
-		pr_info("  EFI remap 0x%012llx => %p\n",
-			md->phys_addr, (void *)md->virt_addr);
-
-	memcpy(*new, md, memmap.desc_size);
-	*new += memmap.desc_size;
-
-	return 1;
-}
-
-/*
- * Switch UEFI from an identity map to a kernel virtual map
- */
 static int __init arm64_enter_virtual_mode(void)
 {
-	efi_memory_desc_t *md;
-	phys_addr_t virtmap_phys;
-	void *virtmap, *virt_md;
-	efi_status_t status;
 	u64 mapsize;
-	int count = 0;
-	unsigned long flags;
 
 	if (!efi_enabled(EFI_BOOT)) {
 		pr_info("EFI services will not be available.\n");
@@ -402,76 +358,20 @@  static int __init arm64_enter_virtual_mode(void)
 
 	efi.memmap = &memmap;
 
-	/* Map the runtime regions */
-	virtmap = kmalloc(mapsize, GFP_KERNEL);
-	if (!virtmap) {
-		pr_err("Failed to allocate EFI virtual memmap\n");
-		return -1;
-	}
-	virtmap_phys = virt_to_phys(virtmap);
-	virt_md = virtmap;
-
-	for_each_efi_memory_desc(&memmap, md) {
-		if (!(md->attribute & EFI_MEMORY_RUNTIME))
-			continue;
-		if (!remap_region(md, &virt_md))
-			goto err_unmap;
-		++count;
-	}
-
-	efi.systab = (__force void *)efi_lookup_mapped_addr(efi_system_table);
+	efi.systab = (__force void *)ioremap_cache(efi_system_table,
+						   sizeof(efi_system_table_t));
 	if (!efi.systab) {
-		/*
-		 * If we have no virtual mapping for the System Table at this
-		 * point, the memory map doesn't cover the physical offset where
-		 * it resides. This means the System Table will be inaccessible
-		 * to Runtime Services themselves once the virtual mapping is
-		 * installed.
-		 */
 		pr_err("Failed to remap EFI System Table -- buggy firmware?\n");
-		goto err_unmap;
+		return -1;
 	}
 	set_bit(EFI_SYSTEM_TABLES, &efi.flags);
 
-	local_irq_save(flags);
-	cpu_switch_mm(idmap_pg_dir, &init_mm);
-
-	/* Call SetVirtualAddressMap with the physical address of the map */
-	runtime = efi.systab->runtime;
-	efi.set_virtual_address_map = runtime->set_virtual_address_map;
-
-	status = efi.set_virtual_address_map(count * memmap.desc_size,
-					     memmap.desc_size,
-					     memmap.desc_version,
-					     (efi_memory_desc_t *)virtmap_phys);
-	cpu_set_reserved_ttbr0();
-	flush_tlb_all();
-	local_irq_restore(flags);
-
-	kfree(virtmap);
-
 	free_boot_services();
 
-	if (status != EFI_SUCCESS) {
-		pr_err("Failed to set EFI virtual address map! [%lx]\n",
-			status);
-		return -1;
-	}
-
 	/* Set up runtime services function pointers */
-	runtime = efi.systab->runtime;
 	efi_native_runtime_setup();
 	set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
 
 	return 0;
-
-err_unmap:
-	/* unmap all mappings that succeeded: there are 'count' of those */
-	for (virt_md = virtmap; count--; virt_md += memmap.desc_size) {
-		md = virt_md;
-		iounmap((__force void __iomem *)md->virt_addr);
-	}
-	kfree(virtmap);
-	return -1;
 }
 early_initcall(arm64_enter_virtual_mode);