diff mbox

[v5] arm64: fix VTTBR_BADDR_MASK

Message ID 20140818203604.4654.99905.stgit@joelaarch64.amd.com
State New
Headers show

Commit Message

Joel Schopp Aug. 18, 2014, 8:36 p.m. UTC
The current VTTBR_BADDR_MASK only masks 39 bits, which is broken on current
systems.  Rather than just add a bit it seems like a good time to also set
things at run-time instead of compile time to accomodate more hardware.

This patch sets TCR_EL2.PS, VTCR_EL2.T0SZ and vttbr_baddr_mask in runtime,
not compile time.

In ARMv8, EL2 physical address size (TCR_EL2.PS) and stage2 input address
size (VTCR_EL2.T0SZE) cannot be determined in compile time since they
depend on hardware capability.

According to Table D4-23 and Table D4-25 in ARM DDI 0487A.b document,
vttbr_x is calculated using different fixed values with consideration
of T0SZ, granule size and the level of translation tables. Therefore,
vttbr_baddr_mask should be determined dynamically.

Changes since v4:
More minor cleanups from review
Moved some functions into headers

Changes since v3:
Another rebase
Addressed minor comments from v2

Changes since v2:
Rebased on https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git next branch

Changes since v1:
Rebased fix on Jungseok Lee's patch https://lkml.org/lkml/2014/5/12/189 to
provide better long term fix.  Updated that patch to log error instead of
silently fail on unaligned vttbr.

Cc: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Sungjinn Chung <sungjinn.chung@samsung.com>
Signed-off-by: Jungseok Lee <jays.lee@samsung.com>
Signed-off-by: Joel Schopp <joel.schopp@amd.com>
---
 arch/arm/include/asm/kvm_mmu.h   |   12 ++++++
 arch/arm/kvm/arm.c               |   17 +++++++-
 arch/arm64/include/asm/kvm_arm.h |   17 +-------
 arch/arm64/include/asm/kvm_mmu.h |   78 ++++++++++++++++++++++++++++++++++++++
 arch/arm64/kvm/hyp-init.S        |   20 +++++++---
 5 files changed, 122 insertions(+), 22 deletions(-)

Comments

Christoffer Dall Aug. 19, 2014, 12:24 p.m. UTC | #1
On Mon, Aug 18, 2014 at 03:36:04PM -0500, Joel Schopp wrote:
> The current VTTBR_BADDR_MASK only masks 39 bits, which is broken on current
> systems.  Rather than just add a bit it seems like a good time to also set
> things at run-time instead of compile time to accomodate more hardware.
> 
> This patch sets TCR_EL2.PS, VTCR_EL2.T0SZ and vttbr_baddr_mask in runtime,
> not compile time.
> 
> In ARMv8, EL2 physical address size (TCR_EL2.PS) and stage2 input address
> size (VTCR_EL2.T0SZE) cannot be determined in compile time since they
> depend on hardware capability.
> 
> According to Table D4-23 and Table D4-25 in ARM DDI 0487A.b document,
> vttbr_x is calculated using different fixed values with consideration
> of T0SZ, granule size and the level of translation tables. Therefore,
> vttbr_baddr_mask should be determined dynamically.
> 
> Changes since v4:
> More minor cleanups from review
> Moved some functions into headers
> 
> Changes since v3:
> Another rebase
> Addressed minor comments from v2
> 
> Changes since v2:
> Rebased on https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git next branch
> 
> Changes since v1:
> Rebased fix on Jungseok Lee's patch https://lkml.org/lkml/2014/5/12/189 to
> provide better long term fix.  Updated that patch to log error instead of
> silently fail on unaligned vttbr.
> 
> Cc: Christoffer Dall <christoffer.dall@linaro.org>
> Cc: Sungjinn Chung <sungjinn.chung@samsung.com>
> Signed-off-by: Jungseok Lee <jays.lee@samsung.com>
> Signed-off-by: Joel Schopp <joel.schopp@amd.com>
> ---
>  arch/arm/include/asm/kvm_mmu.h   |   12 ++++++
>  arch/arm/kvm/arm.c               |   17 +++++++-
>  arch/arm64/include/asm/kvm_arm.h |   17 +-------
>  arch/arm64/include/asm/kvm_mmu.h |   78 ++++++++++++++++++++++++++++++++++++++
>  arch/arm64/kvm/hyp-init.S        |   20 +++++++---
>  5 files changed, 122 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
> index 5c7aa3c..73f6ff6 100644
> --- a/arch/arm/include/asm/kvm_mmu.h
> +++ b/arch/arm/include/asm/kvm_mmu.h
> @@ -166,6 +166,18 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
>  
>  void stage2_flush_vm(struct kvm *kvm);
>  
> +static inline int kvm_get_phys_addr_shift(void)
> +{
> +	return KVM_PHYS_SHIFT;
> +}
> +
> +static inline int set_vttbr_baddr_mask(void)
> +{
> +	vttbr_baddr_mask = VTTBR_BADDR_MASK;
> +	return 0;
> +}
> +
> +
>  #endif	/* !__ASSEMBLY__ */
>  
>  #endif /* __ARM_KVM_MMU_H__ */
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index 3c82b37..f396eb7 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -37,6 +37,7 @@
>  #include <asm/mman.h>
>  #include <asm/tlbflush.h>
>  #include <asm/cacheflush.h>
> +#include <asm/cputype.h>
>  #include <asm/virt.h>
>  #include <asm/kvm_arm.h>
>  #include <asm/kvm_asm.h>
> @@ -466,8 +467,14 @@ static void update_vttbr(struct kvm *kvm)
>  	/* update vttbr to be used with the new vmid */
>  	pgd_phys = virt_to_phys(kvm->arch.pgd);
>  	vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK;
> -	kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK;
> -	kvm->arch.vttbr |= vmid;
> +
> +	/*
> +	 * If the VTTBR isn't aligned there is something wrong with the system
> +	 * or kernel.
> +	 */
> +	BUG_ON(pgd_phys & ~vttbr_baddr_mask);
> +
> +	kvm->arch.vttbr = pgd_phys | vmid;
>  
>  	spin_unlock(&kvm_vmid_lock);
>  }
> @@ -1052,6 +1059,12 @@ int kvm_arch_init(void *opaque)
>  		}
>  	}
>  
> +	err = set_vttbr_baddr_mask();
> +	if (err) {
> +		kvm_err("Cannot set vttbr_baddr_mask\n");
> +		return -EINVAL;
> +	}
> +
>  	cpu_notifier_register_begin();
>  
>  	err = init_hyp_mode();
> diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
> index 3d69030..8dbef70 100644
> --- a/arch/arm64/include/asm/kvm_arm.h
> +++ b/arch/arm64/include/asm/kvm_arm.h
> @@ -94,7 +94,6 @@
>  /* TCR_EL2 Registers bits */
>  #define TCR_EL2_TBI	(1 << 20)
>  #define TCR_EL2_PS	(7 << 16)
> -#define TCR_EL2_PS_40B	(2 << 16)
>  #define TCR_EL2_TG0	(1 << 14)
>  #define TCR_EL2_SH0	(3 << 12)
>  #define TCR_EL2_ORGN0	(3 << 10)
> @@ -103,8 +102,6 @@
>  #define TCR_EL2_MASK	(TCR_EL2_TG0 | TCR_EL2_SH0 | \
>  			 TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
>  
> -#define TCR_EL2_FLAGS	(TCR_EL2_PS_40B)
> -
>  /* VTCR_EL2 Registers bits */
>  #define VTCR_EL2_PS_MASK	(7 << 16)
>  #define VTCR_EL2_TG0_MASK	(1 << 14)
> @@ -119,36 +116,28 @@
>  #define VTCR_EL2_SL0_MASK	(3 << 6)
>  #define VTCR_EL2_SL0_LVL1	(1 << 6)
>  #define VTCR_EL2_T0SZ_MASK	0x3f
> -#define VTCR_EL2_T0SZ_40B	24
> +#define VTCR_EL2_T0SZ(bits)	(64 - (bits))
>  
>  #ifdef CONFIG_ARM64_64K_PAGES
>  /*
>   * Stage2 translation configuration:
> - * 40bits output (PS = 2)
> - * 40bits input  (T0SZ = 24)
>   * 64kB pages (TG0 = 1)
>   * 2 level page tables (SL = 1)
>   */
>  #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \
>  				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> -				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
> -#define VTTBR_X		(38 - VTCR_EL2_T0SZ_40B)
> +				 VTCR_EL2_SL0_LVL1)
>  #else
>  /*
>   * Stage2 translation configuration:
> - * 40bits output (PS = 2)
> - * 40bits input  (T0SZ = 24)
>   * 4kB pages (TG0 = 0)
>   * 3 level page tables (SL = 1)
>   */
>  #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \
>  				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> -				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
> -#define VTTBR_X		(37 - VTCR_EL2_T0SZ_40B)
> +				 VTCR_EL2_SL0_LVL1)
>  #endif
>  
> -#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
> -#define VTTBR_BADDR_MASK  (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
>  #define VTTBR_VMID_SHIFT  (48LLU)
>  #define VTTBR_VMID_MASK	  (0xffLLU << VTTBR_VMID_SHIFT)
>  
> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> index 7d29847..b6ae83b 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -152,5 +152,83 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
>  
>  void stage2_flush_vm(struct kvm *kvm);
>  
> +/*
> + * ARMv8 64K architecture limitations:
> + * 16 <= T0SZ <= 21 is valid under 3 level of translation tables
> + * 18 <= T0SZ <= 34 is valid under 2 level of translation tables
> + * 31 <= T0SZ <= 39 is valid under 1 level of transltaion tables
> + *
> + * ARMv8 4K architecture limitations:
> + * 16 <= T0SZ <= 24 is valid under 4 level of translation tables
> + * 21 <= T0SZ <= 33 is valid under 3 level of translation tables
> + * 30 <= T0SZ <= 39 is valid under 2 level of translation tables
> + *
> + * For 4K pages we only support 3 or 4 level, giving T0SZ a range of 16 to 33.
> + * For 64K pages we only support 2 or 3 level, giving T0SZ a range of 16 to 34.
> + *
> + * See Table D4-23 and Table D4-25 in ARM DDI 0487A.b to figure out
> + * the origin of the hardcoded values, 38 and 37.
> + */
> +
> +#ifdef CONFIG_ARM64_64K_PAGES
> +static inline int t0sz_to_vttbr_x(int t0sz)
> +{
> +	if (t0sz < 16 || t0sz > 34) {
> +		kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
> +		return 0;
> +	}
> +
> +	return 38 - t0sz;
> +}
> +#else /* 4K pages */
> +static inline int t0sz_to_vttbr_x(int t0sz)
> +{
> +	if (t0sz < 16 || t0sz > 33) {
> +		kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
> +		return 0;

why not just return an error code directly here?  These are now
returning 0 on error which usually means everything was ok...

> +	}
> +	return 37 - t0sz;
> +}
> +#endif
> +static inline int kvm_get_phys_addr_shift(void)
> +{
> +	int pa_range = read_cpuid(ID_AA64MMFR0_EL1) & 0xf;
> +
> +	switch (pa_range) {
> +	case 0: return 32;
> +	case 1: return 36;
> +	case 2: return 40;
> +	case 3: return 42;
> +	case 4: return 44;
> +	case 5: return 48;
> +	default:
> +		BUG();
> +		return 0;
> +	}
> +}
> +
> +static u64 vttbr_baddr_mask;
> +
> +/**
> + * set_vttbr_baddr_mask - set mask value for vttbr base address
> + *
> + * In ARMv8, vttbr_baddr_mask cannot be determined in compile time since the
> + * stage2 input address size depends on hardware capability. Thus, we first
> + * need to read ID_AA64MMFR0_EL1.PARange and then set vttbr_baddr_mask with
> + * consideration of both the granule size and the level of translation tables.
> + */
> +static inline int set_vttbr_baddr_mask(void)
> +{
> +	int t0sz, vttbr_x;
> +
> +	t0sz = VTCR_EL2_T0SZ(kvm_get_phys_addr_shift());
> +	vttbr_x = t0sz_to_vttbr_x(t0sz);
> +	if (!vttbr_x)
> +		return -EINVAL;
> +	vttbr_baddr_mask = (((1LLU << (48 - vttbr_x)) - 1) << (vttbr_x - 1));
> +
> +	return 0;
> +}
> +
>  #endif /* __ASSEMBLY__ */
>  #endif /* __ARM64_KVM_MMU_H__ */
> diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
> index d968796..c0f7634 100644
> --- a/arch/arm64/kvm/hyp-init.S
> +++ b/arch/arm64/kvm/hyp-init.S
> @@ -63,17 +63,21 @@ __do_hyp_init:
>  	mrs	x4, tcr_el1
>  	ldr	x5, =TCR_EL2_MASK
>  	and	x4, x4, x5
> -	ldr	x5, =TCR_EL2_FLAGS
> -	orr	x4, x4, x5
> -	msr	tcr_el2, x4
> -
> -	ldr	x4, =VTCR_EL2_FLAGS
>  	/*
>  	 * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in
> -	 * VTCR_EL2.
> +	 * TCR_EL2 and both PS bits and T0SZ bits in VTCR_EL2.
>  	 */
>  	mrs	x5, ID_AA64MMFR0_EL1
>  	bfi	x4, x5, #16, #3
> +	msr	tcr_el2, x4
> +
> +	ldr	x4, =VTCR_EL2_FLAGS
> +	bfi	x4, x5, #16, #3
> +	and	x5, x5, #0xf
> +	adr	x6, t0sz
> +	add	x6, x6, x5, lsl #2
> +	ldr	w5, [x6]
> +	orr	x4, x4, x5
>  	msr	vtcr_el2, x4
>  
>  	mrs	x4, mair_el1
> @@ -109,6 +113,10 @@ target: /* We're now in the trampoline code, switch page tables */
>  
>  	/* Hello, World! */
>  	eret
> +
> +t0sz:
> +	.word	VTCR_EL2_T0SZ(32), VTCR_EL2_T0SZ(36), VTCR_EL2_T0SZ(40)
> +	.word	VTCR_EL2_T0SZ(42), VTCR_EL2_T0SZ(44), VTCR_EL2_T0SZ(48)
>  ENDPROC(__kvm_hyp_init)
>  
>  	.ltorg
>
Joel Schopp Aug. 19, 2014, 2:23 p.m. UTC | #2
On 08/19/2014 07:24 AM, Christoffer Dall wrote:
> On Mon, Aug 18, 2014 at 03:36:04PM -0500, Joel Schopp wrote:
>> The current VTTBR_BADDR_MASK only masks 39 bits, which is broken on current
>> systems.  Rather than just add a bit it seems like a good time to also set
>> things at run-time instead of compile time to accomodate more hardware.
>>
>> This patch sets TCR_EL2.PS, VTCR_EL2.T0SZ and vttbr_baddr_mask in runtime,
>> not compile time.
>>
>> In ARMv8, EL2 physical address size (TCR_EL2.PS) and stage2 input address
>> size (VTCR_EL2.T0SZE) cannot be determined in compile time since they
>> depend on hardware capability.
>>
>> According to Table D4-23 and Table D4-25 in ARM DDI 0487A.b document,
>> vttbr_x is calculated using different fixed values with consideration
>> of T0SZ, granule size and the level of translation tables. Therefore,
>> vttbr_baddr_mask should be determined dynamically.
>>
>> Changes since v4:
>> More minor cleanups from review
>> Moved some functions into headers
>>
>> Changes since v3:
>> Another rebase
>> Addressed minor comments from v2
>>
>> Changes since v2:
>> Rebased on https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git next branch
>>
>> Changes since v1:
>> Rebased fix on Jungseok Lee's patch https://lkml.org/lkml/2014/5/12/189 to
>> provide better long term fix.  Updated that patch to log error instead of
>> silently fail on unaligned vttbr.
>>
>> Cc: Christoffer Dall <christoffer.dall@linaro.org>
>> Cc: Sungjinn Chung <sungjinn.chung@samsung.com>
>> Signed-off-by: Jungseok Lee <jays.lee@samsung.com>
>> Signed-off-by: Joel Schopp <joel.schopp@amd.com>
>> ---
>>  arch/arm/include/asm/kvm_mmu.h   |   12 ++++++
>>  arch/arm/kvm/arm.c               |   17 +++++++-
>>  arch/arm64/include/asm/kvm_arm.h |   17 +-------
>>  arch/arm64/include/asm/kvm_mmu.h |   78 ++++++++++++++++++++++++++++++++++++++
>>  arch/arm64/kvm/hyp-init.S        |   20 +++++++---
>>  5 files changed, 122 insertions(+), 22 deletions(-)
>>
>> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
>> index 5c7aa3c..73f6ff6 100644
>> --- a/arch/arm/include/asm/kvm_mmu.h
>> +++ b/arch/arm/include/asm/kvm_mmu.h
>> @@ -166,6 +166,18 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
>>  
>>  void stage2_flush_vm(struct kvm *kvm);
>>  
>> +static inline int kvm_get_phys_addr_shift(void)
>> +{
>> +	return KVM_PHYS_SHIFT;
>> +}
>> +
>> +static inline int set_vttbr_baddr_mask(void)
>> +{
>> +	vttbr_baddr_mask = VTTBR_BADDR_MASK;
>> +	return 0;
>> +}
>> +
>> +
>>  #endif	/* !__ASSEMBLY__ */
>>  
>>  #endif /* __ARM_KVM_MMU_H__ */
>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>> index 3c82b37..f396eb7 100644
>> --- a/arch/arm/kvm/arm.c
>> +++ b/arch/arm/kvm/arm.c
>> @@ -37,6 +37,7 @@
>>  #include <asm/mman.h>
>>  #include <asm/tlbflush.h>
>>  #include <asm/cacheflush.h>
>> +#include <asm/cputype.h>
>>  #include <asm/virt.h>
>>  #include <asm/kvm_arm.h>
>>  #include <asm/kvm_asm.h>
>> @@ -466,8 +467,14 @@ static void update_vttbr(struct kvm *kvm)
>>  	/* update vttbr to be used with the new vmid */
>>  	pgd_phys = virt_to_phys(kvm->arch.pgd);
>>  	vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK;
>> -	kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK;
>> -	kvm->arch.vttbr |= vmid;
>> +
>> +	/*
>> +	 * If the VTTBR isn't aligned there is something wrong with the system
>> +	 * or kernel.
>> +	 */
>> +	BUG_ON(pgd_phys & ~vttbr_baddr_mask);
>> +
>> +	kvm->arch.vttbr = pgd_phys | vmid;
>>  
>>  	spin_unlock(&kvm_vmid_lock);
>>  }
>> @@ -1052,6 +1059,12 @@ int kvm_arch_init(void *opaque)
>>  		}
>>  	}
>>  
>> +	err = set_vttbr_baddr_mask();
>> +	if (err) {
>> +		kvm_err("Cannot set vttbr_baddr_mask\n");
>> +		return -EINVAL;
>> +	}
>> +
>>  	cpu_notifier_register_begin();
>>  
>>  	err = init_hyp_mode();
>> diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
>> index 3d69030..8dbef70 100644
>> --- a/arch/arm64/include/asm/kvm_arm.h
>> +++ b/arch/arm64/include/asm/kvm_arm.h
>> @@ -94,7 +94,6 @@
>>  /* TCR_EL2 Registers bits */
>>  #define TCR_EL2_TBI	(1 << 20)
>>  #define TCR_EL2_PS	(7 << 16)
>> -#define TCR_EL2_PS_40B	(2 << 16)
>>  #define TCR_EL2_TG0	(1 << 14)
>>  #define TCR_EL2_SH0	(3 << 12)
>>  #define TCR_EL2_ORGN0	(3 << 10)
>> @@ -103,8 +102,6 @@
>>  #define TCR_EL2_MASK	(TCR_EL2_TG0 | TCR_EL2_SH0 | \
>>  			 TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
>>  
>> -#define TCR_EL2_FLAGS	(TCR_EL2_PS_40B)
>> -
>>  /* VTCR_EL2 Registers bits */
>>  #define VTCR_EL2_PS_MASK	(7 << 16)
>>  #define VTCR_EL2_TG0_MASK	(1 << 14)
>> @@ -119,36 +116,28 @@
>>  #define VTCR_EL2_SL0_MASK	(3 << 6)
>>  #define VTCR_EL2_SL0_LVL1	(1 << 6)
>>  #define VTCR_EL2_T0SZ_MASK	0x3f
>> -#define VTCR_EL2_T0SZ_40B	24
>> +#define VTCR_EL2_T0SZ(bits)	(64 - (bits))
>>  
>>  #ifdef CONFIG_ARM64_64K_PAGES
>>  /*
>>   * Stage2 translation configuration:
>> - * 40bits output (PS = 2)
>> - * 40bits input  (T0SZ = 24)
>>   * 64kB pages (TG0 = 1)
>>   * 2 level page tables (SL = 1)
>>   */
>>  #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \
>>  				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
>> -				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
>> -#define VTTBR_X		(38 - VTCR_EL2_T0SZ_40B)
>> +				 VTCR_EL2_SL0_LVL1)
>>  #else
>>  /*
>>   * Stage2 translation configuration:
>> - * 40bits output (PS = 2)
>> - * 40bits input  (T0SZ = 24)
>>   * 4kB pages (TG0 = 0)
>>   * 3 level page tables (SL = 1)
>>   */
>>  #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \
>>  				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
>> -				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
>> -#define VTTBR_X		(37 - VTCR_EL2_T0SZ_40B)
>> +				 VTCR_EL2_SL0_LVL1)
>>  #endif
>>  
>> -#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
>> -#define VTTBR_BADDR_MASK  (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
>>  #define VTTBR_VMID_SHIFT  (48LLU)
>>  #define VTTBR_VMID_MASK	  (0xffLLU << VTTBR_VMID_SHIFT)
>>  
>> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
>> index 7d29847..b6ae83b 100644
>> --- a/arch/arm64/include/asm/kvm_mmu.h
>> +++ b/arch/arm64/include/asm/kvm_mmu.h
>> @@ -152,5 +152,83 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
>>  
>>  void stage2_flush_vm(struct kvm *kvm);
>>  
>> +/*
>> + * ARMv8 64K architecture limitations:
>> + * 16 <= T0SZ <= 21 is valid under 3 level of translation tables
>> + * 18 <= T0SZ <= 34 is valid under 2 level of translation tables
>> + * 31 <= T0SZ <= 39 is valid under 1 level of transltaion tables
>> + *
>> + * ARMv8 4K architecture limitations:
>> + * 16 <= T0SZ <= 24 is valid under 4 level of translation tables
>> + * 21 <= T0SZ <= 33 is valid under 3 level of translation tables
>> + * 30 <= T0SZ <= 39 is valid under 2 level of translation tables
>> + *
>> + * For 4K pages we only support 3 or 4 level, giving T0SZ a range of 16 to 33.
>> + * For 64K pages we only support 2 or 3 level, giving T0SZ a range of 16 to 34.
>> + *
>> + * See Table D4-23 and Table D4-25 in ARM DDI 0487A.b to figure out
>> + * the origin of the hardcoded values, 38 and 37.
>> + */
>> +
>> +#ifdef CONFIG_ARM64_64K_PAGES
>> +static inline int t0sz_to_vttbr_x(int t0sz)
>> +{
>> +	if (t0sz < 16 || t0sz > 34) {
>> +		kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
>> +		return 0;
>> +	}
>> +
>> +	return 38 - t0sz;
>> +}
>> +#else /* 4K pages */
>> +static inline int t0sz_to_vttbr_x(int t0sz)
>> +{
>> +	if (t0sz < 16 || t0sz > 33) {
>> +		kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
>> +		return 0;
> why not just return an error code directly here?  These are now
> returning 0 on error which usually means everything was ok...
The return is a value,not just an error code. Because of this returning
an error overloads that value.  0 just seemed like a convenient invalid
value to check since a vttbr_x of 0 is invalid, but returning a negative
error code would be as equally invalid.  If this is the only issue it
doesn't seem worth respinning the patch for, but I'll change it to
-EINVAL if for some reason a v6 is needed.
Christoffer Dall Aug. 19, 2014, 2:38 p.m. UTC | #3
On Tue, Aug 19, 2014 at 09:23:57AM -0500, Joel Schopp wrote:
> 
> On 08/19/2014 07:24 AM, Christoffer Dall wrote:
> > On Mon, Aug 18, 2014 at 03:36:04PM -0500, Joel Schopp wrote:
> >> The current VTTBR_BADDR_MASK only masks 39 bits, which is broken on current
> >> systems.  Rather than just add a bit it seems like a good time to also set
> >> things at run-time instead of compile time to accomodate more hardware.
> >>
> >> This patch sets TCR_EL2.PS, VTCR_EL2.T0SZ and vttbr_baddr_mask in runtime,
> >> not compile time.
> >>
> >> In ARMv8, EL2 physical address size (TCR_EL2.PS) and stage2 input address
> >> size (VTCR_EL2.T0SZE) cannot be determined in compile time since they
> >> depend on hardware capability.
> >>
> >> According to Table D4-23 and Table D4-25 in ARM DDI 0487A.b document,
> >> vttbr_x is calculated using different fixed values with consideration
> >> of T0SZ, granule size and the level of translation tables. Therefore,
> >> vttbr_baddr_mask should be determined dynamically.
> >>
> >> Changes since v4:
> >> More minor cleanups from review
> >> Moved some functions into headers
> >>
> >> Changes since v3:
> >> Another rebase
> >> Addressed minor comments from v2
> >>
> >> Changes since v2:
> >> Rebased on https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git next branch
> >>
> >> Changes since v1:
> >> Rebased fix on Jungseok Lee's patch https://lkml.org/lkml/2014/5/12/189 to
> >> provide better long term fix.  Updated that patch to log error instead of
> >> silently fail on unaligned vttbr.
> >>
> >> Cc: Christoffer Dall <christoffer.dall@linaro.org>
> >> Cc: Sungjinn Chung <sungjinn.chung@samsung.com>
> >> Signed-off-by: Jungseok Lee <jays.lee@samsung.com>
> >> Signed-off-by: Joel Schopp <joel.schopp@amd.com>
> >> ---
> >>  arch/arm/include/asm/kvm_mmu.h   |   12 ++++++
> >>  arch/arm/kvm/arm.c               |   17 +++++++-
> >>  arch/arm64/include/asm/kvm_arm.h |   17 +-------
> >>  arch/arm64/include/asm/kvm_mmu.h |   78 ++++++++++++++++++++++++++++++++++++++
> >>  arch/arm64/kvm/hyp-init.S        |   20 +++++++---
> >>  5 files changed, 122 insertions(+), 22 deletions(-)
> >>
> >> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
> >> index 5c7aa3c..73f6ff6 100644
> >> --- a/arch/arm/include/asm/kvm_mmu.h
> >> +++ b/arch/arm/include/asm/kvm_mmu.h
> >> @@ -166,6 +166,18 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
> >>  
> >>  void stage2_flush_vm(struct kvm *kvm);
> >>  
> >> +static inline int kvm_get_phys_addr_shift(void)
> >> +{
> >> +	return KVM_PHYS_SHIFT;
> >> +}
> >> +
> >> +static inline int set_vttbr_baddr_mask(void)
> >> +{
> >> +	vttbr_baddr_mask = VTTBR_BADDR_MASK;
> >> +	return 0;
> >> +}
> >> +
> >> +
> >>  #endif	/* !__ASSEMBLY__ */
> >>  
> >>  #endif /* __ARM_KVM_MMU_H__ */
> >> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> >> index 3c82b37..f396eb7 100644
> >> --- a/arch/arm/kvm/arm.c
> >> +++ b/arch/arm/kvm/arm.c
> >> @@ -37,6 +37,7 @@
> >>  #include <asm/mman.h>
> >>  #include <asm/tlbflush.h>
> >>  #include <asm/cacheflush.h>
> >> +#include <asm/cputype.h>
> >>  #include <asm/virt.h>
> >>  #include <asm/kvm_arm.h>
> >>  #include <asm/kvm_asm.h>
> >> @@ -466,8 +467,14 @@ static void update_vttbr(struct kvm *kvm)
> >>  	/* update vttbr to be used with the new vmid */
> >>  	pgd_phys = virt_to_phys(kvm->arch.pgd);
> >>  	vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK;
> >> -	kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK;
> >> -	kvm->arch.vttbr |= vmid;
> >> +
> >> +	/*
> >> +	 * If the VTTBR isn't aligned there is something wrong with the system
> >> +	 * or kernel.
> >> +	 */
> >> +	BUG_ON(pgd_phys & ~vttbr_baddr_mask);
> >> +
> >> +	kvm->arch.vttbr = pgd_phys | vmid;
> >>  
> >>  	spin_unlock(&kvm_vmid_lock);
> >>  }
> >> @@ -1052,6 +1059,12 @@ int kvm_arch_init(void *opaque)
> >>  		}
> >>  	}
> >>  
> >> +	err = set_vttbr_baddr_mask();
> >> +	if (err) {
> >> +		kvm_err("Cannot set vttbr_baddr_mask\n");
> >> +		return -EINVAL;
> >> +	}
> >> +
> >>  	cpu_notifier_register_begin();
> >>  
> >>  	err = init_hyp_mode();
> >> diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
> >> index 3d69030..8dbef70 100644
> >> --- a/arch/arm64/include/asm/kvm_arm.h
> >> +++ b/arch/arm64/include/asm/kvm_arm.h
> >> @@ -94,7 +94,6 @@
> >>  /* TCR_EL2 Registers bits */
> >>  #define TCR_EL2_TBI	(1 << 20)
> >>  #define TCR_EL2_PS	(7 << 16)
> >> -#define TCR_EL2_PS_40B	(2 << 16)
> >>  #define TCR_EL2_TG0	(1 << 14)
> >>  #define TCR_EL2_SH0	(3 << 12)
> >>  #define TCR_EL2_ORGN0	(3 << 10)
> >> @@ -103,8 +102,6 @@
> >>  #define TCR_EL2_MASK	(TCR_EL2_TG0 | TCR_EL2_SH0 | \
> >>  			 TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
> >>  
> >> -#define TCR_EL2_FLAGS	(TCR_EL2_PS_40B)
> >> -
> >>  /* VTCR_EL2 Registers bits */
> >>  #define VTCR_EL2_PS_MASK	(7 << 16)
> >>  #define VTCR_EL2_TG0_MASK	(1 << 14)
> >> @@ -119,36 +116,28 @@
> >>  #define VTCR_EL2_SL0_MASK	(3 << 6)
> >>  #define VTCR_EL2_SL0_LVL1	(1 << 6)
> >>  #define VTCR_EL2_T0SZ_MASK	0x3f
> >> -#define VTCR_EL2_T0SZ_40B	24
> >> +#define VTCR_EL2_T0SZ(bits)	(64 - (bits))
> >>  
> >>  #ifdef CONFIG_ARM64_64K_PAGES
> >>  /*
> >>   * Stage2 translation configuration:
> >> - * 40bits output (PS = 2)
> >> - * 40bits input  (T0SZ = 24)
> >>   * 64kB pages (TG0 = 1)
> >>   * 2 level page tables (SL = 1)
> >>   */
> >>  #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \
> >>  				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> >> -				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
> >> -#define VTTBR_X		(38 - VTCR_EL2_T0SZ_40B)
> >> +				 VTCR_EL2_SL0_LVL1)
> >>  #else
> >>  /*
> >>   * Stage2 translation configuration:
> >> - * 40bits output (PS = 2)
> >> - * 40bits input  (T0SZ = 24)
> >>   * 4kB pages (TG0 = 0)
> >>   * 3 level page tables (SL = 1)
> >>   */
> >>  #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \
> >>  				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> >> -				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
> >> -#define VTTBR_X		(37 - VTCR_EL2_T0SZ_40B)
> >> +				 VTCR_EL2_SL0_LVL1)
> >>  #endif
> >>  
> >> -#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
> >> -#define VTTBR_BADDR_MASK  (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
> >>  #define VTTBR_VMID_SHIFT  (48LLU)
> >>  #define VTTBR_VMID_MASK	  (0xffLLU << VTTBR_VMID_SHIFT)
> >>  
> >> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> >> index 7d29847..b6ae83b 100644
> >> --- a/arch/arm64/include/asm/kvm_mmu.h
> >> +++ b/arch/arm64/include/asm/kvm_mmu.h
> >> @@ -152,5 +152,83 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
> >>  
> >>  void stage2_flush_vm(struct kvm *kvm);
> >>  
> >> +/*
> >> + * ARMv8 64K architecture limitations:
> >> + * 16 <= T0SZ <= 21 is valid under 3 level of translation tables
> >> + * 18 <= T0SZ <= 34 is valid under 2 level of translation tables
> >> + * 31 <= T0SZ <= 39 is valid under 1 level of transltaion tables
> >> + *
> >> + * ARMv8 4K architecture limitations:
> >> + * 16 <= T0SZ <= 24 is valid under 4 level of translation tables
> >> + * 21 <= T0SZ <= 33 is valid under 3 level of translation tables
> >> + * 30 <= T0SZ <= 39 is valid under 2 level of translation tables
> >> + *
> >> + * For 4K pages we only support 3 or 4 level, giving T0SZ a range of 16 to 33.
> >> + * For 64K pages we only support 2 or 3 level, giving T0SZ a range of 16 to 34.
> >> + *
> >> + * See Table D4-23 and Table D4-25 in ARM DDI 0487A.b to figure out
> >> + * the origin of the hardcoded values, 38 and 37.
> >> + */
> >> +
> >> +#ifdef CONFIG_ARM64_64K_PAGES
> >> +static inline int t0sz_to_vttbr_x(int t0sz)
> >> +{
> >> +	if (t0sz < 16 || t0sz > 34) {
> >> +		kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
> >> +		return 0;
> >> +	}
> >> +
> >> +	return 38 - t0sz;
> >> +}
> >> +#else /* 4K pages */
> >> +static inline int t0sz_to_vttbr_x(int t0sz)
> >> +{
> >> +	if (t0sz < 16 || t0sz > 33) {
> >> +		kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
> >> +		return 0;
> > why not just return an error code directly here?  These are now
> > returning 0 on error which usually means everything was ok...
> The return is a value,not just an error code. Because of this returning
> an error overloads that value.  0 just seemed like a convenient invalid
> value to check since a vttbr_x of 0 is invalid, but returning a negative
> error code would be as equally invalid.  If this is the only issue it
> doesn't seem worth respinning the patch for, but I'll change it to
> -EINVAL if for some reason a v6 is needed.

Have you given up on doing the alignment check with the proper size on
the pgd allocation for this patch?

-Christoffer
Joel Schopp Aug. 19, 2014, 2:49 p.m. UTC | #4
>> The return is a value,not just an error code. Because of this returning
>> an error overloads that value.  0 just seemed like a convenient invalid
>> value to check since a vttbr_x of 0 is invalid, but returning a negative
>> error code would be as equally invalid.  If this is the only issue it
>> doesn't seem worth respinning the patch for, but I'll change it to
>> -EINVAL if for some reason a v6 is needed.
> Have you given up on doing the alignment check with the proper size on
> the pgd allocation for this patch?
Yes, I'd rather leave the extra check out of this patch.  If I were
changing the pgd allocation code I would make sure to add a check, or if
there were a static check there now I would update it for the dynamic
value from the hardware, but it seems unrelated to add several checks to
other parts of the code beyond those already in the patch.  I did leave
the functions in the headers such that checks like this could be added
when someone is updating the code for other reasons, say 4 level page
tables.

-Joel
Christoffer Dall Aug. 19, 2014, 2:59 p.m. UTC | #5
On Tue, Aug 19, 2014 at 09:49:07AM -0500, Joel Schopp wrote:
> 
> >> The return is a value,not just an error code. Because of this returning
> >> an error overloads that value.  0 just seemed like a convenient invalid
> >> value to check since a vttbr_x of 0 is invalid, but returning a negative
> >> error code would be as equally invalid.  If this is the only issue it
> >> doesn't seem worth respinning the patch for, but I'll change it to
> >> -EINVAL if for some reason a v6 is needed.
> > Have you given up on doing the alignment check with the proper size on
> > the pgd allocation for this patch?
> Yes, I'd rather leave the extra check out of this patch.  If I were
> changing the pgd allocation code I would make sure to add a check, or if
> there were a static check there now I would update it for the dynamic
> value from the hardware, but it seems unrelated to add several checks to
> other parts of the code beyond those already in the patch.  I did leave
> the functions in the headers such that checks like this could be added
> when someone is updating the code for other reasons, say 4 level page
> tables.
> 

hmmm, the point is that we need to ensure that we have a properly
aligned allocated PGD, that's what this patch currently addresses, and as
you pointed out, the BUG_ON() just before trying to run a VM is not the
nicest solution - we should really be dealing with this properly at
allocation time.

But, if you don't have time to look at that, then ok, I'll have to pick
it up myself.

However, you are hinting that we do not support 4 levels of page tables,
yet you do allow the t0sz_to_vttbr_x funciton to pass even when using
t0sz values only supported under 4 levels of page tables....  What is
the rationale for that?

-Christoffer
Joel Schopp Aug. 19, 2014, 3:16 p.m. UTC | #6
> hmmm, the point is that we need to ensure that we have a properly
> aligned allocated PGD, that's what this patch currently addresses, and as
> you pointed out, the BUG_ON() just before trying to run a VM is not the
> nicest solution - we should really be dealing with this properly at
> allocation time.
>
> But, if you don't have time to look at that, then ok, I'll have to pick
> it up myself.
>
> However, you are hinting that we do not support 4 levels of page tables,
> yet you do allow the t0sz_to_vttbr_x funciton to pass even when using
> t0sz values only supported under 4 levels of page tables....  What is
> the rationale for that?
Minimizing merge conflicts. I figure both are going in within the next
month or two.
Marc Zyngier Aug. 26, 2014, 5:02 p.m. UTC | #7
Hi Joel,

On Mon, Aug 18 2014 at  9:36:04 pm BST, Joel Schopp <joel.schopp@amd.com> wrote:
> The current VTTBR_BADDR_MASK only masks 39 bits, which is broken on current
> systems.  Rather than just add a bit it seems like a good time to also set
> things at run-time instead of compile time to accomodate more hardware.
>
> This patch sets TCR_EL2.PS, VTCR_EL2.T0SZ and vttbr_baddr_mask in runtime,
> not compile time.
>
> In ARMv8, EL2 physical address size (TCR_EL2.PS) and stage2 input address
> size (VTCR_EL2.T0SZE) cannot be determined in compile time since they
> depend on hardware capability.
>
> According to Table D4-23 and Table D4-25 in ARM DDI 0487A.b document,
> vttbr_x is calculated using different fixed values with consideration
> of T0SZ, granule size and the level of translation tables. Therefore,
> vttbr_baddr_mask should be determined dynamically.
>
> Changes since v4:
> More minor cleanups from review
> Moved some functions into headers
>
> Changes since v3:
> Another rebase
> Addressed minor comments from v2
>
> Changes since v2:
> Rebased on https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git next branch
>
> Changes since v1:
> Rebased fix on Jungseok Lee's patch https://lkml.org/lkml/2014/5/12/189 to
> provide better long term fix.  Updated that patch to log error instead of
> silently fail on unaligned vttbr.
>
> Cc: Christoffer Dall <christoffer.dall@linaro.org>
> Cc: Sungjinn Chung <sungjinn.chung@samsung.com>
> Signed-off-by: Jungseok Lee <jays.lee@samsung.com>
> Signed-off-by: Joel Schopp <joel.schopp@amd.com>
> ---
>  arch/arm/include/asm/kvm_mmu.h   |   12 ++++++
>  arch/arm/kvm/arm.c               |   17 +++++++-
>  arch/arm64/include/asm/kvm_arm.h |   17 +-------
>  arch/arm64/include/asm/kvm_mmu.h |   78 ++++++++++++++++++++++++++++++++++++++
>  arch/arm64/kvm/hyp-init.S        |   20 +++++++---
>  5 files changed, 122 insertions(+), 22 deletions(-)
>
> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
> index 5c7aa3c..73f6ff6 100644
> --- a/arch/arm/include/asm/kvm_mmu.h
> +++ b/arch/arm/include/asm/kvm_mmu.h
> @@ -166,6 +166,18 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
>
>  void stage2_flush_vm(struct kvm *kvm);
>
> +static inline int kvm_get_phys_addr_shift(void)
> +{
> +       return KVM_PHYS_SHIFT;
> +}
> +
> +static inline int set_vttbr_baddr_mask(void)
> +{
> +       vttbr_baddr_mask = VTTBR_BADDR_MASK;

Have you tried compiling this?

Apart from the obvious missing definition of the variable, I'm not fond
of functions with side-effects hidden in an include file. What is wrong
with just returning the mask and letting the common code setting it?

> +       return 0;
> +}
> +
> +
>  #endif /* !__ASSEMBLY__ */
>
>  #endif /* __ARM_KVM_MMU_H__ */
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index 3c82b37..f396eb7 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -37,6 +37,7 @@
>  #include <asm/mman.h>
>  #include <asm/tlbflush.h>
>  #include <asm/cacheflush.h>
> +#include <asm/cputype.h>
>  #include <asm/virt.h>
>  #include <asm/kvm_arm.h>
>  #include <asm/kvm_asm.h>
> @@ -466,8 +467,14 @@ static void update_vttbr(struct kvm *kvm)
>         /* update vttbr to be used with the new vmid */
>         pgd_phys = virt_to_phys(kvm->arch.pgd);
>         vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK;
> -       kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK;
> -       kvm->arch.vttbr |= vmid;
> +
> +       /*
> +        * If the VTTBR isn't aligned there is something wrong with the system
> +        * or kernel.
> +        */
> +       BUG_ON(pgd_phys & ~vttbr_baddr_mask);
> +
> +       kvm->arch.vttbr = pgd_phys | vmid;
>
>         spin_unlock(&kvm_vmid_lock);
>  }
> @@ -1052,6 +1059,12 @@ int kvm_arch_init(void *opaque)
>                 }
>         }
>
> +       err = set_vttbr_baddr_mask();
> +       if (err) {
> +               kvm_err("Cannot set vttbr_baddr_mask\n");
> +               return -EINVAL;
> +       }
> +
>         cpu_notifier_register_begin();
>
>         err = init_hyp_mode();
> diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
> index 3d69030..8dbef70 100644
> --- a/arch/arm64/include/asm/kvm_arm.h
> +++ b/arch/arm64/include/asm/kvm_arm.h
> @@ -94,7 +94,6 @@
>  /* TCR_EL2 Registers bits */
>  #define TCR_EL2_TBI    (1 << 20)
>  #define TCR_EL2_PS     (7 << 16)
> -#define TCR_EL2_PS_40B (2 << 16)
>  #define TCR_EL2_TG0    (1 << 14)
>  #define TCR_EL2_SH0    (3 << 12)
>  #define TCR_EL2_ORGN0  (3 << 10)
> @@ -103,8 +102,6 @@
>  #define TCR_EL2_MASK   (TCR_EL2_TG0 | TCR_EL2_SH0 | \
>                          TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
>
> -#define TCR_EL2_FLAGS  (TCR_EL2_PS_40B)
> -
>  /* VTCR_EL2 Registers bits */
>  #define VTCR_EL2_PS_MASK       (7 << 16)
>  #define VTCR_EL2_TG0_MASK      (1 << 14)
> @@ -119,36 +116,28 @@
>  #define VTCR_EL2_SL0_MASK      (3 << 6)
>  #define VTCR_EL2_SL0_LVL1      (1 << 6)
>  #define VTCR_EL2_T0SZ_MASK     0x3f
> -#define VTCR_EL2_T0SZ_40B      24
> +#define VTCR_EL2_T0SZ(bits)    (64 - (bits))
>
>  #ifdef CONFIG_ARM64_64K_PAGES
>  /*
>   * Stage2 translation configuration:
> - * 40bits output (PS = 2)
> - * 40bits input  (T0SZ = 24)
>   * 64kB pages (TG0 = 1)
>   * 2 level page tables (SL = 1)
>   */
>  #define VTCR_EL2_FLAGS         (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \
>                                  VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> -                                VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
> -#define VTTBR_X                (38 - VTCR_EL2_T0SZ_40B)
> +                                VTCR_EL2_SL0_LVL1)
>  #else
>  /*
>   * Stage2 translation configuration:
> - * 40bits output (PS = 2)
> - * 40bits input  (T0SZ = 24)
>   * 4kB pages (TG0 = 0)
>   * 3 level page tables (SL = 1)
>   */
>  #define VTCR_EL2_FLAGS         (VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \
>                                  VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> -                                VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
> -#define VTTBR_X                (37 - VTCR_EL2_T0SZ_40B)
> +                                VTCR_EL2_SL0_LVL1)
>  #endif
>
> -#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
> -#define VTTBR_BADDR_MASK  (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
>  #define VTTBR_VMID_SHIFT  (48LLU)
>  #define VTTBR_VMID_MASK          (0xffLLU << VTTBR_VMID_SHIFT)
>
> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> index 7d29847..b6ae83b 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -152,5 +152,83 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
>
>  void stage2_flush_vm(struct kvm *kvm);
>
> +/*
> + * ARMv8 64K architecture limitations:
> + * 16 <= T0SZ <= 21 is valid under 3 level of translation tables
> + * 18 <= T0SZ <= 34 is valid under 2 level of translation tables
> + * 31 <= T0SZ <= 39 is valid under 1 level of transltaion tables
> + *
> + * ARMv8 4K architecture limitations:
> + * 16 <= T0SZ <= 24 is valid under 4 level of translation tables
> + * 21 <= T0SZ <= 33 is valid under 3 level of translation tables
> + * 30 <= T0SZ <= 39 is valid under 2 level of translation tables
> + *
> + * For 4K pages we only support 3 or 4 level, giving T0SZ a range of 16 to 33.
> + * For 64K pages we only support 2 or 3 level, giving T0SZ a range of 16 to 34.
> + *
> + * See Table D4-23 and Table D4-25 in ARM DDI 0487A.b to figure out
> + * the origin of the hardcoded values, 38 and 37.
> + */
> +
> +#ifdef CONFIG_ARM64_64K_PAGES
> +static inline int t0sz_to_vttbr_x(int t0sz)
> +{
> +       if (t0sz < 16 || t0sz > 34) {
> +               kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
> +               return 0;

0 is definitely a bad value for something that is an error
case. Consider -EINVAL instead.

Also, what if we're in a range that only deals with more levels of page
tables than the kernel can deal with (remember we use the kernel page
table accessors)? See the new ARM64_VA_BITS and ARM64_PGTABLE_LEVELS
symbols that are now available, and use them to validate the range you
have.

> +       }
> +
> +       return 38 - t0sz;
> +}
> +#else /* 4K pages */
> +static inline int t0sz_to_vttbr_x(int t0sz)
> +{
> +       if (t0sz < 16 || t0sz > 33) {
> +               kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
> +               return 0;

Same here.

> +       }
> +       return 37 - t0sz;
> +}
> +#endif
> +static inline int kvm_get_phys_addr_shift(void)
> +{
> +       int pa_range = read_cpuid(ID_AA64MMFR0_EL1) & 0xf;
> +
> +       switch (pa_range) {
> +       case 0: return 32;
> +       case 1: return 36;
> +       case 2: return 40;
> +       case 3: return 42;
> +       case 4: return 44;
> +       case 5: return 48;
> +       default:
> +               BUG();
> +               return 0;
> +       }
> +}
> +
> +static u64 vttbr_baddr_mask;

Now every compilation unit that includes kvm_mmu.h has an instance of
this variable. I doubt that it is the intended effect.

> +
> +/**
> + * set_vttbr_baddr_mask - set mask value for vttbr base address
> + *
> + * In ARMv8, vttbr_baddr_mask cannot be determined in compile time since the
> + * stage2 input address size depends on hardware capability. Thus, we first
> + * need to read ID_AA64MMFR0_EL1.PARange and then set vttbr_baddr_mask with
> + * consideration of both the granule size and the level of translation tables.
> + */
> +static inline int set_vttbr_baddr_mask(void)
> +{
> +       int t0sz, vttbr_x;
> +
> +       t0sz = VTCR_EL2_T0SZ(kvm_get_phys_addr_shift());
> +       vttbr_x = t0sz_to_vttbr_x(t0sz);
> +       if (!vttbr_x)
> +               return -EINVAL;
> +       vttbr_baddr_mask = (((1LLU << (48 - vttbr_x)) - 1) << (vttbr_x - 1));

I think this can now be written as GENMASK_ULL(48, (vttbr_x - 1)).

> +       return 0;
> +}
> +
>  #endif /* __ASSEMBLY__ */
>  #endif /* __ARM64_KVM_MMU_H__ */
> diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
> index d968796..c0f7634 100644
> --- a/arch/arm64/kvm/hyp-init.S
> +++ b/arch/arm64/kvm/hyp-init.S
> @@ -63,17 +63,21 @@ __do_hyp_init:
>         mrs     x4, tcr_el1
>         ldr     x5, =TCR_EL2_MASK
>         and     x4, x4, x5
> -       ldr     x5, =TCR_EL2_FLAGS
> -       orr     x4, x4, x5
> -       msr     tcr_el2, x4
> -
> -       ldr     x4, =VTCR_EL2_FLAGS
>         /*
>          * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in
> -        * VTCR_EL2.
> +        * TCR_EL2 and both PS bits and T0SZ bits in VTCR_EL2.
>          */
>         mrs     x5, ID_AA64MMFR0_EL1
>         bfi     x4, x5, #16, #3
> +       msr     tcr_el2, x4
> +
> +       ldr     x4, =VTCR_EL2_FLAGS
> +       bfi     x4, x5, #16, #3
> +       and     x5, x5, #0xf
> +       adr     x6, t0sz
> +       add     x6, x6, x5, lsl #2
> +       ldr     w5, [x6]
> +       orr     x4, x4, x5

You'll need to validate the T0SZ value, and possibly adjust it so that
it is compatible with the addressing capability of the kernel. That
probably require a slight change of the hyp-init API.

>         msr     vtcr_el2, x4
>
>         mrs     x4, mair_el1
> @@ -109,6 +113,10 @@ target: /* We're now in the trampoline code, switch page tables */
>
>         /* Hello, World! */
>         eret
> +
> +t0sz:
> +       .word   VTCR_EL2_T0SZ(32), VTCR_EL2_T0SZ(36), VTCR_EL2_T0SZ(40)
> +       .word   VTCR_EL2_T0SZ(42), VTCR_EL2_T0SZ(44), VTCR_EL2_T0SZ(48)
>  ENDPROC(__kvm_hyp_init)
>
>         .ltorg
>

Another element that doesn't appear in this patch is that we need a way
for the kernel to expose the maximum input address to userspace (and
validate that noone puts memory outside of that range). This should be a
separate patch, but it is conceptually tied to the same problem.

Thanks,

        M.
Joel Schopp Aug. 26, 2014, 6:35 p.m. UTC | #8
>> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
>> index 5c7aa3c..73f6ff6 100644
>> --- a/arch/arm/include/asm/kvm_mmu.h
>> +++ b/arch/arm/include/asm/kvm_mmu.h
>> @@ -166,6 +166,18 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
>>
>>  void stage2_flush_vm(struct kvm *kvm);
>>
>> +static inline int kvm_get_phys_addr_shift(void)
>> +{
>> +       return KVM_PHYS_SHIFT;
>> +}
>> +
>> +static inline int set_vttbr_baddr_mask(void)
>> +{
>> +       vttbr_baddr_mask = VTTBR_BADDR_MASK;
> Have you tried compiling this?
>
> Apart from the obvious missing definition of the variable, I'm not fond
> of functions with side-effects hidden in an include file. What is wrong
> with just returning the mask and letting the common code setting it?
I like that change, will do in v6.

>> +#ifdef CONFIG_ARM64_64K_PAGES
>> +static inline int t0sz_to_vttbr_x(int t0sz)
>> +{
>> +       if (t0sz < 16 || t0sz > 34) {
>> +               kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
>> +               return 0;
> 0 is definitely a bad value for something that is an error
> case. Consider -EINVAL instead.
OK.
>
> Also, what if we're in a range that only deals with more levels of page
> tables than the kernel can deal with (remember we use the kernel page
> table accessors)? See the new ARM64_VA_BITS and ARM64_PGTABLE_LEVELS
> symbols that are now available, and use them to validate the range you
> have.  
With the simple current tests I can look at them and see they are
correct, even if I can't make a scenario to test that they would fail. 
However, if I add in more complicated checks I'd really like to test
them catching the failure cases.  Can you describe a case where we can
boot a kernel and then have the checks still fail in kvm? 

>
>> +       }
>> +       return 37 - t0sz;
>> +}
>> +#endif
>> +static inline int kvm_get_phys_addr_shift(void)
>> +{
>> +       int pa_range = read_cpuid(ID_AA64MMFR0_EL1) & 0xf;
>> +
>> +       switch (pa_range) {
>> +       case 0: return 32;
>> +       case 1: return 36;
>> +       case 2: return 40;
>> +       case 3: return 42;
>> +       case 4: return 44;
>> +       case 5: return 48;
>> +       default:
>> +               BUG();
>> +               return 0;
>> +       }
>> +}
>> +
>> +static u64 vttbr_baddr_mask;
> Now every compilation unit that includes kvm_mmu.h has an instance of
> this variable. I doubt that it is the intended effect.
The change for the comment farther above to just return the mask and
have the common code set it should resolve this as well.

>
>> +
>> +/**
>> + * set_vttbr_baddr_mask - set mask value for vttbr base address
>> + *
>> + * In ARMv8, vttbr_baddr_mask cannot be determined in compile time since the
>> + * stage2 input address size depends on hardware capability. Thus, we first
>> + * need to read ID_AA64MMFR0_EL1.PARange and then set vttbr_baddr_mask with
>> + * consideration of both the granule size and the level of translation tables.
>> + */
>> +static inline int set_vttbr_baddr_mask(void)
>> +{
>> +       int t0sz, vttbr_x;
>> +
>> +       t0sz = VTCR_EL2_T0SZ(kvm_get_phys_addr_shift());
>> +       vttbr_x = t0sz_to_vttbr_x(t0sz);
>> +       if (!vttbr_x)
>> +               return -EINVAL;
>> +       vttbr_baddr_mask = (((1LLU << (48 - vttbr_x)) - 1) << (vttbr_x - 1));
> I think this can now be written as GENMASK_ULL(48, (vttbr_x - 1)).
That does improve readability, I like it.

>
>> +       return 0;
>> +}
>> +
>>  #endif /* __ASSEMBLY__ */
>>  #endif /* __ARM64_KVM_MMU_H__ */
>> diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
>> index d968796..c0f7634 100644
>> --- a/arch/arm64/kvm/hyp-init.S
>> +++ b/arch/arm64/kvm/hyp-init.S
>> @@ -63,17 +63,21 @@ __do_hyp_init:
>>         mrs     x4, tcr_el1
>>         ldr     x5, =TCR_EL2_MASK
>>         and     x4, x4, x5
>> -       ldr     x5, =TCR_EL2_FLAGS
>> -       orr     x4, x4, x5
>> -       msr     tcr_el2, x4
>> -
>> -       ldr     x4, =VTCR_EL2_FLAGS
>>         /*
>>          * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in
>> -        * VTCR_EL2.
>> +        * TCR_EL2 and both PS bits and T0SZ bits in VTCR_EL2.
>>          */
>>         mrs     x5, ID_AA64MMFR0_EL1
>>         bfi     x4, x5, #16, #3
>> +       msr     tcr_el2, x4
>> +
>> +       ldr     x4, =VTCR_EL2_FLAGS
>> +       bfi     x4, x5, #16, #3
>> +       and     x5, x5, #0xf
>> +       adr     x6, t0sz
>> +       add     x6, x6, x5, lsl #2
>> +       ldr     w5, [x6]
>> +       orr     x4, x4, x5
> You'll need to validate the T0SZ value, and possibly adjust it so that
> it is compatible with the addressing capability of the kernel. That
> probably require a slight change of the hyp-init API.
In order to do that I really should test that path, can you think of a
way to generate a t0sz value that is incompatible with the kernel, but
the kernel still boots so I can load kvm and test it?

>
>>         msr     vtcr_el2, x4
>>
>>         mrs     x4, mair_el1
>> @@ -109,6 +113,10 @@ target: /* We're now in the trampoline code, switch page tables */
>>
>>         /* Hello, World! */
>>         eret
>> +
>> +t0sz:
>> +       .word   VTCR_EL2_T0SZ(32), VTCR_EL2_T0SZ(36), VTCR_EL2_T0SZ(40)
>> +       .word   VTCR_EL2_T0SZ(42), VTCR_EL2_T0SZ(44), VTCR_EL2_T0SZ(48)
>>  ENDPROC(__kvm_hyp_init)
>>
>>         .ltorg
>>
> Another element that doesn't appear in this patch is that we need a way
> for the kernel to expose the maximum input address to userspace (and
> validate that noone puts memory outside of that range). This should be a
> separate patch, but it is conceptually tied to the same problem.
I do think that separate patch would be a nice addition and probably
dependent on this, but I do think this patch is useful on its own. 
Especially since the current VTTBR_BADDR_MASK is broken.
Marc Zyngier Aug. 29, 2014, 10:06 a.m. UTC | #9
On Tue, Aug 26 2014 at  7:35:21 pm BST, Joel Schopp <joel.schopp@amd.com> wrote:
>>> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
>>> index 5c7aa3c..73f6ff6 100644
>>> --- a/arch/arm/include/asm/kvm_mmu.h
>>> +++ b/arch/arm/include/asm/kvm_mmu.h
>>> @@ -166,6 +166,18 @@ static inline void
>>> coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
>>>
>>>  void stage2_flush_vm(struct kvm *kvm);
>>>
>>> +static inline int kvm_get_phys_addr_shift(void)
>>> +{
>>> +       return KVM_PHYS_SHIFT;
>>> +}
>>> +
>>> +static inline int set_vttbr_baddr_mask(void)
>>> +{
>>> +       vttbr_baddr_mask = VTTBR_BADDR_MASK;
>> Have you tried compiling this?
>>
>> Apart from the obvious missing definition of the variable, I'm not fond
>> of functions with side-effects hidden in an include file. What is wrong
>> with just returning the mask and letting the common code setting it?
> I like that change, will do in v6.
>
>>> +#ifdef CONFIG_ARM64_64K_PAGES
>>> +static inline int t0sz_to_vttbr_x(int t0sz)
>>> +{
>>> +       if (t0sz < 16 || t0sz > 34) {
>>> +               kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
>>> +               return 0;
>> 0 is definitely a bad value for something that is an error
>> case. Consider -EINVAL instead.
> OK.
>>
>> Also, what if we're in a range that only deals with more levels of page
>> tables than the kernel can deal with (remember we use the kernel page
>> table accessors)? See the new ARM64_VA_BITS and ARM64_PGTABLE_LEVELS
>> symbols that are now available, and use them to validate the range you
>> have.  
> With the simple current tests I can look at them and see they are
> correct, even if I can't make a scenario to test that they would fail. 
> However, if I add in more complicated checks I'd really like to test
> them catching the failure cases.  Can you describe a case where we can
> boot a kernel and then have the checks still fail in kvm? 

You're looking at T0SZ values that are outside of what a given
configuration of the kernel can handle (T0SZ == 16, for example, is only
valid with 3 levels of page tables, and the current implementation only
deals with 2). This is a case where things may explode, as you allow
input addresses that we simply cannot support, and in this case you want
to cap T0SZ to something sensible that is compatible with what the
kernel (and thus KVM) can express in terms of translations.

Testing is another thing, and I don't expect you to be able to test
every possible combinaison. Nonetheless, I do expect some reasonable
effort in the code to avoid deadly situations.

>>
>>> +       }
>>> +       return 37 - t0sz;
>>> +}
>>> +#endif
>>> +static inline int kvm_get_phys_addr_shift(void)
>>> +{
>>> +       int pa_range = read_cpuid(ID_AA64MMFR0_EL1) & 0xf;
>>> +
>>> +       switch (pa_range) {
>>> +       case 0: return 32;
>>> +       case 1: return 36;
>>> +       case 2: return 40;
>>> +       case 3: return 42;
>>> +       case 4: return 44;
>>> +       case 5: return 48;
>>> +       default:
>>> +               BUG();
>>> +               return 0;
>>> +       }
>>> +}
>>> +
>>> +static u64 vttbr_baddr_mask;
>> Now every compilation unit that includes kvm_mmu.h has an instance of
>> this variable. I doubt that it is the intended effect.
> The change for the comment farther above to just return the mask and
> have the common code set it should resolve this as well.
>
>>
>>> +
>>> +/**
>>> + * set_vttbr_baddr_mask - set mask value for vttbr base address
>>> + *
>>> + * In ARMv8, vttbr_baddr_mask cannot be determined in compile time since the
>>> + * stage2 input address size depends on hardware capability. Thus, we first
>>> + * need to read ID_AA64MMFR0_EL1.PARange and then set vttbr_baddr_mask with
>>> + * consideration of both the granule size and the level of
>>> translation tables.
>>> + */
>>> +static inline int set_vttbr_baddr_mask(void)
>>> +{
>>> +       int t0sz, vttbr_x;
>>> +
>>> +       t0sz = VTCR_EL2_T0SZ(kvm_get_phys_addr_shift());
>>> +       vttbr_x = t0sz_to_vttbr_x(t0sz);
>>> +       if (!vttbr_x)
>>> +               return -EINVAL;
>>> +       vttbr_baddr_mask = (((1LLU << (48 - vttbr_x)) - 1) << (vttbr_x - 1));
>> I think this can now be written as GENMASK_ULL(48, (vttbr_x - 1)).
> That does improve readability, I like it.
>
>>
>>> +       return 0;
>>> +}
>>> +
>>>  #endif /* __ASSEMBLY__ */
>>>  #endif /* __ARM64_KVM_MMU_H__ */
>>> diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
>>> index d968796..c0f7634 100644
>>> --- a/arch/arm64/kvm/hyp-init.S
>>> +++ b/arch/arm64/kvm/hyp-init.S
>>> @@ -63,17 +63,21 @@ __do_hyp_init:
>>>         mrs     x4, tcr_el1
>>>         ldr     x5, =TCR_EL2_MASK
>>>         and     x4, x4, x5
>>> -       ldr     x5, =TCR_EL2_FLAGS
>>> -       orr     x4, x4, x5
>>> -       msr     tcr_el2, x4
>>> -
>>> -       ldr     x4, =VTCR_EL2_FLAGS
>>>         /*
>>>          * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in
>>> -        * VTCR_EL2.
>>> +        * TCR_EL2 and both PS bits and T0SZ bits in VTCR_EL2.
>>>          */
>>>         mrs     x5, ID_AA64MMFR0_EL1
>>>         bfi     x4, x5, #16, #3
>>> +       msr     tcr_el2, x4
>>> +
>>> +       ldr     x4, =VTCR_EL2_FLAGS
>>> +       bfi     x4, x5, #16, #3
>>> +       and     x5, x5, #0xf
>>> +       adr     x6, t0sz
>>> +       add     x6, x6, x5, lsl #2
>>> +       ldr     w5, [x6]
>>> +       orr     x4, x4, x5
>> You'll need to validate the T0SZ value, and possibly adjust it so that
>> it is compatible with the addressing capability of the kernel. That
>> probably require a slight change of the hyp-init API.
> In order to do that I really should test that path, can you think of a
> way to generate a t0sz value that is incompatible with the kernel, but
> the kernel still boots so I can load kvm and test it?
>
>>
>>>         msr     vtcr_el2, x4
>>>
>>>         mrs     x4, mair_el1
>>> @@ -109,6 +113,10 @@ target: /* We're now in the trampoline code, switch page tables */
>>>
>>>         /* Hello, World! */
>>>         eret
>>> +
>>> +t0sz:
>>> +       .word   VTCR_EL2_T0SZ(32), VTCR_EL2_T0SZ(36), VTCR_EL2_T0SZ(40)
>>> +       .word   VTCR_EL2_T0SZ(42), VTCR_EL2_T0SZ(44), VTCR_EL2_T0SZ(48)
>>>  ENDPROC(__kvm_hyp_init)
>>>
>>>         .ltorg
>>>
>> Another element that doesn't appear in this patch is that we need a way
>> for the kernel to expose the maximum input address to userspace (and
>> validate that noone puts memory outside of that range). This should be a
>> separate patch, but it is conceptually tied to the same problem.
> I do think that separate patch would be a nice addition and probably
> dependent on this, but I do think this patch is useful on its own. 
> Especially since the current VTTBR_BADDR_MASK is broken.

This patch is indeed useful on its own, but I want to see a complete
solution, not a set of point fixes.

Thanks,

	M.
diff mbox

Patch

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 5c7aa3c..73f6ff6 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -166,6 +166,18 @@  static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
 
 void stage2_flush_vm(struct kvm *kvm);
 
+static inline int kvm_get_phys_addr_shift(void)
+{
+	return KVM_PHYS_SHIFT;
+}
+
+static inline int set_vttbr_baddr_mask(void)
+{
+	vttbr_baddr_mask = VTTBR_BADDR_MASK;
+	return 0;
+}
+
+
 #endif	/* !__ASSEMBLY__ */
 
 #endif /* __ARM_KVM_MMU_H__ */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 3c82b37..f396eb7 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -37,6 +37,7 @@ 
 #include <asm/mman.h>
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
+#include <asm/cputype.h>
 #include <asm/virt.h>
 #include <asm/kvm_arm.h>
 #include <asm/kvm_asm.h>
@@ -466,8 +467,14 @@  static void update_vttbr(struct kvm *kvm)
 	/* update vttbr to be used with the new vmid */
 	pgd_phys = virt_to_phys(kvm->arch.pgd);
 	vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK;
-	kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK;
-	kvm->arch.vttbr |= vmid;
+
+	/*
+	 * If the VTTBR isn't aligned there is something wrong with the system
+	 * or kernel.
+	 */
+	BUG_ON(pgd_phys & ~vttbr_baddr_mask);
+
+	kvm->arch.vttbr = pgd_phys | vmid;
 
 	spin_unlock(&kvm_vmid_lock);
 }
@@ -1052,6 +1059,12 @@  int kvm_arch_init(void *opaque)
 		}
 	}
 
+	err = set_vttbr_baddr_mask();
+	if (err) {
+		kvm_err("Cannot set vttbr_baddr_mask\n");
+		return -EINVAL;
+	}
+
 	cpu_notifier_register_begin();
 
 	err = init_hyp_mode();
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 3d69030..8dbef70 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -94,7 +94,6 @@ 
 /* TCR_EL2 Registers bits */
 #define TCR_EL2_TBI	(1 << 20)
 #define TCR_EL2_PS	(7 << 16)
-#define TCR_EL2_PS_40B	(2 << 16)
 #define TCR_EL2_TG0	(1 << 14)
 #define TCR_EL2_SH0	(3 << 12)
 #define TCR_EL2_ORGN0	(3 << 10)
@@ -103,8 +102,6 @@ 
 #define TCR_EL2_MASK	(TCR_EL2_TG0 | TCR_EL2_SH0 | \
 			 TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
 
-#define TCR_EL2_FLAGS	(TCR_EL2_PS_40B)
-
 /* VTCR_EL2 Registers bits */
 #define VTCR_EL2_PS_MASK	(7 << 16)
 #define VTCR_EL2_TG0_MASK	(1 << 14)
@@ -119,36 +116,28 @@ 
 #define VTCR_EL2_SL0_MASK	(3 << 6)
 #define VTCR_EL2_SL0_LVL1	(1 << 6)
 #define VTCR_EL2_T0SZ_MASK	0x3f
-#define VTCR_EL2_T0SZ_40B	24
+#define VTCR_EL2_T0SZ(bits)	(64 - (bits))
 
 #ifdef CONFIG_ARM64_64K_PAGES
 /*
  * Stage2 translation configuration:
- * 40bits output (PS = 2)
- * 40bits input  (T0SZ = 24)
  * 64kB pages (TG0 = 1)
  * 2 level page tables (SL = 1)
  */
 #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \
 				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
-				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
-#define VTTBR_X		(38 - VTCR_EL2_T0SZ_40B)
+				 VTCR_EL2_SL0_LVL1)
 #else
 /*
  * Stage2 translation configuration:
- * 40bits output (PS = 2)
- * 40bits input  (T0SZ = 24)
  * 4kB pages (TG0 = 0)
  * 3 level page tables (SL = 1)
  */
 #define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \
 				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
-				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
-#define VTTBR_X		(37 - VTCR_EL2_T0SZ_40B)
+				 VTCR_EL2_SL0_LVL1)
 #endif
 
-#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
-#define VTTBR_BADDR_MASK  (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
 #define VTTBR_VMID_SHIFT  (48LLU)
 #define VTTBR_VMID_MASK	  (0xffLLU << VTTBR_VMID_SHIFT)
 
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 7d29847..b6ae83b 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -152,5 +152,83 @@  static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
 
 void stage2_flush_vm(struct kvm *kvm);
 
+/*
+ * ARMv8 64K architecture limitations:
+ * 16 <= T0SZ <= 21 is valid under 3 level of translation tables
+ * 18 <= T0SZ <= 34 is valid under 2 level of translation tables
+ * 31 <= T0SZ <= 39 is valid under 1 level of transltaion tables
+ *
+ * ARMv8 4K architecture limitations:
+ * 16 <= T0SZ <= 24 is valid under 4 level of translation tables
+ * 21 <= T0SZ <= 33 is valid under 3 level of translation tables
+ * 30 <= T0SZ <= 39 is valid under 2 level of translation tables
+ *
+ * For 4K pages we only support 3 or 4 level, giving T0SZ a range of 16 to 33.
+ * For 64K pages we only support 2 or 3 level, giving T0SZ a range of 16 to 34.
+ *
+ * See Table D4-23 and Table D4-25 in ARM DDI 0487A.b to figure out
+ * the origin of the hardcoded values, 38 and 37.
+ */
+
+#ifdef CONFIG_ARM64_64K_PAGES
+static inline int t0sz_to_vttbr_x(int t0sz)
+{
+	if (t0sz < 16 || t0sz > 34) {
+		kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
+		return 0;
+	}
+
+	return 38 - t0sz;
+}
+#else /* 4K pages */
+static inline int t0sz_to_vttbr_x(int t0sz)
+{
+	if (t0sz < 16 || t0sz > 33) {
+		kvm_err("Cannot support %d-bit address space\n", 64 - t0sz);
+		return 0;
+	}
+	return 37 - t0sz;
+}
+#endif
+static inline int kvm_get_phys_addr_shift(void)
+{
+	int pa_range = read_cpuid(ID_AA64MMFR0_EL1) & 0xf;
+
+	switch (pa_range) {
+	case 0: return 32;
+	case 1: return 36;
+	case 2: return 40;
+	case 3: return 42;
+	case 4: return 44;
+	case 5: return 48;
+	default:
+		BUG();
+		return 0;
+	}
+}
+
+static u64 vttbr_baddr_mask;
+
+/**
+ * set_vttbr_baddr_mask - set mask value for vttbr base address
+ *
+ * In ARMv8, vttbr_baddr_mask cannot be determined in compile time since the
+ * stage2 input address size depends on hardware capability. Thus, we first
+ * need to read ID_AA64MMFR0_EL1.PARange and then set vttbr_baddr_mask with
+ * consideration of both the granule size and the level of translation tables.
+ */
+static inline int set_vttbr_baddr_mask(void)
+{
+	int t0sz, vttbr_x;
+
+	t0sz = VTCR_EL2_T0SZ(kvm_get_phys_addr_shift());
+	vttbr_x = t0sz_to_vttbr_x(t0sz);
+	if (!vttbr_x)
+		return -EINVAL;
+	vttbr_baddr_mask = (((1LLU << (48 - vttbr_x)) - 1) << (vttbr_x - 1));
+
+	return 0;
+}
+
 #endif /* __ASSEMBLY__ */
 #endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index d968796..c0f7634 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -63,17 +63,21 @@  __do_hyp_init:
 	mrs	x4, tcr_el1
 	ldr	x5, =TCR_EL2_MASK
 	and	x4, x4, x5
-	ldr	x5, =TCR_EL2_FLAGS
-	orr	x4, x4, x5
-	msr	tcr_el2, x4
-
-	ldr	x4, =VTCR_EL2_FLAGS
 	/*
 	 * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in
-	 * VTCR_EL2.
+	 * TCR_EL2 and both PS bits and T0SZ bits in VTCR_EL2.
 	 */
 	mrs	x5, ID_AA64MMFR0_EL1
 	bfi	x4, x5, #16, #3
+	msr	tcr_el2, x4
+
+	ldr	x4, =VTCR_EL2_FLAGS
+	bfi	x4, x5, #16, #3
+	and	x5, x5, #0xf
+	adr	x6, t0sz
+	add	x6, x6, x5, lsl #2
+	ldr	w5, [x6]
+	orr	x4, x4, x5
 	msr	vtcr_el2, x4
 
 	mrs	x4, mair_el1
@@ -109,6 +113,10 @@  target: /* We're now in the trampoline code, switch page tables */
 
 	/* Hello, World! */
 	eret
+
+t0sz:
+	.word	VTCR_EL2_T0SZ(32), VTCR_EL2_T0SZ(36), VTCR_EL2_T0SZ(40)
+	.word	VTCR_EL2_T0SZ(42), VTCR_EL2_T0SZ(44), VTCR_EL2_T0SZ(48)
 ENDPROC(__kvm_hyp_init)
 
 	.ltorg