Message ID | 20140804143845.2851.85441.stgit@joelaarch64.amd.com |
---|---|
State | New |
Headers | show |
Since this fixes a real problem and didn't make it into 3.16 it would be good if this made it into 3.17. -Joel On 08/04/2014 09:38 AM, Joel Schopp wrote: > The current VTTBR_BADDR_MASK only masks 39 bits, which is broken on current > systems. Rather than just add a bit it seems like a good time to also set > things at run-time instead of compile time to accomodate more hardware. > > This patch sets TCR_EL2.PS, VTCR_EL2.T0SZ and vttbr_baddr_mask in runtime, > not compile time. > > In ARMv8, EL2 physical address size (TCR_EL2.PS) and stage2 input address > size (VTCR_EL2.T0SZE) cannot be determined in compile time since they > depends on hardware capability. > > According to Table D4-23 and Table D4-25 in ARM DDI 0487A.b document, > vttbr_x is calculated using different hard-coded values with consideration > of T0SZ, granule size and the level of translation tables. Therefore, > vttbr_baddr_mask should be determined dynamically. > > Changes since v2: > Rebased on https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git next branch > > Changes since v1: > Rebased fix on Jungseok Lee's patch https://lkml.org/lkml/2014/5/12/189 to > provide better long term fix. Updated that patch to log error instead of > silently fail on unaligned vttbr. > > Cc: Christoffer Dall <christoffer.dall@linaro.org> > Cc: Sungjinn Chung <sungjinn.chung@samsung.com> > Signed-off-by: Jungseok Lee <jays.lee@samsung.com> > Signed-off-by: Joel Schopp <joel.schopp@amd.com> > --- > arch/arm/kvm/arm.c | 91 +++++++++++++++++++++++++++++++++++++- > arch/arm64/include/asm/kvm_arm.h | 17 +------ > arch/arm64/kvm/hyp-init.S | 20 ++++++-- > 3 files changed, 106 insertions(+), 22 deletions(-) > > diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c > index d7424ef..d7ca2f5 100644 > --- a/arch/arm/kvm/arm.c > +++ b/arch/arm/kvm/arm.c > @@ -37,6 +37,7 @@ > #include <asm/mman.h> > #include <asm/tlbflush.h> > #include <asm/cacheflush.h> > +#include <asm/cputype.h> > #include <asm/virt.h> > #include <asm/kvm_arm.h> > #include <asm/kvm_asm.h> > @@ -61,6 +62,9 @@ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); > static u8 kvm_next_vmid; > static DEFINE_SPINLOCK(kvm_vmid_lock); > > +/* VTTBR mask cannot be determined in complie time under ARMv8 */ > +static u64 vttbr_baddr_mask; > + > static bool vgic_present; > > static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) > @@ -376,6 +380,75 @@ static bool need_new_vmid_gen(struct kvm *kvm) > } > > /** > + * set_vttbr_baddr_mask - set mask value for vttbr base address > + * > + * In ARMv8, vttbr_baddr_mask cannot be determined in compile time since stage2 > + * input address size depends on hardware capability. Thus, it is needed to read > + * ID_AA64MMFR0_EL1.PARange first and then set vttbr_baddr_mask with > + * consideration of both granule size and the level of translation tables. > + */ > +static int set_vttbr_baddr_mask(void) > +{ > +#ifndef CONFIG_ARM64 > + vttbr_baddr_mask = VTTBR_BADDR_MASK; > +#else > + int pa_range, t0sz, vttbr_x; > + > + pa_range = read_cpuid(ID_AA64MMFR0_EL1) & 0xf; > + > + switch (pa_range) { > + case 0: > + t0sz = VTCR_EL2_T0SZ(32); > + break; > + case 1: > + t0sz = VTCR_EL2_T0SZ(36); > + break; > + case 2: > + t0sz = VTCR_EL2_T0SZ(40); > + break; > + case 3: > + t0sz = VTCR_EL2_T0SZ(42); > + break; > + case 4: > + t0sz = VTCR_EL2_T0SZ(44); > + break; > + default: > + t0sz = VTCR_EL2_T0SZ(48); > + } > + > + /* > + * See Table D4-23 and Table D4-25 in ARM DDI 0487A.b to figure out > + * the origin of the hardcoded values, 38 and 37. > + */ > +#ifdef CONFIG_ARM64_64K_PAGES > + /* > + * 16 <= T0SZ <= 21 is valid under 3 level of translation tables > + * 18 <= T0SZ <= 34 is valid under 2 level of translation tables > + * 31 <= T0SZ <= 39 is valid under 1 level of transltaion tables > + */ > + if (t0sz <= 17) { > + kvm_err("Cannot support %d-bit address space\n", 64 - t0sz); > + return -EINVAL; > + } > + vttbr_x = 38 - t0sz; > +#else > + /* > + * 16 <= T0SZ <= 24 is valid under 4 level of translation tables > + * 21 <= T0SZ <= 30 is valid under 3 level of translation tables > + * 30 <= T0SZ <= 39 is valid under 2 level of translation tables > + */ > + if (t0sz <= 20) { > + kvm_err("Cannot support %d-bit address space\n", 64 - t0sz); > + return -EINVAL; > + } > + vttbr_x = 37 - t0sz; > +#endif > + vttbr_baddr_mask = (((1LLU << (48 - vttbr_x)) - 1) << (vttbr_x - 1)); > +#endif > + return 0; > +} > + > +/** > * update_vttbr - Update the VTTBR with a valid VMID before the guest runs > * @kvm The guest that we are about to run > * > @@ -429,8 +502,16 @@ static void update_vttbr(struct kvm *kvm) > /* update vttbr to be used with the new vmid */ > pgd_phys = virt_to_phys(kvm->arch.pgd); > vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK; > - kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK; > - kvm->arch.vttbr |= vmid; > + > + /* > + * If the VTTBR isn't aligned there is something wrong with the system > + * or kernel. It is better to just fail and not mask it. But no need > + * to panic the host kernel with a BUG_ON(), instead just log the error. > + */ > + if (pgd_phys & ~vttbr_baddr_mask) > + kvm_err("VTTBR not aligned, expect guest to fail"); > + > + kvm->arch.vttbr = pgd_phys | vmid; > > spin_unlock(&kvm_vmid_lock); > } > @@ -1015,6 +1096,12 @@ int kvm_arch_init(void *opaque) > } > } > > + err = set_vttbr_baddr_mask(); > + if (err) { > + kvm_err("Cannot set vttbr_baddr_mask\n"); > + return -EINVAL; > + } > + > cpu_notifier_register_begin(); > > err = init_hyp_mode(); > diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h > index cc83520..ff4a4fa 100644 > --- a/arch/arm64/include/asm/kvm_arm.h > +++ b/arch/arm64/include/asm/kvm_arm.h > @@ -95,7 +95,6 @@ > /* TCR_EL2 Registers bits */ > #define TCR_EL2_TBI (1 << 20) > #define TCR_EL2_PS (7 << 16) > -#define TCR_EL2_PS_40B (2 << 16) > #define TCR_EL2_TG0 (1 << 14) > #define TCR_EL2_SH0 (3 << 12) > #define TCR_EL2_ORGN0 (3 << 10) > @@ -104,8 +103,6 @@ > #define TCR_EL2_MASK (TCR_EL2_TG0 | TCR_EL2_SH0 | \ > TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ) > > -#define TCR_EL2_FLAGS (TCR_EL2_PS_40B) > - > /* VTCR_EL2 Registers bits */ > #define VTCR_EL2_PS_MASK (7 << 16) > #define VTCR_EL2_TG0_MASK (1 << 14) > @@ -120,36 +117,28 @@ > #define VTCR_EL2_SL0_MASK (3 << 6) > #define VTCR_EL2_SL0_LVL1 (1 << 6) > #define VTCR_EL2_T0SZ_MASK 0x3f > -#define VTCR_EL2_T0SZ_40B 24 > +#define VTCR_EL2_T0SZ(bits) (64 - (bits)) > > #ifdef CONFIG_ARM64_64K_PAGES > /* > * Stage2 translation configuration: > - * 40bits output (PS = 2) > - * 40bits input (T0SZ = 24) > * 64kB pages (TG0 = 1) > * 2 level page tables (SL = 1) > */ > #define VTCR_EL2_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \ > VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \ > - VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B) > -#define VTTBR_X (38 - VTCR_EL2_T0SZ_40B) > + VTCR_EL2_SL0_LVL1) > #else > /* > * Stage2 translation configuration: > - * 40bits output (PS = 2) > - * 40bits input (T0SZ = 24) > * 4kB pages (TG0 = 0) > * 3 level page tables (SL = 1) > */ > #define VTCR_EL2_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \ > VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \ > - VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B) > -#define VTTBR_X (37 - VTCR_EL2_T0SZ_40B) > + VTCR_EL2_SL0_LVL1) > #endif > > -#define VTTBR_BADDR_SHIFT (VTTBR_X - 1) > -#define VTTBR_BADDR_MASK (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT) > #define VTTBR_VMID_SHIFT (48LLU) > #define VTTBR_VMID_MASK (0xffLLU << VTTBR_VMID_SHIFT) > > diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S > index d968796..c0f7634 100644 > --- a/arch/arm64/kvm/hyp-init.S > +++ b/arch/arm64/kvm/hyp-init.S > @@ -63,17 +63,21 @@ __do_hyp_init: > mrs x4, tcr_el1 > ldr x5, =TCR_EL2_MASK > and x4, x4, x5 > - ldr x5, =TCR_EL2_FLAGS > - orr x4, x4, x5 > - msr tcr_el2, x4 > - > - ldr x4, =VTCR_EL2_FLAGS > /* > * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in > - * VTCR_EL2. > + * TCR_EL2 and both PS bits and T0SZ bits in VTCR_EL2. > */ > mrs x5, ID_AA64MMFR0_EL1 > bfi x4, x5, #16, #3 > + msr tcr_el2, x4 > + > + ldr x4, =VTCR_EL2_FLAGS > + bfi x4, x5, #16, #3 > + and x5, x5, #0xf > + adr x6, t0sz > + add x6, x6, x5, lsl #2 > + ldr w5, [x6] > + orr x4, x4, x5 > msr vtcr_el2, x4 > > mrs x4, mair_el1 > @@ -109,6 +113,10 @@ target: /* We're now in the trampoline code, switch page tables */ > > /* Hello, World! */ > eret > + > +t0sz: > + .word VTCR_EL2_T0SZ(32), VTCR_EL2_T0SZ(36), VTCR_EL2_T0SZ(40) > + .word VTCR_EL2_T0SZ(42), VTCR_EL2_T0SZ(44), VTCR_EL2_T0SZ(48) > ENDPROC(__kvm_hyp_init) > > .ltorg >
On Mon, Aug 04, 2014 at 09:42:46AM -0500, Joel Schopp wrote: > Since this fixes a real problem and didn't make it into 3.16 it would be > good if this made it into 3.17. > It's too late for the merge window, we have to review and test as I told you in a private e-mail. We will review this and test it, and if it makes sense merge as a fix for 3.17 before the release. Please be a little patient given the number of patches on the list currently. Thanks, -Christoffer
On Mon, Aug 04, 2014 at 09:38:45AM -0500, Joel Schopp wrote: > The current VTTBR_BADDR_MASK only masks 39 bits, which is broken on current > systems. Rather than just add a bit it seems like a good time to also set > things at run-time instead of compile time to accomodate more hardware. > > This patch sets TCR_EL2.PS, VTCR_EL2.T0SZ and vttbr_baddr_mask in runtime, > not compile time. > > In ARMv8, EL2 physical address size (TCR_EL2.PS) and stage2 input address > size (VTCR_EL2.T0SZE) cannot be determined in compile time since they > depends on hardware capability. depend > > According to Table D4-23 and Table D4-25 in ARM DDI 0487A.b document, > vttbr_x is calculated using different hard-coded values with consideration fixed values > of T0SZ, granule size and the level of translation tables. Therefore, > vttbr_baddr_mask should be determined dynamically. > > Changes since v2: > Rebased on https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git next branch > > Changes since v1: > Rebased fix on Jungseok Lee's patch https://lkml.org/lkml/2014/5/12/189 to > provide better long term fix. Updated that patch to log error instead of > silently fail on unaligned vttbr. > > Cc: Christoffer Dall <christoffer.dall@linaro.org> > Cc: Sungjinn Chung <sungjinn.chung@samsung.com> > Signed-off-by: Jungseok Lee <jays.lee@samsung.com> > Signed-off-by: Joel Schopp <joel.schopp@amd.com> > --- > arch/arm/kvm/arm.c | 91 +++++++++++++++++++++++++++++++++++++- > arch/arm64/include/asm/kvm_arm.h | 17 +------ > arch/arm64/kvm/hyp-init.S | 20 ++++++-- > 3 files changed, 106 insertions(+), 22 deletions(-) > > diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c > index d7424ef..d7ca2f5 100644 > --- a/arch/arm/kvm/arm.c > +++ b/arch/arm/kvm/arm.c > @@ -37,6 +37,7 @@ > #include <asm/mman.h> > #include <asm/tlbflush.h> > #include <asm/cacheflush.h> > +#include <asm/cputype.h> > #include <asm/virt.h> > #include <asm/kvm_arm.h> > #include <asm/kvm_asm.h> > @@ -61,6 +62,9 @@ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); > static u8 kvm_next_vmid; > static DEFINE_SPINLOCK(kvm_vmid_lock); > > +/* VTTBR mask cannot be determined in complie time under ARMv8 */ I don't think we need this comment. > +static u64 vttbr_baddr_mask; > + > static bool vgic_present; > > static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) > @@ -376,6 +380,75 @@ static bool need_new_vmid_gen(struct kvm *kvm) > } > > /** > + * set_vttbr_baddr_mask - set mask value for vttbr base address > + * > + * In ARMv8, vttbr_baddr_mask cannot be determined in compile time since stage2 the stage2 input address > + * input address size depends on hardware capability. Thus, it is needed to read Thus, we first need to read... > + * ID_AA64MMFR0_EL1.PARange first and then set vttbr_baddr_mask with > + * consideration of both granule size and the level of translation tables. the granule ... > + */ > +static int set_vttbr_baddr_mask(void) > +{ > +#ifndef CONFIG_ARM64 > + vttbr_baddr_mask = VTTBR_BADDR_MASK; > +#else > + int pa_range, t0sz, vttbr_x; > + > + pa_range = read_cpuid(ID_AA64MMFR0_EL1) & 0xf; > + > + switch (pa_range) { > + case 0: > + t0sz = VTCR_EL2_T0SZ(32); > + break; > + case 1: > + t0sz = VTCR_EL2_T0SZ(36); > + break; > + case 2: > + t0sz = VTCR_EL2_T0SZ(40); > + break; > + case 3: > + t0sz = VTCR_EL2_T0SZ(42); > + break; > + case 4: > + t0sz = VTCR_EL2_T0SZ(44); > + break; > + default: > + t0sz = VTCR_EL2_T0SZ(48); > + } the last case would be case 5 and the default case would be a BUG(). > + > + /* > + * See Table D4-23 and Table D4-25 in ARM DDI 0487A.b to figure out > + * the origin of the hardcoded values, 38 and 37. > + */ > +#ifdef CONFIG_ARM64_64K_PAGES > + /* > + * 16 <= T0SZ <= 21 is valid under 3 level of translation tables > + * 18 <= T0SZ <= 34 is valid under 2 level of translation tables > + * 31 <= T0SZ <= 39 is valid under 1 level of transltaion tables > + */ so this scheme is with concatenated initial level stage-2 page tables. But we only ever allocate the amount of pages for our pgd according to what the host has, so I think this allocation needs to be locked down more tight, because the host is always using the appropriate amount for 39 bits virtual addresses. If you want to allow this full range, I think you need to change the allocation logic and go through arch/arm/kvm/mmu.c and look for all uses of pgd_offset() and friends and make sure we're doing the right thing. > + if (t0sz <= 17) { I think this is more readable if you say (t0sz < 18) and don't you also need to check the upper bound?: (t0sz < 18 || t0sz > 34) > + kvm_err("Cannot support %d-bit address space\n", 64 - t0sz); > + return -EINVAL; > + } > + vttbr_x = 38 - t0sz; > +#else > + /* > + * 16 <= T0SZ <= 24 is valid under 4 level of translation tables > + * 21 <= T0SZ <= 30 is valid under 3 level of translation tables under the current scheme, shouldn't this be: 21 <= T0SZ <= 33 but same comment as above applies. > + * 30 <= T0SZ <= 39 is valid under 2 level of translation tables > + */ > + if (t0sz <= 20) { same as above > + kvm_err("Cannot support %d-bit address space\n", 64 - t0sz); > + return -EINVAL; > + } > + vttbr_x = 37 - t0sz; > +#endif > + vttbr_baddr_mask = (((1LLU << (48 - vttbr_x)) - 1) << (vttbr_x - 1)); > +#endif This nested ifdef is really quite horrible. Can you either factor these out into some static inlines in arch/arm[64]/include/asm/kvm_mmu.h or provide a per-architecture implementation in a .c file? > + return 0; > +} > + > +/** > * update_vttbr - Update the VTTBR with a valid VMID before the guest runs > * @kvm The guest that we are about to run > * > @@ -429,8 +502,16 @@ static void update_vttbr(struct kvm *kvm) > /* update vttbr to be used with the new vmid */ > pgd_phys = virt_to_phys(kvm->arch.pgd); > vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK; > - kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK; > - kvm->arch.vttbr |= vmid; > + > + /* > + * If the VTTBR isn't aligned there is something wrong with the system > + * or kernel. It is better to just fail and not mask it. But no need > + * to panic the host kernel with a BUG_ON(), instead just log the error. > + */ These last two sentences are not very helpful, because they don't describe the rationale for what you're doing, only what you are (and are not) doing. That said, I don't think this is doing the right thing. I think you want to refuse running the VM and avoid any stage-2 entried being created if this is not the case (actually, we may want to check this after set_vttbr_baddr_mask() or right aftert allocating the stage-2 pgd), because otherwise I think we may be overwriting memory not belonging to us with concatenated page tables in a 42-bit 4KB system, for example. > + if (pgd_phys & ~vttbr_baddr_mask) > + kvm_err("VTTBR not aligned, expect guest to fail"); > + > + kvm->arch.vttbr = pgd_phys | vmid; > > spin_unlock(&kvm_vmid_lock); > } > @@ -1015,6 +1096,12 @@ int kvm_arch_init(void *opaque) > } > } > > + err = set_vttbr_baddr_mask(); > + if (err) { > + kvm_err("Cannot set vttbr_baddr_mask\n"); > + return -EINVAL; > + } > + > cpu_notifier_register_begin(); > > err = init_hyp_mode(); > diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h > index cc83520..ff4a4fa 100644 > --- a/arch/arm64/include/asm/kvm_arm.h > +++ b/arch/arm64/include/asm/kvm_arm.h > @@ -95,7 +95,6 @@ > /* TCR_EL2 Registers bits */ > #define TCR_EL2_TBI (1 << 20) > #define TCR_EL2_PS (7 << 16) > -#define TCR_EL2_PS_40B (2 << 16) > #define TCR_EL2_TG0 (1 << 14) > #define TCR_EL2_SH0 (3 << 12) > #define TCR_EL2_ORGN0 (3 << 10) > @@ -104,8 +103,6 @@ > #define TCR_EL2_MASK (TCR_EL2_TG0 | TCR_EL2_SH0 | \ > TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ) > > -#define TCR_EL2_FLAGS (TCR_EL2_PS_40B) > - > /* VTCR_EL2 Registers bits */ > #define VTCR_EL2_PS_MASK (7 << 16) > #define VTCR_EL2_TG0_MASK (1 << 14) > @@ -120,36 +117,28 @@ > #define VTCR_EL2_SL0_MASK (3 << 6) > #define VTCR_EL2_SL0_LVL1 (1 << 6) > #define VTCR_EL2_T0SZ_MASK 0x3f > -#define VTCR_EL2_T0SZ_40B 24 > +#define VTCR_EL2_T0SZ(bits) (64 - (bits)) > > #ifdef CONFIG_ARM64_64K_PAGES > /* > * Stage2 translation configuration: > - * 40bits output (PS = 2) > - * 40bits input (T0SZ = 24) > * 64kB pages (TG0 = 1) > * 2 level page tables (SL = 1) > */ > #define VTCR_EL2_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \ > VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \ > - VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B) > -#define VTTBR_X (38 - VTCR_EL2_T0SZ_40B) > + VTCR_EL2_SL0_LVL1) > #else > /* > * Stage2 translation configuration: > - * 40bits output (PS = 2) > - * 40bits input (T0SZ = 24) > * 4kB pages (TG0 = 0) > * 3 level page tables (SL = 1) > */ > #define VTCR_EL2_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \ > VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \ > - VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B) > -#define VTTBR_X (37 - VTCR_EL2_T0SZ_40B) > + VTCR_EL2_SL0_LVL1) > #endif > > -#define VTTBR_BADDR_SHIFT (VTTBR_X - 1) > -#define VTTBR_BADDR_MASK (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT) > #define VTTBR_VMID_SHIFT (48LLU) > #define VTTBR_VMID_MASK (0xffLLU << VTTBR_VMID_SHIFT) > > diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S > index d968796..c0f7634 100644 > --- a/arch/arm64/kvm/hyp-init.S > +++ b/arch/arm64/kvm/hyp-init.S > @@ -63,17 +63,21 @@ __do_hyp_init: > mrs x4, tcr_el1 > ldr x5, =TCR_EL2_MASK > and x4, x4, x5 > - ldr x5, =TCR_EL2_FLAGS > - orr x4, x4, x5 > - msr tcr_el2, x4 > - > - ldr x4, =VTCR_EL2_FLAGS > /* > * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in > - * VTCR_EL2. > + * TCR_EL2 and both PS bits and T0SZ bits in VTCR_EL2. > */ > mrs x5, ID_AA64MMFR0_EL1 > bfi x4, x5, #16, #3 > + msr tcr_el2, x4 > + > + ldr x4, =VTCR_EL2_FLAGS > + bfi x4, x5, #16, #3 > + and x5, x5, #0xf > + adr x6, t0sz > + add x6, x6, x5, lsl #2 > + ldr w5, [x6] > + orr x4, x4, x5 > msr vtcr_el2, x4 > > mrs x4, mair_el1 > @@ -109,6 +113,10 @@ target: /* We're now in the trampoline code, switch page tables */ > > /* Hello, World! */ > eret > + > +t0sz: > + .word VTCR_EL2_T0SZ(32), VTCR_EL2_T0SZ(36), VTCR_EL2_T0SZ(40) > + .word VTCR_EL2_T0SZ(42), VTCR_EL2_T0SZ(44), VTCR_EL2_T0SZ(48) > ENDPROC(__kvm_hyp_init) > > .ltorg > Thanks for picking this up! -Christoffer
Thanks for the detailed review. > the last case would be case 5 and the default case would be a BUG(). I agree with the case, but rather than do a BUG() I'm going to print an error and return -EINVAL. Not worth stopping the host kernel just because kvm is messed up when we can gracefully exit from it. > >> + >> + /* >> + * See Table D4-23 and Table D4-25 in ARM DDI 0487A.b to figure out >> + * the origin of the hardcoded values, 38 and 37. >> + */ >> +#ifdef CONFIG_ARM64_64K_PAGES >> + /* >> + * 16 <= T0SZ <= 21 is valid under 3 level of translation tables >> + * 18 <= T0SZ <= 34 is valid under 2 level of translation tables >> + * 31 <= T0SZ <= 39 is valid under 1 level of transltaion tables >> + */ > so this scheme is with concatenated initial level stage-2 page tables. > > But we only ever allocate the amount of pages for our pgd according to > what the host has, so I think this allocation needs to be locked down > more tight, because the host is always using the appropriate amount for > 39 bits virtual addresses. I'll narrow the sanity check of the range. I'll narrow it based on a 39 - 48 bit VA host range in anticipation of the 4 level 4k and 3 level 64k host patches going in. >> + kvm_err("Cannot support %d-bit address space\n", 64 - t0sz); >> + return -EINVAL; >> + } >> + vttbr_x = 37 - t0sz; >> +#endif >> + vttbr_baddr_mask = (((1LLU << (48 - vttbr_x)) - 1) << (vttbr_x - 1)); >> +#endif > This nested ifdef is really quite horrible. Can you either factor these > out into some static inlines in arch/arm[64]/include/asm/kvm_mmu.h or > provide a per-architecture implementation in a .c file? I'll factor it out in the file to make it more readable and do away with the nested ifdef. My theory on putting things into .h files is to only do it if there is actually another file that uses it. > >> + return 0; >> +} >> + >> +/** >> * update_vttbr - Update the VTTBR with a valid VMID before the guest runs >> * @kvm The guest that we are about to run >> * >> @@ -429,8 +502,16 @@ static void update_vttbr(struct kvm *kvm) >> /* update vttbr to be used with the new vmid */ >> pgd_phys = virt_to_phys(kvm->arch.pgd); >> vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK; >> - kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK; >> - kvm->arch.vttbr |= vmid; >> + >> + /* >> + * If the VTTBR isn't aligned there is something wrong with the system >> + * or kernel. It is better to just fail and not mask it. But no need >> + * to panic the host kernel with a BUG_ON(), instead just log the error. >> + */ > These last two sentences are not very helpful, because they don't > describe the rationale for what you're doing, only what you are (and are > not) doing. I'll reword the comment. > > That said, I don't think this is doing the right thing. I think you > want to refuse running the VM and avoid any stage-2 entried being > created if this is not the case (actually, we may want to check this > after set_vttbr_baddr_mask() or right aftert allocating the stage-2 > pgd), because otherwise I think we may be overwriting memory not > belonging to us with concatenated page tables in a 42-bit 4KB system, > for example. My experience here was that the hardware actually catches the error on the first instruction load of the guest kernel and does a stage 2 translation abort. However, to be extra safe we could just log the error with the address of the vttbr and then zero out the pgd_phys part of vttbr altogether, leaving only the vmid. The guest would then die of natural causes and we wouldn't have to worry about the outside possibility of memory getting overwritten. I don't like the option of just calling BUG() because stopping the host kernel from running just because we can't run a guest seems a bit extreme. On the other hand adding a return code to update_vttbr and checking it (even with unlikely) in the kvm_arch_vcpu_ioctl_run() loop doesn't thrill me either just from a wasted cpu cycles point of view. > >> + if (pgd_phys & ~vttbr_baddr_mask) >> + kvm_err("VTTBR not aligned, expect guest to fail"); >> + >> + kvm->arch.vttbr = pgd_phys | vmid; >> >> spin_unlock(&kvm_vmid_lock); >> } >> @@ -1015,6 +1096,12 @@ int kvm_arch_init(void *opaque) >> } >> } >> >> + err = set_vttbr_baddr_mask(); >> + if (err) { >> + kvm_err("Cannot set vttbr_baddr_mask\n"); >> + return -EINVAL; >> + } >> + >> cpu_notifier_register_begin(); >> >> err = init_hyp_mode(); >> diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h >> index cc83520..ff4a4fa 100644 >> --- a/arch/arm64/include/asm/kvm_arm.h >> +++ b/arch/arm64/include/asm/kvm_arm.h >> @@ -95,7 +95,6 @@ >> /* TCR_EL2 Registers bits */ >> #define TCR_EL2_TBI (1 << 20) >> #define TCR_EL2_PS (7 << 16) >> -#define TCR_EL2_PS_40B (2 << 16) >> #define TCR_EL2_TG0 (1 << 14) >> #define TCR_EL2_SH0 (3 << 12) >> #define TCR_EL2_ORGN0 (3 << 10) >> @@ -104,8 +103,6 @@ >> #define TCR_EL2_MASK (TCR_EL2_TG0 | TCR_EL2_SH0 | \ >> TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ) >> >> -#define TCR_EL2_FLAGS (TCR_EL2_PS_40B) >> - >> /* VTCR_EL2 Registers bits */ >> #define VTCR_EL2_PS_MASK (7 << 16) >> #define VTCR_EL2_TG0_MASK (1 << 14) >> @@ -120,36 +117,28 @@ >> #define VTCR_EL2_SL0_MASK (3 << 6) >> #define VTCR_EL2_SL0_LVL1 (1 << 6) >> #define VTCR_EL2_T0SZ_MASK 0x3f >> -#define VTCR_EL2_T0SZ_40B 24 >> +#define VTCR_EL2_T0SZ(bits) (64 - (bits)) >> >> #ifdef CONFIG_ARM64_64K_PAGES >> /* >> * Stage2 translation configuration: >> - * 40bits output (PS = 2) >> - * 40bits input (T0SZ = 24) >> * 64kB pages (TG0 = 1) >> * 2 level page tables (SL = 1) >> */ >> #define VTCR_EL2_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \ >> VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \ >> - VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B) >> -#define VTTBR_X (38 - VTCR_EL2_T0SZ_40B) >> + VTCR_EL2_SL0_LVL1) >> #else >> /* >> * Stage2 translation configuration: >> - * 40bits output (PS = 2) >> - * 40bits input (T0SZ = 24) >> * 4kB pages (TG0 = 0) >> * 3 level page tables (SL = 1) >> */ >> #define VTCR_EL2_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \ >> VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \ >> - VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B) >> -#define VTTBR_X (37 - VTCR_EL2_T0SZ_40B) >> + VTCR_EL2_SL0_LVL1) >> #endif >> >> -#define VTTBR_BADDR_SHIFT (VTTBR_X - 1) >> -#define VTTBR_BADDR_MASK (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT) >> #define VTTBR_VMID_SHIFT (48LLU) >> #define VTTBR_VMID_MASK (0xffLLU << VTTBR_VMID_SHIFT) >> >> diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S >> index d968796..c0f7634 100644 >> --- a/arch/arm64/kvm/hyp-init.S >> +++ b/arch/arm64/kvm/hyp-init.S >> @@ -63,17 +63,21 @@ __do_hyp_init: >> mrs x4, tcr_el1 >> ldr x5, =TCR_EL2_MASK >> and x4, x4, x5 >> - ldr x5, =TCR_EL2_FLAGS >> - orr x4, x4, x5 >> - msr tcr_el2, x4 >> - >> - ldr x4, =VTCR_EL2_FLAGS >> /* >> * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in >> - * VTCR_EL2. >> + * TCR_EL2 and both PS bits and T0SZ bits in VTCR_EL2. >> */ >> mrs x5, ID_AA64MMFR0_EL1 >> bfi x4, x5, #16, #3 >> + msr tcr_el2, x4 >> + >> + ldr x4, =VTCR_EL2_FLAGS >> + bfi x4, x5, #16, #3 >> + and x5, x5, #0xf >> + adr x6, t0sz >> + add x6, x6, x5, lsl #2 >> + ldr w5, [x6] >> + orr x4, x4, x5 >> msr vtcr_el2, x4 >> >> mrs x4, mair_el1 >> @@ -109,6 +113,10 @@ target: /* We're now in the trampoline code, switch page tables */ >> >> /* Hello, World! */ >> eret >> + >> +t0sz: >> + .word VTCR_EL2_T0SZ(32), VTCR_EL2_T0SZ(36), VTCR_EL2_T0SZ(40) >> + .word VTCR_EL2_T0SZ(42), VTCR_EL2_T0SZ(44), VTCR_EL2_T0SZ(48) >> ENDPROC(__kvm_hyp_init) >> >> .ltorg >> > Thanks for picking this up! Thanks for the detailed review. I'll try to turnaround v4 quickly. > -Christoffer
On Mon, Aug 11, 2014 at 10:20:41AM -0500, Joel Schopp wrote: > Thanks for the detailed review. > > the last case would be case 5 and the default case would be a BUG(). > I agree with the case, but rather than do a BUG() I'm going to print an > error and return -EINVAL. Not worth stopping the host kernel just > because kvm is messed up when we can gracefully exit from it. agreed > > > >> + > >> + /* > >> + * See Table D4-23 and Table D4-25 in ARM DDI 0487A.b to figure out > >> + * the origin of the hardcoded values, 38 and 37. > >> + */ > >> +#ifdef CONFIG_ARM64_64K_PAGES > >> + /* > >> + * 16 <= T0SZ <= 21 is valid under 3 level of translation tables > >> + * 18 <= T0SZ <= 34 is valid under 2 level of translation tables > >> + * 31 <= T0SZ <= 39 is valid under 1 level of transltaion tables > >> + */ > > so this scheme is with concatenated initial level stage-2 page tables. > > > > But we only ever allocate the amount of pages for our pgd according to > > what the host has, so I think this allocation needs to be locked down > > more tight, because the host is always using the appropriate amount for > > 39 bits virtual addresses. > I'll narrow the sanity check of the range. I'll narrow it based on a 39 > - 48 bit VA host range in anticipation of the 4 level 4k and 3 level 64k > host patches going in. > > >> + kvm_err("Cannot support %d-bit address space\n", 64 - t0sz); > >> + return -EINVAL; > >> + } > >> + vttbr_x = 37 - t0sz; > >> +#endif > >> + vttbr_baddr_mask = (((1LLU << (48 - vttbr_x)) - 1) << (vttbr_x - 1)); > >> +#endif > > This nested ifdef is really quite horrible. Can you either factor these > > out into some static inlines in arch/arm[64]/include/asm/kvm_mmu.h or > > provide a per-architecture implementation in a .c file? > I'll factor it out in the file to make it more readable and do away with > the nested ifdef. My theory on putting things into .h files is to only > do it if there is actually another file that uses it. that's not really the design principle between the split of kvm/arm and kvm/arm64 though, but I'll let you choose your preferred method when writing up the patches. > > > >> + return 0; > >> +} > >> + > >> +/** > >> * update_vttbr - Update the VTTBR with a valid VMID before the guest runs > >> * @kvm The guest that we are about to run > >> * > >> @@ -429,8 +502,16 @@ static void update_vttbr(struct kvm *kvm) > >> /* update vttbr to be used with the new vmid */ > >> pgd_phys = virt_to_phys(kvm->arch.pgd); > >> vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK; > >> - kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK; > >> - kvm->arch.vttbr |= vmid; > >> + > >> + /* > >> + * If the VTTBR isn't aligned there is something wrong with the system > >> + * or kernel. It is better to just fail and not mask it. But no need > >> + * to panic the host kernel with a BUG_ON(), instead just log the error. > >> + */ > > These last two sentences are not very helpful, because they don't > > describe the rationale for what you're doing, only what you are (and are > > not) doing. > I'll reword the comment. > > > > That said, I don't think this is doing the right thing. I think you > > want to refuse running the VM and avoid any stage-2 entried being > > created if this is not the case (actually, we may want to check this > > after set_vttbr_baddr_mask() or right aftert allocating the stage-2 > > pgd), because otherwise I think we may be overwriting memory not > > belonging to us with concatenated page tables in a 42-bit 4KB system, > > for example. > My experience here was that the hardware actually catches the error on > the first instruction load of the guest kernel and does a stage 2 > translation abort. However, to be extra safe we could just log the > error with the address of the vttbr and then zero out the pgd_phys part > of vttbr altogether, leaving only the vmid. The guest would then die of > natural causes and we wouldn't have to worry about the outside > possibility of memory getting overwritten. uh, putting zero in the pgd_phys part will just point to random memory if you happen to have memory based at address 0 though, right? I think we should check when we allocate the pgd that it is indeed of the right size and alignment, and if it isn't at this point, it truly is a BUG() and your kernel is terribly busted. > > I don't like the option of just calling BUG() because stopping the host > kernel from running just because we can't run a guest seems a bit > extreme. On the other hand adding a return code to update_vttbr and > checking it (even with unlikely) in the kvm_arch_vcpu_ioctl_run() loop > doesn't thrill me either just from a wasted cpu cycles point of view. Agreed that we shouldn't call BUG on something that could happen. We only should if we really do have a BUG. If we have a programming error in setting our stage-2 page tables, then we're hosed, and I think crashing the kernel is fine. You could make it a VM_BUG_ON() if you don't like having the check regularly, or create the error code return. > > > >> + if (pgd_phys & ~vttbr_baddr_mask) > >> + kvm_err("VTTBR not aligned, expect guest to fail"); > >> + > >> + kvm->arch.vttbr = pgd_phys | vmid; > >> > >> spin_unlock(&kvm_vmid_lock); > >> } > >> @@ -1015,6 +1096,12 @@ int kvm_arch_init(void *opaque) > >> } > >> } > >> > >> + err = set_vttbr_baddr_mask(); > >> + if (err) { > >> + kvm_err("Cannot set vttbr_baddr_mask\n"); > >> + return -EINVAL; > >> + } > >> + > >> cpu_notifier_register_begin(); > >> > >> err = init_hyp_mode(); > >> diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h > >> index cc83520..ff4a4fa 100644 > >> --- a/arch/arm64/include/asm/kvm_arm.h > >> +++ b/arch/arm64/include/asm/kvm_arm.h > >> @@ -95,7 +95,6 @@ > >> /* TCR_EL2 Registers bits */ > >> #define TCR_EL2_TBI (1 << 20) > >> #define TCR_EL2_PS (7 << 16) > >> -#define TCR_EL2_PS_40B (2 << 16) > >> #define TCR_EL2_TG0 (1 << 14) > >> #define TCR_EL2_SH0 (3 << 12) > >> #define TCR_EL2_ORGN0 (3 << 10) > >> @@ -104,8 +103,6 @@ > >> #define TCR_EL2_MASK (TCR_EL2_TG0 | TCR_EL2_SH0 | \ > >> TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ) > >> > >> -#define TCR_EL2_FLAGS (TCR_EL2_PS_40B) > >> - > >> /* VTCR_EL2 Registers bits */ > >> #define VTCR_EL2_PS_MASK (7 << 16) > >> #define VTCR_EL2_TG0_MASK (1 << 14) > >> @@ -120,36 +117,28 @@ > >> #define VTCR_EL2_SL0_MASK (3 << 6) > >> #define VTCR_EL2_SL0_LVL1 (1 << 6) > >> #define VTCR_EL2_T0SZ_MASK 0x3f > >> -#define VTCR_EL2_T0SZ_40B 24 > >> +#define VTCR_EL2_T0SZ(bits) (64 - (bits)) > >> > >> #ifdef CONFIG_ARM64_64K_PAGES > >> /* > >> * Stage2 translation configuration: > >> - * 40bits output (PS = 2) > >> - * 40bits input (T0SZ = 24) > >> * 64kB pages (TG0 = 1) > >> * 2 level page tables (SL = 1) > >> */ > >> #define VTCR_EL2_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \ > >> VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \ > >> - VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B) > >> -#define VTTBR_X (38 - VTCR_EL2_T0SZ_40B) > >> + VTCR_EL2_SL0_LVL1) > >> #else > >> /* > >> * Stage2 translation configuration: > >> - * 40bits output (PS = 2) > >> - * 40bits input (T0SZ = 24) > >> * 4kB pages (TG0 = 0) > >> * 3 level page tables (SL = 1) > >> */ > >> #define VTCR_EL2_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \ > >> VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \ > >> - VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B) > >> -#define VTTBR_X (37 - VTCR_EL2_T0SZ_40B) > >> + VTCR_EL2_SL0_LVL1) > >> #endif > >> > >> -#define VTTBR_BADDR_SHIFT (VTTBR_X - 1) > >> -#define VTTBR_BADDR_MASK (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT) > >> #define VTTBR_VMID_SHIFT (48LLU) > >> #define VTTBR_VMID_MASK (0xffLLU << VTTBR_VMID_SHIFT) > >> > >> diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S > >> index d968796..c0f7634 100644 > >> --- a/arch/arm64/kvm/hyp-init.S > >> +++ b/arch/arm64/kvm/hyp-init.S > >> @@ -63,17 +63,21 @@ __do_hyp_init: > >> mrs x4, tcr_el1 > >> ldr x5, =TCR_EL2_MASK > >> and x4, x4, x5 > >> - ldr x5, =TCR_EL2_FLAGS > >> - orr x4, x4, x5 > >> - msr tcr_el2, x4 > >> - > >> - ldr x4, =VTCR_EL2_FLAGS > >> /* > >> * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in > >> - * VTCR_EL2. > >> + * TCR_EL2 and both PS bits and T0SZ bits in VTCR_EL2. > >> */ > >> mrs x5, ID_AA64MMFR0_EL1 > >> bfi x4, x5, #16, #3 > >> + msr tcr_el2, x4 > >> + > >> + ldr x4, =VTCR_EL2_FLAGS > >> + bfi x4, x5, #16, #3 > >> + and x5, x5, #0xf > >> + adr x6, t0sz > >> + add x6, x6, x5, lsl #2 > >> + ldr w5, [x6] > >> + orr x4, x4, x5 > >> msr vtcr_el2, x4 > >> > >> mrs x4, mair_el1 > >> @@ -109,6 +113,10 @@ target: /* We're now in the trampoline code, switch page tables */ > >> > >> /* Hello, World! */ > >> eret > >> + > >> +t0sz: > >> + .word VTCR_EL2_T0SZ(32), VTCR_EL2_T0SZ(36), VTCR_EL2_T0SZ(40) > >> + .word VTCR_EL2_T0SZ(42), VTCR_EL2_T0SZ(44), VTCR_EL2_T0SZ(48) > >> ENDPROC(__kvm_hyp_init) > >> > >> .ltorg > >> > > Thanks for picking this up! > Thanks for the detailed review. I'll try to turnaround v4 quickly. Cool! -Christoffer
>>> That said, I don't think this is doing the right thing. I think you >>> want to refuse running the VM and avoid any stage-2 entried being >>> created if this is not the case (actually, we may want to check this >>> after set_vttbr_baddr_mask() or right aftert allocating the stage-2 >>> pgd), because otherwise I think we may be overwriting memory not >>> belonging to us with concatenated page tables in a 42-bit 4KB system, >>> for example. >> My experience here was that the hardware actually catches the error on >> the first instruction load of the guest kernel and does a stage 2 >> translation abort. However, to be extra safe we could just log the >> error with the address of the vttbr and then zero out the pgd_phys part >> of vttbr altogether, leaving only the vmid. The guest would then die of >> natural causes and we wouldn't have to worry about the outside >> possibility of memory getting overwritten. > uh, putting zero in the pgd_phys part will just point to random memory > if you happen to have memory based at address 0 though, right? > > I think we should check when we allocate the pgd that it is indeed of > the right size and alignment, and if it isn't at this point, it truly is > a BUG() and your kernel is terribly busted. If I can't rely on 0 to be an invalid address I can't think of what I could rely on to be invalid. I'll just change this to BUG_ON(pgd_phys & ~vttbr_baddr_mask); and give up on my dream of the host kernel surviving the bug.
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index d7424ef..d7ca2f5 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -37,6 +37,7 @@ #include <asm/mman.h> #include <asm/tlbflush.h> #include <asm/cacheflush.h> +#include <asm/cputype.h> #include <asm/virt.h> #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> @@ -61,6 +62,9 @@ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); static u8 kvm_next_vmid; static DEFINE_SPINLOCK(kvm_vmid_lock); +/* VTTBR mask cannot be determined in complie time under ARMv8 */ +static u64 vttbr_baddr_mask; + static bool vgic_present; static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) @@ -376,6 +380,75 @@ static bool need_new_vmid_gen(struct kvm *kvm) } /** + * set_vttbr_baddr_mask - set mask value for vttbr base address + * + * In ARMv8, vttbr_baddr_mask cannot be determined in compile time since stage2 + * input address size depends on hardware capability. Thus, it is needed to read + * ID_AA64MMFR0_EL1.PARange first and then set vttbr_baddr_mask with + * consideration of both granule size and the level of translation tables. + */ +static int set_vttbr_baddr_mask(void) +{ +#ifndef CONFIG_ARM64 + vttbr_baddr_mask = VTTBR_BADDR_MASK; +#else + int pa_range, t0sz, vttbr_x; + + pa_range = read_cpuid(ID_AA64MMFR0_EL1) & 0xf; + + switch (pa_range) { + case 0: + t0sz = VTCR_EL2_T0SZ(32); + break; + case 1: + t0sz = VTCR_EL2_T0SZ(36); + break; + case 2: + t0sz = VTCR_EL2_T0SZ(40); + break; + case 3: + t0sz = VTCR_EL2_T0SZ(42); + break; + case 4: + t0sz = VTCR_EL2_T0SZ(44); + break; + default: + t0sz = VTCR_EL2_T0SZ(48); + } + + /* + * See Table D4-23 and Table D4-25 in ARM DDI 0487A.b to figure out + * the origin of the hardcoded values, 38 and 37. + */ +#ifdef CONFIG_ARM64_64K_PAGES + /* + * 16 <= T0SZ <= 21 is valid under 3 level of translation tables + * 18 <= T0SZ <= 34 is valid under 2 level of translation tables + * 31 <= T0SZ <= 39 is valid under 1 level of transltaion tables + */ + if (t0sz <= 17) { + kvm_err("Cannot support %d-bit address space\n", 64 - t0sz); + return -EINVAL; + } + vttbr_x = 38 - t0sz; +#else + /* + * 16 <= T0SZ <= 24 is valid under 4 level of translation tables + * 21 <= T0SZ <= 30 is valid under 3 level of translation tables + * 30 <= T0SZ <= 39 is valid under 2 level of translation tables + */ + if (t0sz <= 20) { + kvm_err("Cannot support %d-bit address space\n", 64 - t0sz); + return -EINVAL; + } + vttbr_x = 37 - t0sz; +#endif + vttbr_baddr_mask = (((1LLU << (48 - vttbr_x)) - 1) << (vttbr_x - 1)); +#endif + return 0; +} + +/** * update_vttbr - Update the VTTBR with a valid VMID before the guest runs * @kvm The guest that we are about to run * @@ -429,8 +502,16 @@ static void update_vttbr(struct kvm *kvm) /* update vttbr to be used with the new vmid */ pgd_phys = virt_to_phys(kvm->arch.pgd); vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK; - kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK; - kvm->arch.vttbr |= vmid; + + /* + * If the VTTBR isn't aligned there is something wrong with the system + * or kernel. It is better to just fail and not mask it. But no need + * to panic the host kernel with a BUG_ON(), instead just log the error. + */ + if (pgd_phys & ~vttbr_baddr_mask) + kvm_err("VTTBR not aligned, expect guest to fail"); + + kvm->arch.vttbr = pgd_phys | vmid; spin_unlock(&kvm_vmid_lock); } @@ -1015,6 +1096,12 @@ int kvm_arch_init(void *opaque) } } + err = set_vttbr_baddr_mask(); + if (err) { + kvm_err("Cannot set vttbr_baddr_mask\n"); + return -EINVAL; + } + cpu_notifier_register_begin(); err = init_hyp_mode(); diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index cc83520..ff4a4fa 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -95,7 +95,6 @@ /* TCR_EL2 Registers bits */ #define TCR_EL2_TBI (1 << 20) #define TCR_EL2_PS (7 << 16) -#define TCR_EL2_PS_40B (2 << 16) #define TCR_EL2_TG0 (1 << 14) #define TCR_EL2_SH0 (3 << 12) #define TCR_EL2_ORGN0 (3 << 10) @@ -104,8 +103,6 @@ #define TCR_EL2_MASK (TCR_EL2_TG0 | TCR_EL2_SH0 | \ TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ) -#define TCR_EL2_FLAGS (TCR_EL2_PS_40B) - /* VTCR_EL2 Registers bits */ #define VTCR_EL2_PS_MASK (7 << 16) #define VTCR_EL2_TG0_MASK (1 << 14) @@ -120,36 +117,28 @@ #define VTCR_EL2_SL0_MASK (3 << 6) #define VTCR_EL2_SL0_LVL1 (1 << 6) #define VTCR_EL2_T0SZ_MASK 0x3f -#define VTCR_EL2_T0SZ_40B 24 +#define VTCR_EL2_T0SZ(bits) (64 - (bits)) #ifdef CONFIG_ARM64_64K_PAGES /* * Stage2 translation configuration: - * 40bits output (PS = 2) - * 40bits input (T0SZ = 24) * 64kB pages (TG0 = 1) * 2 level page tables (SL = 1) */ #define VTCR_EL2_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \ VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \ - VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B) -#define VTTBR_X (38 - VTCR_EL2_T0SZ_40B) + VTCR_EL2_SL0_LVL1) #else /* * Stage2 translation configuration: - * 40bits output (PS = 2) - * 40bits input (T0SZ = 24) * 4kB pages (TG0 = 0) * 3 level page tables (SL = 1) */ #define VTCR_EL2_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \ VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \ - VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B) -#define VTTBR_X (37 - VTCR_EL2_T0SZ_40B) + VTCR_EL2_SL0_LVL1) #endif -#define VTTBR_BADDR_SHIFT (VTTBR_X - 1) -#define VTTBR_BADDR_MASK (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT) #define VTTBR_VMID_SHIFT (48LLU) #define VTTBR_VMID_MASK (0xffLLU << VTTBR_VMID_SHIFT) diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index d968796..c0f7634 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -63,17 +63,21 @@ __do_hyp_init: mrs x4, tcr_el1 ldr x5, =TCR_EL2_MASK and x4, x4, x5 - ldr x5, =TCR_EL2_FLAGS - orr x4, x4, x5 - msr tcr_el2, x4 - - ldr x4, =VTCR_EL2_FLAGS /* * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in - * VTCR_EL2. + * TCR_EL2 and both PS bits and T0SZ bits in VTCR_EL2. */ mrs x5, ID_AA64MMFR0_EL1 bfi x4, x5, #16, #3 + msr tcr_el2, x4 + + ldr x4, =VTCR_EL2_FLAGS + bfi x4, x5, #16, #3 + and x5, x5, #0xf + adr x6, t0sz + add x6, x6, x5, lsl #2 + ldr w5, [x6] + orr x4, x4, x5 msr vtcr_el2, x4 mrs x4, mair_el1 @@ -109,6 +113,10 @@ target: /* We're now in the trampoline code, switch page tables */ /* Hello, World! */ eret + +t0sz: + .word VTCR_EL2_T0SZ(32), VTCR_EL2_T0SZ(36), VTCR_EL2_T0SZ(40) + .word VTCR_EL2_T0SZ(42), VTCR_EL2_T0SZ(44), VTCR_EL2_T0SZ(48) ENDPROC(__kvm_hyp_init) .ltorg