Message ID | 20221001162318.153420-42-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | target/arm: Implement FEAT_HAFDBS | expand |
On Sat, 1 Oct 2022 at 18:04, Richard Henderson <richard.henderson@linaro.org> wrote: > > Perform the atomic update for hardware management of the access flag > and the dirty bit. > > A limitation of the implementation so far is that the page table > itself must already be writable, i.e. the dirty bit for the stage2 > page table must already be set, i.e. we cannot set both dirty bits > at the same time. > > This is allowed because it is CONSTRAINED UNPREDICTABLE whether any > atomic update happens at all. The implementation is allowed to simply > fall back on software update at any time. I can't see where in the Arm ARM this is stated. In any case, if HA is set you can't simply return ARMFault_AccessFlag without breaking the bit in D5.4.12 which says "When hardware updates of the Access flag are enabled for a stage of translation an address translation instruction that uses that stage of translation will not report that the address will give rise to an Access flag fault in the PAR" > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > docs/system/arm/emulation.rst | 1 + > target/arm/cpu64.c | 1 + > target/arm/ptw.c | 119 ++++++++++++++++++++++++++++++++-- > 3 files changed, 115 insertions(+), 6 deletions(-) > > diff --git a/docs/system/arm/emulation.rst b/docs/system/arm/emulation.rst > index be7bbffe59..c3582d075e 100644 > --- a/docs/system/arm/emulation.rst > +++ b/docs/system/arm/emulation.rst > @@ -31,6 +31,7 @@ the following architecture extensions: > - FEAT_FRINTTS (Floating-point to integer instructions) > - FEAT_FlagM (Flag manipulation instructions v2) > - FEAT_FlagM2 (Enhancements to flag manipulation instructions) > +- FEAT_HAFDBS (Hardware management of the access flag and dirty bit state) > - FEAT_HCX (Support for the HCRX_EL2 register) > - FEAT_HPDS (Hierarchical permission disables) > - FEAT_I8MM (AArch64 Int8 matrix multiplication instructions) > diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c > index e6314e86d2..b064dc7964 100644 > --- a/target/arm/cpu64.c > +++ b/target/arm/cpu64.c > @@ -1116,6 +1116,7 @@ static void aarch64_max_initfn(Object *obj) > cpu->isar.id_aa64mmfr0 = t; > > t = cpu->isar.id_aa64mmfr1; > + t = FIELD_DP64(t, ID_AA64MMFR1, HAFDBS, 2); /* FEAT_HAFDBS */ I think we should split the access flag update and the dirty-bit update into separate commits. It might be useful for bisection purposes later, and it looks like they're pretty cleanly separable. (Though if you look at my last comment in this email, maybe not quite so clean as in the code as you have it here.) > t = FIELD_DP64(t, ID_AA64MMFR1, VMIDBITS, 2); /* FEAT_VMID16 */ > t = FIELD_DP64(t, ID_AA64MMFR1, VH, 1); /* FEAT_VHE */ > t = FIELD_DP64(t, ID_AA64MMFR1, HPDS, 1); /* FEAT_HPDS */ > diff --git a/target/arm/ptw.c b/target/arm/ptw.c > index 45734b0d28..14ab56d1b5 100644 > --- a/target/arm/ptw.c > +++ b/target/arm/ptw.c > @@ -223,6 +223,7 @@ static bool S2_attrs_are_device(uint64_t hcr, uint8_t attrs) > typedef struct { > bool is_secure; > bool be; > + bool rw; > void *hphys; > hwaddr gphys; > } S1TranslateResult; > @@ -261,7 +262,8 @@ static bool S1_ptw_translate(CPUARMState *env, ARMMMUIdx mmu_idx, > pte_attrs = s2.cacheattrs.attrs; > pte_secure = s2.f.attrs.secure; > } > - res->hphys = NULL; > + res->hphys = NULL; /* force slow path */ > + res->rw = false; /* debug never modifies */ > } else { > CPUTLBEntryFull *full; > int flags; > @@ -276,6 +278,7 @@ static bool S1_ptw_translate(CPUARMState *env, ARMMMUIdx mmu_idx, > goto fail; > } > res->gphys = full->phys_addr; > + res->rw = full->prot & PAGE_WRITE; > pte_attrs = full->pte_attrs; > pte_secure = full->attrs.secure; > } > @@ -381,6 +384,56 @@ static uint64_t arm_ldq_ptw(CPUARMState *env, const S1TranslateResult *s1, > return data; > } > > +static uint64_t arm_casq_ptw(CPUARMState *env, uint64_t old_val, > + uint64_t new_val, const S1TranslateResult *s1, > + ARMMMUFaultInfo *fi) > +{ > + uint64_t cur_val; > + > + if (unlikely(!s1->hphys)) { > + fi->type = ARMFault_UnsuppAtomicUpdate; > + fi->s1ptw = true; > + return 0; > + } > + > +#ifndef CONFIG_ATOMIC64 > + /* > + * We can't support the atomic operation on the host. We should be > + * running in round-robin mode though, which means that we would only > + * race with dma i/o. > + */ > + qemu_mutex_lock_iothread(); Are there definitely no code paths where we might try to do a page table walk with the iothread already locked ? > + if (s1->be) { > + cur_val = ldq_be_p(s1->hphys); > + if (cur_val == old_val) { > + stq_be_p(s1->hphys, new_val); > + } > + } else { > + cur_val = ldq_le_p(s1->hphys); > + if (cur_val == old_val) { > + stq_le_p(s1->hphys, new_val); > + } > + } > + qemu_mutex_unlock_iothread(); > +#else > + if (s1->be) { > + old_val = cpu_to_be64(old_val); > + new_val = cpu_to_be64(new_val); > + cur_val = qatomic_cmpxchg__nocheck((uint64_t *)s1->hphys, > + old_val, new_val); > + cur_val = be64_to_cpu(cur_val); > + } else { > + old_val = cpu_to_le64(old_val); > + new_val = cpu_to_le64(new_val); > + cur_val = qatomic_cmpxchg__nocheck((uint64_t *)s1->hphys, > + old_val, new_val); > + cur_val = le64_to_cpu(cur_val); > + } > +#endif > + > + return cur_val; > +} > + > static bool get_level1_table_address(CPUARMState *env, ARMMMUIdx mmu_idx, > uint32_t *table, uint32_t address) > { > @@ -1290,6 +1343,7 @@ static bool get_phys_addr_lpae(CPUARMState *env, uint64_t address, > goto do_fault; > } > > + restart_atomic_update: > if (!(descriptor & 1) || (!(descriptor & 2) && (level == 3))) { > /* Invalid, or the Reserved level 3 encoding */ > goto do_translation_fault; > @@ -1365,10 +1419,28 @@ static bool get_phys_addr_lpae(CPUARMState *env, uint64_t address, > * Here descaddr is the final physical address, and attributes > * are all in attrs. > */ > - if ((attrs & (1 << 10)) == 0) { > + if ((attrs & (1 << 10)) == 0 && !debug) { > /* Access flag */ > - fi->type = ARMFault_AccessFlag; > - goto do_fault; > + uint64_t new_des, old_des; > + > + /* > + * If HA is disabled, or if the pte is not writable, > + * pass on the access fault to software. > + */ > + if (!param.ha || !s1.rw) { > + fi->type = ARMFault_AccessFlag; > + goto do_fault; > + } > + > + old_des = descriptor; > + new_des = descriptor | (1 << 10); /* AF */ > + descriptor = arm_casq_ptw(env, old_des, new_des, &s1, fi); > + if (fi->type != ARMFault_None) { > + goto do_fault; > + } > + if (old_des != descriptor) { > + goto restart_atomic_update; Do we really need to go all the way back to restart_atomic_update? My reading of the Arm ARM is that we just need the "write the Access bit" to be atomic, not that everything we do with the descriptor must be atomic. Though the pseudocode does do a complete loop back like this, so I guess that's the 'safe' interpretation. Hmm... > + } > } > > ap = extract32(attrs, 6, 2); > @@ -1385,8 +1457,43 @@ static bool get_phys_addr_lpae(CPUARMState *env, uint64_t address, > } > > if (!(result->f.prot & (1 << access_type))) { > - fi->type = ARMFault_Permission; > - goto do_fault; > + uint64_t new_des, old_des; > + > + /* Writes may set dirty if DBM attribute is set. */ > + if (!param.hd > + || access_type != MMU_DATA_STORE > + || !extract64(attrs, 51, 1) /* DBM */ > + || !s1.rw) { > + fi->type = ARMFault_Permission; > + goto do_fault; > + } > + > + old_des = descriptor; > + if (mmu_idx == ARMMMUIdx_Stage2 || mmu_idx == ARMMMUIdx_Stage2_S) { > + new_des = descriptor | (1ull << 7); /* S2AP[1] */ > + } else { > + new_des = descriptor & ~(1ull << 7); /* AP[2] */ > + } If we update the prot bits, we need to also re-calculate the exec bit, I think, because the execute-never stuff depends on whether the page is writeable. Alternatively you can do it the way the pseudocode does and pre-figure-out the final permission bits value (see AArch64.S1ApplyOutputPerms() and AArch64.S2ApplyOutputPerms()) so you only need to calculate the exec bit once. > + > + /* > + * If the descriptor didn't change, then attributes weren't the > + * reason for the permission fault, so deliver it. > + */ > + if (old_des == new_des) { > + fi->type = ARMFault_Permission; > + goto do_fault; > + } > + > + descriptor = arm_casq_ptw(env, old_des, new_des, &s1, fi); Are we allowed to do the access and dirty bit updates with separate atomic accesses? This bit in D5.4.12 suggests the expectation is it will be a single atomic rmw to update the whole thing: "For a Block or Page Translation Table descriptor for which the AF bit is 0, the DBM bit is 1, and either the value of the stage 1 AP[2] bit is 1 or the value of the stage 2 S2AP[1] bit is 0, both AF can be set to 1, and either AP[2] set to 0 or S2AP[1] set to 1, in a single atomic read-modify-write operation, as a result of an attempted write to a memory location that uses the translation table entry." (though it depends how you interpret that "can"...) > + if (fi->type != ARMFault_None) { > + goto do_fault; > + } > + if (old_des != descriptor) { > + goto restart_atomic_update; > + } > + > + /* Success: the page is now writable. */ > + result->f.prot |= 1 << MMU_DATA_STORE; > } > > if (ns) { > -- > 2.34.1 thanks -- PMM
On Fri, 7 Oct 2022 at 14:47, Peter Maydell <peter.maydell@linaro.org> wrote: > Do we really need to go all the way back to restart_atomic_update? > Are we allowed to do the access and dirty bit updates with separate > atomic accesses? I've just discovered that the latest revision of the Arm ARM (rev I.a) is clearer on this -- R_SGJBL and I_YZSVV clearly say that we need to go back to restart_atomic_update for dirty bit updates, and it's a reasonable assumption that this is true also for atomic updates. And R_PRHKD says you're permitted to do everything in one rmw, which clearly implies that you're permitted also not to. And if you restart the descriptor handling it's architecturally not distinguishable whether you did one rmw or two. So both of these are fine the way you have them in your patch. thanks -- PMM
On 10/7/22 06:47, Peter Maydell wrote: > On Sat, 1 Oct 2022 at 18:04, Richard Henderson > <richard.henderson@linaro.org> wrote: >> >> Perform the atomic update for hardware management of the access flag >> and the dirty bit. >> >> A limitation of the implementation so far is that the page table >> itself must already be writable, i.e. the dirty bit for the stage2 >> page table must already be set, i.e. we cannot set both dirty bits >> at the same time. >> >> This is allowed because it is CONSTRAINED UNPREDICTABLE whether any >> atomic update happens at all. The implementation is allowed to simply >> fall back on software update at any time. > > I can't see where in the Arm ARM this is stated. > > In any case, if HA is set you can't simply return ARMFault_AccessFlag > without breaking the bit in D5.4.12 which says > "When hardware updates of the Access flag are enabled for a stage of > translation an address translation instruction that uses that stage > of translation will not report that the address will give rise to > an Access flag fault in the PAR" I think this may have been loose (or incorrect) reading of what has become R_QSPMS in I_a, via "access generates ... a Permission fault", due to the pte page being read-only, due to the dirty bit being clear. However, having performed the same atomic update conversion for x86, I can see now that I merely need to perform the lookup for s1 ptw with MMU_DATA_STORE rather than MMU_DATA_LOAD in order for the s1 pte page to have its own dirty bit processed and become writable. >> + t = FIELD_DP64(t, ID_AA64MMFR1, HAFDBS, 2); /* FEAT_HAFDBS */ > > I think we should split the access flag update and the > dirty-bit update into separate commits. It might be useful > for bisection purposes later, and it looks like they're pretty > cleanly separable. (Though if you look at my last comment in this > email, maybe not quite so clean as in the code as you have it here.) Shouldn't be too hard to split. I'll try, at least. >> +static uint64_t arm_casq_ptw(CPUARMState *env, uint64_t old_val, >> + uint64_t new_val, const S1TranslateResult *s1, >> + ARMMMUFaultInfo *fi) >> +{ >> + uint64_t cur_val; >> + >> + if (unlikely(!s1->hphys)) { >> + fi->type = ARMFault_UnsuppAtomicUpdate; >> + fi->s1ptw = true; >> + return 0; >> + } >> + >> +#ifndef CONFIG_ATOMIC64 >> + /* >> + * We can't support the atomic operation on the host. We should be >> + * running in round-robin mode though, which means that we would only >> + * race with dma i/o. >> + */ >> + qemu_mutex_lock_iothread(); > > Are there definitely no code paths where we might try to do > a page table walk with the iothread already locked ? I'll double-check, but another possibility is to simply perform the atomic operation on the low 32-bits, where both AF and DB are located. Another trick I learned from x86... >> + old_des = descriptor; >> + if (mmu_idx == ARMMMUIdx_Stage2 || mmu_idx == ARMMMUIdx_Stage2_S) { >> + new_des = descriptor | (1ull << 7); /* S2AP[1] */ >> + } else { >> + new_des = descriptor & ~(1ull << 7); /* AP[2] */ >> + } > > If we update the prot bits, we need to also re-calculate the exec bit, > I think, because the execute-never stuff depends on whether the page is > writeable. Alternatively you can do it the way the pseudocode does and > pre-figure-out the final permission bits value (see AArch64.S1ApplyOutputPerms() > and AArch64.S2ApplyOutputPerms()) so you only need to calculate the > exec bit once. Good catch. r~
On Fri, 7 Oct 2022 at 17:45, Richard Henderson <richard.henderson@linaro.org> wrote: > > On 10/7/22 06:47, Peter Maydell wrote: > > Are there definitely no code paths where we might try to do > > a page table walk with the iothread already locked ? > > I'll double-check, but another possibility is to simply perform the atomic operation on > the low 32-bits, where both AF and DB are located. Another trick I learned from x86... Doesn't that cause a problem where we don't detect that some other CPU wrote to the high 32 bits of the descriptor ? We're supposed to be using those high 32 bits, not the ones we have in hand... If we do need the iothread lock, we could do it the way that io_readx() does, I guess, where we track whether we needed to lock it or not. thanks -- PMM
On 10/7/22 09:50, Peter Maydell wrote: > On Fri, 7 Oct 2022 at 17:45, Richard Henderson > <richard.henderson@linaro.org> wrote: >> >> On 10/7/22 06:47, Peter Maydell wrote: >>> Are there definitely no code paths where we might try to do >>> a page table walk with the iothread already locked ? >> >> I'll double-check, but another possibility is to simply perform the atomic operation on >> the low 32-bits, where both AF and DB are located. Another trick I learned from x86... > > Doesn't that cause a problem where we don't detect that some other > CPU wrote to the high 32 bits of the descriptor ? We're supposed to > be using those high 32 bits, not the ones we have in hand... Hmm, yes. Which now makes me wonder if the x86 case is in fact buggy... > If we do need the iothread lock, we could do it the way that > io_readx() does, I guess, where we track whether we needed to > lock it or not. yes. r~
diff --git a/docs/system/arm/emulation.rst b/docs/system/arm/emulation.rst index be7bbffe59..c3582d075e 100644 --- a/docs/system/arm/emulation.rst +++ b/docs/system/arm/emulation.rst @@ -31,6 +31,7 @@ the following architecture extensions: - FEAT_FRINTTS (Floating-point to integer instructions) - FEAT_FlagM (Flag manipulation instructions v2) - FEAT_FlagM2 (Enhancements to flag manipulation instructions) +- FEAT_HAFDBS (Hardware management of the access flag and dirty bit state) - FEAT_HCX (Support for the HCRX_EL2 register) - FEAT_HPDS (Hierarchical permission disables) - FEAT_I8MM (AArch64 Int8 matrix multiplication instructions) diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index e6314e86d2..b064dc7964 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -1116,6 +1116,7 @@ static void aarch64_max_initfn(Object *obj) cpu->isar.id_aa64mmfr0 = t; t = cpu->isar.id_aa64mmfr1; + t = FIELD_DP64(t, ID_AA64MMFR1, HAFDBS, 2); /* FEAT_HAFDBS */ t = FIELD_DP64(t, ID_AA64MMFR1, VMIDBITS, 2); /* FEAT_VMID16 */ t = FIELD_DP64(t, ID_AA64MMFR1, VH, 1); /* FEAT_VHE */ t = FIELD_DP64(t, ID_AA64MMFR1, HPDS, 1); /* FEAT_HPDS */ diff --git a/target/arm/ptw.c b/target/arm/ptw.c index 45734b0d28..14ab56d1b5 100644 --- a/target/arm/ptw.c +++ b/target/arm/ptw.c @@ -223,6 +223,7 @@ static bool S2_attrs_are_device(uint64_t hcr, uint8_t attrs) typedef struct { bool is_secure; bool be; + bool rw; void *hphys; hwaddr gphys; } S1TranslateResult; @@ -261,7 +262,8 @@ static bool S1_ptw_translate(CPUARMState *env, ARMMMUIdx mmu_idx, pte_attrs = s2.cacheattrs.attrs; pte_secure = s2.f.attrs.secure; } - res->hphys = NULL; + res->hphys = NULL; /* force slow path */ + res->rw = false; /* debug never modifies */ } else { CPUTLBEntryFull *full; int flags; @@ -276,6 +278,7 @@ static bool S1_ptw_translate(CPUARMState *env, ARMMMUIdx mmu_idx, goto fail; } res->gphys = full->phys_addr; + res->rw = full->prot & PAGE_WRITE; pte_attrs = full->pte_attrs; pte_secure = full->attrs.secure; } @@ -381,6 +384,56 @@ static uint64_t arm_ldq_ptw(CPUARMState *env, const S1TranslateResult *s1, return data; } +static uint64_t arm_casq_ptw(CPUARMState *env, uint64_t old_val, + uint64_t new_val, const S1TranslateResult *s1, + ARMMMUFaultInfo *fi) +{ + uint64_t cur_val; + + if (unlikely(!s1->hphys)) { + fi->type = ARMFault_UnsuppAtomicUpdate; + fi->s1ptw = true; + return 0; + } + +#ifndef CONFIG_ATOMIC64 + /* + * We can't support the atomic operation on the host. We should be + * running in round-robin mode though, which means that we would only + * race with dma i/o. + */ + qemu_mutex_lock_iothread(); + if (s1->be) { + cur_val = ldq_be_p(s1->hphys); + if (cur_val == old_val) { + stq_be_p(s1->hphys, new_val); + } + } else { + cur_val = ldq_le_p(s1->hphys); + if (cur_val == old_val) { + stq_le_p(s1->hphys, new_val); + } + } + qemu_mutex_unlock_iothread(); +#else + if (s1->be) { + old_val = cpu_to_be64(old_val); + new_val = cpu_to_be64(new_val); + cur_val = qatomic_cmpxchg__nocheck((uint64_t *)s1->hphys, + old_val, new_val); + cur_val = be64_to_cpu(cur_val); + } else { + old_val = cpu_to_le64(old_val); + new_val = cpu_to_le64(new_val); + cur_val = qatomic_cmpxchg__nocheck((uint64_t *)s1->hphys, + old_val, new_val); + cur_val = le64_to_cpu(cur_val); + } +#endif + + return cur_val; +} + static bool get_level1_table_address(CPUARMState *env, ARMMMUIdx mmu_idx, uint32_t *table, uint32_t address) { @@ -1290,6 +1343,7 @@ static bool get_phys_addr_lpae(CPUARMState *env, uint64_t address, goto do_fault; } + restart_atomic_update: if (!(descriptor & 1) || (!(descriptor & 2) && (level == 3))) { /* Invalid, or the Reserved level 3 encoding */ goto do_translation_fault; @@ -1365,10 +1419,28 @@ static bool get_phys_addr_lpae(CPUARMState *env, uint64_t address, * Here descaddr is the final physical address, and attributes * are all in attrs. */ - if ((attrs & (1 << 10)) == 0) { + if ((attrs & (1 << 10)) == 0 && !debug) { /* Access flag */ - fi->type = ARMFault_AccessFlag; - goto do_fault; + uint64_t new_des, old_des; + + /* + * If HA is disabled, or if the pte is not writable, + * pass on the access fault to software. + */ + if (!param.ha || !s1.rw) { + fi->type = ARMFault_AccessFlag; + goto do_fault; + } + + old_des = descriptor; + new_des = descriptor | (1 << 10); /* AF */ + descriptor = arm_casq_ptw(env, old_des, new_des, &s1, fi); + if (fi->type != ARMFault_None) { + goto do_fault; + } + if (old_des != descriptor) { + goto restart_atomic_update; + } } ap = extract32(attrs, 6, 2); @@ -1385,8 +1457,43 @@ static bool get_phys_addr_lpae(CPUARMState *env, uint64_t address, } if (!(result->f.prot & (1 << access_type))) { - fi->type = ARMFault_Permission; - goto do_fault; + uint64_t new_des, old_des; + + /* Writes may set dirty if DBM attribute is set. */ + if (!param.hd + || access_type != MMU_DATA_STORE + || !extract64(attrs, 51, 1) /* DBM */ + || !s1.rw) { + fi->type = ARMFault_Permission; + goto do_fault; + } + + old_des = descriptor; + if (mmu_idx == ARMMMUIdx_Stage2 || mmu_idx == ARMMMUIdx_Stage2_S) { + new_des = descriptor | (1ull << 7); /* S2AP[1] */ + } else { + new_des = descriptor & ~(1ull << 7); /* AP[2] */ + } + + /* + * If the descriptor didn't change, then attributes weren't the + * reason for the permission fault, so deliver it. + */ + if (old_des == new_des) { + fi->type = ARMFault_Permission; + goto do_fault; + } + + descriptor = arm_casq_ptw(env, old_des, new_des, &s1, fi); + if (fi->type != ARMFault_None) { + goto do_fault; + } + if (old_des != descriptor) { + goto restart_atomic_update; + } + + /* Success: the page is now writable. */ + result->f.prot |= 1 << MMU_DATA_STORE; } if (ns) {
Perform the atomic update for hardware management of the access flag and the dirty bit. A limitation of the implementation so far is that the page table itself must already be writable, i.e. the dirty bit for the stage2 page table must already be set, i.e. we cannot set both dirty bits at the same time. This is allowed because it is CONSTRAINED UNPREDICTABLE whether any atomic update happens at all. The implementation is allowed to simply fall back on software update at any time. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- docs/system/arm/emulation.rst | 1 + target/arm/cpu64.c | 1 + target/arm/ptw.c | 119 ++++++++++++++++++++++++++++++++-- 3 files changed, 115 insertions(+), 6 deletions(-)