Message ID | 20250430165655.605595-11-tabba@google.com |
---|---|
State | New |
Headers | show |
Series | KVM: Mapping guest_memfd backed memory at the host for software protected VMs | expand |
On Wed, Apr 30, 2025 at 9:57 AM Fuad Tabba <tabba@google.com> wrote: > > Add arm64 support for handling guest page faults on guest_memfd > backed memslots. > > For now, the fault granule is restricted to PAGE_SIZE. > > Signed-off-by: Fuad Tabba <tabba@google.com> > --- > arch/arm64/kvm/mmu.c | 65 +++++++++++++++++++++++++++------------- > include/linux/kvm_host.h | 5 ++++ > virt/kvm/kvm_main.c | 5 ---- > 3 files changed, 50 insertions(+), 25 deletions(-) > > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c > index 148a97c129de..d1044c7f78bb 100644 > --- a/arch/arm64/kvm/mmu.c > +++ b/arch/arm64/kvm/mmu.c > @@ -1466,6 +1466,30 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) > return vma->vm_flags & VM_MTE_ALLOWED; > } > > +static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, > + gfn_t gfn, bool write_fault, bool *writable, > + struct page **page, bool is_gmem) > +{ > + kvm_pfn_t pfn; > + int ret; > + > + if (!is_gmem) > + return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page); > + > + *writable = false; > + > + ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL); > + if (!ret) { > + *writable = !memslot_is_readonly(slot); > + return pfn; > + } > + > + if (ret == -EHWPOISON) > + return KVM_PFN_ERR_HWPOISON; > + > + return KVM_PFN_ERR_NOSLOT_MASK; > +} > + > static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > struct kvm_s2_trans *nested, > struct kvm_memory_slot *memslot, unsigned long hva, > @@ -1473,19 +1497,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > { > int ret = 0; > bool write_fault, writable; > - bool exec_fault, mte_allowed; > + bool exec_fault, mte_allowed = false; > bool device = false, vfio_allow_any_uc = false; > unsigned long mmu_seq; > phys_addr_t ipa = fault_ipa; > struct kvm *kvm = vcpu->kvm; > - struct vm_area_struct *vma; > + struct vm_area_struct *vma = NULL; > short vma_shift; > void *memcache; > - gfn_t gfn; > + gfn_t gfn = ipa >> PAGE_SHIFT; > kvm_pfn_t pfn; > bool logging_active = memslot_is_logging(memslot); > - bool force_pte = logging_active || is_protected_kvm_enabled(); > - long vma_pagesize, fault_granule; > + bool is_gmem = kvm_slot_has_gmem(memslot) && kvm_mem_from_gmem(kvm, gfn); > + bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled(); > + long vma_pagesize, fault_granule = PAGE_SIZE; > enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; > struct kvm_pgtable *pgt; > struct page *page; > @@ -1522,16 +1547,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > return ret; > } > > + mmap_read_lock(current->mm); We don't have to take the mmap_lock for gmem faults, right? I think we should reorganize user_mem_abort() a bit (and I think vma_pagesize and maybe vma_shift should be renamed) given the changes we're making here. Below is a diff that I think might be a little cleaner. Let me know what you think. > + > /* > * Let's check if we will get back a huge page backed by hugetlbfs, or > * get block mapping for device MMIO region. > */ > - mmap_read_lock(current->mm); > - vma = vma_lookup(current->mm, hva); > - if (unlikely(!vma)) { > - kvm_err("Failed to find VMA for hva 0x%lx\n", hva); > - mmap_read_unlock(current->mm); > - return -EFAULT; > + if (!is_gmem) { > + vma = vma_lookup(current->mm, hva); > + if (unlikely(!vma)) { > + kvm_err("Failed to find VMA for hva 0x%lx\n", hva); > + mmap_read_unlock(current->mm); > + return -EFAULT; > + } > + > + vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; > + mte_allowed = kvm_vma_mte_allowed(vma); > } > > if (force_pte) > @@ -1602,18 +1633,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > ipa &= ~(vma_pagesize - 1); > } > > - gfn = ipa >> PAGE_SHIFT; > - mte_allowed = kvm_vma_mte_allowed(vma); > - > - vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; > - > /* Don't use the VMA after the unlock -- it may have vanished */ > vma = NULL; > > /* > * Read mmu_invalidate_seq so that KVM can detect if the results of > - * vma_lookup() or __kvm_faultin_pfn() become stale prior to > - * acquiring kvm->mmu_lock. > + * vma_lookup() or faultin_pfn() become stale prior to acquiring > + * kvm->mmu_lock. > * > * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs > * with the smp_wmb() in kvm_mmu_invalidate_end(). > @@ -1621,8 +1647,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > mmu_seq = vcpu->kvm->mmu_invalidate_seq; > mmap_read_unlock(current->mm); > > - pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, > - &writable, &page); > + pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem); > if (pfn == KVM_PFN_ERR_HWPOISON) { I think we need to take care to handle HWPOISON properly. I know that it is (or will most likely be) the case that GUP(hva) --> pfn, but with gmem, it *might* not be the case. So the following line isn't right. I think we need to handle HWPOISON for gmem using memory fault exits instead of sending a SIGBUS to userspace. This would be consistent with how KVM/x86 today handles getting a HWPOISON page back from kvm_gmem_get_pfn(). I'm not entirely sure how KVM/x86 is meant to handle HWPOISON on shared gmem pages yet; I need to keep reading your series. The reorganization diff below leaves this unfixed. > kvm_send_hwpoison_signal(hva, vma_shift); > return 0; > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index f3af6bff3232..1b2e4e9a7802 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -1882,6 +1882,11 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn) > return gfn_to_memslot(kvm, gfn)->id; > } > > +static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot) > +{ > + return slot->flags & KVM_MEM_READONLY; > +} > + > static inline gfn_t > hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) > { > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index c75d8e188eb7..d9bca5ba19dc 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -2640,11 +2640,6 @@ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) > return size; > } > > -static bool memslot_is_readonly(const struct kvm_memory_slot *slot) > -{ > - return slot->flags & KVM_MEM_READONLY; > -} > - > static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn, > gfn_t *nr_pages, bool write) > { > -- > 2.49.0.901.g37484f566f-goog Thanks, Fuad! Here's the reorganization/rename diff: diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index d1044c7f78bba..c9eb72fe9013b 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1502,7 +1502,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, unsigned long mmu_seq; phys_addr_t ipa = fault_ipa; struct kvm *kvm = vcpu->kvm; - struct vm_area_struct *vma = NULL; short vma_shift; void *memcache; gfn_t gfn = ipa >> PAGE_SHIFT; @@ -1510,7 +1509,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, bool logging_active = memslot_is_logging(memslot); bool is_gmem = kvm_slot_has_gmem(memslot) && kvm_mem_from_gmem(kvm, gfn); bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled(); - long vma_pagesize, fault_granule = PAGE_SIZE; + long target_size = PAGE_SIZE, fault_granule = PAGE_SIZE; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; struct page *page; @@ -1547,13 +1546,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return ret; } - mmap_read_lock(current->mm); - /* * Let's check if we will get back a huge page backed by hugetlbfs, or * get block mapping for device MMIO region. */ if (!is_gmem) { + struct vm_area_struct *vma = NULL; + + mmap_read_lock(current->mm); + vma = vma_lookup(current->mm, hva); if (unlikely(!vma)) { kvm_err("Failed to find VMA for hva 0x%lx\n", hva); @@ -1563,38 +1564,45 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; mte_allowed = kvm_vma_mte_allowed(vma); - } - - if (force_pte) - vma_shift = PAGE_SHIFT; - else - vma_shift = get_vma_page_shift(vma, hva); + vma_shift = force_pte ? get_vma_page_shift(vma, hva) : PAGE_SHIFT; - switch (vma_shift) { + switch (vma_shift) { #ifndef __PAGETABLE_PMD_FOLDED - case PUD_SHIFT: - if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) - break; - fallthrough; + case PUD_SHIFT: + if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) + break; + fallthrough; #endif - case CONT_PMD_SHIFT: - vma_shift = PMD_SHIFT; - fallthrough; - case PMD_SHIFT: - if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) + case CONT_PMD_SHIFT: + vma_shift = PMD_SHIFT; + fallthrough; + case PMD_SHIFT: + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) + break; + fallthrough; + case CONT_PTE_SHIFT: + vma_shift = PAGE_SHIFT; + force_pte = true; + fallthrough; + case PAGE_SHIFT: break; - fallthrough; - case CONT_PTE_SHIFT: - vma_shift = PAGE_SHIFT; - force_pte = true; - fallthrough; - case PAGE_SHIFT: - break; - default: - WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); - } + default: + WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); + } - vma_pagesize = 1UL << vma_shift; + /* + * Read mmu_invalidate_seq so that KVM can detect if the results of + * vma_lookup() or faultin_pfn() become stale prior to acquiring + * kvm->mmu_lock. + * + * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs + * with the smp_wmb() in kvm_mmu_invalidate_end(). + */ + mmu_seq = vcpu->kvm->mmu_invalidate_seq; + mmap_read_unlock(current->mm); + + target_size = 1UL << vma_shift; + } if (nested) { unsigned long max_map_size; @@ -1620,7 +1628,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, max_map_size = PAGE_SIZE; force_pte = (max_map_size == PAGE_SIZE); - vma_pagesize = min(vma_pagesize, (long)max_map_size); + target_size = min(target_size, (long)max_map_size); } /* @@ -1628,27 +1636,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * ensure we find the right PFN and lay down the mapping in the right * place. */ - if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { - fault_ipa &= ~(vma_pagesize - 1); - ipa &= ~(vma_pagesize - 1); + if (target_size == PMD_SIZE || target_size == PUD_SIZE) { + fault_ipa &= ~(target_size - 1); + ipa &= ~(target_size - 1); } - /* Don't use the VMA after the unlock -- it may have vanished */ - vma = NULL; - - /* - * Read mmu_invalidate_seq so that KVM can detect if the results of - * vma_lookup() or faultin_pfn() become stale prior to acquiring - * kvm->mmu_lock. - * - * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs - * with the smp_wmb() in kvm_mmu_invalidate_end(). - */ - mmu_seq = vcpu->kvm->mmu_invalidate_seq; - mmap_read_unlock(current->mm); - pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem); if (pfn == KVM_PFN_ERR_HWPOISON) { + // TODO: Handle gmem properly. vma_shift + // intentionally left uninitialized. kvm_send_hwpoison_signal(hva, vma_shift); return 0; } @@ -1658,9 +1654,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (kvm_is_device_pfn(pfn)) { /* * If the page was identified as device early by looking at - * the VMA flags, vma_pagesize is already representing the + * the VMA flags, target_size is already representing the * largest quantity we can map. If instead it was mapped - * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE + * via __kvm_faultin_pfn(), target_size is set to PAGE_SIZE * and must not be upgraded. * * In both cases, we don't let transparent_hugepage_adjust() @@ -1699,7 +1695,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_fault_lock(kvm); pgt = vcpu->arch.hw_mmu->pgt; - if (mmu_invalidate_retry(kvm, mmu_seq)) { + if (!is_gmem && mmu_invalidate_retry(kvm, mmu_seq)) { ret = -EAGAIN; goto out_unlock; } @@ -1708,16 +1704,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * If we are not forced to use page mapping, check if we are * backed by a THP and thus use block mapping if possible. */ - if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { + if (target_size == PAGE_SIZE && !(force_pte || device)) { if (fault_is_perm && fault_granule > PAGE_SIZE) - vma_pagesize = fault_granule; - else - vma_pagesize = transparent_hugepage_adjust(kvm, memslot, + target_size = fault_granule; + else if (!is_gmem) + target_size = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, &fault_ipa); - if (vma_pagesize < 0) { - ret = vma_pagesize; + if (target_size < 0) { + ret = target_size; goto out_unlock; } } @@ -1725,7 +1721,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (!fault_is_perm && !device && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new disallowed VMA */ if (mte_allowed) { - sanitise_mte_tags(kvm, pfn, vma_pagesize); + sanitise_mte_tags(kvm, pfn, target_size); } else { ret = -EFAULT; goto out_unlock; @@ -1750,10 +1746,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* * Under the premise of getting a FSC_PERM fault, we just need to relax - * permissions only if vma_pagesize equals fault_granule. Otherwise, + * permissions only if target_size equals fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ - if (fault_is_perm && vma_pagesize == fault_granule) { + if (fault_is_perm && target_size == fault_granule) { /* * Drop the SW bits in favour of those stored in the * PTE, which will be preserved. @@ -1761,7 +1757,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, prot &= ~KVM_NV_GUEST_MAP_SZ; ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); } else { - ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, target_size, __pfn_to_phys(pfn), prot, memcache, flags); }
Hi James, On Fri, 9 May 2025 at 21:15, James Houghton <jthoughton@google.com> wrote: > > On Wed, Apr 30, 2025 at 9:57 AM Fuad Tabba <tabba@google.com> wrote: > > > > Add arm64 support for handling guest page faults on guest_memfd > > backed memslots. > > > > For now, the fault granule is restricted to PAGE_SIZE. > > > > Signed-off-by: Fuad Tabba <tabba@google.com> > > --- > > arch/arm64/kvm/mmu.c | 65 +++++++++++++++++++++++++++------------- > > include/linux/kvm_host.h | 5 ++++ > > virt/kvm/kvm_main.c | 5 ---- > > 3 files changed, 50 insertions(+), 25 deletions(-) > > > > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c > > index 148a97c129de..d1044c7f78bb 100644 > > --- a/arch/arm64/kvm/mmu.c > > +++ b/arch/arm64/kvm/mmu.c > > @@ -1466,6 +1466,30 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) > > return vma->vm_flags & VM_MTE_ALLOWED; > > } > > > > +static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, > > + gfn_t gfn, bool write_fault, bool *writable, > > + struct page **page, bool is_gmem) > > +{ > > + kvm_pfn_t pfn; > > + int ret; > > + > > + if (!is_gmem) > > + return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page); > > + > > + *writable = false; > > + > > + ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL); > > + if (!ret) { > > + *writable = !memslot_is_readonly(slot); > > + return pfn; > > + } > > + > > + if (ret == -EHWPOISON) > > + return KVM_PFN_ERR_HWPOISON; > > + > > + return KVM_PFN_ERR_NOSLOT_MASK; > > +} > > + > > static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > struct kvm_s2_trans *nested, > > struct kvm_memory_slot *memslot, unsigned long hva, > > @@ -1473,19 +1497,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > { > > int ret = 0; > > bool write_fault, writable; > > - bool exec_fault, mte_allowed; > > + bool exec_fault, mte_allowed = false; > > bool device = false, vfio_allow_any_uc = false; > > unsigned long mmu_seq; > > phys_addr_t ipa = fault_ipa; > > struct kvm *kvm = vcpu->kvm; > > - struct vm_area_struct *vma; > > + struct vm_area_struct *vma = NULL; > > short vma_shift; > > void *memcache; > > - gfn_t gfn; > > + gfn_t gfn = ipa >> PAGE_SHIFT; > > kvm_pfn_t pfn; > > bool logging_active = memslot_is_logging(memslot); > > - bool force_pte = logging_active || is_protected_kvm_enabled(); > > - long vma_pagesize, fault_granule; > > + bool is_gmem = kvm_slot_has_gmem(memslot) && kvm_mem_from_gmem(kvm, gfn); > > + bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled(); > > + long vma_pagesize, fault_granule = PAGE_SIZE; > > enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; > > struct kvm_pgtable *pgt; > > struct page *page; > > @@ -1522,16 +1547,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > return ret; > > } > > > > + mmap_read_lock(current->mm); > > We don't have to take the mmap_lock for gmem faults, right? > > I think we should reorganize user_mem_abort() a bit (and I think vma_pagesize > and maybe vma_shift should be renamed) given the changes we're making here. Good point. > Below is a diff that I think might be a little cleaner. Let me know what you > think. > > > + > > /* > > * Let's check if we will get back a huge page backed by hugetlbfs, or > > * get block mapping for device MMIO region. > > */ > > - mmap_read_lock(current->mm); > > - vma = vma_lookup(current->mm, hva); > > - if (unlikely(!vma)) { > > - kvm_err("Failed to find VMA for hva 0x%lx\n", hva); > > - mmap_read_unlock(current->mm); > > - return -EFAULT; > > + if (!is_gmem) { > > + vma = vma_lookup(current->mm, hva); > > + if (unlikely(!vma)) { > > + kvm_err("Failed to find VMA for hva 0x%lx\n", hva); > > + mmap_read_unlock(current->mm); > > + return -EFAULT; > > + } > > + > > + vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; > > + mte_allowed = kvm_vma_mte_allowed(vma); > > } > > > > if (force_pte) > > @@ -1602,18 +1633,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > ipa &= ~(vma_pagesize - 1); > > } > > > > - gfn = ipa >> PAGE_SHIFT; > > - mte_allowed = kvm_vma_mte_allowed(vma); > > - > > - vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; > > - > > /* Don't use the VMA after the unlock -- it may have vanished */ > > vma = NULL; > > > > /* > > * Read mmu_invalidate_seq so that KVM can detect if the results of > > - * vma_lookup() or __kvm_faultin_pfn() become stale prior to > > - * acquiring kvm->mmu_lock. > > + * vma_lookup() or faultin_pfn() become stale prior to acquiring > > + * kvm->mmu_lock. > > * > > * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs > > * with the smp_wmb() in kvm_mmu_invalidate_end(). > > @@ -1621,8 +1647,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > mmu_seq = vcpu->kvm->mmu_invalidate_seq; > > mmap_read_unlock(current->mm); > > > > - pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, > > - &writable, &page); > > + pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem); > > if (pfn == KVM_PFN_ERR_HWPOISON) { > > I think we need to take care to handle HWPOISON properly. I know that it is > (or will most likely be) the case that GUP(hva) --> pfn, but with gmem, > it *might* not be the case. So the following line isn't right. > > I think we need to handle HWPOISON for gmem using memory fault exits instead of > sending a SIGBUS to userspace. This would be consistent with how KVM/x86 > today handles getting a HWPOISON page back from kvm_gmem_get_pfn(). I'm not > entirely sure how KVM/x86 is meant to handle HWPOISON on shared gmem pages yet; > I need to keep reading your series. You're right. In the next respin (coming soon), Ackerley has added a patch that performs a best-effort check to ensure that hva matches the gfn. > The reorganization diff below leaves this unfixed. > > > kvm_send_hwpoison_signal(hva, vma_shift); > > return 0; > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > index f3af6bff3232..1b2e4e9a7802 100644 > > --- a/include/linux/kvm_host.h > > +++ b/include/linux/kvm_host.h > > @@ -1882,6 +1882,11 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn) > > return gfn_to_memslot(kvm, gfn)->id; > > } > > > > +static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot) > > +{ > > + return slot->flags & KVM_MEM_READONLY; > > +} > > + > > static inline gfn_t > > hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) > > { > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > index c75d8e188eb7..d9bca5ba19dc 100644 > > --- a/virt/kvm/kvm_main.c > > +++ b/virt/kvm/kvm_main.c > > @@ -2640,11 +2640,6 @@ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) > > return size; > > } > > > > -static bool memslot_is_readonly(const struct kvm_memory_slot *slot) > > -{ > > - return slot->flags & KVM_MEM_READONLY; > > -} > > - > > static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn, > > gfn_t *nr_pages, bool write) > > { > > -- > > 2.49.0.901.g37484f566f-goog > > Thanks, Fuad! Here's the reorganization/rename diff: Thank you James. This is very helpful. Cheers, /fuad > > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c > index d1044c7f78bba..c9eb72fe9013b 100644 > --- a/arch/arm64/kvm/mmu.c > +++ b/arch/arm64/kvm/mmu.c > @@ -1502,7 +1502,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > unsigned long mmu_seq; > phys_addr_t ipa = fault_ipa; > struct kvm *kvm = vcpu->kvm; > - struct vm_area_struct *vma = NULL; > short vma_shift; > void *memcache; > gfn_t gfn = ipa >> PAGE_SHIFT; > @@ -1510,7 +1509,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > bool logging_active = memslot_is_logging(memslot); > bool is_gmem = kvm_slot_has_gmem(memslot) && kvm_mem_from_gmem(kvm, gfn); > bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled(); > - long vma_pagesize, fault_granule = PAGE_SIZE; > + long target_size = PAGE_SIZE, fault_granule = PAGE_SIZE; > enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; > struct kvm_pgtable *pgt; > struct page *page; > @@ -1547,13 +1546,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > return ret; > } > > - mmap_read_lock(current->mm); > - > /* > * Let's check if we will get back a huge page backed by hugetlbfs, or > * get block mapping for device MMIO region. > */ > if (!is_gmem) { > + struct vm_area_struct *vma = NULL; > + > + mmap_read_lock(current->mm); > + > vma = vma_lookup(current->mm, hva); > if (unlikely(!vma)) { > kvm_err("Failed to find VMA for hva 0x%lx\n", hva); > @@ -1563,38 +1564,45 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; > mte_allowed = kvm_vma_mte_allowed(vma); > - } > - > - if (force_pte) > - vma_shift = PAGE_SHIFT; > - else > - vma_shift = get_vma_page_shift(vma, hva); > + vma_shift = force_pte ? get_vma_page_shift(vma, hva) : PAGE_SHIFT; > > - switch (vma_shift) { > + switch (vma_shift) { > #ifndef __PAGETABLE_PMD_FOLDED > - case PUD_SHIFT: > - if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) > - break; > - fallthrough; > + case PUD_SHIFT: > + if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) > + break; > + fallthrough; > #endif > - case CONT_PMD_SHIFT: > - vma_shift = PMD_SHIFT; > - fallthrough; > - case PMD_SHIFT: > - if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) > + case CONT_PMD_SHIFT: > + vma_shift = PMD_SHIFT; > + fallthrough; > + case PMD_SHIFT: > + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) > + break; > + fallthrough; > + case CONT_PTE_SHIFT: > + vma_shift = PAGE_SHIFT; > + force_pte = true; > + fallthrough; > + case PAGE_SHIFT: > break; > - fallthrough; > - case CONT_PTE_SHIFT: > - vma_shift = PAGE_SHIFT; > - force_pte = true; > - fallthrough; > - case PAGE_SHIFT: > - break; > - default: > - WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); > - } > + default: > + WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); > + } > > - vma_pagesize = 1UL << vma_shift; > + /* > + * Read mmu_invalidate_seq so that KVM can detect if the results of > + * vma_lookup() or faultin_pfn() become stale prior to acquiring > + * kvm->mmu_lock. > + * > + * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs > + * with the smp_wmb() in kvm_mmu_invalidate_end(). > + */ > + mmu_seq = vcpu->kvm->mmu_invalidate_seq; > + mmap_read_unlock(current->mm); > + > + target_size = 1UL << vma_shift; > + } > > if (nested) { > unsigned long max_map_size; > @@ -1620,7 +1628,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > max_map_size = PAGE_SIZE; > > force_pte = (max_map_size == PAGE_SIZE); > - vma_pagesize = min(vma_pagesize, (long)max_map_size); > + target_size = min(target_size, (long)max_map_size); > } > > /* > @@ -1628,27 +1636,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > * ensure we find the right PFN and lay down the mapping in the right > * place. > */ > - if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { > - fault_ipa &= ~(vma_pagesize - 1); > - ipa &= ~(vma_pagesize - 1); > + if (target_size == PMD_SIZE || target_size == PUD_SIZE) { > + fault_ipa &= ~(target_size - 1); > + ipa &= ~(target_size - 1); > } > > - /* Don't use the VMA after the unlock -- it may have vanished */ > - vma = NULL; > - > - /* > - * Read mmu_invalidate_seq so that KVM can detect if the results of > - * vma_lookup() or faultin_pfn() become stale prior to acquiring > - * kvm->mmu_lock. > - * > - * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs > - * with the smp_wmb() in kvm_mmu_invalidate_end(). > - */ > - mmu_seq = vcpu->kvm->mmu_invalidate_seq; > - mmap_read_unlock(current->mm); > - > pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem); > if (pfn == KVM_PFN_ERR_HWPOISON) { > + // TODO: Handle gmem properly. vma_shift > + // intentionally left uninitialized. > kvm_send_hwpoison_signal(hva, vma_shift); > return 0; > } > @@ -1658,9 +1654,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > if (kvm_is_device_pfn(pfn)) { > /* > * If the page was identified as device early by looking at > - * the VMA flags, vma_pagesize is already representing the > + * the VMA flags, target_size is already representing the > * largest quantity we can map. If instead it was mapped > - * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE > + * via __kvm_faultin_pfn(), target_size is set to PAGE_SIZE > * and must not be upgraded. > * > * In both cases, we don't let transparent_hugepage_adjust() > @@ -1699,7 +1695,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > kvm_fault_lock(kvm); > pgt = vcpu->arch.hw_mmu->pgt; > - if (mmu_invalidate_retry(kvm, mmu_seq)) { > + if (!is_gmem && mmu_invalidate_retry(kvm, mmu_seq)) { > ret = -EAGAIN; > goto out_unlock; > } > @@ -1708,16 +1704,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > * If we are not forced to use page mapping, check if we are > * backed by a THP and thus use block mapping if possible. > */ > - if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { > + if (target_size == PAGE_SIZE && !(force_pte || device)) { > if (fault_is_perm && fault_granule > PAGE_SIZE) > - vma_pagesize = fault_granule; > - else > - vma_pagesize = transparent_hugepage_adjust(kvm, memslot, > + target_size = fault_granule; > + else if (!is_gmem) > + target_size = transparent_hugepage_adjust(kvm, memslot, > hva, &pfn, > &fault_ipa); > > - if (vma_pagesize < 0) { > - ret = vma_pagesize; > + if (target_size < 0) { > + ret = target_size; > goto out_unlock; > } > } > @@ -1725,7 +1721,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > if (!fault_is_perm && !device && kvm_has_mte(kvm)) { > /* Check the VMM hasn't introduced a new disallowed VMA */ > if (mte_allowed) { > - sanitise_mte_tags(kvm, pfn, vma_pagesize); > + sanitise_mte_tags(kvm, pfn, target_size); > } else { > ret = -EFAULT; > goto out_unlock; > @@ -1750,10 +1746,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > /* > * Under the premise of getting a FSC_PERM fault, we just need to relax > - * permissions only if vma_pagesize equals fault_granule. Otherwise, > + * permissions only if target_size equals fault_granule. Otherwise, > * kvm_pgtable_stage2_map() should be called to change block size. > */ > - if (fault_is_perm && vma_pagesize == fault_granule) { > + if (fault_is_perm && target_size == fault_granule) { > /* > * Drop the SW bits in favour of those stored in the > * PTE, which will be preserved. > @@ -1761,7 +1757,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > prot &= ~KVM_NV_GUEST_MAP_SZ; > ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); > } else { > - ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, > + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, target_size, > __pfn_to_phys(pfn), prot, > memcache, flags); > }
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 148a97c129de..d1044c7f78bb 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1466,6 +1466,30 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) return vma->vm_flags & VM_MTE_ALLOWED; } +static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, bool write_fault, bool *writable, + struct page **page, bool is_gmem) +{ + kvm_pfn_t pfn; + int ret; + + if (!is_gmem) + return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page); + + *writable = false; + + ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL); + if (!ret) { + *writable = !memslot_is_readonly(slot); + return pfn; + } + + if (ret == -EHWPOISON) + return KVM_PFN_ERR_HWPOISON; + + return KVM_PFN_ERR_NOSLOT_MASK; +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_s2_trans *nested, struct kvm_memory_slot *memslot, unsigned long hva, @@ -1473,19 +1497,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, { int ret = 0; bool write_fault, writable; - bool exec_fault, mte_allowed; + bool exec_fault, mte_allowed = false; bool device = false, vfio_allow_any_uc = false; unsigned long mmu_seq; phys_addr_t ipa = fault_ipa; struct kvm *kvm = vcpu->kvm; - struct vm_area_struct *vma; + struct vm_area_struct *vma = NULL; short vma_shift; void *memcache; - gfn_t gfn; + gfn_t gfn = ipa >> PAGE_SHIFT; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); - bool force_pte = logging_active || is_protected_kvm_enabled(); - long vma_pagesize, fault_granule; + bool is_gmem = kvm_slot_has_gmem(memslot) && kvm_mem_from_gmem(kvm, gfn); + bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled(); + long vma_pagesize, fault_granule = PAGE_SIZE; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; struct page *page; @@ -1522,16 +1547,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return ret; } + mmap_read_lock(current->mm); + /* * Let's check if we will get back a huge page backed by hugetlbfs, or * get block mapping for device MMIO region. */ - mmap_read_lock(current->mm); - vma = vma_lookup(current->mm, hva); - if (unlikely(!vma)) { - kvm_err("Failed to find VMA for hva 0x%lx\n", hva); - mmap_read_unlock(current->mm); - return -EFAULT; + if (!is_gmem) { + vma = vma_lookup(current->mm, hva); + if (unlikely(!vma)) { + kvm_err("Failed to find VMA for hva 0x%lx\n", hva); + mmap_read_unlock(current->mm); + return -EFAULT; + } + + vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; + mte_allowed = kvm_vma_mte_allowed(vma); } if (force_pte) @@ -1602,18 +1633,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ipa &= ~(vma_pagesize - 1); } - gfn = ipa >> PAGE_SHIFT; - mte_allowed = kvm_vma_mte_allowed(vma); - - vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; - /* Don't use the VMA after the unlock -- it may have vanished */ vma = NULL; /* * Read mmu_invalidate_seq so that KVM can detect if the results of - * vma_lookup() or __kvm_faultin_pfn() become stale prior to - * acquiring kvm->mmu_lock. + * vma_lookup() or faultin_pfn() become stale prior to acquiring + * kvm->mmu_lock. * * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs * with the smp_wmb() in kvm_mmu_invalidate_end(). @@ -1621,8 +1647,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, mmu_seq = vcpu->kvm->mmu_invalidate_seq; mmap_read_unlock(current->mm); - pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, - &writable, &page); + pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); return 0; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f3af6bff3232..1b2e4e9a7802 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1882,6 +1882,11 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn) return gfn_to_memslot(kvm, gfn)->id; } +static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot) +{ + return slot->flags & KVM_MEM_READONLY; +} + static inline gfn_t hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c75d8e188eb7..d9bca5ba19dc 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2640,11 +2640,6 @@ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) return size; } -static bool memslot_is_readonly(const struct kvm_memory_slot *slot) -{ - return slot->flags & KVM_MEM_READONLY; -} - static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages, bool write) {
Add arm64 support for handling guest page faults on guest_memfd backed memslots. For now, the fault granule is restricted to PAGE_SIZE. Signed-off-by: Fuad Tabba <tabba@google.com> --- arch/arm64/kvm/mmu.c | 65 +++++++++++++++++++++++++++------------- include/linux/kvm_host.h | 5 ++++ virt/kvm/kvm_main.c | 5 ---- 3 files changed, 50 insertions(+), 25 deletions(-)