diff mbox series

[v8,10/13] KVM: arm64: Handle guest_memfd()-backed guest page faults

Message ID 20250430165655.605595-11-tabba@google.com
State New
Headers show
Series KVM: Mapping guest_memfd backed memory at the host for software protected VMs | expand

Commit Message

Fuad Tabba April 30, 2025, 4:56 p.m. UTC
Add arm64 support for handling guest page faults on guest_memfd
backed memslots.

For now, the fault granule is restricted to PAGE_SIZE.

Signed-off-by: Fuad Tabba <tabba@google.com>
---
 arch/arm64/kvm/mmu.c     | 65 +++++++++++++++++++++++++++-------------
 include/linux/kvm_host.h |  5 ++++
 virt/kvm/kvm_main.c      |  5 ----
 3 files changed, 50 insertions(+), 25 deletions(-)

Comments

James Houghton May 9, 2025, 8:15 p.m. UTC | #1
On Wed, Apr 30, 2025 at 9:57 AM Fuad Tabba <tabba@google.com> wrote:
>
> Add arm64 support for handling guest page faults on guest_memfd
> backed memslots.
>
> For now, the fault granule is restricted to PAGE_SIZE.
>
> Signed-off-by: Fuad Tabba <tabba@google.com>
> ---
>  arch/arm64/kvm/mmu.c     | 65 +++++++++++++++++++++++++++-------------
>  include/linux/kvm_host.h |  5 ++++
>  virt/kvm/kvm_main.c      |  5 ----
>  3 files changed, 50 insertions(+), 25 deletions(-)
>
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 148a97c129de..d1044c7f78bb 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1466,6 +1466,30 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
>         return vma->vm_flags & VM_MTE_ALLOWED;
>  }
>
> +static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> +                            gfn_t gfn, bool write_fault, bool *writable,
> +                            struct page **page, bool is_gmem)
> +{
> +       kvm_pfn_t pfn;
> +       int ret;
> +
> +       if (!is_gmem)
> +               return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page);
> +
> +       *writable = false;
> +
> +       ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL);
> +       if (!ret) {
> +               *writable = !memslot_is_readonly(slot);
> +               return pfn;
> +       }
> +
> +       if (ret == -EHWPOISON)
> +               return KVM_PFN_ERR_HWPOISON;
> +
> +       return KVM_PFN_ERR_NOSLOT_MASK;
> +}
> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>                           struct kvm_s2_trans *nested,
>                           struct kvm_memory_slot *memslot, unsigned long hva,
> @@ -1473,19 +1497,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  {
>         int ret = 0;
>         bool write_fault, writable;
> -       bool exec_fault, mte_allowed;
> +       bool exec_fault, mte_allowed = false;
>         bool device = false, vfio_allow_any_uc = false;
>         unsigned long mmu_seq;
>         phys_addr_t ipa = fault_ipa;
>         struct kvm *kvm = vcpu->kvm;
> -       struct vm_area_struct *vma;
> +       struct vm_area_struct *vma = NULL;
>         short vma_shift;
>         void *memcache;
> -       gfn_t gfn;
> +       gfn_t gfn = ipa >> PAGE_SHIFT;
>         kvm_pfn_t pfn;
>         bool logging_active = memslot_is_logging(memslot);
> -       bool force_pte = logging_active || is_protected_kvm_enabled();
> -       long vma_pagesize, fault_granule;
> +       bool is_gmem = kvm_slot_has_gmem(memslot) && kvm_mem_from_gmem(kvm, gfn);
> +       bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled();
> +       long vma_pagesize, fault_granule = PAGE_SIZE;
>         enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
>         struct kvm_pgtable *pgt;
>         struct page *page;
> @@ -1522,16 +1547,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>                         return ret;
>         }
>
> +       mmap_read_lock(current->mm);

We don't have to take the mmap_lock for gmem faults, right?

I think we should reorganize user_mem_abort() a bit (and I think vma_pagesize
and maybe vma_shift should be renamed) given the changes we're making here.

Below is a diff that I think might be a little cleaner. Let me know what you
think.

> +
>         /*
>          * Let's check if we will get back a huge page backed by hugetlbfs, or
>          * get block mapping for device MMIO region.
>          */
> -       mmap_read_lock(current->mm);
> -       vma = vma_lookup(current->mm, hva);
> -       if (unlikely(!vma)) {
> -               kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> -               mmap_read_unlock(current->mm);
> -               return -EFAULT;
> +       if (!is_gmem) {
> +               vma = vma_lookup(current->mm, hva);
> +               if (unlikely(!vma)) {
> +                       kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> +                       mmap_read_unlock(current->mm);
> +                       return -EFAULT;
> +               }
> +
> +               vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> +               mte_allowed = kvm_vma_mte_allowed(vma);
>         }
>
>         if (force_pte)
> @@ -1602,18 +1633,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>                 ipa &= ~(vma_pagesize - 1);
>         }
>
> -       gfn = ipa >> PAGE_SHIFT;
> -       mte_allowed = kvm_vma_mte_allowed(vma);
> -
> -       vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> -
>         /* Don't use the VMA after the unlock -- it may have vanished */
>         vma = NULL;
>
>         /*
>          * Read mmu_invalidate_seq so that KVM can detect if the results of
> -        * vma_lookup() or __kvm_faultin_pfn() become stale prior to
> -        * acquiring kvm->mmu_lock.
> +        * vma_lookup() or faultin_pfn() become stale prior to acquiring
> +        * kvm->mmu_lock.
>          *
>          * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
>          * with the smp_wmb() in kvm_mmu_invalidate_end().
> @@ -1621,8 +1647,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         mmu_seq = vcpu->kvm->mmu_invalidate_seq;
>         mmap_read_unlock(current->mm);
>
> -       pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
> -                               &writable, &page);
> +       pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem);
>         if (pfn == KVM_PFN_ERR_HWPOISON) {

I think we need to take care to handle HWPOISON properly. I know that it is
(or will most likely be) the case that GUP(hva) --> pfn, but with gmem,
it *might* not be the case. So the following line isn't right.

I think we need to handle HWPOISON for gmem using memory fault exits instead of
sending a SIGBUS to userspace. This would be consistent with how KVM/x86
today handles getting a HWPOISON page back from kvm_gmem_get_pfn(). I'm not
entirely sure how KVM/x86 is meant to handle HWPOISON on shared gmem pages yet;
I need to keep reading your series.

The reorganization diff below leaves this unfixed.

>                 kvm_send_hwpoison_signal(hva, vma_shift);
>                 return 0;
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index f3af6bff3232..1b2e4e9a7802 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1882,6 +1882,11 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
>         return gfn_to_memslot(kvm, gfn)->id;
>  }
>
> +static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot)
> +{
> +       return slot->flags & KVM_MEM_READONLY;
> +}
> +
>  static inline gfn_t
>  hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
>  {
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index c75d8e188eb7..d9bca5ba19dc 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2640,11 +2640,6 @@ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
>         return size;
>  }
>
> -static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
> -{
> -       return slot->flags & KVM_MEM_READONLY;
> -}
> -
>  static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
>                                        gfn_t *nr_pages, bool write)
>  {
> --
> 2.49.0.901.g37484f566f-goog

Thanks, Fuad! Here's the reorganization/rename diff:

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index d1044c7f78bba..c9eb72fe9013b 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1502,7 +1502,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	unsigned long mmu_seq;
 	phys_addr_t ipa = fault_ipa;
 	struct kvm *kvm = vcpu->kvm;
-	struct vm_area_struct *vma = NULL;
 	short vma_shift;
 	void *memcache;
 	gfn_t gfn = ipa >> PAGE_SHIFT;
@@ -1510,7 +1509,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	bool logging_active = memslot_is_logging(memslot);
 	bool is_gmem = kvm_slot_has_gmem(memslot) && kvm_mem_from_gmem(kvm, gfn);
 	bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled();
-	long vma_pagesize, fault_granule = PAGE_SIZE;
+	long target_size = PAGE_SIZE, fault_granule = PAGE_SIZE;
 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
 	struct kvm_pgtable *pgt;
 	struct page *page;
@@ -1547,13 +1546,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			return ret;
 	}
 
-	mmap_read_lock(current->mm);
-
 	/*
 	 * Let's check if we will get back a huge page backed by hugetlbfs, or
 	 * get block mapping for device MMIO region.
 	 */
 	if (!is_gmem) {
+		struct vm_area_struct *vma = NULL;
+
+		mmap_read_lock(current->mm);
+
 		vma = vma_lookup(current->mm, hva);
 		if (unlikely(!vma)) {
 			kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
@@ -1563,38 +1564,45 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 		vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
 		mte_allowed = kvm_vma_mte_allowed(vma);
-	}
-
-	if (force_pte)
-		vma_shift = PAGE_SHIFT;
-	else
-		vma_shift = get_vma_page_shift(vma, hva);
+		vma_shift = force_pte ? get_vma_page_shift(vma, hva) : PAGE_SHIFT;
 
-	switch (vma_shift) {
+		switch (vma_shift) {
 #ifndef __PAGETABLE_PMD_FOLDED
-	case PUD_SHIFT:
-		if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
-			break;
-		fallthrough;
+		case PUD_SHIFT:
+			if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
+				break;
+			fallthrough;
 #endif
-	case CONT_PMD_SHIFT:
-		vma_shift = PMD_SHIFT;
-		fallthrough;
-	case PMD_SHIFT:
-		if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
+		case CONT_PMD_SHIFT:
+			vma_shift = PMD_SHIFT;
+			fallthrough;
+		case PMD_SHIFT:
+			if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
+				break;
+			fallthrough;
+		case CONT_PTE_SHIFT:
+			vma_shift = PAGE_SHIFT;
+			force_pte = true;
+			fallthrough;
+		case PAGE_SHIFT:
 			break;
-		fallthrough;
-	case CONT_PTE_SHIFT:
-		vma_shift = PAGE_SHIFT;
-		force_pte = true;
-		fallthrough;
-	case PAGE_SHIFT:
-		break;
-	default:
-		WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
-	}
+		default:
+			WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
+		}
 
-	vma_pagesize = 1UL << vma_shift;
+		/*
+		 * Read mmu_invalidate_seq so that KVM can detect if the results of
+		 * vma_lookup() or faultin_pfn() become stale prior to acquiring
+		 * kvm->mmu_lock.
+		 *
+		 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
+		 * with the smp_wmb() in kvm_mmu_invalidate_end().
+		 */
+		mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+		mmap_read_unlock(current->mm);
+
+		target_size = 1UL << vma_shift;
+	}
 
 	if (nested) {
 		unsigned long max_map_size;
@@ -1620,7 +1628,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			max_map_size = PAGE_SIZE;
 
 		force_pte = (max_map_size == PAGE_SIZE);
-		vma_pagesize = min(vma_pagesize, (long)max_map_size);
+		target_size = min(target_size, (long)max_map_size);
 	}
 
 	/*
@@ -1628,27 +1636,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	 * ensure we find the right PFN and lay down the mapping in the right
 	 * place.
 	 */
-	if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) {
-		fault_ipa &= ~(vma_pagesize - 1);
-		ipa &= ~(vma_pagesize - 1);
+	if (target_size == PMD_SIZE || target_size == PUD_SIZE) {
+		fault_ipa &= ~(target_size - 1);
+		ipa &= ~(target_size - 1);
 	}
 
-	/* Don't use the VMA after the unlock -- it may have vanished */
-	vma = NULL;
-
-	/*
-	 * Read mmu_invalidate_seq so that KVM can detect if the results of
-	 * vma_lookup() or faultin_pfn() become stale prior to acquiring
-	 * kvm->mmu_lock.
-	 *
-	 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
-	 * with the smp_wmb() in kvm_mmu_invalidate_end().
-	 */
-	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
-	mmap_read_unlock(current->mm);
-
 	pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem);
 	if (pfn == KVM_PFN_ERR_HWPOISON) {
+		// TODO: Handle gmem properly. vma_shift
+		// intentionally left uninitialized.
 		kvm_send_hwpoison_signal(hva, vma_shift);
 		return 0;
 	}
@@ -1658,9 +1654,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (kvm_is_device_pfn(pfn)) {
 		/*
 		 * If the page was identified as device early by looking at
-		 * the VMA flags, vma_pagesize is already representing the
+		 * the VMA flags, target_size is already representing the
 		 * largest quantity we can map.  If instead it was mapped
-		 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
+		 * via __kvm_faultin_pfn(), target_size is set to PAGE_SIZE
 		 * and must not be upgraded.
 		 *
 		 * In both cases, we don't let transparent_hugepage_adjust()
@@ -1699,7 +1695,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 	kvm_fault_lock(kvm);
 	pgt = vcpu->arch.hw_mmu->pgt;
-	if (mmu_invalidate_retry(kvm, mmu_seq)) {
+	if (!is_gmem && mmu_invalidate_retry(kvm, mmu_seq)) {
 		ret = -EAGAIN;
 		goto out_unlock;
 	}
@@ -1708,16 +1704,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	 * If we are not forced to use page mapping, check if we are
 	 * backed by a THP and thus use block mapping if possible.
 	 */
-	if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
+	if (target_size == PAGE_SIZE && !(force_pte || device)) {
 		if (fault_is_perm && fault_granule > PAGE_SIZE)
-			vma_pagesize = fault_granule;
-		else
-			vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
+			target_size = fault_granule;
+		else if (!is_gmem)
+			target_size = transparent_hugepage_adjust(kvm, memslot,
 								   hva, &pfn,
 								   &fault_ipa);
 
-		if (vma_pagesize < 0) {
-			ret = vma_pagesize;
+		if (target_size < 0) {
+			ret = target_size;
 			goto out_unlock;
 		}
 	}
@@ -1725,7 +1721,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (!fault_is_perm && !device && kvm_has_mte(kvm)) {
 		/* Check the VMM hasn't introduced a new disallowed VMA */
 		if (mte_allowed) {
-			sanitise_mte_tags(kvm, pfn, vma_pagesize);
+			sanitise_mte_tags(kvm, pfn, target_size);
 		} else {
 			ret = -EFAULT;
 			goto out_unlock;
@@ -1750,10 +1746,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 	/*
 	 * Under the premise of getting a FSC_PERM fault, we just need to relax
-	 * permissions only if vma_pagesize equals fault_granule. Otherwise,
+	 * permissions only if target_size equals fault_granule. Otherwise,
 	 * kvm_pgtable_stage2_map() should be called to change block size.
 	 */
-	if (fault_is_perm && vma_pagesize == fault_granule) {
+	if (fault_is_perm && target_size == fault_granule) {
 		/*
 		 * Drop the SW bits in favour of those stored in the
 		 * PTE, which will be preserved.
@@ -1761,7 +1757,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		prot &= ~KVM_NV_GUEST_MAP_SZ;
 		ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags);
 	} else {
-		ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
+		ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, target_size,
 					     __pfn_to_phys(pfn), prot,
 					     memcache, flags);
 	}
Fuad Tabba May 12, 2025, 7:07 a.m. UTC | #2
Hi James,


On Fri, 9 May 2025 at 21:15, James Houghton <jthoughton@google.com> wrote:
>
> On Wed, Apr 30, 2025 at 9:57 AM Fuad Tabba <tabba@google.com> wrote:
> >
> > Add arm64 support for handling guest page faults on guest_memfd
> > backed memslots.
> >
> > For now, the fault granule is restricted to PAGE_SIZE.
> >
> > Signed-off-by: Fuad Tabba <tabba@google.com>
> > ---
> >  arch/arm64/kvm/mmu.c     | 65 +++++++++++++++++++++++++++-------------
> >  include/linux/kvm_host.h |  5 ++++
> >  virt/kvm/kvm_main.c      |  5 ----
> >  3 files changed, 50 insertions(+), 25 deletions(-)
> >
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index 148a97c129de..d1044c7f78bb 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -1466,6 +1466,30 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
> >         return vma->vm_flags & VM_MTE_ALLOWED;
> >  }
> >
> > +static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > +                            gfn_t gfn, bool write_fault, bool *writable,
> > +                            struct page **page, bool is_gmem)
> > +{
> > +       kvm_pfn_t pfn;
> > +       int ret;
> > +
> > +       if (!is_gmem)
> > +               return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page);
> > +
> > +       *writable = false;
> > +
> > +       ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL);
> > +       if (!ret) {
> > +               *writable = !memslot_is_readonly(slot);
> > +               return pfn;
> > +       }
> > +
> > +       if (ret == -EHWPOISON)
> > +               return KVM_PFN_ERR_HWPOISON;
> > +
> > +       return KVM_PFN_ERR_NOSLOT_MASK;
> > +}
> > +
> >  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >                           struct kvm_s2_trans *nested,
> >                           struct kvm_memory_slot *memslot, unsigned long hva,
> > @@ -1473,19 +1497,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >  {
> >         int ret = 0;
> >         bool write_fault, writable;
> > -       bool exec_fault, mte_allowed;
> > +       bool exec_fault, mte_allowed = false;
> >         bool device = false, vfio_allow_any_uc = false;
> >         unsigned long mmu_seq;
> >         phys_addr_t ipa = fault_ipa;
> >         struct kvm *kvm = vcpu->kvm;
> > -       struct vm_area_struct *vma;
> > +       struct vm_area_struct *vma = NULL;
> >         short vma_shift;
> >         void *memcache;
> > -       gfn_t gfn;
> > +       gfn_t gfn = ipa >> PAGE_SHIFT;
> >         kvm_pfn_t pfn;
> >         bool logging_active = memslot_is_logging(memslot);
> > -       bool force_pte = logging_active || is_protected_kvm_enabled();
> > -       long vma_pagesize, fault_granule;
> > +       bool is_gmem = kvm_slot_has_gmem(memslot) && kvm_mem_from_gmem(kvm, gfn);
> > +       bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled();
> > +       long vma_pagesize, fault_granule = PAGE_SIZE;
> >         enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
> >         struct kvm_pgtable *pgt;
> >         struct page *page;
> > @@ -1522,16 +1547,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >                         return ret;
> >         }
> >
> > +       mmap_read_lock(current->mm);
>
> We don't have to take the mmap_lock for gmem faults, right?
>
> I think we should reorganize user_mem_abort() a bit (and I think vma_pagesize
> and maybe vma_shift should be renamed) given the changes we're making here.

Good point.

> Below is a diff that I think might be a little cleaner. Let me know what you
> think.
>
> > +
> >         /*
> >          * Let's check if we will get back a huge page backed by hugetlbfs, or
> >          * get block mapping for device MMIO region.
> >          */
> > -       mmap_read_lock(current->mm);
> > -       vma = vma_lookup(current->mm, hva);
> > -       if (unlikely(!vma)) {
> > -               kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> > -               mmap_read_unlock(current->mm);
> > -               return -EFAULT;
> > +       if (!is_gmem) {
> > +               vma = vma_lookup(current->mm, hva);
> > +               if (unlikely(!vma)) {
> > +                       kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> > +                       mmap_read_unlock(current->mm);
> > +                       return -EFAULT;
> > +               }
> > +
> > +               vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> > +               mte_allowed = kvm_vma_mte_allowed(vma);
> >         }
> >
> >         if (force_pte)
> > @@ -1602,18 +1633,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >                 ipa &= ~(vma_pagesize - 1);
> >         }
> >
> > -       gfn = ipa >> PAGE_SHIFT;
> > -       mte_allowed = kvm_vma_mte_allowed(vma);
> > -
> > -       vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> > -
> >         /* Don't use the VMA after the unlock -- it may have vanished */
> >         vma = NULL;
> >
> >         /*
> >          * Read mmu_invalidate_seq so that KVM can detect if the results of
> > -        * vma_lookup() or __kvm_faultin_pfn() become stale prior to
> > -        * acquiring kvm->mmu_lock.
> > +        * vma_lookup() or faultin_pfn() become stale prior to acquiring
> > +        * kvm->mmu_lock.
> >          *
> >          * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
> >          * with the smp_wmb() in kvm_mmu_invalidate_end().
> > @@ -1621,8 +1647,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >         mmu_seq = vcpu->kvm->mmu_invalidate_seq;
> >         mmap_read_unlock(current->mm);
> >
> > -       pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
> > -                               &writable, &page);
> > +       pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem);
> >         if (pfn == KVM_PFN_ERR_HWPOISON) {
>
> I think we need to take care to handle HWPOISON properly. I know that it is
> (or will most likely be) the case that GUP(hva) --> pfn, but with gmem,
> it *might* not be the case. So the following line isn't right.
>
> I think we need to handle HWPOISON for gmem using memory fault exits instead of
> sending a SIGBUS to userspace. This would be consistent with how KVM/x86
> today handles getting a HWPOISON page back from kvm_gmem_get_pfn(). I'm not
> entirely sure how KVM/x86 is meant to handle HWPOISON on shared gmem pages yet;
> I need to keep reading your series.

You're right. In the next respin (coming soon), Ackerley has added a
patch that performs a best-effort check to ensure that hva matches the
gfn.

> The reorganization diff below leaves this unfixed.
>
> >                 kvm_send_hwpoison_signal(hva, vma_shift);
> >                 return 0;
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index f3af6bff3232..1b2e4e9a7802 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -1882,6 +1882,11 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
> >         return gfn_to_memslot(kvm, gfn)->id;
> >  }
> >
> > +static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot)
> > +{
> > +       return slot->flags & KVM_MEM_READONLY;
> > +}
> > +
> >  static inline gfn_t
> >  hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
> >  {
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index c75d8e188eb7..d9bca5ba19dc 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -2640,11 +2640,6 @@ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
> >         return size;
> >  }
> >
> > -static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
> > -{
> > -       return slot->flags & KVM_MEM_READONLY;
> > -}
> > -
> >  static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
> >                                        gfn_t *nr_pages, bool write)
> >  {
> > --
> > 2.49.0.901.g37484f566f-goog
>
> Thanks, Fuad! Here's the reorganization/rename diff:

Thank you James. This is very helpful.

Cheers,
/fuad

>
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index d1044c7f78bba..c9eb72fe9013b 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1502,7 +1502,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         unsigned long mmu_seq;
>         phys_addr_t ipa = fault_ipa;
>         struct kvm *kvm = vcpu->kvm;
> -       struct vm_area_struct *vma = NULL;
>         short vma_shift;
>         void *memcache;
>         gfn_t gfn = ipa >> PAGE_SHIFT;
> @@ -1510,7 +1509,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         bool logging_active = memslot_is_logging(memslot);
>         bool is_gmem = kvm_slot_has_gmem(memslot) && kvm_mem_from_gmem(kvm, gfn);
>         bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled();
> -       long vma_pagesize, fault_granule = PAGE_SIZE;
> +       long target_size = PAGE_SIZE, fault_granule = PAGE_SIZE;
>         enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
>         struct kvm_pgtable *pgt;
>         struct page *page;
> @@ -1547,13 +1546,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>                         return ret;
>         }
>
> -       mmap_read_lock(current->mm);
> -
>         /*
>          * Let's check if we will get back a huge page backed by hugetlbfs, or
>          * get block mapping for device MMIO region.
>          */
>         if (!is_gmem) {
> +               struct vm_area_struct *vma = NULL;
> +
> +               mmap_read_lock(current->mm);
> +
>                 vma = vma_lookup(current->mm, hva);
>                 if (unlikely(!vma)) {
>                         kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> @@ -1563,38 +1564,45 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>
>                 vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
>                 mte_allowed = kvm_vma_mte_allowed(vma);
> -       }
> -
> -       if (force_pte)
> -               vma_shift = PAGE_SHIFT;
> -       else
> -               vma_shift = get_vma_page_shift(vma, hva);
> +               vma_shift = force_pte ? get_vma_page_shift(vma, hva) : PAGE_SHIFT;
>
> -       switch (vma_shift) {
> +               switch (vma_shift) {
>  #ifndef __PAGETABLE_PMD_FOLDED
> -       case PUD_SHIFT:
> -               if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
> -                       break;
> -               fallthrough;
> +               case PUD_SHIFT:
> +                       if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
> +                               break;
> +                       fallthrough;
>  #endif
> -       case CONT_PMD_SHIFT:
> -               vma_shift = PMD_SHIFT;
> -               fallthrough;
> -       case PMD_SHIFT:
> -               if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
> +               case CONT_PMD_SHIFT:
> +                       vma_shift = PMD_SHIFT;
> +                       fallthrough;
> +               case PMD_SHIFT:
> +                       if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
> +                               break;
> +                       fallthrough;
> +               case CONT_PTE_SHIFT:
> +                       vma_shift = PAGE_SHIFT;
> +                       force_pte = true;
> +                       fallthrough;
> +               case PAGE_SHIFT:
>                         break;
> -               fallthrough;
> -       case CONT_PTE_SHIFT:
> -               vma_shift = PAGE_SHIFT;
> -               force_pte = true;
> -               fallthrough;
> -       case PAGE_SHIFT:
> -               break;
> -       default:
> -               WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
> -       }
> +               default:
> +                       WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
> +               }
>
> -       vma_pagesize = 1UL << vma_shift;
> +               /*
> +                * Read mmu_invalidate_seq so that KVM can detect if the results of
> +                * vma_lookup() or faultin_pfn() become stale prior to acquiring
> +                * kvm->mmu_lock.
> +                *
> +                * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
> +                * with the smp_wmb() in kvm_mmu_invalidate_end().
> +                */
> +               mmu_seq = vcpu->kvm->mmu_invalidate_seq;
> +               mmap_read_unlock(current->mm);
> +
> +               target_size = 1UL << vma_shift;
> +       }
>
>         if (nested) {
>                 unsigned long max_map_size;
> @@ -1620,7 +1628,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>                         max_map_size = PAGE_SIZE;
>
>                 force_pte = (max_map_size == PAGE_SIZE);
> -               vma_pagesize = min(vma_pagesize, (long)max_map_size);
> +               target_size = min(target_size, (long)max_map_size);
>         }
>
>         /*
> @@ -1628,27 +1636,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>          * ensure we find the right PFN and lay down the mapping in the right
>          * place.
>          */
> -       if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) {
> -               fault_ipa &= ~(vma_pagesize - 1);
> -               ipa &= ~(vma_pagesize - 1);
> +       if (target_size == PMD_SIZE || target_size == PUD_SIZE) {
> +               fault_ipa &= ~(target_size - 1);
> +               ipa &= ~(target_size - 1);
>         }
>
> -       /* Don't use the VMA after the unlock -- it may have vanished */
> -       vma = NULL;
> -
> -       /*
> -        * Read mmu_invalidate_seq so that KVM can detect if the results of
> -        * vma_lookup() or faultin_pfn() become stale prior to acquiring
> -        * kvm->mmu_lock.
> -        *
> -        * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
> -        * with the smp_wmb() in kvm_mmu_invalidate_end().
> -        */
> -       mmu_seq = vcpu->kvm->mmu_invalidate_seq;
> -       mmap_read_unlock(current->mm);
> -
>         pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem);
>         if (pfn == KVM_PFN_ERR_HWPOISON) {
> +               // TODO: Handle gmem properly. vma_shift
> +               // intentionally left uninitialized.
>                 kvm_send_hwpoison_signal(hva, vma_shift);
>                 return 0;
>         }
> @@ -1658,9 +1654,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         if (kvm_is_device_pfn(pfn)) {
>                 /*
>                  * If the page was identified as device early by looking at
> -                * the VMA flags, vma_pagesize is already representing the
> +                * the VMA flags, target_size is already representing the
>                  * largest quantity we can map.  If instead it was mapped
> -                * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
> +                * via __kvm_faultin_pfn(), target_size is set to PAGE_SIZE
>                  * and must not be upgraded.
>                  *
>                  * In both cases, we don't let transparent_hugepage_adjust()
> @@ -1699,7 +1695,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>
>         kvm_fault_lock(kvm);
>         pgt = vcpu->arch.hw_mmu->pgt;
> -       if (mmu_invalidate_retry(kvm, mmu_seq)) {
> +       if (!is_gmem && mmu_invalidate_retry(kvm, mmu_seq)) {
>                 ret = -EAGAIN;
>                 goto out_unlock;
>         }
> @@ -1708,16 +1704,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>          * If we are not forced to use page mapping, check if we are
>          * backed by a THP and thus use block mapping if possible.
>          */
> -       if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
> +       if (target_size == PAGE_SIZE && !(force_pte || device)) {
>                 if (fault_is_perm && fault_granule > PAGE_SIZE)
> -                       vma_pagesize = fault_granule;
> -               else
> -                       vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
> +                       target_size = fault_granule;
> +               else if (!is_gmem)
> +                       target_size = transparent_hugepage_adjust(kvm, memslot,
>                                                                    hva, &pfn,
>                                                                    &fault_ipa);
>
> -               if (vma_pagesize < 0) {
> -                       ret = vma_pagesize;
> +               if (target_size < 0) {
> +                       ret = target_size;
>                         goto out_unlock;
>                 }
>         }
> @@ -1725,7 +1721,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         if (!fault_is_perm && !device && kvm_has_mte(kvm)) {
>                 /* Check the VMM hasn't introduced a new disallowed VMA */
>                 if (mte_allowed) {
> -                       sanitise_mte_tags(kvm, pfn, vma_pagesize);
> +                       sanitise_mte_tags(kvm, pfn, target_size);
>                 } else {
>                         ret = -EFAULT;
>                         goto out_unlock;
> @@ -1750,10 +1746,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>
>         /*
>          * Under the premise of getting a FSC_PERM fault, we just need to relax
> -        * permissions only if vma_pagesize equals fault_granule. Otherwise,
> +        * permissions only if target_size equals fault_granule. Otherwise,
>          * kvm_pgtable_stage2_map() should be called to change block size.
>          */
> -       if (fault_is_perm && vma_pagesize == fault_granule) {
> +       if (fault_is_perm && target_size == fault_granule) {
>                 /*
>                  * Drop the SW bits in favour of those stored in the
>                  * PTE, which will be preserved.
> @@ -1761,7 +1757,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>                 prot &= ~KVM_NV_GUEST_MAP_SZ;
>                 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags);
>         } else {
> -               ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
> +               ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, target_size,
>                                              __pfn_to_phys(pfn), prot,
>                                              memcache, flags);
>         }
diff mbox series

Patch

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 148a97c129de..d1044c7f78bb 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1466,6 +1466,30 @@  static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
 	return vma->vm_flags & VM_MTE_ALLOWED;
 }
 
+static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+			     gfn_t gfn, bool write_fault, bool *writable,
+			     struct page **page, bool is_gmem)
+{
+	kvm_pfn_t pfn;
+	int ret;
+
+	if (!is_gmem)
+		return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page);
+
+	*writable = false;
+
+	ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL);
+	if (!ret) {
+		*writable = !memslot_is_readonly(slot);
+		return pfn;
+	}
+
+	if (ret == -EHWPOISON)
+		return KVM_PFN_ERR_HWPOISON;
+
+	return KVM_PFN_ERR_NOSLOT_MASK;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_s2_trans *nested,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
@@ -1473,19 +1497,20 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 {
 	int ret = 0;
 	bool write_fault, writable;
-	bool exec_fault, mte_allowed;
+	bool exec_fault, mte_allowed = false;
 	bool device = false, vfio_allow_any_uc = false;
 	unsigned long mmu_seq;
 	phys_addr_t ipa = fault_ipa;
 	struct kvm *kvm = vcpu->kvm;
-	struct vm_area_struct *vma;
+	struct vm_area_struct *vma = NULL;
 	short vma_shift;
 	void *memcache;
-	gfn_t gfn;
+	gfn_t gfn = ipa >> PAGE_SHIFT;
 	kvm_pfn_t pfn;
 	bool logging_active = memslot_is_logging(memslot);
-	bool force_pte = logging_active || is_protected_kvm_enabled();
-	long vma_pagesize, fault_granule;
+	bool is_gmem = kvm_slot_has_gmem(memslot) && kvm_mem_from_gmem(kvm, gfn);
+	bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled();
+	long vma_pagesize, fault_granule = PAGE_SIZE;
 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
 	struct kvm_pgtable *pgt;
 	struct page *page;
@@ -1522,16 +1547,22 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			return ret;
 	}
 
+	mmap_read_lock(current->mm);
+
 	/*
 	 * Let's check if we will get back a huge page backed by hugetlbfs, or
 	 * get block mapping for device MMIO region.
 	 */
-	mmap_read_lock(current->mm);
-	vma = vma_lookup(current->mm, hva);
-	if (unlikely(!vma)) {
-		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
-		mmap_read_unlock(current->mm);
-		return -EFAULT;
+	if (!is_gmem) {
+		vma = vma_lookup(current->mm, hva);
+		if (unlikely(!vma)) {
+			kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
+			mmap_read_unlock(current->mm);
+			return -EFAULT;
+		}
+
+		vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
+		mte_allowed = kvm_vma_mte_allowed(vma);
 	}
 
 	if (force_pte)
@@ -1602,18 +1633,13 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		ipa &= ~(vma_pagesize - 1);
 	}
 
-	gfn = ipa >> PAGE_SHIFT;
-	mte_allowed = kvm_vma_mte_allowed(vma);
-
-	vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
-
 	/* Don't use the VMA after the unlock -- it may have vanished */
 	vma = NULL;
 
 	/*
 	 * Read mmu_invalidate_seq so that KVM can detect if the results of
-	 * vma_lookup() or __kvm_faultin_pfn() become stale prior to
-	 * acquiring kvm->mmu_lock.
+	 * vma_lookup() or faultin_pfn() become stale prior to acquiring
+	 * kvm->mmu_lock.
 	 *
 	 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
 	 * with the smp_wmb() in kvm_mmu_invalidate_end().
@@ -1621,8 +1647,7 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
 	mmap_read_unlock(current->mm);
 
-	pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
-				&writable, &page);
+	pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem);
 	if (pfn == KVM_PFN_ERR_HWPOISON) {
 		kvm_send_hwpoison_signal(hva, vma_shift);
 		return 0;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f3af6bff3232..1b2e4e9a7802 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1882,6 +1882,11 @@  static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
 	return gfn_to_memslot(kvm, gfn)->id;
 }
 
+static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot)
+{
+	return slot->flags & KVM_MEM_READONLY;
+}
+
 static inline gfn_t
 hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c75d8e188eb7..d9bca5ba19dc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2640,11 +2640,6 @@  unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
 	return size;
 }
 
-static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
-{
-	return slot->flags & KVM_MEM_READONLY;
-}
-
 static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
 				       gfn_t *nr_pages, bool write)
 {