diff mbox series

[RFC,v5,05/15] KVM: guest_memfd: Folio mappability states and functions that manage their transition

Message ID 20250117163001.2326672-6-tabba@google.com
State New
Headers show
Series KVM: Restricted mapping of guest_memfd at the host and arm64 support | expand

Commit Message

Fuad Tabba Jan. 17, 2025, 4:29 p.m. UTC
To allow restricted mapping of guest_memfd folios by the host,
guest_memfd needs to track whether they can be mapped and by who,
since the mapping will only be allowed under conditions where it
safe to access these folios. These conditions depend on the
folios being explicitly shared with the host, or not yet exposed
to the guest (e.g., at initialization).

This patch introduces states that determine whether the host and
the guest can fault in the folios as well as the functions that
manage transitioning between those states.

Signed-off-by: Fuad Tabba <tabba@google.com>
---
 include/linux/kvm_host.h |  53 ++++++++++++++
 virt/kvm/guest_memfd.c   | 153 +++++++++++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c      |  92 +++++++++++++++++++++++
 3 files changed, 298 insertions(+)

Comments

Kirill A. Shutemov Jan. 20, 2025, 10:30 a.m. UTC | #1
On Fri, Jan 17, 2025 at 04:29:51PM +0000, Fuad Tabba wrote:
> +/*
> + * Marks the range [start, end) as not mappable by the host. If the host doesn't
> + * have any references to a particular folio, then that folio is marked as
> + * mappable by the guest.
> + *
> + * However, if the host still has references to the folio, then the folio is
> + * marked and not mappable by anyone. Marking it is not mappable allows it to
> + * drain all references from the host, and to ensure that the hypervisor does
> + * not transition the folio to private, since the host still might access it.
> + *
> + * Usually called when guest unshares memory with the host.
> + */
> +static int gmem_clear_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
> +{
> +	struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> +	void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_MAPPABLE);
> +	void *xval_none = xa_mk_value(KVM_GMEM_NONE_MAPPABLE);
> +	pgoff_t i;
> +	int r = 0;
> +
> +	filemap_invalidate_lock(inode->i_mapping);
> +	for (i = start; i < end; i++) {
> +		struct folio *folio;
> +		int refcount = 0;
> +
> +		folio = filemap_lock_folio(inode->i_mapping, i);
> +		if (!IS_ERR(folio)) {
> +			refcount = folio_ref_count(folio);
> +		} else {
> +			r = PTR_ERR(folio);
> +			if (WARN_ON_ONCE(r != -ENOENT))
> +				break;
> +
> +			folio = NULL;
> +		}
> +
> +		/* +1 references are expected because of filemap_lock_folio(). */
> +		if (folio && refcount > folio_nr_pages(folio) + 1) {

Looks racy.

What prevent anybody from obtaining a reference just after check?

Lock on folio doesn't stop random filemap_get_entry() from elevating the
refcount.

folio_ref_freeze() might be required.

> +			/*
> +			 * Outstanding references, the folio cannot be faulted
> +			 * in by anyone until they're dropped.
> +			 */
> +			r = xa_err(xa_store(mappable_offsets, i, xval_none, GFP_KERNEL));
> +		} else {
> +			/*
> +			 * No outstanding references. Transition the folio to
> +			 * guest mappable immediately.
> +			 */
> +			r = xa_err(xa_store(mappable_offsets, i, xval_guest, GFP_KERNEL));
> +		}
> +
> +		if (folio) {
> +			folio_unlock(folio);
> +			folio_put(folio);
> +		}
> +
> +		if (WARN_ON_ONCE(r))
> +			break;
> +	}
> +	filemap_invalidate_unlock(inode->i_mapping);
> +
> +	return r;
> +}
Fuad Tabba Jan. 20, 2025, 10:40 a.m. UTC | #2
On Mon, 20 Jan 2025 at 10:30, Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Fri, Jan 17, 2025 at 04:29:51PM +0000, Fuad Tabba wrote:
> > +/*
> > + * Marks the range [start, end) as not mappable by the host. If the host doesn't
> > + * have any references to a particular folio, then that folio is marked as
> > + * mappable by the guest.
> > + *
> > + * However, if the host still has references to the folio, then the folio is
> > + * marked and not mappable by anyone. Marking it is not mappable allows it to
> > + * drain all references from the host, and to ensure that the hypervisor does
> > + * not transition the folio to private, since the host still might access it.
> > + *
> > + * Usually called when guest unshares memory with the host.
> > + */
> > +static int gmem_clear_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
> > +{
> > +     struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> > +     void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_MAPPABLE);
> > +     void *xval_none = xa_mk_value(KVM_GMEM_NONE_MAPPABLE);
> > +     pgoff_t i;
> > +     int r = 0;
> > +
> > +     filemap_invalidate_lock(inode->i_mapping);
> > +     for (i = start; i < end; i++) {
> > +             struct folio *folio;
> > +             int refcount = 0;
> > +
> > +             folio = filemap_lock_folio(inode->i_mapping, i);
> > +             if (!IS_ERR(folio)) {
> > +                     refcount = folio_ref_count(folio);
> > +             } else {
> > +                     r = PTR_ERR(folio);
> > +                     if (WARN_ON_ONCE(r != -ENOENT))
> > +                             break;
> > +
> > +                     folio = NULL;
> > +             }
> > +
> > +             /* +1 references are expected because of filemap_lock_folio(). */
> > +             if (folio && refcount > folio_nr_pages(folio) + 1) {
>
> Looks racy.
>
> What prevent anybody from obtaining a reference just after check?
>
> Lock on folio doesn't stop random filemap_get_entry() from elevating the
> refcount.
>
> folio_ref_freeze() might be required.

I thought the folio lock would be sufficient, but you're right,
nothing prevents getting a reference after the check. I'll use a
folio_ref_freeze() when I respin.

Thanks,
/fuad

> > +                     /*
> > +                      * Outstanding references, the folio cannot be faulted
> > +                      * in by anyone until they're dropped.
> > +                      */
> > +                     r = xa_err(xa_store(mappable_offsets, i, xval_none, GFP_KERNEL));
> > +             } else {
> > +                     /*
> > +                      * No outstanding references. Transition the folio to
> > +                      * guest mappable immediately.
> > +                      */
> > +                     r = xa_err(xa_store(mappable_offsets, i, xval_guest, GFP_KERNEL));
> > +             }
> > +
> > +             if (folio) {
> > +                     folio_unlock(folio);
> > +                     folio_put(folio);
> > +             }
> > +
> > +             if (WARN_ON_ONCE(r))
> > +                     break;
> > +     }
> > +     filemap_invalidate_unlock(inode->i_mapping);
> > +
> > +     return r;
> > +}
>
> --
>   Kiryl Shutsemau / Kirill A. Shutemov
diff mbox series

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cda3ed4c3c27..84aa7908a5dd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2564,4 +2564,57 @@  long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
 				    struct kvm_pre_fault_memory *range);
 #endif
 
+#ifdef CONFIG_KVM_GMEM_MAPPABLE
+bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t gfn, gfn_t end);
+int kvm_gmem_set_mappable(struct kvm *kvm, gfn_t start, gfn_t end);
+int kvm_gmem_clear_mappable(struct kvm *kvm, gfn_t start, gfn_t end);
+int kvm_slot_gmem_set_mappable(struct kvm_memory_slot *slot, gfn_t start,
+			       gfn_t end);
+int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot, gfn_t start,
+				 gfn_t end);
+bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot, gfn_t gfn);
+bool kvm_slot_gmem_is_guest_mappable(struct kvm_memory_slot *slot, gfn_t gfn);
+#else
+static inline bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t gfn, gfn_t end)
+{
+	WARN_ON_ONCE(1);
+	return false;
+}
+static inline int kvm_gmem_set_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
+{
+	WARN_ON_ONCE(1);
+	return -EINVAL;
+}
+static inline int kvm_gmem_clear_mappable(struct kvm *kvm, gfn_t start,
+					  gfn_t end)
+{
+	WARN_ON_ONCE(1);
+	return -EINVAL;
+}
+static inline int kvm_slot_gmem_set_mappable(struct kvm_memory_slot *slot,
+					     gfn_t start, gfn_t end)
+{
+	WARN_ON_ONCE(1);
+	return -EINVAL;
+}
+static inline int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot,
+					       gfn_t start, gfn_t end)
+{
+	WARN_ON_ONCE(1);
+	return -EINVAL;
+}
+static inline bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot,
+					     gfn_t gfn)
+{
+	WARN_ON_ONCE(1);
+	return false;
+}
+static inline bool kvm_slot_gmem_is_guest_mappable(struct kvm_memory_slot *slot,
+						   gfn_t gfn)
+{
+	WARN_ON_ONCE(1);
+	return false;
+}
+#endif /* CONFIG_KVM_GMEM_MAPPABLE */
+
 #endif
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 0a7b6cf8bd8f..d1c192927cf7 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -375,6 +375,159 @@  static void kvm_gmem_init_mount(void)
 	kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
 }
 
+#ifdef CONFIG_KVM_GMEM_MAPPABLE
+/*
+ * An enum of the valid states that describe who can map a folio.
+ * Bit 0: if set guest cannot map the page
+ * Bit 1: if set host cannot map the page
+ */
+enum folio_mappability {
+	KVM_GMEM_ALL_MAPPABLE	= 0b00,	/* Mappable by host and guest. */
+	KVM_GMEM_GUEST_MAPPABLE	= 0b10, /* Mappable only by guest. */
+	KVM_GMEM_NONE_MAPPABLE	= 0b11, /* Not mappable, transient state. */
+};
+
+/*
+ * Marks the range [start, end) as mappable by both the host and the guest.
+ * Usually called when guest shares memory with the host.
+ */
+static int gmem_set_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
+{
+	struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
+	void *xval = xa_mk_value(KVM_GMEM_ALL_MAPPABLE);
+	pgoff_t i;
+	int r = 0;
+
+	filemap_invalidate_lock(inode->i_mapping);
+	for (i = start; i < end; i++) {
+		r = xa_err(xa_store(mappable_offsets, i, xval, GFP_KERNEL));
+		if (r)
+			break;
+	}
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	return r;
+}
+
+/*
+ * Marks the range [start, end) as not mappable by the host. If the host doesn't
+ * have any references to a particular folio, then that folio is marked as
+ * mappable by the guest.
+ *
+ * However, if the host still has references to the folio, then the folio is
+ * marked and not mappable by anyone. Marking it is not mappable allows it to
+ * drain all references from the host, and to ensure that the hypervisor does
+ * not transition the folio to private, since the host still might access it.
+ *
+ * Usually called when guest unshares memory with the host.
+ */
+static int gmem_clear_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
+{
+	struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
+	void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_MAPPABLE);
+	void *xval_none = xa_mk_value(KVM_GMEM_NONE_MAPPABLE);
+	pgoff_t i;
+	int r = 0;
+
+	filemap_invalidate_lock(inode->i_mapping);
+	for (i = start; i < end; i++) {
+		struct folio *folio;
+		int refcount = 0;
+
+		folio = filemap_lock_folio(inode->i_mapping, i);
+		if (!IS_ERR(folio)) {
+			refcount = folio_ref_count(folio);
+		} else {
+			r = PTR_ERR(folio);
+			if (WARN_ON_ONCE(r != -ENOENT))
+				break;
+
+			folio = NULL;
+		}
+
+		/* +1 references are expected because of filemap_lock_folio(). */
+		if (folio && refcount > folio_nr_pages(folio) + 1) {
+			/*
+			 * Outstanding references, the folio cannot be faulted
+			 * in by anyone until they're dropped.
+			 */
+			r = xa_err(xa_store(mappable_offsets, i, xval_none, GFP_KERNEL));
+		} else {
+			/*
+			 * No outstanding references. Transition the folio to
+			 * guest mappable immediately.
+			 */
+			r = xa_err(xa_store(mappable_offsets, i, xval_guest, GFP_KERNEL));
+		}
+
+		if (folio) {
+			folio_unlock(folio);
+			folio_put(folio);
+		}
+
+		if (WARN_ON_ONCE(r))
+			break;
+	}
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	return r;
+}
+
+static bool gmem_is_mappable(struct inode *inode, pgoff_t pgoff)
+{
+	struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
+	unsigned long r;
+
+	r = xa_to_value(xa_load(mappable_offsets, pgoff));
+
+	return (r == KVM_GMEM_ALL_MAPPABLE);
+}
+
+static bool gmem_is_guest_mappable(struct inode *inode, pgoff_t pgoff)
+{
+	struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
+	unsigned long r;
+
+	r = xa_to_value(xa_load(mappable_offsets, pgoff));
+
+	return (r == KVM_GMEM_ALL_MAPPABLE || r == KVM_GMEM_GUEST_MAPPABLE);
+}
+
+int kvm_slot_gmem_set_mappable(struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
+{
+	struct inode *inode = file_inode(slot->gmem.file);
+	pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn;
+	pgoff_t end_off = start_off + end - start;
+
+	return gmem_set_mappable(inode, start_off, end_off);
+}
+
+int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
+{
+	struct inode *inode = file_inode(slot->gmem.file);
+	pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn;
+	pgoff_t end_off = start_off + end - start;
+
+	return gmem_clear_mappable(inode, start_off, end_off);
+}
+
+bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+	struct inode *inode = file_inode(slot->gmem.file);
+	unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn;
+
+	return gmem_is_mappable(inode, pgoff);
+}
+
+bool kvm_slot_gmem_is_guest_mappable(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+	struct inode *inode = file_inode(slot->gmem.file);
+	unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn;
+
+	return gmem_is_guest_mappable(inode, pgoff);
+}
+#endif /* CONFIG_KVM_GMEM_MAPPABLE */
+
 static struct file_operations kvm_gmem_fops = {
 	.open		= generic_file_open,
 	.release	= kvm_gmem_release,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index de2c11dae231..fffff01cebe7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3094,6 +3094,98 @@  static int next_segment(unsigned long len, int offset)
 		return len;
 }
 
+#ifdef CONFIG_KVM_GMEM_MAPPABLE
+bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
+{
+	struct kvm_memslot_iter iter;
+	bool r = true;
+
+	mutex_lock(&kvm->slots_lock);
+
+	kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) {
+		struct kvm_memory_slot *memslot = iter.slot;
+		gfn_t gfn_start, gfn_end, i;
+
+		if (!kvm_slot_can_be_private(memslot))
+			continue;
+
+		gfn_start = max(start, memslot->base_gfn);
+		gfn_end = min(end, memslot->base_gfn + memslot->npages);
+		if (WARN_ON_ONCE(gfn_start >= gfn_end))
+			continue;
+
+		for (i = gfn_start; i < gfn_end; i++) {
+			r = kvm_slot_gmem_is_mappable(memslot, i);
+			if (r)
+				goto out;
+		}
+	}
+out:
+	mutex_unlock(&kvm->slots_lock);
+
+	return r;
+}
+
+int kvm_gmem_set_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
+{
+	struct kvm_memslot_iter iter;
+	int r = 0;
+
+	mutex_lock(&kvm->slots_lock);
+
+	kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) {
+		struct kvm_memory_slot *memslot = iter.slot;
+		gfn_t gfn_start, gfn_end;
+
+		if (!kvm_slot_can_be_private(memslot))
+			continue;
+
+		gfn_start = max(start, memslot->base_gfn);
+		gfn_end = min(end, memslot->base_gfn + memslot->npages);
+		if (WARN_ON_ONCE(start >= end))
+			continue;
+
+		r = kvm_slot_gmem_set_mappable(memslot, gfn_start, gfn_end);
+		if (WARN_ON_ONCE(r))
+			break;
+	}
+
+	mutex_unlock(&kvm->slots_lock);
+
+	return r;
+}
+
+int kvm_gmem_clear_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
+{
+	struct kvm_memslot_iter iter;
+	int r = 0;
+
+	mutex_lock(&kvm->slots_lock);
+
+	kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) {
+		struct kvm_memory_slot *memslot = iter.slot;
+		gfn_t gfn_start, gfn_end;
+
+		if (!kvm_slot_can_be_private(memslot))
+			continue;
+
+		gfn_start = max(start, memslot->base_gfn);
+		gfn_end = min(end, memslot->base_gfn + memslot->npages);
+		if (WARN_ON_ONCE(start >= end))
+			continue;
+
+		r = kvm_slot_gmem_clear_mappable(memslot, gfn_start, gfn_end);
+		if (WARN_ON_ONCE(r))
+			break;
+	}
+
+	mutex_unlock(&kvm->slots_lock);
+
+	return r;
+}
+
+#endif /* CONFIG_KVM_GMEM_MAPPABLE */
+
 /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */
 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
 				 void *data, int offset, int len)