diff mbox series

[RFC,v5,06/15] KVM: guest_memfd: Handle final folio_put() of guestmem pages

Message ID 20250117163001.2326672-7-tabba@google.com
State New
Headers show
Series KVM: Restricted mapping of guest_memfd at the host and arm64 support | expand

Commit Message

Fuad Tabba Jan. 17, 2025, 4:29 p.m. UTC
Before transitioning a guest_memfd folio to unshared, thereby
disallowing access by the host and allowing the hypervisor to
transition its view of the guest page as private, we need to be
sure that the host doesn't have any references to the folio.

This patch introduces a new type for guest_memfd folios, and uses
that to register a callback that informs the guest_memfd
subsystem when the last reference is dropped, therefore knowing
that the host doesn't have any remaining references.

Signed-off-by: Fuad Tabba <tabba@google.com>
---
The function kvm_slot_gmem_register_callback() isn't used in this
series. It will be used later in code that performs unsharing of
memory. I have tested it with pKVM, based on downstream code [*].
It's included in this RFC since it demonstrates the plan to
handle unsharing of private folios.

[*] https://android-kvm.googlesource.com/linux/+/refs/heads/tabba/guestmem-6.13-v5-pkvm
---
 include/linux/kvm_host.h   |  11 +++
 include/linux/page-flags.h |   7 ++
 mm/debug.c                 |   1 +
 mm/swap.c                  |   4 +
 virt/kvm/guest_memfd.c     | 145 +++++++++++++++++++++++++++++++++++++
 5 files changed, 168 insertions(+)

Comments

Vlastimil Babka Jan. 20, 2025, 11:37 a.m. UTC | #1
On 1/17/25 17:29, Fuad Tabba wrote:
> Before transitioning a guest_memfd folio to unshared, thereby
> disallowing access by the host and allowing the hypervisor to
> transition its view of the guest page as private, we need to be
> sure that the host doesn't have any references to the folio.
> 
> This patch introduces a new type for guest_memfd folios, and uses
> that to register a callback that informs the guest_memfd
> subsystem when the last reference is dropped, therefore knowing
> that the host doesn't have any remaining references.
> 
> Signed-off-by: Fuad Tabba <tabba@google.com>
> ---
> The function kvm_slot_gmem_register_callback() isn't used in this
> series. It will be used later in code that performs unsharing of
> memory. I have tested it with pKVM, based on downstream code [*].
> It's included in this RFC since it demonstrates the plan to
> handle unsharing of private folios.
> 
> [*] https://android-kvm.googlesource.com/linux/+/refs/heads/tabba/guestmem-6.13-v5-pkvm

<snip>

> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -387,6 +387,28 @@ enum folio_mappability {
>  	KVM_GMEM_NONE_MAPPABLE	= 0b11, /* Not mappable, transient state. */
>  };
>  
> +/*
> + * Unregisters the __folio_put() callback from the folio.
> + *
> + * Restores a folio's refcount after all pending references have been released,
> + * and removes the folio type, thereby removing the callback. Now the folio can
> + * be freed normaly once all actual references have been dropped.
> + *
> + * Must be called with the filemap (inode->i_mapping) invalidate_lock held.
> + * Must also have exclusive access to the folio: folio must be either locked, or
> + * gmem holds the only reference.
> + */
> +static void __kvm_gmem_restore_pending_folio(struct folio *folio)
> +{
> +	if (WARN_ON_ONCE(folio_mapped(folio) || !folio_test_guestmem(folio)))
> +		return;
> +
> +	WARN_ON_ONCE(!folio_test_locked(folio) && folio_ref_count(folio) > 1);

Similar to Kirill's objection on the other patch, I think there might be a
speculative refcount increase (i.e. from a pfn scanner) as long as we have
refcount over 1. Probably not a problem here if we want to restore refcount
anyway? But the warning would be spurious.

> +
> +	__folio_clear_guestmem(folio);
> +	folio_ref_add(folio, folio_nr_pages(folio));
> +}
> +
>  /*
>   * Marks the range [start, end) as mappable by both the host and the guest.
>   * Usually called when guest shares memory with the host.
> @@ -400,7 +422,31 @@ static int gmem_set_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
>  
>  	filemap_invalidate_lock(inode->i_mapping);
>  	for (i = start; i < end; i++) {
> +		struct folio *folio = NULL;
> +
> +		/*
> +		 * If the folio is NONE_MAPPABLE, it indicates that it is
> +		 * transitioning to private (GUEST_MAPPABLE). Transition it to
> +		 * shared (ALL_MAPPABLE) immediately, and remove the callback.
> +		 */
> +		if (xa_to_value(xa_load(mappable_offsets, i)) == KVM_GMEM_NONE_MAPPABLE) {
> +			folio = filemap_lock_folio(inode->i_mapping, i);
> +			if (WARN_ON_ONCE(IS_ERR(folio))) {
> +				r = PTR_ERR(folio);
> +				break;
> +			}
> +
> +			if (folio_test_guestmem(folio))
> +				__kvm_gmem_restore_pending_folio(folio);
> +		}
> +
>  		r = xa_err(xa_store(mappable_offsets, i, xval, GFP_KERNEL));
> +
> +		if (folio) {
> +			folio_unlock(folio);
> +			folio_put(folio);
> +		}
> +
>  		if (r)
>  			break;
>  	}
> @@ -473,6 +519,105 @@ static int gmem_clear_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
>  	return r;
>  }
>  
> +/*
> + * Registers a callback to __folio_put(), so that gmem knows that the host does
> + * not have any references to the folio. It does that by setting the folio type
> + * to guestmem.
> + *
> + * Returns 0 if the host doesn't have any references, or -EAGAIN if the host
> + * has references, and the callback has been registered.

Note this comment.

> + *
> + * Must be called with the following locks held:
> + * - filemap (inode->i_mapping) invalidate_lock
> + * - folio lock
> + */
> +static int __gmem_register_callback(struct folio *folio, struct inode *inode, pgoff_t idx)
> +{
> +	struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> +	void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_MAPPABLE);
> +	int refcount;
> +
> +	rwsem_assert_held_write_nolockdep(&inode->i_mapping->invalidate_lock);
> +	WARN_ON_ONCE(!folio_test_locked(folio));
> +
> +	if (folio_mapped(folio) || folio_test_guestmem(folio))
> +		return -EAGAIN;

But here we return -EAGAIN and no callback was registered?

> +
> +	/* Register a callback first. */
> +	__folio_set_guestmem(folio);
> +
> +	/*
> +	 * Check for references after setting the type to guestmem, to guard
> +	 * against potential races with the refcount being decremented later.
> +	 *
> +	 * At least one reference is expected because the folio is locked.
> +	 */
> +
> +	refcount = folio_ref_sub_return(folio, folio_nr_pages(folio));
> +	if (refcount == 1) {
> +		int r;
> +
> +		/* refcount isn't elevated, it's now faultable by the guest. */

Again this seems racy, somebody could have just speculatively increased it.
Maybe we need to freeze here as well?

> +		r = WARN_ON_ONCE(xa_err(xa_store(mappable_offsets, idx, xval_guest, GFP_KERNEL)));
> +		if (!r)
> +			__kvm_gmem_restore_pending_folio(folio);
> +
> +		return r;
> +	}
> +
> +	return -EAGAIN;
> +}
> +
> +int kvm_slot_gmem_register_callback(struct kvm_memory_slot *slot, gfn_t gfn)
> +{
> +	unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn;
> +	struct inode *inode = file_inode(slot->gmem.file);
> +	struct folio *folio;
> +	int r;
> +
> +	filemap_invalidate_lock(inode->i_mapping);
> +
> +	folio = filemap_lock_folio(inode->i_mapping, pgoff);
> +	if (WARN_ON_ONCE(IS_ERR(folio))) {
> +		r = PTR_ERR(folio);
> +		goto out;
> +	}
> +
> +	r = __gmem_register_callback(folio, inode, pgoff);
> +
> +	folio_unlock(folio);
> +	folio_put(folio);
> +out:
> +	filemap_invalidate_unlock(inode->i_mapping);
> +
> +	return r;
> +}
> +
> +/*
> + * Callback function for __folio_put(), i.e., called when all references by the
> + * host to the folio have been dropped. This allows gmem to transition the state
> + * of the folio to mappable by the guest, and allows the hypervisor to continue
> + * transitioning its state to private, since the host cannot attempt to access
> + * it anymore.
> + */
> +void kvm_gmem_handle_folio_put(struct folio *folio)
> +{
> +	struct xarray *mappable_offsets;
> +	struct inode *inode;
> +	pgoff_t index;
> +	void *xval;
> +
> +	inode = folio->mapping->host;
> +	index = folio->index;
> +	mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> +	xval = xa_mk_value(KVM_GMEM_GUEST_MAPPABLE);
> +
> +	filemap_invalidate_lock(inode->i_mapping);
> +	__kvm_gmem_restore_pending_folio(folio);
> +	WARN_ON_ONCE(xa_err(xa_store(mappable_offsets, index, xval, GFP_KERNEL)));
> +	filemap_invalidate_unlock(inode->i_mapping);
> +}
> +
>  static bool gmem_is_mappable(struct inode *inode, pgoff_t pgoff)
>  {
>  	struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
Fuad Tabba Jan. 20, 2025, 12:14 p.m. UTC | #2
Hi Vlastimil,

On Mon, 20 Jan 2025 at 11:37, Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 1/17/25 17:29, Fuad Tabba wrote:
> > Before transitioning a guest_memfd folio to unshared, thereby
> > disallowing access by the host and allowing the hypervisor to
> > transition its view of the guest page as private, we need to be
> > sure that the host doesn't have any references to the folio.
> >
> > This patch introduces a new type for guest_memfd folios, and uses
> > that to register a callback that informs the guest_memfd
> > subsystem when the last reference is dropped, therefore knowing
> > that the host doesn't have any remaining references.
> >
> > Signed-off-by: Fuad Tabba <tabba@google.com>
> > ---
> > The function kvm_slot_gmem_register_callback() isn't used in this
> > series. It will be used later in code that performs unsharing of
> > memory. I have tested it with pKVM, based on downstream code [*].
> > It's included in this RFC since it demonstrates the plan to
> > handle unsharing of private folios.
> >
> > [*] https://android-kvm.googlesource.com/linux/+/refs/heads/tabba/guestmem-6.13-v5-pkvm
>
> <snip>
>
> > --- a/virt/kvm/guest_memfd.c
> > +++ b/virt/kvm/guest_memfd.c
> > @@ -387,6 +387,28 @@ enum folio_mappability {
> >       KVM_GMEM_NONE_MAPPABLE  = 0b11, /* Not mappable, transient state. */
> >  };
> >
> > +/*
> > + * Unregisters the __folio_put() callback from the folio.
> > + *
> > + * Restores a folio's refcount after all pending references have been released,
> > + * and removes the folio type, thereby removing the callback. Now the folio can
> > + * be freed normaly once all actual references have been dropped.
> > + *
> > + * Must be called with the filemap (inode->i_mapping) invalidate_lock held.
> > + * Must also have exclusive access to the folio: folio must be either locked, or
> > + * gmem holds the only reference.
> > + */
> > +static void __kvm_gmem_restore_pending_folio(struct folio *folio)
> > +{
> > +     if (WARN_ON_ONCE(folio_mapped(folio) || !folio_test_guestmem(folio)))
> > +             return;
> > +
> > +     WARN_ON_ONCE(!folio_test_locked(folio) && folio_ref_count(folio) > 1);
>
> Similar to Kirill's objection on the other patch, I think there might be a
> speculative refcount increase (i.e. from a pfn scanner) as long as we have
> refcount over 1. Probably not a problem here if we want to restore refcount
> anyway? But the warning would be spurious.
>
> > +
> > +     __folio_clear_guestmem(folio);
> > +     folio_ref_add(folio, folio_nr_pages(folio));
> > +}
> > +
> >  /*
> >   * Marks the range [start, end) as mappable by both the host and the guest.
> >   * Usually called when guest shares memory with the host.
> > @@ -400,7 +422,31 @@ static int gmem_set_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
> >
> >       filemap_invalidate_lock(inode->i_mapping);
> >       for (i = start; i < end; i++) {
> > +             struct folio *folio = NULL;
> > +
> > +             /*
> > +              * If the folio is NONE_MAPPABLE, it indicates that it is
> > +              * transitioning to private (GUEST_MAPPABLE). Transition it to
> > +              * shared (ALL_MAPPABLE) immediately, and remove the callback.
> > +              */
> > +             if (xa_to_value(xa_load(mappable_offsets, i)) == KVM_GMEM_NONE_MAPPABLE) {
> > +                     folio = filemap_lock_folio(inode->i_mapping, i);
> > +                     if (WARN_ON_ONCE(IS_ERR(folio))) {
> > +                             r = PTR_ERR(folio);
> > +                             break;
> > +                     }
> > +
> > +                     if (folio_test_guestmem(folio))
> > +                             __kvm_gmem_restore_pending_folio(folio);
> > +             }
> > +
> >               r = xa_err(xa_store(mappable_offsets, i, xval, GFP_KERNEL));
> > +
> > +             if (folio) {
> > +                     folio_unlock(folio);
> > +                     folio_put(folio);
> > +             }
> > +
> >               if (r)
> >                       break;
> >       }
> > @@ -473,6 +519,105 @@ static int gmem_clear_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
> >       return r;
> >  }
> >
> > +/*
> > + * Registers a callback to __folio_put(), so that gmem knows that the host does
> > + * not have any references to the folio. It does that by setting the folio type
> > + * to guestmem.
> > + *
> > + * Returns 0 if the host doesn't have any references, or -EAGAIN if the host
> > + * has references, and the callback has been registered.
>
> Note this comment.
>
> > + *
> > + * Must be called with the following locks held:
> > + * - filemap (inode->i_mapping) invalidate_lock
> > + * - folio lock
> > + */
> > +static int __gmem_register_callback(struct folio *folio, struct inode *inode, pgoff_t idx)
> > +{
> > +     struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> > +     void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_MAPPABLE);
> > +     int refcount;
> > +
> > +     rwsem_assert_held_write_nolockdep(&inode->i_mapping->invalidate_lock);
> > +     WARN_ON_ONCE(!folio_test_locked(folio));
> > +
> > +     if (folio_mapped(folio) || folio_test_guestmem(folio))
> > +             return -EAGAIN;
>
> But here we return -EAGAIN and no callback was registered?

This is intentional. If the folio is still mapped (i.e., its mapcount
is elevated), then we cannot register the callback yet, so the
host/vmm needs to unmap first, then try again. That said, I see the
problem with the comment above, and I will clarify this.

> > +
> > +     /* Register a callback first. */
> > +     __folio_set_guestmem(folio);
> > +
> > +     /*
> > +      * Check for references after setting the type to guestmem, to guard
> > +      * against potential races with the refcount being decremented later.
> > +      *
> > +      * At least one reference is expected because the folio is locked.
> > +      */
> > +
> > +     refcount = folio_ref_sub_return(folio, folio_nr_pages(folio));
> > +     if (refcount == 1) {
> > +             int r;
> > +
> > +             /* refcount isn't elevated, it's now faultable by the guest. */
>
> Again this seems racy, somebody could have just speculatively increased it.
> Maybe we need to freeze here as well?

A speculative increase here is ok I think (famous last words). The
callback was registered before the check, therefore, such an increase
would trigger the callback.

Thanks,
/fuad


> > +             r = WARN_ON_ONCE(xa_err(xa_store(mappable_offsets, idx, xval_guest, GFP_KERNEL)));
> > +             if (!r)
> > +                     __kvm_gmem_restore_pending_folio(folio);
> > +
> > +             return r;
> > +     }
> > +
> > +     return -EAGAIN;
> > +}
> > +
> > +int kvm_slot_gmem_register_callback(struct kvm_memory_slot *slot, gfn_t gfn)
> > +{
> > +     unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn;
> > +     struct inode *inode = file_inode(slot->gmem.file);
> > +     struct folio *folio;
> > +     int r;
> > +
> > +     filemap_invalidate_lock(inode->i_mapping);
> > +
> > +     folio = filemap_lock_folio(inode->i_mapping, pgoff);
> > +     if (WARN_ON_ONCE(IS_ERR(folio))) {
> > +             r = PTR_ERR(folio);
> > +             goto out;
> > +     }
> > +
> > +     r = __gmem_register_callback(folio, inode, pgoff);
> > +
> > +     folio_unlock(folio);
> > +     folio_put(folio);
> > +out:
> > +     filemap_invalidate_unlock(inode->i_mapping);
> > +
> > +     return r;
> > +}
> > +
> > +/*
> > + * Callback function for __folio_put(), i.e., called when all references by the
> > + * host to the folio have been dropped. This allows gmem to transition the state
> > + * of the folio to mappable by the guest, and allows the hypervisor to continue
> > + * transitioning its state to private, since the host cannot attempt to access
> > + * it anymore.
> > + */
> > +void kvm_gmem_handle_folio_put(struct folio *folio)
> > +{
> > +     struct xarray *mappable_offsets;
> > +     struct inode *inode;
> > +     pgoff_t index;
> > +     void *xval;
> > +
> > +     inode = folio->mapping->host;
> > +     index = folio->index;
> > +     mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> > +     xval = xa_mk_value(KVM_GMEM_GUEST_MAPPABLE);
> > +
> > +     filemap_invalidate_lock(inode->i_mapping);
> > +     __kvm_gmem_restore_pending_folio(folio);
> > +     WARN_ON_ONCE(xa_err(xa_store(mappable_offsets, index, xval, GFP_KERNEL)));
> > +     filemap_invalidate_unlock(inode->i_mapping);
> > +}
> > +
> >  static bool gmem_is_mappable(struct inode *inode, pgoff_t pgoff)
> >  {
> >       struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
>
Fuad Tabba Jan. 23, 2025, 11 a.m. UTC | #3
On Wed, 22 Jan 2025 at 22:24, Ackerley Tng <ackerleytng@google.com> wrote:
>
> Fuad Tabba <tabba@google.com> writes:
>
> >> > <snip>
> >> >
> >> > +/*
> >> > + * Registers a callback to __folio_put(), so that gmem knows that the host does
> >> > + * not have any references to the folio. It does that by setting the folio type
> >> > + * to guestmem.
> >> > + *
> >> > + * Returns 0 if the host doesn't have any references, or -EAGAIN if the host
> >> > + * has references, and the callback has been registered.
> >>
> >> Note this comment.
> >>
> >> > + *
> >> > + * Must be called with the following locks held:
> >> > + * - filemap (inode->i_mapping) invalidate_lock
> >> > + * - folio lock
> >> > + */
> >> > +static int __gmem_register_callback(struct folio *folio, struct inode *inode, pgoff_t idx)
> >> > +{
> >> > +     struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> >> > +     void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_MAPPABLE);
> >> > +     int refcount;
> >> > +
> >> > +     rwsem_assert_held_write_nolockdep(&inode->i_mapping->invalidate_lock);
> >> > +     WARN_ON_ONCE(!folio_test_locked(folio));
> >> > +
> >> > +     if (folio_mapped(folio) || folio_test_guestmem(folio))
> >> > +             return -EAGAIN;
> >>
> >> But here we return -EAGAIN and no callback was registered?
> >
> > This is intentional. If the folio is still mapped (i.e., its mapcount
> > is elevated), then we cannot register the callback yet, so the
> > host/vmm needs to unmap first, then try again. That said, I see the
> > problem with the comment above, and I will clarify this.
> >
> >> > +
> >> > +     /* Register a callback first. */
> >> > +     __folio_set_guestmem(folio);
> >> > +
> >> > +     /*
> >> > +      * Check for references after setting the type to guestmem, to guard
> >> > +      * against potential races with the refcount being decremented later.
> >> > +      *
> >> > +      * At least one reference is expected because the folio is locked.
> >> > +      */
> >> > +
> >> > +     refcount = folio_ref_sub_return(folio, folio_nr_pages(folio));
> >> > +     if (refcount == 1) {
> >> > +             int r;
> >> > +
> >> > +             /* refcount isn't elevated, it's now faultable by the guest. */
> >>
> >> Again this seems racy, somebody could have just speculatively increased it.
> >> Maybe we need to freeze here as well?
> >
> > A speculative increase here is ok I think (famous last words). The
> > callback was registered before the check, therefore, such an increase
> > would trigger the callback.
> >
> > Thanks,
> > /fuad
> >
> >
>
> I checked the callback (kvm_gmem_handle_folio_put()) and agree with you
> that the mappability reset to KVM_GMEM_GUEST_MAPPABLE is handled
> correctly (since kvm_gmem_handle_folio_put() doesn't assume anything
> about the mappability state at callback-time).
>
> However, what if the new speculative reference writes to the page and
> guest goes on to fault/use the page?

I don't think that's a problem. At this point the page is in a
transient state, but still shared from the guest's point of view.
Moreover, no one can fault-in the page at the host at this point (we
check in kvm_gmem_fault()).

Let's have a look at the code:

+static int __gmem_register_callback(struct folio *folio, struct inode
*inode, pgoff_t idx)
+{
+       struct xarray *mappable_offsets =
&kvm_gmem_private(inode)->mappable_offsets;
+       void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_MAPPABLE);
+       int refcount;

At this point the guest still perceives the page as shared, the state
of the page is KVM_GMEM_NONE_MAPPABLE (transient state). This means
that kvm_gmem_fault() doesn't fault-in the page at the host anymore.

+       rwsem_assert_held_write_nolockdep(&inode->i_mapping->invalidate_lock);
+       WARN_ON_ONCE(!folio_test_locked(folio));
+
+       if (folio_mapped(folio) || folio_test_guestmem(folio))
+               return -EAGAIN;
+
+       /* Register a callback first. */
+       __folio_set_guestmem(folio);

This (in addition to the state of the NONE_MAPPABLE), also ensures
that kvm_gmem_fault() doesn't fault-in the page at the host anymore.

+       /*
+        * Check for references after setting the type to guestmem, to guard
+        * against potential races with the refcount being decremented later.
+        *
+        * At least one reference is expected because the folio is locked.
+        */
+
+       refcount = folio_ref_sub_return(folio, folio_nr_pages(folio));
+       if (refcount == 1) {
+               int r;

At this point we know that guest_memfd has the only real reference.
Speculative references AFAIK do not access the page itself.
+
+               /* refcount isn't elevated, it's now faultable by the guest. */
+               r = WARN_ON_ONCE(xa_err(xa_store(mappable_offsets,
idx, xval_guest, GFP_KERNEL)));

Now it's safe so let the guest know that it can map the page.

+               if (!r)
+                       __kvm_gmem_restore_pending_folio(folio);
+
+               return r;
+       }
+
+       return -EAGAIN;
+}

Does this make sense, or did I miss something?

Thanks!
/fuad

> >> > +             r = WARN_ON_ONCE(xa_err(xa_store(mappable_offsets, idx, xval_guest, GFP_KERNEL)));
> >> > +             if (!r)
> >> > +                     __kvm_gmem_restore_pending_folio(folio);
> >> > +
> >> > +             return r;
> >> > +     }
> >> > +
> >> > +     return -EAGAIN;
> >> > +}
> >> > +
> >> > +int kvm_slot_gmem_register_callback(struct kvm_memory_slot *slot, gfn_t gfn)
> >> > +{
> >> > +     unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn;
> >> > +     struct inode *inode = file_inode(slot->gmem.file);
> >> > +     struct folio *folio;
> >> > +     int r;
> >> > +
> >> > +     filemap_invalidate_lock(inode->i_mapping);
> >> > +
> >> > +     folio = filemap_lock_folio(inode->i_mapping, pgoff);
> >> > +     if (WARN_ON_ONCE(IS_ERR(folio))) {
> >> > +             r = PTR_ERR(folio);
> >> > +             goto out;
> >> > +     }
> >> > +
> >> > +     r = __gmem_register_callback(folio, inode, pgoff);
> >> > +
> >> > +     folio_unlock(folio);
> >> > +     folio_put(folio);
> >> > +out:
> >> > +     filemap_invalidate_unlock(inode->i_mapping);
> >> > +
> >> > +     return r;
> >> > +}
> >> > +
> >> > +/*
> >> > + * Callback function for __folio_put(), i.e., called when all references by the
> >> > + * host to the folio have been dropped. This allows gmem to transition the state
> >> > + * of the folio to mappable by the guest, and allows the hypervisor to continue
> >> > + * transitioning its state to private, since the host cannot attempt to access
> >> > + * it anymore.
> >> > + */
> >> > +void kvm_gmem_handle_folio_put(struct folio *folio)
> >> > +{
> >> > +     struct xarray *mappable_offsets;
> >> > +     struct inode *inode;
> >> > +     pgoff_t index;
> >> > +     void *xval;
> >> > +
> >> > +     inode = folio->mapping->host;
> >> > +     index = folio->index;
> >> > +     mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> >> > +     xval = xa_mk_value(KVM_GMEM_GUEST_MAPPABLE);
> >> > +
> >> > +     filemap_invalidate_lock(inode->i_mapping);
> >> > +     __kvm_gmem_restore_pending_folio(folio);
> >> > +     WARN_ON_ONCE(xa_err(xa_store(mappable_offsets, index, xval, GFP_KERNEL)));
> >> > +     filemap_invalidate_unlock(inode->i_mapping);
> >> > +}
> >> > +
> >> >  static bool gmem_is_mappable(struct inode *inode, pgoff_t pgoff)
> >> >  {
> >> >       struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> >>
diff mbox series

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 84aa7908a5dd..63e6d6dd98b3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2574,6 +2574,8 @@  int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot, gfn_t start,
 				 gfn_t end);
 bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot, gfn_t gfn);
 bool kvm_slot_gmem_is_guest_mappable(struct kvm_memory_slot *slot, gfn_t gfn);
+int kvm_slot_gmem_register_callback(struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_gmem_handle_folio_put(struct folio *folio);
 #else
 static inline bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t gfn, gfn_t end)
 {
@@ -2615,6 +2617,15 @@  static inline bool kvm_slot_gmem_is_guest_mappable(struct kvm_memory_slot *slot,
 	WARN_ON_ONCE(1);
 	return false;
 }
+static inline int kvm_slot_gmem_register_callback(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+	WARN_ON_ONCE(1);
+	return -EINVAL;
+}
+static inline void kvm_gmem_handle_folio_put(struct folio *folio)
+{
+	WARN_ON_ONCE(1);
+}
 #endif /* CONFIG_KVM_GMEM_MAPPABLE */
 
 #endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 6615f2f59144..bab3cac1f93b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -942,6 +942,7 @@  enum pagetype {
 	PGTY_slab	= 0xf5,
 	PGTY_zsmalloc	= 0xf6,
 	PGTY_unaccepted	= 0xf7,
+	PGTY_guestmem	= 0xf8,
 
 	PGTY_mapcount_underflow = 0xff
 };
@@ -1091,6 +1092,12 @@  FOLIO_TYPE_OPS(hugetlb, hugetlb)
 FOLIO_TEST_FLAG_FALSE(hugetlb)
 #endif
 
+#ifdef CONFIG_KVM_GMEM_MAPPABLE
+FOLIO_TYPE_OPS(guestmem, guestmem)
+#else
+FOLIO_TEST_FLAG_FALSE(guestmem)
+#endif
+
 PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc)
 
 /*
diff --git a/mm/debug.c b/mm/debug.c
index 95b6ab809c0e..db93be385ed9 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -56,6 +56,7 @@  static const char *page_type_names[] = {
 	DEF_PAGETYPE_NAME(table),
 	DEF_PAGETYPE_NAME(buddy),
 	DEF_PAGETYPE_NAME(unaccepted),
+	DEF_PAGETYPE_NAME(guestmem),
 };
 
 static const char *page_type_name(unsigned int page_type)
diff --git a/mm/swap.c b/mm/swap.c
index 6f01b56bce13..15220eaabc86 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -37,6 +37,7 @@ 
 #include <linux/page_idle.h>
 #include <linux/local_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/kvm_host.h>
 
 #include "internal.h"
 
@@ -103,6 +104,9 @@  static void free_typed_folio(struct folio *folio)
 	case PGTY_offline:
 		/* Nothing to do, it's offline. */
 		return;
+	case PGTY_guestmem:
+		kvm_gmem_handle_folio_put(folio);
+		return;
 	default:
 		WARN_ON_ONCE(1);
 	}
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index d1c192927cf7..722afd9f8742 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -387,6 +387,28 @@  enum folio_mappability {
 	KVM_GMEM_NONE_MAPPABLE	= 0b11, /* Not mappable, transient state. */
 };
 
+/*
+ * Unregisters the __folio_put() callback from the folio.
+ *
+ * Restores a folio's refcount after all pending references have been released,
+ * and removes the folio type, thereby removing the callback. Now the folio can
+ * be freed normaly once all actual references have been dropped.
+ *
+ * Must be called with the filemap (inode->i_mapping) invalidate_lock held.
+ * Must also have exclusive access to the folio: folio must be either locked, or
+ * gmem holds the only reference.
+ */
+static void __kvm_gmem_restore_pending_folio(struct folio *folio)
+{
+	if (WARN_ON_ONCE(folio_mapped(folio) || !folio_test_guestmem(folio)))
+		return;
+
+	WARN_ON_ONCE(!folio_test_locked(folio) && folio_ref_count(folio) > 1);
+
+	__folio_clear_guestmem(folio);
+	folio_ref_add(folio, folio_nr_pages(folio));
+}
+
 /*
  * Marks the range [start, end) as mappable by both the host and the guest.
  * Usually called when guest shares memory with the host.
@@ -400,7 +422,31 @@  static int gmem_set_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
 
 	filemap_invalidate_lock(inode->i_mapping);
 	for (i = start; i < end; i++) {
+		struct folio *folio = NULL;
+
+		/*
+		 * If the folio is NONE_MAPPABLE, it indicates that it is
+		 * transitioning to private (GUEST_MAPPABLE). Transition it to
+		 * shared (ALL_MAPPABLE) immediately, and remove the callback.
+		 */
+		if (xa_to_value(xa_load(mappable_offsets, i)) == KVM_GMEM_NONE_MAPPABLE) {
+			folio = filemap_lock_folio(inode->i_mapping, i);
+			if (WARN_ON_ONCE(IS_ERR(folio))) {
+				r = PTR_ERR(folio);
+				break;
+			}
+
+			if (folio_test_guestmem(folio))
+				__kvm_gmem_restore_pending_folio(folio);
+		}
+
 		r = xa_err(xa_store(mappable_offsets, i, xval, GFP_KERNEL));
+
+		if (folio) {
+			folio_unlock(folio);
+			folio_put(folio);
+		}
+
 		if (r)
 			break;
 	}
@@ -473,6 +519,105 @@  static int gmem_clear_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
 	return r;
 }
 
+/*
+ * Registers a callback to __folio_put(), so that gmem knows that the host does
+ * not have any references to the folio. It does that by setting the folio type
+ * to guestmem.
+ *
+ * Returns 0 if the host doesn't have any references, or -EAGAIN if the host
+ * has references, and the callback has been registered.
+ *
+ * Must be called with the following locks held:
+ * - filemap (inode->i_mapping) invalidate_lock
+ * - folio lock
+ */
+static int __gmem_register_callback(struct folio *folio, struct inode *inode, pgoff_t idx)
+{
+	struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
+	void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_MAPPABLE);
+	int refcount;
+
+	rwsem_assert_held_write_nolockdep(&inode->i_mapping->invalidate_lock);
+	WARN_ON_ONCE(!folio_test_locked(folio));
+
+	if (folio_mapped(folio) || folio_test_guestmem(folio))
+		return -EAGAIN;
+
+	/* Register a callback first. */
+	__folio_set_guestmem(folio);
+
+	/*
+	 * Check for references after setting the type to guestmem, to guard
+	 * against potential races with the refcount being decremented later.
+	 *
+	 * At least one reference is expected because the folio is locked.
+	 */
+
+	refcount = folio_ref_sub_return(folio, folio_nr_pages(folio));
+	if (refcount == 1) {
+		int r;
+
+		/* refcount isn't elevated, it's now faultable by the guest. */
+		r = WARN_ON_ONCE(xa_err(xa_store(mappable_offsets, idx, xval_guest, GFP_KERNEL)));
+		if (!r)
+			__kvm_gmem_restore_pending_folio(folio);
+
+		return r;
+	}
+
+	return -EAGAIN;
+}
+
+int kvm_slot_gmem_register_callback(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+	unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn;
+	struct inode *inode = file_inode(slot->gmem.file);
+	struct folio *folio;
+	int r;
+
+	filemap_invalidate_lock(inode->i_mapping);
+
+	folio = filemap_lock_folio(inode->i_mapping, pgoff);
+	if (WARN_ON_ONCE(IS_ERR(folio))) {
+		r = PTR_ERR(folio);
+		goto out;
+	}
+
+	r = __gmem_register_callback(folio, inode, pgoff);
+
+	folio_unlock(folio);
+	folio_put(folio);
+out:
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	return r;
+}
+
+/*
+ * Callback function for __folio_put(), i.e., called when all references by the
+ * host to the folio have been dropped. This allows gmem to transition the state
+ * of the folio to mappable by the guest, and allows the hypervisor to continue
+ * transitioning its state to private, since the host cannot attempt to access
+ * it anymore.
+ */
+void kvm_gmem_handle_folio_put(struct folio *folio)
+{
+	struct xarray *mappable_offsets;
+	struct inode *inode;
+	pgoff_t index;
+	void *xval;
+
+	inode = folio->mapping->host;
+	index = folio->index;
+	mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
+	xval = xa_mk_value(KVM_GMEM_GUEST_MAPPABLE);
+
+	filemap_invalidate_lock(inode->i_mapping);
+	__kvm_gmem_restore_pending_folio(folio);
+	WARN_ON_ONCE(xa_err(xa_store(mappable_offsets, index, xval, GFP_KERNEL)));
+	filemap_invalidate_unlock(inode->i_mapping);
+}
+
 static bool gmem_is_mappable(struct inode *inode, pgoff_t pgoff)
 {
 	struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;