diff mbox series

[RFC,3/4] mm: guest_memfd: Add option to remove guest private memory from direct map

Message ID 20240805-guest-memfd-lib-v1-3-e5a29a4ff5d7@quicinc.com
State New
Headers show
Series mm: Introduce guest_memfd library | expand

Commit Message

Elliot Berman Aug. 5, 2024, 6:34 p.m. UTC
This patch was reworked from Patrick's patch:
https://lore.kernel.org/all/20240709132041.3625501-6-roypat@amazon.co.uk/

While guest_memfd is not available to be mapped by userspace, it is
still accessible through the kernel's direct map. This means that in
scenarios where guest-private memory is not hardware protected, it can
be speculatively read and its contents potentially leaked through
hardware side-channels. Removing guest-private memory from the direct
map, thus mitigates a large class of speculative execution issues
[1, Table 1].

Direct map removal do not reuse the `.prepare` machinery, since
`prepare` can be called multiple time, and it is the responsibility of
the preparation routine to not "prepare" the same folio twice [2]. Thus,
instead explicitly check if `filemap_grab_folio` allocated a new folio,
and remove the returned folio from the direct map only if this was the
case.

The patch uses release_folio instead of free_folio to reinsert pages
back into the direct map as by the time free_folio is called,
folio->mapping can already be NULL. This means that a call to
folio_inode inside free_folio might deference a NULL pointer, leaving no
way to access the inode which stores the flags that allow determining
whether the page was removed from the direct map in the first place.

[1]: https://download.vusec.net/papers/quarantine_raid23.pdf

Cc: Patrick Roy <roypat@amazon.co.uk>
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
---
 include/linux/guest_memfd.h |  8 ++++++
 mm/guest_memfd.c            | 65 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 72 insertions(+), 1 deletion(-)

Comments

David Hildenbrand Aug. 6, 2024, 2:08 p.m. UTC | #1
On 05.08.24 20:34, Elliot Berman wrote:
> This patch was reworked from Patrick's patch:
> https://lore.kernel.org/all/20240709132041.3625501-6-roypat@amazon.co.uk/
> 
> While guest_memfd is not available to be mapped by userspace, it is
> still accessible through the kernel's direct map. This means that in
> scenarios where guest-private memory is not hardware protected, it can
> be speculatively read and its contents potentially leaked through
> hardware side-channels. Removing guest-private memory from the direct
> map, thus mitigates a large class of speculative execution issues
> [1, Table 1].

I think you have to point out here that the speculative execution issues 
are primarily only an issue when guest_memfd private memory is used 
without TDX and friends where the memory would be encrypted either way.

Or am I wrong?

> 
> Direct map removal do not reuse the `.prepare` machinery, since
> `prepare` can be called multiple time, and it is the responsibility of
> the preparation routine to not "prepare" the same folio twice [2]. Thus,
> instead explicitly check if `filemap_grab_folio` allocated a new folio,
> and remove the returned folio from the direct map only if this was the
> case.
> 
> The patch uses release_folio instead of free_folio to reinsert pages
> back into the direct map as by the time free_folio is called,
> folio->mapping can already be NULL. This means that a call to
> folio_inode inside free_folio might deference a NULL pointer, leaving no
> way to access the inode which stores the flags that allow determining
> whether the page was removed from the direct map in the first place.
> 
> [1]: https://download.vusec.net/papers/quarantine_raid23.pdf
> 
> Cc: Patrick Roy <roypat@amazon.co.uk>
> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
> ---
>   include/linux/guest_memfd.h |  8 ++++++
>   mm/guest_memfd.c            | 65 ++++++++++++++++++++++++++++++++++++++++++++-
>   2 files changed, 72 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/guest_memfd.h b/include/linux/guest_memfd.h
> index be56d9d53067..f9e4a27aed67 100644
> --- a/include/linux/guest_memfd.h
> +++ b/include/linux/guest_memfd.h
> @@ -25,6 +25,14 @@ struct guest_memfd_operations {
>   	int (*release)(struct inode *inode);
>   };
>   
> +/**
> + * @GUEST_MEMFD_FLAG_NO_DIRECT_MAP: When making folios inaccessible by host, also
> + *                                  remove them from the kernel's direct map.
> + */

Should we start introducing the concept of private and shared first, 
such that we can then say that this only applies to private memory?

> +enum {
> +	GUEST_MEMFD_FLAG_NO_DIRECT_MAP		= BIT(0),
> +};
> +
>   /**
>    * @GUEST_MEMFD_GRAB_UPTODATE: Ensure pages are zeroed/up to date.
>    *                             If trusted hyp will do it, can ommit this flag
> diff --git a/mm/guest_memfd.c b/mm/guest_memfd.c
> index 580138b0f9d4..e9d8cab72b28 100644
> --- a/mm/guest_memfd.c
> +++ b/mm/guest_memfd.c
> @@ -7,9 +7,55 @@
>   #include <linux/falloc.h>
>   #include <linux/guest_memfd.h>
>   #include <linux/pagemap.h>
> +#include <linux/set_memory.h>
> +
> +static inline int guest_memfd_folio_private(struct folio *folio)
> +{
> +	unsigned long nr_pages = folio_nr_pages(folio);

guest_memfd only supports small folios, this can be simplified.

> +	unsigned long i;
> +	int r;
> +
> +	for (i = 0; i < nr_pages; i++) {
> +		struct page *page = folio_page(folio, i);
> +
> +		r = set_direct_map_invalid_noflush(page);
> +		if (r < 0)
> +			goto out_remap;
> +	}
> +
> +	folio_set_private(folio);
> +	return 0;
> +out_remap:
> +	for (; i > 0; i--) {
> +		struct page *page = folio_page(folio, i - 1);
> +
> +		BUG_ON(set_direct_map_default_noflush(page));
> +	}
> +	return r;
> +}
> +
> +static inline void guest_memfd_folio_clear_private(struct folio *folio)

Set set/clear private semantics in this context are a bit confusing. I 
assume you mean "make inaccessible" "make accessible" and using the 
PG_private flag is just an implementation detail.

> +{
> +	unsigned long start = (unsigned long)folio_address(folio);
> +	unsigned long nr = folio_nr_pages(folio);
> +	unsigned long i;
> +
> +	if (!folio_test_private(folio))
> +		return;
> +
> +	for (i = 0; i < nr; i++) {
> +		struct page *page = folio_page(folio, i);
> +
> +		BUG_ON(set_direct_map_default_noflush(page));
> +	}
> +	flush_tlb_kernel_range(start, start + folio_size(folio));
> +
> +	folio_clear_private(folio);
> +}
>   
>   struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags)
>   {
> +	unsigned long gmem_flags = (unsigned long)file->private_data;
>   	struct inode *inode = file_inode(file);
>   	struct guest_memfd_operations *ops = inode->i_private;
>   	struct folio *folio;
> @@ -43,6 +89,12 @@ struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
>   			goto out_err;
>   	}
>   
> +	if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
> +		r = guest_memfd_folio_private(folio);
> +		if (r)
> +			goto out_err;
> +	}
> +
>   	/*
>   	 * Ignore accessed, referenced, and dirty flags.  The memory is
>   	 * unevictable and there is no storage to write back to.
> @@ -213,14 +265,25 @@ static bool gmem_release_folio(struct folio *folio, gfp_t gfp)
>   	if (ops->invalidate_end)
>   		ops->invalidate_end(inode, offset, nr);
>   
> +	guest_memfd_folio_clear_private(folio);
> +
>   	return true;
>   }
>   
> +static void gmem_invalidate_folio(struct folio *folio, size_t offset, size_t len)
> +{
> +	/* not yet supported */
> +	BUG_ON(offset || len != folio_size(folio));
> +
> +	BUG_ON(!gmem_release_folio(folio, 0));

In general, no BUG_ON please. WARN_ON_ONCE() is sufficient.

> +}
> +
>   static const struct address_space_operations gmem_aops = {
>   	.dirty_folio = noop_dirty_folio,
>   	.migrate_folio = gmem_migrate_folio,
>   	.error_remove_folio = gmem_error_folio,
>   	.release_folio = gmem_release_folio,
> +	.invalidate_folio = gmem_invalidate_folio,
>   };
>   
>   static inline bool guest_memfd_check_ops(const struct guest_memfd_operations *ops)
> @@ -241,7 +304,7 @@ struct file *guest_memfd_alloc(const char *name,
>   	if (!guest_memfd_check_ops(ops))
>   		return ERR_PTR(-EINVAL);
>   
> -	if (flags)
> +	if (flags & ~GUEST_MEMFD_FLAG_NO_DIRECT_MAP)
>   		return ERR_PTR(-EINVAL);
>   
>   	/*
>
Manwaring, Derek Aug. 8, 2024, 12:14 a.m. UTC | #2
On 2024-08-06 07:10-0700 David Hildenbrand wrote:
> > While guest_memfd is not available to be mapped by userspace, it is
> > still accessible through the kernel's direct map. This means that in
> > scenarios where guest-private memory is not hardware protected, it can
> > be speculatively read and its contents potentially leaked through
> > hardware side-channels. Removing guest-private memory from the direct
> > map, thus mitigates a large class of speculative execution issues
> > [1, Table 1].
>
> I think you have to point out here that the speculative execution issues
> are primarily only an issue when guest_memfd private memory is used
> without TDX and friends where the memory would be encrypted either way.
>
> Or am I wrong?

Actually, I'm not sure how much protection CoCo solutions offer in this
regard. I'd love to hear more from Intel and AMD on this, but it looks
like they are not targeting full coverage for these types of attacks
(beyond protecting guest mitigation settings from manipulation by the
host).  For example, see this selection from AMD's 2020 whitepaper [1]
on SEV-SNP:

"There are certain classes of attacks that are not in scope for any of
these three features. Architectural side channel attacks on CPU data
structures are not specifically prevented by any hardware means. As with
standard software security practices, code which is sensitive to such
side channel attacks (e.g., cryptographic libraries) should be written
in a way which helps prevent such attacks."

And:

"While SEV-SNP offers guests several options when it comes to protection
from speculative side channel attacks and SMT, it is not able to protect
against all possible side channel attacks. For example, traditional side
channel attacks on software such as PRIME+PROBE are not protected by
SEV-SNP."

Intel's docs also indicate guests need to protect themselves in some
cases saying, "TD software should be aware that potentially untrusted
software running outside a TD may be able to influence conditional
branch predictions of software running in a TD" [2] and "a TDX guest VM
is no different from a legacy guest VM in terms of protecting this
userspace <-> OS kernel boundary" [3]. But these focus on hardening
kernel & software within the guest.

What's not clear to me is what happens during transient execution when
the host kernel attempts to access a page in physical memory that
belongs to a guest. I assume if it only happens transiently, it will not
result in a machine check like it would if the instructions were
actually retired. As far as I can tell encryption happens between the
CPU & main memory, so cache contents will be plaintext. This seems to
leave open the possibility of the host kernel retrieving the plaintext
cache contents with a transient execution attack. I assume vendors have
controls in place to stop this, but Foreshadow/L1TF is a good example of
one place this fell apart for SGX [4].

All that said, we're also dependent on hardware not being subject to
L1TF-style issues for the currently proposed non-CoCo method to be
effective.  We're simply clearing the Present bit while the physmap PTE
still points to the guest physical page. This was found to be
exploitable across OS & VMM boundaries on Intel server parts before
Cascade Lake [5] (thanks to Claudio for highlighting this). So that's a
long way of saying TDX may offer similar protection, but not because of
encryption.

Derek

[1] https://www.amd.com/content/dam/amd/en/documents/epyc-business-docs/white-papers/SEV-SNP-strengthening-vm-isolation-with-integrity-protection-and-more.pdf#page=19
[2] https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/best-practices/trusted-domain-security-guidance-for-developers.html
[3] https://intel.github.io/ccc-linux-guest-hardening-docs/security-spec.html#transient-execution-attacks-and-their-mitigation
[4] https://foreshadowattack.eu/foreshadow.pdf
[5] https://foreshadowattack.eu/foreshadow-NG.pdf
Manwaring, Derek Aug. 15, 2024, 7:08 p.m. UTC | #3
On 2024-08-07 17:16-0700 Derek Manwaring wrote:
> All that said, we're also dependent on hardware not being subject to
> L1TF-style issues for the currently proposed non-CoCo method to be
> effective. We're simply clearing the Present bit while the physmap PTE
> still points to the guest physical page.

I was wrong here. The set_direct_map_invalid_noflush implementation
moves through __change_page_attr and pfn_pte, eventually arriving at
flip_protnone_guard where the PFN is inverted & thus no longer valid for
pages marked not present. So we do benefit from that prior work's extra
protection against L1TF.

Thank you for finding this, Patrick.

Derek
Mike Rapoport Aug. 19, 2024, 10:09 a.m. UTC | #4
On Mon, Aug 05, 2024 at 11:34:49AM -0700, Elliot Berman wrote:
> This patch was reworked from Patrick's patch:
> https://lore.kernel.org/all/20240709132041.3625501-6-roypat@amazon.co.uk/
> 
> While guest_memfd is not available to be mapped by userspace, it is
> still accessible through the kernel's direct map. This means that in
> scenarios where guest-private memory is not hardware protected, it can
> be speculatively read and its contents potentially leaked through
> hardware side-channels. Removing guest-private memory from the direct
> map, thus mitigates a large class of speculative execution issues
> [1, Table 1].
> 
> Direct map removal do not reuse the `.prepare` machinery, since
> `prepare` can be called multiple time, and it is the responsibility of
> the preparation routine to not "prepare" the same folio twice [2]. Thus,
> instead explicitly check if `filemap_grab_folio` allocated a new folio,
> and remove the returned folio from the direct map only if this was the
> case.
> 
> The patch uses release_folio instead of free_folio to reinsert pages
> back into the direct map as by the time free_folio is called,
> folio->mapping can already be NULL. This means that a call to
> folio_inode inside free_folio might deference a NULL pointer, leaving no
> way to access the inode which stores the flags that allow determining
> whether the page was removed from the direct map in the first place.
> 
> [1]: https://download.vusec.net/papers/quarantine_raid23.pdf
> 
> Cc: Patrick Roy <roypat@amazon.co.uk>
> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
> ---
>  include/linux/guest_memfd.h |  8 ++++++
>  mm/guest_memfd.c            | 65 ++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 72 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/guest_memfd.h b/include/linux/guest_memfd.h
> index be56d9d53067..f9e4a27aed67 100644
> --- a/include/linux/guest_memfd.h
> +++ b/include/linux/guest_memfd.h
> @@ -25,6 +25,14 @@ struct guest_memfd_operations {
>  	int (*release)(struct inode *inode);
>  };
>  
> +/**
> + * @GUEST_MEMFD_FLAG_NO_DIRECT_MAP: When making folios inaccessible by host, also
> + *                                  remove them from the kernel's direct map.
> + */
> +enum {

please name this enum, otherwise kernel-doc wont' be happy

> +	GUEST_MEMFD_FLAG_NO_DIRECT_MAP		= BIT(0),
> +};
> +
>  /**
>   * @GUEST_MEMFD_GRAB_UPTODATE: Ensure pages are zeroed/up to date.
>   *                             If trusted hyp will do it, can ommit this flag
> diff --git a/mm/guest_memfd.c b/mm/guest_memfd.c
> index 580138b0f9d4..e9d8cab72b28 100644
> --- a/mm/guest_memfd.c
> +++ b/mm/guest_memfd.c
> @@ -7,9 +7,55 @@
>  #include <linux/falloc.h>
>  #include <linux/guest_memfd.h>
>  #include <linux/pagemap.h>
> +#include <linux/set_memory.h>
> +
> +static inline int guest_memfd_folio_private(struct folio *folio)
> +{
> +	unsigned long nr_pages = folio_nr_pages(folio);
> +	unsigned long i;
> +	int r;
> +
> +	for (i = 0; i < nr_pages; i++) {
> +		struct page *page = folio_page(folio, i);
> +
> +		r = set_direct_map_invalid_noflush(page);
> +		if (r < 0)
> +			goto out_remap;
> +	}
> +
> +	folio_set_private(folio);
> +	return 0;
> +out_remap:
> +	for (; i > 0; i--) {
> +		struct page *page = folio_page(folio, i - 1);
> +
> +		BUG_ON(set_direct_map_default_noflush(page));
> +	}
> +	return r;
> +}
> +
> +static inline void guest_memfd_folio_clear_private(struct folio *folio)
> +{
> +	unsigned long start = (unsigned long)folio_address(folio);
> +	unsigned long nr = folio_nr_pages(folio);
> +	unsigned long i;
> +
> +	if (!folio_test_private(folio))
> +		return;
> +
> +	for (i = 0; i < nr; i++) {
> +		struct page *page = folio_page(folio, i);
> +
> +		BUG_ON(set_direct_map_default_noflush(page));
> +	}
> +	flush_tlb_kernel_range(start, start + folio_size(folio));

I think that TLB flush should come after removing pages from the direct map
rather than after adding them back.

> +
> +	folio_clear_private(folio);
> +}
>  
>  struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags)
>  {
> +	unsigned long gmem_flags = (unsigned long)file->private_data;
>  	struct inode *inode = file_inode(file);
>  	struct guest_memfd_operations *ops = inode->i_private;
>  	struct folio *folio;
Elliot Berman Aug. 20, 2024, 4:56 p.m. UTC | #5
On Mon, Aug 19, 2024 at 01:09:52PM +0300, Mike Rapoport wrote:
> On Mon, Aug 05, 2024 at 11:34:49AM -0700, Elliot Berman wrote:
> > This patch was reworked from Patrick's patch:
> > https://lore.kernel.org/all/20240709132041.3625501-6-roypat@amazon.co.uk/
> > 
> > While guest_memfd is not available to be mapped by userspace, it is
> > still accessible through the kernel's direct map. This means that in
> > scenarios where guest-private memory is not hardware protected, it can
> > be speculatively read and its contents potentially leaked through
> > hardware side-channels. Removing guest-private memory from the direct
> > map, thus mitigates a large class of speculative execution issues
> > [1, Table 1].
> > 
> > Direct map removal do not reuse the `.prepare` machinery, since
> > `prepare` can be called multiple time, and it is the responsibility of
> > the preparation routine to not "prepare" the same folio twice [2]. Thus,
> > instead explicitly check if `filemap_grab_folio` allocated a new folio,
> > and remove the returned folio from the direct map only if this was the
> > case.
> > 
> > The patch uses release_folio instead of free_folio to reinsert pages
> > back into the direct map as by the time free_folio is called,
> > folio->mapping can already be NULL. This means that a call to
> > folio_inode inside free_folio might deference a NULL pointer, leaving no
> > way to access the inode which stores the flags that allow determining
> > whether the page was removed from the direct map in the first place.
> > 
> > [1]: https://download.vusec.net/papers/quarantine_raid23.pdf
> > 
> > Cc: Patrick Roy <roypat@amazon.co.uk>
> > Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
> > ---
> >  include/linux/guest_memfd.h |  8 ++++++
> >  mm/guest_memfd.c            | 65 ++++++++++++++++++++++++++++++++++++++++++++-
> >  2 files changed, 72 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/linux/guest_memfd.h b/include/linux/guest_memfd.h
> > index be56d9d53067..f9e4a27aed67 100644
> > --- a/include/linux/guest_memfd.h
> > +++ b/include/linux/guest_memfd.h
> > @@ -25,6 +25,14 @@ struct guest_memfd_operations {
> >  	int (*release)(struct inode *inode);
> >  };
> >  
> > +/**
> > + * @GUEST_MEMFD_FLAG_NO_DIRECT_MAP: When making folios inaccessible by host, also
> > + *                                  remove them from the kernel's direct map.
> > + */
> > +enum {
> 
> please name this enum, otherwise kernel-doc wont' be happy
> 
> > +	GUEST_MEMFD_FLAG_NO_DIRECT_MAP		= BIT(0),
> > +};
> > +
> >  /**
> >   * @GUEST_MEMFD_GRAB_UPTODATE: Ensure pages are zeroed/up to date.
> >   *                             If trusted hyp will do it, can ommit this flag
> > diff --git a/mm/guest_memfd.c b/mm/guest_memfd.c
> > index 580138b0f9d4..e9d8cab72b28 100644
> > --- a/mm/guest_memfd.c
> > +++ b/mm/guest_memfd.c
> > @@ -7,9 +7,55 @@
> >  #include <linux/falloc.h>
> >  #include <linux/guest_memfd.h>
> >  #include <linux/pagemap.h>
> > +#include <linux/set_memory.h>
> > +
> > +static inline int guest_memfd_folio_private(struct folio *folio)
> > +{
> > +	unsigned long nr_pages = folio_nr_pages(folio);
> > +	unsigned long i;
> > +	int r;
> > +
> > +	for (i = 0; i < nr_pages; i++) {
> > +		struct page *page = folio_page(folio, i);
> > +
> > +		r = set_direct_map_invalid_noflush(page);
> > +		if (r < 0)
> > +			goto out_remap;
> > +	}
> > +
> > +	folio_set_private(folio);
> > +	return 0;
> > +out_remap:
> > +	for (; i > 0; i--) {
> > +		struct page *page = folio_page(folio, i - 1);
> > +
> > +		BUG_ON(set_direct_map_default_noflush(page));
> > +	}
> > +	return r;
> > +}
> > +
> > +static inline void guest_memfd_folio_clear_private(struct folio *folio)
> > +{
> > +	unsigned long start = (unsigned long)folio_address(folio);
> > +	unsigned long nr = folio_nr_pages(folio);
> > +	unsigned long i;
> > +
> > +	if (!folio_test_private(folio))
> > +		return;
> > +
> > +	for (i = 0; i < nr; i++) {
> > +		struct page *page = folio_page(folio, i);
> > +
> > +		BUG_ON(set_direct_map_default_noflush(page));
> > +	}
> > +	flush_tlb_kernel_range(start, start + folio_size(folio));
> 
> I think that TLB flush should come after removing pages from the direct map
> rather than after adding them back.
> 

Gunyah flushes the tlb when it removes the stage 2 mapping, so we
skipped it on removal as a performance optimization. I remember seeing
that pKVM does the same (tlb flush for the stage 2 unmap & the
equivalent for x86). Patrick had also done the same in their patches.

Thanks,
Elliot
Mike Rapoport Aug. 21, 2024, 2:26 p.m. UTC | #6
On Tue, Aug 20, 2024 at 09:56:10AM -0700, Elliot Berman wrote:
> On Mon, Aug 19, 2024 at 01:09:52PM +0300, Mike Rapoport wrote:
> > On Mon, Aug 05, 2024 at 11:34:49AM -0700, Elliot Berman wrote:
> > > This patch was reworked from Patrick's patch:
> > > https://lore.kernel.org/all/20240709132041.3625501-6-roypat@amazon.co.uk/
> > > 
> > > While guest_memfd is not available to be mapped by userspace, it is
> > > still accessible through the kernel's direct map. This means that in
> > > scenarios where guest-private memory is not hardware protected, it can
> > > be speculatively read and its contents potentially leaked through
> > > hardware side-channels. Removing guest-private memory from the direct
> > > map, thus mitigates a large class of speculative execution issues
> > > [1, Table 1].
> > > 
> > > Direct map removal do not reuse the `.prepare` machinery, since
> > > `prepare` can be called multiple time, and it is the responsibility of
> > > the preparation routine to not "prepare" the same folio twice [2]. Thus,
> > > instead explicitly check if `filemap_grab_folio` allocated a new folio,
> > > and remove the returned folio from the direct map only if this was the
> > > case.
> > > 
> > > The patch uses release_folio instead of free_folio to reinsert pages
> > > back into the direct map as by the time free_folio is called,
> > > folio->mapping can already be NULL. This means that a call to
> > > folio_inode inside free_folio might deference a NULL pointer, leaving no
> > > way to access the inode which stores the flags that allow determining
> > > whether the page was removed from the direct map in the first place.
> > > 
> > > [1]: https://download.vusec.net/papers/quarantine_raid23.pdf
> > > 
> > > Cc: Patrick Roy <roypat@amazon.co.uk>
> > > Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
> > > ---
> > >  include/linux/guest_memfd.h |  8 ++++++
> > >  mm/guest_memfd.c            | 65 ++++++++++++++++++++++++++++++++++++++++++++-
> > >  2 files changed, 72 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/include/linux/guest_memfd.h b/include/linux/guest_memfd.h
> > > index be56d9d53067..f9e4a27aed67 100644
> > > --- a/include/linux/guest_memfd.h
> > > +++ b/include/linux/guest_memfd.h
> > > @@ -25,6 +25,14 @@ struct guest_memfd_operations {
> > >  	int (*release)(struct inode *inode);
> > >  };
> > >  
> > > +/**
> > > + * @GUEST_MEMFD_FLAG_NO_DIRECT_MAP: When making folios inaccessible by host, also
> > > + *                                  remove them from the kernel's direct map.
> > > + */
> > > +enum {
> > 
> > please name this enum, otherwise kernel-doc wont' be happy
> > 
> > > +	GUEST_MEMFD_FLAG_NO_DIRECT_MAP		= BIT(0),
> > > +};
> > > +
> > >  /**
> > >   * @GUEST_MEMFD_GRAB_UPTODATE: Ensure pages are zeroed/up to date.
> > >   *                             If trusted hyp will do it, can ommit this flag
> > > diff --git a/mm/guest_memfd.c b/mm/guest_memfd.c
> > > index 580138b0f9d4..e9d8cab72b28 100644
> > > --- a/mm/guest_memfd.c
> > > +++ b/mm/guest_memfd.c
> > > @@ -7,9 +7,55 @@
> > >  #include <linux/falloc.h>
> > >  #include <linux/guest_memfd.h>
> > >  #include <linux/pagemap.h>
> > > +#include <linux/set_memory.h>
> > > +
> > > +static inline int guest_memfd_folio_private(struct folio *folio)
> > > +{
> > > +	unsigned long nr_pages = folio_nr_pages(folio);
> > > +	unsigned long i;
> > > +	int r;
> > > +
> > > +	for (i = 0; i < nr_pages; i++) {
> > > +		struct page *page = folio_page(folio, i);
> > > +
> > > +		r = set_direct_map_invalid_noflush(page);
> > > +		if (r < 0)
> > > +			goto out_remap;
> > > +	}
> > > +
> > > +	folio_set_private(folio);
> > > +	return 0;
> > > +out_remap:
> > > +	for (; i > 0; i--) {
> > > +		struct page *page = folio_page(folio, i - 1);
> > > +
> > > +		BUG_ON(set_direct_map_default_noflush(page));
> > > +	}
> > > +	return r;
> > > +}
> > > +
> > > +static inline void guest_memfd_folio_clear_private(struct folio *folio)
> > > +{
> > > +	unsigned long start = (unsigned long)folio_address(folio);
> > > +	unsigned long nr = folio_nr_pages(folio);
> > > +	unsigned long i;
> > > +
> > > +	if (!folio_test_private(folio))
> > > +		return;
> > > +
> > > +	for (i = 0; i < nr; i++) {
> > > +		struct page *page = folio_page(folio, i);
> > > +
> > > +		BUG_ON(set_direct_map_default_noflush(page));
> > > +	}
> > > +	flush_tlb_kernel_range(start, start + folio_size(folio));
> > 
> > I think that TLB flush should come after removing pages from the direct map
> > rather than after adding them back.
> > 
> 
> Gunyah flushes the tlb when it removes the stage 2 mapping, so we
> skipped it on removal as a performance optimization. I remember seeing
> that pKVM does the same (tlb flush for the stage 2 unmap & the
> equivalent for x86). Patrick had also done the same in their patches.

Strictly from the API perspective, unmapping the pages from the direct map
would imply removing potentially stale TLB entries.
If all currently anticipated users do it elsewhere, at the very least there
should be a huge bold comment.

And what's the point of tlb flush after setting the direct map to default?
There should not be stale tlb entries for the unmapped pages.
 
> Thanks,
> Elliot
diff mbox series

Patch

diff --git a/include/linux/guest_memfd.h b/include/linux/guest_memfd.h
index be56d9d53067..f9e4a27aed67 100644
--- a/include/linux/guest_memfd.h
+++ b/include/linux/guest_memfd.h
@@ -25,6 +25,14 @@  struct guest_memfd_operations {
 	int (*release)(struct inode *inode);
 };
 
+/**
+ * @GUEST_MEMFD_FLAG_NO_DIRECT_MAP: When making folios inaccessible by host, also
+ *                                  remove them from the kernel's direct map.
+ */
+enum {
+	GUEST_MEMFD_FLAG_NO_DIRECT_MAP		= BIT(0),
+};
+
 /**
  * @GUEST_MEMFD_GRAB_UPTODATE: Ensure pages are zeroed/up to date.
  *                             If trusted hyp will do it, can ommit this flag
diff --git a/mm/guest_memfd.c b/mm/guest_memfd.c
index 580138b0f9d4..e9d8cab72b28 100644
--- a/mm/guest_memfd.c
+++ b/mm/guest_memfd.c
@@ -7,9 +7,55 @@ 
 #include <linux/falloc.h>
 #include <linux/guest_memfd.h>
 #include <linux/pagemap.h>
+#include <linux/set_memory.h>
+
+static inline int guest_memfd_folio_private(struct folio *folio)
+{
+	unsigned long nr_pages = folio_nr_pages(folio);
+	unsigned long i;
+	int r;
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *page = folio_page(folio, i);
+
+		r = set_direct_map_invalid_noflush(page);
+		if (r < 0)
+			goto out_remap;
+	}
+
+	folio_set_private(folio);
+	return 0;
+out_remap:
+	for (; i > 0; i--) {
+		struct page *page = folio_page(folio, i - 1);
+
+		BUG_ON(set_direct_map_default_noflush(page));
+	}
+	return r;
+}
+
+static inline void guest_memfd_folio_clear_private(struct folio *folio)
+{
+	unsigned long start = (unsigned long)folio_address(folio);
+	unsigned long nr = folio_nr_pages(folio);
+	unsigned long i;
+
+	if (!folio_test_private(folio))
+		return;
+
+	for (i = 0; i < nr; i++) {
+		struct page *page = folio_page(folio, i);
+
+		BUG_ON(set_direct_map_default_noflush(page));
+	}
+	flush_tlb_kernel_range(start, start + folio_size(folio));
+
+	folio_clear_private(folio);
+}
 
 struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags)
 {
+	unsigned long gmem_flags = (unsigned long)file->private_data;
 	struct inode *inode = file_inode(file);
 	struct guest_memfd_operations *ops = inode->i_private;
 	struct folio *folio;
@@ -43,6 +89,12 @@  struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags
 			goto out_err;
 	}
 
+	if (gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
+		r = guest_memfd_folio_private(folio);
+		if (r)
+			goto out_err;
+	}
+
 	/*
 	 * Ignore accessed, referenced, and dirty flags.  The memory is
 	 * unevictable and there is no storage to write back to.
@@ -213,14 +265,25 @@  static bool gmem_release_folio(struct folio *folio, gfp_t gfp)
 	if (ops->invalidate_end)
 		ops->invalidate_end(inode, offset, nr);
 
+	guest_memfd_folio_clear_private(folio);
+
 	return true;
 }
 
+static void gmem_invalidate_folio(struct folio *folio, size_t offset, size_t len)
+{
+	/* not yet supported */
+	BUG_ON(offset || len != folio_size(folio));
+
+	BUG_ON(!gmem_release_folio(folio, 0));
+}
+
 static const struct address_space_operations gmem_aops = {
 	.dirty_folio = noop_dirty_folio,
 	.migrate_folio = gmem_migrate_folio,
 	.error_remove_folio = gmem_error_folio,
 	.release_folio = gmem_release_folio,
+	.invalidate_folio = gmem_invalidate_folio,
 };
 
 static inline bool guest_memfd_check_ops(const struct guest_memfd_operations *ops)
@@ -241,7 +304,7 @@  struct file *guest_memfd_alloc(const char *name,
 	if (!guest_memfd_check_ops(ops))
 		return ERR_PTR(-EINVAL);
 
-	if (flags)
+	if (flags & ~GUEST_MEMFD_FLAG_NO_DIRECT_MAP)
 		return ERR_PTR(-EINVAL);
 
 	/*