diff mbox series

[RFC,2/4] mm: add PTE_MARKER_GUARD PTE marker

Message ID 03570f8a0ad2a9c0a92cc0c594e375c4185eccdc.1727440966.git.lorenzo.stoakes@oracle.com
State Superseded
Headers show
Series implement lightweight guard pages | expand

Commit Message

Lorenzo Stoakes Sept. 27, 2024, 12:51 p.m. UTC
Add a new PTE marker that results in any access causing the accessing
process to segfault.

This is preferable to PTE_MARKER_POISONED, which results in the same
handling as hardware poisoned memory, and is thus undesirable for cases
where we simply wish to 'soft' poison a range.

This is in preparation for implementing the ability to specify guard pages
at the page table level, i.e. ranges that, when accessed, should cause
process termination.

Additionally, rename zap_drop_file_uffd_wp() to zap_drop_markers() - the
function checks the ZAP_FLAG_DROP_MARKER flag so naming it for this single
purpose was simply incorrect.

We then reuse the same logic to determine whether a zap should clear a
guard entry - this should only be performed on teardown and never on
MADV_DONTNEED or the like.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
 include/linux/mm_inline.h |  2 +-
 include/linux/swapops.h   | 26 ++++++++++++++++++++++++--
 mm/hugetlb.c              |  3 +++
 mm/memory.c               | 18 +++++++++++++++---
 4 files changed, 43 insertions(+), 6 deletions(-)

Comments

Jann Horn Oct. 11, 2024, 6:11 p.m. UTC | #1
On Fri, Sep 27, 2024 at 2:51 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
> Add a new PTE marker that results in any access causing the accessing
> process to segfault.
[...]
>  static inline int is_poisoned_swp_entry(swp_entry_t entry)
> +{
> +       /*
> +        * We treat guard pages as poisoned too as these have the same semantics
> +        * as poisoned ranges, only with different fault handling.
> +        */
> +       return is_pte_marker_entry(entry) &&
> +               (pte_marker_get(entry) &
> +                (PTE_MARKER_POISONED | PTE_MARKER_GUARD));
> +}

This means MADV_FREE will also clear guard PTEs, right?

> diff --git a/mm/memory.c b/mm/memory.c
> index 5c6486e33e63..6c413c3d72fd 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1457,7 +1457,7 @@ static inline bool should_zap_folio(struct zap_details *details,
>         return !folio_test_anon(folio);
>  }
>
> -static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
> +static inline bool zap_drop_markers(struct zap_details *details)
>  {
>         if (!details)
>                 return false;
> @@ -1478,7 +1478,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
>         if (vma_is_anonymous(vma))
>                 return;
>
> -       if (zap_drop_file_uffd_wp(details))
> +       if (zap_drop_markers(details))
>                 return;
>
>         for (;;) {
> @@ -1673,7 +1673,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>                          * drop the marker if explicitly requested.
>                          */
>                         if (!vma_is_anonymous(vma) &&
> -                           !zap_drop_file_uffd_wp(details))
> +                           !zap_drop_markers(details))
> +                               continue;
> +               } else if (is_guard_swp_entry(entry)) {
> +                       /*
> +                        * Ordinary zapping should not remove guard PTE
> +                        * markers. Only do so if we should remove PTE markers
> +                        * in general.
> +                        */
> +                       if (!zap_drop_markers(details))
>                                 continue;

Just a comment: It's nice that the feature is restricted to anonymous
VMAs, otherwise we'd have to figure out here what to do about
unmap_mapping_folio() (which sets ZAP_FLAG_DROP_MARKER together with
details.single_folio)...


>                 } else if (is_hwpoison_entry(entry) ||
>                            is_poisoned_swp_entry(entry)) {
> @@ -4005,6 +4013,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
>         if (marker & PTE_MARKER_POISONED)
>                 return VM_FAULT_HWPOISON;
>
> +       /* Hitting a guard page is always a fatal condition. */
> +       if (marker & PTE_MARKER_GUARD)
> +               return VM_FAULT_SIGSEGV;
> +
>         if (pte_marker_entry_uffd_wp(entry))
>                 return pte_marker_handle_uffd_wp(vmf);
>
> --
> 2.46.2
>
Lorenzo Stoakes Oct. 14, 2024, 10:23 a.m. UTC | #2
On Fri, Oct 11, 2024 at 08:11:32PM +0200, Jann Horn wrote:
> On Fri, Sep 27, 2024 at 2:51 PM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> > Add a new PTE marker that results in any access causing the accessing
> > process to segfault.
> [...]
> >  static inline int is_poisoned_swp_entry(swp_entry_t entry)
> > +{
> > +       /*
> > +        * We treat guard pages as poisoned too as these have the same semantics
> > +        * as poisoned ranges, only with different fault handling.
> > +        */
> > +       return is_pte_marker_entry(entry) &&
> > +               (pte_marker_get(entry) &
> > +                (PTE_MARKER_POISONED | PTE_MARKER_GUARD));
> > +}
>
> This means MADV_FREE will also clear guard PTEs, right?

Yes, this is expected, it acts like unmap in effect (with a delayed
effect), so we give it the same semantics. The same thing happens with
hardware poisoning.

You can see in the tests what expectations we have with different
operations, we assert there this specific behaviour:

	/* Lazyfree range. */
	ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_FREE), 0);

	/* This should simply clear the poison markers. */
	for (i = 0; i < 10; i++) {
		ASSERT_TRUE(try_read_write_buf(&ptr[i * page_size]));
	}

The tests somewhat self-document expected behaviour.

>
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 5c6486e33e63..6c413c3d72fd 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -1457,7 +1457,7 @@ static inline bool should_zap_folio(struct zap_details *details,
> >         return !folio_test_anon(folio);
> >  }
> >
> > -static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
> > +static inline bool zap_drop_markers(struct zap_details *details)
> >  {
> >         if (!details)
> >                 return false;
> > @@ -1478,7 +1478,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
> >         if (vma_is_anonymous(vma))
> >                 return;
> >
> > -       if (zap_drop_file_uffd_wp(details))
> > +       if (zap_drop_markers(details))
> >                 return;
> >
> >         for (;;) {
> > @@ -1673,7 +1673,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> >                          * drop the marker if explicitly requested.
> >                          */
> >                         if (!vma_is_anonymous(vma) &&
> > -                           !zap_drop_file_uffd_wp(details))
> > +                           !zap_drop_markers(details))
> > +                               continue;
> > +               } else if (is_guard_swp_entry(entry)) {
> > +                       /*
> > +                        * Ordinary zapping should not remove guard PTE
> > +                        * markers. Only do so if we should remove PTE markers
> > +                        * in general.
> > +                        */
> > +                       if (!zap_drop_markers(details))
> >                                 continue;
>
> Just a comment: It's nice that the feature is restricted to anonymous
> VMAs, otherwise we'd have to figure out here what to do about
> unmap_mapping_folio() (which sets ZAP_FLAG_DROP_MARKER together with
> details.single_folio)...

Yes this is not the only issue with file-backed mappings. Readahead being
another, and plenty more.

We will probably look at how we might do this once this patch set lands,
and tackle all of these fun things then...

>
>
> >                 } else if (is_hwpoison_entry(entry) ||
> >                            is_poisoned_swp_entry(entry)) {
> > @@ -4005,6 +4013,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
> >         if (marker & PTE_MARKER_POISONED)
> >                 return VM_FAULT_HWPOISON;
> >
> > +       /* Hitting a guard page is always a fatal condition. */
> > +       if (marker & PTE_MARKER_GUARD)
> > +               return VM_FAULT_SIGSEGV;
> > +
> >         if (pte_marker_entry_uffd_wp(entry))
> >                 return pte_marker_handle_uffd_wp(vmf);
> >
> > --
> > 2.46.2
> >
diff mbox series

Patch

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 6f801c7b36e2..0d97a14bf051 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -531,7 +531,7 @@  static inline pte_marker copy_pte_marker(
 {
 	pte_marker srcm = pte_marker_get(entry);
 	/* Always copy error entries. */
-	pte_marker dstm = srcm & PTE_MARKER_POISONED;
+	pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD);
 
 	/* Only copy PTE markers if UFFD register matches. */
 	if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma))
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index cb468e418ea1..4d0606df0791 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -426,9 +426,15 @@  typedef unsigned long pte_marker;
  * "Poisoned" here is meant in the very general sense of "future accesses are
  * invalid", instead of referring very specifically to hardware memory errors.
  * This marker is meant to represent any of various different causes of this.
+ *
+ * Note that, when encountered by the faulting logic, PTEs with this marker will
+ * result in VM_FAULT_HWPOISON and thus regardless trigger hardware memory error
+ * logic.
  */
 #define  PTE_MARKER_POISONED			BIT(1)
-#define  PTE_MARKER_MASK			(BIT(2) - 1)
+/* Indicates that, on fault, this PTE will case a SIGSEGV signal to be sent. */
+#define  PTE_MARKER_GUARD			BIT(2)
+#define  PTE_MARKER_MASK			(BIT(3) - 1)
 
 static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
 {
@@ -461,9 +467,25 @@  static inline swp_entry_t make_poisoned_swp_entry(void)
 }
 
 static inline int is_poisoned_swp_entry(swp_entry_t entry)
+{
+	/*
+	 * We treat guard pages as poisoned too as these have the same semantics
+	 * as poisoned ranges, only with different fault handling.
+	 */
+	return is_pte_marker_entry(entry) &&
+		(pte_marker_get(entry) &
+		 (PTE_MARKER_POISONED | PTE_MARKER_GUARD));
+}
+
+static inline swp_entry_t make_guard_swp_entry(void)
+{
+	return make_pte_marker_entry(PTE_MARKER_GUARD);
+}
+
+static inline int is_guard_swp_entry(swp_entry_t entry)
 {
 	return is_pte_marker_entry(entry) &&
-	    (pte_marker_get(entry) & PTE_MARKER_POISONED);
+		(pte_marker_get(entry) & PTE_MARKER_GUARD);
 }
 
 /*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 190fa05635f4..daf69ac46360 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6348,6 +6348,9 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 				ret = VM_FAULT_HWPOISON_LARGE |
 				      VM_FAULT_SET_HINDEX(hstate_index(h));
 				goto out_mutex;
+			} else if (marker & PTE_MARKER_GUARD) {
+				ret = VM_FAULT_SIGSEGV;
+				goto out_mutex;
 			}
 		}
 
diff --git a/mm/memory.c b/mm/memory.c
index 5c6486e33e63..6c413c3d72fd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1457,7 +1457,7 @@  static inline bool should_zap_folio(struct zap_details *details,
 	return !folio_test_anon(folio);
 }
 
-static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
+static inline bool zap_drop_markers(struct zap_details *details)
 {
 	if (!details)
 		return false;
@@ -1478,7 +1478,7 @@  zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
 	if (vma_is_anonymous(vma))
 		return;
 
-	if (zap_drop_file_uffd_wp(details))
+	if (zap_drop_markers(details))
 		return;
 
 	for (;;) {
@@ -1673,7 +1673,15 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			 * drop the marker if explicitly requested.
 			 */
 			if (!vma_is_anonymous(vma) &&
-			    !zap_drop_file_uffd_wp(details))
+			    !zap_drop_markers(details))
+				continue;
+		} else if (is_guard_swp_entry(entry)) {
+			/*
+			 * Ordinary zapping should not remove guard PTE
+			 * markers. Only do so if we should remove PTE markers
+			 * in general.
+			 */
+			if (!zap_drop_markers(details))
 				continue;
 		} else if (is_hwpoison_entry(entry) ||
 			   is_poisoned_swp_entry(entry)) {
@@ -4005,6 +4013,10 @@  static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
 	if (marker & PTE_MARKER_POISONED)
 		return VM_FAULT_HWPOISON;
 
+	/* Hitting a guard page is always a fatal condition. */
+	if (marker & PTE_MARKER_GUARD)
+		return VM_FAULT_SIGSEGV;
+
 	if (pte_marker_entry_uffd_wp(entry))
 		return pte_marker_handle_uffd_wp(vmf);