diff mbox series

[v3,2/7] mm/munmap: Replace can_modify_mm with can_modify_vma

Message ID 20240817-mseal-depessimize-v3-2-d8d2e037df30@gmail.com
State New
Headers show
Series mm: Optimize mseal checks | expand

Commit Message

Pedro Falcato Aug. 17, 2024, 12:18 a.m. UTC
We were doing an extra mmap tree traversal just to check if the entire
range is modifiable. This can be done when we iterate through the VMAs
instead.

Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
---
 mm/mmap.c | 11 +----------
 mm/vma.c  | 19 ++++++++++++-------
 2 files changed, 13 insertions(+), 17 deletions(-)

Comments

Liam R. Howlett Aug. 19, 2024, 8:22 p.m. UTC | #1
* Pedro Falcato <pedro.falcato@gmail.com> [240816 20:18]:
> We were doing an extra mmap tree traversal just to check if the entire
> range is modifiable. This can be done when we iterate through the VMAs
> instead.
> 
> Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>

Although this has the side effect of potentially splitting the first vma
if it is not mseal()'ed, there really is no risk here since someone
making an invalid call that splits the vma for whatever reason can
modify the bad call to achieve the same split.  That is, this doesn't
help or hinder an attacker.

Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>

> ---
>  mm/mmap.c | 11 +----------
>  mm/vma.c  | 19 ++++++++++++-------
>  2 files changed, 13 insertions(+), 17 deletions(-)
> 
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 3af256bacef3..30ae4cb5cec9 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1740,16 +1740,7 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  		unsigned long start, unsigned long end, struct list_head *uf,
>  		bool unlock)
>  {
> -	struct mm_struct *mm = vma->vm_mm;
> -
> -	/*
> -	 * Check if memory is sealed, prevent unmapping a sealed VMA.
> -	 * can_modify_mm assumes we have acquired the lock on MM.
> -	 */
> -	if (unlikely(!can_modify_mm(mm, start, end)))
> -		return -EPERM;
> -
> -	return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
> +	return do_vmi_align_munmap(vmi, vma, vma->vm_mm, start, end, uf, unlock);
>  }
>  
>  /*
> diff --git a/mm/vma.c b/mm/vma.c
> index 84965f2cd580..5850f7c0949b 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -712,6 +712,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  		if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
>  			goto map_count_exceeded;
>  
> +		/* Don't bother splitting the VMA if we can't unmap it anyway */
> +		if (!can_modify_vma(vma)) {
> +			error = -EPERM;
> +			goto start_split_failed;
> +		}
> +
>  		error = __split_vma(vmi, vma, start, 1);
>  		if (error)
>  			goto start_split_failed;
> @@ -723,6 +729,11 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  	 */
>  	next = vma;
>  	do {
> +		if (!can_modify_vma(next)) {
> +			error = -EPERM;
> +			goto modify_vma_failed;
> +		}
> +
>  		/* Does it split the end? */
>  		if (next->vm_end > end) {
>  			error = __split_vma(vmi, next, end, 0);
> @@ -815,6 +826,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  	__mt_destroy(&mt_detach);
>  	return 0;
>  
> +modify_vma_failed:
>  clear_tree_failed:
>  userfaultfd_error:
>  munmap_gather_failed:
> @@ -860,13 +872,6 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
>  	if (end == start)
>  		return -EINVAL;
>  
> -	/*
> -	 * Check if memory is sealed, prevent unmapping a sealed VMA.
> -	 * can_modify_mm assumes we have acquired the lock on MM.
> -	 */
> -	if (unlikely(!can_modify_mm(mm, start, end)))
> -		return -EPERM;
> -
>  	/* Find the first overlapping VMA */
>  	vma = vma_find(vmi, end);
>  	if (!vma) {
> 
> -- 
> 2.46.0
>
Lorenzo Stoakes Aug. 21, 2024, 6:40 a.m. UTC | #2
On Sat, Aug 17, 2024 at 01:18:29AM GMT, Pedro Falcato wrote:
> We were doing an extra mmap tree traversal just to check if the entire
> range is modifiable. This can be done when we iterate through the VMAs
> instead.
>
> Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
> ---
>  mm/mmap.c | 11 +----------
>  mm/vma.c  | 19 ++++++++++++-------
>  2 files changed, 13 insertions(+), 17 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 3af256bacef3..30ae4cb5cec9 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1740,16 +1740,7 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  		unsigned long start, unsigned long end, struct list_head *uf,
>  		bool unlock)
>  {
> -	struct mm_struct *mm = vma->vm_mm;
> -
> -	/*
> -	 * Check if memory is sealed, prevent unmapping a sealed VMA.
> -	 * can_modify_mm assumes we have acquired the lock on MM.
> -	 */
> -	if (unlikely(!can_modify_mm(mm, start, end)))
> -		return -EPERM;
> -
> -	return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
> +	return do_vmi_align_munmap(vmi, vma, vma->vm_mm, start, end, uf, unlock);
>  }
>

Oh I like this. Want more mm/mmap.c stuff to look like this, abstracting
actual functionality to mm/vma.c...

>  /*
> diff --git a/mm/vma.c b/mm/vma.c
> index 84965f2cd580..5850f7c0949b 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -712,6 +712,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  		if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
>  			goto map_count_exceeded;
>
> +		/* Don't bother splitting the VMA if we can't unmap it anyway */
> +		if (!can_modify_vma(vma)) {
> +			error = -EPERM;
> +			goto start_split_failed;
> +		}
> +
>  		error = __split_vma(vmi, vma, start, 1);
>  		if (error)
>  			goto start_split_failed;
> @@ -723,6 +729,11 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  	 */
>  	next = vma;
>  	do {
> +		if (!can_modify_vma(next)) {
> +			error = -EPERM;
> +			goto modify_vma_failed;
> +		}
> +
>  		/* Does it split the end? */
>  		if (next->vm_end > end) {
>  			error = __split_vma(vmi, next, end, 0);
> @@ -815,6 +826,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  	__mt_destroy(&mt_detach);
>  	return 0;
>
> +modify_vma_failed:
>  clear_tree_failed:
>  userfaultfd_error:
>  munmap_gather_failed:
> @@ -860,13 +872,6 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
>  	if (end == start)
>  		return -EINVAL;
>
> -	/*
> -	 * Check if memory is sealed, prevent unmapping a sealed VMA.
> -	 * can_modify_mm assumes we have acquired the lock on MM.
> -	 */
> -	if (unlikely(!can_modify_mm(mm, start, end)))
> -		return -EPERM;
> -

This means we will arch_unmap() first, before realising we can't unmap,
however there are a number of other error conditions that would cause a
similar outcome in do_vmi_align_munmap() so I don't think that's a problem.

>  	/* Find the first overlapping VMA */
>  	vma = vma_find(vmi, end);
>  	if (!vma) {
>
> --
> 2.46.0
>

LGTM, Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Jeff Xu Aug. 21, 2024, 4:15 p.m. UTC | #3
On Fri, Aug 16, 2024 at 5:18 PM Pedro Falcato <pedro.falcato@gmail.com> wrote:
>
> We were doing an extra mmap tree traversal just to check if the entire
> range is modifiable. This can be done when we iterate through the VMAs
> instead.
>
> Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
> ---
>  mm/mmap.c | 11 +----------
>  mm/vma.c  | 19 ++++++++++++-------
>  2 files changed, 13 insertions(+), 17 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 3af256bacef3..30ae4cb5cec9 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1740,16 +1740,7 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>                 unsigned long start, unsigned long end, struct list_head *uf,
>                 bool unlock)
>  {
> -       struct mm_struct *mm = vma->vm_mm;
> -
> -       /*
> -        * Check if memory is sealed, prevent unmapping a sealed VMA.
> -        * can_modify_mm assumes we have acquired the lock on MM.
> -        */
> -       if (unlikely(!can_modify_mm(mm, start, end)))
> -               return -EPERM;
Another approach to improve perf  is to clone the vmi (since it
already point to the first vma), and pass the cloned vmi/vma into
can_modify_mm check, that will remove the cost of re-finding the first
VMA.

The can_modify_mm then continues from cloned VMI/vma till the end of
address range, there will be some perf cost there.  However,  most
address ranges in the real world are within a single VMA,  in
practice, the perf cost is the same as checking the single VMA, 99.9%
case.

This will help preserve the nice sealing feature (if one of the vma is
sealed, the entire address range is not modified)

> -
> -       return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
> +       return do_vmi_align_munmap(vmi, vma, vma->vm_mm, start, end, uf, unlock);
>  }
>
>  /*
> diff --git a/mm/vma.c b/mm/vma.c
> index 84965f2cd580..5850f7c0949b 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -712,6 +712,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>                 if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
>                         goto map_count_exceeded;
>
> +               /* Don't bother splitting the VMA if we can't unmap it anyway */
> +               if (!can_modify_vma(vma)) {
> +                       error = -EPERM;
> +                       goto start_split_failed;
> +               }
> +
>                 error = __split_vma(vmi, vma, start, 1);
>                 if (error)
>                         goto start_split_failed;
> @@ -723,6 +729,11 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>          */
>         next = vma;
>         do {
> +               if (!can_modify_vma(next)) {
> +                       error = -EPERM;
> +                       goto modify_vma_failed;
> +               }
> +
>                 /* Does it split the end? */
>                 if (next->vm_end > end) {
>                         error = __split_vma(vmi, next, end, 0);
> @@ -815,6 +826,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>         __mt_destroy(&mt_detach);
>         return 0;
>
> +modify_vma_failed:
>  clear_tree_failed:
>  userfaultfd_error:
>  munmap_gather_failed:
> @@ -860,13 +872,6 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
>         if (end == start)
>                 return -EINVAL;
>
> -       /*
> -        * Check if memory is sealed, prevent unmapping a sealed VMA.
> -        * can_modify_mm assumes we have acquired the lock on MM.
> -        */
> -       if (unlikely(!can_modify_mm(mm, start, end)))
> -               return -EPERM;
> -
>         /* Find the first overlapping VMA */
>         vma = vma_find(vmi, end);
>         if (!vma) {
>
> --
> 2.46.0
>
Pedro Falcato Aug. 21, 2024, 4:23 p.m. UTC | #4
On Wed, Aug 21, 2024 at 5:16 PM Jeff Xu <jeffxu@chromium.org> wrote:
>
> On Fri, Aug 16, 2024 at 5:18 PM Pedro Falcato <pedro.falcato@gmail.com> wrote:
> >
> > We were doing an extra mmap tree traversal just to check if the entire
> > range is modifiable. This can be done when we iterate through the VMAs
> > instead.
> >
> > Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
> > ---
> >  mm/mmap.c | 11 +----------
> >  mm/vma.c  | 19 ++++++++++++-------
> >  2 files changed, 13 insertions(+), 17 deletions(-)
> >
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index 3af256bacef3..30ae4cb5cec9 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1740,16 +1740,7 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >                 unsigned long start, unsigned long end, struct list_head *uf,
> >                 bool unlock)
> >  {
> > -       struct mm_struct *mm = vma->vm_mm;
> > -
> > -       /*
> > -        * Check if memory is sealed, prevent unmapping a sealed VMA.
> > -        * can_modify_mm assumes we have acquired the lock on MM.
> > -        */
> > -       if (unlikely(!can_modify_mm(mm, start, end)))
> > -               return -EPERM;
> Another approach to improve perf  is to clone the vmi (since it
> already point to the first vma), and pass the cloned vmi/vma into
> can_modify_mm check, that will remove the cost of re-finding the first
> VMA.
>
> The can_modify_mm then continues from cloned VMI/vma till the end of
> address range, there will be some perf cost there.  However,  most
> address ranges in the real world are within a single VMA,  in
> practice, the perf cost is the same as checking the single VMA, 99.9%
> case.
>
> This will help preserve the nice sealing feature (if one of the vma is
> sealed, the entire address range is not modified)

Please drop it. No one wants to preserve this. Everyone is in sync
when it comes to the solution except you.
Lorenzo Stoakes Aug. 21, 2024, 5:02 p.m. UTC | #5
On Wed, Aug 21, 2024 at 09:33:06AM GMT, Jeff Xu wrote:
> On Wed, Aug 21, 2024 at 9:24 AM Pedro Falcato <pedro.falcato@gmail.com> wrote:
> >
> > On Wed, Aug 21, 2024 at 5:16 PM Jeff Xu <jeffxu@chromium.org> wrote:
> > >
> > > On Fri, Aug 16, 2024 at 5:18 PM Pedro Falcato <pedro.falcato@gmail.com> wrote:
> > > >
> > > > We were doing an extra mmap tree traversal just to check if the entire
> > > > range is modifiable. This can be done when we iterate through the VMAs
> > > > instead.
> > > >
> > > > Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
> > > > ---
> > > >  mm/mmap.c | 11 +----------
> > > >  mm/vma.c  | 19 ++++++++++++-------
> > > >  2 files changed, 13 insertions(+), 17 deletions(-)
> > > >
> > > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > > index 3af256bacef3..30ae4cb5cec9 100644
> > > > --- a/mm/mmap.c
> > > > +++ b/mm/mmap.c
> > > > @@ -1740,16 +1740,7 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > > >                 unsigned long start, unsigned long end, struct list_head *uf,
> > > >                 bool unlock)
> > > >  {
> > > > -       struct mm_struct *mm = vma->vm_mm;
> > > > -
> > > > -       /*
> > > > -        * Check if memory is sealed, prevent unmapping a sealed VMA.
> > > > -        * can_modify_mm assumes we have acquired the lock on MM.
> > > > -        */
> > > > -       if (unlikely(!can_modify_mm(mm, start, end)))
> > > > -               return -EPERM;
> > > Another approach to improve perf  is to clone the vmi (since it
> > > already point to the first vma), and pass the cloned vmi/vma into
> > > can_modify_mm check, that will remove the cost of re-finding the first
> > > VMA.
> > >
> > > The can_modify_mm then continues from cloned VMI/vma till the end of
> > > address range, there will be some perf cost there.  However,  most
> > > address ranges in the real world are within a single VMA,  in
> > > practice, the perf cost is the same as checking the single VMA, 99.9%
> > > case.
> > >
> > > This will help preserve the nice sealing feature (if one of the vma is
> > > sealed, the entire address range is not modified)
> >
> > Please drop it. No one wants to preserve this. Everyone is in sync
> > when it comes to the solution except you.
>
> Still, this is another option that will very likely address the perf issue.

Nack to your approach. Feel free to send a follow up series replacing
Pedro's with yours for review if you feel differently, and stop stalling
things. Thanks.

>
> -Jeff
>
> >
> > --
> > Pedro
Liam R. Howlett Aug. 21, 2024, 6:25 p.m. UTC | #6
* Jeff Xu <jeffxu@chromium.org> [240821 12:33]:
> On Wed, Aug 21, 2024 at 9:24 AM Pedro Falcato <pedro.falcato@gmail.com> wrote:
> >
> > On Wed, Aug 21, 2024 at 5:16 PM Jeff Xu <jeffxu@chromium.org> wrote:
> > >
> > > On Fri, Aug 16, 2024 at 5:18 PM Pedro Falcato <pedro.falcato@gmail.com> wrote:
> > > >
> > > > We were doing an extra mmap tree traversal just to check if the entire
> > > > range is modifiable. This can be done when we iterate through the VMAs
> > > > instead.
> > > >
> > > > Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
> > > > ---
> > > >  mm/mmap.c | 11 +----------
> > > >  mm/vma.c  | 19 ++++++++++++-------
> > > >  2 files changed, 13 insertions(+), 17 deletions(-)
> > > >
> > > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > > index 3af256bacef3..30ae4cb5cec9 100644
> > > > --- a/mm/mmap.c
> > > > +++ b/mm/mmap.c
> > > > @@ -1740,16 +1740,7 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > > >                 unsigned long start, unsigned long end, struct list_head *uf,
> > > >                 bool unlock)
> > > >  {
> > > > -       struct mm_struct *mm = vma->vm_mm;
> > > > -
> > > > -       /*
> > > > -        * Check if memory is sealed, prevent unmapping a sealed VMA.
> > > > -        * can_modify_mm assumes we have acquired the lock on MM.
> > > > -        */
> > > > -       if (unlikely(!can_modify_mm(mm, start, end)))
> > > > -               return -EPERM;
> > > Another approach to improve perf  is to clone the vmi (since it
> > > already point to the first vma), and pass the cloned vmi/vma into
> > > can_modify_mm check, that will remove the cost of re-finding the first
> > > VMA.
> > >
> > > The can_modify_mm then continues from cloned VMI/vma till the end of
> > > address range, there will be some perf cost there.  However,  most
> > > address ranges in the real world are within a single VMA,  in
> > > practice, the perf cost is the same as checking the single VMA, 99.9%
> > > case.
> > >
> > > This will help preserve the nice sealing feature (if one of the vma is
> > > sealed, the entire address range is not modified)
> >
> > Please drop it. No one wants to preserve this. Everyone is in sync
> > when it comes to the solution except you.
> 
> Still, this is another option that will very likely address the perf issue.

The cost of cloning the vmi is a memory copy, while the cost of not
cloning the vmi is a re-walk of the tree.  Neither of these are free.

Both can be avoided by checking the vma flag during the existing loop,
which is what is done in this patch set.  This is obviously lower cost
of either of the above options since they do extra work and still have
to check the vma flag(s).

I think you are confusing the behaviour of the munmap() call when you
state 'partial success' with a potential split operation that may happen
prior to aborting a munmap() call.

What could happen in the failure scenario is that you'd end up with two
vmas instead of one mapping a particular area - but the mseal flag is
checked prior to allowing a split to happen, so it'll only split
non-mseal()'ed vmas.

So what mseal() used to avoid is a call that could potentially split a
vma that isn't mseal()'ed, while this patch will allow it to be split.
This is how it has been for a very long time and it's okay.

Considering how this affects the security model of mseal(), it means the
attacker could still accomplish the same feat of splitting that first
vma by altering the call to munmap() to avoid an mseal()'ed vma, so
there isn't much lost or gained here security wise - if any.

Thanks,
Liam
diff mbox series

Patch

diff --git a/mm/mmap.c b/mm/mmap.c
index 3af256bacef3..30ae4cb5cec9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1740,16 +1740,7 @@  int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		unsigned long start, unsigned long end, struct list_head *uf,
 		bool unlock)
 {
-	struct mm_struct *mm = vma->vm_mm;
-
-	/*
-	 * Check if memory is sealed, prevent unmapping a sealed VMA.
-	 * can_modify_mm assumes we have acquired the lock on MM.
-	 */
-	if (unlikely(!can_modify_mm(mm, start, end)))
-		return -EPERM;
-
-	return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
+	return do_vmi_align_munmap(vmi, vma, vma->vm_mm, start, end, uf, unlock);
 }
 
 /*
diff --git a/mm/vma.c b/mm/vma.c
index 84965f2cd580..5850f7c0949b 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -712,6 +712,12 @@  do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
 			goto map_count_exceeded;
 
+		/* Don't bother splitting the VMA if we can't unmap it anyway */
+		if (!can_modify_vma(vma)) {
+			error = -EPERM;
+			goto start_split_failed;
+		}
+
 		error = __split_vma(vmi, vma, start, 1);
 		if (error)
 			goto start_split_failed;
@@ -723,6 +729,11 @@  do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	 */
 	next = vma;
 	do {
+		if (!can_modify_vma(next)) {
+			error = -EPERM;
+			goto modify_vma_failed;
+		}
+
 		/* Does it split the end? */
 		if (next->vm_end > end) {
 			error = __split_vma(vmi, next, end, 0);
@@ -815,6 +826,7 @@  do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	__mt_destroy(&mt_detach);
 	return 0;
 
+modify_vma_failed:
 clear_tree_failed:
 userfaultfd_error:
 munmap_gather_failed:
@@ -860,13 +872,6 @@  int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
 	if (end == start)
 		return -EINVAL;
 
-	/*
-	 * Check if memory is sealed, prevent unmapping a sealed VMA.
-	 * can_modify_mm assumes we have acquired the lock on MM.
-	 */
-	if (unlikely(!can_modify_mm(mm, start, end)))
-		return -EPERM;
-
 	/* Find the first overlapping VMA */
 	vma = vma_find(vmi, end);
 	if (!vma) {