Message ID | 20210302000133.272579-2-axelrasmussen@google.com |
---|---|
State | Superseded |
Headers | show |
Series | userfaultfd: support minor fault handling for shmem | expand |
On 1 Mar 2021, at 19:01, Axel Rasmussen wrote: > Modify the userfaultfd register API to allow registering shmem VMAs in > minor mode. Modify the shmem mcopy implementation to support > UFFDIO_CONTINUE in order to resolve such faults. > > Combine the shmem mcopy handler functions into a single > shmem_mcopy_atomic_pte, which takes a mode parameter. This matches how > the hugetlbfs implementation is structured, and lets us remove a good > chunk of boilerplate. > > Signed-off-by: Axel Rasmussen <axelrasmussen@google.com> > --- > fs/userfaultfd.c | 6 +-- > include/linux/shmem_fs.h | 26 ++++----- > include/uapi/linux/userfaultfd.h | 4 +- > mm/memory.c | 8 +-- > mm/shmem.c | 92 +++++++++++++++----------------- > mm/userfaultfd.c | 27 +++++----- > 6 files changed, 79 insertions(+), 84 deletions(-) > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > index 14f92285d04f..9f3b8684cf3c 100644 > --- a/fs/userfaultfd.c > +++ b/fs/userfaultfd.c > @@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, > } > > if (vm_flags & VM_UFFD_MINOR) { > - /* FIXME: Add minor fault interception for shmem. */ > - if (!is_vm_hugetlb_page(vma)) > + if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) > return false; > } > > @@ -1941,7 +1940,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, > /* report all available features and ioctls to userland */ > uffdio_api.features = UFFD_API_FEATURES; > #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR > - uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS; > + uffdio_api.features &= > + ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); > #endif > uffdio_api.ioctls = UFFD_API_IOCTLS; > ret = -EFAULT; > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h > index d82b6f396588..f0919c3722e7 100644 > --- a/include/linux/shmem_fs.h > +++ b/include/linux/shmem_fs.h > @@ -9,6 +9,7 @@ > #include <linux/percpu_counter.h> > #include <linux/xattr.h> > #include <linux/fs_parser.h> > +#include <linux/userfaultfd_k.h> > > /* inode in-kernel data */ > > @@ -122,21 +123,16 @@ static inline bool shmem_file(struct file *file) > extern bool shmem_charge(struct inode *inode, long pages); > extern void shmem_uncharge(struct inode *inode, long pages); > > +#ifdef CONFIG_USERFAULTFD > #ifdef CONFIG_SHMEM > -extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, > - struct vm_area_struct *dst_vma, > - unsigned long dst_addr, > - unsigned long src_addr, > - struct page **pagep); > -extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, > - pmd_t *dst_pmd, > - struct vm_area_struct *dst_vma, > - unsigned long dst_addr); > -#else > -#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ > - src_addr, pagep) ({ BUG(); 0; }) > -#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \ > - dst_addr) ({ BUG(); 0; }) > -#endif > +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, > + struct vm_area_struct *dst_vma, > + unsigned long dst_addr, unsigned long src_addr, > + enum mcopy_atomic_mode mode, struct page **pagep); > +#else /* !CONFIG_SHMEM */ > +#define shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \ > + src_addr, mode, pagep) ({ BUG(); 0; }) > +#endif /* CONFIG_SHMEM */ > +#endif /* CONFIG_USERFAULTFD */ > > #endif > diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h > index bafbeb1a2624..47d9790d863d 100644 > --- a/include/uapi/linux/userfaultfd.h > +++ b/include/uapi/linux/userfaultfd.h > @@ -31,7 +31,8 @@ > UFFD_FEATURE_MISSING_SHMEM | \ > UFFD_FEATURE_SIGBUS | \ > UFFD_FEATURE_THREAD_ID | \ > - UFFD_FEATURE_MINOR_HUGETLBFS) > + UFFD_FEATURE_MINOR_HUGETLBFS | \ > + UFFD_FEATURE_MINOR_SHMEM) > #define UFFD_API_IOCTLS \ > ((__u64)1 << _UFFDIO_REGISTER | \ > (__u64)1 << _UFFDIO_UNREGISTER | \ > @@ -196,6 +197,7 @@ struct uffdio_api { > #define UFFD_FEATURE_SIGBUS (1<<7) > #define UFFD_FEATURE_THREAD_ID (1<<8) > #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) > +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) > __u64 features; > > __u64 ioctls; > diff --git a/mm/memory.c b/mm/memory.c > index c8e357627318..a1e5ff55027e 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -3929,9 +3929,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) > * something). > */ > if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { > - ret = do_fault_around(vmf); > - if (ret) > - return ret; > + if (likely(!userfaultfd_minor(vmf->vma))) { > + ret = do_fault_around(vmf); > + if (ret) > + return ret; > + } > } > > ret = __do_fault(vmf); > diff --git a/mm/shmem.c b/mm/shmem.c > index b2db4ed0fbc7..6f81259fabb3 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -77,7 +77,6 @@ static struct vfsmount *shm_mnt; > #include <linux/syscalls.h> > #include <linux/fcntl.h> > #include <uapi/linux/memfd.h> > -#include <linux/userfaultfd_k.h> > #include <linux/rmap.h> > #include <linux/uuid.h> > > @@ -1785,8 +1784,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, > * vm. If we swap it in we mark it dirty since we also free the swap > * entry since a page cannot live in both the swap and page cache. > * > - * vmf and fault_type are only supplied by shmem_fault: > - * otherwise they are NULL. > + * vma, vmf, and fault_type are only supplied by shmem_fault: otherwise they > + * are NULL. > */ > static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, > struct page **pagep, enum sgp_type sgp, gfp_t gfp, > @@ -1830,6 +1829,12 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, > return error; > } > > + if (page && vma && userfaultfd_minor(vma)) { > + unlock_page(page); > + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); > + return 0; > + } > + > if (page) > hindex = page->index; > if (page && sgp == SGP_WRITE) > @@ -2354,14 +2359,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode > return inode; > } > > -static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, > - pmd_t *dst_pmd, > - struct vm_area_struct *dst_vma, > - unsigned long dst_addr, > - unsigned long src_addr, > - bool zeropage, > - struct page **pagep) > +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, > + struct vm_area_struct *dst_vma, > + unsigned long dst_addr, unsigned long src_addr, > + enum mcopy_atomic_mode mode, struct page **pagep) > { > + bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); > struct inode *inode = file_inode(dst_vma->vm_file); > struct shmem_inode_info *info = SHMEM_I(inode); > struct address_space *mapping = inode->i_mapping; > @@ -2378,12 +2381,17 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, > if (!shmem_inode_acct_block(inode, 1)) > goto out; > > - if (!*pagep) { > + if (is_continue) { > + ret = -EFAULT; > + page = find_lock_page(mapping, pgoff); > + if (!page) > + goto out_unacct_blocks; > + } else if (!*pagep) { > page = shmem_alloc_page(gfp, info, pgoff); > if (!page) > goto out_unacct_blocks; > > - if (!zeropage) { /* mcopy_atomic */ > + if (mode == MCOPY_ATOMIC_NORMAL) { /* mcopy_atomic */ > page_kaddr = kmap_atomic(page); > ret = copy_from_user(page_kaddr, > (const void __user *)src_addr, Hi Axel, shmem_mcopy_atomic_pte is not guarded by CONFIG_USERFAULTFD, thus it is causing compilation errors due to the use of enum mcopy_atomic_mode mode, when CONFIG_USERFAULTFD is not set. — Best Regards, Yan Zi
On Tue, Mar 9, 2021 at 11:52 AM Zi Yan <ziy@nvidia.com> wrote: > > On 1 Mar 2021, at 19:01, Axel Rasmussen wrote: > > > Modify the userfaultfd register API to allow registering shmem VMAs in > > minor mode. Modify the shmem mcopy implementation to support > > UFFDIO_CONTINUE in order to resolve such faults. > > > > Combine the shmem mcopy handler functions into a single > > shmem_mcopy_atomic_pte, which takes a mode parameter. This matches how > > the hugetlbfs implementation is structured, and lets us remove a good > > chunk of boilerplate. > > > > Signed-off-by: Axel Rasmussen <axelrasmussen@google.com> > > --- > > fs/userfaultfd.c | 6 +-- > > include/linux/shmem_fs.h | 26 ++++----- > > include/uapi/linux/userfaultfd.h | 4 +- > > mm/memory.c | 8 +-- > > mm/shmem.c | 92 +++++++++++++++----------------- > > mm/userfaultfd.c | 27 +++++----- > > 6 files changed, 79 insertions(+), 84 deletions(-) > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > > index 14f92285d04f..9f3b8684cf3c 100644 > > --- a/fs/userfaultfd.c > > +++ b/fs/userfaultfd.c > > @@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, > > } > > > > if (vm_flags & VM_UFFD_MINOR) { > > - /* FIXME: Add minor fault interception for shmem. */ > > - if (!is_vm_hugetlb_page(vma)) > > + if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) > > return false; > > } > > > > @@ -1941,7 +1940,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, > > /* report all available features and ioctls to userland */ > > uffdio_api.features = UFFD_API_FEATURES; > > #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR > > - uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS; > > + uffdio_api.features &= > > + ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); > > #endif > > uffdio_api.ioctls = UFFD_API_IOCTLS; > > ret = -EFAULT; > > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h > > index d82b6f396588..f0919c3722e7 100644 > > --- a/include/linux/shmem_fs.h > > +++ b/include/linux/shmem_fs.h > > @@ -9,6 +9,7 @@ > > #include <linux/percpu_counter.h> > > #include <linux/xattr.h> > > #include <linux/fs_parser.h> > > +#include <linux/userfaultfd_k.h> > > > > /* inode in-kernel data */ > > > > @@ -122,21 +123,16 @@ static inline bool shmem_file(struct file *file) > > extern bool shmem_charge(struct inode *inode, long pages); > > extern void shmem_uncharge(struct inode *inode, long pages); > > > > +#ifdef CONFIG_USERFAULTFD > > #ifdef CONFIG_SHMEM > > -extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, > > - struct vm_area_struct *dst_vma, > > - unsigned long dst_addr, > > - unsigned long src_addr, > > - struct page **pagep); > > -extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, > > - pmd_t *dst_pmd, > > - struct vm_area_struct *dst_vma, > > - unsigned long dst_addr); > > -#else > > -#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ > > - src_addr, pagep) ({ BUG(); 0; }) > > -#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \ > > - dst_addr) ({ BUG(); 0; }) > > -#endif > > +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, > > + struct vm_area_struct *dst_vma, > > + unsigned long dst_addr, unsigned long src_addr, > > + enum mcopy_atomic_mode mode, struct page **pagep); > > +#else /* !CONFIG_SHMEM */ > > +#define shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \ > > + src_addr, mode, pagep) ({ BUG(); 0; }) > > +#endif /* CONFIG_SHMEM */ > > +#endif /* CONFIG_USERFAULTFD */ > > > > #endif > > diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h > > index bafbeb1a2624..47d9790d863d 100644 > > --- a/include/uapi/linux/userfaultfd.h > > +++ b/include/uapi/linux/userfaultfd.h > > @@ -31,7 +31,8 @@ > > UFFD_FEATURE_MISSING_SHMEM | \ > > UFFD_FEATURE_SIGBUS | \ > > UFFD_FEATURE_THREAD_ID | \ > > - UFFD_FEATURE_MINOR_HUGETLBFS) > > + UFFD_FEATURE_MINOR_HUGETLBFS | \ > > + UFFD_FEATURE_MINOR_SHMEM) > > #define UFFD_API_IOCTLS \ > > ((__u64)1 << _UFFDIO_REGISTER | \ > > (__u64)1 << _UFFDIO_UNREGISTER | \ > > @@ -196,6 +197,7 @@ struct uffdio_api { > > #define UFFD_FEATURE_SIGBUS (1<<7) > > #define UFFD_FEATURE_THREAD_ID (1<<8) > > #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) > > +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) > > __u64 features; > > > > __u64 ioctls; > > diff --git a/mm/memory.c b/mm/memory.c > > index c8e357627318..a1e5ff55027e 100644 > > --- a/mm/memory.c > > +++ b/mm/memory.c > > @@ -3929,9 +3929,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) > > * something). > > */ > > if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { > > - ret = do_fault_around(vmf); > > - if (ret) > > - return ret; > > + if (likely(!userfaultfd_minor(vmf->vma))) { > > + ret = do_fault_around(vmf); > > + if (ret) > > + return ret; > > + } > > } > > > > ret = __do_fault(vmf); > > diff --git a/mm/shmem.c b/mm/shmem.c > > index b2db4ed0fbc7..6f81259fabb3 100644 > > --- a/mm/shmem.c > > +++ b/mm/shmem.c > > @@ -77,7 +77,6 @@ static struct vfsmount *shm_mnt; > > #include <linux/syscalls.h> > > #include <linux/fcntl.h> > > #include <uapi/linux/memfd.h> > > -#include <linux/userfaultfd_k.h> > > #include <linux/rmap.h> > > #include <linux/uuid.h> > > > > @@ -1785,8 +1784,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, > > * vm. If we swap it in we mark it dirty since we also free the swap > > * entry since a page cannot live in both the swap and page cache. > > * > > - * vmf and fault_type are only supplied by shmem_fault: > > - * otherwise they are NULL. > > + * vma, vmf, and fault_type are only supplied by shmem_fault: otherwise they > > + * are NULL. > > */ > > static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, > > struct page **pagep, enum sgp_type sgp, gfp_t gfp, > > @@ -1830,6 +1829,12 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, > > return error; > > } > > > > + if (page && vma && userfaultfd_minor(vma)) { > > + unlock_page(page); > > + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); > > + return 0; > > + } > > + > > if (page) > > hindex = page->index; > > if (page && sgp == SGP_WRITE) > > @@ -2354,14 +2359,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode > > return inode; > > } > > > > -static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, > > - pmd_t *dst_pmd, > > - struct vm_area_struct *dst_vma, > > - unsigned long dst_addr, > > - unsigned long src_addr, > > - bool zeropage, > > - struct page **pagep) > > +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, > > + struct vm_area_struct *dst_vma, > > + unsigned long dst_addr, unsigned long src_addr, > > + enum mcopy_atomic_mode mode, struct page **pagep) > > { > > + bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); > > struct inode *inode = file_inode(dst_vma->vm_file); > > struct shmem_inode_info *info = SHMEM_I(inode); > > struct address_space *mapping = inode->i_mapping; > > @@ -2378,12 +2381,17 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, > > if (!shmem_inode_acct_block(inode, 1)) > > goto out; > > > > - if (!*pagep) { > > + if (is_continue) { > > + ret = -EFAULT; > > + page = find_lock_page(mapping, pgoff); > > + if (!page) > > + goto out_unacct_blocks; > > + } else if (!*pagep) { > > page = shmem_alloc_page(gfp, info, pgoff); > > if (!page) > > goto out_unacct_blocks; > > > > - if (!zeropage) { /* mcopy_atomic */ > > + if (mode == MCOPY_ATOMIC_NORMAL) { /* mcopy_atomic */ > > page_kaddr = kmap_atomic(page); > > ret = copy_from_user(page_kaddr, > > (const void __user *)src_addr, > > Hi Axel, > > shmem_mcopy_atomic_pte is not guarded by CONFIG_USERFAULTFD, thus it is > causing compilation errors due to the use of enum mcopy_atomic_mode mode, > when CONFIG_USERFAULTFD is not set. Ah, my apologies, I guarded it in the header but forgot to do so in shmem.c. I'll send an updated patch today. > > > — > Best Regards, > Yan Zi
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 14f92285d04f..9f3b8684cf3c 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, } if (vm_flags & VM_UFFD_MINOR) { - /* FIXME: Add minor fault interception for shmem. */ - if (!is_vm_hugetlb_page(vma)) + if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) return false; } @@ -1941,7 +1940,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR - uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS; + uffdio_api.features &= + ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); #endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d82b6f396588..f0919c3722e7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -9,6 +9,7 @@ #include <linux/percpu_counter.h> #include <linux/xattr.h> #include <linux/fs_parser.h> +#include <linux/userfaultfd_k.h> /* inode in-kernel data */ @@ -122,21 +123,16 @@ static inline bool shmem_file(struct file *file) extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); +#ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM -extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - struct page **pagep); -extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr); -#else -#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ - src_addr, pagep) ({ BUG(); 0; }) -#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \ - dst_addr) ({ BUG(); 0; }) -#endif +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep); +#else /* !CONFIG_SHMEM */ +#define shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \ + src_addr, mode, pagep) ({ BUG(); 0; }) +#endif /* CONFIG_SHMEM */ +#endif /* CONFIG_USERFAULTFD */ #endif diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index bafbeb1a2624..47d9790d863d 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -31,7 +31,8 @@ UFFD_FEATURE_MISSING_SHMEM | \ UFFD_FEATURE_SIGBUS | \ UFFD_FEATURE_THREAD_ID | \ - UFFD_FEATURE_MINOR_HUGETLBFS) + UFFD_FEATURE_MINOR_HUGETLBFS | \ + UFFD_FEATURE_MINOR_SHMEM) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -196,6 +197,7 @@ struct uffdio_api { #define UFFD_FEATURE_SIGBUS (1<<7) #define UFFD_FEATURE_THREAD_ID (1<<8) #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) __u64 features; __u64 ioctls; diff --git a/mm/memory.c b/mm/memory.c index c8e357627318..a1e5ff55027e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3929,9 +3929,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - ret = do_fault_around(vmf); - if (ret) - return ret; + if (likely(!userfaultfd_minor(vmf->vma))) { + ret = do_fault_around(vmf); + if (ret) + return ret; + } } ret = __do_fault(vmf); diff --git a/mm/shmem.c b/mm/shmem.c index b2db4ed0fbc7..6f81259fabb3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -77,7 +77,6 @@ static struct vfsmount *shm_mnt; #include <linux/syscalls.h> #include <linux/fcntl.h> #include <uapi/linux/memfd.h> -#include <linux/userfaultfd_k.h> #include <linux/rmap.h> #include <linux/uuid.h> @@ -1785,8 +1784,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * - * vmf and fault_type are only supplied by shmem_fault: - * otherwise they are NULL. + * vma, vmf, and fault_type are only supplied by shmem_fault: otherwise they + * are NULL. */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, @@ -1830,6 +1829,12 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return error; } + if (page && vma && userfaultfd_minor(vma)) { + unlock_page(page); + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); + return 0; + } + if (page) hindex = page->index; if (page && sgp == SGP_WRITE) @@ -2354,14 +2359,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; } -static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - bool zeropage, - struct page **pagep) +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep) { + bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; @@ -2378,12 +2381,17 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!shmem_inode_acct_block(inode, 1)) goto out; - if (!*pagep) { + if (is_continue) { + ret = -EFAULT; + page = find_lock_page(mapping, pgoff); + if (!page) + goto out_unacct_blocks; + } else if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); if (!page) goto out_unacct_blocks; - if (!zeropage) { /* mcopy_atomic */ + if (mode == MCOPY_ATOMIC_NORMAL) { /* mcopy_atomic */ page_kaddr = kmap_atomic(page); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, @@ -2397,7 +2405,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, /* don't free the page */ return -ENOENT; } - } else { /* mfill_zeropage_atomic */ + } else { /* zeropage */ clear_highpage(page); } } else { @@ -2405,10 +2413,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, *pagep = NULL; } - VM_BUG_ON(PageLocked(page) || PageSwapBacked(page)); - __SetPageLocked(page); - __SetPageSwapBacked(page); - __SetPageUptodate(page); + if (!is_continue) { + VM_BUG_ON(PageSwapBacked(page)); + VM_BUG_ON(PageLocked(page)); + __SetPageLocked(page); + __SetPageSwapBacked(page); + __SetPageUptodate(page); + } ret = -EFAULT; offset = linear_page_index(dst_vma, dst_addr); @@ -2416,10 +2427,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (unlikely(offset >= max_off)) goto out_release; - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK, dst_mm); - if (ret) - goto out_release; + /* If page wasn't already in the page cache, add it. */ + if (!is_continue) { + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, + gfp & GFP_RECLAIM_MASK, dst_mm); + if (ret) + goto out_release; + } _dst_pte = mk_pte(page, dst_vma->vm_page_prot); if (dst_vma->vm_flags & VM_WRITE) @@ -2446,13 +2460,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!pte_none(*dst_pte)) goto out_release_unlock; - lru_cache_add(page); + if (!is_continue) { + lru_cache_add(page); - spin_lock_irq(&info->lock); - info->alloced++; - inode->i_blocks += BLOCKS_PER_PAGE; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + spin_lock_irq(&info->lock); + info->alloced++; + inode->i_blocks += BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + } inc_mm_counter(dst_mm, mm_counter_file(page)); page_add_file_rmap(page, false); @@ -2477,28 +2493,6 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, goto out; } -int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - struct page **pagep) -{ - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, false, pagep); -} - -int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) -{ - struct page *page = NULL; - - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, 0, true, &page); -} - #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ce6cb4760d2c..6cd7ab531aec 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -415,7 +415,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, unsigned long dst_addr, unsigned long src_addr, struct page **page, - bool zeropage, + enum mcopy_atomic_mode mode, bool wp_copy) { ssize_t err; @@ -431,22 +431,24 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, * and not in the radix tree. */ if (!(dst_vma->vm_flags & VM_SHARED)) { - if (!zeropage) + switch (mode) { + case MCOPY_ATOMIC_NORMAL: err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, page, wp_copy); - else + break; + case MCOPY_ATOMIC_ZEROPAGE: err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, dst_addr); + break; + case MCOPY_ATOMIC_CONTINUE: + err = -EINVAL; + break; + } } else { VM_WARN_ON_ONCE(wp_copy); - if (!zeropage) - err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, - dst_vma, dst_addr, - src_addr, page); - else - err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd, - dst_vma, dst_addr); + err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + src_addr, mode, page); } return err; @@ -467,7 +469,6 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, long copied; struct page *page; bool wp_copy; - bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE); /* * Sanitize the command parameters: @@ -530,7 +531,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; - if (mcopy_mode == MCOPY_ATOMIC_CONTINUE) + if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE) goto out_unlock; /* @@ -578,7 +579,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, BUG_ON(pmd_trans_huge(*dst_pmd)); err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - src_addr, &page, zeropage, wp_copy); + src_addr, &page, mcopy_mode, wp_copy); cond_resched(); if (unlikely(err == -ENOENT)) {
Modify the userfaultfd register API to allow registering shmem VMAs in minor mode. Modify the shmem mcopy implementation to support UFFDIO_CONTINUE in order to resolve such faults. Combine the shmem mcopy handler functions into a single shmem_mcopy_atomic_pte, which takes a mode parameter. This matches how the hugetlbfs implementation is structured, and lets us remove a good chunk of boilerplate. Signed-off-by: Axel Rasmussen <axelrasmussen@google.com> --- fs/userfaultfd.c | 6 +-- include/linux/shmem_fs.h | 26 ++++----- include/uapi/linux/userfaultfd.h | 4 +- mm/memory.c | 8 +-- mm/shmem.c | 92 +++++++++++++++----------------- mm/userfaultfd.c | 27 +++++----- 6 files changed, 79 insertions(+), 84 deletions(-)