This patch completely moves the old userfaultfd core to use the new vm_uffd_ops API. After this change, existing file systems will start to use the new API for userfault operations. When at it, moving vma_can_userfault() into mm/userfaultfd.c instead, because it's getting too big. It's only used in slow paths so it shouldn't be an issue. This will also remove quite some hard-coded checks for either shmem or hugetlbfs. Now all the old checks should still work but with vm_uffd_ops. Signed-off-by: Peter Xu <peterx@xxxxxxxxxx> --- include/linux/shmem_fs.h | 14 ----- include/linux/userfaultfd_k.h | 46 ++++---------- mm/shmem.c | 2 +- mm/userfaultfd.c | 115 +++++++++++++++++++++++++--------- 4 files changed, 101 insertions(+), 76 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 6d0f9c599ff7..2f5b7b295cf6 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -195,20 +195,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); -#ifdef CONFIG_USERFAULTFD -#ifdef CONFIG_SHMEM -extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop); -#else /* !CONFIG_SHMEM */ -#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \ - src_addr, flags, foliop) ({ BUG(); 0; }) -#endif /* CONFIG_SHMEM */ -#endif /* CONFIG_USERFAULTFD */ - /* * Used space is stored as unsigned 64-bit value in bytes but * quota core supports only signed 64-bit values so use that diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e79c724b3b95..4e56ad423a4a 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -85,9 +85,14 @@ extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr)) #define MFILL_ATOMIC_MODE_MASK ((__force uffd_flags_t) (MFILL_ATOMIC_BIT(0) - 1)) +static inline enum mfill_atomic_mode uffd_flags_get_mode(uffd_flags_t flags) +{ + return (enum mfill_atomic_mode)(flags & MFILL_ATOMIC_MODE_MASK); +} + static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode expected) { - return (flags & MFILL_ATOMIC_MODE_MASK) == ((__force uffd_flags_t) expected); + return uffd_flags_get_mode(flags) == expected; } static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode) @@ -196,41 +201,16 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) return vma->vm_flags & __VM_UFFD_FLAGS; } -static inline bool vma_can_userfault(struct vm_area_struct *vma, - unsigned long vm_flags, - bool wp_async) +static inline const vm_uffd_ops *vma_get_uffd_ops(struct vm_area_struct *vma) { - vm_flags &= __VM_UFFD_FLAGS; - - if (vma->vm_flags & VM_DROPPABLE) - return false; - - if ((vm_flags & VM_UFFD_MINOR) && - (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) - return false; - - /* - * If wp async enabled, and WP is the only mode enabled, allow any - * memory type. - */ - if (wp_async && (vm_flags == VM_UFFD_WP)) - return true; - -#ifndef CONFIG_PTE_MARKER_UFFD_WP - /* - * If user requested uffd-wp but not enabled pte markers for - * uffd-wp, then shmem & hugetlbfs are not supported but only - * anonymous. - */ - if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) - return false; -#endif - - /* By default, allow any of anon|shmem|hugetlb */ - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); + if (vma->vm_ops && vma->vm_ops->userfaultfd_ops) + return vma->vm_ops->userfaultfd_ops; + return NULL; } +bool vma_can_userfault(struct vm_area_struct *vma, + unsigned long vm_flags, bool wp_async); + static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) { struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx; diff --git a/mm/shmem.c b/mm/shmem.c index bd0a29000318..4d71fc7be358 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3158,7 +3158,7 @@ static int shmem_uffd_get_folio(struct inode *inode, pgoff_t pgoff, return shmem_get_folio(inode, pgoff, 0, folio, SGP_NOALLOC); } -int shmem_mfill_atomic_pte(pmd_t *dst_pmd, +static int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 879505c6996f..61783ff2d335 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -14,12 +14,48 @@ #include <linux/userfaultfd_k.h> #include <linux/mmu_notifier.h> #include <linux/hugetlb.h> -#include <linux/shmem_fs.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include "internal.h" #include "swap.h" +bool vma_can_userfault(struct vm_area_struct *vma, + unsigned long vm_flags, bool wp_async) +{ + unsigned long supported; + + if (vma->vm_flags & VM_DROPPABLE) + return false; + + vm_flags &= __VM_UFFD_FLAGS; + +#ifndef CONFIG_PTE_MARKER_UFFD_WP + /* + * If user requested uffd-wp but not enabled pte markers for + * uffd-wp, then any file system (like shmem or hugetlbfs) are not + * supported but only anonymous. + */ + if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) + return false; +#endif + /* + * If wp async enabled, and WP is the only mode enabled, allow any + * memory type. + */ + if (wp_async && (vm_flags == VM_UFFD_WP)) + return true; + + if (vma_is_anonymous(vma)) + /* Anonymous has no page cache, MINOR not supported */ + supported = VM_UFFD_MISSING | VM_UFFD_WP; + else if (vma_get_uffd_ops(vma)) + supported = vma_get_uffd_ops(vma)->uffd_features; + else + return false; + + return !(vm_flags & (~supported)); +} + static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) { @@ -384,11 +420,15 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, { struct inode *inode = file_inode(dst_vma->vm_file); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + const vm_uffd_ops *uffd_ops = vma_get_uffd_ops(dst_vma); struct folio *folio; struct page *page; int ret; - ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); + if (WARN_ON_ONCE(!uffd_ops || !uffd_ops->uffd_get_folio)) + return -EINVAL; + + ret = uffd_ops->uffd_get_folio(inode, pgoff, &folio); /* Our caller expects us to return -EFAULT if we failed to find folio */ if (ret == -ENOENT) ret = -EFAULT; @@ -504,18 +544,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb( u32 hash; struct address_space *mapping; - /* - * There is no default zero huge page for all huge page sizes as - * supported by hugetlb. A PMD_SIZE huge pages may exist as used - * by THP. Since we can not reliably insert a zero page, this - * feature is not supported. - */ - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { - up_read(&ctx->map_changing_lock); - uffd_mfill_unlock(dst_vma); - return -EINVAL; - } - src_addr = src_start; dst_addr = dst_start; copied = 0; @@ -686,14 +714,55 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, err = mfill_atomic_pte_zeropage(dst_pmd, dst_vma, dst_addr); } else { - err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, - dst_addr, src_addr, - flags, foliop); + const vm_uffd_ops *uffd_ops = vma_get_uffd_ops(dst_vma); + + if (WARN_ON_ONCE(!uffd_ops || !uffd_ops->uffd_copy)) { + err = -EINVAL; + } else { + err = uffd_ops->uffd_copy(dst_pmd, dst_vma, + dst_addr, src_addr, + flags, foliop); + } } return err; } +static inline bool +vma_uffd_ops_supported(struct vm_area_struct *vma, uffd_flags_t flags) +{ + enum mfill_atomic_mode mode = uffd_flags_get_mode(flags); + const vm_uffd_ops *uffd_ops; + unsigned long uffd_ioctls; + + if ((flags & MFILL_ATOMIC_WP) && !(vma->vm_flags & VM_UFFD_WP)) + return false; + + /* Anonymous supports everything except CONTINUE */ + if (vma_is_anonymous(vma)) + return mode != MFILL_ATOMIC_CONTINUE; + + uffd_ops = vma_get_uffd_ops(vma); + if (!uffd_ops) + return false; + + uffd_ioctls = uffd_ops->uffd_ioctls; + switch (mode) { + case MFILL_ATOMIC_COPY: + return uffd_ioctls & BIT(_UFFDIO_COPY); + case MFILL_ATOMIC_ZEROPAGE: + return uffd_ioctls & BIT(_UFFDIO_ZEROPAGE); + case MFILL_ATOMIC_CONTINUE: + if (!(vma->vm_flags & VM_SHARED)) + return false; + return uffd_ioctls & BIT(_UFFDIO_CONTINUE); + case MFILL_ATOMIC_POISON: + return uffd_ioctls & BIT(_UFFDIO_POISON); + default: + return false; + } +} + static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, @@ -752,11 +821,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, dst_vma->vm_flags & VM_SHARED)) goto out_unlock; - /* - * validate 'mode' now that we know the dst_vma: don't allow - * a wrprotect copy if the userfaultfd didn't register as WP. - */ - if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) + if (!vma_uffd_ops_supported(dst_vma, flags)) goto out_unlock; /* @@ -766,12 +831,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start, len, flags); - if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) - goto out_unlock; - if (!vma_is_shmem(dst_vma) && - uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) - goto out_unlock; - while (src_addr < src_start + len) { pmd_t dst_pmdval; -- 2.49.0 -- Peter Xu