Hi David, On Thu, 11 Sept 2025 at 21:09, David Hildenbrand <david@xxxxxxxxxx> wrote: > > On 11.09.25 11:55, Chunyan Zhang wrote: > > Some platforms can customize the PTE PMD entry soft-dirty bit making it > > unavailable even if the architecture provides the resource. > > > > Add an API which architectures can define their specific implementations > > to detect if soft-dirty bit is available on which device the kernel is > > running. > > Thinking to myself: maybe pgtable_supports_soft_dirty() would read better > Whatever you prefer. I will use pgtable_supports_* in the next version. > > > > Signed-off-by: Chunyan Zhang <zhangchunyan@xxxxxxxxxxx> > > --- > > fs/proc/task_mmu.c | 17 ++++++++++++++++- > > include/linux/pgtable.h | 12 ++++++++++++ > > mm/debug_vm_pgtable.c | 10 +++++----- > > mm/huge_memory.c | 13 +++++++------ > > mm/internal.h | 2 +- > > mm/mremap.c | 13 +++++++------ > > mm/userfaultfd.c | 10 ++++------ > > 7 files changed, 52 insertions(+), 25 deletions(-) > > > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > > index 29cca0e6d0ff..9e8083b6d4cd 100644 > > --- a/fs/proc/task_mmu.c > > +++ b/fs/proc/task_mmu.c > > @@ -1058,7 +1058,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) > > * -Werror=unterminated-string-initialization warning > > * with GCC 15 > > */ > > - static const char mnemonics[BITS_PER_LONG][3] = { > > + static char mnemonics[BITS_PER_LONG][3] = { > > /* > > * In case if we meet a flag we don't know about. > > */ > > @@ -1129,6 +1129,16 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) > > [ilog2(VM_SEALED)] = "sl", > > #endif > > }; > > +/* > > + * We should remove the VM_SOFTDIRTY flag if the soft-dirty bit is > > + * unavailable on which the kernel is running, even if the architecture > > + * provides the resource and soft-dirty is compiled in. > > + */ > > +#ifdef CONFIG_MEM_SOFT_DIRTY > > + if (!pgtable_soft_dirty_supported()) > > + mnemonics[ilog2(VM_SOFTDIRTY)][0] = 0; > > +#endif > > You can now drop the ifdef. Ok, you mean define VM_SOFTDIRTY 0x08000000 no matter if MEM_SOFT_DIRTY is compiled in, right? Then I need memcpy() to set mnemonics[ilog2(VM_SOFTDIRTY)] here. > > But, I wonder if could we instead just stop setting the flag. Then we don't > have to worry about any VM_SOFTDIRTY checks. > > Something like the following > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 892fe5dbf9de0..8b8bf63a32ef7 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -783,6 +783,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) > static inline void vm_flags_init(struct vm_area_struct *vma, > vm_flags_t flags) > { > + VM_WARN_ON_ONCE(!pgtable_soft_dirty_supported() && (flags & VM_SOFTDIRTY)); > ACCESS_PRIVATE(vma, __vm_flags) = flags; > } > > @@ -801,6 +802,7 @@ static inline void vm_flags_reset(struct vm_area_struct *vma, > static inline void vm_flags_reset_once(struct vm_area_struct *vma, > vm_flags_t flags) > { > + VM_WARN_ON_ONCE(!pgtable_soft_dirty_supported() && (flags & VM_SOFTDIRTY)); > vma_assert_write_locked(vma); > WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); > } > @@ -808,6 +810,7 @@ static inline void vm_flags_reset_once(struct vm_area_struct *vma, > static inline void vm_flags_set(struct vm_area_struct *vma, > vm_flags_t flags) > { > + VM_WARN_ON_ONCE(!pgtable_soft_dirty_supported() && (flags & VM_SOFTDIRTY)); > vma_start_write(vma); > ACCESS_PRIVATE(vma, __vm_flags) |= flags; > } > diff --git a/mm/mmap.c b/mm/mmap.c > index 5fd3b80fda1d5..40cb3fbf9a247 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -1451,8 +1451,10 @@ static struct vm_area_struct *__install_special_mapping( > return ERR_PTR(-ENOMEM); > > vma_set_range(vma, addr, addr + len, 0); > - vm_flags_init(vma, (vm_flags | mm->def_flags | > - VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK); > + vm_flags |= mm->def_flags | VM_DONTEXPAND; Why use '|=' rather than not directly setting vm_flags which is an uninitialized variable? > + if (pgtable_soft_dirty_supported()) > + vm_flags |= VM_SOFTDIRTY; > + vm_flags_init(vma, vm_flags & ~VM_LOCKED_MASK); > vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); > > vma->vm_ops = ops; > diff --git a/mm/vma.c b/mm/vma.c > index abe0da33c8446..16a1ed2a6199c 100644 > --- a/mm/vma.c > +++ b/mm/vma.c > @@ -2551,7 +2551,8 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) > * then new mapped in-place (which must be aimed as > * a completely new data area). > */ > - vm_flags_set(vma, VM_SOFTDIRTY); > + if (pgtable_soft_dirty_supported()) > + vm_flags_set(vma, VM_SOFTDIRTY); > > vma_set_page_prot(vma); > } > @@ -2819,7 +2820,8 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, > mm->data_vm += len >> PAGE_SHIFT; > if (vm_flags & VM_LOCKED) > mm->locked_vm += (len >> PAGE_SHIFT); > - vm_flags_set(vma, VM_SOFTDIRTY); > + if (pgtable_soft_dirty_supported()) > + vm_flags_set(vma, VM_SOFTDIRTY); > return 0; > > mas_store_fail: > diff --git a/mm/vma_exec.c b/mm/vma_exec.c > index 922ee51747a68..c06732a5a620a 100644 > --- a/mm/vma_exec.c > +++ b/mm/vma_exec.c > @@ -107,6 +107,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) > int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, > unsigned long *top_mem_p) > { > + unsigned long flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; > int err; > struct vm_area_struct *vma = vm_area_alloc(mm); > > @@ -137,7 +138,9 @@ int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, > BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); > vma->vm_end = STACK_TOP_MAX; > vma->vm_start = vma->vm_end - PAGE_SIZE; > - vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); > + if (pgtable_soft_dirty_supported()) > + flags |= VM_SOFTDIRTY; > + vm_flags_init(vma, flags); > vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); > > err = insert_vm_struct(mm, vma); > > > > + > > size_t i; > > > > seq_puts(m, "VmFlags: "); > > @@ -1531,6 +1541,8 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, > > static inline void clear_soft_dirty(struct vm_area_struct *vma, > > unsigned long addr, pte_t *pte) > > { > > + if (!pgtable_soft_dirty_supported()) > > + return; > > /* > > * The soft-dirty tracker uses #PF-s to catch writes > > * to pages, so write-protect the pte as well. See the > > @@ -1566,6 +1578,9 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, > > { > > pmd_t old, pmd = *pmdp; > > > > + if (!pgtable_soft_dirty_supported()) > > + return; > > + > > if (pmd_present(pmd)) { > > /* See comment in change_huge_pmd() */ > > old = pmdp_invalidate(vma, addr, pmdp); > > That would all be handled with the above never-set-VM_SOFTDIRTY. Sorry I'm not sure I understand here, you mean no longer need #ifdef CONFIG_MEM_SOFT_DIRTY for these function definitions, right? > > > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h > > index 4c035637eeb7..2a3578a4ae4c 100644 > > --- a/include/linux/pgtable.h > > +++ b/include/linux/pgtable.h > > @@ -1537,6 +1537,18 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) > > #define arch_start_context_switch(prev) do {} while (0) > > #endif > > > > +/* > > + * Some platforms can customize the PTE soft-dirty bit making it unavailable > > + * even if the architecture provides the resource. > > + * Adding this API allows architectures to add their own checks for the > > + * devices on which the kernel is running. > > + * Note: When overiding it, please make sure the CONFIG_MEM_SOFT_DIRTY > > + * is part of this macro. > > + */ > > +#ifndef pgtable_soft_dirty_supported > > +#define pgtable_soft_dirty_supported() IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) > > +#endif > > + > > #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY > > #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION > > static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) > > diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c > > index 830107b6dd08..b32ce2b0b998 100644 > > --- a/mm/debug_vm_pgtable.c > > +++ b/mm/debug_vm_pgtable.c > > @@ -690,7 +690,7 @@ static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args) > > { > > pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); > > > > - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) > > + if (!pgtable_soft_dirty_supported()) > > return; > > > > pr_debug("Validating PTE soft dirty\n"); > > @@ -702,7 +702,7 @@ static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args) > > { > > pte_t pte; > > > > - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) > > + if (!pgtable_soft_dirty_supported()) > > return; > > > > pr_debug("Validating PTE swap soft dirty\n"); > > @@ -718,7 +718,7 @@ static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) > > { > > pmd_t pmd; > > > > - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) > > + if (!pgtable_soft_dirty_supported()) > > return; > > > > if (!has_transparent_hugepage()) > > @@ -734,8 +734,8 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) > > { > > pmd_t pmd; > > > > - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) || > > - !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION)) > > + if (!pgtable_soft_dirty_supported() || > > + !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION)) > > return; > > > > if (!has_transparent_hugepage()) > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > > index 9c38a95e9f09..218d430a2ec6 100644 > > --- a/mm/huge_memory.c > > +++ b/mm/huge_memory.c > > @@ -2271,12 +2271,13 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, > > > > static pmd_t move_soft_dirty_pmd(pmd_t pmd) > > { > > -#ifdef CONFIG_MEM_SOFT_DIRTY > > - if (unlikely(is_pmd_migration_entry(pmd))) > > - pmd = pmd_swp_mksoft_dirty(pmd); > > - else if (pmd_present(pmd)) > > - pmd = pmd_mksoft_dirty(pmd); > > -#endif > > + if (pgtable_soft_dirty_supported()) { > > + if (unlikely(is_pmd_migration_entry(pmd))) > > + pmd = pmd_swp_mksoft_dirty(pmd); > > + else if (pmd_present(pmd)) > > + pmd = pmd_mksoft_dirty(pmd); > > + } > > + > > Wondering, should simply the arch take care of that and we can just clal > pmd_swp_mksoft_dirty / pmd_mksoft_dirty? Ok, I think I can do that in another patchset. > > > return pmd; > > } > > > > diff --git a/mm/internal.h b/mm/internal.h > > index 45b725c3dc03..c6ca62f8ecf3 100644 > > --- a/mm/internal.h > > +++ b/mm/internal.h > > @@ -1538,7 +1538,7 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) > > * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY) > > * will be constantly true. > > */ > > - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) > > + if (!pgtable_soft_dirty_supported()) > > return false; > > > > That should be handled with the above never-set-VM_SOFTDIRTY. We don't need to check if (!pgtable_soft_dirty_supported()) if I understand correctly. Thanks for the review, Chunyan > > > /* > > diff --git a/mm/mremap.c b/mm/mremap.c > > index e618a706aff5..7beb3114dbf5 100644 > > --- a/mm/mremap.c > > +++ b/mm/mremap.c > > @@ -162,12 +162,13 @@ static pte_t move_soft_dirty_pte(pte_t pte) > > * Set soft dirty bit so we can notice > > * in userspace the ptes were moved. > > */ > > -#ifdef CONFIG_MEM_SOFT_DIRTY > > - if (pte_present(pte)) > > - pte = pte_mksoft_dirty(pte); > > - else if (is_swap_pte(pte)) > > - pte = pte_swp_mksoft_dirty(pte); > > -#endif > > + if (pgtable_soft_dirty_supported()) { > > + if (pte_present(pte)) > > + pte = pte_mksoft_dirty(pte); > > + else if (is_swap_pte(pte)) > > + pte = pte_swp_mksoft_dirty(pte); > > + } > > + > > return pte; > > } > > > -- > Cheers > > David / dhildenb >