On 18/08/2025 06:55, Yafang Shao wrote: > This patch introduces a new BPF struct_ops called bpf_thp_ops for dynamic > THP tuning. It includes a hook get_suggested_order() [0], allowing BPF > programs to influence THP order selection based on factors such as: > - Workload identity > For example, workloads running in specific containers or cgroups. > - Allocation context > Whether the allocation occurs during a page fault, khugepaged, or other > paths. > - System memory pressure > (May require new BPF helpers to accurately assess memory pressure.) > > Key Details: > - Only one BPF program can be attached at a time, but it can be updated > dynamically to adjust the policy. > - Supports automatic mTHP order selection and per-workload THP policies. > - Only functional when THP is set to madise or always. > > It requires CONFIG_EXPERIMENTAL_BPF_ORDER_SELECTION to enable. [1] > This feature is unstable and may evolve in future kernel versions. > > Link: https://lwn.net/ml/all/9bc57721-5287-416c-aa30-46932d605f63@xxxxxxxxxx/ [0] > Link: https://lwn.net/ml/all/dda67ea5-2943-497c-a8e5-d81f0733047d@lucifer.local/ [1] > > Suggested-by: David Hildenbrand <david@xxxxxxxxxx> > Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> > Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx> > --- > include/linux/huge_mm.h | 15 +++ > include/linux/khugepaged.h | 12 ++- > mm/Kconfig | 12 +++ > mm/Makefile | 1 + > mm/bpf_thp.c | 186 +++++++++++++++++++++++++++++++++++++ > mm/huge_memory.c | 10 ++ > mm/khugepaged.c | 26 +++++- > mm/memory.c | 18 +++- > 8 files changed, 273 insertions(+), 7 deletions(-) > create mode 100644 mm/bpf_thp.c > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index 1ac0d06fb3c1..f0c91d7bd267 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -6,6 +6,8 @@ > > #include <linux/fs.h> /* only for vma_is_dax() */ > #include <linux/kobject.h> > +#include <linux/pgtable.h> > +#include <linux/mm.h> > > vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); > int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, > @@ -56,6 +58,7 @@ enum transparent_hugepage_flag { > TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, > TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, > TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, > + TRANSPARENT_HUGEPAGE_BPF_ATTACHED, /* BPF prog is attached */ > }; > > struct kobject; > @@ -195,6 +198,18 @@ static inline bool hugepage_global_always(void) > (1<<TRANSPARENT_HUGEPAGE_FLAG); > } > > +#ifdef CONFIG_EXPERIMENTAL_BPF_ORDER_SELECTION > +int get_suggested_order(struct mm_struct *mm, struct vm_area_struct *vma__nullable, > + u64 vma_flags, enum tva_type tva_flags, int orders); > +#else > +static inline int > +get_suggested_order(struct mm_struct *mm, struct vm_area_struct *vma__nullable, > + u64 vma_flags, enum tva_type tva_flags, int orders) > +{ > + return orders; > +} > +#endif > + > static inline int highest_order(unsigned long orders) > { > return fls_long(orders) - 1; > diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h > index eb1946a70cff..d81c1228a21f 100644 > --- a/include/linux/khugepaged.h > +++ b/include/linux/khugepaged.h > @@ -4,6 +4,8 @@ > > #include <linux/mm.h> > > +#include <linux/huge_mm.h> > + > extern unsigned int khugepaged_max_ptes_none __read_mostly; > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > extern struct attribute_group khugepaged_attr_group; > @@ -22,7 +24,15 @@ extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, > > static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) > { > - if (mm_flags_test(MMF_VM_HUGEPAGE, oldmm)) > + /* > + * THP allocation policy can be dynamically modified via BPF. Even if a > + * task was allowed to allocate THPs, BPF can decide whether its forked > + * child can allocate THPs. > + * > + * The MMF_VM_HUGEPAGE flag will be cleared by khugepaged. > + */ > + if (mm_flags_test(MMF_VM_HUGEPAGE, oldmm) && > + get_suggested_order(mm, NULL, 0, -1, BIT(PMD_ORDER))) Hi Yafang,