On 08/06/2025 08:35, Yafang Shao wrote: > A new bpf_thp struct ops is introduced to provide finer-grained control > over THP allocation policy. The struct ops includes two APIs for > determining the THP allocator and reclaimer behavior: > > - THP allocator > > int (*allocator)(unsigned long vm_flags, unsigned long tva_flags); > > The BPF program returns either THP_ALLOC_CURRENT or THP_ALLOC_KHUGEPAGED, > indicating whether THP allocation should be performed synchronously > (current task) or asynchronously (khugepaged). > > The decision is based on the current task context, VMA flags, and TVA > flags. > > - THP reclaimer > > int (*reclaimer)(bool vma_madvised); > > The BPF program returns either RECLAIMER_CURRENT or RECLAIMER_KSWAPD, > determining whether memory reclamation is handled by the current task or > kswapd. > > The decision depends on the current task and VMA flags. > > Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx> > --- > include/linux/huge_mm.h | 13 +-- > mm/Makefile | 3 + > mm/bpf_thp.c | 184 ++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 190 insertions(+), 10 deletions(-) > create mode 100644 mm/bpf_thp.c > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index 6a40ebf25f5c..0d02c9b56a85 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -54,6 +54,7 @@ enum transparent_hugepage_flag { > TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, > TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, > TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, > + TRANSPARENT_HUGEPAGE_BPF_ATTACHED, /* BPF prog is attached */ > }; > > struct kobject; > @@ -192,16 +193,8 @@ static inline bool hugepage_global_always(void) > > #define THP_ALLOC_KHUGEPAGED (1 << 1) > #define THP_ALLOC_CURRENT (1 << 2) > -static inline int bpf_thp_allocator(unsigned long vm_flags, > - unsigned long tva_flags) > -{ > - return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT; > -} > - > -static inline gfp_t bpf_thp_gfp_mask(bool vma_madvised) > -{ > - return 0; > -} It makes it quite confusing for review to add code in earlier patches and remove it here. I dont think you should have had the first 3 patches? and the code is mostly in this patch? > +int bpf_thp_allocator(unsigned long vm_flags, unsigned long tva_flags); > +gfp_t bpf_thp_gfp_mask(bool vma_madvised); > > static inline int highest_order(unsigned long orders) > { > diff --git a/mm/Makefile b/mm/Makefile > index 1a7a11d4933d..e5f41cf3fd61 100644 > --- a/mm/Makefile > +++ b/mm/Makefile > @@ -99,6 +99,9 @@ obj-$(CONFIG_MIGRATION) += migrate.o > obj-$(CONFIG_NUMA) += memory-tiers.o > obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o > obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o > +ifdef CONFIG_BPF_SYSCALL > +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += bpf_thp.o > +endif > obj-$(CONFIG_PAGE_COUNTER) += page_counter.o > obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o > obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o > diff --git a/mm/bpf_thp.c b/mm/bpf_thp.c > new file mode 100644 > index 000000000000..894d6cb93107 > --- /dev/null > +++ b/mm/bpf_thp.c > @@ -0,0 +1,184 @@ > +// SPDX-License-Identifier: GPL-2.0 > + > +#include <linux/bpf.h> > +#include <linux/btf.h> > +#include <linux/huge_mm.h> > +#include <linux/khugepaged.h> > + > +#define RECLAIMER_CURRENT (1 << 1) > +#define RECLAIMER_KSWAPD (1 << 2) > +#define RECLAIMER_BOTH (RECLAIMER_CURRENT | RECLAIMER_KSWAPD) > + > +struct bpf_thp_ops { > + /** > + * @allocator: Specifies whether the THP allocation is performed > + * by the current task or by khugepaged. > + * @vm_flags: Flags for the VMA in the current allocation context > + * @tva_flags: Flags for the TVA in the current allocation context > + * > + * Rerurn: > + * - THP_ALLOC_CURRENT: THP was allocated synchronously by the calling > + * task's context. > + * - THP_ALLOC_KHUGEPAGED: THP was allocated asynchronously by the > + * khugepaged kernel thread. > + * - 0: THP allocation is disallowed in the current context. > + */ > + int (*allocator)(unsigned long vm_flags, unsigned long tva_flags); > + /** > + * @reclaimer: Specifies the entity performing page reclaim: > + * - current task context > + * - kswapd > + * - none (no reclaim) > + * @vma_madvised: MADV flags for this VMA (e.g., MADV_HUGEPAGE, MADV_NOHUGEPAGE) > + * > + * Return: > + * - RECLAIMER_CURRENT: Direct reclaim by the current task if THP > + * allocation fails. > + * - RECLAIMER_KSWAPD: Wake kswapd to reclaim memory if THP allocation fails. > + * - RECLAIMER_ALL: Both current and kswapd will perform the reclaim > + * - 0: No reclaim will be attempted. > + */ > + int (*reclaimer)(bool vma_madvised); > +}; > + > +static struct bpf_thp_ops bpf_thp; > + > +int bpf_thp_allocator(unsigned long vm_flags, unsigned long tva_flags) > +{ > + int allocator; > + > + /* No BPF program is attached */ > + if (!(transparent_hugepage_flags & (1<<TRANSPARENT_HUGEPAGE_BPF_ATTACHED))) > + return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT; > + > + if (current_is_khugepaged()) > + return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT; > + if (!bpf_thp.allocator) > + return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT; Probably make it if (current_is_khugepaged() || !bpf_thp.allocator) return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT; > + > + allocator = bpf_thp.allocator(vm_flags, tva_flags); > + if (!allocator) > + return 0; > + /* invalid return value */ > + if (allocator & ~(THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT)) > + return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT; > + return allocator; > +} > + > +gfp_t bpf_thp_gfp_mask(bool vma_madvised) > +{ > + int reclaimer; > + > + if (!(transparent_hugepage_flags & (1<<TRANSPARENT_HUGEPAGE_BPF_ATTACHED))) > + return 0; > + > + if (!bpf_thp.reclaimer) > + return 0; > + > + reclaimer = bpf_thp.reclaimer(vma_madvised); > + switch (reclaimer) { > + case RECLAIMER_CURRENT: > + return GFP_TRANSHUGE | __GFP_NORETRY; > + case RECLAIMER_KSWAPD: > + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; > + case RECLAIMER_BOTH: > + return GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM | __GFP_NORETRY; > + default: > + return 0; maybe you let the userspace decide GFP flags instead of having RECLAIMER_xyz? > + } > +} > + > +static bool bpf_thp_ops_is_valid_access(int off, int size, > + enum bpf_access_type type, > + const struct bpf_prog *prog, > + struct bpf_insn_access_aux *info) > +{ > + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); > +} > + > +static const struct bpf_func_proto * > +bpf_thp_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > +{ > + return bpf_base_func_proto(func_id, prog); > +} > + > +static const struct bpf_verifier_ops thp_bpf_verifier_ops = { > + .get_func_proto = bpf_thp_get_func_proto, > + .is_valid_access = bpf_thp_ops_is_valid_access, > +}; > + > +static int bpf_thp_reg(void *kdata, struct bpf_link *link) > +{ > + struct bpf_thp_ops *ops = kdata; > + > + /* TODO: add support for multiple attaches */ > + if (test_and_set_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, > + &transparent_hugepage_flags)) > + return -EOPNOTSUPP; > + bpf_thp.allocator = ops->allocator; > + bpf_thp.reclaimer = ops->reclaimer; > + return 0; > +} > + > +static void bpf_thp_unreg(void *kdata, struct bpf_link *link) > +{ > + clear_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, &transparent_hugepage_flags); > + bpf_thp.allocator = NULL; > + bpf_thp.reclaimer = NULL; > +} > + > +static int bpf_thp_check_member(const struct btf_type *t, > + const struct btf_member *member, > + const struct bpf_prog *prog) > +{ > + return 0; > +} > + > +static int bpf_thp_init_member(const struct btf_type *t, > + const struct btf_member *member, > + void *kdata, const void *udata) > +{ > + return 0; > +} > + > +static int bpf_thp_init(struct btf *btf) > +{ > + return 0; > +} > + > +static int allocator(unsigned long vm_flags, unsigned long tva_flags) > +{ > + return 0; > +} > + > +static int reclaimer(bool vma_madvised) > +{ > + return 0; > +} > + > +static struct bpf_thp_ops __bpf_thp_ops = { > + .allocator = allocator, > + .reclaimer = reclaimer, > +}; > + > +static struct bpf_struct_ops bpf_bpf_thp_ops = { > + .verifier_ops = &thp_bpf_verifier_ops, > + .init = bpf_thp_init, > + .check_member = bpf_thp_check_member, > + .init_member = bpf_thp_init_member, > + .reg = bpf_thp_reg, > + .unreg = bpf_thp_unreg, > + .name = "bpf_thp_ops", > + .cfi_stubs = &__bpf_thp_ops, > + .owner = THIS_MODULE, > +}; > + > +static int __init bpf_thp_ops_init(void) > +{ > + int err = register_bpf_struct_ops(&bpf_bpf_thp_ops, bpf_thp_ops); > + > + if (err) > + pr_err("bpf_thp: Failed to register struct_ops (%d)\n", err); > + return err; > +} > +late_initcall(bpf_thp_ops_init);