On Fri, Jul 18, 2025 at 2:21 AM Amery Hung <ameryhung@xxxxxxxxx> wrote: > > > > On 6/8/25 12:35 AM, Yafang Shao wrote: > > A new bpf_thp struct ops is introduced to provide finer-grained control > > over THP allocation policy. The struct ops includes two APIs for > > determining the THP allocator and reclaimer behavior: > > > > - THP allocator > > > > int (*allocator)(unsigned long vm_flags, unsigned long tva_flags); > > > > The BPF program returns either THP_ALLOC_CURRENT or THP_ALLOC_KHUGEPAGED, > > indicating whether THP allocation should be performed synchronously > > (current task) or asynchronously (khugepaged). > > > > The decision is based on the current task context, VMA flags, and TVA > > flags. > > > > - THP reclaimer > > > > int (*reclaimer)(bool vma_madvised); > > > > The BPF program returns either RECLAIMER_CURRENT or RECLAIMER_KSWAPD, > > determining whether memory reclamation is handled by the current task or > > kswapd. > > > > The decision depends on the current task and VMA flags. > > > > Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx> > > --- > > include/linux/huge_mm.h | 13 +-- > > mm/Makefile | 3 + > > mm/bpf_thp.c | 184 ++++++++++++++++++++++++++++++++++++++++ > > 3 files changed, 190 insertions(+), 10 deletions(-) > > create mode 100644 mm/bpf_thp.c > > > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > > index 6a40ebf25f5c..0d02c9b56a85 100644 > > --- a/include/linux/huge_mm.h > > +++ b/include/linux/huge_mm.h > > @@ -54,6 +54,7 @@ enum transparent_hugepage_flag { > > TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, > > TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, > > TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, > > + TRANSPARENT_HUGEPAGE_BPF_ATTACHED, /* BPF prog is attached */ > > }; > > > > struct kobject; > > @@ -192,16 +193,8 @@ static inline bool hugepage_global_always(void) > > > > #define THP_ALLOC_KHUGEPAGED (1 << 1) > > #define THP_ALLOC_CURRENT (1 << 2) > > -static inline int bpf_thp_allocator(unsigned long vm_flags, > > - unsigned long tva_flags) > > -{ > > - return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT; > > -} > > - > > -static inline gfp_t bpf_thp_gfp_mask(bool vma_madvised) > > -{ > > - return 0; > > -} > > +int bpf_thp_allocator(unsigned long vm_flags, unsigned long tva_flags); > > +gfp_t bpf_thp_gfp_mask(bool vma_madvised); > > > > static inline int highest_order(unsigned long orders) > > { > > diff --git a/mm/Makefile b/mm/Makefile > > index 1a7a11d4933d..e5f41cf3fd61 100644 > > --- a/mm/Makefile > > +++ b/mm/Makefile > > @@ -99,6 +99,9 @@ obj-$(CONFIG_MIGRATION) += migrate.o > > obj-$(CONFIG_NUMA) += memory-tiers.o > > obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o > > obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o > > +ifdef CONFIG_BPF_SYSCALL > > +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += bpf_thp.o > > +endif > > obj-$(CONFIG_PAGE_COUNTER) += page_counter.o > > obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o > > obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o > > diff --git a/mm/bpf_thp.c b/mm/bpf_thp.c > > new file mode 100644 > > index 000000000000..894d6cb93107 > > --- /dev/null > > +++ b/mm/bpf_thp.c > > @@ -0,0 +1,184 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > + > > +#include <linux/bpf.h> > > +#include <linux/btf.h> > > +#include <linux/huge_mm.h> > > +#include <linux/khugepaged.h> > > + > > +#define RECLAIMER_CURRENT (1 << 1) > > +#define RECLAIMER_KSWAPD (1 << 2) > > +#define RECLAIMER_BOTH (RECLAIMER_CURRENT | RECLAIMER_KSWAPD) > > + > > +struct bpf_thp_ops { > > + /** > > + * @allocator: Specifies whether the THP allocation is performed > > + * by the current task or by khugepaged. > > + * @vm_flags: Flags for the VMA in the current allocation context > > + * @tva_flags: Flags for the TVA in the current allocation context > > + * > > + * Rerurn: > > + * - THP_ALLOC_CURRENT: THP was allocated synchronously by the calling > > + * task's context. > > + * - THP_ALLOC_KHUGEPAGED: THP was allocated asynchronously by the > > + * khugepaged kernel thread. > > + * - 0: THP allocation is disallowed in the current context. > > + */ > > + int (*allocator)(unsigned long vm_flags, unsigned long tva_flags); > > + /** > > + * @reclaimer: Specifies the entity performing page reclaim: > > + * - current task context > > + * - kswapd > > + * - none (no reclaim) > > + * @vma_madvised: MADV flags for this VMA (e.g., MADV_HUGEPAGE, MADV_NOHUGEPAGE) > > + * > > + * Return: > > + * - RECLAIMER_CURRENT: Direct reclaim by the current task if THP > > + * allocation fails. > > + * - RECLAIMER_KSWAPD: Wake kswapd to reclaim memory if THP allocation fails. > > + * - RECLAIMER_ALL: Both current and kswapd will perform the reclaim > > + * - 0: No reclaim will be attempted. > > + */ > > + int (*reclaimer)(bool vma_madvised); > > +}; > > + > > +static struct bpf_thp_ops bpf_thp; > > + > > +int bpf_thp_allocator(unsigned long vm_flags, unsigned long tva_flags) > > +{ > > + int allocator; > > + > > + /* No BPF program is attached */ > > + if (!(transparent_hugepage_flags & (1<<TRANSPARENT_HUGEPAGE_BPF_ATTACHED))) > > + return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT; > > + > > + if (current_is_khugepaged()) > > + return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT; > > + if (!bpf_thp.allocator) > > + return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT; > > + > > + allocator = bpf_thp.allocator(vm_flags, tva_flags); > > + if (!allocator) > > + return 0; > > The check seems redundant. Is it? Right, thanks for pointing it out. > > > + /* invalid return value */ > > + if (allocator & ~(THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT)) > > + return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT; > > + return allocator; > > +} > > + > > +gfp_t bpf_thp_gfp_mask(bool vma_madvised) > > +{ > > + int reclaimer; > > + > > + if (!(transparent_hugepage_flags & (1<<TRANSPARENT_HUGEPAGE_BPF_ATTACHED))) > > + return 0; > > + > > + if (!bpf_thp.reclaimer) > > + return 0; > > + > > + reclaimer = bpf_thp.reclaimer(vma_madvised); > > + switch (reclaimer) { > > + case RECLAIMER_CURRENT: > > + return GFP_TRANSHUGE | __GFP_NORETRY; > > + case RECLAIMER_KSWAPD: > > + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; > > + case RECLAIMER_BOTH: > > + return GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM | __GFP_NORETRY; > > + default: > > + return 0; > > + } > > +} > > + > > +static bool bpf_thp_ops_is_valid_access(int off, int size, > > + enum bpf_access_type type, > > + const struct bpf_prog *prog, > > + struct bpf_insn_access_aux *info) > > +{ > > + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); > > +} > > + > > +static const struct bpf_func_proto * > > +bpf_thp_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > > +{ > > + return bpf_base_func_proto(func_id, prog); > > +} > > + > > +static const struct bpf_verifier_ops thp_bpf_verifier_ops = { > > + .get_func_proto = bpf_thp_get_func_proto, > > + .is_valid_access = bpf_thp_ops_is_valid_access, > > +}; > > + > > +static int bpf_thp_reg(void *kdata, struct bpf_link *link) > > +{ > > + struct bpf_thp_ops *ops = kdata; > > + > > + /* TODO: add support for multiple attaches */ > > + if (test_and_set_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, > > + &transparent_hugepage_flags)) > > + return -EOPNOTSUPP; > > I think returning -EBUSY if the struct_ops is already attached is a > better choice Makes sense. Thanks for the suggestion. > > > + bpf_thp.allocator = ops->allocator; > > + bpf_thp.reclaimer = ops->reclaimer; > > + return 0; > > +} > > + > > +static void bpf_thp_unreg(void *kdata, struct bpf_link *link) > > +{ > > + clear_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, &transparent_hugepage_flags); > > + bpf_thp.allocator = NULL; > > + bpf_thp.reclaimer = NULL; > > +} > > + > > +static int bpf_thp_check_member(const struct btf_type *t, > > + const struct btf_member *member, > > + const struct bpf_prog *prog) > > +{ > > + return 0; > > +} > > + > > [...] > > > +static int bpf_thp_init_member(const struct btf_type *t, > > + const struct btf_member *member, > > + void *kdata, const void *udata) > > +{ > > + return 0; > > +} > > + > > +static int bpf_thp_init(struct btf *btf) > > +{ > > + return 0; > > +} > > + > > +static int allocator(unsigned long vm_flags, unsigned long tva_flags) > > +{ > > + return 0; > > +} > > + > > +static int reclaimer(bool vma_madvised) > > +{ > > + return 0; > > +} > > + > > +static struct bpf_thp_ops __bpf_thp_ops = { > > + .allocator = allocator, > > + .reclaimer = reclaimer, > > +}; > > + > > +static struct bpf_struct_ops bpf_bpf_thp_ops = { > > + .verifier_ops = &thp_bpf_verifier_ops, > > + .init = bpf_thp_init, > > + .check_member = bpf_thp_check_member, > > nit. check_member doesn't need to be defined if it does not do anything. I will remove it. -- Regards Yafang