On Wed, Sep 10, 2025 at 10:44:39AM +0800, Yafang Shao wrote: > This patch introduces a new BPF struct_ops called bpf_thp_ops for dynamic > THP tuning. It includes a hook bpf_hook_thp_get_order(), allowing BPF > programs to influence THP order selection based on factors such as: > - Workload identity > For example, workloads running in specific containers or cgroups. > - Allocation context > Whether the allocation occurs during a page fault, khugepaged, swap or > other paths. > - VMA's memory advice settings > MADV_HUGEPAGE or MADV_NOHUGEPAGE > - Memory pressure > PSI system data or associated cgroup PSI metrics > > The kernel API of this new BPF hook is as follows, > > /** > * @thp_order_fn_t: Get the suggested THP orders from a BPF program for allocation > * @vma: vm_area_struct associated with the THP allocation > * @vma_type: The VMA type, such as BPF_THP_VM_HUGEPAGE if VM_HUGEPAGE is set > * BPF_THP_VM_NOHUGEPAGE if VM_NOHUGEPAGE is set, or BPF_THP_VM_NONE if > * neither is set. > * @tva_type: TVA type for current @vma > * @orders: Bitmask of requested THP orders for this allocation > * - PMD-mapped allocation if PMD_ORDER is set > * - mTHP allocation otherwise > * > * Return: The suggested THP order from the BPF program for allocation. It will > * not exceed the highest requested order in @orders. Return -1 to > * indicate that the original requested @orders should remain unchanged. > */ > typedef int thp_order_fn_t(struct vm_area_struct *vma, > enum bpf_thp_vma_type vma_type, > enum tva_type tva_type, > unsigned long orders); > > Only a single BPF program can be attached at any given time, though it can > be dynamically updated to adjust the policy. The implementation supports > anonymous THP, shmem THP, and mTHP, with future extensions planned for > file-backed THP. > > This functionality is only active when system-wide THP is configured to > madvise or always mode. It remains disabled in never mode. Additionally, > if THP is explicitly disabled for a specific task via prctl(), this BPF > functionality will also be unavailable for that task. > > This feature requires CONFIG_BPF_GET_THP_ORDER (marked EXPERIMENTAL) to be > enabled. Note that this capability is currently unstable and may undergo > significant changes—including potential removal—in future kernel versions. Thanks for highlighting. > > Suggested-by: David Hildenbrand <david@xxxxxxxxxx> > Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> > Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx> > --- > MAINTAINERS | 1 + > include/linux/huge_mm.h | 26 ++++- > mm/Kconfig | 12 ++ > mm/Makefile | 1 + > mm/huge_memory_bpf.c | 243 ++++++++++++++++++++++++++++++++++++++++ > 5 files changed, 280 insertions(+), 3 deletions(-) > create mode 100644 mm/huge_memory_bpf.c > > diff --git a/MAINTAINERS b/MAINTAINERS > index 8fef05bc2224..d055a3c95300 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -16252,6 +16252,7 @@ F: include/linux/huge_mm.h > F: include/linux/khugepaged.h > F: include/trace/events/huge_memory.h > F: mm/huge_memory.c > +F: mm/huge_memory_bpf.c THanks! > F: mm/khugepaged.c > F: mm/mm_slot.h > F: tools/testing/selftests/mm/khugepaged.c > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index 23f124493c47..f72a5fd04e4f 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -56,6 +56,7 @@ enum transparent_hugepage_flag { > TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, > TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, > TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, > + TRANSPARENT_HUGEPAGE_BPF_ATTACHED, /* BPF prog is attached */ > }; > > struct kobject; > @@ -270,6 +271,19 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, > enum tva_type type, > unsigned long orders); > > +#ifdef CONFIG_BPF_GET_THP_ORDER > +unsigned long > +bpf_hook_thp_get_orders(struct vm_area_struct *vma, vm_flags_t vma_flags, > + enum tva_type type, unsigned long orders); Thanks for renaming! > +#else > +static inline unsigned long > +bpf_hook_thp_get_orders(struct vm_area_struct *vma, vm_flags_t vma_flags, > + enum tva_type tva_flags, unsigned long orders) > +{ > + return orders; > +} > +#endif > + > /** > * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma > * @vma: the vm area to check > @@ -291,6 +305,12 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, > enum tva_type type, > unsigned long orders) > { > + unsigned long bpf_orders; > + > + bpf_orders = bpf_hook_thp_get_orders(vma, vm_flags, type, orders); > + if (!bpf_orders) > + return 0; I think it'd be easier to just do: /* The BPF-specified order overrides which order is selected. */ orders &= bpf_hook_thp_get_orders(vma, vm_flags, type, orders); if (!orders) return 0; > + > /* > * Optimization to check if required orders are enabled early. Only > * forced collapse ignores sysfs configs. > @@ -304,12 +324,12 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, > ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled())) > mask |= READ_ONCE(huge_anon_orders_inherit); > > - orders &= mask; > - if (!orders) > + bpf_orders &= mask; > + if (!bpf_orders) > return 0 With my suggeted change this would remain the same. > } > > - return __thp_vma_allowable_orders(vma, vm_flags, type, orders); > + return __thp_vma_allowable_orders(vma, vm_flags, type, bpf_orders); With my suggeted change this would remain the same. > } > > struct thpsize { > diff --git a/mm/Kconfig b/mm/Kconfig > index d1ed839ca710..4d89d2158f10 100644 > --- a/mm/Kconfig > +++ b/mm/Kconfig > @@ -896,6 +896,18 @@ config NO_PAGE_MAPCOUNT > > EXPERIMENTAL because the impact of some changes is still unclear. > > +config BPF_GET_THP_ORDER Yeah, I think we maybe need to sledgehammer this as already Lance was confused as to the permenancy of this, and I feel that users might be too, even with the '(EXPERIMENTAL)' bit. So maybe config BPF_GET_THP_ORDER_EXPERIMENTAL Just to hammer it home? > + bool "BPF-based THP order selection (EXPERIMENTAL)" > + depends on TRANSPARENT_HUGEPAGE && BPF_SYSCALL > + > + help > + Enable dynamic THP order selection using BPF programs. This > + experimental feature allows custom BPF logic to determine optimal > + transparent hugepage allocation sizes at runtime. > + > + WARNING: This feature is unstable and may change in future kernel > + versions. > + > endif # TRANSPARENT_HUGEPAGE > > # simple helper to make the code a bit easier to read > diff --git a/mm/Makefile b/mm/Makefile > index 21abb3353550..f180332f2ad0 100644 > --- a/mm/Makefile > +++ b/mm/Makefile > @@ -99,6 +99,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o > obj-$(CONFIG_NUMA) += memory-tiers.o > obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o > obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o > +obj-$(CONFIG_BPF_GET_THP_ORDER) += huge_memory_bpf.o > obj-$(CONFIG_PAGE_COUNTER) += page_counter.o > obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o > obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o > diff --git a/mm/huge_memory_bpf.c b/mm/huge_memory_bpf.c > new file mode 100644 > index 000000000000..525ee22ab598 > --- /dev/null > +++ b/mm/huge_memory_bpf.c > @@ -0,0 +1,243 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * BPF-based THP policy management > + * > + * Author: Yafang Shao <laoar.shao@xxxxxxxxx> > + */ > + > +#include <linux/bpf.h> > +#include <linux/btf.h> > +#include <linux/huge_mm.h> > +#include <linux/khugepaged.h> > + > +enum bpf_thp_vma_type { > + BPF_THP_VM_NONE = 0, > + BPF_THP_VM_HUGEPAGE, /* VM_HUGEPAGE */ > + BPF_THP_VM_NOHUGEPAGE, /* VM_NOHUGEPAGE */ > +}; I'm really not so sure how useful this is - can't a user just ascertain this from the VMA flags themselves? Let's keep the interface as minimal as possible. > + > +/** > + * @thp_order_fn_t: Get the suggested THP orders from a BPF program for allocation orders -> order? > + * @vma: vm_area_struct associated with the THP allocation > + * @vma_type: The VMA type, such as BPF_THP_VM_HUGEPAGE if VM_HUGEPAGE is set > + * BPF_THP_VM_NOHUGEPAGE if VM_NOHUGEPAGE is set, or BPF_THP_VM_NONE if > + * neither is set. Obv as above let's drop this probably :) > + * @tva_type: TVA type for current @vma > + * @orders: Bitmask of requested THP orders for this allocation Shouldn't requested = available? > + * - PMD-mapped allocation if PMD_ORDER is set > + * - mTHP allocation otherwise Not sure these 2 points are super useful. > + * > + * Return: The suggested THP order from the BPF program for allocation. It will > + * not exceed the highest requested order in @orders. Return -1 to > + * indicate that the original requested @orders should remain unchanged. > + */ > +typedef int thp_order_fn_t(struct vm_area_struct *vma, > + enum bpf_thp_vma_type vma_type, > + enum tva_type tva_type, > + unsigned long orders); > + > +struct bpf_thp_ops { > + thp_order_fn_t __rcu *thp_get_order; > +}; > + > +static struct bpf_thp_ops bpf_thp; > +static DEFINE_SPINLOCK(thp_ops_lock); > + > +/* > + * Returns the original @orders if no BPF program is attached or if the > + * suggested order is invalid. > + */ > +unsigned long bpf_hook_thp_get_orders(struct vm_area_struct *vma, > + vm_flags_t vma_flags, > + enum tva_type tva_type, > + unsigned long orders) > +{ > + thp_order_fn_t *bpf_hook_thp_get_order; > + unsigned long thp_orders = orders; > + enum bpf_thp_vma_type vma_type; > + int thp_order; > + > + /* No BPF program is attached */ > + if (!test_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, > + &transparent_hugepage_flags)) > + return orders; > + > + if (vma_flags & VM_HUGEPAGE) > + vma_type = BPF_THP_VM_HUGEPAGE; > + else if (vma_flags & VM_NOHUGEPAGE) > + vma_type = BPF_THP_VM_NOHUGEPAGE; > + else > + vma_type = BPF_THP_VM_NONE; As per above, not sure this is all that useful. > + > + rcu_read_lock(); > + bpf_hook_thp_get_order = rcu_dereference(bpf_thp.thp_get_order); > + if (!bpf_hook_thp_get_order) > + goto out; > + > + thp_order = bpf_hook_thp_get_order(vma, vma_type, tva_type, orders); > + if (thp_order < 0) > + goto out; > + /* > + * The maximum requested order is determined by the callsite. E.g.: > + * - PMD-mapped THP uses PMD_ORDER > + * - mTHP uses (PMD_ORDER - 1) I don't think this is quite right, highest_order() figures out the highest set bit, so mTHP can be PMD_ORDER - 1 or less (in theory ofc). I think we can just replace this with something simpler like - 'depending on where the BPF hook is invoked, we check for either PMD order or mTHP orders (less than PMD order)' or something. > + * > + * We must respect this upper bound to avoid undefined behavior. So the > + * highest suggested order can't exceed the highest requested order. > + */ I think this sentence is also unnecessary. > + if (thp_order <= highest_order(orders)) > + thp_orders = BIT(thp_order); > + > +out: > + rcu_read_unlock(); > + return thp_orders; > +} > + > +static bool bpf_thp_ops_is_valid_access(int off, int size, > + enum bpf_access_type type, > + const struct bpf_prog *prog, > + struct bpf_insn_access_aux *info) > +{ > + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); > +} > + > +static const struct bpf_func_proto * > +bpf_thp_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > +{ > + return bpf_base_func_proto(func_id, prog); > +} > + > +static const struct bpf_verifier_ops thp_bpf_verifier_ops = { > + .get_func_proto = bpf_thp_get_func_proto, > + .is_valid_access = bpf_thp_ops_is_valid_access, > +}; > + > +static int bpf_thp_init(struct btf *btf) > +{ > + return 0; > +} > + > +static int bpf_thp_check_member(const struct btf_type *t, > + const struct btf_member *member, > + const struct bpf_prog *prog) > +{ > + /* The call site operates under RCU protection. */ > + if (prog->sleepable) > + return -EINVAL; > + return 0; > +} > + > +static int bpf_thp_init_member(const struct btf_type *t, > + const struct btf_member *member, > + void *kdata, const void *udata) > +{ > + return 0; > +} > + > +static int bpf_thp_reg(void *kdata, struct bpf_link *link) > +{ > + struct bpf_thp_ops *ops = kdata; > + > + spin_lock(&thp_ops_lock); > + if (test_and_set_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, > + &transparent_hugepage_flags)) { > + spin_unlock(&thp_ops_lock); > + return -EBUSY; > + } > + WARN_ON_ONCE(rcu_access_pointer(bpf_thp.thp_get_order)); > + rcu_assign_pointer(bpf_thp.thp_get_order, ops->thp_get_order); > + spin_unlock(&thp_ops_lock); > + return 0; > +} > + > +static void bpf_thp_unreg(void *kdata, struct bpf_link *link) > +{ > + thp_order_fn_t *old_fn; > + > + spin_lock(&thp_ops_lock); > + clear_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, &transparent_hugepage_flags); > + old_fn = rcu_replace_pointer(bpf_thp.thp_get_order, NULL, > + lockdep_is_held(&thp_ops_lock)); > + WARN_ON_ONCE(!old_fn); > + spin_unlock(&thp_ops_lock); > + > + synchronize_rcu(); > +} > + > +static int bpf_thp_update(void *kdata, void *old_kdata, struct bpf_link *link) > +{ > + thp_order_fn_t *old_fn, *new_fn; > + struct bpf_thp_ops *old = old_kdata; > + struct bpf_thp_ops *ops = kdata; > + int ret = 0; > + > + if (!ops || !old) > + return -EINVAL; > + > + spin_lock(&thp_ops_lock); > + /* The prog has aleady been removed. */ > + if (!test_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, > + &transparent_hugepage_flags)) { > + ret = -ENOENT; > + goto out; > + } > + > + new_fn = rcu_dereference(ops->thp_get_order); > + old_fn = rcu_replace_pointer(bpf_thp.thp_get_order, new_fn, > + lockdep_is_held(&thp_ops_lock)); > + WARN_ON_ONCE(!old_fn || !new_fn); > + > +out: > + spin_unlock(&thp_ops_lock); > + if (!ret) > + synchronize_rcu(); > + return ret; > +} > + > +static int bpf_thp_validate(void *kdata) > +{ > + struct bpf_thp_ops *ops = kdata; > + > + if (!ops->thp_get_order) { > + pr_err("bpf_thp: required ops isn't implemented\n"); > + return -EINVAL; > + } > + return 0; > +} > + > +static int bpf_thp_get_order(struct vm_area_struct *vma, > + enum bpf_thp_vma_type vma_type, > + enum tva_type tva_type, > + unsigned long orders) > +{ > + return -1; > +} > + > +static struct bpf_thp_ops __bpf_thp_ops = { > + .thp_get_order = (thp_order_fn_t __rcu *)bpf_thp_get_order, > +}; > + > +static struct bpf_struct_ops bpf_bpf_thp_ops = { > + .verifier_ops = &thp_bpf_verifier_ops, > + .init = bpf_thp_init, > + .check_member = bpf_thp_check_member, > + .init_member = bpf_thp_init_member, > + .reg = bpf_thp_reg, > + .unreg = bpf_thp_unreg, > + .update = bpf_thp_update, > + .validate = bpf_thp_validate, > + .cfi_stubs = &__bpf_thp_ops, > + .owner = THIS_MODULE, > + .name = "bpf_thp_ops", > +}; > + > +static int __init bpf_thp_ops_init(void) > +{ > + int err; > + > + err = register_bpf_struct_ops(&bpf_bpf_thp_ops, bpf_thp_ops); > + if (err) > + pr_err("bpf_thp: Failed to register struct_ops (%d)\n", err); > + return err; > +} > +late_initcall(bpf_thp_ops_init); > -- > 2.47.3 >