Re: [PATCH v7 mm-new 02/10] mm: thp: add support for BPF based THP order selection

Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> · Thu, 11 Sep 2025 15:33:09 +0100

On Wed, Sep 10, 2025 at 10:44:39AM +0800, Yafang Shao wrote:
> This patch introduces a new BPF struct_ops called bpf_thp_ops for dynamic
> THP tuning. It includes a hook bpf_hook_thp_get_order(), allowing BPF
> programs to influence THP order selection based on factors such as:
> - Workload identity
>   For example, workloads running in specific containers or cgroups.
> - Allocation context
>   Whether the allocation occurs during a page fault, khugepaged, swap or
>   other paths.
> - VMA's memory advice settings
>   MADV_HUGEPAGE or MADV_NOHUGEPAGE
> - Memory pressure
>   PSI system data or associated cgroup PSI metrics
>
> The kernel API of this new BPF hook is as follows,
>
> /**
>  * @thp_order_fn_t: Get the suggested THP orders from a BPF program for allocation
>  * @vma: vm_area_struct associated with the THP allocation
>  * @vma_type: The VMA type, such as BPF_THP_VM_HUGEPAGE if VM_HUGEPAGE is set
>  *            BPF_THP_VM_NOHUGEPAGE if VM_NOHUGEPAGE is set, or BPF_THP_VM_NONE if
>  *            neither is set.
>  * @tva_type: TVA type for current @vma
>  * @orders: Bitmask of requested THP orders for this allocation
>  *          - PMD-mapped allocation if PMD_ORDER is set
>  *          - mTHP allocation otherwise
>  *
>  * Return: The suggested THP order from the BPF program for allocation. It will
>  *         not exceed the highest requested order in @orders. Return -1 to
>  *         indicate that the original requested @orders should remain unchanged.
>  */
> typedef int thp_order_fn_t(struct vm_area_struct *vma,
> 			   enum bpf_thp_vma_type vma_type,
> 			   enum tva_type tva_type,
> 			   unsigned long orders);
>
> Only a single BPF program can be attached at any given time, though it can
> be dynamically updated to adjust the policy. The implementation supports
> anonymous THP, shmem THP, and mTHP, with future extensions planned for
> file-backed THP.
>
> This functionality is only active when system-wide THP is configured to
> madvise or always mode. It remains disabled in never mode. Additionally,
> if THP is explicitly disabled for a specific task via prctl(), this BPF
> functionality will also be unavailable for that task.
>
> This feature requires CONFIG_BPF_GET_THP_ORDER (marked EXPERIMENTAL) to be
> enabled. Note that this capability is currently unstable and may undergo
> significant changes—including potential removal—in future kernel versions.

Thanks for highlighting.

>
> Suggested-by: David Hildenbrand <david@xxxxxxxxxx>
> Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx>
> Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx>
> ---
>  MAINTAINERS             |   1 +
>  include/linux/huge_mm.h |  26 ++++-
>  mm/Kconfig              |  12 ++
>  mm/Makefile             |   1 +
>  mm/huge_memory_bpf.c    | 243 ++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 280 insertions(+), 3 deletions(-)
>  create mode 100644 mm/huge_memory_bpf.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 8fef05bc2224..d055a3c95300 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -16252,6 +16252,7 @@ F:	include/linux/huge_mm.h
>  F:	include/linux/khugepaged.h
>  F:	include/trace/events/huge_memory.h
>  F:	mm/huge_memory.c
> +F:	mm/huge_memory_bpf.c

THanks!

>  F:	mm/khugepaged.c
>  F:	mm/mm_slot.h
>  F:	tools/testing/selftests/mm/khugepaged.c
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 23f124493c47..f72a5fd04e4f 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -56,6 +56,7 @@ enum transparent_hugepage_flag {
>  	TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
>  	TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
>  	TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
> +	TRANSPARENT_HUGEPAGE_BPF_ATTACHED,      /* BPF prog is attached */
>  };
>
>  struct kobject;
> @@ -270,6 +271,19 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>  					 enum tva_type type,
>  					 unsigned long orders);
>
> +#ifdef CONFIG_BPF_GET_THP_ORDER
> +unsigned long
> +bpf_hook_thp_get_orders(struct vm_area_struct *vma, vm_flags_t vma_flags,
> +			enum tva_type type, unsigned long orders);

Thanks for renaming!

> +#else
> +static inline unsigned long
> +bpf_hook_thp_get_orders(struct vm_area_struct *vma, vm_flags_t vma_flags,
> +			enum tva_type tva_flags, unsigned long orders)
> +{
> +	return orders;
> +}
> +#endif
> +
>  /**
>   * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
>   * @vma:  the vm area to check
> @@ -291,6 +305,12 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>  				       enum tva_type type,
>  				       unsigned long orders)
>  {
> +	unsigned long bpf_orders;
> +
> +	bpf_orders = bpf_hook_thp_get_orders(vma, vm_flags, type, orders);
> +	if (!bpf_orders)
> +		return 0;

I think it'd be easier to just do:

	/* The BPF-specified order overrides which order is selected. */
	orders &= bpf_hook_thp_get_orders(vma, vm_flags, type, orders);
	if (!orders)
		return 0;

> +
>  	/*
>  	 * Optimization to check if required orders are enabled early. Only
>  	 * forced collapse ignores sysfs configs.
> @@ -304,12 +324,12 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>  		    ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
>  			mask |= READ_ONCE(huge_anon_orders_inherit);
>
> -		orders &= mask;
> -		if (!orders)
> +		bpf_orders &= mask;
> +		if (!bpf_orders)
>  			return 0

With my suggeted change this would remain the same.

>  	}
>
> -	return __thp_vma_allowable_orders(vma, vm_flags, type, orders);
> +	return __thp_vma_allowable_orders(vma, vm_flags, type, bpf_orders);

With my suggeted change this would remain the same.

>  }
>
>  struct thpsize {
> diff --git a/mm/Kconfig b/mm/Kconfig
> index d1ed839ca710..4d89d2158f10 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -896,6 +896,18 @@ config NO_PAGE_MAPCOUNT
>
>  	  EXPERIMENTAL because the impact of some changes is still unclear.
>
> +config BPF_GET_THP_ORDER

Yeah, I think we maybe need to sledgehammer this as already Lance was confused
as to the permenancy of this, and I feel that users might be too, even with the
'(EXPERIMENTAL)' bit.

So maybe

config BPF_GET_THP_ORDER_EXPERIMENTAL

Just to hammer it home?

> +	bool "BPF-based THP order selection (EXPERIMENTAL)"
> +	depends on TRANSPARENT_HUGEPAGE && BPF_SYSCALL
> +
> +	help
> +	  Enable dynamic THP order selection using BPF programs. This
> +	  experimental feature allows custom BPF logic to determine optimal
> +	  transparent hugepage allocation sizes at runtime.
> +
> +	  WARNING: This feature is unstable and may change in future kernel
> +	  versions.
> +
>  endif # TRANSPARENT_HUGEPAGE
>
>  # simple helper to make the code a bit easier to read
> diff --git a/mm/Makefile b/mm/Makefile
> index 21abb3353550..f180332f2ad0 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -99,6 +99,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
>  obj-$(CONFIG_NUMA) += memory-tiers.o
>  obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
>  obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
> +obj-$(CONFIG_BPF_GET_THP_ORDER) += huge_memory_bpf.o
>  obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
>  obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
>  obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
> diff --git a/mm/huge_memory_bpf.c b/mm/huge_memory_bpf.c
> new file mode 100644
> index 000000000000..525ee22ab598
> --- /dev/null
> +++ b/mm/huge_memory_bpf.c
> @@ -0,0 +1,243 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * BPF-based THP policy management
> + *
> + * Author: Yafang Shao <laoar.shao@xxxxxxxxx>
> + */
> +
> +#include <linux/bpf.h>
> +#include <linux/btf.h>
> +#include <linux/huge_mm.h>
> +#include <linux/khugepaged.h>
> +
> +enum bpf_thp_vma_type {
> +	BPF_THP_VM_NONE = 0,
> +	BPF_THP_VM_HUGEPAGE,	/* VM_HUGEPAGE */
> +	BPF_THP_VM_NOHUGEPAGE,	/* VM_NOHUGEPAGE */
> +};

I'm really not so sure how useful this is - can't a user just ascertain this
from the VMA flags themselves?

Let's keep the interface as minimal as possible.

> +
> +/**
> + * @thp_order_fn_t: Get the suggested THP orders from a BPF program for allocation

orders -> order?

> + * @vma: vm_area_struct associated with the THP allocation
> + * @vma_type: The VMA type, such as BPF_THP_VM_HUGEPAGE if VM_HUGEPAGE is set
> + *            BPF_THP_VM_NOHUGEPAGE if VM_NOHUGEPAGE is set, or BPF_THP_VM_NONE if
> + *            neither is set.

Obv as above let's drop this probably :)

> + * @tva_type: TVA type for current @vma
> + * @orders: Bitmask of requested THP orders for this allocation

Shouldn't requested = available?

> + *          - PMD-mapped allocation if PMD_ORDER is set
> + *          - mTHP allocation otherwise

Not sure these 2 points are super useful.

> + *
> + * Return: The suggested THP order from the BPF program for allocation. It will
> + *         not exceed the highest requested order in @orders. Return -1 to
> + *         indicate that the original requested @orders should remain unchanged.
> + */
> +typedef int thp_order_fn_t(struct vm_area_struct *vma,
> +			   enum bpf_thp_vma_type vma_type,
> +			   enum tva_type tva_type,
> +			   unsigned long orders);
> +
> +struct bpf_thp_ops {
> +	thp_order_fn_t __rcu *thp_get_order;
> +};
> +
> +static struct bpf_thp_ops bpf_thp;
> +static DEFINE_SPINLOCK(thp_ops_lock);
> +
> +/*
> + * Returns the original @orders if no BPF program is attached or if the
> + * suggested order is invalid.
> + */
> +unsigned long bpf_hook_thp_get_orders(struct vm_area_struct *vma,
> +				      vm_flags_t vma_flags,
> +				      enum tva_type tva_type,
> +				      unsigned long orders)
> +{
> +	thp_order_fn_t *bpf_hook_thp_get_order;
> +	unsigned long thp_orders = orders;
> +	enum bpf_thp_vma_type vma_type;
> +	int thp_order;
> +
> +	/* No BPF program is attached */
> +	if (!test_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED,
> +		      &transparent_hugepage_flags))
> +		return orders;
> +
> +	if (vma_flags & VM_HUGEPAGE)
> +		vma_type = BPF_THP_VM_HUGEPAGE;
> +	else if (vma_flags & VM_NOHUGEPAGE)
> +		vma_type = BPF_THP_VM_NOHUGEPAGE;
> +	else
> +		vma_type = BPF_THP_VM_NONE;

As per above, not sure this is all that useful.

> +
> +	rcu_read_lock();
> +	bpf_hook_thp_get_order = rcu_dereference(bpf_thp.thp_get_order);
> +	if (!bpf_hook_thp_get_order)
> +		goto out;
> +
> +	thp_order = bpf_hook_thp_get_order(vma, vma_type, tva_type, orders);
> +	if (thp_order < 0)
> +		goto out;
> +	/*
> +	 * The maximum requested order is determined by the callsite. E.g.:
> +	 * - PMD-mapped THP uses PMD_ORDER
> +	 * - mTHP uses (PMD_ORDER - 1)

I don't think this is quite right, highest_order() figures out the highest set
bit, so mTHP can be PMD_ORDER - 1 or less (in theory ofc).

I think we can just replace this with something simpler like - 'depending on
where the BPF hook is invoked, we check for either PMD order or mTHP orders
(less than PMD order)' or something.

> +	 *
> +	 * We must respect this upper bound to avoid undefined behavior. So the
> +	 * highest suggested order can't exceed the highest requested order.
> +	 */

I think this sentence is also unnecessary.

> +	if (thp_order <= highest_order(orders))
> +		thp_orders = BIT(thp_order);
> +
> +out:
> +	rcu_read_unlock();
> +	return thp_orders;
> +}
> +
> +static bool bpf_thp_ops_is_valid_access(int off, int size,
> +					enum bpf_access_type type,
> +					const struct bpf_prog *prog,
> +					struct bpf_insn_access_aux *info)
> +{
> +	return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
> +}
> +
> +static const struct bpf_func_proto *
> +bpf_thp_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> +{
> +	return bpf_base_func_proto(func_id, prog);
> +}
> +
> +static const struct bpf_verifier_ops thp_bpf_verifier_ops = {
> +	.get_func_proto = bpf_thp_get_func_proto,
> +	.is_valid_access = bpf_thp_ops_is_valid_access,
> +};
> +
> +static int bpf_thp_init(struct btf *btf)
> +{
> +	return 0;
> +}
> +
> +static int bpf_thp_check_member(const struct btf_type *t,
> +				const struct btf_member *member,
> +				const struct bpf_prog *prog)
> +{
> +	/* The call site operates under RCU protection. */
> +	if (prog->sleepable)
> +		return -EINVAL;
> +	return 0;
> +}
> +
> +static int bpf_thp_init_member(const struct btf_type *t,
> +			       const struct btf_member *member,
> +			       void *kdata, const void *udata)
> +{
> +	return 0;
> +}
> +
> +static int bpf_thp_reg(void *kdata, struct bpf_link *link)
> +{
> +	struct bpf_thp_ops *ops = kdata;
> +
> +	spin_lock(&thp_ops_lock);
> +	if (test_and_set_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED,
> +			     &transparent_hugepage_flags)) {
> +		spin_unlock(&thp_ops_lock);
> +		return -EBUSY;
> +	}
> +	WARN_ON_ONCE(rcu_access_pointer(bpf_thp.thp_get_order));
> +	rcu_assign_pointer(bpf_thp.thp_get_order, ops->thp_get_order);
> +	spin_unlock(&thp_ops_lock);
> +	return 0;
> +}
> +
> +static void bpf_thp_unreg(void *kdata, struct bpf_link *link)
> +{
> +	thp_order_fn_t *old_fn;
> +
> +	spin_lock(&thp_ops_lock);
> +	clear_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, &transparent_hugepage_flags);
> +	old_fn = rcu_replace_pointer(bpf_thp.thp_get_order, NULL,
> +				     lockdep_is_held(&thp_ops_lock));
> +	WARN_ON_ONCE(!old_fn);
> +	spin_unlock(&thp_ops_lock);
> +
> +	synchronize_rcu();
> +}
> +
> +static int bpf_thp_update(void *kdata, void *old_kdata, struct bpf_link *link)
> +{
> +	thp_order_fn_t *old_fn, *new_fn;
> +	struct bpf_thp_ops *old = old_kdata;
> +	struct bpf_thp_ops *ops = kdata;
> +	int ret = 0;
> +
> +	if (!ops || !old)
> +		return -EINVAL;
> +
> +	spin_lock(&thp_ops_lock);
> +	/* The prog has aleady been removed. */
> +	if (!test_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED,
> +		      &transparent_hugepage_flags)) {
> +		ret = -ENOENT;
> +		goto out;
> +	}
> +
> +	new_fn = rcu_dereference(ops->thp_get_order);
> +	old_fn = rcu_replace_pointer(bpf_thp.thp_get_order, new_fn,
> +				     lockdep_is_held(&thp_ops_lock));
> +	WARN_ON_ONCE(!old_fn || !new_fn);
> +
> +out:
> +	spin_unlock(&thp_ops_lock);
> +	if (!ret)
> +		synchronize_rcu();
> +	return ret;
> +}
> +
> +static int bpf_thp_validate(void *kdata)
> +{
> +	struct bpf_thp_ops *ops = kdata;
> +
> +	if (!ops->thp_get_order) {
> +		pr_err("bpf_thp: required ops isn't implemented\n");
> +		return -EINVAL;
> +	}
> +	return 0;
> +}
> +
> +static int bpf_thp_get_order(struct vm_area_struct *vma,
> +			     enum bpf_thp_vma_type vma_type,
> +			     enum tva_type tva_type,
> +			     unsigned long orders)
> +{
> +	return -1;
> +}
> +
> +static struct bpf_thp_ops __bpf_thp_ops = {
> +	.thp_get_order = (thp_order_fn_t __rcu *)bpf_thp_get_order,
> +};
> +
> +static struct bpf_struct_ops bpf_bpf_thp_ops = {
> +	.verifier_ops = &thp_bpf_verifier_ops,
> +	.init = bpf_thp_init,
> +	.check_member = bpf_thp_check_member,
> +	.init_member = bpf_thp_init_member,
> +	.reg = bpf_thp_reg,
> +	.unreg = bpf_thp_unreg,
> +	.update = bpf_thp_update,
> +	.validate = bpf_thp_validate,
> +	.cfi_stubs = &__bpf_thp_ops,
> +	.owner = THIS_MODULE,
> +	.name = "bpf_thp_ops",
> +};
> +
> +static int __init bpf_thp_ops_init(void)
> +{
> +	int err;
> +
> +	err = register_bpf_struct_ops(&bpf_bpf_thp_ops, bpf_thp_ops);
> +	if (err)
> +		pr_err("bpf_thp: Failed to register struct_ops (%d)\n", err);
> +	return err;
> +}
> +late_initcall(bpf_thp_ops_init);
> --
> 2.47.3
>