Re: [RFC PATCH v3 4/5] mm: thp: add bpf thp struct ops

Amery Hung <ameryhung@xxxxxxxxx> · Thu, 17 Jul 2025 11:21:38 -0700

On 6/8/25 12:35 AM, Yafang Shao wrote:
A new bpf_thp struct ops is introduced to provide finer-grained control
over THP allocation policy. The struct ops includes two APIs for
determining the THP allocator and reclaimer behavior:

- THP allocator

   int (*allocator)(unsigned long vm_flags, unsigned long tva_flags);

   The BPF program returns either THP_ALLOC_CURRENT or THP_ALLOC_KHUGEPAGED,
   indicating whether THP allocation should be performed synchronously
   (current task) or asynchronously (khugepaged).

   The decision is based on the current task context, VMA flags, and TVA
   flags.

- THP reclaimer

   int (*reclaimer)(bool vma_madvised);

   The BPF program returns either RECLAIMER_CURRENT or RECLAIMER_KSWAPD,
   determining whether memory reclamation is handled by the current task or
   kswapd.

   The decision depends on the current task and VMA flags.

Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx>
---
  include/linux/huge_mm.h |  13 +--
  mm/Makefile             |   3 +
  mm/bpf_thp.c            | 184 ++++++++++++++++++++++++++++++++++++++++
  3 files changed, 190 insertions(+), 10 deletions(-)
  create mode 100644 mm/bpf_thp.c

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 6a40ebf25f5c..0d02c9b56a85 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -54,6 +54,7 @@ enum transparent_hugepage_flag {
  	TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
  	TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
  	TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
+	TRANSPARENT_HUGEPAGE_BPF_ATTACHED,	/* BPF prog is attached */
  };
  
  struct kobject;
@@ -192,16 +193,8 @@ static inline bool hugepage_global_always(void)
  
  #define THP_ALLOC_KHUGEPAGED (1 << 1)
  #define THP_ALLOC_CURRENT (1 << 2)
-static inline int bpf_thp_allocator(unsigned long vm_flags,
-				     unsigned long tva_flags)
-{
-	return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT;
-}
-
-static inline gfp_t bpf_thp_gfp_mask(bool vma_madvised)
-{
-	return 0;
-}
+int bpf_thp_allocator(unsigned long vm_flags, unsigned long tva_flags);
+gfp_t bpf_thp_gfp_mask(bool vma_madvised);
  
  static inline int highest_order(unsigned long orders)
  {
diff --git a/mm/Makefile b/mm/Makefile
index 1a7a11d4933d..e5f41cf3fd61 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -99,6 +99,9 @@ obj-$(CONFIG_MIGRATION) += migrate.o
  obj-$(CONFIG_NUMA) += memory-tiers.o
  obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
  obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
+ifdef CONFIG_BPF_SYSCALL
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += bpf_thp.o
+endif
  obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
  obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
  obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
diff --git a/mm/bpf_thp.c b/mm/bpf_thp.c
new file mode 100644
index 000000000000..894d6cb93107
--- /dev/null
+++ b/mm/bpf_thp.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/huge_mm.h>
+#include <linux/khugepaged.h>
+
+#define RECLAIMER_CURRENT (1 << 1)
+#define RECLAIMER_KSWAPD (1 << 2)
+#define RECLAIMER_BOTH (RECLAIMER_CURRENT | RECLAIMER_KSWAPD)
+
+struct bpf_thp_ops {
+	/**
+	 * @allocator: Specifies whether the THP allocation is performed
+	 * by the current task or by khugepaged.
+	 * @vm_flags: Flags for the VMA in the current allocation context
+	 * @tva_flags: Flags for the TVA in the current allocation context
+	 *
+	 * Rerurn:
+	 * - THP_ALLOC_CURRENT: THP was allocated synchronously by the calling
+	 *   task's context.
+	 * - THP_ALLOC_KHUGEPAGED: THP was allocated asynchronously by the
+	 *   khugepaged kernel thread.
+	 * - 0: THP allocation is disallowed in the current context.
+	 */
+	int (*allocator)(unsigned long vm_flags, unsigned long tva_flags);
+	/**
+	 * @reclaimer: Specifies the entity performing page reclaim:
+	 *             - current task context
+	 *             - kswapd
+	 *             - none (no reclaim)
+	 * @vma_madvised: MADV flags for this VMA (e.g., MADV_HUGEPAGE, MADV_NOHUGEPAGE)
+	 *
+	 * Return:
+	 * - RECLAIMER_CURRENT: Direct reclaim by the current task if THP
+	 *   allocation fails.
+	 * - RECLAIMER_KSWAPD: Wake kswapd to reclaim memory if THP allocation fails.
+	 * - RECLAIMER_ALL: Both current and kswapd will perform the reclaim
+	 * - 0: No reclaim will be attempted.
+	 */
+	int (*reclaimer)(bool vma_madvised);
+};
+
+static struct bpf_thp_ops bpf_thp;
+
+int bpf_thp_allocator(unsigned long vm_flags, unsigned long tva_flags)
+{
+	int allocator;
+
+	/* No BPF program is attached */
+	if (!(transparent_hugepage_flags & (1<<TRANSPARENT_HUGEPAGE_BPF_ATTACHED)))
+		return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT;
+
+	if (current_is_khugepaged())
+		return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT;
+	if (!bpf_thp.allocator)
+		return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT;
+
+	allocator = bpf_thp.allocator(vm_flags, tva_flags);
+	if (!allocator)
+		return 0;

The check seems redundant. Is it?

+	/* invalid return value */
+	if (allocator & ~(THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT))
+		return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT;
+	return allocator;
+}
+
+gfp_t bpf_thp_gfp_mask(bool vma_madvised)
+{
+	int reclaimer;
+
+	if (!(transparent_hugepage_flags & (1<<TRANSPARENT_HUGEPAGE_BPF_ATTACHED)))
+		return 0;
+
+	if (!bpf_thp.reclaimer)
+		return 0;
+
+	reclaimer = bpf_thp.reclaimer(vma_madvised);
+	switch (reclaimer) {
+	case RECLAIMER_CURRENT:
+		return GFP_TRANSHUGE | __GFP_NORETRY;
+	case RECLAIMER_KSWAPD:
+		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+	case RECLAIMER_BOTH:
+		return GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM | __GFP_NORETRY;
+	default:
+		return 0;
+	}
+}
+
+static bool bpf_thp_ops_is_valid_access(int off, int size,
+					enum bpf_access_type type,
+					const struct bpf_prog *prog,
+					struct bpf_insn_access_aux *info)
+{
+	return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static const struct bpf_func_proto *
+bpf_thp_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	return bpf_base_func_proto(func_id, prog);
+}
+
+static const struct bpf_verifier_ops thp_bpf_verifier_ops = {
+	.get_func_proto = bpf_thp_get_func_proto,
+	.is_valid_access = bpf_thp_ops_is_valid_access,
+};
+
+static int bpf_thp_reg(void *kdata, struct bpf_link *link)
+{
+	struct bpf_thp_ops *ops = kdata;
+
+	/* TODO: add support for multiple attaches */
+	if (test_and_set_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED,
+		&transparent_hugepage_flags))
+		return -EOPNOTSUPP;

I think returning -EBUSY if the struct_ops is already attached is a 
better choice

+	bpf_thp.allocator = ops->allocator;
+	bpf_thp.reclaimer = ops->reclaimer;
+	return 0;
+}
+
+static void bpf_thp_unreg(void *kdata, struct bpf_link *link)
+{
+	clear_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, &transparent_hugepage_flags);
+	bpf_thp.allocator = NULL;
+	bpf_thp.reclaimer = NULL;
+}
+
+static int bpf_thp_check_member(const struct btf_type *t,
+				const struct btf_member *member,
+				const struct bpf_prog *prog)
+{
+	return 0;
+}
+

[...]

+static int bpf_thp_init_member(const struct btf_type *t,
+			       const struct btf_member *member,
+			       void *kdata, const void *udata)
+{
+	return 0;
+}
+
+static int bpf_thp_init(struct btf *btf)
+{
+	return 0;
+}
+
+static int allocator(unsigned long vm_flags, unsigned long tva_flags)
+{
+	return 0;
+}
+
+static int reclaimer(bool vma_madvised)
+{
+	return 0;
+}
+
+static struct bpf_thp_ops __bpf_thp_ops = {
+	.allocator = allocator,
+	.reclaimer = reclaimer,
+};
+
+static struct bpf_struct_ops bpf_bpf_thp_ops = {
+	.verifier_ops = &thp_bpf_verifier_ops,
+	.init = bpf_thp_init,
+	.check_member = bpf_thp_check_member,

nit. check_member doesn't need to be defined if it does not do anything.

+	.init_member = bpf_thp_init_member,
+	.reg = bpf_thp_reg,
+	.unreg = bpf_thp_unreg,
+	.name = "bpf_thp_ops",
+	.cfi_stubs = &__bpf_thp_ops,
+	.owner = THIS_MODULE,
+};
+
+static int __init bpf_thp_ops_init(void)
+{
+	int err = register_bpf_struct_ops(&bpf_bpf_thp_ops, bpf_thp_ops);
+
+	if (err)
+		pr_err("bpf_thp: Failed to register struct_ops (%d)\n", err);
+	return err;
+}
+late_initcall(bpf_thp_ops_init);