There are many places in the kernel where we need to zeroout larger chunks but the maximum segment we can zeroout at a time by ZERO_PAGE is limited by PAGE_SIZE. This is especially annoying in block devices and filesystems where we attach multiple ZERO_PAGEs to the bio in different bvecs. With multipage bvec support in block layer, it is much more efficient to send out larger zero pages as a part of single bvec. This concern was raised during the review of adding LBS support to XFS[1][2]. Usually huge_zero_folio is allocated on demand, and it will be deallocated by the shrinker if there are no users of it left. Add a config option STATIC_PMD_ZERO_PAGE that will always allocate the huge_zero_folio, and it will never be freed. This makes using the huge_zero_folio without having to pass any mm struct and call put_folio in the destructor. We can enable it by default for x86_64 where the PMD size is 2M. It is good compromise between the memory and efficiency. As a THP zero page might be wasteful for architectures with bigger page sizes, let's not enable it for them. [1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@xxxxxx/ [2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@xxxxxxxxxxxxx/ Suggested-by: David Hildenbrand <david@xxxxxxxxxx> Signed-off-by: Pankaj Raghav <p.raghav@xxxxxxxxxxx> --- arch/x86/Kconfig | 1 + mm/Kconfig | 12 ++++++++++++ mm/memory.c | 30 ++++++++++++++++++++++++++---- 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 055204dc211d..96f99b4f96ea 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -152,6 +152,7 @@ config X86 select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64 select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64 select ARCH_WANTS_THP_SWAP if X86_64 + select ARCH_WANTS_STATIC_PMD_ZERO_PAGE if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH select BUILDTIME_TABLE_SORT select CLKEVT_I8253 diff --git a/mm/Kconfig b/mm/Kconfig index bd08e151fa1b..8f50f5c3f7a7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -826,6 +826,18 @@ config ARCH_WANTS_THP_SWAP config MM_ID def_bool n +config ARCH_WANTS_STATIC_PMD_ZERO_PAGE + bool + +config STATIC_PMD_ZERO_PAGE + def_bool y + depends on ARCH_WANTS_STATIC_PMD_ZERO_PAGE + help + Typically huge_zero_folio, which is a PMD page of zeroes, is allocated + on demand and deallocated when not in use. This option will always + allocate huge_zero_folio for zeroing and it is never deallocated. + Not suitable for memory constrained systems. + menuconfig TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT diff --git a/mm/memory.c b/mm/memory.c index 11edc4d66e74..ab8c16d04307 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -203,9 +203,17 @@ static void put_huge_zero_page(void) BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); } +/* + * If STATIC_PMD_ZERO_PAGE is enabled, @mm can be NULL, i.e, the huge_zero_folio + * is not associated with any mm_struct. +*/ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) { - if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + if (!IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE) && !mm) + return NULL; + + if (IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE) || + test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) return READ_ONCE(huge_zero_folio); if (!get_huge_zero_page()) @@ -219,6 +227,9 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) void mm_put_huge_zero_folio(struct mm_struct *mm) { + if (IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE)) + return; + if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) put_huge_zero_page(); } @@ -246,15 +257,26 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, static int __init init_huge_zero_page(void) { + int ret = 0; + + if (IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE)) { + if (!get_huge_zero_page()) + ret = -ENOMEM; + goto out; + } + huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero"); - if (!huge_zero_page_shrinker) - return -ENOMEM; + if (!huge_zero_page_shrinker) { + ret = -ENOMEM; + goto out; + } huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count; huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan; shrinker_register(huge_zero_page_shrinker); - return 0; +out: + return ret; } early_initcall(init_huge_zero_page); -- 2.47.2