On Mon, Apr 14, 2025, James Houghton wrote: > By using MGLRU's debugfs for invoking test_young() and clear_young(), we > avoid page_idle's incompatibility with MGLRU, and we can mark pages as > idle (clear_young()) much faster. > > The ability to use page_idle is left in, as it is useful for kernels > that do not have MGLRU built in. If MGLRU is enabled but is not usable > (e.g. we can't access the debugfs mount), the test will fail, as > page_idle is not compatible with MGLRU. > > cgroup utility functions have been borrowed so that, when running with > MGLRU, we can create a memcg in which to run our test. > > Other MGLRU-debugfs-specific parsing code has been added to > lru_gen_util.{c,h}. This is not a proper changelog, at least not by upstream KVM standards. Please rewrite it describe the changes being made, using imperative mood/tone. From Documentation/process/maintainer-kvm-x86.rst: Changelog ~~~~~~~~~ Most importantly, write changelogs using imperative mood and avoid pronouns. > @@ -354,7 +459,12 @@ static int access_tracking_unreliable(void) > puts("Skipping idle page count sanity check, because NUMA balancing is enabled"); > return 1; > } > + return 0; > +} > > +int run_test_in_cg(const char *cgroup, void *arg) static > +{ > + for_each_guest_mode(run_test, arg); Having "separate" flows for MGLRU vs. page_idle is unnecessary. Give the helper a more common name and use it for both: static int run_test_for_each_guest_mode(const char *cgroup, void *arg) { for_each_guest_mode(run_test, arg); return 0; } > return 0; > } > > @@ -372,7 +482,7 @@ static void help(char *name) > printf(" -v: specify the number of vCPUs to run.\n"); > printf(" -o: Overlap guest memory accesses instead of partitioning\n" > " them into a separate region of memory for each vCPU.\n"); > - printf(" -w: Control whether the test warns or fails if more than 10%\n" > + printf(" -w: Control whether the test warns or fails if more than 10%%\n" > " of pages are still seen as idle/old after accessing guest\n" > " memory. >0 == warn only, 0 == fail, <0 == auto. For auto\n" > " mode, the test fails by default, but switches to warn only\n" > @@ -383,6 +493,12 @@ static void help(char *name) > exit(0); > } > > +void destroy_cgroup(char *cg) static. But this is a pointless wrapper, just delete it. > +{ > + printf("Destroying cgroup: %s\n", cg); > + cg_destroy(cg); > +} > + > int main(int argc, char *argv[]) > { > struct test_params params = { > @@ -390,6 +506,7 @@ int main(int argc, char *argv[]) > .vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE, > .nr_vcpus = 1, > }; > + char *new_cg = NULL; > int page_idle_fd; > int opt; > > @@ -424,15 +541,53 @@ int main(int argc, char *argv[]) > } > } > > - page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR); > - __TEST_REQUIRE(page_idle_fd >= 0, > - "CONFIG_IDLE_PAGE_TRACKING is not enabled"); > - close(page_idle_fd); > + if (lru_gen_usable()) { Using MGLRU on my home box fails. It's full cgroup v2, and has both CONFIG_IDLE_PAGE_TRACKING=y and MGLRU enabled. ==== Test Assertion Failure ==== access_tracking_perf_test.c:244: false pid=114670 tid=114670 errno=17 - File exists 1 0x00000000004032a9: find_generation at access_tracking_perf_test.c:244 2 0x00000000004032da: lru_gen_mark_memory_idle at access_tracking_perf_test.c:272 3 0x00000000004034e4: mark_memory_idle at access_tracking_perf_test.c:391 4 (inlined by) run_test at access_tracking_perf_test.c:431 5 0x0000000000403d84: for_each_guest_mode at guest_modes.c:96 6 0x0000000000402c61: run_test_for_each_guest_mode at access_tracking_perf_test.c:492 7 0x000000000041d8e2: cg_run at cgroup_util.c:382 8 0x00000000004027fa: main at access_tracking_perf_test.c:572 9 0x00007fa1cb629d8f: ?? ??:0 10 0x00007fa1cb629e3f: ?? ??:0 11 0x00000000004029d4: _start at ??:? Could not find a generation with 90% of guest memory (235929 pages). Interestingly, if I force the test to use /sys/kernel/mm/page_idle/bitmap, it passes. Please try to reproduce the failure (assuming you haven't already tested that exact combination of cgroups v2, MGLRU=y, and CONFIG_IDLE_PAGE_TRACKING=y). I don't have bandwidth to dig any further at this time. > + if (cg_find_unified_root(cgroup_root, sizeof(cgroup_root), NULL)) > + ksft_exit_skip("cgroup v2 isn't mounted\n"); > + > + new_cg = cg_name(cgroup_root, TEST_MEMCG_NAME); > + printf("Creating cgroup: %s\n", new_cg); > + if (cg_create(new_cg) && errno != EEXIST) > + ksft_exit_skip("could not create new cgroup: %s\n", new_cg); > + > + use_lru_gen = true; > + } else { > + page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR); > + __TEST_REQUIRE(page_idle_fd >= 0, > + "Couldn't open /sys/kernel/mm/page_idle/bitmap. " > + "Is CONFIG_IDLE_PAGE_TRACKING enabled?"); > + > + close(page_idle_fd); > + } Splitting the "check" and "execute" into separate if-else statements results in some compilers complaining about new_cg possibly being unused. The compiler is probably being a bit stupid, but the code is just as must to blame. There's zero reason to split this in two, just do everything after the idle_pages_warn_only and total_pages processing. Code at the bottom (note, you'll have to rebase on my not-yet-posted series, or undo the use of __open_path_or_exit()). > > if (idle_pages_warn_only == -1) > idle_pages_warn_only = access_tracking_unreliable(); > > - for_each_guest_mode(run_test, ¶ms); > + /* > + * If guest_page_size is larger than the host's page size, the > + * guest (memstress) will only fault in a subset of the host's pages. > + */ > + total_pages = params.nr_vcpus * params.vcpu_memory_bytes / > + max(memstress_args.guest_page_size, > + (uint64_t)getpagesize()); > + > + if (use_lru_gen) { > + int ret; > + > + puts("Using lru_gen for aging"); > + /* > + * This will fork off a new process to run the test within > + * a new memcg, so we need to properly propagate the return > + * value up. > + */ > + ret = cg_run(new_cg, &run_test_in_cg, ¶ms); > + destroy_cgroup(new_cg); > + if (ret) > + return ret; > + } else { > + puts("Using page_idle for aging"); > + for_each_guest_mode(run_test, ¶ms); > + } static int run_test_for_each_guest_mode(const char *cgroup, void *arg) { for_each_guest_mode(run_test, arg); return 0; } int main(int argc, char *argv[]) { struct test_params params = { .backing_src = DEFAULT_VM_MEM_SRC, .vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE, .nr_vcpus = 1, }; int page_idle_fd; int opt; guest_modes_append_default(); while ((opt = getopt(argc, argv, "hm:b:v:os:w:")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); break; case 'b': params.vcpu_memory_bytes = parse_size(optarg); break; case 'v': params.nr_vcpus = atoi_positive("Number of vCPUs", optarg); break; case 'o': overlap_memory_access = true; break; case 's': params.backing_src = parse_backing_src_type(optarg); break; case 'w': idle_pages_warn_only = atoi_non_negative("Idle pages warning", optarg); break; case 'h': default: help(argv[0]); break; } } if (idle_pages_warn_only == -1) idle_pages_warn_only = access_tracking_unreliable(); /* * If guest_page_size is larger than the host's page size, the * guest (memstress) will only fault in a subset of the host's pages. */ total_pages = params.nr_vcpus * params.vcpu_memory_bytes / max_t(uint64_t, memstress_args.guest_page_size, getpagesize()); if (lru_gen_usable()) { bool cg_created = true; char *test_cg = NULL; int ret; puts("Using lru_gen for aging"); use_lru_gen = true; if (cg_find_controller_root(cgroup_root, sizeof(cgroup_root), "memory")) ksft_exit_skip("Cannot find memory group controller\n"); test_cg = cg_name(cgroup_root, TEST_MEMCG_NAME); printf("Creating cgroup: %s\n", test_cg); if (cg_create(test_cg)) { if (errno == EEXIST) cg_created = false; else ksft_exit_skip("could not create new cgroup: %s\n", test_cg); } /* * This will fork off a new process to run the test within * a new memcg, so we need to properly propagate the return * value up. */ ret = cg_run(test_cg, &run_test_for_each_guest_mode, ¶ms); if (cg_created) cg_destroy(test_cg); return ret; } puts("Using page_idle for aging"); page_idle_fd = __open_path_or_exit("/sys/kernel/mm/page_idle/bitmap", O_RDWR, "Is CONFIG_IDLE_PAGE_TRACKING enabled?"); close(page_idle_fd); run_test_for_each_guest_mode(NULL, ¶ms); return 0; }