On Mon, Aug 18, 2025 at 10:05 AM Roman Gushchin <roman.gushchin@xxxxxxxxx> wrote: > > Implement a pseudo-realistic test for the OOM handling > functionality. > > The OOM handling policy which is implemented in bpf is to > kill all tasks belonging to the biggest leaf cgroup, which > doesn't contain unkillable tasks (tasks with oom_score_adj > set to -1000). Pagecache size is excluded from the accounting. > > The test creates a hierarchy of memory cgroups, causes an > OOM at the top level, checks that the expected process will be > killed and checks memcg's oom statistics. > > Signed-off-by: Roman Gushchin <roman.gushchin@xxxxxxxxx> > --- > .../selftests/bpf/prog_tests/test_oom.c | 229 ++++++++++++++++++ > tools/testing/selftests/bpf/progs/test_oom.c | 108 +++++++++ > 2 files changed, 337 insertions(+) > create mode 100644 tools/testing/selftests/bpf/prog_tests/test_oom.c > create mode 100644 tools/testing/selftests/bpf/progs/test_oom.c > > diff --git a/tools/testing/selftests/bpf/prog_tests/test_oom.c b/tools/testing/selftests/bpf/prog_tests/test_oom.c > new file mode 100644 > index 000000000000..eaeb14a9d18f > --- /dev/null > +++ b/tools/testing/selftests/bpf/prog_tests/test_oom.c > @@ -0,0 +1,229 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +#include <test_progs.h> > +#include <bpf/btf.h> > +#include <bpf/bpf.h> > + > +#include "cgroup_helpers.h" > +#include "test_oom.skel.h" > + > +struct cgroup_desc { > + const char *path; > + int fd; > + unsigned long long id; > + int pid; > + size_t target; > + size_t max; > + int oom_score_adj; > + bool victim; > +}; > + > +#define MB (1024 * 1024) > +#define OOM_SCORE_ADJ_MIN (-1000) > +#define OOM_SCORE_ADJ_MAX 1000 > + > +static struct cgroup_desc cgroups[] = { > + { .path = "/oom_test", .max = 80 * MB}, > + { .path = "/oom_test/cg1", .target = 10 * MB, > + .oom_score_adj = OOM_SCORE_ADJ_MAX }, > + { .path = "/oom_test/cg2", .target = 40 * MB, > + .oom_score_adj = OOM_SCORE_ADJ_MIN }, > + { .path = "/oom_test/cg3" }, > + { .path = "/oom_test/cg3/cg4", .target = 30 * MB, > + .victim = true }, > + { .path = "/oom_test/cg3/cg5", .target = 20 * MB }, > +}; > + > +static int spawn_task(struct cgroup_desc *desc) > +{ > + char *ptr; > + int pid; > + > + pid = fork(); > + if (pid < 0) > + return pid; > + > + if (pid > 0) { > + /* parent */ > + desc->pid = pid; > + return 0; > + } > + > + /* child */ > + if (desc->oom_score_adj) { > + char buf[64]; > + int fd = open("/proc/self/oom_score_adj", O_WRONLY); > + > + if (fd < 0) > + return -1; > + > + snprintf(buf, sizeof(buf), "%d", desc->oom_score_adj); > + write(fd, buf, sizeof(buf)); > + close(fd); > + } > + > + ptr = (char *)malloc(desc->target); > + if (!ptr) > + return -ENOMEM; > + > + memset(ptr, 'a', desc->target); > + > + while (1) > + sleep(1000); > + > + return 0; > +} > + > +static void setup_environment(void) > +{ > + int i, err; > + > + err = setup_cgroup_environment(); > + if (!ASSERT_OK(err, "setup_cgroup_environment")) > + goto cleanup; > + > + for (i = 0; i < ARRAY_SIZE(cgroups); i++) { > + cgroups[i].fd = create_and_get_cgroup(cgroups[i].path); > + if (!ASSERT_GE(cgroups[i].fd, 0, "create_and_get_cgroup")) > + goto cleanup; > + > + cgroups[i].id = get_cgroup_id(cgroups[i].path); > + if (!ASSERT_GT(cgroups[i].id, 0, "get_cgroup_id")) > + goto cleanup; > + > + /* Freeze the top-level cgroup */ > + if (i == 0) { > + /* Freeze the top-level cgroup */ > + err = write_cgroup_file(cgroups[i].path, "cgroup.freeze", "1"); > + if (!ASSERT_OK(err, "freeze cgroup")) > + goto cleanup; > + } > + > + /* Recursively enable the memory controller */ > + if (!cgroups[i].target) { > + > + err = write_cgroup_file(cgroups[i].path, "cgroup.subtree_control", > + "+memory"); > + if (!ASSERT_OK(err, "enable memory controller")) > + goto cleanup; > + } > + > + /* Set memory.max */ > + if (cgroups[i].max) { > + char buf[256]; > + > + snprintf(buf, sizeof(buf), "%lu", cgroups[i].max); > + err = write_cgroup_file(cgroups[i].path, "memory.max", buf); > + if (!ASSERT_OK(err, "set memory.max")) > + goto cleanup; > + > + snprintf(buf, sizeof(buf), "0"); > + write_cgroup_file(cgroups[i].path, "memory.swap.max", buf); > + > + } > + > + /* Spawn tasks creating memory pressure */ > + if (cgroups[i].target) { > + char buf[256]; > + > + err = spawn_task(&cgroups[i]); > + if (!ASSERT_OK(err, "spawn task")) > + goto cleanup; > + > + snprintf(buf, sizeof(buf), "%d", cgroups[i].pid); > + err = write_cgroup_file(cgroups[i].path, "cgroup.procs", buf); > + if (!ASSERT_OK(err, "put child into a cgroup")) > + goto cleanup; > + } > + } > + > + return; > + > +cleanup: > + cleanup_cgroup_environment(); > +} > + > +static int run_and_wait_for_oom(void) > +{ > + int ret = -1; > + bool first = true; > + char buf[4096] = {}; > + size_t size; > + > + /* Unfreeze the top-level cgroup */ > + ret = write_cgroup_file(cgroups[0].path, "cgroup.freeze", "0"); > + if (!ASSERT_OK(ret, "freeze cgroup")) > + return -1; > + > + for (;;) { > + int i, status; > + pid_t pid = wait(&status); > + > + if (pid == -1) { > + if (errno == EINTR) > + continue; > + /* ECHILD */ > + break; > + } > + > + if (!first) > + continue; > + > + first = false; > + > + /* Check which process was terminated first */ > + for (i = 0; i < ARRAY_SIZE(cgroups); i++) { > + if (!ASSERT_OK(cgroups[i].victim != > + (pid == cgroups[i].pid), > + "correct process was killed")) { > + ret = -1; > + break; > + } > + > + if (!cgroups[i].victim) > + continue; > + > + /* Check the memcg oom counter */ > + size = read_cgroup_file(cgroups[i].path, > + "memory.events", > + buf, sizeof(buf)); > + if (!ASSERT_OK(size <= 0, "read memory.events")) { > + ret = -1; > + break; > + } > + > + if (!ASSERT_OK(strstr(buf, "oom_kill 1") == NULL, > + "oom_kill count check")) { > + ret = -1; > + break; > + } > + } > + > + /* Kill all remaining tasks */ > + for (i = 0; i < ARRAY_SIZE(cgroups); i++) > + if (cgroups[i].pid && cgroups[i].pid != pid) > + kill(cgroups[i].pid, SIGKILL); > + } > + > + return ret; > +} > + > +void test_oom(void) > +{ > + struct test_oom *skel; > + int err; > + > + setup_environment(); > + > + skel = test_oom__open_and_load(); > + err = test_oom__attach(skel); > + if (CHECK_FAIL(err)) > + goto cleanup; > + > + /* Unfreeze all child tasks and create the memory pressure */ > + err = run_and_wait_for_oom(); > + CHECK_FAIL(err); > + > +cleanup: > + cleanup_cgroup_environment(); > + test_oom__destroy(skel); > +} > diff --git a/tools/testing/selftests/bpf/progs/test_oom.c b/tools/testing/selftests/bpf/progs/test_oom.c > new file mode 100644 > index 000000000000..ca83563fc9a8 > --- /dev/null > +++ b/tools/testing/selftests/bpf/progs/test_oom.c > @@ -0,0 +1,108 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +#include "vmlinux.h" > +#include <bpf/bpf_helpers.h> > +#include <bpf/bpf_tracing.h> > + > +char _license[] SEC("license") = "GPL"; > + > +#define OOM_SCORE_ADJ_MIN (-1000) > + > +void bpf_rcu_read_lock(void) __ksym; > +void bpf_rcu_read_unlock(void) __ksym; > +struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; > +void bpf_task_release(struct task_struct *p) __ksym; > +struct mem_cgroup *bpf_get_root_mem_cgroup(void) __ksym; > +struct mem_cgroup *bpf_get_mem_cgroup(struct cgroup_subsys_state *css) __ksym; > +void bpf_put_mem_cgroup(struct mem_cgroup *memcg) __ksym; > +int bpf_oom_kill_process(struct oom_control *oc, struct task_struct *task, > + const char *message__str) __ksym; These declarations should come from vmlinux.h, if you don't get them, you might not have recent enough pahole. At the very least these should all be __ksym __weak, not just __ksym (but I'd rather not add them, though). [...]