Hello, Hitting Kernel Panic on latest-next while running rcutorture tests 37ff6e9a2ce3 ("Add linux-next specific files for 20250502") reverting this patch fixes it 3b2339eeb032 ("sched-numa-add-statistics-of-numa-balance-task-migration-v3") https://web.git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/kernel/sched/core.c?id=3b2339eeb032627e9329daf70a4ba8cd62c9cc8d by looking at RIP pointer $ ./scripts/faddr2line vmlinux __migrate_swap_task+0x2e/0x180 __migrate_swap_task+0x2e/0x180: count_memcg_events_mm at include/linux/memcontrol.h:987 (inlined by) count_memcg_events_mm at include/linux/memcontrol.h:978 (inlined by) __migrate_swap_task at kernel/sched/core.c:3356 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); mm->owner -> NULL Attaching kernel logs below: [ 1070.635450] rcu-torture: rcu_torture_read_exit: End of episode [ 1074.047617] BUG: kernel NULL pointer dereference, address: 0000000000000498 [ 1074.054577] #PF: supervisor read access in kernel mode [ 1074.059718] #PF: error_code(0x0000) - not-present page [ 1074.064856] PGD 0 P4D 0 [ 1074.067395] Oops: Oops: 0000 [#1] SMP NOPTI [ 1074.071583] CPU: 48 UID: 0 PID: 307 Comm: migration/48 Not tainted 6.15.0-rc4-next-20250502-37ff6e9a2ce3-1746413815614 #1 PREEMPT(voluntary) [ 1074.084258] Hardware name: Dell Inc. PowerEdge R6515/0R4CNN, BIOS 2.16.0 07/09/2024 [ 1074.091913] Stopper: multi_cpu_stop+0x0/0x130 <- migrate_swap+0xad/0x120 [ 1074.098619] RIP: 0010:__migrate_swap_task+0x2e/0x180 [ 1074.103585] Code: 00 55 48 89 e5 41 57 41 56 41 55 41 54 49 89 fc 53 48 63 de 48 83 87 a0 03 00 00 01 66 90 4c 8b af 50 09 00 00 e8 c2 47 07 00 <49> 8b bd 98 04 00 00 e8 26 11 36 00 48 89 c7 48 85 c0 74 0a be 3b [ 1074.122332] RSP: 0018:ffffa4bc4d54bdb0 EFLAGS: 00010002 [ 1074.127557] RAX: 0000000000000001 RBX: 0000000000000007 RCX: 0000000000000000 [ 1074.134688] RDX: ffff8d80c01fcec0 RSI: 0000000000000007 RDI: ffff8d2153c93480 [ 1074.141822] RBP: ffffa4bc4d54bdd8 R08: 000000fa1239fb41 R09: ffff8d9f3e832380 [ 1074.148955] R10: 0000000000000004 R11: 0000000000000001 R12: ffff8d2153c93480 [ 1074.156088] R13: 0000000000000000 R14: ffff8d60dc9ac14c R15: ffff8d2153c9414c [ 1074.163218] FS: 0000000000000000(0000) GS:ffff8d9f8a626000(0000) knlGS:0000000000000000 [ 1074.171306] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1074.177051] CR2: 000000000000049op+0x10/0x10 [ 1074.203665] cpu_stopper_thread+0xa6/0x160 [ 1074.207767] smpboot_thread_fn+0x122/0x280 [ 1074.211866] kthread+0x11a/0x230 [ 1074.215098] ? __pfx_smpboot_thread_fn+0x10/0x10 [ 1074.219717] ? _raw_spin_unlock_irq+0x28/0x50 [ 1074.224076] ? __pfx_kthread+0x10/0x10 [ 1074.227829] ret_from_fork+0x40/0x60 [ 1074.231407] ? __pfx_kthread+0x10/0x10 [ 1074.235161] ret_from_fork_asm+0x1a/0x30 [ 1074.239089] </TASK> [ 1074.241279] Modules linked in: rcutorture torture xt_tcpudp nft_compat nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink binfmt_misc ipmi_ssif nls_iso8859_1 intel_rapl_msr intel_rapl_common amd64_edac edac_mce_amd kvm_amd dell_smbios wmi_bmof kvm dell_wmi_descriptor dcdbas rapl ccp k10temp acpi_power_meter ptdma wmi ipmi_si acpi_ipmi ipmi_devintf ipmi_msghandler mac_hid sch_fq_codel dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua msr fuse efi_pstore ip_tables x_tables autofs4 btrfs blake2b_generic raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq raid1 raid0 mgag200 drm_client_lib i2c_algo_bit drm_shmem_helper drm_kms_helper nvme ghash_clmulni_intel drm tg3 mpt3sas nvme_core ahci bnxt_en i2c_piix4 raid_class libahci i2c_smbus scsi_transport_sas aesni_intel [last unloaded: torture] [ 1074.316817] CR2: 0000000000000498 [ 1074.320135] ---[ end trace 0000000000000000 ]--- [ 1074.418846] pstore: backend (erst) writing error (-28) [ 1074.423983] RIP: 0010:__migrate_swap_task+0x2e/0x180 [ 1074.428949] Code: 00 55 48 89 e5 41 57 41 56 41 55 41 54 49 89 fc 53 48 63 de 48 83 87 a0 03 00 00 01 66 90 4c 8b af 50 09 00 00 e8 c2 47 07 00 <49> 8b bd 98 04 00 00 e8 26 11 36 00 48 89 c7 48 85 c0 74 0a be 3b [ 1074.447694] RSP: 0018:ffffa4bc4d54bdb0 EFLAGS: 00010002 [ 1074.452919] RAX: 0000000000000001 RBX: 0000000000000007 RCX: 0000000000000000 [ 1074.460051] RDX: ffff8d80c01fcec0 RSI: 0000000000000007 RDI: ffff8d2153c93480 [ 1074.467184] RBP: ffffa4bc4d54bdd8 R08: 000000fa1239fb41 R09: ffff8d9f3e832380 [ 1074.474317] R10: 0000000000000004 R11: 0000000000000001 R12: ffff8d2153c93480 [ 1074.481450] R13: 0000000000000000 R14: ffff8d60dc9ac14c R15: ffff8d2153c9414c [ 1074.488581] FS: 0000000000000000(0000) GS:ffff8d9f8a626000(0000) knlGS:0000000000000000 [ 1074.496666] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1074.502414] CR2: 0000000000000498 CR3: 000000409341a002 CR4: 0000000000770ef0 [ 1074.509547] PKRU: 55555554 [ 1074.512258] note: migration/48[307] exited with irqs disabled [ 1084.683268] watchdog: CPU6: Watchdog detected hard LOCKUP on cpu 6 [ 1084.683274] Modules linked in: rcutorture torture xt_tcpudp nft_compat nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink binfmt_misc ipmi_ssif nls_iso8859_1 intel_rapl_msr intel_rapl_common amd64_edac edac_mce_amd kvm_amd dell_smbios wmi_bmof kvm dell_wmi_descriptor dcdbas rapl ccp k10temp acpi_power_meter ptdma wmi ipmi_si acpi_ipmi ipmi_devintf ipmi_msghandler mac_hid sch_fq_codel dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua msr fuse efi_pstore ip_tables x_tables autofs4 btrfs blake2b_generic raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq raid1 raid0 mgag200 drm_client_lib i2c_algo_bit drm_shmem_helper drm_kms_helper nvme ghash_clmulni_intel drm tg3 mpt3sas nvme_core ahci bnxt_en i2c_piix4 raid_class libahci i2c_smbus scsi_transport_sas aesni_intel [last unloaded: torture] [ 1084.683352] CPU: 6 UID: 0 PID: 83659 Comm: rcu_torture_rea Tainted: G D 6.15.0-rc4-next-20250502-37ff6e9a2ce3-1746413815614 #1 PREEMPT(voluntary) [ 1084.683357] Tainted: [D]=DIE [ 1084.683358] Hardware name: Dell Inc. PowerEdge R6515/0R4CNN, BIOS 2.16.0 07/09/2024 [ 1084.683360] RIP: 0010:native_queued_spin_lock_slowpath+0x2b4/0x300 [ 1084.683368] Code: 63 ff 4c 8d a8 c0 d1 20 b4 49 81 ff ff 1f 00 00 77 46 4e 03 2c fd e0 5e f7 b2 49 89 5d 00 8b 43 08 85 c0 75 09 f3 90 8b 43 08 <85> c0 74 f7 48 8b 13 48 85 d2 0f 84 5e ff ff ff 0f 0d 0a e9 56 ff [ 1084.683370] RSP: 0018:ffffa4bc6b503a28 EFLAGS: 00000046 [ 1084.683373] RAX: 0000000000000000 RBX: ffff8d403f9b31c0 RCX: 0000000000000008 [ 1084.683375] RDX: 0000000000000047 RSI: 00000000011c0100 RDI: ffff8d403f9f2280 [ 1084.683376] RBP: ffffa4bc6b503a50 R08: 0000000000000080 R09: ffffffffffffff00 [ 1084.683377] R10: 0000000000000000 R11: 0000000000000080 R12: ffff8d403f9f2280 [ 1084.683379] R13: ffff8d403fdb31c0 R14: 00000000001c0000 R15: 0000000000000046 [ 1084.683380] FS: 0000000000000000(0000) GS:ffff8d408b7a6000(0000) knlGS:0000000000000000 [ 1084.683382] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1084.683384] CR2: 00007f54f32a3010 CR3: 000000209f547008 CR4: 0000000000770ef0 [ 1084.683385] PKRU: 55555554 [ 1084.683387] Call Trace: [ 1084.683388] <TASK> [ 1084.683395] _raw_spin_lock+0x3c/0x50 [ 1084.683399] raw_spin_rq_lock_nested+0x28/0xa0 [ 1084.683404] _raw_spin_rq_lock_irqsave+0x29/0x60 [ 1084.683408] sched_balance_rq+0x6c8/0x1430 [ 1084.683412] ? srso_alias_return_thunk+0x5/0xfbef5 [ 1084.683422] sched_balance_newidle+0x1ba/0x450 [ 1084.683426] pick_next_task_fair+0x39/0x500 [ 1084.683429] ? srso_alias_return_thunk+0x5/0xfbef5 [ 1084.683431] ? dequeue_task_fair+0xb1/0x1b0 [ 1084.683433] ? srso_alias_return_thunk+0x5/0xfbef5 [ 1084.683436] __pick_next_task+0x43/0x1b0 [ 1084.683440] __schedule+0x20c/0x15b0 [ 1084.683443] ? trace_preempt_on+0x1f/0x70 [ 1084.683447] ? srso_alias_return_thunk+0x5/0xfbef5 [ 1084.683450] ? preempt_count_sub+0x50/0x80 [ 1084.683452] ? srso_alias_return_thunk+0x5/0xfbef5 [ 1084.683455] ? hrtimer_start_range_ns+0x137/0x4b0 [ 1084.683459] ? srso_alias_return_thunk+0x5/0xfbef5 [ 1084.683463] schedule+0x_us+0x23/0x30 [torture] [ 1084.683489] rcu_torture_reader+0x138/0x200 [rcutorture] [ 1084.683496] ? __pfx_rcu_torture_timer+0x10/0x10 [rcutorture] [ 1084.683503] kthread+0x11a/0x230 [ 1084.683507] ? __pfx_rcu_torture_reader+0x10/0x10 [rcutorture] [ 1084.683512] ? _raw_spin_unlock_irq+0x28/0x50 [ 1084.683516] ? __pfx_kthread+0x10/0x10 [ 1084.683519] ret_from_fork+0x40/0x60 [ 1084.683524] ? __pfx_kthread+0x10/0x10 [ 1084.683527] ret_from_fork_asm+0x1a/0x30 [ 1084.683535] </TASK> [ 1084.683537] Kernel panic - not syncing: Hard LOCKUP [ 1086.154471] Shutting down cpus with NMI [ 1086.169269] Kernel Offset: 0x30200000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 1086.583614] ---[ end Kernel panic - not syncing: Hard LOCKUP ]--- Test recreate steps: 1. Load rcutorture module to machine 2. Toggle cpu status (Online/offline) https://github.com/avocado-framework-tests/avocado-misc-tests/blob/master/generic/rcutorture.py Reported same at: https://lore.kernel.org/linux-next/8f746aa3-9ee6-45a8-84b1-da335be17c2e@xxxxxxx/T/#mc98b701dcd3667ff8a18de8581936ee257238884 Reported-by: Ayush Jain <Ayush.jain3@xxxxxxx> Let me know, if more details are needed from my end Thanks and Regards, Ayush Jain On 4/30/2025 4:06 PM, Chen Yu wrote: > On systems with NUMA balancing enabled, it is found that tracking > the task activities due to NUMA balancing is helpful. NUMA balancing > has two mechanisms for task migration: one is to migrate the task to > an idle CPU in its preferred node, the other is to swap tasks on > different nodes if they are on each other's preferred node. > > The kernel already has NUMA page migration statistics in > /sys/fs/cgroup/mytest/memory.stat and /proc/{PID}/sched, > but does not have statistics for task migration/swap. > Add the task migration and swap count accordingly. > > The following two new fields: > > numa_task_migrated > numa_task_swapped > > will be displayed in both > /sys/fs/cgroup/{GROUP}/memory.stat and /proc/{PID}/sched > > Introducing both pertask and permemcg NUMA balancing statistics helps > to quickly evaluate the performance and resource usage of the target > workload. For example, the user can first identify the container which > has high NUMA balance activity and then narrow down to a specific task > within that group, and tune the memory policy of that task. > In summary, it is plausible to iterate the /proc/$pid/sched to find the > offending task, but the introduction of per memcg tasks' Numa balancing > aggregated activity can further help users identify the task in a > divide-and-conquer way. > > Tested-by: K Prateek Nayak <kprateek.nayak@xxxxxxx> > Tested-by: Madadi Vineeth Reddy <vineethr@xxxxxxxxxxxxx> > Acked-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> > Signed-off-by: Chen Yu <yu.c.chen@xxxxxxxxx> > --- > v2->v3: > Remove unnecessary p->mm check because kernel threads are > not supported by Numa Balancing. (Libo Chen) > v1->v2: > Update the Documentation/admin-guide/cgroup-v2.rst. (Michal) > --- > Documentation/admin-guide/cgroup-v2.rst | 6 ++++++ > include/linux/sched.h | 4 ++++ > include/linux/vm_event_item.h | 2 ++ > kernel/sched/core.c | 7 +++++-- > kernel/sched/debug.c | 4 ++++ > mm/memcontrol.c | 2 ++ > mm/vmstat.c | 2 ++ > 7 files changed, 25 insertions(+), 2 deletions(-) > > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst > index 1a16ce68a4d7..d346f3235945 100644 > --- a/Documentation/admin-guide/cgroup-v2.rst > +++ b/Documentation/admin-guide/cgroup-v2.rst > @@ -1670,6 +1670,12 @@ The following nested keys are defined. > numa_hint_faults (npn) > Number of NUMA hinting faults. > > + numa_task_migrated (npn) > + Number of task migration by NUMA balancing. > + > + numa_task_swapped (npn) > + Number of task swap by NUMA balancing. > + > pgdemote_kswapd > Number of pages demoted by kswapd. > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index f96ac1982893..1c50e30b5c01 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -549,6 +549,10 @@ struct sched_statistics { > u64 nr_failed_migrations_running; > u64 nr_failed_migrations_hot; > u64 nr_forced_migrations; > +#ifdef CONFIG_NUMA_BALANCING > + u64 numa_task_migrated; > + u64 numa_task_swapped; > +#endif > > u64 nr_wakeups; > u64 nr_wakeups_sync; > diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h > index 9e15a088ba38..91a3ce9a2687 100644 > --- a/include/linux/vm_event_item.h > +++ b/include/linux/vm_event_item.h > @@ -66,6 +66,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, > NUMA_HINT_FAULTS, > NUMA_HINT_FAULTS_LOCAL, > NUMA_PAGE_MIGRATE, > + NUMA_TASK_MIGRATE, > + NUMA_TASK_SWAP, > #endif > #ifdef CONFIG_MIGRATION > PGMIGRATE_SUCCESS, PGMIGRATE_FAIL, > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index c81cf642dba0..25a92f2abda4 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -3352,6 +3352,9 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) > #ifdef CONFIG_NUMA_BALANCING > static void __migrate_swap_task(struct task_struct *p, int cpu) > { > + __schedstat_inc(p->stats.numa_task_swapped); > + count_memcg_events_mm(p->mm, NUMA_TASK_SWAP, 1); > + > if (task_on_rq_queued(p)) { > struct rq *src_rq, *dst_rq; > struct rq_flags srf, drf; > @@ -7953,8 +7956,8 @@ int migrate_task_to(struct task_struct *p, int target_cpu) > if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) > return -EINVAL; > > - /* TODO: This is not properly updating schedstats */ > - > + __schedstat_inc(p->stats.numa_task_migrated); > + count_memcg_events_mm(p->mm, NUMA_TASK_MIGRATE, 1); > trace_sched_move_numa(p, curr_cpu, target_cpu); > return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); > } > diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c > index 56ae54e0ce6a..f971c2af7912 100644 > --- a/kernel/sched/debug.c > +++ b/kernel/sched/debug.c > @@ -1206,6 +1206,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, > P_SCHEDSTAT(nr_failed_migrations_running); > P_SCHEDSTAT(nr_failed_migrations_hot); > P_SCHEDSTAT(nr_forced_migrations); > +#ifdef CONFIG_NUMA_BALANCING > + P_SCHEDSTAT(numa_task_migrated); > + P_SCHEDSTAT(numa_task_swapped); > +#endif > P_SCHEDSTAT(nr_wakeups); > P_SCHEDSTAT(nr_wakeups_sync); > P_SCHEDSTAT(nr_wakeups_migrate); > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index c96c1f2b9cf5..cdaab8a957f3 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -463,6 +463,8 @@ static const unsigned int memcg_vm_event_stat[] = { > NUMA_PAGE_MIGRATE, > NUMA_PTE_UPDATES, > NUMA_HINT_FAULTS, > + NUMA_TASK_MIGRATE, > + NUMA_TASK_SWAP, > #endif > }; > > diff --git a/mm/vmstat.c b/mm/vmstat.c > index 4c268ce39ff2..ed08bb384ae4 100644 > --- a/mm/vmstat.c > +++ b/mm/vmstat.c > @@ -1347,6 +1347,8 @@ const char * const vmstat_text[] = { > "numa_hint_faults", > "numa_hint_faults_local", > "numa_pages_migrated", > + "numa_task_migrated", > + "numa_task_swapped", > #endif > #ifdef CONFIG_MIGRATION > "pgmigrate_success",