On 8/11/2025 2:36 PM, Shivank Garg wrote: > Add dedicated inode structure (kvm_gmem_inode_info) and slab-allocated > inode cache for guest memory backing, similar to how shmem handles inodes. > > This adds the necessary allocation/destruction functions and prepares > for upcoming guest_memfd NUMA policy support changes. > > Signed-off-by: Shivank Garg <shivankg@xxxxxxx> > --- > virt/kvm/guest_memfd.c | 69 ++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 67 insertions(+), 2 deletions(-) > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index 0e93323fc839..d9c23401e770 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -17,6 +17,15 @@ struct kvm_gmem { > struct list_head entry; > }; > > +struct kvm_gmem_inode_info { > + struct inode vfs_inode; > +}; > + > +static inline struct kvm_gmem_inode_info *KVM_GMEM_I(struct inode *inode) > +{ > + return container_of(inode, struct kvm_gmem_inode_info, vfs_inode); > +} > + > /** > * folio_file_pfn - like folio_file_page, but return a pfn. > * @folio: The folio which contains this index. > @@ -389,13 +398,46 @@ static struct file_operations kvm_gmem_fops = { > .fallocate = kvm_gmem_fallocate, > }; > > +static struct kmem_cache *kvm_gmem_inode_cachep; > + > +static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) > +{ > + struct kvm_gmem_inode_info *info; > + > + info = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL); > + if (!info) > + return NULL; > + > + return &info->vfs_inode; > +} > + > +static void kvm_gmem_destroy_inode(struct inode *inode) > +{ > +} > + > +static void kvm_gmem_free_inode(struct inode *inode) > +{ > + kmem_cache_free(kvm_gmem_inode_cachep, KVM_GMEM_I(inode)); > +} > + > +static const struct super_operations kvm_gmem_super_operations = { > + .statfs = simple_statfs, > + .alloc_inode = kvm_gmem_alloc_inode, > + .destroy_inode = kvm_gmem_destroy_inode, > + .free_inode = kvm_gmem_free_inode, > +}; > + > static int kvm_gmem_init_fs_context(struct fs_context *fc) > { > + struct pseudo_fs_context *ctx; > + > if (!init_pseudo(fc, GUEST_MEMFD_MAGIC)) > return -ENOMEM; > > fc->s_iflags |= SB_I_NOEXEC; > fc->s_iflags |= SB_I_NODEV; > + ctx = fc->fs_private; > + ctx->ops = &kvm_gmem_super_operations; > > return 0; > } > @@ -417,17 +459,40 @@ static int kvm_gmem_init_mount(void) > return 0; > } > > +static void kvm_gmem_init_inode(void *foo) > +{ > + struct kvm_gmem_inode_info *info = foo; > + > + inode_init_once(&info->vfs_inode); > +} > + > int kvm_gmem_init(struct module *module) > { > - kvm_gmem_fops.owner = module; > + int ret; > + struct kmem_cache_args args = { > + .align = 0, > + .ctor = kvm_gmem_init_inode, > + }; > > - return kvm_gmem_init_mount(); > + kvm_gmem_fops.owner = module; > + kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache", > + sizeof(struct kvm_gmem_inode_info), > + &args, SLAB_ACCOUNT); > + if (!kvm_gmem_inode_cachep) > + return -ENOMEM; > + ret = kvm_gmem_init_mount(); > + if (ret) { > + kmem_cache_destroy(kvm_gmem_inode_cachep); > + return ret; > + } > + return 0; > } > > void kvm_gmem_exit(void) > { > kern_unmount(kvm_gmem_mnt); > kvm_gmem_mnt = NULL; > + kmem_cache_destroy(kvm_gmem_inode_cachep); > } While testing my code, I discovered a bug that occurs when unloading the kvm_amd module after a guest_memfd-backed VM has run. dmesg logs: [ 610.075763] ============================================================================= [ 610.083933] BUG kvm_gmem_inode_cache (Not tainted): Objects remaining on __kmem_cache_shutdown() [ 610.092711] ----------------------------------------------------------------------------- [ 610.102368] Object 0x000000008ee52a58 @offset=19200 [ 610.107247] Slab 0x000000004b1b088c objects=51 used=1 fp=0x000000007c55fc00 flags=0x57ffffc0000240(workingset|head|node=1|zone=2|lastcpupid=0x1fffff) [ 610.120733] Disabling lock debugging due to kernel taint [ 610.120741] ------------[ cut here ]------------ [ 610.120742] WARNING: CPU: 7 PID: 7554 at mm/slub.c:1171 __kmem_cache_shutdown+0x264/0x370 [ 610.120751] Modules linked in: xt_set ip_set xt_addrtype xfrm_user xfrm_algo xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT nf_reject_ipv4 nft_compat nff_defrag_ipv4 nf_tables overlay bridge stp llc cfg80211 rfkill binfmt_misc ipmi_ssif amd_atl intel_rapl_msr wmi_bmof intel_rapl_common amd64_edac edac_mce_amdmem_helper drm_kms_helper i2c_piix4 ptdma i2c_smbus k10temp wmi acpi_power_meter ipmi_si acpi_ipmi ipmi_devintf ipmi_msghandler sg dm_multipath fuse drm dm_mo56 async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 sd_mod kvm_amd(-) ahci libahci kvm nvme tg3 libata ccp irqbypass nvme_c [ 610.120831] CPU: 7 UID: 0 PID: 7554 Comm: rmmod Kdump: loaded Tainted: G B 6.16.0+ #10 PREEMPT(none) [ 610.120835] Tainted: [B]=BAD_PAGE [ 610.120836] Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 2.16.2 07/09/2024 [ 610.120838] RIP: 0010:__kmem_cache_shutdown+0x264/0x370 [ 610.120841] Code: 89 f1 4c 89 f6 4d 8b 46 20 48 c7 c7 08 08 ec 87 81 e2 ff 7f 00 00 e8 fb a7 d7 ff be 01 00 00 00 bf 05 00 00 00 e8 dc e9 cd ff <0f> 0b 48 fe ff ff [ 610.120843] RSP: 0018:ffffcd6962963cb8 EFLAGS: 00010046 [ 610.120846] RAX: 0000000000000000 RBX: ffff89fde07d21c0 RCX: 0000000000000027 [ 610.120848] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff89fcbe5dbe80 [ 610.120850] RBP: ffff89fde07d21c0 R08: 0000000000000000 R09: 0000000000000003 [ 610.120851] R10: ffffcd6962963b58 R11: ffffffff889db908 R12: ffff89fdcccd7f80 [ 610.120852] R13: ffff89fdcccd0000 R14: fffff96802333400 R15: ffff89fdd6ab6c00 [ 610.120854] FS: 00007f066eaab080(0000) GS:ffff89fd3516f000(0000) knlGS:0000000000000000 [ 610.120856] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 610.120857] CR2: 00007ffefd577828 CR3: 0000000220406004 CR4: 0000000000770ef0 [ 610.120859] PKRU: 55555554 [ 610.120860] Call Trace: [ 610.120862] <TASK> [ 610.120866] kmem_cache_destroy+0x3a/0x150 [ 610.120872] kvm_exit+0x7b/0xa0 [kvm] [ 610.120919] svm_exit+0x5/0x10 [kvm_amd] [ 610.120926] __do_sys_delete_module.isra.0+0x18b/0x2e0 [ 610.120933] ? srso_alias_return_thunk+0x5/0xfbef5 [ 610.120937] ? syscall_trace_enter+0xfa/0x1a0 [ 610.120941] do_syscall_64+0x7b/0x2c0 [ 610.120947] ? srso_alias_return_thunk+0x5/0xfbef5 [ 610.120950] ? __handle_mm_fault+0x2aa/0x670 [ 610.120954] ? iterate_dir+0x11e/0x230 [ 610.120960] ? srso_alias_return_thunk+0x5/0xfbef5 [ 610.120963] ? count_memcg_events+0xb2/0x160 [ 610.120967] ? srso_alias_return_thunk+0x5/0xfbef5 [ 610.120969] ? handle_mm_fault+0xb2/0x2f0 [ 610.120972] ? srso_alias_return_thunk+0x5/0xfbef5 [ 610.120975] ? do_user_addr_fault+0x16f/0x6f0 [ 610.120981] ? srso_alias_return_thunk+0x5/0xfbef5 [ 610.120984] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 610.120986] RIP: 0033:0x7f066e12ac9b [ 610.120989] Code: 73 01 c3 48 8b 0d 7d 81 0d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 b0 00 00 00 0f 05 <48> 3d 01 89 01 48 [ 610.120990] RSP: 002b:00007ffc629f1878 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 [ 610.120993] RAX: ffffffffffffffda RBX: 00005630e80256f0 RCX: 00007f066e12ac9b [ 610.120994] RDX: 0000000000000000 RSI: 0000000000000800 RDI: 00005630e8025758 [ 610.120996] RBP: 00007ffc629f18a0 R08: 1999999999999999 R09: 0000000000000000 [ 610.120997] R10: 00007f066e1b1fc0 R11: 0000000000000206 R12: 0000000000000000 [ 610.120999] R13: 00007ffc629f1af0 R14: 00005630e80256f0 R15: 0000000000000000 [ 610.121003] </TASK> [ 610.121004] ---[ end trace 0000000000000000 ]--- [ 610.121017] ------------[ cut here ]------------ There is a race condition here: kern_unmount() -> mntput() -> cleanup_mnt() -> deactivate_super() -> deactivate_locked_super() -> fs->kill_sb() (guest_memfd kill_sb) -> generic_shutdown_super() -> evict_inodes() -> destroy_inode() -> call_rcu() I should be waiting for pending RCU callback to finish before calling the kmem_cache_destroy(). To fix this, I added rcu_barrier() like dax_fs_exit() is doing. @@ -561,6 +566,7 @@ void kvm_gmem_exit(void) { kern_unmount(kvm_gmem_mnt); kvm_gmem_mnt = NULL; + rcu_barrier(); kmem_cache_destroy(kvm_gmem_inode_cachep); } I'll incorporate this fix into next version. Thanks, Shivank