On Tue, 24 Jun 2025, Li Lingfeng wrote: > Hi, > > During my validation of this fix patch, I encountered a deadlock issue > that wasn't present during last Saturday's verification. > Process 2761 holds the nfsd_mutex while sending an upcall request, > triggering process 2776. > This process must complete writing to end_grace before responding to the > upcall, but it gets blocked while attempting to acquire the nfsd_mutex, > causing a deadlock. This is harder to resolve than I hoped. I think the possibility of the same deadlock is already present. If nfs4_laundromat() calls nfsd4_end_grace() and gets to nfsd4_record_grace_done() just as user-space set the number of threads to zero so that nfsd is stopped, that code will take nfsd_mutex and call nfs4_state_shutdown_net(), which will call cancel_delayed_work_sync(&nn->laundromat_work); which will wait for nfsd4_record_grace_done() while holding nfsd_mutex. If grace_done waits for an upcall and nfsdcltrack end up blocking on nfsd_mutex as I think your traces show, then we get the same deadlock. I can think of two approaches to resolving this. 1/ change the upcall wait to abort when nfsd enters shutdown. 2/ cause the /proc/fs/nfsd/ files to fail without taking nfsd_mutex if shutdown is happening. It might be easiest to use a timeout in the upcall wait and abort one shutdown is noticed. So there could be a delay (1 second?) but not a full deadlock. I'll explore some more. NeilBrown > I revalidated the previous solution and found the same issue. > I'm unsure what changed between validations. > Additionally, why was the locking mechanism changed from guard(mutex) in > the previous version to explicit mutex_lock/mutex_unlock in this version? > > Thanks. > > Base: > commit 86731a2a651e58953fc949573895f2fa6d456841 (tag: v6.16-rc3, > origin/master, origin/HEAD) > Author: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> > Date: Sun Jun 22 13:30:08 2025 -0700 > > Linux 6.16-rc3 > > Diff: > diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c > index 82785db730d9..0c6f0fbecc02 100644 > --- a/fs/nfsd/nfs4recover.c > +++ b/fs/nfsd/nfs4recover.c > @@ -1008,7 +1008,8 @@ __nfsd4_init_cld_pipe(struct net *net) > if (nn->cld_net) > return 0; > > - cn = kzalloc(sizeof(*cn), GFP_KERNEL); > + cn = NULL;//kzalloc(sizeof(*cn), GFP_KERNEL); > + printk("%s force err inject\n", __func__); > if (!cn) { > ret = -ENOMEM; > goto err; > @@ -1478,6 +1479,7 @@ nfs4_cld_state_init(struct net *net) > nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, > sizeof(struct list_head), > GFP_KERNEL); > + printk("%s get nn->reclaim_str_hashtbl %px\n", __func__, > nn->reclaim_str_hashtbl); > if (!nn->reclaim_str_hashtbl) > return -ENOMEM; > > @@ -1496,7 +1498,12 @@ nfs4_cld_state_shutdown(struct net *net) > struct nfsd_net *nn = net_generic(net, nfsd_net_id); > > nn->track_reclaim_completes = false; > + printk("%s free nn->reclaim_str_hashtbl %px\n", __func__, > nn->reclaim_str_hashtbl); > kfree(nn->reclaim_str_hashtbl); > + printk("%s free nn->reclaim_str_hashtbl %px done\n", __func__, > nn->reclaim_str_hashtbl); > + printk("%s sleep after free...\n", __func__); > + msleep(10 * 1000); > + printk("%s sleep done\n", __func__); > } > > static bool > > > CLIENT A: > [root@nfs_test3 ~]# mount /dev/sdb /mnt/sdb > [root@nfs_test3 ~]# echo "/mnt *(rw,no_root_squash,fsid=0)" > /etc/exports > [root@nfs_test3 ~]# echo "/mnt/sdb *(rw,no_root_squash,fsid=1)" >> > /etc/exports > [root@nfs_test3 ~]# systemctl restart nfs-server > [ 229.107168][ T2761] nfs4_cld_state_init get nn->reclaim_str_hashtbl > ffff88810e20ba00 > [ 229.108175][ T2761] __nfsd4_init_cld_pipe force err inject > [ 229.108819][ T2761] NFSD: unable to create nfsdcld upcall pipe (-12) > [ 229.109568][ T2761] nfs4_cld_state_shutdown free > nn->reclaim_str_hashtbl ffff88810e20ba00 > [ 229.110601][ T2761] nfs4_cld_state_shutdown free > nn->reclaim_str_hashtbl ffff88810e20ba00 done > [ 229.111599][ T2761] nfs4_cld_state_shutdown sleep after free... > [ 239.282399][ T2761] nfs4_cld_state_shutdown sleep done > [ 239.283083][ T2761] __nfsd4_init_cld_pipe force err inject > [ 239.283734][ T2761] NFSD: unable to create nfsdcld upcall pipe (-12) > [ 453.554502][ T101] INFO: task bash:2644 blocked for more than 147 > seconds. > [ 453.555494][ T101] Not tainted > 6.16.0-rc3-00001-g4e8c356736df-dirty #84 > [ 453.556408][ T101] "echo 0 > > /proc/sys/kernel/hung_task_timeout_secs" disables this message. > [ 453.558022][ T101] INFO: task bash:2644 is blocked on a mutex likely > owned by task rpc.nfsd:2761. > [ 453.559949][ T101] INFO: task rpc.nfsd:2761 blocked for more than > 147 seconds. > [ 453.560868][ T101] Not tainted > 6.16.0-rc3-00001-g4e8c356736df-dirty #84 > [ 453.561770][ T101] "echo 0 > > /proc/sys/kernel/hung_task_timeout_secs" disables this message. > [ 453.563573][ T101] INFO: task nfsdcltrack:2776 blocked for more than > 147 seconds. > [ 453.564516][ T101] Not tainted > 6.16.0-rc3-00001-g4e8c356736df-dirty #84 > [ 453.565431][ T101] "echo 0 > > /proc/sys/kernel/hung_task_timeout_secs" disables this message. > [ 453.566945][ T101] INFO: task nfsdcltrack:2776 is blocked on a mutex > likely owned by task rpc.nfsd:2761. > [ 453.568860][ T101] > [ 453.568860][ T101] Showing all locks held in the system: > [ 453.569816][ T101] 1 lock held by khungtaskd/101: > [ 453.570469][ T101] #0: ffffffffa6f66120 > (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire.constprop.0+0x7/0x30 > [ 453.571753][ T101] 2 locks held by kworker/u64:4/149: > [ 453.572402][ T101] #0: ffff8881001a5948 > ((wq_completion)events_unbound){+.+.}-{0:0}, at: > process_one_work+0x72b/0x8a0 > [ 453.573758][ T101] #1: ffffc90000bcfd60 > ((work_completion)(&sub_info->work)){+.+.}-{0:0}, at: > process_one_work+0x72b/0x8a0 > [ 453.575172][ T101] 2 locks held by bash/2644: > [ 453.575742][ T101] #0: ffff88810e204400 > (sb_writers#12){.+.+}-{0:0}, at: ksys_write+0xc9/0x160 > [ 453.576856][ T101] #1: ffffffffa738e508 (nfsd_mutex){+.+.}-{4:4}, > at: write_v4_end_grace+0x94/0x160 > [ 453.578005][ T101] 2 locks held by rpc.nfsd/2761: > [ 453.578644][ T101] #0: ffff88810e204400 > (sb_writers#12){.+.+}-{0:0}, at: ksys_write+0xc9/0x160 > [ 453.579750][ T101] #1: ffffffffa738e508 (nfsd_mutex){+.+.}-{4:4}, > at: write_threads+0x14e/0x210 > [ 453.580871][ T101] 2 locks held by nfsdcltrack/2776: > [ 453.581511][ T101] #0: ffff88810e204400 > (sb_writers#12){.+.+}-{0:0}, at: ksys_write+0xc9/0x160 > [ 453.582644][ T101] #1: ffffffffa738e508 (nfsd_mutex){+.+.}-{4:4}, > at: write_v4_end_grace+0x94/0x160 > [ 453.583793][ T101] > [ 453.584085][ T101] ============================================= > [ 453.584085][ T101] > > CLIENT B: > write /proc/fs/nfsd/v4_end_grace between "nfs4_cld_state_shutdown sleep > after free..." and "nfs4_cld_state_shutdown sleep done" > [root@nfs_test3 ~]# echo 1 > /proc/fs/nfsd/v4_end_grace > > Processes: > [root@nfs_test3 ~]# ps aux | grep D > USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND > root 378 0.1 0.0 97948 7008 ? Ss 10:04 0:00 > /usr/sbin/sshd -D > root 2644 1.3 0.0 127676 8656 pts/0 Ds+ 10:06 0:00 -bash > root 2745 0.5 0.0 270324 3356 ? Ssl 10:06 0:00 > /usr/sbin/gssproxy -D > root 2761 0.9 0.0 33740 2808 ? Ds 10:06 0:00 > /usr/sbin/rpc.nfsd -V 2 8 > root 2776 0.0 0.0 19272 3076 ? D 10:06 0:00 > /sbin/nfsdcltrack init > root 2885 0.0 0.0 119476 2320 pts/1 S+ 10:07 0:00 grep > --color=auto D > [root@nfs_test3 ~]# cat /proc/2644/stack > [<0>] write_v4_end_grace+0x94/0x160 > [<0>] nfsctl_transaction_write+0x8b/0xf0 > [<0>] vfs_write+0x160/0x7f0 > [<0>] ksys_write+0xc9/0x160 > [<0>] do_syscall_64+0x72/0x390 > [<0>] entry_SYSCALL_64_after_hwframe+0x76/0x7e > [root@nfs_test3 ~]# cat /proc/2761/stack > [<0>] call_usermodehelper_exec+0x283/0x2d0 > [<0>] nfsd4_umh_cltrack_upcall+0xfc/0x1a0 > [<0>] nfsd4_umh_cltrack_init+0x60/0x80 > [<0>] nfsd4_client_tracking_init+0x1a7/0x260 > [<0>] nfs4_state_start_net+0x63/0x90 > [<0>] nfsd_startup_net+0x261/0x320 > [<0>] nfsd_svc+0x103/0x1a0 > [<0>] write_threads+0x170/0x210 > [<0>] nfsctl_transaction_write+0x8b/0xf0 > [<0>] vfs_write+0x160/0x7f0 > [<0>] ksys_write+0xc9/0x160 > [<0>] do_syscall_64+0x72/0x390 > [<0>] entry_SYSCALL_64_after_hwframe+0x76/0x7e > [root@nfs_test3 ~]# cat /proc/2776/stack > [<0>] write_v4_end_grace+0x94/0x160 > [<0>] nfsctl_transaction_write+0x8b/0xf0 > [<0>] vfs_write+0x160/0x7f0 > [<0>] ksys_write+0xc9/0x160 > [<0>] do_syscall_64+0x72/0x390 > [<0>] entry_SYSCALL_64_after_hwframe+0x76/0x7e > [root@nfs_test3 ~]# > > > 在 2025/6/23 10:47, NeilBrown 写道: > > write_foo functions are called to handle IO to files in /proc/fs/nfsd/. > > They can be called at any time and so generally need locking to ensure > > they don't happen at an awkward time. > > > > Many already take nfsd_mutex and check if nfsd_serv has been set. This > > ensures they only run when the server is fully configured. > > > > write_filehandle() does *not* need locking. It interacts with the > > export table which is set up when the netns is set up, so it is always > > valid and it has its own locking. write_filehandle() is needed before > > the nfs server is started so checking nfsd_serv would be wrong. > > > > The remaining files which do not have any locking are > > write_v4_end_grace(), write_unlock_ip(), and write_unlock_fs(). > > None of these make sense when the nfs server is not running and there is > > evidence that write_v4_end_grace() can race with ->client_tracking_op > > setup/shutdown and cause problems. > > > > This patch adds locking to these three and ensures the "unlock" > > functions abort if ->nfsd_serv is not set. > > > > Reported-and-tested-by: Li Lingfeng <lilingfeng3@xxxxxxxxxx> > > Signed-off-by: NeilBrown <neil@xxxxxxxxxx> > > --- > > fs/nfsd/nfsctl.c | 40 +++++++++++++++++++++++++++++++++++++--- > > 1 file changed, 37 insertions(+), 3 deletions(-) > > > > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c > > index 3f3e9f6c4250..6b0cad81b5fa 100644 > > --- a/fs/nfsd/nfsctl.c > > +++ b/fs/nfsd/nfsctl.c > > @@ -214,13 +214,18 @@ static inline struct net *netns(struct file *file) > > * returns one if one or more locks were not released > > * On error: return code is negative errno value > > */ > > -static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) > > +static ssize_t __write_unlock_ip(struct file *file, char *buf, size_t size) > > { > > struct sockaddr_storage address; > > struct sockaddr *sap = (struct sockaddr *)&address; > > size_t salen = sizeof(address); > > char *fo_path; > > struct net *net = netns(file); > > + struct nfsd_net *nn = net_generic(net, nfsd_net_id); > > + > > + if (!nn->nfsd_serv) > > + /* There cannot be any files to unlock */ > > + return -EINVAL; > > > > /* sanity check */ > > if (size == 0) > > @@ -240,6 +245,16 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) > > return nlmsvc_unlock_all_by_ip(sap); > > } > > > > +static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) > > +{ > > + ssize_t ret; > > + > > + mutex_lock(&nfsd_mutex); > > + ret = __write_unlock_ip(file, buf, size); > > + mutex_unlock(&nfsd_mutex); > > + return ret; > > +} > > + > > /* > > * write_unlock_fs - Release all locks on a local file system > > * > > @@ -254,11 +269,16 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) > > * returns one if one or more locks were not released > > * On error: return code is negative errno value > > */ > > -static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) > > +static ssize_t __write_unlock_fs(struct file *file, char *buf, size_t size) > > { > > struct path path; > > char *fo_path; > > int error; > > + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); > > + > > + if (!nn->nfsd_serv) > > + /* There cannot be any files to unlock */ > > + return -EINVAL; > > > > /* sanity check */ > > if (size == 0) > > @@ -291,6 +311,16 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) > > return error; > > } > > > > +static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) > > +{ > > + ssize_t ret; > > + > > + mutex_lock(&nfsd_mutex); > > + ret = __write_unlock_fs(file, buf, size); > > + mutex_unlock(&nfsd_mutex); > > + return ret; > > +} > > + > > /* > > * write_filehandle - Get a variable-length NFS file handle by path > > * > > @@ -1082,10 +1112,14 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) > > case 'Y': > > case 'y': > > case '1': > > - if (!nn->nfsd_serv) > > + mutex_lock(&nfsd_mutex); > > + if (!nn->nfsd_serv) { > > + mutex_unlock(&nfsd_mutex); > > return -EBUSY; > > + } > > trace_nfsd_end_grace(netns(file)); > > nfsd4_end_grace(nn); > > + mutex_unlock(&nfsd_mutex); > > break; > > default: > > return -EINVAL; >