Re: [PATCH 1/4] nfsd: provide proper locking for all write_ function

Li Lingfeng <lilingfeng3@xxxxxxxxxx> · Tue, 24 Jun 2025 10:52:08 +0800

Hi,

During my validation of this fix patch, I encountered a deadlock issue
that wasn't present during last Saturday's verification.
Process 2761 holds the nfsd_mutex while sending an upcall request,
triggering process 2776.
This process must complete writing to end_grace before responding to the
upcall, but it gets blocked while attempting to acquire the nfsd_mutex,
causing a deadlock.
I revalidated the previous solution and found the same issue.
I'm unsure what changed between validations.
Additionally, why was the locking mechanism changed from guard(mutex) in
the previous version to explicit mutex_lock/mutex_unlock in this version?

Thanks.

Base:
commit 86731a2a651e58953fc949573895f2fa6d456841 (tag: v6.16-rc3, 
origin/master, origin/HEAD)
Author: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Date:   Sun Jun 22 13:30:08 2025 -0700

    Linux 6.16-rc3

Diff:

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 82785db730d9..0c6f0fbecc02 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -1008,7 +1008,8 @@ __nfsd4_init_cld_pipe(struct net *net)
        if (nn->cld_net)
                return 0;

-       cn = kzalloc(sizeof(*cn), GFP_KERNEL);
+       cn = NULL;//kzalloc(sizeof(*cn), GFP_KERNEL);
+       printk("%s force err inject\n", __func__);
        if (!cn) {
                ret = -ENOMEM;
                goto err;
@@ -1478,6 +1479,7 @@ nfs4_cld_state_init(struct net *net)
        nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE,
                                                sizeof(struct list_head),
                                                GFP_KERNEL);
+       printk("%s get nn->reclaim_str_hashtbl %px\n", __func__, 
nn->reclaim_str_hashtbl);
        if (!nn->reclaim_str_hashtbl)
                return -ENOMEM;

@@ -1496,7 +1498,12 @@ nfs4_cld_state_shutdown(struct net *net)
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);

        nn->track_reclaim_completes = false;
+       printk("%s free nn->reclaim_str_hashtbl %px\n", __func__, 
nn->reclaim_str_hashtbl);
        kfree(nn->reclaim_str_hashtbl);
+       printk("%s free nn->reclaim_str_hashtbl %px done\n", __func__, 
nn->reclaim_str_hashtbl);
+       printk("%s sleep after free...\n", __func__);
+       msleep(10 * 1000);
+       printk("%s sleep done\n", __func__);
 }

 static bool


CLIENT A:
[root@nfs_test3 ~]# mount /dev/sdb /mnt/sdb
[root@nfs_test3 ~]# echo "/mnt *(rw,no_root_squash,fsid=0)" > /etc/exports
[root@nfs_test3 ~]# echo "/mnt/sdb *(rw,no_root_squash,fsid=1)" >> 
/etc/exports
[root@nfs_test3 ~]# systemctl restart nfs-server
[  229.107168][ T2761] nfs4_cld_state_init get nn->reclaim_str_hashtbl 
ffff88810e20ba00
[  229.108175][ T2761] __nfsd4_init_cld_pipe force err inject
[  229.108819][ T2761] NFSD: unable to create nfsdcld upcall pipe (-12)
[  229.109568][ T2761] nfs4_cld_state_shutdown free 
nn->reclaim_str_hashtbl ffff88810e20ba00
[  229.110601][ T2761] nfs4_cld_state_shutdown free 
nn->reclaim_str_hashtbl ffff88810e20ba00 done
[  229.111599][ T2761] nfs4_cld_state_shutdown sleep after free...
[  239.282399][ T2761] nfs4_cld_state_shutdown sleep done
[  239.283083][ T2761] __nfsd4_init_cld_pipe force err inject
[  239.283734][ T2761] NFSD: unable to create nfsdcld upcall pipe (-12)
[  453.554502][  T101] INFO: task bash:2644 blocked for more than 147 
seconds.
[  453.555494][  T101]       Not tainted 
6.16.0-rc3-00001-g4e8c356736df-dirty #84
[  453.556408][  T101] "echo 0 > 
/proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  453.558022][  T101] INFO: task bash:2644 is blocked on a mutex likely 
owned by task rpc.nfsd:2761.
[  453.559949][  T101] INFO: task rpc.nfsd:2761 blocked for more than 
147 seconds.
[  453.560868][  T101]       Not tainted 
6.16.0-rc3-00001-g4e8c356736df-dirty #84
[  453.561770][  T101] "echo 0 > 
/proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  453.563573][  T101] INFO: task nfsdcltrack:2776 blocked for more than 
147 seconds.
[  453.564516][  T101]       Not tainted 
6.16.0-rc3-00001-g4e8c356736df-dirty #84
[  453.565431][  T101] "echo 0 > 
/proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  453.566945][  T101] INFO: task nfsdcltrack:2776 is blocked on a mutex 
likely owned by task rpc.nfsd:2761.
[  453.568860][  T101]
[  453.568860][  T101] Showing all locks held in the system:
[  453.569816][  T101] 1 lock held by khungtaskd/101:
[  453.570469][  T101]  #0: ffffffffa6f66120 
(rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire.constprop.0+0x7/0x30
[  453.571753][  T101] 2 locks held by kworker/u64:4/149:
[  453.572402][  T101]  #0: ffff8881001a5948 
((wq_completion)events_unbound){+.+.}-{0:0}, at: 
process_one_work+0x72b/0x8a0
[  453.573758][  T101]  #1: ffffc90000bcfd60 
((work_completion)(&sub_info->work)){+.+.}-{0:0}, at: 
process_one_work+0x72b/0x8a0
[  453.575172][  T101] 2 locks held by bash/2644:
[  453.575742][  T101]  #0: ffff88810e204400 
(sb_writers#12){.+.+}-{0:0}, at: ksys_write+0xc9/0x160
[  453.576856][  T101]  #1: ffffffffa738e508 (nfsd_mutex){+.+.}-{4:4}, 
at: write_v4_end_grace+0x94/0x160
[  453.578005][  T101] 2 locks held by rpc.nfsd/2761:
[  453.578644][  T101]  #0: ffff88810e204400 
(sb_writers#12){.+.+}-{0:0}, at: ksys_write+0xc9/0x160
[  453.579750][  T101]  #1: ffffffffa738e508 (nfsd_mutex){+.+.}-{4:4}, 
at: write_threads+0x14e/0x210
[  453.580871][  T101] 2 locks held by nfsdcltrack/2776:
[  453.581511][  T101]  #0: ffff88810e204400 
(sb_writers#12){.+.+}-{0:0}, at: ksys_write+0xc9/0x160
[  453.582644][  T101]  #1: ffffffffa738e508 (nfsd_mutex){+.+.}-{4:4}, 
at: write_v4_end_grace+0x94/0x160
[  453.583793][  T101]
[  453.584085][  T101] =============================================
[  453.584085][  T101]

CLIENT B:
write /proc/fs/nfsd/v4_end_grace between "nfs4_cld_state_shutdown sleep 
after free..." and "nfs4_cld_state_shutdown sleep done"
[root@nfs_test3 ~]# echo 1 > /proc/fs/nfsd/v4_end_grace

Processes:
[root@nfs_test3 ~]# ps aux | grep D
USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
root       378  0.1  0.0  97948  7008 ?        Ss   10:04   0:00 
/usr/sbin/sshd -D
root      2644  1.3  0.0 127676  8656 pts/0    Ds+  10:06   0:00 -bash
root      2745  0.5  0.0 270324  3356 ?        Ssl  10:06   0:00 
/usr/sbin/gssproxy -D
root      2761  0.9  0.0  33740  2808 ?        Ds   10:06   0:00 
/usr/sbin/rpc.nfsd -V 2 8
root      2776  0.0  0.0  19272  3076 ?        D    10:06   0:00 
/sbin/nfsdcltrack init
root      2885  0.0  0.0 119476  2320 pts/1    S+   10:07   0:00 grep 
--color=auto D
[root@nfs_test3 ~]# cat /proc/2644/stack
[<0>] write_v4_end_grace+0x94/0x160
[<0>] nfsctl_transaction_write+0x8b/0xf0
[<0>] vfs_write+0x160/0x7f0
[<0>] ksys_write+0xc9/0x160
[<0>] do_syscall_64+0x72/0x390
[<0>] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[root@nfs_test3 ~]# cat /proc/2761/stack
[<0>] call_usermodehelper_exec+0x283/0x2d0
[<0>] nfsd4_umh_cltrack_upcall+0xfc/0x1a0
[<0>] nfsd4_umh_cltrack_init+0x60/0x80
[<0>] nfsd4_client_tracking_init+0x1a7/0x260
[<0>] nfs4_state_start_net+0x63/0x90
[<0>] nfsd_startup_net+0x261/0x320
[<0>] nfsd_svc+0x103/0x1a0
[<0>] write_threads+0x170/0x210
[<0>] nfsctl_transaction_write+0x8b/0xf0
[<0>] vfs_write+0x160/0x7f0
[<0>] ksys_write+0xc9/0x160
[<0>] do_syscall_64+0x72/0x390
[<0>] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[root@nfs_test3 ~]# cat /proc/2776/stack
[<0>] write_v4_end_grace+0x94/0x160
[<0>] nfsctl_transaction_write+0x8b/0xf0
[<0>] vfs_write+0x160/0x7f0
[<0>] ksys_write+0xc9/0x160
[<0>] do_syscall_64+0x72/0x390
[<0>] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[root@nfs_test3 ~]#


在 2025/6/23 10:47, NeilBrown 写道:
write_foo functions are called to handle IO to files in /proc/fs/nfsd/.
They can be called at any time and so generally need locking to ensure
they don't happen at an awkward time.

Many already take nfsd_mutex and check if nfsd_serv has been set.  This
ensures they only run when the server is fully configured.

write_filehandle() does *not* need locking.  It interacts with the
export table which is set up when the netns is set up, so it is always
valid and it has its own locking.  write_filehandle() is needed before
the nfs server is started so checking nfsd_serv would be wrong.

The remaining files which do not have any locking are
write_v4_end_grace(), write_unlock_ip(), and write_unlock_fs().
None of these make sense when the nfs server is not running and there is
evidence that write_v4_end_grace() can race with ->client_tracking_op
setup/shutdown and cause problems.

This patch adds locking to these three and ensures the "unlock"
functions abort if ->nfsd_serv is not set.

Reported-and-tested-by: Li Lingfeng <lilingfeng3@xxxxxxxxxx>
Signed-off-by: NeilBrown <neil@xxxxxxxxxx>
---
  fs/nfsd/nfsctl.c | 40 +++++++++++++++++++++++++++++++++++++---
  1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3f3e9f6c4250..6b0cad81b5fa 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -214,13 +214,18 @@ static inline struct net *netns(struct file *file)
   *			returns one if one or more locks were not released
   *	On error:	return code is negative errno value
   */
-static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
+static ssize_t __write_unlock_ip(struct file *file, char *buf, size_t size)
  {
  	struct sockaddr_storage address;
  	struct sockaddr *sap = (struct sockaddr *)&address;
  	size_t salen = sizeof(address);
  	char *fo_path;
  	struct net *net = netns(file);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	if (!nn->nfsd_serv)
+		/* There cannot be any files to unlock */
+		return -EINVAL;
  
  	/* sanity check */
  	if (size == 0)
@@ -240,6 +245,16 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
  	return nlmsvc_unlock_all_by_ip(sap);
  }
  
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
+{
+	ssize_t ret;
+
+	mutex_lock(&nfsd_mutex);
+	ret = __write_unlock_ip(file, buf, size);
+	mutex_unlock(&nfsd_mutex);
+	return ret;
+}
+
  /*
   * write_unlock_fs - Release all locks on a local file system
   *
@@ -254,11 +269,16 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
   *			returns one if one or more locks were not released
   *	On error:	return code is negative errno value
   */
-static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
+static ssize_t __write_unlock_fs(struct file *file, char *buf, size_t size)
  {
  	struct path path;
  	char *fo_path;
  	int error;
+	struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
+
+	if (!nn->nfsd_serv)
+		/* There cannot be any files to unlock */
+		return -EINVAL;
  
  	/* sanity check */
  	if (size == 0)
@@ -291,6 +311,16 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
  	return error;
  }
  
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
+{
+	ssize_t ret;
+
+	mutex_lock(&nfsd_mutex);
+	ret = __write_unlock_fs(file, buf, size);
+	mutex_unlock(&nfsd_mutex);
+	return ret;
+}
+
  /*
   * write_filehandle - Get a variable-length NFS file handle by path
   *
@@ -1082,10 +1112,14 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
  		case 'Y':
  		case 'y':
  		case '1':
-			if (!nn->nfsd_serv)
+			mutex_lock(&nfsd_mutex);
+			if (!nn->nfsd_serv) {
+				mutex_unlock(&nfsd_mutex);
  				return -EBUSY;
+			}
  			trace_nfsd_end_grace(netns(file));
  			nfsd4_end_grace(nn);
+			mutex_unlock(&nfsd_mutex);
  			break;
  		default:
  			return -EINVAL;