Re: [PATCH 1/4] nfsd: provide proper locking for all write_ function

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, 24 Jun 2025, Li Lingfeng wrote:
> Hi,
> 
> During my validation of this fix patch, I encountered a deadlock issue
> that wasn't present during last Saturday's verification.
> Process 2761 holds the nfsd_mutex while sending an upcall request,
> triggering process 2776.

Thanks for testing!  Holding a mutex while sending an upcall is
definitely bad.  We might have to drop and retake the mutex and add more
checks after the retake.  I'll have a look.

> This process must complete writing to end_grace before responding to the
> upcall, but it gets blocked while attempting to acquire the nfsd_mutex,
> causing a deadlock.
> I revalidated the previous solution and found the same issue.
> I'm unsure what changed between validations.
> Additionally, why was the locking mechanism changed from guard(mutex) in
> the previous version to explicit mutex_lock/mutex_unlock in this version?

I thought the mutex_lock/unlock version would be easier to backport.  We
haven't had guard() for all that long.  The behaviour should be the
same.

Thanks,
NeilBrown



> 
> Thanks.
> 
> Base:
> commit 86731a2a651e58953fc949573895f2fa6d456841 (tag: v6.16-rc3, 
> origin/master, origin/HEAD)
> Author: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
> Date:   Sun Jun 22 13:30:08 2025 -0700
> 
>      Linux 6.16-rc3
> 
> Diff:
> diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> index 82785db730d9..0c6f0fbecc02 100644
> --- a/fs/nfsd/nfs4recover.c
> +++ b/fs/nfsd/nfs4recover.c
> @@ -1008,7 +1008,8 @@ __nfsd4_init_cld_pipe(struct net *net)
>          if (nn->cld_net)
>                  return 0;
> 
> -       cn = kzalloc(sizeof(*cn), GFP_KERNEL);
> +       cn = NULL;//kzalloc(sizeof(*cn), GFP_KERNEL);
> +       printk("%s force err inject\n", __func__);
>          if (!cn) {
>                  ret = -ENOMEM;
>                  goto err;
> @@ -1478,6 +1479,7 @@ nfs4_cld_state_init(struct net *net)
>          nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE,
>                                                  sizeof(struct list_head),
>                                                  GFP_KERNEL);
> +       printk("%s get nn->reclaim_str_hashtbl %px\n", __func__, 
> nn->reclaim_str_hashtbl);
>          if (!nn->reclaim_str_hashtbl)
>                  return -ENOMEM;
> 
> @@ -1496,7 +1498,12 @@ nfs4_cld_state_shutdown(struct net *net)
>          struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> 
>          nn->track_reclaim_completes = false;
> +       printk("%s free nn->reclaim_str_hashtbl %px\n", __func__, 
> nn->reclaim_str_hashtbl);
>          kfree(nn->reclaim_str_hashtbl);
> +       printk("%s free nn->reclaim_str_hashtbl %px done\n", __func__, 
> nn->reclaim_str_hashtbl);
> +       printk("%s sleep after free...\n", __func__);
> +       msleep(10 * 1000);
> +       printk("%s sleep done\n", __func__);
>   }
> 
>   static bool
> 
> 
> CLIENT A:
> [root@nfs_test3 ~]# mount /dev/sdb /mnt/sdb
> [root@nfs_test3 ~]# echo "/mnt *(rw,no_root_squash,fsid=0)" > /etc/exports
> [root@nfs_test3 ~]# echo "/mnt/sdb *(rw,no_root_squash,fsid=1)" >> 
> /etc/exports
> [root@nfs_test3 ~]# systemctl restart nfs-server
> [  229.107168][ T2761] nfs4_cld_state_init get nn->reclaim_str_hashtbl 
> ffff88810e20ba00
> [  229.108175][ T2761] __nfsd4_init_cld_pipe force err inject
> [  229.108819][ T2761] NFSD: unable to create nfsdcld upcall pipe (-12)
> [  229.109568][ T2761] nfs4_cld_state_shutdown free 
> nn->reclaim_str_hashtbl ffff88810e20ba00
> [  229.110601][ T2761] nfs4_cld_state_shutdown free 
> nn->reclaim_str_hashtbl ffff88810e20ba00 done
> [  229.111599][ T2761] nfs4_cld_state_shutdown sleep after free...
> [  239.282399][ T2761] nfs4_cld_state_shutdown sleep done
> [  239.283083][ T2761] __nfsd4_init_cld_pipe force err inject
> [  239.283734][ T2761] NFSD: unable to create nfsdcld upcall pipe (-12)
> [  453.554502][  T101] INFO: task bash:2644 blocked for more than 147 
> seconds.
> [  453.555494][  T101]       Not tainted 
> 6.16.0-rc3-00001-g4e8c356736df-dirty #84
> [  453.556408][  T101] "echo 0 > 
> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  453.558022][  T101] INFO: task bash:2644 is blocked on a mutex likely 
> owned by task rpc.nfsd:2761.
> [  453.559949][  T101] INFO: task rpc.nfsd:2761 blocked for more than 
> 147 seconds.
> [  453.560868][  T101]       Not tainted 
> 6.16.0-rc3-00001-g4e8c356736df-dirty #84
> [  453.561770][  T101] "echo 0 > 
> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  453.563573][  T101] INFO: task nfsdcltrack:2776 blocked for more than 
> 147 seconds.
> [  453.564516][  T101]       Not tainted 
> 6.16.0-rc3-00001-g4e8c356736df-dirty #84
> [  453.565431][  T101] "echo 0 > 
> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  453.566945][  T101] INFO: task nfsdcltrack:2776 is blocked on a mutex 
> likely owned by task rpc.nfsd:2761.
> [  453.568860][  T101]
> [  453.568860][  T101] Showing all locks held in the system:
> [  453.569816][  T101] 1 lock held by khungtaskd/101:
> [  453.570469][  T101]  #0: ffffffffa6f66120 
> (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire.constprop.0+0x7/0x30
> [  453.571753][  T101] 2 locks held by kworker/u64:4/149:
> [  453.572402][  T101]  #0: ffff8881001a5948 
> ((wq_completion)events_unbound){+.+.}-{0:0}, at: 
> process_one_work+0x72b/0x8a0
> [  453.573758][  T101]  #1: ffffc90000bcfd60 
> ((work_completion)(&sub_info->work)){+.+.}-{0:0}, at: 
> process_one_work+0x72b/0x8a0
> [  453.575172][  T101] 2 locks held by bash/2644:
> [  453.575742][  T101]  #0: ffff88810e204400 
> (sb_writers#12){.+.+}-{0:0}, at: ksys_write+0xc9/0x160
> [  453.576856][  T101]  #1: ffffffffa738e508 (nfsd_mutex){+.+.}-{4:4}, 
> at: write_v4_end_grace+0x94/0x160
> [  453.578005][  T101] 2 locks held by rpc.nfsd/2761:
> [  453.578644][  T101]  #0: ffff88810e204400 
> (sb_writers#12){.+.+}-{0:0}, at: ksys_write+0xc9/0x160
> [  453.579750][  T101]  #1: ffffffffa738e508 (nfsd_mutex){+.+.}-{4:4}, 
> at: write_threads+0x14e/0x210
> [  453.580871][  T101] 2 locks held by nfsdcltrack/2776:
> [  453.581511][  T101]  #0: ffff88810e204400 
> (sb_writers#12){.+.+}-{0:0}, at: ksys_write+0xc9/0x160
> [  453.582644][  T101]  #1: ffffffffa738e508 (nfsd_mutex){+.+.}-{4:4}, 
> at: write_v4_end_grace+0x94/0x160
> [  453.583793][  T101]
> [  453.584085][  T101] =============================================
> [  453.584085][  T101]
> 
> CLIENT B:
> write /proc/fs/nfsd/v4_end_grace between "nfs4_cld_state_shutdown sleep 
> after free..." and "nfs4_cld_state_shutdown sleep done"
> [root@nfs_test3 ~]# echo 1 > /proc/fs/nfsd/v4_end_grace
> 
> Processes:
> [root@nfs_test3 ~]# ps aux | grep D
> USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
> root       378  0.1  0.0  97948  7008 ?        Ss   10:04   0:00 
> /usr/sbin/sshd -D
> root      2644  1.3  0.0 127676  8656 pts/0    Ds+  10:06   0:00 -bash
> root      2745  0.5  0.0 270324  3356 ?        Ssl  10:06   0:00 
> /usr/sbin/gssproxy -D
> root      2761  0.9  0.0  33740  2808 ?        Ds   10:06   0:00 
> /usr/sbin/rpc.nfsd -V 2 8
> root      2776  0.0  0.0  19272  3076 ?        D    10:06   0:00 
> /sbin/nfsdcltrack init
> root      2885  0.0  0.0 119476  2320 pts/1    S+   10:07   0:00 grep 
> --color=auto D
> [root@nfs_test3 ~]# cat /proc/2644/stack
> [<0>] write_v4_end_grace+0x94/0x160
> [<0>] nfsctl_transaction_write+0x8b/0xf0
> [<0>] vfs_write+0x160/0x7f0
> [<0>] ksys_write+0xc9/0x160
> [<0>] do_syscall_64+0x72/0x390
> [<0>] entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [root@nfs_test3 ~]# cat /proc/2761/stack
> [<0>] call_usermodehelper_exec+0x283/0x2d0
> [<0>] nfsd4_umh_cltrack_upcall+0xfc/0x1a0
> [<0>] nfsd4_umh_cltrack_init+0x60/0x80
> [<0>] nfsd4_client_tracking_init+0x1a7/0x260
> [<0>] nfs4_state_start_net+0x63/0x90
> [<0>] nfsd_startup_net+0x261/0x320
> [<0>] nfsd_svc+0x103/0x1a0
> [<0>] write_threads+0x170/0x210
> [<0>] nfsctl_transaction_write+0x8b/0xf0
> [<0>] vfs_write+0x160/0x7f0
> [<0>] ksys_write+0xc9/0x160
> [<0>] do_syscall_64+0x72/0x390
> [<0>] entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [root@nfs_test3 ~]# cat /proc/2776/stack
> [<0>] write_v4_end_grace+0x94/0x160
> [<0>] nfsctl_transaction_write+0x8b/0xf0
> [<0>] vfs_write+0x160/0x7f0
> [<0>] ksys_write+0xc9/0x160
> [<0>] do_syscall_64+0x72/0x390
> [<0>] entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [root@nfs_test3 ~]#
> 
> 
> 在 2025/6/23 10:47, NeilBrown 写道:
> > write_foo functions are called to handle IO to files in /proc/fs/nfsd/.
> > They can be called at any time and so generally need locking to ensure
> > they don't happen at an awkward time.
> >
> > Many already take nfsd_mutex and check if nfsd_serv has been set.  This
> > ensures they only run when the server is fully configured.
> >
> > write_filehandle() does *not* need locking.  It interacts with the
> > export table which is set up when the netns is set up, so it is always
> > valid and it has its own locking.  write_filehandle() is needed before
> > the nfs server is started so checking nfsd_serv would be wrong.
> >
> > The remaining files which do not have any locking are
> > write_v4_end_grace(), write_unlock_ip(), and write_unlock_fs().
> > None of these make sense when the nfs server is not running and there is
> > evidence that write_v4_end_grace() can race with ->client_tracking_op
> > setup/shutdown and cause problems.
> >
> > This patch adds locking to these three and ensures the "unlock"
> > functions abort if ->nfsd_serv is not set.
> >
> > Reported-and-tested-by: Li Lingfeng <lilingfeng3@xxxxxxxxxx>
> > Signed-off-by: NeilBrown <neil@xxxxxxxxxx>
> > ---
> >   fs/nfsd/nfsctl.c | 40 +++++++++++++++++++++++++++++++++++++---
> >   1 file changed, 37 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> > index 3f3e9f6c4250..6b0cad81b5fa 100644
> > --- a/fs/nfsd/nfsctl.c
> > +++ b/fs/nfsd/nfsctl.c
> > @@ -214,13 +214,18 @@ static inline struct net *netns(struct file *file)
> >    *			returns one if one or more locks were not released
> >    *	On error:	return code is negative errno value
> >    */
> > -static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
> > +static ssize_t __write_unlock_ip(struct file *file, char *buf, size_t size)
> >   {
> >   	struct sockaddr_storage address;
> >   	struct sockaddr *sap = (struct sockaddr *)&address;
> >   	size_t salen = sizeof(address);
> >   	char *fo_path;
> >   	struct net *net = netns(file);
> > +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > +
> > +	if (!nn->nfsd_serv)
> > +		/* There cannot be any files to unlock */
> > +		return -EINVAL;
> >   
> >   	/* sanity check */
> >   	if (size == 0)
> > @@ -240,6 +245,16 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
> >   	return nlmsvc_unlock_all_by_ip(sap);
> >   }
> >   
> > +static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
> > +{
> > +	ssize_t ret;
> > +
> > +	mutex_lock(&nfsd_mutex);
> > +	ret = __write_unlock_ip(file, buf, size);
> > +	mutex_unlock(&nfsd_mutex);
> > +	return ret;
> > +}
> > +
> >   /*
> >    * write_unlock_fs - Release all locks on a local file system
> >    *
> > @@ -254,11 +269,16 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
> >    *			returns one if one or more locks were not released
> >    *	On error:	return code is negative errno value
> >    */
> > -static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
> > +static ssize_t __write_unlock_fs(struct file *file, char *buf, size_t size)
> >   {
> >   	struct path path;
> >   	char *fo_path;
> >   	int error;
> > +	struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
> > +
> > +	if (!nn->nfsd_serv)
> > +		/* There cannot be any files to unlock */
> > +		return -EINVAL;
> >   
> >   	/* sanity check */
> >   	if (size == 0)
> > @@ -291,6 +311,16 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
> >   	return error;
> >   }
> >   
> > +static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
> > +{
> > +	ssize_t ret;
> > +
> > +	mutex_lock(&nfsd_mutex);
> > +	ret = __write_unlock_fs(file, buf, size);
> > +	mutex_unlock(&nfsd_mutex);
> > +	return ret;
> > +}
> > +
> >   /*
> >    * write_filehandle - Get a variable-length NFS file handle by path
> >    *
> > @@ -1082,10 +1112,14 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
> >   		case 'Y':
> >   		case 'y':
> >   		case '1':
> > -			if (!nn->nfsd_serv)
> > +			mutex_lock(&nfsd_mutex);
> > +			if (!nn->nfsd_serv) {
> > +				mutex_unlock(&nfsd_mutex);
> >   				return -EBUSY;
> > +			}
> >   			trace_nfsd_end_grace(netns(file));
> >   			nfsd4_end_grace(nn);
> > +			mutex_unlock(&nfsd_mutex);
> >   			break;
> >   		default:
> >   			return -EINVAL;
> 






[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux