Re: [PATCH RFT] nfsd: provide locking for v4_end_grace

Li Lingfeng <lilingfeng3@xxxxxxxxxx> · Tue, 1 Jul 2025 14:44:57 +0800

Hi, Neil
I tested this patch, but unfortunately it did not resolve the issue.

I think the concurrency situation should be like this:
nfsd_svc
 nfsd_startup_net
  nfs4_state_start_net
   nfs4_state_create_net
    INIT_DELAYED_WORK
    // nn->laundromat_work has been initialized
   nfsd4_client_tracking_init
    nfsd4_cld_tracking_init
                                                write_v4_end_grace
                                                 nfsd4_end_grace
nfsd4_record_grace_done
 nfsd4_cld_grace_done
alloc_cld_upcall
                                                     cn = nn->cld_net
                                                     spin_lock // 
cn->cn_lock
                                                     // null-ptr-deref
     __nfsd4_init_cld_pipe
      cn = kzalloc
      nn->cld_net = cn

The problem I encountered is writing v4_end_grace and service startup 
concurrency.
This patch moves the immediate tasks of write_v4_end_grace to 
laundromat_work.
I believe it can prevent deadlock issues, but it does not actually 
address the
null pointer dereferences caused by concurrency.

Thanks,
Lingfeng.

Base:
commit 86731a2a651e58953fc949573895f2fa6d456841 (tag: v6.16-rc3,
origin/master, origin/HEAD)
Author: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Date:   Sun Jun 22 13:30:08 2025 -0700

     Linux 6.16-rc3

Diff:

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 82785db730d9..0c6f0fbecc02 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -1008,7 +1008,8 @@ __nfsd4_init_cld_pipe(struct net *net)
        if (nn->cld_net)
                return 0;

-       cn = kzalloc(sizeof(*cn), GFP_KERNEL);
+       cn = NULL;//kzalloc(sizeof(*cn), GFP_KERNEL);
+       printk("%s force err inject\n", __func__);
        if (!cn) {
                ret = -ENOMEM;
                goto err;
@@ -1478,6 +1479,7 @@ nfs4_cld_state_init(struct net *net)
        nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE,
                                                sizeof(struct list_head),
                                                GFP_KERNEL);
+       printk("%s get nn->reclaim_str_hashtbl %px\n", __func__, 
nn->reclaim_str_hashtbl);
        if (!nn->reclaim_str_hashtbl)
                return -ENOMEM;

@@ -1496,7 +1498,12 @@ nfs4_cld_state_shutdown(struct net *net)
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);

        nn->track_reclaim_completes = false;
+       printk("%s free nn->reclaim_str_hashtbl %px\n", __func__, 
nn->reclaim_str_hashtbl);
        kfree(nn->reclaim_str_hashtbl);
+       printk("%s free nn->reclaim_str_hashtbl %px done\n", __func__, 
nn->reclaim_str_hashtbl);
+       printk("%s sleep after free...\n", __func__);
+       msleep(10 * 1000);
+       printk("%s sleep done\n", __func__);
 }

 static bool

CLIENT A:
[root@nfs_test3 ~]# mount /dev/sdb /mnt/sdb
[root@nfs_test3 ~]# echo "/mnt *(rw,no_root_squash,fsid=0)" > /etc/exports
[root@nfs_test3 ~]# echo "/mnt/sdb *(rw,no_root_squash,fsid=1)" >> 
/etc/exports
[root@nfs_test3 ~]# systemctl restart nfs-server
[  142.103887][ T2762] nfs4_cld_state_init get nn->reclaim_str_hashtbl 
ffff88819366f000
[  142.104971][ T2762] __nfsd4_init_cld_pipe force err inject
[  142.105631][ T2762] NFSD: unable to create nfsdcld upcall pipe (-12)
[  142.106384][ T2762] nfs4_cld_state_shutdown free 
nn->reclaim_str_hashtbl ffff88819366f000
[  142.107366][ T2762] nfs4_cld_state_shutdown free 
nn->reclaim_str_hashtbl ffff88819366f000 done
[  142.108373][ T2762] nfs4_cld_state_shutdown sleep after free...
[  142.872490][  T149] Oops: general protection fault, probably for 
non-canonical address 0xdffffc0000000004: 0000 [#1] SMP KASAN PTI
[  142.874022][  T149] KASAN: null-ptr-deref in range 
[0x0000000000000020-0x0000000000000027]
[  142.875066][  T149] CPU: 10 UID: 0 PID: 149 Comm: kworker/u64:4 Not 
tainted 6.16.0-rc3-00001-g4c30cab673ec-dirty #87 PREEMPT(undef)
[  142.876545][  T149] Hardware name: QEMU Standard PC (i440FX + PIIX, 
1996), BIOS 1.16.3-2.fc40 04/01/2014
[  142.877725][  T149] Workqueue: nfsd4 laundromat_main
[  142.878381][  T149] RIP: 0010:kasan_byte_accessible+0x15/0x30
[  142.879112][  T149] Code: 00 00 0f 1f 00 90 90 90 90 90 90 90 90 90 
90 90 90 90 90 90 90 66 0f 1f 00 48 b8 00 00 00 00 00 fc ff df 48 c1 ef 
03 48 01 c7 <0f> b6 07 3c 07 0f 96 c00
[  142.881501][  T149] RSP: 0018:ffffc900015d7968 EFLAGS: 00010282
[  142.882251][  T149] RAX: dffffc0000000000 RBX: 0000000000000020 RCX: 
0000000000000000
[  142.883229][  T149] RDX: 0000000000000000 RSI: ffffffffa27a4620 RDI: 
dffffc0000000004
[  142.884216][  T149] RBP: 0000000000000020 R08: 0000000000000001 R09: 
0000000000000000
[  142.885195][  T149] R10: ffffffffa17ffd3a R11: ffffffffa0d66dbe R12: 
ffffffffa27a4620
[  142.886176][  T149] R13: 0000000000000001 R14: 0000000000000000 R15: 
0000000000000000
[  142.887150][  T149] FS:  0000000000000000(0000) 
GS:ffff888e52e1b000(0000) knlGS:0000000000000000
[  142.888246][  T149] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  142.889059][  T149] CR2: 0000557d474b9308 CR3: 0000000f2408e000 CR4: 
00000000000006f0
[  142.890029][  T149] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[  142.891008][  T149] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 
0000000000000400
[  142.891977][  T149] Call Trace:
[  142.892390][  T149]  <TASK>
[  142.892753][  T149]  __kasan_check_byte+0x13/0x50
[  142.893363][  T149]  lock_acquire.part.0+0x36/0x220
[  142.893990][  T149]  ? rcu_is_watching+0x20/0x50
[  142.894592][  T149]  ? lock_acquire+0xf2/0x140
[  142.895162][  T149]  ? alloc_cld_upcall+0x6a/0x1e0
[  142.895776][  T149]  _raw_spin_lock+0x30/0x40
[  142.896339][  T149]  ? alloc_cld_upcall+0x6a/0x1e0
[  142.896948][  T149]  alloc_cld_upcall+0x6a/0x1e0
[  142.897543][  T149]  nfsd4_cld_grace_done+0x93/0x270
[  142.898174][  T149]  ? __pfx_nfsd4_cld_grace_done+0x10/0x10
[  142.898889][  T149]  ? find_held_lock+0x2b/0x80
[  142.899473][  T149]  ? nfs4_laundromat+0x8e/0xc30
[  142.900071][  T149]  ? nfs4_laundromat+0x8e/0xc30
[  142.900660][  T149]  ? mark_held_locks+0x40/0x70
[  142.901259][  T149]  nfsd4_end_grace.part.0+0x51/0x120
[  142.901913][  T149]  nfs4_laundromat+0x29a/0xc30
[  142.902519][  T149]  ? __pfx_nfs4_laundromat+0x10/0x10
[  142.903173][  T149]  ? __lock_release.isra.0+0x5f/0x180
[  142.903850][  T149]  ? lock_acquire.part.0+0xac/0x220
[  142.904493][  T149]  ? process_one_work+0x72b/0x8a0
[  142.905112][  T149]  ? rcu_is_watching+0x20/0x50
[  142.905708][  T149]  ? process_one_work+0x72b/0x8a0
[  142.906336][  T149]  laundromat_main+0x19/0x40
[  142.906903][  T149]  process_one_work+0x7ab/0x8a0
[  142.907509][  T149]  ? process_one_work+0x72b/0x8a0
[  142.908130][  T149]  ? process_one_work+0x72b/0x8a0
[  142.908764][  T149]  ? __pfx_process_one_work+0x10/0x10
[  142.909433][  T149]  ? __list_add_valid_or_report+0x37/0x100
[  142.910160][  T149]  worker_thread+0x390/0x690
[  142.910738][  T149]  ? __kthread_parkme+0xe8/0x110
[  142.911355][  T149]  ? __pfx_worker_thread+0x10/0x10
[  142.911990][  T149]  kthread+0x23c/0x3d0
[  142.912499][  T149]  ? kthread+0x12d/0x3d0
[  142.913024][  T149]  ? __pfx_kthread+0x10/0x10
[  142.913601][  T149]  ? ret_from_fork+0x1d/0x2b0
[  142.914180][  T149]  ? __lock_release.isra.0+0x5f/0x180
[  142.914853][  T149]  ? rcu_is_watching+0x20/0x50
[  142.915450][  T149]  ? __pfx_kthread+0x10/0x10
[  142.916018][  T149]  ret_from_fork+0x21e/0x2b0
[  142.916588][  T149]  ? __pfx_kthread+0x10/0x10
[  142.917167][  T149]  ret_from_fork_asm+0x1a/0x30
[  142.917766][  T149]  </TASK>
[  142.918146][  T149] Modules linked in:
[  142.918674][  T149] ---[ end trace 0000000000000000 ]---
[  142.919360][  T149] RIP: 0010:kasan_byte_accessible+0x15/0x30
[  142.920099][  T149] Code: 00 00 0f 1f 00 90 90 90 90 90 90 90 90 90 
90 90 90 90 90 90 90 66 0f 1f 00 48 b8 00 00 00 00 00 fc ff df 48 c1 ef 
03 48 01 c7 <0f> b6 07 3c 07 0f 96 c00
[  142.922522][  T149] RSP: 0018:ffffc900015d7968 EFLAGS: 00010282
[  142.923276][  T149] RAX: dffffc0000000000 RBX: 0000000000000020 RCX: 
0000000000000000
[  142.924271][  T149] RDX: 0000000000000000 RSI: ffffffffa27a4620 RDI: 
dffffc0000000004
[  142.925297][  T149] RBP: 0000000000000020 R08: 0000000000000001 R09: 
0000000000000000
[  142.926282][  T149] R10: ffffffffa17ffd3a R11: ffffffffa0d66dbe R12: 
ffffffffa27a4620
[  142.927280][  T149] R13: 0000000000000001 R14: 0000000000000000 R15: 
0000000000000000
[  142.928259][  T149] FS:  0000000000000000(0000) 
GS:ffff888e52e1b000(0000) knlGS:0000000000000000
[  142.929382][  T149] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  142.930198][  T149] CR2: 0000557d474b9308 CR3: 0000000f2408e000 CR4: 
00000000000006f0
[  142.931185][  T149] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[  142.932173][  T149] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 
0000000000000400
[  142.933175][  T149] Kernel panic - not syncing: Fatal exception
[  142.934551][  T149] Kernel Offset: 0x1fa00000 from 0xffffffff81000000 
(relocation range: 0xffffffff80000000-0xffffffffbfffffff)
[  142.935979][  T149] ---[ end Kernel panic - not syncing: Fatal 
exception ]---

CLIENT B:
write /proc/fs/nfsd/v4_end_grace between "nfs4_cld_state_shutdown sleep
after free..." and "nfs4_cld_state_shutdown sleep done"
[root@nfs_test3 ~]# echo 1 > /proc/fs/nfsd/v4_end_grace

在 2025/7/1 10:14, NeilBrown 写道:
Writing to v4_end_grace can race with server shutdown and result in
memory being accessed after it was freed - reclaim_str_hashtbl in
particular.

We cannot hold nfsd_mutex across the nfsd4_end_grace() call as that is
held while client_tracking_op->init() is called and that can wait for
an upcall to nfsdcltrack which can write to v4_end_grace, resulting in a
deadlock.

nfsd4_end_grace() is also called by the landromat work queue and this
doesn't require locking as server shutdown will stop the work and wait
for it before freeing anything that nfsd4_end_grace() might access.

However, we must be sure that writing to v4_end_grace doesn't restart
the work item after shutdown has already waited for it.  For this we can
use disable_delayed_work_sync() instead of cancel_delayed_work_sync().

So this patch adds a nfsd_net field "grace_end_forced", sets that when
v4_end_grace is written, and schedules the laundromat (providing it
hasn't been disabled).  This field bypasses other checks for whether the
grace period has finished.  The delayed work is disabled before
nfsd4_client_tracking_exit() is call to shutdown client tracking.

This resolves a race which can result in use-after-free.

Note that disable_delayed_work_sync() was added in v6.10.  To backport
to an earlier kernel without that interface the exclusion could be
provided by some spinlock that was released in the shutdown path
after ->nfsd_serv is set to NULL.  It would need to be taken before
the test on nfsd_serv in write_v4_end_grace() and released after
nfsd4_force_end_grace() is called.

Reported-by: Li Lingfeng <lilingfeng3@xxxxxxxxxx>
Cc: stable@xxxxxxxxxxxxxxx
Signed-off-by: NeilBrown <neil@xxxxxxxxxx>
---
  fs/nfsd/netns.h     |  1 +
  fs/nfsd/nfs4state.c | 19 ++++++++++++++++---
  fs/nfsd/nfsctl.c    |  7 ++++++-
  fs/nfsd/state.h     |  2 +-
  4 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 3e2d0fde80a7..d83c68872c4c 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -66,6 +66,7 @@ struct nfsd_net {
  
  	struct lock_manager nfsd4_manager;
  	bool grace_ended;
+	bool grace_end_forced;
  	time64_t boot_time;
  
  	struct dentry *nfsd_client_dir;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d5694987f86f..b34f157334e6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -84,7 +84,7 @@ static u64 current_sessionid = 1;
  /* forward declarations */
  static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner);
  static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
-void nfsd4_end_grace(struct nfsd_net *nn);
+static void nfsd4_end_grace(struct nfsd_net *nn);
  static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps);
  static void nfsd4_file_hash_remove(struct nfs4_file *fi);
  static void deleg_reaper(struct nfsd_net *nn);
@@ -6458,7 +6458,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
  	return nfs_ok;
  }
  
-void
+static void
  nfsd4_end_grace(struct nfsd_net *nn)
  {
  	/* do nothing if grace period already ended */
@@ -6491,6 +6491,16 @@ nfsd4_end_grace(struct nfsd_net *nn)
  	 */
  }
  
+void
+nfsd4_force_end_grace(struct nfsd_net *nn)
+{
+	nn->grace_end_forced = true;
+	/* This is a no-op after nfs4_state_shutdown_net() has called
+	 * disable_delayed_work_sync()
+	 */
+	mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
+}
+
  /*
   * If we've waited a lease period but there are still clients trying to
   * reclaim, wait a little longer to give them a chance to finish.
@@ -6500,6 +6510,8 @@ static bool clients_still_reclaiming(struct nfsd_net *nn)
  	time64_t double_grace_period_end = nn->boot_time +
  					   2 * nn->nfsd4_lease;
  
+	if (nn->grace_end_forced)
+		return false;
  	if (nn->track_reclaim_completes &&
  			atomic_read(&nn->nr_reclaim_complete) ==
  			nn->reclaim_str_hashtbl_size)
@@ -8807,6 +8819,7 @@ static int nfs4_state_create_net(struct net *net)
  	nn->unconf_name_tree = RB_ROOT;
  	nn->boot_time = ktime_get_real_seconds();
  	nn->grace_ended = false;
+	nn->grace_end_forced = false;
  	nn->nfsd4_manager.block_opens = true;
  	INIT_LIST_HEAD(&nn->nfsd4_manager.list);
  	INIT_LIST_HEAD(&nn->client_lru);
@@ -8935,7 +8948,7 @@ nfs4_state_shutdown_net(struct net *net)
  
  	shrinker_free(nn->nfsd_client_shrinker);
  	cancel_work_sync(&nn->nfsd_shrinker_work);
-	cancel_delayed_work_sync(&nn->laundromat_work);
+	disable_delayed_work_sync(&nn->laundromat_work);
  	locks_end_grace(&nn->nfsd4_manager);
  
  	INIT_LIST_HEAD(&reaplist);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3f3e9f6c4250..a9e6c2a155da 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1082,10 +1082,15 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
  		case 'Y':
  		case 'y':
  		case '1':
+			/* This test ensures we don't try to
+			 * end grace before the server has been started,
+			 * but doesn't guarantee we don't end grace
+			 * while the server is being shut down.
+			 */
  			if (!nn->nfsd_serv)
  				return -EBUSY;
  			trace_nfsd_end_grace(netns(file));
-			nfsd4_end_grace(nn);
+			nfsd4_force_end_grace(nn);
  			break;
  		default:
  			return -EINVAL;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 1995bca158b8..e2bea9908fa8 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -836,7 +836,7 @@ static inline void nfsd4_revoke_states(struct net *net, struct super_block *sb)
  #endif
  
  /* grace period management */
-void nfsd4_end_grace(struct nfsd_net *nn);
+void nfsd4_force_end_grace(struct nfsd_net *nn);
  
  /* nfs4recover operations */
  extern int nfsd4_client_tracking_init(struct net *net);