[PATCH] [UNTESTED] fs/namespace: defer RCU sync for MNT_DETACH umount

Eric Chanudet <echanude@xxxxxxxxxx> · Tue, 8 Apr 2025 16:58:34 -0400

Defer releasing the detached file-system when calling namespace_unlock()
during a lazy umount to return faster.

When requesting MNT_DETACH, the caller does not expect the file-system
to be shut down upon returning from the syscall. Calling
synchronize_rcu_expedited() has a significant cost on RT kernel that
defaults to rcupdate.rcu_normal_after_boot=1. Queue the detached struct
mount in a separate list and put it on a workqueue to run post RCU
grace-period.

w/o patch, 6.15-rc1 PREEMPT_RT:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
    0.02455 +- 0.00107 seconds time elapsed  ( +-  4.36% )
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
    0.02555 +- 0.00114 seconds time elapsed  ( +-  4.46% )

w/ patch, 6.15-rc1 PREEMPT_RT:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
    0.026311 +- 0.000869 seconds time elapsed  ( +-  3.30% )
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
    0.003194 +- 0.000160 seconds time elapsed  ( +-  5.01% )

Signed-off-by: Alexander Larsson <alexl@xxxxxxxxxx>
Signed-off-by: Lucas Karpinski <lkarpins@xxxxxxxxxx>
Signed-off-by: Eric Chanudet <echanude@xxxxxxxxxx>
Link: https://lore.kernel.org/20250408210350.749901-12-echanude@xxxxxxxxxx
Not-Tested-by: Christian Brauner <brauner@xxxxxxxxxx>
Massaged-With-Great-Shame-by: Christian Brauner <brauner@xxxxxxxxxx>
---
 fs/namespace.c | 78 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 62 insertions(+), 16 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index bc23c0e1fb9d..c36debbc5135 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -45,6 +45,11 @@ static unsigned int m_hash_shift __ro_after_init;
 static unsigned int mp_hash_mask __ro_after_init;
 static unsigned int mp_hash_shift __ro_after_init;
 
+struct deferred_free_mounts {
+	struct rcu_work rwork;
+	struct hlist_head release_list;
+};
+
 static __initdata unsigned long mhash_entries;
 static int __init set_mhash_entries(char *str)
 {
@@ -77,8 +82,9 @@ static struct hlist_head *mount_hashtable __ro_after_init;
 static struct hlist_head *mountpoint_hashtable __ro_after_init;
 static struct kmem_cache *mnt_cache __ro_after_init;
 static DECLARE_RWSEM(namespace_sem);
-static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
-static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+static bool defer_unmount;		/* protected by namespace_sem */
+static HLIST_HEAD(unmounted);		/* protected by namespace_sem */
+static LIST_HEAD(ex_mountpoints);	/* protected by namespace_sem */
 static DEFINE_SEQLOCK(mnt_ns_tree_lock);
 
 #ifdef CONFIG_FSNOTIFY
@@ -1412,7 +1418,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	return ERR_PTR(err);
 }
 
-static void cleanup_mnt(struct mount *mnt)
+static void __mntput_no_expire(struct mount *mnt, bool cleanup_sync);
+
+static void cleanup_mnt(struct mount *mnt, bool cleanup_sync)
 {
 	struct hlist_node *p;
 	struct mount *m;
@@ -1428,7 +1436,9 @@ static void cleanup_mnt(struct mount *mnt)
 		mnt_pin_kill(mnt);
 	hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
 		hlist_del(&m->mnt_umount);
-		mntput(&m->mnt);
+		if (unlikely(m->mnt_expiry_mark))
+			WRITE_ONCE(m->mnt_expiry_mark, 0);
+		__mntput_no_expire(m, cleanup_sync);
 	}
 	fsnotify_vfsmount_delete(&mnt->mnt);
 	dput(mnt->mnt.mnt_root);
@@ -1439,7 +1449,7 @@ static void cleanup_mnt(struct mount *mnt)
 
 static void __cleanup_mnt(struct rcu_head *head)
 {
-	cleanup_mnt(container_of(head, struct mount, mnt_rcu));
+	cleanup_mnt(container_of(head, struct mount, mnt_rcu), false /* cleanup sync */);
 }
 
 static LLIST_HEAD(delayed_mntput_list);
@@ -1449,11 +1459,11 @@ static void delayed_mntput(struct work_struct *unused)
 	struct mount *m, *t;
 
 	llist_for_each_entry_safe(m, t, node, mnt_llist)
-		cleanup_mnt(m);
+		cleanup_mnt(m, false /* cleanup sync */);
 }
 static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
 
-static void mntput_no_expire(struct mount *mnt)
+static void __mntput_no_expire(struct mount *mnt, bool cleanup_sync)
 {
 	LIST_HEAD(list);
 	int count;
@@ -1507,7 +1517,7 @@ static void mntput_no_expire(struct mount *mnt)
 	unlock_mount_hash();
 	shrink_dentry_list(&list);
 
-	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
+	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL) && !cleanup_sync)) {
 		struct task_struct *task = current;
 		if (likely(!(task->flags & PF_KTHREAD))) {
 			init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
@@ -1518,7 +1528,12 @@ static void mntput_no_expire(struct mount *mnt)
 			schedule_delayed_work(&delayed_mntput_work, 1);
 		return;
 	}
-	cleanup_mnt(mnt);
+	cleanup_mnt(mnt, cleanup_sync);
+}
+
+static inline void mntput_no_expire(struct mount *mnt)
+{
+	__mntput_no_expire(mnt, false);
 }
 
 void mntput(struct vfsmount *mnt)
@@ -1789,15 +1804,37 @@ static bool need_notify_mnt_list(void)
 }
 #endif
 
-static void namespace_unlock(void)
+static void free_mounts(struct hlist_head *mount_list, bool cleanup_sync)
 {
-	struct hlist_head head;
 	struct hlist_node *p;
 	struct mount *m;
+
+	hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) {
+		hlist_del(&m->mnt_umount);
+		if (unlikely(m->mnt_expiry_mark))
+			WRITE_ONCE(m->mnt_expiry_mark, 0);
+		__mntput_no_expire(m, cleanup_sync);
+	}
+}
+
+static void defer_free_mounts(struct work_struct *work)
+{
+	struct deferred_free_mounts *d;
+
+	d = container_of(to_rcu_work(work), struct deferred_free_mounts, rwork);
+	free_mounts(&d->release_list, true /* cleanup_sync */);
+	kfree(d);
+}
+
+static void namespace_unlock(void)
+{
+	HLIST_HEAD(head);
 	LIST_HEAD(list);
+	bool defer = defer_unmount;
 
 	hlist_move_list(&unmounted, &head);
 	list_splice_init(&ex_mountpoints, &list);
+	defer_unmount = false;
 
 	if (need_notify_mnt_list()) {
 		/*
@@ -1817,12 +1854,19 @@ static void namespace_unlock(void)
 	if (likely(hlist_empty(&head)))
 		return;
 
-	synchronize_rcu_expedited();
+	if (defer) {
+		struct deferred_free_mounts *d;
 
-	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
-		hlist_del(&m->mnt_umount);
-		mntput(&m->mnt);
+		d = kmalloc(sizeof(struct deferred_free_mounts), GFP_KERNEL);
+		if (d) {
+			hlist_move_list(&head, &d->release_list);
+			INIT_RCU_WORK(&d->rwork, defer_free_mounts);
+			queue_rcu_work(system_unbound_wq, &d->rwork);
+			return;
+		}
 	}
+	synchronize_rcu_expedited();
+	free_mounts(&head, false /* cleanup_sync */);
 }
 
 static inline void namespace_lock(void)
@@ -2044,8 +2088,10 @@ static int do_umount(struct mount *mnt, int flags)
 
 	event++;
 	if (flags & MNT_DETACH) {
-		if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
+		if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) {
 			umount_tree(mnt, UMOUNT_PROPAGATE);
+			defer_unmount = true;
+		}
 		retval = 0;
 	} else {
 		shrink_submounts(mnt);
-- 
2.47.2


--n7c2fkulbsmjbpdu--