On Thu, Apr 17, 2025 at 05:31:26PM +0200, Sebastian Andrzej Siewior wrote: > On 2025-04-17 17:28:20 [+0200], Christian Brauner wrote: > > > So if there's some userspace process with a broken NFS server and it > > > does umount(MNT_DETACH) it will end up hanging every other > > > umount(MNT_DETACH) on the system because the dealyed_mntput_work > > > workqueue (to my understanding) cannot make progress. > > > > Ok, "to my understanding" has been updated after going back and reading > > the delayed work code. Luckily it's not as bad as I thought it is > > because it's queued on system_wq which is multi-threaded so it's at > > least not causing everyone with MNT_DETACH to get stuck. I'm still > > skeptical how safe this all is. > > I would (again) throw system_unbound_wq into the game because the former > will remain on the CPU on which has been enqueued (if speaking about > multi threading). Yes, good point. However, what about using polled grace periods? A first simple-minded thing to do would be to record the grace period after umount_tree() has finished and the check it in namespace_unlock(): diff --git a/fs/namespace.c b/fs/namespace.c index d9ca80dcc544..1e7ebcdd1ebc 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -77,6 +77,7 @@ static struct hlist_head *mount_hashtable __ro_after_init; static struct hlist_head *mountpoint_hashtable __ro_after_init; static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); +static unsigned long rcu_unmount_seq; /* protected by namespace_sem */ static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static DEFINE_SEQLOCK(mnt_ns_tree_lock); @@ -1794,6 +1795,7 @@ static void namespace_unlock(void) struct hlist_head head; struct hlist_node *p; struct mount *m; + unsigned long unmount_seq = rcu_unmount_seq; LIST_HEAD(list); hlist_move_list(&unmounted, &head); @@ -1817,7 +1819,7 @@ static void namespace_unlock(void) if (likely(hlist_empty(&head))) return; - synchronize_rcu_expedited(); + cond_synchronize_rcu_expedited(unmount_seq); hlist_for_each_entry_safe(m, p, &head, mnt_umount) { hlist_del(&m->mnt_umount); @@ -1939,6 +1941,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) */ mnt_notify_add(p); } + + rcu_unmount_seq = get_state_synchronize_rcu(); } static void shrink_submounts(struct mount *mnt); I'm not sure how much that would buy us. If it doesn't then it should be possible to play with the following possibly strange idea: diff --git a/fs/mount.h b/fs/mount.h index 7aecf2a60472..51b86300dc50 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -61,6 +61,7 @@ struct mount { struct rb_node mnt_node; /* node in the ns->mounts rbtree */ struct rcu_head mnt_rcu; struct llist_node mnt_llist; + unsigned long mnt_rcu_unmount_seq; }; #ifdef CONFIG_SMP struct mnt_pcp __percpu *mnt_pcp; diff --git a/fs/namespace.c b/fs/namespace.c index d9ca80dcc544..aae9df75beed 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1794,6 +1794,7 @@ static void namespace_unlock(void) struct hlist_head head; struct hlist_node *p; struct mount *m; + bool needs_synchronize_rcu = false; LIST_HEAD(list); hlist_move_list(&unmounted, &head); @@ -1817,7 +1818,16 @@ static void namespace_unlock(void) if (likely(hlist_empty(&head))) return; - synchronize_rcu_expedited(); + hlist_for_each_entry_safe(m, p, &head, mnt_umount) { + if (!poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq)) + continue; + + needs_synchronize_rcu = true; + break; + } + + if (needs_synchronize_rcu) + synchronize_rcu_expedited(); hlist_for_each_entry_safe(m, p, &head, mnt_umount) { hlist_del(&m->mnt_umount); @@ -1923,8 +1933,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) } } change_mnt_propagation(p, MS_PRIVATE); - if (disconnect) + if (disconnect) { + p->mnt_rcu_unmount_seq = get_state_synchronize_rcu(); hlist_add_head(&p->mnt_umount, &unmounted); + } /* * At this point p->mnt_ns is NULL, notification will be queued This would allow to elide synchronize rcu calls if they elapsed in the meantime since we moved that mount to the unmounted list.