On Sat, Apr 19, 2025 at 09:24:31AM +0800, Ian Kent wrote: > > On 18/4/25 16:47, Christian Brauner wrote: > > On Fri, Apr 18, 2025 at 09:20:52AM +0800, Ian Kent wrote: > > > On 18/4/25 09:13, Ian Kent wrote: > > > > On 18/4/25 00:28, Christian Brauner wrote: > > > > > On Thu, Apr 17, 2025 at 05:31:26PM +0200, Sebastian Andrzej Siewior > > > > > wrote: > > > > > > On 2025-04-17 17:28:20 [+0200], Christian Brauner wrote: > > > > > > > > So if there's some userspace process with a broken > > > > > > > > NFS server and it > > > > > > > > does umount(MNT_DETACH) it will end up hanging every other > > > > > > > > umount(MNT_DETACH) on the system because the dealyed_mntput_work > > > > > > > > workqueue (to my understanding) cannot make progress. > > > > > > > Ok, "to my understanding" has been updated after going back > > > > > > > and reading > > > > > > > the delayed work code. Luckily it's not as bad as I thought it is > > > > > > > because it's queued on system_wq which is multi-threaded so it's at > > > > > > > least not causing everyone with MNT_DETACH to get stuck. I'm still > > > > > > > skeptical how safe this all is. > > > > > > I would (again) throw system_unbound_wq into the game because > > > > > > the former > > > > > > will remain on the CPU on which has been enqueued (if speaking about > > > > > > multi threading). > > > > > Yes, good point. > > > > > > > > > > However, what about using polled grace periods? > > > > > > > > > > A first simple-minded thing to do would be to record the grace period > > > > > after umount_tree() has finished and the check it in namespace_unlock(): > > > > > > > > > > diff --git a/fs/namespace.c b/fs/namespace.c > > > > > index d9ca80dcc544..1e7ebcdd1ebc 100644 > > > > > --- a/fs/namespace.c > > > > > +++ b/fs/namespace.c > > > > > @@ -77,6 +77,7 @@ static struct hlist_head *mount_hashtable > > > > > __ro_after_init; > > > > > static struct hlist_head *mountpoint_hashtable __ro_after_init; > > > > > static struct kmem_cache *mnt_cache __ro_after_init; > > > > > static DECLARE_RWSEM(namespace_sem); > > > > > +static unsigned long rcu_unmount_seq; /* protected by namespace_sem */ > > > > > static HLIST_HEAD(unmounted); /* protected by namespace_sem */ > > > > > static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ > > > > > static DEFINE_SEQLOCK(mnt_ns_tree_lock); > > > > > @@ -1794,6 +1795,7 @@ static void namespace_unlock(void) > > > > > struct hlist_head head; > > > > > struct hlist_node *p; > > > > > struct mount *m; > > > > > + unsigned long unmount_seq = rcu_unmount_seq; > > > > > LIST_HEAD(list); > > > > > > > > > > hlist_move_list(&unmounted, &head); > > > > > @@ -1817,7 +1819,7 @@ static void namespace_unlock(void) > > > > > if (likely(hlist_empty(&head))) > > > > > return; > > > > > > > > > > - synchronize_rcu_expedited(); > > > > > + cond_synchronize_rcu_expedited(unmount_seq); > > > > > > > > > > hlist_for_each_entry_safe(m, p, &head, mnt_umount) { > > > > > hlist_del(&m->mnt_umount); > > > > > @@ -1939,6 +1941,8 @@ static void umount_tree(struct mount *mnt, > > > > > enum umount_tree_flags how) > > > > > */ > > > > > mnt_notify_add(p); > > > > > } > > > > > + > > > > > + rcu_unmount_seq = get_state_synchronize_rcu(); > > > > > } > > > > > > > > > > static void shrink_submounts(struct mount *mnt); > > > > > > > > > > > > > > > I'm not sure how much that would buy us. If it doesn't then it should be > > > > > possible to play with the following possibly strange idea: > > > > > > > > > > diff --git a/fs/mount.h b/fs/mount.h > > > > > index 7aecf2a60472..51b86300dc50 100644 > > > > > --- a/fs/mount.h > > > > > +++ b/fs/mount.h > > > > > @@ -61,6 +61,7 @@ struct mount { > > > > > struct rb_node mnt_node; /* node in the ns->mounts > > > > > rbtree */ > > > > > struct rcu_head mnt_rcu; > > > > > struct llist_node mnt_llist; > > > > > + unsigned long mnt_rcu_unmount_seq; > > > > > }; > > > > > #ifdef CONFIG_SMP > > > > > struct mnt_pcp __percpu *mnt_pcp; > > > > > diff --git a/fs/namespace.c b/fs/namespace.c > > > > > index d9ca80dcc544..aae9df75beed 100644 > > > > > --- a/fs/namespace.c > > > > > +++ b/fs/namespace.c > > > > > @@ -1794,6 +1794,7 @@ static void namespace_unlock(void) > > > > > struct hlist_head head; > > > > > struct hlist_node *p; > > > > > struct mount *m; > > > > > + bool needs_synchronize_rcu = false; > > > > > LIST_HEAD(list); > > > > > > > > > > hlist_move_list(&unmounted, &head); > > > > > @@ -1817,7 +1818,16 @@ static void namespace_unlock(void) > > > > > if (likely(hlist_empty(&head))) > > > > > return; > > > > > > > > > > - synchronize_rcu_expedited(); > > > > > + hlist_for_each_entry_safe(m, p, &head, mnt_umount) { > > > > > + if (!poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq)) > > > > > + continue; > > This has a bug. This needs to be: > > > > /* A grace period has already elapsed. */ > > if (poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq)) > > continue; > > > > /* Oh oh, we have to pay up. */ > > needs_synchronize_rcu = true; > > break; > > > > which I'm pretty sure will eradicate most of the performance gain you've > > seen because fundamentally the two version shouldn't be different (Note, > > I drafted this while on my way out the door. r. > > > > I would test the following version where we pay the cost of the > > smb_mb() from poll_state_synchronize_rcu() exactly one time: > > > > diff --git a/fs/mount.h b/fs/mount.h > > index 7aecf2a60472..51b86300dc50 100644 > > --- a/fs/mount.h > > +++ b/fs/mount.h > > @@ -61,6 +61,7 @@ struct mount { > > struct rb_node mnt_node; /* node in the ns->mounts rbtree */ > > struct rcu_head mnt_rcu; > > struct llist_node mnt_llist; > > + unsigned long mnt_rcu_unmount_seq; > > }; > > #ifdef CONFIG_SMP > > struct mnt_pcp __percpu *mnt_pcp; > > diff --git a/fs/namespace.c b/fs/namespace.c > > index d9ca80dcc544..dd367c54bc29 100644 > > --- a/fs/namespace.c > > +++ b/fs/namespace.c > > @@ -1794,6 +1794,7 @@ static void namespace_unlock(void) > > struct hlist_head head; > > struct hlist_node *p; > > struct mount *m; > > + unsigned long mnt_rcu_unmount_seq = 0; > > LIST_HEAD(list); > > > > hlist_move_list(&unmounted, &head); > > @@ -1817,7 +1818,10 @@ static void namespace_unlock(void) > > if (likely(hlist_empty(&head))) > > return; > > > > - synchronize_rcu_expedited(); > > + hlist_for_each_entry_safe(m, p, &head, mnt_umount) > > + mnt_rcu_unmount_seq = max(m->mnt_rcu_unmount_seq, mnt_rcu_unmount_seq); > > + > > + cond_synchronize_rcu_expedited(mnt_rcu_unmount_seq); > > > > hlist_for_each_entry_safe(m, p, &head, mnt_umount) { > > hlist_del(&m->mnt_umount); > > @@ -1923,8 +1927,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) > > } > > } > > change_mnt_propagation(p, MS_PRIVATE); > > - if (disconnect) > > + if (disconnect) { > > + p->mnt_rcu_unmount_seq = get_state_synchronize_rcu(); > > hlist_add_head(&p->mnt_umount, &unmounted); > > + } > > > > /* > > * At this point p->mnt_ns is NULL, notification will be queued > > > > If this doesn't help I had considered recording the rcu sequence number > > during __legitimize_mnt() in the mounts. But we likely can't do that > > because get_state_synchronize_rcu() is expensive because it inserts a > > smb_mb() and that would likely be noticable during path lookup. This > > would also hinge on the notion that the last store of the rcu sequence > > number is guaranteed to be seen when we check them in namespace_unlock(). > > > > Another possibly insane idea (haven't fully thought it out but throwing > > it out there to test): allocate a percpu counter for each mount and > > increment it each time we enter __legitimize_mnt() and decrement it when > > we leave __legitimize_mnt(). During umount_tree() check the percpu sum > > for each mount after it's been added to the @unmounted list. > > I had been thinking that a completion in the mount with a counter (say > > walker_cnt) could be used. Because the mounts are unhashed there won't > > be new walks so if/once the count is 0 the walker could call complete() > > and wait_for_completion() replaces the rcu sync completely. The catch is > > managing walker_cnt correctly could be racy or expensive. > > > I thought this would not be received to well dew to the additional fields Path walking absolutely has to be as fast as possible, unmounting doesn't. Anything that writes to a shared field from e.g., __legitimize_mnt() will cause cacheline pingpong and will very likely be noticable. And people care about even slight decreases in performances there. That's why we have the percpu counter and why I didn't even consider something like the completion stuff.