On Fri, Apr 18, 2025 at 09:59:23PM +0200, Christian Brauner wrote: > On Fri, Apr 18, 2025 at 10:47:10AM +0200, Christian Brauner wrote: > > On Fri, Apr 18, 2025 at 09:20:52AM +0800, Ian Kent wrote: > > > > On 18/4/25 00:28, Christian Brauner wrote: > > > > > I'm not sure how much that would buy us. If it doesn't then it should be > > > > > possible to play with the following possibly strange idea: > > > > > > > > > > diff --git a/fs/mount.h b/fs/mount.h > > > > > index 7aecf2a60472..51b86300dc50 100644 > > > > > --- a/fs/mount.h > > > > > +++ b/fs/mount.h > > > > > @@ -61,6 +61,7 @@ struct mount { > > > > > struct rb_node mnt_node; /* node in the ns->mounts > > > > > rbtree */ > > > > > struct rcu_head mnt_rcu; > > > > > struct llist_node mnt_llist; > > > > > + unsigned long mnt_rcu_unmount_seq; > > > > > }; > > > > > #ifdef CONFIG_SMP > > > > > struct mnt_pcp __percpu *mnt_pcp; > > > > > diff --git a/fs/namespace.c b/fs/namespace.c > > > > > index d9ca80dcc544..aae9df75beed 100644 > > > > > --- a/fs/namespace.c > > > > > +++ b/fs/namespace.c > > > > > @@ -1794,6 +1794,7 @@ static void namespace_unlock(void) > > > > > struct hlist_head head; > > > > > struct hlist_node *p; > > > > > struct mount *m; > > > > > + bool needs_synchronize_rcu = false; > > > > > LIST_HEAD(list); > > > > > > > > > > hlist_move_list(&unmounted, &head); > > > > > @@ -1817,7 +1818,16 @@ static void namespace_unlock(void) > > > > > if (likely(hlist_empty(&head))) > > > > > return; > > > > > > > > > > - synchronize_rcu_expedited(); > > > > > + hlist_for_each_entry_safe(m, p, &head, mnt_umount) { > > > > > + if (!poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq)) > > > > > + continue; > > > > This has a bug. This needs to be: > > > > /* A grace period has already elapsed. */ > > if (poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq)) > > continue; > > > > /* Oh oh, we have to pay up. */ > > needs_synchronize_rcu = true; > > break; > > > > which I'm pretty sure will eradicate most of the performance gain you've > > seen because fundamentally the two version shouldn't be different (Note, > > I drafted this while on my way out the door. r. I failed to notice... Once corrected the numbers are back to that of mainline, same as with the other version. > > I would test the following version where we pay the cost of the > > smb_mb() from poll_state_synchronize_rcu() exactly one time: > > > > diff --git a/fs/mount.h b/fs/mount.h > > index 7aecf2a60472..51b86300dc50 100644 > > --- a/fs/mount.h > > +++ b/fs/mount.h > > @@ -61,6 +61,7 @@ struct mount { > > struct rb_node mnt_node; /* node in the ns->mounts rbtree */ > > struct rcu_head mnt_rcu; > > struct llist_node mnt_llist; > > + unsigned long mnt_rcu_unmount_seq; > > }; > > #ifdef CONFIG_SMP > > struct mnt_pcp __percpu *mnt_pcp; > > diff --git a/fs/namespace.c b/fs/namespace.c > > index d9ca80dcc544..dd367c54bc29 100644 > > --- a/fs/namespace.c > > +++ b/fs/namespace.c > > @@ -1794,6 +1794,7 @@ static void namespace_unlock(void) > > struct hlist_head head; > > struct hlist_node *p; > > struct mount *m; > > + unsigned long mnt_rcu_unmount_seq = 0; > > LIST_HEAD(list); > > > > hlist_move_list(&unmounted, &head); > > @@ -1817,7 +1818,10 @@ static void namespace_unlock(void) > > if (likely(hlist_empty(&head))) > > return; > > > > - synchronize_rcu_expedited(); > > + hlist_for_each_entry_safe(m, p, &head, mnt_umount) > > + mnt_rcu_unmount_seq = max(m->mnt_rcu_unmount_seq, mnt_rcu_unmount_seq); > > + > > + cond_synchronize_rcu_expedited(mnt_rcu_unmount_seq); > > > > hlist_for_each_entry_safe(m, p, &head, mnt_umount) { > > hlist_del(&m->mnt_umount); > > @@ -1923,8 +1927,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) > > } > > } > > change_mnt_propagation(p, MS_PRIVATE); > > - if (disconnect) > > + if (disconnect) { > > + p->mnt_rcu_unmount_seq = get_state_synchronize_rcu(); > > hlist_add_head(&p->mnt_umount, &unmounted); > > + } > > > > /* > > * At this point p->mnt_ns is NULL, notification will be queued That did not show improved performances. ftrace confirms time is spent in synchronize_rcu as it used to, e.g: 5) | namespace_unlock() { 5) | cond_synchronize_rcu_expedited() { 5) | synchronize_rcu_expedited() { 5) 0.088 us | rcu_gp_is_normal(); 5) | synchronize_rcu_normal() { 5) * 15797.30 us | } 5) * 15797.70 us | } 5) * 15797.94 us | } 5) * 15856.13 us | } (rcu_expedited=1 was mentioned upthread iirc, PREEMPT_RT discards it, the trace above was with it: # cat /sys/kernel/rcu_expedited 1) 0001-UNTESTED-fs-namespace-defer-RCU-sync-for-MNT_DETACH-.patch yields the expected improved performances, although I'm still seeing the corruption reported earlier that doesn't happen on mainline. > I'm appending a patch that improves on the first version of this patch. > Instead of simply sampling the current rcu state and hoping that the rcu > grace period has elapsed by the time we get to put the mounts we sample > the rcu state and kick off a new grace period at the end of > umount_tree(). That could even get us some performance improvement by on > non-RT kernels. I have no clue how well this will fare on RT though. 0001-UNTESTED-mount-sample-and-kick-of-grace-period-durin.patch does not show the expected performance improvements: # perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs /mnt' -- umount /mnt 0.022602 +- 0.000757 seconds time elapsed ( +- 3.35% ) # perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs /mnt' -- umount -l /mnt 0.02145 +- 0.00130 seconds time elapsed ( +- 6.05% ) Best, -- Eric Chanudet