Re: [PATCH v4] fs/namespace: defer RCU sync for MNT_DETACH umount

Christian Brauner <brauner@xxxxxxxxxx> · Thu, 17 Apr 2025 18:28:20 +0200

On Thu, Apr 17, 2025 at 05:31:26PM +0200, Sebastian Andrzej Siewior wrote:
> On 2025-04-17 17:28:20 [+0200], Christian Brauner wrote:
> > >     So if there's some userspace process with a broken NFS server and it
> > >     does umount(MNT_DETACH) it will end up hanging every other
> > >     umount(MNT_DETACH) on the system because the dealyed_mntput_work
> > >     workqueue (to my understanding) cannot make progress.
> > 
> > Ok, "to my understanding" has been updated after going back and reading
> > the delayed work code. Luckily it's not as bad as I thought it is
> > because it's queued on system_wq which is multi-threaded so it's at
> > least not causing everyone with MNT_DETACH to get stuck. I'm still
> > skeptical how safe this all is. 
> 
> I would (again) throw system_unbound_wq into the game because the former
> will remain on the CPU on which has been enqueued (if speaking about
> multi threading).

Yes, good point.

However, what about using polled grace periods?

A first simple-minded thing to do would be to record the grace period
after umount_tree() has finished and the check it in namespace_unlock():

diff --git a/fs/namespace.c b/fs/namespace.c
index d9ca80dcc544..1e7ebcdd1ebc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -77,6 +77,7 @@ static struct hlist_head *mount_hashtable __ro_after_init;
 static struct hlist_head *mountpoint_hashtable __ro_after_init;
 static struct kmem_cache *mnt_cache __ro_after_init;
 static DECLARE_RWSEM(namespace_sem);
+static unsigned long rcu_unmount_seq; /* protected by namespace_sem */
 static HLIST_HEAD(unmounted);  /* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
 static DEFINE_SEQLOCK(mnt_ns_tree_lock);
@@ -1794,6 +1795,7 @@ static void namespace_unlock(void)
        struct hlist_head head;
        struct hlist_node *p;
        struct mount *m;
+       unsigned long unmount_seq = rcu_unmount_seq;
        LIST_HEAD(list);

        hlist_move_list(&unmounted, &head);
@@ -1817,7 +1819,7 @@ static void namespace_unlock(void)
        if (likely(hlist_empty(&head)))
                return;

-       synchronize_rcu_expedited();
+       cond_synchronize_rcu_expedited(unmount_seq);

        hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
                hlist_del(&m->mnt_umount);
@@ -1939,6 +1941,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
                 */
                mnt_notify_add(p);
        }
+
+       rcu_unmount_seq = get_state_synchronize_rcu();
 }

 static void shrink_submounts(struct mount *mnt);


I'm not sure how much that would buy us. If it doesn't then it should be
possible to play with the following possibly strange idea:

diff --git a/fs/mount.h b/fs/mount.h
index 7aecf2a60472..51b86300dc50 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -61,6 +61,7 @@ struct mount {
                struct rb_node mnt_node; /* node in the ns->mounts rbtree */
                struct rcu_head mnt_rcu;
                struct llist_node mnt_llist;
+               unsigned long mnt_rcu_unmount_seq;
        };
 #ifdef CONFIG_SMP
        struct mnt_pcp __percpu *mnt_pcp;
diff --git a/fs/namespace.c b/fs/namespace.c
index d9ca80dcc544..aae9df75beed 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1794,6 +1794,7 @@ static void namespace_unlock(void)
        struct hlist_head head;
        struct hlist_node *p;
        struct mount *m;
+       bool needs_synchronize_rcu = false;
        LIST_HEAD(list);

        hlist_move_list(&unmounted, &head);
@@ -1817,7 +1818,16 @@ static void namespace_unlock(void)
        if (likely(hlist_empty(&head)))
                return;

-       synchronize_rcu_expedited();
+       hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
+               if (!poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq))
+                       continue;
+
+               needs_synchronize_rcu = true;
+               break;
+       }
+
+       if (needs_synchronize_rcu)
+               synchronize_rcu_expedited();

        hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
                hlist_del(&m->mnt_umount);
@@ -1923,8 +1933,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
                        }
                }
                change_mnt_propagation(p, MS_PRIVATE);
-               if (disconnect)
+               if (disconnect) {
+                       p->mnt_rcu_unmount_seq = get_state_synchronize_rcu();
                        hlist_add_head(&p->mnt_umount, &unmounted);
+               }

                /*
                 * At this point p->mnt_ns is NULL, notification will be queued

This would allow to elide synchronize rcu calls if they elapsed in the
meantime since we moved that mount to the unmounted list.