On Thu, Apr 17, 2025 at 06:28:20PM +0200, Christian Brauner wrote: > On Thu, Apr 17, 2025 at 05:31:26PM +0200, Sebastian Andrzej Siewior wrote: > > On 2025-04-17 17:28:20 [+0200], Christian Brauner wrote: > > > > (1) The real issue is with the same process P1 doing stupid stuff that > > > > just happened to work. For example if there's code out there that > > > > does a MNT_DETACH on a filesystem that uses a loop device > > > > immediately followed by the desire to reuse the loop device: > > > > > > > > It doesn't matter whether such code must in theory already be > > > > prepared to handle the case of seeing EBUSY after the MNT_DETACH. If > > > > this currently just happens to work because we guarantee that the > > > > last mntput() and cleanup_mnt() will have been done when the caller > > > > returns to userspace it's a uapi break plain and simple. > > > > > > > > This implicit guarantee isn't there anymore after your patch because > > > > the final mntput() from is done from the system_unbound_wq which has > > > > the consequence that the cleanup_mnt() is done from the > > > > delayed_mntput_work workqeueue. And that leads to problem number > > > > (2). The following script runs fine on mainline, but consistently fails with the version of this patch after discussions upstream: #! /usr/bin/bash -eux mntpath="/mnt" umount -R "$mntpath" || true fspath="/root/fs.ext4" dd if=/dev/zero of="$fspath" bs=1M count=500 mkfs.ext4 "$fspath" loopdev="$(losetup -f "$fspath" --show)" for i in $(seq 0 99); do mkfs.ext4 -Fq "$fspath" mount "$loopdev" "$mntpath" touch "$mntpath/f1" umount -l "$mntpath" done losetup -d "$loopdev" Failure looks like: + for i in $(seq 0 99) + mkfs.ext4 -Fq /root/fs.ext4 /root/fs.ext4 contains a ext4 file system created on Thu Apr 17 20:42:04 2025 + mount /dev/loop0 /mnt + touch /mnt/f1 + umount -l /mnt + for i in $(seq 0 99) + mkfs.ext4 -Fq /root/fs.ext4 /root/fs.ext4 contains a ext4 file system created on Thu Apr 17 20:42:04 2025 + mount /dev/loop0 /mnt mount: /mnt: mount(2) system call failed: Structure needs cleaning. [ 9.352478] EXT4-fs (loop0): mounted filesystem 3c5c632e-24d1-4027-b378-f51e67972883 r/w with ordered data mode. Quota mode: none. [ 9.449121] EXT4-fs (loop0): unmounting filesystem 3c5c632e-24d1-4027-b378-f51e67972883. [ 9.462093] EXT4-fs (loop0): ext4_check_descriptors: Checksum for group 32 failed (10605!=64170) [ 9.462099] EXT4-fs (loop0): group descriptors corrupted! Seems worse than EBUSY :(. > However, what about using polled grace periods? > > A first simple-minded thing to do would be to record the grace period > after umount_tree() has finished and the check it in namespace_unlock(): > > diff --git a/fs/namespace.c b/fs/namespace.c > index d9ca80dcc544..1e7ebcdd1ebc 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -77,6 +77,7 @@ static struct hlist_head *mount_hashtable __ro_after_init; > static struct hlist_head *mountpoint_hashtable __ro_after_init; > static struct kmem_cache *mnt_cache __ro_after_init; > static DECLARE_RWSEM(namespace_sem); > +static unsigned long rcu_unmount_seq; /* protected by namespace_sem */ > static HLIST_HEAD(unmounted); /* protected by namespace_sem */ > static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ > static DEFINE_SEQLOCK(mnt_ns_tree_lock); > @@ -1794,6 +1795,7 @@ static void namespace_unlock(void) > struct hlist_head head; > struct hlist_node *p; > struct mount *m; > + unsigned long unmount_seq = rcu_unmount_seq; > LIST_HEAD(list); > > hlist_move_list(&unmounted, &head); > @@ -1817,7 +1819,7 @@ static void namespace_unlock(void) > if (likely(hlist_empty(&head))) > return; > > - synchronize_rcu_expedited(); > + cond_synchronize_rcu_expedited(unmount_seq); > > hlist_for_each_entry_safe(m, p, &head, mnt_umount) { > hlist_del(&m->mnt_umount); > @@ -1939,6 +1941,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) > */ > mnt_notify_add(p); > } > + > + rcu_unmount_seq = get_state_synchronize_rcu(); > } > > static void shrink_submounts(struct mount *mnt); > > > I'm not sure how much that would buy us. If it doesn't then it should be > possible to play with the following possibly strange idea: Did not improve the lazy unmount, no corruption running the script. QEMU x86_64, 8cpus, PREEMP_RT: # perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs /mnt' -- umount /mnt 0.019498 +- 0.000944 seconds time elapsed ( +- 4.84% ) # perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs /mnt' -- umount -l /mnt 0.020635 +- 0.000959 seconds time elapsed ( +- 4.65% ) > diff --git a/fs/mount.h b/fs/mount.h > index 7aecf2a60472..51b86300dc50 100644 > --- a/fs/mount.h > +++ b/fs/mount.h > @@ -61,6 +61,7 @@ struct mount { > struct rb_node mnt_node; /* node in the ns->mounts rbtree */ > struct rcu_head mnt_rcu; > struct llist_node mnt_llist; > + unsigned long mnt_rcu_unmount_seq; > }; > #ifdef CONFIG_SMP > struct mnt_pcp __percpu *mnt_pcp; > diff --git a/fs/namespace.c b/fs/namespace.c > index d9ca80dcc544..aae9df75beed 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -1794,6 +1794,7 @@ static void namespace_unlock(void) > struct hlist_head head; > struct hlist_node *p; > struct mount *m; > + bool needs_synchronize_rcu = false; > LIST_HEAD(list); > > hlist_move_list(&unmounted, &head); > @@ -1817,7 +1818,16 @@ static void namespace_unlock(void) > if (likely(hlist_empty(&head))) > return; > > - synchronize_rcu_expedited(); > + hlist_for_each_entry_safe(m, p, &head, mnt_umount) { > + if (!poll_state_synchronize_rcu(m->mnt_rcu_unmount_seq)) > + continue; > + > + needs_synchronize_rcu = true; > + break; > + } > + > + if (needs_synchronize_rcu) > + synchronize_rcu_expedited(); > > hlist_for_each_entry_safe(m, p, &head, mnt_umount) { > hlist_del(&m->mnt_umount); > @@ -1923,8 +1933,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) > } > } > change_mnt_propagation(p, MS_PRIVATE); > - if (disconnect) > + if (disconnect) { > + p->mnt_rcu_unmount_seq = get_state_synchronize_rcu(); > hlist_add_head(&p->mnt_umount, &unmounted); > + } > > /* > * At this point p->mnt_ns is NULL, notification will be queued > > This would allow to elide synchronize rcu calls if they elapsed in the > meantime since we moved that mount to the unmounted list. Faster umount, lazy or not, no corruption running the script. QEMU x86_64, 8cpus, PREEMP_RT: # perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount /mnt 0.001482 +- 0.000121 seconds time elapsed ( +- 8.17% ) # perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l /mnt 0.002248 +- 0.000845 seconds time elapsed ( +- 37.58% ) The crun test from v1 without your patch: # perf stat -r 10 --null -- crun run test 0.08166 +- 0.00476 seconds time elapsed ( +- 5.83% ) The crun test from v1 with your patch: # perf stat -r 10 --null -- crun run test 0.01449 +- 0.00457 seconds time elapsed ( +- 31.55% ) I have not run the LTP fs tests with that last patch yet, but that looks like quite an improvement. Best, -- Eric Chanudet