On Tue, May 20, 2025 at 05:27:55PM -0500, Eric W. Biederman wrote: > I have only skimmed this so far, and I am a bit confused what we > are using MNT_MARK for. I would think we should be able to use > MNT_MARK instead of stealing another bit. MNT_MARK is used to avoid walking the same ancestry chain again and again - once we have done that once and removed all non-overmounts, we no longer will run into any when reaching that chain. Otherwise we can run into unpleasant situation: 1000-long chain of overmounts, with a bush on top of it. Basically, when we see something in that bush with strictly internal non-candidate, we need to take out all non-overmounts in the chain of ancestor candidates. Walking the same 1000-long chunk for each of those would be not only pointless, it would give O(N^2) worst case time complexity. Anyway, I have something I believe is a more readable approach - have an explicit MNT_UMOUNT_CANDIDATE set by gather_candidates(), use it for is_candidate(m) and instead of remove_candidate() in trim_ancestors() just clean the MNT_UMOUNT_CANDIDATE there. Then, in the beginning of trim_one() *and* handle_locked() have if (!is_candidate(m)) { remove_from_candidate_list(m); return; } Ta-da - trim_one() can be used with list_for_each_entry_safe() now, so it doesn't need to return anything. gather_candidates() doesn't need the list of already seen elements of original set - we just set MNT_UMOUNT_CANDIDATE on encountered mounts, still put the ones not in original set on candidates and and leave the original ones on the original list. Instead of dissolving linkage in 'visited' we loop through the original set and remove those MNT_UMOUNT_CANDIDATE there. What's more, that allows to kill mnt_umounting linkage - we never use both at the same time now, so 'candidates' can use mnt_list linkage instead. I've started tests right now, will update D/f/propagate_umount.txt, convert away from global lists and push if it survives the tests. In the meanwhile, the incremental I'm testing right now is this: diff --git a/fs/mount.h b/fs/mount.h index 7aecf2a60472..65275d031e61 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -84,7 +84,6 @@ struct mount { struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ struct hlist_node mnt_umount; }; - struct list_head mnt_umounting; /* list entry for umount propagation */ #ifdef CONFIG_FSNOTIFY struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; diff --git a/fs/namespace.c b/fs/namespace.c index 34b47bd82c38..028db59e2b26 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -382,7 +382,6 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); - INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); RB_CLEAR_NODE(&mnt->mnt_node); mnt->mnt.mnt_idmap = &nop_mnt_idmap; diff --git a/fs/pnode.c b/fs/pnode.c index 9b2f1ac80f25..605bb22011e0 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -470,14 +470,12 @@ static LIST_HEAD(candidates); // undecided unmount candidates static inline struct mount *first_candidate(void) { - if (list_empty(&candidates)) - return NULL; - return list_first_entry(&candidates, struct mount, mnt_umounting); + return list_first_entry_or_null(&candidates, struct mount, mnt_list); } static inline bool is_candidate(struct mount *m) { - return !list_empty(&m->mnt_umounting); + return m->mnt.mnt_flags & MNT_UMOUNT_CANDIDATE; } static inline bool will_be_unmounted(struct mount *m) @@ -492,29 +490,26 @@ static void umount_one(struct mount *m) move_from_ns(m, &to_umount); } -static void remove_candidate(struct mount *m) +static void remove_from_candidate_list(struct mount *m) { - CLEAR_MNT_MARK(m); // unmark on removal from candidates - list_del_init(&m->mnt_umounting); + m->mnt.mnt_flags &= ~(MNT_MARKED | MNT_UMOUNT_CANDIDATE); + list_del_init(&m->mnt_list); } static void gather_candidates(struct list_head *set) { - LIST_HEAD(visited); struct mount *m, *p, *q; list_for_each_entry(m, set, mnt_list) { if (is_candidate(m)) continue; - list_add(&m->mnt_umounting, &visited); + m->mnt.mnt_flags |= MNT_UMOUNT_CANDIDATE; p = m->mnt_parent; q = propagation_next(p, p); while (q) { struct mount *child = __lookup_mnt(&q->mnt, m->mnt_mountpoint); if (child) { - struct list_head *head; - /* * We might've already run into this one. That * must've happened on earlier iteration of the @@ -526,17 +521,15 @@ static void gather_candidates(struct list_head *set) q = skip_propagation_subtree(q, p); continue; } - if (will_be_unmounted(child)) - head = &visited; - else - head = &candidates; - list_add(&child->mnt_umounting, head); + child->mnt.mnt_flags |= MNT_UMOUNT_CANDIDATE; + if (!will_be_unmounted(child)) + list_add(&child->mnt_list, &candidates); } q = propagation_next(q, p); } } - while (!list_empty(&visited)) // empty visited - list_del_init(visited.next); + list_for_each_entry(m, set, mnt_list) + m->mnt.mnt_flags &= ~MNT_UMOUNT_CANDIDATE; } /* @@ -553,7 +546,7 @@ static void trim_ancestors(struct mount *m) return; SET_MNT_MARK(m); if (m->mnt_mountpoint != p->mnt.mnt_root) - remove_candidate(p); + p->mnt.mnt_flags &= ~MNT_UMOUNT_CANDIDATE; } } @@ -563,13 +556,19 @@ static void trim_ancestors(struct mount *m) * If we can immediately tell that @m is OK to unmount (unlocked * and all children are already committed to unmounting) commit * to unmounting it. - * Returns the next candidate to be trimmed. + * Only @m itself might be taken from the candidates list; + * anything found by trim_ancestors() is marked non-candidate + * and left on the list. */ -static struct mount *trim_one(struct mount *m) +static void trim_one(struct mount *m) { bool remove_this = false, found = false, umount_this = false; struct mount *n; - struct list_head *next; + + if (!is_candidate(m)) { // trim_ancestors() left it on list + remove_from_candidate_list(m); + return; + } list_for_each_entry(n, &m->mnt_mounts, mnt_child) { if (!is_candidate(n)) { @@ -586,24 +585,23 @@ static struct mount *trim_one(struct mount *m) remove_this = true; umount_this = true; } - next = m->mnt_umounting.next; if (remove_this) { - remove_candidate(m); + remove_from_candidate_list(m); if (umount_this) umount_one(m); } - if (next == &candidates) - return NULL; - - return list_entry(next, struct mount, mnt_umounting); } static void handle_locked(struct mount *m) { struct mount *cutoff = m, *p; + if (!is_candidate(m)) { // trim_ancestors() left it on list + remove_from_candidate_list(m); + return; + } for (p = m; is_candidate(p); p = p->mnt_parent) { - remove_candidate(p); + remove_from_candidate_list(p); if (!IS_MNT_LOCKED(p)) cutoff = p->mnt_parent; } @@ -657,14 +655,14 @@ static void reparent(struct mount *m) */ void propagate_umount(struct list_head *set) { - struct mount *m; + struct mount *m, *p; // collect all candidates gather_candidates(set); // reduce the set until it's non-shifting - for (m = first_candidate(); m; m = trim_one(m)) - ; + list_for_each_entry_safe(m, p, &candidates, mnt_list) + trim_one(m); // ... and non-revealing while (!list_empty(&candidates)) diff --git a/include/linux/mount.h b/include/linux/mount.h index dcc17ce8a959..33af9c4058fa 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -50,10 +50,12 @@ struct path; #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \ + MNT_UMOUNT_CANDIDATE) #define MNT_INTERNAL 0x4000 +#define MNT_UMOUNT_CANDIDATE 0x020000 #define MNT_LOCK_ATIME 0x040000 #define MNT_LOCK_NOEXEC 0x080000 #define MNT_LOCK_NOSUID 0x100000