In a prior patch series we tried to cleanly differentiate between: (1) The task has already been reaped. (2) The caller requested a pidfd for a thread-group leader but the pid actually references a struct pid that isn't used as a thread-group leader. as this was causing issues for non-threaded workloads. But there's cases where the current simple logic is wrong. Specifically, if the pid was a leader pid and the check races with __unhash_process(). Stabilize this by using a sequence counter associated with tasklist_lock and retry while we're in __unhash_process(). The seqcounter might be useful independent of pidfs. Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> --- include/linux/pid.h | 1 + kernel/exit.c | 11 +++++++++++ kernel/fork.c | 22 ++++++++++++---------- kernel/pid.c | 1 + 4 files changed, 25 insertions(+), 10 deletions(-) diff --git a/include/linux/pid.h b/include/linux/pid.h index 311ecebd7d56..b54a4c1ef602 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -65,6 +65,7 @@ struct pid struct hlist_head inodes; /* wait queue for pidfd notifications */ wait_queue_head_t wait_pidfd; + seqcount_rwlock_t pid_seq; struct rcu_head rcu; struct upid numbers[]; }; diff --git a/kernel/exit.c b/kernel/exit.c index 1b51dc099f1e..8050572fe682 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -133,17 +133,28 @@ struct release_task_post { static void __unhash_process(struct release_task_post *post, struct task_struct *p, bool group_dead) { + struct pid *pid; + + lockdep_assert_held_write(&tasklist_lock); + nr_threads--; + + pid = task_pid(p); + raw_write_seqcount_begin(&pid->pid_seq); detach_pid(post->pids, p, PIDTYPE_PID); if (group_dead) { detach_pid(post->pids, p, PIDTYPE_TGID); detach_pid(post->pids, p, PIDTYPE_PGID); detach_pid(post->pids, p, PIDTYPE_SID); + } + raw_write_seqcount_end(&pid->pid_seq); + if (group_dead) { list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); } + list_del_rcu(&p->thread_node); } diff --git a/kernel/fork.c b/kernel/fork.c index 4a2080b968c8..1480bf6f5f38 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2109,24 +2109,26 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { int err = 0; + unsigned int seq; - if (!(flags & PIDFD_THREAD)) { + do { + seq = raw_seqcount_begin(&pid->pid_seq); /* * If this is struct pid isn't used as a thread-group * leader pid but the caller requested to create a * thread-group leader pidfd then report ENOENT to the * caller as a hint. */ - if (!pid_has_task(pid, PIDTYPE_TGID)) + if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID)) err = -ENOENT; - } - - /* - * If this wasn't a thread-group leader struct pid or the task - * got reaped in the meantime report -ESRCH to userspace. - */ - if (!pid_has_task(pid, PIDTYPE_PID)) - err = -ESRCH; + /* + * If this wasn't a thread-group leader struct pid or + * the task got reaped in the meantime report -ESRCH to + * userspace. + */ + if (!pid_has_task(pid, PIDTYPE_PID)) + err = -ESRCH; + } while (read_seqcount_retry(&pid->pid_seq, seq)); if (err) return err; diff --git a/kernel/pid.c b/kernel/pid.c index 4ac2ce46817f..bbca61f62faa 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -271,6 +271,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, upid = pid->numbers + ns->level; idr_preload(GFP_KERNEL); spin_lock(&pidmap_lock); + seqcount_rwlock_init(&pid->pid_seq, &tasklist_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; pidfs_add_pid(pid); -- 2.47.2