Am Di., 3. Juni 2025 um 15:32 Uhr schrieb Christian Brauner <brauner@xxxxxxxxxx>: > > Extend the coredump socket to allow the coredump server to tell the > kernel how to process individual coredumps. > > When the crashing task connects to the coredump socket the kernel will > send a struct coredump_req to the coredump server. The kernel will set > the size member of struct coredump_req allowing the coredump server how > much data can be read. > > The coredump server uses MSG_PEEK to peek the size of struct > coredump_req. If the kernel uses a newer struct coredump_req the > coredump server just reads the size it knows and discard any remaining > bytes in the buffer. If the kernel uses an older struct coredump_req > the coredump server just reads the size the kernel knows. > > The returned struct coredump_req will inform the coredump server what > features the kernel supports. The coredump_req->mask member is set to > the currently know features. > > The coredump server may only use features whose bits were raised by the > kernel in coredump_req->mask. > > In response to a coredump_req from the kernel the coredump server sends > a struct coredump_ack to the kernel. The kernel informs the coredump > server what version of struct coredump_ack it supports by setting struct > coredump_req->size_ack to the size it knows about. The coredump server > may only send as many bytes as coredump_req->size_ack indicates (a > smaller size is fine of course). The coredump server must set > coredump_ack->size accordingly. > > The coredump server sets the features it wants to use in struct > coredump_ack->mask. Only bits returned in struct coredump_req->mask may > be used. > > In case an invalid struct coredump_ack is sent to the kernel an > out-of-band byte will be sent by the kernel indicating the reason why > the coredump_ack was rejected. > > The out-of-band markers allow advanced userspace to infer failure. They > are optional and can be ignored by not listening for POLLPRI events and > aren't necessary for the coredump server to function correctly. > > In the initial version the following features are supported in > coredump_{req,ack}->mask: > > * COREDUMP_KERNEL > The kernel will write the coredump data to the socket. > > * COREDUMP_USERSPACE > The kernel will not write coredump data but will indicate to the > parent that a coredump has been generated. This is used when userspace > generates its own coredumps. > > * COREDUMP_REJECT > The kernel will skip generating a coredump for this task. > > * COREDUMP_WAIT > The kernel will prevent the task from exiting until the coredump > server has shutdown the socket connection. > > The flexible coredump socket can be enabled by using the "@@" prefix > instead of the single "@" prefix for the regular coredump socket: > > @@/run/systemd/coredump.socket > > will enable flexible coredump handling. Current kernels already enforce > that "@" must be followed by "/" and will reject anything else. So > extending this is backward and forward compatible. > > Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@xxxxxxxxxxxxx> > --- > fs/coredump.c | 130 +++++++++++++++++++++++++++++++++++++++--- > include/uapi/linux/coredump.h | 104 +++++++++++++++++++++++++++++++++ > 2 files changed, 227 insertions(+), 7 deletions(-) > > diff --git a/fs/coredump.c b/fs/coredump.c > index f217ebf2b3b6..e79f37d3eefb 100644 > --- a/fs/coredump.c > +++ b/fs/coredump.c > @@ -51,6 +51,7 @@ > #include <net/sock.h> > #include <uapi/linux/pidfd.h> > #include <uapi/linux/un.h> > +#include <uapi/linux/coredump.h> > > #include <linux/uaccess.h> > #include <asm/mmu_context.h> > @@ -83,15 +84,17 @@ static int core_name_size = CORENAME_MAX_SIZE; > unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT; > > enum coredump_type_t { > - COREDUMP_FILE = 1, > - COREDUMP_PIPE = 2, > - COREDUMP_SOCK = 3, > + COREDUMP_FILE = 1, > + COREDUMP_PIPE = 2, > + COREDUMP_SOCK = 3, > + COREDUMP_SOCK_REQ = 4, > }; > > struct core_name { > char *corename; > int used, size; > enum coredump_type_t core_type; > + u64 mask; > }; > > static int expand_corename(struct core_name *cn, int size) > @@ -235,6 +238,9 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, > int pid_in_pattern = 0; > int err = 0; > > + cn->mask = COREDUMP_KERNEL; > + if (core_pipe_limit) > + cn->mask |= COREDUMP_WAIT; > cn->used = 0; > cn->corename = NULL; > if (*pat_ptr == '|') > @@ -264,6 +270,13 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, > pat_ptr++; > if (!(*pat_ptr)) > return -ENOMEM; > + if (*pat_ptr == '@') { > + pat_ptr++; > + if (!(*pat_ptr)) > + return -ENOMEM; > + > + cn->core_type = COREDUMP_SOCK_REQ; > + } > > err = cn_printf(cn, "%s", pat_ptr); > if (err) > @@ -632,6 +645,93 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new) > return 0; > } > > +#ifdef CONFIG_UNIX > +static inline bool coredump_sock_recv(struct file *file, struct coredump_ack *ack, size_t size, int flags) > +{ > + struct msghdr msg = {}; > + struct kvec iov = { .iov_base = ack, .iov_len = size }; > + ssize_t ret; > + > + memset(ack, 0, size); > + ret = kernel_recvmsg(sock_from_file(file), &msg, &iov, 1, size, flags); > + return ret == size; > +} > + > +static inline bool coredump_sock_send(struct file *file, struct coredump_req *req) > +{ > + struct msghdr msg = { .msg_flags = MSG_NOSIGNAL }; > + struct kvec iov = { .iov_base = req, .iov_len = sizeof(*req) }; > + ssize_t ret; > + > + ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(*req)); > + return ret == sizeof(*req); > +} > + > +static_assert(sizeof(enum coredump_oob) == sizeof(__u8)); > + > +static inline bool coredump_sock_oob(struct file *file, enum coredump_oob oob) > +{ > +#ifdef CONFIG_AF_UNIX_OOB > + struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_OOB }; > + struct kvec iov = { .iov_base = &oob, .iov_len = sizeof(oob) }; > + > + kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(oob)); > +#endif > + coredump_report_failure("Coredump socket ack failed %u", oob); > + return false; > +} > + > +static bool coredump_request(struct core_name *cn, struct coredump_params *cprm) > +{ > + struct coredump_req req = { > + .size = sizeof(struct coredump_req), > + .mask = COREDUMP_KERNEL | COREDUMP_USERSPACE | > + COREDUMP_REJECT | COREDUMP_WAIT, > + .size_ack = sizeof(struct coredump_ack), > + }; > + struct coredump_ack ack = {}; > + ssize_t usize; > + > + if (cn->core_type != COREDUMP_SOCK_REQ) > + return true; > + > + /* Let userspace know what we support. */ > + if (!coredump_sock_send(cprm->file, &req)) > + return false; > + > + /* Peek the size of the coredump_ack. */ > + if (!coredump_sock_recv(cprm->file, &ack, sizeof(ack.size), > + MSG_PEEK | MSG_WAITALL)) > + return false; > + > + /* Refuse unknown coredump_ack sizes. */ > + usize = ack.size; > + if (usize < COREDUMP_ACK_SIZE_VER0 || usize > sizeof(ack)) > + return coredump_sock_oob(cprm->file, COREDUMP_OOB_INVALIDSIZE); > + > + /* Now retrieve the coredump_ack. */ > + if (!coredump_sock_recv(cprm->file, &ack, usize, MSG_WAITALL)) > + return false; > + if (ack.size != usize) > + return false; > + > + /* Refuse unknown coredump_ack flags. */ > + if (ack.mask & ~req.mask) > + return coredump_sock_oob(cprm->file, COREDUMP_OOB_UNSUPPORTED); > + > + /* Refuse mutually exclusive options. */ > + if (hweight64(ack.mask & (COREDUMP_USERSPACE | COREDUMP_KERNEL | > + COREDUMP_REJECT)) != 1) > + return coredump_sock_oob(cprm->file, COREDUMP_OOB_CONFLICTING); > + > + if (ack.spare) > + return coredump_sock_oob(cprm->file, COREDUMP_OOB_UNSUPPORTED); > + > + cn->mask = ack.mask; > + return true; > +} > +#endif > + > void do_coredump(const kernel_siginfo_t *siginfo) > { > struct core_state core_state; > @@ -850,6 +950,8 @@ void do_coredump(const kernel_siginfo_t *siginfo) > } > break; > } > + case COREDUMP_SOCK_REQ: > + fallthrough; > case COREDUMP_SOCK: { > #ifdef CONFIG_UNIX > struct file *file __free(fput) = NULL; > @@ -918,6 +1020,9 @@ void do_coredump(const kernel_siginfo_t *siginfo) > > cprm.limit = RLIM_INFINITY; > cprm.file = no_free_ptr(file); > + > + if (!coredump_request(&cn, &cprm)) > + goto close_fail; > #else > coredump_report_failure("Core dump socket support %s disabled", cn.corename); > goto close_fail; > @@ -929,12 +1034,17 @@ void do_coredump(const kernel_siginfo_t *siginfo) > goto close_fail; > } > > + /* Don't even generate the coredump. */ > + if (cn.mask & COREDUMP_REJECT) > + goto close_fail; > + > /* get us an unshared descriptor table; almost always a no-op */ > /* The cell spufs coredump code reads the file descriptor tables */ > retval = unshare_files(); > if (retval) > goto close_fail; > - if (!dump_interrupted()) { > + > + if ((cn.mask & COREDUMP_KERNEL) && !dump_interrupted()) { > /* > * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would > * have this set to NULL. > @@ -968,17 +1078,23 @@ void do_coredump(const kernel_siginfo_t *siginfo) > kernel_sock_shutdown(sock_from_file(cprm.file), SHUT_WR); > #endif > > + /* Let the parent know that a coredump was generated. */ > + if (cn.mask & COREDUMP_USERSPACE) > + core_dumped = true; > + > /* > * When core_pipe_limit is set we wait for the coredump server > * or usermodehelper to finish before exiting so it can e.g., > * inspect /proc/<pid>. > */ > - if (core_pipe_limit) { > + if (cn.mask & COREDUMP_WAIT) { > switch (cn.core_type) { > case COREDUMP_PIPE: > wait_for_dump_helpers(cprm.file); > break; > #ifdef CONFIG_UNIX > + case COREDUMP_SOCK_REQ: > + fallthrough; > case COREDUMP_SOCK: { > ssize_t n; > > @@ -1249,8 +1365,8 @@ static inline bool check_coredump_socket(void) > if (current->nsproxy->mnt_ns != init_task.nsproxy->mnt_ns) > return false; > > - /* Must be an absolute path. */ > - if (*(core_pattern + 1) != '/') > + /* Must be an absolute path or the socket request. */ > + if (*(core_pattern + 1) != '/' && *(core_pattern + 1) != '@') > return false; > > return true; > diff --git a/include/uapi/linux/coredump.h b/include/uapi/linux/coredump.h > new file mode 100644 > index 000000000000..4fa7d1f9d062 > --- /dev/null > +++ b/include/uapi/linux/coredump.h > @@ -0,0 +1,104 @@ > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ > + > +#ifndef _UAPI_LINUX_COREDUMP_H > +#define _UAPI_LINUX_COREDUMP_H > + > +#include <linux/types.h> > + > +/** > + * coredump_{req,ack} flags > + * @COREDUMP_KERNEL: kernel writes coredump > + * @COREDUMP_USERSPACE: userspace writes coredump > + * @COREDUMP_REJECT: don't generate coredump > + * @COREDUMP_WAIT: wait for coredump server > + */ > +enum { > + COREDUMP_KERNEL = (1ULL << 0), > + COREDUMP_USERSPACE = (1ULL << 1), > + COREDUMP_REJECT = (1ULL << 2), > + COREDUMP_WAIT = (1ULL << 3), > +}; > + > +/** > + * struct coredump_req - message kernel sends to userspace > + * @size: size of struct coredump_req > + * @size_ack: known size of struct coredump_ack on this kernel > + * @mask: supported features > + * > + * When a coredump happens the kernel will connect to the coredump > + * socket and send a coredump request to the coredump server. The @size > + * member is set to the size of struct coredump_req and provides a hint > + * to userspace how much data can be read. Userspace may use MSG_PEEK to > + * peek the size of struct coredump_req and then choose to consume it in > + * one go. Userspace may also simply read a COREDUMP_ACK_SIZE_VER0 > + * request. If the size the kernel sends is larger userspace simply > + * discards any remaining data. > + * > + * The coredump_req->mask member is set to the currently know features. > + * Userspace may only set coredump_ack->mask to the bits raised by the > + * kernel in coredump_req->mask. > + * > + * The coredump_req->size_ack member is set by the kernel to the size of > + * struct coredump_ack the kernel knows. Userspace may only send up to > + * coredump_req->size_ack bytes to the kernel and must set > + * coredump_ack->size accordingly. > + */ > +struct coredump_req { > + __u32 size; > + __u32 size_ack; > + __u64 mask; > +}; > + > +enum { > + COREDUMP_REQ_SIZE_VER0 = 16U, /* size of first published struct */ > +}; > + > +/** > + * struct coredump_ack - message userspace sends to kernel > + * @size: size of the struct > + * @spare: unused > + * @mask: features kernel is supposed to use > + * > + * The @size member must be set to the size of struct coredump_ack. It > + * may never exceed what the kernel returned in coredump_req->size_ack > + * but it may of course be smaller (>= COREDUMP_ACK_SIZE_VER0 and <= > + * coredump_req->size_ack). > + * > + * The @mask member must be set to the features the coredump server > + * wants the kernel to use. Only bits the kernel returned in > + * coredump_req->mask may be set. > + */ > +struct coredump_ack { > + __u32 size; > + __u32 spare; > + __u64 mask; > +}; > + > +enum { > + COREDUMP_ACK_SIZE_VER0 = 16U, /* size of first published struct */ > +}; > + > +/** > + * enum coredump_oob - Out-of-band markers for the coredump socket > + * > + * The kernel will place a single byte coredump_oob marker on the > + * coredump socket. An interested coredump server can listen for POLLPRI > + * and figure out why the provided coredump_ack was invalid. > + * > + * The out-of-band markers allow advanced userspace to infer more details > + * about a coredump ack. They are optional and can be ignored. They > + * aren't necessary for the coredump server to function correctly. > + * > + * @COREDUMP_OOB_INVALIDSIZE: the provided coredump_ack size was invalid > + * @COREDUMP_OOB_UNSUPPORTED: the provided coredump_ack mask was invalid > + * @COREDUMP_OOB_CONFLICTING: the provided coredump_ack mask has conflicting options > + * @__COREDUMP_OOB_MAX: the maximum value for coredump_oob > + */ > +enum coredump_oob { > + COREDUMP_OOB_INVALIDSIZE = 1U, > + COREDUMP_OOB_UNSUPPORTED = 2U, > + COREDUMP_OOB_CONFLICTING = 3U, > + __COREDUMP_OOB_MAX = 255U, > +} __attribute__ ((__packed__)); > + > +#endif /* _UAPI_LINUX_COREDUMP_H */ > > -- > 2.47.2 >