Let's restore sock_create_kern() that holds a netns reference. Now, it's the same as the version before commit 26abe14379f8 ("net: Modify sk_alloc to not reference count the netns of kernel sockets."). Back then, after creating a socket in init_net, we used sk_change_net() to drop the netns ref and switch to another netns, but now we can simply use __sock_create_kern() instead. $ git blame -L:sk_change_net include/net/sock.h 26abe14379f8~ DEBUG_NET_WARN_ON_ONCE() is to catch a path calling sock_create_kern() from __net_init functions, since doing so would leak the netns as __net_exit functions cannot run until the socket is removed. Signed-off-by: Kuniyuki Iwashima <kuniyu@xxxxxxxxxx> --- v2: s/ret/err/ in sock_create_kern() for clarity --- include/linux/net.h | 2 ++ net/socket.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/include/linux/net.h b/include/linux/net.h index 12180e00f882..b60e3afab344 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -254,6 +254,8 @@ bool sock_is_registered(int family); int sock_create(int family, int type, int proto, struct socket **res); int __sock_create_kern(struct net *net, int family, int type, int proto, struct socket **res); +int sock_create_kern(struct net *net, int family, int type, int proto, + struct socket **res); int sock_create_lite(int family, int type, int proto, struct socket **res); struct socket *sock_alloc(void); void sock_release(struct socket *sock); diff --git a/net/socket.c b/net/socket.c index 7c4474c966c0..9ad352183fae 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1632,6 +1632,48 @@ int __sock_create_kern(struct net *net, int family, int type, int protocol, stru } EXPORT_SYMBOL(__sock_create_kern); +/** + * sock_create_kern - creates a socket for kernel space + * + * @net: net namespace + * @family: protocol family (AF_INET, ...) + * @type: communication type (SOCK_STREAM, ...) + * @protocol: protocol (0, ...) + * @res: new socket + * + * Creates a new socket and assigns it to @res. + * + * The socket is for kernel space and should not be exposed to + * userspace via a file descriptor nor BPF hooks except for LSM + * (see inet_create(), inet_release(), etc). + * + * The socket bypasses some LSMs that take care of @kern in + * security_socket_create() and security_socket_post_create(). + * + * The socket holds a reference count of @net so that the caller + * does not need to care about @net's lifetime. + * + * This MUST NOT be called from the __net_init path and @net MUST + * be alive as of calling sock_create_kern(). + * + * Context: Process context. This function internally uses GFP_KERNEL. + * Return: 0 or an error. + */ +int sock_create_kern(struct net *net, int family, int type, int protocol, + struct socket **res) +{ + int err; + + DEBUG_NET_WARN_ON_ONCE(!net_initialized(net)); + + err = __sock_create(net, family, type, protocol, res, 1); + if (!err) + sk_net_refcnt_upgrade((*res)->sk); + + return err; +} +EXPORT_SYMBOL(sock_create_kern); + static struct socket *__sys_socket_create(int family, int type, int protocol) { struct socket *sock; -- 2.49.0