Add the reserved "linuxafsk/" prefix for AF_UNIX sockets and require CAP_NET_ADMIN in the owning user namespace of the network namespace to bind it. This will be used in next patches to support the coredump socket but is a generally useful concept. The collision risk is so low that we can just start using it. Userspace must already be prepared to retry if a given abstract address isn't usable anyway. Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> --- include/uapi/linux/un.h | 2 ++ net/unix/af_unix.c | 39 +++++++++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/un.h b/include/uapi/linux/un.h index 0ad59dc8b686..bbd5ad508dfa 100644 --- a/include/uapi/linux/un.h +++ b/include/uapi/linux/un.h @@ -5,6 +5,8 @@ #include <linux/socket.h> #define UNIX_PATH_MAX 108 +/* reserved AF_UNIX socket namespace. */ +#define UNIX_SOCKET_NAMESPACE "linuxafsk/" struct sockaddr_un { __kernel_sa_family_t sun_family; /* AF_UNIX */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 472f8aa9ea15..148d008862e7 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -114,6 +114,13 @@ static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; +static const struct sockaddr_un linuxafsk_addr = { + .sun_family = AF_UNIX, + .sun_path = "\0"UNIX_SOCKET_NAMESPACE, +}; + +#define UNIX_SOCKET_NAMESPACE_ADDR_LEN (offsetof(struct sockaddr_un, sun_path) + sizeof(UNIX_SOCKET_NAMESPACE)) + /* SMP locking strategy: * hash table is protected with spinlock. * each socket state is protected by separate spinlock. @@ -436,6 +443,30 @@ static struct sock *__unix_find_socket_byname(struct net *net, return NULL; } +static int unix_may_bind_name(struct net *net, struct sockaddr_un *sunname, + int len, unsigned int hash) +{ + struct sock *s; + + s = __unix_find_socket_byname(net, sunname, len, hash); + if (s) + return -EADDRINUSE; + + /* + * Check whether this is our reserved prefix and if so ensure + * that only privileged processes can bind it. + */ + if (UNIX_SOCKET_NAMESPACE_ADDR_LEN <= len && + !memcmp(&linuxafsk_addr, sunname, UNIX_SOCKET_NAMESPACE_ADDR_LEN)) { + /* Don't bind the namespace itself. */ + if (UNIX_SOCKET_NAMESPACE_ADDR_LEN == len) + return -ECONNREFUSED; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -ECONNREFUSED; + } + return 0; +} + static inline struct sock *unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, int len, unsigned int hash) @@ -1258,10 +1289,10 @@ static int unix_autobind(struct sock *sk) new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); unix_table_double_lock(net, old_hash, new_hash); - if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { + if (unix_may_bind_name(net, addr->name, addr->len, new_hash)) { unix_table_double_unlock(net, old_hash, new_hash); - /* __unix_find_socket_byname() may take long time if many names + /* unix_may_bind_name() may take long time if many names * are already in use. */ cond_resched(); @@ -1379,7 +1410,8 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); unix_table_double_lock(net, old_hash, new_hash); - if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) + err = unix_may_bind_name(net, addr->name, addr->len, new_hash); + if (err) goto out_spin; __unix_set_addr_hash(net, sk, addr, new_hash); @@ -1389,7 +1421,6 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, out_spin: unix_table_double_unlock(net, old_hash, new_hash); - err = -EADDRINUSE; out_mutex: mutex_unlock(&u->bindlock); out: -- 2.47.2