In the tcp sendmsg handler, permit a write in LISTENING state if a MSG_PRELOAD flag is used. Copy from iovec to a linear sk_buff for placement on the socket write queue. Signed-off-by: Jeremy Harris <jgh@xxxxxxxx> --- include/linux/socket.h | 1 + net/ipv4/tcp.c | 112 ++++++++++++++++++ .../perf/trace/beauty/include/linux/socket.h | 1 + tools/perf/trace/beauty/msg_flags.c | 3 + 4 files changed, 117 insertions(+) diff --git a/include/linux/socket.h b/include/linux/socket.h index 3b262487ec06..b41f4cd4dc97 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -330,6 +330,7 @@ struct ucred { #define MSG_SOCK_DEVMEM 0x2000000 /* Receive devmem skbs as cmsg */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ +#define MSG_PRELOAD 0x10000000 /* Preload tx data while listening */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file descriptor received through diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f64f8276a73c..c0a787c1649d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1057,6 +1057,115 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, return err; } +/* Cut-down version of tcp_sendmsg_locked(), for writing on a listen socket + */ +static int tcp_sendmsg_preload(struct sock *sk, struct msghdr *msg) +{ + struct sk_buff *skb; + int flags, err, copied = 0; + int size_goal; + int process_backlog = 0; + long timeo; + + if (sk->sk_state != TCP_LISTEN) + return -EINVAL; + + flags = msg->msg_flags; + + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + + /* Ok commence sending. */ +restart: + /* Use a arbitrary "mss" value */ + size_goal = 1000; + + err = -EPIPE; + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto do_error; + + while (msg_data_left(msg)) { + ssize_t copy = 0; + + skb = tcp_write_queue_tail(sk); + if (skb) + copy = size_goal - skb->len; + + trace_tcp_sendmsg_locked(sk, msg, skb, size_goal); + + if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { + bool first_skb = !skb; + + /* Limit to only one skb on the sk write queue */ + + if (!first_skb) + goto out_nopush; + + if (!sk_stream_memory_free(sk)) + goto wait_for_space; + + if (unlikely(process_backlog >= 16)) { + process_backlog = 0; + if (sk_flush_backlog(sk)) + goto restart; + } + + skb = tcp_stream_alloc_skb(sk, sk->sk_allocation, + first_skb); + if (!skb) + goto wait_for_space; + + process_backlog++; + +#ifdef CONFIG_SKB_DECRYPTED + skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); +#endif + tcp_skb_entail(sk, skb); + copy = size_goal; + } + + /* Try to append data to the end of skb. */ + if (copy > msg_data_left(msg)) + copy = msg_data_left(msg); + + copy = min_t(int, copy, skb_tailroom(skb)); + err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); + if (err) + goto do_error; + + TCP_SKB_CB(skb)->end_seq += copy; + tcp_skb_pcount_set(skb, 0); + + copied += copy; + goto out_nopush; + +wait_for_space: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + tcp_remove_empty_skb(sk); + + err = sk_stream_wait_memory(sk, &timeo); + if (err != 0) + goto do_error; + } + +out_nopush: + return copied; + +do_error: + tcp_remove_empty_skb(sk); + + if (copied) + goto out_nopush; + + err = sk_stream_error(sk, flags, err); + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { + sk->sk_write_space(sk); + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); + } + + return err; +} + int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { struct net_devmem_dmabuf_binding *binding = NULL; @@ -1129,6 +1238,9 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto out_err; } + if (unlikely(flags & MSG_PRELOAD)) + return tcp_sendmsg_preload(sk, msg); + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); tcp_rate_check_app_limited(sk); /* is sending application-limited? */ diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h index c3322eb3d686..e9ea498169f3 100644 --- a/tools/perf/trace/beauty/include/linux/socket.h +++ b/tools/perf/trace/beauty/include/linux/socket.h @@ -330,6 +330,7 @@ struct ucred { #define MSG_SOCK_DEVMEM 0x2000000 /* Receive devmem skbs as cmsg */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ +#define MSG_PRELOAD 0x10000000 /* Preload tx data while listening */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file descriptor received through diff --git a/tools/perf/trace/beauty/msg_flags.c b/tools/perf/trace/beauty/msg_flags.c index 2da581ff0c80..27e40da9b02d 100644 --- a/tools/perf/trace/beauty/msg_flags.c +++ b/tools/perf/trace/beauty/msg_flags.c @@ -20,6 +20,9 @@ #ifndef MSG_SPLICE_PAGES #define MSG_SPLICE_PAGES 0x8000000 #endif +#ifndef MSG_PRELOAD +#define MSG_PRELOAD 0x10000000 +#endif #ifndef MSG_FASTOPEN #define MSG_FASTOPEN 0x20000000 #endif -- 2.49.0