On Sat, Jun 21, 2025 at 12:37 PM <chia-yu.chang@xxxxxxxxxxxxxxxxxxx> wrote: > > From: Ilpo Järvinen <ij@xxxxxxxxxx> > > These three byte counters track IP ECN field payload byte sums for > all arriving (acceptable) packets for ECT0, ECT1, and CE. The > AccECN option (added by a later patch in the series) echoes these > counters back to sender side; therefore, it is placed within the > group of tcp_sock_write_txrx. > > Below are the pahole outcomes before and after this patch, in which > the group size of tcp_sock_write_txrx is increased from 95 + 4 to > 107 + 4 and an extra 4-byte hole is created but will be exploited > in later patches: > > [BEFORE THIS PATCH] > struct tcp_sock { > [...] > u32 delivered_ce; /* 2640 4 */ > u32 received_ce; /* 2644 4 */ > u32 app_limited; /* 2648 4 */ > u32 rcv_wnd; /* 2652 4 */ > struct tcp_options_received rx_opt; /* 2656 24 */ > __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2680 0 */ > > [...] > /* size: 3264, cachelines: 51, members: 169 */ > } > > [AFTER THIS PATCH] > struct tcp_sock { > [...] > u32 delivered_ce; /* 2640 4 */ > u32 received_ce; /* 2644 4 */ > u32 received_ecn_bytes[3];/* 2648 12 */ > u32 app_limited; /* 2660 4 */ > u32 rcv_wnd; /* 2664 4 */ > struct tcp_options_received rx_opt; /* 2668 24 */ > __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2692 0 */ > /* XXX 4 bytes hole, try to pack */ > > [...] > /* size: 3264, cachelines: 51, members: 170 */ > } > > Signed-off-by: Ilpo Järvinen <ij@xxxxxxxxxx> > Signed-off-by: Neal Cardwell <ncardwell@xxxxxxxxxx> > Co-developed-by: Chia-Yu Chang <chia-yu.chang@xxxxxxxxxxxxxxxxxxx> > Signed-off-by: Chia-Yu Chang <chia-yu.chang@xxxxxxxxxxxxxxxxxxx> > > --- > v8: > - Add new helper function tcp_ecn_received_counters_payload() > --- > .../networking/net_cachelines/tcp_sock.rst | 1 + > include/linux/tcp.h | 4 ++++ > include/net/tcp.h | 20 +++++++++++++++++- > net/ipv4/tcp.c | 3 ++- > net/ipv4/tcp_input.c | 21 +++++++++++++++---- > net/ipv4/tcp_minisocks.c | 2 +- > 6 files changed, 44 insertions(+), 7 deletions(-) > > diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst > index 22ac668fe6c7..804480d39132 100644 > --- a/Documentation/networking/net_cachelines/tcp_sock.rst > +++ b/Documentation/networking/net_cachelines/tcp_sock.rst > @@ -102,6 +102,7 @@ u32 prr_out read_mostly read_m > u32 delivered read_mostly read_write tcp_rate_skb_sent, tcp_newly_delivered(tx);tcp_ack, tcp_rate_gen, tcp_clean_rtx_queue (rx) > u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) > u32 received_ce read_mostly read_write > +u32[3] received_ecn_bytes read_mostly read_write > u8:4 received_ce_pending read_mostly read_write > u8:2 syn_ect_snt write_mostly read_write > u8:2 syn_ect_rcv read_mostly read_write > diff --git a/include/linux/tcp.h b/include/linux/tcp.h > index 1d8301f2883c..0c2331e186e8 100644 > --- a/include/linux/tcp.h > +++ b/include/linux/tcp.h > @@ -306,6 +306,10 @@ struct tcp_sock { > u32 delivered; /* Total data packets delivered incl. rexmits */ > u32 delivered_ce; /* Like the above but only ECE marked packets */ > u32 received_ce; /* Like the above but for rcvd CE marked pkts */ > + u32 received_ecn_bytes[3]; /* received byte counters for three ECN > + * types: INET_ECN_ECT_1, INET_ECN_ECT_0, > + * and INET_ECN_CE > + */ > u32 app_limited; /* limited until "delivered" reaches this val */ > u32 rcv_wnd; /* Current receiver window */ > /* > diff --git a/include/net/tcp.h b/include/net/tcp.h > index 4d6325fa3f67..b861941a2213 100644 > --- a/include/net/tcp.h > +++ b/include/net/tcp.h > @@ -467,7 +467,10 @@ static inline int tcp_accecn_extract_syn_ect(u8 ace) > bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, u8 sent_ect); > void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, > u8 syn_ect_snt); > -void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb); > +void tcp_ecn_received_counters_payload(struct sock *sk, > + const struct sk_buff *skb); > +void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, > + u32 payload_len); > > enum tcp_tw_status { > TCP_TW_SUCCESS = 0, > @@ -1035,6 +1038,20 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) > * See draft-ietf-tcpm-accurate-ecn for the latest values. > */ > #define TCP_ACCECN_CEP_INIT_OFFSET 5 > +#define TCP_ACCECN_E1B_INIT_OFFSET 1 > +#define TCP_ACCECN_E0B_INIT_OFFSET 1 > +#define TCP_ACCECN_CEB_INIT_OFFSET 0 > + > +static inline void __tcp_accecn_init_bytes_counters(int *counter_array) > +{ > + BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1); > + BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2); > + BUILD_BUG_ON(INET_ECN_CE != 0x3); > + > + counter_array[INET_ECN_ECT_1 - 1] = 0; > + counter_array[INET_ECN_ECT_0 - 1] = 0; > + counter_array[INET_ECN_CE - 1] = 0; > +} > > /* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is > * attemped to be negotiated and requested for incoming connection > @@ -1053,6 +1070,7 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp) > { > tp->received_ce = 0; > tp->received_ce_pending = 0; > + __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); > } > > /* State flags for sacked in struct tcp_skb_cb */ > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c > index e6d7b5420c88..0e779de3ce01 100644 > --- a/net/ipv4/tcp.c > +++ b/net/ipv4/tcp.c > @@ -5122,6 +5122,7 @@ static void __init tcp_struct_check(void) > CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); > CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); > CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); > + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); > CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); > CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); > CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); > @@ -5129,7 +5130,7 @@ static void __init tcp_struct_check(void) > /* 32bit arches with 8byte alignment on u64 fields might need padding > * before tcp_clock_cache. > */ > - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 95 + 4); > + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 107 + 4); > > /* RX read-write hotpath cache lines */ > CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c > index c986452302cb..5c0159cc0083 100644 > --- a/net/ipv4/tcp_input.c > +++ b/net/ipv4/tcp_input.c > @@ -6013,8 +6013,17 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t > } > } > > +void tcp_ecn_received_counters_payload(struct sock *sk, > + const struct sk_buff *skb) > +{ > + const struct tcphdr *th = (const struct tcphdr *)skb->data; > + > + tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); > +} > + > /* Updates Accurate ECN received counters from the received IP ECN field */ > -void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb) > +void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, > + u32 payload_len) > { > u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; > u8 is_ce = INET_ECN_is_ce(ecnfield); > @@ -6034,6 +6043,9 @@ void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb) > tp->received_ce += pcount; > tp->received_ce_pending = min(tp->received_ce_pending + pcount, > 0xfU); > + > + if (payload_len > 0) > + tp->received_ecn_bytes[ecnfield - 1] += payload_len; > } > } > > @@ -6307,7 +6319,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) > flag |= __tcp_replace_ts_recent(tp, > delta); > > - tcp_ecn_received_counters(sk, skb); > + tcp_ecn_received_counters(sk, skb, 0); > > /* We know that such packets are checksummed > * on entry. > @@ -6353,7 +6365,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) > /* Bulk data transfer: receiver */ > tcp_cleanup_skb(skb); > __skb_pull(skb, tcp_header_len); > - tcp_ecn_received_counters(sk, skb); > + tcp_ecn_received_counters(sk, skb, > + len - tcp_header_len); > eaten = tcp_queue_rcv(sk, skb, &fragstolen); > > tcp_event_data_recv(sk, skb); > @@ -6400,7 +6413,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) > tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt); > tcp_fast_path_on(tp); > } > - tcp_ecn_received_counters(sk, skb); > + tcp_ecn_received_counters_payload(sk, skb); I missed this from a prior patch, but is it expected to account bytes even if the packet is dropped ? > > reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); > if ((int)reason < 0) { > diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c > index 779a206a5ca6..f1e698c323f3 100644 > --- a/net/ipv4/tcp_minisocks.c > +++ b/net/ipv4/tcp_minisocks.c > @@ -500,7 +500,7 @@ static void tcp_ecn_openreq_child(struct sock *sk, > tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); > tp->syn_ect_snt = treq->syn_ect_snt; > tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); > - tcp_ecn_received_counters(sk, skb); > + tcp_ecn_received_counters_payload(sk, skb); > } else { > tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? > TCP_ECN_MODE_RFC3168 : > -- > 2.34.1 >