Hi Bert, Thanks for your report and analysis! On Sun, May 18, 2025 at 3:49 AM Bert Karwatzki <spasswolf@xxxxxx> wrote: > > Am Samstag, dem 17.05.2025 um 13:34 +0200 schrieb Bert Karwatzki: > > Am Freitag, dem 16.05.2025 um 20:19 +0200 schrieb Bert Karwatzki: > > > I've added a debugging statement: > > > > > > diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c > > > index 3bd5ee0995fe..853493eca4f5 100644 > > > --- a/net/mac80211/tx.c > > > +++ b/net/mac80211/tx.c > > > @@ -4586,7 +4586,11 @@ static noinline void ieee80211_8023_xmit_clang_debug_helper(struct sk_buff *skb, What is the caller of it? It's the function that you customized? > > > struct ieee80211_local *local, > > > struct ieee80211_tx_info *info) > > > { > > > - if (unlikely(skb->sk && sock_flag(skb->sk, SOCK_WIFI_STATUS))) { > > > + if (unlikely(skb->sk && ((skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) || > > > + sock_flag(skb->sk, SOCK_WIFI_STATUS)))) { > > > + if ((skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) ^ sock_flag(skb->sk, SOCK_WIFI_STATUS)) > > > + printk(KERN_INFO "%s: skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS = %u sock_flag(skb->sk, > > > SOCK_WIFI_STATUS) = %u\n", > > > + __func__, (skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS), sock_flag(skb->sk, > > > SOCK_WIFI_STATUS)); > > > info->status_data = ieee80211_store_ack_skb(local, skb, > > > &info->flags, NULL); > > > if (info->status_data) > > > > > > This gives the following logoutput (and a lockup), indicating that sock_flag(skb->sk, SOCK_WIFI_STATUS) and > > > (skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) are actually NOT equivalent (when compiled with clang and > > > PREEMPT_RT=y) Moving skc_flags out of the union can solve the issue, right? Simple modification looks like this: diff --git a/include/net/sock.h b/include/net/sock.h index 3e15d7105ad2..5810c7b80507 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -189,13 +189,13 @@ struct sock_common { atomic64_t skc_cookie; + unsigned long skc_flags; /* following fields are padding to force * offset(struct sock, sk_refcnt) == 128 on 64bit arches * assuming IPV6 is enabled. We use this padding differently * for different kind of 'sockets' */ union { - unsigned long skc_flags; struct sock *skc_listener; /* request_sock */ struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */ }; Can you give it a try? > > > > I've added more debugging output: > > > > diff --git a/include/net/sock.h b/include/net/sock.h > > index e223102337c7..e13560b5b7a8 100644 > > --- a/include/net/sock.h > > +++ b/include/net/sock.h > > @@ -2735,8 +2735,10 @@ static inline void _sock_tx_timestamp(struct sock *sk, > > *tskey = atomic_inc_return(&sk->sk_tskey) - 1; > > } > > } > > - if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) > > + if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) { > > + printk(KERN_INFO "%s: setting SKBTX_WIFI_STATUS for sk = %px\n", __func__, sk); > > *tx_flags |= SKBTX_WIFI_STATUS; > > + } > > } > > > > static inline void sock_tx_timestamp(struct sock *sk, > > diff --git a/net/core/sock.c b/net/core/sock.c > > index e02a78538e3e..f6589ad5ba36 100644 > > --- a/net/core/sock.c > > +++ b/net/core/sock.c > > @@ -1548,6 +1548,7 @@ int sk_setsockopt(struct sock *sk, int level, int optname, > > break; > > > > case SO_WIFI_STATUS: > > + printk(KERN_INFO "%s: setting SOCK_WIFI_STATUS to %u for sk = %px\n", __func__, valbool, sk); > > sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); > > break; > > > > diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c > > index 853493eca4f5..eee2f80949c6 100644 > > --- a/net/mac80211/tx.c > > +++ b/net/mac80211/tx.c > > @@ -4588,9 +4588,12 @@ static noinline void ieee80211_8023_xmit_clang_debug_helper(struct sk_buff *skb, > > { > > if (unlikely(skb->sk && ((skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) || > > sock_flag(skb->sk, SOCK_WIFI_STATUS)))) { > > - if ((skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) ^ sock_flag(skb->sk, SOCK_WIFI_STATUS)) > > + if ((skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) ^ sock_flag(skb->sk, SOCK_WIFI_STATUS)) { > > printk(KERN_INFO "%s: skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS = %u sock_flag(skb->sk, SOCK_WIFI_STATUS) = %u\n", > > __func__, (skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS), sock_flag(skb->sk, SOCK_WIFI_STATUS)); > > + printk(KERN_INFO "%s: skb->sk = %px skb->sk->sk_flags = 0x%lx\n", __func__, skb->sk, skb->sk->sk_flags); > > + return; // This should make this case non-fatal. > > + } > > info->status_data = ieee80211_store_ack_skb(local, skb, > > &info->flags, NULL); > > if (info->status_data) > > > > > > > > This gives after ~15min uptime > > > > [ 189.337797] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS = 0 sock_flag(skb->sk, SOCK_WIFI_STATUS) = 1 > > [ 189.337803] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb->sk = ffff8c1b798c4e00 skb->sk->sk_flags = 0xffffffffb4efe640 > > [ 191.325256] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS = 0 sock_flag(skb->sk, SOCK_WIFI_STATUS) = 1 > > [ 191.325259] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb->sk = ffff8c1b798c5a00 skb->sk->sk_flags = 0xffffffffb4efe640 > > [ 257.591831] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS = 0 sock_flag(skb->sk, SOCK_WIFI_STATUS) = 1 > > [ 257.591844] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb->sk = ffff8c1baf3bca00 skb->sk->sk_flags = 0xffffffffb4efe640 > > [ 301.786963] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS = 0 sock_flag(skb->sk, SOCK_WIFI_STATUS) = 1 > > [ 301.786967] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb->sk = ffff8c1c1bc40100 skb->sk->sk_flags = 0xffffffffb4efe640 > > [ 302.780881] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS = 0 sock_flag(skb->sk, SOCK_WIFI_STATUS) = 1 > > [ 302.780884] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb->sk = ffff8c1a44cf6000 skb->sk->sk_flags = 0xffffffffb4efe640 > > [ 482.792298] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS = 0 sock_flag(skb->sk, SOCK_WIFI_STATUS) = 1 > > [ 482.792304] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb->sk = ffff8c1da0f4de00 skb->sk->sk_flags = 0xffffffffb4efe640 > > [ 482.806144] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS = 0 sock_flag(skb->sk, SOCK_WIFI_STATUS) = 1 > > [ 482.806148] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb->sk = ffff8c1da0f4c500 skb->sk->sk_flags = 0xffffffffb4efe640 > > [ 482.817280] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS = 0 sock_flag(skb->sk, SOCK_WIFI_STATUS) = 1 > > [ 482.817284] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb->sk = ffff8c1da0f4df00 skb->sk->sk_flags = 0xffffffffb4efe640 > > [ 552.327291] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS = 0 sock_flag(skb->sk, SOCK_WIFI_STATUS) = 1 > > [ 552.327295] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb->sk = ffff8c1da0f4de00 skb->sk->sk_flags = 0xffffffffb4efe640 > > [ 916.971599] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS = 0 sock_flag(skb->sk, SOCK_WIFI_STATUS) = 1 > > [ 916.971607] [ T576] ieee80211_8023_xmit_clang_debug_helper: skb->sk = ffff8c1a62834000 skb->sk->sk_flags = 0xffffffffb4efe640 > > > > The printk()s in sk_set_sockopt() and _sock_tx_timestamp() are not called at all so the flag > > SOCK_WIFI_STATUS is actually nevers set! What is printed when printing skb->sk->sk_flags looks > > suspiciously like a pointer, and as sk_flags is actually a member of a union in struct sock_common > > it seems clang is using sk_flags for one of the other union members here > > > > struct sock_common { > > [...] > > union { > > unsigned long skc_flags; > > struct sock *skc_listener; /* request_sock */ > > struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */ > > }; > > [...] > > } > > > > Bert Karwatzki > > I added even more debugging output and found out why commit 76a853f86c97 (" wifi: free > SKBTX_WIFI_STATUS skb tx_flags flag") does not work. > > diff --git a/include/net/sock.h b/include/net/sock.h > index e13560b5b7a8..6e1291d2e5a1 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -2738,6 +2738,8 @@ static inline void _sock_tx_timestamp(struct sock *sk, > if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) { > printk(KERN_INFO "%s: setting SKBTX_WIFI_STATUS for sk = %px\n", __func__, sk); > *tx_flags |= SKBTX_WIFI_STATUS; > + } else { > + printk(KERN_INFO "%s: NOT setting SKBTX_WIFI_STATUS for sk = %px\n", __func__, sk); > } > } > > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c > index 20915895bdaa..4913b09c0617 100644 > --- a/net/ipv4/inet_connection_sock.c > +++ b/net/ipv4/inet_connection_sock.c > @@ -912,6 +912,7 @@ reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener, > return NULL; > } > req->rsk_listener = sk_listener; > + printk(KERN_INFO "%s: sk_listener = %px\n", __func__, sk_listener); > } > req->rsk_ops = ops; > req_to_sk(req)->sk_prot = sk_listener->sk_prot; > @@ -986,6 +987,7 @@ static struct request_sock *inet_reqsk_clone(struct request_sock *req, > nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu; > > nreq->rsk_listener = sk; > + printk(KERN_INFO "%s: rsk_listener =%px\n", __func__, sk); > > /* We need not acquire fastopenq->lock > * because the child socket is locked in inet_csk_listen_stop(). > diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c > index 67efe9501581..1a3108ec7503 100644 > --- a/net/ipv4/inet_timewait_sock.c > +++ b/net/ipv4/inet_timewait_sock.c > @@ -190,6 +190,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, > const struct inet_sock *inet = inet_sk(sk); > > tw->tw_dr = dr; > + printk(KERN_INFO "%s: sk = %px tw_dr = %px\n", __func__, sk, dr); > /* Give us an identity. */ > tw->tw_daddr = inet->inet_daddr; > tw->tw_rcv_saddr = inet->inet_rcv_saddr; > diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c > index eee2f80949c6..227b86427e06 100644 > --- a/net/mac80211/tx.c > +++ b/net/mac80211/tx.c > @@ -4586,6 +4586,8 @@ static noinline void ieee80211_8023_xmit_clang_debug_helper(struct sk_buff *skb, > struct ieee80211_local *local, > struct ieee80211_tx_info *info) > { > + if (skb->sk) > + printk(KERN_INFO "%s: skb->sk = %px skb->sk->sk_flags = 0x%lx\n", __func__, skb->sk, skb->sk->sk_flags); > if (unlikely(skb->sk && ((skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) || > sock_flag(skb->sk, SOCK_WIFI_STATUS)))) { > if ((skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) ^ sock_flag(skb->sk, SOCK_WIFI_STATUS)) { > > > This monitor the value of skb->sk->sk_flags not only in the error case but in all cases, and also monitors > the places where the other members of the sk_flags union are set. The error occurs when at the start > of ieee80211_8023_xmit_clang_debug_helper() sk_flags is not actually the skc_flags member of the union > but insted is skc_tw_dr which is only interpreted is flags. > So why does it work with gcc but fail with clang? sock_flag(skb->sk, SOCK_WIFI_STATUS) test bit 19 of > skb->sk->sk_flags Could you say more about this? I don't follow it. Why does the gcc test just miss the crash issue? Is there anything (like call trace) different between them? My worry is that all the callers calling sock_flag might have such potential risk... Thanks, Jason > > Here are the important snippets of debug output: > > clang: > [ T575] ieee80211_8023_xmit_clang_debug_helper: skb->sk = ffff8f1bebba4300 skb->sk->sk_flags = 0xffffffffa16fe640 > > Here test_bit(0xffffffffa16fe640, SOCK_WIFI_STATUS) is 1. > > gcc: > [ T600] ieee80211_8023_xmit_clang_debug_helper: skb->sk = ffff8d3506bec700 skb->sk->sk_flags = 0xffffffff93d40100 > Here test_bit(0xffffffff93d40100, SOCK_WIFI_STATUS) is 0. > > So that this works with gcc just seems like luck. I've not yet test why it works with clang when PREEMPT_RT is not > enabled but my guess is that in that case we have a tw_dr pointer which fails the test_bit(). > > Bert Karwatzki > > > > > > >