Re: [RESEND PATCH v2 bpf-next 02/12] bpf: tcp: Make sure iter->batch always contains a full bucket snapshot

Stanislav Fomichev <stfomichev@xxxxxxxxx> · Wed, 18 Jun 2025 11:44:46 -0700

On 06/18, Jordan Rife wrote:
> Require that iter->batch always contains a full bucket snapshot. This
> invariant is important to avoid skipping or repeating sockets during
> iteration when combined with the next few patches. Before, there were
> two cases where a call to bpf_iter_tcp_batch may only capture part of a
> bucket:
> 
> 1. When bpf_iter_tcp_realloc_batch() returns -ENOMEM.
> 2. When more sockets are added to the bucket while calling
>    bpf_iter_tcp_realloc_batch(), making the updated batch size
>    insufficient.
> 
> In cases where the batch size only covers part of a bucket, it is
> possible to forget which sockets were already visited, especially if we
> have to process a bucket in more than two batches. This forces us to
> choose between repeating or skipping sockets, so don't allow this:
> 
> 1. Stop iteration and propagate -ENOMEM up to userspace if reallocation
>    fails instead of continuing with a partial batch.
> 2. Try bpf_iter_tcp_realloc_batch() with GFP_USER just as before, but if
>    we still aren't able to capture the full bucket, call
>    bpf_iter_tcp_realloc_batch() again while holding the bucket lock to
>    guarantee the bucket does not change. On the second attempt use
>    GFP_NOWAIT since we hold onto the spin lock.
> 
> I did some manual testing to exercise the code paths where GFP_NOWAIT is
> used and where ERR_PTR(err) is returned. I used the realloc test cases
> included later in this series to trigger a scenario where a realloc
> happens inside bpf_iter_tcp_batch and made a small code tweak to force
> the first realloc attempt to allocate a too-small batch, thus requiring
> another attempt with GFP_NOWAIT. Some printks showed both reallocs with
> the tests passing:
> 
> May 09 18:18:55 crow kernel: resize batch TCP_SEQ_STATE_LISTENING
> May 09 18:18:55 crow kernel: again GFP_USER
> May 09 18:18:55 crow kernel: resize batch TCP_SEQ_STATE_LISTENING
> May 09 18:18:55 crow kernel: again GFP_NOWAIT
> May 09 18:18:57 crow kernel: resize batch TCP_SEQ_STATE_ESTABLISHED
> May 09 18:18:57 crow kernel: again GFP_USER
> May 09 18:18:57 crow kernel: resize batch TCP_SEQ_STATE_ESTABLISHED
> May 09 18:18:57 crow kernel: again GFP_NOWAIT
> 
> With this setup, I also forced each of the bpf_iter_tcp_realloc_batch
> calls to return -ENOMEM to ensure that iteration ends and that the
> read() in userspace fails.
> 
> Signed-off-by: Jordan Rife <jordan@xxxxxxxx>
> Reviewed-by: Kuniyuki Iwashima <kuniyu@xxxxxxxxxx>
> ---
>  net/ipv4/tcp_ipv4.c | 96 ++++++++++++++++++++++++++++++++-------------
>  1 file changed, 68 insertions(+), 28 deletions(-)
> 
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 2e40af6aff37..69c976a07434 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -3057,7 +3057,10 @@ static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
>  	if (!new_batch)
>  		return -ENOMEM;
>  
> -	bpf_iter_tcp_put_batch(iter);
> +	if (flags != GFP_NOWAIT)
> +		bpf_iter_tcp_put_batch(iter);
> +
> +	memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
>  	kvfree(iter->batch);
>  	iter->batch = new_batch;
>  	iter->max_sk = new_batch_sz;
> @@ -3066,69 +3069,85 @@ static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
>  }
>  
>  static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
> -						 struct sock *start_sk)
> +						 struct sock **start_sk)
>  {
> -	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
>  	struct bpf_tcp_iter_state *iter = seq->private;
> -	struct tcp_iter_state *st = &iter->state;
>  	struct hlist_nulls_node *node;
>  	unsigned int expected = 1;
>  	struct sock *sk;
>  
> -	sock_hold(start_sk);
> -	iter->batch[iter->end_sk++] = start_sk;
> +	sock_hold(*start_sk);
> +	iter->batch[iter->end_sk++] = *start_sk;
>  
> -	sk = sk_nulls_next(start_sk);
> +	sk = sk_nulls_next(*start_sk);
> +	*start_sk = NULL;
>  	sk_nulls_for_each_from(sk, node) {
>  		if (seq_sk_match(seq, sk)) {
>  			if (iter->end_sk < iter->max_sk) {
>  				sock_hold(sk);
>  				iter->batch[iter->end_sk++] = sk;
> +			} else if (!*start_sk) {
> +				/* Remember where we left off. */
> +				*start_sk = sk;
>  			}
>  			expected++;
>  		}
>  	}
> -	spin_unlock(&hinfo->lhash2[st->bucket].lock);
>  
>  	return expected;
>  }
>  
>  static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
> -						   struct sock *start_sk)
> +						   struct sock **start_sk)
>  {
> -	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
>  	struct bpf_tcp_iter_state *iter = seq->private;
> -	struct tcp_iter_state *st = &iter->state;
>  	struct hlist_nulls_node *node;
>  	unsigned int expected = 1;
>  	struct sock *sk;
>  
> -	sock_hold(start_sk);
> -	iter->batch[iter->end_sk++] = start_sk;
> +	sock_hold(*start_sk);
> +	iter->batch[iter->end_sk++] = *start_sk;
>  
> -	sk = sk_nulls_next(start_sk);
> +	sk = sk_nulls_next(*start_sk);
> +	*start_sk = NULL;
>  	sk_nulls_for_each_from(sk, node) {
>  		if (seq_sk_match(seq, sk)) {
>  			if (iter->end_sk < iter->max_sk) {
>  				sock_hold(sk);
>  				iter->batch[iter->end_sk++] = sk;
> +			} else if (!*start_sk) {
> +				/* Remember where we left off. */
> +				*start_sk = sk;
>  			}
>  			expected++;
>  		}
>  	}
> -	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
>  
>  	return expected;
>  }
>  
> +static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
> +{
> +	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
> +	struct bpf_tcp_iter_state *iter = seq->private;
> +	struct tcp_iter_state *st = &iter->state;
> +
> +	if (st->state == TCP_SEQ_STATE_LISTENING)
> +		spin_unlock(&hinfo->lhash2[st->bucket].lock);
> +	else
> +		spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
> +}
> +
>  static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
>  {
>  	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
>  	struct bpf_tcp_iter_state *iter = seq->private;
>  	struct tcp_iter_state *st = &iter->state;
> +	int prev_bucket, prev_state;
>  	unsigned int expected;
> -	bool resized = false;
> +	int resizes = 0;
>  	struct sock *sk;
> +	int err;
>  
>  	/* The st->bucket is done.  Directly advance to the next
>  	 * bucket instead of having the tcp_seek_last_pos() to skip
> @@ -3149,29 +3168,50 @@ static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
>  	/* Get a new batch */
>  	iter->cur_sk = 0;
>  	iter->end_sk = 0;
> -	iter->st_bucket_done = false;
> +	iter->st_bucket_done = true;
>  
> +	prev_bucket = st->bucket;
> +	prev_state = st->state;
>  	sk = tcp_seek_last_pos(seq);
>  	if (!sk)
>  		return NULL; /* Done */
> +	if (st->bucket != prev_bucket || st->state != prev_state)
> +		resizes = 0;
> +	expected = 0;
>  
> +fill_batch:
>  	if (st->state == TCP_SEQ_STATE_LISTENING)
> -		expected = bpf_iter_tcp_listening_batch(seq, sk);
> +		expected += bpf_iter_tcp_listening_batch(seq, &sk);
>  	else
> -		expected = bpf_iter_tcp_established_batch(seq, sk);
> +		expected += bpf_iter_tcp_established_batch(seq, &sk);
>  
> -	if (iter->end_sk == expected) {
> -		iter->st_bucket_done = true;
> -		return sk;
> -	}

[..]

> +	if (unlikely(resizes <= 1 && iter->end_sk != expected)) {
> +		resizes++;
> +
> +		if (resizes == 1) {
> +			bpf_iter_tcp_unlock_bucket(seq);
>  
> -	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
> -						    GFP_USER)) {
> -		resized = true;
> -		goto again;
> +			err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
> +							 GFP_USER);
> +			if (err)
> +				return ERR_PTR(err);
> +			goto again;
> +		}
> +
> +		err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
> +		if (err) {
> +			bpf_iter_tcp_unlock_bucket(seq);
> +			return ERR_PTR(err);
> +		}
> +
> +		expected = iter->end_sk;
> +		goto fill_batch;

Can we try to unroll this? Add new helpers to hide the repeating parts,
store extra state in iter if needed.

AFAIU, we want the following:
1. find sk, try to fill the batch, if it fits -> bail out
2. try to allocate new batch with GPU_USER, try to fill again -> bail
   out
3. otherwise, attempt GPF_NOWAIT and do that dance where you copy over
   previous partial copy

The conditional put in bpf_iter_tcp_put_batch does not look nice :-(
Same for unconditional memcpy (which, if I understand correctly, only
needed for GFP_NOWAIT case). I'm 99% sure your current version works,
but it's a bit hard to follow :-(

Untested code to illustrate the idea below. Any reason it won't work?

/* fast path */

sk = tcp_seek_last_pos(seq);
if (!sk) return NULL;
fits = bpf_iter_tcp_fill_batch(...);
bpf_iter_tcp_unlock_bucket(iter);
if (fits) return sk;

/* not enough space to store full batch, try to reallocate with GFP_USER */

bpf_iter_tcp_free_batch(iter);

if (bpf_iter_tcp_alloc_batch(iter, GFP_USER)) {
	/* allocated 'expected' size, try to fill again */

	sk = tcp_seek_last_pos(seq);
	if (!sk) return NULL;
	fits = bpf_iter_tcp_fill_batch(...);
	if (fits) {
		bpf_iter_tcp_unlock_bucket(iter);
		return sk;
	}
}

/* the bucket is still locked here, sk points to the correct one,
 * we have a partial result in iter->batch */

old_batch = iter->batch;

if (!bpf_iter_tcp_alloc_batch(iter, GFP_NOWAIT)) {
	/* no luck, bail out */
	bpf_iter_tcp_unlock_bucket(iter);
	bpf_iter_tcp_free_batch(iter); /* or put? */
	return ERR_PTR(-ENOMEM);
}

if (old_batch) {
	/* copy partial result from the previous run if needed? */
	memcpy(iter->batch, old_batch, ...);
	kvfree(old_batch);
}

/* TODO: somehow fill the remainder */

bpf_iter_tcp_unlock_bucket(iter);
return ..;

....

bool bpf_iter_tcp_fill_batch(...)
{
	if (st->state == TCP_SEQ_STATE_LISTENING)
		expected = bpf_iter_tcp_listening_batch(seq, sk);
	else
		expected = bpf_iter_tcp_established_batch(seq, sk);

	/* TODO: store expected into the iter for future resizing */
	/* TODO: make bpf_iter_tcp_xxx_batch store start_sk in iter */

	if (iter->end_sk == expected) {
		iter->st_bucket_done = true;
		return true;
	}

	return false;
}

bool bpf_iter_tcp_free_batch(...)
{
	bpf_iter_tcp_put_batch(iter);
	kvfree(iter->batch);
	iter->batch = NULL;
}