[PATCH v3 bpf-next 2/6] bpf: udp: Make sure iter->batch always contains a full bucket snapshot

Jordan Rife <jordan@xxxxxxxx> · Wed, 16 Apr 2025 16:36:17 -0700

Require that iter->batch always contains a full bucket snapshot. This
invariant is important to avoid skipping or repeating sockets during
iteration when combined with the next few patches. Before, there were
two cases where a call to bpf_iter_udp_batch may only capture part of a
bucket:

1. When bpf_iter_udp_realloc_batch() returns -ENOMEM [1].
2. When more sockets are added to the bucket while calling
   bpf_iter_udp_realloc_batch(), making the updated batch size
   insufficient [2].

In cases where the batch size only covers part of a bucket, it is
possible to forget which sockets were already visited, especially if we
have to process a bucket in more than two batches. This forces us to
choose between repeating or skipping sockets, so don't allow this:

1. Stop iteration and propagate -ENOMEM up to userspace if reallocation
   fails instead of continuing with a partial batch.
2. Retry bpf_iter_udp_realloc_batch() up to two times if we fail to
   capture the full bucket. On the third attempt, hold onto the bucket
   lock (hslot2->lock) through bpf_iter_udp_realloc_batch() with
   GFP_ATOMIC to guarantee that the bucket size doesn't change before
   our next attempt. Try with GFP_USER first to improve the chances
   that memory allocation succeeds; only use GFP_ATOMIC as a last
   resort.

Testing all scenarios directly is a bit difficult, but I did some manual
testing to exercise the code paths where GFP_ATOMIC is used and where
where ERR_PTR(err) is returned to make sure there are no deadlocks. I
used the realloc test case included later in this series to trigger a
scenario where a realloc happens inside bpf_iter_udp_realloc_batch and
made a small code tweak to force the first two realloc attempts to
allocate a too-small buffer, thus requiring another attempt until the
GFP_ATOMIC case is hit. Some printks showed three reallocs with the
tests passing:

Apr 16 00:08:32 crow kernel: go again (mem_flags=GFP_USER)
Apr 16 00:08:32 crow kernel: go again (mem_flags=GFP_USER)
Apr 16 00:08:32 crow kernel: go again (mem_flags=GFP_ATOMIC)

With this setup, I also forced bpf_iter_udp_realloc_batch to return
-ENOMEM on one of the retries to ensure that iteration ends and that the
read() in userspace fails, forced the hlist_empty condition to be true
on the GFP_ATOMIC pass to test the first WARN_ON_ONCE condition code
path, and walked back iter->end_sk on the GFP_ATOMIC pass to test the
second WARN_ON_ONCE condition code path. In each case, locks were
released and the loop terminated.

[1]: https://lore.kernel.org/bpf/CABi4-ogUtMrH8-NVB6W8Xg_F_KDLq=yy-yu-tKr2udXE2Mu1Lg@xxxxxxxxxxxxxx/
[2]: https://lore.kernel.org/bpf/7ed28273-a716-4638-912d-f86f965e54bb@xxxxxxxxx/

Signed-off-by: Jordan Rife <jordan@xxxxxxxx>
Suggested-by: Martin KaFai Lau <martin.lau@xxxxxxxxx>
---
 net/ipv4/udp.c | 57 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 44 insertions(+), 13 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0ac31dec339a..4802d3fa37ed 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3377,6 +3377,7 @@ int udp4_seq_show(struct seq_file *seq, void *v)
 }
 
 #ifdef CONFIG_BPF_SYSCALL
+#define MAX_REALLOC_ATTEMPTS 3
 struct bpf_iter__udp {
 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
 	__bpf_md_ptr(struct udp_sock *, udp_sk);
@@ -3401,11 +3402,13 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
 	struct bpf_udp_iter_state *iter = seq->private;
 	struct udp_iter_state *state = &iter->state;
 	struct net *net = seq_file_net(seq);
+	int resizes = MAX_REALLOC_ATTEMPTS;
 	int resume_bucket, resume_offset;
 	struct udp_table *udptable;
 	unsigned int batch_sks = 0;
-	bool resized = false;
+	spinlock_t *lock = NULL;
 	struct sock *sk;
+	int err = 0;
 
 	resume_bucket = state->bucket;
 	resume_offset = iter->offset;
@@ -3433,10 +3436,13 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
 		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot;
 
 		if (hlist_empty(&hslot2->head))
-			continue;
+			goto next_bucket;
 
 		iter->offset = 0;
-		spin_lock_bh(&hslot2->lock);
+		if (!lock) {
+			lock = &hslot2->lock;
+			spin_lock_bh(lock);
+		}
 		udp_portaddr_for_each_entry(sk, &hslot2->head) {
 			if (seq_sk_match(seq, sk)) {
 				/* Resume from the last iterated socket at the
@@ -3454,15 +3460,26 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
 				batch_sks++;
 			}
 		}
-		spin_unlock_bh(&hslot2->lock);
 
 		if (iter->end_sk)
 			break;
+next_bucket:
+		/* Somehow the bucket was emptied or all matching sockets were
+		 * removed while we held onto its lock. This should not happen.
+		 */
+		if (WARN_ON_ONCE(!resizes))
+			/* Best effort; reset the resize budget and move on. */
+			resizes = MAX_REALLOC_ATTEMPTS;
+		if (lock)
+			spin_unlock_bh(lock);
+		lock = NULL;
 	}
 
 	/* All done: no batch made. */
 	if (!iter->end_sk)
-		return NULL;
+		goto done;
+
+	sk = iter->batch[0];
 
 	if (iter->end_sk == batch_sks) {
 		/* Batching is done for the current bucket; return the first
@@ -3471,16 +3488,30 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
 		iter->st_bucket_done = true;
 		goto done;
 	}
-	if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2,
-						    GFP_USER)) {
-		resized = true;
-		/* After allocating a larger batch, retry one more time to grab
-		 * the whole bucket.
-		 */
-		goto again;
+
+	/* Somehow the batch size still wasn't big enough even though we held
+	 * a lock on the bucket. This should not happen.
+	 */
+	if (WARN_ON_ONCE(!resizes))
+		goto done;
+
+	resizes--;
+	if (resizes) {
+		spin_unlock_bh(lock);
+		lock = NULL;
+	}
+	err = bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2,
+					 resizes ? GFP_USER : GFP_ATOMIC);
+	if (err) {
+		sk = ERR_PTR(err);
+		goto done;
 	}
+
+	goto again;
 done:
-	return iter->batch[0];
+	if (lock)
+		spin_unlock_bh(lock);
+	return sk;
 }
 
 static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-- 
2.43.0