Re: [PATCH 3/5] fuse: {io-uring} Use bitmaps to track queue availability

Joanne Koong <joannelkoong@xxxxxxxxx> · Thu, 24 Jul 2025 16:56:58 -0700

On Tue, Jul 22, 2025 at 2:58 PM Bernd Schubert <bschubert@xxxxxxx> wrote:
>
> Add per-CPU and per-NUMA node bitmasks to track which
> io-uring queues are available for new requests.
>
> - Global queue availability (avail_q_mask)
> - Per-NUMA node queue availability (per_numa_avail_q_mask)
> - Global queue registration (registered_q_mask)
> - Per-NUMA node queue registration (numa_registered_q_mask)
>
> Note that these bitmasks are not lock protected, accessing them
> will not be absolutely accurate. Goal is to determine which
> queues are aproximately idle and might be better suited for
> a request.
>
> Signed-off-by: Bernd Schubert <bschubert@xxxxxxx>
> ---
>  fs/fuse/dev_uring.c   | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/fuse/dev_uring_i.h | 18 ++++++++++
>  2 files changed, 117 insertions(+)
>
> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
> index 0f5ab27dacb66c9f5f10eac2713d9bd3eb4c26da..c2bc20848bc54541ede9286562177994e7ca5879 100644
> --- a/fs/fuse/dev_uring.c
> +++ b/fs/fuse/dev_uring.c
> +static int fuse_ring_create_q_masks(struct fuse_ring *ring)
> +{
> +       if (!zalloc_cpumask_var(&ring->avail_q_mask, GFP_KERNEL_ACCOUNT))
> +               return -ENOMEM;
> +
> +       if (!zalloc_cpumask_var(&ring->registered_q_mask, GFP_KERNEL_ACCOUNT))
> +               return -ENOMEM;
> +
> +       ring->per_numa_avail_q_mask = kcalloc(ring->nr_numa_nodes,
> +                                             sizeof(struct cpumask *),

nit: sizeof(cpumask_var_t) since per_numa_avail_q_mask gets defined in
the struct as a cpumask_var_t?

> +                                             GFP_KERNEL_ACCOUNT);
> +       if (!ring->per_numa_avail_q_mask)
> +               return -ENOMEM;
> +       for (int node = 0; node < ring->nr_numa_nodes; node++)

nit: afaik, the general convention has the int declared at top of
function instead of inside loop scope

> +               if (!zalloc_cpumask_var(&ring->per_numa_avail_q_mask[node],
> +                                       GFP_KERNEL_ACCOUNT))
> @@ -472,11 +539,18 @@ void fuse_uring_stop_queues(struct fuse_ring *ring)
>
>         for (qid = 0; qid < ring->max_nr_queues; qid++) {
>                 struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]);
> +               int node;
>
>                 if (!queue)
>                         continue;
>
>                 fuse_uring_teardown_entries(queue);
> +
> +               node = queue->numa_node;
> +               cpumask_clear_cpu(qid, ring->registered_q_mask);
> +               cpumask_clear_cpu(qid, ring->avail_q_mask);
> +               cpumask_clear_cpu(qid, ring->numa_registered_q_mask[node]);
> +               cpumask_clear_cpu(qid, ring->per_numa_avail_q_mask[node]);

Would it be more efficient to clear it all at once (eg
cpumask_clear()) outside the loop instead of clearing it bit by bit
here?

>         }
>
>         if (atomic_read(&ring->queue_refs) > 0) {
> @@ -744,9 +818,18 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent,
>  static void fuse_uring_ent_avail(struct fuse_ring_ent *ent,
>                                  struct fuse_ring_queue *queue)
>  {
> +       struct fuse_ring *ring = queue->ring;
> +       int node = queue->numa_node;
> +
>         WARN_ON_ONCE(!ent->cmd);
>         list_move(&ent->list, &queue->ent_avail_queue);
>         ent->state = FRRS_AVAILABLE;
> +
> +       if (list_is_singular(&queue->ent_avail_queue) &&

Did you mean to include "list_is_singular()" here? I think even if
queue->ent_avail_queue has more than one entry on it, we still need to
run this loop in case it previously used to be the case that
queue->nr_reqs >= FUSE_URING_QUEUE_THRESHOLD?

> +           queue->nr_reqs <= FUSE_URING_QUEUE_THRESHOLD) {

Should this be <? Afaict, if queue->nr_reqs ==
FUSE_URING_QUEUE_THRESHOLD then it's considered full (and no longer
available)?

> +               cpumask_set_cpu(queue->qid, ring->avail_q_mask);
> +               cpumask_set_cpu(queue->qid, ring->per_numa_avail_q_mask[node]);
> +       }
>  }
>
>  /* Used to find the request on SQE commit */
> @@ -769,6 +852,8 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent,
>                                            struct fuse_req *req)
>  {
>         struct fuse_ring_queue *queue = ent->queue;
> +       struct fuse_ring *ring = queue->ring;
> +       int node = queue->numa_node;
>
>         lockdep_assert_held(&queue->lock);
>
> @@ -783,6 +868,16 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent,
>         ent->state = FRRS_FUSE_REQ;
>         list_move_tail(&ent->list, &queue->ent_w_req_queue);
>         fuse_uring_add_to_pq(ent, req);
> +
> +       /*
> +        * If there are no more available entries, mark the queue as unavailable
> +        * in both global and per-NUMA node masks
> +        */
> +       if (list_empty(&queue->ent_avail_queue)) {
> +               cpumask_clear_cpu(queue->qid, ring->avail_q_mask);
> +               cpumask_clear_cpu(queue->qid,
> +                                 ring->per_numa_avail_q_mask[node]);
> +       }
>  }
>
>  /* Fetch the next fuse request if available */
> @@ -982,6 +1077,7 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent,
>         struct fuse_ring *ring = queue->ring;
>         struct fuse_conn *fc = ring->fc;
>         struct fuse_iqueue *fiq = &fc->iq;
> +       int node = queue->numa_node;
>
>         fuse_uring_prepare_cancel(cmd, issue_flags, ent);
>
> @@ -990,6 +1086,9 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent,
>         fuse_uring_ent_avail(ent, queue);
>         spin_unlock(&queue->lock);
>
> +       cpumask_set_cpu(queue->qid, ring->registered_q_mask);
> +       cpumask_set_cpu(queue->qid, ring->numa_registered_q_mask[node]);
> +
>         if (!ring->ready) {
>                 bool ready = is_ring_ready(ring, queue->qid);
>
> diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
> index 708412294982566919122a1a0d7f741217c763ce..0457dbc6737c8876dd7a7d4c9c724da05e553e6a 100644
> --- a/fs/fuse/dev_uring_i.h
> +++ b/fs/fuse/dev_uring_i.h
> @@ -125,6 +131,18 @@ struct fuse_ring {
>          */
>         unsigned int stop_debug_log : 1;
>
> +       /* Tracks which queues are available (empty) globally */

nit: is "(empty)" accurate here? afaict, the queue is available as
long as it has <= FUSE_URING_QUEUE_THRESHOLD requests (eg even if it's
not empty)?

> +       cpumask_var_t avail_q_mask;

Should avail_q_mask  also get set accordingly when requests are queued
(eg fuse_uring_queue_fuse_req(), fuse_uring_queue_bq_req()) or
completed by userspace (eg fuse_uring_req_end()), if they meet
FUSE_URING_QUEUE_THRESHOLD?

Thanks,
Joanne
> +