Re: [PATCH 7/8] ublk: optimize UBLK_IO_REGISTER_IO_BUF on daemon task

Caleb Sander Mateos <csander@xxxxxxxxxxxxxxx> · Wed, 11 Jun 2025 08:36:43 -0700

On Mon, Jun 9, 2025 at 6:22 PM Ming Lei <ming.lei@xxxxxxxxxx> wrote:
>
> On Mon, Jun 09, 2025 at 10:14:21AM -0700, Caleb Sander Mateos wrote:
> > On Mon, Jun 9, 2025 at 2:02 AM Ming Lei <ming.lei@xxxxxxxxxx> wrote:
> > >
> > > On Fri, Jun 06, 2025 at 03:40:10PM -0600, Caleb Sander Mateos wrote:
> > > > ublk_register_io_buf() performs an expensive atomic refcount increment,
> > > > as well as a lot of pointer chasing to look up the struct request.
> > > >
> > > > Create a separate ublk_daemon_register_io_buf() for the daemon task to
> > > > call. Initialize ublk_rq_data's reference count to a large number, count
> > > > the number of buffers registered on the daemon task nonatomically, and
> > > > atomically subtract the large number minus the number of registered
> > > > buffers in ublk_commit_and_fetch().
> > > >
> > > > Also obtain the struct request directly from ublk_io's req field instead
> > > > of looking it up on the tagset.
> > > >
> > > > Signed-off-by: Caleb Sander Mateos <csander@xxxxxxxxxxxxxxx>
> > > > ---
> > > >  drivers/block/ublk_drv.c | 59 ++++++++++++++++++++++++++++++++++------
> > > >  1 file changed, 50 insertions(+), 9 deletions(-)
> > > >
> > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > index 2084bbdd2cbb..ec9e0fd21b0e 100644
> > > > --- a/drivers/block/ublk_drv.c
> > > > +++ b/drivers/block/ublk_drv.c
> > > > @@ -81,12 +81,20 @@
> > > >  #define UBLK_PARAM_TYPE_ALL                                \
> > > >       (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
> > > >        UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED |    \
> > > >        UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
> > > >
> > > > +/*
> > > > + * Initialize refcount to a large number to include any registered buffers.
> > > > + * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
> > > > + * any buffers registered on the io daemon task.
> > > > + */
> > > > +#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
> > > > +
> > > >  struct ublk_rq_data {
> > > >       refcount_t ref;
> > > > +     unsigned buffers_registered;
> > > >
> > > >       /* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */
> > > >       u16 buf_index;
> > > >       void *buf_ctx_handle;
> > > >  };
> > > > @@ -677,11 +685,12 @@ static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
> > > >               struct request *req)
> > > >  {
> > > >       if (ublk_need_req_ref(ubq)) {
> > > >               struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > >
> > > > -             refcount_set(&data->ref, 1);
> > > > +             refcount_set(&data->ref, UBLK_REFCOUNT_INIT);
> > > > +             data->buffers_registered = 0;
> > > >       }
> > > >  }
> > > >
> > > >  static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
> > > >               struct request *req)
> > > > @@ -706,10 +715,19 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
> > > >       } else {
> > > >               __ublk_complete_rq(req);
> > > >       }
> > > >  }
> > > >
> > > > +static inline void ublk_sub_req_ref(struct request *req)
> > > > +{
> > > > +     struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > > +     unsigned sub_refs = UBLK_REFCOUNT_INIT - data->buffers_registered;
> > > > +
> > > > +     if (refcount_sub_and_test(sub_refs, &data->ref))
> > > > +             __ublk_complete_rq(req);
> > > > +}
> > > > +
> > > >  static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
> > > >  {
> > > >       return ubq->flags & UBLK_F_NEED_GET_DATA;
> > > >  }
> > > >
> > > > @@ -1184,14 +1202,12 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
> > > >
> > > >  static void ublk_auto_buf_reg_fallback(struct request *req)
> > > >  {
> > > >       const struct ublk_queue *ubq = req->mq_hctx->driver_data;
> > > >       struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
> > > > -     struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > >
> > > >       iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
> > > > -     refcount_set(&data->ref, 1);
> > > >  }
> > > >
> > > >  static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > > >                             unsigned int issue_flags)
> > > >  {
> > > > @@ -1207,13 +1223,12 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > > >                       return true;
> > > >               }
> > > >               blk_mq_end_request(req, BLK_STS_IOERR);
> > > >               return false;
> > > >       }
> > > > -     /* one extra reference is dropped by ublk_io_release */
> > > > -     refcount_set(&data->ref, 2);
> > > >
> > > > +     data->buffers_registered = 1;
> > > >       data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
> > > >       /* store buffer index in request payload */
> > > >       data->buf_index = pdu->buf.index;
> > > >       io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
> > > >       return true;
> > > > @@ -1221,14 +1236,14 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> > > >
> > > >  static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
> > > >                                  struct request *req, struct ublk_io *io,
> > > >                                  unsigned int issue_flags)
> > > >  {
> > > > +     ublk_init_req_ref(ubq, req);
> > > >       if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
> > > >               return ublk_auto_buf_reg(req, io, issue_flags);
> > > >
> > > > -     ublk_init_req_ref(ubq, req);
> > > >       return true;
> > > >  }
> > > >
> > > >  static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
> > > >                         struct ublk_io *io)
> > > > @@ -2019,10 +2034,31 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
> > > >       }
> > > >
> > > >       return 0;
> > > >  }
> > > >
> > > > +static int ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
> > > > +                                    const struct ublk_queue *ubq,
> > > > +                                    const struct ublk_io *io,
> > > > +                                    unsigned index, unsigned issue_flags)
> > > > +{
> > > > +     struct request *req = io->req;
> > > > +     struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > > > +     int ret;
> > > > +
> > > > +     if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
> > > > +             return -EINVAL;
> > > > +
> > > > +     ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
> > > > +                                   issue_flags);
> > > > +     if (ret)
> > > > +             return ret;
> > > > +
> > > > +     data->buffers_registered++;
> > >
> > > This optimization replaces one ublk_get_req_ref()/refcount_inc_not_zero()
> > > with data->buffers_registered++ in case of registering io buffer from
> > > daemon context.
> > >
> > > And in typical implementation, the unregistering io buffer should be done
> > > in daemon context too, then I am wondering if any user-visible improvement
> > > can be observed in this more complicated & fragile way:
> >
> > Yes, generally I would expect the unregister to happen on the daemon
> > task too. But it's possible (with patch "ublk: allow
> > UBLK_IO_(UN)REGISTER_IO_BUF on any task") for the
> > UBLK_IO_UNREGISTER_IO_BUF to be issued on another task. And even if
> > the unregister happens on the daemon task, it's up to the io_uring
> > layer to actually call ublk_io_release() once all requests using the
> > registered buffer have completed. Assuming only the daemon task issues
> > io_uring requests using the buffer, I believe ublk_io_release() will
> > always currently be called on that task. But I'd rather not make
> > assumptions about the io_uring layer. It's also possible in principle
> > for ublk_io_release() whether it's called on the daemon task and have
> > a fast path in that case (similar to UBLK_IO_REGISTER_IO_BUF).
>
> Yes, my point is that optimization should focus on common case.

Of course, I agree. I think the common case is for both register and
unregister to be issued on the daemon task. I only optimized the
register so far because it appears significantly more expensive (due
to the request lookup on the tagset and the CAS loop to increment the
refcount). I can test optimizing the unregister as well and see if
it's a net win.

>
> > But I'm not sure it's worth the cost of an additional ubq/io lookup.
> >
> > >
> > > - __ublk_check_and_get_req() is bypassed.
> > >
> > > - buggy application may overflow ->buffers_registered
> >
> > Isn't it already possible in principle for a ublk server to overflow
> > ublk_rq_data's refcount_t by registering the same ublk request with
> > lots of buffers? But sure, if you're concerned about this overflow, I
> > can easily add a check for it.
>
> At least refcount_inc_not_zero() will warn if it happens.
>
> >
> > >
> > > So can you share any data about this optimization on workload with local
> > > registering & remote un-registering io buffer? Also is this usage
> > > really one common case?
> >
> > Sure, I can provide some performance measurements for this
> > optimization. From looking at CPU profiles in the past, the reference
> > counting and request lookup accounted for around 2% of the ublk server
> > CPU time. To be clear, in our use case, both the register and
> > unregister happen on the daemon task. I just haven't bothered
> > optimizing the reference-counting for the unregister yet because it
> > doesn't appear nearly as expensive.
>
> If you are talking about per-io-task, I guess it may not make a difference
> from user viewpoint from both iops and cpu utilization since the counter is
> basically per-task variable, and atomic cost shouldn't mean something for us.

Atomic RMW operations are more expensive than non-atomic ones even
when the cache line isn't contended (at least in my experience on
x86). The effect is greater in systems with large numbers of CPUs (and
especially with multiple NUMA nodes). Let me collect some precise
performance numbers, but I recall it being a significant hotspot.

Best,
Caleb