Re: [PATCH 7/8] ublk: optimize UBLK_IO_REGISTER_IO_BUF on daemon task

Caleb Sander Mateos <csander@xxxxxxxxxxxxxxx> · Mon, 9 Jun 2025 10:14:21 -0700

On Mon, Jun 9, 2025 at 2:02 AM Ming Lei <ming.lei@xxxxxxxxxx> wrote:
>
> On Fri, Jun 06, 2025 at 03:40:10PM -0600, Caleb Sander Mateos wrote:
> > ublk_register_io_buf() performs an expensive atomic refcount increment,
> > as well as a lot of pointer chasing to look up the struct request.
> >
> > Create a separate ublk_daemon_register_io_buf() for the daemon task to
> > call. Initialize ublk_rq_data's reference count to a large number, count
> > the number of buffers registered on the daemon task nonatomically, and
> > atomically subtract the large number minus the number of registered
> > buffers in ublk_commit_and_fetch().
> >
> > Also obtain the struct request directly from ublk_io's req field instead
> > of looking it up on the tagset.
> >
> > Signed-off-by: Caleb Sander Mateos <csander@xxxxxxxxxxxxxxx>
> > ---
> >  drivers/block/ublk_drv.c | 59 ++++++++++++++++++++++++++++++++++------
> >  1 file changed, 50 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > index 2084bbdd2cbb..ec9e0fd21b0e 100644
> > --- a/drivers/block/ublk_drv.c
> > +++ b/drivers/block/ublk_drv.c
> > @@ -81,12 +81,20 @@
> >  #define UBLK_PARAM_TYPE_ALL                                \
> >       (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
> >        UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED |    \
> >        UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
> >
> > +/*
> > + * Initialize refcount to a large number to include any registered buffers.
> > + * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
> > + * any buffers registered on the io daemon task.
> > + */
> > +#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
> > +
> >  struct ublk_rq_data {
> >       refcount_t ref;
> > +     unsigned buffers_registered;
> >
> >       /* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */
> >       u16 buf_index;
> >       void *buf_ctx_handle;
> >  };
> > @@ -677,11 +685,12 @@ static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
> >               struct request *req)
> >  {
> >       if (ublk_need_req_ref(ubq)) {
> >               struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> >
> > -             refcount_set(&data->ref, 1);
> > +             refcount_set(&data->ref, UBLK_REFCOUNT_INIT);
> > +             data->buffers_registered = 0;
> >       }
> >  }
> >
> >  static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
> >               struct request *req)
> > @@ -706,10 +715,19 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
> >       } else {
> >               __ublk_complete_rq(req);
> >       }
> >  }
> >
> > +static inline void ublk_sub_req_ref(struct request *req)
> > +{
> > +     struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > +     unsigned sub_refs = UBLK_REFCOUNT_INIT - data->buffers_registered;
> > +
> > +     if (refcount_sub_and_test(sub_refs, &data->ref))
> > +             __ublk_complete_rq(req);
> > +}
> > +
> >  static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
> >  {
> >       return ubq->flags & UBLK_F_NEED_GET_DATA;
> >  }
> >
> > @@ -1184,14 +1202,12 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
> >
> >  static void ublk_auto_buf_reg_fallback(struct request *req)
> >  {
> >       const struct ublk_queue *ubq = req->mq_hctx->driver_data;
> >       struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
> > -     struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> >
> >       iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
> > -     refcount_set(&data->ref, 1);
> >  }
> >
> >  static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> >                             unsigned int issue_flags)
> >  {
> > @@ -1207,13 +1223,12 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> >                       return true;
> >               }
> >               blk_mq_end_request(req, BLK_STS_IOERR);
> >               return false;
> >       }
> > -     /* one extra reference is dropped by ublk_io_release */
> > -     refcount_set(&data->ref, 2);
> >
> > +     data->buffers_registered = 1;
> >       data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
> >       /* store buffer index in request payload */
> >       data->buf_index = pdu->buf.index;
> >       io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
> >       return true;
> > @@ -1221,14 +1236,14 @@ static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
> >
> >  static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
> >                                  struct request *req, struct ublk_io *io,
> >                                  unsigned int issue_flags)
> >  {
> > +     ublk_init_req_ref(ubq, req);
> >       if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
> >               return ublk_auto_buf_reg(req, io, issue_flags);
> >
> > -     ublk_init_req_ref(ubq, req);
> >       return true;
> >  }
> >
> >  static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
> >                         struct ublk_io *io)
> > @@ -2019,10 +2034,31 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
> >       }
> >
> >       return 0;
> >  }
> >
> > +static int ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
> > +                                    const struct ublk_queue *ubq,
> > +                                    const struct ublk_io *io,
> > +                                    unsigned index, unsigned issue_flags)
> > +{
> > +     struct request *req = io->req;
> > +     struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
> > +     int ret;
> > +
> > +     if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
> > +             return -EINVAL;
> > +
> > +     ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
> > +                                   issue_flags);
> > +     if (ret)
> > +             return ret;
> > +
> > +     data->buffers_registered++;
>
> This optimization replaces one ublk_get_req_ref()/refcount_inc_not_zero()
> with data->buffers_registered++ in case of registering io buffer from
> daemon context.
>
> And in typical implementation, the unregistering io buffer should be done
> in daemon context too, then I am wondering if any user-visible improvement
> can be observed in this more complicated & fragile way:

Yes, generally I would expect the unregister to happen on the daemon
task too. But it's possible (with patch "ublk: allow
UBLK_IO_(UN)REGISTER_IO_BUF on any task") for the
UBLK_IO_UNREGISTER_IO_BUF to be issued on another task. And even if
the unregister happens on the daemon task, it's up to the io_uring
layer to actually call ublk_io_release() once all requests using the
registered buffer have completed. Assuming only the daemon task issues
io_uring requests using the buffer, I believe ublk_io_release() will
always currently be called on that task. But I'd rather not make
assumptions about the io_uring layer. It's also possible in principle
for ublk_io_release() whether it's called on the daemon task and have
a fast path in that case (similar to UBLK_IO_REGISTER_IO_BUF). But I'm
not sure it's worth the cost of an additional ubq/io lookup.

>
> - __ublk_check_and_get_req() is bypassed.
>
> - buggy application may overflow ->buffers_registered

Isn't it already possible in principle for a ublk server to overflow
ublk_rq_data's refcount_t by registering the same ublk request with
lots of buffers? But sure, if you're concerned about this overflow, I
can easily add a check for it.

>
> So can you share any data about this optimization on workload with local
> registering & remote un-registering io buffer? Also is this usage
> really one common case?

Sure, I can provide some performance measurements for this
optimization. From looking at CPU profiles in the past, the reference
counting and request lookup accounted for around 2% of the ublk server
CPU time. To be clear, in our use case, both the register and
unregister happen on the daemon task. I just haven't bothered
optimizing the reference-counting for the unregister yet because it
doesn't appear nearly as expensive.

Best,
Caleb