Re: [PATCH 6/8] ublk: implement ->queue_rqs()

Caleb Sander Mateos <csander@xxxxxxxxxxxxxxx> · Wed, 26 Mar 2025 14:30:56 -0700

On Mon, Mar 24, 2025 at 6:49 AM Ming Lei <ming.lei@xxxxxxxxxx> wrote:
>
> Implement ->queue_rqs() for improving perf in case of MQ.
>
> In this way, we just need to call io_uring_cmd_complete_in_task() once for
> one batch, then both io_uring and ublk server can get exact batch from
> client side.
>
> Follows IOPS improvement:
>
> - tests
>
>         tools/testing/selftests/ublk/kublk add -t null -q 2 [-z]
>
>         fio/t/io_uring -p0 /dev/ublkb0
>
> - results:
>
>         more than 10% IOPS boost observed
>
> Pass all ublk selftests, especially the io dispatch order test.
>
> Cc: Uday Shankar <ushankar@xxxxxxxxxxxxxxx>
> Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx>
> ---
>  drivers/block/ublk_drv.c | 85 ++++++++++++++++++++++++++++++++++++----
>  1 file changed, 77 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 53a463681a41..86621fde7fde 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -83,6 +83,7 @@ struct ublk_rq_data {
>  struct ublk_uring_cmd_pdu {
>         struct ublk_queue *ubq;
>         u16 tag;
> +       struct rq_list list;
>  };
>
>  /*
> @@ -1258,6 +1259,32 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
>         io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb);
>  }
>
> +static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd,
> +               unsigned int issue_flags)
> +{
> +       struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
> +       struct ublk_queue *ubq = pdu->ubq;
> +       struct request *rq;
> +
> +       while ((rq = rq_list_pop(&pdu->list))) {
> +               struct ublk_io *io = &ubq->ios[rq->tag];
> +
> +               ublk_rq_task_work_cb(io->cmd, issue_flags);

ublk_rq_task_work_cb() is duplicating the lookup of ubq, rq, and io.
Could you factor out a helper that takes those values instead of cmd?

> +       }
> +}
> +
> +static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
> +{
> +       struct request *rq = l->head;
> +       struct ublk_io *io = &ubq->ios[rq->tag];
> +       struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(io->cmd);
> +
> +       pdu->ubq = ubq;

Why does pdu->ubq need to be set here but not in ublk_queue_cmd()? I
would have thought it would already be set to ubq because pdu comes
from a rq belonging to this ubq.

> +       pdu->list = *l;
> +       rq_list_init(l);
> +       io_uring_cmd_complete_in_task(io->cmd, ublk_cmd_list_tw_cb);

Could store io->cmd in a variable to avoid looking it up twice.

> +}
> +
>  static enum blk_eh_timer_return ublk_timeout(struct request *rq)
>  {
>         struct ublk_queue *ubq = rq->mq_hctx->driver_data;
> @@ -1296,16 +1323,13 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq)
>         return BLK_EH_RESET_TIMER;
>  }
>
> -static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> -               const struct blk_mq_queue_data *bd)
> +static blk_status_t ublk_prep_rq_batch(struct request *rq)

naming nit: why "batch"?

>  {
> -       struct ublk_queue *ubq = hctx->driver_data;
> -       struct request *rq = bd->rq;
> +       struct ublk_queue *ubq = rq->mq_hctx->driver_data;
>         blk_status_t res;
>
> -       if (unlikely(ubq->fail_io)) {
> +       if (unlikely(ubq->fail_io))
>                 return BLK_STS_TARGET;
> -       }
>
>         /* fill iod to slot in io cmd buffer */
>         res = ublk_setup_iod(ubq, rq);
> @@ -1324,17 +1348,58 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
>         if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort))
>                 return BLK_STS_IOERR;
>
> +       if (unlikely(ubq->canceling))
> +               return BLK_STS_IOERR;

Why is ubq->cancelling treated differently for ->queue_rq() vs. ->queue_rqs()?

> +
> +       blk_mq_start_request(rq);
> +       return BLK_STS_OK;
> +}
> +
> +static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> +               const struct blk_mq_queue_data *bd)
> +{
> +       struct ublk_queue *ubq = hctx->driver_data;
> +       struct request *rq = bd->rq;
> +       blk_status_t res;
> +
>         if (unlikely(ubq->canceling)) {
>                 __ublk_abort_rq(ubq, rq);
>                 return BLK_STS_OK;
>         }
>
> -       blk_mq_start_request(bd->rq);
> -       ublk_queue_cmd(ubq, rq);
> +       res = ublk_prep_rq_batch(rq);
> +       if (res != BLK_STS_OK)
> +               return res;
>
> +       ublk_queue_cmd(ubq, rq);
>         return BLK_STS_OK;
>  }
>
> +static void ublk_queue_rqs(struct rq_list *rqlist)
> +{
> +       struct rq_list requeue_list = { };
> +       struct rq_list submit_list = { };
> +       struct ublk_queue *ubq = NULL;
> +       struct request *req;
> +
> +       while ((req = rq_list_pop(rqlist))) {
> +               struct ublk_queue *this_q = req->mq_hctx->driver_data;
> +
> +               if (ubq && ubq != this_q && !rq_list_empty(&submit_list))
> +                       ublk_queue_cmd_list(ubq, &submit_list);
> +               ubq = this_q;

Probably could avoid the extra ->driver_data dereference on every rq
by comparing the mq_hctx pointers instead. The ->driver_data
dereference could be moved to the ublk_queue_cmd_list() calls.

> +
> +               if (ublk_prep_rq_batch(req) == BLK_STS_OK)
> +                       rq_list_add_tail(&submit_list, req);
> +               else
> +                       rq_list_add_tail(&requeue_list, req);
> +       }
> +
> +       if (ubq && !rq_list_empty(&submit_list))
> +               ublk_queue_cmd_list(ubq, &submit_list);
> +       *rqlist = requeue_list;
> +}
> +
>  static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
>                 unsigned int hctx_idx)
>  {
> @@ -1347,6 +1412,7 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
>
>  static const struct blk_mq_ops ublk_mq_ops = {
>         .queue_rq       = ublk_queue_rq,
> +       .queue_rqs      = ublk_queue_rqs,
>         .init_hctx      = ublk_init_hctx,
>         .timeout        = ublk_timeout,
>  };
> @@ -3147,6 +3213,9 @@ static int __init ublk_init(void)
>         BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
>                         UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
>
> +       BUILD_BUG_ON(sizeof(struct ublk_uring_cmd_pdu) >
> +                       sizeof_field(struct io_uring_cmd, pdu));

Looks like Uday also suggested this, but if you change
ublk_get_uring_cmd_pdu() to use io_uring_cmd_to_pdu(), you get this
check for free.

Best,
Caleb

> +
>         init_waitqueue_head(&ublk_idr_wq);
>
>         ret = misc_register(&ublk_misc);
> --
> 2.47.0
>