Re: [PATCH 07/23] ublk: add new batch command UBLK_U_IO_PREP_IO_CMDS & UBLK_U_IO_COMMIT_IO_CMDS

Caleb Sander Mateos <csander@xxxxxxxxxxxxxxx> · Sat, 6 Sep 2025 11:50:55 -0700

On Mon, Sep 1, 2025 at 3:03 AM Ming Lei <ming.lei@xxxxxxxxxx> wrote:
>
> Add new command UBLK_U_IO_PREP_IO_CMDS, which is the batch version of
> UBLK_IO_FETCH_REQ.
>
> Add new command UBLK_U_IO_COMMIT_IO_CMDS, which is for committing io command
> result only, still the batch version.
>
> The new command header type is `struct ublk_batch_io`, and fixed buffer is
> required for these two uring_cmd.

The commit message could be clearer that it doesn't actually implement
these commands yet, just validates the SQE fields.

>
> Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx>
> ---
>  drivers/block/ublk_drv.c      | 102 +++++++++++++++++++++++++++++++++-
>  include/uapi/linux/ublk_cmd.h |  49 ++++++++++++++++
>  2 files changed, 149 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 46be5b656f22..4da0dbbd7e16 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -85,6 +85,11 @@
>          UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED |    \
>          UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
>
> +#define UBLK_BATCH_F_ALL  \
> +       (UBLK_BATCH_F_HAS_ZONE_LBA | \
> +        UBLK_BATCH_F_HAS_BUF_ADDR | \
> +        UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
> +
>  struct ublk_uring_cmd_pdu {
>         /*
>          * Store requests in same batch temporarily for queuing them to
> @@ -108,6 +113,11 @@ struct ublk_uring_cmd_pdu {
>         u16 tag;
>  };
>
> +struct ublk_batch_io_data {
> +       struct ublk_queue *ubq;
> +       struct io_uring_cmd *cmd;
> +};
> +
>  /*
>   * io command is active: sqe cmd is received, and its cqe isn't done
>   *
> @@ -277,7 +287,7 @@ static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
>         return ub->dev_info.flags & UBLK_F_ZONED;
>  }
>
> -static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
> +static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)

This change could go in a separate commit.

>  {
>         return ubq->flags & UBLK_F_ZONED;
>  }
> @@ -2528,10 +2538,98 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
>         return ublk_ch_uring_cmd_local(cmd, issue_flags);
>  }
>
> +static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
> +{
> +       const unsigned short mask = UBLK_BATCH_F_HAS_BUF_ADDR |
> +               UBLK_BATCH_F_HAS_ZONE_LBA;

Can we use a fixed-size integer type, i.e. u16?

> +
> +       if (uc->flags & ~UBLK_BATCH_F_ALL)
> +               return -EINVAL;
> +
> +       /* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */
> +       if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
> +                       (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR))
> +               return -EINVAL;
> +
> +       switch (uc->flags & mask) {
> +       case 0:
> +               if (uc->elem_bytes != 8)

sizeof(struct ublk_elem_header)?

> +                       return -EINVAL;
> +               break;
> +       case UBLK_BATCH_F_HAS_ZONE_LBA:
> +       case UBLK_BATCH_F_HAS_BUF_ADDR:
> +               if (uc->elem_bytes != 8 + 8)

sizeof(u64)?

> +                       return -EINVAL;
> +               break;
> +       case UBLK_BATCH_F_HAS_ZONE_LBA | UBLK_BATCH_F_HAS_BUF_ADDR:
> +               if (uc->elem_bytes != 8 + 8 + 8)
> +                       return -EINVAL;

So elem_bytes is redundant with flags? Do we really need a separate a
separate field then?

> +               break;
> +       default:
> +               return -EINVAL;

default case is unreachable?

> +       }
> +
> +       return 0;
> +}
> +
> +static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data,
> +                               const struct ublk_batch_io *uc)
> +{
> +       if (!(data->cmd->flags & IORING_URING_CMD_FIXED))
> +               return -EINVAL;
> +
> +       if (uc->nr_elem * uc->elem_bytes > data->cmd->sqe->len)

Cast nr_elem and/or elem_bytes to u32 to avoid overflow concerns?

Should also use READ_ONCE() to read the userspace-mapped sqe->len.

> +               return -E2BIG;
> +
> +       if (uc->nr_elem > data->ubq->q_depth)
> +               return -E2BIG;
> +
> +       if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) &&
> +                       !ublk_queue_is_zoned(data->ubq))
> +               return -EINVAL;
> +
> +       if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) &&
> +                       !ublk_need_map_io(data->ubq))
> +               return -EINVAL;
> +
> +       if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
> +                       !ublk_support_auto_buf_reg(data->ubq))
> +               return -EINVAL;
> +
> +       if (uc->reserved || uc->reserved2)
> +               return -EINVAL;
> +
> +       return ublk_check_batch_cmd_flags(uc);
> +}
> +
>  static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
>                                        unsigned int issue_flags)
>  {
> -       return -EOPNOTSUPP;
> +       const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe);
> +       struct ublk_device *ub = cmd->file->private_data;
> +       struct ublk_batch_io_data data = {
> +               .cmd = cmd,
> +       };
> +       u32 cmd_op = cmd->cmd_op;
> +       int ret = -EINVAL;
> +
> +       if (uc->q_id >= ub->dev_info.nr_hw_queues)
> +               goto out;
> +       data.ubq = ublk_get_queue(ub, uc->q_id);

Should be using READ_ONCE() to read from userspace-mapped memory.

> +
> +       switch (cmd_op) {
> +       case UBLK_U_IO_PREP_IO_CMDS:
> +       case UBLK_U_IO_COMMIT_IO_CMDS:
> +               ret = ublk_check_batch_cmd(&data, uc);
> +               if (ret)
> +                       goto out;
> +               ret = -EOPNOTSUPP;
> +               break;
> +       default:
> +               ret = -EOPNOTSUPP;
> +       }
> +out:
> +       return ret;
>  }
>
>  static inline bool ublk_check_ubuf_dir(const struct request *req,
> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
> index ec77dabba45b..01d3af52cfb4 100644
> --- a/include/uapi/linux/ublk_cmd.h
> +++ b/include/uapi/linux/ublk_cmd.h
> @@ -102,6 +102,10 @@
>         _IOWR('u', 0x23, struct ublksrv_io_cmd)
>  #define        UBLK_U_IO_UNREGISTER_IO_BUF     \
>         _IOWR('u', 0x24, struct ublksrv_io_cmd)
> +#define        UBLK_U_IO_PREP_IO_CMDS  \
> +       _IOWR('u', 0x25, struct ublk_batch_io)
> +#define        UBLK_U_IO_COMMIT_IO_CMDS        \
> +       _IOWR('u', 0x26, struct ublk_batch_io)
>
>  /* only ABORT means that no re-fetch */
>  #define UBLK_IO_RES_OK                 0
> @@ -525,6 +529,51 @@ struct ublksrv_io_cmd {
>         };
>  };
>
> +struct ublk_elem_header {
> +       __u16 tag;      /* IO tag */
> +
> +       /*
> +        * Buffer index for incoming io command, only valid iff
> +        * UBLK_F_AUTO_BUF_REG is set
> +        */
> +       __u16 buf_index;
> +       __u32 result;   /* I/O completion result (commit only) */

The result is unsigned? So there's no way to specify a request failure?

> +};
> +
> +/*
> + * uring_cmd buffer structure

Add "for batch commands"?

> + *
> + * buffer includes multiple elements, which number is specified by
> + * `nr_elem`. Each element buffer is organized in the following order:
> + *
> + * struct ublk_elem_buffer {
> + *     // Mandatory fields (8 bytes)
> + *     struct ublk_elem_header header;
> + *
> + *     // Optional fields (8 bytes each, included based on flags)
> + *
> + *     // Buffer address (if UBLK_BATCH_F_HAS_BUF_ADDR) for copying data
> + *     // between ublk request and ublk server buffer
> + *     __u64 buf_addr;
> + *
> + *     // returned Zone append LBA (if UBLK_BATCH_F_HAS_ZONE_LBA)
> + *     __u64 zone_lba;
> + * }
> + *
> + * Used for `UBLK_U_IO_PREP_IO_CMDS` and `UBLK_U_IO_COMMIT_IO_CMDS`
> + */
> +struct ublk_batch_io {
> +       __u16  q_id;

So this doesn't allow batching completions across ublk queues? That
seems like it significantly limits the usefulness of this feature. A
ublk server thread may be handling ublk requests from a number of
client threads which are submitting to different ublk queues.

Best,
Caleb

> +#define UBLK_BATCH_F_HAS_ZONE_LBA      (1 << 0)
> +#define UBLK_BATCH_F_HAS_BUF_ADDR      (1 << 1)
> +#define UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK     (1 << 2)
> +       __u16   flags;
> +       __u16   nr_elem;
> +       __u8    elem_bytes;
> +       __u8    reserved;
> +       __u64   reserved2;
> +};
> +
>  struct ublk_param_basic {
>  #define UBLK_ATTR_READ_ONLY            (1 << 0)
>  #define UBLK_ATTR_ROTATIONAL           (1 << 1)
> --
> 2.47.0
>