Re: [PATCH 3/4] ublk: add feature UBLK_F_AUTO_ZERO_COPY

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sat, Apr 26, 2025 at 2:41 AM Ming Lei <ming.lei@xxxxxxxxxx> wrote:
>
> UBLK_F_SUPPORT_ZERO_COPY requires ublk server to issue explicit buffer
> register/unregister uring_cmd for each IO, this way is not only inefficient,
> but also introduce dependency between buffer consumer and buffer register/
> unregister uring_cmd, please see tools/testing/selftests/ublk/stripe.c
> in which backing file IO has to be issued one by one by IOSQE_IO_LINK.

This is a great idea!

>
> Add feature UBLK_F_AUTO_ZERO_COPY for addressing the existed zero copy
> limit:

nit: "existed" -> "existing", "limit" -> "limitation"

>
> - register request buffer automatically before delivering io command to
> ublk server
>
> - unregister request buffer automatically when completing the request
>
> - io_uring will unregister the buffer automatically when uring is
>   exiting, so we needn't worry about accident exit
>
> Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx>
> ---
>  drivers/block/ublk_drv.c      | 87 +++++++++++++++++++++++++++--------
>  include/uapi/linux/ublk_cmd.h | 20 ++++++++
>  2 files changed, 89 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 347790b3a633..453767f91431 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -64,7 +64,8 @@
>                 | UBLK_F_CMD_IOCTL_ENCODE \
>                 | UBLK_F_USER_COPY \
>                 | UBLK_F_ZONED \
> -               | UBLK_F_USER_RECOVERY_FAIL_IO)
> +               | UBLK_F_USER_RECOVERY_FAIL_IO \
> +               | UBLK_F_AUTO_ZERO_COPY)
>
>  #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
>                 | UBLK_F_USER_RECOVERY_REISSUE \
> @@ -131,6 +132,14 @@ struct ublk_uring_cmd_pdu {
>   */
>  #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
>
> +/*
> + * request buffer is registered automatically, so we have to unregister it
> + * before completing this request.
> + *
> + * io_uring will unregister buffer automatically for us during exiting.
> + */
> +#define UBLK_IO_FLAG_UNREG_BUF         0x10
> +
>  /* atomic RW with ubq->cancel_lock */
>  #define UBLK_IO_FLAG_CANCELED  0x80000000
>
> @@ -207,7 +216,8 @@ static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
>                                                    int tag);
>  static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
>  {
> -       return ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY);
> +       return ub->dev_info.flags & (UBLK_F_USER_COPY |
> +                       UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_ZERO_COPY);
>  }
>
>  static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
> @@ -614,6 +624,11 @@ static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
>         return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
>  }
>
> +static inline bool ublk_support_auto_zc(const struct ublk_queue *ubq)
> +{
> +       return ubq->flags & UBLK_F_AUTO_ZERO_COPY;
> +}
> +
>  static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
>  {
>         return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY);
> @@ -621,7 +636,7 @@ static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
>
>  static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
>  {
> -       return !ublk_support_user_copy(ubq);
> +       return !ublk_support_user_copy(ubq) && !ublk_support_auto_zc(ubq);
>  }
>
>  static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
> @@ -629,17 +644,21 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
>         /*
>          * read()/write() is involved in user copy, so request reference
>          * has to be grabbed
> +        *
> +        * For auto zc, ublk server still may issue UBLK_IO_COMMIT_AND_FETCH_REQ
> +        * before one registered buffer is used up, so reference is
> +        * required too.
>          */
> -       return ublk_support_user_copy(ubq);
> +       return ublk_support_user_copy(ubq) || ublk_support_auto_zc(ubq);

Since both places where ublk_support_user_copy() is used are being
adjusted to also check ublk_support_auto_zc(), maybe
ublk_support_user_copy() should just be changed to check
UBLK_F_AUTO_ZERO_COPY too?

>  }
>
>  static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
> -               struct request *req)
> +               struct request *req, int init_ref)
>  {
>         if (ublk_need_req_ref(ubq)) {
>                 struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
>
> -               kref_init(&data->ref);
> +               refcount_set(&data->ref.refcount, init_ref);

It might be nicer not to mix kref and raw reference count operations.
Maybe you could add a prep patch that switches from struct kref to
refcount_t?

>         }
>  }
>
> @@ -667,6 +686,15 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
>         }
>  }
>
> +/* for ublk zero copy */
> +static void ublk_io_release(void *priv)
> +{
> +       struct request *rq = priv;
> +       struct ublk_queue *ubq = rq->mq_hctx->driver_data;
> +
> +       ublk_put_req_ref(ubq, rq);
> +}
> +
>  static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
>  {
>         return ubq->flags & UBLK_F_NEED_GET_DATA;
> @@ -1231,7 +1259,22 @@ static void ublk_dispatch_req(struct ublk_queue *ubq,
>                         mapped_bytes >> 9;
>         }
>
> -       ublk_init_req_ref(ubq, req);
> +       if (ublk_support_auto_zc(ubq) && ublk_rq_has_data(req)) {
> +               int ret;
> +
> +               /* one extra reference is dropped by ublk_io_release */
> +               ublk_init_req_ref(ubq, req, 2);
> +               ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release,
> +                                             tag, issue_flags);

Using the ublk request tag as the registered buffer index may be too
limiting. It would cause collisions if there are multiple ublk devices
or queues handled on a single io_uring. It also requires offsetting
any registered user buffers so their indices come after all the ublk
ones, which may be difficult if ublk devices are added dynamically.
Perhaps the UBLK_IO_FETCH_REQ operation could specify the registered
buffer index to use for the request?

This also requires the io_uring issuing the zero-copy I/Os to be the
same as the one submitting the ublk fetch requests. That would also
make it difficult for us to adopt for our ublk server, which uses
separate io_urings. Not sure if there is a simple way the ublk server
could specify what io_uring to register the ublk zero-copy buffers
with.

> +               if (ret) {
> +                       blk_mq_end_request(req, BLK_STS_IOERR);
> +                       return;

Does this leak the ublk fetch request? Seems like it should be
completed to the ublk server with an error code.

> +               }
> +               io->flags |= UBLK_IO_FLAG_UNREG_BUF;
> +       } else {
> +               ublk_init_req_ref(ubq, req, 1);
> +       }
> +
>         ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
>  }
>
> @@ -1593,7 +1636,8 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
>  }
>
>  static void ublk_commit_completion(struct ublk_device *ub,
> -               const struct ublksrv_io_cmd *ub_cmd)
> +               const struct ublksrv_io_cmd *ub_cmd,
> +               unsigned int issue_flags)
>  {
>         u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
>         struct ublk_queue *ubq = ublk_get_queue(ub, qid);
> @@ -1604,6 +1648,15 @@ static void ublk_commit_completion(struct ublk_device *ub,
>         io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
>         io->res = ub_cmd->result;
>
> +       if (io->flags & UBLK_IO_FLAG_UNREG_BUF) {
> +               int ret;
> +
> +               ret = io_buffer_unregister_bvec(io->cmd, tag, issue_flags);
> +               if (ret)
> +                       io->res = ret;

I think it would be confusing to report an error to the application
submitting the I/O if unregistration fails. The only scenario where it
seems possible for this to fail is if userspace issues an
IORING_REGISTER_BUFFERS_UPDATE that overwrites the registered buffer
slot while the ublk request is being handled by the ublk server. I
would either ignore any error from io_buffer_unregister_bvec() or
return it to the ublk server.

> +               io->flags &= ~UBLK_IO_FLAG_UNREG_BUF;
> +       }
> +
>         /* find the io request and complete */
>         req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
>         if (WARN_ON_ONCE(unlikely(!req)))
> @@ -1942,14 +1995,6 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
>         io_uring_cmd_mark_cancelable(cmd, issue_flags);
>  }
>
> -static void ublk_io_release(void *priv)
> -{
> -       struct request *rq = priv;
> -       struct ublk_queue *ubq = rq->mq_hctx->driver_data;
> -
> -       ublk_put_req_ref(ubq, rq);
> -}
> -
>  static int ublk_register_io_buf(struct io_uring_cmd *cmd,
>                                 struct ublk_queue *ubq, unsigned int tag,
>                                 unsigned int index, unsigned int issue_flags)
> @@ -2124,7 +2169,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
>                 }
>
>                 ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
> -               ublk_commit_completion(ub, ub_cmd);
> +               ublk_commit_completion(ub, ub_cmd, issue_flags);
>                 break;
>         case UBLK_IO_NEED_GET_DATA:
>                 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
> @@ -2730,6 +2775,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
>                 return -EINVAL;
>         }
>
> +       /* F_AUTO_ZERO_COPY and F_SUPPORT_ZERO_COPY can't co-exist */
> +       if ((info.flags & UBLK_F_AUTO_ZERO_COPY) &&
> +                       (info.flags & UBLK_F_SUPPORT_ZERO_COPY))
> +               return -EINVAL;
> +
>         /*
>          * unprivileged device can't be trusted, but RECOVERY and
>          * RECOVERY_REISSUE still may hang error handling, so can't
> @@ -2747,7 +2797,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
>                  * buffer by pwrite() to ublk char device, which can't be
>                  * used for unprivileged device
>                  */
> -               if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY))
> +               if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
> +                                       UBLK_F_AUTO_ZERO_COPY))
>                         return -EINVAL;
>         }
>
> diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
> index 583b86681c93..d8bb5e55cce7 100644
> --- a/include/uapi/linux/ublk_cmd.h
> +++ b/include/uapi/linux/ublk_cmd.h
> @@ -211,6 +211,26 @@
>   */
>  #define UBLK_F_USER_RECOVERY_FAIL_IO (1ULL << 9)
>
> +/*
> + * request buffer is registered automatically before delivering this io
> + * command to ublk server, meantime it is un-registered automatically
> + * when completing this io command.
> + *
> + * request tag has to be used as the buffer index, and ublk server has to
> + * pass this IO's tag as buffer index for using the registered zero copy
> + * buffer
> + *
> + * This way avoids extra uring_cmd cost, but also simplifies backend
> + * implementation, such as, the dependency on IO_REGISTER_IO_BUF and
> + * IO_UNREGISTER_IO_BUF becomes not necessary.
> + *
> + * For using this feature, ublk server has to register buffer table
> + * in sparse way, and buffer number has to be >= ublk queue depth.
> + *
> + * This feature is preferred to UBLK_F_SUPPORT_ZERO_COPY.
> + */
> +#define UBLK_F_AUTO_ZERO_COPY  (1ULL << 10)

This flag is already taken by UBLK_F_UPDATE_SIZE in commit "ublk: Add
UBLK_U_CMD_UPDATE_SIZE". You may need to rebase on for-6.16/block.

Best,
Caleb

> +
>  /* device state */
>  #define UBLK_S_DEV_DEAD        0
>  #define UBLK_S_DEV_LIVE        1
> --
> 2.47.0
>





[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux