On Sat, Apr 26, 2025 at 03:42:59PM -0700, Caleb Sander Mateos wrote: > On Sat, Apr 26, 2025 at 2:41 AM Ming Lei <ming.lei@xxxxxxxxxx> wrote: > > > > UBLK_F_SUPPORT_ZERO_COPY requires ublk server to issue explicit buffer > > register/unregister uring_cmd for each IO, this way is not only inefficient, > > but also introduce dependency between buffer consumer and buffer register/ > > unregister uring_cmd, please see tools/testing/selftests/ublk/stripe.c > > in which backing file IO has to be issued one by one by IOSQE_IO_LINK. > > This is a great idea! > > > > > Add feature UBLK_F_AUTO_ZERO_COPY for addressing the existed zero copy > > limit: > > nit: "existed" -> "existing", "limit" -> "limitation" > > > > > - register request buffer automatically before delivering io command to > > ublk server > > > > - unregister request buffer automatically when completing the request > > > > - io_uring will unregister the buffer automatically when uring is > > exiting, so we needn't worry about accident exit > > > > Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> > > --- > > drivers/block/ublk_drv.c | 87 +++++++++++++++++++++++++++-------- > > include/uapi/linux/ublk_cmd.h | 20 ++++++++ > > 2 files changed, 89 insertions(+), 18 deletions(-) > > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c > > index 347790b3a633..453767f91431 100644 > > --- a/drivers/block/ublk_drv.c > > +++ b/drivers/block/ublk_drv.c > > @@ -64,7 +64,8 @@ > > | UBLK_F_CMD_IOCTL_ENCODE \ > > | UBLK_F_USER_COPY \ > > | UBLK_F_ZONED \ > > - | UBLK_F_USER_RECOVERY_FAIL_IO) > > + | UBLK_F_USER_RECOVERY_FAIL_IO \ > > + | UBLK_F_AUTO_ZERO_COPY) > > > > #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ > > | UBLK_F_USER_RECOVERY_REISSUE \ > > @@ -131,6 +132,14 @@ struct ublk_uring_cmd_pdu { > > */ > > #define UBLK_IO_FLAG_NEED_GET_DATA 0x08 > > > > +/* > > + * request buffer is registered automatically, so we have to unregister it > > + * before completing this request. > > + * > > + * io_uring will unregister buffer automatically for us during exiting. > > + */ > > +#define UBLK_IO_FLAG_UNREG_BUF 0x10 > > + > > /* atomic RW with ubq->cancel_lock */ > > #define UBLK_IO_FLAG_CANCELED 0x80000000 > > > > @@ -207,7 +216,8 @@ static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, > > int tag); > > static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) > > { > > - return ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); > > + return ub->dev_info.flags & (UBLK_F_USER_COPY | > > + UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_ZERO_COPY); > > } > > > > static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) > > @@ -614,6 +624,11 @@ static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq) > > return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY; > > } > > > > +static inline bool ublk_support_auto_zc(const struct ublk_queue *ubq) > > +{ > > + return ubq->flags & UBLK_F_AUTO_ZERO_COPY; > > +} > > + > > static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) > > { > > return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); > > @@ -621,7 +636,7 @@ static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) > > > > static inline bool ublk_need_map_io(const struct ublk_queue *ubq) > > { > > - return !ublk_support_user_copy(ubq); > > + return !ublk_support_user_copy(ubq) && !ublk_support_auto_zc(ubq); > > } > > > > static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) > > @@ -629,17 +644,21 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) > > /* > > * read()/write() is involved in user copy, so request reference > > * has to be grabbed > > + * > > + * For auto zc, ublk server still may issue UBLK_IO_COMMIT_AND_FETCH_REQ > > + * before one registered buffer is used up, so reference is > > + * required too. > > */ > > - return ublk_support_user_copy(ubq); > > + return ublk_support_user_copy(ubq) || ublk_support_auto_zc(ubq); > > Since both places where ublk_support_user_copy() is used are being > adjusted to also check ublk_support_auto_zc(), maybe > ublk_support_user_copy() should just be changed to check > UBLK_F_AUTO_ZERO_COPY too? I think that isn't good, we should have let each flag to cover its feature only. That said it was not good to add zero copy check in both ublk_support_user_copy() and ublk_dev_is_user_copy(). If ublk server needs user copy, it should have enabled it explicitly. > > > } > > > > static inline void ublk_init_req_ref(const struct ublk_queue *ubq, > > - struct request *req) > > + struct request *req, int init_ref) > > { > > if (ublk_need_req_ref(ubq)) { > > struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); > > > > - kref_init(&data->ref); > > + refcount_set(&data->ref.refcount, init_ref); > > It might be nicer not to mix kref and raw reference count operations. > Maybe you could add a prep patch that switches from struct kref to > refcount_t? That is fine, or probably kref need to provide one variant of __kref_init(nr). > > > } > > } > > > > @@ -667,6 +686,15 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq, > > } > > } > > > > +/* for ublk zero copy */ > > +static void ublk_io_release(void *priv) > > +{ > > + struct request *rq = priv; > > + struct ublk_queue *ubq = rq->mq_hctx->driver_data; > > + > > + ublk_put_req_ref(ubq, rq); > > +} > > + > > static inline bool ublk_need_get_data(const struct ublk_queue *ubq) > > { > > return ubq->flags & UBLK_F_NEED_GET_DATA; > > @@ -1231,7 +1259,22 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, > > mapped_bytes >> 9; > > } > > > > - ublk_init_req_ref(ubq, req); > > + if (ublk_support_auto_zc(ubq) && ublk_rq_has_data(req)) { > > + int ret; > > + > > + /* one extra reference is dropped by ublk_io_release */ > > + ublk_init_req_ref(ubq, req, 2); > > + ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release, > > + tag, issue_flags); > > Using the ublk request tag as the registered buffer index may be too > limiting. It would cause collisions if there are multiple ublk devices > or queues handled on a single io_uring. It also requires offsetting > any registered user buffers so their indices come after all the ublk > ones, which may be difficult if ublk devices are added dynamically. > Perhaps the UBLK_IO_FETCH_REQ operation could specify the registered > buffer index to use for the request? > > This also requires the io_uring issuing the zero-copy I/Os to be the > same as the one submitting the ublk fetch requests. That would also > make it difficult for us to adopt for our ublk server, which uses > separate io_urings. Not sure if there is a simple way the ublk server > could specify what io_uring to register the ublk zero-copy buffers > with. I think it can be done by passing `io_uring fd` and buffer 'index' via uring_cmd_header, there is still one u64 (->addr) left for zero copy, then the buffer's `io_uring fd/fixed_fd` and 'index' can be provided to io_uring core for registering buffer, this way should be flexible enough for covering any case. > > > + if (ret) { > > + blk_mq_end_request(req, BLK_STS_IOERR); > > + return; > > Does this leak the ublk fetch request? Seems like it should be It won't leak ublk uring_cmd which is just for delivering ublk io command to ublk server. > completed to the ublk server with an error code. It could be hard for ublk server to deal with, I'd suggest to start with one simple implementation first. It is usually one bug, and user will get notified from client's failure, then complain and ublk server gets fixed. > > > + } > > + io->flags |= UBLK_IO_FLAG_UNREG_BUF; > > + } else { > > + ublk_init_req_ref(ubq, req, 1); > > + } > > + > > ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags); > > } > > > > @@ -1593,7 +1636,8 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) > > } > > > > static void ublk_commit_completion(struct ublk_device *ub, > > - const struct ublksrv_io_cmd *ub_cmd) > > + const struct ublksrv_io_cmd *ub_cmd, > > + unsigned int issue_flags) > > { > > u32 qid = ub_cmd->q_id, tag = ub_cmd->tag; > > struct ublk_queue *ubq = ublk_get_queue(ub, qid); > > @@ -1604,6 +1648,15 @@ static void ublk_commit_completion(struct ublk_device *ub, > > io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; > > io->res = ub_cmd->result; > > > > + if (io->flags & UBLK_IO_FLAG_UNREG_BUF) { > > + int ret; > > + > > + ret = io_buffer_unregister_bvec(io->cmd, tag, issue_flags); > > + if (ret) > > + io->res = ret; > > I think it would be confusing to report an error to the application > submitting the I/O if unregistration fails. The only scenario where it > seems possible for this to fail is if userspace issues an > IORING_REGISTER_BUFFERS_UPDATE that overwrites the registered buffer > slot while the ublk request is being handled by the ublk server. I > would either ignore any error from io_buffer_unregister_bvec() or > return it to the ublk server. Fair enough, maybe WARN_ON_ONCE() is enough. > > > + io->flags &= ~UBLK_IO_FLAG_UNREG_BUF; > > + } > > + > > /* find the io request and complete */ > > req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); > > if (WARN_ON_ONCE(unlikely(!req))) > > @@ -1942,14 +1995,6 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, > > io_uring_cmd_mark_cancelable(cmd, issue_flags); > > } > > > > -static void ublk_io_release(void *priv) > > -{ > > - struct request *rq = priv; > > - struct ublk_queue *ubq = rq->mq_hctx->driver_data; > > - > > - ublk_put_req_ref(ubq, rq); > > -} > > - > > static int ublk_register_io_buf(struct io_uring_cmd *cmd, > > struct ublk_queue *ubq, unsigned int tag, > > unsigned int index, unsigned int issue_flags) > > @@ -2124,7 +2169,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, > > } > > > > ublk_fill_io_cmd(io, cmd, ub_cmd->addr); > > - ublk_commit_completion(ub, ub_cmd); > > + ublk_commit_completion(ub, ub_cmd, issue_flags); > > break; > > case UBLK_IO_NEED_GET_DATA: > > if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) > > @@ -2730,6 +2775,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) > > return -EINVAL; > > } > > > > + /* F_AUTO_ZERO_COPY and F_SUPPORT_ZERO_COPY can't co-exist */ > > + if ((info.flags & UBLK_F_AUTO_ZERO_COPY) && > > + (info.flags & UBLK_F_SUPPORT_ZERO_COPY)) > > + return -EINVAL; > > + > > /* > > * unprivileged device can't be trusted, but RECOVERY and > > * RECOVERY_REISSUE still may hang error handling, so can't > > @@ -2747,7 +2797,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) > > * buffer by pwrite() to ublk char device, which can't be > > * used for unprivileged device > > */ > > - if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)) > > + if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | > > + UBLK_F_AUTO_ZERO_COPY)) > > return -EINVAL; > > } > > > > diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h > > index 583b86681c93..d8bb5e55cce7 100644 > > --- a/include/uapi/linux/ublk_cmd.h > > +++ b/include/uapi/linux/ublk_cmd.h > > @@ -211,6 +211,26 @@ > > */ > > #define UBLK_F_USER_RECOVERY_FAIL_IO (1ULL << 9) > > > > +/* > > + * request buffer is registered automatically before delivering this io > > + * command to ublk server, meantime it is un-registered automatically > > + * when completing this io command. > > + * > > + * request tag has to be used as the buffer index, and ublk server has to > > + * pass this IO's tag as buffer index for using the registered zero copy > > + * buffer > > + * > > + * This way avoids extra uring_cmd cost, but also simplifies backend > > + * implementation, such as, the dependency on IO_REGISTER_IO_BUF and > > + * IO_UNREGISTER_IO_BUF becomes not necessary. > > + * > > + * For using this feature, ublk server has to register buffer table > > + * in sparse way, and buffer number has to be >= ublk queue depth. > > + * > > + * This feature is preferred to UBLK_F_SUPPORT_ZERO_COPY. > > + */ > > +#define UBLK_F_AUTO_ZERO_COPY (1ULL << 10) > > This flag is already taken by UBLK_F_UPDATE_SIZE in commit "ublk: Add > UBLK_U_CMD_UPDATE_SIZE". You may need to rebase on for-6.16/block. Good catch, will fix it in V2. Thanks, Ming