On Mon, Apr 14, 2025 at 07:25:47PM +0800, Ming Lei wrote: > From: Uday Shankar <ushankar@xxxxxxxxxxxxxxx> > > There are currently two ways in which ublk server exit is detected by > ublk_drv: > > 1. uring_cmd cancellation. If there are any outstanding uring_cmds which > have not been completed to the ublk server when it exits, io_uring > calls the uring_cmd callback with a special cancellation flag as the > issuing task is exiting. > 2. I/O timeout. This is needed in addition to the above to handle the > "saturated queue" case, when all I/Os for a given queue are in the > ublk server, and therefore there are no outstanding uring_cmds to > cancel when the ublk server exits. > > There are a couple of issues with this approach: > > - It is complex and inelegant to have two methods to detect the same > condition > - The second method detects ublk server exit only after a long delay > (~30s, the default timeout assigned by the block layer). This delays > the nosrv behavior from kicking in and potential subsequent recovery > of the device. > > The second issue is brought to light with the new test_generic_06 which > will be added in following patch. It fails before this fix: > > selftests: ublk: test_generic_06.sh > dev id is 0 > dd: error writing '/dev/ublkb0': Input/output error > 1+0 records in > 0+0 records out > 0 bytes copied, 30.0611 s, 0.0 kB/s > DEAD > dd took 31 seconds to exit (>= 5s tolerance)! > generic_06 : [FAIL] > > Fix this by instead detecting and handling ublk server exit in the > character file release callback. This has several advantages: > > - This one place can handle both saturated and unsaturated queues. Thus, > it replaces both preexisting methods of detecting ublk server exit. > - It runs quickly on ublk server exit - there is no 30s delay. > - It starts the process of removing task references in ublk_drv. This is > needed if we want to relax restrictions in the driver like letting > only one thread serve each queue > > There is also the disadvantage that the character file release callback > can also be triggered by intentional close of the file, which is a > significant behavior change. Preexisting ublk servers (libublksrv) are > dependent on the ability to open/close the file multiple times. To > address this, only transition to a nosrv state if the file is released > while the ublk device is live. This allows for programs to open/close > the file multiple times during setup. It is still a behavior change if a > ublk server decides to close/reopen the file while the device is LIVE > (i.e. while it is responsible for serving I/O), but that would be highly > unusual. This behavior is in line with what is done by FUSE, which is > very similar to ublk in that a userspace daemon is providing services > traditionally provided by the kernel. > > With this change in, the new test (and all other selftests, and all > ublksrv tests) pass: > > selftests: ublk: test_generic_06.sh > dev id is 0 > dd: error writing '/dev/ublkb0': Input/output error > 1+0 records in > 0+0 records out > 0 bytes copied, 0.0376731 s, 0.0 kB/s > DEAD > generic_04 : [PASS] > > Signed-off-by: Uday Shankar <ushankar@xxxxxxxxxxxxxxx> > Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> > --- > drivers/block/ublk_drv.c | 231 ++++++++++++++++++++++----------------- > 1 file changed, 131 insertions(+), 100 deletions(-) > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c > index b68bd4172fa8..e02f205f6da4 100644 > --- a/drivers/block/ublk_drv.c > +++ b/drivers/block/ublk_drv.c > @@ -199,8 +199,6 @@ struct ublk_device { > struct completion completion; > unsigned int nr_queues_ready; > unsigned int nr_privileged_daemon; > - > - struct work_struct nosrv_work; > }; > > /* header of ublk_params */ > @@ -209,8 +207,9 @@ struct ublk_params_header { > __u32 types; > }; > > -static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq); > - > +static void ublk_stop_dev_unlocked(struct ublk_device *ub); > +static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); > +static void __ublk_quiesce_dev(struct ublk_device *ub); > static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, > struct ublk_queue *ubq, int tag, size_t offset); > static inline unsigned int ublk_req_build_flags(struct request *req); > @@ -1336,8 +1335,6 @@ static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) > static enum blk_eh_timer_return ublk_timeout(struct request *rq) > { > struct ublk_queue *ubq = rq->mq_hctx->driver_data; > - unsigned int nr_inflight = 0; > - int i; > > if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) { > if (!ubq->timeout) { > @@ -1348,26 +1345,6 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) > return BLK_EH_DONE; > } > > - if (!ubq_daemon_is_dying(ubq)) > - return BLK_EH_RESET_TIMER; > - > - for (i = 0; i < ubq->q_depth; i++) { > - struct ublk_io *io = &ubq->ios[i]; > - > - if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) > - nr_inflight++; > - } > - > - /* cancelable uring_cmd can't help us if all commands are in-flight */ > - if (nr_inflight == ubq->q_depth) { > - struct ublk_device *ub = ubq->dev; > - > - if (ublk_abort_requests(ub, ubq)) { > - schedule_work(&ub->nosrv_work); > - } > - return BLK_EH_DONE; > - } > - > return BLK_EH_RESET_TIMER; > } > > @@ -1525,13 +1502,112 @@ static void ublk_reset_ch_dev(struct ublk_device *ub) > ub->nr_privileged_daemon = 0; > } > > +static struct gendisk *ublk_get_disk(struct ublk_device *ub) > +{ > + struct gendisk *disk; > + > + spin_lock(&ub->lock); > + disk = ub->ub_disk; > + if (disk) > + get_device(disk_to_dev(disk)); > + spin_unlock(&ub->lock); > + > + return disk; > +} > + > +static void ublk_put_disk(struct gendisk *disk) > +{ > + if (disk) > + put_device(disk_to_dev(disk)); > +} > + > static int ublk_ch_release(struct inode *inode, struct file *filp) > { > struct ublk_device *ub = filp->private_data; > + struct gendisk *disk; > + int i; > + > + /* > + * disk isn't attached yet, either device isn't live, or it has > + * been removed already, so we needn't to do anything > + */ > + disk = ublk_get_disk(ub); > + if (!disk) > + goto out; > + > + /* > + * All uring_cmd are done now, so abort any request outstanding to > + * the ublk server > + * > + * This can be done in lockless way because ublk server has been > + * gone > + * > + * More importantly, we have to provide forward progress guarantee > + * without holding ub->mutex, otherwise control task grabbing > + * ub->mutex triggers deadlock > + * > + * All requests may be inflight, so ->canceling may not be set, set > + * it now. > + */ > + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { > + struct ublk_queue *ubq = ublk_get_queue(ub, i); > + > + ubq->canceling = true; > + ublk_abort_queue(ub, ubq); > + } > + blk_mq_kick_requeue_list(disk->queue); > + > + /* > + * All infligh requests have been completed or requeued and any new > + * request will be failed or requeued via `->canceling` now, so it is > + * fine to grab ub->mutex now. > + */ > + mutex_lock(&ub->mutex); > + > + /* double check after grabbing lock */ > + if (!ub->ub_disk) > + goto unlock; > + > + /* > + * Transition the device to the nosrv state. What exactly this > + * means depends on the recovery flags > + */ > + blk_mq_quiesce_queue(disk->queue); > + if (ublk_nosrv_should_stop_dev(ub)) { > + /* > + * Allow any pending/future I/O to pass through quickly > + * with an error. This is needed because del_gendisk > + * waits for all pending I/O to complete > + */ > + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) > + ublk_get_queue(ub, i)->force_abort = true; > + blk_mq_unquiesce_queue(disk->queue); > + > + ublk_stop_dev_unlocked(ub); > + } else { > + if (ublk_nosrv_dev_should_queue_io(ub)) { > + /* > + * keep request queue as quiesced for queuing new IO > + * during QUIESCED state > + * > + * request queue will be unquiesced in ending > + * recovery, see ublk_ctrl_end_recovery > + */ Does this comment need an update after [PATCH 4/9] ublk: rely on ->canceling for dealing with ublk_nosrv_dev_should_queue_io If I read that one right, actually the request queue is not quiesced anymore during the UBLK_S_DEV_QUIESCED state > + __ublk_quiesce_dev(ub); > + } else { > + ub->dev_info.state = UBLK_S_DEV_FAIL_IO; > + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) > + ublk_get_queue(ub, i)->fail_io = true; > + } > + blk_mq_unquiesce_queue(disk->queue); > + } > +unlock: > + mutex_unlock(&ub->mutex); > + ublk_put_disk(disk); > > /* all uring_cmd has been done now, reset device & ubq */ > ublk_reset_ch_dev(ub); > - > +out: > clear_bit(UB_STATE_OPEN, &ub->state); > return 0; > } > @@ -1627,37 +1703,22 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) > } > > /* Must be called when queue is frozen */ > -static bool ublk_mark_queue_canceling(struct ublk_queue *ubq) > +static void ublk_mark_queue_canceling(struct ublk_queue *ubq) > { > - bool canceled; > - > spin_lock(&ubq->cancel_lock); > - canceled = ubq->canceling; > - if (!canceled) > + if (!ubq->canceling) > ubq->canceling = true; > spin_unlock(&ubq->cancel_lock); > - > - return canceled; > } > > -static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq) > +static void ublk_start_cancel(struct ublk_queue *ubq) > { > - bool was_canceled = ubq->canceling; > - struct gendisk *disk; > - > - if (was_canceled) > - return false; > - > - spin_lock(&ub->lock); > - disk = ub->ub_disk; > - if (disk) > - get_device(disk_to_dev(disk)); > - spin_unlock(&ub->lock); > + struct ublk_device *ub = ubq->dev; > + struct gendisk *disk = ublk_get_disk(ub); > > /* Our disk has been dead */ > if (!disk) > - return false; > - > + return; > /* > * Now we are serialized with ublk_queue_rq() > * > @@ -1666,15 +1727,9 @@ static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq) > * touch completed uring_cmd > */ > blk_mq_quiesce_queue(disk->queue); > - was_canceled = ublk_mark_queue_canceling(ubq); > - if (!was_canceled) { > - /* abort queue is for making forward progress */ > - ublk_abort_queue(ub, ubq); > - } > + ublk_mark_queue_canceling(ubq); > blk_mq_unquiesce_queue(disk->queue); > - put_device(disk_to_dev(disk)); > - > - return !was_canceled; > + ublk_put_disk(disk); > } > > static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io, > @@ -1698,6 +1753,17 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io, > /* > * The ublk char device won't be closed when calling cancel fn, so both > * ublk device and queue are guaranteed to be live > + * > + * Two-stage cancel: > + * > + * - make every active uring_cmd done in ->cancel_fn() > + * > + * - aborting inflight ublk IO requests in ublk char device release handler, > + * which depends on 1st stage because device can only be closed iff all > + * uring_cmd are done > + * > + * Do _not_ try to acquire ub->mutex before all inflight requests are > + * aborted, otherwise deadlock may be caused. > */ > static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, > unsigned int issue_flags) > @@ -1705,8 +1771,6 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, > struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); > struct ublk_queue *ubq = pdu->ubq; > struct task_struct *task; > - struct ublk_device *ub; > - bool need_schedule; > struct ublk_io *io; > > if (WARN_ON_ONCE(!ubq)) > @@ -1719,16 +1783,12 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, > if (WARN_ON_ONCE(task && task != ubq->ubq_daemon)) > return; > > - ub = ubq->dev; > - need_schedule = ublk_abort_requests(ub, ubq); > + if (!ubq->canceling) > + ublk_start_cancel(ubq); > > io = &ubq->ios[pdu->tag]; > WARN_ON_ONCE(io->cmd != cmd); > ublk_cancel_cmd(ubq, io, issue_flags); > - > - if (need_schedule) { > - schedule_work(&ub->nosrv_work); > - } > } > > static inline bool ublk_queue_ready(struct ublk_queue *ubq) > @@ -1779,6 +1839,7 @@ static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub) > } > } > > +/* Now all inflight requests have been aborted */ > static void __ublk_quiesce_dev(struct ublk_device *ub) > { > int i; > @@ -1787,13 +1848,11 @@ static void __ublk_quiesce_dev(struct ublk_device *ub) > __func__, ub->dev_info.dev_id, > ub->dev_info.state == UBLK_S_DEV_LIVE ? > "LIVE" : "QUIESCED"); > - blk_mq_quiesce_queue(ub->ub_disk->queue); > /* mark every queue as canceling */ > for (i = 0; i < ub->dev_info.nr_hw_queues; i++) > ublk_get_queue(ub, i)->canceling = true; > ublk_wait_tagset_rqs_idle(ub); > ub->dev_info.state = UBLK_S_DEV_QUIESCED; > - blk_mq_unquiesce_queue(ub->ub_disk->queue); > } > > static void ublk_force_abort_dev(struct ublk_device *ub) > @@ -1830,50 +1889,25 @@ static struct gendisk *ublk_detach_disk(struct ublk_device *ub) > return disk; > } > > -static void ublk_stop_dev(struct ublk_device *ub) > +static void ublk_stop_dev_unlocked(struct ublk_device *ub) > + __must_hold(&ub->mutex) > { > struct gendisk *disk; > > - mutex_lock(&ub->mutex); > if (!ub->ub_disk) > - goto unlock; > + return; > + > if (ublk_nosrv_dev_should_queue_io(ub)) > ublk_force_abort_dev(ub); > del_gendisk(ub->ub_disk); > disk = ublk_detach_disk(ub); > put_disk(disk); > - unlock: > - mutex_unlock(&ub->mutex); > - ublk_cancel_dev(ub); > } > > -static void ublk_nosrv_work(struct work_struct *work) > +static void ublk_stop_dev(struct ublk_device *ub) > { > - struct ublk_device *ub = > - container_of(work, struct ublk_device, nosrv_work); > - int i; > - > - if (ublk_nosrv_should_stop_dev(ub)) { > - ublk_stop_dev(ub); > - return; > - } > - > mutex_lock(&ub->mutex); > - if (ub->dev_info.state != UBLK_S_DEV_LIVE) > - goto unlock; > - > - if (ublk_nosrv_dev_should_queue_io(ub)) { > - __ublk_quiesce_dev(ub); > - } else { > - blk_mq_quiesce_queue(ub->ub_disk->queue); > - ub->dev_info.state = UBLK_S_DEV_FAIL_IO; > - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { > - ublk_get_queue(ub, i)->fail_io = true; > - } > - blk_mq_unquiesce_queue(ub->ub_disk->queue); > - } > - > - unlock: > + ublk_stop_dev_unlocked(ub); > mutex_unlock(&ub->mutex); > ublk_cancel_dev(ub); > } > @@ -2491,7 +2525,6 @@ static void ublk_remove(struct ublk_device *ub) > bool unprivileged; > > ublk_stop_dev(ub); > - cancel_work_sync(&ub->nosrv_work); > cdev_device_del(&ub->cdev, &ub->cdev_dev); > unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; > ublk_put_device(ub); > @@ -2776,7 +2809,6 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) > goto out_unlock; > mutex_init(&ub->mutex); > spin_lock_init(&ub->lock); > - INIT_WORK(&ub->nosrv_work, ublk_nosrv_work); > > ret = ublk_alloc_dev_number(ub, header->dev_id); > if (ret < 0) > @@ -2908,7 +2940,6 @@ static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) > static int ublk_ctrl_stop_dev(struct ublk_device *ub) > { > ublk_stop_dev(ub); > - cancel_work_sync(&ub->nosrv_work); > return 0; > } > > -- > 2.47.0 >