On 7/18/25 05:58, Bart Van Assche wrote: > Support pipelining of zoned writes if the block driver preserves the write > order per hardware queue. Track per zone to which software queue writes > have been queued. If zoned writes are pipelined, submit new writes to the > same software queue as the writes that are already in progress. This > prevents reordering by submitting requests for the same zone to different > software or hardware queues. > > Cc: Christoph Hellwig <hch@xxxxxx> > Cc: Damien Le Moal <dlemoal@xxxxxxxxxx> > Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx> > --- > block/blk-mq.c | 4 +-- > block/blk-zoned.c | 66 ++++++++++++++++++++++++++++++++++++++--------- > 2 files changed, 56 insertions(+), 14 deletions(-) > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index c1035a2bbda8..56384b4aadd9 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -3145,8 +3145,8 @@ void blk_mq_submit_bio(struct bio *bio) > /* > * A BIO that was released from a zone write plug has already been > * through the preparation in this function, already holds a reference > - * on the queue usage counter, and is the only write BIO in-flight for > - * the target zone. Go straight to preparing a request for it. > + * on the queue usage counter. Go straight to preparing a request for > + * it. > */ > if (bio_zone_write_plugging(bio)) { > nr_segs = bio->__bi_nr_segments; > diff --git a/block/blk-zoned.c b/block/blk-zoned.c > index 6ef53f78fa3b..3813e4bc8b0b 100644 > --- a/block/blk-zoned.c > +++ b/block/blk-zoned.c > @@ -53,6 +53,8 @@ static const char *const zone_cond_name[] = { > * @zone_no: The number of the zone the plug is managing. > * @wp_offset: The zone write pointer location relative to the start of the zone > * as a number of 512B sectors. > + * @from_cpu: Software queue to submit writes from for drivers that preserve > + * the write order. > * @bio_list: The list of BIOs that are currently plugged. > * @bio_work: Work struct to handle issuing of plugged BIOs > * @rcu_head: RCU head to free zone write plugs with an RCU grace period. > @@ -65,6 +67,7 @@ struct blk_zone_wplug { > unsigned int flags; > unsigned int zone_no; > unsigned int wp_offset; > + int from_cpu; > struct bio_list bio_list; > struct work_struct bio_work; > struct rcu_head rcu_head; > @@ -74,8 +77,7 @@ struct blk_zone_wplug { > /* > * Zone write plug flags bits: > * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, > - * that is, that write BIOs are being throttled due to a write BIO already > - * being executed or the zone write plug bio list is not empty. > + * that is, that write BIOs are being throttled. > * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone > * write pointer offset and need to update it. > * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed > @@ -572,6 +574,7 @@ static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, > zwplug->flags = 0; > zwplug->zone_no = zno; > zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); > + zwplug->from_cpu = -1; > bio_list_init(&zwplug->bio_list); > INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); > zwplug->disk = disk; > @@ -768,14 +771,19 @@ static bool blk_zone_wplug_handle_reset_all(struct bio *bio) > static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, > struct blk_zone_wplug *zwplug) > { > + lockdep_assert_held(&zwplug->lock); Unrelated change. Please move this to a prep patch. > + > /* > * Take a reference on the zone write plug and schedule the submission > * of the next plugged BIO. blk_zone_wplug_bio_work() will release the > * reference we take here. > */ > - WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); Why do you remove this warning ? > refcount_inc(&zwplug->ref); > - queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); > + if (zwplug->from_cpu >= 0) > + queue_work_on(zwplug->from_cpu, disk->zone_wplugs_wq, > + &zwplug->bio_work); > + else > + queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); > } > > static inline void disk_zone_wplug_add_bio(struct gendisk *disk, > @@ -972,9 +980,12 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, > return true; > } > > -static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) > +static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs, > + int from_cpu) > { > struct gendisk *disk = bio->bi_bdev->bd_disk; > + const bool ordered_hwq = bio_op(bio) != REQ_OP_ZONE_APPEND && > + disk->queue->limits.features & BLK_FEAT_ORDERED_HWQ; This is not correct. If the BIO is a zone append and blk_zone_wplug_handle_write() is called, it means that we need to handle the BIO using zone append emulation, that is, the BIO will be a regular write. So you must treat it as if it originally was a regular write. > sector_t sector = bio->bi_iter.bi_sector; > bool schedule_bio_work = false; > struct blk_zone_wplug *zwplug; > @@ -1034,15 +1045,38 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) > if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) > goto add_to_bio_list; > > + if (ordered_hwq && zwplug->from_cpu < 0) { > + /* No zoned writes are in progress. Select the current CPU. */ > + zwplug->from_cpu = raw_smp_processor_id(); > + } > + > + if (ordered_hwq && zwplug->from_cpu == from_cpu) { > + /* > + * The block driver preserves the write order, zoned writes have > + * not been plugged and the zoned write will be submitted from > + * zwplug->from_cpu. Let the caller submit the bio. > + */ > + } else if (ordered_hwq) { > + /* > + * The block driver preserves the write order but the caller > + * allocated a request from another CPU. Submit the bio from > + * zwplug->from_cpu. > + */ > + goto plug; > + } else { > + /* > + * The block driver does not preserve the write order. Plug and > + * let the caller submit the BIO. > + */ > + zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; > + } On the last round of comments, I already suggested a much nicer way of writing this that does not repeat the if (oredered_hwq) and does not have an empty if clause: if (ordered_hwq) { /* * The block driver preserves the write order, zoned writes have * not been plugged and the zoned write will be submitted from * zwplug->from_cpu. Let the caller submit the bio. */ if (zwplug->from_cpu < 0) { /* * No zoned writes are in progress: select the * current CPU. */ zwplug->from_cpu = raw_smp_processor_id(); } else if (zwplug->from_cpu != raw_smp_processor_id()) { /* * The caller allocated a request from another CPU. * Submit the bio from zwplug->from_cpu. */ goto plug; } } else { /* * The block driver does not preserve the write order. Plug and * let the caller submit the BIO. */ zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; } > if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { You moved the BLK_ZONE_WPLUG_PLUGGED flag set above. So if this fails, you need to clear this flag and also reset zwplug->from_cpu to -1. > spin_unlock_irqrestore(&zwplug->lock, flags); > bio_io_error(bio); > return true; > } > > - /* Otherwise, plug and submit the BIO. */ > - zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; > - > spin_unlock_irqrestore(&zwplug->lock, flags); > > return false; > @@ -1150,7 +1184,7 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs, int rq_cpu) > fallthrough; > case REQ_OP_WRITE: > case REQ_OP_WRITE_ZEROES: > - return blk_zone_wplug_handle_write(bio, nr_segs); > + return blk_zone_wplug_handle_write(bio, nr_segs, rq_cpu); > case REQ_OP_ZONE_RESET: > return blk_zone_wplug_handle_reset_or_finish(bio, 0); > case REQ_OP_ZONE_FINISH: > @@ -1182,6 +1216,9 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk, > > zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; > > + if (refcount_read(&zwplug->ref) == 2) This needs a comment explaining why you use the plug ref count instead of unconditionally clearing from_cpu. > + zwplug->from_cpu = -1; > + > /* > * If the zone is full (it was fully written or finished, or empty > * (it was reset), remove its zone write plug from the hash table. > @@ -1283,6 +1320,8 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) > struct blk_zone_wplug *zwplug = > container_of(work, struct blk_zone_wplug, bio_work); > struct block_device *bdev; > + bool ordered_hwq = zwplug->disk->queue->limits.features & > + BLK_FEAT_ORDERED_HWQ; Splitting the line after the "=" would be nicer. > struct bio *bio; > > do { > @@ -1323,7 +1362,7 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) > } else { > blk_mq_submit_bio(bio); > } > - } while (0); > + } while (ordered_hwq); > > put_zwplug: > /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ > @@ -1850,6 +1889,7 @@ static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, > unsigned int zwp_zone_no, zwp_ref; > unsigned int zwp_bio_list_size; > unsigned long flags; > + int from_cpu; > > spin_lock_irqsave(&zwplug->lock, flags); > zwp_zone_no = zwplug->zone_no; > @@ -1857,10 +1897,12 @@ static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, > zwp_ref = refcount_read(&zwplug->ref); > zwp_wp_offset = zwplug->wp_offset; > zwp_bio_list_size = bio_list_size(&zwplug->bio_list); > + from_cpu = zwplug->from_cpu; > spin_unlock_irqrestore(&zwplug->lock, flags); > > - seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref, > - zwp_wp_offset, zwp_bio_list_size); > + seq_printf(m, "zone_no %u flags 0x%x ref %u wp_offset %u bio_list_size %u from_cpu %d\n", > + zwp_zone_no, zwp_flags, zwp_ref, zwp_wp_offset, > + zwp_bio_list_size, from_cpu); > } > > int queue_zone_wplugs_show(void *data, struct seq_file *m) -- Damien Le Moal Western Digital Research