On 4/22/25 2:59 PM, Ming Lei wrote: > On Tue, Apr 22, 2025 at 09:04:32AM +0200, Christoph Hellwig wrote: >> On Mon, Apr 21, 2025 at 10:39:24AM +0800, Ming Lei wrote: >>>> In my view, RCU is optimized for read-heavy workloads with: >>>> - Non-blocking readers >>> >>> srcu allows blocking reader >> >> It does. But it's certainly not optimized for long blocking readers. >> >>> Basically I agree with you that rwsem(instead of rwlock) should match with >>> this case in theory, but I feel that rwsem is stronger than srcu from lock >>> viewpoint, and we will add new dependency if rwsem is held inside >>> ->store(), such as the following splat. >> >> How does manually implementing a reader/write lock using SRCU avoid >> that dependency vs just hiding it? >> >> I'd rather sort this out as a rwsem is very natural her as Nilay pointed >> out, and also avoids the whole giv up and retry pattern. > > Thinking of further, the warning triggered from rwsem is false positive, > because elevator switch can't happen until disk is added, and the splat > can be avoided by switching to down_read_nested() in elevator_change(), > which need to be nested too. > > So looks fine to use rwsem. > Instead of using down_read_nested() why can't we move the reader-lock inside elv_iosched_store() ? With your patchset, now elevator_change() is called from three places: - elevator_set_default - elevator_set_none - elv_iosched_store As both elevator_set_default and elevator_set_none should run holding reader-lock, I think we should remove reader-lock from elevator_change and move it inside elv_iosched_store. In fact, I just quickly prototyped rwsem replacing srcu on top of your changes and ran blktests. The rwsem changes sustain blktests and I didn't encounter any lockdep splat. For reference, attached my (quick) prototype changes. Thanks, --Nilay
diff --git a/block/blk-mq.c b/block/blk-mq.c index 5c9f7391af92..bbe3dd40f1e8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4773,18 +4773,14 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) goto out_free_srcu; } - mutex_init(&set->update_nr_hwq_lock); - init_waitqueue_head(&set->update_nr_hwq_wq); - ret = init_srcu_struct(&set->update_nr_hwq_srcu); - if (ret) - goto out_cleanup_srcu; + init_rwsem(&set->update_nr_hwq_lock); ret = -ENOMEM; set->tags = kcalloc_node(set->nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) - goto out_cleanup_hwq_srcu; + goto out_cleanup_srcu; for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, @@ -4813,8 +4809,6 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) } kfree(set->tags); set->tags = NULL; -out_cleanup_hwq_srcu: - cleanup_srcu_struct(&set->update_nr_hwq_srcu); out_cleanup_srcu: if (set->flags & BLK_MQ_F_BLOCKING) cleanup_srcu_struct(set->srcu); @@ -5007,21 +5001,16 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { - mutex_lock(&set->update_nr_hwq_lock); + down_write(&set->update_nr_hwq_lock); /* * Mark us in updating nr_hw_queues for preventing reader of * nr_hw_queues, such as adding/deleting disk. */ - set->updating_nr_hwq = true; - synchronize_srcu(&set->update_nr_hwq_srcu); - mutex_lock(&set->tag_list_lock); __blk_mq_update_nr_hw_queues(set, nr_hw_queues); mutex_unlock(&set->tag_list_lock); - set->updating_nr_hwq = false; - wake_up_all(&set->update_nr_hwq_wq); - mutex_unlock(&set->update_nr_hwq_lock); + up_write(&set->update_nr_hwq_lock); } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); diff --git a/block/elevator.c b/block/elevator.c index 378553fce5d8..69193d04fffa 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -743,15 +743,8 @@ int __elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) { - struct blk_mq_tag_set *set = q->tag_set; unsigned int memflags; - int ret, idx; - - idx = srcu_read_lock(&set->update_nr_hwq_srcu); - if (set->updating_nr_hwq) { - ret = -EBUSY; - goto exit; - } + int ret; memflags = blk_mq_freeze_queue(q); /* @@ -771,8 +764,6 @@ static int elevator_change(struct request_queue *q, if (!ret) ret = elevator_change_done(q, ctx); -exit: - srcu_read_unlock(&set->update_nr_hwq_srcu, idx); return ret; } @@ -792,6 +783,7 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, size_t count) { struct request_queue *q = disk->queue; + struct blk_mq_tag_set *set = q->tag_set; char elevator_name[ELV_NAME_MAX]; struct elv_change_ctx ctx = { .uevent = true, @@ -808,9 +800,12 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, elv_iosched_load_module(ctx.name); + down_read(&set->update_nr_hwq_lock); ret = elevator_change(q, &ctx); if (!ret) ret = count; + up_read(&set->update_nr_hwq_lock); + return ret; } diff --git a/block/genhd.c b/block/genhd.c index de227aa923ed..359a55bf47ce 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -407,19 +407,12 @@ static int retry_on_updating_nr_hwq(struct gendisk_data *data, set = disk->queue->tag_set; do { - int idx, ret; + int ret; - idx = srcu_read_lock(&set->update_nr_hwq_srcu); - if (set->updating_nr_hwq) { - srcu_read_unlock(&set->update_nr_hwq_srcu, idx); - goto wait; - } + down_read(&set->update_nr_hwq_lock); ret = cb(data); - srcu_read_unlock(&set->update_nr_hwq_srcu, idx); + up_read(&set->update_nr_hwq_lock); return ret; - wait: - wait_event_interruptible(set->update_nr_hwq_wq, - !set->updating_nr_hwq); } while (true); } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index afe76dcfaa3c..ef84d53095a6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -9,6 +9,7 @@ #include <linux/prefetch.h> #include <linux/srcu.h> #include <linux/rw_hint.h> +#include <linux/rwsem.h> struct blk_mq_tags; struct blk_flush_queue; @@ -528,10 +529,7 @@ struct blk_mq_tag_set { struct list_head tag_list; struct srcu_struct *srcu; - bool updating_nr_hwq; - struct mutex update_nr_hwq_lock; - struct srcu_struct update_nr_hwq_srcu; - wait_queue_head_t update_nr_hwq_wq; + struct rw_semaphore update_nr_hwq_lock; }; /**