Move scheduler tags (sched_tags) allocation and deallocation outside both the ->elevator_lock and ->freeze_lock when updating nr_hw_queues. This change breaks the dependency chain from the percpu allocator lock to the elevator lock, helping to prevent potential deadlocks, as observed in the reported lockdep splat[1]. This commit introduces batch allocation and deallocation helpers for sched_tags, which are now used from within __blk_mq_update_nr_hw_queues routine while iterating through the tagset. With this change, all sched_tags memory management is handled entirely outside the ->elevator_lock and the ->freeze_lock context, thereby eliminating the lock dependency that could otherwise manifest during nr_hw_queues updates. [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@xxxxxxxxxxxxx/ Reported-by: Stefan Haberland <sth@xxxxxxxxxxxxx> Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@xxxxxxxxxxxxx/ Reviewed-by: Ming Lei <ming.lei@xxxxxxxxxx> Reviewed-by: Christoph Hellwig <hch@xxxxxx> Signed-off-by: Nilay Shroff <nilay@xxxxxxxxxxxxx> --- block/blk-mq-sched.c | 65 ++++++++++++++++++++++++++++++++++++++++++++ block/blk-mq-sched.h | 4 +++ block/blk-mq.c | 16 +++++++++-- block/blk.h | 3 +- block/elevator.c | 6 ++-- 5 files changed, 88 insertions(+), 6 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 2d6d1ebdd8fb..e2ce4a28e6c9 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -427,6 +427,32 @@ void blk_mq_free_sched_tags(struct elevator_tags *et, kfree(et); } +void blk_mq_free_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set) +{ + struct request_queue *q; + struct elevator_tags *et; + + lockdep_assert_held_write(&set->update_nr_hwq_lock); + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + /* + * Accessing q->elevator without holding q->elevator_lock is + * safe because we're holding here set->update_nr_hwq_lock in + * the writer context. So, scheduler update/switch code (which + * acquires the same lock but in the reader context) can't run + * concurrently. + */ + if (q->elevator) { + et = xa_load(et_table, q->id); + if (unlikely(!et)) + WARN_ON_ONCE(1); + else + blk_mq_free_sched_tags(et, set); + } + } +} + struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, unsigned int nr_hw_queues) { @@ -477,6 +503,45 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, return NULL; } +int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set, unsigned int nr_hw_queues) +{ + struct request_queue *q; + struct elevator_tags *et; + gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; + + lockdep_assert_held_write(&set->update_nr_hwq_lock); + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + /* + * Accessing q->elevator without holding q->elevator_lock is + * safe because we're holding here set->update_nr_hwq_lock in + * the writer context. So, scheduler update/switch code (which + * acquires the same lock but in the reader context) can't run + * concurrently. + */ + if (q->elevator) { + et = blk_mq_alloc_sched_tags(set, nr_hw_queues); + if (!et) + goto out_unwind; + if (xa_insert(et_table, q->id, et, gfp)) + goto out_free_tags; + } + } + return 0; +out_free_tags: + blk_mq_free_sched_tags(et, set); +out_unwind: + list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) { + if (q->elevator) { + et = xa_load(et_table, q->id); + if (et) + blk_mq_free_sched_tags(et, set); + } + } + return -ENOMEM; +} + /* caller must have a reference to @e, will grab another one if successful */ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, struct elevator_tags *et) diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 0cde00cd1c47..b554e1d55950 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -25,8 +25,12 @@ void blk_mq_sched_free_rqs(struct request_queue *q); struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, unsigned int nr_hw_queues); +int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set, unsigned int nr_hw_queues); void blk_mq_free_sched_tags(struct elevator_tags *et, struct blk_mq_tag_set *set); +void blk_mq_free_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set); static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 4806b867e37d..af7aae699ede 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4972,6 +4972,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, struct request_queue *q; int prev_nr_hw_queues = set->nr_hw_queues; unsigned int memflags; + struct xarray et_table; + struct elevator_tags *et; int i; lockdep_assert_held(&set->tag_list_lock); @@ -4984,6 +4986,11 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, return; memflags = memalloc_noio_save(); + + xa_init(&et_table); + if (blk_mq_alloc_sched_tags_batch(&et_table, set, nr_hw_queues) < 0) + goto out_memalloc_restore; + list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); blk_mq_sysfs_unregister_hctxs(q); @@ -4995,6 +5002,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) { list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_unfreeze_queue_nomemrestore(q); + blk_mq_free_sched_tags_batch(&et_table, set); goto reregister; } @@ -5018,8 +5026,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, } /* elv_update_nr_hw_queues() unfreeze queue for us */ - list_for_each_entry(q, &set->tag_list, tag_set_list) - elv_update_nr_hw_queues(q); + list_for_each_entry(q, &set->tag_list, tag_set_list) { + et = xa_load(&et_table, q->id); + elv_update_nr_hw_queues(q, et); + } reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { @@ -5029,7 +5039,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, blk_mq_remove_hw_queues_cpuhp(q); blk_mq_add_hw_queues_cpuhp(q); } +out_memalloc_restore: memalloc_noio_restore(memflags); + xa_destroy(&et_table); /* Free the excess tags when nr_hw_queues shrink. */ for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) diff --git a/block/blk.h b/block/blk.h index 37ec459fe656..a312518fb8f3 100644 --- a/block/blk.h +++ b/block/blk.h @@ -12,6 +12,7 @@ #include "blk-crypto-internal.h" struct elevator_type; +struct elevator_tags; #define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9) #define BLK_MIN_SEGMENT_SIZE 4096 @@ -321,7 +322,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, bool blk_insert_flush(struct request *rq); -void elv_update_nr_hw_queues(struct request_queue *q); +void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_tags *et); void elevator_set_default(struct request_queue *q); void elevator_set_none(struct request_queue *q); diff --git a/block/elevator.c b/block/elevator.c index 50f4b78efe66..92a0a4851583 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -705,7 +705,7 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) * The I/O scheduler depends on the number of hardware queues, this forces a * reattachment when nr_hw_queues changes. */ -void elv_update_nr_hw_queues(struct request_queue *q) +void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_tags *et) { struct blk_mq_tag_set *set = q->tag_set; struct elv_change_ctx ctx = {}; @@ -720,11 +720,11 @@ void elv_update_nr_hw_queues(struct request_queue *q) * acquires same lock in the reader context) can't run concurrently. */ if (q->elevator) { - ctx.et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues); - if (!ctx.et) { + if (!et) { WARN_ON_ONCE(1); return; } + ctx.et = et; } mutex_lock(&q->elevator_lock); -- 2.50.0