Move scheduler tags (sched_tags) allocation and deallocation outside both the ->elevator_lock and ->freeze_lock when updating nr_hw_queues. This change breaks the dependency chain from the percpu allocator lock to the elevator lock, helping to prevent potential deadlocks, as observed in the reported lockdep splat[1]. This commit introduces batch allocation and deallocation helpers for sched_tags, which are now used from within __blk_mq_update_nr_hw_queues routine while iterating through the tagset. With this change, all sched_tags memory management is handled entirely outside the ->elevator_lock and the ->freeze_lock context, thereby eliminating the lock dependency that could otherwise manifest during nr_hw_queues updates. [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@xxxxxxxxxxxxx/ Reported-by: Stefan Haberland <sth@xxxxxxxxxxxxx> Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@xxxxxxxxxxxxx/ Signed-off-by: Nilay Shroff <nilay@xxxxxxxxxxxxx> --- block/blk-mq-sched.c | 63 ++++++++++++++++++++++++++++++++++++++++++++ block/blk-mq-sched.h | 4 +++ block/blk-mq.c | 11 +++++++- block/blk.h | 2 +- block/elevator.c | 4 +-- 5 files changed, 80 insertions(+), 4 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 7bc15b4cff89..dcc4036647b0 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -427,6 +427,30 @@ void blk_mq_free_sched_tags(struct elevator_tags *et, kfree(et); } +void blk_mq_free_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set) +{ + struct request_queue *q; + struct elevator_tags *et; + + lockdep_assert_held_write(&set->update_nr_hwq_lock); + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + /* + * Accessing q->elevator without holding q->elevator_lock is + * safe because we're holding here set->update_nr_hwq_lock in + * the writer context. So, scheduler update/switch code (which + * acquires the same lock but in the reader context) can't run + * concurrently. + */ + if (q->elevator) { + et = xa_load(et_table, q->id); + if (et) + blk_mq_free_sched_tags(et, set); + } + } +} + struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, unsigned int nr_hw_queues) { @@ -476,6 +500,45 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, return NULL; } +int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set, unsigned int nr_hw_queues) +{ + struct request_queue *q; + struct elevator_tags *et; + gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; + + lockdep_assert_held_write(&set->update_nr_hwq_lock); + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + /* + * Accessing q->elevator without holding q->elevator_lock is + * safe because we're holding here set->update_nr_hwq_lock in + * the writer context. So, scheduler update/switch code (which + * acquires the same lock but in the reader context) can't run + * concurrently. + */ + if (q->elevator) { + et = blk_mq_alloc_sched_tags(set, nr_hw_queues); + if (!et) + goto out_unwind; + if (xa_insert(et_table, q->id, et, gfp)) + goto out_free_tags; + } + } + return 0; +out_free_tags: + blk_mq_free_sched_tags(et, set); +out_unwind: + list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) { + if (q->elevator) { + et = xa_load(et_table, q->id); + if (et) + blk_mq_free_sched_tags(et, set); + } + } + return -ENOMEM; +} + /* caller must have a reference to @e, will grab another one if successful */ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, struct elevator_tags *et) diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 0cde00cd1c47..b554e1d55950 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -25,8 +25,12 @@ void blk_mq_sched_free_rqs(struct request_queue *q); struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, unsigned int nr_hw_queues); +int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set, unsigned int nr_hw_queues); void blk_mq_free_sched_tags(struct elevator_tags *et, struct blk_mq_tag_set *set); +void blk_mq_free_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set); static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 4806b867e37d..a68b658ce07b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4972,6 +4972,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, struct request_queue *q; int prev_nr_hw_queues = set->nr_hw_queues; unsigned int memflags; + struct xarray et_table; int i; lockdep_assert_held(&set->tag_list_lock); @@ -4984,6 +4985,11 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, return; memflags = memalloc_noio_save(); + + xa_init(&et_table); + if (blk_mq_alloc_sched_tags_batch(&et_table, set, nr_hw_queues) < 0) + goto out_memalloc_restore; + list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); blk_mq_sysfs_unregister_hctxs(q); @@ -4995,6 +5001,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) { list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_unfreeze_queue_nomemrestore(q); + blk_mq_free_sched_tags_batch(&et_table, set); goto reregister; } @@ -5019,7 +5026,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, /* elv_update_nr_hw_queues() unfreeze queue for us */ list_for_each_entry(q, &set->tag_list, tag_set_list) - elv_update_nr_hw_queues(q); + elv_update_nr_hw_queues(q, &et_table); reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { @@ -5029,7 +5036,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, blk_mq_remove_hw_queues_cpuhp(q); blk_mq_add_hw_queues_cpuhp(q); } +out_memalloc_restore: memalloc_noio_restore(memflags); + xa_destroy(&et_table); /* Free the excess tags when nr_hw_queues shrink. */ for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) diff --git a/block/blk.h b/block/blk.h index 37ec459fe656..c6d1d1458388 100644 --- a/block/blk.h +++ b/block/blk.h @@ -321,7 +321,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, bool blk_insert_flush(struct request *rq); -void elv_update_nr_hw_queues(struct request_queue *q); +void elv_update_nr_hw_queues(struct request_queue *q, struct xarray *et_table); void elevator_set_default(struct request_queue *q); void elevator_set_none(struct request_queue *q); diff --git a/block/elevator.c b/block/elevator.c index 50f4b78efe66..8ba8b869d5a4 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -705,7 +705,7 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) * The I/O scheduler depends on the number of hardware queues, this forces a * reattachment when nr_hw_queues changes. */ -void elv_update_nr_hw_queues(struct request_queue *q) +void elv_update_nr_hw_queues(struct request_queue *q, struct xarray *et_table) { struct blk_mq_tag_set *set = q->tag_set; struct elv_change_ctx ctx = {}; @@ -720,7 +720,7 @@ void elv_update_nr_hw_queues(struct request_queue *q) * acquires same lock in the reader context) can't run concurrently. */ if (q->elevator) { - ctx.et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues); + ctx.et = xa_load(et_table, q->id); if (!ctx.et) { WARN_ON_ONCE(1); return; -- 2.49.0