On Tue, Jun 24, 2025 at 06:47:04PM +0530, Nilay Shroff wrote: > Recent lockdep reports [1] have revealed a potential deadlock caused by a > lock dependency between the percpu allocator lock and the elevator lock. > This issue can be avoided by ensuring that the allocation and release of > scheduler tags (sched_tags) are performed outside the elevator lock. > Furthermore, the queue does not need to be remain frozen during these > operations. > > To address this, move all sched_tags allocations and deallocations outside > of both the ->elevator_lock and the ->freeze_lock. Since the lifetime of > the elevator queue and its associated sched_tags is closely tied, the > allocated sched_tags are now stored in the elevator queue structure. Then, > during the actual elevator switch (which runs under ->freeze_lock and > ->elevator_lock), the pre-allocated sched_tags are assigned to the > appropriate q->hctx. Once the elevator switch is complete and the locks > are released, the old elevator queue and its associated sched_tags are > freed. > > This commit specifically addresses the allocation/deallocation of sched_ > tags during elevator switching. Note that sched_tags may also be allocated > in other contexts, such as during nr_hw_queues updates. Supporting that > use case will require batch allocation/deallocation, which will be handled > in a follow-up patch. > > This restructuring ensures that sched_tags memory management occurs > entirely outside of the ->elevator_lock and ->freeze_lock context, > eliminating the lock dependency problem seen during scheduler updates. > > [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@xxxxxxxxxxxxx/ > > Reported-by: Stefan Haberland <sth@xxxxxxxxxxxxx> > Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@xxxxxxxxxxxxx/ > Signed-off-by: Nilay Shroff <nilay@xxxxxxxxxxxxx> > --- > block/blk-mq-sched.c | 186 ++++++++++++++++++++++++++----------------- > block/blk-mq-sched.h | 14 +++- > block/elevator.c | 66 +++++++++++++-- > block/elevator.h | 19 ++++- > 4 files changed, 204 insertions(+), 81 deletions(-) > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c > index 359e0704e09b..5d3132ac7777 100644 > --- a/block/blk-mq-sched.c > +++ b/block/blk-mq-sched.c > @@ -374,64 +374,17 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, > } > EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); > > -static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, > - struct blk_mq_hw_ctx *hctx, > - unsigned int hctx_idx) > -{ > - if (blk_mq_is_shared_tags(q->tag_set->flags)) { > - hctx->sched_tags = q->sched_shared_tags; > - return 0; > - } > - > - hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx, > - q->nr_requests); > - > - if (!hctx->sched_tags) > - return -ENOMEM; > - return 0; > -} > - > -static void blk_mq_exit_sched_shared_tags(struct request_queue *queue) > -{ > - blk_mq_free_rq_map(queue->sched_shared_tags); > - queue->sched_shared_tags = NULL; > -} > - > /* called in queue's release handler, tagset has gone away */ > static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) > { > struct blk_mq_hw_ctx *hctx; > unsigned long i; > > - queue_for_each_hw_ctx(q, hctx, i) { > - if (hctx->sched_tags) { > - if (!blk_mq_is_shared_tags(flags)) > - blk_mq_free_rq_map(hctx->sched_tags); > - hctx->sched_tags = NULL; > - } > - } > + queue_for_each_hw_ctx(q, hctx, i) > + hctx->sched_tags = NULL; > > if (blk_mq_is_shared_tags(flags)) > - blk_mq_exit_sched_shared_tags(q); > -} > - > -static int blk_mq_init_sched_shared_tags(struct request_queue *queue) > -{ > - struct blk_mq_tag_set *set = queue->tag_set; > - > - /* > - * Set initial depth at max so that we don't need to reallocate for > - * updating nr_requests. > - */ > - queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set, > - BLK_MQ_NO_HCTX_IDX, > - MAX_SCHED_RQ); > - if (!queue->sched_shared_tags) > - return -ENOMEM; > - > - blk_mq_tag_update_sched_shared_tags(queue); > - > - return 0; > + q->sched_shared_tags = NULL; > } > > void blk_mq_sched_reg_debugfs(struct request_queue *q) > @@ -458,8 +411,106 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q) > mutex_unlock(&q->debugfs_mutex); > } > > +void __blk_mq_free_sched_tags(struct blk_mq_tag_set *set, > + struct blk_mq_tags **tags, unsigned int nr_hw_queues) > +{ > + unsigned long i; > + > + if (!tags) > + return; > + > + /* Shared tags are stored at index 0 in @tags. */ > + if (blk_mq_is_shared_tags(set->flags)) > + blk_mq_free_map_and_rqs(set, tags[0], BLK_MQ_NO_HCTX_IDX); > + else { > + for (i = 0; i < nr_hw_queues; i++) > + blk_mq_free_map_and_rqs(set, tags[i], i); > + } > + > + kfree(tags); > +} > + > +void blk_mq_free_sched_tags(struct elevator_tags *et, > + struct blk_mq_tag_set *set, int id) > +{ > + struct blk_mq_tags **tags; > + > + tags = xa_load(&et->tags_table, id); > + __blk_mq_free_sched_tags(set, tags, et->nr_hw_queues); > +} > + > +struct blk_mq_tags **__blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, > + unsigned int nr_hw_queues, > + unsigned int nr_requests, > + gfp_t gfp) > +{ > + int i, nr_tags; > + struct blk_mq_tags **tags; > + > + if (blk_mq_is_shared_tags(set->flags)) > + nr_tags = 1; > + else > + nr_tags = nr_hw_queues; > + > + tags = kcalloc(nr_tags, sizeof(struct blk_mq_tags *), gfp); > + if (!tags) > + return NULL; > + > + if (blk_mq_is_shared_tags(set->flags)) { > + /* Shared tags are stored at index 0 in @tags. */ > + tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX, > + MAX_SCHED_RQ); > + if (!tags[0]) > + goto out; > + } else { > + for (i = 0; i < nr_hw_queues; i++) { > + tags[i] = blk_mq_alloc_map_and_rqs(set, i, nr_requests); > + if (!tags[i]) > + goto out_unwind; > + } > + } > + > + return tags; > +out_unwind: > + while (--i >= 0) > + blk_mq_free_map_and_rqs(set, tags[i], i); > +out: > + kfree(tags); > + return NULL; > +} > + > +int blk_mq_alloc_sched_tags(struct elevator_tags *et, > + struct blk_mq_tag_set *set, int id) > +{ > + struct blk_mq_tags **tags; > + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; > + > + /* > + * Default to double of smaller one between hw queue_depth and > + * 128, since we don't split into sync/async like the old code > + * did. Additionally, this is a per-hw queue depth. > + */ > + et->nr_requests = 2 * min_t(unsigned int, set->queue_depth, > + BLKDEV_DEFAULT_RQ); > + > + tags = __blk_mq_alloc_sched_tags(set, et->nr_hw_queues, > + et->nr_requests, gfp); > + if (!tags) > + return -ENOMEM; > + > + if (xa_insert(&et->tags_table, id, tags, gfp)) > + goto out_free_tags; Not sure why you add `tags_table` into `elevator_tags`, it looks very confusing just from naming, not mention only single element is stored to the table in elevator switch code path since queue id is the key. Wrt. xarray suggestion, I meant to add on xarray local variable in __blk_mq_update_nr_hw_queues(), then you can store allocated `elevator_tags` to the xarray via ->id of request_queue. Thanks, Ming