Recent lockdep reports [1] have revealed a potential deadlock caused by a lock dependency between the percpu allocator lock and the elevator lock. This issue can be avoided by ensuring that the allocation and release of scheduler tags (sched_tags) are performed outside the elevator lock. Furthermore, the queue does not need to be remain frozen during these operations. To address this, move all sched_tags allocations and deallocations outside of both the ->elevator_lock and the ->freeze_lock. Since the lifetime of the elevator queue and its associated sched_tags is closely tied, the allocated sched_tags are now stored in the elevator queue structure. Then, during the actual elevator switch (which runs under ->freeze_lock and ->elevator_lock), the pre-allocated sched_tags are assigned to the appropriate q->hctx. Once the elevator switch is complete and the locks are released, the old elevator queue and its associated sched_tags are freed. This commit specifically addresses the allocation/deallocation of sched_ tags during elevator switching. Note that sched_tags may also be allocated in other contexts, such as during nr_hw_queues updates. Supporting that use case will require batch allocation/deallocation, which will be handled in a follow-up patch. This restructuring ensures that sched_tags memory management occurs entirely outside of the ->elevator_lock and ->freeze_lock context, eliminating the lock dependency problem seen during scheduler updates. [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@xxxxxxxxxxxxx/ Reported-by: Stefan Haberland <sth@xxxxxxxxxxxxx> Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@xxxxxxxxxxxxx/ Signed-off-by: Nilay Shroff <nilay@xxxxxxxxxxxxx> --- block/blk-mq-sched.c | 186 ++++++++++++++++++++++++++----------------- block/blk-mq-sched.h | 14 +++- block/elevator.c | 66 +++++++++++++-- block/elevator.h | 19 ++++- 4 files changed, 204 insertions(+), 81 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 359e0704e09b..5d3132ac7777 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -374,64 +374,17 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); -static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) -{ - if (blk_mq_is_shared_tags(q->tag_set->flags)) { - hctx->sched_tags = q->sched_shared_tags; - return 0; - } - - hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx, - q->nr_requests); - - if (!hctx->sched_tags) - return -ENOMEM; - return 0; -} - -static void blk_mq_exit_sched_shared_tags(struct request_queue *queue) -{ - blk_mq_free_rq_map(queue->sched_shared_tags); - queue->sched_shared_tags = NULL; -} - /* called in queue's release handler, tagset has gone away */ static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) { struct blk_mq_hw_ctx *hctx; unsigned long i; - queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->sched_tags) { - if (!blk_mq_is_shared_tags(flags)) - blk_mq_free_rq_map(hctx->sched_tags); - hctx->sched_tags = NULL; - } - } + queue_for_each_hw_ctx(q, hctx, i) + hctx->sched_tags = NULL; if (blk_mq_is_shared_tags(flags)) - blk_mq_exit_sched_shared_tags(q); -} - -static int blk_mq_init_sched_shared_tags(struct request_queue *queue) -{ - struct blk_mq_tag_set *set = queue->tag_set; - - /* - * Set initial depth at max so that we don't need to reallocate for - * updating nr_requests. - */ - queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set, - BLK_MQ_NO_HCTX_IDX, - MAX_SCHED_RQ); - if (!queue->sched_shared_tags) - return -ENOMEM; - - blk_mq_tag_update_sched_shared_tags(queue); - - return 0; + q->sched_shared_tags = NULL; } void blk_mq_sched_reg_debugfs(struct request_queue *q) @@ -458,8 +411,106 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q) mutex_unlock(&q->debugfs_mutex); } +void __blk_mq_free_sched_tags(struct blk_mq_tag_set *set, + struct blk_mq_tags **tags, unsigned int nr_hw_queues) +{ + unsigned long i; + + if (!tags) + return; + + /* Shared tags are stored at index 0 in @tags. */ + if (blk_mq_is_shared_tags(set->flags)) + blk_mq_free_map_and_rqs(set, tags[0], BLK_MQ_NO_HCTX_IDX); + else { + for (i = 0; i < nr_hw_queues; i++) + blk_mq_free_map_and_rqs(set, tags[i], i); + } + + kfree(tags); +} + +void blk_mq_free_sched_tags(struct elevator_tags *et, + struct blk_mq_tag_set *set, int id) +{ + struct blk_mq_tags **tags; + + tags = xa_load(&et->tags_table, id); + __blk_mq_free_sched_tags(set, tags, et->nr_hw_queues); +} + +struct blk_mq_tags **__blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, + unsigned int nr_hw_queues, + unsigned int nr_requests, + gfp_t gfp) +{ + int i, nr_tags; + struct blk_mq_tags **tags; + + if (blk_mq_is_shared_tags(set->flags)) + nr_tags = 1; + else + nr_tags = nr_hw_queues; + + tags = kcalloc(nr_tags, sizeof(struct blk_mq_tags *), gfp); + if (!tags) + return NULL; + + if (blk_mq_is_shared_tags(set->flags)) { + /* Shared tags are stored at index 0 in @tags. */ + tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX, + MAX_SCHED_RQ); + if (!tags[0]) + goto out; + } else { + for (i = 0; i < nr_hw_queues; i++) { + tags[i] = blk_mq_alloc_map_and_rqs(set, i, nr_requests); + if (!tags[i]) + goto out_unwind; + } + } + + return tags; +out_unwind: + while (--i >= 0) + blk_mq_free_map_and_rqs(set, tags[i], i); +out: + kfree(tags); + return NULL; +} + +int blk_mq_alloc_sched_tags(struct elevator_tags *et, + struct blk_mq_tag_set *set, int id) +{ + struct blk_mq_tags **tags; + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; + + /* + * Default to double of smaller one between hw queue_depth and + * 128, since we don't split into sync/async like the old code + * did. Additionally, this is a per-hw queue depth. + */ + et->nr_requests = 2 * min_t(unsigned int, set->queue_depth, + BLKDEV_DEFAULT_RQ); + + tags = __blk_mq_alloc_sched_tags(set, et->nr_hw_queues, + et->nr_requests, gfp); + if (!tags) + return -ENOMEM; + + if (xa_insert(&et->tags_table, id, tags, gfp)) + goto out_free_tags; + + return 0; + +out_free_tags: + __blk_mq_free_sched_tags(set, tags, et->nr_hw_queues); + return -ENOMEM; +} + /* caller must have a reference to @e, will grab another one if successful */ -int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) +int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, + struct elevator_tags *et) { unsigned int flags = q->tag_set->flags; struct blk_mq_hw_ctx *hctx; @@ -467,40 +518,33 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) unsigned long i; int ret; - /* - * Default to double of smaller one between hw queue_depth and 128, - * since we don't split into sync/async like the old code did. - * Additionally, this is a per-hw queue depth. - */ - q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, - BLKDEV_DEFAULT_RQ); - - eq = elevator_alloc(q, e); + eq = elevator_alloc(q, e, et); if (!eq) return -ENOMEM; + q->nr_requests = et->nr_requests; + if (blk_mq_is_shared_tags(flags)) { - ret = blk_mq_init_sched_shared_tags(q); - if (ret) - goto err_put_elevator; + /* Shared tags are stored at index 0 in @eq->tags. */ + q->sched_shared_tags = eq->tags[0]; + blk_mq_tag_update_sched_shared_tags(q); } queue_for_each_hw_ctx(q, hctx, i) { - ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i); - if (ret) - goto err_free_map_and_rqs; + if (blk_mq_is_shared_tags(flags)) + hctx->sched_tags = q->sched_shared_tags; + else + hctx->sched_tags = eq->tags[i]; } ret = e->ops.init_sched(q, eq); if (ret) - goto err_free_map_and_rqs; + goto out; queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { ret = e->ops.init_hctx(hctx, i); if (ret) { - eq = q->elevator; - blk_mq_sched_free_rqs(q); blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; @@ -509,10 +553,8 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) } return 0; -err_free_map_and_rqs: - blk_mq_sched_free_rqs(q); +out: blk_mq_sched_tags_teardown(q, flags); -err_put_elevator: kobject_put(&eq->kobj); q->elevator = NULL; return ret; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 1326526bb733..92aa50b8376a 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -18,7 +18,19 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); -int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); +int blk_mq_alloc_sched_tags(struct elevator_tags *et, + struct blk_mq_tag_set *set, int id); +void blk_mq_free_sched_tags(struct elevator_tags *et, + struct blk_mq_tag_set *set, int id); +struct blk_mq_tags **__blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, + unsigned int nr_hw_queues, + unsigned int nr_requests, + gfp_t gfp); +void __blk_mq_free_sched_tags(struct blk_mq_tag_set *set, + struct blk_mq_tags **tags, unsigned int nr_hw_queues); + +int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, + struct elevator_tags *et); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_rqs(struct request_queue *q); diff --git a/block/elevator.c b/block/elevator.c index 770874040f79..1408894c0396 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -54,6 +54,8 @@ struct elv_change_ctx { struct elevator_queue *old; /* for registering new elevator */ struct elevator_queue *new; + /* holds sched tags data */ + struct elevator_tags *et; }; static DEFINE_SPINLOCK(elv_list_lock); @@ -132,7 +134,7 @@ static struct elevator_type *elevator_find_get(const char *name) static const struct kobj_type elv_ktype; struct elevator_queue *elevator_alloc(struct request_queue *q, - struct elevator_type *e) + struct elevator_type *e, struct elevator_tags *et) { struct elevator_queue *eq; @@ -145,8 +147,15 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); hash_init(eq->hash); + eq->nr_hw_queues = et->nr_hw_queues; + eq->tags = xa_load(&et->tags_table, q->id); + if (WARN_ON_ONCE(!eq->tags)) + goto out; return eq; +out: + kobject_put(&eq->kobj); + return NULL; } static void elevator_release(struct kobject *kobj) @@ -165,7 +174,6 @@ static void elevator_exit(struct request_queue *q) lockdep_assert_held(&q->elevator_lock); ioc_clear_queue(q); - blk_mq_sched_free_rqs(q); mutex_lock(&e->sysfs_lock); blk_mq_exit_sched(q, e); @@ -591,7 +599,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx) } if (new_e) { - ret = blk_mq_init_sched(q, new_e); + ret = blk_mq_init_sched(q, new_e, ctx->et); if (ret) goto out_unfreeze; ctx->new = q->elevator; @@ -626,8 +634,10 @@ static void elv_exit_and_release(struct request_queue *q) elevator_exit(q); mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); - if (e) + if (e) { + __blk_mq_free_sched_tags(q->tag_set, e->tags, e->nr_hw_queues); kobject_put(&e->kobj); + } } static int elevator_change_done(struct request_queue *q, @@ -640,6 +650,8 @@ static int elevator_change_done(struct request_queue *q, &ctx->old->flags); elv_unregister_queue(q, ctx->old); + __blk_mq_free_sched_tags(q->tag_set, ctx->old->tags, + ctx->old->nr_hw_queues); kobject_put(&ctx->old->kobj); if (enable_wbt) wbt_enable_default(q->disk); @@ -658,9 +670,20 @@ static int elevator_change_done(struct request_queue *q, static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) { unsigned int memflags; + struct elevator_tags et; + struct blk_mq_tag_set *set = q->tag_set; int ret = 0; - lockdep_assert_held(&q->tag_set->update_nr_hwq_lock); + lockdep_assert_held(&set->update_nr_hwq_lock); + + et.nr_hw_queues = set->nr_hw_queues; + xa_init(&et.tags_table); + ctx->et = &et; + if (strncmp(ctx->name, "none", 4)) { + ret = blk_mq_alloc_sched_tags(ctx->et, set, q->id); + if (ret) + goto out; + } memflags = blk_mq_freeze_queue(q); /* @@ -680,7 +703,13 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) blk_mq_unfreeze_queue(q, memflags); if (!ret) ret = elevator_change_done(q, ctx); - + /* + * Free sched tags if it's allocated but we couldn't switch elevator. + */ + if (!xa_empty(&ctx->et->tags_table) && !ctx->new) + blk_mq_free_sched_tags(ctx->et, set, q->id); +out: + xa_destroy(&ctx->et->tags_table); return ret; } @@ -690,11 +719,29 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) */ void elv_update_nr_hw_queues(struct request_queue *q) { + struct blk_mq_tag_set *set = q->tag_set; + struct elevator_tags et; struct elv_change_ctx ctx = {}; int ret = -ENODEV; WARN_ON_ONCE(q->mq_freeze_depth == 0); + et.nr_hw_queues = set->nr_hw_queues; + xa_init(&et.tags_table); + ctx.et = &et; + /* + * Accessing q->elevator without holding q->elevator_lock is safe here + * because nr_hw_queue update is protected by set->update_nr_hwq_lock + * in the writer context. So, scheduler update/switch code (which + * acquires same lock in the reader context) can't run concurrently. + */ + if (q->elevator) { + if (blk_mq_alloc_sched_tags(ctx.et, set, q->id)) { + WARN_ON_ONCE(1); + goto out; + } + } + mutex_lock(&q->elevator_lock); if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) { ctx.name = q->elevator->type->elevator_name; @@ -706,6 +753,13 @@ void elv_update_nr_hw_queues(struct request_queue *q) blk_mq_unfreeze_queue_nomemrestore(q); if (!ret) WARN_ON_ONCE(elevator_change_done(q, &ctx)); + /* + * Free sched tags if it's allocated but we couldn't switch elevator. + */ + if (!xa_empty(&ctx.et->tags_table) && !ctx.new) + blk_mq_free_sched_tags(ctx.et, set, q->id); +out: + xa_destroy(&ctx.et->tags_table); } /* diff --git a/block/elevator.h b/block/elevator.h index a4de5f9ad790..849c264031ca 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -23,6 +23,19 @@ enum elv_merge { struct blk_mq_alloc_data; struct blk_mq_hw_ctx; +struct elevator_tags { + /* num. of hardware queues for which tags are allocated */ + unsigned int nr_hw_queues; + /* depth used while allocating tags */ + unsigned int nr_requests; + /* + * An Xarray table storing shared and per hardware queue sched tags. + * The key to store an entry in Xarray is q->id. Please note that + * shared sched tags are always stored at index 0. + */ + struct xarray tags_table; +}; + struct elevator_mq_ops { int (*init_sched)(struct request_queue *, struct elevator_queue *); void (*exit_sched)(struct elevator_queue *); @@ -113,6 +126,8 @@ struct request *elv_rqhash_find(struct request_queue *q, sector_t offset); struct elevator_queue { struct elevator_type *type; + struct blk_mq_tags **tags; + unsigned int nr_hw_queues; void *elevator_data; struct kobject kobj; struct mutex sysfs_lock; @@ -152,8 +167,8 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *page); ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); -extern struct elevator_queue *elevator_alloc(struct request_queue *, - struct elevator_type *); +struct elevator_queue *elevator_alloc(struct request_queue *, + struct elevator_type *, struct elevator_tags *); /* * Helper functions. -- 2.49.0