Recent lockdep reports [1] have revealed a potential deadlock caused by a lock dependency between the percpu allocator lock and the elevator lock. This issue can be avoided by ensuring that the allocation and release of scheduler tags (sched_tags) are performed outside the elevator lock. Furthermore, the queue does not need to be remain frozen during these operations. To address this, move all sched_tags allocations and deallocations outside of both the ->elevator_lock and the ->freeze_lock. Moreover, since the lifetime of the elevator queue and its associated sched_tags is closely tied, the allocated sched_tags are now stored in the elevator queue structure. Then, during the actual elevator switch (which runs under ->freeze_lock and ->elevator_lock), the pre-allocated sched_tags are assigned to the appropriate q->hctx. Once the elevator switch is complete and the locks are released, the old elevator queue and its associated sched_tags are freed. This restructuring ensures that sched_tags memory management occurs entirely outside of the ->elevator_lock and ->freeze_lock context, eliminating the lock dependency problem seen during scheduler updates or hardware queue update. [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@xxxxxxxxxxxxx/ Reported-by: Stefan Haberland <sth@xxxxxxxxxxxxx> Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@xxxxxxxxxxxxx/ Signed-off-by: Nilay Shroff <nilay@xxxxxxxxxxxxx> --- block/blk-mq-sched.c | 244 +++++++++++++++++++++++++++++++------------ block/blk-mq-sched.h | 13 ++- block/blk-mq.c | 14 ++- block/blk.h | 4 +- block/elevator.c | 82 +++++++++++---- block/elevator.h | 28 ++++- 6 files changed, 296 insertions(+), 89 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index d914eb9d61a6..dd00d8c9fb78 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -374,27 +374,17 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); -static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) +static void blk_mq_init_sched_tags(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, + unsigned int hctx_idx, + struct elevator_queue *eq) { if (blk_mq_is_shared_tags(q->tag_set->flags)) { hctx->sched_tags = q->sched_shared_tags; - return 0; + return; } - hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx, - q->nr_requests); - - if (!hctx->sched_tags) - return -ENOMEM; - return 0; -} - -static void blk_mq_exit_sched_shared_tags(struct request_queue *queue) -{ - blk_mq_free_rq_map(queue->sched_shared_tags); - queue->sched_shared_tags = NULL; + hctx->sched_tags = eq->tags->u.tags[hctx_idx]; } /* called in queue's release handler, tagset has gone away */ @@ -403,35 +393,18 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int fla struct blk_mq_hw_ctx *hctx; unsigned long i; - queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->sched_tags) { - if (!blk_mq_is_shared_tags(flags)) - blk_mq_free_rq_map(hctx->sched_tags); - hctx->sched_tags = NULL; - } - } + queue_for_each_hw_ctx(q, hctx, i) + hctx->sched_tags = NULL; if (blk_mq_is_shared_tags(flags)) - blk_mq_exit_sched_shared_tags(q); + q->sched_shared_tags = NULL; } -static int blk_mq_init_sched_shared_tags(struct request_queue *queue) +static void blk_mq_init_sched_shared_tags(struct request_queue *queue, + struct elevator_queue *eq) { - struct blk_mq_tag_set *set = queue->tag_set; - - /* - * Set initial depth at max so that we don't need to reallocate for - * updating nr_requests. - */ - queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set, - BLK_MQ_NO_HCTX_IDX, - MAX_SCHED_RQ); - if (!queue->sched_shared_tags) - return -ENOMEM; - + queue->sched_shared_tags = eq->tags->u.shared_tags; blk_mq_tag_update_sched_shared_tags(queue); - - return 0; } void blk_mq_sched_reg_debugfs(struct request_queue *q) @@ -458,8 +431,165 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q) mutex_unlock(&q->debugfs_mutex); } +void __blk_mq_free_sched_tags(struct blk_mq_tag_set *set, + struct elevator_tags *tags) +{ + unsigned long i; + + if (!tags) + return; + + if (blk_mq_is_shared_tags(set->flags)) { + if (tags->u.shared_tags) { + blk_mq_free_rqs(set, tags->u.shared_tags, + BLK_MQ_NO_HCTX_IDX); + blk_mq_free_rq_map(tags->u.shared_tags); + } + goto out; + } + + if (!tags->u.tags) + goto out; + + for (i = 0; i < tags->nr_hw_queues; i++) { + if (tags->u.tags[i]) { + blk_mq_free_rqs(set, tags->u.tags[i], i); + blk_mq_free_rq_map(tags->u.tags[i]); + } + } + + kfree(tags->u.tags); +out: + kfree(tags); +} + +void blk_mq_free_sched_tags(struct blk_mq_tag_set *set, + struct elevator_tags **tags) +{ + unsigned long i, count = 0; + struct request_queue *q; + + lockdep_assert_held_write(&set->update_nr_hwq_lock); + + if (!tags) + return; + + /* + * Accessing q->elevator without holding q->elevator_lock is safe + * because we're holding here set->update_nr_hwq_lock in the writer + * context. So, scheduler update/switch code (which acquires the same + * lock but in the reader context) can't run concurrently. + */ + list_for_each_entry(q, &set->tag_list, tag_set_list) { + if (q->elevator) + count++; + } + + for (i = 0; i < count; i++) + __blk_mq_free_sched_tags(set, tags[i]); + + kfree(tags); +} + +struct elevator_tags *__blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, + unsigned int nr_hw_queues, int id) +{ + unsigned int i; + struct elevator_tags *tags; + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; + + tags = kcalloc(1, sizeof(struct elevator_tags), gfp); + if (!tags) + return NULL; + + tags->id = id; + /* + * Default to double of smaller one between hw queue_depth and + * 128, since we don't split into sync/async like the old code + * did. Additionally, this is a per-hw queue depth. + */ + tags->nr_requests = 2 * min_t(unsigned int, set->queue_depth, + BLKDEV_DEFAULT_RQ); + + if (blk_mq_is_shared_tags(set->flags)) { + + tags->u.shared_tags = blk_mq_alloc_map_and_rqs(set, + BLK_MQ_NO_HCTX_IDX, + MAX_SCHED_RQ); + if (!tags->u.shared_tags) + goto out; + + return tags; + } + + tags->u.tags = kcalloc(nr_hw_queues, sizeof(struct blk_mq_tags *), gfp); + if (!tags->u.tags) + goto out; + + tags->nr_hw_queues = nr_hw_queues; + for (i = 0; i < nr_hw_queues; i++) { + tags->u.tags[i] = blk_mq_alloc_map_and_rqs(set, i, + tags->nr_requests); + if (!tags->u.tags[i]) + goto out; + } + + return tags; + +out: + __blk_mq_free_sched_tags(set, tags); + return NULL; +} + +int blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, + unsigned int nr_hw_queues, + struct elevator_tags ***elv_tags) +{ + unsigned long idx = 0, count = 0; + struct request_queue *q; + struct elevator_tags **tags; + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; + + lockdep_assert_held_write(&set->update_nr_hwq_lock); + /* + * Accessing q->elevator without holding q->elevator_lock is safe + * because we're holding here set->update_nr_hwq_lock in the writer + * context. So, scheduler update/switch code (which acquires the same + * lock but in the reader context) can't run concurrently. + */ + list_for_each_entry(q, &set->tag_list, tag_set_list) { + if (q->elevator) + count++; + } + + if (!count) + return 0; + + tags = kcalloc(count, sizeof(struct elevator_tags *), gfp); + if (!tags) + return -ENOMEM; + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + if (q->elevator) { + tags[idx] = __blk_mq_alloc_sched_tags(set, nr_hw_queues, + q->id); + if (!tags[idx]) + goto out; + + idx++; + } + } + + *elv_tags = tags; + return count; +out: + blk_mq_free_sched_tags(set, tags); + return -ENOMEM; +} + /* caller must have a reference to @e, will grab another one if successful */ -int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) +int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, + struct elevator_tags *tags) { unsigned int flags = q->tag_set->flags; struct blk_mq_hw_ctx *hctx; @@ -467,40 +597,26 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) unsigned long i; int ret; - /* - * Default to double of smaller one between hw queue_depth and 128, - * since we don't split into sync/async like the old code did. - * Additionally, this is a per-hw queue depth. - */ - q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, - BLKDEV_DEFAULT_RQ); - - eq = elevator_alloc(q, e); + eq = elevator_alloc(q, e, tags); if (!eq) return -ENOMEM; - if (blk_mq_is_shared_tags(flags)) { - ret = blk_mq_init_sched_shared_tags(q); - if (ret) - return ret; - } + q->nr_requests = tags->nr_requests; - queue_for_each_hw_ctx(q, hctx, i) { - ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i); - if (ret) - goto err_free_map_and_rqs; - } + if (blk_mq_is_shared_tags(flags)) + blk_mq_init_sched_shared_tags(q, eq); + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_init_sched_tags(q, hctx, i, eq); ret = e->ops.init_sched(q, eq); if (ret) - goto err_free_map_and_rqs; + goto out; queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { ret = e->ops.init_hctx(hctx, i); if (ret) { - eq = q->elevator; - blk_mq_sched_free_rqs(q); blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; @@ -509,10 +625,8 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) } return 0; -err_free_map_and_rqs: - blk_mq_sched_free_rqs(q); +out: blk_mq_sched_tags_teardown(q, flags); - kobject_put(&eq->kobj); q->elevator = NULL; return ret; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 1326526bb733..01c43192b98a 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -18,7 +18,18 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); -int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); +void __blk_mq_free_sched_tags(struct blk_mq_tag_set *set, + struct elevator_tags *tags); +void blk_mq_free_sched_tags(struct blk_mq_tag_set *set, + struct elevator_tags **elv_t); +int blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, + unsigned int nr_hw_queues, + struct elevator_tags ***tags); +struct elevator_tags *__blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, + unsigned int nr_hw_queues, int id); + +int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, + struct elevator_tags *elv_t); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_rqs(struct request_queue *q); diff --git a/block/blk-mq.c b/block/blk-mq.c index 4806b867e37d..b8b616c79742 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4970,6 +4970,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { struct request_queue *q; + int count; + struct elevator_tags **elv_tags = NULL; int prev_nr_hw_queues = set->nr_hw_queues; unsigned int memflags; int i; @@ -4984,6 +4986,11 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, return; memflags = memalloc_noio_save(); + + count = blk_mq_alloc_sched_tags(set, nr_hw_queues, &elv_tags); + if (count < 0) + goto out; + list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); blk_mq_sysfs_unregister_hctxs(q); @@ -4995,6 +5002,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) { list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_unfreeze_queue_nomemrestore(q); + + blk_mq_free_sched_tags(set, elv_tags); + elv_tags = NULL; goto reregister; } @@ -5019,7 +5029,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, /* elv_update_nr_hw_queues() unfreeze queue for us */ list_for_each_entry(q, &set->tag_list, tag_set_list) - elv_update_nr_hw_queues(q); + elv_update_nr_hw_queues(q, elv_tags, count); reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { @@ -5029,6 +5039,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, blk_mq_remove_hw_queues_cpuhp(q); blk_mq_add_hw_queues_cpuhp(q); } + kfree(elv_tags); +out: memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ diff --git a/block/blk.h b/block/blk.h index 37ec459fe656..cd50305da84e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -10,6 +10,7 @@ #include <linux/timekeeping.h> #include <xen/xen.h> #include "blk-crypto-internal.h" +#include "elevator.h" struct elevator_type; @@ -321,7 +322,8 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, bool blk_insert_flush(struct request *rq); -void elv_update_nr_hw_queues(struct request_queue *q); +void elv_update_nr_hw_queues(struct request_queue *q, + struct elevator_tags **elv_tags, int count); void elevator_set_default(struct request_queue *q); void elevator_set_none(struct request_queue *q); diff --git a/block/elevator.c b/block/elevator.c index ab22542e6cf0..849dbe347cb2 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -45,17 +45,6 @@ #include "blk-wbt.h" #include "blk-cgroup.h" -/* Holding context data for changing elevator */ -struct elv_change_ctx { - const char *name; - bool no_uevent; - - /* for unregistering old elevator */ - struct elevator_queue *old; - /* for registering new elevator */ - struct elevator_queue *new; -}; - static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -132,7 +121,7 @@ static struct elevator_type *elevator_find_get(const char *name) static const struct kobj_type elv_ktype; struct elevator_queue *elevator_alloc(struct request_queue *q, - struct elevator_type *e) + struct elevator_type *e, struct elevator_tags *tags) { struct elevator_queue *eq; @@ -142,6 +131,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, __elevator_get(e); eq->type = e; + eq->tags = tags; kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); hash_init(eq->hash); @@ -166,13 +156,25 @@ static void elevator_exit(struct request_queue *q) lockdep_assert_held(&q->elevator_lock); ioc_clear_queue(q); - blk_mq_sched_free_rqs(q); mutex_lock(&e->sysfs_lock); blk_mq_exit_sched(q, e); mutex_unlock(&e->sysfs_lock); } +static struct elevator_tags *elevator_tags_lookup(struct elevator_tags **tags, + struct request_queue *q, int count) +{ + int i; + + for (i = 0; i < count; i++) { + if (tags[i]->id == q->id) + return tags[i]; + } + + return NULL; +} + static inline void __elv_rqhash_del(struct request *rq) { hash_del(&rq->hash); @@ -570,7 +572,8 @@ EXPORT_SYMBOL_GPL(elv_unregister); * If switching fails, we are most likely running out of memory and not able * to restore the old io scheduler, so leaving the io scheduler being none. */ -static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx) +static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx, + struct elevator_tags *tags) { struct elevator_type *new_e = NULL; int ret = 0; @@ -592,7 +595,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx) } if (new_e) { - ret = blk_mq_init_sched(q, new_e); + ret = blk_mq_init_sched(q, new_e, tags); if (ret) goto out_unfreeze; ctx->new = q->elevator; @@ -627,8 +630,10 @@ static void elv_exit_and_release(struct request_queue *q) elevator_exit(q); mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); - if (e) + if (e) { + __blk_mq_free_sched_tags(q->tag_set, e->tags); kobject_put(&e->kobj); + } } static int elevator_change_done(struct request_queue *q, @@ -641,6 +646,7 @@ static int elevator_change_done(struct request_queue *q, &ctx->old->flags); elv_unregister_queue(q, ctx->old); + __blk_mq_free_sched_tags(q->tag_set, ctx->old->tags); kobject_put(&ctx->old->kobj); if (enable_wbt) wbt_enable_default(q->disk); @@ -659,9 +665,17 @@ static int elevator_change_done(struct request_queue *q, static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) { unsigned int memflags; + struct elevator_tags *tags = NULL; + struct blk_mq_tag_set *set = q->tag_set; int ret = 0; - lockdep_assert_held(&q->tag_set->update_nr_hwq_lock); + lockdep_assert_held(&set->update_nr_hwq_lock); + + if (strncmp(ctx->name, "none", 4)) { + tags = __blk_mq_alloc_sched_tags(set, set->nr_hw_queues, q->id); + if (!tags) + return -ENOMEM; + } memflags = blk_mq_freeze_queue(q); /* @@ -676,11 +690,16 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) blk_mq_cancel_work_sync(q); mutex_lock(&q->elevator_lock); if (!(q->elevator && elevator_match(q->elevator->type, ctx->name))) - ret = elevator_switch(q, ctx); + ret = elevator_switch(q, ctx, tags); mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); if (!ret) ret = elevator_change_done(q, ctx); + /* + * Free sched tags if it's allocated but we couldn't switch elevator. + */ + if (tags && !ctx->new) + __blk_mq_free_sched_tags(set, tags); return ret; } @@ -689,24 +708,47 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) * The I/O scheduler depends on the number of hardware queues, this forces a * reattachment when nr_hw_queues changes. */ -void elv_update_nr_hw_queues(struct request_queue *q) +void elv_update_nr_hw_queues(struct request_queue *q, + struct elevator_tags **elv_tags, int count) { + struct elevator_tags *tags = NULL; + struct blk_mq_tag_set *set = q->tag_set; struct elv_change_ctx ctx = {}; int ret = -ENODEV; WARN_ON_ONCE(q->mq_freeze_depth == 0); + /* + * Accessing q->elevator without holding q->elevator_lock is safe here + * because nr_hw_queue update is protected by set->update_nr_hwq_lock + * in the writer context. So, scheduler update/switch code (which + * acquires same lock in the reader context) can't run concurrently. + */ + if (q->elevator) { + tags = elevator_tags_lookup(elv_tags, q, count); + if (!tags) { + WARN_ON(1); + goto out_unfreeze; + } + } + mutex_lock(&q->elevator_lock); if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) { ctx.name = q->elevator->type->elevator_name; /* force to reattach elevator after nr_hw_queue is updated */ - ret = elevator_switch(q, &ctx); + ret = elevator_switch(q, &ctx, tags); } mutex_unlock(&q->elevator_lock); +out_unfreeze: blk_mq_unfreeze_queue_nomemrestore(q); if (!ret) WARN_ON_ONCE(elevator_change_done(q, &ctx)); + /* + * Free sched tags if it's allocated but we couldn't switch elevator. + */ + if (tags && !ctx.new) + __blk_mq_free_sched_tags(set, tags); } /* diff --git a/block/elevator.h b/block/elevator.h index a4de5f9ad790..0b92121005cf 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -23,6 +23,17 @@ enum elv_merge { struct blk_mq_alloc_data; struct blk_mq_hw_ctx; +/* Holding context data for changing elevator */ +struct elv_change_ctx { + const char *name; + bool no_uevent; + + /* for unregistering old elevator */ + struct elevator_queue *old; + /* for registering new elevator */ + struct elevator_queue *new; +}; + struct elevator_mq_ops { int (*init_sched)(struct request_queue *, struct elevator_queue *); void (*exit_sched)(struct elevator_queue *); @@ -107,12 +118,27 @@ void elv_rqhash_add(struct request_queue *q, struct request *rq); void elv_rqhash_reposition(struct request_queue *q, struct request *rq); struct request *elv_rqhash_find(struct request_queue *q, sector_t offset); +struct elevator_tags { + /* num. of hardware queues for which tags are allocated */ + unsigned int nr_hw_queues; + /* depth used while allocating tags */ + unsigned int nr_requests; + /* request queue id (q->id) used during elevator_tags_lookup() */ + int id; + union { + /* tags shared across all queues */ + struct blk_mq_tags *shared_tags; + /* array of blk_mq_tags pointers, one per hardware queue. */ + struct blk_mq_tags **tags; + } u; +}; /* * each queue has an elevator_queue associated with it */ struct elevator_queue { struct elevator_type *type; + struct elevator_tags *tags; void *elevator_data; struct kobject kobj; struct mutex sysfs_lock; @@ -153,7 +179,7 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); extern struct elevator_queue *elevator_alloc(struct request_queue *, - struct elevator_type *); + struct elevator_type *, struct elevator_tags *); /* * Helper functions. -- 2.49.0