On 6/30/25 07:21, Nilay Shroff wrote:
Recent lockdep reports [1] have revealed a potential deadlock caused by a
lock dependency between the percpu allocator lock and the elevator lock.
This issue can be avoided by ensuring that the allocation and release of
scheduler tags (sched_tags) are performed outside the elevator lock.
Furthermore, the queue does not need to be remain frozen during these
operations.
To address this, move all sched_tags allocations and deallocations outside
of both the ->elevator_lock and the ->freeze_lock. Since the lifetime of
the elevator queue and its associated sched_tags is closely tied, the
allocated sched_tags are now stored in the elevator queue structure. Then,
during the actual elevator switch (which runs under ->freeze_lock and
->elevator_lock), the pre-allocated sched_tags are assigned to the
appropriate q->hctx. Once the elevator switch is complete and the locks
are released, the old elevator queue and its associated sched_tags are
freed.
This commit specifically addresses the allocation/deallocation of sched_
tags during elevator switching. Note that sched_tags may also be allocated
in other contexts, such as during nr_hw_queues updates. Supporting that
use case will require batch allocation/deallocation, which will be handled
in a follow-up patch.
This restructuring ensures that sched_tags memory management occurs
entirely outside of the ->elevator_lock and ->freeze_lock context,
eliminating the lock dependency problem seen during scheduler updates.
[1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@xxxxxxxxxxxxx/
Reported-by: Stefan Haberland <sth@xxxxxxxxxxxxx>
Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@xxxxxxxxxxxxx/
Signed-off-by: Nilay Shroff <nilay@xxxxxxxxxxxxx>
---
block/blk-mq-sched.c | 155 +++++++++++++++++++++++--------------------
block/blk-mq-sched.h | 8 ++-
block/elevator.c | 47 +++++++++++--
block/elevator.h | 14 +++-
4 files changed, 144 insertions(+), 80 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 359e0704e09b..2d6d1ebdd8fb 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -374,64 +374,17 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
-static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
- struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_idx)
-{
- if (blk_mq_is_shared_tags(q->tag_set->flags)) {
- hctx->sched_tags = q->sched_shared_tags;
- return 0;
- }
-
- hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
- q->nr_requests);
-
- if (!hctx->sched_tags)
- return -ENOMEM;
- return 0;
-}
-
-static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
-{
- blk_mq_free_rq_map(queue->sched_shared_tags);
- queue->sched_shared_tags = NULL;
-}
-
/* called in queue's release handler, tagset has gone away */
static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{
struct blk_mq_hw_ctx *hctx;
unsigned long i;
- queue_for_each_hw_ctx(q, hctx, i) {
- if (hctx->sched_tags) {
- if (!blk_mq_is_shared_tags(flags))
- blk_mq_free_rq_map(hctx->sched_tags);
- hctx->sched_tags = NULL;
- }
- }
+ queue_for_each_hw_ctx(q, hctx, i)
+ hctx->sched_tags = NULL;
if (blk_mq_is_shared_tags(flags))
- blk_mq_exit_sched_shared_tags(q);
-}
-
-static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
-{
- struct blk_mq_tag_set *set = queue->tag_set;
-
- /*
- * Set initial depth at max so that we don't need to reallocate for
- * updating nr_requests.
- */
- queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
- BLK_MQ_NO_HCTX_IDX,
- MAX_SCHED_RQ);
- if (!queue->sched_shared_tags)
- return -ENOMEM;
-
- blk_mq_tag_update_sched_shared_tags(queue);
-
- return 0;
+ q->sched_shared_tags = NULL;
}
void blk_mq_sched_reg_debugfs(struct request_queue *q)
@@ -458,8 +411,75 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q)
mutex_unlock(&q->debugfs_mutex);
}
+void blk_mq_free_sched_tags(struct elevator_tags *et,
+ struct blk_mq_tag_set *set)
+{
+ unsigned long i;
+
+ /* Shared tags are stored at index 0 in @tags. */
+ if (blk_mq_is_shared_tags(set->flags))
+ blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX);
+ else {
+ for (i = 0; i < et->nr_hw_queues; i++)
+ blk_mq_free_map_and_rqs(set, et->tags[i], i);
+ }
+
+ kfree(et);
+}
+
+struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
+ unsigned int nr_hw_queues)
+{
+ unsigned int nr_tags;
+ int i;
+ struct elevator_tags *et;
+ gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
+
+ if (blk_mq_is_shared_tags(set->flags))
+ nr_tags = 1;
+ else
+ nr_tags = nr_hw_queues;
+
+ et = kmalloc(sizeof(struct elevator_tags) +
+ nr_tags * sizeof(struct blk_mq_tags *), gfp);
+ if (!et)
+ return NULL;
+ /*
+ * Default to double of smaller one between hw queue_depth and
+ * 128, since we don't split into sync/async like the old code
+ * did. Additionally, this is a per-hw queue depth.
+ */
+ et->nr_requests = 2 * min_t(unsigned int, set->queue_depth,
+ BLKDEV_DEFAULT_RQ);
+ et->nr_hw_queues = nr_hw_queues;
+
+ if (blk_mq_is_shared_tags(set->flags)) {
+ /* Shared tags are stored at index 0 in @tags. */
+ et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX,
+ MAX_SCHED_RQ);
+ if (!et->tags[0])
+ goto out;
+ } else {
+ for (i = 0; i < et->nr_hw_queues; i++) {
+ et->tags[i] = blk_mq_alloc_map_and_rqs(set, i,
+ et->nr_requests);
+ if (!et->tags[i])
+ goto out_unwind;
+ }
+ }
+
+ return et;
+out_unwind:
+ while (--i >= 0)
+ blk_mq_free_map_and_rqs(set, et->tags[i], i);
+out:
+ kfree(et);
+ return NULL;
+}
+
As smatch stated, the unwind pattern is a bit odd.
Maybe move the unwind into the 'else' branch, and us a conditional
to invoke it:
if (i < et->nr_hw_queues)
while (--i >= 0)
blk_mq_free_map_and_request()
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@xxxxxxx +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich