Hi,
在 2025/08/05 16:33, Yu Kuai 写道:
Hi,
在 2025/08/04 19:32, Ming Lei 写道:
On Mon, Aug 04, 2025 at 02:30:43PM +0800, Yu Kuai wrote:
Hi,
在 2025/08/01 19:44, Ming Lei 写道:
Replace the spinlock in blk_mq_find_and_get_req() with an SRCU read
lock
around the tag iterators.
This is done by:
- Holding the SRCU read lock in blk_mq_queue_tag_busy_iter(),
blk_mq_tagset_busy_iter(), and blk_mq_hctx_has_requests().
- Removing the now-redundant tags->lock from blk_mq_find_and_get_req().
This change improves performance by replacing a spinlock with a more
scalable SRCU lock, and fixes lockup issue in scsi_host_busy() in
case of
shost->host_blocked.
Meantime it becomes possible to use blk_mq_in_driver_rw() for io
accounting.
Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx>
---
block/blk-mq-tag.c | 12 ++++++++----
block/blk-mq.c | 24 ++++--------------------
2 files changed, 12 insertions(+), 24 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 6c2f5881e0de..7ae431077a32 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -256,13 +256,10 @@ static struct request
*blk_mq_find_and_get_req(struct blk_mq_tags *tags,
unsigned int bitnr)
{
struct request *rq;
- unsigned long flags;
- spin_lock_irqsave(&tags->lock, flags);
rq = tags->rqs[bitnr];
if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq))
rq = NULL;
- spin_unlock_irqrestore(&tags->lock, flags);
return rq;
}
Just wonder, does the lockup problem due to the tags->lock contention by
concurrent scsi_host_busy?
Yes.
@@ -440,7 +437,9 @@ void blk_mq_tagset_busy_iter(struct
blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv)
{
unsigned int flags = tagset->flags;
- int i, nr_tags;
+ int i, nr_tags, srcu_idx;
+
+ srcu_idx = srcu_read_lock(&tagset->tags_srcu);
nr_tags = blk_mq_is_shared_tags(flags) ? 1 :
tagset->nr_hw_queues;
@@ -449,6 +448,7 @@ void blk_mq_tagset_busy_iter(struct
blk_mq_tag_set *tagset,
__blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
BT_TAG_ITER_STARTED);
}
+ srcu_read_unlock(&tagset->tags_srcu, srcu_idx);
And should we add cond_resched() after finish interating one tags, even
with the srcu change, looks like it's still possible to lockup with
big cpu cores & deep queue depth.
The main trouble is from the big tags->lock.
IMO it isn't needed, because max queue depth is just 10K, which is much
bigger than actual queue depth. We can add it when someone shows it is
really needed.
If we don't want this, why not using srcu here? Looks like just use
rcu_read_lock and rcu_read_unlock to protect blk_mq_find_and_get_req()
will be enough.
Like following patch:
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index d880c50629d6..e2381ee9747d 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -255,11 +255,11 @@ static struct request
*blk_mq_find_and_get_req(struct blk_mq_tags *tags,
struct request *rq;
unsigned long flags;
- spin_lock_irqsave(&tags->lock, flags);
+ rcu_read_lock();
rq = tags->rqs[bitnr];
if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq))
rq = NULL;
- spin_unlock_irqrestore(&tags->lock, flags);
+ rcu_read_unlock();
return rq;
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b1d81839679f..a70959cad692 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3442,12 +3442,8 @@ static void blk_mq_clear_rq_mapping(struct
blk_mq_tags *drv_tags,
/*
* Wait until all pending iteration is done.
- *
- * Request reference is cleared and it is guaranteed to be observed
- * after the ->lock is released.
*/
- spin_lock_irqsave(&drv_tags->lock, flags);
- spin_unlock_irqrestore(&drv_tags->lock, flags);
+ synchronize_rcu();
}
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
@@ -3912,12 +3908,8 @@ static void blk_mq_clear_flush_rq_mapping(struct
blk_mq_tags *tags,
/*
* Wait until all pending iteration is done.
- *
- * Request reference is cleared and it is guaranteed to be observed
- * after the ->lock is released.
*/
- spin_lock_irqsave(&tags->lock, flags);
- spin_unlock_irqrestore(&tags->lock, flags);
+ synchronize_rcu();
}
/* hctx->ctxs will be freed in queue's release handler */
Thanks,
Kuai
Thanks,
Ming
.
.