Re: [PATCH V2 07/20] block: prevent adding/deleting disk during updating nr_hw_queues

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 4/22/25 2:59 PM, Ming Lei wrote:
> On Tue, Apr 22, 2025 at 09:04:32AM +0200, Christoph Hellwig wrote:
>> On Mon, Apr 21, 2025 at 10:39:24AM +0800, Ming Lei wrote:
>>>> In my view, RCU is optimized for read-heavy workloads with:
>>>> - Non-blocking readers
>>>
>>> srcu allows blocking reader
>>
>> It does.  But it's certainly not optimized for long blocking readers.
>>
>>> Basically I agree with you that rwsem(instead of rwlock) should match with
>>> this case in theory, but I feel that rwsem is stronger than srcu from lock
>>> viewpoint, and we will add new dependency if rwsem is held inside
>>> ->store(), such as the following splat.
>>
>> How does manually implementing a reader/write lock using SRCU avoid
>> that dependency vs just hiding it?
>>
>> I'd rather sort this out as a rwsem is very natural her as Nilay pointed
>> out, and also avoids the whole giv up and retry pattern.
>  
> Thinking of further, the warning triggered from rwsem is false positive,
> because elevator switch can't happen until disk is added, and the splat
> can be avoided by switching to down_read_nested() in elevator_change(),
> which need to be nested too.
> 
> So looks fine to use rwsem.
> 
Instead of using down_read_nested() why can't we move the reader-lock 
inside elv_iosched_store() ?

With your patchset, now elevator_change() is called from three places:
- elevator_set_default
- elevator_set_none
- elv_iosched_store

As both elevator_set_default and elevator_set_none should run holding
reader-lock, I think we should remove reader-lock from  elevator_change and 
move it inside elv_iosched_store.

In fact, I just quickly prototyped rwsem replacing srcu on top of your 
changes and ran blktests. The rwsem changes sustain blktests and I didn't
encounter any lockdep splat. For reference, attached my (quick) prototype
changes.

Thanks,
--Nilay
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5c9f7391af92..bbe3dd40f1e8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4773,18 +4773,14 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 			goto out_free_srcu;
 	}
 
-	mutex_init(&set->update_nr_hwq_lock);
-	init_waitqueue_head(&set->update_nr_hwq_wq);
-	ret = init_srcu_struct(&set->update_nr_hwq_srcu);
-	if (ret)
-		goto out_cleanup_srcu;
+	init_rwsem(&set->update_nr_hwq_lock);
 
 	ret = -ENOMEM;
 	set->tags = kcalloc_node(set->nr_hw_queues,
 				 sizeof(struct blk_mq_tags *), GFP_KERNEL,
 				 set->numa_node);
 	if (!set->tags)
-		goto out_cleanup_hwq_srcu;
+		goto out_cleanup_srcu;
 
 	for (i = 0; i < set->nr_maps; i++) {
 		set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
@@ -4813,8 +4809,6 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	}
 	kfree(set->tags);
 	set->tags = NULL;
-out_cleanup_hwq_srcu:
-	cleanup_srcu_struct(&set->update_nr_hwq_srcu);
 out_cleanup_srcu:
 	if (set->flags & BLK_MQ_F_BLOCKING)
 		cleanup_srcu_struct(set->srcu);
@@ -5007,21 +5001,16 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
 
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 {
-	mutex_lock(&set->update_nr_hwq_lock);
+	down_write(&set->update_nr_hwq_lock);
 	/*
 	 * Mark us in updating nr_hw_queues for preventing reader of
 	 * nr_hw_queues, such as adding/deleting disk.
 	 */
-	set->updating_nr_hwq = true;
-	synchronize_srcu(&set->update_nr_hwq_srcu);
-
 	mutex_lock(&set->tag_list_lock);
 	__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
 	mutex_unlock(&set->tag_list_lock);
 
-	set->updating_nr_hwq = false;
-	wake_up_all(&set->update_nr_hwq_wq);
-	mutex_unlock(&set->update_nr_hwq_lock);
+	up_write(&set->update_nr_hwq_lock);
 }
 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 
diff --git a/block/elevator.c b/block/elevator.c
index 378553fce5d8..69193d04fffa 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -743,15 +743,8 @@ int __elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
 static int elevator_change(struct request_queue *q,
 			   struct elv_change_ctx *ctx)
 {
-	struct blk_mq_tag_set *set = q->tag_set;
 	unsigned int memflags;
-	int ret, idx;
-
-	idx = srcu_read_lock(&set->update_nr_hwq_srcu);
-	if (set->updating_nr_hwq) {
-		ret = -EBUSY;
-		goto exit;
-	}
+	int ret;
 
 	memflags = blk_mq_freeze_queue(q);
 	/*
@@ -771,8 +764,6 @@ static int elevator_change(struct request_queue *q,
 	if (!ret)
 		ret = elevator_change_done(q, ctx);
 
-exit:
-	srcu_read_unlock(&set->update_nr_hwq_srcu, idx);
 	return ret;
 }
 
@@ -792,6 +783,7 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
 			  size_t count)
 {
 	struct request_queue *q = disk->queue;
+	struct blk_mq_tag_set *set = q->tag_set;
 	char elevator_name[ELV_NAME_MAX];
 	struct elv_change_ctx ctx = {
 		.uevent = true,
@@ -808,9 +800,12 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
 
 	elv_iosched_load_module(ctx.name);
 
+	down_read(&set->update_nr_hwq_lock);
 	ret = elevator_change(q, &ctx);
 	if (!ret)
 		ret = count;
+	up_read(&set->update_nr_hwq_lock);
+
 	return ret;
 }
 
diff --git a/block/genhd.c b/block/genhd.c
index de227aa923ed..359a55bf47ce 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -407,19 +407,12 @@ static int retry_on_updating_nr_hwq(struct gendisk_data *data,
 
 	set = disk->queue->tag_set;
 	do {
-		int idx, ret;
+		int ret;
 
-		idx = srcu_read_lock(&set->update_nr_hwq_srcu);
-		if (set->updating_nr_hwq) {
-			srcu_read_unlock(&set->update_nr_hwq_srcu, idx);
-			goto wait;
-		}
+		down_read(&set->update_nr_hwq_lock);
 		ret = cb(data);
-		srcu_read_unlock(&set->update_nr_hwq_srcu, idx);
+		up_read(&set->update_nr_hwq_lock);
 		return ret;
- wait:
-		wait_event_interruptible(set->update_nr_hwq_wq,
-				!set->updating_nr_hwq);
 	} while (true);
 }
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index afe76dcfaa3c..ef84d53095a6 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -9,6 +9,7 @@
 #include <linux/prefetch.h>
 #include <linux/srcu.h>
 #include <linux/rw_hint.h>
+#include <linux/rwsem.h>
 
 struct blk_mq_tags;
 struct blk_flush_queue;
@@ -528,10 +529,7 @@ struct blk_mq_tag_set {
 	struct list_head	tag_list;
 	struct srcu_struct	*srcu;
 
-	bool			updating_nr_hwq;
-	struct mutex		update_nr_hwq_lock;
-	struct srcu_struct	update_nr_hwq_srcu;
-	wait_queue_head_t	update_nr_hwq_wq;
+	struct rw_semaphore	update_nr_hwq_lock;
 };
 
 /**

[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux