On 10/09/2025 22:32, Bart Van Assche wrote:
The SCSI core uses the budget map to enforce the cmd_per_lun limit.
That's not strictly true, as I mentioned in
https://lore.kernel.org/linux-scsi/e7708546-c001-4f31-b895-69720755c3ac@xxxxxxx/T/#m16d3bf6266faefee60addb48ae4b5cdd65e90a68
cmd_per_lun may be completely ignored by the LLD setting its own sdev
queue depth.
That limit cannot be exceeded if host->cmd_per_lun >= host->can_queue
Can host->cmd_per_lun > host->can_queue ever be true?
and if the host tag set is shared across all hardware queues.
Sure, but what about single HW queue scenario? We should also enforce
host->cmd_per_lun <= host->can_queue && sdev->max_queue_depth <=
host->can_queue for that, right?
Most/all single HW queue SCSI LLDs do not set .host_tagset (even though
they could).
Since scsi_mq_get_budget() shows up in all CPU profiles for fast SCSI
devices, do not allocate a budget map if cmd_per_lun >= can_queue and
if the host tag set is shared across all hardware queues.
On my UFS 4 test setup this patch improves IOPS by 1% and reduces the
time spent in scsi_mq_get_budget() from 0.22% to 0.01%.
Cc: Jens Axboe <axboe@xxxxxxxxx>
Cc: Christoph Hellwig <hch@xxxxxxxxxxxxx>
Cc: Ming Lei <ming.lei@xxxxxxxxxx>
Cc: John Garry <john.g.garry@xxxxxxxxxx>
Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx>
---
drivers/scsi/scsi.c | 7 ++++-
drivers/scsi/scsi_lib.c | 60 +++++++++++++++++++++++++++++++++-----
drivers/scsi/scsi_scan.c | 11 ++++++-
include/scsi/scsi_device.h | 5 +---
4 files changed, 70 insertions(+), 13 deletions(-)
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 9a0f467264b3..06066b694d8a 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -216,6 +216,8 @@ int scsi_device_max_queue_depth(struct scsi_device *sdev)
*/
int scsi_change_queue_depth(struct scsi_device *sdev, int depth)
{
+ struct Scsi_Host *shost = sdev->host;
+
depth = min_t(int, depth, scsi_device_max_queue_depth(sdev));
if (depth > 0) {
@@ -226,7 +228,10 @@ int scsi_change_queue_depth(struct scsi_device *sdev, int depth)
if (sdev->request_queue)
blk_set_queue_depth(sdev->request_queue, depth);
- sbitmap_resize(&sdev->budget_map, sdev->queue_depth);
+ if (shost->host_tagset && depth >= shost->can_queue)
+ sbitmap_free(&sdev->budget_map);
eh, what happens if we call this twice?
+ else
+ sbitmap_resize(&sdev->budget_map, sdev->queue_depth);
what if we set queue_depth = shost->can_queue (and free the budget map)
and then later set lower than shost->can_queue (and try to reference the
budget map)?
return sdev->queue_depth;
}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 0c65ecfedfbd..c546514d1049 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -396,7 +396,8 @@ void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd)
if (starget->can_queue > 0)
atomic_dec(&starget->target_busy);
- sbitmap_put(&sdev->budget_map, cmd->budget_token);
+ if (sdev->budget_map.map)
+ sbitmap_put(&sdev->budget_map, cmd->budget_token);
cmd->budget_token = -1;
}
@@ -445,6 +446,47 @@ static void scsi_single_lun_run(struct scsi_device *current_sdev)
spin_unlock_irqrestore(shost->host_lock, flags);
}
+struct sdev_in_flight_data {
+ const struct scsi_device *sdev;
+ int count;
+};
+
+static bool scsi_device_check_in_flight(struct request *rq, void *data)
so this does not check the cmd state (like scsi_host_check_in_flight()
does), but it uses the same naming (scsi_xxx_check_in_flight)
+{
+ struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
+ struct sdev_in_flight_data *sifd = data;
+
+ if (cmd->device == sifd->sdev)
+ sifd->count++;
+
+ return true;
+}
+
+/**
+ * scsi_device_busy() - Number of commands allocated for a SCSI device
+ * @sdev: SCSI device.
+ *
+ * Note: There is a subtle difference between this function and
+ * scsi_host_busy(). scsi_host_busy() counts the number of commands that have
+ * been started. This function counts the number of commands that have been
+ * allocated. At least the UFS driver depends on this function counting commands
+ * that have already been allocated but that have not yet been started.
+ */
+int scsi_device_busy(const struct scsi_device *sdev)
+{
+ struct sdev_in_flight_data sifd = { .sdev = sdev };
+ struct blk_mq_tag_set *set = &sdev->host->tag_set;
+
+ if (sdev->budget_map.map)
I really dislike these checks
+ return sbitmap_weight(&sdev->budget_map);
+ if (WARN_ON_ONCE(!set->shared_tags))
+ return 0;
+ blk_mq_all_tag_iter(set->shared_tags, scsi_device_check_in_flight,
+ &sifd);
+ return sifd.count;
+}
+EXPORT_SYMBOL(scsi_device_busy);
+
static inline bool scsi_device_is_busy(struct scsi_device *sdev)
{
if (scsi_device_busy(sdev) >= sdev->queue_depth)
@@ -1358,11 +1400,13 @@ scsi_device_state_check(struct scsi_device *sdev, struct request *req)
static inline int scsi_dev_queue_ready(struct request_queue *q,
struct scsi_device *sdev)
{
- int token;
+ int token = INT_MAX;
- token = sbitmap_get(&sdev->budget_map);
- if (token < 0)
- return -1;
+ if (sdev->budget_map.map) {
this can race with a call to scsi_change_queue_depth() (which may free
sdev->budget_map.map), right?
scsi_change_queue_depth() does not seem to do any queue freezing.
+ token = sbitmap_get(&sdev->budget_map);
+ if (token < 0)
+ return -1;
+ }
thanks,
John