On Sat, Apr 12, 2025 at 3:39 PM Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> wrote: > > From: Yu Kuai <yukuai3@xxxxxxxxxx> > > Currently if sync speed is above speed_min and below speed_max, > md_do_sync() will wait for all sync IOs to be done before issuing new > sync IO, means sync IO depth is limited to just 1. > > This limit is too low, in order to prevent sync speed drop conspicuously > after fixing is_mddev_idle() in the next patch, add a new api for > limiting sync IO depth, the default value is 32. > > Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx> > --- > drivers/md/md.c | 103 +++++++++++++++++++++++++++++++++++++++--------- > drivers/md/md.h | 1 + > 2 files changed, 85 insertions(+), 19 deletions(-) > > diff --git a/drivers/md/md.c b/drivers/md/md.c > index 438e71e45c16..8966c4afc62a 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -111,32 +111,42 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread); > /* Default safemode delay: 200 msec */ > #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) > /* > - * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' > - * is 1000 KB/sec, so the extra system load does not show up that much. > - * Increase it if you want to have more _guaranteed_ speed. Note that > - * the RAID driver will use the maximum available bandwidth if the IO > - * subsystem is idle. There is also an 'absolute maximum' reconstruction > - * speed limit - in case reconstruction slows down your system despite > - * idle IO detection. These comments are useful. They only describe the meaning of those control values. Is it good to keep them? > + * Background sync IO speed control: > * > - * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. > - * or /sys/block/mdX/md/sync_speed_{min,max} > + * - below speed min: > + * no limit; > + * - above speed min and below speed max: > + * a) if mddev is idle, then no limit; > + * b) if mddev is busy handling normal IO, then limit inflight sync IO > + * to sync_io_depth; > + * - above speed max: > + * sync IO can't be issued; > + * > + * Following configurations can be changed via /proc/sys/dev/raid/ for system > + * or /sys/block/mdX/md/ for one array. > */ > - > static int sysctl_speed_limit_min = 1000; > static int sysctl_speed_limit_max = 200000; > -static inline int speed_min(struct mddev *mddev) > +static int sysctl_sync_io_depth = 32; > + > +static int speed_min(struct mddev *mddev) > { > return mddev->sync_speed_min ? > mddev->sync_speed_min : sysctl_speed_limit_min; > } > > -static inline int speed_max(struct mddev *mddev) > +static int speed_max(struct mddev *mddev) > { > return mddev->sync_speed_max ? > mddev->sync_speed_max : sysctl_speed_limit_max; > } > > +static int sync_io_depth(struct mddev *mddev) > +{ > + return mddev->sync_io_depth ? > + mddev->sync_io_depth : sysctl_sync_io_depth; > +} > + > static void rdev_uninit_serial(struct md_rdev *rdev) > { > if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) > @@ -293,14 +303,21 @@ static const struct ctl_table raid_table[] = { > .procname = "speed_limit_min", > .data = &sysctl_speed_limit_min, > .maxlen = sizeof(int), > - .mode = S_IRUGO|S_IWUSR, > + .mode = 0644, Is it better to use macro rather than number directly here? > .proc_handler = proc_dointvec, > }, > { > .procname = "speed_limit_max", > .data = &sysctl_speed_limit_max, > .maxlen = sizeof(int), > - .mode = S_IRUGO|S_IWUSR, > + .mode = 0644, > + .proc_handler = proc_dointvec, > + }, > + { > + .procname = "sync_io_depth", > + .data = &sysctl_sync_io_depth, > + .maxlen = sizeof(int), > + .mode = 0644, > .proc_handler = proc_dointvec, > }, > }; > @@ -5091,7 +5108,7 @@ static ssize_t > sync_min_show(struct mddev *mddev, char *page) > { > return sprintf(page, "%d (%s)\n", speed_min(mddev), > - mddev->sync_speed_min ? "local": "system"); > + mddev->sync_speed_min ? "local" : "system"); > } > > static ssize_t > @@ -5100,7 +5117,7 @@ sync_min_store(struct mddev *mddev, const char *buf, size_t len) > unsigned int min; > int rv; > > - if (strncmp(buf, "system", 6)==0) { > + if (strncmp(buf, "system", 6) == 0) { > min = 0; > } else { > rv = kstrtouint(buf, 10, &min); > @@ -5120,7 +5137,7 @@ static ssize_t > sync_max_show(struct mddev *mddev, char *page) > { > return sprintf(page, "%d (%s)\n", speed_max(mddev), > - mddev->sync_speed_max ? "local": "system"); > + mddev->sync_speed_max ? "local" : "system"); > } > > static ssize_t > @@ -5129,7 +5146,7 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len) > unsigned int max; > int rv; > > - if (strncmp(buf, "system", 6)==0) { > + if (strncmp(buf, "system", 6) == 0) { > max = 0; > } else { > rv = kstrtouint(buf, 10, &max); > @@ -5145,6 +5162,35 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len) > static struct md_sysfs_entry md_sync_max = > __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); > > +static ssize_t > +sync_io_depth_show(struct mddev *mddev, char *page) > +{ > + return sprintf(page, "%d (%s)\n", sync_io_depth(mddev), > + mddev->sync_io_depth ? "local" : "system"); > +} > + > +static ssize_t > +sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len) > +{ > + unsigned int max; > + int rv; > + > + if (strncmp(buf, "system", 6) == 0) { > + max = 0; > + } else { > + rv = kstrtouint(buf, 10, &max); > + if (rv < 0) > + return rv; > + if (max == 0) > + return -EINVAL; > + } > + mddev->sync_io_depth = max; > + return len; > +} > + > +static struct md_sysfs_entry md_sync_io_depth = > +__ATTR_RW(sync_io_depth); > + > static ssize_t > degraded_show(struct mddev *mddev, char *page) > { > @@ -5671,6 +5717,7 @@ static struct attribute *md_redundancy_attrs[] = { > &md_mismatches.attr, > &md_sync_min.attr, > &md_sync_max.attr, > + &md_sync_io_depth.attr, > &md_sync_speed.attr, > &md_sync_force_parallel.attr, > &md_sync_completed.attr, > @@ -8927,6 +8974,23 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) > } > } > > +static bool sync_io_within_limit(struct mddev *mddev) > +{ > + int io_sectors; > + > + /* > + * For raid456, sync IO is stripe(4k) per IO, for other levels, it's > + * RESYNC_PAGES(64k) per IO. > + */ > + if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6) > + io_sectors = 8; > + else > + io_sectors = 128; > + > + return atomic_read(&mddev->recovery_active) < > + io_sectors * sync_io_depth(mddev); > +} > + > #define SYNC_MARKS 10 > #define SYNC_MARK_STEP (3*HZ) > #define UPDATE_FREQUENCY (5*60*HZ) > @@ -9195,7 +9259,8 @@ void md_do_sync(struct md_thread *thread) > msleep(500); > goto repeat; > } > - if (!is_mddev_idle(mddev, 0)) { > + if (!sync_io_within_limit(mddev) && > + !is_mddev_idle(mddev, 0)) { > /* > * Give other IO more of a chance. > * The faster the devices, the less we wait. > diff --git a/drivers/md/md.h b/drivers/md/md.h > index 1cf00a04bcdd..63be622467c6 100644 > --- a/drivers/md/md.h > +++ b/drivers/md/md.h > @@ -483,6 +483,7 @@ struct mddev { > /* if zero, use the system-wide default */ > int sync_speed_min; > int sync_speed_max; > + int sync_io_depth; > > /* resync even though the same disks are shared among md-devices */ > int parallel_resync; > -- > 2.39.2 > This part looks good to me. Acked-by: Xiao Ni <xni@xxxxxxxxxx>