Re: [PATCH 2/4] md: add a new api sync_io_depth

Xiao Ni <xni@xxxxxxxxxx> · Wed, 16 Apr 2025 13:32:10 +0800

On Sat, Apr 12, 2025 at 3:39 PM Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> wrote:
>
> From: Yu Kuai <yukuai3@xxxxxxxxxx>
>
> Currently if sync speed is above speed_min and below speed_max,
> md_do_sync() will wait for all sync IOs to be done before issuing new
> sync IO, means sync IO depth is limited to just 1.
>
> This limit is too low, in order to prevent sync speed drop conspicuously
> after fixing is_mddev_idle() in the next patch, add a new api for
> limiting sync IO depth, the default value is 32.
>
> Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx>
> ---
>  drivers/md/md.c | 103 +++++++++++++++++++++++++++++++++++++++---------
>  drivers/md/md.h |   1 +
>  2 files changed, 85 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 438e71e45c16..8966c4afc62a 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -111,32 +111,42 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
>  /* Default safemode delay: 200 msec */
>  #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
>  /*
> - * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
> - * is 1000 KB/sec, so the extra system load does not show up that much.
> - * Increase it if you want to have more _guaranteed_ speed. Note that
> - * the RAID driver will use the maximum available bandwidth if the IO
> - * subsystem is idle. There is also an 'absolute maximum' reconstruction
> - * speed limit - in case reconstruction slows down your system despite
> - * idle IO detection.

These comments are useful. They only describe the meaning of those
control values. Is it good to keep them?

> + * Background sync IO speed control:
>   *
> - * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
> - * or /sys/block/mdX/md/sync_speed_{min,max}
> + * - below speed min:
> + *   no limit;
> + * - above speed min and below speed max:
> + *   a) if mddev is idle, then no limit;
> + *   b) if mddev is busy handling normal IO, then limit inflight sync IO
> + *   to sync_io_depth;
> + * - above speed max:
> + *   sync IO can't be issued;
> + *
> + * Following configurations can be changed via /proc/sys/dev/raid/ for system
> + * or /sys/block/mdX/md/ for one array.
>   */
> -
>  static int sysctl_speed_limit_min = 1000;
>  static int sysctl_speed_limit_max = 200000;
> -static inline int speed_min(struct mddev *mddev)
> +static int sysctl_sync_io_depth = 32;
> +
> +static int speed_min(struct mddev *mddev)
>  {
>         return mddev->sync_speed_min ?
>                 mddev->sync_speed_min : sysctl_speed_limit_min;
>  }
>
> -static inline int speed_max(struct mddev *mddev)
> +static int speed_max(struct mddev *mddev)
>  {
>         return mddev->sync_speed_max ?
>                 mddev->sync_speed_max : sysctl_speed_limit_max;
>  }
>
> +static int sync_io_depth(struct mddev *mddev)
> +{
> +       return mddev->sync_io_depth ?
> +               mddev->sync_io_depth : sysctl_sync_io_depth;
> +}
> +
>  static void rdev_uninit_serial(struct md_rdev *rdev)
>  {
>         if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
> @@ -293,14 +303,21 @@ static const struct ctl_table raid_table[] = {
>                 .procname       = "speed_limit_min",
>                 .data           = &sysctl_speed_limit_min,
>                 .maxlen         = sizeof(int),
> -               .mode           = S_IRUGO|S_IWUSR,
> +               .mode           = 0644,

Is it better to use macro rather than number directly here?

>                 .proc_handler   = proc_dointvec,
>         },
>         {
>                 .procname       = "speed_limit_max",
>                 .data           = &sysctl_speed_limit_max,
>                 .maxlen         = sizeof(int),
> -               .mode           = S_IRUGO|S_IWUSR,
> +               .mode           = 0644,
> +               .proc_handler   = proc_dointvec,
> +       },
> +       {
> +               .procname       = "sync_io_depth",
> +               .data           = &sysctl_sync_io_depth,
> +               .maxlen         = sizeof(int),
> +               .mode           = 0644,
>                 .proc_handler   = proc_dointvec,
>         },
>  };
> @@ -5091,7 +5108,7 @@ static ssize_t
>  sync_min_show(struct mddev *mddev, char *page)
>  {
>         return sprintf(page, "%d (%s)\n", speed_min(mddev),
> -                      mddev->sync_speed_min ? "local": "system");
> +                      mddev->sync_speed_min ? "local" : "system");
>  }
>
>  static ssize_t
> @@ -5100,7 +5117,7 @@ sync_min_store(struct mddev *mddev, const char *buf, size_t len)
>         unsigned int min;
>         int rv;
>
> -       if (strncmp(buf, "system", 6)==0) {
> +       if (strncmp(buf, "system", 6) == 0) {
>                 min = 0;
>         } else {
>                 rv = kstrtouint(buf, 10, &min);
> @@ -5120,7 +5137,7 @@ static ssize_t
>  sync_max_show(struct mddev *mddev, char *page)
>  {
>         return sprintf(page, "%d (%s)\n", speed_max(mddev),
> -                      mddev->sync_speed_max ? "local": "system");
> +                      mddev->sync_speed_max ? "local" : "system");
>  }
>
>  static ssize_t
> @@ -5129,7 +5146,7 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len)
>         unsigned int max;
>         int rv;
>
> -       if (strncmp(buf, "system", 6)==0) {
> +       if (strncmp(buf, "system", 6) == 0) {
>                 max = 0;
>         } else {
>                 rv = kstrtouint(buf, 10, &max);
> @@ -5145,6 +5162,35 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len)
>  static struct md_sysfs_entry md_sync_max =
>  __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
>
> +static ssize_t
> +sync_io_depth_show(struct mddev *mddev, char *page)
> +{
> +       return sprintf(page, "%d (%s)\n", sync_io_depth(mddev),
> +                      mddev->sync_io_depth ? "local" : "system");
> +}
> +
> +static ssize_t
> +sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len)
> +{
> +       unsigned int max;
> +       int rv;
> +
> +       if (strncmp(buf, "system", 6) == 0) {
> +               max = 0;
> +       } else {
> +               rv = kstrtouint(buf, 10, &max);
> +               if (rv < 0)
> +                       return rv;
> +               if (max == 0)
> +                       return -EINVAL;
> +       }
> +       mddev->sync_io_depth = max;
> +       return len;
> +}
> +
> +static struct md_sysfs_entry md_sync_io_depth =
> +__ATTR_RW(sync_io_depth);
> +
>  static ssize_t
>  degraded_show(struct mddev *mddev, char *page)
>  {
> @@ -5671,6 +5717,7 @@ static struct attribute *md_redundancy_attrs[] = {
>         &md_mismatches.attr,
>         &md_sync_min.attr,
>         &md_sync_max.attr,
> +       &md_sync_io_depth.attr,
>         &md_sync_speed.attr,
>         &md_sync_force_parallel.attr,
>         &md_sync_completed.attr,
> @@ -8927,6 +8974,23 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
>         }
>  }
>
> +static bool sync_io_within_limit(struct mddev *mddev)
> +{
> +       int io_sectors;
> +
> +       /*
> +        * For raid456, sync IO is stripe(4k) per IO, for other levels, it's
> +        * RESYNC_PAGES(64k) per IO.
> +        */
> +       if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6)
> +               io_sectors = 8;
> +       else
> +               io_sectors = 128;
> +
> +       return atomic_read(&mddev->recovery_active) <
> +               io_sectors * sync_io_depth(mddev);
> +}
> +
>  #define SYNC_MARKS     10
>  #define        SYNC_MARK_STEP  (3*HZ)
>  #define UPDATE_FREQUENCY (5*60*HZ)
> @@ -9195,7 +9259,8 @@ void md_do_sync(struct md_thread *thread)
>                                 msleep(500);
>                                 goto repeat;
>                         }
> -                       if (!is_mddev_idle(mddev, 0)) {
> +                       if (!sync_io_within_limit(mddev) &&
> +                           !is_mddev_idle(mddev, 0)) {
>                                 /*
>                                  * Give other IO more of a chance.
>                                  * The faster the devices, the less we wait.
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 1cf00a04bcdd..63be622467c6 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -483,6 +483,7 @@ struct mddev {
>         /* if zero, use the system-wide default */
>         int                             sync_speed_min;
>         int                             sync_speed_max;
> +       int                             sync_io_depth;
>
>         /* resync even though the same disks are shared among md-devices */
>         int                             parallel_resync;
> --
> 2.39.2
>

This part looks good to me.

Acked-by: Xiao Ni <xni@xxxxxxxxxx>