Presently we start garbage collection late - when we start running out of free zones to backfill max_open_zones. This is a reasonable default as it minimizes write amplification. The longer we wait, the more blocks are invalidated and reclaim cost less in terms of blocks to relocate. Starting this late however introduces a risk of GC being outcompeted by user writes. If GC can't keep up, user writes will be forced to wait for free zones with high tail latencies as a result. This is not a problem under normal circumstances, but if fragmentation is bad and user write pressure is high (multiple full-throttle writers) we will "bottom out" of free zones. To mitigate this, introduce a gc_pressure mount option that lets the user specify a percentage of how much of the unused space that gc should keep available for writing. A high value will reclaim more of the space occupied by unused blocks, creating a larger buffer against write bursts. This comes at a cost as write amplification is increased. To illustrate this using a sample workload, setting gc_pressure to 60% avoids high (500ms) max latencies while increasing write amplification by 15%. Signed-off-by: Hans Holmberg <hans.holmberg@xxxxxxx> --- A patch for xfsprogs documenting the option will follow (if it makes it beyond RFC) fs/xfs/xfs_mount.h | 1 + fs/xfs/xfs_super.c | 14 +++++++++++++- fs/xfs/xfs_zone_alloc.c | 5 +++++ fs/xfs/xfs_zone_gc.c | 16 ++++++++++++++-- 4 files changed, 33 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 799b84220ebb..af595024de00 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -229,6 +229,7 @@ typedef struct xfs_mount { bool m_finobt_nores; /* no per-AG finobt resv. */ bool m_update_sb; /* sb needs update in mount */ unsigned int m_max_open_zones; + unsigned int m_gc_pressure; /* * Bitsets of per-fs metadata that have been checked and/or are sick. diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index af5e63cb6a99..656f228cc3d9 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -111,7 +111,7 @@ enum { Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones, - Opt_lifetime, Opt_nolifetime, + Opt_lifetime, Opt_nolifetime, Opt_gc_pressure, }; static const struct fs_parameter_spec xfs_fs_parameters[] = { @@ -159,6 +159,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_u32("max_open_zones", Opt_max_open_zones), fsparam_flag("lifetime", Opt_lifetime), fsparam_flag("nolifetime", Opt_nolifetime), + fsparam_u32("gc_pressure", Opt_gc_pressure), {} }; @@ -242,6 +243,9 @@ xfs_fs_show_options( if (mp->m_max_open_zones) seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones); + if (mp->m_gc_pressure) + seq_printf(m, ",gc_pressure=%u", mp->m_gc_pressure); + return 0; } @@ -1100,6 +1104,11 @@ xfs_finish_flags( "nolifetime mount option only supported on zoned file systems."); return -EINVAL; } + if (mp->m_gc_pressure) { + xfs_warn(mp, +"gc_pressure mount option only supported on zoned file systems."); + return -EINVAL; + } } return 0; @@ -1500,6 +1509,9 @@ xfs_fs_parse_param( case Opt_max_open_zones: parsing_mp->m_max_open_zones = result.uint_32; return 0; + case Opt_gc_pressure: + parsing_mp->m_gc_pressure = result.uint_32; + return 0; case Opt_lifetime: parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME; return 0; diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 734b73470ef9..f75247c16600 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -1162,6 +1162,11 @@ xfs_mount_zones( return -EFSCORRUPTED; } + if (mp->m_gc_pressure > 100) { + xfs_notice(mp, "gc_pressure must not exceed 100 percent."); + return -EINVAL; + } + error = xfs_calc_open_zones(mp); if (error) return error; diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index a4abaac0fbc5..002a943860f5 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -162,18 +162,30 @@ struct xfs_zone_gc_data { /* * We aim to keep enough zones free in stock to fully use the open zone limit - * for data placement purposes. + * for data placement purposes. Additionally, the gc_pressure mount option + * can be set to make sure a fraction of the unused/free blocks are available + * for writing. */ bool xfs_zoned_need_gc( struct xfs_mount *mp) { + s64 available, free; + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) return false; - if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) < + + available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); + + if (available < mp->m_groups[XG_TYPE_RTG].blocks * (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) return true; + + free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); + if (available < div_s64(free * mp->m_gc_pressure, 100)) + return true; + return false; } -- 2.34.1