Holding XFS_ILOCK_SHARED over log force in xfs_fsync_flush_log() significantly increases contention on ILOCK for O_DSYNC | O_DIRECT writes to file preallocated with fallocate (thus DIO happens to unwritten extents and we need ILOCK in exclusive mode for timestamp modifications and extent conversions). But holding ILOCK over the log force doesn't seem strictly necessary for correctness. We are just using it for a mechanism to make sure parallel fsyncs all wait for log force to complete but that can be also achieved without holding ILOCK. With this patch DB2 database restore operation speeds up by a factor of about 2.5x in a VM with 4 CPUs, 16GB of RAM and NVME SSD as a backing store. Signed-off-by: Jan Kara <jack@xxxxxxx> --- fs/xfs/xfs_file.c | 48 ++++++++++++++--------------------------- fs/xfs/xfs_inode.c | 2 -- fs/xfs/xfs_inode_item.c | 12 ++++++++--- fs/xfs/xfs_inode_item.h | 4 +++- 4 files changed, 28 insertions(+), 38 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b04c59d87378..c5b8ebddc179 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -75,30 +75,10 @@ xfs_dir_fsync( return xfs_log_force_inode(ip); } -static xfs_csn_t -xfs_fsync_seq( - struct xfs_inode *ip, - bool datasync) -{ - if (!xfs_ipincount(ip)) - return 0; - if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - return 0; - return ip->i_itemp->ili_commit_seq; -} - /* * All metadata updates are logged, which means that we just have to flush the - * log up to the latest LSN that touched the inode. - * - * If we have concurrent fsync/fdatasync() calls, we need them to all block on - * the log force before we clear the ili_fsync_fields field. This ensures that - * we don't get a racing sync operation that does not wait for the metadata to - * hit the journal before returning. If we race with clearing ili_fsync_fields, - * then all that will happen is the log force will do nothing as the lsn will - * already be on disk. We can't race with setting ili_fsync_fields because that - * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock - * shared until after the ili_fsync_fields is cleared. + * log up to the latest LSN that modified the inode metadata relevant for the + * fsync/fdatasync(). */ static int xfs_fsync_flush_log( @@ -106,21 +86,25 @@ xfs_fsync_flush_log( bool datasync, int *log_flushed) { - int error = 0; + struct xfs_inode_log_item *iip = ip->i_itemp; xfs_csn_t seq; xfs_ilock(ip, XFS_ILOCK_SHARED); - seq = xfs_fsync_seq(ip, datasync); - if (seq) { - error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, - log_flushed); - - spin_lock(&ip->i_itemp->ili_lock); - ip->i_itemp->ili_fsync_fields = 0; - spin_unlock(&ip->i_itemp->ili_lock); + if (!xfs_ipincount(ip)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return 0; } + + if (datasync) + seq = iip->ili_datasync_commit_seq; + else + seq = iip->ili_commit_seq; xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; + + if (!seq) + return 0; + + return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, log_flushed); } STATIC int diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9c39251961a3..209b8aba6238 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1656,7 +1656,6 @@ xfs_ifree_mark_inode_stale( spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; spin_unlock(&iip->ili_lock); ASSERT(iip->ili_last_fields); @@ -2502,7 +2501,6 @@ xfs_iflush( spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags); spin_unlock(&iip->ili_lock); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 829675700fcd..27f28bea0a8f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -153,7 +153,10 @@ xfs_inode_item_precommit( * (ili_fields) correctly tracks that the version has changed. */ spin_lock(&iip->ili_lock); - iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); + if (flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP)) + iip->ili_datasync_tx = true; + else + iip->ili_datasync_tx = false; if (flags & XFS_ILOG_IVERSION) flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); @@ -863,7 +866,11 @@ xfs_inode_item_committing( struct xfs_log_item *lip, xfs_csn_t seq) { - INODE_ITEM(lip)->ili_commit_seq = seq; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + + iip->ili_commit_seq = seq; + if (iip->ili_datasync_tx) + iip->ili_datasync_commit_seq = seq; return xfs_inode_item_release(lip); } @@ -1055,7 +1062,6 @@ xfs_iflush_abort_clean( { iip->ili_last_fields = 0; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; iip->ili_flush_lsn = 0; iip->ili_item.li_buf = NULL; list_del_init(&iip->ili_item.li_bio_list); diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index ba92ce11a011..7a3364f3a6b4 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -32,9 +32,11 @@ struct xfs_inode_log_item { spinlock_t ili_lock; /* flush state lock */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ - unsigned int ili_fsync_fields; /* logged since last fsync */ + bool ili_datasync_tx; /* does fdatasync need to flush current tx? */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ xfs_csn_t ili_commit_seq; /* last transaction commit */ + /* last transaction commit with changes relevant for fdatasync */ + xfs_csn_t ili_datasync_commit_seq; }; static inline int xfs_inode_clean(struct xfs_inode *ip) -- 2.51.0 --vgxg43pdz5f5qtun--