[PATCH RFC] xfs: Don't hold XFS_ILOCK_SHARED over log force during fsync

Jan Kara <jack@xxxxxxx> · Mon, 8 Sep 2025 17:12:49 +0200

Holding XFS_ILOCK_SHARED over log force in xfs_fsync_flush_log()
significantly increases contention on ILOCK for O_DSYNC | O_DIRECT
writes to file preallocated with fallocate (thus DIO happens to
unwritten extents and we need ILOCK in exclusive mode for timestamp
modifications and extent conversions). But holding ILOCK over the log
force doesn't seem strictly necessary for correctness. We are just using
it for a mechanism to make sure parallel fsyncs all wait for log force
to complete but that can be also achieved without holding ILOCK.

With this patch DB2 database restore operation speeds up by a factor of
about 2.5x in a VM with 4 CPUs, 16GB of RAM and NVME SSD as a backing
store.

Signed-off-by: Jan Kara <jack@xxxxxxx>
---
 fs/xfs/xfs_file.c       | 33 ++++++++++++++++++++++-----------
 fs/xfs/xfs_inode_item.c |  1 +
 fs/xfs/xfs_inode_item.h |  1 +
 3 files changed, 24 insertions(+), 11 deletions(-)

I've chosen adding ili_fsync_flushing_fields to xfs_inode_log_item since that
seemed in line with how the code is structured. Arguably that is unnecessarily
wasteful since in practice we use just one bit of information from
ili_fsync_fields and one bit from ili_fsync_flushing_fields. If people prefer
more space efficient solution, I can certainly do that.

This is marked as RFC because I'm not quite sure I didn't miss some subtlety
in XFS logging mechanisms and this will crash & burn badly in some corner
case (but fstests seem to be passing fine for me).

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index b04c59d87378..2bb793c8c179 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -80,9 +80,13 @@ xfs_fsync_seq(
 	struct xfs_inode	*ip,
 	bool			datasync)
 {
+	unsigned int sync_fields;
+
 	if (!xfs_ipincount(ip))
 		return 0;
-	if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+	sync_fields = ip->i_itemp->ili_fsync_fields |
+		      ip->i_itemp->ili_fsync_flushing_fields;
+	if (datasync && !(sync_fields & ~XFS_ILOG_TIMESTAMP))
 		return 0;
 	return ip->i_itemp->ili_commit_seq;
 }
@@ -92,13 +96,14 @@ xfs_fsync_seq(
  * log up to the latest LSN that touched the inode.
  *
  * If we have concurrent fsync/fdatasync() calls, we need them to all block on
- * the log force before we clear the ili_fsync_fields field. This ensures that
- * we don't get a racing sync operation that does not wait for the metadata to
- * hit the journal before returning.  If we race with clearing ili_fsync_fields,
- * then all that will happen is the log force will do nothing as the lsn will
- * already be on disk.  We can't race with setting ili_fsync_fields because that
- * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
- * shared until after the ili_fsync_fields is cleared.
+ * the log force until it is finished. Thus we clear ili_fsync_fields so that
+ * new modifications since starting log force can accumulate there and just
+ * save old ili_fsync_fields value to ili_fsync_flushing_fields so that
+ * concurrent fsyncs can use that to determine whether they need to wait for
+ * running log force or not. This ensures that we don't get a racing sync
+ * operation that does not wait for the metadata to hit the journal before
+ * returning.  If we race with clearing ili_fsync_fields, then all that will
+ * happen is the log force will do nothing as the lsn will already be on disk.
  */
 static  int
 xfs_fsync_flush_log(
@@ -112,14 +117,20 @@ xfs_fsync_flush_log(
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	seq = xfs_fsync_seq(ip, datasync);
 	if (seq) {
+		spin_lock(&ip->i_itemp->ili_lock);
+		ip->i_itemp->ili_fsync_flushing_fields =
+						ip->i_itemp->ili_fsync_fields;
+		ip->i_itemp->ili_fsync_fields = 0;
+		spin_unlock(&ip->i_itemp->ili_lock);
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
 					  log_flushed);
-
 		spin_lock(&ip->i_itemp->ili_lock);
-		ip->i_itemp->ili_fsync_fields = 0;
+		ip->i_itemp->ili_fsync_flushing_fields = 0;
 		spin_unlock(&ip->i_itemp->ili_lock);
+	} else {
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	}
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 829675700fcd..39d15eb9311d 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -1056,6 +1056,7 @@ xfs_iflush_abort_clean(
 	iip->ili_last_fields = 0;
 	iip->ili_fields = 0;
 	iip->ili_fsync_fields = 0;
+	iip->ili_fsync_flushing_fields = 0;
 	iip->ili_flush_lsn = 0;
 	iip->ili_item.li_buf = NULL;
 	list_del_init(&iip->ili_item.li_bio_list);
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index ba92ce11a011..018ba4cd3fab 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -33,6 +33,7 @@ struct xfs_inode_log_item {
 	unsigned int		ili_last_fields;   /* fields when flushed */
 	unsigned int		ili_fields;	   /* fields to be logged */
 	unsigned int		ili_fsync_fields;  /* logged since last fsync */
+	unsigned int		ili_fsync_flushing_fields;  /* fields currently being committed by fsync */
 	xfs_lsn_t		ili_flush_lsn;	   /* lsn at last flush */
 	xfs_csn_t		ili_commit_seq;	   /* last transaction commit */
 };
-- 
2.51.0