[PATCH v2] xfs: Don't hold XFS_ILOCK_SHARED over log force during fsync

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Holding XFS_ILOCK_SHARED over log force in xfs_fsync_flush_log()
significantly increases contention on ILOCK for O_DSYNC | O_DIRECT
writes to file preallocated with fallocate (thus DIO happens to
unwritten extents and we need ILOCK in exclusive mode for timestamp
modifications and extent conversions). But holding ILOCK over the log
force doesn't seem strictly necessary for correctness. We are just using
it for a mechanism to make sure parallel fsyncs all wait for log force
to complete but that can be also achieved without holding ILOCK.

With this patch DB2 database restore operation speeds up by a factor of
about 2.5x in a VM with 4 CPUs, 16GB of RAM and NVME SSD as a backing
store.

Signed-off-by: Jan Kara <jack@xxxxxxx>
---
 fs/xfs/xfs_file.c       | 48 ++++++++++++++---------------------------
 fs/xfs/xfs_inode.c      |  2 --
 fs/xfs/xfs_inode_item.c | 12 ++++++++---
 fs/xfs/xfs_inode_item.h |  4 +++-
 4 files changed, 28 insertions(+), 38 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index b04c59d87378..c5b8ebddc179 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -75,30 +75,10 @@ xfs_dir_fsync(
 	return xfs_log_force_inode(ip);
 }
 
-static xfs_csn_t
-xfs_fsync_seq(
-	struct xfs_inode	*ip,
-	bool			datasync)
-{
-	if (!xfs_ipincount(ip))
-		return 0;
-	if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
-		return 0;
-	return ip->i_itemp->ili_commit_seq;
-}
-
 /*
  * All metadata updates are logged, which means that we just have to flush the
- * log up to the latest LSN that touched the inode.
- *
- * If we have concurrent fsync/fdatasync() calls, we need them to all block on
- * the log force before we clear the ili_fsync_fields field. This ensures that
- * we don't get a racing sync operation that does not wait for the metadata to
- * hit the journal before returning.  If we race with clearing ili_fsync_fields,
- * then all that will happen is the log force will do nothing as the lsn will
- * already be on disk.  We can't race with setting ili_fsync_fields because that
- * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
- * shared until after the ili_fsync_fields is cleared.
+ * log up to the latest LSN that modified the inode metadata relevant for the
+ * fsync/fdatasync().
  */
 static  int
 xfs_fsync_flush_log(
@@ -106,21 +86,25 @@ xfs_fsync_flush_log(
 	bool			datasync,
 	int			*log_flushed)
 {
-	int			error = 0;
+	struct xfs_inode_log_item *iip = ip->i_itemp;
 	xfs_csn_t		seq;
 
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	seq = xfs_fsync_seq(ip, datasync);
-	if (seq) {
-		error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
-					  log_flushed);
-
-		spin_lock(&ip->i_itemp->ili_lock);
-		ip->i_itemp->ili_fsync_fields = 0;
-		spin_unlock(&ip->i_itemp->ili_lock);
+	if (!xfs_ipincount(ip)) {
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return 0;
 	}
+
+	if (datasync)
+		seq = iip->ili_datasync_commit_seq;
+	else
+		seq = iip->ili_commit_seq;
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-	return error;
+
+	if (!seq)
+		return 0;
+
+	return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, log_flushed);
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9c39251961a3..209b8aba6238 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1656,7 +1656,6 @@ xfs_ifree_mark_inode_stale(
 	spin_lock(&iip->ili_lock);
 	iip->ili_last_fields = iip->ili_fields;
 	iip->ili_fields = 0;
-	iip->ili_fsync_fields = 0;
 	spin_unlock(&iip->ili_lock);
 	ASSERT(iip->ili_last_fields);
 
@@ -2502,7 +2501,6 @@ xfs_iflush(
 	spin_lock(&iip->ili_lock);
 	iip->ili_last_fields = iip->ili_fields;
 	iip->ili_fields = 0;
-	iip->ili_fsync_fields = 0;
 	set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags);
 	spin_unlock(&iip->ili_lock);
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 829675700fcd..27f28bea0a8f 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -153,7 +153,10 @@ xfs_inode_item_precommit(
 	 * (ili_fields) correctly tracks that the version has changed.
 	 */
 	spin_lock(&iip->ili_lock);
-	iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION);
+	if (flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP))
+		iip->ili_datasync_tx = true;
+	else
+		iip->ili_datasync_tx = false;
 	if (flags & XFS_ILOG_IVERSION)
 		flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
 
@@ -863,7 +866,11 @@ xfs_inode_item_committing(
 	struct xfs_log_item	*lip,
 	xfs_csn_t		seq)
 {
-	INODE_ITEM(lip)->ili_commit_seq = seq;
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+
+	iip->ili_commit_seq = seq;
+	if (iip->ili_datasync_tx)
+		iip->ili_datasync_commit_seq = seq;
 	return xfs_inode_item_release(lip);
 }
 
@@ -1055,7 +1062,6 @@ xfs_iflush_abort_clean(
 {
 	iip->ili_last_fields = 0;
 	iip->ili_fields = 0;
-	iip->ili_fsync_fields = 0;
 	iip->ili_flush_lsn = 0;
 	iip->ili_item.li_buf = NULL;
 	list_del_init(&iip->ili_item.li_bio_list);
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index ba92ce11a011..7a3364f3a6b4 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -32,9 +32,11 @@ struct xfs_inode_log_item {
 	spinlock_t		ili_lock;	   /* flush state lock */
 	unsigned int		ili_last_fields;   /* fields when flushed */
 	unsigned int		ili_fields;	   /* fields to be logged */
-	unsigned int		ili_fsync_fields;  /* logged since last fsync */
+	bool			ili_datasync_tx;   /* does fdatasync need to flush current tx? */
 	xfs_lsn_t		ili_flush_lsn;	   /* lsn at last flush */
 	xfs_csn_t		ili_commit_seq;	   /* last transaction commit */
+	/* last transaction commit with changes relevant for fdatasync */
+	xfs_csn_t		ili_datasync_commit_seq;
 };
 
 static inline int xfs_inode_clean(struct xfs_inode *ip)
-- 
2.51.0


--vgxg43pdz5f5qtun--




[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux