On Mon, Jul 28, 2025 at 10:30:26PM +0200, Andrey Albershteyn wrote: > Add integration with fs-verity. XFS stores fs-verity descriptor in > the extended file attributes and the Merkle tree is store in data fork > beyond EOF. > > The descriptor is stored under "vdesc" extended attribute. > > The Merkle tree reading/writing is done through iomap interface. The > data itself are at offset 1 << 53 in the inode's page cache. When XFS > reads from this region iomap doesn't call into fsverity to verify it > against Merkle tree. For data, verification is done on BIO completion. > > When fs-verity is enabled on an inode, the XFS_IVERITY_CONSTRUCTION > flag is set meaning that the Merkle tree is being build. The > initialization ends with storing of verity descriptor and setting > inode on-disk flag (XFS_DIFLAG2_VERITY). > > Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> /me wonders if this is really SoB me since it's rather different from the last time I even touched this patch. It also means I can't RVB it. ;) > Signed-off-by: Andrey Albershteyn <aalbersh@xxxxxxxxxx> > --- > fs/xfs/Makefile | 1 + > fs/xfs/libxfs/xfs_da_format.h | 4 + > fs/xfs/xfs_bmap_util.c | 7 + > fs/xfs/xfs_fsverity.c | 311 ++++++++++++++++++++++++++++++++++++++++++ > fs/xfs/xfs_fsverity.h | 28 ++++ > fs/xfs/xfs_inode.h | 6 + > fs/xfs/xfs_super.c | 20 +++ > 7 files changed, 377 insertions(+) > > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile > index 5bf501cf8271..ad66439db7bf 100644 > --- a/fs/xfs/Makefile > +++ b/fs/xfs/Makefile > @@ -147,6 +147,7 @@ xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o > xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o > xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o > xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o > +xfs-$(CONFIG_FS_VERITY) += xfs_fsverity.o > > # notify failure > ifeq ($(CONFIG_MEMORY_FAILURE),y) > diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h > index e5274be2fe9c..b17fdbbb48aa 100644 > --- a/fs/xfs/libxfs/xfs_da_format.h > +++ b/fs/xfs/libxfs/xfs_da_format.h > @@ -909,4 +909,8 @@ struct xfs_parent_rec { > __be32 p_gen; > } __packed; > > +/* fs-verity ondisk xattr name used for the fsverity descriptor */ > +#define XFS_VERITY_DESCRIPTOR_NAME "vdesc" > +#define XFS_VERITY_DESCRIPTOR_NAME_LEN (sizeof(XFS_VERITY_DESCRIPTOR_NAME) - 1) > + > #endif /* __XFS_DA_FORMAT_H__ */ > diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c > index 06ca11731e43..af1933129647 100644 > --- a/fs/xfs/xfs_bmap_util.c > +++ b/fs/xfs/xfs_bmap_util.c > @@ -31,6 +31,7 @@ > #include "xfs_rtbitmap.h" > #include "xfs_rtgroup.h" > #include "xfs_zone_alloc.h" > +#include <linux/fsverity.h> > > /* Kernel only BMAP related definitions and functions */ > > @@ -553,6 +554,12 @@ xfs_can_free_eofblocks( > if (last_fsb <= end_fsb) > return false; > > + /* > + * Nothing to clean on fsverity inodes as they are read-only > + */ > + if (IS_VERITY(VFS_I(ip))) > + return false; > + > /* > * Check if there is an post-EOF extent to free. If there are any > * delalloc blocks attached to the inode (data fork delalloc > diff --git a/fs/xfs/xfs_fsverity.c b/fs/xfs/xfs_fsverity.c > new file mode 100644 > index 000000000000..ec7dea4289d5 > --- /dev/null > +++ b/fs/xfs/xfs_fsverity.c > @@ -0,0 +1,311 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (C) 2023 Red Hat, Inc. > + */ > +#include "xfs.h" > +#include "xfs_shared.h" > +#include "xfs_format.h" > +#include "xfs_da_format.h" > +#include "xfs_da_btree.h" > +#include "xfs_trans_resv.h" > +#include "xfs_mount.h" > +#include "xfs_inode.h" > +#include "xfs_log_format.h" > +#include "xfs_attr.h" > +#include "xfs_bmap_util.h" > +#include "xfs_log_format.h" > +#include "xfs_trans.h" > +#include "xfs_attr_leaf.h" > +#include "xfs_trace.h" > +#include "xfs_quota.h" > +#include "xfs_ag.h" > +#include "xfs_fsverity.h" > +#include "xfs_iomap.h" > +#include <linux/fsverity.h> > +#include <linux/pagemap.h> > + > +/* > + * Initialize an args structure to load or store the fsverity descriptor. > + * Caller must ensure @args is zeroed except for value and valuelen. > + */ > +static inline void > +xfs_fsverity_init_vdesc_args( > + struct xfs_inode *ip, > + struct xfs_da_args *args) > +{ > + args->geo = ip->i_mount->m_attr_geo; > + args->whichfork = XFS_ATTR_FORK; > + args->attr_filter = XFS_ATTR_VERITY; > + args->op_flags = XFS_DA_OP_OKNOENT; > + args->dp = ip; > + args->owner = ip->i_ino; > + args->name = XFS_VERITY_DESCRIPTOR_NAME; > + args->namelen = XFS_VERITY_DESCRIPTOR_NAME_LEN; > + xfs_attr_sethash(args); > +} > + > +/* Delete the verity descriptor. */ > +static int > +xfs_fsverity_delete_descriptor( > + struct xfs_inode *ip) > +{ > + struct xfs_da_args args = { }; > + > + xfs_fsverity_init_vdesc_args(ip, &args); > + return xfs_attr_set(&args, XFS_ATTRUPDATE_REMOVE, false); > +} > + > +/* Retrieve the verity descriptor. */ > +static int > +xfs_fsverity_get_descriptor( > + struct inode *inode, > + void *buf, > + size_t buf_size) > +{ > + struct xfs_inode *ip = XFS_I(inode); > + struct xfs_da_args args = { > + .value = buf, > + .valuelen = buf_size, > + }; > + int error = 0; > + > + /* > + * The fact that (returned attribute size) == (provided buf_size) is > + * checked by xfs_attr_copy_value() (returns -ERANGE). No descriptor > + * is treated as a short read so that common fsverity code will > + * complain. > + */ > + xfs_fsverity_init_vdesc_args(ip, &args); > + error = xfs_attr_get(&args); > + if (error == -ENOATTR) > + return 0; > + if (error) > + return error; > + > + return args.valuelen; > +} > + > +/* Try to remove all the fsverity metadata after a failed enablement. */ > +static int > +xfs_fsverity_delete_metadata( > + struct xfs_inode *ip) > +{ > + struct xfs_trans *tp; > + struct xfs_mount *mp = ip->i_mount; > + int error; > + > + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); > + if (error) { > + ASSERT(xfs_is_shutdown(mp)); > + return error; > + } > + > + xfs_ilock(ip, XFS_ILOCK_EXCL); > + xfs_trans_ijoin(tp, ip, 0); > + > + /* > + * As only merkle tree is getting removed, no need to change inode size > + */ > + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, XFS_ISIZE(ip)); > + if (error) > + goto err_cancel; > + > + error = xfs_trans_commit(tp); > + xfs_iunlock(ip, XFS_ILOCK_EXCL); > + if (error) > + return error; > + > + error = xfs_fsverity_delete_descriptor(ip); > + return error != -ENOATTR ? error : 0; Do you need to re-truncate the file to EOF to clear out the post-eof blocks? > + > +err_cancel: > + xfs_trans_cancel(tp); > + return error; > +} > + > + > +/* Prepare to enable fsverity by clearing old metadata. */ > +static int > +xfs_fsverity_begin_enable( > + struct file *filp) > +{ > + struct inode *inode = file_inode(filp); > + struct xfs_inode *ip = XFS_I(inode); > + int error; > + > + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); > + > + if (IS_DAX(inode)) > + return -EINVAL; > + > + if (inode->i_size > XFS_FSVERITY_MTREE_OFFSET) > + return -EFBIG; > + > + if (xfs_iflags_test_and_set(ip, XFS_VERITY_CONSTRUCTION)) > + return -EBUSY; Hm. Do we actually flush the pagecache to disk in the _begin_enable path? /me looked briefly but didn't see anything. > + > + error = xfs_qm_dqattach(ip); > + if (error) > + return error; > + > + return xfs_fsverity_delete_metadata(ip); > +} > + > +/* Complete (or fail) the process of enabling fsverity. */ > +static int > +xfs_fsverity_end_enable( > + struct file *filp, > + const void *desc, > + size_t desc_size, > + u64 merkle_tree_size) > +{ > + struct xfs_da_args args = { > + .value = (void *)desc, > + .valuelen = desc_size, > + }; > + struct inode *inode = file_inode(filp); > + struct xfs_inode *ip = XFS_I(inode); > + struct xfs_mount *mp = ip->i_mount; > + struct xfs_trans *tp; > + int error = 0; > + > + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); > + > + /* fs-verity failed, just cleanup */ > + if (desc == NULL) > + goto out; > + > + xfs_fsverity_init_vdesc_args(ip, &args); > + error = xfs_attr_set(&args, XFS_ATTRUPDATE_UPSERT, false); > + if (error) > + goto out; > + > + /* > + * Wait for Merkel tree get written to disk before setting on-disk inode > + * flag and clering XFS_VERITY_CONSTRUCTION s/Merkel/Merkle/ s/clering/clearing/ , sorry if that's my typo > + */ > + error = filemap_write_and_wait(inode->i_mapping); > + if (error) > + return error; > + > + /* Set fsverity inode flag */ > + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_ichange, > + 0, 0, false, &tp); > + if (error) > + goto out; > + > + /* > + * Ensure that we've persisted the verity information before we enable > + * it on the inode and tell the caller we have sealed the inode. > + */ > + ip->i_diflags2 |= XFS_DIFLAG2_VERITY; > + > + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); > + xfs_trans_set_sync(tp); > + > + error = xfs_trans_commit(tp); > + xfs_iunlock(ip, XFS_ILOCK_EXCL); > + > + if (!error) > + inode->i_flags |= S_VERITY; > + > +out: > + if (error) { > + int error2; > + > + error2 = xfs_fsverity_delete_metadata(ip); > + if (error2) > + xfs_alert(ip->i_mount, > +"ino 0x%llx failed to clean up new fsverity metadata, err %d", > + ip->i_ino, error2); > + } > + > + xfs_iflags_clear(ip, XFS_VERITY_CONSTRUCTION); > + return error; > +} > + > +static void > +xfs_fsverity_adjust_read( > + struct ioregion *region) > +{ > + u8 log_blocksize; > + unsigned int block_size; > + u64 tree_size; > + u64 position = region->pos & XFS_FSVERITY_MTREE_MASK; > + > + fsverity_merkle_tree_geometry(region->inode, &log_blocksize, > + &block_size, &tree_size); > + > + if (position + region->length < tree_size) > + return; > + > + region->length = tree_size - position; > +} > + > +/* Retrieve a merkle tree block. */ > +static struct page * > +xfs_fsverity_read_merkle( > + struct inode *inode, > + pgoff_t index, > + unsigned long num_ra_pages) > +{ > + u64 position = (index << PAGE_SHIFT) | XFS_FSVERITY_MTREE_OFFSET; Is this shift safe on 32-bit where pgoff_t is unsigned long? I don't think it is....? > + struct ioregion region = { > + .inode = inode, > + .pos = position, > + .length = PAGE_SIZE, > + .ops = &xfs_read_iomap_ops, > + }; > + int error; > + struct folio *folio; > + > + /* > + * As region->length is PAGE_SIZE we have to adjust the length for the > + * end of the tree. The case when tree blocks size are smaller then > + * PAGE_SIZE. Hrm. I wonder if we want to try to create large folios for merkle trees? Since the comment references PAGE_SIZE, what happens if this is an 8k fsblock XFS filesystem where folios will always be at least 8k? > + */ > + xfs_fsverity_adjust_read(®ion); > + > + folio = iomap_read_region(®ion); > + if (IS_ERR(folio)) > + return ERR_PTR(-EIO); > + > + /* Wait for buffered read to finish */ > + error = folio_wait_locked_killable(folio); > + if (error) > + return ERR_PTR(error); > + if (IS_ERR(folio) || !folio_test_uptodate(folio)) > + return ERR_PTR(-EFSCORRUPTED); > + > + return folio_file_page(folio, 0); > +} > + > +/* Write a merkle tree block. */ > +static int > +xfs_fsverity_write_merkle( > + struct inode *inode, > + const void *buf, > + u64 pos, > + unsigned int size) > +{ > + struct ioregion region = { > + .inode = inode, > + .pos = pos | XFS_FSVERITY_MTREE_OFFSET, > + .buf = buf, > + .length = size, > + .ops = &xfs_buffered_write_iomap_ops, > + }; > + > + if (region.pos + region.length > inode->i_sb->s_maxbytes) > + return -EFBIG; > + > + return iomap_write_region(®ion); > +} > + > +const struct fsverity_operations xfs_fsverity_ops = { > + .begin_enable_verity = xfs_fsverity_begin_enable, > + .end_enable_verity = xfs_fsverity_end_enable, > + .get_verity_descriptor = xfs_fsverity_get_descriptor, > + .read_merkle_tree_page = xfs_fsverity_read_merkle, > + .write_merkle_tree_block = xfs_fsverity_write_merkle, > +}; > diff --git a/fs/xfs/xfs_fsverity.h b/fs/xfs/xfs_fsverity.h > new file mode 100644 > index 000000000000..e063b7288dc0 > --- /dev/null > +++ b/fs/xfs/xfs_fsverity.h > @@ -0,0 +1,28 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (C) 2022 Red Hat, Inc. > + */ > +#ifndef __XFS_FSVERITY_H__ > +#define __XFS_FSVERITY_H__ > + > +/* Merkle tree location in page cache. We take memory region from the inode's > + * address space for Merkle tree. > + * > + * At maximum of 8 levels with 128 hashes per block (32 bytes SHA-256) maximum Aren't sha512 hashes allowed too? In which case it would be 64 bytes per hash? I suppose that doesn't really matter since... an 8EB file on a 1k fsblock filesystem with 1k merkle tree blocks will do this: # xfs_db /dev/sda -c "btheight -b 1024 -n $(( (2**63 - 1) / 1024 )) 64:64:0:merkle -w min" 64:64:0:merkle: worst case per 1024-byte block: 8 records (leaf) / 8 keyptrs (node) level 0: 9007199254740991 records, 1125899906842624 blocks level 1: 1125899906842624 records, 140737488355328 blocks level 2: 140737488355328 records, 17592186044416 blocks level 3: 17592186044416 records, 2199023255552 blocks level 4: 2199023255552 records, 274877906944 blocks level 5: 274877906944 records, 34359738368 blocks level 6: 34359738368 records, 4294967296 blocks level 7: 4294967296 records, 536870912 blocks level 8: 536870912 records, 67108864 blocks level 9: 67108864 records, 8388608 blocks level 10: 8388608 records, 1048576 blocks level 11: 1048576 records, 131072 blocks level 12: 131072 records, 16384 blocks level 13: 16384 records, 2048 blocks level 14: 2048 records, 256 blocks level 15: 256 records, 32 blocks level 16: 32 records, 4 blocks level 17: 4 records, 1 block 18 levels, 1286742750677285 blocks total log2(1286742750677285) is 50, so your choice of 53 is fine. Though this will far exceed the max merkle tree height anyway. > + * tree size is ((128^8 − 1)/(128 − 1)) = 567*10^12 blocks. This should fit in 53 > + * bits address space. > + * > + * At this Merkle tree size we can cover 295EB large file. This is much larger > + * than the currently supported file size. > + * > + * As we allocate some pagecache space for fsverity tree we need to limit > + * maximum fsverity file size. > + */ > +#define XFS_FSVERITY_MTREE_OFFSET (1ULL << 53) > +#define XFS_FSVERITY_MTREE_MASK (XFS_FSVERITY_MTREE_OFFSET - 1) This is part of the ondisk format, so it belongs in xfs_fs.h. --D > +#ifdef CONFIG_FS_VERITY > +extern const struct fsverity_operations xfs_fsverity_ops; > +#endif /* CONFIG_FS_VERITY */ > + > +#endif /* __XFS_FSVERITY_H__ */ > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index d7e2b902ef5c..033e1a71e64b 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -404,6 +404,12 @@ static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip) > */ > #define XFS_IREMAPPING (1U << 15) > > +/* > + * fs-verity's Merkle tree is under construction. The file is read-only, the > + * only writes happening is the ones with Merkle tree blocks. > + */ > +#define XFS_VERITY_CONSTRUCTION (1U << 16) > + > /* All inode state flags related to inode reclaim. */ > #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ > XFS_IRECLAIM | \ > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c > index 7b1ace75955c..10fb23ac672a 100644 > --- a/fs/xfs/xfs_super.c > +++ b/fs/xfs/xfs_super.c > @@ -30,6 +30,7 @@ > #include "xfs_filestream.h" > #include "xfs_quota.h" > #include "xfs_sysfs.h" > +#include "xfs_fsverity.h" > #include "xfs_ondisk.h" > #include "xfs_rmap_item.h" > #include "xfs_refcount_item.h" > @@ -54,6 +55,7 @@ > #include <linux/fs_context.h> > #include <linux/fs_parser.h> > #include <linux/fsverity.h> > +#include <linux/iomap.h> > > static const struct super_operations xfs_super_operations; > > @@ -1711,6 +1713,9 @@ xfs_fs_fill_super( > sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; > #endif > sb->s_op = &xfs_super_operations; > +#ifdef CONFIG_FS_VERITY > + sb->s_vop = &xfs_fsverity_ops; > +#endif > > /* > * Delay mount work if the debug hook is set. This is debug > @@ -1964,10 +1969,25 @@ xfs_fs_fill_super( > xfs_set_resuming_quotaon(mp); > mp->m_qflags &= ~XFS_QFLAGS_MNTOPTS; > > + if (xfs_has_verity(mp)) > + xfs_warn(mp, > + "EXPERIMENTAL fsverity feature in use. Use at your own risk!"); > + > error = xfs_mountfs(mp); > if (error) > goto out_filestream_unmount; > > +#ifdef CONFIG_FS_VERITY > + /* > + * Don't use a high priority workqueue like the other fsverity > + * implementations because that will lead to conflicts with the xfs log > + * workqueue. > + */ > + error = iomap_init_fsverity(mp->m_super, 0, 0); > + if (error) > + goto out_unmount; > +#endif > + > root = igrab(VFS_I(mp->m_rootip)); > if (!root) { > error = -ENOENT; > > -- > 2.50.0 > >