On Wed, Apr 09, 2025 at 10:30:15AM -0700, Darrick J. Wong wrote: > Then it occurred to me to look at set_blocksize again: > > /* Don't change the size if it is same as current */ > if (inode->i_blkbits != blksize_bits(size)) { > sync_blockdev(bdev); > inode->i_blkbits = blksize_bits(size); > mapping_set_folio_order_range(inode->i_mapping, > get_order(size), get_order(size)); > kill_bdev(bdev); > } > > (Note that I changed mapping_set_folio_min_order here to > mapping_set_folio_order_range to shut up a folio migration bug that I > reported elsewhere on fsdevel yesterday, and willy suggested forcing the > max order as a temporary workaround.) > > The update of i_blkbits and the order bits of mapping->flags are > performed before kill_bdev truncates the pagecache, which means there's > a window where there can be a !uptodate order-0 folio in the pagecache > but i_blkbits > PAGE_SHIFT (in this case, 13). The debugging assertion > above is from someone trying to install a too-small folio into the > pagecache. I think the "FARK" message I captured overnight is from > readahead trying to bring in contents from disk for this too-small folio > and failing. > > So I think the answer is that set_blocksize needs to lock out folio_add, > flush the dirty folios, invalidate the entire bdev pagecache, set > i_blkbits and the folio order, and only then allow new additions to the > pagecache. > > But then, which lock(s)? Were this a file on XFS I'd say that one has > to take i_rwsem and mmap_invalidate_lock before truncating the pagecache > but by my recollection bdev devices don't take either lock in their IO > paths. Here's my shabby attempt to lock my way out of this mess. My reproducer no longer trips, but I don't think that means much. --D From: Darrick J. Wong <djwong@xxxxxxxxxx> Subject: [PATCH] block: fix race between set_blocksize and IO paths With the new large sector size support, it's now the case that set_blocksize needs to change i_blksize and the folio order with no folios in the pagecache because the geometry changes cause problems with the bufferhead code. Therefore, truncate the page cache after flushing but before updating i_blksize. However, that's not enough -- we also need to lock out file IO and page faults during the update. Take both the i_rwsem and the invalidate_lock in exclusive mode for invalidations, and in shared mode for read/write operations. I don't know if this is the correct fix. Signed-off-by: "Darrick J. Wong" <djwong@xxxxxxxxxx> --- block/bdev.c | 12 ++++++++++++ block/blk-zoned.c | 5 ++++- block/fops.c | 7 +++++++ block/ioctl.c | 6 ++++++ 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/block/bdev.c b/block/bdev.c index 7b4e35a661b0c9..0cbdac46d98d86 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -169,11 +169,23 @@ int set_blocksize(struct file *file, int size) /* Don't change the size if it is same as current */ if (inode->i_blkbits != blksize_bits(size)) { + /* Prevent concurrent IO operations */ + inode_lock(inode); + filemap_invalidate_lock(inode->i_mapping); + + /* + * Flush and truncate the pagecache before we reconfigure the + * mapping geometry because folio sizes are variable now. + */ sync_blockdev(bdev); + kill_bdev(bdev); + inode->i_blkbits = blksize_bits(size); mapping_set_folio_order_range(inode->i_mapping, get_order(size), get_order(size)); kill_bdev(bdev); + filemap_invalidate_unlock(inode->i_mapping); + inode_unlock(inode); } return 0; } diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 0c77244a35c92e..8f15d1aa6eb89a 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -343,6 +343,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, op = REQ_OP_ZONE_RESET; /* Invalidate the page cache, including dirty pages. */ + inode_lock(bdev->bd_mapping->host); filemap_invalidate_lock(bdev->bd_mapping); ret = blkdev_truncate_zone_range(bdev, mode, &zrange); if (ret) @@ -364,8 +365,10 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); fail: - if (cmd == BLKRESETZONE) + if (cmd == BLKRESETZONE) { filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); + } return ret; } diff --git a/block/fops.c b/block/fops.c index be9f1dbea9ce0a..f46ae08fac33dd 100644 --- a/block/fops.c +++ b/block/fops.c @@ -746,7 +746,9 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = direct_write_fallback(iocb, from, ret, blkdev_buffered_write(iocb, from)); } else { + inode_lock_shared(bd_inode); ret = blkdev_buffered_write(iocb, from); + inode_unlock_shared(bd_inode); } if (ret > 0) @@ -757,6 +759,7 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { + struct inode *bd_inode = bdev_file_inode(iocb->ki_filp); struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); loff_t size = bdev_nr_bytes(bdev); loff_t pos = iocb->ki_pos; @@ -793,7 +796,9 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) goto reexpand; } + inode_lock_shared(bd_inode); ret = filemap_read(iocb, to, ret); + inode_unlock_shared(bd_inode); reexpand: if (unlikely(shorted)) @@ -836,6 +841,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, if ((start | len) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; + inode_lock(inode); filemap_invalidate_lock(inode->i_mapping); /* @@ -868,6 +874,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, fail: filemap_invalidate_unlock(inode->i_mapping); + inode_unlock(inode); return error; } diff --git a/block/ioctl.c b/block/ioctl.c index faa40f383e2736..e472cc1030c60c 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -142,6 +142,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, if (err) return err; + inode_lock(bdev->bd_mapping->host); filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, start + len - 1); if (err) @@ -174,6 +175,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, blk_finish_plug(&plug); fail: filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); return err; } @@ -199,12 +201,14 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode, end > bdev_nr_bytes(bdev)) return -EINVAL; + inode_lock(bdev->bd_mapping->host); filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, end - 1); if (!err) err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9, GFP_KERNEL); filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); return err; } @@ -236,6 +240,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode, return -EINVAL; /* Invalidate the page cache, including dirty pages */ + inode_lock(bdev->bd_mapping->host); filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, end); if (err) @@ -246,6 +251,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode, fail: filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); return err; }