On Fri, Aug 01, 2025 at 09:23:57AM -0600, Keith Busch wrote: > On Thu, Jul 10, 2025 at 12:12:29PM -0400, Mike Snitzer wrote: > > All said, in practice I haven't had any issues with this patch. But > > it could just be I don't have the stars aligned to test the case that > > might have problems. If you know of such a case I'd welcome > > suggestions. > > This is something I threw together that appears to be successful with > NVMe through raw block direct-io. This will defer catching an invalid io > vector to much later in the block stack, which should be okay, and > removes one of the vector walks in the fast path, so that's a bonus. > > While this is testing okay with NVMe so far, I haven't tested any more > complicated setups yet, and I probably need to get filesystems using > this relaxed limit too. Ship it! ;) Many thanks for this, I'll review closely and circle back! Mike > --- > diff --git a/block/bio.c b/block/bio.c > index 92c512e876c8d..634b2031c4829 100644 > --- a/block/bio.c > +++ b/block/bio.c > @@ -1227,13 +1227,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) > extraction_flags |= ITER_ALLOW_P2PDMA; > > - /* > - * Each segment in the iov is required to be a block size multiple. > - * However, we may not be able to get the entire segment if it spans > - * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the > - * result to ensure the bio's total size is correct. The remainder of > - * the iov data will be picked up in the next bio iteration. > - */ > size = iov_iter_extract_pages(iter, &pages, > UINT_MAX - bio->bi_iter.bi_size, > nr_pages, extraction_flags, &offset); > @@ -1241,18 +1234,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > return size ? size : -EFAULT; > > nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); > - > - if (bio->bi_bdev) { > - size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); > - iov_iter_revert(iter, trim); > - size -= trim; > - } > - > - if (unlikely(!size)) { > - ret = -EFAULT; > - goto out; > - } > - > for (left = size, i = 0; left > 0; left -= len, i += num_pages) { > struct page *page = pages[i]; > struct folio *folio = page_folio(page); > @@ -1297,6 +1278,23 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > return ret; > } > > +static int bio_align_to_bs(struct bio *bio, struct iov_iter *iter) > +{ > + unsigned int mask = bdev_logical_block_size(bio->bi_bdev) - 1; > + unsigned int total = bio->bi_iter.bi_size; > + size_t trim = total & mask; > + > + if (!trim) > + return 0; > + > + /* FIXME: might be leaking pages */ > + bio_revert(bio, trim); > + iov_iter_revert(iter, trim); > + if (total == trim) > + return -EFAULT; > + return 0; > +} > + > /** > * bio_iov_iter_get_pages - add user or kernel pages to a bio > * @bio: bio to add pages to > @@ -1327,7 +1325,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > if (iov_iter_is_bvec(iter)) { > bio_iov_bvec_set(bio, iter); > iov_iter_advance(iter, bio->bi_iter.bi_size); > - return 0; > + return bio_align_to_bs(bio, iter); > } > > if (iov_iter_extract_will_pin(iter)) > @@ -1336,6 +1334,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > ret = __bio_iov_iter_get_pages(bio, iter); > } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); > > + ret = bio_align_to_bs(bio, iter); > return bio->bi_vcnt ? 0 : ret; > } > EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); > diff --git a/block/blk-merge.c b/block/blk-merge.c > index 70d704615be52..a3acfef8eb81d 100644 > --- a/block/blk-merge.c > +++ b/block/blk-merge.c > @@ -298,6 +298,9 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, > unsigned nsegs = 0, bytes = 0; > > bio_for_each_bvec(bv, bio, iter) { > + if (bv.bv_offset & lim->dma_alignment) > + return -EFAULT; > + > /* > * If the queue doesn't support SG gaps and adding this > * offset would create a gap, disallow it. > @@ -341,6 +344,8 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, > * we do not use the full hardware limits. > */ > bytes = ALIGN_DOWN(bytes, bio_split_alignment(bio, lim)); > + if (!bytes) > + return -EFAULT; > > /* > * Bio splitting may cause subtle trouble such as hang when doing sync > diff --git a/block/fops.c b/block/fops.c > index 82451ac8ff25d..820902cf10730 100644 > --- a/block/fops.c > +++ b/block/fops.c > @@ -38,8 +38,8 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb) > static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb, > struct iov_iter *iter) > { > - return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) || > - !bdev_iter_is_aligned(bdev, iter); > + return (iocb->ki_pos | iov_iter_count(iter)) & > + (bdev_logical_block_size(bdev) - 1); > } > > #define DIO_INLINE_BIO_VECS 4 > diff --git a/include/linux/bio.h b/include/linux/bio.h > index 46ffac5caab78..d3ddf78d1f35e 100644 > --- a/include/linux/bio.h > +++ b/include/linux/bio.h > @@ -169,6 +169,22 @@ static inline void bio_advance(struct bio *bio, unsigned int nbytes) > > #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len) > > +static inline void bio_revert(struct bio *bio, unsigned int nbytes) > +{ > + bio->bi_iter.bi_size -= nbytes; > + > + while (nbytes) { > + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; > + > + if (nbytes < bv->bv_len) { > + bv->bv_len -= nbytes; > + return; > + } > + bio->bi_vcnt--; > + nbytes -= bv->bv_len; > + } > +} > + > static inline unsigned bio_segments(struct bio *bio) > { > unsigned segs = 0; > --