From: Keith Busch <kbusch@xxxxxxxxxx> Various storage protocols can support offloading block data copies. Enhance the block layer to know about the device's copying capabilities, introduce the new REQ_OP_COPY operation, and provide the infrastructure to iterate, split, and merge these kinds of requests. A copy command must provide the device with a list of source LBAs and their lengths, and a destination LBA. The 'struct bio' type doesn't readily have a way to describe such a thing. But a copy request doesn't use host memory for data, so the bio's bio_vec is unused space. This patch adds a new purpose to the bio_vec where it can provide a vector of sectors instead of memory pages. Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx> --- block/bio.c | 25 ++++++++++++++ block/blk-core.c | 4 +++ block/blk-lib.c | 47 ++++++++++++++++++++++----- block/blk-merge.c | 28 +++++++++++++++- block/blk-sysfs.c | 9 ++++++ block/blk.h | 17 +++++++++- include/linux/bio.h | 20 ++++++++++++ include/linux/blk-mq.h | 5 +++ include/linux/blk_types.h | 2 ++ include/linux/blkdev.h | 14 ++++++++ include/linux/bvec.h | 68 +++++++++++++++++++++++++++++++++++++-- 11 files changed, 226 insertions(+), 13 deletions(-) diff --git a/block/bio.c b/block/bio.c index 3c0a558c90f52..9c73a895c987b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1156,6 +1156,31 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) bio_set_flag(bio, BIO_CLONED); } +static int bvec_try_merge_copy_src(struct bio *bio, struct bio_vec *src) +{ + struct bio_vec *bv; + + if (!bio->bi_vcnt) + return false; + + bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + if (bv->bv_sector + src->bv_sectors != src->bv_sector) + return false; + + bv->bv_sectors += src->bv_sectors; + return true; +} + +int bio_add_copy_src(struct bio *bio, struct bio_vec *src) +{ + if (bvec_try_merge_copy_src(bio, src)) + return 0; + if (bio->bi_vcnt >= bio->bi_max_vecs) + return -EINVAL; + bio->bi_io_vec[bio->bi_vcnt++] = *src; + return 0; +} + static unsigned int get_contig_folio_len(unsigned int *num_pages, struct page **pages, unsigned int i, struct folio *folio, size_t left, diff --git a/block/blk-core.c b/block/blk-core.c index b862c66018f25..cb3d9879e2d65 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -837,6 +837,10 @@ void submit_bio_noacct(struct bio *bio) if (!bdev_max_discard_sectors(bdev)) goto not_supported; break; + case REQ_OP_COPY: + if (!bdev_copy_sectors(bdev)) + goto not_supported; + break; case REQ_OP_SECURE_ERASE: if (!bdev_max_secure_erase_sectors(bdev)) goto not_supported; diff --git a/block/blk-lib.c b/block/blk-lib.c index a819ded0ed3a9..a538acbaa2cd7 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -369,14 +369,7 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL(blkdev_issue_secure_erase); -/** - * blkdev_copy - copy source sectors to a destination on the same block device - * @dst_sector: start sector of the destination to copy to - * @src_sector: start sector of the source to copy from - * @nr_sects: number of sectors to copy - * @gfp: allocation flags to use - */ -int blkdev_copy(struct block_device *bdev, sector_t dst_sector, +static int __blkdev_copy(struct block_device *bdev, sector_t dst_sector, sector_t src_sector, sector_t nr_sects, gfp_t gfp) { unsigned int nr_vecs = __blkdev_sectors_to_bio_pages(nr_sects); @@ -429,4 +422,42 @@ int blkdev_copy(struct block_device *bdev, sector_t dst_sector, kvfree(buf); return ret; } + +static int blkdev_copy_offload(struct block_device *bdev, sector_t dst_sector, + sector_t src_sector, sector_t nr_sects, gfp_t gfp) +{ + struct bio *bio; + int ret; + + struct bio_vec bv = { + .bv_sector = src_sector, + .bv_sectors = nr_sects, + }; + + bio = bio_alloc(bdev, 1, REQ_OP_COPY, gfp); + bio_add_copy_src(bio, &bv); + bio->bi_iter.bi_sector = dst_sector; + bio->bi_iter.bi_size = nr_sects << SECTOR_SHIFT; + + ret = submit_bio_wait(bio); + bio_put(bio); + return ret; + +} + +/** + * blkdev_copy - copy source sectors to a destination on the same block device + * @dst_sector: start sector of the destination to copy to + * @src_sector: start sector of the source to copy from + * @nr_sects: number of sectors to copy + * @gfp: allocation flags to use + */ +int blkdev_copy(struct block_device *bdev, sector_t dst_sector, + sector_t src_sector, sector_t nr_sects, gfp_t gfp) +{ + if (bdev_copy_sectors(bdev)) + return blkdev_copy_offload(bdev, dst_sector, src_sector, + nr_sects, gfp); + return __blkdev_copy(bdev, dst_sector, src_sector, nr_sects, gfp); +} EXPORT_SYMBOL_GPL(blkdev_copy); diff --git a/block/blk-merge.c b/block/blk-merge.c index 3af1d284add50..8085fc0a27c2f 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -399,6 +399,31 @@ struct bio *bio_split_write_zeroes(struct bio *bio, return bio_submit_split(bio, max_sectors); } +struct bio *bio_split_copy(struct bio *bio, const struct queue_limits *lim, + unsigned *nr_segs) +{ + unsigned nsegs = 0, sectors = 0, mcss = lim->max_copy_segment_sectors; + struct bvec_iter iter; + struct bio_vec bv; + + bio_for_each_copy_bvec(bv, bio, iter, mcss) { + unsigned s; + + s = min(lim->max_copy_sectors - sectors, bv.bv_sectors); + nsegs += 1; + sectors += s; + + if (nsegs >= lim->max_copy_segments || sectors >= lim->max_copy_sectors) + break; + } + + if (sectors == bio_sectors(bio)) + sectors = 0; + + *nr_segs = nsegs; + return bio_submit_split(bio, sectors); +} + /** * bio_split_to_limits - split a bio to fit the queue limits * @bio: bio to be split @@ -467,6 +492,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq, if (!boundary_sectors || req_op(rq) == REQ_OP_DISCARD || + req_op(rq) == REQ_OP_COPY || req_op(rq) == REQ_OP_SECURE_ERASE) return max_sectors; return min(max_sectors, @@ -753,7 +779,7 @@ static struct request *attempt_merge(struct request_queue *q, req->__data_len += blk_rq_bytes(next); - if (!blk_discard_mergable(req)) + if (!blk_discard_mergable(req) && !blk_copy_mergable(req)) elv_merge_requests(q, req, next); blk_crypto_rq_put_keyslot(next); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b2b9b89d6967c..93ce41f399363 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -132,6 +132,7 @@ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ QUEUE_SYSFS_LIMIT_SHOW(max_segments) QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments) +QUEUE_SYSFS_LIMIT_SHOW(max_copy_segments) QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments) QUEUE_SYSFS_LIMIT_SHOW(max_segment_size) QUEUE_SYSFS_LIMIT_SHOW(max_write_streams) @@ -160,6 +161,8 @@ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_copy_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_copy_segment_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors) @@ -501,10 +504,13 @@ QUEUE_LIM_RO_ENTRY(queue_io_min, "minimum_io_size"); QUEUE_LIM_RO_ENTRY(queue_io_opt, "optimal_io_size"); QUEUE_LIM_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); +QUEUE_LIM_RO_ENTRY(queue_max_copy_segments, "max_copy_segments"); QUEUE_LIM_RO_ENTRY(queue_discard_granularity, "discard_granularity"); QUEUE_LIM_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes"); QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); +QUEUE_RO_ENTRY(queue_max_copy_sectors, "copy_max_bytes"); +QUEUE_RO_ENTRY(queue_max_copy_segment_sectors, "copy_segment_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_atomic_write_boundary_sectors, @@ -644,6 +650,7 @@ static struct attribute *queue_attrs[] = { &queue_max_sectors_entry.attr, &queue_max_segments_entry.attr, &queue_max_discard_segments_entry.attr, + &queue_max_copy_segments_entry.attr, &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, &queue_max_write_streams_entry.attr, @@ -657,6 +664,8 @@ static struct attribute *queue_attrs[] = { &queue_discard_granularity_entry.attr, &queue_max_discard_sectors_entry.attr, &queue_max_hw_discard_sectors_entry.attr, + &queue_max_copy_sectors_entry.attr, + &queue_max_copy_segment_sectors_entry.attr, &queue_atomic_write_max_sectors_entry.attr, &queue_atomic_write_boundary_sectors_entry.attr, &queue_atomic_write_unit_min_entry.attr, diff --git a/block/blk.h b/block/blk.h index 37ec459fe6562..685f3eeca46e0 100644 --- a/block/blk.h +++ b/block/blk.h @@ -185,10 +185,20 @@ static inline bool blk_discard_mergable(struct request *req) return false; } +static inline bool blk_copy_mergable(struct request *req) +{ + if (req_op(req) == REQ_OP_COPY && + queue_max_copy_segments(req->q) > 1) + return true; + return false; +} + static inline unsigned int blk_rq_get_max_segments(struct request *rq) { if (req_op(rq) == REQ_OP_DISCARD) return queue_max_discard_segments(rq->q); + if (req_op(rq) == REQ_OP_COPY) + return queue_max_copy_segments(rq->q); return queue_max_segments(rq->q); } @@ -200,7 +210,8 @@ static inline unsigned int blk_queue_get_max_sectors(struct request *rq) if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) return min(q->limits.max_discard_sectors, UINT_MAX >> SECTOR_SHIFT); - + if (unlikely(op == REQ_OP_COPY)) + return q->limits.max_copy_sectors; if (unlikely(op == REQ_OP_WRITE_ZEROES)) return q->limits.max_write_zeroes_sectors; @@ -347,6 +358,8 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs); struct bio *bio_split_zone_append(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs); +struct bio *bio_split_copy(struct bio *bio, const struct queue_limits *lim, + unsigned *nsegs); /* * All drivers must accept single-segments bios that are smaller than PAGE_SIZE. @@ -397,6 +410,8 @@ static inline struct bio *__bio_split_to_limits(struct bio *bio, return bio_split_discard(bio, lim, nr_segs); case REQ_OP_WRITE_ZEROES: return bio_split_write_zeroes(bio, lim, nr_segs); + case REQ_OP_COPY: + return bio_split_copy(bio, lim, nr_segs); default: /* other operations can't be split */ *nr_segs = 0; diff --git a/include/linux/bio.h b/include/linux/bio.h index 9c37c66ef9ca3..e25bcde9ec59d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -54,6 +54,7 @@ static inline bool bio_has_data(struct bio *bio) if (bio && bio->bi_iter.bi_size && bio_op(bio) != REQ_OP_DISCARD && + bio_op(bio) != REQ_OP_COPY && bio_op(bio) != REQ_OP_SECURE_ERASE && bio_op(bio) != REQ_OP_WRITE_ZEROES) return true; @@ -68,6 +69,11 @@ static inline bool bio_no_advance_iter(const struct bio *bio) bio_op(bio) == REQ_OP_WRITE_ZEROES; } +static inline bool bio_sector_advance_iter(const struct bio *bio) +{ + return bio_op(bio) == REQ_OP_COPY; +} + static inline void *bio_data(struct bio *bio) { if (bio_has_data(bio)) @@ -100,6 +106,8 @@ static inline void bio_advance_iter(const struct bio *bio, if (bio_no_advance_iter(bio)) iter->bi_size -= bytes; + else if (bio_sector_advance_iter(bio)) + bvec_iter_sector_advance(bio->bi_io_vec, iter, bytes); else bvec_iter_advance(bio->bi_io_vec, iter, bytes); /* TODO: It is reasonable to complete bio with error here. */ @@ -114,6 +122,8 @@ static inline void bio_advance_iter_single(const struct bio *bio, if (bio_no_advance_iter(bio)) iter->bi_size -= bytes; + else if (bio_sector_advance_iter(bio)) + bvec_iter_sector_advance_single(bio->bi_io_vec, iter, bytes); else bvec_iter_advance_single(bio->bi_io_vec, iter, bytes); } @@ -155,6 +165,15 @@ static inline void bio_advance(struct bio *bio, unsigned int nbytes) ((bvl = mp_bvec_iter_bvec((bio)->bi_io_vec, (iter))), 1); \ bio_advance_iter_single((bio), &(iter), (bvl).bv_len)) +#define __bio_for_each_copy_bvec(bvl, bio, iter, start, max) \ + for (iter = (start); \ + (iter).bi_size && \ + ((bvl = copy_bvec_iter_bvec((bio)->bi_io_vec, (iter), max)), 1); \ + bio_advance_iter_single((bio), &(iter), (bvl).bv_sectors << SECTOR_SHIFT)) + +#define bio_for_each_copy_bvec(bvl, bio, iter, max) \ + __bio_for_each_copy_bvec(bvl, bio, iter, (bio)->bi_iter, max) + /* iterate over multi-page bvec */ #define bio_for_each_bvec(bvl, bio, iter) \ __bio_for_each_bvec(bvl, bio, iter, (bio)->bi_iter) @@ -409,6 +428,7 @@ extern void bio_uninit(struct bio *); void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf); void bio_chain(struct bio *, struct bio *); +int bio_add_copy_src(struct bio *bio, struct bio_vec *src); int __must_check bio_add_page(struct bio *bio, struct page *page, unsigned len, unsigned off); bool __must_check bio_add_folio(struct bio *bio, struct folio *folio, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index de8c85a03bb7f..49816e7f7df7d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1049,6 +1049,11 @@ struct req_iterator { struct bio *bio; }; +#define rq_for_each_copy_bvec(bvl, _rq, _iter) \ + __rq_for_each_bio(_iter.bio, _rq) \ + bio_for_each_copy_bvec(bvl, _iter.bio, _iter.iter, \ + _rq->q->limits.max_copy_segment_sectors) + #define __rq_for_each_bio(_bio, rq) \ if ((rq->bio)) \ for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3d1577f07c1c8..361d44c0d1317 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -355,6 +355,8 @@ enum req_op { REQ_OP_ZONE_RESET = (__force blk_opf_t)13, /* reset all the zone present on the device */ REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)15, + /* Copy offload sectors to the device */ + REQ_OP_COPY = (__force blk_opf_t)17, /* Driver private requests */ REQ_OP_DRV_IN = (__force blk_opf_t)34, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b7d71b126ec9b..e39ba0e91d43e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -399,9 +399,13 @@ struct queue_limits { unsigned int atomic_write_hw_unit_max; unsigned int atomic_write_unit_max; + unsigned int max_copy_sectors; + unsigned int max_copy_segment_sectors; + unsigned short max_segments; unsigned short max_integrity_segments; unsigned short max_discard_segments; + unsigned short max_copy_segments; unsigned short max_write_streams; unsigned int write_stream_granularity; @@ -1271,6 +1275,11 @@ static inline unsigned short queue_max_discard_segments(const struct request_que return q->limits.max_discard_segments; } +static inline unsigned short queue_max_copy_segments(const struct request_queue *q) +{ + return q->limits.max_copy_segments; +} + static inline unsigned int queue_max_segment_size(const struct request_queue *q) { return q->limits.max_segment_size; @@ -1380,6 +1389,11 @@ static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) return bdev_limits(bdev)->max_write_zeroes_sectors; } +static inline unsigned int bdev_copy_sectors(struct block_device *bdev) +{ + return bdev_limits(bdev)->max_copy_sectors; +} + static inline bool bdev_nonrot(struct block_device *bdev) { return blk_queue_nonrot(bdev_get_queue(bdev)); diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 204b22a99c4ba..7cc82738ede8a 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -21,6 +21,8 @@ struct page; * @bv_page: First page associated with the address range. * @bv_len: Number of bytes in the address range. * @bv_offset: Start of the address range relative to the start of @bv_page. + * @bv_sector: Start sector associated with the source block range + * @bv_sectors: Number of sectors in the block range * * The following holds for a bvec if n * PAGE_SIZE < bv_offset + bv_len: * @@ -29,9 +31,17 @@ struct page; * This holds because page_is_mergeable() checks the above property. */ struct bio_vec { - struct page *bv_page; - unsigned int bv_len; - unsigned int bv_offset; + union { + struct { + struct page *bv_page; + unsigned int bv_len; + unsigned int bv_offset; + }; + struct { + sector_t bv_sector; + sector_t bv_sectors; + }; + }; }; /** @@ -118,6 +128,21 @@ struct bvec_iter_all { .bv_offset = mp_bvec_iter_offset((bvec), (iter)), \ }) +/* sector based bvec helpers */ +#define copy_bvec_iter_sector(bvec, iter) \ + (__bvec_iter_bvec((bvec), (iter))->bv_sector) + \ + ((iter).bi_bvec_done >> 9) + +#define copy_bvec_iter_sectors(bvec, iter) \ + (__bvec_iter_bvec((bvec), (iter))->bv_sectors) - \ + ((iter).bi_bvec_done >> 9) + +#define copy_bvec_iter_bvec(bvec, iter, max) \ +((struct bio_vec) { \ + .bv_sector = copy_bvec_iter_sector((bvec), (iter)), \ + .bv_sectors = min(max, copy_bvec_iter_sectors((bvec), (iter))), \ +}) + /* For building single-page bvec in flight */ #define bvec_iter_offset(bvec, iter) \ (mp_bvec_iter_offset((bvec), (iter)) % PAGE_SIZE) @@ -161,6 +186,30 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv, return true; } +static inline bool bvec_iter_sector_advance(const struct bio_vec *bv, + struct bvec_iter *iter, unsigned bytes) +{ + unsigned int idx = iter->bi_idx; + + if (WARN_ONCE(bytes > iter->bi_size, + "Attempted to advance past end of bvec iter\n")) { + iter->bi_size = 0; + return false; + } + + iter->bi_size -= bytes; + bytes += iter->bi_bvec_done; + + while (bytes && bytes >> 9 >= bv[idx].bv_sectors) { + bytes -= bv[idx].bv_sectors << 9; + idx++; + } + + iter->bi_idx = idx; + iter->bi_bvec_done = bytes; + return true; +} + /* * A simpler version of bvec_iter_advance(), @bytes should not span * across multiple bvec entries, i.e. bytes <= bv[i->bi_idx].bv_len @@ -178,6 +227,19 @@ static inline void bvec_iter_advance_single(const struct bio_vec *bv, iter->bi_size -= bytes; } +static inline void bvec_iter_sector_advance_single(const struct bio_vec *bv, + struct bvec_iter *iter, unsigned bytes) +{ + unsigned int done = iter->bi_bvec_done + bytes; + + if (done == bv[iter->bi_idx].bv_sectors << 9) { + done = 0; + iter->bi_idx++; + } + iter->bi_bvec_done = done; + iter->bi_size -= bytes; +} + #define for_each_bvec(bvl, bio_vec, iter, start) \ for (iter = (start); \ (iter).bi_size && \ -- 2.47.1