On 7/29/2025 5:06 PM, Yu Kuai wrote: > From: Yu Kuai <yukuai3@xxxxxxxxxx> > > As discussed [1], hold rcu for copying data from/to page is too heavy. > it's better to protect page with rcu around for page lookup and then > grab a reference to prevent page to be freed by discard. > > [1] https://lore.kernel.org/all/eb41cab3-5946-4fe3-a1be-843dd6fca159@xxxxxxxxx/ > > Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx> > --- > Changes from v1: > - refer to filemap_get_entry(), use xas_load + xas_reload to fix > concurrent problems. > > drivers/block/brd.c | 73 ++++++++++++++++++++++++++++----------------- > 1 file changed, 46 insertions(+), 27 deletions(-) > > diff --git a/drivers/block/brd.c b/drivers/block/brd.c > index 0c2eabe14af3..b7a0448ca928 100644 > --- a/drivers/block/brd.c > +++ b/drivers/block/brd.c > @@ -44,45 +44,72 @@ struct brd_device { > }; > > /* > - * Look up and return a brd's page for a given sector. > + * Look up and return a brd's page with reference grabbed for a given sector. > */ > static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) > { > - return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); > + struct page *page; > + XA_STATE(xas, &brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); > + > + rcu_read_lock(); > +repeat: > + xas_reset(&xas); Is it better to move xas_reset() to the failing branches instead of adding an extra xas_reset() for the success branch ? > + page = xas_load(&xas); > + if (xas_retry(&xas, page)) > + goto repeat; > + > + if (!page || xa_is_value(page)) { > + page = NULL; > + goto out; > + } brd will not store special value in the xarray, so xa_is_value() is unnecessary. > + > + if (!get_page_unless_zero(page)) > + goto repeat; > + > + if (unlikely(page != xas_reload(&xas))) { > + put_page(page); > + goto repeat; > + } > +out: > + rcu_read_unlock(); > + > + return page; > } > > /* > * Insert a new page for a given sector, if one does not already exist. > + * The returned page will grab reference. > */ > static struct page *brd_insert_page(struct brd_device *brd, sector_t sector, > blk_opf_t opf) > - __releases(rcu) > - __acquires(rcu) > { > gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO; > struct page *page, *ret; > > - rcu_read_unlock(); > page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM); > - if (!page) { > - rcu_read_lock(); > + if (!page) > return ERR_PTR(-ENOMEM); > - } > > xa_lock(&brd->brd_pages); > ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL, > page, gfp); > - rcu_read_lock(); > - if (ret) { > + if (!ret) { > + brd->brd_nr_pages++; > + get_page(page); > xa_unlock(&brd->brd_pages); > - __free_page(page); > - if (xa_is_err(ret)) > - return ERR_PTR(xa_err(ret)); > + return page; > + } > + > + if (!xa_is_err(ret)) { > + get_page(ret); > + xa_unlock(&brd->brd_pages); > + put_page(page); > return ret; > } > - brd->brd_nr_pages++; > + > xa_unlock(&brd->brd_pages); > - return page; > + put_page(page); > + return ERR_PTR(xa_err(ret)); > } > > /* > @@ -95,7 +122,7 @@ static void brd_free_pages(struct brd_device *brd) > pgoff_t idx; > > xa_for_each(&brd->brd_pages, idx, page) { > - __free_page(page); > + put_page(page); > cond_resched(); > } > > @@ -117,7 +144,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) > > bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset); > > - rcu_read_lock(); > page = brd_lookup_page(brd, sector); > if (!page && op_is_write(opf)) { > page = brd_insert_page(brd, sector, opf); > @@ -135,13 +161,13 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) > memset(kaddr, 0, bv.bv_len); > } > kunmap_local(kaddr); > - rcu_read_unlock(); > > bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len); > + if (page) > + put_page(page); > return true; > > out_error: > - rcu_read_unlock(); > if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT)) > bio_wouldblock_error(bio); > else > @@ -149,13 +175,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) > return false; > } > > -static void brd_free_one_page(struct rcu_head *head) > -{ > - struct page *page = container_of(head, struct page, rcu_head); > - > - __free_page(page); > -} > - > static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) > { > sector_t aligned_sector = round_up(sector, PAGE_SECTORS); > @@ -170,7 +189,7 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) > while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) { > page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT); > if (page) { > - call_rcu(&page->rcu_head, brd_free_one_page); > + put_page(page); > brd->brd_nr_pages--; > } > aligned_sector += PAGE_SECTORS;