On Fri, Aug 29, 2025 at 04:56:26PM -0700, Joanne Koong wrote: > Do readahead in fuse using iomap. This gives us granular uptodate > tracking for large folios, which optimizes how much data needs to be > read in. If some portions of the folio are already uptodate (eg through > a prior write), we only need to read in the non-uptodate portions. > > Signed-off-by: Joanne Koong <joannelkoong@xxxxxxxxx> > --- > fs/fuse/file.c | 214 +++++++++++++++++++++++++++---------------------- > 1 file changed, 118 insertions(+), 96 deletions(-) > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index bdfb13cdee4b..1659603f4cb6 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -844,8 +844,73 @@ static const struct iomap_ops fuse_iomap_ops = { > > struct fuse_fill_read_data { > struct file *file; > + /* > + * We need to track this because non-readahead requests can't be sent > + * asynchronously. > + */ > + bool readahead : 1; > + > + /* > + * Fields below are used if sending the read request > + * asynchronously. > + */ > + struct fuse_conn *fc; > + struct readahead_control *rac; > + struct fuse_io_args *ia; > + unsigned int nr_bytes; > }; > > +/* forward declarations */ > +static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, > + unsigned len, struct fuse_args_pages *ap, > + unsigned cur_bytes, bool write); > +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, > + unsigned int count, bool async); > + > +static int fuse_handle_readahead(struct folio *folio, > + struct fuse_fill_read_data *data, loff_t pos, > + size_t len) > +{ > + struct fuse_io_args *ia = data->ia; > + size_t off = offset_in_folio(folio, pos); > + struct fuse_conn *fc = data->fc; > + struct fuse_args_pages *ap; > + > + if (ia && fuse_folios_need_send(fc, pos, len, &ia->ap, data->nr_bytes, > + false)) { > + fuse_send_readpages(ia, data->file, data->nr_bytes, > + fc->async_read); > + data->nr_bytes = 0; > + ia = NULL; > + } > + if (!ia) { > + struct readahead_control *rac = data->rac; > + unsigned nr_pages = min(fc->max_pages, readahead_count(rac)); > + > + if (fc->num_background >= fc->congestion_threshold && > + rac->ra->async_size >= readahead_count(rac)) > + /* > + * Congested and only async pages left, so skip the > + * rest. > + */ > + return -EAGAIN; > + > + data->ia = fuse_io_alloc(NULL, nr_pages); > + if (!data->ia) > + return -ENOMEM; > + ia = data->ia; > + } > + folio_get(folio); > + ap = &ia->ap; > + ap->folios[ap->num_folios] = folio; > + ap->descs[ap->num_folios].offset = off; > + ap->descs[ap->num_folios].length = len; > + data->nr_bytes += len; > + ap->num_folios++; > + > + return 0; > +} > + > static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, > struct folio *folio, loff_t pos, > size_t len) > @@ -855,13 +920,24 @@ static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, > size_t off = offset_in_folio(folio, pos); > int ret; > > - /* > - * for non-readahead read requests, do reads synchronously since > - * it's not guaranteed that the server can handle out-of-order reads > - */ > iomap_start_folio_read(folio, len); > - ret = fuse_do_readfolio(file, folio, off, len); > - iomap_finish_folio_read(folio, off, len, ret); > + if (data->readahead) { > + ret = fuse_handle_readahead(folio, data, pos, len); > + /* > + * If fuse_handle_readahead was successful, fuse_readpages_end > + * will do the iomap_finish_folio_read, else we need to call it > + * here > + */ > + if (ret) > + iomap_finish_folio_read(folio, off, len, ret); > + } else { > + /* > + * for non-readahead read requests, do reads synchronously since > + * it's not guaranteed that the server can handle out-of-order reads > + */ > + ret = fuse_do_readfolio(file, folio, off, len); > + iomap_finish_folio_read(folio, off, len, ret); > + } > return ret; > } > > @@ -923,7 +999,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, > } > > for (i = 0; i < ap->num_folios; i++) { > - folio_end_read(ap->folios[i], !err); > + iomap_finish_folio_read(ap->folios[i], ap->descs[i].offset, > + ap->descs[i].length, err); > folio_put(ap->folios[i]); > } > if (ia->ff) > @@ -933,7 +1010,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, > } > > static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, > - unsigned int count) > + unsigned int count, bool async) > { > struct fuse_file *ff = file->private_data; > struct fuse_mount *fm = ff->fm; > @@ -955,7 +1032,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, > > fuse_read_args_fill(ia, file, pos, count, FUSE_READ); > ia->read.attr_ver = fuse_get_attr_version(fm->fc); > - if (fm->fc->async_read) { > + if (async) { > ia->ff = fuse_file_get(ff); > ap->args.end = fuse_readpages_end; > err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); > @@ -972,81 +1049,20 @@ static void fuse_readahead(struct readahead_control *rac) > { > struct inode *inode = rac->mapping->host; > struct fuse_conn *fc = get_fuse_conn(inode); > - unsigned int max_pages, nr_pages; > - struct folio *folio = NULL; > + struct fuse_fill_read_data data = { > + .file = rac->file, > + .readahead = true, > + .fc = fc, > + .rac = rac, > + }; > > if (fuse_is_bad(inode)) > return; > > - max_pages = min_t(unsigned int, fc->max_pages, > - fc->max_read / PAGE_SIZE); > - > - /* > - * This is only accurate the first time through, since readahead_folio() > - * doesn't update readahead_count() from the previous folio until the > - * next call. Grab nr_pages here so we know how many pages we're going > - * to have to process. This means that we will exit here with > - * readahead_count() == folio_nr_pages(last_folio), but we will have > - * consumed all of the folios, and read_pages() will call > - * readahead_folio() again which will clean up the rac. > - */ > - nr_pages = readahead_count(rac); > - > - while (nr_pages) { > - struct fuse_io_args *ia; > - struct fuse_args_pages *ap; > - unsigned cur_pages = min(max_pages, nr_pages); > - unsigned int pages = 0; > - > - if (fc->num_background >= fc->congestion_threshold && > - rac->ra->async_size >= readahead_count(rac)) > - /* > - * Congested and only async pages left, so skip the > - * rest. > - */ > - break; > - > - ia = fuse_io_alloc(NULL, cur_pages); > - if (!ia) > - break; > - ap = &ia->ap; > - > - while (pages < cur_pages) { > - unsigned int folio_pages; > - > - /* > - * This returns a folio with a ref held on it. > - * The ref needs to be held until the request is > - * completed, since the splice case (see > - * fuse_try_move_page()) drops the ref after it's > - * replaced in the page cache. > - */ > - if (!folio) > - folio = __readahead_folio(rac); > - > - folio_pages = folio_nr_pages(folio); > - if (folio_pages > cur_pages - pages) { > - /* > - * Large folios belonging to fuse will never > - * have more pages than max_pages. > - */ > - WARN_ON(!pages); > - break; > - } > - > - ap->folios[ap->num_folios] = folio; > - ap->descs[ap->num_folios].length = folio_size(folio); > - ap->num_folios++; > - pages += folio_pages; > - folio = NULL; > - } > - fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT); > - nr_pages -= pages; > - } > - if (folio) { > - folio_end_read(folio, false); > - folio_put(folio); > - } > + iomap_readahead(rac, &fuse_iomap_ops, &fuse_iomap_read_ops, &data); > + if (data.ia) > + fuse_send_readpages(data.ia, data.file, data.nr_bytes, > + fc->async_read); > } > > static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) > @@ -2077,7 +2093,7 @@ struct fuse_fill_wb_data { > struct fuse_file *ff; > unsigned int max_folios; > /* > - * nr_bytes won't overflow since fuse_writepage_need_send() caps > + * nr_bytes won't overflow since fuse_folios_need_send() caps > * wb requests to never exceed fc->max_pages (which has an upper bound > * of U16_MAX). > */ > @@ -2122,14 +2138,15 @@ static void fuse_writepages_send(struct inode *inode, > spin_unlock(&fi->lock); > } > > -static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, > - unsigned len, struct fuse_args_pages *ap, > - struct fuse_fill_wb_data *data) > +static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, > + unsigned len, struct fuse_args_pages *ap, > + unsigned cur_bytes, bool write) > { > struct folio *prev_folio; > struct fuse_folio_desc prev_desc; > - unsigned bytes = data->nr_bytes + len; > + unsigned bytes = cur_bytes + len; > loff_t prev_pos; > + size_t max_bytes = write ? fc->max_write : fc->max_read; > > WARN_ON(!ap->num_folios); > > @@ -2137,8 +2154,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, > if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages) > return true; > > - /* Reached max write bytes */ > - if (bytes > fc->max_write) > + if (bytes > max_bytes) > return true; > > /* Discontinuity */ > @@ -2148,11 +2164,6 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, > if (prev_pos != pos) > return true; > > - /* Need to grow the pages array? If so, did the expansion fail? */ > - if (ap->num_folios == data->max_folios && > - !fuse_pages_realloc(data, fc->max_pages)) > - return true; > - > return false; > } > > @@ -2176,10 +2187,21 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, > return -EIO; > } > > - if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data)) { > - fuse_writepages_send(inode, data); > - data->wpa = NULL; > - data->nr_bytes = 0; > + if (wpa) { > + bool send = fuse_folios_need_send(fc, pos, len, ap, data->nr_bytes, > + true); > + > + if (!send) { > + /* Need to grow the pages array? If so, did the expansion fail? */ > + send = (ap->num_folios == data->max_folios) && > + !fuse_pages_realloc(data, fc->max_pages); > + } What purpose this code relocation serve? I gather the idea here is that writes need to reallocate the pages array, whereas readahead can simply constrain to whatever's already allocated? --D > + > + if (send) { > + fuse_writepages_send(inode, data); > + data->wpa = NULL; > + data->nr_bytes = 0; > + } > } > > if (data->wpa == NULL) { > -- > 2.47.3 > >