On 25/07/04 11:13AM, Amir Goldstein wrote: > On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@xxxxxxxxxx> wrote: > > > > This commit fills in read/write/mmap handling for famfs files. The > > dev_dax_iomap interface is used - just like xfs in fs-dax mode. > > > > Signed-off-by: John Groves <john@xxxxxxxxxx> > > --- > > fs/fuse/famfs.c | 436 +++++++++++++++++++++++++++++++++++++++++++++++ > > fs/fuse/file.c | 14 ++ > > fs/fuse/fuse_i.h | 3 + > > 3 files changed, 453 insertions(+) > > > > diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c > > index f5e01032b825..1973eb10b60b 100644 > > --- a/fs/fuse/famfs.c > > +++ b/fs/fuse/famfs.c > > @@ -585,3 +585,439 @@ famfs_file_init_dax( > > return rc; > > } > > > > +/********************************************************************* > > + * iomap_operations > > + * > > + * This stuff uses the iomap (dax-related) helpers to resolve file offsets to > > + * offsets within a dax device. > > + */ > > + > > +static ssize_t famfs_file_bad(struct inode *inode); > > + > > +static int > > +famfs_interleave_fileofs_to_daxofs(struct inode *inode, struct iomap *iomap, > > + loff_t file_offset, off_t len, unsigned int flags) > > +{ > > + struct fuse_inode *fi = get_fuse_inode(inode); > > + struct famfs_file_meta *meta = fi->famfs_meta; > > + struct fuse_conn *fc = get_fuse_conn(inode); > > + loff_t local_offset = file_offset; > > + int i; > > + > > + /* This function is only for extent_type INTERLEAVED_EXTENT */ > > + if (meta->fm_extent_type != INTERLEAVED_EXTENT) { > > + pr_err("%s: bad extent type\n", __func__); > > + goto err_out; > > + } > > + > > + if (famfs_file_bad(inode)) > > + goto err_out; > > + > > + iomap->offset = file_offset; > > + > > + for (i = 0; i < meta->fm_niext; i++) { > > + struct famfs_meta_interleaved_ext *fei = &meta->ie[i]; > > + u64 chunk_size = fei->fie_chunk_size; > > + u64 nstrips = fei->fie_nstrips; > > + u64 ext_size = fei->fie_nbytes; > > + > > + ext_size = min_t(u64, ext_size, meta->file_size); > > + > > + if (ext_size == 0) { > > + pr_err("%s: ext_size=%lld file_size=%ld\n", > > + __func__, fei->fie_nbytes, meta->file_size); > > + goto err_out; > > + } > > + > > + /* Is the data is in this striped extent? */ > > + if (local_offset < ext_size) { > > + u64 chunk_num = local_offset / chunk_size; > > + u64 chunk_offset = local_offset % chunk_size; > > + u64 stripe_num = chunk_num / nstrips; > > + u64 strip_num = chunk_num % nstrips; > > + u64 chunk_remainder = chunk_size - chunk_offset; > > + u64 strip_offset = chunk_offset + (stripe_num * chunk_size); > > + u64 strip_dax_ofs = fei->ie_strips[strip_num].ext_offset; > > + u64 strip_devidx = fei->ie_strips[strip_num].dev_index; > > + > > + if (!fc->dax_devlist->devlist[strip_devidx].valid) { > > + pr_err("%s: daxdev=%lld invalid\n", __func__, > > + strip_devidx); > > + goto err_out; > > + } > > + iomap->addr = strip_dax_ofs + strip_offset; > > + iomap->offset = file_offset; > > + iomap->length = min_t(loff_t, len, chunk_remainder); > > + > > + iomap->dax_dev = fc->dax_devlist->devlist[strip_devidx].devp; > > + > > + iomap->type = IOMAP_MAPPED; > > + iomap->flags = flags; > > + > > + return 0; > > + } > > + local_offset -= ext_size; /* offset is beyond this striped extent */ > > + } > > + > > + err_out: > > + pr_err("%s: err_out\n", __func__); > > + > > + /* We fell out the end of the extent list. > > + * Set iomap to zero length in this case, and return 0 > > + * This just means that the r/w is past EOF > > + */ > > + iomap->addr = 0; /* there is no valid dax device offset */ > > + iomap->offset = file_offset; /* file offset */ > > + iomap->length = 0; /* this had better result in no access to dax mem */ > > + iomap->dax_dev = NULL; > > + iomap->type = IOMAP_MAPPED; > > + iomap->flags = flags; > > + > > + return 0; > > +} > > + > > +/** > > + * famfs_fileofs_to_daxofs() - Resolve (file, offset, len) to (daxdev, offset, len) > > + * > > + * This function is called by famfs_fuse_iomap_begin() to resolve an offset in a > > + * file to an offset in a dax device. This is upcalled from dax from calls to > > + * both * dax_iomap_fault() and dax_iomap_rw(). Dax finishes the job resolving > > + * a fault to a specific physical page (the fault case) or doing a memcpy > > + * variant (the rw case) > > + * > > + * Pages can be PTE (4k), PMD (2MiB) or (theoretically) PuD (1GiB) > > + * (these sizes are for X86; may vary on other cpu architectures > > + * > > + * @inode: The file where the fault occurred > > + * @iomap: To be filled in to indicate where to find the right memory, > > + * relative to a dax device. > > + * @file_offset: Within the file where the fault occurred (will be page boundary) > > + * @len: The length of the faulted mapping (will be a page multiple) > > + * (will be trimmed in *iomap if it's disjoint in the extent list) > > + * @flags: > > + * > > + * Return values: 0. (info is returned in a modified @iomap struct) > > + */ > > +static int > > +famfs_fileofs_to_daxofs(struct inode *inode, struct iomap *iomap, > > + loff_t file_offset, off_t len, unsigned int flags) > > +{ > > + struct fuse_inode *fi = get_fuse_inode(inode); > > + struct famfs_file_meta *meta = fi->famfs_meta; > > + struct fuse_conn *fc = get_fuse_conn(inode); > > + loff_t local_offset = file_offset; > > + int i; > > + > > + if (!fc->dax_devlist) { > > + pr_err("%s: null dax_devlist\n", __func__); > > + goto err_out; > > + } > > + > > + if (famfs_file_bad(inode)) > > + goto err_out; > > + > > + if (meta->fm_extent_type == INTERLEAVED_EXTENT) > > + return famfs_interleave_fileofs_to_daxofs(inode, iomap, > > + file_offset, > > + len, flags); > > + > > + iomap->offset = file_offset; > > + > > + for (i = 0; i < meta->fm_nextents; i++) { > > + /* TODO: check devindex too */ > > + loff_t dax_ext_offset = meta->se[i].ext_offset; > > + loff_t dax_ext_len = meta->se[i].ext_len; > > + u64 daxdev_idx = meta->se[i].dev_index; > > + > > + if ((dax_ext_offset == 0) && > > + (meta->file_type != FAMFS_SUPERBLOCK)) > > + pr_warn("%s: zero offset on non-superblock file!!\n", > > + __func__); > > + > > + /* local_offset is the offset minus the size of extents skipped > > + * so far; If local_offset < dax_ext_len, the data of interest > > + * starts in this extent > > + */ > > + if (local_offset < dax_ext_len) { > > + loff_t ext_len_remainder = dax_ext_len - local_offset; > > + struct famfs_daxdev *dd; > > + > > + dd = &fc->dax_devlist->devlist[daxdev_idx]; > > + > > + if (!dd->valid || dd->error) { > > + pr_err("%s: daxdev=%lld %s\n", __func__, > > + daxdev_idx, > > + dd->valid ? "error" : "invalid"); > > + goto err_out; > > + } > > + > > + /* > > + * OK, we found the file metadata extent where this > > + * data begins > > + * @local_offset - The offset within the current > > + * extent > > + * @ext_len_remainder - Remaining length of ext after > > + * skipping local_offset > > + * Outputs: > > + * iomap->addr: the offset within the dax device where > > + * the data starts > > + * iomap->offset: the file offset > > + * iomap->length: the valid length resolved here > > + */ > > + iomap->addr = dax_ext_offset + local_offset; > > + iomap->offset = file_offset; > > + iomap->length = min_t(loff_t, len, ext_len_remainder); > > + > > + iomap->dax_dev = fc->dax_devlist->devlist[daxdev_idx].devp; > > + > > + iomap->type = IOMAP_MAPPED; > > + iomap->flags = flags; > > + return 0; > > + } > > + local_offset -= dax_ext_len; /* Get ready for the next extent */ > > + } > > + > > + err_out: > > + pr_err("%s: err_out\n", __func__); > > + > > + /* We fell out the end of the extent list. > > + * Set iomap to zero length in this case, and return 0 > > + * This just means that the r/w is past EOF > > + */ > > + iomap->addr = 0; /* there is no valid dax device offset */ > > + iomap->offset = file_offset; /* file offset */ > > + iomap->length = 0; /* this had better result in no access to dax mem */ > > + iomap->dax_dev = NULL; > > + iomap->type = IOMAP_MAPPED; > > + iomap->flags = flags; > > + > > + return 0; > > +} > > + > > +/** > > + * famfs_fuse_iomap_begin() - Handler for iomap_begin upcall from dax > > + * > > + * This function is pretty simple because files are > > + * * never partially allocated > > + * * never have holes (never sparse) > > + * * never "allocate on write" > > + * > > + * @inode: inode for the file being accessed > > + * @offset: offset within the file > > + * @length: Length being accessed at offset > > + * @flags: > > + * @iomap: iomap struct to be filled in, resolving (offset, length) to > > + * (daxdev, offset, len) > > + * @srcmap: > > + */ > > +static int > > +famfs_fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, > > + unsigned int flags, struct iomap *iomap, struct iomap *srcmap) > > +{ > > + struct fuse_inode *fi = get_fuse_inode(inode); > > + struct famfs_file_meta *meta = fi->famfs_meta; > > + size_t size; > > + > > + size = i_size_read(inode); > > + > > + WARN_ON(size != meta->file_size); > > + > > + return famfs_fileofs_to_daxofs(inode, iomap, offset, length, flags); > > +} > > + > > +/* Note: We never need a special set of write_iomap_ops because famfs never > > + * performs allocation on write. > > + */ > > +const struct iomap_ops famfs_iomap_ops = { > > + .iomap_begin = famfs_fuse_iomap_begin, > > +}; > > + > > +/********************************************************************* > > + * vm_operations > > + */ > > +static vm_fault_t > > +__famfs_fuse_filemap_fault(struct vm_fault *vmf, unsigned int pe_size, > > + bool write_fault) > > +{ > > + struct inode *inode = file_inode(vmf->vma->vm_file); > > + vm_fault_t ret; > > + pfn_t pfn; > > + > > + if (!IS_DAX(file_inode(vmf->vma->vm_file))) { > > + pr_err("%s: file not marked IS_DAX!!\n", __func__); > > + return VM_FAULT_SIGBUS; > > + } > > + > > + if (write_fault) { > > + sb_start_pagefault(inode->i_sb); > > + file_update_time(vmf->vma->vm_file); > > + } > > + > > + ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &famfs_iomap_ops); > > + if (ret & VM_FAULT_NEEDDSYNC) > > + ret = dax_finish_sync_fault(vmf, pe_size, pfn); > > + > > + if (write_fault) > > + sb_end_pagefault(inode->i_sb); > > + > > + return ret; > > +} > > + > > +static inline bool > > +famfs_is_write_fault(struct vm_fault *vmf) > > +{ > > + return (vmf->flags & FAULT_FLAG_WRITE) && > > + (vmf->vma->vm_flags & VM_SHARED); > > +} > > + > > +static vm_fault_t > > +famfs_filemap_fault(struct vm_fault *vmf) > > +{ > > + return __famfs_fuse_filemap_fault(vmf, 0, famfs_is_write_fault(vmf)); > > +} > > + > > +static vm_fault_t > > +famfs_filemap_huge_fault(struct vm_fault *vmf, unsigned int pe_size) > > +{ > > + return __famfs_fuse_filemap_fault(vmf, pe_size, famfs_is_write_fault(vmf)); > > +} > > + > > +static vm_fault_t > > +famfs_filemap_page_mkwrite(struct vm_fault *vmf) > > +{ > > + return __famfs_fuse_filemap_fault(vmf, 0, true); > > +} > > + > > +static vm_fault_t > > +famfs_filemap_pfn_mkwrite(struct vm_fault *vmf) > > +{ > > + return __famfs_fuse_filemap_fault(vmf, 0, true); > > +} > > + > > +static vm_fault_t > > +famfs_filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, > > + pgoff_t end_pgoff) > > +{ > > + return filemap_map_pages(vmf, start_pgoff, end_pgoff); > > +} > > + > > +const struct vm_operations_struct famfs_file_vm_ops = { > > + .fault = famfs_filemap_fault, > > + .huge_fault = famfs_filemap_huge_fault, > > + .map_pages = famfs_filemap_map_pages, > > + .page_mkwrite = famfs_filemap_page_mkwrite, > > + .pfn_mkwrite = famfs_filemap_pfn_mkwrite, > > +}; > > + > > +/********************************************************************* > > + * file_operations > > + */ > > + > > +/** > > + * famfs_file_bad() - Check for files that aren't in a valid state > > + * > > + * @inode - inode > > + * > > + * Returns: 0=success > > + * -errno=failure > > + */ > > +static ssize_t > > +famfs_file_bad(struct inode *inode) > > +{ > > + struct fuse_inode *fi = get_fuse_inode(inode); > > + struct famfs_file_meta *meta = fi->famfs_meta; > > + size_t i_size = i_size_read(inode); > > + > > + if (!meta) { > > + pr_err("%s: un-initialized famfs file\n", __func__); > > + return -EIO; > > + } > > + if (meta->error) { > > + pr_debug("%s: previously detected metadata errors\n", __func__); > > + return -EIO; > > + } > > + if (i_size != meta->file_size) { > > + pr_warn("%s: i_size overwritten from %ld to %ld\n", > > + __func__, meta->file_size, i_size); > > + meta->error = true; > > + return -ENXIO; > > + } > > + if (!IS_DAX(inode)) { > > + pr_debug("%s: inode %llx IS_DAX is false\n", __func__, (u64)inode); > > + return -ENXIO; > > + } > > + return 0; > > +} > > + > > +static ssize_t > > +famfs_fuse_rw_prep(struct kiocb *iocb, struct iov_iter *ubuf) > > +{ > > + struct inode *inode = iocb->ki_filp->f_mapping->host; > > + size_t i_size = i_size_read(inode); > > + size_t count = iov_iter_count(ubuf); > > + size_t max_count; > > + ssize_t rc; > > + > > + rc = famfs_file_bad(inode); > > + if (rc) > > + return rc; > > + > > + max_count = max_t(size_t, 0, i_size - iocb->ki_pos); > > + > > + if (count > max_count) > > + iov_iter_truncate(ubuf, max_count); > > + > > + if (!iov_iter_count(ubuf)) > > + return 0; > > + > > + return rc; > > +} > > + > > +ssize_t > > +famfs_fuse_read_iter(struct kiocb *iocb, struct iov_iter *to) > > +{ > > + ssize_t rc; > > + > > + rc = famfs_fuse_rw_prep(iocb, to); > > + if (rc) > > + return rc; > > + > > + if (!iov_iter_count(to)) > > + return 0; > > + > > + rc = dax_iomap_rw(iocb, to, &famfs_iomap_ops); > > + > > + file_accessed(iocb->ki_filp); > > + return rc; > > +} > > + > > +ssize_t > > +famfs_fuse_write_iter(struct kiocb *iocb, struct iov_iter *from) > > +{ > > + ssize_t rc; > > + > > + rc = famfs_fuse_rw_prep(iocb, from); > > + if (rc) > > + return rc; > > + > > + if (!iov_iter_count(from)) > > + return 0; > > + > > + return dax_iomap_rw(iocb, from, &famfs_iomap_ops); > > +} > > + > > +int > > +famfs_fuse_mmap(struct file *file, struct vm_area_struct *vma) > > +{ > > + struct inode *inode = file_inode(file); > > + ssize_t rc; > > + > > + rc = famfs_file_bad(inode); > > + if (rc) > > + return (int)rc; > > + > > + file_accessed(file); > > + vma->vm_ops = &famfs_file_vm_ops; > > + vm_flags_set(vma, VM_HUGEPAGE); > > + return 0; > > +} > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > > index 5d205eadb48f..24a14b176510 100644 > > --- a/fs/fuse/file.c > > +++ b/fs/fuse/file.c > > @@ -1874,6 +1874,8 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) > > > > if (FUSE_IS_VIRTIO_DAX(fi)) > > return fuse_dax_read_iter(iocb, to); > > + if (fuse_file_famfs(fi)) > > + return famfs_fuse_read_iter(iocb, to); > > > > /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ > > if (ff->open_flags & FOPEN_DIRECT_IO) > > @@ -1896,6 +1898,8 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) > > > > if (FUSE_IS_VIRTIO_DAX(fi)) > > return fuse_dax_write_iter(iocb, from); > > + if (fuse_file_famfs(fi)) > > + return famfs_fuse_write_iter(iocb, from); > > > > /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ > > if (ff->open_flags & FOPEN_DIRECT_IO) > > @@ -1911,10 +1915,14 @@ static ssize_t fuse_splice_read(struct file *in, loff_t *ppos, > > unsigned int flags) > > { > > struct fuse_file *ff = in->private_data; > > + struct inode *inode = file_inode(in); > > + struct fuse_inode *fi = get_fuse_inode(inode); > > > > /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ > > if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) > > return fuse_passthrough_splice_read(in, ppos, pipe, len, flags); > > + else if (fuse_file_famfs(fi)) > > + return -EIO; /* direct I/O doesn't make sense in dax_iomap */ > > else > > return filemap_splice_read(in, ppos, pipe, len, flags); > > } > > @@ -1923,10 +1931,14 @@ static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, > > loff_t *ppos, size_t len, unsigned int flags) > > { > > struct fuse_file *ff = out->private_data; > > + struct inode *inode = file_inode(out); > > + struct fuse_inode *fi = get_fuse_inode(inode); > > > > /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ > > if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) > > return fuse_passthrough_splice_write(pipe, out, ppos, len, flags); > > + else if (fuse_file_famfs(fi)) > > + return -EIO; /* direct I/O doesn't make sense in dax_iomap */ > > else > > return iter_file_splice_write(pipe, out, ppos, len, flags); > > } > > This looks odd. > > Usually, the methods first check for FUSE_IS_VIRTIO_DAX() and > fuse_file_famfs() to get this condition out of the way so I never needed > to think about whether or not the code verifies that fuse_file_passthrough() > and fuse_file_famfs() cannot co-exist. > > Is there a reason why you did not follow the same pattern here? I think I just got a little sloppy. I'll do the famfs test first. Unless we can rule out this path for famfs. But unlike virtiofs, famfs doesn't have separate file_operations, so I suppose it must be checked. > > Also, your comment makes no sense. > splice is not the case of direct I/O - quite the contrary. Sorry, brain fart on the comment. Splice doesn't make sense for famfs because famfs doesn't use the page cache. Will fix the comment too. > > Thanks, > Amir. Thank you Amir! John