On Wed, May 14, 2025 at 04:42:17PM -0700, Ackerley Tng wrote: > +static int kvm_gmem_convert_execute_work(struct inode *inode, > + struct conversion_work *work, > + bool to_shared) > +{ > + enum shareability m; > + int ret; > + > + m = to_shared ? SHAREABILITY_ALL : SHAREABILITY_GUEST; > + ret = kvm_gmem_shareability_apply(inode, work, m); > + if (ret) > + return ret; > + /* > + * Apply shareability first so split/merge can operate on new > + * shareability state. > + */ > + ret = kvm_gmem_restructure_folios_in_range( > + inode, work->start, work->nr_pages, to_shared); > + > + return ret; > +} > + > static int kvm_gmem_convert_range(struct file *file, pgoff_t start, > size_t nr_pages, bool shared, > pgoff_t *error_index) > @@ -371,18 +539,21 @@ static int kvm_gmem_convert_range(struct file *file, pgoff_t start, > > list_for_each_entry(work, &work_list, list) { > rollback_stop_item = work; > - ret = kvm_gmem_shareability_apply(inode, work, m); > + > + ret = kvm_gmem_convert_execute_work(inode, work, shared); > if (ret) > break; > } > > if (ret) { > - m = shared ? SHAREABILITY_GUEST : SHAREABILITY_ALL; > list_for_each_entry(work, &work_list, list) { > + int r; > + > + r = kvm_gmem_convert_execute_work(inode, work, !shared); > + WARN_ON(r); > + > if (work == rollback_stop_item) > break; > - > - WARN_ON(kvm_gmem_shareability_apply(inode, work, m)); Could kvm_gmem_shareability_apply() fail here? > } > } > > @@ -434,6 +605,277 @@ static int kvm_gmem_ioctl_convert_range(struct file *file, > return ret; > } > > +#ifdef CONFIG_KVM_GMEM_HUGETLB > + > +static inline void __filemap_remove_folio_for_restructuring(struct folio *folio) > +{ > + struct address_space *mapping = folio->mapping; > + > + spin_lock(&mapping->host->i_lock); > + xa_lock_irq(&mapping->i_pages); > + > + __filemap_remove_folio(folio, NULL); > + > + xa_unlock_irq(&mapping->i_pages); > + spin_unlock(&mapping->host->i_lock); > +} > + > +/** > + * filemap_remove_folio_for_restructuring() - Remove @folio from filemap for > + * split/merge. > + * > + * @folio: the folio to be removed. > + * > + * Similar to filemap_remove_folio(), but skips LRU-related calls (meaningless > + * for guest_memfd), and skips call to ->free_folio() to maintain folio flags. > + * > + * Context: Expects only the filemap's refcounts to be left on the folio. Will > + * freeze these refcounts away so that no other users will interfere > + * with restructuring. > + */ > +static inline void filemap_remove_folio_for_restructuring(struct folio *folio) > +{ > + int filemap_refcount; > + > + filemap_refcount = folio_nr_pages(folio); > + while (!folio_ref_freeze(folio, filemap_refcount)) { > + /* > + * At this point only filemap refcounts are expected, hence okay > + * to spin until speculative refcounts go away. > + */ > + WARN_ONCE(1, "Spinning on folio=%p refcount=%d", folio, folio_ref_count(folio)); > + } > + > + folio_lock(folio); > + __filemap_remove_folio_for_restructuring(folio); > + folio_unlock(folio); > +} > + > +/** > + * kvm_gmem_split_folio_in_filemap() - Split @folio within filemap in @inode. > + * > + * @inode: inode containing the folio. > + * @folio: folio to be split. > + * > + * Split a folio into folios of size PAGE_SIZE. Will clean up folio from filemap > + * and add back the split folios. > + * > + * Context: Expects that before this call, folio's refcount is just the > + * filemap's refcounts. After this function returns, the split folios' > + * refcounts will also be filemap's refcounts. > + * Return: 0 on success or negative error otherwise. > + */ > +static int kvm_gmem_split_folio_in_filemap(struct inode *inode, struct folio *folio) > +{ > + size_t orig_nr_pages; > + pgoff_t orig_index; > + size_t i, j; > + int ret; > + > + orig_nr_pages = folio_nr_pages(folio); > + if (orig_nr_pages == 1) > + return 0; > + > + orig_index = folio->index; > + > + filemap_remove_folio_for_restructuring(folio); > + > + ret = kvm_gmem_allocator_ops(inode)->split_folio(folio); > + if (ret) > + goto err; > + > + for (i = 0; i < orig_nr_pages; ++i) { > + struct folio *f = page_folio(folio_page(folio, i)); > + > + ret = __kvm_gmem_filemap_add_folio(inode->i_mapping, f, > + orig_index + i); Why does the failure of __kvm_gmem_filemap_add_folio() here lead to rollback, while the failure of the one under rollback only triggers WARN_ON()? > + if (ret) > + goto rollback; > + } > + > + return ret; > + > +rollback: > + for (j = 0; j < i; ++j) { > + struct folio *f = page_folio(folio_page(folio, j)); > + > + filemap_remove_folio_for_restructuring(f); > + } > + > + kvm_gmem_allocator_ops(inode)->merge_folio(folio); > +err: > + WARN_ON(__kvm_gmem_filemap_add_folio(inode->i_mapping, folio, orig_index)); > + > + return ret; > +} > + > +static inline int kvm_gmem_try_split_folio_in_filemap(struct inode *inode, > + struct folio *folio) > +{ > + size_t to_nr_pages; > + void *priv; > + > + if (!kvm_gmem_has_custom_allocator(inode)) > + return 0; > + > + priv = kvm_gmem_allocator_private(inode); > + to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_page(priv); > + > + if (kvm_gmem_has_some_shared(inode, folio->index, to_nr_pages)) > + return kvm_gmem_split_folio_in_filemap(inode, folio); > + > + return 0; > +} > + > +/** > + * kvm_gmem_merge_folio_in_filemap() - Merge @first_folio within filemap in > + * @inode. > + * > + * @inode: inode containing the folio. > + * @first_folio: first folio among folios to be merged. > + * > + * Will clean up subfolios from filemap and add back the merged folio. > + * > + * Context: Expects that before this call, all subfolios only have filemap > + * refcounts. After this function returns, the merged folio will only > + * have filemap refcounts. > + * Return: 0 on success or negative error otherwise. > + */ > +static int kvm_gmem_merge_folio_in_filemap(struct inode *inode, > + struct folio *first_folio) > +{ > + size_t to_nr_pages; > + pgoff_t index; > + void *priv; > + size_t i; > + int ret; > + > + index = first_folio->index; > + > + priv = kvm_gmem_allocator_private(inode); > + to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv); > + if (folio_nr_pages(first_folio) == to_nr_pages) > + return 0; > + > + for (i = 0; i < to_nr_pages; ++i) { > + struct folio *f = page_folio(folio_page(first_folio, i)); > + > + filemap_remove_folio_for_restructuring(f); > + } > + > + kvm_gmem_allocator_ops(inode)->merge_folio(first_folio); > + > + ret = __kvm_gmem_filemap_add_folio(inode->i_mapping, first_folio, index); > + if (ret) > + goto err_split; > + > + return ret; > + > +err_split: > + WARN_ON(kvm_gmem_allocator_ops(inode)->split_folio(first_folio)); guestmem_hugetlb_split_folio() is possible to fail. e.g. After the stash is freed by guestmem_hugetlb_unstash_free_metadata() in guestmem_hugetlb_merge_folio(), it's possible to get -ENOMEM for the stash allocation in guestmem_hugetlb_stash_metadata() in guestmem_hugetlb_split_folio(). > + for (i = 0; i < to_nr_pages; ++i) { > + struct folio *f = page_folio(folio_page(first_folio, i)); > + > + WARN_ON(__kvm_gmem_filemap_add_folio(inode->i_mapping, f, index + i)); > + } > + > + return ret; > +} > + > +static inline int kvm_gmem_try_merge_folio_in_filemap(struct inode *inode, > + struct folio *first_folio) > +{ > + size_t to_nr_pages; > + void *priv; > + > + priv = kvm_gmem_allocator_private(inode); > + to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv); > + > + if (kvm_gmem_has_some_shared(inode, first_folio->index, to_nr_pages)) > + return 0; > + > + return kvm_gmem_merge_folio_in_filemap(inode, first_folio); > +} > + > +static int kvm_gmem_restructure_folios_in_range(struct inode *inode, > + pgoff_t start, size_t nr_pages, > + bool is_split_operation) > +{ > + size_t to_nr_pages; > + pgoff_t index; > + pgoff_t end; > + void *priv; > + int ret; > + > + if (!kvm_gmem_has_custom_allocator(inode)) > + return 0; > + > + end = start + nr_pages; > + > + /* Round to allocator page size, to check all (huge) pages in range. */ > + priv = kvm_gmem_allocator_private(inode); > + to_nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv); > + > + start = round_down(start, to_nr_pages); > + end = round_up(end, to_nr_pages); > + > + for (index = start; index < end; index += to_nr_pages) { > + struct folio *f; > + > + f = filemap_get_folio(inode->i_mapping, index); > + if (IS_ERR(f)) > + continue; > + > + /* Leave just filemap's refcounts on the folio. */ > + folio_put(f); > + > + if (is_split_operation) > + ret = kvm_gmem_split_folio_in_filemap(inode, f); kvm_gmem_try_split_folio_in_filemap()? > + else > + ret = kvm_gmem_try_merge_folio_in_filemap(inode, f); > + > + if (ret) > + goto rollback; > + } > + return ret; > + > +rollback: > + for (index -= to_nr_pages; index >= start; index -= to_nr_pages) { > + struct folio *f; > + > + f = filemap_get_folio(inode->i_mapping, index); > + if (IS_ERR(f)) > + continue; > + > + /* Leave just filemap's refcounts on the folio. */ > + folio_put(f); > + > + if (is_split_operation) > + WARN_ON(kvm_gmem_merge_folio_in_filemap(inode, f)); > + else > + WARN_ON(kvm_gmem_split_folio_in_filemap(inode, f)); Is it safe to just leave WARN_ON()s in the rollback case? Besides, are the kvm_gmem_merge_folio_in_filemap() and kvm_gmem_split_folio_in_filemap() here duplicated with the kvm_gmem_split_folio_in_filemap() and kvm_gmem_try_merge_folio_in_filemap() in the following "r = kvm_gmem_convert_execute_work(inode, work, !shared)"? > + } > + > + return ret; > +} > + > +#else > + > +static inline int kvm_gmem_try_split_folio_in_filemap(struct inode *inode, > + struct folio *folio) > +{ > + return 0; > +} > + > +static int kvm_gmem_restructure_folios_in_range(struct inode *inode, > + pgoff_t start, size_t nr_pages, > + bool is_split_operation) > +{ > + return 0; > +} > + > +#endif > +