If large swap entry or large folio entry returned from find_get_entries() is splited before it is truncated, only the first splited entry will be truncated, leaving the remaining splited entries un-truncated. To address this issue, we introduce a new helper function shmem_find_get_entries() which is similar to find_get_entries() but it will also return order of found entries. Then we can detect entry splitting after initial search by comparing current entry order with order returned from shmem_find_get_entries() and retry finding entries if the split is detectted to fix the issue. The large swap entry related race was introduced in 12885cbe88dd ("mm: shmem: split large entry if the swapin folio is not large"). The large folio related race seems a long-standing issue which may be related to conversion to xarray, conversion to folio and other changes. As a result, it's hard to track down the specific commit that directly introduced this issue. Signed-off-by: Kemeng Shi <shikemeng@xxxxxxxxxxxxxxx> --- mm/filemap.c | 2 +- mm/internal.h | 2 ++ mm/shmem.c | 81 ++++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 70 insertions(+), 15 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 7b90cbeb4a1a..672844b94d3a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2015,7 +2015,7 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, } EXPORT_SYMBOL(__filemap_get_folio); -static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, +struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) { struct folio *folio; diff --git a/mm/internal.h b/mm/internal.h index 6b8ed2017743..9573b3a9e8c0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -446,6 +446,8 @@ static inline void force_page_cache_readahead(struct address_space *mapping, force_page_cache_ra(&ractl, nr_to_read); } +struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, + xa_mark_t mark); unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, diff --git a/mm/shmem.c b/mm/shmem.c index f1062910a4de..2349673b239b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -949,18 +949,29 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) * the number of pages being freed. 0 means entry not found in XArray (0 pages * being freed). */ -static long shmem_free_swap(struct address_space *mapping, - pgoff_t index, void *radswap) +static long shmem_free_swap(struct address_space *mapping, pgoff_t index, + int order, void *radswap) { - int order = xa_get_order(&mapping->i_pages, index); + int old_order; void *old; - old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); - if (old != radswap) + xa_lock_irq(&mapping->i_pages); + old_order = xa_get_order(&mapping->i_pages, index); + /* free swap anyway if input order is -1 */ + if (order != -1 && old_order != order) { + xa_unlock_irq(&mapping->i_pages); + return 0; + } + + old = __xa_cmpxchg(&mapping->i_pages, index, radswap, NULL, 0); + if (old != radswap) { + xa_unlock_irq(&mapping->i_pages); return 0; - free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order); + } + xa_unlock_irq(&mapping->i_pages); - return 1 << order; + free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << old_order); + return 1 << old_order; } /* @@ -1077,6 +1088,39 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) return folio; } +/* + * Similar to find_get_entries(), but will return order of found entries + */ +static unsigned shmem_find_get_entries(struct address_space *mapping, + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, + pgoff_t *indices, int *orders) +{ + XA_STATE(xas, &mapping->i_pages, *start); + struct folio *folio; + + rcu_read_lock(); + while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { + indices[fbatch->nr] = xas.xa_index; + if (!xa_is_value(folio)) + orders[fbatch->nr] = folio_order(folio); + else + orders[fbatch->nr] = xas_get_order(&xas); + if (!folio_batch_add(fbatch, folio)) + break; + } + + if (folio_batch_count(fbatch)) { + unsigned long nr; + int idx = folio_batch_count(fbatch) - 1; + + nr = 1 << orders[idx]; + *start = round_down(indices[idx] + nr, nr); + } + rcu_read_unlock(); + + return folio_batch_count(fbatch); +} + /* * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. @@ -1090,6 +1134,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, pgoff_t end = (lend + 1) >> PAGE_SHIFT; struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; + int orders[PAGEVEC_SIZE]; struct folio *folio; bool same_folio; long nr_swaps_freed = 0; @@ -1113,7 +1158,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (unfalloc) continue; nr_swaps_freed += shmem_free_swap(mapping, - indices[i], folio); + indices[i], -1, folio); continue; } @@ -1166,8 +1211,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, while (index < end) { cond_resched(); - if (!find_get_entries(mapping, &index, end - 1, &fbatch, - indices)) { + if (!shmem_find_get_entries(mapping, &index, end - 1, &fbatch, + indices, orders)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) break; @@ -1183,9 +1228,13 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (unfalloc) continue; - swaps_freed = shmem_free_swap(mapping, indices[i], folio); + swaps_freed = shmem_free_swap(mapping, + indices[i], orders[i], folio); + /* + * Swap was replaced by page or was + * splited: retry + */ if (!swaps_freed) { - /* Swap was replaced by page: retry */ index = indices[i]; break; } @@ -1196,8 +1245,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, folio_lock(folio); if (!unfalloc || !folio_test_uptodate(folio)) { - if (folio_mapping(folio) != mapping) { - /* Page was replaced by swap: retry */ + /* + * Page was replaced by swap or was + * splited: retry + */ + if (folio_mapping(folio) != mapping || + folio_order(folio) != orders[i]) { folio_unlock(folio); index = indices[i]; break; -- 2.30.0