[PATCH 3/7] mm: shmem: avoid missing entries in shmem_undo_range() when entries was splited concurrently

Kemeng Shi <shikemeng@xxxxxxxxxxxxxxx> · Fri, 6 Jun 2025 06:10:33 +0800

If large swap entry or large folio entry returned from find_get_entries()
is splited before it is truncated, only the first splited entry will be
truncated, leaving the remaining splited entries un-truncated.
To address this issue, we introduce a new helper function
shmem_find_get_entries() which is similar to find_get_entries() but it will
also return order of found entries. Then we can detect entry splitting
after initial search by comparing current entry order with order returned
from shmem_find_get_entries() and retry finding entries if the split is
detectted to fix the issue.
The large swap entry related race was introduced in 12885cbe88dd ("mm:
shmem: split large entry if the swapin folio is not large"). The large
folio related race seems a long-standing issue which may be related to
conversion to xarray, conversion to folio and other changes. As a result,
it's hard to track down the specific commit that directly introduced this
issue.

Signed-off-by: Kemeng Shi <shikemeng@xxxxxxxxxxxxxxx>
---
 mm/filemap.c  |  2 +-
 mm/internal.h |  2 ++
 mm/shmem.c    | 81 ++++++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 7b90cbeb4a1a..672844b94d3a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2015,7 +2015,7 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
 }
 EXPORT_SYMBOL(__filemap_get_folio);
 
-static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
+struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
 		xa_mark_t mark)
 {
 	struct folio *folio;
diff --git a/mm/internal.h b/mm/internal.h
index 6b8ed2017743..9573b3a9e8c0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -446,6 +446,8 @@ static inline void force_page_cache_readahead(struct address_space *mapping,
 	force_page_cache_ra(&ractl, nr_to_read);
 }
 
+struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
+		xa_mark_t mark);
 unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
 		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
 unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
diff --git a/mm/shmem.c b/mm/shmem.c
index f1062910a4de..2349673b239b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -949,18 +949,29 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
  * the number of pages being freed. 0 means entry not found in XArray (0 pages
  * being freed).
  */
-static long shmem_free_swap(struct address_space *mapping,
-			    pgoff_t index, void *radswap)
+static long shmem_free_swap(struct address_space *mapping, pgoff_t index,
+			    int order, void *radswap)
 {
-	int order = xa_get_order(&mapping->i_pages, index);
+	int old_order;
 	void *old;
 
-	old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
-	if (old != radswap)
+	xa_lock_irq(&mapping->i_pages);
+	old_order = xa_get_order(&mapping->i_pages, index);
+	/* free swap anyway if input order is -1 */
+	if (order != -1 && old_order != order) {
+		xa_unlock_irq(&mapping->i_pages);
+		return 0;
+	}
+
+	old = __xa_cmpxchg(&mapping->i_pages, index, radswap, NULL, 0);
+	if (old != radswap) {
+		xa_unlock_irq(&mapping->i_pages);
 		return 0;
-	free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order);
+	}
+	xa_unlock_irq(&mapping->i_pages);
 
-	return 1 << order;
+	free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << old_order);
+	return 1 << old_order;
 }
 
 /*
@@ -1077,6 +1088,39 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
 	return folio;
 }
 
+/*
+ * Similar to find_get_entries(), but will return order of found entries
+ */
+static unsigned shmem_find_get_entries(struct address_space *mapping,
+		pgoff_t *start, pgoff_t end, struct folio_batch *fbatch,
+		pgoff_t *indices, int *orders)
+{
+	XA_STATE(xas, &mapping->i_pages, *start);
+	struct folio *folio;
+
+	rcu_read_lock();
+	while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
+		indices[fbatch->nr] = xas.xa_index;
+		if (!xa_is_value(folio))
+			orders[fbatch->nr] = folio_order(folio);
+		else
+			orders[fbatch->nr] = xas_get_order(&xas);
+		if (!folio_batch_add(fbatch, folio))
+			break;
+	}
+
+	if (folio_batch_count(fbatch)) {
+		unsigned long nr;
+		int idx = folio_batch_count(fbatch) - 1;
+
+		nr = 1 << orders[idx];
+		*start = round_down(indices[idx] + nr, nr);
+	}
+	rcu_read_unlock();
+
+	return folio_batch_count(fbatch);
+}
+
 /*
  * Remove range of pages and swap entries from page cache, and free them.
  * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
@@ -1090,6 +1134,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
 	struct folio_batch fbatch;
 	pgoff_t indices[PAGEVEC_SIZE];
+	int orders[PAGEVEC_SIZE];
 	struct folio *folio;
 	bool same_folio;
 	long nr_swaps_freed = 0;
@@ -1113,7 +1158,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 				if (unfalloc)
 					continue;
 				nr_swaps_freed += shmem_free_swap(mapping,
-							indices[i], folio);
+						indices[i], -1, folio);
 				continue;
 			}
 
@@ -1166,8 +1211,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 	while (index < end) {
 		cond_resched();
 
-		if (!find_get_entries(mapping, &index, end - 1, &fbatch,
-				indices)) {
+		if (!shmem_find_get_entries(mapping, &index, end - 1, &fbatch,
+				indices, orders)) {
 			/* If all gone or hole-punch or unfalloc, we're done */
 			if (index == start || end != -1)
 				break;
@@ -1183,9 +1228,13 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 
 				if (unfalloc)
 					continue;
-				swaps_freed = shmem_free_swap(mapping, indices[i], folio);
+				swaps_freed = shmem_free_swap(mapping,
+					indices[i], orders[i], folio);
+				/*
+				 * Swap was replaced by page or was
+				 * splited: retry
+				 */
 				if (!swaps_freed) {
-					/* Swap was replaced by page: retry */
 					index = indices[i];
 					break;
 				}
@@ -1196,8 +1245,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			folio_lock(folio);
 
 			if (!unfalloc || !folio_test_uptodate(folio)) {
-				if (folio_mapping(folio) != mapping) {
-					/* Page was replaced by swap: retry */
+				/*
+				 * Page was replaced by swap or was
+				 * splited: retry
+				 */
+				if (folio_mapping(folio) != mapping ||
+				    folio_order(folio) != orders[i]) {
 					folio_unlock(folio);
 					index = indices[i];
 					break;
-- 
2.30.0