[RFC,13/14] swap: simplify swapoff using virtual swap

Message ID	20250407234223.1059191-14-nphamcs@gmail.com
State	New
Headers	show Received: from mail-yw1-f173.google.com (mail-yw1-f173.google.com [209.85.128.173]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 91AA32550B1; Mon, 7 Apr 2025 23:42:34 +0000 (UTC) From: Nhat Pham <nphamcs@gmail.com> To: linux-mm@kvack.org Cc: akpm@linux-foundation.org, hannes@cmpxchg.org, hughd@google.com, yosry.ahmed@linux.dev, mhocko@kernel.org, roman.gushchin@linux.dev, shakeel.butt@linux.dev, muchun.song@linux.dev, len.brown@intel.com, chengming.zhou@linux.dev, kasong@tencent.com, chrisl@kernel.org, huang.ying.caritas@gmail.com, ryan.roberts@arm.com, viro@zeniv.linux.org.uk, baohua@kernel.org, osalvador@suse.de, lorenzo.stoakes@oracle.com, christophe.leroy@csgroup.eu, pavel@kernel.org, kernel-team@meta.com, linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, linux-pm@vger.kernel.org Subject: [RFC PATCH 13/14] swap: simplify swapoff using virtual swap Date: Mon, 7 Apr 2025 16:42:14 -0700 Message-ID: <20250407234223.1059191-14-nphamcs@gmail.com> In-Reply-To: <20250407234223.1059191-1-nphamcs@gmail.com> References: <20250407234223.1059191-1-nphamcs@gmail.com> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	[RFC,01/14] swapfile: rearrange functions \| expand [RFC,01/14] swapfile: rearrange functions [RFC,03/14] mm: swap: add a separate type for physical swap slots [RFC,04/14] mm: swap: swap cache support for virtualized swap [RFC,06/14] mm: swap: allocate a virtual swap slot for each swapped out page [RFC,08/14] swap: manage swap entry lifetime at the virtual swap layer [RFC,12/14] vswap: support THP swapin and batch free_swap_and_cache [RFC,13/14] swap: simplify swapoff using virtual swap

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0b273a7b9f01..668b6add3b8f 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -108,7 +108,10 @@ extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); + +#ifndef CONFIG_VIRTUAL_SWAP int shmem_unuse(unsigned int type); +#endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long shmem_allowable_huge_orders(struct inode *inode, diff --git a/include/linux/swap.h b/include/linux/swap.h index c3a10c952116..177f6640a026 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -764,6 +764,7 @@ void vswap_store_folio(swp_entry_t entry, struct folio *folio); void swap_zeromap_folio_set(struct folio *folio); void vswap_assoc_zswap(swp_entry_t entry, struct zswap_entry *zswap_entry); bool vswap_can_swapin_thp(swp_entry_t entry, int nr); +void vswap_swapoff(swp_entry_t entry, struct folio *folio, swp_slot_t slot); static inline bool trylock_swapoff(swp_entry_t entry, struct swap_info_struct **si) diff --git a/mm/shmem.c b/mm/shmem.c index 609971a2b365..fa792769e422 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1380,6 +1380,7 @@ static void shmem_evict_inode(struct inode *inode) #endif } +#ifndef CONFIG_VIRTUAL_SWAP static int shmem_find_swap_entries(struct address_space *mapping, pgoff_t start, struct folio_batch *fbatch, pgoff_t *indices, unsigned int type) @@ -1525,6 +1526,7 @@ int shmem_unuse(unsigned int type) return error; } +#endif /* CONFIG_VIRTUAL_SWAP */ /* * Move the page from the page cache to the swap cache. diff --git a/mm/swapfile.c b/mm/swapfile.c index 59b34d51b16b..d1251a9264fa 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2053,6 +2053,163 @@ unsigned int count_swap_pages(int type, int free) } #endif /* CONFIG_HIBERNATION */ +/* + * Scan swap_map from current position to next entry still in use. + * Return 0 if there are no inuse entries after prev till end of + * the map. + */ +static unsigned int find_next_to_unuse(struct swap_info_struct *si, + unsigned int prev) +{ + unsigned int i; + unsigned char count; + + /* + * No need for swap_lock here: we're just looking + * for whether an entry is in use, not modifying it; false + * hits are okay, and sys_swapoff() has already prevented new + * allocations from this area (while holding swap_lock). + */ + for (i = prev + 1; i < si->max; i++) { + count = READ_ONCE(si->swap_map[i]); + if (count && swap_count(count) != SWAP_MAP_BAD) + break; + if ((i % LATENCY_LIMIT) == 0) + cond_resched(); + } + + if (i == si->max) + i = 0; + + return i; +} + +#ifdef CONFIG_VIRTUAL_SWAP +#define for_each_allocated_offset(si, offset) \ + while (swap_usage_in_pages(si) && \ + !signal_pending(current) && \ + (offset = find_next_to_unuse(si, offset)) != 0) + +static struct folio *pagein(swp_entry_t entry, struct swap_iocb **splug, + struct mempolicy *mpol) +{ + bool folio_was_allocated; + struct folio *folio = __read_swap_cache_async(entry, GFP_KERNEL, mpol, + NO_INTERLEAVE_INDEX, &folio_was_allocated, false); + + if (folio_was_allocated) + swap_read_folio(folio, splug); + return folio; +} + +static int try_to_unuse(unsigned int type) +{ + struct swap_info_struct *si = swap_info[type]; + struct swap_iocb *splug = NULL; + struct mempolicy *mpol; + struct blk_plug plug; + unsigned long offset; + struct folio *folio; + swp_entry_t entry; + swp_slot_t slot; + int ret = 0; + + if (!atomic_long_read(&si->inuse_pages)) + goto success; + + mpol = get_task_policy(current); + blk_start_plug(&plug); + + /* first round - submit the reads */ + offset = 0; + for_each_allocated_offset(si, offset) { + slot = swp_slot(type, offset); + entry = swp_slot_to_swp_entry(slot); + if (!entry.val) + continue; + + folio = pagein(entry, &splug, mpol); + if (folio) + folio_put(folio); + } + blk_finish_plug(&plug); + swap_read_unplug(splug); + lru_add_drain(); + + /* second round - updating the virtual swap slots' backing state */ + offset = 0; + for_each_allocated_offset(si, offset) { + slot = swp_slot(type, offset); +retry: + entry = swp_slot_to_swp_entry(slot); + if (!entry.val) + continue; + + /* try to allocate swap cache folio */ + folio = pagein(entry, &splug, mpol); + if (!folio) { + if (!swp_slot_to_swp_entry(swp_slot(type, offset)).val) + continue; + + ret = -ENOMEM; + pr_err("swapoff: unable to allocate swap cache folio for %lu\n", + entry.val); + goto finish; + } + + folio_lock(folio); + /* + * We need to check if the folio is still in swap cache. We can, for + * instance, race with zswap writeback, obtaining the temporary folio + * it allocated for decompression and writeback, which would be + * promply deleted from swap cache. By the time we lock that folio, + * it might have already contained stale data. + * + * Concurrent swap operations might have also come in before we + * reobtain the lock, deleting the folio from swap cache, invalidating + * the virtual swap slot, then swapping out the folio again. + * + * In all of these cases, we must retry the physical -> virtual lookup. + * + * Note that if everything is still valid, then virtual swap slot must + * corresponds to the head page (since all previous swap slots are + * freed). + */ + if (!folio_test_swapcache(folio) || folio->swap.val != entry.val) { + folio_unlock(folio); + folio_put(folio); + if (signal_pending(current)) + break; + schedule_timeout_uninterruptible(1); + goto retry; + } + + folio_wait_writeback(folio); + vswap_swapoff(entry, folio, slot); + folio_unlock(folio); + folio_put(folio); + } + +finish: + if (ret == -ENOMEM) + return ret; + + /* concurrent swappers might still be releasing physical swap slots... */ + while (swap_usage_in_pages(si)) { + if (signal_pending(current)) + return -EINTR; + schedule_timeout_uninterruptible(1); + } + +success: + /* + * Make sure that further cleanups after try_to_unuse() returns happen + * after swap_range_free() reduces si->inuse_pages to 0. + */ + smp_mb(); + return 0; +} +#else static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) { return pte_same(pte_swp_clear_flags(pte), swp_pte); @@ -2340,37 +2497,6 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) return ret; } -/* - * Scan swap_map from current position to next entry still in use. - * Return 0 if there are no inuse entries after prev till end of - * the map. - */ -static unsigned int find_next_to_unuse(struct swap_info_struct *si, - unsigned int prev) -{ - unsigned int i; - unsigned char count; - - /* - * No need for swap_lock here: we're just looking - * for whether an entry is in use, not modifying it; false - * hits are okay, and sys_swapoff() has already prevented new - * allocations from this area (while holding swap_lock). - */ - for (i = prev + 1; i < si->max; i++) { - count = READ_ONCE(si->swap_map[i]); - if (count && swap_count(count) != SWAP_MAP_BAD) - break; - if ((i % LATENCY_LIMIT) == 0) - cond_resched(); - } - - if (i == si->max) - i = 0; - - return i; -} - static int try_to_unuse(unsigned int type) { struct mm_struct *prev_mm; @@ -2474,6 +2600,7 @@ static int try_to_unuse(unsigned int type) smp_mb(); return 0; } +#endif /* CONFIG_VIRTUAL_SWAP */ /* * After a successful try_to_unuse, if no swap is now in use, we know diff --git a/mm/vswap.c b/mm/vswap.c index c09a7efc2aeb..c0da71d5d592 100644 --- a/mm/vswap.c +++ b/mm/vswap.c @@ -1289,6 +1289,67 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) swapcache_clear(NULL, entry, nr); } +/** + * vswap_swapoff - unlink a range of virtual swap slots from their backing + * physical swap slots on a swapfile that is being swapped off, + * and associate them with the swapped in folio. + * @entry: the first virtual swap slot in the range. + * @folio: the folio swapped in and loaded into swap cache. + * @slot: the first physical swap slot in the range. + */ +void vswap_swapoff(swp_entry_t entry, struct folio *folio, swp_slot_t slot) +{ + int i = 0, nr = folio_nr_pages(folio); + struct swp_desc *desc; + unsigned int type = swp_slot_type(slot); + unsigned int offset = swp_slot_offset(slot); + + XA_STATE(xas, &vswap_map, entry.val); + + rcu_read_lock(); + xas_for_each(&xas, desc, entry.val + nr - 1) { + if (xas_retry(&xas, desc)) + continue; + + write_lock(&desc->lock); + /* + * There might be concurrent swap operations that might invalidate the + * originally obtained virtual swap slot, allowing it to be + * re-allocated, or change its backing state. + * + * We must re-check here to make sure we are not performing bogus backing + * store changes. + */ + if (desc->type != VSWAP_SWAPFILE || + swp_slot_type(desc->slot) != type) { + /* there should not be mixed backing states among the subpages */ + VM_WARN_ON(i); + write_unlock(&desc->lock); + break; + } + + VM_WARN_ON(swp_slot_offset(desc->slot) != offset + i); + + xa_erase(&vswap_rmap, desc->slot.val); + desc->type = VSWAP_FOLIO; + desc->folio = folio; + write_unlock(&desc->lock); + i++; + } + rcu_read_unlock(); + + if (i) { + /* + * If we update the virtual swap slots' backing, mark the folio as + * dirty so that reclaimers will try to page it out again. + */ + folio_mark_dirty(folio); + swap_slot_free_nr(slot, nr); + /* folio is in swap cache, so entries are guaranteed to be valid */ + mem_cgroup_uncharge_swap(entry, nr); + } +} + #ifdef CONFIG_MEMCG static unsigned short vswap_cgroup_record(swp_entry_t entry, unsigned short memcgid, unsigned int nr_ents)

[RFC,13/14] swap: simplify swapoff using virtual swap

Commit Message

Patch