@@ -679,6 +679,23 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry);
+
+void __mem_cgroup_record_swap(struct folio *folio, swp_entry_t entry);
+static inline void mem_cgroup_record_swap(struct folio *folio,
+ swp_entry_t entry)
+{
+ if (!mem_cgroup_disabled())
+ __mem_cgroup_record_swap(folio, entry);
+}
+
+void __mem_cgroup_unrecord_swap(swp_entry_t entry, unsigned int nr_pages);
+static inline void mem_cgroup_unrecord_swap(swp_entry_t entry,
+ unsigned int nr_pages)
+{
+ if (!mem_cgroup_disabled())
+ __mem_cgroup_unrecord_swap(entry, nr_pages);
+}
+
int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry);
static inline int mem_cgroup_try_charge_swap(struct folio *folio,
swp_entry_t entry)
@@ -5020,6 +5020,46 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
css_put(&memcg->css);
}
+/**
+ * __mem_cgroup_record_swap - record the folio's cgroup for the swap entries.
+ * @folio: folio being swapped out.
+ * @entry: the first swap entry in the range.
+ *
+ * In the virtual swap implementation, we only record the folio's cgroup
+ * for the virtual swap slots on their allocation. We will only charge
+ * physical swap slots towards the cgroup's swap usage, i.e when physical swap
+ * slots are allocated for zswap writeback or fallback from zswap store
+ * failure.
+ */
+void __mem_cgroup_record_swap(struct folio *folio, swp_entry_t entry)
+{
+ unsigned int nr_pages = folio_nr_pages(folio);
+ struct mem_cgroup *memcg;
+
+ memcg = folio_memcg(folio);
+
+ VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
+ if (!memcg)
+ return;
+
+ memcg = mem_cgroup_id_get_online(memcg);
+ if (nr_pages > 1)
+ mem_cgroup_id_get_many(memcg, nr_pages - 1);
+ swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
+}
+
+void __mem_cgroup_unrecord_swap(swp_entry_t entry, unsigned int nr_pages)
+{
+ unsigned short id = swap_cgroup_clear(entry, nr_pages);
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(id);
+ if (memcg)
+ mem_cgroup_id_put_many(memcg, nr_pages);
+ rcu_read_unlock();
+}
+
/**
* __mem_cgroup_try_charge_swap - try charging swap space for a folio
* @folio: folio being added to swap
@@ -5038,34 +5078,47 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
if (do_memsw_account())
return 0;
- memcg = folio_memcg(folio);
+ if (IS_ENABLED(CONFIG_VIRTUAL_SWAP)) {
+ /*
+ * In the virtual swap implementation, we already record the cgroup
+ * on virtual swap allocation. Note that the virtual swap slot holds
+ * a reference to memcg, so this lookup should be safe.
+ */
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(lookup_swap_cgroup_id(entry));
+ rcu_read_unlock();
+ } else {
+ memcg = folio_memcg(folio);
- VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
- if (!memcg)
- return 0;
+ VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
+ if (!memcg)
+ return 0;
- if (!entry.val) {
- memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
- return 0;
- }
+ if (!entry.val) {
+ memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
+ return 0;
+ }
- memcg = mem_cgroup_id_get_online(memcg);
+ memcg = mem_cgroup_id_get_online(memcg);
+ }
if (!mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
memcg_memory_event(memcg, MEMCG_SWAP_MAX);
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
- mem_cgroup_id_put(memcg);
+ if (!IS_ENABLED(CONFIG_VIRTUAL_SWAP))
+ mem_cgroup_id_put(memcg);
return -ENOMEM;
}
- /* Get references for the tail pages, too */
- if (nr_pages > 1)
- mem_cgroup_id_get_many(memcg, nr_pages - 1);
+ if (!IS_ENABLED(CONFIG_VIRTUAL_SWAP)) {
+ /* Get references for the tail pages, too */
+ if (nr_pages > 1)
+ mem_cgroup_id_get_many(memcg, nr_pages - 1);
+ swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
+ }
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
- swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
-
return 0;
}
@@ -5079,7 +5132,11 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
struct mem_cgroup *memcg;
unsigned short id;
- id = swap_cgroup_clear(entry, nr_pages);
+ if (IS_ENABLED(CONFIG_VIRTUAL_SWAP))
+ id = lookup_swap_cgroup_id(entry);
+ else
+ id = swap_cgroup_clear(entry, nr_pages);
+
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
@@ -5090,7 +5147,8 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
page_counter_uncharge(&memcg->swap, nr_pages);
}
mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
- mem_cgroup_id_put_many(memcg, nr_pages);
+ if (!IS_ENABLED(CONFIG_VIRTUAL_SWAP))
+ mem_cgroup_id_put_many(memcg, nr_pages);
}
rcu_read_unlock();
}
@@ -5099,7 +5157,7 @@ static bool mem_cgroup_may_zswap(struct mem_cgroup *original_memcg);
long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
{
- long nr_swap_pages, nr_zswap_pages = 0;
+ long nr_swap_pages;
/*
* If swap is virtualized and zswap is enabled, we can still use zswap even
@@ -5108,10 +5166,14 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
if (IS_ENABLED(CONFIG_VIRTUAL_SWAP) && zswap_is_enabled() &&
(mem_cgroup_disabled() || do_memsw_account() ||
mem_cgroup_may_zswap(memcg))) {
- nr_zswap_pages = PAGE_COUNTER_MAX;
+ /*
+ * No need to check swap cgroup limits, since zswap is not charged
+ * towards swap consumption.
+ */
+ return PAGE_COUNTER_MAX;
}
- nr_swap_pages = max_t(long, nr_zswap_pages, get_nr_swap_pages());
+ nr_swap_pages = get_nr_swap_pages();
if (mem_cgroup_disabled() || do_memsw_account())
return nr_swap_pages;
for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg))
@@ -349,6 +349,7 @@ static inline void release_backing(swp_entry_t entry, int nr)
swap_slot_free_nr(slot, nr);
swap_slot_put_swap_info(si);
}
+ mem_cgroup_uncharge_swap(entry, nr);
}
}
@@ -367,7 +368,7 @@ static void vswap_free(swp_entry_t entry)
virt_clear_shadow_from_swap_cache(entry);
release_backing(entry, 1);
- mem_cgroup_uncharge_swap(entry, 1);
+ mem_cgroup_unrecord_swap(entry, 1);
/* erase forward mapping and release the virtual slot for reallocation */
release_vswap_slot(entry.val);
kfree_rcu(desc, rcu);
@@ -392,27 +393,13 @@ swp_entry_t folio_alloc_swap(struct folio *folio)
{
swp_entry_t entry;
struct swp_desc *desc;
- int i, nr = folio_nr_pages(folio);
+ int nr = folio_nr_pages(folio);
entry = vswap_alloc(nr);
if (!entry.val)
return entry;
- /*
- * XXX: for now, we charge towards the memory cgroup's swap limit on virtual
- * swap slots allocation. This will be changed soon - we will only charge on
- * physical swap slots allocation.
- */
- if (mem_cgroup_try_charge_swap(folio, entry)) {
- for (i = 0; i < nr; i++) {
- vswap_free(entry);
- entry.val++;
- }
- atomic_add(nr, &vswap_alloc_reject);
- entry.val = 0;
- return entry;
- }
-
+ mem_cgroup_record_swap(folio, entry);
XA_STATE(xas, &vswap_map, entry.val);
rcu_read_lock();
@@ -454,6 +441,9 @@ swp_slot_t vswap_alloc_swap_slot(struct folio *folio)
if (!slot.val)
return slot;
+ if (mem_cgroup_try_charge_swap(folio, entry))
+ goto free_phys_swap;
+
/* establish the vrtual <-> physical swap slots linkages. */
for (i = 0; i < nr; i++) {
err = xa_insert(&vswap_rmap, slot.val + i,
@@ -462,13 +452,7 @@ swp_slot_t vswap_alloc_swap_slot(struct folio *folio)
if (err) {
while (--i >= 0)
xa_erase(&vswap_rmap, slot.val + i);
- /*
- * We have not updated the backing type of the virtual swap slot.
- * Simply free up the physical swap slots here!
- */
- swap_slot_free_nr(slot, nr);
- slot.val = 0;
- return slot;
+ goto uncharge;
}
}
@@ -505,6 +489,17 @@ swp_slot_t vswap_alloc_swap_slot(struct folio *folio)
}
rcu_read_unlock();
return slot;
+
+uncharge:
+ mem_cgroup_uncharge_swap(entry, nr);
+free_phys_swap:
+ /*
+ * We have not updated the backing type of the virtual swap slot.
+ * Simply free up the physical swap slots here!
+ */
+ swap_slot_free_nr(slot, nr);
+ slot.val = 0;
+ return slot;
}
/**
Now that zswap and the zero-filled swap page optimization no longer takes up any physical swap space, we should not charge towards the swap usage and limits of the memcg in these case. We will only record the memcg id on virtual swap slot allocation, and defer physical swap charging (i.e towards memory.swap.current) until the virtual swap slot is backed by an actual physical swap slot (on zswap store failure fallback or zswap writeback). Signed-off-by: Nhat Pham <nphamcs@gmail.com> --- include/linux/swap.h | 17 ++++++++ mm/memcontrol.c | 102 ++++++++++++++++++++++++++++++++++--------- mm/vswap.c | 43 ++++++++---------- 3 files changed, 118 insertions(+), 44 deletions(-)