diff mbox series

mm: net: memcg accounting for TCP rx zerocopy

Message ID 20210112214105.1440932-1-shakeelb@google.com
State New
Headers show
Series mm: net: memcg accounting for TCP rx zerocopy | expand

Commit Message

Shakeel Butt Jan. 12, 2021, 9:41 p.m. UTC
From: Arjun Roy <arjunroy@google.com>

TCP zerocopy receive is used by high performance network applications to
further scale. For RX zerocopy, the memory containing the network data
filled by network driver is directly mapped into the address space of
high performance applications. To keep the TLB cost low, these
applications unmaps the network memory in big batches. So, this memory
can remain mapped for long time. This can cause memory isolation issue
as this memory becomes unaccounted after getting mapped into the
application address space. This patch adds the memcg accounting for such
memory.

Accounting the network memory comes with its own unique challenge. The
high performance NIC drivers use page pooling to reuse the pages to
eliminate/reduce the expensive setup steps like IOMMU. These drivers
keep an extra reference on the pages and thus we can not depends on the
page reference for the uncharging. The page in the pool may keep a memcg
pinned for arbitrary long time or may get used by other memcg.

This patch decouples the uncharging of the page from the refcnt and
associate it with the map count i.e. the page gets uncharged when the
last address space unmaps it. Now the question what if the driver drops
its reference while the page is still mapped. That is fine as the
address space also holds a reference to the page i.e. the reference
count can not drop to zero before the map count.

Signed-off-by: Arjun Roy <arjunroy@google.com>
Co-developed-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Shakeel Butt <shakeelb@google.com>
---
 include/linux/memcontrol.h | 34 +++++++++++++++++++--
 mm/memcontrol.c            | 60 ++++++++++++++++++++++++++++++++++++++
 mm/rmap.c                  |  3 ++
 net/ipv4/tcp.c             | 27 +++++++++++++----
 4 files changed, 116 insertions(+), 8 deletions(-)

Comments

Roman Gushchin Jan. 12, 2021, 11:48 p.m. UTC | #1
On Tue, Jan 12, 2021 at 03:36:18PM -0800, Arjun Roy wrote:
> On Tue, Jan 12, 2021 at 3:31 PM Roman Gushchin <guro@fb.com> wrote:

> >

> > On Tue, Jan 12, 2021 at 01:41:05PM -0800, Shakeel Butt wrote:

> > > From: Arjun Roy <arjunroy@google.com>

> > >

> > > TCP zerocopy receive is used by high performance network applications to

> > > further scale. For RX zerocopy, the memory containing the network data

> > > filled by network driver is directly mapped into the address space of

> > > high performance applications. To keep the TLB cost low, these

> > > applications unmaps the network memory in big batches. So, this memory

> > > can remain mapped for long time. This can cause memory isolation issue

> > > as this memory becomes unaccounted after getting mapped into the

> > > application address space. This patch adds the memcg accounting for such

> > > memory.

> > >

> > > Accounting the network memory comes with its own unique challenge. The

> > > high performance NIC drivers use page pooling to reuse the pages to

> > > eliminate/reduce the expensive setup steps like IOMMU. These drivers

> > > keep an extra reference on the pages and thus we can not depends on the

> > > page reference for the uncharging. The page in the pool may keep a memcg

> > > pinned for arbitrary long time or may get used by other memcg.

> > >

> > > This patch decouples the uncharging of the page from the refcnt and

> > > associate it with the map count i.e. the page gets uncharged when the

> > > last address space unmaps it. Now the question what if the driver drops

> > > its reference while the page is still mapped. That is fine as the

> > > address space also holds a reference to the page i.e. the reference

> > > count can not drop to zero before the map count.

> > >

> > > Signed-off-by: Arjun Roy <arjunroy@google.com>

> > > Co-developed-by: Shakeel Butt <shakeelb@google.com>

> > > Signed-off-by: Shakeel Butt <shakeelb@google.com>

> > > ---

> > >  include/linux/memcontrol.h | 34 +++++++++++++++++++--

> > >  mm/memcontrol.c            | 60 ++++++++++++++++++++++++++++++++++++++

> > >  mm/rmap.c                  |  3 ++

> > >  net/ipv4/tcp.c             | 27 +++++++++++++----

> > >  4 files changed, 116 insertions(+), 8 deletions(-)

> > >

> > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

> > > index 7a38a1517a05..0b0e3b4615cf 100644

> > > --- a/include/linux/memcontrol.h

> > > +++ b/include/linux/memcontrol.h

> > > @@ -349,11 +349,13 @@ extern struct mem_cgroup *root_mem_cgroup;

> > >

> > >  enum page_memcg_data_flags {

> > >       /* page->memcg_data is a pointer to an objcgs vector */

> > > -     MEMCG_DATA_OBJCGS = (1UL << 0),

> > > +     MEMCG_DATA_OBJCGS       = (1UL << 0),

> > >       /* page has been accounted as a non-slab kernel page */

> > > -     MEMCG_DATA_KMEM = (1UL << 1),

> > > +     MEMCG_DATA_KMEM         = (1UL << 1),

> > > +     /* page has been accounted as network memory */

> > > +     MEMCG_DATA_SOCK         = (1UL << 2),

> > >       /* the next bit after the last actual flag */

> > > -     __NR_MEMCG_DATA_FLAGS  = (1UL << 2),

> > > +     __NR_MEMCG_DATA_FLAGS   = (1UL << 3),

> > >  };

> > >

> > >  #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)

> > > @@ -444,6 +446,11 @@ static inline bool PageMemcgKmem(struct page *page)

> > >       return page->memcg_data & MEMCG_DATA_KMEM;

> > >  }

> > >

> > > +static inline bool PageMemcgSock(struct page *page)

> > > +{

> > > +     return page->memcg_data & MEMCG_DATA_SOCK;

> > > +}

> > > +

> > >  #ifdef CONFIG_MEMCG_KMEM

> > >  /*

> > >   * page_objcgs - get the object cgroups vector associated with a page

> > > @@ -1095,6 +1102,11 @@ static inline bool PageMemcgKmem(struct page *page)

> > >       return false;

> > >  }

> > >

> > > +static inline bool PageMemcgSock(struct page *page)

> > > +{

> > > +     return false;

> > > +}

> > > +

> > >  static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)

> > >  {

> > >       return true;

> > > @@ -1561,6 +1573,10 @@ extern struct static_key_false memcg_sockets_enabled_key;

> > >  #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)

> > >  void mem_cgroup_sk_alloc(struct sock *sk);

> > >  void mem_cgroup_sk_free(struct sock *sk);

> > > +int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg, struct page **pages,

> > > +                              unsigned int nr_pages);

> > > +void mem_cgroup_uncharge_sock_pages(struct page **pages, unsigned int nr_pages);

> > > +

> > >  static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)

> > >  {

> > >       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)

> > > @@ -1589,6 +1605,18 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,

> > >                                         int nid, int shrinker_id)

> > >  {

> > >  }

> > > +

> > > +static inline int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg,

> > > +                                            struct page **pages,

> > > +                                            unsigned int nr_pages)

> > > +{

> > > +     return 0;

> > > +}

> > > +

> > > +static inline void mem_cgroup_uncharge_sock_pages(struct page **pages,

> > > +                                               unsigned int nr_pages)

> > > +{

> > > +}

> > >  #endif

> > >

> > >  #ifdef CONFIG_MEMCG_KMEM

> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c

> > > index db9836f4b64b..38e94538e081 100644

> > > --- a/mm/memcontrol.c

> > > +++ b/mm/memcontrol.c

> > > @@ -7061,6 +7061,66 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)

> > >       refill_stock(memcg, nr_pages);

> > >  }

> > >

> > > +/**

> > > + * mem_cgroup_charge_sock_pages - charge socket memory

> > > + * @memcg: memcg to charge

> > > + * @pages: array of pages to charge

> > > + * @nr_pages: number of pages

> > > + *

> > > + * Charges all @pages to current's memcg. The caller should have a reference on

> > > + * the given memcg.

> > > + *

> > > + * Returns 0 on success.

> > > + */

> > > +int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg, struct page **pages,

> > > +                              unsigned int nr_pages)

> > > +{

> > > +     int ret = 0;

> > > +

> > > +     if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))

> > > +             goto out;

> > > +

> > > +     ret = try_charge(memcg, GFP_KERNEL, nr_pages);

> > > +

> > > +     if (!ret) {

> > > +             int i;

> > > +

> > > +             for (i = 0; i < nr_pages; i++)

> > > +                     pages[i]->memcg_data = (unsigned long)memcg |

> > > +                             MEMCG_DATA_SOCK;

> > > +             css_get_many(&memcg->css, nr_pages);

> > > +     }

> > > +out:

> > > +     return ret;

> > > +}

> > > +

> > > +/**

> > > + * mem_cgroup_uncharge_sock_pages - uncharge socket pages

> > > + * @pages: array of pages to uncharge

> > > + * @nr_pages: number of pages

> > > + *

> > > + * This assumes all pages are charged to the same memcg.

> > > + */

> > > +void mem_cgroup_uncharge_sock_pages(struct page **pages, unsigned int nr_pages)

> > > +{

> > > +     int i;

> > > +     struct mem_cgroup *memcg;

> > > +

> > > +     if (mem_cgroup_disabled())

> > > +             return;

> > > +

> > > +     memcg = page_memcg(pages[0]);

> > > +

> > > +     if (unlikely(!memcg))

> > > +             return;

> > > +

> > > +     refill_stock(memcg, nr_pages);

> > > +

> > > +     for (i = 0; i < nr_pages; i++)

> > > +             pages[i]->memcg_data = 0;

> > > +     css_put_many(&memcg->css, nr_pages);

> > > +}

> >

> > What about statistics? Should it be accounted towards "sock", "slab/kmem" or deserves

> > a separate counter? Do we plan to eventually have shrinkers for this type of memory?

> >

> 

> While the pages in question are part of an sk_buff, they may be

> accounted towards sockmem. However, that charge is unaccounted when

> the skb is freed after the receive operation. When they are in use by

> the user application I do not think sockmem is the right place to have

> a break-out counter.


Does it mean that a page can be accounted twice (even temporarily)?

Historically we have a corresponding vmstat counter to each charged page.
It helps with finding accounting/stastistics issues: we can check that
memory.current ~= anon + file + sock + slab + percpu + stack.
It would be nice to preserve such ability.

> 

> To double check, what do you mean by shrinker?


I mean do we plan to implement a mechanism to reclaim memory from these drivers
on-demand, if a cgroup is experiencing high memory pressure.
Arjun Roy Jan. 13, 2021, 12:12 a.m. UTC | #2
On Tue, Jan 12, 2021 at 3:48 PM Roman Gushchin <guro@fb.com> wrote:
>

> On Tue, Jan 12, 2021 at 03:36:18PM -0800, Arjun Roy wrote:

> > On Tue, Jan 12, 2021 at 3:31 PM Roman Gushchin <guro@fb.com> wrote:

> > >

> > > On Tue, Jan 12, 2021 at 01:41:05PM -0800, Shakeel Butt wrote:

> > > > From: Arjun Roy <arjunroy@google.com>

> > > >

> > > > TCP zerocopy receive is used by high performance network applications to

> > > > further scale. For RX zerocopy, the memory containing the network data

> > > > filled by network driver is directly mapped into the address space of

> > > > high performance applications. To keep the TLB cost low, these

> > > > applications unmaps the network memory in big batches. So, this memory

> > > > can remain mapped for long time. This can cause memory isolation issue

> > > > as this memory becomes unaccounted after getting mapped into the

> > > > application address space. This patch adds the memcg accounting for such

> > > > memory.

> > > >

> > > > Accounting the network memory comes with its own unique challenge. The

> > > > high performance NIC drivers use page pooling to reuse the pages to

> > > > eliminate/reduce the expensive setup steps like IOMMU. These drivers

> > > > keep an extra reference on the pages and thus we can not depends on the

> > > > page reference for the uncharging. The page in the pool may keep a memcg

> > > > pinned for arbitrary long time or may get used by other memcg.

> > > >

> > > > This patch decouples the uncharging of the page from the refcnt and

> > > > associate it with the map count i.e. the page gets uncharged when the

> > > > last address space unmaps it. Now the question what if the driver drops

> > > > its reference while the page is still mapped. That is fine as the

> > > > address space also holds a reference to the page i.e. the reference

> > > > count can not drop to zero before the map count.

> > > >

> > > > Signed-off-by: Arjun Roy <arjunroy@google.com>

> > > > Co-developed-by: Shakeel Butt <shakeelb@google.com>

> > > > Signed-off-by: Shakeel Butt <shakeelb@google.com>

> > > > ---

> > > >  include/linux/memcontrol.h | 34 +++++++++++++++++++--

> > > >  mm/memcontrol.c            | 60 ++++++++++++++++++++++++++++++++++++++

> > > >  mm/rmap.c                  |  3 ++

> > > >  net/ipv4/tcp.c             | 27 +++++++++++++----

> > > >  4 files changed, 116 insertions(+), 8 deletions(-)

> > > >

> > > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

> > > > index 7a38a1517a05..0b0e3b4615cf 100644

> > > > --- a/include/linux/memcontrol.h

> > > > +++ b/include/linux/memcontrol.h

> > > > @@ -349,11 +349,13 @@ extern struct mem_cgroup *root_mem_cgroup;

> > > >

> > > >  enum page_memcg_data_flags {

> > > >       /* page->memcg_data is a pointer to an objcgs vector */

> > > > -     MEMCG_DATA_OBJCGS = (1UL << 0),

> > > > +     MEMCG_DATA_OBJCGS       = (1UL << 0),

> > > >       /* page has been accounted as a non-slab kernel page */

> > > > -     MEMCG_DATA_KMEM = (1UL << 1),

> > > > +     MEMCG_DATA_KMEM         = (1UL << 1),

> > > > +     /* page has been accounted as network memory */

> > > > +     MEMCG_DATA_SOCK         = (1UL << 2),

> > > >       /* the next bit after the last actual flag */

> > > > -     __NR_MEMCG_DATA_FLAGS  = (1UL << 2),

> > > > +     __NR_MEMCG_DATA_FLAGS   = (1UL << 3),

> > > >  };

> > > >

> > > >  #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)

> > > > @@ -444,6 +446,11 @@ static inline bool PageMemcgKmem(struct page *page)

> > > >       return page->memcg_data & MEMCG_DATA_KMEM;

> > > >  }

> > > >

> > > > +static inline bool PageMemcgSock(struct page *page)

> > > > +{

> > > > +     return page->memcg_data & MEMCG_DATA_SOCK;

> > > > +}

> > > > +

> > > >  #ifdef CONFIG_MEMCG_KMEM

> > > >  /*

> > > >   * page_objcgs - get the object cgroups vector associated with a page

> > > > @@ -1095,6 +1102,11 @@ static inline bool PageMemcgKmem(struct page *page)

> > > >       return false;

> > > >  }

> > > >

> > > > +static inline bool PageMemcgSock(struct page *page)

> > > > +{

> > > > +     return false;

> > > > +}

> > > > +

> > > >  static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)

> > > >  {

> > > >       return true;

> > > > @@ -1561,6 +1573,10 @@ extern struct static_key_false memcg_sockets_enabled_key;

> > > >  #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)

> > > >  void mem_cgroup_sk_alloc(struct sock *sk);

> > > >  void mem_cgroup_sk_free(struct sock *sk);

> > > > +int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg, struct page **pages,

> > > > +                              unsigned int nr_pages);

> > > > +void mem_cgroup_uncharge_sock_pages(struct page **pages, unsigned int nr_pages);

> > > > +

> > > >  static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)

> > > >  {

> > > >       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)

> > > > @@ -1589,6 +1605,18 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,

> > > >                                         int nid, int shrinker_id)

> > > >  {

> > > >  }

> > > > +

> > > > +static inline int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg,

> > > > +                                            struct page **pages,

> > > > +                                            unsigned int nr_pages)

> > > > +{

> > > > +     return 0;

> > > > +}

> > > > +

> > > > +static inline void mem_cgroup_uncharge_sock_pages(struct page **pages,

> > > > +                                               unsigned int nr_pages)

> > > > +{

> > > > +}

> > > >  #endif

> > > >

> > > >  #ifdef CONFIG_MEMCG_KMEM

> > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c

> > > > index db9836f4b64b..38e94538e081 100644

> > > > --- a/mm/memcontrol.c

> > > > +++ b/mm/memcontrol.c

> > > > @@ -7061,6 +7061,66 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)

> > > >       refill_stock(memcg, nr_pages);

> > > >  }

> > > >

> > > > +/**

> > > > + * mem_cgroup_charge_sock_pages - charge socket memory

> > > > + * @memcg: memcg to charge

> > > > + * @pages: array of pages to charge

> > > > + * @nr_pages: number of pages

> > > > + *

> > > > + * Charges all @pages to current's memcg. The caller should have a reference on

> > > > + * the given memcg.

> > > > + *

> > > > + * Returns 0 on success.

> > > > + */

> > > > +int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg, struct page **pages,

> > > > +                              unsigned int nr_pages)

> > > > +{

> > > > +     int ret = 0;

> > > > +

> > > > +     if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))

> > > > +             goto out;

> > > > +

> > > > +     ret = try_charge(memcg, GFP_KERNEL, nr_pages);

> > > > +

> > > > +     if (!ret) {

> > > > +             int i;

> > > > +

> > > > +             for (i = 0; i < nr_pages; i++)

> > > > +                     pages[i]->memcg_data = (unsigned long)memcg |

> > > > +                             MEMCG_DATA_SOCK;

> > > > +             css_get_many(&memcg->css, nr_pages);

> > > > +     }

> > > > +out:

> > > > +     return ret;

> > > > +}

> > > > +

> > > > +/**

> > > > + * mem_cgroup_uncharge_sock_pages - uncharge socket pages

> > > > + * @pages: array of pages to uncharge

> > > > + * @nr_pages: number of pages

> > > > + *

> > > > + * This assumes all pages are charged to the same memcg.

> > > > + */

> > > > +void mem_cgroup_uncharge_sock_pages(struct page **pages, unsigned int nr_pages)

> > > > +{

> > > > +     int i;

> > > > +     struct mem_cgroup *memcg;

> > > > +

> > > > +     if (mem_cgroup_disabled())

> > > > +             return;

> > > > +

> > > > +     memcg = page_memcg(pages[0]);

> > > > +

> > > > +     if (unlikely(!memcg))

> > > > +             return;

> > > > +

> > > > +     refill_stock(memcg, nr_pages);

> > > > +

> > > > +     for (i = 0; i < nr_pages; i++)

> > > > +             pages[i]->memcg_data = 0;

> > > > +     css_put_many(&memcg->css, nr_pages);

> > > > +}

> > >

> > > What about statistics? Should it be accounted towards "sock", "slab/kmem" or deserves

> > > a separate counter? Do we plan to eventually have shrinkers for this type of memory?

> > >

> >

> > While the pages in question are part of an sk_buff, they may be

> > accounted towards sockmem. However, that charge is unaccounted when

> > the skb is freed after the receive operation. When they are in use by

> > the user application I do not think sockmem is the right place to have

> > a break-out counter.

>

> Does it mean that a page can be accounted twice (even temporarily)?

>


This was an actual consideration for this patchset that we went back
and forth on a little bit.
Short answer, for this patch in its current form: yes. We're calling
mem_cgroup_charge_sock_pages() immediately prior to vm_insert_pages();
and the skb isn't cleaned up until afterwards. Thus we have a period
of double charging.

The pseudocode for the approach in this patch is:

while skb = next skb in queue is not null:
    charge_skb_pages(skb.pages) // sets page.memcg for each page
    // at this point pages are double counted
    vm_insert_pages(skb.pages)
    free(skb) // unrefs the pages, no longer double counted

An alternative version of this patch went the other way: have a short
period of undercharging.

while skb = next skb in queue is not null:
    for page in skb.pages set page.memcg
    vm_insert_pages(skb.pages)
    free(skb) // unrefs the pages. pages are now undercounted
charge_skb_pages(nr_pages_mapped, FORCE_CHARGE) // count is now correct again
ret

The latter would have the benefit of having less charge operations
(less pricey atomics etc.) but would require the charge to be forced
to succeed since we already made it available to the user. But if one
assumes that in the common case, that sock->memcg == current->memcg,
then perhaps a forced charge is ok in that case.


> Historically we have a corresponding vmstat counter to each charged page.

> It helps with finding accounting/stastistics issues: we can check that

> memory.current ~= anon + file + sock + slab + percpu + stack.

> It would be nice to preserve such ability.

>


Perhaps one option would be to have it count as a file page, or have a
new category.

> >

> > To double check, what do you mean by shrinker?

>

> I mean do we plan to implement a mechanism to reclaim memory from these drivers

> on-demand, if a cgroup is experiencing high memory pressure.


Not that I'm aware of.

Thanks,
-Arjun
Shakeel Butt Jan. 13, 2021, 12:12 a.m. UTC | #3
On Tue, Jan 12, 2021 at 3:48 PM Roman Gushchin <guro@fb.com> wrote:
>

> On Tue, Jan 12, 2021 at 03:36:18PM -0800, Arjun Roy wrote:

> > On Tue, Jan 12, 2021 at 3:31 PM Roman Gushchin <guro@fb.com> wrote:

> > >

> > > On Tue, Jan 12, 2021 at 01:41:05PM -0800, Shakeel Butt wrote:

> > > > From: Arjun Roy <arjunroy@google.com>

> > > >

> > > > TCP zerocopy receive is used by high performance network applications to

> > > > further scale. For RX zerocopy, the memory containing the network data

> > > > filled by network driver is directly mapped into the address space of

> > > > high performance applications. To keep the TLB cost low, these

> > > > applications unmaps the network memory in big batches. So, this memory

> > > > can remain mapped for long time. This can cause memory isolation issue

> > > > as this memory becomes unaccounted after getting mapped into the

> > > > application address space. This patch adds the memcg accounting for such

> > > > memory.

> > > >

> > > > Accounting the network memory comes with its own unique challenge. The

> > > > high performance NIC drivers use page pooling to reuse the pages to

> > > > eliminate/reduce the expensive setup steps like IOMMU. These drivers

> > > > keep an extra reference on the pages and thus we can not depends on the

> > > > page reference for the uncharging. The page in the pool may keep a memcg

> > > > pinned for arbitrary long time or may get used by other memcg.

> > > >

> > > > This patch decouples the uncharging of the page from the refcnt and

> > > > associate it with the map count i.e. the page gets uncharged when the

> > > > last address space unmaps it. Now the question what if the driver drops

> > > > its reference while the page is still mapped. That is fine as the

> > > > address space also holds a reference to the page i.e. the reference

> > > > count can not drop to zero before the map count.

> > > >

> > > > Signed-off-by: Arjun Roy <arjunroy@google.com>

> > > > Co-developed-by: Shakeel Butt <shakeelb@google.com>

> > > > Signed-off-by: Shakeel Butt <shakeelb@google.com>

> > > > ---

> > > >  include/linux/memcontrol.h | 34 +++++++++++++++++++--

> > > >  mm/memcontrol.c            | 60 ++++++++++++++++++++++++++++++++++++++

> > > >  mm/rmap.c                  |  3 ++

> > > >  net/ipv4/tcp.c             | 27 +++++++++++++----

> > > >  4 files changed, 116 insertions(+), 8 deletions(-)

> > > >

> > > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

> > > > index 7a38a1517a05..0b0e3b4615cf 100644

> > > > --- a/include/linux/memcontrol.h

> > > > +++ b/include/linux/memcontrol.h

> > > > @@ -349,11 +349,13 @@ extern struct mem_cgroup *root_mem_cgroup;

> > > >

> > > >  enum page_memcg_data_flags {

> > > >       /* page->memcg_data is a pointer to an objcgs vector */

> > > > -     MEMCG_DATA_OBJCGS = (1UL << 0),

> > > > +     MEMCG_DATA_OBJCGS       = (1UL << 0),

> > > >       /* page has been accounted as a non-slab kernel page */

> > > > -     MEMCG_DATA_KMEM = (1UL << 1),

> > > > +     MEMCG_DATA_KMEM         = (1UL << 1),

> > > > +     /* page has been accounted as network memory */

> > > > +     MEMCG_DATA_SOCK         = (1UL << 2),

> > > >       /* the next bit after the last actual flag */

> > > > -     __NR_MEMCG_DATA_FLAGS  = (1UL << 2),

> > > > +     __NR_MEMCG_DATA_FLAGS   = (1UL << 3),

> > > >  };

> > > >

> > > >  #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)

> > > > @@ -444,6 +446,11 @@ static inline bool PageMemcgKmem(struct page *page)

> > > >       return page->memcg_data & MEMCG_DATA_KMEM;

> > > >  }

> > > >

> > > > +static inline bool PageMemcgSock(struct page *page)

> > > > +{

> > > > +     return page->memcg_data & MEMCG_DATA_SOCK;

> > > > +}

> > > > +

> > > >  #ifdef CONFIG_MEMCG_KMEM

> > > >  /*

> > > >   * page_objcgs - get the object cgroups vector associated with a page

> > > > @@ -1095,6 +1102,11 @@ static inline bool PageMemcgKmem(struct page *page)

> > > >       return false;

> > > >  }

> > > >

> > > > +static inline bool PageMemcgSock(struct page *page)

> > > > +{

> > > > +     return false;

> > > > +}

> > > > +

> > > >  static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)

> > > >  {

> > > >       return true;

> > > > @@ -1561,6 +1573,10 @@ extern struct static_key_false memcg_sockets_enabled_key;

> > > >  #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)

> > > >  void mem_cgroup_sk_alloc(struct sock *sk);

> > > >  void mem_cgroup_sk_free(struct sock *sk);

> > > > +int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg, struct page **pages,

> > > > +                              unsigned int nr_pages);

> > > > +void mem_cgroup_uncharge_sock_pages(struct page **pages, unsigned int nr_pages);

> > > > +

> > > >  static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)

> > > >  {

> > > >       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)

> > > > @@ -1589,6 +1605,18 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,

> > > >                                         int nid, int shrinker_id)

> > > >  {

> > > >  }

> > > > +

> > > > +static inline int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg,

> > > > +                                            struct page **pages,

> > > > +                                            unsigned int nr_pages)

> > > > +{

> > > > +     return 0;

> > > > +}

> > > > +

> > > > +static inline void mem_cgroup_uncharge_sock_pages(struct page **pages,

> > > > +                                               unsigned int nr_pages)

> > > > +{

> > > > +}

> > > >  #endif

> > > >

> > > >  #ifdef CONFIG_MEMCG_KMEM

> > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c

> > > > index db9836f4b64b..38e94538e081 100644

> > > > --- a/mm/memcontrol.c

> > > > +++ b/mm/memcontrol.c

> > > > @@ -7061,6 +7061,66 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)

> > > >       refill_stock(memcg, nr_pages);

> > > >  }

> > > >

> > > > +/**

> > > > + * mem_cgroup_charge_sock_pages - charge socket memory

> > > > + * @memcg: memcg to charge

> > > > + * @pages: array of pages to charge

> > > > + * @nr_pages: number of pages

> > > > + *

> > > > + * Charges all @pages to current's memcg. The caller should have a reference on

> > > > + * the given memcg.

> > > > + *

> > > > + * Returns 0 on success.

> > > > + */

> > > > +int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg, struct page **pages,

> > > > +                              unsigned int nr_pages)

> > > > +{

> > > > +     int ret = 0;

> > > > +

> > > > +     if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))

> > > > +             goto out;

> > > > +

> > > > +     ret = try_charge(memcg, GFP_KERNEL, nr_pages);

> > > > +

> > > > +     if (!ret) {

> > > > +             int i;

> > > > +

> > > > +             for (i = 0; i < nr_pages; i++)

> > > > +                     pages[i]->memcg_data = (unsigned long)memcg |

> > > > +                             MEMCG_DATA_SOCK;

> > > > +             css_get_many(&memcg->css, nr_pages);

> > > > +     }

> > > > +out:

> > > > +     return ret;

> > > > +}

> > > > +

> > > > +/**

> > > > + * mem_cgroup_uncharge_sock_pages - uncharge socket pages

> > > > + * @pages: array of pages to uncharge

> > > > + * @nr_pages: number of pages

> > > > + *

> > > > + * This assumes all pages are charged to the same memcg.

> > > > + */

> > > > +void mem_cgroup_uncharge_sock_pages(struct page **pages, unsigned int nr_pages)

> > > > +{

> > > > +     int i;

> > > > +     struct mem_cgroup *memcg;

> > > > +

> > > > +     if (mem_cgroup_disabled())

> > > > +             return;

> > > > +

> > > > +     memcg = page_memcg(pages[0]);

> > > > +

> > > > +     if (unlikely(!memcg))

> > > > +             return;

> > > > +

> > > > +     refill_stock(memcg, nr_pages);

> > > > +

> > > > +     for (i = 0; i < nr_pages; i++)

> > > > +             pages[i]->memcg_data = 0;

> > > > +     css_put_many(&memcg->css, nr_pages);

> > > > +}

> > >

> > > What about statistics? Should it be accounted towards "sock", "slab/kmem" or deserves

> > > a separate counter? Do we plan to eventually have shrinkers for this type of memory?

> > >

> >

> > While the pages in question are part of an sk_buff, they may be

> > accounted towards sockmem. However, that charge is unaccounted when

> > the skb is freed after the receive operation. When they are in use by

> > the user application I do not think sockmem is the right place to have

> > a break-out counter.

>

> Does it mean that a page can be accounted twice (even temporarily)?

>


Actually yes depending on the environment. For applications running in
cgroup v2 where the skmem is charged against the memcg's memory
counter and if sk->sk_memcg is equal to current's memcg there is a
small window where the memory is double charged. However that is not
the case for cgroup v1 or if sk->sk_memcg is different from current's
memcg. IMO this small window of double charging is fine as it is
somewhat similar to recv*() syscalls where the application has to
pre-allocate memory where the kernel copies the network data which is
charged to sk->sk_memcg.

> Historically we have a corresponding vmstat counter to each charged page.

> It helps with finding accounting/stastistics issues: we can check that

> memory.current ~= anon + file + sock + slab + percpu + stack.

> It would be nice to preserve such ability.


I think it can be either sock memcg stat or a new one. I will think of
something.

>

> >

> > To double check, what do you mean by shrinker?

>

> I mean do we plan to implement a mechanism to reclaim memory from these drivers

> on-demand, if a cgroup is experiencing high memory pressure.


These pages are not really reclaimable as these are not really LRU or
page cache pages. These will get freed when application unmaps them.
Shakeel Butt Jan. 13, 2021, 12:18 a.m. UTC | #4
On Tue, Jan 12, 2021 at 4:12 PM Arjun Roy <arjunroy@google.com> wrote:
>

> On Tue, Jan 12, 2021 at 3:48 PM Roman Gushchin <guro@fb.com> wrote:

> >

[snip]
> > Historically we have a corresponding vmstat counter to each charged page.

> > It helps with finding accounting/stastistics issues: we can check that

> > memory.current ~= anon + file + sock + slab + percpu + stack.

> > It would be nice to preserve such ability.

> >

>

> Perhaps one option would be to have it count as a file page, or have a

> new category.

>


Oh these are actually already accounted for in NR_FILE_MAPPED.
Roman Gushchin Jan. 13, 2021, 6:43 p.m. UTC | #5
On Tue, Jan 12, 2021 at 04:18:44PM -0800, Shakeel Butt wrote:
> On Tue, Jan 12, 2021 at 4:12 PM Arjun Roy <arjunroy@google.com> wrote:

> >

> > On Tue, Jan 12, 2021 at 3:48 PM Roman Gushchin <guro@fb.com> wrote:

> > >

> [snip]

> > > Historically we have a corresponding vmstat counter to each charged page.

> > > It helps with finding accounting/stastistics issues: we can check that

> > > memory.current ~= anon + file + sock + slab + percpu + stack.

> > > It would be nice to preserve such ability.

> > >

> >

> > Perhaps one option would be to have it count as a file page, or have a

> > new category.

> >

> 

> Oh these are actually already accounted for in NR_FILE_MAPPED.


Well, it's confusing. Can't we fix this by looking at the new page memcg flag?
Roman Gushchin Jan. 13, 2021, 6:47 p.m. UTC | #6
On Tue, Jan 12, 2021 at 04:12:08PM -0800, Arjun Roy wrote:
> On Tue, Jan 12, 2021 at 3:48 PM Roman Gushchin <guro@fb.com> wrote:

> >

> > On Tue, Jan 12, 2021 at 03:36:18PM -0800, Arjun Roy wrote:

> > > On Tue, Jan 12, 2021 at 3:31 PM Roman Gushchin <guro@fb.com> wrote:

> > > >

> > > > On Tue, Jan 12, 2021 at 01:41:05PM -0800, Shakeel Butt wrote:

> > > > > From: Arjun Roy <arjunroy@google.com>

> > > > >

> > > > > TCP zerocopy receive is used by high performance network applications to

> > > > > further scale. For RX zerocopy, the memory containing the network data

> > > > > filled by network driver is directly mapped into the address space of

> > > > > high performance applications. To keep the TLB cost low, these

> > > > > applications unmaps the network memory in big batches. So, this memory

> > > > > can remain mapped for long time. This can cause memory isolation issue

> > > > > as this memory becomes unaccounted after getting mapped into the

> > > > > application address space. This patch adds the memcg accounting for such

> > > > > memory.

> > > > >

> > > > > Accounting the network memory comes with its own unique challenge. The

> > > > > high performance NIC drivers use page pooling to reuse the pages to

> > > > > eliminate/reduce the expensive setup steps like IOMMU. These drivers

> > > > > keep an extra reference on the pages and thus we can not depends on the

> > > > > page reference for the uncharging. The page in the pool may keep a memcg

> > > > > pinned for arbitrary long time or may get used by other memcg.

> > > > >

> > > > > This patch decouples the uncharging of the page from the refcnt and

> > > > > associate it with the map count i.e. the page gets uncharged when the

> > > > > last address space unmaps it. Now the question what if the driver drops

> > > > > its reference while the page is still mapped. That is fine as the

> > > > > address space also holds a reference to the page i.e. the reference

> > > > > count can not drop to zero before the map count.

> > > > >

> > > > > Signed-off-by: Arjun Roy <arjunroy@google.com>

> > > > > Co-developed-by: Shakeel Butt <shakeelb@google.com>

> > > > > Signed-off-by: Shakeel Butt <shakeelb@google.com>

> > > > > ---

> > > > >  include/linux/memcontrol.h | 34 +++++++++++++++++++--

> > > > >  mm/memcontrol.c            | 60 ++++++++++++++++++++++++++++++++++++++

> > > > >  mm/rmap.c                  |  3 ++

> > > > >  net/ipv4/tcp.c             | 27 +++++++++++++----

> > > > >  4 files changed, 116 insertions(+), 8 deletions(-)

> > > > >

> > > > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

> > > > > index 7a38a1517a05..0b0e3b4615cf 100644

> > > > > --- a/include/linux/memcontrol.h

> > > > > +++ b/include/linux/memcontrol.h

> > > > > @@ -349,11 +349,13 @@ extern struct mem_cgroup *root_mem_cgroup;

> > > > >

> > > > >  enum page_memcg_data_flags {

> > > > >       /* page->memcg_data is a pointer to an objcgs vector */

> > > > > -     MEMCG_DATA_OBJCGS = (1UL << 0),

> > > > > +     MEMCG_DATA_OBJCGS       = (1UL << 0),

> > > > >       /* page has been accounted as a non-slab kernel page */

> > > > > -     MEMCG_DATA_KMEM = (1UL << 1),

> > > > > +     MEMCG_DATA_KMEM         = (1UL << 1),

> > > > > +     /* page has been accounted as network memory */

> > > > > +     MEMCG_DATA_SOCK         = (1UL << 2),

> > > > >       /* the next bit after the last actual flag */

> > > > > -     __NR_MEMCG_DATA_FLAGS  = (1UL << 2),

> > > > > +     __NR_MEMCG_DATA_FLAGS   = (1UL << 3),

> > > > >  };

> > > > >

> > > > >  #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)

> > > > > @@ -444,6 +446,11 @@ static inline bool PageMemcgKmem(struct page *page)

> > > > >       return page->memcg_data & MEMCG_DATA_KMEM;

> > > > >  }

> > > > >

> > > > > +static inline bool PageMemcgSock(struct page *page)

> > > > > +{

> > > > > +     return page->memcg_data & MEMCG_DATA_SOCK;

> > > > > +}

> > > > > +

> > > > >  #ifdef CONFIG_MEMCG_KMEM

> > > > >  /*

> > > > >   * page_objcgs - get the object cgroups vector associated with a page

> > > > > @@ -1095,6 +1102,11 @@ static inline bool PageMemcgKmem(struct page *page)

> > > > >       return false;

> > > > >  }

> > > > >

> > > > > +static inline bool PageMemcgSock(struct page *page)

> > > > > +{

> > > > > +     return false;

> > > > > +}

> > > > > +

> > > > >  static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)

> > > > >  {

> > > > >       return true;

> > > > > @@ -1561,6 +1573,10 @@ extern struct static_key_false memcg_sockets_enabled_key;

> > > > >  #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)

> > > > >  void mem_cgroup_sk_alloc(struct sock *sk);

> > > > >  void mem_cgroup_sk_free(struct sock *sk);

> > > > > +int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg, struct page **pages,

> > > > > +                              unsigned int nr_pages);

> > > > > +void mem_cgroup_uncharge_sock_pages(struct page **pages, unsigned int nr_pages);

> > > > > +

> > > > >  static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)

> > > > >  {

> > > > >       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)

> > > > > @@ -1589,6 +1605,18 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,

> > > > >                                         int nid, int shrinker_id)

> > > > >  {

> > > > >  }

> > > > > +

> > > > > +static inline int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg,

> > > > > +                                            struct page **pages,

> > > > > +                                            unsigned int nr_pages)

> > > > > +{

> > > > > +     return 0;

> > > > > +}

> > > > > +

> > > > > +static inline void mem_cgroup_uncharge_sock_pages(struct page **pages,

> > > > > +                                               unsigned int nr_pages)

> > > > > +{

> > > > > +}

> > > > >  #endif

> > > > >

> > > > >  #ifdef CONFIG_MEMCG_KMEM

> > > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c

> > > > > index db9836f4b64b..38e94538e081 100644

> > > > > --- a/mm/memcontrol.c

> > > > > +++ b/mm/memcontrol.c

> > > > > @@ -7061,6 +7061,66 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)

> > > > >       refill_stock(memcg, nr_pages);

> > > > >  }

> > > > >

> > > > > +/**

> > > > > + * mem_cgroup_charge_sock_pages - charge socket memory

> > > > > + * @memcg: memcg to charge

> > > > > + * @pages: array of pages to charge

> > > > > + * @nr_pages: number of pages

> > > > > + *

> > > > > + * Charges all @pages to current's memcg. The caller should have a reference on

> > > > > + * the given memcg.

> > > > > + *

> > > > > + * Returns 0 on success.

> > > > > + */

> > > > > +int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg, struct page **pages,

> > > > > +                              unsigned int nr_pages)

> > > > > +{

> > > > > +     int ret = 0;

> > > > > +

> > > > > +     if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))

> > > > > +             goto out;

> > > > > +

> > > > > +     ret = try_charge(memcg, GFP_KERNEL, nr_pages);

> > > > > +

> > > > > +     if (!ret) {

> > > > > +             int i;

> > > > > +

> > > > > +             for (i = 0; i < nr_pages; i++)

> > > > > +                     pages[i]->memcg_data = (unsigned long)memcg |

> > > > > +                             MEMCG_DATA_SOCK;

> > > > > +             css_get_many(&memcg->css, nr_pages);

> > > > > +     }

> > > > > +out:

> > > > > +     return ret;

> > > > > +}

> > > > > +

> > > > > +/**

> > > > > + * mem_cgroup_uncharge_sock_pages - uncharge socket pages

> > > > > + * @pages: array of pages to uncharge

> > > > > + * @nr_pages: number of pages

> > > > > + *

> > > > > + * This assumes all pages are charged to the same memcg.

> > > > > + */

> > > > > +void mem_cgroup_uncharge_sock_pages(struct page **pages, unsigned int nr_pages)

> > > > > +{

> > > > > +     int i;

> > > > > +     struct mem_cgroup *memcg;

> > > > > +

> > > > > +     if (mem_cgroup_disabled())

> > > > > +             return;

> > > > > +

> > > > > +     memcg = page_memcg(pages[0]);

> > > > > +

> > > > > +     if (unlikely(!memcg))

> > > > > +             return;

> > > > > +

> > > > > +     refill_stock(memcg, nr_pages);

> > > > > +

> > > > > +     for (i = 0; i < nr_pages; i++)

> > > > > +             pages[i]->memcg_data = 0;

> > > > > +     css_put_many(&memcg->css, nr_pages);

> > > > > +}

> > > >

> > > > What about statistics? Should it be accounted towards "sock", "slab/kmem" or deserves

> > > > a separate counter? Do we plan to eventually have shrinkers for this type of memory?

> > > >

> > >

> > > While the pages in question are part of an sk_buff, they may be

> > > accounted towards sockmem. However, that charge is unaccounted when

> > > the skb is freed after the receive operation. When they are in use by

> > > the user application I do not think sockmem is the right place to have

> > > a break-out counter.

> >

> > Does it mean that a page can be accounted twice (even temporarily)?

> >

> 

> This was an actual consideration for this patchset that we went back

> and forth on a little bit.

> Short answer, for this patch in its current form: yes. We're calling

> mem_cgroup_charge_sock_pages() immediately prior to vm_insert_pages();

> and the skb isn't cleaned up until afterwards. Thus we have a period

> of double charging.

> 

> The pseudocode for the approach in this patch is:

> 

> while skb = next skb in queue is not null:

>     charge_skb_pages(skb.pages) // sets page.memcg for each page

>     // at this point pages are double counted

>     vm_insert_pages(skb.pages)

>     free(skb) // unrefs the pages, no longer double counted

> 

> An alternative version of this patch went the other way: have a short

> period of undercharging.

> 

> while skb = next skb in queue is not null:

>     for page in skb.pages set page.memcg

>     vm_insert_pages(skb.pages)

>     free(skb) // unrefs the pages. pages are now undercounted

> charge_skb_pages(nr_pages_mapped, FORCE_CHARGE) // count is now correct again

> ret


I have to think more, but at the first look the second approach is better.
IMO forcing the charge is less of a problem than double accounting
(we're forcing sock memory charging anyway). I'm afraid that even if the
double counting is temporarily for each individual page, with a constant
traffic it will create a permanent difference.

Btw, what is a typical size of the TCP zerocopy data per-memcg? MBs? GBs?

Thanks!
Arjun Roy Jan. 13, 2021, 7:09 p.m. UTC | #7
On Wed, Jan 13, 2021 at 10:48 AM Roman Gushchin <guro@fb.com> wrote:
>

> On Tue, Jan 12, 2021 at 04:12:08PM -0800, Arjun Roy wrote:

> > On Tue, Jan 12, 2021 at 3:48 PM Roman Gushchin <guro@fb.com> wrote:

> > >

> > > On Tue, Jan 12, 2021 at 03:36:18PM -0800, Arjun Roy wrote:

> > > > On Tue, Jan 12, 2021 at 3:31 PM Roman Gushchin <guro@fb.com> wrote:

> > > > >

> > > > > On Tue, Jan 12, 2021 at 01:41:05PM -0800, Shakeel Butt wrote:

> > > > > > From: Arjun Roy <arjunroy@google.com>

> > > > > >

> > > > > > TCP zerocopy receive is used by high performance network applications to

> > > > > > further scale. For RX zerocopy, the memory containing the network data

> > > > > > filled by network driver is directly mapped into the address space of

> > > > > > high performance applications. To keep the TLB cost low, these

> > > > > > applications unmaps the network memory in big batches. So, this memory

> > > > > > can remain mapped for long time. This can cause memory isolation issue

> > > > > > as this memory becomes unaccounted after getting mapped into the

> > > > > > application address space. This patch adds the memcg accounting for such

> > > > > > memory.

> > > > > >

> > > > > > Accounting the network memory comes with its own unique challenge. The

> > > > > > high performance NIC drivers use page pooling to reuse the pages to

> > > > > > eliminate/reduce the expensive setup steps like IOMMU. These drivers

> > > > > > keep an extra reference on the pages and thus we can not depends on the

> > > > > > page reference for the uncharging. The page in the pool may keep a memcg

> > > > > > pinned for arbitrary long time or may get used by other memcg.

> > > > > >

> > > > > > This patch decouples the uncharging of the page from the refcnt and

> > > > > > associate it with the map count i.e. the page gets uncharged when the

> > > > > > last address space unmaps it. Now the question what if the driver drops

> > > > > > its reference while the page is still mapped. That is fine as the

> > > > > > address space also holds a reference to the page i.e. the reference

> > > > > > count can not drop to zero before the map count.

> > > > > >

> > > > > > Signed-off-by: Arjun Roy <arjunroy@google.com>

> > > > > > Co-developed-by: Shakeel Butt <shakeelb@google.com>

> > > > > > Signed-off-by: Shakeel Butt <shakeelb@google.com>

> > > > > > ---

> > > > > >  include/linux/memcontrol.h | 34 +++++++++++++++++++--

> > > > > >  mm/memcontrol.c            | 60 ++++++++++++++++++++++++++++++++++++++

> > > > > >  mm/rmap.c                  |  3 ++

> > > > > >  net/ipv4/tcp.c             | 27 +++++++++++++----

> > > > > >  4 files changed, 116 insertions(+), 8 deletions(-)

> > > > > >

> > > > > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

> > > > > > index 7a38a1517a05..0b0e3b4615cf 100644

> > > > > > --- a/include/linux/memcontrol.h

> > > > > > +++ b/include/linux/memcontrol.h

> > > > > > @@ -349,11 +349,13 @@ extern struct mem_cgroup *root_mem_cgroup;

> > > > > >

> > > > > >  enum page_memcg_data_flags {

> > > > > >       /* page->memcg_data is a pointer to an objcgs vector */

> > > > > > -     MEMCG_DATA_OBJCGS = (1UL << 0),

> > > > > > +     MEMCG_DATA_OBJCGS       = (1UL << 0),

> > > > > >       /* page has been accounted as a non-slab kernel page */

> > > > > > -     MEMCG_DATA_KMEM = (1UL << 1),

> > > > > > +     MEMCG_DATA_KMEM         = (1UL << 1),

> > > > > > +     /* page has been accounted as network memory */

> > > > > > +     MEMCG_DATA_SOCK         = (1UL << 2),

> > > > > >       /* the next bit after the last actual flag */

> > > > > > -     __NR_MEMCG_DATA_FLAGS  = (1UL << 2),

> > > > > > +     __NR_MEMCG_DATA_FLAGS   = (1UL << 3),

> > > > > >  };

> > > > > >

> > > > > >  #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)

> > > > > > @@ -444,6 +446,11 @@ static inline bool PageMemcgKmem(struct page *page)

> > > > > >       return page->memcg_data & MEMCG_DATA_KMEM;

> > > > > >  }

> > > > > >

> > > > > > +static inline bool PageMemcgSock(struct page *page)

> > > > > > +{

> > > > > > +     return page->memcg_data & MEMCG_DATA_SOCK;

> > > > > > +}

> > > > > > +

> > > > > >  #ifdef CONFIG_MEMCG_KMEM

> > > > > >  /*

> > > > > >   * page_objcgs - get the object cgroups vector associated with a page

> > > > > > @@ -1095,6 +1102,11 @@ static inline bool PageMemcgKmem(struct page *page)

> > > > > >       return false;

> > > > > >  }

> > > > > >

> > > > > > +static inline bool PageMemcgSock(struct page *page)

> > > > > > +{

> > > > > > +     return false;

> > > > > > +}

> > > > > > +

> > > > > >  static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)

> > > > > >  {

> > > > > >       return true;

> > > > > > @@ -1561,6 +1573,10 @@ extern struct static_key_false memcg_sockets_enabled_key;

> > > > > >  #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)

> > > > > >  void mem_cgroup_sk_alloc(struct sock *sk);

> > > > > >  void mem_cgroup_sk_free(struct sock *sk);

> > > > > > +int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg, struct page **pages,

> > > > > > +                              unsigned int nr_pages);

> > > > > > +void mem_cgroup_uncharge_sock_pages(struct page **pages, unsigned int nr_pages);

> > > > > > +

> > > > > >  static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)

> > > > > >  {

> > > > > >       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)

> > > > > > @@ -1589,6 +1605,18 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,

> > > > > >                                         int nid, int shrinker_id)

> > > > > >  {

> > > > > >  }

> > > > > > +

> > > > > > +static inline int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg,

> > > > > > +                                            struct page **pages,

> > > > > > +                                            unsigned int nr_pages)

> > > > > > +{

> > > > > > +     return 0;

> > > > > > +}

> > > > > > +

> > > > > > +static inline void mem_cgroup_uncharge_sock_pages(struct page **pages,

> > > > > > +                                               unsigned int nr_pages)

> > > > > > +{

> > > > > > +}

> > > > > >  #endif

> > > > > >

> > > > > >  #ifdef CONFIG_MEMCG_KMEM

> > > > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c

> > > > > > index db9836f4b64b..38e94538e081 100644

> > > > > > --- a/mm/memcontrol.c

> > > > > > +++ b/mm/memcontrol.c

> > > > > > @@ -7061,6 +7061,66 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)

> > > > > >       refill_stock(memcg, nr_pages);

> > > > > >  }

> > > > > >

> > > > > > +/**

> > > > > > + * mem_cgroup_charge_sock_pages - charge socket memory

> > > > > > + * @memcg: memcg to charge

> > > > > > + * @pages: array of pages to charge

> > > > > > + * @nr_pages: number of pages

> > > > > > + *

> > > > > > + * Charges all @pages to current's memcg. The caller should have a reference on

> > > > > > + * the given memcg.

> > > > > > + *

> > > > > > + * Returns 0 on success.

> > > > > > + */

> > > > > > +int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg, struct page **pages,

> > > > > > +                              unsigned int nr_pages)

> > > > > > +{

> > > > > > +     int ret = 0;

> > > > > > +

> > > > > > +     if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))

> > > > > > +             goto out;

> > > > > > +

> > > > > > +     ret = try_charge(memcg, GFP_KERNEL, nr_pages);

> > > > > > +

> > > > > > +     if (!ret) {

> > > > > > +             int i;

> > > > > > +

> > > > > > +             for (i = 0; i < nr_pages; i++)

> > > > > > +                     pages[i]->memcg_data = (unsigned long)memcg |

> > > > > > +                             MEMCG_DATA_SOCK;

> > > > > > +             css_get_many(&memcg->css, nr_pages);

> > > > > > +     }

> > > > > > +out:

> > > > > > +     return ret;

> > > > > > +}

> > > > > > +

> > > > > > +/**

> > > > > > + * mem_cgroup_uncharge_sock_pages - uncharge socket pages

> > > > > > + * @pages: array of pages to uncharge

> > > > > > + * @nr_pages: number of pages

> > > > > > + *

> > > > > > + * This assumes all pages are charged to the same memcg.

> > > > > > + */

> > > > > > +void mem_cgroup_uncharge_sock_pages(struct page **pages, unsigned int nr_pages)

> > > > > > +{

> > > > > > +     int i;

> > > > > > +     struct mem_cgroup *memcg;

> > > > > > +

> > > > > > +     if (mem_cgroup_disabled())

> > > > > > +             return;

> > > > > > +

> > > > > > +     memcg = page_memcg(pages[0]);

> > > > > > +

> > > > > > +     if (unlikely(!memcg))

> > > > > > +             return;

> > > > > > +

> > > > > > +     refill_stock(memcg, nr_pages);

> > > > > > +

> > > > > > +     for (i = 0; i < nr_pages; i++)

> > > > > > +             pages[i]->memcg_data = 0;

> > > > > > +     css_put_many(&memcg->css, nr_pages);

> > > > > > +}

> > > > >

> > > > > What about statistics? Should it be accounted towards "sock", "slab/kmem" or deserves

> > > > > a separate counter? Do we plan to eventually have shrinkers for this type of memory?

> > > > >

> > > >

> > > > While the pages in question are part of an sk_buff, they may be

> > > > accounted towards sockmem. However, that charge is unaccounted when

> > > > the skb is freed after the receive operation. When they are in use by

> > > > the user application I do not think sockmem is the right place to have

> > > > a break-out counter.

> > >

> > > Does it mean that a page can be accounted twice (even temporarily)?

> > >

> >

> > This was an actual consideration for this patchset that we went back

> > and forth on a little bit.

> > Short answer, for this patch in its current form: yes. We're calling

> > mem_cgroup_charge_sock_pages() immediately prior to vm_insert_pages();

> > and the skb isn't cleaned up until afterwards. Thus we have a period

> > of double charging.

> >

> > The pseudocode for the approach in this patch is:

> >

> > while skb = next skb in queue is not null:

> >     charge_skb_pages(skb.pages) // sets page.memcg for each page

> >     // at this point pages are double counted

> >     vm_insert_pages(skb.pages)

> >     free(skb) // unrefs the pages, no longer double counted

> >

> > An alternative version of this patch went the other way: have a short

> > period of undercharging.

> >

> > while skb = next skb in queue is not null:

> >     for page in skb.pages set page.memcg

> >     vm_insert_pages(skb.pages)

> >     free(skb) // unrefs the pages. pages are now undercounted

> > charge_skb_pages(nr_pages_mapped, FORCE_CHARGE) // count is now correct again

> > ret

>

> I have to think more, but at the first look the second approach is better.

> IMO forcing the charge is less of a problem than double accounting

> (we're forcing sock memory charging anyway). I'm afraid that even if the

> double counting is temporarily for each individual page, with a constant

> traffic it will create a permanent difference.

>

> Btw, what is a typical size of the TCP zerocopy data per-memcg? MBs? GBs?

>


This will depend a lot on the application - the flow is:
1. Packet arrives at NIC => ... => skb in queue with page frag
2. Zerocopy receive maps page to userspace, now it's charged to memcg
3. <application can hold onto this page for potentially arbitrary
amount of time>
4. Application calls munmap, or MADV_DONTNEED: now it's uncharged from memcg.

Depending on what happens in step 3, I think it's hard to say what is
typical. But I will note that it's a few tens of milliseconds at line
rate to get to 1GB of backlog data, and I can imagine an application
taking at least a few milliseconds to handle a request depending on
what kind of computation is required.

-Arjun


> Thanks!
Shakeel Butt Jan. 13, 2021, 7:12 p.m. UTC | #8
On Wed, Jan 13, 2021 at 10:43 AM Roman Gushchin <guro@fb.com> wrote:
>

> On Tue, Jan 12, 2021 at 04:18:44PM -0800, Shakeel Butt wrote:

> > On Tue, Jan 12, 2021 at 4:12 PM Arjun Roy <arjunroy@google.com> wrote:

> > >

> > > On Tue, Jan 12, 2021 at 3:48 PM Roman Gushchin <guro@fb.com> wrote:

> > > >

> > [snip]

> > > > Historically we have a corresponding vmstat counter to each charged page.

> > > > It helps with finding accounting/stastistics issues: we can check that

> > > > memory.current ~= anon + file + sock + slab + percpu + stack.

> > > > It would be nice to preserve such ability.

> > > >

> > >

> > > Perhaps one option would be to have it count as a file page, or have a

> > > new category.

> > >

> >

> > Oh these are actually already accounted for in NR_FILE_MAPPED.

>

> Well, it's confusing. Can't we fix this by looking at the new page memcg flag?


Yes we can. I am inclined more towards just using NR_FILE_PAGES (as
Arjun suggested) instead of adding a new metric.
Shakeel Butt Jan. 13, 2021, 7:34 p.m. UTC | #9
On Wed, Jan 13, 2021 at 10:48 AM Roman Gushchin <guro@fb.com> wrote:
>

[snip]
> > > Does it mean that a page can be accounted twice (even temporarily)?

> > >

> >

> > This was an actual consideration for this patchset that we went back

> > and forth on a little bit.

> > Short answer, for this patch in its current form: yes. We're calling

> > mem_cgroup_charge_sock_pages() immediately prior to vm_insert_pages();

> > and the skb isn't cleaned up until afterwards. Thus we have a period

> > of double charging.

> >

> > The pseudocode for the approach in this patch is:

> >

> > while skb = next skb in queue is not null:

> >     charge_skb_pages(skb.pages) // sets page.memcg for each page

> >     // at this point pages are double counted

> >     vm_insert_pages(skb.pages)

> >     free(skb) // unrefs the pages, no longer double counted

> >

> > An alternative version of this patch went the other way: have a short

> > period of undercharging.

> >

> > while skb = next skb in queue is not null:

> >     for page in skb.pages set page.memcg

> >     vm_insert_pages(skb.pages)

> >     free(skb) // unrefs the pages. pages are now undercounted

> > charge_skb_pages(nr_pages_mapped, FORCE_CHARGE) // count is now correct again

> > ret

>

> I have to think more, but at the first look the second approach is better.

> IMO forcing the charge is less of a problem than double accounting

> (we're forcing sock memory charging anyway). I'm afraid that even if the

> double counting is temporarily for each individual page, with a constant

> traffic it will create a permanent difference.

>


The double accounting behavior is a bit different in cgroup v1 vs v2
world as skmem is accounted in memory for v2 and a separate tcp
counter for v1. I am fine with either approaches mentioned by Arjun
but I would prefer to not add complexity by doing one approach for v1
and the other for v2.
Yang Shi Jan. 13, 2021, 7:49 p.m. UTC | #10
On Wed, Jan 13, 2021 at 11:13 AM Shakeel Butt <shakeelb@google.com> wrote:
>

> On Wed, Jan 13, 2021 at 10:43 AM Roman Gushchin <guro@fb.com> wrote:

> >

> > On Tue, Jan 12, 2021 at 04:18:44PM -0800, Shakeel Butt wrote:

> > > On Tue, Jan 12, 2021 at 4:12 PM Arjun Roy <arjunroy@google.com> wrote:

> > > >

> > > > On Tue, Jan 12, 2021 at 3:48 PM Roman Gushchin <guro@fb.com> wrote:

> > > > >

> > > [snip]

> > > > > Historically we have a corresponding vmstat counter to each charged page.

> > > > > It helps with finding accounting/stastistics issues: we can check that

> > > > > memory.current ~= anon + file + sock + slab + percpu + stack.

> > > > > It would be nice to preserve such ability.

> > > > >

> > > >

> > > > Perhaps one option would be to have it count as a file page, or have a

> > > > new category.

> > > >

> > >

> > > Oh these are actually already accounted for in NR_FILE_MAPPED.

> >

> > Well, it's confusing. Can't we fix this by looking at the new page memcg flag?

>

> Yes we can. I am inclined more towards just using NR_FILE_PAGES (as

> Arjun suggested) instead of adding a new metric.


IMHO I tend to agree with Roman, it sounds confusing. I'm not sure how
people relies on the counter to have ballpark estimation about the
amount of reclaimable memory for specific memcg, but they are
unreclaimable. And, I don't think they are accounted to
NR_ACTIVE_FILE/NR_INACTIVE_FILE, right? So, the disparity between
NR_FILE_PAGES and NR_{IN}ACTIVE_FILE may be confusing either.

>
Shakeel Butt Jan. 13, 2021, 7:55 p.m. UTC | #11
On Wed, Jan 13, 2021 at 11:49 AM Yang Shi <shy828301@gmail.com> wrote:
>

> On Wed, Jan 13, 2021 at 11:13 AM Shakeel Butt <shakeelb@google.com> wrote:

> >

> > On Wed, Jan 13, 2021 at 10:43 AM Roman Gushchin <guro@fb.com> wrote:

> > >

> > > On Tue, Jan 12, 2021 at 04:18:44PM -0800, Shakeel Butt wrote:

> > > > On Tue, Jan 12, 2021 at 4:12 PM Arjun Roy <arjunroy@google.com> wrote:

> > > > >

> > > > > On Tue, Jan 12, 2021 at 3:48 PM Roman Gushchin <guro@fb.com> wrote:

> > > > > >

> > > > [snip]

> > > > > > Historically we have a corresponding vmstat counter to each charged page.

> > > > > > It helps with finding accounting/stastistics issues: we can check that

> > > > > > memory.current ~= anon + file + sock + slab + percpu + stack.

> > > > > > It would be nice to preserve such ability.

> > > > > >

> > > > >

> > > > > Perhaps one option would be to have it count as a file page, or have a

> > > > > new category.

> > > > >

> > > >

> > > > Oh these are actually already accounted for in NR_FILE_MAPPED.

> > >

> > > Well, it's confusing. Can't we fix this by looking at the new page memcg flag?

> >

> > Yes we can. I am inclined more towards just using NR_FILE_PAGES (as

> > Arjun suggested) instead of adding a new metric.

>

> IMHO I tend to agree with Roman, it sounds confusing. I'm not sure how

> people relies on the counter to have ballpark estimation about the

> amount of reclaimable memory for specific memcg, but they are

> unreclaimable. And, I don't think they are accounted to

> NR_ACTIVE_FILE/NR_INACTIVE_FILE, right? So, the disparity between

> NR_FILE_PAGES and NR_{IN}ACTIVE_FILE may be confusing either.

>


Please note that due to shmem/tmpfs there is already disparity between
NR_FILE_PAGES and NR_{IN}ACTIVE_FILE.

BTW I don't have a strong opinion against adding a new metric. If
there is consensus we can add one.
Arjun Roy Jan. 20, 2021, 3:31 a.m. UTC | #12
On Wed, Jan 13, 2021 at 11:55 AM Shakeel Butt <shakeelb@google.com> wrote:
>

> On Wed, Jan 13, 2021 at 11:49 AM Yang Shi <shy828301@gmail.com> wrote:

> >

> > On Wed, Jan 13, 2021 at 11:13 AM Shakeel Butt <shakeelb@google.com> wrote:

> > >

> > > On Wed, Jan 13, 2021 at 10:43 AM Roman Gushchin <guro@fb.com> wrote:

> > > >

> > > > On Tue, Jan 12, 2021 at 04:18:44PM -0800, Shakeel Butt wrote:

> > > > > On Tue, Jan 12, 2021 at 4:12 PM Arjun Roy <arjunroy@google.com> wrote:

> > > > > >

> > > > > > On Tue, Jan 12, 2021 at 3:48 PM Roman Gushchin <guro@fb.com> wrote:

> > > > > > >

> > > > > [snip]

> > > > > > > Historically we have a corresponding vmstat counter to each charged page.

> > > > > > > It helps with finding accounting/stastistics issues: we can check that

> > > > > > > memory.current ~= anon + file + sock + slab + percpu + stack.

> > > > > > > It would be nice to preserve such ability.

> > > > > > >

> > > > > >

> > > > > > Perhaps one option would be to have it count as a file page, or have a

> > > > > > new category.

> > > > > >

> > > > >

> > > > > Oh these are actually already accounted for in NR_FILE_MAPPED.

> > > >

> > > > Well, it's confusing. Can't we fix this by looking at the new page memcg flag?

> > >

> > > Yes we can. I am inclined more towards just using NR_FILE_PAGES (as

> > > Arjun suggested) instead of adding a new metric.

> >

> > IMHO I tend to agree with Roman, it sounds confusing. I'm not sure how

> > people relies on the counter to have ballpark estimation about the

> > amount of reclaimable memory for specific memcg, but they are

> > unreclaimable. And, I don't think they are accounted to

> > NR_ACTIVE_FILE/NR_INACTIVE_FILE, right? So, the disparity between

> > NR_FILE_PAGES and NR_{IN}ACTIVE_FILE may be confusing either.

> >

>

> Please note that due to shmem/tmpfs there is already disparity between

> NR_FILE_PAGES and NR_{IN}ACTIVE_FILE.

>

> BTW I don't have a strong opinion against adding a new metric. If

> there is consensus we can add one.


Just wanted to see if there were any thoughts/consensus on what the
best way to proceed is - should there be a v2 patch with specific
changes? Or is NR_FILE_PAGES alright?

And similar query, for pre-charging vs. post charging.

Thanks,
-Arjun
Roman Gushchin Jan. 20, 2021, 3:49 a.m. UTC | #13
On Tue, Jan 19, 2021 at 07:31:51PM -0800, Arjun Roy wrote:
> On Wed, Jan 13, 2021 at 11:55 AM Shakeel Butt <shakeelb@google.com> wrote:

> >

> > On Wed, Jan 13, 2021 at 11:49 AM Yang Shi <shy828301@gmail.com> wrote:

> > >

> > > On Wed, Jan 13, 2021 at 11:13 AM Shakeel Butt <shakeelb@google.com> wrote:

> > > >

> > > > On Wed, Jan 13, 2021 at 10:43 AM Roman Gushchin <guro@fb.com> wrote:

> > > > >

> > > > > On Tue, Jan 12, 2021 at 04:18:44PM -0800, Shakeel Butt wrote:

> > > > > > On Tue, Jan 12, 2021 at 4:12 PM Arjun Roy <arjunroy@google.com> wrote:

> > > > > > >

> > > > > > > On Tue, Jan 12, 2021 at 3:48 PM Roman Gushchin <guro@fb.com> wrote:

> > > > > > > >

> > > > > > [snip]

> > > > > > > > Historically we have a corresponding vmstat counter to each charged page.

> > > > > > > > It helps with finding accounting/stastistics issues: we can check that

> > > > > > > > memory.current ~= anon + file + sock + slab + percpu + stack.

> > > > > > > > It would be nice to preserve such ability.

> > > > > > > >

> > > > > > >

> > > > > > > Perhaps one option would be to have it count as a file page, or have a

> > > > > > > new category.

> > > > > > >

> > > > > >

> > > > > > Oh these are actually already accounted for in NR_FILE_MAPPED.

> > > > >

> > > > > Well, it's confusing. Can't we fix this by looking at the new page memcg flag?

> > > >

> > > > Yes we can. I am inclined more towards just using NR_FILE_PAGES (as

> > > > Arjun suggested) instead of adding a new metric.

> > >

> > > IMHO I tend to agree with Roman, it sounds confusing. I'm not sure how

> > > people relies on the counter to have ballpark estimation about the

> > > amount of reclaimable memory for specific memcg, but they are

> > > unreclaimable. And, I don't think they are accounted to

> > > NR_ACTIVE_FILE/NR_INACTIVE_FILE, right? So, the disparity between

> > > NR_FILE_PAGES and NR_{IN}ACTIVE_FILE may be confusing either.

> > >

> >

> > Please note that due to shmem/tmpfs there is already disparity between

> > NR_FILE_PAGES and NR_{IN}ACTIVE_FILE.

> >

> > BTW I don't have a strong opinion against adding a new metric. If

> > there is consensus we can add one.

> 

> Just wanted to see if there were any thoughts/consensus on what the

> best way to proceed is - should there be a v2 patch with specific

> changes? Or is NR_FILE_PAGES alright?


I struggle to see why these pages should be considered file pages.
(NR_FILE_MAPPED is a different story).
I'm ok with slab/kmem, sock and a new metric, we can discuss what's
the best option out of three.

> And similar query, for pre-charging vs. post charging.


IMO double accounting is bad. If it means post charging, I vote for
post charging.

Thanks!
diff mbox series

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7a38a1517a05..0b0e3b4615cf 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -349,11 +349,13 @@  extern struct mem_cgroup *root_mem_cgroup;
 
 enum page_memcg_data_flags {
 	/* page->memcg_data is a pointer to an objcgs vector */
-	MEMCG_DATA_OBJCGS = (1UL << 0),
+	MEMCG_DATA_OBJCGS	= (1UL << 0),
 	/* page has been accounted as a non-slab kernel page */
-	MEMCG_DATA_KMEM = (1UL << 1),
+	MEMCG_DATA_KMEM		= (1UL << 1),
+	/* page has been accounted as network memory */
+	MEMCG_DATA_SOCK		= (1UL << 2),
 	/* the next bit after the last actual flag */
-	__NR_MEMCG_DATA_FLAGS  = (1UL << 2),
+	__NR_MEMCG_DATA_FLAGS	= (1UL << 3),
 };
 
 #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
@@ -444,6 +446,11 @@  static inline bool PageMemcgKmem(struct page *page)
 	return page->memcg_data & MEMCG_DATA_KMEM;
 }
 
+static inline bool PageMemcgSock(struct page *page)
+{
+	return page->memcg_data & MEMCG_DATA_SOCK;
+}
+
 #ifdef CONFIG_MEMCG_KMEM
 /*
  * page_objcgs - get the object cgroups vector associated with a page
@@ -1095,6 +1102,11 @@  static inline bool PageMemcgKmem(struct page *page)
 	return false;
 }
 
+static inline bool PageMemcgSock(struct page *page)
+{
+	return false;
+}
+
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
 	return true;
@@ -1561,6 +1573,10 @@  extern struct static_key_false memcg_sockets_enabled_key;
 #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)
 void mem_cgroup_sk_alloc(struct sock *sk);
 void mem_cgroup_sk_free(struct sock *sk);
+int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg, struct page **pages,
+				 unsigned int nr_pages);
+void mem_cgroup_uncharge_sock_pages(struct page **pages, unsigned int nr_pages);
+
 static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 {
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)
@@ -1589,6 +1605,18 @@  static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
 					  int nid, int shrinker_id)
 {
 }
+
+static inline int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg,
+					       struct page **pages,
+					       unsigned int nr_pages)
+{
+	return 0;
+}
+
+static inline void mem_cgroup_uncharge_sock_pages(struct page **pages,
+						  unsigned int nr_pages)
+{
+}
 #endif
 
 #ifdef CONFIG_MEMCG_KMEM
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index db9836f4b64b..38e94538e081 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -7061,6 +7061,66 @@  void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 	refill_stock(memcg, nr_pages);
 }
 
+/**
+ * mem_cgroup_charge_sock_pages - charge socket memory
+ * @memcg: memcg to charge
+ * @pages: array of pages to charge
+ * @nr_pages: number of pages
+ *
+ * Charges all @pages to current's memcg. The caller should have a reference on
+ * the given memcg.
+ *
+ * Returns 0 on success.
+ */
+int mem_cgroup_charge_sock_pages(struct mem_cgroup *memcg, struct page **pages,
+				 unsigned int nr_pages)
+{
+	int ret = 0;
+
+	if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
+		goto out;
+
+	ret = try_charge(memcg, GFP_KERNEL, nr_pages);
+
+	if (!ret) {
+		int i;
+
+		for (i = 0; i < nr_pages; i++)
+			pages[i]->memcg_data = (unsigned long)memcg |
+				MEMCG_DATA_SOCK;
+		css_get_many(&memcg->css, nr_pages);
+	}
+out:
+	return ret;
+}
+
+/**
+ * mem_cgroup_uncharge_sock_pages - uncharge socket pages
+ * @pages: array of pages to uncharge
+ * @nr_pages: number of pages
+ *
+ * This assumes all pages are charged to the same memcg.
+ */
+void mem_cgroup_uncharge_sock_pages(struct page **pages, unsigned int nr_pages)
+{
+	int i;
+	struct mem_cgroup *memcg;
+
+	if (mem_cgroup_disabled())
+		return;
+
+	memcg = page_memcg(pages[0]);
+
+	if (unlikely(!memcg))
+		return;
+
+	refill_stock(memcg, nr_pages);
+
+	for (i = 0; i < nr_pages; i++)
+		pages[i]->memcg_data = 0;
+	css_put_many(&memcg->css, nr_pages);
+}
+
 static int __init cgroup_memory(char *s)
 {
 	char *token;
diff --git a/mm/rmap.c b/mm/rmap.c
index 5ebf16fae4b9..ea6b09757215 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1284,6 +1284,9 @@  static void page_remove_file_rmap(struct page *page, bool compound)
 
 	if (unlikely(PageMlocked(page)))
 		clear_page_mlock(page);
+
+	if (unlikely(PageMemcgSock(page)))
+		mem_cgroup_uncharge_sock_pages(&page, 1);
 }
 
 static void page_remove_anon_compound_rmap(struct page *page)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2267d21c73a6..af0cec677aa0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1934,6 +1934,8 @@  static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
 					      u32 total_bytes_to_map,
 					      int err)
 {
+	unsigned int pages_mapped = 0;
+
 	/* At least one page did not map. Try zapping if we skipped earlier. */
 	if (err == -EBUSY &&
 	    zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
@@ -1954,7 +1956,8 @@  static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
 		err = vm_insert_pages(vma, *address,
 				      pending_pages,
 				      &pages_remaining);
-		bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
+		pages_mapped = leftover_pages - pages_remaining;
+		bytes_mapped = PAGE_SIZE * pages_mapped;
 		*seq += bytes_mapped;
 		*address += bytes_mapped;
 	}
@@ -1968,11 +1971,16 @@  static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
 
 		*length -= bytes_not_mapped;
 		zc->recv_skip_hint += bytes_not_mapped;
+
+		/* Cancel the memcg charge for remaining pages. */
+		mem_cgroup_uncharge_sock_pages(pending_pages + pages_mapped,
+					       pages_remaining);
 	}
 	return err;
 }
 
 static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
+					struct mem_cgroup *memcg,
 					struct page **pages,
 					unsigned int pages_to_map,
 					unsigned long *address,
@@ -1986,6 +1994,11 @@  static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
 	unsigned int bytes_mapped;
 	int err;
 
+	err = mem_cgroup_charge_sock_pages(memcg, pages, pages_to_map);
+
+	if (unlikely(err))
+		return err;
+
 	err = vm_insert_pages(vma, *address, pages, &pages_remaining);
 	pages_mapped = pages_to_map - (unsigned int)pages_remaining;
 	bytes_mapped = PAGE_SIZE * pages_mapped;
@@ -2011,6 +2024,7 @@  static int tcp_zerocopy_receive(struct sock *sk,
 	u32 length = 0, offset, vma_len, avail_len, copylen = 0;
 	unsigned long address = (unsigned long)zc->address;
 	struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
+	struct mem_cgroup *memcg;
 	s32 copybuf_len = zc->copybuf_len;
 	struct tcp_sock *tp = tcp_sk(sk);
 	const skb_frag_t *frags = NULL;
@@ -2062,6 +2076,7 @@  static int tcp_zerocopy_receive(struct sock *sk,
 		zc->length = avail_len;
 		zc->recv_skip_hint = avail_len;
 	}
+	memcg = get_mem_cgroup_from_mm(current->mm);
 	ret = 0;
 	while (length + PAGE_SIZE <= zc->length) {
 		int mappable_offset;
@@ -2101,7 +2116,7 @@  static int tcp_zerocopy_receive(struct sock *sk,
 			/* Either full batch, or we're about to go to next skb
 			 * (and we cannot unroll failed ops across skbs).
 			 */
-			ret = tcp_zerocopy_vm_insert_batch(vma, pages,
+			ret = tcp_zerocopy_vm_insert_batch(vma, memcg, pages,
 							   pages_to_map,
 							   &address, &length,
 							   &seq, zc,
@@ -2112,9 +2127,10 @@  static int tcp_zerocopy_receive(struct sock *sk,
 		}
 	}
 	if (pages_to_map) {
-		ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
-						   &address, &length, &seq,
-						   zc, total_bytes_to_map);
+		ret = tcp_zerocopy_vm_insert_batch(vma, memcg, pages,
+						   pages_to_map, &address,
+						   &length, &seq, zc,
+						   total_bytes_to_map);
 	}
 out:
 	mmap_read_unlock(current->mm);
@@ -2138,6 +2154,7 @@  static int tcp_zerocopy_receive(struct sock *sk,
 			ret = -EIO;
 	}
 	zc->length = length;
+	mem_cgroup_put(memcg);
 	return ret;
 }
 #endif