diff mbox series

mm: proc: add Sock to /proc/meminfo

Message ID 20201010103854.66746-1-songmuchun@bytedance.com
State New
Headers show
Series mm: proc: add Sock to /proc/meminfo | expand

Commit Message

Muchun Song Oct. 10, 2020, 10:38 a.m. UTC
The amount of memory allocated to sockets buffer can become significant.
However, we do not display the amount of memory consumed by sockets
buffer. In this case, knowing where the memory is consumed by the kernel
is very difficult. On our server with 500GB RAM, sometimes we can see
25GB disappear through /proc/meminfo. After our analysis, we found the
following memory allocation path which consumes the memory with page_owner
enabled.

  849698 times:
  Page allocated via order 3, mask 0x4052c0(GFP_NOWAIT|__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP)
   __alloc_pages_nodemask+0x11d/0x290
   skb_page_frag_refill+0x68/0xf0
   sk_page_frag_refill+0x19/0x70
   tcp_sendmsg_locked+0x2f4/0xd10
   tcp_sendmsg+0x29/0xa0
   sock_sendmsg+0x30/0x40
   sock_write_iter+0x8f/0x100
   __vfs_write+0x10b/0x190
   vfs_write+0xb0/0x190
   ksys_write+0x5a/0xd0
   do_syscall_64+0x5d/0x110
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 drivers/base/node.c      |  2 ++
 drivers/net/virtio_net.c |  3 +--
 fs/proc/meminfo.c        |  1 +
 include/linux/mmzone.h   |  1 +
 include/linux/skbuff.h   | 43 ++++++++++++++++++++++++++++++++++++++--
 kernel/exit.c            |  3 +--
 mm/page_alloc.c          |  7 +++++--
 mm/vmstat.c              |  1 +
 net/core/sock.c          |  8 ++++----
 net/ipv4/tcp.c           |  3 +--
 net/xfrm/xfrm_state.c    |  3 +--
 11 files changed, 59 insertions(+), 16 deletions(-)

Comments

Randy Dunlap Oct. 10, 2020, 4:36 p.m. UTC | #1
Hi,

On 10/10/20 3:38 AM, Muchun Song wrote:
> The amount of memory allocated to sockets buffer can become significant.
> However, we do not display the amount of memory consumed by sockets
> buffer. In this case, knowing where the memory is consumed by the kernel
> is very difficult. On our server with 500GB RAM, sometimes we can see
> 25GB disappear through /proc/meminfo. After our analysis, we found the
> following memory allocation path which consumes the memory with page_owner
> enabled.
> 
>   849698 times:
>   Page allocated via order 3, mask 0x4052c0(GFP_NOWAIT|__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP)
>    __alloc_pages_nodemask+0x11d/0x290
>    skb_page_frag_refill+0x68/0xf0
>    sk_page_frag_refill+0x19/0x70
>    tcp_sendmsg_locked+0x2f4/0xd10
>    tcp_sendmsg+0x29/0xa0
>    sock_sendmsg+0x30/0x40
>    sock_write_iter+0x8f/0x100
>    __vfs_write+0x10b/0x190
>    vfs_write+0xb0/0x190
>    ksys_write+0x5a/0xd0
>    do_syscall_64+0x5d/0x110
>    entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> ---
>  drivers/base/node.c      |  2 ++
>  drivers/net/virtio_net.c |  3 +--
>  fs/proc/meminfo.c        |  1 +
>  include/linux/mmzone.h   |  1 +
>  include/linux/skbuff.h   | 43 ++++++++++++++++++++++++++++++++++++++--
>  kernel/exit.c            |  3 +--
>  mm/page_alloc.c          |  7 +++++--
>  mm/vmstat.c              |  1 +
>  net/core/sock.c          |  8 ++++----
>  net/ipv4/tcp.c           |  3 +--
>  net/xfrm/xfrm_state.c    |  3 +--
>  11 files changed, 59 insertions(+), 16 deletions(-)

Thanks for finding that.

Please update Documentation/filesystems/proc.rst "meminfo" section also.
Muchun Song Oct. 11, 2020, 4:42 a.m. UTC | #2
On Sun, Oct 11, 2020 at 12:37 AM Randy Dunlap <rdunlap@infradead.org> wrote:
>
> Hi,
>
> On 10/10/20 3:38 AM, Muchun Song wrote:
> > The amount of memory allocated to sockets buffer can become significant.
> > However, we do not display the amount of memory consumed by sockets
> > buffer. In this case, knowing where the memory is consumed by the kernel
> > is very difficult. On our server with 500GB RAM, sometimes we can see
> > 25GB disappear through /proc/meminfo. After our analysis, we found the
> > following memory allocation path which consumes the memory with page_owner
> > enabled.
> >
> >   849698 times:
> >   Page allocated via order 3, mask 0x4052c0(GFP_NOWAIT|__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP)
> >    __alloc_pages_nodemask+0x11d/0x290
> >    skb_page_frag_refill+0x68/0xf0
> >    sk_page_frag_refill+0x19/0x70
> >    tcp_sendmsg_locked+0x2f4/0xd10
> >    tcp_sendmsg+0x29/0xa0
> >    sock_sendmsg+0x30/0x40
> >    sock_write_iter+0x8f/0x100
> >    __vfs_write+0x10b/0x190
> >    vfs_write+0xb0/0x190
> >    ksys_write+0x5a/0xd0
> >    do_syscall_64+0x5d/0x110
> >    entry_SYSCALL_64_after_hwframe+0x44/0xa9
> >
> > Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> > ---
> >  drivers/base/node.c      |  2 ++
> >  drivers/net/virtio_net.c |  3 +--
> >  fs/proc/meminfo.c        |  1 +
> >  include/linux/mmzone.h   |  1 +
> >  include/linux/skbuff.h   | 43 ++++++++++++++++++++++++++++++++++++++--
> >  kernel/exit.c            |  3 +--
> >  mm/page_alloc.c          |  7 +++++--
> >  mm/vmstat.c              |  1 +
> >  net/core/sock.c          |  8 ++++----
> >  net/ipv4/tcp.c           |  3 +--
> >  net/xfrm/xfrm_state.c    |  3 +--
> >  11 files changed, 59 insertions(+), 16 deletions(-)
>
> Thanks for finding that.
>
> Please update Documentation/filesystems/proc.rst "meminfo" section also.

Will do. Thanks for your suggestions.

>
> --
> ~Randy
>
Mike Rapoport Oct. 11, 2020, 1:52 p.m. UTC | #3
On Sat, Oct 10, 2020 at 06:38:54PM +0800, Muchun Song wrote:
> The amount of memory allocated to sockets buffer can become significant.
> However, we do not display the amount of memory consumed by sockets
> buffer. In this case, knowing where the memory is consumed by the kernel
> is very difficult. On our server with 500GB RAM, sometimes we can see
> 25GB disappear through /proc/meminfo. After our analysis, we found the
> following memory allocation path which consumes the memory with page_owner
> enabled.
 
I have a high lelel question.
There is accounting of the socket memory for memcg that gets called from
the networking layer. Did you check if the same call sites can be used
for the system-wide accounting as well?

>   849698 times:
>   Page allocated via order 3, mask 0x4052c0(GFP_NOWAIT|__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP)
>    __alloc_pages_nodemask+0x11d/0x290
>    skb_page_frag_refill+0x68/0xf0
>    sk_page_frag_refill+0x19/0x70
>    tcp_sendmsg_locked+0x2f4/0xd10
>    tcp_sendmsg+0x29/0xa0
>    sock_sendmsg+0x30/0x40
>    sock_write_iter+0x8f/0x100
>    __vfs_write+0x10b/0x190
>    vfs_write+0xb0/0x190
>    ksys_write+0x5a/0xd0
>    do_syscall_64+0x5d/0x110
>    entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> ---
>  drivers/base/node.c      |  2 ++
>  drivers/net/virtio_net.c |  3 +--

Is virtio-net the only dirver that requred an update?

>  fs/proc/meminfo.c        |  1 +
>  include/linux/mmzone.h   |  1 +
>  include/linux/skbuff.h   | 43 ++++++++++++++++++++++++++++++++++++++--
>  kernel/exit.c            |  3 +--
>  mm/page_alloc.c          |  7 +++++--
>  mm/vmstat.c              |  1 +
>  net/core/sock.c          |  8 ++++----
>  net/ipv4/tcp.c           |  3 +--
>  net/xfrm/xfrm_state.c    |  3 +--
>  11 files changed, 59 insertions(+), 16 deletions(-)
>
Muchun Song Oct. 11, 2020, 4 p.m. UTC | #4
On Sun, Oct 11, 2020 at 9:53 PM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Sat, Oct 10, 2020 at 06:38:54PM +0800, Muchun Song wrote:
> > The amount of memory allocated to sockets buffer can become significant.
> > However, we do not display the amount of memory consumed by sockets
> > buffer. In this case, knowing where the memory is consumed by the kernel
> > is very difficult. On our server with 500GB RAM, sometimes we can see
> > 25GB disappear through /proc/meminfo. After our analysis, we found the
> > following memory allocation path which consumes the memory with page_owner
> > enabled.
>
> I have a high lelel question.
> There is accounting of the socket memory for memcg that gets called from
> the networking layer. Did you check if the same call sites can be used
> for the system-wide accounting as well?

I also think about this. But we did not pass the `struct page` parameter to
the sock accounting memcg API. So we did not know the NUMA node
which allocated the socket buffer memory and cannot do node-level
statistics. In addition, there is another problem. If the user sends a 4096-byte
message, we only charge one page to the memcg but the system allocates 8
pages. So if we reuse the same call sites for the system-wide accounting,
the statistical count we get is always smaller than the actual situation.

>
> >   849698 times:
> >   Page allocated via order 3, mask 0x4052c0(GFP_NOWAIT|__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP)
> >    __alloc_pages_nodemask+0x11d/0x290
> >    skb_page_frag_refill+0x68/0xf0
> >    sk_page_frag_refill+0x19/0x70
> >    tcp_sendmsg_locked+0x2f4/0xd10
> >    tcp_sendmsg+0x29/0xa0
> >    sock_sendmsg+0x30/0x40
> >    sock_write_iter+0x8f/0x100
> >    __vfs_write+0x10b/0x190
> >    vfs_write+0xb0/0x190
> >    ksys_write+0x5a/0xd0
> >    do_syscall_64+0x5d/0x110
> >    entry_SYSCALL_64_after_hwframe+0x44/0xa9
> >
> > Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> > ---
> >  drivers/base/node.c      |  2 ++
> >  drivers/net/virtio_net.c |  3 +--
>
> Is virtio-net the only dirver that requred an update?

Yeah, only virtio-net needs an update. Because only it uses the
skb_page_frag_refill() API.

>
> >  fs/proc/meminfo.c        |  1 +
> >  include/linux/mmzone.h   |  1 +
> >  include/linux/skbuff.h   | 43 ++++++++++++++++++++++++++++++++++++++--
> >  kernel/exit.c            |  3 +--
> >  mm/page_alloc.c          |  7 +++++--
> >  mm/vmstat.c              |  1 +
> >  net/core/sock.c          |  8 ++++----
> >  net/ipv4/tcp.c           |  3 +--
> >  net/xfrm/xfrm_state.c    |  3 +--
> >  11 files changed, 59 insertions(+), 16 deletions(-)
> >
Cong Wang Oct. 11, 2020, 6:39 p.m. UTC | #5
On Sat, Oct 10, 2020 at 3:39 AM Muchun Song <songmuchun@bytedance.com> wrote:
>
> The amount of memory allocated to sockets buffer can become significant.
> However, we do not display the amount of memory consumed by sockets
> buffer. In this case, knowing where the memory is consumed by the kernel

We do it via `ss -m`. Is it not sufficient? And if not, why not adding it there
rather than /proc/meminfo?

>  static inline void __skb_frag_unref(skb_frag_t *frag)
>  {
> -       put_page(skb_frag_page(frag));
> +       struct page *page = skb_frag_page(frag);
> +
> +       if (put_page_testzero(page)) {
> +               dec_sock_node_page_state(page);
> +               __put_page(page);
> +       }
>  }

You mix socket page frag with skb frag at least, not sure this is exactly
what you want, because clearly skb page frags are frequently used
by network drivers rather than sockets.

Also, which one matches this dec_sock_node_page_state()? Clearly
not skb_fill_page_desc() or __skb_frag_ref().

Thanks.
Muchun Song Oct. 12, 2020, 4:22 a.m. UTC | #6
On Mon, Oct 12, 2020 at 2:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Sat, Oct 10, 2020 at 3:39 AM Muchun Song <songmuchun@bytedance.com> wrote:
> >
> > The amount of memory allocated to sockets buffer can become significant.
> > However, we do not display the amount of memory consumed by sockets
> > buffer. In this case, knowing where the memory is consumed by the kernel
>
> We do it via `ss -m`. Is it not sufficient? And if not, why not adding it there
> rather than /proc/meminfo?

If the system has little free memory, we can know where the memory is via
/proc/meminfo. If a lot of memory is consumed by socket buffer, we cannot
know it when the Sock is not shown in the /proc/meminfo. If the unaware user
can't think of the socket buffer, naturally they will not `ss -m`. The
end result
is that we still don’t know where the memory is consumed. And we add the
Sock to the /proc/meminfo just like the memcg does('sock' item in the cgroup
v2 memory.stat). So I think that adding to /proc/meminfo is sufficient.

>
> >  static inline void __skb_frag_unref(skb_frag_t *frag)
> >  {
> > -       put_page(skb_frag_page(frag));
> > +       struct page *page = skb_frag_page(frag);
> > +
> > +       if (put_page_testzero(page)) {
> > +               dec_sock_node_page_state(page);
> > +               __put_page(page);
> > +       }
> >  }
>
> You mix socket page frag with skb frag at least, not sure this is exactly
> what you want, because clearly skb page frags are frequently used
> by network drivers rather than sockets.
>
> Also, which one matches this dec_sock_node_page_state()? Clearly
> not skb_fill_page_desc() or __skb_frag_ref().

Yeah, we call inc_sock_node_page_state() in the skb_page_frag_refill().
So if someone gets the page returned by skb_page_frag_refill(), it must
put the page via __skb_frag_unref()/skb_frag_unref(). We use PG_private
to indicate that we need to dec the node page state when the refcount of
page reaches zero.

Thanks.

>
> Thanks.
Eric Dumazet Oct. 12, 2020, 7:42 a.m. UTC | #7
On Mon, Oct 12, 2020 at 6:22 AM Muchun Song <songmuchun@bytedance.com> wrote:
>
> On Mon, Oct 12, 2020 at 2:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Sat, Oct 10, 2020 at 3:39 AM Muchun Song <songmuchun@bytedance.com> wrote:
> > >
> > > The amount of memory allocated to sockets buffer can become significant.
> > > However, we do not display the amount of memory consumed by sockets
> > > buffer. In this case, knowing where the memory is consumed by the kernel
> >
> > We do it via `ss -m`. Is it not sufficient? And if not, why not adding it there
> > rather than /proc/meminfo?
>
> If the system has little free memory, we can know where the memory is via
> /proc/meminfo. If a lot of memory is consumed by socket buffer, we cannot
> know it when the Sock is not shown in the /proc/meminfo. If the unaware user
> can't think of the socket buffer, naturally they will not `ss -m`. The
> end result
> is that we still don’t know where the memory is consumed. And we add the
> Sock to the /proc/meminfo just like the memcg does('sock' item in the cgroup
> v2 memory.stat). So I think that adding to /proc/meminfo is sufficient.
>
> >
> > >  static inline void __skb_frag_unref(skb_frag_t *frag)
> > >  {
> > > -       put_page(skb_frag_page(frag));
> > > +       struct page *page = skb_frag_page(frag);
> > > +
> > > +       if (put_page_testzero(page)) {
> > > +               dec_sock_node_page_state(page);
> > > +               __put_page(page);
> > > +       }
> > >  }
> >
> > You mix socket page frag with skb frag at least, not sure this is exactly
> > what you want, because clearly skb page frags are frequently used
> > by network drivers rather than sockets.
> >
> > Also, which one matches this dec_sock_node_page_state()? Clearly
> > not skb_fill_page_desc() or __skb_frag_ref().
>
> Yeah, we call inc_sock_node_page_state() in the skb_page_frag_refill().
> So if someone gets the page returned by skb_page_frag_refill(), it must
> put the page via __skb_frag_unref()/skb_frag_unref(). We use PG_private
> to indicate that we need to dec the node page state when the refcount of
> page reaches zero.
>

Pages can be transferred from pipe to socket, socket to pipe (splice()
and zerocopy friends...)

 If you want to track TCP memory allocations, you always can look at
/proc/net/sockstat,
without adding yet another expensive memory accounting.
Muchun Song Oct. 12, 2020, 8:39 a.m. UTC | #8
On Mon, Oct 12, 2020 at 3:42 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Oct 12, 2020 at 6:22 AM Muchun Song <songmuchun@bytedance.com> wrote:
> >
> > On Mon, Oct 12, 2020 at 2:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > On Sat, Oct 10, 2020 at 3:39 AM Muchun Song <songmuchun@bytedance.com> wrote:
> > > >
> > > > The amount of memory allocated to sockets buffer can become significant.
> > > > However, we do not display the amount of memory consumed by sockets
> > > > buffer. In this case, knowing where the memory is consumed by the kernel
> > >
> > > We do it via `ss -m`. Is it not sufficient? And if not, why not adding it there
> > > rather than /proc/meminfo?
> >
> > If the system has little free memory, we can know where the memory is via
> > /proc/meminfo. If a lot of memory is consumed by socket buffer, we cannot
> > know it when the Sock is not shown in the /proc/meminfo. If the unaware user
> > can't think of the socket buffer, naturally they will not `ss -m`. The
> > end result
> > is that we still don’t know where the memory is consumed. And we add the
> > Sock to the /proc/meminfo just like the memcg does('sock' item in the cgroup
> > v2 memory.stat). So I think that adding to /proc/meminfo is sufficient.
> >
> > >
> > > >  static inline void __skb_frag_unref(skb_frag_t *frag)
> > > >  {
> > > > -       put_page(skb_frag_page(frag));
> > > > +       struct page *page = skb_frag_page(frag);
> > > > +
> > > > +       if (put_page_testzero(page)) {
> > > > +               dec_sock_node_page_state(page);
> > > > +               __put_page(page);
> > > > +       }
> > > >  }
> > >
> > > You mix socket page frag with skb frag at least, not sure this is exactly
> > > what you want, because clearly skb page frags are frequently used
> > > by network drivers rather than sockets.
> > >
> > > Also, which one matches this dec_sock_node_page_state()? Clearly
> > > not skb_fill_page_desc() or __skb_frag_ref().
> >
> > Yeah, we call inc_sock_node_page_state() in the skb_page_frag_refill().
> > So if someone gets the page returned by skb_page_frag_refill(), it must
> > put the page via __skb_frag_unref()/skb_frag_unref(). We use PG_private
> > to indicate that we need to dec the node page state when the refcount of
> > page reaches zero.
> >
>
> Pages can be transferred from pipe to socket, socket to pipe (splice()
> and zerocopy friends...)
>
>  If you want to track TCP memory allocations, you always can look at
> /proc/net/sockstat,
> without adding yet another expensive memory accounting.

The 'mem' item in the /proc/net/sockstat does not represent real
memory usage. This is just the total amount of charged memory.

For example, if a task sends a 10-byte message, it only charges one
page to memcg. But the system may allocate 8 pages. Therefore, it
does not truly reflect the memory allocated by the above memory
allocation path. We can see the difference via the following message.

cat /proc/net/sockstat
  sockets: used 698
  TCP: inuse 70 orphan 0 tw 617 alloc 134 mem 13
  UDP: inuse 90 mem 4
  UDPLITE: inuse 0
  RAW: inuse 1
  FRAG: inuse 0 memory 0

cat /proc/meminfo | grep Sock
  Sock:              13664 kB

The /proc/net/sockstat only shows us that there are 17*4 kB TCP
memory allocations. But apply this patch, we can see that we truly
allocate 13664 kB(May be greater than this value because of per-cpu
stat cache). Of course the load of the example here is not high. In
some high load cases, I believe the difference here will be even
greater.
Eric Dumazet Oct. 12, 2020, 9:24 a.m. UTC | #9
On 10/12/20 10:39 AM, Muchun Song wrote:
> On Mon, Oct 12, 2020 at 3:42 PM Eric Dumazet <edumazet@google.com> wrote:
>>
>> On Mon, Oct 12, 2020 at 6:22 AM Muchun Song <songmuchun@bytedance.com> wrote:
>>>
>>> On Mon, Oct 12, 2020 at 2:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>
>>>> On Sat, Oct 10, 2020 at 3:39 AM Muchun Song <songmuchun@bytedance.com> wrote:
>>>>>
>>>>> The amount of memory allocated to sockets buffer can become significant.
>>>>> However, we do not display the amount of memory consumed by sockets
>>>>> buffer. In this case, knowing where the memory is consumed by the kernel
>>>>
>>>> We do it via `ss -m`. Is it not sufficient? And if not, why not adding it there
>>>> rather than /proc/meminfo?
>>>
>>> If the system has little free memory, we can know where the memory is via
>>> /proc/meminfo. If a lot of memory is consumed by socket buffer, we cannot
>>> know it when the Sock is not shown in the /proc/meminfo. If the unaware user
>>> can't think of the socket buffer, naturally they will not `ss -m`. The
>>> end result
>>> is that we still don’t know where the memory is consumed. And we add the
>>> Sock to the /proc/meminfo just like the memcg does('sock' item in the cgroup
>>> v2 memory.stat). So I think that adding to /proc/meminfo is sufficient.
>>>
>>>>
>>>>>  static inline void __skb_frag_unref(skb_frag_t *frag)
>>>>>  {
>>>>> -       put_page(skb_frag_page(frag));
>>>>> +       struct page *page = skb_frag_page(frag);
>>>>> +
>>>>> +       if (put_page_testzero(page)) {
>>>>> +               dec_sock_node_page_state(page);
>>>>> +               __put_page(page);
>>>>> +       }
>>>>>  }
>>>>
>>>> You mix socket page frag with skb frag at least, not sure this is exactly
>>>> what you want, because clearly skb page frags are frequently used
>>>> by network drivers rather than sockets.
>>>>
>>>> Also, which one matches this dec_sock_node_page_state()? Clearly
>>>> not skb_fill_page_desc() or __skb_frag_ref().
>>>
>>> Yeah, we call inc_sock_node_page_state() in the skb_page_frag_refill().
>>> So if someone gets the page returned by skb_page_frag_refill(), it must
>>> put the page via __skb_frag_unref()/skb_frag_unref(). We use PG_private
>>> to indicate that we need to dec the node page state when the refcount of
>>> page reaches zero.
>>>
>>
>> Pages can be transferred from pipe to socket, socket to pipe (splice()
>> and zerocopy friends...)
>>
>>  If you want to track TCP memory allocations, you always can look at
>> /proc/net/sockstat,
>> without adding yet another expensive memory accounting.
> 
> The 'mem' item in the /proc/net/sockstat does not represent real
> memory usage. This is just the total amount of charged memory.
> 
> For example, if a task sends a 10-byte message, it only charges one
> page to memcg. But the system may allocate 8 pages. Therefore, it
> does not truly reflect the memory allocated by the above memory
> allocation path. We can see the difference via the following message.
> 
> cat /proc/net/sockstat
>   sockets: used 698
>   TCP: inuse 70 orphan 0 tw 617 alloc 134 mem 13
>   UDP: inuse 90 mem 4
>   UDPLITE: inuse 0
>   RAW: inuse 1
>   FRAG: inuse 0 memory 0
> 
> cat /proc/meminfo | grep Sock
>   Sock:              13664 kB
> 
> The /proc/net/sockstat only shows us that there are 17*4 kB TCP
> memory allocations. But apply this patch, we can see that we truly
> allocate 13664 kB(May be greater than this value because of per-cpu
> stat cache). Of course the load of the example here is not high. In
> some high load cases, I believe the difference here will be even
> greater.
> 

This is great, but you have not addressed my feedback.

TCP memory allocations are bounded by /proc/sys/net/ipv4/tcp_mem

Fact that the memory is forward allocated or not is a detail.

If you think we must pre-allocate memory, instead of forward allocations,
your patch does not address this. Adding one line per consumer in /proc/meminfo looks
wrong to me.

If you do not want 9.37 % of physical memory being possibly used by TCP,
just change /proc/sys/net/ipv4/tcp_mem accordingly ?
Muchun Song Oct. 12, 2020, 9:53 a.m. UTC | #10
On Mon, Oct 12, 2020 at 5:24 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
>
> On 10/12/20 10:39 AM, Muchun Song wrote:
> > On Mon, Oct 12, 2020 at 3:42 PM Eric Dumazet <edumazet@google.com> wrote:
> >>
> >> On Mon, Oct 12, 2020 at 6:22 AM Muchun Song <songmuchun@bytedance.com> wrote:
> >>>
> >>> On Mon, Oct 12, 2020 at 2:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>>
> >>>> On Sat, Oct 10, 2020 at 3:39 AM Muchun Song <songmuchun@bytedance.com> wrote:
> >>>>>
> >>>>> The amount of memory allocated to sockets buffer can become significant.
> >>>>> However, we do not display the amount of memory consumed by sockets
> >>>>> buffer. In this case, knowing where the memory is consumed by the kernel
> >>>>
> >>>> We do it via `ss -m`. Is it not sufficient? And if not, why not adding it there
> >>>> rather than /proc/meminfo?
> >>>
> >>> If the system has little free memory, we can know where the memory is via
> >>> /proc/meminfo. If a lot of memory is consumed by socket buffer, we cannot
> >>> know it when the Sock is not shown in the /proc/meminfo. If the unaware user
> >>> can't think of the socket buffer, naturally they will not `ss -m`. The
> >>> end result
> >>> is that we still don’t know where the memory is consumed. And we add the
> >>> Sock to the /proc/meminfo just like the memcg does('sock' item in the cgroup
> >>> v2 memory.stat). So I think that adding to /proc/meminfo is sufficient.
> >>>
> >>>>
> >>>>>  static inline void __skb_frag_unref(skb_frag_t *frag)
> >>>>>  {
> >>>>> -       put_page(skb_frag_page(frag));
> >>>>> +       struct page *page = skb_frag_page(frag);
> >>>>> +
> >>>>> +       if (put_page_testzero(page)) {
> >>>>> +               dec_sock_node_page_state(page);
> >>>>> +               __put_page(page);
> >>>>> +       }
> >>>>>  }
> >>>>
> >>>> You mix socket page frag with skb frag at least, not sure this is exactly
> >>>> what you want, because clearly skb page frags are frequently used
> >>>> by network drivers rather than sockets.
> >>>>
> >>>> Also, which one matches this dec_sock_node_page_state()? Clearly
> >>>> not skb_fill_page_desc() or __skb_frag_ref().
> >>>
> >>> Yeah, we call inc_sock_node_page_state() in the skb_page_frag_refill().
> >>> So if someone gets the page returned by skb_page_frag_refill(), it must
> >>> put the page via __skb_frag_unref()/skb_frag_unref(). We use PG_private
> >>> to indicate that we need to dec the node page state when the refcount of
> >>> page reaches zero.
> >>>
> >>
> >> Pages can be transferred from pipe to socket, socket to pipe (splice()
> >> and zerocopy friends...)
> >>
> >>  If you want to track TCP memory allocations, you always can look at
> >> /proc/net/sockstat,
> >> without adding yet another expensive memory accounting.
> >
> > The 'mem' item in the /proc/net/sockstat does not represent real
> > memory usage. This is just the total amount of charged memory.
> >
> > For example, if a task sends a 10-byte message, it only charges one
> > page to memcg. But the system may allocate 8 pages. Therefore, it
> > does not truly reflect the memory allocated by the above memory
> > allocation path. We can see the difference via the following message.
> >
> > cat /proc/net/sockstat
> >   sockets: used 698
> >   TCP: inuse 70 orphan 0 tw 617 alloc 134 mem 13
> >   UDP: inuse 90 mem 4
> >   UDPLITE: inuse 0
> >   RAW: inuse 1
> >   FRAG: inuse 0 memory 0
> >
> > cat /proc/meminfo | grep Sock
> >   Sock:              13664 kB
> >
> > The /proc/net/sockstat only shows us that there are 17*4 kB TCP
> > memory allocations. But apply this patch, we can see that we truly
> > allocate 13664 kB(May be greater than this value because of per-cpu
> > stat cache). Of course the load of the example here is not high. In
> > some high load cases, I believe the difference here will be even
> > greater.
> >
>
> This is great, but you have not addressed my feedback.
>
> TCP memory allocations are bounded by /proc/sys/net/ipv4/tcp_mem
>
> Fact that the memory is forward allocated or not is a detail.
>
> If you think we must pre-allocate memory, instead of forward allocations,
> your patch does not address this. Adding one line per consumer in /proc/meminfo looks
> wrong to me.

I think that the consumer which consumes a lot of memory should be added
to the /proc/meminfo. This can help us know the user of large memory.

>
> If you do not want 9.37 % of physical memory being possibly used by TCP,
> just change /proc/sys/net/ipv4/tcp_mem accordingly ?

We are not complaining about TCP using too much memory, but how do
we know that TCP uses a lot of memory. When I firstly face this problem,
I do not know who uses the 25GB memory and it is not shown in the /proc/meminfo.
If we can know the amount memory of the socket buffer via /proc/meminfo, we
may not need to spend a lot of time troubleshooting this problem. Not everyone
knows that a lot of memory may be used here. But I believe many people
should know /proc/meminfo to confirm memory users.

Thanks.

>
>
Cong Wang Oct. 12, 2020, 9:46 p.m. UTC | #11
On Sun, Oct 11, 2020 at 9:22 PM Muchun Song <songmuchun@bytedance.com> wrote:
>
> On Mon, Oct 12, 2020 at 2:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Sat, Oct 10, 2020 at 3:39 AM Muchun Song <songmuchun@bytedance.com> wrote:
> > >
> > > The amount of memory allocated to sockets buffer can become significant.
> > > However, we do not display the amount of memory consumed by sockets
> > > buffer. In this case, knowing where the memory is consumed by the kernel
> >
> > We do it via `ss -m`. Is it not sufficient? And if not, why not adding it there
> > rather than /proc/meminfo?
>
> If the system has little free memory, we can know where the memory is via
> /proc/meminfo. If a lot of memory is consumed by socket buffer, we cannot
> know it when the Sock is not shown in the /proc/meminfo. If the unaware user
> can't think of the socket buffer, naturally they will not `ss -m`. The
> end result

Interesting, we already have a few counters related to socket buffers,
are you saying these are not accounted in /proc/meminfo either?
If yes, why are page frags so special here? If not, they are more
important than page frags, so you probably want to deal with them
first.


> is that we still don’t know where the memory is consumed. And we add the
> Sock to the /proc/meminfo just like the memcg does('sock' item in the cgroup
> v2 memory.stat). So I think that adding to /proc/meminfo is sufficient.

It looks like actually the socket page frag is already accounted,
for example, the tcp_sendmsg_locked():

                        copy = min_t(int, copy, pfrag->size - pfrag->offset);

                        if (!sk_wmem_schedule(sk, copy))
                                goto wait_for_memory;


>
> >
> > >  static inline void __skb_frag_unref(skb_frag_t *frag)
> > >  {
> > > -       put_page(skb_frag_page(frag));
> > > +       struct page *page = skb_frag_page(frag);
> > > +
> > > +       if (put_page_testzero(page)) {
> > > +               dec_sock_node_page_state(page);
> > > +               __put_page(page);
> > > +       }
> > >  }
> >
> > You mix socket page frag with skb frag at least, not sure this is exactly
> > what you want, because clearly skb page frags are frequently used
> > by network drivers rather than sockets.
> >
> > Also, which one matches this dec_sock_node_page_state()? Clearly
> > not skb_fill_page_desc() or __skb_frag_ref().
>
> Yeah, we call inc_sock_node_page_state() in the skb_page_frag_refill().

How is skb_page_frag_refill() possibly paired with __skb_frag_unref()?

> So if someone gets the page returned by skb_page_frag_refill(), it must
> put the page via __skb_frag_unref()/skb_frag_unref(). We use PG_private
> to indicate that we need to dec the node page state when the refcount of
> page reaches zero.

skb_page_frag_refill() is called on frags not within an skb, for instance,
sk_page_frag_refill() uses it for a per-socket or per-process page frag.
But, __skb_frag_unref() is specifically used for skb frags, which are
supposed to be filled by skb_fill_page_desc() (page is allocated by driver).

They are different things you are mixing them up, which looks clearly
wrong or at least misleading.

Thanks.
Cong Wang Oct. 12, 2020, 10:12 p.m. UTC | #12
On Mon, Oct 12, 2020 at 2:53 AM Muchun Song <songmuchun@bytedance.com> wrote:
> We are not complaining about TCP using too much memory, but how do
> we know that TCP uses a lot of memory. When I firstly face this problem,
> I do not know who uses the 25GB memory and it is not shown in the /proc/meminfo.
> If we can know the amount memory of the socket buffer via /proc/meminfo, we
> may not need to spend a lot of time troubleshooting this problem. Not everyone
> knows that a lot of memory may be used here. But I believe many people
> should know /proc/meminfo to confirm memory users.

Well, I'd bet networking people know `ss -m` better than /proc/meminfo,
generally speaking.

The practice here is that if you want some networking-specific counters,
add it to where networking people know better, that is, `ss -m` or /proc/net/...

Or maybe the problem you described is not specific to networking at all,
there must be some other places where pages are allocated but not charged.
If so, adding a general mm counter in /proc/meminfo makes sense, but
it won't be specific to networking.

Thanks.
Muchun Song Oct. 13, 2020, 3:29 a.m. UTC | #13
On Tue, Oct 13, 2020 at 5:47 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Sun, Oct 11, 2020 at 9:22 PM Muchun Song <songmuchun@bytedance.com> wrote:
> >
> > On Mon, Oct 12, 2020 at 2:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > On Sat, Oct 10, 2020 at 3:39 AM Muchun Song <songmuchun@bytedance.com> wrote:
> > > >
> > > > The amount of memory allocated to sockets buffer can become significant.
> > > > However, we do not display the amount of memory consumed by sockets
> > > > buffer. In this case, knowing where the memory is consumed by the kernel
> > >
> > > We do it via `ss -m`. Is it not sufficient? And if not, why not adding it there
> > > rather than /proc/meminfo?
> >
> > If the system has little free memory, we can know where the memory is via
> > /proc/meminfo. If a lot of memory is consumed by socket buffer, we cannot
> > know it when the Sock is not shown in the /proc/meminfo. If the unaware user
> > can't think of the socket buffer, naturally they will not `ss -m`. The
> > end result
>
> Interesting, we already have a few counters related to socket buffers,
> are you saying these are not accounted in /proc/meminfo either?

Yeah, these are not accounted for in /proc/meminfo.

> If yes, why are page frags so special here? If not, they are more
> important than page frags, so you probably want to deal with them
> first.
>
>
> > is that we still don’t know where the memory is consumed. And we add the
> > Sock to the /proc/meminfo just like the memcg does('sock' item in the cgroup
> > v2 memory.stat). So I think that adding to /proc/meminfo is sufficient.
>
> It looks like actually the socket page frag is already accounted,
> for example, the tcp_sendmsg_locked():
>
>                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
>
>                         if (!sk_wmem_schedule(sk, copy))
>                                 goto wait_for_memory;
>

Yeah, it is already accounted for. But it does not represent real memory
usage. This is just the total amount of charged memory.

For example, if a task sends a 10-byte message, it only charges one
page to memcg. But the system may allocate 8 pages. Therefore, it
does not truly reflect the memory allocated by the page frag memory
allocation path.

>
> >
> > >
> > > >  static inline void __skb_frag_unref(skb_frag_t *frag)
> > > >  {
> > > > -       put_page(skb_frag_page(frag));
> > > > +       struct page *page = skb_frag_page(frag);
> > > > +
> > > > +       if (put_page_testzero(page)) {
> > > > +               dec_sock_node_page_state(page);
> > > > +               __put_page(page);
> > > > +       }
> > > >  }
> > >
> > > You mix socket page frag with skb frag at least, not sure this is exactly
> > > what you want, because clearly skb page frags are frequently used
> > > by network drivers rather than sockets.
> > >
> > > Also, which one matches this dec_sock_node_page_state()? Clearly
> > > not skb_fill_page_desc() or __skb_frag_ref().
> >
> > Yeah, we call inc_sock_node_page_state() in the skb_page_frag_refill().
>
> How is skb_page_frag_refill() possibly paired with __skb_frag_unref()?
>
> > So if someone gets the page returned by skb_page_frag_refill(), it must
> > put the page via __skb_frag_unref()/skb_frag_unref(). We use PG_private
> > to indicate that we need to dec the node page state when the refcount of
> > page reaches zero.
>
> skb_page_frag_refill() is called on frags not within an skb, for instance,
> sk_page_frag_refill() uses it for a per-socket or per-process page frag.
> But, __skb_frag_unref() is specifically used for skb frags, which are
> supposed to be filled by skb_fill_page_desc() (page is allocated by driver).
>
> They are different things you are mixing them up, which looks clearly
> wrong or at least misleading.

Yeah, it looks a little strange. I just want to account for page frag
allocations. So I have to use PG_private to distinguish the page
from page frag or others in the __skb_frag_unref(). If the page is
allocated from skb_page_frag_refill, we should decrease the
statistics.

Thanks.

>
> Thanks.
Muchun Song Oct. 13, 2020, 3:52 a.m. UTC | #14
On Tue, Oct 13, 2020 at 6:12 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Mon, Oct 12, 2020 at 2:53 AM Muchun Song <songmuchun@bytedance.com> wrote:
> > We are not complaining about TCP using too much memory, but how do
> > we know that TCP uses a lot of memory. When I firstly face this problem,
> > I do not know who uses the 25GB memory and it is not shown in the /proc/meminfo.
> > If we can know the amount memory of the socket buffer via /proc/meminfo, we
> > may not need to spend a lot of time troubleshooting this problem. Not everyone
> > knows that a lot of memory may be used here. But I believe many people
> > should know /proc/meminfo to confirm memory users.
>
> Well, I'd bet networking people know `ss -m` better than /proc/meminfo,

I agree with you. But if someone(not networking people) faces the same
problem. He may suspect that there is a memory leak or think that a certain
driver allocates memory but has no statistics. He only saw the memory
disappeared via /proc/meminfo.

> generally speaking.
>
> The practice here is that if you want some networking-specific counters,
> add it to where networking people know better, that is, `ss -m` or /proc/net/...
>
> Or maybe the problem you described is not specific to networking at all,
> there must be some other places where pages are allocated but not charged.

Yeah, it is not charged. The allocation path is as follows. This allocation
consumes 25GB memory on our server. And it belongs to the network core.

Thanks.

   __alloc_pages_nodemask+0x11d/0x290
   skb_page_frag_refill+0x68/0xf0
   sk_page_frag_refill+0x19/0x70
   tcp_sendmsg_locked+0x2f4/0xd10
   tcp_sendmsg+0x29/0xa0
   sock_sendmsg+0x30/0x40
   sock_write_iter+0x8f/0x100
   __vfs_write+0x10b/0x190
   vfs_write+0xb0/0x190
   ksys_write+0x5a/0xd0
   do_syscall_64+0x5d/0x110
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

> If so, adding a general mm counter in /proc/meminfo makes sense, but
> it won't be specific to networking.
>
> Thanks.
Eric Dumazet Oct. 13, 2020, 6:55 a.m. UTC | #15
On 10/12/20 11:53 AM, Muchun Song wrote:

> We are not complaining about TCP using too much memory, but how do
> we know that TCP uses a lot of memory. When I firstly face this problem,
> I do not know who uses the 25GB memory and it is not shown in the /proc/meminfo.
> If we can know the amount memory of the socket buffer via /proc/meminfo, we
> may not need to spend a lot of time troubleshooting this problem. Not everyone
> knows that a lot of memory may be used here. But I believe many people
> should know /proc/meminfo to confirm memory users.

Adding yet another operations in networking fast path is a high cost to pay
just to add one extra line in /proc/meminfo, while /proc/net/sockstat
is already a good proxy, with per protocol details, instead of a single bucket.

I reiterate that zero copy would make this counter out of sync,
unless special support is added (adding yet another operations ?)

Also your patch does not address gazillions of page allocations from drivers
in RX path.

Here at Google the majority of networking mm usage when hosts are under stress
is in RX path, when out of order queues start to grow in TCP sockets.

Allocations in TX path were greatly reduced and optimally sized with the introduction
of /proc/sys/net/ipv4/tcp_notsent_lowat.

We have gazillions of put_page()/__free_page()/__free_pages()/alloc_page()/... all
over the places, adding yet another tracking of "this page is used by networking stacks"
is going to be quite a big project.

I thought memcg was a better goal in the long run, lets focus on it.
Mike Rapoport Oct. 13, 2020, 8:09 a.m. UTC | #16
On Mon, Oct 12, 2020 at 05:53:01PM +0800, Muchun Song wrote:
> On Mon, Oct 12, 2020 at 5:24 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > On 10/12/20 10:39 AM, Muchun Song wrote:
> > > On Mon, Oct 12, 2020 at 3:42 PM Eric Dumazet <edumazet@google.com> wrote:
> > >>
> > >> On Mon, Oct 12, 2020 at 6:22 AM Muchun Song <songmuchun@bytedance.com> wrote:
> > >>>
> > >>> On Mon, Oct 12, 2020 at 2:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >>>>
> > >>>> On Sat, Oct 10, 2020 at 3:39 AM Muchun Song <songmuchun@bytedance.com> wrote:
> > >>>>>
> > >>>>> The amount of memory allocated to sockets buffer can become significant.
> > >>>>> However, we do not display the amount of memory consumed by sockets
> > >>>>> buffer. In this case, knowing where the memory is consumed by the kernel
> > >>>>
> > >>>> We do it via `ss -m`. Is it not sufficient? And if not, why not adding it there
> > >>>> rather than /proc/meminfo?
> > >>>
> > >>> If the system has little free memory, we can know where the memory is via
> > >>> /proc/meminfo. If a lot of memory is consumed by socket buffer, we cannot
> > >>> know it when the Sock is not shown in the /proc/meminfo. If the unaware user
> > >>> can't think of the socket buffer, naturally they will not `ss -m`. The
> > >>> end result
> > >>> is that we still don’t know where the memory is consumed. And we add the
> > >>> Sock to the /proc/meminfo just like the memcg does('sock' item in the cgroup
> > >>> v2 memory.stat). So I think that adding to /proc/meminfo is sufficient.
> > >>>
> > >>>>
> > >>>>>  static inline void __skb_frag_unref(skb_frag_t *frag)
> > >>>>>  {
> > >>>>> -       put_page(skb_frag_page(frag));
> > >>>>> +       struct page *page = skb_frag_page(frag);
> > >>>>> +
> > >>>>> +       if (put_page_testzero(page)) {
> > >>>>> +               dec_sock_node_page_state(page);
> > >>>>> +               __put_page(page);
> > >>>>> +       }
> > >>>>>  }
> > >>>>
> > >>>> You mix socket page frag with skb frag at least, not sure this is exactly
> > >>>> what you want, because clearly skb page frags are frequently used
> > >>>> by network drivers rather than sockets.
> > >>>>
> > >>>> Also, which one matches this dec_sock_node_page_state()? Clearly
> > >>>> not skb_fill_page_desc() or __skb_frag_ref().
> > >>>
> > >>> Yeah, we call inc_sock_node_page_state() in the skb_page_frag_refill().
> > >>> So if someone gets the page returned by skb_page_frag_refill(), it must
> > >>> put the page via __skb_frag_unref()/skb_frag_unref(). We use PG_private
> > >>> to indicate that we need to dec the node page state when the refcount of
> > >>> page reaches zero.
> > >>>
> > >>
> > >> Pages can be transferred from pipe to socket, socket to pipe (splice()
> > >> and zerocopy friends...)
> > >>
> > >>  If you want to track TCP memory allocations, you always can look at
> > >> /proc/net/sockstat,
> > >> without adding yet another expensive memory accounting.
> > >
> > > The 'mem' item in the /proc/net/sockstat does not represent real
> > > memory usage. This is just the total amount of charged memory.
> > >
> > > For example, if a task sends a 10-byte message, it only charges one
> > > page to memcg. But the system may allocate 8 pages. Therefore, it
> > > does not truly reflect the memory allocated by the above memory
> > > allocation path. We can see the difference via the following message.
> > >
> > > cat /proc/net/sockstat
> > >   sockets: used 698
> > >   TCP: inuse 70 orphan 0 tw 617 alloc 134 mem 13
> > >   UDP: inuse 90 mem 4
> > >   UDPLITE: inuse 0
> > >   RAW: inuse 1
> > >   FRAG: inuse 0 memory 0
> > >
> > > cat /proc/meminfo | grep Sock
> > >   Sock:              13664 kB
> > >
> > > The /proc/net/sockstat only shows us that there are 17*4 kB TCP
> > > memory allocations. But apply this patch, we can see that we truly
> > > allocate 13664 kB(May be greater than this value because of per-cpu
> > > stat cache). Of course the load of the example here is not high. In
> > > some high load cases, I believe the difference here will be even
> > > greater.
> > >
> >
> > This is great, but you have not addressed my feedback.
> >
> > TCP memory allocations are bounded by /proc/sys/net/ipv4/tcp_mem
> >
> > Fact that the memory is forward allocated or not is a detail.
> >
> > If you think we must pre-allocate memory, instead of forward allocations,
> > your patch does not address this. Adding one line per consumer in /proc/meminfo looks
> > wrong to me.
> 
> I think that the consumer which consumes a lot of memory should be added
> to the /proc/meminfo. This can help us know the user of large memory.
> 
> >
> > If you do not want 9.37 % of physical memory being possibly used by TCP,
> > just change /proc/sys/net/ipv4/tcp_mem accordingly ?
> 
> We are not complaining about TCP using too much memory, but how do
> we know that TCP uses a lot of memory. When I firstly face this problem,
> I do not know who uses the 25GB memory and it is not shown in the /proc/meminfo.
> If we can know the amount memory of the socket buffer via /proc/meminfo, we
> may not need to spend a lot of time troubleshooting this problem. Not everyone
> knows that a lot of memory may be used here. But I believe many people
> should know /proc/meminfo to confirm memory users.

If I undestand correctly, the problem you are trying to solve is to
simplify troubleshooting of memory usage for people who may not be aware
that networking stack can be a large memory consumer.

For that a paragraph in 'man 5 proc' maybe a good start:
Randy Dunlap Oct. 13, 2020, 2:43 p.m. UTC | #17
On 10/13/20 1:09 AM, Mike Rapoport wrote:
> On Mon, Oct 12, 2020 at 05:53:01PM +0800, Muchun Song wrote:
>> On Mon, Oct 12, 2020 at 5:24 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>
>>> On 10/12/20 10:39 AM, Muchun Song wrote:
>>>> On Mon, Oct 12, 2020 at 3:42 PM Eric Dumazet <edumazet@google.com> wrote:
>>>>>
>>>>> On Mon, Oct 12, 2020 at 6:22 AM Muchun Song <songmuchun@bytedance.com> wrote:
>>>>>>
>>>>>> On Mon, Oct 12, 2020 at 2:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>>>>
>>>>>>> On Sat, Oct 10, 2020 at 3:39 AM Muchun Song <songmuchun@bytedance.com> wrote:
>>>>>>>>
>>>>>>>> The amount of memory allocated to sockets buffer can become significant.
>>>>>>>> However, we do not display the amount of memory consumed by sockets
>>>>>>>> buffer. In this case, knowing where the memory is consumed by the kernel
>>>>>>>
>>>>>>> We do it via `ss -m`. Is it not sufficient? And if not, why not adding it there
>>>>>>> rather than /proc/meminfo?
>>>>>>
>>>>>> If the system has little free memory, we can know where the memory is via
>>>>>> /proc/meminfo. If a lot of memory is consumed by socket buffer, we cannot
>>>>>> know it when the Sock is not shown in the /proc/meminfo. If the unaware user
>>>>>> can't think of the socket buffer, naturally they will not `ss -m`. The
>>>>>> end result
>>>>>> is that we still don’t know where the memory is consumed. And we add the
>>>>>> Sock to the /proc/meminfo just like the memcg does('sock' item in the cgroup
>>>>>> v2 memory.stat). So I think that adding to /proc/meminfo is sufficient.
>>>>>>
>>>>>>>
>>>>>>>>  static inline void __skb_frag_unref(skb_frag_t *frag)
>>>>>>>>  {
>>>>>>>> -       put_page(skb_frag_page(frag));
>>>>>>>> +       struct page *page = skb_frag_page(frag);
>>>>>>>> +
>>>>>>>> +       if (put_page_testzero(page)) {
>>>>>>>> +               dec_sock_node_page_state(page);
>>>>>>>> +               __put_page(page);
>>>>>>>> +       }
>>>>>>>>  }
>>>>>>>
>>>>>>> You mix socket page frag with skb frag at least, not sure this is exactly
>>>>>>> what you want, because clearly skb page frags are frequently used
>>>>>>> by network drivers rather than sockets.
>>>>>>>
>>>>>>> Also, which one matches this dec_sock_node_page_state()? Clearly
>>>>>>> not skb_fill_page_desc() or __skb_frag_ref().
>>>>>>
>>>>>> Yeah, we call inc_sock_node_page_state() in the skb_page_frag_refill().
>>>>>> So if someone gets the page returned by skb_page_frag_refill(), it must
>>>>>> put the page via __skb_frag_unref()/skb_frag_unref(). We use PG_private
>>>>>> to indicate that we need to dec the node page state when the refcount of
>>>>>> page reaches zero.
>>>>>>
>>>>>
>>>>> Pages can be transferred from pipe to socket, socket to pipe (splice()
>>>>> and zerocopy friends...)
>>>>>
>>>>>  If you want to track TCP memory allocations, you always can look at
>>>>> /proc/net/sockstat,
>>>>> without adding yet another expensive memory accounting.
>>>>
>>>> The 'mem' item in the /proc/net/sockstat does not represent real
>>>> memory usage. This is just the total amount of charged memory.
>>>>
>>>> For example, if a task sends a 10-byte message, it only charges one
>>>> page to memcg. But the system may allocate 8 pages. Therefore, it
>>>> does not truly reflect the memory allocated by the above memory
>>>> allocation path. We can see the difference via the following message.
>>>>
>>>> cat /proc/net/sockstat
>>>>   sockets: used 698
>>>>   TCP: inuse 70 orphan 0 tw 617 alloc 134 mem 13
>>>>   UDP: inuse 90 mem 4
>>>>   UDPLITE: inuse 0
>>>>   RAW: inuse 1
>>>>   FRAG: inuse 0 memory 0
>>>>
>>>> cat /proc/meminfo | grep Sock
>>>>   Sock:              13664 kB
>>>>
>>>> The /proc/net/sockstat only shows us that there are 17*4 kB TCP
>>>> memory allocations. But apply this patch, we can see that we truly
>>>> allocate 13664 kB(May be greater than this value because of per-cpu
>>>> stat cache). Of course the load of the example here is not high. In
>>>> some high load cases, I believe the difference here will be even
>>>> greater.
>>>>
>>>
>>> This is great, but you have not addressed my feedback.
>>>
>>> TCP memory allocations are bounded by /proc/sys/net/ipv4/tcp_mem
>>>
>>> Fact that the memory is forward allocated or not is a detail.
>>>
>>> If you think we must pre-allocate memory, instead of forward allocations,
>>> your patch does not address this. Adding one line per consumer in /proc/meminfo looks
>>> wrong to me.
>>
>> I think that the consumer which consumes a lot of memory should be added
>> to the /proc/meminfo. This can help us know the user of large memory.
>>
>>>
>>> If you do not want 9.37 % of physical memory being possibly used by TCP,
>>> just change /proc/sys/net/ipv4/tcp_mem accordingly ?
>>
>> We are not complaining about TCP using too much memory, but how do
>> we know that TCP uses a lot of memory. When I firstly face this problem,
>> I do not know who uses the 25GB memory and it is not shown in the /proc/meminfo.
>> If we can know the amount memory of the socket buffer via /proc/meminfo, we
>> may not need to spend a lot of time troubleshooting this problem. Not everyone
>> knows that a lot of memory may be used here. But I believe many people
>> should know /proc/meminfo to confirm memory users.
> 
> If I undestand correctly, the problem you are trying to solve is to
> simplify troubleshooting of memory usage for people who may not be aware
> that networking stack can be a large memory consumer.
> 
> For that a paragraph in 'man 5 proc' maybe a good start:
> 
>>From ddbcf38576d1a2b0e36fe25a27350d566759b664 Mon Sep 17 00:00:00 2001
> From: Mike Rapoport <rppt@linux.ibm.com>
> Date: Tue, 13 Oct 2020 11:07:35 +0300
> Subject: [PATCH] proc.5: meminfo: add not anout network stack memory
>  consumption
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>  man5/proc.5 | 8 ++++++++
>  1 file changed, 8 insertions(+)
> 
> diff --git a/man5/proc.5 b/man5/proc.5
> index ed309380b..8414676f1 100644
> --- a/man5/proc.5
> +++ b/man5/proc.5
> @@ -3478,6 +3478,14 @@ Except as noted below,
>  all of the fields have been present since at least Linux 2.6.0.
>  Some fields are displayed only if the kernel was configured
>  with various options; those dependencies are noted in the list.
> +.IP
> +Note that significant part of memory allocated by the network stack
> +is not accounted in the file.
> +The memory consumption of the network stack can be queried
> +using
> +.IR /proc/net/sockstat
> +or
> +.BR ss (8)
>  .RS
>  .TP
>  .IR MemTotal " %lu"

Hi Mike,

Could you tell us what units those values are in?
or is that already explained somewhere else?

thanks.
Mike Rapoport Oct. 13, 2020, 3:12 p.m. UTC | #18
On Tue, Oct 13, 2020 at 07:43:59AM -0700, Randy Dunlap wrote:
> On 10/13/20 1:09 AM, Mike Rapoport wrote:
> > On Mon, Oct 12, 2020 at 05:53:01PM +0800, Muchun Song wrote:
> >> On Mon, Oct 12, 2020 at 5:24 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >>>
> >>> On 10/12/20 10:39 AM, Muchun Song wrote:
> >>>> On Mon, Oct 12, 2020 at 3:42 PM Eric Dumazet <edumazet@google.com> wrote:
> >>
> >> We are not complaining about TCP using too much memory, but how do
> >> we know that TCP uses a lot of memory. When I firstly face this problem,
> >> I do not know who uses the 25GB memory and it is not shown in the /proc/meminfo.
> >> If we can know the amount memory of the socket buffer via /proc/meminfo, we
> >> may not need to spend a lot of time troubleshooting this problem. Not everyone
> >> knows that a lot of memory may be used here. But I believe many people
> >> should know /proc/meminfo to confirm memory users.
> > 
> > If I undestand correctly, the problem you are trying to solve is to
> > simplify troubleshooting of memory usage for people who may not be aware
> > that networking stack can be a large memory consumer.
> > 
> > For that a paragraph in 'man 5 proc' maybe a good start:
> > 
> >>From ddbcf38576d1a2b0e36fe25a27350d566759b664 Mon Sep 17 00:00:00 2001
> > From: Mike Rapoport <rppt@linux.ibm.com>
> > Date: Tue, 13 Oct 2020 11:07:35 +0300
> > Subject: [PATCH] proc.5: meminfo: add not anout network stack memory
> >  consumption
> > 
> > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > ---
> >  man5/proc.5 | 8 ++++++++
> >  1 file changed, 8 insertions(+)
> > 
> > diff --git a/man5/proc.5 b/man5/proc.5
> > index ed309380b..8414676f1 100644
> > --- a/man5/proc.5
> > +++ b/man5/proc.5
> > @@ -3478,6 +3478,14 @@ Except as noted below,
> >  all of the fields have been present since at least Linux 2.6.0.
> >  Some fields are displayed only if the kernel was configured
> >  with various options; those dependencies are noted in the list.
> > +.IP
> > +Note that significant part of memory allocated by the network stack
> > +is not accounted in the file.
> > +The memory consumption of the network stack can be queried
> > +using
> > +.IR /proc/net/sockstat
> > +or
> > +.BR ss (8)
> >  .RS
> >  .TP
> >  .IR MemTotal " %lu"
> 
> Hi Mike,
> 
> Could you tell us what units those values are in?
> or is that already explained somewhere else?

It is described a few lines above and anyway, "MemTotal" is a part of
the diff context ;-)

> thanks.
> -- 
> ~Randy
> 
>
Randy Dunlap Oct. 13, 2020, 3:21 p.m. UTC | #19
On 10/13/20 8:12 AM, Mike Rapoport wrote:
> On Tue, Oct 13, 2020 at 07:43:59AM -0700, Randy Dunlap wrote:
>> On 10/13/20 1:09 AM, Mike Rapoport wrote:
>>> On Mon, Oct 12, 2020 at 05:53:01PM +0800, Muchun Song wrote:
>>>> On Mon, Oct 12, 2020 at 5:24 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>>>
>>>>> On 10/12/20 10:39 AM, Muchun Song wrote:
>>>>>> On Mon, Oct 12, 2020 at 3:42 PM Eric Dumazet <edumazet@google.com> wrote:
>>>>
>>>> We are not complaining about TCP using too much memory, but how do
>>>> we know that TCP uses a lot of memory. When I firstly face this problem,
>>>> I do not know who uses the 25GB memory and it is not shown in the /proc/meminfo.
>>>> If we can know the amount memory of the socket buffer via /proc/meminfo, we
>>>> may not need to spend a lot of time troubleshooting this problem. Not everyone
>>>> knows that a lot of memory may be used here. But I believe many people
>>>> should know /proc/meminfo to confirm memory users.
>>>
>>> If I undestand correctly, the problem you are trying to solve is to
>>> simplify troubleshooting of memory usage for people who may not be aware
>>> that networking stack can be a large memory consumer.
>>>
>>> For that a paragraph in 'man 5 proc' maybe a good start:
>>>
>>> >From ddbcf38576d1a2b0e36fe25a27350d566759b664 Mon Sep 17 00:00:00 2001
>>> From: Mike Rapoport <rppt@linux.ibm.com>
>>> Date: Tue, 13 Oct 2020 11:07:35 +0300
>>> Subject: [PATCH] proc.5: meminfo: add not anout network stack memory
>>>  consumption
>>>
>>> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
>>> ---
>>>  man5/proc.5 | 8 ++++++++
>>>  1 file changed, 8 insertions(+)
>>>
>>> diff --git a/man5/proc.5 b/man5/proc.5
>>> index ed309380b..8414676f1 100644
>>> --- a/man5/proc.5
>>> +++ b/man5/proc.5
>>> @@ -3478,6 +3478,14 @@ Except as noted below,
>>>  all of the fields have been present since at least Linux 2.6.0.
>>>  Some fields are displayed only if the kernel was configured
>>>  with various options; those dependencies are noted in the list.
>>> +.IP
>>> +Note that significant part of memory allocated by the network stack
>>> +is not accounted in the file.
>>> +The memory consumption of the network stack can be queried
>>> +using
>>> +.IR /proc/net/sockstat
>>> +or
>>> +.BR ss (8)
>>>  .RS
>>>  .TP
>>>  .IR MemTotal " %lu"
>>
>> Hi Mike,
>>
>> Could you tell us what units those values are in?
>> or is that already explained somewhere else?
> 
> It is described a few lines above and anyway, "MemTotal" is a part of
> the diff context ;-)

with no units AFAICT.

But I was unclear. I wasn't referring to /proc/meminfo, but instead
to /proc/net/sockstat and its units:

sockets: used 1224
TCP: inuse 11 orphan 1 tw 1 alloc 26 mem 3
UDP: inuse 4 mem 2
UDPLITE: inuse 0
RAW: inuse 0
FRAG: inuse 0 memory 0

E.g., for TCP and UDP, are those socket counts or some unit of memory?
If units of memory, what unit size?

thanks.
Muchun Song Oct. 13, 2020, 3:28 p.m. UTC | #20
On Tue, Oct 13, 2020 at 4:09 PM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Mon, Oct 12, 2020 at 05:53:01PM +0800, Muchun Song wrote:
> > On Mon, Oct 12, 2020 at 5:24 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > >
> > > On 10/12/20 10:39 AM, Muchun Song wrote:
> > > > On Mon, Oct 12, 2020 at 3:42 PM Eric Dumazet <edumazet@google.com> wrote:
> > > >>
> > > >> On Mon, Oct 12, 2020 at 6:22 AM Muchun Song <songmuchun@bytedance.com> wrote:
> > > >>>
> > > >>> On Mon, Oct 12, 2020 at 2:39 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > >>>>
> > > >>>> On Sat, Oct 10, 2020 at 3:39 AM Muchun Song <songmuchun@bytedance.com> wrote:
> > > >>>>>
> > > >>>>> The amount of memory allocated to sockets buffer can become significant.
> > > >>>>> However, we do not display the amount of memory consumed by sockets
> > > >>>>> buffer. In this case, knowing where the memory is consumed by the kernel
> > > >>>>
> > > >>>> We do it via `ss -m`. Is it not sufficient? And if not, why not adding it there
> > > >>>> rather than /proc/meminfo?
> > > >>>
> > > >>> If the system has little free memory, we can know where the memory is via
> > > >>> /proc/meminfo. If a lot of memory is consumed by socket buffer, we cannot
> > > >>> know it when the Sock is not shown in the /proc/meminfo. If the unaware user
> > > >>> can't think of the socket buffer, naturally they will not `ss -m`. The
> > > >>> end result
> > > >>> is that we still don’t know where the memory is consumed. And we add the
> > > >>> Sock to the /proc/meminfo just like the memcg does('sock' item in the cgroup
> > > >>> v2 memory.stat). So I think that adding to /proc/meminfo is sufficient.
> > > >>>
> > > >>>>
> > > >>>>>  static inline void __skb_frag_unref(skb_frag_t *frag)
> > > >>>>>  {
> > > >>>>> -       put_page(skb_frag_page(frag));
> > > >>>>> +       struct page *page = skb_frag_page(frag);
> > > >>>>> +
> > > >>>>> +       if (put_page_testzero(page)) {
> > > >>>>> +               dec_sock_node_page_state(page);
> > > >>>>> +               __put_page(page);
> > > >>>>> +       }
> > > >>>>>  }
> > > >>>>
> > > >>>> You mix socket page frag with skb frag at least, not sure this is exactly
> > > >>>> what you want, because clearly skb page frags are frequently used
> > > >>>> by network drivers rather than sockets.
> > > >>>>
> > > >>>> Also, which one matches this dec_sock_node_page_state()? Clearly
> > > >>>> not skb_fill_page_desc() or __skb_frag_ref().
> > > >>>
> > > >>> Yeah, we call inc_sock_node_page_state() in the skb_page_frag_refill().
> > > >>> So if someone gets the page returned by skb_page_frag_refill(), it must
> > > >>> put the page via __skb_frag_unref()/skb_frag_unref(). We use PG_private
> > > >>> to indicate that we need to dec the node page state when the refcount of
> > > >>> page reaches zero.
> > > >>>
> > > >>
> > > >> Pages can be transferred from pipe to socket, socket to pipe (splice()
> > > >> and zerocopy friends...)
> > > >>
> > > >>  If you want to track TCP memory allocations, you always can look at
> > > >> /proc/net/sockstat,
> > > >> without adding yet another expensive memory accounting.
> > > >
> > > > The 'mem' item in the /proc/net/sockstat does not represent real
> > > > memory usage. This is just the total amount of charged memory.
> > > >
> > > > For example, if a task sends a 10-byte message, it only charges one
> > > > page to memcg. But the system may allocate 8 pages. Therefore, it
> > > > does not truly reflect the memory allocated by the above memory
> > > > allocation path. We can see the difference via the following message.
> > > >
> > > > cat /proc/net/sockstat
> > > >   sockets: used 698
> > > >   TCP: inuse 70 orphan 0 tw 617 alloc 134 mem 13
> > > >   UDP: inuse 90 mem 4
> > > >   UDPLITE: inuse 0
> > > >   RAW: inuse 1
> > > >   FRAG: inuse 0 memory 0
> > > >
> > > > cat /proc/meminfo | grep Sock
> > > >   Sock:              13664 kB
> > > >
> > > > The /proc/net/sockstat only shows us that there are 17*4 kB TCP
> > > > memory allocations. But apply this patch, we can see that we truly
> > > > allocate 13664 kB(May be greater than this value because of per-cpu
> > > > stat cache). Of course the load of the example here is not high. In
> > > > some high load cases, I believe the difference here will be even
> > > > greater.
> > > >
> > >
> > > This is great, but you have not addressed my feedback.
> > >
> > > TCP memory allocations are bounded by /proc/sys/net/ipv4/tcp_mem
> > >
> > > Fact that the memory is forward allocated or not is a detail.
> > >
> > > If you think we must pre-allocate memory, instead of forward allocations,
> > > your patch does not address this. Adding one line per consumer in /proc/meminfo looks
> > > wrong to me.
> >
> > I think that the consumer which consumes a lot of memory should be added
> > to the /proc/meminfo. This can help us know the user of large memory.
> >
> > >
> > > If you do not want 9.37 % of physical memory being possibly used by TCP,
> > > just change /proc/sys/net/ipv4/tcp_mem accordingly ?
> >
> > We are not complaining about TCP using too much memory, but how do
> > we know that TCP uses a lot of memory. When I firstly face this problem,
> > I do not know who uses the 25GB memory and it is not shown in the /proc/meminfo.
> > If we can know the amount memory of the socket buffer via /proc/meminfo, we
> > may not need to spend a lot of time troubleshooting this problem. Not everyone
> > knows that a lot of memory may be used here. But I believe many people
> > should know /proc/meminfo to confirm memory users.
>
> If I undestand correctly, the problem you are trying to solve is to
> simplify troubleshooting of memory usage for people who may not be aware
> that networking stack can be a large memory consumer.

Yeah, you are right. Although the information provided by /proc/net/sockstat
is not accurate, it can also provide some valuable information. I think that it
might be better if we can add a total amount socket buffer to /proc/meminfo.
The amount socket buffer statistics can be from /proc/net/sockstat directly.

Thanks.

>
> For that a paragraph in 'man 5 proc' maybe a good start:
>
> From ddbcf38576d1a2b0e36fe25a27350d566759b664 Mon Sep 17 00:00:00 2001
> From: Mike Rapoport <rppt@linux.ibm.com>
> Date: Tue, 13 Oct 2020 11:07:35 +0300
> Subject: [PATCH] proc.5: meminfo: add not anout network stack memory
>  consumption
>
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>  man5/proc.5 | 8 ++++++++
>  1 file changed, 8 insertions(+)
>
> diff --git a/man5/proc.5 b/man5/proc.5
> index ed309380b..8414676f1 100644
> --- a/man5/proc.5
> +++ b/man5/proc.5
> @@ -3478,6 +3478,14 @@ Except as noted below,
>  all of the fields have been present since at least Linux 2.6.0.
>  Some fields are displayed only if the kernel was configured
>  with various options; those dependencies are noted in the list.
> +.IP
> +Note that significant part of memory allocated by the network stack
> +is not accounted in the file.
> +The memory consumption of the network stack can be queried
> +using
> +.IR /proc/net/sockstat
> +or
> +.BR ss (8)
>  .RS
>  .TP
>  .IR MemTotal " %lu"
> --
> 2.25.4
>
>
Mike Rapoport Oct. 14, 2020, 5:34 a.m. UTC | #21
On Tue, Oct 13, 2020 at 08:21:13AM -0700, Randy Dunlap wrote:
> On 10/13/20 8:12 AM, Mike Rapoport wrote:

> > On Tue, Oct 13, 2020 at 07:43:59AM -0700, Randy Dunlap wrote:

> >> On 10/13/20 1:09 AM, Mike Rapoport wrote:

> >>> On Mon, Oct 12, 2020 at 05:53:01PM +0800, Muchun Song wrote:

> >>>> On Mon, Oct 12, 2020 at 5:24 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:

> >>>>>

> >>>>> On 10/12/20 10:39 AM, Muchun Song wrote:

> >>>>>> On Mon, Oct 12, 2020 at 3:42 PM Eric Dumazet <edumazet@google.com> wrote:

> >>>>

> >>>> We are not complaining about TCP using too much memory, but how do

> >>>> we know that TCP uses a lot of memory. When I firstly face this problem,

> >>>> I do not know who uses the 25GB memory and it is not shown in the /proc/meminfo.

> >>>> If we can know the amount memory of the socket buffer via /proc/meminfo, we

> >>>> may not need to spend a lot of time troubleshooting this problem. Not everyone

> >>>> knows that a lot of memory may be used here. But I believe many people

> >>>> should know /proc/meminfo to confirm memory users.

> >>>

> >>> If I undestand correctly, the problem you are trying to solve is to

> >>> simplify troubleshooting of memory usage for people who may not be aware

> >>> that networking stack can be a large memory consumer.

> >>>

> >>> For that a paragraph in 'man 5 proc' maybe a good start:

> >>>

> >>> >From ddbcf38576d1a2b0e36fe25a27350d566759b664 Mon Sep 17 00:00:00 2001

> >>> From: Mike Rapoport <rppt@linux.ibm.com>

> >>> Date: Tue, 13 Oct 2020 11:07:35 +0300

> >>> Subject: [PATCH] proc.5: meminfo: add not anout network stack memory

> >>>  consumption

> >>>

> >>> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>

> >>> ---

> >>>  man5/proc.5 | 8 ++++++++

> >>>  1 file changed, 8 insertions(+)

> >>>

> >>> diff --git a/man5/proc.5 b/man5/proc.5

> >>> index ed309380b..8414676f1 100644

> >>> --- a/man5/proc.5

> >>> +++ b/man5/proc.5

> >>> @@ -3478,6 +3478,14 @@ Except as noted below,

> >>>  all of the fields have been present since at least Linux 2.6.0.

> >>>  Some fields are displayed only if the kernel was configured

> >>>  with various options; those dependencies are noted in the list.

> >>> +.IP

> >>> +Note that significant part of memory allocated by the network stack

> >>> +is not accounted in the file.

> >>> +The memory consumption of the network stack can be queried

> >>> +using

> >>> +.IR /proc/net/sockstat

> >>> +or

> >>> +.BR ss (8)

> >>>  .RS

> >>>  .TP

> >>>  .IR MemTotal " %lu"

> >>

> >> Hi Mike,

> >>

> >> Could you tell us what units those values are in?

> >> or is that already explained somewhere else?

> > 

> > It is described a few lines above and anyway, "MemTotal" is a part of

> > the diff context ;-)

> 

> with no units AFAICT.

> 

> But I was unclear. I wasn't referring to /proc/meminfo, but instead

> to /proc/net/sockstat and its units:

> 

> sockets: used 1224

> TCP: inuse 11 orphan 1 tw 1 alloc 26 mem 3

> UDP: inuse 4 mem 2

> UDPLITE: inuse 0

> RAW: inuse 0

> FRAG: inuse 0 memory 0

> 

> E.g., for TCP and UDP, are those socket counts or some unit of memory?

> If units of memory, what unit size?


Ah, these are in 4k pages, AFAIU.
And, as it seems /proc/net/sockstat lacks a description in proc.5 at
all...

> thanks.

> -- 

> ~Randy

> 

> 


-- 
Sincerely yours,
Mike.
Vlastimil Babka Oct. 16, 2020, 3:38 p.m. UTC | #22
On 10/13/20 10:09 AM, Mike Rapoport wrote:
>> We are not complaining about TCP using too much memory, but how do
>> we know that TCP uses a lot of memory. When I firstly face this problem,
>> I do not know who uses the 25GB memory and it is not shown in the /proc/meminfo.
>> If we can know the amount memory of the socket buffer via /proc/meminfo, we
>> may not need to spend a lot of time troubleshooting this problem. Not everyone
>> knows that a lot of memory may be used here. But I believe many people
>> should know /proc/meminfo to confirm memory users.
> If I undestand correctly, the problem you are trying to solve is to
> simplify troubleshooting of memory usage for people who may not be aware
> that networking stack can be a large memory consumer.
> 
> For that a paragraph in 'man 5 proc' maybe a good start:

Yeah. Another major consumer that I've seen at some point was xfs buffers. And 
there might be others, and adding everything to /proc/meminfo is not feasible. I 
have once proposed adding a counter called "Unaccounted:" which would at least 
tell the user easily if a significant portion is occupied by memory not 
explained by the other meminfo counters, and look for trends (increase = 
potential memory leak?). For specific prominent consumers not covered by meminfo 
but that have some kind of internal counters, we could document where to look, 
such as /proc/net/sockstat or maybe create some /proc/ or /sys directory with 
file per consumer so that it's still easy to check, but without the overhead of 
global counters and bloated /proc/meminfo?

>  From ddbcf38576d1a2b0e36fe25a27350d566759b664 Mon Sep 17 00:00:00 2001
> From: Mike Rapoport<rppt@linux.ibm.com>
> Date: Tue, 13 Oct 2020 11:07:35 +0300
> Subject: [PATCH] proc.5: meminfo: add not anout network stack memory
>   consumption
> 
> Signed-off-by: Mike Rapoport<rppt@linux.ibm.com>
> ---
>   man5/proc.5 | 8 ++++++++
>   1 file changed, 8 insertions(+)
> 
> diff --git a/man5/proc.5 b/man5/proc.5
> index ed309380b..8414676f1 100644
> --- a/man5/proc.5
> +++ b/man5/proc.5
> @@ -3478,6 +3478,14 @@ Except as noted below,
>   all of the fields have been present since at least Linux 2.6.0.
>   Some fields are displayed only if the kernel was configured
>   with various options; those dependencies are noted in the list.
> +.IP
> +Note that significant part of memory allocated by the network stack
> +is not accounted in the file.
> +The memory consumption of the network stack can be queried
> +using
> +.IR /proc/net/sockstat
> +or
> +.BR ss (8)
>   .RS
>   .TP
>   .IR MemTotal " %lu"
> -- 2.25.4
Minchan Kim Oct. 16, 2020, 8:53 p.m. UTC | #23
On Fri, Oct 16, 2020 at 05:38:26PM +0200, Vlastimil Babka wrote:
> On 10/13/20 10:09 AM, Mike Rapoport wrote:
> > > We are not complaining about TCP using too much memory, but how do
> > > we know that TCP uses a lot of memory. When I firstly face this problem,
> > > I do not know who uses the 25GB memory and it is not shown in the /proc/meminfo.
> > > If we can know the amount memory of the socket buffer via /proc/meminfo, we
> > > may not need to spend a lot of time troubleshooting this problem. Not everyone
> > > knows that a lot of memory may be used here. But I believe many people
> > > should know /proc/meminfo to confirm memory users.
> > If I undestand correctly, the problem you are trying to solve is to
> > simplify troubleshooting of memory usage for people who may not be aware
> > that networking stack can be a large memory consumer.
> > 
> > For that a paragraph in 'man 5 proc' maybe a good start:
> 
> Yeah. Another major consumer that I've seen at some point was xfs buffers.

As well, there are other various type of memory consumers in embedded world,
depending on the features what they supprted, too. They often tempted to add
the memory consumption into /proc/meminfo or /proc/vmstat, too to get
memory visibility.

> And there might be others, and adding everything to /proc/meminfo is not
> feasible. I have once proposed adding a counter called "Unaccounted:" which
> would at least tell the user easily if a significant portion is occupied by
> memory not explained by the other meminfo counters, and look for trends
> (increase = potential memory leak?). For specific prominent consumers not
> covered by meminfo but that have some kind of internal counters, we could
> document where to look, such as /proc/net/sockstat or maybe create some
> /proc/ or /sys directory with file per consumer so that it's still easy to
> check, but without the overhead of global counters and bloated
> /proc/meminfo?

What have in my mind is to support simple general sysfs infra from MM for
driver/subysstems rather than creating each own memory stat. The API
could support flexible accounting like just global memory consumption and/or
consmption by key(e.g,. pid or each own special) for the detail.

So, they are all shown under /sys/kernel/mm/misc/ with detail as well as
/proc/meminfo with simple line for global.

Furthermore, I'd like to plug the infra into OOM message so once OOM occurs,
we could print each own hidden memory usage from driver side if the driver
believe they could be memory hogger. It would make easier to detect
such kinds of leak or hogging as well as better maintainace.
Shakeel Butt Oct. 19, 2020, 5:23 p.m. UTC | #24
CCed: Paolo Bonzini

On Fri, Oct 16, 2020 at 1:53 PM Minchan Kim <minchan@kernel.org> wrote:
[snip]
> > And there might be others, and adding everything to /proc/meminfo is not
> > feasible. I have once proposed adding a counter called "Unaccounted:" which
> > would at least tell the user easily if a significant portion is occupied by
> > memory not explained by the other meminfo counters, and look for trends
> > (increase = potential memory leak?). For specific prominent consumers not
> > covered by meminfo but that have some kind of internal counters, we could
> > document where to look, such as /proc/net/sockstat or maybe create some
> > /proc/ or /sys directory with file per consumer so that it's still easy to
> > check, but without the overhead of global counters and bloated
> > /proc/meminfo?
>
> What have in my mind is to support simple general sysfs infra from MM for
> driver/subysstems rather than creating each own memory stat. The API
> could support flexible accounting like just global memory consumption and/or
> consmption by key(e.g,. pid or each own special) for the detail.
>
> So, they are all shown under /sys/kernel/mm/misc/ with detail as well as
> /proc/meminfo with simple line for global.

This reminds me of statsfs [1]. I am wondering if this can be another
useful use-case for statsfs.

[1] https://lkml.org/lkml/2020/5/26/332
diff mbox series

Patch

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 508b80f6329b..6f92775da85c 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -418,6 +418,7 @@  static ssize_t node_read_meminfo(struct device *dev,
 #ifdef CONFIG_SHADOW_CALL_STACK
 		       "Node %d ShadowCallStack:%8lu kB\n"
 #endif
+		       "Node %d Sock:           %8lu kB\n"
 		       "Node %d PageTables:     %8lu kB\n"
 		       "Node %d NFS_Unstable:   %8lu kB\n"
 		       "Node %d Bounce:         %8lu kB\n"
@@ -441,6 +442,7 @@  static ssize_t node_read_meminfo(struct device *dev,
 		       nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
 		       nid, K(i.sharedram),
 		       nid, node_page_state(pgdat, NR_KERNEL_STACK_KB),
+		       nid, K(node_page_state(pgdat, NR_SOCK)),
 #ifdef CONFIG_SHADOW_CALL_STACK
 		       nid, node_page_state(pgdat, NR_KERNEL_SCS_KB),
 #endif
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 263b005981bd..e7183f67ae4a 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2646,8 +2646,7 @@  static void free_receive_page_frags(struct virtnet_info *vi)
 {
 	int i;
 	for (i = 0; i < vi->max_queue_pairs; i++)
-		if (vi->rq[i].alloc_frag.page)
-			put_page(vi->rq[i].alloc_frag.page);
+		put_page_frag(&vi->rq[i].alloc_frag);
 }
 
 static void free_unused_bufs(struct virtnet_info *vi)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 887a5532e449..1dcf3120d831 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -106,6 +106,7 @@  static int meminfo_proc_show(struct seq_file *m, void *v)
 	seq_printf(m, "ShadowCallStack:%8lu kB\n",
 		   global_node_page_state(NR_KERNEL_SCS_KB));
 #endif
+	show_val_kb(m, "Sock:           ", global_node_page_state(NR_SOCK));
 	show_val_kb(m, "PageTables:     ",
 		    global_zone_page_state(NR_PAGETABLE));
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 31712bb61f7f..1996713d2c6b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -207,6 +207,7 @@  enum node_stat_item {
 #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
 	NR_KERNEL_SCS_KB,	/* measured in KiB */
 #endif
+	NR_SOCK,                /* Count of socket buffer pages */
 	NR_VM_NODE_STAT_ITEMS
 };
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fcd53f97c186..7e5108da4d84 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -19,7 +19,8 @@ 
 #include <linux/rbtree.h>
 #include <linux/socket.h>
 #include <linux/refcount.h>
-
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
 #include <linux/atomic.h>
 #include <asm/types.h>
 #include <linux/spinlock.h>
@@ -3003,6 +3004,25 @@  static inline void skb_frag_ref(struct sk_buff *skb, int f)
 	__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
 }
 
+static inline void inc_sock_node_page_state(struct page *page)
+{
+	mod_node_page_state(page_pgdat(page), NR_SOCK, compound_nr(page));
+	/*
+	 * Indicate that we need to decrease the Sock page state when
+	 * the page freed.
+	 */
+	SetPagePrivate(page);
+}
+
+static inline void dec_sock_node_page_state(struct page *page)
+{
+	if (PagePrivate(page)) {
+		ClearPagePrivate(page);
+		mod_node_page_state(page_pgdat(page), NR_SOCK,
+				    -compound_nr(page));
+	}
+}
+
 /**
  * __skb_frag_unref - release a reference on a paged fragment.
  * @frag: the paged fragment
@@ -3011,7 +3031,12 @@  static inline void skb_frag_ref(struct sk_buff *skb, int f)
  */
 static inline void __skb_frag_unref(skb_frag_t *frag)
 {
-	put_page(skb_frag_page(frag));
+	struct page *page = skb_frag_page(frag);
+
+	if (put_page_testzero(page)) {
+		dec_sock_node_page_state(page);
+		__put_page(page);
+	}
 }
 
 /**
@@ -3091,6 +3116,20 @@  static inline void skb_frag_set_page(struct sk_buff *skb, int f,
 	__skb_frag_set_page(&skb_shinfo(skb)->frags[f], page);
 }
 
+static inline bool put_page_frag(struct page_frag *pfrag)
+{
+	struct page *page = pfrag->page;
+
+	if (page) {
+		if (put_page_testzero(page)) {
+			dec_sock_node_page_state(page);
+			__put_page(page);
+		}
+		return true;
+	}
+	return false;
+}
+
 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio);
 
 /**
diff --git a/kernel/exit.c b/kernel/exit.c
index 62912406d74a..58d373767d16 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -841,8 +841,7 @@  void __noreturn do_exit(long code)
 	if (tsk->splice_pipe)
 		free_pipe_info(tsk->splice_pipe);
 
-	if (tsk->task_frag.page)
-		put_page(tsk->task_frag.page);
+	put_page_frag(&tsk->task_frag);
 
 	validate_creds_for_do_exit(tsk);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cefbef32bf4a..6c543158aa06 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5379,7 +5379,7 @@  void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 		" unevictable:%lu dirty:%lu writeback:%lu\n"
 		" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
 		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
-		" free:%lu free_pcp:%lu free_cma:%lu\n",
+		" free:%lu free_pcp:%lu free_cma:%lu sock:%lu\n",
 		global_node_page_state(NR_ACTIVE_ANON),
 		global_node_page_state(NR_INACTIVE_ANON),
 		global_node_page_state(NR_ISOLATED_ANON),
@@ -5397,7 +5397,8 @@  void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 		global_zone_page_state(NR_BOUNCE),
 		global_zone_page_state(NR_FREE_PAGES),
 		free_pcp,
-		global_zone_page_state(NR_FREE_CMA_PAGES));
+		global_zone_page_state(NR_FREE_CMA_PAGES),
+		global_node_page_state(NR_SOCK));
 
 	for_each_online_pgdat(pgdat) {
 		if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
@@ -5425,6 +5426,7 @@  void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 #ifdef CONFIG_SHADOW_CALL_STACK
 			" shadow_call_stack:%lukB"
 #endif
+			" sock:%lukB"
 			" all_unreclaimable? %s"
 			"\n",
 			pgdat->node_id,
@@ -5450,6 +5452,7 @@  void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 #ifdef CONFIG_SHADOW_CALL_STACK
 			node_page_state(pgdat, NR_KERNEL_SCS_KB),
 #endif
+			K(node_page_state(pgdat, NR_SOCK)),
 			pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
 				"yes" : "no");
 	}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b05dec387557..ceaf6f85c155 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1220,6 +1220,7 @@  const char * const vmstat_text[] = {
 #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
 	"nr_shadow_call_stack",
 #endif
+	"nr_sock",
 
 	/* enum writeback_stat_item counters */
 	"nr_dirty_threshold",
diff --git a/net/core/sock.c b/net/core/sock.c
index 5972d26f03ae..1661b423802b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1780,10 +1780,8 @@  static void __sk_destruct(struct rcu_head *head)
 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
 			 __func__, atomic_read(&sk->sk_omem_alloc));
 
-	if (sk->sk_frag.page) {
-		put_page(sk->sk_frag.page);
+	if (put_page_frag(&sk->sk_frag))
 		sk->sk_frag.page = NULL;
-	}
 
 	if (sk->sk_peer_cred)
 		put_cred(sk->sk_peer_cred);
@@ -2456,7 +2454,7 @@  bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
 		}
 		if (pfrag->offset + sz <= pfrag->size)
 			return true;
-		put_page(pfrag->page);
+		put_page_frag(pfrag);
 	}
 
 	pfrag->offset = 0;
@@ -2469,12 +2467,14 @@  bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
 					  SKB_FRAG_PAGE_ORDER);
 		if (likely(pfrag->page)) {
 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
+			inc_sock_node_page_state(pfrag->page);
 			return true;
 		}
 	}
 	pfrag->page = alloc_page(gfp);
 	if (likely(pfrag->page)) {
 		pfrag->size = PAGE_SIZE;
+		inc_sock_node_page_state(pfrag->page);
 		return true;
 	}
 	return false;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 57a568875539..583761844b4f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2751,8 +2751,7 @@  int tcp_disconnect(struct sock *sk, int flags)
 
 	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
 
-	if (sk->sk_frag.page) {
-		put_page(sk->sk_frag.page);
+	if (put_page_frag(&sk->sk_frag)) {
 		sk->sk_frag.page = NULL;
 		sk->sk_frag.offset = 0;
 	}
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 69520ad3d83b..0f7c16679e49 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -495,8 +495,7 @@  static void ___xfrm_state_destroy(struct xfrm_state *x)
 		x->type->destructor(x);
 		xfrm_put_type(x->type);
 	}
-	if (x->xfrag.page)
-		put_page(x->xfrag.page);
+	put_page_frag(&x->xfrag);
 	xfrm_dev_state_free(x);
 	security_xfrm_state_free(x);
 	xfrm_state_free(x);