Message ID | 20210121122723.3446-9-rppt@kernel.org |
---|---|
State | New |
Headers | show |
Series | [v16,01/11] mm: add definition of PMD_PAGE_ORDER | expand |
On Thu, Jan 21, 2021 at 02:27:20PM +0200, Mike Rapoport wrote: > From: Mike Rapoport <rppt@linux.ibm.com> > > Account memory consumed by secretmem to memcg. The accounting is updated > when the memory is actually allocated and freed. I think this is wrong. It fails to account subsequent allocators from the same PMD. If you want to track like this, you need separate pools per memcg. I think you shouldn't try to track like this; better to just track on a per-page basis. After all, the page allocator doesn't track order-10 pages to the memcg that initially caused them to be split. > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com> > Acked-by: Roman Gushchin <guro@fb.com> > Reviewed-by: Shakeel Butt <shakeelb@google.com> > Cc: Alexander Viro <viro@zeniv.linux.org.uk> > Cc: Andy Lutomirski <luto@kernel.org> > Cc: Arnd Bergmann <arnd@arndb.de> > Cc: Borislav Petkov <bp@alien8.de> > Cc: Catalin Marinas <catalin.marinas@arm.com> > Cc: Christopher Lameter <cl@linux.com> > Cc: Dan Williams <dan.j.williams@intel.com> > Cc: Dave Hansen <dave.hansen@linux.intel.com> > Cc: David Hildenbrand <david@redhat.com> > Cc: Elena Reshetova <elena.reshetova@intel.com> > Cc: Hagen Paul Pfeifer <hagen@jauu.net> > Cc: "H. Peter Anvin" <hpa@zytor.com> > Cc: Ingo Molnar <mingo@redhat.com> > Cc: James Bottomley <jejb@linux.ibm.com> > Cc: "Kirill A. Shutemov" <kirill@shutemov.name> > Cc: Mark Rutland <mark.rutland@arm.com> > Cc: Matthew Wilcox <willy@infradead.org> > Cc: Michael Kerrisk <mtk.manpages@gmail.com> > Cc: Palmer Dabbelt <palmer@dabbelt.com> > Cc: Palmer Dabbelt <palmerdabbelt@google.com> > Cc: Paul Walmsley <paul.walmsley@sifive.com> > Cc: Peter Zijlstra <peterz@infradead.org> > Cc: Rick Edgecombe <rick.p.edgecombe@intel.com> > Cc: Shuah Khan <shuah@kernel.org> > Cc: Thomas Gleixner <tglx@linutronix.de> > Cc: Tycho Andersen <tycho@tycho.ws> > Cc: Will Deacon <will@kernel.org> > --- > mm/filemap.c | 3 ++- > mm/secretmem.c | 36 +++++++++++++++++++++++++++++++++++- > 2 files changed, 37 insertions(+), 2 deletions(-) > > diff --git a/mm/filemap.c b/mm/filemap.c > index 2d0c6721879d..bb28dd6d9e22 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -42,6 +42,7 @@ > #include <linux/psi.h> > #include <linux/ramfs.h> > #include <linux/page_idle.h> > +#include <linux/secretmem.h> > #include "internal.h" > > #define CREATE_TRACE_POINTS > @@ -839,7 +840,7 @@ noinline int __add_to_page_cache_locked(struct page *page, > page->mapping = mapping; > page->index = offset; > > - if (!huge) { > + if (!huge && !page_is_secretmem(page)) { > error = mem_cgroup_charge(page, current->mm, gfp); > if (error) > goto error; > diff --git a/mm/secretmem.c b/mm/secretmem.c > index 469211c7cc3a..05026460e2ee 100644 > --- a/mm/secretmem.c > +++ b/mm/secretmem.c > @@ -18,6 +18,7 @@ > #include <linux/memblock.h> > #include <linux/pseudo_fs.h> > #include <linux/secretmem.h> > +#include <linux/memcontrol.h> > #include <linux/set_memory.h> > #include <linux/sched/signal.h> > > @@ -44,6 +45,32 @@ struct secretmem_ctx { > > static struct cma *secretmem_cma; > > +static int secretmem_account_pages(struct page *page, gfp_t gfp, int order) > +{ > + int err; > + > + err = memcg_kmem_charge_page(page, gfp, order); > + if (err) > + return err; > + > + /* > + * seceremem caches are unreclaimable kernel allocations, so treat > + * them as unreclaimable slab memory for VM statistics purposes > + */ > + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, > + PAGE_SIZE << order); > + > + return 0; > +} > + > +static void secretmem_unaccount_pages(struct page *page, int order) > +{ > + > + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, > + -PAGE_SIZE << order); > + memcg_kmem_uncharge_page(page, order); > +} > + > static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp) > { > unsigned long nr_pages = (1 << PMD_PAGE_ORDER); > @@ -56,6 +83,10 @@ static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp) > if (!page) > return -ENOMEM; > > + err = secretmem_account_pages(page, gfp, PMD_PAGE_ORDER); > + if (err) > + goto err_cma_release; > + > /* > * clear the data left from the prevoius user before dropping the > * pages from the direct map > @@ -65,7 +96,7 @@ static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp) > > err = set_direct_map_invalid_noflush(page, nr_pages); > if (err) > - goto err_cma_release; > + goto err_memcg_uncharge; > > addr = (unsigned long)page_address(page); > err = gen_pool_add(pool, addr, PMD_SIZE, NUMA_NO_NODE); > @@ -83,6 +114,8 @@ static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp) > * won't fail > */ > set_direct_map_default_noflush(page, nr_pages); > +err_memcg_uncharge: > + secretmem_unaccount_pages(page, PMD_PAGE_ORDER); > err_cma_release: > cma_release(secretmem_cma, page, nr_pages); > return err; > @@ -314,6 +347,7 @@ static void secretmem_cleanup_chunk(struct gen_pool *pool, > int i; > > set_direct_map_default_noflush(page, nr_pages); > + secretmem_unaccount_pages(page, PMD_PAGE_ORDER); > > for (i = 0; i < nr_pages; i++) > clear_highpage(page + i); > -- > 2.28.0 >
On Thu 21-01-21 14:27:20, Mike Rapoport wrote: > From: Mike Rapoport <rppt@linux.ibm.com> > > Account memory consumed by secretmem to memcg. The accounting is updated > when the memory is actually allocated and freed. What does this mean? What are the lifetime rules? [...] > +static int secretmem_account_pages(struct page *page, gfp_t gfp, int order) > +{ > + int err; > + > + err = memcg_kmem_charge_page(page, gfp, order); > + if (err) > + return err; > + > + /* > + * seceremem caches are unreclaimable kernel allocations, so treat > + * them as unreclaimable slab memory for VM statistics purposes > + */ > + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, > + PAGE_SIZE << order); A lot of memcg accounted memory is not reclaimable. Why do you abuse SLAB counter when this is not a slab owned memory? Why do you use the kmem accounting API when __GFP_ACCOUNT should give you the same without this details? -- Michal Hocko SUSE Labs
On Mon, Jan 25, 2021 at 8:20 AM Matthew Wilcox <willy@infradead.org> wrote: > > On Thu, Jan 21, 2021 at 02:27:20PM +0200, Mike Rapoport wrote: > > From: Mike Rapoport <rppt@linux.ibm.com> > > > > Account memory consumed by secretmem to memcg. The accounting is updated > > when the memory is actually allocated and freed. > > I think this is wrong. It fails to account subsequent allocators from > the same PMD. If you want to track like this, you need separate pools > per memcg. > Are these secretmem pools shared between different jobs/memcgs?
On Mon, Jan 25, 2021 at 09:18:04AM -0800, Shakeel Butt wrote: > On Mon, Jan 25, 2021 at 8:20 AM Matthew Wilcox <willy@infradead.org> wrote: > > > > On Thu, Jan 21, 2021 at 02:27:20PM +0200, Mike Rapoport wrote: > > > From: Mike Rapoport <rppt@linux.ibm.com> > > > > > > Account memory consumed by secretmem to memcg. The accounting is updated > > > when the memory is actually allocated and freed. I though about doing per-page accounting, but then one would be able to create a lot of secretmem file descriptors, use only a page from each while actual memory consumption will be way higher. > > I think this is wrong. It fails to account subsequent allocators from > > the same PMD. If you want to track like this, you need separate pools > > per memcg. > > > > Are these secretmem pools shared between different jobs/memcgs? A secretmem pool is per anonymous file descriptor and this file descriptor can be shared only explicitly between several processes. So, the secretmem pool should not be shared between different jobs/memcg. Of course, it's possible to spread threads of a process across different memcgs, but in that case the accounting will be similar to what's happening today with sl*b. The first thread to cause kmalloc() will be charged for the allocation of the entire slab and subsequent allocations from that slab will not be accounted. That said, having a pool per memcg will add ton of complexity with very dubious value. -- Sincerely yours, Mike.
On Mon, Jan 25, 2021 at 05:54:51PM +0100, Michal Hocko wrote: > On Thu 21-01-21 14:27:20, Mike Rapoport wrote: > > From: Mike Rapoport <rppt@linux.ibm.com> > > > > Account memory consumed by secretmem to memcg. The accounting is updated > > when the memory is actually allocated and freed. > > What does this mean? That means that the accounting is updated when secretmem does cma_alloc() and cma_relase(). > What are the lifetime rules? Hmm, what do you mean by lifetime rules? > [...] > > > +static int secretmem_account_pages(struct page *page, gfp_t gfp, int order) > > +{ > > + int err; > > + > > + err = memcg_kmem_charge_page(page, gfp, order); > > + if (err) > > + return err; > > + > > + /* > > + * seceremem caches are unreclaimable kernel allocations, so treat > > + * them as unreclaimable slab memory for VM statistics purposes > > + */ > > + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, > > + PAGE_SIZE << order); > > A lot of memcg accounted memory is not reclaimable. Why do you abuse > SLAB counter when this is not a slab owned memory? Why do you use the > kmem accounting API when __GFP_ACCOUNT should give you the same without > this details? I cannot use __GFP_ACCOUNT because cma_alloc() does not use gfp. Besides, kmem accounting with __GFP_ACCOUNT does not seem to update stats and there was an explicit request for statistics: https://lore.kernel.org/lkml/CALo0P13aq3GsONnZrksZNU9RtfhMsZXGWhK1n=xYJWQizCd4Zw@mail.gmail.com/ As for (ab)using NR_SLAB_UNRECLAIMABLE_B, as it was already discussed here: https://lore.kernel.org/lkml/20201129172625.GD557259@kernel.org/ I think that a dedicated stats counter would be too much at the moment and NR_SLAB_UNRECLAIMABLE_B is the only explicit stat for unreclaimable memory. -- Sincerely yours, Mike.
On Mon 25-01-21 23:38:17, Mike Rapoport wrote: > On Mon, Jan 25, 2021 at 05:54:51PM +0100, Michal Hocko wrote: > > On Thu 21-01-21 14:27:20, Mike Rapoport wrote: > > > From: Mike Rapoport <rppt@linux.ibm.com> > > > > > > Account memory consumed by secretmem to memcg. The accounting is updated > > > when the memory is actually allocated and freed. > > > > What does this mean? > > That means that the accounting is updated when secretmem does cma_alloc() > and cma_relase(). > > > What are the lifetime rules? > > Hmm, what do you mean by lifetime rules? OK, so let's start by reservation time (mmap time right?) then the instantiation time (faulting in memory). What if the calling process of the former has a different memcg context than the later. E.g. when you send your fd or inherited fd over fork will move to a different memcg. What about freeing path? E.g. when you punch a hole in the middle of a mapping? Please make sure to document all this. > > [...] > > > > > +static int secretmem_account_pages(struct page *page, gfp_t gfp, int order) > > > +{ > > > + int err; > > > + > > > + err = memcg_kmem_charge_page(page, gfp, order); > > > + if (err) > > > + return err; > > > + > > > + /* > > > + * seceremem caches are unreclaimable kernel allocations, so treat > > > + * them as unreclaimable slab memory for VM statistics purposes > > > + */ > > > + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, > > > + PAGE_SIZE << order); > > > > A lot of memcg accounted memory is not reclaimable. Why do you abuse > > SLAB counter when this is not a slab owned memory? Why do you use the > > kmem accounting API when __GFP_ACCOUNT should give you the same without > > this details? > > I cannot use __GFP_ACCOUNT because cma_alloc() does not use gfp. Other people are working on this to change. But OK, I do see that this can be done later but it looks rather awkward. > Besides, kmem accounting with __GFP_ACCOUNT does not seem > to update stats and there was an explicit request for statistics: > > https://lore.kernel.org/lkml/CALo0P13aq3GsONnZrksZNU9RtfhMsZXGWhK1n=xYJWQizCd4Zw@mail.gmail.com/ charging and stats are two different things. You can still take care of your stats without explicitly using the charging API. But this is a mere detail. It just hit my eyes. > As for (ab)using NR_SLAB_UNRECLAIMABLE_B, as it was already discussed here: > > https://lore.kernel.org/lkml/20201129172625.GD557259@kernel.org/ Those arguments should be a part of the changelof. > I think that a dedicated stats counter would be too much at the moment and > NR_SLAB_UNRECLAIMABLE_B is the only explicit stat for unreclaimable memory. Why do you think it would be too much? If the secret memory becomes a prevalent memory user because it will happen to back the whole virtual machine then hiding it into any existing counter would be less than useful. Please note that this all is a user visible stuff that will become PITA (if possible) to change later on. You should really have strong arguments in your justification here. -- Michal Hocko SUSE Labs
On Tue, Jan 26, 2021 at 08:31:42AM +0100, Michal Hocko wrote: > On Mon 25-01-21 23:38:17, Mike Rapoport wrote: > > On Mon, Jan 25, 2021 at 05:54:51PM +0100, Michal Hocko wrote: > > > On Thu 21-01-21 14:27:20, Mike Rapoport wrote: > > > > From: Mike Rapoport <rppt@linux.ibm.com> > > > > > > > > Account memory consumed by secretmem to memcg. The accounting is updated > > > > when the memory is actually allocated and freed. > > > > > > What does this mean? > > > > That means that the accounting is updated when secretmem does cma_alloc() > > and cma_relase(). > > > > > What are the lifetime rules? > > > > Hmm, what do you mean by lifetime rules? > > OK, so let's start by reservation time (mmap time right?) then the > instantiation time (faulting in memory). What if the calling process of > the former has a different memcg context than the later. E.g. when you > send your fd or inherited fd over fork will move to a different memcg. > > What about freeing path? E.g. when you punch a hole in the middle of > a mapping? > > Please make sure to document all this. So, does something like this answer your question: --- The memory cgroup is charged when secremem allocates pages from CMA to increase large pages pool during ->fault() processing. The pages are uncharged from memory cgroup when they are released back to CMA at the time secretme inode is evicted. --- > > > [...] > > > > > > > +static int secretmem_account_pages(struct page *page, gfp_t gfp, int order) > > > > +{ > > > > + int err; > > > > + > > > > + err = memcg_kmem_charge_page(page, gfp, order); > > > > + if (err) > > > > + return err; > > > > + > > > > + /* > > > > + * seceremem caches are unreclaimable kernel allocations, so treat > > > > + * them as unreclaimable slab memory for VM statistics purposes > > > > + */ > > > > + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, > > > > + PAGE_SIZE << order); > > > > > > A lot of memcg accounted memory is not reclaimable. Why do you abuse > > > SLAB counter when this is not a slab owned memory? Why do you use the > > > kmem accounting API when __GFP_ACCOUNT should give you the same without > > > this details? > > > > I cannot use __GFP_ACCOUNT because cma_alloc() does not use gfp. > > Other people are working on this to change. But OK, I do see that this > can be done later but it looks rather awkward. > > > Besides, kmem accounting with __GFP_ACCOUNT does not seem > > to update stats and there was an explicit request for statistics: > > > > https://lore.kernel.org/lkml/CALo0P13aq3GsONnZrksZNU9RtfhMsZXGWhK1n=xYJWQizCd4Zw@mail.gmail.com/ > > charging and stats are two different things. You can still take care of > your stats without explicitly using the charging API. But this is a mere > detail. It just hit my eyes. > > > As for (ab)using NR_SLAB_UNRECLAIMABLE_B, as it was already discussed here: > > > > https://lore.kernel.org/lkml/20201129172625.GD557259@kernel.org/ > > Those arguments should be a part of the changelof. > > > I think that a dedicated stats counter would be too much at the moment and > > NR_SLAB_UNRECLAIMABLE_B is the only explicit stat for unreclaimable memory. > > Why do you think it would be too much? If the secret memory becomes a > prevalent memory user because it will happen to back the whole virtual > machine then hiding it into any existing counter would be less than > useful. > > Please note that this all is a user visible stuff that will become PITA > (if possible) to change later on. You should really have strong > arguments in your justification here. I think that adding a dedicated counter for few 2M areas per container is not worth the churn. When we'll get to the point that secretmem can be used to back the entire guest memory we can add a new counter and it does not seem to PITA to me. -- Sincerely yours, Mike.
On Tue 26-01-21 10:56:54, Mike Rapoport wrote: > On Tue, Jan 26, 2021 at 08:31:42AM +0100, Michal Hocko wrote: > > On Mon 25-01-21 23:38:17, Mike Rapoport wrote: > > > On Mon, Jan 25, 2021 at 05:54:51PM +0100, Michal Hocko wrote: > > > > On Thu 21-01-21 14:27:20, Mike Rapoport wrote: > > > > > From: Mike Rapoport <rppt@linux.ibm.com> > > > > > > > > > > Account memory consumed by secretmem to memcg. The accounting is updated > > > > > when the memory is actually allocated and freed. > > > > > > > > What does this mean? > > > > > > That means that the accounting is updated when secretmem does cma_alloc() > > > and cma_relase(). > > > > > > > What are the lifetime rules? > > > > > > Hmm, what do you mean by lifetime rules? > > > > OK, so let's start by reservation time (mmap time right?) then the > > instantiation time (faulting in memory). What if the calling process of > > the former has a different memcg context than the later. E.g. when you > > send your fd or inherited fd over fork will move to a different memcg. > > > > What about freeing path? E.g. when you punch a hole in the middle of > > a mapping? > > > > Please make sure to document all this. > > So, does something like this answer your question: > > --- > The memory cgroup is charged when secremem allocates pages from CMA to > increase large pages pool during ->fault() processing. OK so that is when the memory is faulted in. Good that is a standard model we have. The memcg context of the creator of the secret memory is not really important. So whoever has created is not charged. > The pages are uncharged from memory cgroup when they are released back to > CMA at the time secretme inode is evicted. > --- so effectivelly when they are unmapped, right? This is similar to anonymous memory. As I've said it would be really great to have this life cycle documented properly. > > Please note that this all is a user visible stuff that will become PITA > > (if possible) to change later on. You should really have strong > > arguments in your justification here. > > I think that adding a dedicated counter for few 2M areas per container is > not worth the churn. What kind of churn you have in mind? What is the downside? > When we'll get to the point that secretmem can be used to back the entire > guest memory we can add a new counter and it does not seem to PITA to me. What does really prevent a larger use with this implementation? -- Michal Hocko SUSE Labs
On Mon, Jan 25, 2021 at 11:38:17PM +0200, Mike Rapoport wrote: > I cannot use __GFP_ACCOUNT because cma_alloc() does not use gfp. > Besides, kmem accounting with __GFP_ACCOUNT does not seem > to update stats and there was an explicit request for statistics: > > https://lore.kernel.org/lkml/CALo0P13aq3GsONnZrksZNU9RtfhMsZXGWhK1n=xYJWQizCd4Zw@mail.gmail.com/ > > As for (ab)using NR_SLAB_UNRECLAIMABLE_B, as it was already discussed here: > > https://lore.kernel.org/lkml/20201129172625.GD557259@kernel.org/ > > I think that a dedicated stats counter would be too much at the moment and > NR_SLAB_UNRECLAIMABLE_B is the only explicit stat for unreclaimable memory. That's not true -- Mlocked is also unreclaimable. And doesn't this feel more like mlocked memory than unreclaimable slab? It's also Unevictable, so could be counted there instead.
On Tue 26-01-21 14:48:38, Matthew Wilcox wrote: > On Mon, Jan 25, 2021 at 11:38:17PM +0200, Mike Rapoport wrote: > > I cannot use __GFP_ACCOUNT because cma_alloc() does not use gfp. > > Besides, kmem accounting with __GFP_ACCOUNT does not seem > > to update stats and there was an explicit request for statistics: > > > > https://lore.kernel.org/lkml/CALo0P13aq3GsONnZrksZNU9RtfhMsZXGWhK1n=xYJWQizCd4Zw@mail.gmail.com/ > > > > As for (ab)using NR_SLAB_UNRECLAIMABLE_B, as it was already discussed here: > > > > https://lore.kernel.org/lkml/20201129172625.GD557259@kernel.org/ > > > > I think that a dedicated stats counter would be too much at the moment and > > NR_SLAB_UNRECLAIMABLE_B is the only explicit stat for unreclaimable memory. > > That's not true -- Mlocked is also unreclaimable. And doesn't this > feel more like mlocked memory than unreclaimable slab? It's also > Unevictable, so could be counted there instead. yes, that is indeed true, except the unreclaimable counter is tracking the unevictable LRUs. These pages are not on any LRU and that can cause some confusion. Maybe they shouldn't be so special and they should live on unevistable LRU and get their stats automagically. I definitely do agree that this would be a better fit than NR_SLAB abuse. But considering that this is somehow even more special than mlock then a dedicated counter sounds as even better fit. -- Michal Hocko SUSE Labs
On Tue, Jan 26, 2021 at 04:05:55PM +0100, Michal Hocko wrote: > On Tue 26-01-21 14:48:38, Matthew Wilcox wrote: > > On Mon, Jan 25, 2021 at 11:38:17PM +0200, Mike Rapoport wrote: > > > I cannot use __GFP_ACCOUNT because cma_alloc() does not use gfp. > > > Besides, kmem accounting with __GFP_ACCOUNT does not seem > > > to update stats and there was an explicit request for statistics: > > > > > > https://lore.kernel.org/lkml/CALo0P13aq3GsONnZrksZNU9RtfhMsZXGWhK1n=xYJWQizCd4Zw@mail.gmail.com/ > > > > > > As for (ab)using NR_SLAB_UNRECLAIMABLE_B, as it was already discussed here: > > > > > > https://lore.kernel.org/lkml/20201129172625.GD557259@kernel.org/ > > > > > > I think that a dedicated stats counter would be too much at the moment and > > > NR_SLAB_UNRECLAIMABLE_B is the only explicit stat for unreclaimable memory. > > > > That's not true -- Mlocked is also unreclaimable. And doesn't this > > feel more like mlocked memory than unreclaimable slab? It's also > > Unevictable, so could be counted there instead. > > yes, that is indeed true, except the unreclaimable counter is tracking > the unevictable LRUs. These pages are not on any LRU and that can cause > some confusion. Maybe they shouldn't be so special and they should live > on unevistable LRU and get their stats automagically. > > I definitely do agree that this would be a better fit than NR_SLAB > abuse. But considering that this is somehow even more special than mlock > then a dedicated counter sounds as even better fit. I think it depends on how large these areas will be in practice. If they will be measured in single or double digits MBs, a separate entry is hardly a good choice: because of the batching the displayed value will be in the noise range, plus every new vmstat item adds to the struct mem_cgroup size. If it will be measured in GBs, of course, a separate counter is preferred. So I'd suggest to go with NR_SLAB (which should have been named NR_KMEM) as now and conditionally switch to a separate counter later. Thanks!
On Wed 27-01-21 10:42:13, Roman Gushchin wrote: > On Tue, Jan 26, 2021 at 04:05:55PM +0100, Michal Hocko wrote: > > On Tue 26-01-21 14:48:38, Matthew Wilcox wrote: > > > On Mon, Jan 25, 2021 at 11:38:17PM +0200, Mike Rapoport wrote: > > > > I cannot use __GFP_ACCOUNT because cma_alloc() does not use gfp. > > > > Besides, kmem accounting with __GFP_ACCOUNT does not seem > > > > to update stats and there was an explicit request for statistics: > > > > > > > > https://lore.kernel.org/lkml/CALo0P13aq3GsONnZrksZNU9RtfhMsZXGWhK1n=xYJWQizCd4Zw@mail.gmail.com/ > > > > > > > > As for (ab)using NR_SLAB_UNRECLAIMABLE_B, as it was already discussed here: > > > > > > > > https://lore.kernel.org/lkml/20201129172625.GD557259@kernel.org/ > > > > > > > > I think that a dedicated stats counter would be too much at the moment and > > > > NR_SLAB_UNRECLAIMABLE_B is the only explicit stat for unreclaimable memory. > > > > > > That's not true -- Mlocked is also unreclaimable. And doesn't this > > > feel more like mlocked memory than unreclaimable slab? It's also > > > Unevictable, so could be counted there instead. > > > > yes, that is indeed true, except the unreclaimable counter is tracking > > the unevictable LRUs. These pages are not on any LRU and that can cause > > some confusion. Maybe they shouldn't be so special and they should live > > on unevistable LRU and get their stats automagically. > > > > I definitely do agree that this would be a better fit than NR_SLAB > > abuse. But considering that this is somehow even more special than mlock > > then a dedicated counter sounds as even better fit. > > I think it depends on how large these areas will be in practice. > If they will be measured in single or double digits MBs, a separate entry > is hardly a good choice: because of the batching the displayed value > will be in the noise range, plus every new vmstat item adds to the > struct mem_cgroup size. > > If it will be measured in GBs, of course, a separate counter is preferred. > So I'd suggest to go with NR_SLAB (which should have been named NR_KMEM) > as now and conditionally switch to a separate counter later. I really do not think the overall usage matters when it comes to abusing other counters. Changing this in future will be always tricky and there always be our favorite "Can this break userspace" question. Yes we dared to change meaning of some counters but this is not generally possible. Just have a look how accounting shmem as a page cache has turned out being much more tricky than many like. Really if a separate counter is a big deal, for which I do not see any big reason, then this should be accounted as unevictable (as suggested by Matthew) and ideally pages of those mappings should be sitting in the unevictable LRU as well unless there is a strong reason against. -- Michal Hocko SUSE Labs
On Wed, Jan 27, 2021 at 11:59 PM Michal Hocko <mhocko@suse.com> wrote: > > On Wed 27-01-21 10:42:13, Roman Gushchin wrote: > > On Tue, Jan 26, 2021 at 04:05:55PM +0100, Michal Hocko wrote: > > > On Tue 26-01-21 14:48:38, Matthew Wilcox wrote: > > > > On Mon, Jan 25, 2021 at 11:38:17PM +0200, Mike Rapoport wrote: > > > > > I cannot use __GFP_ACCOUNT because cma_alloc() does not use gfp. > > > > > Besides, kmem accounting with __GFP_ACCOUNT does not seem > > > > > to update stats and there was an explicit request for statistics: > > > > > > > > > > https://lore.kernel.org/lkml/CALo0P13aq3GsONnZrksZNU9RtfhMsZXGWhK1n=xYJWQizCd4Zw@mail.gmail.com/ > > > > > > > > > > As for (ab)using NR_SLAB_UNRECLAIMABLE_B, as it was already discussed here: > > > > > > > > > > https://lore.kernel.org/lkml/20201129172625.GD557259@kernel.org/ > > > > > > > > > > I think that a dedicated stats counter would be too much at the moment and > > > > > NR_SLAB_UNRECLAIMABLE_B is the only explicit stat for unreclaimable memory. > > > > > > > > That's not true -- Mlocked is also unreclaimable. And doesn't this > > > > feel more like mlocked memory than unreclaimable slab? It's also > > > > Unevictable, so could be counted there instead. > > > > > > yes, that is indeed true, except the unreclaimable counter is tracking > > > the unevictable LRUs. These pages are not on any LRU and that can cause > > > some confusion. Maybe they shouldn't be so special and they should live > > > on unevistable LRU and get their stats automagically. > > > > > > I definitely do agree that this would be a better fit than NR_SLAB > > > abuse. But considering that this is somehow even more special than mlock > > > then a dedicated counter sounds as even better fit. > > > > I think it depends on how large these areas will be in practice. > > If they will be measured in single or double digits MBs, a separate entry > > is hardly a good choice: because of the batching the displayed value > > will be in the noise range, plus every new vmstat item adds to the > > struct mem_cgroup size. > > > > If it will be measured in GBs, of course, a separate counter is preferred. > > So I'd suggest to go with NR_SLAB (which should have been named NR_KMEM) > > as now and conditionally switch to a separate counter later. > > I really do not think the overall usage matters when it comes to abusing > other counters. Changing this in future will be always tricky and there > always be our favorite "Can this break userspace" question. Yes we dared > to change meaning of some counters but this is not generally possible. > Just have a look how accounting shmem as a page cache has turned out > being much more tricky than many like. > > Really if a separate counter is a big deal, for which I do not see any > big reason, then this should be accounted as unevictable (as suggested > by Matthew) and ideally pages of those mappings should be sitting in the > unevictable LRU as well unless there is a strong reason against. > Why not decide based on the movability of these pages? If movable then unevictable LRU seems like the right way otherwise NR_SLAB.
On Thu 28-01-21 06:05:11, Shakeel Butt wrote: > On Wed, Jan 27, 2021 at 11:59 PM Michal Hocko <mhocko@suse.com> wrote: > > > > On Wed 27-01-21 10:42:13, Roman Gushchin wrote: > > > On Tue, Jan 26, 2021 at 04:05:55PM +0100, Michal Hocko wrote: > > > > On Tue 26-01-21 14:48:38, Matthew Wilcox wrote: > > > > > On Mon, Jan 25, 2021 at 11:38:17PM +0200, Mike Rapoport wrote: > > > > > > I cannot use __GFP_ACCOUNT because cma_alloc() does not use gfp. > > > > > > Besides, kmem accounting with __GFP_ACCOUNT does not seem > > > > > > to update stats and there was an explicit request for statistics: > > > > > > > > > > > > https://lore.kernel.org/lkml/CALo0P13aq3GsONnZrksZNU9RtfhMsZXGWhK1n=xYJWQizCd4Zw@mail.gmail.com/ > > > > > > > > > > > > As for (ab)using NR_SLAB_UNRECLAIMABLE_B, as it was already discussed here: > > > > > > > > > > > > https://lore.kernel.org/lkml/20201129172625.GD557259@kernel.org/ > > > > > > > > > > > > I think that a dedicated stats counter would be too much at the moment and > > > > > > NR_SLAB_UNRECLAIMABLE_B is the only explicit stat for unreclaimable memory. > > > > > > > > > > That's not true -- Mlocked is also unreclaimable. And doesn't this > > > > > feel more like mlocked memory than unreclaimable slab? It's also > > > > > Unevictable, so could be counted there instead. > > > > > > > > yes, that is indeed true, except the unreclaimable counter is tracking > > > > the unevictable LRUs. These pages are not on any LRU and that can cause > > > > some confusion. Maybe they shouldn't be so special and they should live > > > > on unevistable LRU and get their stats automagically. > > > > > > > > I definitely do agree that this would be a better fit than NR_SLAB > > > > abuse. But considering that this is somehow even more special than mlock > > > > then a dedicated counter sounds as even better fit. > > > > > > I think it depends on how large these areas will be in practice. > > > If they will be measured in single or double digits MBs, a separate entry > > > is hardly a good choice: because of the batching the displayed value > > > will be in the noise range, plus every new vmstat item adds to the > > > struct mem_cgroup size. > > > > > > If it will be measured in GBs, of course, a separate counter is preferred. > > > So I'd suggest to go with NR_SLAB (which should have been named NR_KMEM) > > > as now and conditionally switch to a separate counter later. > > > > I really do not think the overall usage matters when it comes to abusing > > other counters. Changing this in future will be always tricky and there > > always be our favorite "Can this break userspace" question. Yes we dared > > to change meaning of some counters but this is not generally possible. > > Just have a look how accounting shmem as a page cache has turned out > > being much more tricky than many like. > > > > Really if a separate counter is a big deal, for which I do not see any > > big reason, then this should be accounted as unevictable (as suggested > > by Matthew) and ideally pages of those mappings should be sitting in the > > unevictable LRU as well unless there is a strong reason against. > > > > Why not decide based on the movability of these pages? If movable then > unevictable LRU seems like the right way otherwise NR_SLAB. I really do not follow. If the page is unevictable then why movability matters? I also fail to see why NR_SLAB is even considered considering this is completely outside of slab proper. Really what is the point? What are we trying to achieve by stats? Do we want to know how much secret memory is used because that is an interesting/important information or do we just want to make some accounting? Just think at it from a practical point of view. I want to know how much slab memory is used because it can give me an idea whether kernel is consuming unexpected amount of memory. Now I have to subtract _some_ number to get that information. Where do I get that some number? We have been creative with counters and it tends to kick back much more often than it helps. I really do not want this to turn into an endless bike shed but either this should be accounted as a general type of memory (unevictable would be a good fit because that is a userspace memory which is not reclaimable) or it needs its own counter to tell how much of this specific type of memory is used for this purpose. -- Michal Hocko SUSE Labs
On Thu, Jan 28, 2021 at 6:22 AM Michal Hocko <mhocko@suse.com> wrote: > > On Thu 28-01-21 06:05:11, Shakeel Butt wrote: > > On Wed, Jan 27, 2021 at 11:59 PM Michal Hocko <mhocko@suse.com> wrote: > > > > > > On Wed 27-01-21 10:42:13, Roman Gushchin wrote: > > > > On Tue, Jan 26, 2021 at 04:05:55PM +0100, Michal Hocko wrote: > > > > > On Tue 26-01-21 14:48:38, Matthew Wilcox wrote: > > > > > > On Mon, Jan 25, 2021 at 11:38:17PM +0200, Mike Rapoport wrote: > > > > > > > I cannot use __GFP_ACCOUNT because cma_alloc() does not use gfp. > > > > > > > Besides, kmem accounting with __GFP_ACCOUNT does not seem > > > > > > > to update stats and there was an explicit request for statistics: > > > > > > > > > > > > > > https://lore.kernel.org/lkml/CALo0P13aq3GsONnZrksZNU9RtfhMsZXGWhK1n=xYJWQizCd4Zw@mail.gmail.com/ > > > > > > > > > > > > > > As for (ab)using NR_SLAB_UNRECLAIMABLE_B, as it was already discussed here: > > > > > > > > > > > > > > https://lore.kernel.org/lkml/20201129172625.GD557259@kernel.org/ > > > > > > > > > > > > > > I think that a dedicated stats counter would be too much at the moment and > > > > > > > NR_SLAB_UNRECLAIMABLE_B is the only explicit stat for unreclaimable memory. > > > > > > > > > > > > That's not true -- Mlocked is also unreclaimable. And doesn't this > > > > > > feel more like mlocked memory than unreclaimable slab? It's also > > > > > > Unevictable, so could be counted there instead. > > > > > > > > > > yes, that is indeed true, except the unreclaimable counter is tracking > > > > > the unevictable LRUs. These pages are not on any LRU and that can cause > > > > > some confusion. Maybe they shouldn't be so special and they should live > > > > > on unevistable LRU and get their stats automagically. > > > > > > > > > > I definitely do agree that this would be a better fit than NR_SLAB > > > > > abuse. But considering that this is somehow even more special than mlock > > > > > then a dedicated counter sounds as even better fit. > > > > > > > > I think it depends on how large these areas will be in practice. > > > > If they will be measured in single or double digits MBs, a separate entry > > > > is hardly a good choice: because of the batching the displayed value > > > > will be in the noise range, plus every new vmstat item adds to the > > > > struct mem_cgroup size. > > > > > > > > If it will be measured in GBs, of course, a separate counter is preferred. > > > > So I'd suggest to go with NR_SLAB (which should have been named NR_KMEM) > > > > as now and conditionally switch to a separate counter later. > > > > > > I really do not think the overall usage matters when it comes to abusing > > > other counters. Changing this in future will be always tricky and there > > > always be our favorite "Can this break userspace" question. Yes we dared > > > to change meaning of some counters but this is not generally possible. > > > Just have a look how accounting shmem as a page cache has turned out > > > being much more tricky than many like. > > > > > > Really if a separate counter is a big deal, for which I do not see any > > > big reason, then this should be accounted as unevictable (as suggested > > > by Matthew) and ideally pages of those mappings should be sitting in the > > > unevictable LRU as well unless there is a strong reason against. > > > > > > > Why not decide based on the movability of these pages? If movable then > > unevictable LRU seems like the right way otherwise NR_SLAB. > > I really do not follow. If the page is unevictable then why movability > matters? My point was if these pages are very much similar to our existing definition of unevictable LRU pages then it makes more sense to account for these pages into unevictable stat. > I also fail to see why NR_SLAB is even considered considering > this is completely outside of slab proper. > > Really what is the point? What are we trying to achieve by stats? Do we > want to know how much secret memory is used because that is an > interesting/important information or do we just want to make some > accounting? > > Just think at it from a practical point of view. I want to know how much > slab memory is used because it can give me an idea whether kernel is > consuming unexpected amount of memory. Now I have to subtract _some_ > number to get that information. Where do I get that some number? > > We have been creative with counters and it tends to kick back much more > often than it helps. > > I really do not want this to turn into an endless bike shed but either > this should be accounted as a general type of memory (unevictable would > be a good fit because that is a userspace memory which is not > reclaimable) or it needs its own counter to tell how much of this > specific type of memory is used for this purpose. > I suggested having a separate counter in the previous version but got shot down based on the not-yet-clear benefit of a separate stat for it. There is also an option to not add new or use existing stat at this moment. As there will be more clear use-cases and usage of secretmem, adding a new stat at that time would be much simpler than changing the definition of existing stats.
On Mon, Jan 25, 2021 at 1:35 PM Mike Rapoport <rppt@kernel.org> wrote: > > On Mon, Jan 25, 2021 at 09:18:04AM -0800, Shakeel Butt wrote: > > On Mon, Jan 25, 2021 at 8:20 AM Matthew Wilcox <willy@infradead.org> wrote: > > > > > > On Thu, Jan 21, 2021 at 02:27:20PM +0200, Mike Rapoport wrote: > > > > From: Mike Rapoport <rppt@linux.ibm.com> > > > > > > > > Account memory consumed by secretmem to memcg. The accounting is updated > > > > when the memory is actually allocated and freed. > > I though about doing per-page accounting, but then one would be able to > create a lot of secretmem file descriptors, use only a page from each while > actual memory consumption will be way higher. > > > > I think this is wrong. It fails to account subsequent allocators from > > > the same PMD. If you want to track like this, you need separate pools > > > per memcg. > > > > > > > Are these secretmem pools shared between different jobs/memcgs? > > A secretmem pool is per anonymous file descriptor and this file descriptor > can be shared only explicitly between several processes. So, the secretmem > pool should not be shared between different jobs/memcg. Of course, it's > possible to spread threads of a process across different memcgs, but in > that case the accounting will be similar to what's happening today with > sl*b. I don't think memcg accounting for sl*b works like that. > The first thread to cause kmalloc() will be charged for the > allocation of the entire slab and subsequent allocations from that slab > will not be accounted. The latest kernel does object level memcg accounting. So, each allocation from these threads will correctly charge their own memcgs.
diff --git a/mm/filemap.c b/mm/filemap.c index 2d0c6721879d..bb28dd6d9e22 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,6 +42,7 @@ #include <linux/psi.h> #include <linux/ramfs.h> #include <linux/page_idle.h> +#include <linux/secretmem.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -839,7 +840,7 @@ noinline int __add_to_page_cache_locked(struct page *page, page->mapping = mapping; page->index = offset; - if (!huge) { + if (!huge && !page_is_secretmem(page)) { error = mem_cgroup_charge(page, current->mm, gfp); if (error) goto error; diff --git a/mm/secretmem.c b/mm/secretmem.c index 469211c7cc3a..05026460e2ee 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -18,6 +18,7 @@ #include <linux/memblock.h> #include <linux/pseudo_fs.h> #include <linux/secretmem.h> +#include <linux/memcontrol.h> #include <linux/set_memory.h> #include <linux/sched/signal.h> @@ -44,6 +45,32 @@ struct secretmem_ctx { static struct cma *secretmem_cma; +static int secretmem_account_pages(struct page *page, gfp_t gfp, int order) +{ + int err; + + err = memcg_kmem_charge_page(page, gfp, order); + if (err) + return err; + + /* + * seceremem caches are unreclaimable kernel allocations, so treat + * them as unreclaimable slab memory for VM statistics purposes + */ + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); + + return 0; +} + +static void secretmem_unaccount_pages(struct page *page, int order) +{ + + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + -PAGE_SIZE << order); + memcg_kmem_uncharge_page(page, order); +} + static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp) { unsigned long nr_pages = (1 << PMD_PAGE_ORDER); @@ -56,6 +83,10 @@ static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp) if (!page) return -ENOMEM; + err = secretmem_account_pages(page, gfp, PMD_PAGE_ORDER); + if (err) + goto err_cma_release; + /* * clear the data left from the prevoius user before dropping the * pages from the direct map @@ -65,7 +96,7 @@ static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp) err = set_direct_map_invalid_noflush(page, nr_pages); if (err) - goto err_cma_release; + goto err_memcg_uncharge; addr = (unsigned long)page_address(page); err = gen_pool_add(pool, addr, PMD_SIZE, NUMA_NO_NODE); @@ -83,6 +114,8 @@ static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp) * won't fail */ set_direct_map_default_noflush(page, nr_pages); +err_memcg_uncharge: + secretmem_unaccount_pages(page, PMD_PAGE_ORDER); err_cma_release: cma_release(secretmem_cma, page, nr_pages); return err; @@ -314,6 +347,7 @@ static void secretmem_cleanup_chunk(struct gen_pool *pool, int i; set_direct_map_default_noflush(page, nr_pages); + secretmem_unaccount_pages(page, PMD_PAGE_ORDER); for (i = 0; i < nr_pages; i++) clear_highpage(page + i);