diff mbox series

mm/zsmalloc.c: Fix zsmalloc 32-bit PAE support

Message ID 20181210142105.6750-1-rafael.tinoco@linaro.org
State New
Headers show
Series mm/zsmalloc.c: Fix zsmalloc 32-bit PAE support | expand

Commit Message

Rafael David Tinoco Dec. 10, 2018, 2:21 p.m. UTC
On 32-bit systems, zsmalloc uses HIGHMEM and, when PAE is enabled, the
physical frame number might be so big that zsmalloc obj encoding (to
location) will break, causing:

BUG: KASAN: null-ptr-deref in zs_map_object+0xa4/0x2bc
Read of size 4 at addr 00000000 by task mkfs.ext4/623
CPU: 2 PID: 623 Comm: mkfs.ext4 Not tainted 4.19.0-rc8-00017-g8239bc6d3307-dirty #15
Hardware name: Generic DT based system
[<c0418f7c>] (unwind_backtrace) from [<c0410ca4>] (show_stack+0x20/0x24)
[<c0410ca4>] (show_stack) from [<c16bd540>] (dump_stack+0xbc/0xe8)
[<c16bd540>] (dump_stack) from [<c06cab74>] (kasan_report+0x248/0x390)
[<c06cab74>] (kasan_report) from [<c06c94e8>] (__asan_load4+0x78/0xb4)
[<c06c94e8>] (__asan_load4) from [<c06ddc24>] (zs_map_object+0xa4/0x2bc)
[<c06ddc24>] (zs_map_object) from [<bf0bbbd8>] (zram_bvec_rw.constprop.2+0x324/0x8e4 [zram])
[<bf0bbbd8>] (zram_bvec_rw.constprop.2 [zram]) from [<bf0bc3cc>] (zram_make_request+0x234/0x46c [zram])
[<bf0bc3cc>] (zram_make_request [zram]) from [<c09aff9c>] (generic_make_request+0x304/0x63c)
[<c09aff9c>] (generic_make_request) from [<c09b0320>] (submit_bio+0x4c/0x1c8)
[<c09b0320>] (submit_bio) from [<c0743570>] (submit_bh_wbc.constprop.15+0x238/0x26c)
[<c0743570>] (submit_bh_wbc.constprop.15) from [<c0746cf8>] (__block_write_full_page+0x524/0x76c)
[<c0746cf8>] (__block_write_full_page) from [<c07472c4>] (block_write_full_page+0x1bc/0x1d4)
[<c07472c4>] (block_write_full_page) from [<c0748eb4>] (blkdev_writepage+0x24/0x28)
[<c0748eb4>] (blkdev_writepage) from [<c064a780>] (__writepage+0x44/0x78)
[<c064a780>] (__writepage) from [<c064b50c>] (write_cache_pages+0x3b8/0x800)
[<c064b50c>] (write_cache_pages) from [<c064dd78>] (generic_writepages+0x74/0xa0)
[<c064dd78>] (generic_writepages) from [<c0748e64>] (blkdev_writepages+0x18/0x1c)
[<c0748e64>] (blkdev_writepages) from [<c064e890>] (do_writepages+0x68/0x134)
[<c064e890>] (do_writepages) from [<c06368a4>] (__filemap_fdatawrite_range+0xb0/0xf4)
[<c06368a4>] (__filemap_fdatawrite_range) from [<c0636b68>] (file_write_and_wait_range+0x64/0xd0)
[<c0636b68>] (file_write_and_wait_range) from [<c0747af8>] (blkdev_fsync+0x54/0x84)
[<c0747af8>] (blkdev_fsync) from [<c0739dac>] (vfs_fsync_range+0x70/0xd4)
[<c0739dac>] (vfs_fsync_range) from [<c0739e98>] (do_fsync+0x4c/0x80)
[<c0739e98>] (do_fsync) from [<c073a26c>] (sys_fsync+0x1c/0x20)
[<c073a26c>] (sys_fsync) from [<c0401000>] (ret_fast_syscall+0x0/0x2c)

when trying to decode (the pfn) and map the object.

That happens because one architecture might not re-define
MAX_PHYSMEM_BITS, like in this ARM 32-bit w/ LPAE enabled example. For
32-bit systems, if not re-defined, MAX_POSSIBLE_PHYSMEM_BITS will
default to BITS_PER_LONG (32) in most cases, and, with PAE enabled,
_PFN_BITS might be wrong: which may cause obj variable to overflow if
frame number is in HIGHMEM and referencing a page above the 4GB
watermark.

commit 6e00ec00b1a7 ("staging: zsmalloc: calculate MAX_PHYSMEM_BITS if
not defined") realized MAX_PHYSMEM_BITS depended on SPARSEMEM headers
and "fixed" it by calculating it using BITS_PER_LONG if SPARSEMEM wasn't
used, like in the example given above.

Systems with potential for PAE exist for a long time and assuming
BITS_PER_LONG seems inadequate. Defining MAX_PHYSMEM_BITS looks better,
however it is NOT a constant anymore for x86.

SO, instead, MAX_POSSIBLE_PHYSMEM_BITS should be defined by every
architecture using zsmalloc, together with a sanity check for
MAX_POSSIBLE_PHYSMEM_BITS being too big on 32-bit systems.

Link: https://bugs.linaro.org/show_bug.cgi?id=3765#c17
Signed-off-by: Rafael David Tinoco <rafael.tinoco@linaro.org>

---
 arch/arm/include/asm/pgtable-2level-types.h |  2 ++
 arch/arm/include/asm/pgtable-3level-types.h |  2 ++
 arch/arm64/include/asm/pgtable-types.h      |  2 ++
 arch/ia64/include/asm/page.h                |  2 ++
 arch/mips/include/asm/page.h                |  2 ++
 arch/powerpc/include/asm/mmu.h              |  2 ++
 arch/s390/include/asm/page.h                |  2 ++
 arch/sh/include/asm/page.h                  |  2 ++
 arch/sparc/include/asm/page_32.h            |  2 ++
 arch/sparc/include/asm/page_64.h            |  2 ++
 arch/x86/include/asm/pgtable-2level_types.h |  2 ++
 arch/x86/include/asm/pgtable-3level_types.h |  3 +-
 arch/x86/include/asm/pgtable_64_types.h     |  4 +--
 mm/zsmalloc.c                               | 35 +++++++++++----------
 14 files changed, 45 insertions(+), 19 deletions(-)

-- 
2.20.0.rc1

Comments

Robin Murphy Dec. 10, 2018, 2:35 p.m. UTC | #1
On 10/12/2018 14:21, Rafael David Tinoco wrote:
> On 32-bit systems, zsmalloc uses HIGHMEM and, when PAE is enabled, the

> physical frame number might be so big that zsmalloc obj encoding (to

> location) will break, causing:

> 

> BUG: KASAN: null-ptr-deref in zs_map_object+0xa4/0x2bc

> Read of size 4 at addr 00000000 by task mkfs.ext4/623

> CPU: 2 PID: 623 Comm: mkfs.ext4 Not tainted 4.19.0-rc8-00017-g8239bc6d3307-dirty #15

> Hardware name: Generic DT based system

> [<c0418f7c>] (unwind_backtrace) from [<c0410ca4>] (show_stack+0x20/0x24)

> [<c0410ca4>] (show_stack) from [<c16bd540>] (dump_stack+0xbc/0xe8)

> [<c16bd540>] (dump_stack) from [<c06cab74>] (kasan_report+0x248/0x390)

> [<c06cab74>] (kasan_report) from [<c06c94e8>] (__asan_load4+0x78/0xb4)

> [<c06c94e8>] (__asan_load4) from [<c06ddc24>] (zs_map_object+0xa4/0x2bc)

> [<c06ddc24>] (zs_map_object) from [<bf0bbbd8>] (zram_bvec_rw.constprop.2+0x324/0x8e4 [zram])

> [<bf0bbbd8>] (zram_bvec_rw.constprop.2 [zram]) from [<bf0bc3cc>] (zram_make_request+0x234/0x46c [zram])

> [<bf0bc3cc>] (zram_make_request [zram]) from [<c09aff9c>] (generic_make_request+0x304/0x63c)

> [<c09aff9c>] (generic_make_request) from [<c09b0320>] (submit_bio+0x4c/0x1c8)

> [<c09b0320>] (submit_bio) from [<c0743570>] (submit_bh_wbc.constprop.15+0x238/0x26c)

> [<c0743570>] (submit_bh_wbc.constprop.15) from [<c0746cf8>] (__block_write_full_page+0x524/0x76c)

> [<c0746cf8>] (__block_write_full_page) from [<c07472c4>] (block_write_full_page+0x1bc/0x1d4)

> [<c07472c4>] (block_write_full_page) from [<c0748eb4>] (blkdev_writepage+0x24/0x28)

> [<c0748eb4>] (blkdev_writepage) from [<c064a780>] (__writepage+0x44/0x78)

> [<c064a780>] (__writepage) from [<c064b50c>] (write_cache_pages+0x3b8/0x800)

> [<c064b50c>] (write_cache_pages) from [<c064dd78>] (generic_writepages+0x74/0xa0)

> [<c064dd78>] (generic_writepages) from [<c0748e64>] (blkdev_writepages+0x18/0x1c)

> [<c0748e64>] (blkdev_writepages) from [<c064e890>] (do_writepages+0x68/0x134)

> [<c064e890>] (do_writepages) from [<c06368a4>] (__filemap_fdatawrite_range+0xb0/0xf4)

> [<c06368a4>] (__filemap_fdatawrite_range) from [<c0636b68>] (file_write_and_wait_range+0x64/0xd0)

> [<c0636b68>] (file_write_and_wait_range) from [<c0747af8>] (blkdev_fsync+0x54/0x84)

> [<c0747af8>] (blkdev_fsync) from [<c0739dac>] (vfs_fsync_range+0x70/0xd4)

> [<c0739dac>] (vfs_fsync_range) from [<c0739e98>] (do_fsync+0x4c/0x80)

> [<c0739e98>] (do_fsync) from [<c073a26c>] (sys_fsync+0x1c/0x20)

> [<c073a26c>] (sys_fsync) from [<c0401000>] (ret_fast_syscall+0x0/0x2c)

> 

> when trying to decode (the pfn) and map the object.

> 

> That happens because one architecture might not re-define

> MAX_PHYSMEM_BITS, like in this ARM 32-bit w/ LPAE enabled example. For

> 32-bit systems, if not re-defined, MAX_POSSIBLE_PHYSMEM_BITS will

> default to BITS_PER_LONG (32) in most cases, and, with PAE enabled,

> _PFN_BITS might be wrong: which may cause obj variable to overflow if

> frame number is in HIGHMEM and referencing a page above the 4GB

> watermark.

> 

> commit 6e00ec00b1a7 ("staging: zsmalloc: calculate MAX_PHYSMEM_BITS if

> not defined") realized MAX_PHYSMEM_BITS depended on SPARSEMEM headers

> and "fixed" it by calculating it using BITS_PER_LONG if SPARSEMEM wasn't

> used, like in the example given above.

> 

> Systems with potential for PAE exist for a long time and assuming

> BITS_PER_LONG seems inadequate. Defining MAX_PHYSMEM_BITS looks better,

> however it is NOT a constant anymore for x86.

> 

> SO, instead, MAX_POSSIBLE_PHYSMEM_BITS should be defined by every

> architecture using zsmalloc, together with a sanity check for

> MAX_POSSIBLE_PHYSMEM_BITS being too big on 32-bit systems.

> 

> Link: https://bugs.linaro.org/show_bug.cgi?id=3765#c17

> Signed-off-by: Rafael David Tinoco <rafael.tinoco@linaro.org>

> ---

>   arch/arm/include/asm/pgtable-2level-types.h |  2 ++

>   arch/arm/include/asm/pgtable-3level-types.h |  2 ++

>   arch/arm64/include/asm/pgtable-types.h      |  2 ++

>   arch/ia64/include/asm/page.h                |  2 ++

>   arch/mips/include/asm/page.h                |  2 ++

>   arch/powerpc/include/asm/mmu.h              |  2 ++

>   arch/s390/include/asm/page.h                |  2 ++

>   arch/sh/include/asm/page.h                  |  2 ++

>   arch/sparc/include/asm/page_32.h            |  2 ++

>   arch/sparc/include/asm/page_64.h            |  2 ++

>   arch/x86/include/asm/pgtable-2level_types.h |  2 ++

>   arch/x86/include/asm/pgtable-3level_types.h |  3 +-

>   arch/x86/include/asm/pgtable_64_types.h     |  4 +--

>   mm/zsmalloc.c                               | 35 +++++++++++----------

>   14 files changed, 45 insertions(+), 19 deletions(-)

> 

> diff --git a/arch/arm/include/asm/pgtable-2level-types.h b/arch/arm/include/asm/pgtable-2level-types.h

> index 66cb5b0e89c5..552dba411324 100644

> --- a/arch/arm/include/asm/pgtable-2level-types.h

> +++ b/arch/arm/include/asm/pgtable-2level-types.h

> @@ -64,4 +64,6 @@ typedef pteval_t pgprot_t;

>   

>   #endif /* STRICT_MM_TYPECHECKS */

>   

> +#define MAX_POSSIBLE_PHYSMEM_BITS 32

> +

>   #endif	/* _ASM_PGTABLE_2LEVEL_TYPES_H */

> diff --git a/arch/arm/include/asm/pgtable-3level-types.h b/arch/arm/include/asm/pgtable-3level-types.h

> index 921aa30259c4..664c39e6517c 100644

> --- a/arch/arm/include/asm/pgtable-3level-types.h

> +++ b/arch/arm/include/asm/pgtable-3level-types.h

> @@ -67,4 +67,6 @@ typedef pteval_t pgprot_t;

>   

>   #endif	/* STRICT_MM_TYPECHECKS */

>   

> +#define MAX_POSSIBLE_PHYSMEM_BITS 36


Nit: with LPAE, physical addresses go up to 40 bits, not just 36.

Robin.

> +

>   #endif	/* _ASM_PGTABLE_3LEVEL_TYPES_H */

> diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h

> index 345a072b5856..45c3834eb4c8 100644

> --- a/arch/arm64/include/asm/pgtable-types.h

> +++ b/arch/arm64/include/asm/pgtable-types.h

> @@ -64,4 +64,6 @@ typedef struct { pteval_t pgprot; } pgprot_t;

>   #include <asm-generic/5level-fixup.h>

>   #endif

>   

> +#define MAX_POSSIBLE_PHYSMEM_BITS CONFIG_ARM64_PA_BITS

> +

>   #endif	/* __ASM_PGTABLE_TYPES_H */

> diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h

> index 5798bd2b462c..a3e055979e46 100644

> --- a/arch/ia64/include/asm/page.h

> +++ b/arch/ia64/include/asm/page.h

> @@ -235,4 +235,6 @@ get_order (unsigned long size)

>   

>   #define __HAVE_ARCH_GATE_AREA	1

>   

> +#define MAX_POSSIBLE_PHYSMEM_BITS 50

> +

>   #endif /* _ASM_IA64_PAGE_H */

> diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h

> index e8cc328fce2d..f6a5dea1a66c 100644

> --- a/arch/mips/include/asm/page.h

> +++ b/arch/mips/include/asm/page.h

> @@ -263,4 +263,6 @@ extern int __virt_addr_valid(const volatile void *kaddr);

>   #include <asm-generic/memory_model.h>

>   #include <asm-generic/getorder.h>

>   

> +#define MAX_POSSIBLE_PHYSMEM_BITS 48

> +

>   #endif /* _ASM_PAGE_H */

> diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h

> index eb20eb3b8fb0..2ebc1d2d9a5c 100644

> --- a/arch/powerpc/include/asm/mmu.h

> +++ b/arch/powerpc/include/asm/mmu.h

> @@ -324,6 +324,8 @@ static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)

>   #define MAX_PHYSMEM_BITS        46

>   #endif

>   

> +#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS

> +

>   #ifdef CONFIG_PPC_BOOK3S_64

>   #include <asm/book3s/64/mmu.h>

>   #else /* CONFIG_PPC_BOOK3S_64 */

> diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h

> index a4d38092530a..8abec1461bf7 100644

> --- a/arch/s390/include/asm/page.h

> +++ b/arch/s390/include/asm/page.h

> @@ -180,4 +180,6 @@ static inline int devmem_is_allowed(unsigned long pfn)

>   #include <asm-generic/memory_model.h>

>   #include <asm-generic/getorder.h>

>   

> +#define MAX_POSSIBLE_PHYSMEM_BITS CONFIG_MAX_PHYSMEM_BITS

> +

>   #endif /* _S390_PAGE_H */

> diff --git a/arch/sh/include/asm/page.h b/arch/sh/include/asm/page.h

> index 5eef8be3e59f..40c7e12cf09e 100644

> --- a/arch/sh/include/asm/page.h

> +++ b/arch/sh/include/asm/page.h

> @@ -205,4 +205,6 @@ typedef struct page *pgtable_t;

>   #define ARCH_SLAB_MINALIGN	8

>   #endif

>   

> +#define MAX_POSSIBLE_PHYSMEM_BITS 32

> +

>   #endif /* __ASM_SH_PAGE_H */

> diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h

> index b76d59edec8c..14e9ca4659d7 100644

> --- a/arch/sparc/include/asm/page_32.h

> +++ b/arch/sparc/include/asm/page_32.h

> @@ -139,4 +139,6 @@ extern unsigned long pfn_base;

>   #include <asm-generic/memory_model.h>

>   #include <asm-generic/getorder.h>

>   

> +#define MAX_POSSIBLE_PHYSMEM_BITS 32

> +

>   #endif /* _SPARC_PAGE_H */

> diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h

> index e80f2d5bf62f..6d6f3654ead1 100644

> --- a/arch/sparc/include/asm/page_64.h

> +++ b/arch/sparc/include/asm/page_64.h

> @@ -163,4 +163,6 @@ extern unsigned long PAGE_OFFSET;

>   

>   #include <asm-generic/getorder.h>

>   

> +#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYS_ADDRESS_BITS

> +

>   #endif /* _SPARC64_PAGE_H */

> diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h

> index 6deb6cd236e3..c2eae59e6505 100644

> --- a/arch/x86/include/asm/pgtable-2level_types.h

> +++ b/arch/x86/include/asm/pgtable-2level_types.h

> @@ -38,4 +38,6 @@ typedef union {

>   /* This covers all VMSPLIT_* and VMSPLIT_*_OPT variants */

>   #define PGD_KERNEL_START	(CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)

>   

> +#define MAX_POSSIBLE_PHYSMEM_BITS 32

> +

>   #endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */

> diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h

> index 33845d36897c..5fce514a49a0 100644

> --- a/arch/x86/include/asm/pgtable-3level_types.h

> +++ b/arch/x86/include/asm/pgtable-3level_types.h

> @@ -45,7 +45,8 @@ typedef union {

>    */

>   #define PTRS_PER_PTE	512

>   

> -#define MAX_POSSIBLE_PHYSMEM_BITS	36

>   #define PGD_KERNEL_START	(CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)

>   

> +#define MAX_POSSIBLE_PHYSMEM_BITS 36

> +

>   #endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */

> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h

> index 84bd9bdc1987..d808cfde3d19 100644

> --- a/arch/x86/include/asm/pgtable_64_types.h

> +++ b/arch/x86/include/asm/pgtable_64_types.h

> @@ -64,8 +64,6 @@ extern unsigned int ptrs_per_p4d;

>   #define P4D_SIZE		(_AC(1, UL) << P4D_SHIFT)

>   #define P4D_MASK		(~(P4D_SIZE - 1))

>   

> -#define MAX_POSSIBLE_PHYSMEM_BITS	52

> -

>   #else /* CONFIG_X86_5LEVEL */

>   

>   /*

> @@ -154,4 +152,6 @@ extern unsigned int ptrs_per_p4d;

>   

>   #define PGD_KERNEL_START	((PAGE_SIZE / 2) / sizeof(pgd_t))

>   

> +#define MAX_POSSIBLE_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46)

> +

>   #endif /* _ASM_X86_PGTABLE_64_DEFS_H */

> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c

> index 0787d33b80d8..132c20b6fd4f 100644

> --- a/mm/zsmalloc.c

> +++ b/mm/zsmalloc.c

> @@ -80,23 +80,7 @@

>    * as single (unsigned long) handle value.

>    *

>    * Note that object index <obj_idx> starts from 0.

> - *

> - * This is made more complicated by various memory models and PAE.

> - */

> -

> -#ifndef MAX_POSSIBLE_PHYSMEM_BITS

> -#ifdef MAX_PHYSMEM_BITS

> -#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS

> -#else

> -/*

> - * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just

> - * be PAGE_SHIFT

>    */

> -#define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG

> -#endif

> -#endif

> -

> -#define _PFN_BITS		(MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)

>   

>   /*

>    * Memory for allocating for handle keeps object position by

> @@ -116,6 +100,25 @@

>    */

>   #define OBJ_ALLOCATED_TAG 1

>   #define OBJ_TAG_BITS 1

> +

> +/*

> + * MAX_POSSIBLE_PHYSMEM_BITS should be defined by all archs using zsmalloc:

> + * Trying to guess it from MAX_PHYSMEM_BITS, or considering it BITS_PER_LONG,

> + * proved to be wrong by not considering PAE capabilities, or using SPARSEMEM

> + * only headers, leading to bad object encoding due to object index overflow.

> + */

> +#ifndef MAX_POSSIBLE_PHYSMEM_BITS

> + #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG

> + #error "MAX_POSSIBLE_PHYSMEM_BITS HAS to be defined by arch using zsmalloc";

> +#else

> + #ifndef CONFIG_64BIT

> +  #if (MAX_POSSIBLE_PHYSMEM_BITS >= (BITS_PER_LONG + PAGE_SHIFT - OBJ_TAG_BITS))

> +   #error "MAX_POSSIBLE_PHYSMEM_BITS is wrong for this arch";

> +  #endif

> + #endif

> +#endif

> +

> +#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)

>   #define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)

>   #define OBJ_INDEX_MASK	((_AC(1, UL) << OBJ_INDEX_BITS) - 1)

>   

>
Russell King (Oracle) Dec. 10, 2018, 3:05 p.m. UTC | #2
On Mon, Dec 10, 2018 at 02:35:55PM +0000, Robin Murphy wrote:
> On 10/12/2018 14:21, Rafael David Tinoco wrote:

> >On 32-bit systems, zsmalloc uses HIGHMEM and, when PAE is enabled, the

> >physical frame number might be so big that zsmalloc obj encoding (to

> >location) will break, causing:

> >

> >BUG: KASAN: null-ptr-deref in zs_map_object+0xa4/0x2bc

> >Read of size 4 at addr 00000000 by task mkfs.ext4/623

> >CPU: 2 PID: 623 Comm: mkfs.ext4 Not tainted 4.19.0-rc8-00017-g8239bc6d3307-dirty #15

> >Hardware name: Generic DT based system

> >[<c0418f7c>] (unwind_backtrace) from [<c0410ca4>] (show_stack+0x20/0x24)

> >[<c0410ca4>] (show_stack) from [<c16bd540>] (dump_stack+0xbc/0xe8)

> >[<c16bd540>] (dump_stack) from [<c06cab74>] (kasan_report+0x248/0x390)

> >[<c06cab74>] (kasan_report) from [<c06c94e8>] (__asan_load4+0x78/0xb4)

> >[<c06c94e8>] (__asan_load4) from [<c06ddc24>] (zs_map_object+0xa4/0x2bc)

> >[<c06ddc24>] (zs_map_object) from [<bf0bbbd8>] (zram_bvec_rw.constprop.2+0x324/0x8e4 [zram])

> >[<bf0bbbd8>] (zram_bvec_rw.constprop.2 [zram]) from [<bf0bc3cc>] (zram_make_request+0x234/0x46c [zram])

> >[<bf0bc3cc>] (zram_make_request [zram]) from [<c09aff9c>] (generic_make_request+0x304/0x63c)

> >[<c09aff9c>] (generic_make_request) from [<c09b0320>] (submit_bio+0x4c/0x1c8)

> >[<c09b0320>] (submit_bio) from [<c0743570>] (submit_bh_wbc.constprop.15+0x238/0x26c)

> >[<c0743570>] (submit_bh_wbc.constprop.15) from [<c0746cf8>] (__block_write_full_page+0x524/0x76c)

> >[<c0746cf8>] (__block_write_full_page) from [<c07472c4>] (block_write_full_page+0x1bc/0x1d4)

> >[<c07472c4>] (block_write_full_page) from [<c0748eb4>] (blkdev_writepage+0x24/0x28)

> >[<c0748eb4>] (blkdev_writepage) from [<c064a780>] (__writepage+0x44/0x78)

> >[<c064a780>] (__writepage) from [<c064b50c>] (write_cache_pages+0x3b8/0x800)

> >[<c064b50c>] (write_cache_pages) from [<c064dd78>] (generic_writepages+0x74/0xa0)

> >[<c064dd78>] (generic_writepages) from [<c0748e64>] (blkdev_writepages+0x18/0x1c)

> >[<c0748e64>] (blkdev_writepages) from [<c064e890>] (do_writepages+0x68/0x134)

> >[<c064e890>] (do_writepages) from [<c06368a4>] (__filemap_fdatawrite_range+0xb0/0xf4)

> >[<c06368a4>] (__filemap_fdatawrite_range) from [<c0636b68>] (file_write_and_wait_range+0x64/0xd0)

> >[<c0636b68>] (file_write_and_wait_range) from [<c0747af8>] (blkdev_fsync+0x54/0x84)

> >[<c0747af8>] (blkdev_fsync) from [<c0739dac>] (vfs_fsync_range+0x70/0xd4)

> >[<c0739dac>] (vfs_fsync_range) from [<c0739e98>] (do_fsync+0x4c/0x80)

> >[<c0739e98>] (do_fsync) from [<c073a26c>] (sys_fsync+0x1c/0x20)

> >[<c073a26c>] (sys_fsync) from [<c0401000>] (ret_fast_syscall+0x0/0x2c)

> >

> >when trying to decode (the pfn) and map the object.

> >

> >That happens because one architecture might not re-define

> >MAX_PHYSMEM_BITS, like in this ARM 32-bit w/ LPAE enabled example. For

> >32-bit systems, if not re-defined, MAX_POSSIBLE_PHYSMEM_BITS will

> >default to BITS_PER_LONG (32) in most cases, and, with PAE enabled,

> >_PFN_BITS might be wrong: which may cause obj variable to overflow if

> >frame number is in HIGHMEM and referencing a page above the 4GB

> >watermark.

> >

> >commit 6e00ec00b1a7 ("staging: zsmalloc: calculate MAX_PHYSMEM_BITS if

> >not defined") realized MAX_PHYSMEM_BITS depended on SPARSEMEM headers

> >and "fixed" it by calculating it using BITS_PER_LONG if SPARSEMEM wasn't

> >used, like in the example given above.

> >

> >Systems with potential for PAE exist for a long time and assuming

> >BITS_PER_LONG seems inadequate. Defining MAX_PHYSMEM_BITS looks better,

> >however it is NOT a constant anymore for x86.

> >

> >SO, instead, MAX_POSSIBLE_PHYSMEM_BITS should be defined by every

> >architecture using zsmalloc, together with a sanity check for

> >MAX_POSSIBLE_PHYSMEM_BITS being too big on 32-bit systems.

> >

> >Link: https://bugs.linaro.org/show_bug.cgi?id=3765#c17

> >Signed-off-by: Rafael David Tinoco <rafael.tinoco@linaro.org>

> >---

> >  arch/arm/include/asm/pgtable-2level-types.h |  2 ++

> >  arch/arm/include/asm/pgtable-3level-types.h |  2 ++

> >  arch/arm64/include/asm/pgtable-types.h      |  2 ++

> >  arch/ia64/include/asm/page.h                |  2 ++

> >  arch/mips/include/asm/page.h                |  2 ++

> >  arch/powerpc/include/asm/mmu.h              |  2 ++

> >  arch/s390/include/asm/page.h                |  2 ++

> >  arch/sh/include/asm/page.h                  |  2 ++

> >  arch/sparc/include/asm/page_32.h            |  2 ++

> >  arch/sparc/include/asm/page_64.h            |  2 ++

> >  arch/x86/include/asm/pgtable-2level_types.h |  2 ++

> >  arch/x86/include/asm/pgtable-3level_types.h |  3 +-

> >  arch/x86/include/asm/pgtable_64_types.h     |  4 +--

> >  mm/zsmalloc.c                               | 35 +++++++++++----------

> >  14 files changed, 45 insertions(+), 19 deletions(-)

> >

> >diff --git a/arch/arm/include/asm/pgtable-2level-types.h b/arch/arm/include/asm/pgtable-2level-types.h

> >index 66cb5b0e89c5..552dba411324 100644

> >--- a/arch/arm/include/asm/pgtable-2level-types.h

> >+++ b/arch/arm/include/asm/pgtable-2level-types.h

> >@@ -64,4 +64,6 @@ typedef pteval_t pgprot_t;

> >  #endif /* STRICT_MM_TYPECHECKS */

> >+#define MAX_POSSIBLE_PHYSMEM_BITS 32

> >+

> >  #endif	/* _ASM_PGTABLE_2LEVEL_TYPES_H */

> >diff --git a/arch/arm/include/asm/pgtable-3level-types.h b/arch/arm/include/asm/pgtable-3level-types.h

> >index 921aa30259c4..664c39e6517c 100644

> >--- a/arch/arm/include/asm/pgtable-3level-types.h

> >+++ b/arch/arm/include/asm/pgtable-3level-types.h

> >@@ -67,4 +67,6 @@ typedef pteval_t pgprot_t;

> >  #endif	/* STRICT_MM_TYPECHECKS */

> >+#define MAX_POSSIBLE_PHYSMEM_BITS 36

> 

> Nit: with LPAE, physical addresses go up to 40 bits, not just 36.


Right, with that set at 40, we get:

#define _PFN_BITS               (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)

== 28

#define OBJ_TAG_BITS 1
#define OBJ_INDEX_BITS  (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)

== 3

#define ZS_MAX_ZSPAGE_ORDER 2
#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)

== 4

#define ZS_MIN_ALLOC_SIZE \
        MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))

== 4 << 12 >> 3 = 2048

or half-page allocations.

Given this in Documentation/vm/zsmalloc.rst:

On the other hand, if we just use single
(0-order) pages, it would suffer from very high fragmentation --
any object of size PAGE_SIZE/2 or larger would occupy an entire page.
This was one of the major issues with its predecessor (xvmalloc).

It seems that would not be acceptable, so I have to ask whether
using an unsigned long to store PFN + object ID is really acceptable.
Maybe the real solution to this problem is to stop using an
unsigned long to contain both the PFN and object ID?

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 12.1Mbps down 622kbps up
According to speedtest.net: 11.9Mbps down 500kbps up
Kirill A. Shutemov Dec. 10, 2018, 3:15 p.m. UTC | #3
On Mon, Dec 10, 2018 at 12:21:05PM -0200, Rafael David Tinoco wrote:
> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h

> index 84bd9bdc1987..d808cfde3d19 100644

> --- a/arch/x86/include/asm/pgtable_64_types.h

> +++ b/arch/x86/include/asm/pgtable_64_types.h

> @@ -64,8 +64,6 @@ extern unsigned int ptrs_per_p4d;

>  #define P4D_SIZE		(_AC(1, UL) << P4D_SHIFT)

>  #define P4D_MASK		(~(P4D_SIZE - 1))

>  

> -#define MAX_POSSIBLE_PHYSMEM_BITS	52

> -

>  #else /* CONFIG_X86_5LEVEL */

>  

>  /*

> @@ -154,4 +152,6 @@ extern unsigned int ptrs_per_p4d;

>  

>  #define PGD_KERNEL_START	((PAGE_SIZE / 2) / sizeof(pgd_t))

>  

> +#define MAX_POSSIBLE_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46)

> +


...

>  #endif /* _ASM_X86_PGTABLE_64_DEFS_H */

> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c

> index 0787d33b80d8..132c20b6fd4f 100644

> --- a/mm/zsmalloc.c

> +++ b/mm/zsmalloc.c


...

> @@ -116,6 +100,25 @@

>   */

>  #define OBJ_ALLOCATED_TAG 1

>  #define OBJ_TAG_BITS 1

> +

> +/*

> + * MAX_POSSIBLE_PHYSMEM_BITS should be defined by all archs using zsmalloc:

> + * Trying to guess it from MAX_PHYSMEM_BITS, or considering it BITS_PER_LONG,

> + * proved to be wrong by not considering PAE capabilities, or using SPARSEMEM

> + * only headers, leading to bad object encoding due to object index overflow.

> + */

> +#ifndef MAX_POSSIBLE_PHYSMEM_BITS

> + #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG

> + #error "MAX_POSSIBLE_PHYSMEM_BITS HAS to be defined by arch using zsmalloc";

> +#else

> + #ifndef CONFIG_64BIT

> +  #if (MAX_POSSIBLE_PHYSMEM_BITS >= (BITS_PER_LONG + PAGE_SHIFT - OBJ_TAG_BITS))

> +   #error "MAX_POSSIBLE_PHYSMEM_BITS is wrong for this arch";

> +  #endif

> + #endif

> +#endif

> +

> +#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)

>  #define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)

>  #define OBJ_INDEX_MASK	((_AC(1, UL) << OBJ_INDEX_BITS) - 1)


Have you tested it with CONFIG_X86_5LEVEL=y?

ASAICS, the patch makes OBJ_INDEX_BITS and what depends from it dynamic --
it depends what paging mode we are booting in. ZS_SIZE_CLASSES depends
indirectly on OBJ_INDEX_BITS and I don't see how struct zs_pool definition
can compile with dynamic ZS_SIZE_CLASSES.

Hm?

-- 
 Kirill A. Shutemov
Rafael David Tinoco Dec. 10, 2018, 4:48 p.m. UTC | #4
On 12/10/18 1:15 PM, Kirill A. Shutemov wrote:
> On Mon, Dec 10, 2018 at 12:21:05PM -0200, Rafael David Tinoco wrote:

>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h

>> index 84bd9bdc1987..d808cfde3d19 100644

>> --- a/arch/x86/include/asm/pgtable_64_types.h

>> +++ b/arch/x86/include/asm/pgtable_64_types.h

>> @@ -64,8 +64,6 @@ extern unsigned int ptrs_per_p4d;

>>  #define P4D_SIZE		(_AC(1, UL) << P4D_SHIFT)

>>  #define P4D_MASK		(~(P4D_SIZE - 1))

>>  

>> -#define MAX_POSSIBLE_PHYSMEM_BITS	52

>> -

>>  #else /* CONFIG_X86_5LEVEL */

>>  

>>  /*

>> @@ -154,4 +152,6 @@ extern unsigned int ptrs_per_p4d;

>>  

>>  #define PGD_KERNEL_START	((PAGE_SIZE / 2) / sizeof(pgd_t))

>>  

>> +#define MAX_POSSIBLE_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46)

>> +

> 

> ...

> 

>>  #endif /* _ASM_X86_PGTABLE_64_DEFS_H */

>> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c

>> index 0787d33b80d8..132c20b6fd4f 100644

>> --- a/mm/zsmalloc.c

>> +++ b/mm/zsmalloc.c

> 

> ...

> 

>> @@ -116,6 +100,25 @@

>>   */

>>  #define OBJ_ALLOCATED_TAG 1

>>  #define OBJ_TAG_BITS 1

>> +

>> +/*

>> + * MAX_POSSIBLE_PHYSMEM_BITS should be defined by all archs using zsmalloc:

>> + * Trying to guess it from MAX_PHYSMEM_BITS, or considering it BITS_PER_LONG,

>> + * proved to be wrong by not considering PAE capabilities, or using SPARSEMEM

>> + * only headers, leading to bad object encoding due to object index overflow.

>> + */

>> +#ifndef MAX_POSSIBLE_PHYSMEM_BITS

>> + #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG

>> + #error "MAX_POSSIBLE_PHYSMEM_BITS HAS to be defined by arch using zsmalloc";

>> +#else

>> + #ifndef CONFIG_64BIT

>> +  #if (MAX_POSSIBLE_PHYSMEM_BITS >= (BITS_PER_LONG + PAGE_SHIFT - OBJ_TAG_BITS))

>> +   #error "MAX_POSSIBLE_PHYSMEM_BITS is wrong for this arch";

>> +  #endif

>> + #endif

>> +#endif

>> +

>> +#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)

>>  #define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)

>>  #define OBJ_INDEX_MASK	((_AC(1, UL) << OBJ_INDEX_BITS) - 1)

> 

> Have you tested it with CONFIG_X86_5LEVEL=y?

> 

> ASAICS, the patch makes OBJ_INDEX_BITS and what depends from it dynamic --

> it depends what paging mode we are booting in. ZS_SIZE_CLASSES depends

> indirectly on OBJ_INDEX_BITS and I don't see how struct zs_pool definition

> can compile with dynamic ZS_SIZE_CLASSES.

> 

> Hm?

> 


You're right, terribly sorry. This was a last time change.

mm/zsmalloc.c:256:21: error: variably modified ‘size_class’ at file scope

I'll revisit the patch. Any other comments are welcome.

Thank you
Rafael David Tinoco Dec. 10, 2018, 4:53 p.m. UTC | #5
On 12/10/18 1:05 PM, Russell King - ARM Linux wrote:
> On Mon, Dec 10, 2018 at 02:35:55PM +0000, Robin Murphy wrote:

>> On 10/12/2018 14:21, Rafael David Tinoco wrote:

>>> On 32-bit systems, zsmalloc uses HIGHMEM and, when PAE is enabled, the

>>> physical frame number might be so big that zsmalloc obj encoding (to

>>> location) will break, causing:

>>>

>>> BUG: KASAN: null-ptr-deref in zs_map_object+0xa4/0x2bc

>>> Read of size 4 at addr 00000000 by task mkfs.ext4/623

>>> CPU: 2 PID: 623 Comm: mkfs.ext4 Not tainted 4.19.0-rc8-00017-g8239bc6d3307-dirty #15

>>> Hardware name: Generic DT based system

>>> [<c0418f7c>] (unwind_backtrace) from [<c0410ca4>] (show_stack+0x20/0x24)

>>> [<c0410ca4>] (show_stack) from [<c16bd540>] (dump_stack+0xbc/0xe8)

>>> [<c16bd540>] (dump_stack) from [<c06cab74>] (kasan_report+0x248/0x390)

>>> [<c06cab74>] (kasan_report) from [<c06c94e8>] (__asan_load4+0x78/0xb4)

>>> [<c06c94e8>] (__asan_load4) from [<c06ddc24>] (zs_map_object+0xa4/0x2bc)

>>> [<c06ddc24>] (zs_map_object) from [<bf0bbbd8>] (zram_bvec_rw.constprop.2+0x324/0x8e4 [zram])

>>> [<bf0bbbd8>] (zram_bvec_rw.constprop.2 [zram]) from [<bf0bc3cc>] (zram_make_request+0x234/0x46c [zram])

>>> [<bf0bc3cc>] (zram_make_request [zram]) from [<c09aff9c>] (generic_make_request+0x304/0x63c)

>>> [<c09aff9c>] (generic_make_request) from [<c09b0320>] (submit_bio+0x4c/0x1c8)

>>> [<c09b0320>] (submit_bio) from [<c0743570>] (submit_bh_wbc.constprop.15+0x238/0x26c)

>>> [<c0743570>] (submit_bh_wbc.constprop.15) from [<c0746cf8>] (__block_write_full_page+0x524/0x76c)

>>> [<c0746cf8>] (__block_write_full_page) from [<c07472c4>] (block_write_full_page+0x1bc/0x1d4)

>>> [<c07472c4>] (block_write_full_page) from [<c0748eb4>] (blkdev_writepage+0x24/0x28)

>>> [<c0748eb4>] (blkdev_writepage) from [<c064a780>] (__writepage+0x44/0x78)

>>> [<c064a780>] (__writepage) from [<c064b50c>] (write_cache_pages+0x3b8/0x800)

>>> [<c064b50c>] (write_cache_pages) from [<c064dd78>] (generic_writepages+0x74/0xa0)

>>> [<c064dd78>] (generic_writepages) from [<c0748e64>] (blkdev_writepages+0x18/0x1c)

>>> [<c0748e64>] (blkdev_writepages) from [<c064e890>] (do_writepages+0x68/0x134)

>>> [<c064e890>] (do_writepages) from [<c06368a4>] (__filemap_fdatawrite_range+0xb0/0xf4)

>>> [<c06368a4>] (__filemap_fdatawrite_range) from [<c0636b68>] (file_write_and_wait_range+0x64/0xd0)

>>> [<c0636b68>] (file_write_and_wait_range) from [<c0747af8>] (blkdev_fsync+0x54/0x84)

>>> [<c0747af8>] (blkdev_fsync) from [<c0739dac>] (vfs_fsync_range+0x70/0xd4)

>>> [<c0739dac>] (vfs_fsync_range) from [<c0739e98>] (do_fsync+0x4c/0x80)

>>> [<c0739e98>] (do_fsync) from [<c073a26c>] (sys_fsync+0x1c/0x20)

>>> [<c073a26c>] (sys_fsync) from [<c0401000>] (ret_fast_syscall+0x0/0x2c)

>>>

>>> when trying to decode (the pfn) and map the object.

>>>

>>> That happens because one architecture might not re-define

>>> MAX_PHYSMEM_BITS, like in this ARM 32-bit w/ LPAE enabled example. For

>>> 32-bit systems, if not re-defined, MAX_POSSIBLE_PHYSMEM_BITS will

>>> default to BITS_PER_LONG (32) in most cases, and, with PAE enabled,

>>> _PFN_BITS might be wrong: which may cause obj variable to overflow if

>>> frame number is in HIGHMEM and referencing a page above the 4GB

>>> watermark.

>>>

>>> commit 6e00ec00b1a7 ("staging: zsmalloc: calculate MAX_PHYSMEM_BITS if

>>> not defined") realized MAX_PHYSMEM_BITS depended on SPARSEMEM headers

>>> and "fixed" it by calculating it using BITS_PER_LONG if SPARSEMEM wasn't

>>> used, like in the example given above.

>>>

>>> Systems with potential for PAE exist for a long time and assuming

>>> BITS_PER_LONG seems inadequate. Defining MAX_PHYSMEM_BITS looks better,

>>> however it is NOT a constant anymore for x86.

>>>

>>> SO, instead, MAX_POSSIBLE_PHYSMEM_BITS should be defined by every

>>> architecture using zsmalloc, together with a sanity check for

>>> MAX_POSSIBLE_PHYSMEM_BITS being too big on 32-bit systems.

>>>

>>> Link: https://bugs.linaro.org/show_bug.cgi?id=3765#c17

>>> Signed-off-by: Rafael David Tinoco <rafael.tinoco@linaro.org>

>>> ---

>>>  arch/arm/include/asm/pgtable-2level-types.h |  2 ++

>>>  arch/arm/include/asm/pgtable-3level-types.h |  2 ++

>>>  arch/arm64/include/asm/pgtable-types.h      |  2 ++

>>>  arch/ia64/include/asm/page.h                |  2 ++

>>>  arch/mips/include/asm/page.h                |  2 ++

>>>  arch/powerpc/include/asm/mmu.h              |  2 ++

>>>  arch/s390/include/asm/page.h                |  2 ++

>>>  arch/sh/include/asm/page.h                  |  2 ++

>>>  arch/sparc/include/asm/page_32.h            |  2 ++

>>>  arch/sparc/include/asm/page_64.h            |  2 ++

>>>  arch/x86/include/asm/pgtable-2level_types.h |  2 ++

>>>  arch/x86/include/asm/pgtable-3level_types.h |  3 +-

>>>  arch/x86/include/asm/pgtable_64_types.h     |  4 +--

>>>  mm/zsmalloc.c                               | 35 +++++++++++----------

>>>  14 files changed, 45 insertions(+), 19 deletions(-)

>>>

>>> diff --git a/arch/arm/include/asm/pgtable-2level-types.h b/arch/arm/include/asm/pgtable-2level-types.h

>>> index 66cb5b0e89c5..552dba411324 100644

>>> --- a/arch/arm/include/asm/pgtable-2level-types.h

>>> +++ b/arch/arm/include/asm/pgtable-2level-types.h

>>> @@ -64,4 +64,6 @@ typedef pteval_t pgprot_t;

>>>  #endif /* STRICT_MM_TYPECHECKS */

>>> +#define MAX_POSSIBLE_PHYSMEM_BITS 32

>>> +

>>>  #endif	/* _ASM_PGTABLE_2LEVEL_TYPES_H */

>>> diff --git a/arch/arm/include/asm/pgtable-3level-types.h b/arch/arm/include/asm/pgtable-3level-types.h

>>> index 921aa30259c4..664c39e6517c 100644

>>> --- a/arch/arm/include/asm/pgtable-3level-types.h

>>> +++ b/arch/arm/include/asm/pgtable-3level-types.h

>>> @@ -67,4 +67,6 @@ typedef pteval_t pgprot_t;

>>>  #endif	/* STRICT_MM_TYPECHECKS */

>>> +#define MAX_POSSIBLE_PHYSMEM_BITS 36

>>

>> Nit: with LPAE, physical addresses go up to 40 bits, not just 36.


Hum, I got it from where it was being defined when having SPARSEMEM
enabled (#define MAX_PHYSMEM_BITS 36 in
arch/arm/include/asm/sparsemem.h), since without SPARSEMEM it would
default to BITS_PER_LONG.

> Right, with that set at 40, we get:

> 

> #define _PFN_BITS               (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)

> 

> == 28

> 

> #define OBJ_TAG_BITS 1

> #define OBJ_INDEX_BITS  (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)

> 

> == 3

> 

> #define ZS_MAX_ZSPAGE_ORDER 2

> #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)

> 

> == 4

> 

> #define ZS_MIN_ALLOC_SIZE \

>         MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))

> 

> == 4 << 12 >> 3 = 2048

> 

> or half-page allocations.

> 

> Given this in Documentation/vm/zsmalloc.rst:

> 

> On the other hand, if we just use single

> (0-order) pages, it would suffer from very high fragmentation --

> any object of size PAGE_SIZE/2 or larger would occupy an entire page.

> This was one of the major issues with its predecessor (xvmalloc).

> 

> It seems that would not be acceptable, so I have to ask whether

> using an unsigned long to store PFN + object ID is really acceptable.

> Maybe the real solution to this problem is to stop using an

> unsigned long to contain both the PFN and object ID?

>
kernel test robot Dec. 13, 2018, 6:17 a.m. UTC | #6
Hi Rafael,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.20-rc6 next-20181212]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Rafael-David-Tinoco/mm-zsmalloc-c-Fix-zsmalloc-32-bit-PAE-support/20181211-020704
config: mips-allmodconfig (attached as .config)
compiler: mips-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.2.0 make.cross ARCH=mips 

All error/warnings (new ones prefixed by >>):

>> mm/zsmalloc.c:116:5: error: #error "MAX_POSSIBLE_PHYSMEM_BITS is wrong for this arch";

       #error "MAX_POSSIBLE_PHYSMEM_BITS is wrong for this arch";
        ^~~~~
   In file included from include/linux/cache.h:5:0,
                    from arch/mips/include/asm/cpu-info.h:15,
                    from arch/mips/include/asm/cpu-features.h:13,
                    from arch/mips/include/asm/bitops.h:21,
                    from include/linux/bitops.h:19,
                    from include/linux/kernel.h:11,
                    from include/linux/list.h:9,
                    from include/linux/module.h:9,
                    from mm/zsmalloc.c:33:
>> mm/zsmalloc.c:133:49: warning: right shift count is negative [-Wshift-count-negative]

     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
                                                    ^
   include/uapi/linux/kernel.h:13:40: note: in definition of macro '__KERNEL_DIV_ROUND_UP'
    #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
                                           ^
>> mm/zsmalloc.c:133:2: note: in expansion of macro 'MAX'

     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
     ^~~
>> mm/zsmalloc.c:151:59: note: in expansion of macro 'ZS_MIN_ALLOC_SIZE'

    #define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \
                                                              ^~~~~~~~~~~~~~~~~
>> mm/zsmalloc.c:256:32: note: in expansion of macro 'ZS_SIZE_CLASSES'

     struct size_class *size_class[ZS_SIZE_CLASSES];
                                   ^~~~~~~~~~~~~~~
>> mm/zsmalloc.c:133:49: warning: right shift count is negative [-Wshift-count-negative]

     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
                                                    ^
   include/uapi/linux/kernel.h:13:40: note: in definition of macro '__KERNEL_DIV_ROUND_UP'
    #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
                                           ^
>> mm/zsmalloc.c:133:2: note: in expansion of macro 'MAX'

     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
     ^~~
>> mm/zsmalloc.c:151:59: note: in expansion of macro 'ZS_MIN_ALLOC_SIZE'

    #define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \
                                                              ^~~~~~~~~~~~~~~~~
>> mm/zsmalloc.c:256:32: note: in expansion of macro 'ZS_SIZE_CLASSES'

     struct size_class *size_class[ZS_SIZE_CLASSES];
                                   ^~~~~~~~~~~~~~~
>> mm/zsmalloc.c:256:21: error: variably modified 'size_class' at file scope

     struct size_class *size_class[ZS_SIZE_CLASSES];
                        ^~~~~~~~~~
   In file included from include/linux/kernel.h:10:0,
                    from include/linux/list.h:9,
                    from include/linux/module.h:9,
                    from mm/zsmalloc.c:33:
   mm/zsmalloc.c: In function 'get_size_class_index':
>> mm/zsmalloc.c:133:49: warning: right shift count is negative [-Wshift-count-negative]

     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
                                                    ^
   include/linux/compiler.h:76:40: note: in definition of macro 'likely'
    # define likely(x) __builtin_expect(!!(x), 1)
                                           ^
>> mm/zsmalloc.c:133:2: note: in expansion of macro 'MAX'

     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
     ^~~
   mm/zsmalloc.c:543:20: note: in expansion of macro 'ZS_MIN_ALLOC_SIZE'
     if (likely(size > ZS_MIN_ALLOC_SIZE))
                       ^~~~~~~~~~~~~~~~~
>> mm/zsmalloc.c:133:49: warning: right shift count is negative [-Wshift-count-negative]

     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
                                                    ^
   include/linux/compiler.h:76:40: note: in definition of macro 'likely'
    # define likely(x) __builtin_expect(!!(x), 1)
                                           ^
>> mm/zsmalloc.c:133:2: note: in expansion of macro 'MAX'

     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
     ^~~
   mm/zsmalloc.c:543:20: note: in expansion of macro 'ZS_MIN_ALLOC_SIZE'
     if (likely(size > ZS_MIN_ALLOC_SIZE))
                       ^~~~~~~~~~~~~~~~~
   In file included from include/linux/cache.h:5:0,
                    from arch/mips/include/asm/cpu-info.h:15,
                    from arch/mips/include/asm/cpu-features.h:13,
                    from arch/mips/include/asm/bitops.h:21,
                    from include/linux/bitops.h:19,
                    from include/linux/kernel.h:11,
                    from include/linux/list.h:9,
                    from include/linux/module.h:9,
                    from mm/zsmalloc.c:33:
>> mm/zsmalloc.c:133:49: warning: right shift count is negative [-Wshift-count-negative]

     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
                                                    ^
   include/uapi/linux/kernel.h:13:40: note: in definition of macro '__KERNEL_DIV_ROUND_UP'
    #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
                                           ^
>> mm/zsmalloc.c:133:2: note: in expansion of macro 'MAX'

     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
     ^~~
   mm/zsmalloc.c:544:29: note: in expansion of macro 'ZS_MIN_ALLOC_SIZE'
      idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
                                ^~~~~~~~~~~~~~~~~
>> mm/zsmalloc.c:133:49: warning: right shift count is negative [-Wshift-count-negative]

     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
                                                    ^
   include/uapi/linux/kernel.h:13:40: note: in definition of macro '__KERNEL_DIV_ROUND_UP'
    #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
                                           ^
>> mm/zsmalloc.c:133:2: note: in expansion of macro 'MAX'

     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
     ^~~
   mm/zsmalloc.c:544:29: note: in expansion of macro 'ZS_MIN_ALLOC_SIZE'
      idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
                                ^~~~~~~~~~~~~~~~~
   In file included from include/linux/list.h:9:0,
                    from include/linux/module.h:9,
                    from mm/zsmalloc.c:33:
>> mm/zsmalloc.c:133:49: warning: right shift count is negative [-Wshift-count-negative]

     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
                                                    ^
   include/linux/kernel.h:861:27: note: in definition of macro '__cmp'
    #define __cmp(x, y, op) ((x) op (y) ? (x) : (y))
                              ^
   include/linux/kernel.h:937:27: note: in expansion of macro '__careful_cmp'
    #define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <)
                              ^~~~~~~~~~~~~
>> mm/zsmalloc.c:547:9: note: in expansion of macro 'min_t'

     return min_t(int, ZS_SIZE_CLASSES - 1, idx);
            ^~~~~

vim +116 mm/zsmalloc.c

    32	
  > 33	#include <linux/module.h>

    34	#include <linux/kernel.h>
    35	#include <linux/sched.h>
    36	#include <linux/magic.h>
    37	#include <linux/bitops.h>
    38	#include <linux/errno.h>
    39	#include <linux/highmem.h>
    40	#include <linux/string.h>
    41	#include <linux/slab.h>
    42	#include <asm/tlbflush.h>
    43	#include <asm/pgtable.h>
    44	#include <linux/cpumask.h>
    45	#include <linux/cpu.h>
    46	#include <linux/vmalloc.h>
    47	#include <linux/preempt.h>
    48	#include <linux/spinlock.h>
    49	#include <linux/shrinker.h>
    50	#include <linux/types.h>
    51	#include <linux/debugfs.h>
    52	#include <linux/zsmalloc.h>
    53	#include <linux/zpool.h>
    54	#include <linux/mount.h>
    55	#include <linux/migrate.h>
    56	#include <linux/pagemap.h>
    57	#include <linux/fs.h>
    58	
    59	#define ZSPAGE_MAGIC	0x58
    60	
    61	/*
    62	 * This must be power of 2 and greater than of equal to sizeof(link_free).
    63	 * These two conditions ensure that any 'struct link_free' itself doesn't
    64	 * span more than 1 page which avoids complex case of mapping 2 pages simply
    65	 * to restore link_free pointer values.
    66	 */
    67	#define ZS_ALIGN		8
    68	
    69	/*
    70	 * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single)
    71	 * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N.
    72	 */
    73	#define ZS_MAX_ZSPAGE_ORDER 2
    74	#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
    75	
    76	#define ZS_HANDLE_SIZE (sizeof(unsigned long))
    77	
    78	/*
    79	 * Object location (<PFN>, <obj_idx>) is encoded as
    80	 * as single (unsigned long) handle value.
    81	 *
    82	 * Note that object index <obj_idx> starts from 0.
    83	 */
    84	
    85	/*
    86	 * Memory for allocating for handle keeps object position by
    87	 * encoding <page, obj_idx> and the encoded value has a room
    88	 * in least bit(ie, look at obj_to_location).
    89	 * We use the bit to synchronize between object access by
    90	 * user and migration.
    91	 */
    92	#define HANDLE_PIN_BIT	0
    93	
    94	/*
    95	 * Head in allocated object should have OBJ_ALLOCATED_TAG
    96	 * to identify the object was allocated or not.
    97	 * It's okay to add the status bit in the least bit because
    98	 * header keeps handle which is 4byte-aligned address so we
    99	 * have room for two bit at least.
   100	 */
   101	#define OBJ_ALLOCATED_TAG 1
   102	#define OBJ_TAG_BITS 1
   103	
   104	/*
   105	 * MAX_POSSIBLE_PHYSMEM_BITS should be defined by all archs using zsmalloc:
   106	 * Trying to guess it from MAX_PHYSMEM_BITS, or considering it BITS_PER_LONG,
   107	 * proved to be wrong by not considering PAE capabilities, or using SPARSEMEM
   108	 * only headers, leading to bad object encoding due to object index overflow.
   109	 */
   110	#ifndef MAX_POSSIBLE_PHYSMEM_BITS
   111	 #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
   112	 #error "MAX_POSSIBLE_PHYSMEM_BITS HAS to be defined by arch using zsmalloc";
   113	#else
   114	 #ifndef CONFIG_64BIT
   115	  #if (MAX_POSSIBLE_PHYSMEM_BITS >= (BITS_PER_LONG + PAGE_SHIFT - OBJ_TAG_BITS))
 > 116	   #error "MAX_POSSIBLE_PHYSMEM_BITS is wrong for this arch";

   117	  #endif
   118	 #endif
   119	#endif
   120	
   121	#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
   122	#define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
 > 123	#define OBJ_INDEX_MASK	((_AC(1, UL) << OBJ_INDEX_BITS) - 1)

   124	
   125	#define FULLNESS_BITS	2
   126	#define CLASS_BITS	8
   127	#define ISOLATED_BITS	3
   128	#define MAGIC_VAL_BITS	8
   129	
   130	#define MAX(a, b) ((a) >= (b) ? (a) : (b))
   131	/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
   132	#define ZS_MIN_ALLOC_SIZE \
 > 133		MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))

   134	/* each chunk includes extra space to keep handle */
   135	#define ZS_MAX_ALLOC_SIZE	PAGE_SIZE
   136	
   137	/*
   138	 * On systems with 4K page size, this gives 255 size classes! There is a
   139	 * trader-off here:
   140	 *  - Large number of size classes is potentially wasteful as free page are
   141	 *    spread across these classes
   142	 *  - Small number of size classes causes large internal fragmentation
   143	 *  - Probably its better to use specific size classes (empirically
   144	 *    determined). NOTE: all those class sizes must be set as multiple of
   145	 *    ZS_ALIGN to make sure link_free itself never has to span 2 pages.
   146	 *
   147	 *  ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
   148	 *  (reason above)
   149	 */
   150	#define ZS_SIZE_CLASS_DELTA	(PAGE_SIZE >> CLASS_BITS)
 > 151	#define ZS_SIZE_CLASSES	(DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \

   152					      ZS_SIZE_CLASS_DELTA) + 1)
   153	
   154	enum fullness_group {
   155		ZS_EMPTY,
   156		ZS_ALMOST_EMPTY,
   157		ZS_ALMOST_FULL,
   158		ZS_FULL,
   159		NR_ZS_FULLNESS,
   160	};
   161	
   162	enum zs_stat_type {
   163		CLASS_EMPTY,
   164		CLASS_ALMOST_EMPTY,
   165		CLASS_ALMOST_FULL,
   166		CLASS_FULL,
   167		OBJ_ALLOCATED,
   168		OBJ_USED,
   169		NR_ZS_STAT_TYPE,
   170	};
   171	
   172	struct zs_size_stat {
   173		unsigned long objs[NR_ZS_STAT_TYPE];
   174	};
   175	
   176	#ifdef CONFIG_ZSMALLOC_STAT
   177	static struct dentry *zs_stat_root;
   178	#endif
   179	
   180	#ifdef CONFIG_COMPACTION
   181	static struct vfsmount *zsmalloc_mnt;
   182	#endif
   183	
   184	/*
   185	 * We assign a page to ZS_ALMOST_EMPTY fullness group when:
   186	 *	n <= N / f, where
   187	 * n = number of allocated objects
   188	 * N = total number of objects zspage can store
   189	 * f = fullness_threshold_frac
   190	 *
   191	 * Similarly, we assign zspage to:
   192	 *	ZS_ALMOST_FULL	when n > N / f
   193	 *	ZS_EMPTY	when n == 0
   194	 *	ZS_FULL		when n == N
   195	 *
   196	 * (see: fix_fullness_group())
   197	 */
   198	static const int fullness_threshold_frac = 4;
   199	static size_t huge_class_size;
   200	
   201	struct size_class {
   202		spinlock_t lock;
   203		struct list_head fullness_list[NR_ZS_FULLNESS];
   204		/*
   205		 * Size of objects stored in this class. Must be multiple
   206		 * of ZS_ALIGN.
   207		 */
   208		int size;
   209		int objs_per_zspage;
   210		/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
   211		int pages_per_zspage;
   212	
   213		unsigned int index;
   214		struct zs_size_stat stats;
   215	};
   216	
   217	/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
   218	static void SetPageHugeObject(struct page *page)
   219	{
   220		SetPageOwnerPriv1(page);
   221	}
   222	
   223	static void ClearPageHugeObject(struct page *page)
   224	{
   225		ClearPageOwnerPriv1(page);
   226	}
   227	
   228	static int PageHugeObject(struct page *page)
   229	{
   230		return PageOwnerPriv1(page);
   231	}
   232	
   233	/*
   234	 * Placed within free objects to form a singly linked list.
   235	 * For every zspage, zspage->freeobj gives head of this list.
   236	 *
   237	 * This must be power of 2 and less than or equal to ZS_ALIGN
   238	 */
   239	struct link_free {
   240		union {
   241			/*
   242			 * Free object index;
   243			 * It's valid for non-allocated object
   244			 */
   245			unsigned long next;
   246			/*
   247			 * Handle of allocated object.
   248			 */
   249			unsigned long handle;
   250		};
   251	};
   252	
   253	struct zs_pool {
   254		const char *name;
   255	
 > 256		struct size_class *size_class[ZS_SIZE_CLASSES];

   257		struct kmem_cache *handle_cachep;
   258		struct kmem_cache *zspage_cachep;
   259	
   260		atomic_long_t pages_allocated;
   261	
   262		struct zs_pool_stats stats;
   263	
   264		/* Compact classes */
   265		struct shrinker shrinker;
   266	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
kernel test robot Dec. 13, 2018, 7:27 a.m. UTC | #7
Hi Rafael,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.20-rc6 next-20181212]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Rafael-David-Tinoco/mm-zsmalloc-c-Fix-zsmalloc-32-bit-PAE-support/20181211-020704
config: xtensa-allmodconfig (attached as .config)
compiler: xtensa-linux-gcc (GCC) 8.1.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=8.1.0 make.cross ARCH=xtensa 

All errors (new ones prefixed by >>):

>> mm/zsmalloc.c:112:3: error: #error "MAX_POSSIBLE_PHYSMEM_BITS HAS to be defined by arch using zsmalloc";

     #error "MAX_POSSIBLE_PHYSMEM_BITS HAS to be defined by arch using zsmalloc";
      ^~~~~

vim +112 mm/zsmalloc.c

   103	
   104	/*
   105	 * MAX_POSSIBLE_PHYSMEM_BITS should be defined by all archs using zsmalloc:
   106	 * Trying to guess it from MAX_PHYSMEM_BITS, or considering it BITS_PER_LONG,
   107	 * proved to be wrong by not considering PAE capabilities, or using SPARSEMEM
   108	 * only headers, leading to bad object encoding due to object index overflow.
   109	 */
   110	#ifndef MAX_POSSIBLE_PHYSMEM_BITS
   111	 #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
 > 112	 #error "MAX_POSSIBLE_PHYSMEM_BITS HAS to be defined by arch using zsmalloc";

   113	#else
   114	 #ifndef CONFIG_64BIT
   115	  #if (MAX_POSSIBLE_PHYSMEM_BITS >= (BITS_PER_LONG + PAGE_SHIFT - OBJ_TAG_BITS))
   116	   #error "MAX_POSSIBLE_PHYSMEM_BITS is wrong for this arch";
   117	  #endif
   118	 #endif
   119	#endif
   120	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
diff mbox series

Patch

diff --git a/arch/arm/include/asm/pgtable-2level-types.h b/arch/arm/include/asm/pgtable-2level-types.h
index 66cb5b0e89c5..552dba411324 100644
--- a/arch/arm/include/asm/pgtable-2level-types.h
+++ b/arch/arm/include/asm/pgtable-2level-types.h
@@ -64,4 +64,6 @@  typedef pteval_t pgprot_t;
 
 #endif /* STRICT_MM_TYPECHECKS */
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 32
+
 #endif	/* _ASM_PGTABLE_2LEVEL_TYPES_H */
diff --git a/arch/arm/include/asm/pgtable-3level-types.h b/arch/arm/include/asm/pgtable-3level-types.h
index 921aa30259c4..664c39e6517c 100644
--- a/arch/arm/include/asm/pgtable-3level-types.h
+++ b/arch/arm/include/asm/pgtable-3level-types.h
@@ -67,4 +67,6 @@  typedef pteval_t pgprot_t;
 
 #endif	/* STRICT_MM_TYPECHECKS */
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 36
+
 #endif	/* _ASM_PGTABLE_3LEVEL_TYPES_H */
diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h
index 345a072b5856..45c3834eb4c8 100644
--- a/arch/arm64/include/asm/pgtable-types.h
+++ b/arch/arm64/include/asm/pgtable-types.h
@@ -64,4 +64,6 @@  typedef struct { pteval_t pgprot; } pgprot_t;
 #include <asm-generic/5level-fixup.h>
 #endif
 
+#define MAX_POSSIBLE_PHYSMEM_BITS CONFIG_ARM64_PA_BITS
+
 #endif	/* __ASM_PGTABLE_TYPES_H */
diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h
index 5798bd2b462c..a3e055979e46 100644
--- a/arch/ia64/include/asm/page.h
+++ b/arch/ia64/include/asm/page.h
@@ -235,4 +235,6 @@  get_order (unsigned long size)
 
 #define __HAVE_ARCH_GATE_AREA	1
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 50
+
 #endif /* _ASM_IA64_PAGE_H */
diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h
index e8cc328fce2d..f6a5dea1a66c 100644
--- a/arch/mips/include/asm/page.h
+++ b/arch/mips/include/asm/page.h
@@ -263,4 +263,6 @@  extern int __virt_addr_valid(const volatile void *kaddr);
 #include <asm-generic/memory_model.h>
 #include <asm-generic/getorder.h>
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 48
+
 #endif /* _ASM_PAGE_H */
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index eb20eb3b8fb0..2ebc1d2d9a5c 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -324,6 +324,8 @@  static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
 #define MAX_PHYSMEM_BITS        46
 #endif
 
+#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS
+
 #ifdef CONFIG_PPC_BOOK3S_64
 #include <asm/book3s/64/mmu.h>
 #else /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index a4d38092530a..8abec1461bf7 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -180,4 +180,6 @@  static inline int devmem_is_allowed(unsigned long pfn)
 #include <asm-generic/memory_model.h>
 #include <asm-generic/getorder.h>
 
+#define MAX_POSSIBLE_PHYSMEM_BITS CONFIG_MAX_PHYSMEM_BITS
+
 #endif /* _S390_PAGE_H */
diff --git a/arch/sh/include/asm/page.h b/arch/sh/include/asm/page.h
index 5eef8be3e59f..40c7e12cf09e 100644
--- a/arch/sh/include/asm/page.h
+++ b/arch/sh/include/asm/page.h
@@ -205,4 +205,6 @@  typedef struct page *pgtable_t;
 #define ARCH_SLAB_MINALIGN	8
 #endif
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 32
+
 #endif /* __ASM_SH_PAGE_H */
diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h
index b76d59edec8c..14e9ca4659d7 100644
--- a/arch/sparc/include/asm/page_32.h
+++ b/arch/sparc/include/asm/page_32.h
@@ -139,4 +139,6 @@  extern unsigned long pfn_base;
 #include <asm-generic/memory_model.h>
 #include <asm-generic/getorder.h>
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 32
+
 #endif /* _SPARC_PAGE_H */
diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
index e80f2d5bf62f..6d6f3654ead1 100644
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -163,4 +163,6 @@  extern unsigned long PAGE_OFFSET;
 
 #include <asm-generic/getorder.h>
 
+#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYS_ADDRESS_BITS
+
 #endif /* _SPARC64_PAGE_H */
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index 6deb6cd236e3..c2eae59e6505 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -38,4 +38,6 @@  typedef union {
 /* This covers all VMSPLIT_* and VMSPLIT_*_OPT variants */
 #define PGD_KERNEL_START	(CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 32
+
 #endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 33845d36897c..5fce514a49a0 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -45,7 +45,8 @@  typedef union {
  */
 #define PTRS_PER_PTE	512
 
-#define MAX_POSSIBLE_PHYSMEM_BITS	36
 #define PGD_KERNEL_START	(CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 36
+
 #endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 84bd9bdc1987..d808cfde3d19 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -64,8 +64,6 @@  extern unsigned int ptrs_per_p4d;
 #define P4D_SIZE		(_AC(1, UL) << P4D_SHIFT)
 #define P4D_MASK		(~(P4D_SIZE - 1))
 
-#define MAX_POSSIBLE_PHYSMEM_BITS	52
-
 #else /* CONFIG_X86_5LEVEL */
 
 /*
@@ -154,4 +152,6 @@  extern unsigned int ptrs_per_p4d;
 
 #define PGD_KERNEL_START	((PAGE_SIZE / 2) / sizeof(pgd_t))
 
+#define MAX_POSSIBLE_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46)
+
 #endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0787d33b80d8..132c20b6fd4f 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -80,23 +80,7 @@ 
  * as single (unsigned long) handle value.
  *
  * Note that object index <obj_idx> starts from 0.
- *
- * This is made more complicated by various memory models and PAE.
- */
-
-#ifndef MAX_POSSIBLE_PHYSMEM_BITS
-#ifdef MAX_PHYSMEM_BITS
-#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS
-#else
-/*
- * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
- * be PAGE_SHIFT
  */
-#define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
-#endif
-#endif
-
-#define _PFN_BITS		(MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
 
 /*
  * Memory for allocating for handle keeps object position by
@@ -116,6 +100,25 @@ 
  */
 #define OBJ_ALLOCATED_TAG 1
 #define OBJ_TAG_BITS 1
+
+/*
+ * MAX_POSSIBLE_PHYSMEM_BITS should be defined by all archs using zsmalloc:
+ * Trying to guess it from MAX_PHYSMEM_BITS, or considering it BITS_PER_LONG,
+ * proved to be wrong by not considering PAE capabilities, or using SPARSEMEM
+ * only headers, leading to bad object encoding due to object index overflow.
+ */
+#ifndef MAX_POSSIBLE_PHYSMEM_BITS
+ #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
+ #error "MAX_POSSIBLE_PHYSMEM_BITS HAS to be defined by arch using zsmalloc";
+#else
+ #ifndef CONFIG_64BIT
+  #if (MAX_POSSIBLE_PHYSMEM_BITS >= (BITS_PER_LONG + PAGE_SHIFT - OBJ_TAG_BITS))
+   #error "MAX_POSSIBLE_PHYSMEM_BITS is wrong for this arch";
+  #endif
+ #endif
+#endif
+
+#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
 #define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
 #define OBJ_INDEX_MASK	((_AC(1, UL) << OBJ_INDEX_BITS) - 1)