mbox series

[bpf-next,v2,0/3] xdpsock: allow mmap2 usage for 32bits

Message ID 20190815121356.8848-1-ivan.khoronzhuk@linaro.org
Headers show
Series xdpsock: allow mmap2 usage for 32bits | expand

Message

Ivan Khoronzhuk Aug. 15, 2019, 12:13 p.m. UTC
This patchset contains several improvements for af_xdp socket umem
mappings for 32bit systems. Also, there is one more patch outside of
this series that on linux-next tree and related to mmap2 af_xdp umem
offsets: "mm: mmap: increase sockets maximum memory size pgoff for 32bits"
https://lkml.org/lkml/2019/8/12/549

Based on bpf-next/master

Prev: https://lkml.org/lkml/2019/8/13/437

v2..v1:
	- replaced "libbpf: add asm/unistd.h to xsk to get __NR_mmap2" on
	 "libbpf: use LFS (_FILE_OFFSET_BITS) instead of direct mmap2
	 syscall"
	- use vmap along with page_address to avoid overkill
	- define mmap syscall trace5 for mmap if defined

Ivan Khoronzhuk (3):
  libbpf: use LFS (_FILE_OFFSET_BITS) instead of direct mmap2 syscall
  xdp: xdp_umem: replace kmap on vmap for umem map
  samples: bpf: syscal_nrs: use mmap2 if defined

 net/xdp/xdp_umem.c         | 36 +++++++++++++++++++++++-----
 samples/bpf/syscall_nrs.c  |  6 +++++
 samples/bpf/tracex5_kern.c | 13 ++++++++++
 tools/lib/bpf/Makefile     |  1 +
 tools/lib/bpf/xsk.c        | 49 +++++++++++---------------------------
 5 files changed, 64 insertions(+), 41 deletions(-)

-- 
2.17.1

Comments

Jonathan Lemon Aug. 15, 2019, 6:23 p.m. UTC | #1
On 15 Aug 2019, at 5:13, Ivan Khoronzhuk wrote:

> For 64-bit there is no reason to use vmap/vunmap, so use page_address

> as it was initially. For 32 bits, in some apps, like in samples

> xdpsock_user.c when number of pgs in use is quite big, the kmap

> memory can be not enough, despite on this, kmap looks like is

> deprecated in such cases as it can block and should be used rather

> for dynamic mm.

>

> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>

> ---

>  net/xdp/xdp_umem.c | 36 ++++++++++++++++++++++++++++++------

>  1 file changed, 30 insertions(+), 6 deletions(-)

>

> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c

> index a0607969f8c0..d740c4f8810c 100644

> --- a/net/xdp/xdp_umem.c

> +++ b/net/xdp/xdp_umem.c

> @@ -14,7 +14,7 @@

>  #include <linux/netdevice.h>

>  #include <linux/rtnetlink.h>

>  #include <linux/idr.h>

> -#include <linux/highmem.h>

> +#include <linux/vmalloc.h>

>

>  #include "xdp_umem.h"

>  #include "xsk_queue.h"

> @@ -170,7 +170,30 @@ static void xdp_umem_unmap_pages(struct xdp_umem 

> *umem)

>  	unsigned int i;

>

>  	for (i = 0; i < umem->npgs; i++)

> -		kunmap(umem->pgs[i]);

> +		if (PageHighMem(umem->pgs[i]))

> +			vunmap(umem->pages[i].addr);

> +}

> +

> +static int xdp_umem_map_pages(struct xdp_umem *umem)

> +{

> +	unsigned int i;

> +	void *addr;

> +

> +	for (i = 0; i < umem->npgs; i++) {

> +		if (PageHighMem(umem->pgs[i]))

> +			addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL);

> +		else

> +			addr = page_address(umem->pgs[i]);

> +

> +		if (!addr) {

> +			xdp_umem_unmap_pages(umem);

> +			return -ENOMEM;

> +		}

> +

> +		umem->pages[i].addr = addr;

> +	}

> +

> +	return 0;

>  }


You'll want a __xdp_umem_unmap_pages() helper here that takes an
count of the number of pages to unmap, so it can be called from
xdp_umem_unmap_pages() in the normal case, and xdp_umem_map_pages()
in the error case.  Otherwise the error case ends up calling
PageHighMem on a null page.
-- 
Jonathan

>  static void xdp_umem_unpin_pages(struct xdp_umem *umem)

> @@ -312,7 +335,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, 

> struct xdp_umem_reg *mr)

>  	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;

>  	unsigned int chunks, chunks_per_page;

>  	u64 addr = mr->addr, size = mr->len;

> -	int size_chk, err, i;

> +	int size_chk, err;

>

>  	if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) 

> {

>  		/* Strictly speaking we could support this, if:

> @@ -378,10 +401,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, 

> struct xdp_umem_reg *mr)

>  		goto out_account;

>  	}

>

> -	for (i = 0; i < umem->npgs; i++)

> -		umem->pages[i].addr = kmap(umem->pgs[i]);

> +	err = xdp_umem_map_pages(umem);

> +	if (!err)

> +		return 0;

>

> -	return 0;

> +	kfree(umem->pages);

>

>  out_account:

>  	xdp_umem_unaccount_pages(umem);

> -- 

> 2.17.1
Ivan Khoronzhuk Aug. 15, 2019, 7:19 p.m. UTC | #2
On Thu, Aug 15, 2019 at 11:23:16AM -0700, Jonathan Lemon wrote:
>On 15 Aug 2019, at 5:13, Ivan Khoronzhuk wrote:

>

>>For 64-bit there is no reason to use vmap/vunmap, so use page_address

>>as it was initially. For 32 bits, in some apps, like in samples

>>xdpsock_user.c when number of pgs in use is quite big, the kmap

>>memory can be not enough, despite on this, kmap looks like is

>>deprecated in such cases as it can block and should be used rather

>>for dynamic mm.

>>

>>Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>

>>---

>> net/xdp/xdp_umem.c | 36 ++++++++++++++++++++++++++++++------

>> 1 file changed, 30 insertions(+), 6 deletions(-)

>>

>>diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c

>>index a0607969f8c0..d740c4f8810c 100644

>>--- a/net/xdp/xdp_umem.c

>>+++ b/net/xdp/xdp_umem.c

>>@@ -14,7 +14,7 @@

>> #include <linux/netdevice.h>

>> #include <linux/rtnetlink.h>

>> #include <linux/idr.h>

>>-#include <linux/highmem.h>

>>+#include <linux/vmalloc.h>

>>

>> #include "xdp_umem.h"

>> #include "xsk_queue.h"

>>@@ -170,7 +170,30 @@ static void xdp_umem_unmap_pages(struct 

>>xdp_umem *umem)

>> 	unsigned int i;

>>

>> 	for (i = 0; i < umem->npgs; i++)

>>-		kunmap(umem->pgs[i]);

>>+		if (PageHighMem(umem->pgs[i]))

>>+			vunmap(umem->pages[i].addr);

>>+}

>>+

>>+static int xdp_umem_map_pages(struct xdp_umem *umem)

>>+{

>>+	unsigned int i;

>>+	void *addr;

>>+

>>+	for (i = 0; i < umem->npgs; i++) {

>>+		if (PageHighMem(umem->pgs[i]))

>>+			addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL);

>>+		else

>>+			addr = page_address(umem->pgs[i]);

>>+

>>+		if (!addr) {

>>+			xdp_umem_unmap_pages(umem);

>>+			return -ENOMEM;

>>+		}

>>+

>>+		umem->pages[i].addr = addr;

>>+	}

>>+

>>+	return 0;

>> }

>

>You'll want a __xdp_umem_unmap_pages() helper here that takes an

>count of the number of pages to unmap, so it can be called from

>xdp_umem_unmap_pages() in the normal case, and xdp_umem_map_pages()

>in the error case.  Otherwise the error case ends up calling

>PageHighMem on a null page.

>-- 

>Jonathan


Do you mean null address?
If so, then vunmap do nothing if it's null, and addr is null if it's not
assigned... and it's not assigned w/o correct mapping...

If you mean null page, then it is not possible after all they are
pinned above, here: xdp_umem_pin_pages(), thus assigned.

Or I missed smth?

Despite of this, seems like here should be one more patch, adding unpinning page
in error path, but this not related to this change. Will do this in follow up
fix patch, if no objection to my explanation, ofc.

-- 
Regards,
Ivan Khoronzhuk
Jonathan Lemon Aug. 15, 2019, 7:32 p.m. UTC | #3
On 15 Aug 2019, at 12:19, Ivan Khoronzhuk wrote:

> On Thu, Aug 15, 2019 at 11:23:16AM -0700, Jonathan Lemon wrote:

>> On 15 Aug 2019, at 5:13, Ivan Khoronzhuk wrote:

>>

>>> For 64-bit there is no reason to use vmap/vunmap, so use 

>>> page_address

>>> as it was initially. For 32 bits, in some apps, like in samples

>>> xdpsock_user.c when number of pgs in use is quite big, the kmap

>>> memory can be not enough, despite on this, kmap looks like is

>>> deprecated in such cases as it can block and should be used rather

>>> for dynamic mm.

>>>

>>> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>

>>> ---

>>> net/xdp/xdp_umem.c | 36 ++++++++++++++++++++++++++++++------

>>> 1 file changed, 30 insertions(+), 6 deletions(-)

>>>

>>> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c

>>> index a0607969f8c0..d740c4f8810c 100644

>>> --- a/net/xdp/xdp_umem.c

>>> +++ b/net/xdp/xdp_umem.c

>>> @@ -14,7 +14,7 @@

>>> #include <linux/netdevice.h>

>>> #include <linux/rtnetlink.h>

>>> #include <linux/idr.h>

>>> -#include <linux/highmem.h>

>>> +#include <linux/vmalloc.h>

>>>

>>> #include "xdp_umem.h"

>>> #include "xsk_queue.h"

>>> @@ -170,7 +170,30 @@ static void xdp_umem_unmap_pages(struct 

>>> xdp_umem *umem)

>>> 	unsigned int i;

>>>

>>> 	for (i = 0; i < umem->npgs; i++)

>>> -		kunmap(umem->pgs[i]);

>>> +		if (PageHighMem(umem->pgs[i]))

>>> +			vunmap(umem->pages[i].addr);

>>> +}

>>> +

>>> +static int xdp_umem_map_pages(struct xdp_umem *umem)

>>> +{

>>> +	unsigned int i;

>>> +	void *addr;

>>> +

>>> +	for (i = 0; i < umem->npgs; i++) {

>>> +		if (PageHighMem(umem->pgs[i]))

>>> +			addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL);

>>> +		else

>>> +			addr = page_address(umem->pgs[i]);

>>> +

>>> +		if (!addr) {

>>> +			xdp_umem_unmap_pages(umem);

>>> +			return -ENOMEM;

>>> +		}

>>> +

>>> +		umem->pages[i].addr = addr;

>>> +	}

>>> +

>>> +	return 0;

>>> }

>>

>> You'll want a __xdp_umem_unmap_pages() helper here that takes an

>> count of the number of pages to unmap, so it can be called from

>> xdp_umem_unmap_pages() in the normal case, and xdp_umem_map_pages()

>> in the error case.  Otherwise the error case ends up calling

>> PageHighMem on a null page.

>> -- 

>> Jonathan

>

> Do you mean null address?

> If so, then vunmap do nothing if it's null, and addr is null if it's 

> not

> assigned... and it's not assigned w/o correct mapping...

>

> If you mean null page, then it is not possible after all they are

> pinned above, here: xdp_umem_pin_pages(), thus assigned.

>

> Or I missed smth?


No - I forgot about umem_pin_pages() - feel free to ignore my comments.
--
Jonathan

>

> Despite of this, seems like here should be one more patch, adding 

> unpinning page

> in error path, but this not related to this change. Will do this in 

> follow up

> fix patch, if no objection to my explanation, ofc.

>

> -- 

> Regards,

> Ivan Khoronzhuk
Jonathan Lemon Aug. 19, 2019, 9:17 p.m. UTC | #4
On 15 Aug 2019, at 5:13, Ivan Khoronzhuk wrote:

> For arm32 xdp sockets mmap2 is preferred, so use it if it's defined.

> Declaration of __NR_mmap can be skipped and it breaks build.

>

> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>


Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>



> ---

>  samples/bpf/syscall_nrs.c  |  6 ++++++

>  samples/bpf/tracex5_kern.c | 13 +++++++++++++

>  2 files changed, 19 insertions(+)

>

> diff --git a/samples/bpf/syscall_nrs.c b/samples/bpf/syscall_nrs.c

> index 516e255cbe8f..88f940052450 100644

> --- a/samples/bpf/syscall_nrs.c

> +++ b/samples/bpf/syscall_nrs.c

> @@ -9,5 +9,11 @@ void syscall_defines(void)

>  	COMMENT("Linux system call numbers.");

>  	SYSNR(__NR_write);

>  	SYSNR(__NR_read);

> +#ifdef __NR_mmap2

> +	SYSNR(__NR_mmap2);

> +#endif

> +#ifdef __NR_mmap

>  	SYSNR(__NR_mmap);

> +#endif

> +

>  }

> diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c

> index f57f4e1ea1ec..35cb0eed3be5 100644

> --- a/samples/bpf/tracex5_kern.c

> +++ b/samples/bpf/tracex5_kern.c

> @@ -68,12 +68,25 @@ PROG(SYS__NR_read)(struct pt_regs *ctx)

>  	return 0;

>  }

>

> +#ifdef __NR_mmap2

> +PROG(SYS__NR_mmap2)(struct pt_regs *ctx)

> +{

> +	char fmt[] = "mmap2\n";

> +

> +	bpf_trace_printk(fmt, sizeof(fmt));

> +	return 0;

> +}

> +#endif

> +

> +#ifdef __NR_mmap

>  PROG(SYS__NR_mmap)(struct pt_regs *ctx)

>  {

>  	char fmt[] = "mmap\n";

> +

>  	bpf_trace_printk(fmt, sizeof(fmt));

>  	return 0;

>  }

> +#endif

>

>  char _license[] SEC("license") = "GPL";

>  u32 _version SEC("version") = LINUX_VERSION_CODE;

> -- 

> 2.17.1
Daniel Borkmann Aug. 21, 2019, 12:35 p.m. UTC | #5
On 8/15/19 2:13 PM, Ivan Khoronzhuk wrote:
> This patchset contains several improvements for af_xdp socket umem

> mappings for 32bit systems. Also, there is one more patch outside of

> this series that on linux-next tree and related to mmap2 af_xdp umem

> offsets: "mm: mmap: increase sockets maximum memory size pgoff for 32bits"

> https://lkml.org/lkml/2019/8/12/549

> 

> Based on bpf-next/master

> 

> Prev: https://lkml.org/lkml/2019/8/13/437

> 

> v2..v1:

> 	- replaced "libbpf: add asm/unistd.h to xsk to get __NR_mmap2" on

> 	 "libbpf: use LFS (_FILE_OFFSET_BITS) instead of direct mmap2

> 	 syscall"

> 	- use vmap along with page_address to avoid overkill

> 	- define mmap syscall trace5 for mmap if defined

> 

> Ivan Khoronzhuk (3):

>    libbpf: use LFS (_FILE_OFFSET_BITS) instead of direct mmap2 syscall

>    xdp: xdp_umem: replace kmap on vmap for umem map

>    samples: bpf: syscal_nrs: use mmap2 if defined

> 

>   net/xdp/xdp_umem.c         | 36 +++++++++++++++++++++++-----

>   samples/bpf/syscall_nrs.c  |  6 +++++

>   samples/bpf/tracex5_kern.c | 13 ++++++++++

>   tools/lib/bpf/Makefile     |  1 +

>   tools/lib/bpf/xsk.c        | 49 +++++++++++---------------------------

>   5 files changed, 64 insertions(+), 41 deletions(-)

> 


Applied, and fixed up typo in last one's subject, thanks!