diff mbox series

[net-next,RFC,1/2] page_pool: add page recycling support based on elevated refcnt

Message ID 1625044676-12441-2-git-send-email-linyunsheng@huawei.com
State New
Headers show
Series add elevated refcnt support for page pool | expand

Commit Message

Yunsheng Lin June 30, 2021, 9:17 a.m. UTC
Currently page pool only support page recycling only when
refcnt of page is one, which means it can not support the
split page recycling implemented in the most ethernet driver.

So add elevated refcnt support in page pool, and support
allocating page frag to enable multi-frames-per-page based
on the elevated refcnt support.

As the elevated refcnt is per page, and there is no space
for that in "struct page" now, so add a dynamically allocated
"struct page_pool_info" to record page pool ptr and refcnt
corrsponding to a page for now. Later, we can recycle the
"struct page_pool_info" too, or use part of page memory to
record pp_info.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 drivers/net/ethernet/marvell/mvneta.c           |   6 +-
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c |   2 +-
 include/linux/mm_types.h                        |   2 +-
 include/linux/skbuff.h                          |   4 +-
 include/net/page_pool.h                         |  30 +++-
 net/core/page_pool.c                            | 215 ++++++++++++++++++++----
 6 files changed, 207 insertions(+), 52 deletions(-)

Comments

Jesper Dangaard Brouer July 2, 2021, 9:42 a.m. UTC | #1
On 30/06/2021 11.17, Yunsheng Lin wrote:
> Currently page pool only support page recycling only when

> refcnt of page is one, which means it can not support the

> split page recycling implemented in the most ethernet driver.


Cc. Alex Duyck as I consider him an expert in this area.


> So add elevated refcnt support in page pool, and support

> allocating page frag to enable multi-frames-per-page based

> on the elevated refcnt support.

>

> As the elevated refcnt is per page, and there is no space

> for that in "struct page" now, so add a dynamically allocated

> "struct page_pool_info" to record page pool ptr and refcnt

> corrsponding to a page for now. Later, we can recycle the

> "struct page_pool_info" too, or use part of page memory to

> record pp_info.


I'm not happy with allocating a memory (slab) object "struct 
page_pool_info" per page.

This also gives us an extra level of indirection.


You are also adding a page "frag" API inside page pool, which I'm not 
100% convinced belongs inside page_pool APIs.

Please notice the APIs that Alex Duyck added in mm/page_alloc.c:

  __page_frag_cache_refill() + __page_frag_cache_drain() + 
page_frag_alloc_align()


No more comments below, but kept it if Alex wants to review the details.

> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>

> ---

>   drivers/net/ethernet/marvell/mvneta.c           |   6 +-

>   drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c |   2 +-

>   include/linux/mm_types.h                        |   2 +-

>   include/linux/skbuff.h                          |   4 +-

>   include/net/page_pool.h                         |  30 +++-

>   net/core/page_pool.c                            | 215 ++++++++++++++++++++----

>   6 files changed, 207 insertions(+), 52 deletions(-)

>

> diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c

> index 88a7550..5a29af2 100644

> --- a/drivers/net/ethernet/marvell/mvneta.c

> +++ b/drivers/net/ethernet/marvell/mvneta.c

> @@ -2327,7 +2327,7 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,

>   	if (!skb)

>   		return ERR_PTR(-ENOMEM);

>   

> -	skb_mark_for_recycle(skb, virt_to_page(xdp->data), pool);

> +	skb_mark_for_recycle(skb);

>   

>   	skb_reserve(skb, xdp->data - xdp->data_hard_start);

>   	skb_put(skb, xdp->data_end - xdp->data);

> @@ -2339,10 +2339,6 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,

>   		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,

>   				skb_frag_page(frag), skb_frag_off(frag),

>   				skb_frag_size(frag), PAGE_SIZE);

> -		/* We don't need to reset pp_recycle here. It's already set, so

> -		 * just mark fragments for recycling.

> -		 */

> -		page_pool_store_mem_info(skb_frag_page(frag), pool);

>   	}

>   

>   	return skb;

> diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

> index 3135220..540e387 100644

> --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

> +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

> @@ -3997,7 +3997,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,

>   		}

>   

>   		if (pp)

> -			skb_mark_for_recycle(skb, page, pp);

> +			skb_mark_for_recycle(skb);

>   		else

>   			dma_unmap_single_attrs(dev->dev.parent, dma_addr,

>   					       bm_pool->buf_size, DMA_FROM_DEVICE,

> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

> index 862f88a..cf613df 100644

> --- a/include/linux/mm_types.h

> +++ b/include/linux/mm_types.h

> @@ -101,7 +101,7 @@ struct page {

>   			 * page_pool allocated pages.

>   			 */

>   			unsigned long pp_magic;

> -			struct page_pool *pp;

> +			struct page_pool_info *pp_info;

>   			unsigned long _pp_mapping_pad;

>   			/**

>   			 * @dma_addr: might require a 64-bit value on

> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

> index b2db9cd..7795979 100644

> --- a/include/linux/skbuff.h

> +++ b/include/linux/skbuff.h

> @@ -4711,11 +4711,9 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)

>   }

>   

>   #ifdef CONFIG_PAGE_POOL

> -static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page,

> -					struct page_pool *pp)

> +static inline void skb_mark_for_recycle(struct sk_buff *skb)

>   {

>   	skb->pp_recycle = 1;

> -	page_pool_store_mem_info(page, pp);

>   }

>   #endif

>   

> diff --git a/include/net/page_pool.h b/include/net/page_pool.h

> index 3dd62dd..44e7545 100644

> --- a/include/net/page_pool.h

> +++ b/include/net/page_pool.h

> @@ -45,7 +45,9 @@

>   					* Please note DMA-sync-for-CPU is still

>   					* device driver responsibility

>   					*/

> -#define PP_FLAG_ALL		(PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)

> +#define PP_FLAG_PAGECNT_BIAS	BIT(2)	/* Enable elevated refcnt */

> +#define PP_FLAG_ALL		(PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV |\

> +				 PP_FLAG_PAGECNT_BIAS)

>   

>   /*

>    * Fast allocation side cache array/stack

> @@ -77,6 +79,7 @@ struct page_pool_params {

>   	enum dma_data_direction dma_dir; /* DMA mapping direction */

>   	unsigned int	max_len; /* max DMA sync memory size */

>   	unsigned int	offset;  /* DMA addr offset */

> +	unsigned int	frag_size;

>   };

>   

>   struct page_pool {

> @@ -88,6 +91,8 @@ struct page_pool {

>   	unsigned long defer_warn;

>   

>   	u32 pages_state_hold_cnt;

> +	unsigned int frag_offset;

> +	struct page *frag_page;

>   

>   	/*

>   	 * Data structure for allocation side

> @@ -128,6 +133,11 @@ struct page_pool {

>   	u64 destroy_cnt;

>   };

>   

> +struct page_pool_info {

> +	struct page_pool *pp;

> +	int pagecnt_bias;

> +};

> +

>   struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);

>   

>   static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)

> @@ -137,6 +147,17 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)

>   	return page_pool_alloc_pages(pool, gfp);

>   }

>   

> +struct page *page_pool_alloc_frag(struct page_pool *pool,

> +				  unsigned int *offset, gfp_t gfp);

> +

> +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,

> +						    unsigned int *offset)

> +{

> +	gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);

> +

> +	return page_pool_alloc_frag(pool, offset, gfp);

> +}

> +

>   /* get the stored dma direction. A driver might decide to treat this locally and

>    * avoid the extra cache line from page_pool to determine the direction

>    */

> @@ -253,11 +274,4 @@ static inline void page_pool_ring_unlock(struct page_pool *pool)

>   		spin_unlock_bh(&pool->ring.producer_lock);

>   }

>   

> -/* Store mem_info on struct page and use it while recycling skb frags */

> -static inline

> -void page_pool_store_mem_info(struct page *page, struct page_pool *pp)

> -{

> -	page->pp = pp;

> -}

> -

>   #endif /* _NET_PAGE_POOL_H */

> diff --git a/net/core/page_pool.c b/net/core/page_pool.c

> index 5e4eb45..95d94a7 100644

> --- a/net/core/page_pool.c

> +++ b/net/core/page_pool.c

> @@ -206,6 +206,49 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)

>   	return true;

>   }

>   

> +static int page_pool_set_pp_info(struct page_pool *pool,

> +				 struct page *page, gfp_t gfp)

> +{

> +	struct page_pool_info *pp_info;

> +

> +	pp_info = kzalloc_node(sizeof(*pp_info), gfp, pool->p.nid);

> +	if (!pp_info)

> +		return -ENOMEM;

> +

> +	if (pool->p.flags & PP_FLAG_PAGECNT_BIAS) {

> +		page_ref_add(page, USHRT_MAX);

> +		pp_info->pagecnt_bias = USHRT_MAX;

> +	} else {

> +		pp_info->pagecnt_bias = 0;

> +	}

> +

> +	page->pp_magic |= PP_SIGNATURE;

> +	pp_info->pp = pool;

> +	page->pp_info = pp_info;

> +	return 0;

> +}

> +

> +static int page_pool_clear_pp_info(struct page *page)

> +{

> +	struct page_pool_info *pp_info = page->pp_info;

> +	int bias;

> +

> +	bias = pp_info->pagecnt_bias;

> +

> +	kfree(pp_info);

> +	page->pp_info = NULL;

> +	page->pp_magic = 0;

> +

> +	return bias;

> +}

> +

> +static void page_pool_clear_and_drain_page(struct page *page)

> +{

> +	int bias = page_pool_clear_pp_info(page);

> +

> +	__page_frag_cache_drain(page, bias + 1);

> +}

> +

>   static struct page *__page_pool_alloc_page_order(struct page_pool *pool,

>   						 gfp_t gfp)

>   {

> @@ -216,13 +259,16 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,

>   	if (unlikely(!page))

>   		return NULL;

>   

> -	if ((pool->p.flags & PP_FLAG_DMA_MAP) &&

> -	    unlikely(!page_pool_dma_map(pool, page))) {

> +	if (unlikely(page_pool_set_pp_info(pool, page, gfp))) {

>   		put_page(page);

>   		return NULL;

>   	}

>   

> -	page->pp_magic |= PP_SIGNATURE;

> +	if ((pool->p.flags & PP_FLAG_DMA_MAP) &&

> +	    unlikely(!page_pool_dma_map(pool, page))) {

> +		page_pool_clear_and_drain_page(page);

> +		return NULL;

> +	}

>   

>   	/* Track how many pages are held 'in-flight' */

>   	pool->pages_state_hold_cnt++;

> @@ -261,12 +307,17 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,

>   	 */

>   	for (i = 0; i < nr_pages; i++) {

>   		page = pool->alloc.cache[i];

> +		if (unlikely(page_pool_set_pp_info(pool, page, gfp))) {

> +			put_page(page);

> +			continue;

> +		}

> +

>   		if ((pp_flags & PP_FLAG_DMA_MAP) &&

>   		    unlikely(!page_pool_dma_map(pool, page))) {

> -			put_page(page);

> +			page_pool_clear_and_drain_page(page);

>   			continue;

>   		}

> -		page->pp_magic |= PP_SIGNATURE;

> +

>   		pool->alloc.cache[pool->alloc.count++] = page;

>   		/* Track how many pages are held 'in-flight' */

>   		pool->pages_state_hold_cnt++;

> @@ -284,6 +335,25 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,

>   	return page;

>   }

>   

> +static void page_pool_sub_bias(struct page *page, int nr)

> +{

> +	struct page_pool_info *pp_info = page->pp_info;

> +

> +	/* "pp_info->pagecnt_bias == 0" indicates the PAGECNT_BIAS

> +	 * flags is not set.

> +	 */

> +	if (!pp_info->pagecnt_bias)

> +		return;

> +

> +	/* Make sure pagecnt_bias > 0 for elevated refcnt case */

> +	if (unlikely(pp_info->pagecnt_bias <= nr)) {

> +		page_ref_add(page, USHRT_MAX);

> +		pp_info->pagecnt_bias += USHRT_MAX;

> +	}

> +

> +	pp_info->pagecnt_bias -= nr;

> +}

> +

>   /* For using page_pool replace: alloc_pages() API calls, but provide

>    * synchronization guarantee for allocation side.

>    */

> @@ -293,15 +363,66 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)

>   

>   	/* Fast-path: Get a page from cache */

>   	page = __page_pool_get_cached(pool);

> -	if (page)

> +	if (page) {

> +		page_pool_sub_bias(page, 1);

>   		return page;

> +	}

>   

>   	/* Slow-path: cache empty, do real allocation */

>   	page = __page_pool_alloc_pages_slow(pool, gfp);

> +	if (page)

> +		page_pool_sub_bias(page, 1);

> +

>   	return page;

>   }

>   EXPORT_SYMBOL(page_pool_alloc_pages);

>   

> +struct page *page_pool_alloc_frag(struct page_pool *pool,

> +				  unsigned int *offset, gfp_t gfp)

> +{

> +	unsigned int frag_offset = pool->frag_offset;

> +	unsigned int frag_size = pool->p.frag_size;

> +	struct page *frag_page = pool->frag_page;

> +	unsigned int max_len = pool->p.max_len;

> +

> +	if (!frag_page || frag_offset + frag_size > max_len) {

> +		frag_page = page_pool_alloc_pages(pool, gfp);

> +		if (unlikely(!frag_page)) {

> +			pool->frag_page = NULL;

> +			return NULL;

> +		}

> +

> +		pool->frag_page = frag_page;

> +		frag_offset = 0;

> +

> +		page_pool_sub_bias(frag_page, max_len / frag_size - 1);

> +	}

> +

> +	*offset = frag_offset;

> +	pool->frag_offset = frag_offset + frag_size;

> +

> +	return frag_page;

> +}

> +EXPORT_SYMBOL(page_pool_alloc_frag);

> +

> +static void page_pool_empty_frag(struct page_pool *pool)

> +{

> +	unsigned int frag_offset = pool->frag_offset;

> +	unsigned int frag_size = pool->p.frag_size;

> +	struct page *frag_page = pool->frag_page;

> +	unsigned int max_len = pool->p.max_len;

> +

> +	if (!frag_page)

> +		return;

> +

> +	while (frag_offset + frag_size <= max_len) {

> +		page_pool_put_full_page(pool, frag_page, false);

> +		frag_offset += frag_size;

> +	}

> +

> +	pool->frag_page = NULL;

> +}

> +

>   /* Calculate distance between two u32 values, valid if distance is below 2^(31)

>    *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution

>    */

> @@ -326,10 +447,11 @@ static s32 page_pool_inflight(struct page_pool *pool)

>    * a regular page (that will eventually be returned to the normal

>    * page-allocator via put_page).

>    */

> -void page_pool_release_page(struct page_pool *pool, struct page *page)

> +static int __page_pool_release_page(struct page_pool *pool,

> +				    struct page *page)

>   {

>   	dma_addr_t dma;

> -	int count;

> +	int bias, count;

>   

>   	if (!(pool->p.flags & PP_FLAG_DMA_MAP))

>   		/* Always account for inflight pages, even if we didn't

> @@ -345,22 +467,29 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)

>   			     DMA_ATTR_SKIP_CPU_SYNC);

>   	page_pool_set_dma_addr(page, 0);

>   skip_dma_unmap:

> -	page->pp_magic = 0;

> +	bias = page_pool_clear_pp_info(page);

>   

>   	/* This may be the last page returned, releasing the pool, so

>   	 * it is not safe to reference pool afterwards.

>   	 */

>   	count = atomic_inc_return(&pool->pages_state_release_cnt);

>   	trace_page_pool_state_release(pool, page, count);

> +	return bias;

> +}

> +

> +void page_pool_release_page(struct page_pool *pool, struct page *page)

> +{

> +	int bias = __page_pool_release_page(pool, page);

> +

> +	WARN_ONCE(bias, "PAGECNT_BIAS is not supposed to be enabled\n");

>   }

>   EXPORT_SYMBOL(page_pool_release_page);

>   

>   /* Return a page to the page allocator, cleaning up our state */

>   static void page_pool_return_page(struct page_pool *pool, struct page *page)

>   {

> -	page_pool_release_page(pool, page);

> +	__page_frag_cache_drain(page, __page_pool_release_page(pool, page) + 1);

>   

> -	put_page(page);

>   	/* An optimization would be to call __free_pages(page, pool->p.order)

>   	 * knowing page is not part of page-cache (thus avoiding a

>   	 * __page_cache_release() call).

> @@ -395,7 +524,16 @@ static bool page_pool_recycle_in_cache(struct page *page,

>   	return true;

>   }

>   

> -/* If the page refcnt == 1, this will try to recycle the page.

> +static bool page_pool_bias_page_recyclable(struct page *page, int bias)

> +{

> +	int ref = page_ref_dec_return(page);

> +

> +	WARN_ON(ref < bias);

> +	return ref == bias + 1;

> +}

> +

> +/* If pagecnt_bias == 0 and the page refcnt == 1, this will try to

> + * recycle the page.

>    * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for

>    * the configured size min(dma_sync_size, pool->max_len).

>    * If the page refcnt != 1, then the page will be returned to memory

> @@ -405,16 +543,35 @@ static __always_inline struct page *

>   __page_pool_put_page(struct page_pool *pool, struct page *page,

>   		     unsigned int dma_sync_size, bool allow_direct)

>   {

> -	/* This allocator is optimized for the XDP mode that uses

> +	int bias = page->pp_info->pagecnt_bias;

> +

> +	/* Handle the elevated refcnt case first:

> +	 * multi-frames-per-page, it is likely from the skb, which

> +	 * is likely called in non-sofrirq context, so do not recycle

> +	 * it in pool->alloc.

> +	 *

> +	 * Then handle non-elevated refcnt case:

>   	 * one-frame-per-page, but have fallbacks that act like the

>   	 * regular page allocator APIs.

> -	 *

>   	 * refcnt == 1 means page_pool owns page, and can recycle it.

>   	 *

>   	 * page is NOT reusable when allocated when system is under

>   	 * some pressure. (page_is_pfmemalloc)

>   	 */

> -	if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {

> +	if (bias) {

> +		/* We have gave some refcnt to the stack, so wait for

> +		 * all refcnt of the stack to be decremented before

> +		 * enabling recycling.

> +		 */

> +		if (!page_pool_bias_page_recyclable(page, bias))

> +			return NULL;

> +

> +		/* only enable recycling when it is not pfmemalloced */

> +		if (!page_is_pfmemalloc(page))

> +			return page;

> +

> +	} else if (likely(page_ref_count(page) == 1 &&

> +			  !page_is_pfmemalloc(page))) {

>   		/* Read barrier done in page_ref_count / READ_ONCE */

>   

>   		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)

> @@ -428,22 +585,8 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,

>   		/* Page found as candidate for recycling */

>   		return page;

>   	}

> -	/* Fallback/non-XDP mode: API user have elevated refcnt.

> -	 *

> -	 * Many drivers split up the page into fragments, and some

> -	 * want to keep doing this to save memory and do refcnt based

> -	 * recycling. Support this use case too, to ease drivers

> -	 * switching between XDP/non-XDP.

> -	 *

> -	 * In-case page_pool maintains the DMA mapping, API user must

> -	 * call page_pool_put_page once.  In this elevated refcnt

> -	 * case, the DMA is unmapped/released, as driver is likely

> -	 * doing refcnt based recycle tricks, meaning another process

> -	 * will be invoking put_page.

> -	 */

> -	/* Do not replace this with page_pool_return_page() */

> +

>   	page_pool_release_page(pool, page);

> -	put_page(page);

>   

>   	return NULL;

>   }

> @@ -452,6 +595,7 @@ void page_pool_put_page(struct page_pool *pool, struct page *page,

>   			unsigned int dma_sync_size, bool allow_direct)

>   {

>   	page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);

> +

>   	if (page && !page_pool_recycle_in_ring(pool, page)) {

>   		/* Cache full, fallback to free pages */

>   		page_pool_return_page(pool, page);

> @@ -503,8 +647,11 @@ static void page_pool_empty_ring(struct page_pool *pool)

>   

>   	/* Empty recycle ring */

>   	while ((page = ptr_ring_consume_bh(&pool->ring))) {

> -		/* Verify the refcnt invariant of cached pages */

> -		if (!(page_ref_count(page) == 1))

> +		/* Verify the refcnt invariant of cached pages for

> +		 * non elevated refcnt case.

> +		 */

> +		if (!(pool->p.flags & PP_FLAG_PAGECNT_BIAS) &&

> +		    !(page_ref_count(page) == 1))

>   			pr_crit("%s() page_pool refcnt %d violation\n",

>   				__func__, page_ref_count(page));

>   

> @@ -544,6 +691,7 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)

>   

>   static void page_pool_scrub(struct page_pool *pool)

>   {

> +	page_pool_empty_frag(pool);

>   	page_pool_empty_alloc_cache_once(pool);

>   	pool->destroy_cnt++;

>   

> @@ -637,14 +785,13 @@ bool page_pool_return_skb_page(struct page *page)

>   	if (unlikely(page->pp_magic != PP_SIGNATURE))

>   		return false;

>   

> -	pp = page->pp;

> +	pp = page->pp_info->pp;

>   

>   	/* Driver set this to memory recycling info. Reset it on recycle.

>   	 * This will *not* work for NIC using a split-page memory model.

>   	 * The page will be returned to the pool here regardless of the

>   	 * 'flipped' fragment being in use or not.

>   	 */

> -	page->pp = NULL;

>   	page_pool_put_full_page(pp, page, false);

>   

>   	return true;
Yunsheng Lin July 2, 2021, 10:15 a.m. UTC | #2
On 2021/7/2 17:42, Jesper Dangaard Brouer wrote:
> 

> On 30/06/2021 11.17, Yunsheng Lin wrote:

>> Currently page pool only support page recycling only when

>> refcnt of page is one, which means it can not support the

>> split page recycling implemented in the most ethernet driver.

> 

> Cc. Alex Duyck as I consider him an expert in this area.


Thanks.

> 

> 

>> So add elevated refcnt support in page pool, and support

>> allocating page frag to enable multi-frames-per-page based

>> on the elevated refcnt support.

>>

>> As the elevated refcnt is per page, and there is no space

>> for that in "struct page" now, so add a dynamically allocated

>> "struct page_pool_info" to record page pool ptr and refcnt

>> corrsponding to a page for now. Later, we can recycle the

>> "struct page_pool_info" too, or use part of page memory to

>> record pp_info.

> 

> I'm not happy with allocating a memory (slab) object "struct page_pool_info" per page.

> 

> This also gives us an extra level of indirection.


I'm not happy with that either, if there is better way to
avoid that, I will be happy to change it:)

> 

> 

> You are also adding a page "frag" API inside page pool, which I'm not 100% convinced belongs inside page_pool APIs.

> 

> Please notice the APIs that Alex Duyck added in mm/page_alloc.c:


Actually, that is where the idea of using "page frag" come from.

Aside from the performance improvement, there is memory usage
decrease for 64K page size kernel, which means a 64K page can
be used by 32 description with 2k buffer size, and that is a
lot of memory saving for 64 page size kernel comparing to the
current split page reusing implemented in the driver.


> 

>  __page_frag_cache_refill() + __page_frag_cache_drain() + page_frag_alloc_align()

> 

> 


[...]
Ilias Apalodimas July 6, 2021, 4:54 a.m. UTC | #3
Hi Yunsheng,

Thanks for having a look!

On Fri, Jul 02, 2021 at 06:15:13PM +0800, Yunsheng Lin wrote:
> On 2021/7/2 17:42, Jesper Dangaard Brouer wrote:

> > 

> > On 30/06/2021 11.17, Yunsheng Lin wrote:

> >> Currently page pool only support page recycling only when

> >> refcnt of page is one, which means it can not support the

> >> split page recycling implemented in the most ethernet driver.

> > 

> > Cc. Alex Duyck as I consider him an expert in this area.

> 

> Thanks.

> 

> > 

> > 

> >> So add elevated refcnt support in page pool, and support

> >> allocating page frag to enable multi-frames-per-page based

> >> on the elevated refcnt support.

> >>

> >> As the elevated refcnt is per page, and there is no space

> >> for that in "struct page" now, so add a dynamically allocated

> >> "struct page_pool_info" to record page pool ptr and refcnt

> >> corrsponding to a page for now. Later, we can recycle the

> >> "struct page_pool_info" too, or use part of page memory to

> >> record pp_info.

> > 

> > I'm not happy with allocating a memory (slab) object "struct page_pool_info" per page.

> > 

> > This also gives us an extra level of indirection.

> 

> I'm not happy with that either, if there is better way to

> avoid that, I will be happy to change it:)


I think what we have to answer here is, do we want and does it make sense
for page_pool to do the housekeeping of the buffer splitting or are we
better of having each driver do that.  IIRC your previous patch on top of
the original recycling patchset was just 'atomic' refcnts on top of page pool.

I think I'd prefer each driver having it's own meta-data of how he splits
the page, mostly due to hardware diversity, but tbh I don't have any
strong preference atm.

> 

> > 

> > 

> > You are also adding a page "frag" API inside page pool, which I'm not 100% convinced belongs inside page_pool APIs.

> > 

> > Please notice the APIs that Alex Duyck added in mm/page_alloc.c:

> 

> Actually, that is where the idea of using "page frag" come from.

> 

> Aside from the performance improvement, there is memory usage

> decrease for 64K page size kernel, which means a 64K page can

> be used by 32 description with 2k buffer size, and that is a

> lot of memory saving for 64 page size kernel comparing to the

> current split page reusing implemented in the driver.

> 


Whether the driver or page_pool itself keeps the meta-data, the outcome
here won't change.  We'll still be able to use page frags.


Cheers
/Ilias
> 

> > 

> >  __page_frag_cache_refill() + __page_frag_cache_drain() + page_frag_alloc_align()

> > 

> > 

> 

> [...]
Yunsheng Lin July 6, 2021, 6:46 a.m. UTC | #4
On 2021/7/6 12:54, Ilias Apalodimas wrote:
> Hi Yunsheng,

> 

> Thanks for having a look!


Hi,

Thanks for reviewing.

> 

> On Fri, Jul 02, 2021 at 06:15:13PM +0800, Yunsheng Lin wrote:

>> On 2021/7/2 17:42, Jesper Dangaard Brouer wrote:

>>>

>>> On 30/06/2021 11.17, Yunsheng Lin wrote:

>>>> Currently page pool only support page recycling only when

>>>> refcnt of page is one, which means it can not support the

>>>> split page recycling implemented in the most ethernet driver.

>>>

>>> Cc. Alex Duyck as I consider him an expert in this area.

>>

>> Thanks.

>>

>>>

>>>

>>>> So add elevated refcnt support in page pool, and support

>>>> allocating page frag to enable multi-frames-per-page based

>>>> on the elevated refcnt support.

>>>>

>>>> As the elevated refcnt is per page, and there is no space

>>>> for that in "struct page" now, so add a dynamically allocated

>>>> "struct page_pool_info" to record page pool ptr and refcnt

>>>> corrsponding to a page for now. Later, we can recycle the

>>>> "struct page_pool_info" too, or use part of page memory to

>>>> record pp_info.

>>>

>>> I'm not happy with allocating a memory (slab) object "struct page_pool_info" per page.

>>>

>>> This also gives us an extra level of indirection.

>>

>> I'm not happy with that either, if there is better way to

>> avoid that, I will be happy to change it:)

> 

> I think what we have to answer here is, do we want and does it make sense

> for page_pool to do the housekeeping of the buffer splitting or are we

> better of having each driver do that.  IIRC your previous patch on top of

> the original recycling patchset was just 'atomic' refcnts on top of page pool.


You are right that driver was doing the the buffer splitting in previous
patch.

The reason why I abandoned that is:
1. Currently the meta-data of page in the driver is per desc, which means
   it might not be able to use first half of a page for a desc, and the
   second half of the same page for another desc, this ping-pong way of
   reusing the whole page for only one desc in the driver seems unnecessary
   and waste a lot of memory when there is already reusing in the page pool.

2. Easy use of API for the driver too, which means the driver uses
   page_pool_dev_alloc_frag() and page_pool_put_full_page() for elevated
   refcnt case, corresponding to page_pool_dev_alloc_pages() and
   page_pool_put_full_page() for non-elevated refcnt case, the driver does
   not need to worry about the meta-data of a page.

> 

> I think I'd prefer each driver having it's own meta-data of how he splits

> the page, mostly due to hardware diversity, but tbh I don't have any

> strong preference atm.


Usually how the driver split the page is fixed for a given rx configuration(
like MTU), so the driver is able to pass that info to page pool.


> 

>>

>>>

>>>

>>> You are also adding a page "frag" API inside page pool, which I'm not 100% convinced belongs inside page_pool APIs.

>>>

>>> Please notice the APIs that Alex Duyck added in mm/page_alloc.c:

>>

>> Actually, that is where the idea of using "page frag" come from.

>>

>> Aside from the performance improvement, there is memory usage

>> decrease for 64K page size kernel, which means a 64K page can

>> be used by 32 description with 2k buffer size, and that is a

>> lot of memory saving for 64 page size kernel comparing to the

>> current split page reusing implemented in the driver.

>>

> 

> Whether the driver or page_pool itself keeps the meta-data, the outcome

> here won't change.  We'll still be able to use page frags.


As above, it is the ping-pong way of reusing when the driver keeps the
meta-data, and it is page-frag way of reusing when the page pool keeps
the meta-data.

I am not sure if the page-frag way of reusing is possible when we still
keep the meta-data in the driver, which seems very complex at the initial
thinking.

> 

> 

> Cheers

> /Ilias

>>

>>>

>>>  __page_frag_cache_refill() + __page_frag_cache_drain() + page_frag_alloc_align()

>>>

>>>

>>

>> [...]

> .

>
Ilias Apalodimas July 6, 2021, 8:18 a.m. UTC | #5
> >>


[...]

> >>>

> >>>

> >>>> So add elevated refcnt support in page pool, and support

> >>>> allocating page frag to enable multi-frames-per-page based

> >>>> on the elevated refcnt support.

> >>>>

> >>>> As the elevated refcnt is per page, and there is no space

> >>>> for that in "struct page" now, so add a dynamically allocated

> >>>> "struct page_pool_info" to record page pool ptr and refcnt

> >>>> corrsponding to a page for now. Later, we can recycle the

> >>>> "struct page_pool_info" too, or use part of page memory to

> >>>> record pp_info.

> >>>

> >>> I'm not happy with allocating a memory (slab) object "struct page_pool_info" per page.

> >>>

> >>> This also gives us an extra level of indirection.

> >>

> >> I'm not happy with that either, if there is better way to

> >> avoid that, I will be happy to change it:)

> > 

> > I think what we have to answer here is, do we want and does it make sense

> > for page_pool to do the housekeeping of the buffer splitting or are we

> > better of having each driver do that.  IIRC your previous patch on top of

> > the original recycling patchset was just 'atomic' refcnts on top of page pool.

> 

> You are right that driver was doing the the buffer splitting in previous

> patch.

> 

> The reason why I abandoned that is:

> 1. Currently the meta-data of page in the driver is per desc, which means

>    it might not be able to use first half of a page for a desc, and the

>    second half of the same page for another desc, this ping-pong way of

>    reusing the whole page for only one desc in the driver seems unnecessary

>    and waste a lot of memory when there is already reusing in the page pool.

> 

> 2. Easy use of API for the driver too, which means the driver uses

>    page_pool_dev_alloc_frag() and page_pool_put_full_page() for elevated

>    refcnt case, corresponding to page_pool_dev_alloc_pages() and

>    page_pool_put_full_page() for non-elevated refcnt case, the driver does

>    not need to worry about the meta-data of a page.

> 


Ok that makes sense.  We'll need the complexity anyway and I said I don't
have any strong opinions yet, we might as well make page_pool responsible
for it.
What we need to keep in mind is that page_pool was primarily used for XDP
packets.  We need to make sure we have no performance regressions there.
However I don't have access to > 10gbit NICs with XDP support. Can anyone
apply the patchset and check the performance?

> > 

> >>


[...]

> >> Aside from the performance improvement, there is memory usage

> >> decrease for 64K page size kernel, which means a 64K page can

> >> be used by 32 description with 2k buffer size, and that is a

> >> lot of memory saving for 64 page size kernel comparing to the

> >> current split page reusing implemented in the driver.

> >>

> > 

> > Whether the driver or page_pool itself keeps the meta-data, the outcome

> > here won't change.  We'll still be able to use page frags.

> 

> As above, it is the ping-pong way of reusing when the driver keeps the

> meta-data, and it is page-frag way of reusing when the page pool keeps

> the meta-data.

> 

> I am not sure if the page-frag way of reusing is possible when we still

> keep the meta-data in the driver, which seems very complex at the initial

> thinking.

> 


Fair enough. It's complex in both scenarios so if people think it's useful
I am not against adding it in the API.


Thanks
/Ilias
> > 

> > 

> > Cheers

> > /Ilias

> >>

> >>>

> >>>  __page_frag_cache_refill() + __page_frag_cache_drain() + page_frag_alloc_align()

> >>>

> >>>

> >>

> >> [...]

> > .

> >
Alexander Duyck July 6, 2021, 8:45 p.m. UTC | #6
On Wed, Jun 30, 2021 at 2:19 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>

> Currently page pool only support page recycling only when

> refcnt of page is one, which means it can not support the

> split page recycling implemented in the most ethernet driver.

>

> So add elevated refcnt support in page pool, and support

> allocating page frag to enable multi-frames-per-page based

> on the elevated refcnt support.

>

> As the elevated refcnt is per page, and there is no space

> for that in "struct page" now, so add a dynamically allocated

> "struct page_pool_info" to record page pool ptr and refcnt

> corrsponding to a page for now. Later, we can recycle the

> "struct page_pool_info" too, or use part of page memory to

> record pp_info.

>

> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>


So this isn't going to work with the current recycling logic. The
expectation there is that we can safely unmap the entire page as soon
as the reference count is greater than 1.

In addition I think I need to look over that code better as I am
wondering if there are potential issues assuming a path such as a
skb_clone followed by pskb_expand_head may lead to memory corruptions
since the clone will still have pp_recycle set but none of the pages
will be part of the page pool anymore.

For us the pagecnt_bias would really represent the number of
additional mappings beyond the current page that are being held. I
have already been playing around with something similar. However the
general idea is that we want to keep track of how many references to
the page the device is holding onto. When that hits 0 and the actual
page count is 1 we can refill both, however if we hit 0 and there are
multiple references to the page still floating around we should just
unmap the page and turn it over to the stack or free it.

> ---

>  drivers/net/ethernet/marvell/mvneta.c           |   6 +-

>  drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c |   2 +-

>  include/linux/mm_types.h                        |   2 +-

>  include/linux/skbuff.h                          |   4 +-

>  include/net/page_pool.h                         |  30 +++-

>  net/core/page_pool.c                            | 215 ++++++++++++++++++++----

>  6 files changed, 207 insertions(+), 52 deletions(-)

>

> diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c

> index 88a7550..5a29af2 100644

> --- a/drivers/net/ethernet/marvell/mvneta.c

> +++ b/drivers/net/ethernet/marvell/mvneta.c

> @@ -2327,7 +2327,7 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,

>         if (!skb)

>                 return ERR_PTR(-ENOMEM);

>

> -       skb_mark_for_recycle(skb, virt_to_page(xdp->data), pool);

> +       skb_mark_for_recycle(skb);

>

>         skb_reserve(skb, xdp->data - xdp->data_hard_start);

>         skb_put(skb, xdp->data_end - xdp->data);

> @@ -2339,10 +2339,6 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,

>                 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,

>                                 skb_frag_page(frag), skb_frag_off(frag),

>                                 skb_frag_size(frag), PAGE_SIZE);

> -               /* We don't need to reset pp_recycle here. It's already set, so

> -                * just mark fragments for recycling.

> -                */

> -               page_pool_store_mem_info(skb_frag_page(frag), pool);

>         }

>

>         return skb;


So as I mentioned earlier the problem with recycling is that splitting
up the ownership of the page makes it difficult for us to clean it up.
Technically speaking if the pages are being allowed to leave while
holding references to DMA addresses that we cannot revoke then we
should be holding references to the device.

That is one of the reasons why the previous code was just clearing the
mapping as soon as the refcount was greater than 1. However for this
to work out correctly we would have to track how many DMA mappings we
have outstanding in addition to the one we are working on currently.

> diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

> index 3135220..540e387 100644

> --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

> +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

> @@ -3997,7 +3997,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,

>                 }

>

>                 if (pp)

> -                       skb_mark_for_recycle(skb, page, pp);

> +                       skb_mark_for_recycle(skb);

>                 else

>                         dma_unmap_single_attrs(dev->dev.parent, dma_addr,

>                                                bm_pool->buf_size, DMA_FROM_DEVICE,

> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

> index 862f88a..cf613df 100644

> --- a/include/linux/mm_types.h

> +++ b/include/linux/mm_types.h

> @@ -101,7 +101,7 @@ struct page {

>                          * page_pool allocated pages.

>                          */

>                         unsigned long pp_magic;

> -                       struct page_pool *pp;

> +                       struct page_pool_info *pp_info;

>                         unsigned long _pp_mapping_pad;

>                         /**

>                          * @dma_addr: might require a 64-bit value on


So the problem here is that this is creating a pointer chase, and the
need to allocate yet another structure to store it is going to be
expensive.

As far as storing the pagecnt_bias it might make more sense to
repurpose the lower 12 bits of the dma address. A DMA mapping should
be page aligned anyway so the lower 12 bits would be reserved 0. When
we decrement the value so that the lower 12 bits are 0 we should be
unmapping the page anyway, or resetting the pagecnt_bias to PAGE_SIZE
- 1 and adding back the bias to the page to effectively reset it for
reuse.

> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

> index b2db9cd..7795979 100644

> --- a/include/linux/skbuff.h

> +++ b/include/linux/skbuff.h

> @@ -4711,11 +4711,9 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)

>  }

>

>  #ifdef CONFIG_PAGE_POOL

> -static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page,

> -                                       struct page_pool *pp)

> +static inline void skb_mark_for_recycle(struct sk_buff *skb)

>  {

>         skb->pp_recycle = 1;

> -       page_pool_store_mem_info(page, pp);

>  }

>  #endif


I am not a fan of the pp_recycle flag either. We duplicate it via
skb_clone and from what I can tell if we call pskb_expand_head
afterwards I don't see how we avoid recycling the page frags twice.

> diff --git a/include/net/page_pool.h b/include/net/page_pool.h

> index 3dd62dd..44e7545 100644

> --- a/include/net/page_pool.h

> +++ b/include/net/page_pool.h

> @@ -45,7 +45,9 @@

>                                         * Please note DMA-sync-for-CPU is still

>                                         * device driver responsibility

>                                         */

> -#define PP_FLAG_ALL            (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)

> +#define PP_FLAG_PAGECNT_BIAS   BIT(2)  /* Enable elevated refcnt */

> +#define PP_FLAG_ALL            (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV |\

> +                                PP_FLAG_PAGECNT_BIAS)

>

>  /*


It might be better to just put each flag on a seperate line for
PP_FLAG_ALL just to make it easier to read due to the wrapping. Either
that or you could look at converting this over to an enum with a MAX
value and then define the flags based on those enums, and PP_FLAG_ALL
being BIT(MAX) - 1.

>   * Fast allocation side cache array/stack

> @@ -77,6 +79,7 @@ struct page_pool_params {

>         enum dma_data_direction dma_dir; /* DMA mapping direction */

>         unsigned int    max_len; /* max DMA sync memory size */

>         unsigned int    offset;  /* DMA addr offset */

> +       unsigned int    frag_size;

>  };

>

>  struct page_pool {

> @@ -88,6 +91,8 @@ struct page_pool {

>         unsigned long defer_warn;

>

>         u32 pages_state_hold_cnt;

> +       unsigned int frag_offset;

> +       struct page *frag_page;

>

>         /*

>          * Data structure for allocation side

> @@ -128,6 +133,11 @@ struct page_pool {

>         u64 destroy_cnt;

>  };

>

> +struct page_pool_info {

> +       struct page_pool *pp;

> +       int pagecnt_bias;

> +};

> +


Rather than having a top-down structure here it might be better to
work bottom up. If you assume you are keeping a pagecnt_bias per page
it might make more sense to store this in the driver somewhere rather
than having it as a separate allocated buffer. One advantage of the
Intel drivers was doing this as we had the pagecnt_bias in a structure
that also pointed to the page. That way we were only updating that
count if we dropped the page and didn't have to even touch the page.
You could use that to batch updates to the pagecnt_bias if we did use
the lower 12 bits of the DMA address to store it as well.

I'm assuming the idea with this is that you will be having multiple
buffers received off of a single page and so doing it that way you
should only have one update on allocation, maybe a trickle of updates
for XDP_TX, and another large update when the page is fully consumed
and you drop the remaining pagecnt_bias for Rx.

>  struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);

>

>  static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)

> @@ -137,6 +147,17 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)

>         return page_pool_alloc_pages(pool, gfp);

>  }

>

> +struct page *page_pool_alloc_frag(struct page_pool *pool,

> +                                 unsigned int *offset, gfp_t gfp);

> +

> +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,

> +                                                   unsigned int *offset)

> +{

> +       gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);

> +

> +       return page_pool_alloc_frag(pool, offset, gfp);

> +}

> +

>  /* get the stored dma direction. A driver might decide to treat this locally and

>   * avoid the extra cache line from page_pool to determine the direction

>   */

> @@ -253,11 +274,4 @@ static inline void page_pool_ring_unlock(struct page_pool *pool)

>                 spin_unlock_bh(&pool->ring.producer_lock);

>  }

>

> -/* Store mem_info on struct page and use it while recycling skb frags */

> -static inline

> -void page_pool_store_mem_info(struct page *page, struct page_pool *pp)

> -{

> -       page->pp = pp;

> -}

> -

>  #endif /* _NET_PAGE_POOL_H */


So the issue as I see it with the page_pool recycling patch set is
that I don't think we had proper guarantees in place that the page->pp
value was flushed in all cases where skb->dev was changed. Basically
the logic we need to have in place to address those issues is that
skb->dev is changed we need to invalidate the DMA mappings on the
page_pool page.

I honestly wonder if it wouldn't be better for the recycling to just
make use of the page->lru pointers to keep a list of pages that are
outstanding so that it could release them if it is under DMA pressure.

> diff --git a/net/core/page_pool.c b/net/core/page_pool.c

> index 5e4eb45..95d94a7 100644

> --- a/net/core/page_pool.c

> +++ b/net/core/page_pool.c

> @@ -206,6 +206,49 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)

>         return true;

>  }

>

> +static int page_pool_set_pp_info(struct page_pool *pool,

> +                                struct page *page, gfp_t gfp)

> +{

> +       struct page_pool_info *pp_info;

> +

> +       pp_info = kzalloc_node(sizeof(*pp_info), gfp, pool->p.nid);

> +       if (!pp_info)

> +               return -ENOMEM;

> +

> +       if (pool->p.flags & PP_FLAG_PAGECNT_BIAS) {

> +               page_ref_add(page, USHRT_MAX);

> +               pp_info->pagecnt_bias = USHRT_MAX;

> +       } else {

> +               pp_info->pagecnt_bias = 0;

> +       }

> +

> +       page->pp_magic |= PP_SIGNATURE;

> +       pp_info->pp = pool;

> +       page->pp_info = pp_info;

> +       return 0;

> +}

> +


Having to perform a kzalloc in this path pretty much ruins the whole
point of the page_pool API in my opinion. We would be much better off
having a static structure that is to be maintained somewhere rather
than doing this dynamically as you would just make a memory hog able
to hold that much more memory.

> +static int page_pool_clear_pp_info(struct page *page)

> +{

> +       struct page_pool_info *pp_info = page->pp_info;

> +       int bias;

> +

> +       bias = pp_info->pagecnt_bias;

> +

> +       kfree(pp_info);

> +       page->pp_info = NULL;

> +       page->pp_magic = 0;

> +

> +       return bias;

> +}

> +

> +static void page_pool_clear_and_drain_page(struct page *page)

> +{

> +       int bias = page_pool_clear_pp_info(page);

> +

> +       __page_frag_cache_drain(page, bias + 1);

> +}

> +

>  static struct page *__page_pool_alloc_page_order(struct page_pool *pool,

>                                                  gfp_t gfp)

>  {

> @@ -216,13 +259,16 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,

>         if (unlikely(!page))

>                 return NULL;

>

> -       if ((pool->p.flags & PP_FLAG_DMA_MAP) &&

> -           unlikely(!page_pool_dma_map(pool, page))) {

> +       if (unlikely(page_pool_set_pp_info(pool, page, gfp))) {

>                 put_page(page);

>                 return NULL;

>         }

>

> -       page->pp_magic |= PP_SIGNATURE;

> +       if ((pool->p.flags & PP_FLAG_DMA_MAP) &&

> +           unlikely(!page_pool_dma_map(pool, page))) {

> +               page_pool_clear_and_drain_page(page);

> +               return NULL;

> +       }

>

>         /* Track how many pages are held 'in-flight' */

>         pool->pages_state_hold_cnt++;

> @@ -261,12 +307,17 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,

>          */

>         for (i = 0; i < nr_pages; i++) {

>                 page = pool->alloc.cache[i];

> +               if (unlikely(page_pool_set_pp_info(pool, page, gfp))) {

> +                       put_page(page);

> +                       continue;

> +               }

> +

>                 if ((pp_flags & PP_FLAG_DMA_MAP) &&

>                     unlikely(!page_pool_dma_map(pool, page))) {

> -                       put_page(page);

> +                       page_pool_clear_and_drain_page(page);

>                         continue;

>                 }


This seems backwards to me. I would have the pp_info populated after
you have generated the DMA mapping.

> -               page->pp_magic |= PP_SIGNATURE;

> +

>                 pool->alloc.cache[pool->alloc.count++] = page;

>                 /* Track how many pages are held 'in-flight' */

>                 pool->pages_state_hold_cnt++;

> @@ -284,6 +335,25 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,

>         return page;

>  }

>

> +static void page_pool_sub_bias(struct page *page, int nr)

> +{

> +       struct page_pool_info *pp_info = page->pp_info;

> +

> +       /* "pp_info->pagecnt_bias == 0" indicates the PAGECNT_BIAS

> +        * flags is not set.

> +        */

> +       if (!pp_info->pagecnt_bias)

> +               return;

> +

> +       /* Make sure pagecnt_bias > 0 for elevated refcnt case */

> +       if (unlikely(pp_info->pagecnt_bias <= nr)) {

> +               page_ref_add(page, USHRT_MAX);

> +               pp_info->pagecnt_bias += USHRT_MAX;

> +       }

> +

> +       pp_info->pagecnt_bias -= nr;


So we should never have a case where pagecnt_bias is less than the
value we are subtracting. If we have that then it is a bug.

The general idea with the pagecnt_bias is that we want to batch the
release of the page from the device. So the assumption is we are going
to pull multiple references from the page and rather than doing
page_ref_inc repeatedly we want to batch it at the start, and we have
to perform a __page_frag_cache_drain to remove any unused references
when we need to free it.

What we should probably be checking for is "pp_info->pagecnt_bias -
page_count(page) > 1" when we hit the end of the page. If that is true
then we cannot recycle the page and so when we hit PAGE_SIZE for the
offset we have to drop the mapping and free the page subtracting any
remaining pagecnt_bias we are holding. If I recall I actually ran this
the other way and ran toward 0 in my implementation before as that
allows for not having to track via a value and instead simply checking
for a signed result.

> +}

> +

>  /* For using page_pool replace: alloc_pages() API calls, but provide

>   * synchronization guarantee for allocation side.

>   */

> @@ -293,15 +363,66 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)

>

>         /* Fast-path: Get a page from cache */

>         page = __page_pool_get_cached(pool);

> -       if (page)

> +       if (page) {

> +               page_pool_sub_bias(page, 1);

>                 return page;

> +       }


I'm not sure we should be subtracting from the bias here. Ideally if
you are getting a page you are getting the full 4K page. So having a
bias other than PAGE_SIZE - 1 wouldn't make much sense here.

>

>         /* Slow-path: cache empty, do real allocation */

>         page = __page_pool_alloc_pages_slow(pool, gfp);

> +       if (page)

> +               page_pool_sub_bias(page, 1);

> +


Same here. Really in both cases we should be getting initialized
pages, not ones that are already decrementing.

>         return page;

>  }

>  EXPORT_SYMBOL(page_pool_alloc_pages);

>

> +struct page *page_pool_alloc_frag(struct page_pool *pool,

> +                                 unsigned int *offset, gfp_t gfp)

> +{

> +       unsigned int frag_offset = pool->frag_offset;

> +       unsigned int frag_size = pool->p.frag_size;

> +       struct page *frag_page = pool->frag_page;

> +       unsigned int max_len = pool->p.max_len;

> +

> +       if (!frag_page || frag_offset + frag_size > max_len) {


These are two very different cases. If frag_page is set and just out
of space we need to be freeing the unused references.

> +               frag_page = page_pool_alloc_pages(pool, gfp);


So as per my comment above the page should be coming in with a
pagecnt_bias of PAGE_SIZE - 1, and an actual page_ref_count of
PAGE_SIZE.

> +               if (unlikely(!frag_page)) {

> +                       pool->frag_page = NULL;

> +                       return NULL;

> +               }

> +

> +               pool->frag_page = frag_page;

> +               frag_offset = 0;

> +

> +               page_pool_sub_bias(frag_page, max_len / frag_size - 1);


Why are you doing division here? We should just be subtracting 1 from
the pagecnt_bias since that is the number of buffers that are being
used. The general idea is that when pagecnt_bias is 0 we cut the page
loose for potential recycling or freeing, otherwise we just subtract
our new value from pagecnt_bias until we reach it.

> +       }

> +

> +       *offset = frag_offset;

> +       pool->frag_offset = frag_offset + frag_size;

> +

> +       return frag_page;

> +}

> +EXPORT_SYMBOL(page_pool_alloc_frag);

> +

> +static void page_pool_empty_frag(struct page_pool *pool)

> +{

> +       unsigned int frag_offset = pool->frag_offset;

> +       unsigned int frag_size = pool->p.frag_size;

> +       struct page *frag_page = pool->frag_page;

> +       unsigned int max_len = pool->p.max_len;

> +

> +       if (!frag_page)

> +               return;

> +

> +       while (frag_offset + frag_size <= max_len) {

> +               page_pool_put_full_page(pool, frag_page, false);

> +               frag_offset += frag_size;

> +       }

> +

> +       pool->frag_page = NULL;

> +}

> +


It would be good to look over the page_frag_alloc_align and
__page_frag_cache_drain functions for examples of how to do most of
this. The one complication is that we have the dma mappings and
page_pool logic to deal with.

>  /* Calculate distance between two u32 values, valid if distance is below 2^(31)

>   *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution

>   */

> @@ -326,10 +447,11 @@ static s32 page_pool_inflight(struct page_pool *pool)

>   * a regular page (that will eventually be returned to the normal

>   * page-allocator via put_page).

>   */

> -void page_pool_release_page(struct page_pool *pool, struct page *page)

> +static int __page_pool_release_page(struct page_pool *pool,

> +                                   struct page *page)

>  {

>         dma_addr_t dma;

> -       int count;

> +       int bias, count;

>

>         if (!(pool->p.flags & PP_FLAG_DMA_MAP))

>                 /* Always account for inflight pages, even if we didn't

> @@ -345,22 +467,29 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)

>                              DMA_ATTR_SKIP_CPU_SYNC);

>         page_pool_set_dma_addr(page, 0);

>  skip_dma_unmap:

> -       page->pp_magic = 0;

> +       bias = page_pool_clear_pp_info(page);

>

>         /* This may be the last page returned, releasing the pool, so

>          * it is not safe to reference pool afterwards.

>          */

>         count = atomic_inc_return(&pool->pages_state_release_cnt);

>         trace_page_pool_state_release(pool, page, count);

> +       return bias;

> +}

> +

> +void page_pool_release_page(struct page_pool *pool, struct page *page)

> +{

> +       int bias = __page_pool_release_page(pool, page);

> +

> +       WARN_ONCE(bias, "PAGECNT_BIAS is not supposed to be enabled\n");

>  }

>  EXPORT_SYMBOL(page_pool_release_page);

>

>  /* Return a page to the page allocator, cleaning up our state */

>  static void page_pool_return_page(struct page_pool *pool, struct page *page)

>  {

> -       page_pool_release_page(pool, page);

> +       __page_frag_cache_drain(page, __page_pool_release_page(pool, page) + 1);

>

> -       put_page(page);

>         /* An optimization would be to call __free_pages(page, pool->p.order)

>          * knowing page is not part of page-cache (thus avoiding a

>          * __page_cache_release() call).

> @@ -395,7 +524,16 @@ static bool page_pool_recycle_in_cache(struct page *page,

>         return true;

>  }

>

> -/* If the page refcnt == 1, this will try to recycle the page.

> +static bool page_pool_bias_page_recyclable(struct page *page, int bias)

> +{

> +       int ref = page_ref_dec_return(page);

> +

> +       WARN_ON(ref < bias);

> +       return ref == bias + 1;

> +}

> +

> +/* If pagecnt_bias == 0 and the page refcnt == 1, this will try to

> + * recycle the page.

>   * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for

>   * the configured size min(dma_sync_size, pool->max_len).

>   * If the page refcnt != 1, then the page will be returned to memory

> @@ -405,16 +543,35 @@ static __always_inline struct page *

>  __page_pool_put_page(struct page_pool *pool, struct page *page,

>                      unsigned int dma_sync_size, bool allow_direct)

>  {

> -       /* This allocator is optimized for the XDP mode that uses

> +       int bias = page->pp_info->pagecnt_bias;

> +

> +       /* Handle the elevated refcnt case first:

> +        * multi-frames-per-page, it is likely from the skb, which

> +        * is likely called in non-sofrirq context, so do not recycle

> +        * it in pool->alloc.

> +        *

> +        * Then handle non-elevated refcnt case:

>          * one-frame-per-page, but have fallbacks that act like the

>          * regular page allocator APIs.

> -        *

>          * refcnt == 1 means page_pool owns page, and can recycle it.

>          *

>          * page is NOT reusable when allocated when system is under

>          * some pressure. (page_is_pfmemalloc)

>          */

> -       if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {

> +       if (bias) {

> +               /* We have gave some refcnt to the stack, so wait for

> +                * all refcnt of the stack to be decremented before

> +                * enabling recycling.

> +                */

> +               if (!page_pool_bias_page_recyclable(page, bias))

> +                       return NULL;

> +

> +               /* only enable recycling when it is not pfmemalloced */

> +               if (!page_is_pfmemalloc(page))

> +                       return page;

> +


So this would be fine if this was only accessed from the driver. The
problem is the recycling code made it so that this is accessed in the
generic skb freeing path. As such I think this is prone to races since
you have to guarantee the ordering of things between the reference
count and pagecnt_bias.

> +       } else if (likely(page_ref_count(page) == 1 &&

> +                         !page_is_pfmemalloc(page))) {

>                 /* Read barrier done in page_ref_count / READ_ONCE */

>

>                 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)

> @@ -428,22 +585,8 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,

>                 /* Page found as candidate for recycling */

>                 return page;

>         }

> -       /* Fallback/non-XDP mode: API user have elevated refcnt.

> -        *

> -        * Many drivers split up the page into fragments, and some

> -        * want to keep doing this to save memory and do refcnt based

> -        * recycling. Support this use case too, to ease drivers

> -        * switching between XDP/non-XDP.

> -        *

> -        * In-case page_pool maintains the DMA mapping, API user must

> -        * call page_pool_put_page once.  In this elevated refcnt

> -        * case, the DMA is unmapped/released, as driver is likely

> -        * doing refcnt based recycle tricks, meaning another process

> -        * will be invoking put_page.

> -        */

> -       /* Do not replace this with page_pool_return_page() */

> +

>         page_pool_release_page(pool, page);

> -       put_page(page);

>

>         return NULL;

>  }

> @@ -452,6 +595,7 @@ void page_pool_put_page(struct page_pool *pool, struct page *page,

>                         unsigned int dma_sync_size, bool allow_direct)

>  {

>         page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);

> +

>         if (page && !page_pool_recycle_in_ring(pool, page)) {

>                 /* Cache full, fallback to free pages */

>                 page_pool_return_page(pool, page);

> @@ -503,8 +647,11 @@ static void page_pool_empty_ring(struct page_pool *pool)

>

>         /* Empty recycle ring */

>         while ((page = ptr_ring_consume_bh(&pool->ring))) {

> -               /* Verify the refcnt invariant of cached pages */

> -               if (!(page_ref_count(page) == 1))

> +               /* Verify the refcnt invariant of cached pages for

> +                * non elevated refcnt case.

> +                */

> +               if (!(pool->p.flags & PP_FLAG_PAGECNT_BIAS) &&

> +                   !(page_ref_count(page) == 1))

>                         pr_crit("%s() page_pool refcnt %d violation\n",

>                                 __func__, page_ref_count(page));

>

> @@ -544,6 +691,7 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)

>

>  static void page_pool_scrub(struct page_pool *pool)

>  {

> +       page_pool_empty_frag(pool);

>         page_pool_empty_alloc_cache_once(pool);

>         pool->destroy_cnt++;

>

> @@ -637,14 +785,13 @@ bool page_pool_return_skb_page(struct page *page)

>         if (unlikely(page->pp_magic != PP_SIGNATURE))

>                 return false;

>

> -       pp = page->pp;

> +       pp = page->pp_info->pp;

>

>         /* Driver set this to memory recycling info. Reset it on recycle.

>          * This will *not* work for NIC using a split-page memory model.

>          * The page will be returned to the pool here regardless of the

>          * 'flipped' fragment being in use or not.

>          */

> -       page->pp = NULL;

>         page_pool_put_full_page(pp, page, false);

>

>         return true;

> --

> 2.7.4

>
Yunsheng Lin July 7, 2021, 3:05 a.m. UTC | #7
On 2021/7/7 4:45, Alexander Duyck wrote:
> On Wed, Jun 30, 2021 at 2:19 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:

>>

>> Currently page pool only support page recycling only when

>> refcnt of page is one, which means it can not support the

>> split page recycling implemented in the most ethernet driver.

>>

>> So add elevated refcnt support in page pool, and support

>> allocating page frag to enable multi-frames-per-page based

>> on the elevated refcnt support.

>>

>> As the elevated refcnt is per page, and there is no space

>> for that in "struct page" now, so add a dynamically allocated

>> "struct page_pool_info" to record page pool ptr and refcnt

>> corrsponding to a page for now. Later, we can recycle the

>> "struct page_pool_info" too, or use part of page memory to

>> record pp_info.

>>

>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>


Hi, Alexander

Thanks for detailed reviewing.

> 

> So this isn't going to work with the current recycling logic. The

> expectation there is that we can safely unmap the entire page as soon

> as the reference count is greater than 1.


Yes, the expectation is changed to we can always recycle the page
when the last user has dropped the refcnt that has given to it when
the page is not pfmemalloced.

The above expectation is based on that the last user will always
call page_pool_put_full_page() in order to do the recycling or do
the resource cleanup(dma unmaping..etc).

As the skb_free_head() and skb_release_data() have both checked the
skb->pp_recycle to call the page_pool_put_full_page() if needed, I
think we are safe for most case, the one case I am not so sure above
is the rx zero copy, which seems to also bump up the refcnt before
mapping the page to user space, we might need to ensure rx zero copy
is not the last user of the page or if it is the last user, make sure
it calls page_pool_put_full_page() too.


> 

> In addition I think I need to look over that code better as I am

> wondering if there are potential issues assuming a path such as a

> skb_clone followed by pskb_expand_head may lead to memory corruptions

> since the clone will still have pp_recycle set but none of the pages

> will be part of the page pool anymore.


There is still page->pp_magic that decides if the page is from
page_pool or not.

> 

> For us the pagecnt_bias would really represent the number of

> additional mappings beyond the current page that are being held. I

> have already been playing around with something similar. However the

> general idea is that we want to keep track of how many references to

> the page the device is holding onto. When that hits 0 and the actual

> page count is 1 we can refill both, however if we hit 0 and there are

> multiple references to the page still floating around we should just

> unmap the page and turn it over to the stack or free it.


I am not sure I understood the above.

As page reusing in hns3 driver, pagecnt_bias means how many refcnt the
driver is holding, and (page_count(cb->priv) - pagecnt_bias) means how
many refcnt the stack is holding, see [1].

static bool hns3_can_reuse_page(struct hns3_desc_cb *cb)
{
	return (page_count(cb->priv) - cb->pagecnt_bias) == 1;
}

checking (page_count(cb->priv) - cb->pagecnt_bias) again one instead
of zero is in hns3_can_reuse_page because there is "pagecnt_bias--"
before checking hns3_can_reuse_page() in hns3_nic_reuse_page().

"pagecnt_bias--" means the driver gives the one of its refcnt to the
stack, it is the stack'job to release the refcnt when the skb is passed
to the stack.

1. https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c#L2870

> 

>> ---

>>  drivers/net/ethernet/marvell/mvneta.c           |   6 +-

>>  drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c |   2 +-

>>  include/linux/mm_types.h                        |   2 +-

>>  include/linux/skbuff.h                          |   4 +-

>>  include/net/page_pool.h                         |  30 +++-

>>  net/core/page_pool.c                            | 215 ++++++++++++++++++++----

>>  6 files changed, 207 insertions(+), 52 deletions(-)

>>

>> diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c

>> index 88a7550..5a29af2 100644

>> --- a/drivers/net/ethernet/marvell/mvneta.c

>> +++ b/drivers/net/ethernet/marvell/mvneta.c

>> @@ -2327,7 +2327,7 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,

>>         if (!skb)

>>                 return ERR_PTR(-ENOMEM);

>>

>> -       skb_mark_for_recycle(skb, virt_to_page(xdp->data), pool);

>> +       skb_mark_for_recycle(skb);

>>

>>         skb_reserve(skb, xdp->data - xdp->data_hard_start);

>>         skb_put(skb, xdp->data_end - xdp->data);

>> @@ -2339,10 +2339,6 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,

>>                 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,

>>                                 skb_frag_page(frag), skb_frag_off(frag),

>>                                 skb_frag_size(frag), PAGE_SIZE);

>> -               /* We don't need to reset pp_recycle here. It's already set, so

>> -                * just mark fragments for recycling.

>> -                */

>> -               page_pool_store_mem_info(skb_frag_page(frag), pool);

>>         }

>>

>>         return skb;

> 

> So as I mentioned earlier the problem with recycling is that splitting

> up the ownership of the page makes it difficult for us to clean it up.

> Technically speaking if the pages are being allowed to leave while

> holding references to DMA addresses that we cannot revoke then we

> should be holding references to the device.

> 

> That is one of the reasons why the previous code was just clearing the

> mapping as soon as the refcount was greater than 1. However for this

> to work out correctly we would have to track how many DMA mappings we

> have outstanding in addition to the one we are working on currently.


I think page pool has already handled the above case if I understand
correctly, see page_pool_release().

> 

>> diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

>> index 3135220..540e387 100644

>> --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

>> +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

>> @@ -3997,7 +3997,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,

>>                 }

>>

>>                 if (pp)

>> -                       skb_mark_for_recycle(skb, page, pp);

>> +                       skb_mark_for_recycle(skb);

>>                 else

>>                         dma_unmap_single_attrs(dev->dev.parent, dma_addr,

>>                                                bm_pool->buf_size, DMA_FROM_DEVICE,

>> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

>> index 862f88a..cf613df 100644

>> --- a/include/linux/mm_types.h

>> +++ b/include/linux/mm_types.h

>> @@ -101,7 +101,7 @@ struct page {

>>                          * page_pool allocated pages.

>>                          */

>>                         unsigned long pp_magic;

>> -                       struct page_pool *pp;

>> +                       struct page_pool_info *pp_info;

>>                         unsigned long _pp_mapping_pad;

>>                         /**

>>                          * @dma_addr: might require a 64-bit value on

> 

> So the problem here is that this is creating a pointer chase, and the

> need to allocate yet another structure to store it is going to be

> expensive.

> 

> As far as storing the pagecnt_bias it might make more sense to

> repurpose the lower 12 bits of the dma address. A DMA mapping should

> be page aligned anyway so the lower 12 bits would be reserved 0. When

> we decrement the value so that the lower 12 bits are 0 we should be

> unmapping the page anyway, or resetting the pagecnt_bias to PAGE_SIZE

> - 1 and adding back the bias to the page to effectively reset it for

> reuse.


Yes, that is a great idea. I like it very much supposing page refcnt
updating batching for 'PAGE_SIZE - 1" is enough for performance sake.

Will take a look about it.

> 

>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

>> index b2db9cd..7795979 100644

>> --- a/include/linux/skbuff.h

>> +++ b/include/linux/skbuff.h

>> @@ -4711,11 +4711,9 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)

>>  }

>>

>>  #ifdef CONFIG_PAGE_POOL

>> -static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page,

>> -                                       struct page_pool *pp)

>> +static inline void skb_mark_for_recycle(struct sk_buff *skb)

>>  {

>>         skb->pp_recycle = 1;

>> -       page_pool_store_mem_info(page, pp);

>>  }

>>  #endif

> 

> I am not a fan of the pp_recycle flag either. We duplicate it via

> skb_clone and from what I can tell if we call pskb_expand_head

> afterwards I don't see how we avoid recycling the page frags twice.


Acctually skb->pp_recycle is kind of duplicated, as there is
still page->pp_magic to avoid recycling the page frags twice.

The argument above adding skb->pp_recycle seems to be short
cut code path for non-page_pool case in the previous disscusion,
see [2].

2. https://lore.kernel.org/linux-mm/074b0d1d-9531-57f3-8e0e-a447387478d1@huawei.com/

> 

>> diff --git a/include/net/page_pool.h b/include/net/page_pool.h

>> index 3dd62dd..44e7545 100644

>> --- a/include/net/page_pool.h

>> +++ b/include/net/page_pool.h

>> @@ -45,7 +45,9 @@

>>                                         * Please note DMA-sync-for-CPU is still

>>                                         * device driver responsibility

>>                                         */

>> -#define PP_FLAG_ALL            (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)

>> +#define PP_FLAG_PAGECNT_BIAS   BIT(2)  /* Enable elevated refcnt */

>> +#define PP_FLAG_ALL            (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV |\

>> +                                PP_FLAG_PAGECNT_BIAS)

>>

>>  /*

> 

> It might be better to just put each flag on a seperate line for

> PP_FLAG_ALL just to make it easier to read due to the wrapping. Either

> that or you could look at converting this over to an enum with a MAX

> value and then define the flags based on those enums, and PP_FLAG_ALL

> being BIT(MAX) - 1.


Will do the wrapping first:)

> 

>>   * Fast allocation side cache array/stack

>> @@ -77,6 +79,7 @@ struct page_pool_params {

>>         enum dma_data_direction dma_dir; /* DMA mapping direction */

>>         unsigned int    max_len; /* max DMA sync memory size */

>>         unsigned int    offset;  /* DMA addr offset */

>> +       unsigned int    frag_size;

>>  };

>>

>>  struct page_pool {

>> @@ -88,6 +91,8 @@ struct page_pool {

>>         unsigned long defer_warn;

>>

>>         u32 pages_state_hold_cnt;

>> +       unsigned int frag_offset;

>> +       struct page *frag_page;

>>

>>         /*

>>          * Data structure for allocation side

>> @@ -128,6 +133,11 @@ struct page_pool {

>>         u64 destroy_cnt;

>>  };

>>

>> +struct page_pool_info {

>> +       struct page_pool *pp;

>> +       int pagecnt_bias;

>> +};

>> +

> 

> Rather than having a top-down structure here it might be better to

> work bottom up. If you assume you are keeping a pagecnt_bias per page

> it might make more sense to store this in the driver somewhere rather

> than having it as a separate allocated buffer. One advantage of the

> Intel drivers was doing this as we had the pagecnt_bias in a structure

> that also pointed to the page. That way we were only updating that

> count if we dropped the page and didn't have to even touch the page.

> You could use that to batch updates to the pagecnt_bias if we did use

> the lower 12 bits of the DMA address to store it as well.


I am not sure I understood what "we dropped the page" meant.
The driver does not really need to call page_pool_put_full_page()
if the page of a skb is passed to stack, the driver mainly call
page_pool_put_full_page() when unloading or uniniting when the page
is not passed to stack yet.


> I'm assuming the idea with this is that you will be having multiple

> buffers received off of a single page and so doing it that way you

> should only have one update on allocation, maybe a trickle of updates

> for XDP_TX, and another large update when the page is fully consumed

> and you drop the remaining pagecnt_bias for Rx.


I suppose "having multiple buffers received off of a single page" mean:
use first half of a page for a desc, and the second half of the same page
for another desc, intead of ping-pong way of reusing implemented in most
driver currently?

I am not so familiar with XDP to understand the latter part of comment too.

> 

>>  struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);

>>

>>  static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)

>> @@ -137,6 +147,17 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)

>>         return page_pool_alloc_pages(pool, gfp);

>>  }

>>

>> +struct page *page_pool_alloc_frag(struct page_pool *pool,

>> +                                 unsigned int *offset, gfp_t gfp);

>> +

>> +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,

>> +                                                   unsigned int *offset)

>> +{

>> +       gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);

>> +

>> +       return page_pool_alloc_frag(pool, offset, gfp);

>> +}

>> +

>>  /* get the stored dma direction. A driver might decide to treat this locally and

>>   * avoid the extra cache line from page_pool to determine the direction

>>   */

>> @@ -253,11 +274,4 @@ static inline void page_pool_ring_unlock(struct page_pool *pool)

>>                 spin_unlock_bh(&pool->ring.producer_lock);

>>  }

>>

>> -/* Store mem_info on struct page and use it while recycling skb frags */

>> -static inline

>> -void page_pool_store_mem_info(struct page *page, struct page_pool *pp)

>> -{

>> -       page->pp = pp;

>> -}

>> -

>>  #endif /* _NET_PAGE_POOL_H */

> 

> So the issue as I see it with the page_pool recycling patch set is

> that I don't think we had proper guarantees in place that the page->pp

> value was flushed in all cases where skb->dev was changed. Basically

> the logic we need to have in place to address those issues is that

> skb->dev is changed we need to invalidate the DMA mappings on the

> page_pool page.


The DMA mappings invalidating is based on the pool->p.dev, is there
any reason why the DMA mappings need invalidating when skb->dev is
change, as fast I can tell, the tx is not aware of page pool, so
when the skb is redirected, the page of the skb is always DMA mapped
according to skb->dev before xmitting.

Or it is about XDP redirected?

Is there something obvious I missed here?

> 

> I honestly wonder if it wouldn't be better for the recycling to just

> make use of the page->lru pointers to keep a list of pages that are

> outstanding so that it could release them if it is under DMA pressure.

> 

>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c

>> index 5e4eb45..95d94a7 100644

>> --- a/net/core/page_pool.c

>> +++ b/net/core/page_pool.c

>> @@ -206,6 +206,49 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)

>>         return true;

>>  }

>>

>> +static int page_pool_set_pp_info(struct page_pool *pool,

>> +                                struct page *page, gfp_t gfp)

>> +{

>> +       struct page_pool_info *pp_info;

>> +

>> +       pp_info = kzalloc_node(sizeof(*pp_info), gfp, pool->p.nid);

>> +       if (!pp_info)

>> +               return -ENOMEM;

>> +

>> +       if (pool->p.flags & PP_FLAG_PAGECNT_BIAS) {

>> +               page_ref_add(page, USHRT_MAX);

>> +               pp_info->pagecnt_bias = USHRT_MAX;

>> +       } else {

>> +               pp_info->pagecnt_bias = 0;

>> +       }

>> +

>> +       page->pp_magic |= PP_SIGNATURE;

>> +       pp_info->pp = pool;

>> +       page->pp_info = pp_info;

>> +       return 0;

>> +}

>> +

> 

> Having to perform a kzalloc in this path pretty much ruins the whole

> point of the page_pool API in my opinion. We would be much better off

> having a static structure that is to be maintained somewhere rather

> than doing this dynamically as you would just make a memory hog able

> to hold that much more memory.


Let's see if repurposing the lower 12 bits of the dma address make sense?

> 

>> +static int page_pool_clear_pp_info(struct page *page)

>> +{

>> +       struct page_pool_info *pp_info = page->pp_info;

>> +       int bias;

>> +

>> +       bias = pp_info->pagecnt_bias;

>> +

>> +       kfree(pp_info);

>> +       page->pp_info = NULL;

>> +       page->pp_magic = 0;

>> +

>> +       return bias;

>> +}

>> +

>> +static void page_pool_clear_and_drain_page(struct page *page)

>> +{

>> +       int bias = page_pool_clear_pp_info(page);

>> +

>> +       __page_frag_cache_drain(page, bias + 1);

>> +}

>> +

>>  static struct page *__page_pool_alloc_page_order(struct page_pool *pool,

>>                                                  gfp_t gfp)

>>  {

>> @@ -216,13 +259,16 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,

>>         if (unlikely(!page))

>>                 return NULL;

>>

>> -       if ((pool->p.flags & PP_FLAG_DMA_MAP) &&

>> -           unlikely(!page_pool_dma_map(pool, page))) {

>> +       if (unlikely(page_pool_set_pp_info(pool, page, gfp))) {

>>                 put_page(page);

>>                 return NULL;

>>         }

>>

>> -       page->pp_magic |= PP_SIGNATURE;

>> +       if ((pool->p.flags & PP_FLAG_DMA_MAP) &&

>> +           unlikely(!page_pool_dma_map(pool, page))) {

>> +               page_pool_clear_and_drain_page(page);

>> +               return NULL;

>> +       }

>>

>>         /* Track how many pages are held 'in-flight' */

>>         pool->pages_state_hold_cnt++;

>> @@ -261,12 +307,17 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,

>>          */

>>         for (i = 0; i < nr_pages; i++) {

>>                 page = pool->alloc.cache[i];

>> +               if (unlikely(page_pool_set_pp_info(pool, page, gfp))) {

>> +                       put_page(page);

>> +                       continue;

>> +               }

>> +

>>                 if ((pp_flags & PP_FLAG_DMA_MAP) &&

>>                     unlikely(!page_pool_dma_map(pool, page))) {

>> -                       put_page(page);

>> +                       page_pool_clear_and_drain_page(page);

>>                         continue;

>>                 }

> 

> This seems backwards to me. I would have the pp_info populated after

> you have generated the DMA mapping.


Ok.

> 

>> -               page->pp_magic |= PP_SIGNATURE;

>> +

>>                 pool->alloc.cache[pool->alloc.count++] = page;

>>                 /* Track how many pages are held 'in-flight' */

>>                 pool->pages_state_hold_cnt++;

>> @@ -284,6 +335,25 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,

>>         return page;

>>  }

>>

>> +static void page_pool_sub_bias(struct page *page, int nr)

>> +{

>> +       struct page_pool_info *pp_info = page->pp_info;

>> +

>> +       /* "pp_info->pagecnt_bias == 0" indicates the PAGECNT_BIAS

>> +        * flags is not set.

>> +        */

>> +       if (!pp_info->pagecnt_bias)

>> +               return;

>> +

>> +       /* Make sure pagecnt_bias > 0 for elevated refcnt case */

>> +       if (unlikely(pp_info->pagecnt_bias <= nr)) {

>> +               page_ref_add(page, USHRT_MAX);

>> +               pp_info->pagecnt_bias += USHRT_MAX;

>> +       }

>> +

>> +       pp_info->pagecnt_bias -= nr;

> 

> So we should never have a case where pagecnt_bias is less than the

> value we are subtracting. If we have that then it is a bug.


Yes.

> 

> The general idea with the pagecnt_bias is that we want to batch the

> release of the page from the device. So the assumption is we are going

> to pull multiple references from the page and rather than doing

> page_ref_inc repeatedly we want to batch it at the start, and we have

> to perform a __page_frag_cache_drain to remove any unused references

> when we need to free it.


Yes, it is about batching the page_ref_inc() operation.

> 

> What we should probably be checking for is "pp_info->pagecnt_bias -

> page_count(page) > 1" when we hit the end of the page. If that is true

> then we cannot recycle the page and so when we hit PAGE_SIZE for the

> offset we have to drop the mapping and free the page subtracting any

> remaining pagecnt_bias we are holding. If I recall I actually ran this

> the other way and ran toward 0 in my implementation before as that

> allows for not having to track via a value and instead simply checking

> for a signed result.



When allocating a page for frag, we have decided how many user is using
the page, that is the "page_pool_sub_bias(frag_page, max_len / frag_size - 1)"
in page_pool_alloc_frag().

so it is up to the driver or stack to do multi page_pool_put_full_page()
calling for the same page.
Or the page pool will call page_pool_put_full_page() in page_pool_empty_frag()
if some of the page frag is not allocated to the driver yet.

It seems you are suggesting a slightly different way to do frag reusing.

> 

>> +}

>> +

>>  /* For using page_pool replace: alloc_pages() API calls, but provide

>>   * synchronization guarantee for allocation side.

>>   */

>> @@ -293,15 +363,66 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)

>>

>>         /* Fast-path: Get a page from cache */

>>         page = __page_pool_get_cached(pool);

>> -       if (page)

>> +       if (page) {

>> +               page_pool_sub_bias(page, 1);

>>                 return page;

>> +       }

> 

> I'm not sure we should be subtracting from the bias here. Ideally if

> you are getting a page you are getting the full 4K page. So having a

> bias other than PAGE_SIZE - 1 wouldn't make much sense here.


It seems we have different understanding about pagecnt_bias here,
as the pagecnt_bias is hidden in the page pool now, the subtracting
here mean we give one refcnt to the caller of page_pool_alloc_pages(),
And in page_pool_alloc_frag(), we give different part of page to the
driver, so it means more user too, so there is also subtracting in the
page_pool_alloc_frag() too.

> 

>>

>>         /* Slow-path: cache empty, do real allocation */

>>         page = __page_pool_alloc_pages_slow(pool, gfp);

>> +       if (page)

>> +               page_pool_sub_bias(page, 1);

>> +

> 

> Same here. Really in both cases we should be getting initialized

> pages, not ones that are already decrementing.

> 

>>         return page;

>>  }

>>  EXPORT_SYMBOL(page_pool_alloc_pages);

>>

>> +struct page *page_pool_alloc_frag(struct page_pool *pool,

>> +                                 unsigned int *offset, gfp_t gfp)

>> +{

>> +       unsigned int frag_offset = pool->frag_offset;

>> +       unsigned int frag_size = pool->p.frag_size;

>> +       struct page *frag_page = pool->frag_page;

>> +       unsigned int max_len = pool->p.max_len;

>> +

>> +       if (!frag_page || frag_offset + frag_size > max_len) {

> 

> These are two very different cases. If frag_page is set and just out

> of space we need to be freeing the unused references.


As mention above, we are depending on the last user to do the
recycling or freeing the unused references.

> 

>> +               frag_page = page_pool_alloc_pages(pool, gfp);

> 

> So as per my comment above the page should be coming in with a

> pagecnt_bias of PAGE_SIZE - 1, and an actual page_ref_count of

> PAGE_SIZE.


Let's align the understanding of pagecnt_bias first?

pagecnt_bias meant how many refcnt of a page belong to the page
pool, and (page_ref_count() - pagecnt_bias) means how many refcnt
of a page belong to user of the page pool.

> 

>> +               if (unlikely(!frag_page)) {

>> +                       pool->frag_page = NULL;

>> +                       return NULL;

>> +               }

>> +

>> +               pool->frag_page = frag_page;

>> +               frag_offset = 0;

>> +

>> +               page_pool_sub_bias(frag_page, max_len / frag_size - 1);

> 

> Why are you doing division here? We should just be subtracting 1 from

> the pagecnt_bias since that is the number of buffers that are being

> used. The general idea is that when pagecnt_bias is 0 we cut the page

> loose for potential recycling or freeing, otherwise we just subtract

> our new value from pagecnt_bias until we reach it.


As mentioned above, division is used to find out how many user may be
using the page.

> 

>> +       }

>> +

>> +       *offset = frag_offset;

>> +       pool->frag_offset = frag_offset + frag_size;

>> +

>> +       return frag_page;

>> +}

>> +EXPORT_SYMBOL(page_pool_alloc_frag);

>> +

>> +static void page_pool_empty_frag(struct page_pool *pool)

>> +{

>> +       unsigned int frag_offset = pool->frag_offset;

>> +       unsigned int frag_size = pool->p.frag_size;

>> +       struct page *frag_page = pool->frag_page;

>> +       unsigned int max_len = pool->p.max_len;

>> +

>> +       if (!frag_page)

>> +               return;

>> +

>> +       while (frag_offset + frag_size <= max_len) {

>> +               page_pool_put_full_page(pool, frag_page, false);

>> +               frag_offset += frag_size;

>> +       }

>> +

>> +       pool->frag_page = NULL;

>> +}

>> +

> 

> It would be good to look over the page_frag_alloc_align and

> __page_frag_cache_drain functions for examples of how to do most of

> this. The one complication is that we have the dma mappings and

> page_pool logic to deal with.


Is it ok to rely on the user providing a aligning frag_size, so
that do not need handling it here?

> 

>>  /* Calculate distance between two u32 values, valid if distance is below 2^(31)

>>   *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution

>>   */

>> @@ -326,10 +447,11 @@ static s32 page_pool_inflight(struct page_pool *pool)

>>   * a regular page (that will eventually be returned to the normal

>>   * page-allocator via put_page).

>>   */

>> -void page_pool_release_page(struct page_pool *pool, struct page *page)

>> +static int __page_pool_release_page(struct page_pool *pool,

>> +                                   struct page *page)

>>  {

>>         dma_addr_t dma;

>> -       int count;

>> +       int bias, count;

>>

>>         if (!(pool->p.flags & PP_FLAG_DMA_MAP))

>>                 /* Always account for inflight pages, even if we didn't

>> @@ -345,22 +467,29 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)

>>                              DMA_ATTR_SKIP_CPU_SYNC);

>>         page_pool_set_dma_addr(page, 0);

>>  skip_dma_unmap:

>> -       page->pp_magic = 0;

>> +       bias = page_pool_clear_pp_info(page);

>>

>>         /* This may be the last page returned, releasing the pool, so

>>          * it is not safe to reference pool afterwards.

>>          */

>>         count = atomic_inc_return(&pool->pages_state_release_cnt);

>>         trace_page_pool_state_release(pool, page, count);

>> +       return bias;

>> +}

>> +

>> +void page_pool_release_page(struct page_pool *pool, struct page *page)

>> +{

>> +       int bias = __page_pool_release_page(pool, page);

>> +

>> +       WARN_ONCE(bias, "PAGECNT_BIAS is not supposed to be enabled\n");

>>  }

>>  EXPORT_SYMBOL(page_pool_release_page);

>>

>>  /* Return a page to the page allocator, cleaning up our state */

>>  static void page_pool_return_page(struct page_pool *pool, struct page *page)

>>  {

>> -       page_pool_release_page(pool, page);

>> +       __page_frag_cache_drain(page, __page_pool_release_page(pool, page) + 1);

>>

>> -       put_page(page);

>>         /* An optimization would be to call __free_pages(page, pool->p.order)

>>          * knowing page is not part of page-cache (thus avoiding a

>>          * __page_cache_release() call).

>> @@ -395,7 +524,16 @@ static bool page_pool_recycle_in_cache(struct page *page,

>>         return true;

>>  }

>>

>> -/* If the page refcnt == 1, this will try to recycle the page.

>> +static bool page_pool_bias_page_recyclable(struct page *page, int bias)

>> +{

>> +       int ref = page_ref_dec_return(page);

>> +

>> +       WARN_ON(ref < bias);

>> +       return ref == bias + 1;

>> +}

>> +

>> +/* If pagecnt_bias == 0 and the page refcnt == 1, this will try to

>> + * recycle the page.

>>   * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for

>>   * the configured size min(dma_sync_size, pool->max_len).

>>   * If the page refcnt != 1, then the page will be returned to memory

>> @@ -405,16 +543,35 @@ static __always_inline struct page *

>>  __page_pool_put_page(struct page_pool *pool, struct page *page,

>>                      unsigned int dma_sync_size, bool allow_direct)

>>  {

>> -       /* This allocator is optimized for the XDP mode that uses

>> +       int bias = page->pp_info->pagecnt_bias;

>> +

>> +       /* Handle the elevated refcnt case first:

>> +        * multi-frames-per-page, it is likely from the skb, which

>> +        * is likely called in non-sofrirq context, so do not recycle

>> +        * it in pool->alloc.

>> +        *

>> +        * Then handle non-elevated refcnt case:

>>          * one-frame-per-page, but have fallbacks that act like the

>>          * regular page allocator APIs.

>> -        *

>>          * refcnt == 1 means page_pool owns page, and can recycle it.

>>          *

>>          * page is NOT reusable when allocated when system is under

>>          * some pressure. (page_is_pfmemalloc)

>>          */

>> -       if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {

>> +       if (bias) {

>> +               /* We have gave some refcnt to the stack, so wait for

>> +                * all refcnt of the stack to be decremented before

>> +                * enabling recycling.

>> +                */

>> +               if (!page_pool_bias_page_recyclable(page, bias))

>> +                       return NULL;

>> +

>> +               /* only enable recycling when it is not pfmemalloced */

>> +               if (!page_is_pfmemalloc(page))

>> +                       return page;

>> +

> 

> So this would be fine if this was only accessed from the driver. The

> problem is the recycling code made it so that this is accessed in the

> generic skb freeing path. As such I think this is prone to races since

> you have to guarantee the ordering of things between the reference

> count and pagecnt_bias.


As reference count is handled atomically is page_pool_bias_page_recyclable,
and pagecnt_bias is changed before any page is handled to the stack(maybe
some READ_ONCE/WRITE_ONCE or barrier is still needed, will check it again),
so I suppose the ordering is correct?

> 

>> +       } else if (likely(page_ref_count(page) == 1 &&

>> +                         !page_is_pfmemalloc(page))) {

>>                 /* Read barrier done in page_ref_count / READ_ONCE */

>>

>>                 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)

>> @@ -428,22 +585,8 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,

>>                 /* Page found as candidate for recycling */

>>                 return page;

>>         }

>> -       /* Fallback/non-XDP mode: API user have elevated refcnt.

>> -        *

>> -        * Many drivers split up the page into fragments, and some

>> -        * want to keep doing this to save memory and do refcnt based

>> -        * recycling. Support this use case too, to ease drivers

>> -        * switching between XDP/non-XDP.

>> -        *

>> -        * In-case page_pool maintains the DMA mapping, API user must

>> -        * call page_pool_put_page once.  In this elevated refcnt

>> -        * case, the DMA is unmapped/released, as driver is likely

>> -        * doing refcnt based recycle tricks, meaning another process

>> -        * will be invoking put_page.

>> -        */

>> -       /* Do not replace this with page_pool_return_page() */

>> +

>>         page_pool_release_page(pool, page);

>> -       put_page(page);

>>

>>         return NULL;

>>  }

>> @@ -452,6 +595,7 @@ void page_pool_put_page(struct page_pool *pool, struct page *page,

>>                         unsigned int dma_sync_size, bool allow_direct)

>>  {

>>         page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);

>> +

>>         if (page && !page_pool_recycle_in_ring(pool, page)) {

>>                 /* Cache full, fallback to free pages */

>>                 page_pool_return_page(pool, page);

>> @@ -503,8 +647,11 @@ static void page_pool_empty_ring(struct page_pool *pool)

>>

>>         /* Empty recycle ring */

>>         while ((page = ptr_ring_consume_bh(&pool->ring))) {

>> -               /* Verify the refcnt invariant of cached pages */

>> -               if (!(page_ref_count(page) == 1))

>> +               /* Verify the refcnt invariant of cached pages for

>> +                * non elevated refcnt case.

>> +                */

>> +               if (!(pool->p.flags & PP_FLAG_PAGECNT_BIAS) &&

>> +                   !(page_ref_count(page) == 1))

>>                         pr_crit("%s() page_pool refcnt %d violation\n",

>>                                 __func__, page_ref_count(page));

>>

>> @@ -544,6 +691,7 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)

>>

>>  static void page_pool_scrub(struct page_pool *pool)

>>  {

>> +       page_pool_empty_frag(pool);

>>         page_pool_empty_alloc_cache_once(pool);

>>         pool->destroy_cnt++;

>>

>> @@ -637,14 +785,13 @@ bool page_pool_return_skb_page(struct page *page)

>>         if (unlikely(page->pp_magic != PP_SIGNATURE))

>>                 return false;

>>

>> -       pp = page->pp;

>> +       pp = page->pp_info->pp;

>>

>>         /* Driver set this to memory recycling info. Reset it on recycle.

>>          * This will *not* work for NIC using a split-page memory model.

>>          * The page will be returned to the pool here regardless of the

>>          * 'flipped' fragment being in use or not.

>>          */

>> -       page->pp = NULL;

>>         page_pool_put_full_page(pp, page, false);

>>

>>         return true;

>> --

>> 2.7.4

>>

> .

>
Alexander Duyck July 7, 2021, 3:01 p.m. UTC | #8
On Tue, Jul 6, 2021 at 8:05 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>

> On 2021/7/7 4:45, Alexander Duyck wrote:

> > On Wed, Jun 30, 2021 at 2:19 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:

> >>

> >> Currently page pool only support page recycling only when

> >> refcnt of page is one, which means it can not support the

> >> split page recycling implemented in the most ethernet driver.

> >>

> >> So add elevated refcnt support in page pool, and support

> >> allocating page frag to enable multi-frames-per-page based

> >> on the elevated refcnt support.

> >>

> >> As the elevated refcnt is per page, and there is no space

> >> for that in "struct page" now, so add a dynamically allocated

> >> "struct page_pool_info" to record page pool ptr and refcnt

> >> corrsponding to a page for now. Later, we can recycle the

> >> "struct page_pool_info" too, or use part of page memory to

> >> record pp_info.

> >>

> >> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>

>

> Hi, Alexander

>

> Thanks for detailed reviewing.

>

> >

> > So this isn't going to work with the current recycling logic. The

> > expectation there is that we can safely unmap the entire page as soon

> > as the reference count is greater than 1.

>

> Yes, the expectation is changed to we can always recycle the page

> when the last user has dropped the refcnt that has given to it when

> the page is not pfmemalloced.

>

> The above expectation is based on that the last user will always

> call page_pool_put_full_page() in order to do the recycling or do

> the resource cleanup(dma unmaping..etc).

>

> As the skb_free_head() and skb_release_data() have both checked the

> skb->pp_recycle to call the page_pool_put_full_page() if needed, I

> think we are safe for most case, the one case I am not so sure above

> is the rx zero copy, which seems to also bump up the refcnt before

> mapping the page to user space, we might need to ensure rx zero copy

> is not the last user of the page or if it is the last user, make sure

> it calls page_pool_put_full_page() too.


Yes, but the skb->pp_recycle value is per skb, not per page. So my
concern is that carrying around that value can be problematic as there
are a number of possible cases where the pages might be
unintentionally recycled. All it would take is for a packet to get
cloned a few times and then somebody starts using pskb_expand_head and
you would have multiple cases, possibly simultaneously, of entities
trying to free the page. I just worry it opens us up to a number of
possible races.

> >

> > In addition I think I need to look over that code better as I am

> > wondering if there are potential issues assuming a path such as a

> > skb_clone followed by pskb_expand_head may lead to memory corruptions

> > since the clone will still have pp_recycle set but none of the pages

> > will be part of the page pool anymore.

>

> There is still page->pp_magic that decides if the page is from

> page_pool or not.


The problem with pp_magic is that it doesn't prevent races. The page
pool code was meant to be protected by NAPI to prevent simultaneous
access. With us now allowing the stack to be a part of the handling we
open things up to potential races in the code.

> >

> > For us the pagecnt_bias would really represent the number of

> > additional mappings beyond the current page that are being held. I

> > have already been playing around with something similar. However the

> > general idea is that we want to keep track of how many references to

> > the page the device is holding onto. When that hits 0 and the actual

> > page count is 1 we can refill both, however if we hit 0 and there are

> > multiple references to the page still floating around we should just

> > unmap the page and turn it over to the stack or free it.

>

> I am not sure I understood the above.


As I have already mentioned, the fundamental problem with sharing a
page and using the page pool is that the page pool assumes that it can
unmap if it has a reference count greater than 0. That will no longer
be the case. It has to wait until all of the pagecnt_bias has been
cleared before it can unmap the page. Using get_page/put_page is fine
since it will have no impact on the DMA mappings, but we have to hold
off on calling things like page_pool_put_full_page or update it so
that it will not unmap as long as there is still pagecnt_bias in
place.

> As page reusing in hns3 driver, pagecnt_bias means how many refcnt the

> driver is holding, and (page_count(cb->priv) - pagecnt_bias) means how

> many refcnt the stack is holding, see [1].

>

> static bool hns3_can_reuse_page(struct hns3_desc_cb *cb)

> {

>         return (page_count(cb->priv) - cb->pagecnt_bias) == 1;

> }


So one thing we have to be careful of is letting the page_count hit 0.
My preference is to keep the bias as one less than the total
page_count so that we always have the 1 around. So if pagecnt_bias
hits 0 and we have a page_count of 1 it means that the current thread
owns the only reference to the page.

> checking (page_count(cb->priv) - cb->pagecnt_bias) again one instead

> of zero is in hns3_can_reuse_page because there is "pagecnt_bias--"

> before checking hns3_can_reuse_page() in hns3_nic_reuse_page().

>

> "pagecnt_bias--" means the driver gives the one of its refcnt to the

> stack, it is the stack'job to release the refcnt when the skb is passed

> to the stack.

>

> 1. https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c#L2870


It is mostly just a matter of preference. As long as the difference is
a predictable value it can be worked with.

> >

> >> ---

> >>  drivers/net/ethernet/marvell/mvneta.c           |   6 +-

> >>  drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c |   2 +-

> >>  include/linux/mm_types.h                        |   2 +-

> >>  include/linux/skbuff.h                          |   4 +-

> >>  include/net/page_pool.h                         |  30 +++-

> >>  net/core/page_pool.c                            | 215 ++++++++++++++++++++----

> >>  6 files changed, 207 insertions(+), 52 deletions(-)

> >>

> >> diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c

> >> index 88a7550..5a29af2 100644

> >> --- a/drivers/net/ethernet/marvell/mvneta.c

> >> +++ b/drivers/net/ethernet/marvell/mvneta.c

> >> @@ -2327,7 +2327,7 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,

> >>         if (!skb)

> >>                 return ERR_PTR(-ENOMEM);

> >>

> >> -       skb_mark_for_recycle(skb, virt_to_page(xdp->data), pool);

> >> +       skb_mark_for_recycle(skb);

> >>

> >>         skb_reserve(skb, xdp->data - xdp->data_hard_start);

> >>         skb_put(skb, xdp->data_end - xdp->data);

> >> @@ -2339,10 +2339,6 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,

> >>                 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,

> >>                                 skb_frag_page(frag), skb_frag_off(frag),

> >>                                 skb_frag_size(frag), PAGE_SIZE);

> >> -               /* We don't need to reset pp_recycle here. It's already set, so

> >> -                * just mark fragments for recycling.

> >> -                */

> >> -               page_pool_store_mem_info(skb_frag_page(frag), pool);

> >>         }

> >>

> >>         return skb;

> >

> > So as I mentioned earlier the problem with recycling is that splitting

> > up the ownership of the page makes it difficult for us to clean it up.

> > Technically speaking if the pages are being allowed to leave while

> > holding references to DMA addresses that we cannot revoke then we

> > should be holding references to the device.

> >

> > That is one of the reasons why the previous code was just clearing the

> > mapping as soon as the refcount was greater than 1. However for this

> > to work out correctly we would have to track how many DMA mappings we

> > have outstanding in addition to the one we are working on currently.

>

> I think page pool has already handled the above case if I understand

> correctly, see page_pool_release().


The problem is pagecnt_bias is not multi-thread safe. You are just
accessing an int which is prone to races. In order to fix it you would
need to add either an atomic count or locks around the access of it
which would pretty much negate the point of it.

Really in terms of the page pool recycling code I think it would have
made more sense to add the page pool release logic as an skb
destructor rather than trying to embed the page pool into the page
itself. At least with that if the device is going to go out of scope
by being orphaned or the like we could unmap the page and avoid
potential races.

> >

> >> diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

> >> index 3135220..540e387 100644

> >> --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

> >> +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

> >> @@ -3997,7 +3997,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,

> >>                 }

> >>

> >>                 if (pp)

> >> -                       skb_mark_for_recycle(skb, page, pp);

> >> +                       skb_mark_for_recycle(skb);

> >>                 else

> >>                         dma_unmap_single_attrs(dev->dev.parent, dma_addr,

> >>                                                bm_pool->buf_size, DMA_FROM_DEVICE,

> >> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

> >> index 862f88a..cf613df 100644

> >> --- a/include/linux/mm_types.h

> >> +++ b/include/linux/mm_types.h

> >> @@ -101,7 +101,7 @@ struct page {

> >>                          * page_pool allocated pages.

> >>                          */

> >>                         unsigned long pp_magic;

> >> -                       struct page_pool *pp;

> >> +                       struct page_pool_info *pp_info;

> >>                         unsigned long _pp_mapping_pad;

> >>                         /**

> >>                          * @dma_addr: might require a 64-bit value on

> >

> > So the problem here is that this is creating a pointer chase, and the

> > need to allocate yet another structure to store it is going to be

> > expensive.

> >

> > As far as storing the pagecnt_bias it might make more sense to

> > repurpose the lower 12 bits of the dma address. A DMA mapping should

> > be page aligned anyway so the lower 12 bits would be reserved 0. When

> > we decrement the value so that the lower 12 bits are 0 we should be

> > unmapping the page anyway, or resetting the pagecnt_bias to PAGE_SIZE

> > - 1 and adding back the bias to the page to effectively reset it for

> > reuse.

>

> Yes, that is a great idea. I like it very much supposing page refcnt

> updating batching for 'PAGE_SIZE - 1" is enough for performance sake.

>

> Will take a look about it.

>

> >

> >> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

> >> index b2db9cd..7795979 100644

> >> --- a/include/linux/skbuff.h

> >> +++ b/include/linux/skbuff.h

> >> @@ -4711,11 +4711,9 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)

> >>  }

> >>

> >>  #ifdef CONFIG_PAGE_POOL

> >> -static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page,

> >> -                                       struct page_pool *pp)

> >> +static inline void skb_mark_for_recycle(struct sk_buff *skb)

> >>  {

> >>         skb->pp_recycle = 1;

> >> -       page_pool_store_mem_info(page, pp);

> >>  }

> >>  #endif

> >

> > I am not a fan of the pp_recycle flag either. We duplicate it via

> > skb_clone and from what I can tell if we call pskb_expand_head

> > afterwards I don't see how we avoid recycling the page frags twice.

>

> Acctually skb->pp_recycle is kind of duplicated, as there is

> still page->pp_magic to avoid recycling the page frags twice.

>

> The argument above adding skb->pp_recycle seems to be short

> cut code path for non-page_pool case in the previous disscusion,

> see [2].

>

> 2. https://lore.kernel.org/linux-mm/074b0d1d-9531-57f3-8e0e-a447387478d1@huawei.com/


Yes, but that doesn't guarantee atomic protections so you still have
race conditions possible. All it takes is something stalling during
the dma_unamp call. Worse yet from what I can tell it looks like you
clear page->pp before you clear page->pp_magic so you have the
potential for a NULL pointer issue since it is cleared before the
pp_magic value is.

> >

> >> diff --git a/include/net/page_pool.h b/include/net/page_pool.h

> >> index 3dd62dd..44e7545 100644

> >> --- a/include/net/page_pool.h

> >> +++ b/include/net/page_pool.h

> >> @@ -45,7 +45,9 @@

> >>                                         * Please note DMA-sync-for-CPU is still

> >>                                         * device driver responsibility

> >>                                         */

> >> -#define PP_FLAG_ALL            (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)

> >> +#define PP_FLAG_PAGECNT_BIAS   BIT(2)  /* Enable elevated refcnt */

> >> +#define PP_FLAG_ALL            (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV |\

> >> +                                PP_FLAG_PAGECNT_BIAS)

> >>

> >>  /*

> >

> > It might be better to just put each flag on a seperate line for

> > PP_FLAG_ALL just to make it easier to read due to the wrapping. Either

> > that or you could look at converting this over to an enum with a MAX

> > value and then define the flags based on those enums, and PP_FLAG_ALL

> > being BIT(MAX) - 1.

>

> Will do the wrapping first:)


Sounds good.

> >

> >>   * Fast allocation side cache array/stack

> >> @@ -77,6 +79,7 @@ struct page_pool_params {

> >>         enum dma_data_direction dma_dir; /* DMA mapping direction */

> >>         unsigned int    max_len; /* max DMA sync memory size */

> >>         unsigned int    offset;  /* DMA addr offset */

> >> +       unsigned int    frag_size;

> >>  };

> >>

> >>  struct page_pool {

> >> @@ -88,6 +91,8 @@ struct page_pool {

> >>         unsigned long defer_warn;

> >>

> >>         u32 pages_state_hold_cnt;

> >> +       unsigned int frag_offset;

> >> +       struct page *frag_page;

> >>

> >>         /*

> >>          * Data structure for allocation side

> >> @@ -128,6 +133,11 @@ struct page_pool {

> >>         u64 destroy_cnt;

> >>  };

> >>

> >> +struct page_pool_info {

> >> +       struct page_pool *pp;

> >> +       int pagecnt_bias;

> >> +};

> >> +

> >

> > Rather than having a top-down structure here it might be better to

> > work bottom up. If you assume you are keeping a pagecnt_bias per page

> > it might make more sense to store this in the driver somewhere rather

> > than having it as a separate allocated buffer. One advantage of the

> > Intel drivers was doing this as we had the pagecnt_bias in a structure

> > that also pointed to the page. That way we were only updating that

> > count if we dropped the page and didn't have to even touch the page.

> > You could use that to batch updates to the pagecnt_bias if we did use

> > the lower 12 bits of the DMA address to store it as well.

>

> I am not sure I understood what "we dropped the page" meant.


For XDP_DROP if we are dropping the buffer we are dropping the page
which in our case means we just need to increment the pagecnt_bias
indicating we are putting it back and don't have to do anything with
the actual page refcount or struct.

> The driver does not really need to call page_pool_put_full_page()

> if the page of a skb is passed to stack, the driver mainly call

> page_pool_put_full_page() when unloading or uniniting when the page

> is not passed to stack yet.


I was thinking mostly of something like XDP_TX cases when combined
with the pagecnt_bias. You will need to have something to return the
page to the pool after the XDP_TX is completed.

> > I'm assuming the idea with this is that you will be having multiple

> > buffers received off of a single page and so doing it that way you

> > should only have one update on allocation, maybe a trickle of updates

> > for XDP_TX, and another large update when the page is fully consumed

> > and you drop the remaining pagecnt_bias for Rx.

>

> I suppose "having multiple buffers received off of a single page" mean:

> use first half of a page for a desc, and the second half of the same page

> for another desc, intead of ping-pong way of reusing implemented in most

> driver currently?

>

> I am not so familiar with XDP to understand the latter part of comment too.


The alloc_frag logic below is an example of what I am talking about.
Basically taking a page and chopping it up into multiple pieces for
use as multiple receives instead of just one receive.

> >

> >>  struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);

> >>

> >>  static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)

> >> @@ -137,6 +147,17 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)

> >>         return page_pool_alloc_pages(pool, gfp);

> >>  }

> >>

> >> +struct page *page_pool_alloc_frag(struct page_pool *pool,

> >> +                                 unsigned int *offset, gfp_t gfp);

> >> +

> >> +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,

> >> +                                                   unsigned int *offset)

> >> +{

> >> +       gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);

> >> +

> >> +       return page_pool_alloc_frag(pool, offset, gfp);

> >> +}

> >> +

> >>  /* get the stored dma direction. A driver might decide to treat this locally and

> >>   * avoid the extra cache line from page_pool to determine the direction

> >>   */

> >> @@ -253,11 +274,4 @@ static inline void page_pool_ring_unlock(struct page_pool *pool)

> >>                 spin_unlock_bh(&pool->ring.producer_lock);

> >>  }

> >>

> >> -/* Store mem_info on struct page and use it while recycling skb frags */

> >> -static inline

> >> -void page_pool_store_mem_info(struct page *page, struct page_pool *pp)

> >> -{

> >> -       page->pp = pp;

> >> -}

> >> -

> >>  #endif /* _NET_PAGE_POOL_H */

> >

> > So the issue as I see it with the page_pool recycling patch set is

> > that I don't think we had proper guarantees in place that the page->pp

> > value was flushed in all cases where skb->dev was changed. Basically

> > the logic we need to have in place to address those issues is that

> > skb->dev is changed we need to invalidate the DMA mappings on the

> > page_pool page.

>

> The DMA mappings invalidating is based on the pool->p.dev, is there

> any reason why the DMA mappings need invalidating when skb->dev is

> change, as fast I can tell, the tx is not aware of page pool, so

> when the skb is redirected, the page of the skb is always DMA mapped

> according to skb->dev before xmitting.

>

> Or it is about XDP redirected?

>

> Is there something obvious I missed here?


It is about unmapping the page. In order to do so we have to maintain
a pointer to the original DMA device. The page pool is doing that for
us currently.

Most netdevs have a parent  device that is used for DMA mapping.
Therefore if skb->dev is valid, then the parent device is still valid
since destroying the parent would destroy the children. If the
skb->dev is dropped or changed, then we cannot guarantee the parent
device is still present. So generally if skb->dev cannot be maintained
then we probably shouldn't be maintaining the DMA mapping or page->pp
across that boundary either.

> >

> > I honestly wonder if it wouldn't be better for the recycling to just

> > make use of the page->lru pointers to keep a list of pages that are

> > outstanding so that it could release them if it is under DMA pressure.

> >

> >> diff --git a/net/core/page_pool.c b/net/core/page_pool.c

> >> index 5e4eb45..95d94a7 100644

> >> --- a/net/core/page_pool.c

> >> +++ b/net/core/page_pool.c

> >> @@ -206,6 +206,49 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)

> >>         return true;

> >>  }

> >>

> >> +static int page_pool_set_pp_info(struct page_pool *pool,

> >> +                                struct page *page, gfp_t gfp)

> >> +{

> >> +       struct page_pool_info *pp_info;

> >> +

> >> +       pp_info = kzalloc_node(sizeof(*pp_info), gfp, pool->p.nid);

> >> +       if (!pp_info)

> >> +               return -ENOMEM;

> >> +

> >> +       if (pool->p.flags & PP_FLAG_PAGECNT_BIAS) {

> >> +               page_ref_add(page, USHRT_MAX);

> >> +               pp_info->pagecnt_bias = USHRT_MAX;

> >> +       } else {

> >> +               pp_info->pagecnt_bias = 0;

> >> +       }

> >> +

> >> +       page->pp_magic |= PP_SIGNATURE;

> >> +       pp_info->pp = pool;

> >> +       page->pp_info = pp_info;

> >> +       return 0;

> >> +}

> >> +

> >

> > Having to perform a kzalloc in this path pretty much ruins the whole

> > point of the page_pool API in my opinion. We would be much better off

> > having a static structure that is to be maintained somewhere rather

> > than doing this dynamically as you would just make a memory hog able

> > to hold that much more memory.

>

> Let's see if repurposing the lower 12 bits of the dma address make sense?


Sounds good.

> >

> >> +static int page_pool_clear_pp_info(struct page *page)

> >> +{

> >> +       struct page_pool_info *pp_info = page->pp_info;

> >> +       int bias;

> >> +

> >> +       bias = pp_info->pagecnt_bias;

> >> +

> >> +       kfree(pp_info);

> >> +       page->pp_info = NULL;

> >> +       page->pp_magic = 0;

> >> +

> >> +       return bias;

> >> +}

> >> +

> >> +static void page_pool_clear_and_drain_page(struct page *page)

> >> +{

> >> +       int bias = page_pool_clear_pp_info(page);

> >> +

> >> +       __page_frag_cache_drain(page, bias + 1);

> >> +}

> >> +

> >>  static struct page *__page_pool_alloc_page_order(struct page_pool *pool,

> >>                                                  gfp_t gfp)

> >>  {

> >> @@ -216,13 +259,16 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,

> >>         if (unlikely(!page))

> >>                 return NULL;

> >>

> >> -       if ((pool->p.flags & PP_FLAG_DMA_MAP) &&

> >> -           unlikely(!page_pool_dma_map(pool, page))) {

> >> +       if (unlikely(page_pool_set_pp_info(pool, page, gfp))) {

> >>                 put_page(page);

> >>                 return NULL;

> >>         }

> >>

> >> -       page->pp_magic |= PP_SIGNATURE;

> >> +       if ((pool->p.flags & PP_FLAG_DMA_MAP) &&

> >> +           unlikely(!page_pool_dma_map(pool, page))) {

> >> +               page_pool_clear_and_drain_page(page);

> >> +               return NULL;

> >> +       }

> >>

> >>         /* Track how many pages are held 'in-flight' */

> >>         pool->pages_state_hold_cnt++;

> >> @@ -261,12 +307,17 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,

> >>          */

> >>         for (i = 0; i < nr_pages; i++) {

> >>                 page = pool->alloc.cache[i];

> >> +               if (unlikely(page_pool_set_pp_info(pool, page, gfp))) {

> >> +                       put_page(page);

> >> +                       continue;

> >> +               }

> >> +

> >>                 if ((pp_flags & PP_FLAG_DMA_MAP) &&

> >>                     unlikely(!page_pool_dma_map(pool, page))) {

> >> -                       put_page(page);

> >> +                       page_pool_clear_and_drain_page(page);

> >>                         continue;

> >>                 }

> >

> > This seems backwards to me. I would have the pp_info populated after

> > you have generated the DMA mapping.

>

> Ok.

>

> >

> >> -               page->pp_magic |= PP_SIGNATURE;

> >> +

> >>                 pool->alloc.cache[pool->alloc.count++] = page;

> >>                 /* Track how many pages are held 'in-flight' */

> >>                 pool->pages_state_hold_cnt++;

> >> @@ -284,6 +335,25 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,

> >>         return page;

> >>  }

> >>

> >> +static void page_pool_sub_bias(struct page *page, int nr)

> >> +{

> >> +       struct page_pool_info *pp_info = page->pp_info;

> >> +

> >> +       /* "pp_info->pagecnt_bias == 0" indicates the PAGECNT_BIAS

> >> +        * flags is not set.

> >> +        */

> >> +       if (!pp_info->pagecnt_bias)

> >> +               return;

> >> +

> >> +       /* Make sure pagecnt_bias > 0 for elevated refcnt case */

> >> +       if (unlikely(pp_info->pagecnt_bias <= nr)) {

> >> +               page_ref_add(page, USHRT_MAX);

> >> +               pp_info->pagecnt_bias += USHRT_MAX;

> >> +       }

> >> +

> >> +       pp_info->pagecnt_bias -= nr;

> >

> > So we should never have a case where pagecnt_bias is less than the

> > value we are subtracting. If we have that then it is a bug.

>

> Yes.


Sorry, I was referring to the code above comparing pagecnt_bias to nr.
At most nr should only ever be equal to pagecnt_bias, you should hold
off on recharging pagecnt_bias until you can verify the page_count
indicates we are the only holder of the page. Then we can recharge it
and reset any offsets.

> >

> > The general idea with the pagecnt_bias is that we want to batch the

> > release of the page from the device. So the assumption is we are going

> > to pull multiple references from the page and rather than doing

> > page_ref_inc repeatedly we want to batch it at the start, and we have

> > to perform a __page_frag_cache_drain to remove any unused references

> > when we need to free it.

>

> Yes, it is about batching the page_ref_inc() operation.

>

> >

> > What we should probably be checking for is "pp_info->pagecnt_bias -

> > page_count(page) > 1" when we hit the end of the page. If that is true

> > then we cannot recycle the page and so when we hit PAGE_SIZE for the

> > offset we have to drop the mapping and free the page subtracting any

> > remaining pagecnt_bias we are holding. If I recall I actually ran this

> > the other way and ran toward 0 in my implementation before as that

> > allows for not having to track via a value and instead simply checking

> > for a signed result.

>

>

> When allocating a page for frag, we have decided how many user is using

> the page, that is the "page_pool_sub_bias(frag_page, max_len / frag_size - 1)"

> in page_pool_alloc_frag().

>

> so it is up to the driver or stack to do multi page_pool_put_full_page()

> calling for the same page.


So that is one spot that I think is an issue. We normally only want
this called once per page and ideally after pagecnt_bias is 0. One
issue is that pagecnt_bias is non-atomic so we should really be
restricting this to just the driver calling it in softirq context.

> Or the page pool will call page_pool_put_full_page() in page_pool_empty_frag()

> if some of the page frag is not allocated to the driver yet.

>

> It seems you are suggesting a slightly different way to do frag reusing.


As I mentioned I am not a fan of the current recycling scheme. There
are too many openings for it to end up unmapping the same page
multiple times or other possible issues.

In my mind the driver or page_pool should own the page and just keep
it on a list to either be freed or recycled with the skb destructor
being used to trigger the recycling.

> >

> >> +}

> >> +

> >>  /* For using page_pool replace: alloc_pages() API calls, but provide

> >>   * synchronization guarantee for allocation side.

> >>   */

> >> @@ -293,15 +363,66 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)

> >>

> >>         /* Fast-path: Get a page from cache */

> >>         page = __page_pool_get_cached(pool);

> >> -       if (page)

> >> +       if (page) {

> >> +               page_pool_sub_bias(page, 1);

> >>                 return page;

> >> +       }

> >

> > I'm not sure we should be subtracting from the bias here. Ideally if

> > you are getting a page you are getting the full 4K page. So having a

> > bias other than PAGE_SIZE - 1 wouldn't make much sense here.

>

> It seems we have different understanding about pagecnt_bias here,

> as the pagecnt_bias is hidden in the page pool now, the subtracting

> here mean we give one refcnt to the caller of page_pool_alloc_pages(),

> And in page_pool_alloc_frag(), we give different part of page to the

> driver, so it means more user too, so there is also subtracting in the

> page_pool_alloc_frag() too.


I see what you are getting at, however I think it depends on your use
case. In my mind since you are allocating the full page you should
have the full count available to you. I don't believe pagecnt_bias is
something that should be looked at outside of the driver, or at least
outside of the napi context of the device softirq.

So really in order for this to work correctly you would need to have
some minimum amount of bias reserved for the device to access if you
are going to break up page in to n usable buffers.

> >

> >>

> >>         /* Slow-path: cache empty, do real allocation */

> >>         page = __page_pool_alloc_pages_slow(pool, gfp);

> >> +       if (page)

> >> +               page_pool_sub_bias(page, 1);

> >> +

> >

> > Same here. Really in both cases we should be getting initialized

> > pages, not ones that are already decrementing.

> >

> >>         return page;

> >>  }

> >>  EXPORT_SYMBOL(page_pool_alloc_pages);

> >>

> >> +struct page *page_pool_alloc_frag(struct page_pool *pool,

> >> +                                 unsigned int *offset, gfp_t gfp)

> >> +{

> >> +       unsigned int frag_offset = pool->frag_offset;

> >> +       unsigned int frag_size = pool->p.frag_size;

> >> +       struct page *frag_page = pool->frag_page;

> >> +       unsigned int max_len = pool->p.max_len;

> >> +

> >> +       if (!frag_page || frag_offset + frag_size > max_len) {

> >

> > These are two very different cases. If frag_page is set and just out

> > of space we need to be freeing the unused references.

>

> As mention above, we are depending on the last user to do the

> recycling or freeing the unused references.


But you are holding the pagecnt_bias for it aren't you? If so you need
to release it so that the last user knows that they were the last
user.

Once you aren't using the page you need to release the pagecnt_bias
since the page is on the path to being freed.

> >

> >> +               frag_page = page_pool_alloc_pages(pool, gfp);

> >

> > So as per my comment above the page should be coming in with a

> > pagecnt_bias of PAGE_SIZE - 1, and an actual page_ref_count of

> > PAGE_SIZE.

>

> Let's align the understanding of pagecnt_bias first?

>

> pagecnt_bias meant how many refcnt of a page belong to the page

> pool, and (page_ref_count() - pagecnt_bias) means how many refcnt

> of a page belong to user of the page pool.


So my view is a slight variation on that. I view pagecnt_bias as the
count of references reserved by the page_pool, and page_ref_count -
pagecnt_bias is the actual reference count. So if I am going to free a
page I should deduct pagecnt_bias + 1 from the reference count to
account for dropping our bias and the one for the fact that we own the
page.

> >

> >> +               if (unlikely(!frag_page)) {

> >> +                       pool->frag_page = NULL;

> >> +                       return NULL;

> >> +               }

> >> +

> >> +               pool->frag_page = frag_page;

> >> +               frag_offset = 0;

> >> +

> >> +               page_pool_sub_bias(frag_page, max_len / frag_size - 1);

> >

> > Why are you doing division here? We should just be subtracting 1 from

> > the pagecnt_bias since that is the number of buffers that are being

> > used. The general idea is that when pagecnt_bias is 0 we cut the page

> > loose for potential recycling or freeing, otherwise we just subtract

> > our new value from pagecnt_bias until we reach it.

>

> As mentioned above, division is used to find out how many user may be

> using the page.


That doesn't make any sense to me because it won't tell you the actual
users, and from what I can tell it is buggy since if I use this to
allocate a chunk larger than 2K this comes out to 0 doesn't it? It
seems like you should just always use 1 as the count.

> >

> >> +       }

> >> +

> >> +       *offset = frag_offset;

> >> +       pool->frag_offset = frag_offset + frag_size;

> >> +

> >> +       return frag_page;

> >> +}

> >> +EXPORT_SYMBOL(page_pool_alloc_frag);

> >> +

> >> +static void page_pool_empty_frag(struct page_pool *pool)

> >> +{

> >> +       unsigned int frag_offset = pool->frag_offset;

> >> +       unsigned int frag_size = pool->p.frag_size;

> >> +       struct page *frag_page = pool->frag_page;

> >> +       unsigned int max_len = pool->p.max_len;

> >> +

> >> +       if (!frag_page)

> >> +               return;

> >> +

> >> +       while (frag_offset + frag_size <= max_len) {

> >> +               page_pool_put_full_page(pool, frag_page, false);

> >> +               frag_offset += frag_size;

> >> +       }

> >> +

> >> +       pool->frag_page = NULL;

> >> +}

> >> +

> >

> > It would be good to look over the page_frag_alloc_align and

> > __page_frag_cache_drain functions for examples of how to do most of

> > this. The one complication is that we have the dma mappings and

> > page_pool logic to deal with.

>

> Is it ok to rely on the user providing a aligning frag_size, so

> that do not need handling it here?


It is probably fine since the page pool should only have one consumer
so the requests just need to be aligned by them.

> >

> >>  /* Calculate distance between two u32 values, valid if distance is below 2^(31)

> >>   *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution

> >>   */

> >> @@ -326,10 +447,11 @@ static s32 page_pool_inflight(struct page_pool *pool)

> >>   * a regular page (that will eventually be returned to the normal

> >>   * page-allocator via put_page).

> >>   */

> >> -void page_pool_release_page(struct page_pool *pool, struct page *page)

> >> +static int __page_pool_release_page(struct page_pool *pool,

> >> +                                   struct page *page)

> >>  {

> >>         dma_addr_t dma;

> >> -       int count;

> >> +       int bias, count;

> >>

> >>         if (!(pool->p.flags & PP_FLAG_DMA_MAP))

> >>                 /* Always account for inflight pages, even if we didn't

> >> @@ -345,22 +467,29 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)

> >>                              DMA_ATTR_SKIP_CPU_SYNC);

> >>         page_pool_set_dma_addr(page, 0);

> >>  skip_dma_unmap:

> >> -       page->pp_magic = 0;

> >> +       bias = page_pool_clear_pp_info(page);

> >>

> >>         /* This may be the last page returned, releasing the pool, so

> >>          * it is not safe to reference pool afterwards.

> >>          */

> >>         count = atomic_inc_return(&pool->pages_state_release_cnt);

> >>         trace_page_pool_state_release(pool, page, count);

> >> +       return bias;

> >> +}

> >> +

> >> +void page_pool_release_page(struct page_pool *pool, struct page *page)

> >> +{

> >> +       int bias = __page_pool_release_page(pool, page);

> >> +

> >> +       WARN_ONCE(bias, "PAGECNT_BIAS is not supposed to be enabled\n");

> >>  }

> >>  EXPORT_SYMBOL(page_pool_release_page);

> >>

> >>  /* Return a page to the page allocator, cleaning up our state */

> >>  static void page_pool_return_page(struct page_pool *pool, struct page *page)

> >>  {

> >> -       page_pool_release_page(pool, page);

> >> +       __page_frag_cache_drain(page, __page_pool_release_page(pool, page) + 1);

> >>

> >> -       put_page(page);

> >>         /* An optimization would be to call __free_pages(page, pool->p.order)

> >>          * knowing page is not part of page-cache (thus avoiding a

> >>          * __page_cache_release() call).

> >> @@ -395,7 +524,16 @@ static bool page_pool_recycle_in_cache(struct page *page,

> >>         return true;

> >>  }

> >>

> >> -/* If the page refcnt == 1, this will try to recycle the page.

> >> +static bool page_pool_bias_page_recyclable(struct page *page, int bias)

> >> +{

> >> +       int ref = page_ref_dec_return(page);

> >> +

> >> +       WARN_ON(ref < bias);

> >> +       return ref == bias + 1;

> >> +}

> >> +

> >> +/* If pagecnt_bias == 0 and the page refcnt == 1, this will try to

> >> + * recycle the page.

> >>   * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for

> >>   * the configured size min(dma_sync_size, pool->max_len).

> >>   * If the page refcnt != 1, then the page will be returned to memory

> >> @@ -405,16 +543,35 @@ static __always_inline struct page *

> >>  __page_pool_put_page(struct page_pool *pool, struct page *page,

> >>                      unsigned int dma_sync_size, bool allow_direct)

> >>  {

> >> -       /* This allocator is optimized for the XDP mode that uses

> >> +       int bias = page->pp_info->pagecnt_bias;

> >> +

> >> +       /* Handle the elevated refcnt case first:

> >> +        * multi-frames-per-page, it is likely from the skb, which

> >> +        * is likely called in non-sofrirq context, so do not recycle

> >> +        * it in pool->alloc.

> >> +        *

> >> +        * Then handle non-elevated refcnt case:

> >>          * one-frame-per-page, but have fallbacks that act like the

> >>          * regular page allocator APIs.

> >> -        *

> >>          * refcnt == 1 means page_pool owns page, and can recycle it.

> >>          *

> >>          * page is NOT reusable when allocated when system is under

> >>          * some pressure. (page_is_pfmemalloc)

> >>          */

> >> -       if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {

> >> +       if (bias) {

> >> +               /* We have gave some refcnt to the stack, so wait for

> >> +                * all refcnt of the stack to be decremented before

> >> +                * enabling recycling.

> >> +                */

> >> +               if (!page_pool_bias_page_recyclable(page, bias))

> >> +                       return NULL;

> >> +

> >> +               /* only enable recycling when it is not pfmemalloced */

> >> +               if (!page_is_pfmemalloc(page))

> >> +                       return page;

> >> +

> >

> > So this would be fine if this was only accessed from the driver. The

> > problem is the recycling code made it so that this is accessed in the

> > generic skb freeing path. As such I think this is prone to races since

> > you have to guarantee the ordering of things between the reference

> > count and pagecnt_bias.

>

> As reference count is handled atomically is page_pool_bias_page_recyclable,

> and pagecnt_bias is changed before any page is handled to the stack(maybe

> some READ_ONCE/WRITE_ONCE or barrier is still needed, will check it again),

> so I suppose the ordering is correct?


The problem is in order to get this working correctly you would likely
need to add a number of barriers so that reads and writes are in a
specific order. You would be much better off just not
reading/modifying the pagecnt_bias outside of the softirq paths.

> >

> >> +       } else if (likely(page_ref_count(page) == 1 &&

> >> +                         !page_is_pfmemalloc(page))) {

> >>                 /* Read barrier done in page_ref_count / READ_ONCE */

> >>

> >>                 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)

> >> @@ -428,22 +585,8 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,

> >>                 /* Page found as candidate for recycling */

> >>                 return page;

> >>         }

> >> -       /* Fallback/non-XDP mode: API user have elevated refcnt.

> >> -        *

> >> -        * Many drivers split up the page into fragments, and some

> >> -        * want to keep doing this to save memory and do refcnt based

> >> -        * recycling. Support this use case too, to ease drivers

> >> -        * switching between XDP/non-XDP.

> >> -        *

> >> -        * In-case page_pool maintains the DMA mapping, API user must

> >> -        * call page_pool_put_page once.  In this elevated refcnt

> >> -        * case, the DMA is unmapped/released, as driver is likely

> >> -        * doing refcnt based recycle tricks, meaning another process

> >> -        * will be invoking put_page.

> >> -        */

> >> -       /* Do not replace this with page_pool_return_page() */

> >> +

> >>         page_pool_release_page(pool, page);

> >> -       put_page(page);

> >>

> >>         return NULL;

> >>  }

> >> @@ -452,6 +595,7 @@ void page_pool_put_page(struct page_pool *pool, struct page *page,

> >>                         unsigned int dma_sync_size, bool allow_direct)

> >>  {

> >>         page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);

> >> +

> >>         if (page && !page_pool_recycle_in_ring(pool, page)) {

> >>                 /* Cache full, fallback to free pages */

> >>                 page_pool_return_page(pool, page);

> >> @@ -503,8 +647,11 @@ static void page_pool_empty_ring(struct page_pool *pool)

> >>

> >>         /* Empty recycle ring */

> >>         while ((page = ptr_ring_consume_bh(&pool->ring))) {

> >> -               /* Verify the refcnt invariant of cached pages */

> >> -               if (!(page_ref_count(page) == 1))

> >> +               /* Verify the refcnt invariant of cached pages for

> >> +                * non elevated refcnt case.

> >> +                */

> >> +               if (!(pool->p.flags & PP_FLAG_PAGECNT_BIAS) &&

> >> +                   !(page_ref_count(page) == 1))

> >>                         pr_crit("%s() page_pool refcnt %d violation\n",

> >>                                 __func__, page_ref_count(page));

> >>

> >> @@ -544,6 +691,7 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)

> >>

> >>  static void page_pool_scrub(struct page_pool *pool)

> >>  {

> >> +       page_pool_empty_frag(pool);

> >>         page_pool_empty_alloc_cache_once(pool);

> >>         pool->destroy_cnt++;

> >>

> >> @@ -637,14 +785,13 @@ bool page_pool_return_skb_page(struct page *page)

> >>         if (unlikely(page->pp_magic != PP_SIGNATURE))

> >>                 return false;

> >>

> >> -       pp = page->pp;

> >> +       pp = page->pp_info->pp;

> >>

> >>         /* Driver set this to memory recycling info. Reset it on recycle.

> >>          * This will *not* work for NIC using a split-page memory model.

> >>          * The page will be returned to the pool here regardless of the

> >>          * 'flipped' fragment being in use or not.

> >>          */

> >> -       page->pp = NULL;

> >>         page_pool_put_full_page(pp, page, false);

> >>

> >>         return true;

> >> --

> >> 2.7.4

> >>

> > .

> >
Ilias Apalodimas July 7, 2021, 7:03 p.m. UTC | #9
> > Hi, Alexander

> >

> > Thanks for detailed reviewing.

> >


Likewise!
I'll have a look on the entire conversation in a few days...

> > >

> > > So this isn't going to work with the current recycling logic. The

> > > expectation there is that we can safely unmap the entire page as soon

> > > as the reference count is greater than 1.

> >

> > Yes, the expectation is changed to we can always recycle the page

> > when the last user has dropped the refcnt that has given to it when

> > the page is not pfmemalloced.

> >

> > The above expectation is based on that the last user will always

> > call page_pool_put_full_page() in order to do the recycling or do

> > the resource cleanup(dma unmaping..etc).

> >

> > As the skb_free_head() and skb_release_data() have both checked the

> > skb->pp_recycle to call the page_pool_put_full_page() if needed, I

> > think we are safe for most case, the one case I am not so sure above

> > is the rx zero copy, which seems to also bump up the refcnt before

> > mapping the page to user space, we might need to ensure rx zero copy

> > is not the last user of the page or if it is the last user, make sure

> > it calls page_pool_put_full_page() too.

> 

> Yes, but the skb->pp_recycle value is per skb, not per page. So my

> concern is that carrying around that value can be problematic as there

> are a number of possible cases where the pages might be

> unintentionally recycled. All it would take is for a packet to get

> cloned a few times and then somebody starts using pskb_expand_head and

> you would have multiple cases, possibly simultaneously, of entities

> trying to free the page. I just worry it opens us up to a number of

> possible races.


Maybe I missde something, but I thought the cloned SKBs would never trigger
the recycling path, since they are protected by the atomic dataref check in
skb_release_data(). What am I missing?

[...]

Thanks
/Ilias
Alexander Duyck July 7, 2021, 9:49 p.m. UTC | #10
On Wed, Jul 7, 2021 at 12:03 PM Ilias Apalodimas
<ilias.apalodimas@linaro.org> wrote:
>

> > > Hi, Alexander

> > >

> > > Thanks for detailed reviewing.

> > >

>

> Likewise!

> I'll have a look on the entire conversation in a few days...

>

> > > >

> > > > So this isn't going to work with the current recycling logic. The

> > > > expectation there is that we can safely unmap the entire page as soon

> > > > as the reference count is greater than 1.

> > >

> > > Yes, the expectation is changed to we can always recycle the page

> > > when the last user has dropped the refcnt that has given to it when

> > > the page is not pfmemalloced.

> > >

> > > The above expectation is based on that the last user will always

> > > call page_pool_put_full_page() in order to do the recycling or do

> > > the resource cleanup(dma unmaping..etc).

> > >

> > > As the skb_free_head() and skb_release_data() have both checked the

> > > skb->pp_recycle to call the page_pool_put_full_page() if needed, I

> > > think we are safe for most case, the one case I am not so sure above

> > > is the rx zero copy, which seems to also bump up the refcnt before

> > > mapping the page to user space, we might need to ensure rx zero copy

> > > is not the last user of the page or if it is the last user, make sure

> > > it calls page_pool_put_full_page() too.

> >

> > Yes, but the skb->pp_recycle value is per skb, not per page. So my

> > concern is that carrying around that value can be problematic as there

> > are a number of possible cases where the pages might be

> > unintentionally recycled. All it would take is for a packet to get

> > cloned a few times and then somebody starts using pskb_expand_head and

> > you would have multiple cases, possibly simultaneously, of entities

> > trying to free the page. I just worry it opens us up to a number of

> > possible races.

>

> Maybe I missde something, but I thought the cloned SKBs would never trigger

> the recycling path, since they are protected by the atomic dataref check in

> skb_release_data(). What am I missing?


Are you talking about the head frag? So normally a clone wouldn't
cause an issue because the head isn't changed. In the case of the
head_frag we should be safe since pskb_expand_head will just kmalloc
the new head and clears head_frag so it won't trigger
page_pool_return_skb_page on the head_frag since the dataref just goes
from 2 to 1.

The problem is that pskb_expand_head memcopies the page frags over and
takes a reference on the pages. At that point you would have two skbs
both pointing to the same set of pages and each one ready to call
page_pool_return_skb_page on the pages at any time and possibly racing
with the other.

I suspect if they both called it at roughly the same time one of them
would trigger a NULL pointer dereference since they would both check
pp_magic first, and then both set pp to NULL. If run on a system where
dma_unmap_page_attrs takes a while it would be very likely to race
since pp_magic doesn't get cleared until after the page is unmapped.
Yunsheng Lin July 8, 2021, 2:27 a.m. UTC | #11
On 2021/7/7 23:01, Alexander Duyck wrote:
> On Tue, Jul 6, 2021 at 8:05 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:

>>

>> On 2021/7/7 4:45, Alexander Duyck wrote:

>>> On Wed, Jun 30, 2021 at 2:19 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:

>>>>

>>>> Currently page pool only support page recycling only when

>>>> refcnt of page is one, which means it can not support the

>>>> split page recycling implemented in the most ethernet driver.

>>>>

>>>> So add elevated refcnt support in page pool, and support

>>>> allocating page frag to enable multi-frames-per-page based

>>>> on the elevated refcnt support.

>>>>

>>>> As the elevated refcnt is per page, and there is no space

>>>> for that in "struct page" now, so add a dynamically allocated

>>>> "struct page_pool_info" to record page pool ptr and refcnt

>>>> corrsponding to a page for now. Later, we can recycle the

>>>> "struct page_pool_info" too, or use part of page memory to

>>>> record pp_info.

>>>>

>>>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>

>>

>> Hi, Alexander

>>

>> Thanks for detailed reviewing.

>>

>>>

>>> So this isn't going to work with the current recycling logic. The

>>> expectation there is that we can safely unmap the entire page as soon

>>> as the reference count is greater than 1.

>>

>> Yes, the expectation is changed to we can always recycle the page

>> when the last user has dropped the refcnt that has given to it when

>> the page is not pfmemalloced.

>>

>> The above expectation is based on that the last user will always

>> call page_pool_put_full_page() in order to do the recycling or do

>> the resource cleanup(dma unmaping..etc).

>>

>> As the skb_free_head() and skb_release_data() have both checked the

>> skb->pp_recycle to call the page_pool_put_full_page() if needed, I

>> think we are safe for most case, the one case I am not so sure above

>> is the rx zero copy, which seems to also bump up the refcnt before

>> mapping the page to user space, we might need to ensure rx zero copy

>> is not the last user of the page or if it is the last user, make sure

>> it calls page_pool_put_full_page() too.

> 

> Yes, but the skb->pp_recycle value is per skb, not per page. So my

> concern is that carrying around that value can be problematic as there

> are a number of possible cases where the pages might be

> unintentionally recycled. All it would take is for a packet to get

> cloned a few times and then somebody starts using pskb_expand_head and

> you would have multiple cases, possibly simultaneously, of entities

> trying to free the page. I just worry it opens us up to a number of

> possible races.


I think page_ref_dec_return() in page_pool_bias_page_recyclable() will
prevent the above race to happen.

As the page_ref_dec_return() and page_pool_bias_page_recyclable() return
true, all user of the page have done with the p->pp_magic and p->pp_info,
so it should be ok to reset the p->pp_magic and p->pp_info in any order?

And page_ref_dec_return() has both __atomic_pre_full_fence() and
__atomic_post_full_fence() to ensure the above ordering.

> 

>>>

>>> In addition I think I need to look over that code better as I am

>>> wondering if there are potential issues assuming a path such as a

>>> skb_clone followed by pskb_expand_head may lead to memory corruptions

>>> since the clone will still have pp_recycle set but none of the pages

>>> will be part of the page pool anymore.

>>

>> There is still page->pp_magic that decides if the page is from

>> page_pool or not.

> 

> The problem with pp_magic is that it doesn't prevent races. The page

> pool code was meant to be protected by NAPI to prevent simultaneous

> access. With us now allowing the stack to be a part of the handling we

> open things up to potential races in the code.


As above.

> 

>>>

>>> For us the pagecnt_bias would really represent the number of

>>> additional mappings beyond the current page that are being held. I

>>> have already been playing around with something similar. However the

>>> general idea is that we want to keep track of how many references to

>>> the page the device is holding onto. When that hits 0 and the actual

>>> page count is 1 we can refill both, however if we hit 0 and there are

>>> multiple references to the page still floating around we should just

>>> unmap the page and turn it over to the stack or free it.

>>

>> I am not sure I understood the above.

> 

> As I have already mentioned, the fundamental problem with sharing a

> page and using the page pool is that the page pool assumes that it can

> unmap if it has a reference count greater than 0. That will no longer

> be the case. It has to wait until all of the pagecnt_bias has been

> cleared before it can unmap the page. Using get_page/put_page is fine

> since it will have no impact on the DMA mappings, but we have to hold

> off on calling things like page_pool_put_full_page or update it so

> that it will not unmap as long as there is still pagecnt_bias in

> place.


Actually pagecnt_bias is never clear when the page is in use or is still
recyclable, and DMA unmapping is only done when page is not in use and
and the page is not recyclable(page is from pf_memealloced or pool->ring
is full).

The page_pool_bias_page_recyclable() is used to decide whether there is
user using the page, if there is still other user using the page, current
user calling the page_pool_bias_page_recyclable() just do a ref_dec and
return, it is only the last user calling the page_pool_bias_page_recyclable()
will do the DMA unmapping if the page is not recyclable.

> 

>> As page reusing in hns3 driver, pagecnt_bias means how many refcnt the

>> driver is holding, and (page_count(cb->priv) - pagecnt_bias) means how

>> many refcnt the stack is holding, see [1].

>>

>> static bool hns3_can_reuse_page(struct hns3_desc_cb *cb)

>> {

>>         return (page_count(cb->priv) - cb->pagecnt_bias) == 1;

>> }

> 

> So one thing we have to be careful of is letting the page_count hit 0.

> My preference is to keep the bias as one less than the total

> page_count so that we always have the 1 around. So if pagecnt_bias

> hits 0 and we have a page_count of 1 it means that the current thread

> owns the only reference to the page.

> 

>> checking (page_count(cb->priv) - cb->pagecnt_bias) again one instead

>> of zero is in hns3_can_reuse_page because there is "pagecnt_bias--"

>> before checking hns3_can_reuse_page() in hns3_nic_reuse_page().

>>

>> "pagecnt_bias--" means the driver gives the one of its refcnt to the

>> stack, it is the stack'job to release the refcnt when the skb is passed

>> to the stack.

>>

>> 1. https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c#L2870

> 

> It is mostly just a matter of preference. As long as the difference is

> a predictable value it can be worked with.

> 

>>>

>>>> ---

>>>>  drivers/net/ethernet/marvell/mvneta.c           |   6 +-

>>>>  drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c |   2 +-

>>>>  include/linux/mm_types.h                        |   2 +-

>>>>  include/linux/skbuff.h                          |   4 +-

>>>>  include/net/page_pool.h                         |  30 +++-

>>>>  net/core/page_pool.c                            | 215 ++++++++++++++++++++----

>>>>  6 files changed, 207 insertions(+), 52 deletions(-)

>>>>

>>>> diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c

>>>> index 88a7550..5a29af2 100644

>>>> --- a/drivers/net/ethernet/marvell/mvneta.c

>>>> +++ b/drivers/net/ethernet/marvell/mvneta.c

>>>> @@ -2327,7 +2327,7 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,

>>>>         if (!skb)

>>>>                 return ERR_PTR(-ENOMEM);

>>>>

>>>> -       skb_mark_for_recycle(skb, virt_to_page(xdp->data), pool);

>>>> +       skb_mark_for_recycle(skb);

>>>>

>>>>         skb_reserve(skb, xdp->data - xdp->data_hard_start);

>>>>         skb_put(skb, xdp->data_end - xdp->data);

>>>> @@ -2339,10 +2339,6 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,

>>>>                 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,

>>>>                                 skb_frag_page(frag), skb_frag_off(frag),

>>>>                                 skb_frag_size(frag), PAGE_SIZE);

>>>> -               /* We don't need to reset pp_recycle here. It's already set, so

>>>> -                * just mark fragments for recycling.

>>>> -                */

>>>> -               page_pool_store_mem_info(skb_frag_page(frag), pool);

>>>>         }

>>>>

>>>>         return skb;

>>>

>>> So as I mentioned earlier the problem with recycling is that splitting

>>> up the ownership of the page makes it difficult for us to clean it up.

>>> Technically speaking if the pages are being allowed to leave while

>>> holding references to DMA addresses that we cannot revoke then we

>>> should be holding references to the device.

>>>

>>> That is one of the reasons why the previous code was just clearing the

>>> mapping as soon as the refcount was greater than 1. However for this

>>> to work out correctly we would have to track how many DMA mappings we

>>> have outstanding in addition to the one we are working on currently.

>>

>> I think page pool has already handled the above case if I understand

>> correctly, see page_pool_release().

> 

> The problem is pagecnt_bias is not multi-thread safe. You are just

> accessing an int which is prone to races. In order to fix it you would

> need to add either an atomic count or locks around the access of it

> which would pretty much negate the point of it.


As pagecnt_bias being not multi-thread safe, let's get back to it
later.

> 

> Really in terms of the page pool recycling code I think it would have

> made more sense to add the page pool release logic as an skb

> destructor rather than trying to embed the page pool into the page

> itself. At least with that if the device is going to go out of scope

> by being orphaned or the like we could unmap the page and avoid

> potential races.


I suppose it is not the netdev relevant here, it is the "struct device"
relevant here, right?

I suppose the page_ref_dec_return() and get_device(pool->p.dev) in
page_pool_init() is able to avoid the above race, as the unmaping
is done after page_ref_dec_return()?

> 

>>>

>>>> diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

>>>> index 3135220..540e387 100644

>>>> --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

>>>> +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

>>>> @@ -3997,7 +3997,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,

>>>>                 }

>>>>

>>>>                 if (pp)

>>>> -                       skb_mark_for_recycle(skb, page, pp);

>>>> +                       skb_mark_for_recycle(skb);

>>>>                 else

>>>>                         dma_unmap_single_attrs(dev->dev.parent, dma_addr,

>>>>                                                bm_pool->buf_size, DMA_FROM_DEVICE,

>>>> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

>>>> index 862f88a..cf613df 100644

>>>> --- a/include/linux/mm_types.h

>>>> +++ b/include/linux/mm_types.h

>>>> @@ -101,7 +101,7 @@ struct page {

>>>>                          * page_pool allocated pages.

>>>>                          */

>>>>                         unsigned long pp_magic;

>>>> -                       struct page_pool *pp;

>>>> +                       struct page_pool_info *pp_info;

>>>>                         unsigned long _pp_mapping_pad;

>>>>                         /**

>>>>                          * @dma_addr: might require a 64-bit value on

>>>

>>> So the problem here is that this is creating a pointer chase, and the

>>> need to allocate yet another structure to store it is going to be

>>> expensive.

>>>

>>> As far as storing the pagecnt_bias it might make more sense to

>>> repurpose the lower 12 bits of the dma address. A DMA mapping should

>>> be page aligned anyway so the lower 12 bits would be reserved 0. When

>>> we decrement the value so that the lower 12 bits are 0 we should be

>>> unmapping the page anyway, or resetting the pagecnt_bias to PAGE_SIZE

>>> - 1 and adding back the bias to the page to effectively reset it for

>>> reuse.

>>

>> Yes, that is a great idea. I like it very much supposing page refcnt

>> updating batching for 'PAGE_SIZE - 1" is enough for performance sake.

>>

>> Will take a look about it.

>>

>>>

>>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

>>>> index b2db9cd..7795979 100644

>>>> --- a/include/linux/skbuff.h

>>>> +++ b/include/linux/skbuff.h

>>>> @@ -4711,11 +4711,9 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)

>>>>  }

>>>>

>>>>  #ifdef CONFIG_PAGE_POOL

>>>> -static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page,

>>>> -                                       struct page_pool *pp)

>>>> +static inline void skb_mark_for_recycle(struct sk_buff *skb)

>>>>  {

>>>>         skb->pp_recycle = 1;

>>>> -       page_pool_store_mem_info(page, pp);

>>>>  }

>>>>  #endif

>>>

>>> I am not a fan of the pp_recycle flag either. We duplicate it via

>>> skb_clone and from what I can tell if we call pskb_expand_head

>>> afterwards I don't see how we avoid recycling the page frags twice.

>>

>> Acctually skb->pp_recycle is kind of duplicated, as there is

>> still page->pp_magic to avoid recycling the page frags twice.

>>

>> The argument above adding skb->pp_recycle seems to be short

>> cut code path for non-page_pool case in the previous disscusion,

>> see [2].

>>

>> 2. https://lore.kernel.org/linux-mm/074b0d1d-9531-57f3-8e0e-a447387478d1@huawei.com/

> 

> Yes, but that doesn't guarantee atomic protections so you still have

> race conditions possible. All it takes is something stalling during

> the dma_unamp call. Worse yet from what I can tell it looks like you

> clear page->pp before you clear page->pp_magic so you have the

> potential for a NULL pointer issue since it is cleared before the

> pp_magic value is.


Hopefully the page_ref_dec_return() in page_pool_bias_page_recyclable()
called by page_pool_put_page() will make the order of page->pp_magic
clearing and page->pp clearing irrelevant?

> 

>>>

>>>> diff --git a/include/net/page_pool.h b/include/net/page_pool.h

>>>> index 3dd62dd..44e7545 100644

>>>> --- a/include/net/page_pool.h

>>>> +++ b/include/net/page_pool.h

>>>> @@ -45,7 +45,9 @@

>>>>                                         * Please note DMA-sync-for-CPU is still

>>>>                                         * device driver responsibility

>>>>                                         */

>>>> -#define PP_FLAG_ALL            (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)

>>>> +#define PP_FLAG_PAGECNT_BIAS   BIT(2)  /* Enable elevated refcnt */

>>>> +#define PP_FLAG_ALL            (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV |\

>>>> +                                PP_FLAG_PAGECNT_BIAS)

>>>>

>>>>  /*

>>>

>>> It might be better to just put each flag on a seperate line for

>>> PP_FLAG_ALL just to make it easier to read due to the wrapping. Either

>>> that or you could look at converting this over to an enum with a MAX

>>> value and then define the flags based on those enums, and PP_FLAG_ALL

>>> being BIT(MAX) - 1.

>>

>> Will do the wrapping first:)

> 

> Sounds good.

> 

>>>

>>>>   * Fast allocation side cache array/stack

>>>> @@ -77,6 +79,7 @@ struct page_pool_params {

>>>>         enum dma_data_direction dma_dir; /* DMA mapping direction */

>>>>         unsigned int    max_len; /* max DMA sync memory size */

>>>>         unsigned int    offset;  /* DMA addr offset */

>>>> +       unsigned int    frag_size;

>>>>  };

>>>>

>>>>  struct page_pool {

>>>> @@ -88,6 +91,8 @@ struct page_pool {

>>>>         unsigned long defer_warn;

>>>>

>>>>         u32 pages_state_hold_cnt;

>>>> +       unsigned int frag_offset;

>>>> +       struct page *frag_page;

>>>>

>>>>         /*

>>>>          * Data structure for allocation side

>>>> @@ -128,6 +133,11 @@ struct page_pool {

>>>>         u64 destroy_cnt;

>>>>  };

>>>>

>>>> +struct page_pool_info {

>>>> +       struct page_pool *pp;

>>>> +       int pagecnt_bias;

>>>> +};

>>>> +

>>>

>>> Rather than having a top-down structure here it might be better to

>>> work bottom up. If you assume you are keeping a pagecnt_bias per page

>>> it might make more sense to store this in the driver somewhere rather

>>> than having it as a separate allocated buffer. One advantage of the

>>> Intel drivers was doing this as we had the pagecnt_bias in a structure

>>> that also pointed to the page. That way we were only updating that

>>> count if we dropped the page and didn't have to even touch the page.

>>> You could use that to batch updates to the pagecnt_bias if we did use

>>> the lower 12 bits of the DMA address to store it as well.

>>

>> I am not sure I understood what "we dropped the page" meant.

> 

> For XDP_DROP if we are dropping the buffer we are dropping the page

> which in our case means we just need to increment the pagecnt_bias

> indicating we are putting it back and don't have to do anything with

> the actual page refcount or struct.


In that case, the driver not doing a page_pool_put_page() seems enough
and reuse the page frag again?

It seems to like the usecase as below in hns3 driver? If all the buffer
has memcpy the head page, just reuse it.

https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c#L3149

> 

>> The driver does not really need to call page_pool_put_full_page()

>> if the page of a skb is passed to stack, the driver mainly call

>> page_pool_put_full_page() when unloading or uniniting when the page

>> is not passed to stack yet.

> 

> I was thinking mostly of something like XDP_TX cases when combined

> with the pagecnt_bias. You will need to have something to return the

> page to the pool after the XDP_TX is completed.


I suppose XDP_TX is aware of page pool to call page_pool_put_full_page()
when XDP_TX is completed now?

I suppose the above should be handled as similar as the non-elevated refcnt
case?

> 

>>> I'm assuming the idea with this is that you will be having multiple

>>> buffers received off of a single page and so doing it that way you

>>> should only have one update on allocation, maybe a trickle of updates

>>> for XDP_TX, and another large update when the page is fully consumed

>>> and you drop the remaining pagecnt_bias for Rx.

>>

>> I suppose "having multiple buffers received off of a single page" mean:

>> use first half of a page for a desc, and the second half of the same page

>> for another desc, intead of ping-pong way of reusing implemented in most

>> driver currently?

>>

>> I am not so familiar with XDP to understand the latter part of comment too.

> 

> The alloc_frag logic below is an example of what I am talking about.

> Basically taking a page and chopping it up into multiple pieces for

> use as multiple receives instead of just one receive.


Ok, but when multiple receives is passed to the stack and after the stack is
done with all the receives, we should be able to recycle the page, right?

> 

>>>

>>>>  struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);

>>>>

>>>>  static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)

>>>> @@ -137,6 +147,17 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)

>>>>         return page_pool_alloc_pages(pool, gfp);

>>>>  }

>>>>

>>>> +struct page *page_pool_alloc_frag(struct page_pool *pool,

>>>> +                                 unsigned int *offset, gfp_t gfp);

>>>> +

>>>> +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,

>>>> +                                                   unsigned int *offset)

>>>> +{

>>>> +       gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);

>>>> +

>>>> +       return page_pool_alloc_frag(pool, offset, gfp);

>>>> +}

>>>> +

>>>>  /* get the stored dma direction. A driver might decide to treat this locally and

>>>>   * avoid the extra cache line from page_pool to determine the direction

>>>>   */

>>>> @@ -253,11 +274,4 @@ static inline void page_pool_ring_unlock(struct page_pool *pool)

>>>>                 spin_unlock_bh(&pool->ring.producer_lock);

>>>>  }

>>>>

>>>> -/* Store mem_info on struct page and use it while recycling skb frags */

>>>> -static inline

>>>> -void page_pool_store_mem_info(struct page *page, struct page_pool *pp)

>>>> -{

>>>> -       page->pp = pp;

>>>> -}

>>>> -

>>>>  #endif /* _NET_PAGE_POOL_H */

>>>

>>> So the issue as I see it with the page_pool recycling patch set is

>>> that I don't think we had proper guarantees in place that the page->pp

>>> value was flushed in all cases where skb->dev was changed. Basically

>>> the logic we need to have in place to address those issues is that

>>> skb->dev is changed we need to invalidate the DMA mappings on the

>>> page_pool page.

>>

>> The DMA mappings invalidating is based on the pool->p.dev, is there

>> any reason why the DMA mappings need invalidating when skb->dev is

>> change, as fast I can tell, the tx is not aware of page pool, so

>> when the skb is redirected, the page of the skb is always DMA mapped

>> according to skb->dev before xmitting.

>>

>> Or it is about XDP redirected?

>>

>> Is there something obvious I missed here?

> 

> It is about unmapping the page. In order to do so we have to maintain

> a pointer to the original DMA device. The page pool is doing that for

> us currently.

> 

> Most netdevs have a parent  device that is used for DMA mapping.

> Therefore if skb->dev is valid, then the parent device is still valid

> since destroying the parent would destroy the children. If the

> skb->dev is dropped or changed, then we cannot guarantee the parent

> device is still present. So generally if skb->dev cannot be maintained

> then we probably shouldn't be maintaining the DMA mapping or page->pp

> across that boundary either.


Does the get_device(pool->p.dev) in page_pool_init() not prevent the
above case?

> 

>>>

>>> I honestly wonder if it wouldn't be better for the recycling to just

>>> make use of the page->lru pointers to keep a list of pages that are

>>> outstanding so that it could release them if it is under DMA pressure.

>>>

>>>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c

>>>> index 5e4eb45..95d94a7 100644

>>>> --- a/net/core/page_pool.c

>>>> +++ b/net/core/page_pool.c

>>>> @@ -206,6 +206,49 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)

>>>>         return true;

>>>>  }

>>>>

>>>> +static int page_pool_set_pp_info(struct page_pool *pool,

>>>> +                                struct page *page, gfp_t gfp)

>>>> +{

>>>> +       struct page_pool_info *pp_info;

>>>> +

>>>> +       pp_info = kzalloc_node(sizeof(*pp_info), gfp, pool->p.nid);

>>>> +       if (!pp_info)

>>>> +               return -ENOMEM;

>>>> +

>>>> +       if (pool->p.flags & PP_FLAG_PAGECNT_BIAS) {

>>>> +               page_ref_add(page, USHRT_MAX);

>>>> +               pp_info->pagecnt_bias = USHRT_MAX;

>>>> +       } else {

>>>> +               pp_info->pagecnt_bias = 0;

>>>> +       }

>>>> +

>>>> +       page->pp_magic |= PP_SIGNATURE;

>>>> +       pp_info->pp = pool;

>>>> +       page->pp_info = pp_info;

>>>> +       return 0;

>>>> +}

>>>> +

>>>

>>> Having to perform a kzalloc in this path pretty much ruins the whole

>>> point of the page_pool API in my opinion. We would be much better off

>>> having a static structure that is to be maintained somewhere rather

>>> than doing this dynamically as you would just make a memory hog able

>>> to hold that much more memory.

>>

>> Let's see if repurposing the lower 12 bits of the dma address make sense?

> 

> Sounds good.

> 

>>>

>>>> +static int page_pool_clear_pp_info(struct page *page)

>>>> +{

>>>> +       struct page_pool_info *pp_info = page->pp_info;

>>>> +       int bias;

>>>> +

>>>> +       bias = pp_info->pagecnt_bias;

>>>> +

>>>> +       kfree(pp_info);

>>>> +       page->pp_info = NULL;

>>>> +       page->pp_magic = 0;

>>>> +

>>>> +       return bias;

>>>> +}

>>>> +

>>>> +static void page_pool_clear_and_drain_page(struct page *page)

>>>> +{

>>>> +       int bias = page_pool_clear_pp_info(page);

>>>> +

>>>> +       __page_frag_cache_drain(page, bias + 1);

>>>> +}

>>>> +

>>>>  static struct page *__page_pool_alloc_page_order(struct page_pool *pool,

>>>>                                                  gfp_t gfp)

>>>>  {

>>>> @@ -216,13 +259,16 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,

>>>>         if (unlikely(!page))

>>>>                 return NULL;

>>>>

>>>> -       if ((pool->p.flags & PP_FLAG_DMA_MAP) &&

>>>> -           unlikely(!page_pool_dma_map(pool, page))) {

>>>> +       if (unlikely(page_pool_set_pp_info(pool, page, gfp))) {

>>>>                 put_page(page);

>>>>                 return NULL;

>>>>         }

>>>>

>>>> -       page->pp_magic |= PP_SIGNATURE;

>>>> +       if ((pool->p.flags & PP_FLAG_DMA_MAP) &&

>>>> +           unlikely(!page_pool_dma_map(pool, page))) {

>>>> +               page_pool_clear_and_drain_page(page);

>>>> +               return NULL;

>>>> +       }

>>>>

>>>>         /* Track how many pages are held 'in-flight' */

>>>>         pool->pages_state_hold_cnt++;

>>>> @@ -261,12 +307,17 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,

>>>>          */

>>>>         for (i = 0; i < nr_pages; i++) {

>>>>                 page = pool->alloc.cache[i];

>>>> +               if (unlikely(page_pool_set_pp_info(pool, page, gfp))) {

>>>> +                       put_page(page);

>>>> +                       continue;

>>>> +               }

>>>> +

>>>>                 if ((pp_flags & PP_FLAG_DMA_MAP) &&

>>>>                     unlikely(!page_pool_dma_map(pool, page))) {

>>>> -                       put_page(page);

>>>> +                       page_pool_clear_and_drain_page(page);

>>>>                         continue;

>>>>                 }

>>>

>>> This seems backwards to me. I would have the pp_info populated after

>>> you have generated the DMA mapping.

>>

>> Ok.

>>

>>>

>>>> -               page->pp_magic |= PP_SIGNATURE;

>>>> +

>>>>                 pool->alloc.cache[pool->alloc.count++] = page;

>>>>                 /* Track how many pages are held 'in-flight' */

>>>>                 pool->pages_state_hold_cnt++;

>>>> @@ -284,6 +335,25 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,

>>>>         return page;

>>>>  }

>>>>

>>>> +static void page_pool_sub_bias(struct page *page, int nr)

>>>> +{

>>>> +       struct page_pool_info *pp_info = page->pp_info;

>>>> +

>>>> +       /* "pp_info->pagecnt_bias == 0" indicates the PAGECNT_BIAS

>>>> +        * flags is not set.

>>>> +        */

>>>> +       if (!pp_info->pagecnt_bias)

>>>> +               return;

>>>> +

>>>> +       /* Make sure pagecnt_bias > 0 for elevated refcnt case */

>>>> +       if (unlikely(pp_info->pagecnt_bias <= nr)) {

>>>> +               page_ref_add(page, USHRT_MAX);

>>>> +               pp_info->pagecnt_bias += USHRT_MAX;

>>>> +       }

>>>> +

>>>> +       pp_info->pagecnt_bias -= nr;

>>>

>>> So we should never have a case where pagecnt_bias is less than the

>>> value we are subtracting. If we have that then it is a bug.

>>

>> Yes.

> 

> Sorry, I was referring to the code above comparing pagecnt_bias to nr.

> At most nr should only ever be equal to pagecnt_bias, you should hold

> off on recharging pagecnt_bias until you can verify the page_count

> indicates we are the only holder of the page. Then we can recharge it

> and reset any offsets.


Actually the page pool is the only user of the page when the driver is
calling page_pool_alloc_frag(), page is from pool->alloc/pool->ring or
page allocator in page_pool_alloc_pages(), as memtioned above, the
last user will put the page in pool->ring holding a lock, and when
page_pool_alloc_pages() get a page (also holding the same lock) from
pool->ring, there should be no user of the page other than the page pool.

And page_pool_sub_bias() is called in page_pool_alloc_frag() and
page_pool_alloc_pages().

> 

>>>

>>> The general idea with the pagecnt_bias is that we want to batch the

>>> release of the page from the device. So the assumption is we are going

>>> to pull multiple references from the page and rather than doing

>>> page_ref_inc repeatedly we want to batch it at the start, and we have

>>> to perform a __page_frag_cache_drain to remove any unused references

>>> when we need to free it.

>>

>> Yes, it is about batching the page_ref_inc() operation.

>>

>>>

>>> What we should probably be checking for is "pp_info->pagecnt_bias -

>>> page_count(page) > 1" when we hit the end of the page. If that is true

>>> then we cannot recycle the page and so when we hit PAGE_SIZE for the

>>> offset we have to drop the mapping and free the page subtracting any

>>> remaining pagecnt_bias we are holding. If I recall I actually ran this

>>> the other way and ran toward 0 in my implementation before as that

>>> allows for not having to track via a value and instead simply checking

>>> for a signed result.

>>

>>

>> When allocating a page for frag, we have decided how many user is using

>> the page, that is the "page_pool_sub_bias(frag_page, max_len / frag_size - 1)"

>> in page_pool_alloc_frag().

>>

>> so it is up to the driver or stack to do multi page_pool_put_full_page()

>> calling for the same page.

> 

> So that is one spot that I think is an issue. We normally only want

> this called once per page and ideally after pagecnt_bias is 0. One

> issue is that pagecnt_bias is non-atomic so we should really be

> restricting this to just the driver calling it in softirq context.


Let's discuss the pagecnt_bias handling at the end.

> 

>> Or the page pool will call page_pool_put_full_page() in page_pool_empty_frag()

>> if some of the page frag is not allocated to the driver yet.

>>

>> It seems you are suggesting a slightly different way to do frag reusing.

> 

> As I mentioned I am not a fan of the current recycling scheme. There

> are too many openings for it to end up unmapping the same page

> multiple times or other possible issues.


Other than the pagecnt_bias handling in non-atomic context, I think
most of the race you mentioned above has been handled if I understand
it correctly?

> 

> In my mind the driver or page_pool should own the page and just keep

> it on a list to either be freed or recycled with the skb destructor

> being used to trigger the recycling.


The page_pool still own the page, it is just that when driver also own
the page by calling page_pool_alloc_pages(), and the page is not on a
list of page pool, the driver or stack calling the page_pool_put_full_page()
will put the page back to the list of page pool(or do resource cleaning and
put it back to page allocator) if it is the last user.

I am not similar enough with destructor to say if using skb destructor
has any difference here.

> 

>>>

>>>> +}

>>>> +

>>>>  /* For using page_pool replace: alloc_pages() API calls, but provide

>>>>   * synchronization guarantee for allocation side.

>>>>   */

>>>> @@ -293,15 +363,66 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)

>>>>

>>>>         /* Fast-path: Get a page from cache */

>>>>         page = __page_pool_get_cached(pool);

>>>> -       if (page)

>>>> +       if (page) {

>>>> +               page_pool_sub_bias(page, 1);

>>>>                 return page;

>>>> +       }

>>>

>>> I'm not sure we should be subtracting from the bias here. Ideally if

>>> you are getting a page you are getting the full 4K page. So having a

>>> bias other than PAGE_SIZE - 1 wouldn't make much sense here.

>>

>> It seems we have different understanding about pagecnt_bias here,

>> as the pagecnt_bias is hidden in the page pool now, the subtracting

>> here mean we give one refcnt to the caller of page_pool_alloc_pages(),

>> And in page_pool_alloc_frag(), we give different part of page to the

>> driver, so it means more user too, so there is also subtracting in the

>> page_pool_alloc_frag() too.

> 

> I see what you are getting at, however I think it depends on your use

> case. In my mind since you are allocating the full page you should

> have the full count available to you. I don't believe pagecnt_bias is

> something that should be looked at outside of the driver, or at least

> outside of the napi context of the device softirq.

> 

> So really in order for this to work correctly you would need to have

> some minimum amount of bias reserved for the device to access if you

> are going to break up page in to n usable buffers.


Ensuring the pagecnt_bias > 0 in page_pool_sub_bias() seems enough
to make sure the page pool always own the page?

> 

>>>

>>>>

>>>>         /* Slow-path: cache empty, do real allocation */

>>>>         page = __page_pool_alloc_pages_slow(pool, gfp);

>>>> +       if (page)

>>>> +               page_pool_sub_bias(page, 1);

>>>> +

>>>

>>> Same here. Really in both cases we should be getting initialized

>>> pages, not ones that are already decrementing.

>>>

>>>>         return page;

>>>>  }

>>>>  EXPORT_SYMBOL(page_pool_alloc_pages);

>>>>

>>>> +struct page *page_pool_alloc_frag(struct page_pool *pool,

>>>> +                                 unsigned int *offset, gfp_t gfp)

>>>> +{

>>>> +       unsigned int frag_offset = pool->frag_offset;

>>>> +       unsigned int frag_size = pool->p.frag_size;

>>>> +       struct page *frag_page = pool->frag_page;

>>>> +       unsigned int max_len = pool->p.max_len;

>>>> +

>>>> +       if (!frag_page || frag_offset + frag_size > max_len) {

>>>

>>> These are two very different cases. If frag_page is set and just out

>>> of space we need to be freeing the unused references.

>>

>> As mention above, we are depending on the last user to do the

>> recycling or freeing the unused references.

> 

> But you are holding the pagecnt_bias for it aren't you? If so you need

> to release it so that the last user knows that they were the last

> user.


The user will know it is the last user if page_pool_bias_page_recyclable()
return true.

> 

> Once you aren't using the page you need to release the pagecnt_bias

> since the page is on the path to being freed.

It seems the above is more above what does the pagecnt_bias represent?

> 

>>>

>>>> +               frag_page = page_pool_alloc_pages(pool, gfp);

>>>

>>> So as per my comment above the page should be coming in with a

>>> pagecnt_bias of PAGE_SIZE - 1, and an actual page_ref_count of

>>> PAGE_SIZE.

>>

>> Let's align the understanding of pagecnt_bias first?

>>

>> pagecnt_bias meant how many refcnt of a page belong to the page

>> pool, and (page_ref_count() - pagecnt_bias) means how many refcnt


Actually it is (page_ref_count() - (pagecnt_bias + 1))

>> of a page belong to user of the page pool.

> 

> So my view is a slight variation on that. I view pagecnt_bias as the

> count of references reserved by the page_pool, and page_ref_count -

> pagecnt_bias is the actual reference count. So if I am going to free a

> page I should deduct pagecnt_bias + 1 from the reference count to

> account for dropping our bias and the one for the fact that we own the

> page.


So if (page_ref_count() - (pagecnt_bias + 1)) == 0 means only the page
pool hold the page and it means whichever caller having the
page_pool_bias_page_recyclable() returning true is the last user, right?

> 

>>>

>>>> +               if (unlikely(!frag_page)) {

>>>> +                       pool->frag_page = NULL;

>>>> +                       return NULL;

>>>> +               }

>>>> +

>>>> +               pool->frag_page = frag_page;

>>>> +               frag_offset = 0;

>>>> +

>>>> +               page_pool_sub_bias(frag_page, max_len / frag_size - 1);

>>>

>>> Why are you doing division here? We should just be subtracting 1 from

>>> the pagecnt_bias since that is the number of buffers that are being

>>> used. The general idea is that when pagecnt_bias is 0 we cut the page

>>> loose for potential recycling or freeing, otherwise we just subtract

>>> our new value from pagecnt_bias until we reach it.

>>

>> As mentioned above, division is used to find out how many user may be

>> using the page.

> 

> That doesn't make any sense to me because it won't tell you the actual

> users, and from what I can tell it is buggy since if I use this to

> allocate a chunk larger than 2K this comes out to 0 doesn't it? It

> seems like you should just always use 1 as the count.


There is already a page_pool_sub_bias(page, 1) in page_pool_alloc_pages(),
so for 4K page, there is two users for a page with 2K frag size, and there
is 32 users for 64K page with 2K frag size.

The reason doing a page_pool_sub_bias(page, 1) in page_pool_alloc_pages()
is that the caller is expected to use the page as a whole when using the
page_pool_alloc_pages() directly, so it means only one user.

> 

>>>

>>>> +       }

>>>> +

>>>> +       *offset = frag_offset;

>>>> +       pool->frag_offset = frag_offset + frag_size;

>>>> +

>>>> +       return frag_page;

>>>> +}

>>>> +EXPORT_SYMBOL(page_pool_alloc_frag);

>>>> +

>>>> +static void page_pool_empty_frag(struct page_pool *pool)

>>>> +{

>>>> +       unsigned int frag_offset = pool->frag_offset;

>>>> +       unsigned int frag_size = pool->p.frag_size;

>>>> +       struct page *frag_page = pool->frag_page;

>>>> +       unsigned int max_len = pool->p.max_len;

>>>> +

>>>> +       if (!frag_page)

>>>> +               return;

>>>> +

>>>> +       while (frag_offset + frag_size <= max_len) {

>>>> +               page_pool_put_full_page(pool, frag_page, false);

>>>> +               frag_offset += frag_size;

>>>> +       }

>>>> +

>>>> +       pool->frag_page = NULL;

>>>> +}

>>>> +

>>>

>>> It would be good to look over the page_frag_alloc_align and

>>> __page_frag_cache_drain functions for examples of how to do most of

>>> this. The one complication is that we have the dma mappings and

>>> page_pool logic to deal with.

>>

>> Is it ok to rely on the user providing a aligning frag_size, so

>> that do not need handling it here?

> 

> It is probably fine since the page pool should only have one consumer

> so the requests just need to be aligned by them.

> 

>>>

>>>>  /* Calculate distance between two u32 values, valid if distance is below 2^(31)

>>>>   *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution

>>>>   */

>>>> @@ -326,10 +447,11 @@ static s32 page_pool_inflight(struct page_pool *pool)

>>>>   * a regular page (that will eventually be returned to the normal

>>>>   * page-allocator via put_page).

>>>>   */

>>>> -void page_pool_release_page(struct page_pool *pool, struct page *page)

>>>> +static int __page_pool_release_page(struct page_pool *pool,

>>>> +                                   struct page *page)

>>>>  {

>>>>         dma_addr_t dma;

>>>> -       int count;

>>>> +       int bias, count;

>>>>

>>>>         if (!(pool->p.flags & PP_FLAG_DMA_MAP))

>>>>                 /* Always account for inflight pages, even if we didn't

>>>> @@ -345,22 +467,29 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)

>>>>                              DMA_ATTR_SKIP_CPU_SYNC);

>>>>         page_pool_set_dma_addr(page, 0);

>>>>  skip_dma_unmap:

>>>> -       page->pp_magic = 0;

>>>> +       bias = page_pool_clear_pp_info(page);

>>>>

>>>>         /* This may be the last page returned, releasing the pool, so

>>>>          * it is not safe to reference pool afterwards.

>>>>          */

>>>>         count = atomic_inc_return(&pool->pages_state_release_cnt);

>>>>         trace_page_pool_state_release(pool, page, count);

>>>> +       return bias;

>>>> +}

>>>> +

>>>> +void page_pool_release_page(struct page_pool *pool, struct page *page)

>>>> +{

>>>> +       int bias = __page_pool_release_page(pool, page);

>>>> +

>>>> +       WARN_ONCE(bias, "PAGECNT_BIAS is not supposed to be enabled\n");

>>>>  }

>>>>  EXPORT_SYMBOL(page_pool_release_page);

>>>>

>>>>  /* Return a page to the page allocator, cleaning up our state */

>>>>  static void page_pool_return_page(struct page_pool *pool, struct page *page)

>>>>  {

>>>> -       page_pool_release_page(pool, page);

>>>> +       __page_frag_cache_drain(page, __page_pool_release_page(pool, page) + 1);

>>>>

>>>> -       put_page(page);

>>>>         /* An optimization would be to call __free_pages(page, pool->p.order)

>>>>          * knowing page is not part of page-cache (thus avoiding a

>>>>          * __page_cache_release() call).

>>>> @@ -395,7 +524,16 @@ static bool page_pool_recycle_in_cache(struct page *page,

>>>>         return true;

>>>>  }

>>>>

>>>> -/* If the page refcnt == 1, this will try to recycle the page.

>>>> +static bool page_pool_bias_page_recyclable(struct page *page, int bias)

>>>> +{

>>>> +       int ref = page_ref_dec_return(page);

>>>> +

>>>> +       WARN_ON(ref < bias);

>>>> +       return ref == bias + 1;

>>>> +}

>>>> +

>>>> +/* If pagecnt_bias == 0 and the page refcnt == 1, this will try to

>>>> + * recycle the page.

>>>>   * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for

>>>>   * the configured size min(dma_sync_size, pool->max_len).

>>>>   * If the page refcnt != 1, then the page will be returned to memory

>>>> @@ -405,16 +543,35 @@ static __always_inline struct page *

>>>>  __page_pool_put_page(struct page_pool *pool, struct page *page,

>>>>                      unsigned int dma_sync_size, bool allow_direct)

>>>>  {

>>>> -       /* This allocator is optimized for the XDP mode that uses

>>>> +       int bias = page->pp_info->pagecnt_bias;

>>>> +

>>>> +       /* Handle the elevated refcnt case first:

>>>> +        * multi-frames-per-page, it is likely from the skb, which

>>>> +        * is likely called in non-sofrirq context, so do not recycle

>>>> +        * it in pool->alloc.

>>>> +        *

>>>> +        * Then handle non-elevated refcnt case:

>>>>          * one-frame-per-page, but have fallbacks that act like the

>>>>          * regular page allocator APIs.

>>>> -        *

>>>>          * refcnt == 1 means page_pool owns page, and can recycle it.

>>>>          *

>>>>          * page is NOT reusable when allocated when system is under

>>>>          * some pressure. (page_is_pfmemalloc)

>>>>          */

>>>> -       if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {

>>>> +       if (bias) {

>>>> +               /* We have gave some refcnt to the stack, so wait for

>>>> +                * all refcnt of the stack to be decremented before

>>>> +                * enabling recycling.

>>>> +                */

>>>> +               if (!page_pool_bias_page_recyclable(page, bias))

>>>> +                       return NULL;

>>>> +

>>>> +               /* only enable recycling when it is not pfmemalloced */

>>>> +               if (!page_is_pfmemalloc(page))

>>>> +                       return page;

>>>> +

>>>

>>> So this would be fine if this was only accessed from the driver. The

>>> problem is the recycling code made it so that this is accessed in the

>>> generic skb freeing path. As such I think this is prone to races since

>>> you have to guarantee the ordering of things between the reference

>>> count and pagecnt_bias.

>>

>> As reference count is handled atomically is page_pool_bias_page_recyclable,

>> and pagecnt_bias is changed before any page is handled to the stack(maybe

>> some READ_ONCE/WRITE_ONCE or barrier is still needed, will check it again),

>> so I suppose the ordering is correct?

> 

> The problem is in order to get this working correctly you would likely

> need to add a number of barriers so that reads and writes are in a

> specific order. You would be much better off just not

> reading/modifying the pagecnt_bias outside of the softirq paths.


Most of the reusing implemented in the driver today may not be
able to do reusing when the stack does not process the skb and
dec the refcnt quick enough, this patch try to reuse the page
as much as possible when above case happens.

So it seems the pagecnt_bias need to be checked outside of the
softirq to implement that?

Let's break down the step of reusing a page:
1. driver call page_pool_alloc_frag() to allocte a page frag.
2. page pool sub the pagecnt_bias according to the user using the
   page.
3. driver fill the page info to the desc.
4. driver notify the hw that desc is filled with page info.
5. hw write the packet to page memory according to info in desc.
6. driver process the desc and passed the skb(contianing the page
   frag) to stack
7. stack process the skb
8. stack put the page to page pool calling page_pool_return_page(),
   if it is the last user by checking pagecnt_bias, the page is recycled
   in page pool or is returned to page allocated after cleaning the
   resource.

There is usually barrier in step 4 and step 6, at least in hns3 drvier,
see the barrier does not seems to be necessary?

see:
https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c#L2867
https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c#L3368


> 

>>>

>>>> +       } else if (likely(page_ref_count(page) == 1 &&

>>>> +                         !page_is_pfmemalloc(page))) {

>>>>                 /* Read barrier done in page_ref_count / READ_ONCE */

>>>>

>>>>                 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)

>>>> @@ -428,22 +585,8 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,

>>>>                 /* Page found as candidate for recycling */

>>>>                 return page;

>>>>         }

>>>> -       /* Fallback/non-XDP mode: API user have elevated refcnt.

>>>> -        *

>>>> -        * Many drivers split up the page into fragments, and some

>>>> -        * want to keep doing this to save memory and do refcnt based

>>>> -        * recycling. Support this use case too, to ease drivers

>>>> -        * switching between XDP/non-XDP.

>>>> -        *

>>>> -        * In-case page_pool maintains the DMA mapping, API user must

>>>> -        * call page_pool_put_page once.  In this elevated refcnt

>>>> -        * case, the DMA is unmapped/released, as driver is likely

>>>> -        * doing refcnt based recycle tricks, meaning another process

>>>> -        * will be invoking put_page.

>>>> -        */

>>>> -       /* Do not replace this with page_pool_return_page() */

>>>> +

>>>>         page_pool_release_page(pool, page);

>>>> -       put_page(page);

>>>>

>>>>         return NULL;

>>>>  }

>>>> @@ -452,6 +595,7 @@ void page_pool_put_page(struct page_pool *pool, struct page *page,

>>>>                         unsigned int dma_sync_size, bool allow_direct)

>>>>  {

>>>>         page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);

>>>> +

>>>>         if (page && !page_pool_recycle_in_ring(pool, page)) {

>>>>                 /* Cache full, fallback to free pages */

>>>>                 page_pool_return_page(pool, page);

>>>> @@ -503,8 +647,11 @@ static void page_pool_empty_ring(struct page_pool *pool)

>>>>

>>>>         /* Empty recycle ring */

>>>>         while ((page = ptr_ring_consume_bh(&pool->ring))) {

>>>> -               /* Verify the refcnt invariant of cached pages */

>>>> -               if (!(page_ref_count(page) == 1))

>>>> +               /* Verify the refcnt invariant of cached pages for

>>>> +                * non elevated refcnt case.

>>>> +                */

>>>> +               if (!(pool->p.flags & PP_FLAG_PAGECNT_BIAS) &&

>>>> +                   !(page_ref_count(page) == 1))

>>>>                         pr_crit("%s() page_pool refcnt %d violation\n",

>>>>                                 __func__, page_ref_count(page));

>>>>

>>>> @@ -544,6 +691,7 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)

>>>>

>>>>  static void page_pool_scrub(struct page_pool *pool)

>>>>  {

>>>> +       page_pool_empty_frag(pool);

>>>>         page_pool_empty_alloc_cache_once(pool);

>>>>         pool->destroy_cnt++;

>>>>

>>>> @@ -637,14 +785,13 @@ bool page_pool_return_skb_page(struct page *page)

>>>>         if (unlikely(page->pp_magic != PP_SIGNATURE))

>>>>                 return false;

>>>>

>>>> -       pp = page->pp;

>>>> +       pp = page->pp_info->pp;

>>>>

>>>>         /* Driver set this to memory recycling info. Reset it on recycle.

>>>>          * This will *not* work for NIC using a split-page memory model.

>>>>          * The page will be returned to the pool here regardless of the

>>>>          * 'flipped' fragment being in use or not.

>>>>          */

>>>> -       page->pp = NULL;

>>>>         page_pool_put_full_page(pp, page, false);

>>>>

>>>>         return true;

>>>> --

>>>> 2.7.4

>>>>

>>> .

>>>

> .

>
Ilias Apalodimas July 8, 2021, 2:21 p.m. UTC | #12
> > > >


[...]

> > > > The above expectation is based on that the last user will always

> > > > call page_pool_put_full_page() in order to do the recycling or do

> > > > the resource cleanup(dma unmaping..etc).

> > > >

> > > > As the skb_free_head() and skb_release_data() have both checked the

> > > > skb->pp_recycle to call the page_pool_put_full_page() if needed, I

> > > > think we are safe for most case, the one case I am not so sure above

> > > > is the rx zero copy, which seems to also bump up the refcnt before

> > > > mapping the page to user space, we might need to ensure rx zero copy

> > > > is not the last user of the page or if it is the last user, make sure

> > > > it calls page_pool_put_full_page() too.

> > >

> > > Yes, but the skb->pp_recycle value is per skb, not per page. So my

> > > concern is that carrying around that value can be problematic as there

> > > are a number of possible cases where the pages might be

> > > unintentionally recycled. All it would take is for a packet to get

> > > cloned a few times and then somebody starts using pskb_expand_head and

> > > you would have multiple cases, possibly simultaneously, of entities

> > > trying to free the page. I just worry it opens us up to a number of

> > > possible races.

> >

> > Maybe I missde something, but I thought the cloned SKBs would never trigger

> > the recycling path, since they are protected by the atomic dataref check in

> > skb_release_data(). What am I missing?

> 

> Are you talking about the head frag? So normally a clone wouldn't

> cause an issue because the head isn't changed. In the case of the

> head_frag we should be safe since pskb_expand_head will just kmalloc

> the new head and clears head_frag so it won't trigger

> page_pool_return_skb_page on the head_frag since the dataref just goes

> from 2 to 1.

> 

> The problem is that pskb_expand_head memcopies the page frags over and

> takes a reference on the pages. At that point you would have two skbs

> both pointing to the same set of pages and each one ready to call

> page_pool_return_skb_page on the pages at any time and possibly racing

> with the other.


Ok let me make sure I get the idea properly. 
When pskb_expand_head is called, the new dataref will be 1, but the
head_frag will be set to 0, in which case the recycling code won't be
called for that skb.  
So you are mostly worried about a race within the context of 
pskb_expand_skb() between copying the frags, releasing the previous head
and preparing the new one (on a cloned skb)?

> 

> I suspect if they both called it at roughly the same time one of them

> would trigger a NULL pointer dereference since they would both check

> pp_magic first, and then both set pp to NULL. If run on a system where

> dma_unmap_page_attrs takes a while it would be very likely to race

> since pp_magic doesn't get cleared until after the page is unmapped.



Thanks!
/Ilias
Alexander Duyck July 8, 2021, 2:24 p.m. UTC | #13
On Thu, Jul 8, 2021 at 7:21 AM Ilias Apalodimas
<ilias.apalodimas@linaro.org> wrote:
>

> > > > >

>

> [...]

>

> > > > > The above expectation is based on that the last user will always

> > > > > call page_pool_put_full_page() in order to do the recycling or do

> > > > > the resource cleanup(dma unmaping..etc).

> > > > >

> > > > > As the skb_free_head() and skb_release_data() have both checked the

> > > > > skb->pp_recycle to call the page_pool_put_full_page() if needed, I

> > > > > think we are safe for most case, the one case I am not so sure above

> > > > > is the rx zero copy, which seems to also bump up the refcnt before

> > > > > mapping the page to user space, we might need to ensure rx zero copy

> > > > > is not the last user of the page or if it is the last user, make sure

> > > > > it calls page_pool_put_full_page() too.

> > > >

> > > > Yes, but the skb->pp_recycle value is per skb, not per page. So my

> > > > concern is that carrying around that value can be problematic as there

> > > > are a number of possible cases where the pages might be

> > > > unintentionally recycled. All it would take is for a packet to get

> > > > cloned a few times and then somebody starts using pskb_expand_head and

> > > > you would have multiple cases, possibly simultaneously, of entities

> > > > trying to free the page. I just worry it opens us up to a number of

> > > > possible races.

> > >

> > > Maybe I missde something, but I thought the cloned SKBs would never trigger

> > > the recycling path, since they are protected by the atomic dataref check in

> > > skb_release_data(). What am I missing?

> >

> > Are you talking about the head frag? So normally a clone wouldn't

> > cause an issue because the head isn't changed. In the case of the

> > head_frag we should be safe since pskb_expand_head will just kmalloc

> > the new head and clears head_frag so it won't trigger

> > page_pool_return_skb_page on the head_frag since the dataref just goes

> > from 2 to 1.

> >

> > The problem is that pskb_expand_head memcopies the page frags over and

> > takes a reference on the pages. At that point you would have two skbs

> > both pointing to the same set of pages and each one ready to call

> > page_pool_return_skb_page on the pages at any time and possibly racing

> > with the other.

>

> Ok let me make sure I get the idea properly.

> When pskb_expand_head is called, the new dataref will be 1, but the

> head_frag will be set to 0, in which case the recycling code won't be

> called for that skb.

> So you are mostly worried about a race within the context of

> pskb_expand_skb() between copying the frags, releasing the previous head

> and preparing the new one (on a cloned skb)?


The race is between freeing the two skbs. So the original and the
clone w/ the expanded head will have separate instances of the page. I
am pretty certain there is a race if the two of them start trying to
free the page frags at the same time.

Thanks,

- Alex
Ilias Apalodimas July 8, 2021, 2:50 p.m. UTC | #14
On Thu, Jul 08, 2021 at 07:24:57AM -0700, Alexander Duyck wrote:
> On Thu, Jul 8, 2021 at 7:21 AM Ilias Apalodimas

> <ilias.apalodimas@linaro.org> wrote:

> >

> > > > > >

> >

> > [...]

> >

> > > > > > The above expectation is based on that the last user will always

> > > > > > call page_pool_put_full_page() in order to do the recycling or do

> > > > > > the resource cleanup(dma unmaping..etc).

> > > > > >

> > > > > > As the skb_free_head() and skb_release_data() have both checked the

> > > > > > skb->pp_recycle to call the page_pool_put_full_page() if needed, I

> > > > > > think we are safe for most case, the one case I am not so sure above

> > > > > > is the rx zero copy, which seems to also bump up the refcnt before

> > > > > > mapping the page to user space, we might need to ensure rx zero copy

> > > > > > is not the last user of the page or if it is the last user, make sure

> > > > > > it calls page_pool_put_full_page() too.

> > > > >

> > > > > Yes, but the skb->pp_recycle value is per skb, not per page. So my

> > > > > concern is that carrying around that value can be problematic as there

> > > > > are a number of possible cases where the pages might be

> > > > > unintentionally recycled. All it would take is for a packet to get

> > > > > cloned a few times and then somebody starts using pskb_expand_head and

> > > > > you would have multiple cases, possibly simultaneously, of entities

> > > > > trying to free the page. I just worry it opens us up to a number of

> > > > > possible races.

> > > >

> > > > Maybe I missde something, but I thought the cloned SKBs would never trigger

> > > > the recycling path, since they are protected by the atomic dataref check in

> > > > skb_release_data(). What am I missing?

> > >

> > > Are you talking about the head frag? So normally a clone wouldn't

> > > cause an issue because the head isn't changed. In the case of the

> > > head_frag we should be safe since pskb_expand_head will just kmalloc

> > > the new head and clears head_frag so it won't trigger

> > > page_pool_return_skb_page on the head_frag since the dataref just goes

> > > from 2 to 1.

> > >

> > > The problem is that pskb_expand_head memcopies the page frags over and

> > > takes a reference on the pages. At that point you would have two skbs

> > > both pointing to the same set of pages and each one ready to call

> > > page_pool_return_skb_page on the pages at any time and possibly racing

> > > with the other.

> >

> > Ok let me make sure I get the idea properly.

> > When pskb_expand_head is called, the new dataref will be 1, but the

> > head_frag will be set to 0, in which case the recycling code won't be

> > called for that skb.

> > So you are mostly worried about a race within the context of

> > pskb_expand_skb() between copying the frags, releasing the previous head

> > and preparing the new one (on a cloned skb)?

> 

> The race is between freeing the two skbs. So the original and the

> clone w/ the expanded head will have separate instances of the page. I

> am pretty certain there is a race if the two of them start trying to

> free the page frags at the same time.

> 


Right, I completely forgot calling __skb_frag_unref() before releasing the
head ...
You are right, this will be a race.  Let me go back to the original mail
thread and see what we can do

Thanks!
/Ilias
> Thanks,

> 

> - Alex
Ilias Apalodimas July 8, 2021, 3:17 p.m. UTC | #15
> > >

> > > > > > >

> > >

> > > [...]

> > >

> > > > > > > The above expectation is based on that the last user will always

> > > > > > > call page_pool_put_full_page() in order to do the recycling or do

> > > > > > > the resource cleanup(dma unmaping..etc).

> > > > > > >

> > > > > > > As the skb_free_head() and skb_release_data() have both checked the

> > > > > > > skb->pp_recycle to call the page_pool_put_full_page() if needed, I

> > > > > > > think we are safe for most case, the one case I am not so sure above

> > > > > > > is the rx zero copy, which seems to also bump up the refcnt before

> > > > > > > mapping the page to user space, we might need to ensure rx zero copy

> > > > > > > is not the last user of the page or if it is the last user, make sure

> > > > > > > it calls page_pool_put_full_page() too.

> > > > > >

> > > > > > Yes, but the skb->pp_recycle value is per skb, not per page. So my

> > > > > > concern is that carrying around that value can be problematic as there

> > > > > > are a number of possible cases where the pages might be

> > > > > > unintentionally recycled. All it would take is for a packet to get

> > > > > > cloned a few times and then somebody starts using pskb_expand_head and

> > > > > > you would have multiple cases, possibly simultaneously, of entities

> > > > > > trying to free the page. I just worry it opens us up to a number of

> > > > > > possible races.

> > > > >

> > > > > Maybe I missde something, but I thought the cloned SKBs would never trigger

> > > > > the recycling path, since they are protected by the atomic dataref check in

> > > > > skb_release_data(). What am I missing?

> > > >

> > > > Are you talking about the head frag? So normally a clone wouldn't

> > > > cause an issue because the head isn't changed. In the case of the

> > > > head_frag we should be safe since pskb_expand_head will just kmalloc

> > > > the new head and clears head_frag so it won't trigger

> > > > page_pool_return_skb_page on the head_frag since the dataref just goes

> > > > from 2 to 1.

> > > >

> > > > The problem is that pskb_expand_head memcopies the page frags over and

> > > > takes a reference on the pages. At that point you would have two skbs

> > > > both pointing to the same set of pages and each one ready to call

> > > > page_pool_return_skb_page on the pages at any time and possibly racing

> > > > with the other.

> > >

> > > Ok let me make sure I get the idea properly.

> > > When pskb_expand_head is called, the new dataref will be 1, but the

> > > head_frag will be set to 0, in which case the recycling code won't be

> > > called for that skb.

> > > So you are mostly worried about a race within the context of

> > > pskb_expand_skb() between copying the frags, releasing the previous head

> > > and preparing the new one (on a cloned skb)?

> > 

> > The race is between freeing the two skbs. So the original and the

> > clone w/ the expanded head will have separate instances of the page. I

> > am pretty certain there is a race if the two of them start trying to

> > free the page frags at the same time.

> > 

> 

> Right, I completely forgot calling __skb_frag_unref() before releasing the

> head ...

> You are right, this will be a race.  Let me go back to the original mail

> thread and see what we can do

> 


What do you think about resetting pp_recycle bit on pskb_expand_head()?
If my memory serves me right Eric wanted that from the beginning. Then the
cloned/expanded SKB won't trigger the recycling.  If that skb hits the free
path first, we'll end up recycling the fragments eventually.  If the
original one goes first, we'll just unmap the page(s) and freeing the cloned
one will free all the remaining buffers.

Regards
Ilias
> Thanks!

> /Ilias

> > Thanks,

> > 

> > - Alex
Alexander Duyck July 8, 2021, 3:29 p.m. UTC | #16
On Thu, Jul 8, 2021 at 8:17 AM Ilias Apalodimas
<ilias.apalodimas@linaro.org> wrote:
>

> > > >

> > > > > > > >

> > > >

> > > > [...]

> > > >

> > > > > > > > The above expectation is based on that the last user will always

> > > > > > > > call page_pool_put_full_page() in order to do the recycling or do

> > > > > > > > the resource cleanup(dma unmaping..etc).

> > > > > > > >

> > > > > > > > As the skb_free_head() and skb_release_data() have both checked the

> > > > > > > > skb->pp_recycle to call the page_pool_put_full_page() if needed, I

> > > > > > > > think we are safe for most case, the one case I am not so sure above

> > > > > > > > is the rx zero copy, which seems to also bump up the refcnt before

> > > > > > > > mapping the page to user space, we might need to ensure rx zero copy

> > > > > > > > is not the last user of the page or if it is the last user, make sure

> > > > > > > > it calls page_pool_put_full_page() too.

> > > > > > >

> > > > > > > Yes, but the skb->pp_recycle value is per skb, not per page. So my

> > > > > > > concern is that carrying around that value can be problematic as there

> > > > > > > are a number of possible cases where the pages might be

> > > > > > > unintentionally recycled. All it would take is for a packet to get

> > > > > > > cloned a few times and then somebody starts using pskb_expand_head and

> > > > > > > you would have multiple cases, possibly simultaneously, of entities

> > > > > > > trying to free the page. I just worry it opens us up to a number of

> > > > > > > possible races.

> > > > > >

> > > > > > Maybe I missde something, but I thought the cloned SKBs would never trigger

> > > > > > the recycling path, since they are protected by the atomic dataref check in

> > > > > > skb_release_data(). What am I missing?

> > > > >

> > > > > Are you talking about the head frag? So normally a clone wouldn't

> > > > > cause an issue because the head isn't changed. In the case of the

> > > > > head_frag we should be safe since pskb_expand_head will just kmalloc

> > > > > the new head and clears head_frag so it won't trigger

> > > > > page_pool_return_skb_page on the head_frag since the dataref just goes

> > > > > from 2 to 1.

> > > > >

> > > > > The problem is that pskb_expand_head memcopies the page frags over and

> > > > > takes a reference on the pages. At that point you would have two skbs

> > > > > both pointing to the same set of pages and each one ready to call

> > > > > page_pool_return_skb_page on the pages at any time and possibly racing

> > > > > with the other.

> > > >

> > > > Ok let me make sure I get the idea properly.

> > > > When pskb_expand_head is called, the new dataref will be 1, but the

> > > > head_frag will be set to 0, in which case the recycling code won't be

> > > > called for that skb.

> > > > So you are mostly worried about a race within the context of

> > > > pskb_expand_skb() between copying the frags, releasing the previous head

> > > > and preparing the new one (on a cloned skb)?

> > >

> > > The race is between freeing the two skbs. So the original and the

> > > clone w/ the expanded head will have separate instances of the page. I

> > > am pretty certain there is a race if the two of them start trying to

> > > free the page frags at the same time.

> > >

> >

> > Right, I completely forgot calling __skb_frag_unref() before releasing the

> > head ...

> > You are right, this will be a race.  Let me go back to the original mail

> > thread and see what we can do

> >

>

> What do you think about resetting pp_recycle bit on pskb_expand_head()?


I assume you mean specifically in the cloned case?

> If my memory serves me right Eric wanted that from the beginning. Then the

> cloned/expanded SKB won't trigger the recycling.  If that skb hits the free

> path first, we'll end up recycling the fragments eventually.  If the

> original one goes first, we'll just unmap the page(s) and freeing the cloned

> one will free all the remaining buffers.


I *think* that should be fine. Effectively what we are doing is making
it so that if the original skb is freed first the pages are released,
and if it is released after the clone/expended skb then it can be
recycled.

The issue is we have to maintain it so that there will be exactly one
caller of the recycling function for the pages. So any spot where we
are updating skb->head we will have to see if there is a clone and if
so we have to clear the pp_recycle flag on our skb so that it doesn't
try to recycle the page frags as well.
Ilias Apalodimas July 8, 2021, 3:36 p.m. UTC | #17
On Thu, Jul 08, 2021 at 08:29:56AM -0700, Alexander Duyck wrote:
> On Thu, Jul 8, 2021 at 8:17 AM Ilias Apalodimas

> <ilias.apalodimas@linaro.org> wrote:

> >

> > > > >

> > > > > > > > >

> > > > >

> > > > > [...]

> > > > >

> > > > > > > > > The above expectation is based on that the last user will always

> > > > > > > > > call page_pool_put_full_page() in order to do the recycling or do

> > > > > > > > > the resource cleanup(dma unmaping..etc).

> > > > > > > > >

> > > > > > > > > As the skb_free_head() and skb_release_data() have both checked the

> > > > > > > > > skb->pp_recycle to call the page_pool_put_full_page() if needed, I

> > > > > > > > > think we are safe for most case, the one case I am not so sure above

> > > > > > > > > is the rx zero copy, which seems to also bump up the refcnt before

> > > > > > > > > mapping the page to user space, we might need to ensure rx zero copy

> > > > > > > > > is not the last user of the page or if it is the last user, make sure

> > > > > > > > > it calls page_pool_put_full_page() too.

> > > > > > > >

> > > > > > > > Yes, but the skb->pp_recycle value is per skb, not per page. So my

> > > > > > > > concern is that carrying around that value can be problematic as there

> > > > > > > > are a number of possible cases where the pages might be

> > > > > > > > unintentionally recycled. All it would take is for a packet to get

> > > > > > > > cloned a few times and then somebody starts using pskb_expand_head and

> > > > > > > > you would have multiple cases, possibly simultaneously, of entities

> > > > > > > > trying to free the page. I just worry it opens us up to a number of

> > > > > > > > possible races.

> > > > > > >

> > > > > > > Maybe I missde something, but I thought the cloned SKBs would never trigger

> > > > > > > the recycling path, since they are protected by the atomic dataref check in

> > > > > > > skb_release_data(). What am I missing?

> > > > > >

> > > > > > Are you talking about the head frag? So normally a clone wouldn't

> > > > > > cause an issue because the head isn't changed. In the case of the

> > > > > > head_frag we should be safe since pskb_expand_head will just kmalloc

> > > > > > the new head and clears head_frag so it won't trigger

> > > > > > page_pool_return_skb_page on the head_frag since the dataref just goes

> > > > > > from 2 to 1.

> > > > > >

> > > > > > The problem is that pskb_expand_head memcopies the page frags over and

> > > > > > takes a reference on the pages. At that point you would have two skbs

> > > > > > both pointing to the same set of pages and each one ready to call

> > > > > > page_pool_return_skb_page on the pages at any time and possibly racing

> > > > > > with the other.

> > > > >

> > > > > Ok let me make sure I get the idea properly.

> > > > > When pskb_expand_head is called, the new dataref will be 1, but the

> > > > > head_frag will be set to 0, in which case the recycling code won't be

> > > > > called for that skb.

> > > > > So you are mostly worried about a race within the context of

> > > > > pskb_expand_skb() between copying the frags, releasing the previous head

> > > > > and preparing the new one (on a cloned skb)?

> > > >

> > > > The race is between freeing the two skbs. So the original and the

> > > > clone w/ the expanded head will have separate instances of the page. I

> > > > am pretty certain there is a race if the two of them start trying to

> > > > free the page frags at the same time.

> > > >

> > >

> > > Right, I completely forgot calling __skb_frag_unref() before releasing the

> > > head ...

> > > You are right, this will be a race.  Let me go back to the original mail

> > > thread and see what we can do

> > >

> >

> > What do you think about resetting pp_recycle bit on pskb_expand_head()?

> 

> I assume you mean specifically in the cloned case?

> 


Yes. Even if we do it unconditionally we'll just loose non-cloned buffers from
the recycling.
I'll send a patch later today.

> > If my memory serves me right Eric wanted that from the beginning. Then the

> > cloned/expanded SKB won't trigger the recycling.  If that skb hits the free

> > path first, we'll end up recycling the fragments eventually.  If the

> > original one goes first, we'll just unmap the page(s) and freeing the cloned

> > one will free all the remaining buffers.

> 

> I *think* that should be fine. Effectively what we are doing is making

> it so that if the original skb is freed first the pages are released,

> and if it is released after the clone/expended skb then it can be

> recycled.


Exactly

> 

> The issue is we have to maintain it so that there will be exactly one

> caller of the recycling function for the pages. So any spot where we

> are updating skb->head we will have to see if there is a clone and if

> so we have to clear the pp_recycle flag on our skb so that it doesn't

> try to recycle the page frags as well.


Correct. I'll keep looking around in case there's something less fragile we
can do 

Thanks
/Ilias
Alexander Duyck July 8, 2021, 3:36 p.m. UTC | #18
On Wed, Jul 7, 2021 at 7:27 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>

> On 2021/7/7 23:01, Alexander Duyck wrote:

> > On Tue, Jul 6, 2021 at 8:05 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:

> >>

> >> On 2021/7/7 4:45, Alexander Duyck wrote:

> >>> On Wed, Jun 30, 2021 at 2:19 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:

> >>>>

> >>>> Currently page pool only support page recycling only when

> >>>> refcnt of page is one, which means it can not support the

> >>>> split page recycling implemented in the most ethernet driver.

> >>>>

> >>>> So add elevated refcnt support in page pool, and support

> >>>> allocating page frag to enable multi-frames-per-page based

> >>>> on the elevated refcnt support.

> >>>>

> >>>> As the elevated refcnt is per page, and there is no space

> >>>> for that in "struct page" now, so add a dynamically allocated

> >>>> "struct page_pool_info" to record page pool ptr and refcnt

> >>>> corrsponding to a page for now. Later, we can recycle the

> >>>> "struct page_pool_info" too, or use part of page memory to

> >>>> record pp_info.

> >>>>

> >>>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>

> >>

> >> Hi, Alexander

> >>

> >> Thanks for detailed reviewing.

> >>

> >>>

> >>> So this isn't going to work with the current recycling logic. The

> >>> expectation there is that we can safely unmap the entire page as soon

> >>> as the reference count is greater than 1.

> >>

> >> Yes, the expectation is changed to we can always recycle the page

> >> when the last user has dropped the refcnt that has given to it when

> >> the page is not pfmemalloced.

> >>

> >> The above expectation is based on that the last user will always

> >> call page_pool_put_full_page() in order to do the recycling or do

> >> the resource cleanup(dma unmaping..etc).

> >>

> >> As the skb_free_head() and skb_release_data() have both checked the

> >> skb->pp_recycle to call the page_pool_put_full_page() if needed, I

> >> think we are safe for most case, the one case I am not so sure above

> >> is the rx zero copy, which seems to also bump up the refcnt before

> >> mapping the page to user space, we might need to ensure rx zero copy

> >> is not the last user of the page or if it is the last user, make sure

> >> it calls page_pool_put_full_page() too.

> >

> > Yes, but the skb->pp_recycle value is per skb, not per page. So my

> > concern is that carrying around that value can be problematic as there

> > are a number of possible cases where the pages might be

> > unintentionally recycled. All it would take is for a packet to get

> > cloned a few times and then somebody starts using pskb_expand_head and

> > you would have multiple cases, possibly simultaneously, of entities

> > trying to free the page. I just worry it opens us up to a number of

> > possible races.

>

> I think page_ref_dec_return() in page_pool_bias_page_recyclable() will

> prevent the above race to happen.

>

> As the page_ref_dec_return() and page_pool_bias_page_recyclable() return

> true, all user of the page have done with the p->pp_magic and p->pp_info,

> so it should be ok to reset the p->pp_magic and p->pp_info in any order?

>

> And page_ref_dec_return() has both __atomic_pre_full_fence() and

> __atomic_post_full_fence() to ensure the above ordering.


So if I understand correctly what you are saying is that because of
the pagecnt_bias check we will not hit the page_pool_release_page.
That may help to address the issue introduced by the recycling patch
but I don't think it completely resolves it. In addition there may be
performance implications to this change since you are requiring the
atomic dec for every page.

The difference between pagecnt_bias and what you have here is that we
freed the page when page_ref_count hit 0. With this approach you are
effectively freeing the page when page_ref_count == pagecnt_bias +
modifier. The two implementations have quite a number of differences
in behavior.

What you have effectively done here is make the page refcount and
pagecnt_bias effectively into a ticket lock where we cannot call the
free function until page_ref_cnt == pagecnt_bias + 1. So you need to
keep the pagecnt_bias much lower than the page_ref_cnt otherwise you
run the risk of frequent recycling. For the non-shared page_pool pages
this is probably fine, however the frags implementation is horribly
broken.

Also the ticketlock approach is flawed because with something like
that we shouldn't rewind the number we are currently serving like we
do. We would have to wait until we are the only one holding the page
before we could recycle previously used values.

> >

> >>>

> >>> In addition I think I need to look over that code better as I am

> >>> wondering if there are potential issues assuming a path such as a

> >>> skb_clone followed by pskb_expand_head may lead to memory corruptions

> >>> since the clone will still have pp_recycle set but none of the pages

> >>> will be part of the page pool anymore.

> >>

> >> There is still page->pp_magic that decides if the page is from

> >> page_pool or not.

> >

> > The problem with pp_magic is that it doesn't prevent races. The page

> > pool code was meant to be protected by NAPI to prevent simultaneous

> > access. With us now allowing the stack to be a part of the handling we

> > open things up to potential races in the code.

>

> As above.

>

> >

> >>>

> >>> For us the pagecnt_bias would really represent the number of

> >>> additional mappings beyond the current page that are being held. I

> >>> have already been playing around with something similar. However the

> >>> general idea is that we want to keep track of how many references to

> >>> the page the device is holding onto. When that hits 0 and the actual

> >>> page count is 1 we can refill both, however if we hit 0 and there are

> >>> multiple references to the page still floating around we should just

> >>> unmap the page and turn it over to the stack or free it.

> >>

> >> I am not sure I understood the above.

> >

> > As I have already mentioned, the fundamental problem with sharing a

> > page and using the page pool is that the page pool assumes that it can

> > unmap if it has a reference count greater than 0. That will no longer

> > be the case. It has to wait until all of the pagecnt_bias has been

> > cleared before it can unmap the page. Using get_page/put_page is fine

> > since it will have no impact on the DMA mappings, but we have to hold

> > off on calling things like page_pool_put_full_page or update it so

> > that it will not unmap as long as there is still pagecnt_bias in

> > place.

>

> Actually pagecnt_bias is never clear when the page is in use or is still

> recyclable, and DMA unmapping is only done when page is not in use and

> and the page is not recyclable(page is from pf_memealloced or pool->ring

> is full).

>

> The page_pool_bias_page_recyclable() is used to decide whether there is

> user using the page, if there is still other user using the page, current

> user calling the page_pool_bias_page_recyclable() just do a ref_dec and

> return, it is only the last user calling the page_pool_bias_page_recyclable()

> will do the DMA unmapping if the page is not recyclable.


So now that I have the ticketlock model in my mind I think I see where
you and I may be differing in how we have been viewing things. One
thing is that in my mind we would be freeing/recycling the page when
page_ref_count == pagecnt_bias and skip the extra "+1" modifier.

In my mind the driver is needing to hold onto one reference to the
page itself as long as it is processing Rx DMA requests. So we need to
block recycling until the driver is no longer holding onto the page
for possible DMA operations. In my mind we are doing so via the
pagecnt_bias value and keeping it at least 1 lower than the
page_ref_count until the Rx buffer is ready to be unmapped. For the
last buffer we don't bother with decrementing the pagecnt_bias and
instead just hand the page over to the stack. So what we should have
is the page cycling between a pagecnt_bias that is +1-2 of the actual
page_ref_count and when the two are equal we then perform the
unmap/free or recycle of the page.

On the Tx and SKB side of things we are using the page_ref_count to
track which instances can be recycled and should only ever be reading
pagecnt_bias.

At recycle time we will need to verify there are enough tickets to
support another run through the allocator. We may want to look at
adding a value to the page pool to track the maximum number of slices
a page can be broken into in order to avoid having to update the
page_ref_count and pagecnt_bias too often.

> >

> >> As page reusing in hns3 driver, pagecnt_bias means how many refcnt the

> >> driver is holding, and (page_count(cb->priv) - pagecnt_bias) means how

> >> many refcnt the stack is holding, see [1].

> >>

> >> static bool hns3_can_reuse_page(struct hns3_desc_cb *cb)

> >> {

> >>         return (page_count(cb->priv) - cb->pagecnt_bias) == 1;

> >> }

> >

> > So one thing we have to be careful of is letting the page_count hit 0.

> > My preference is to keep the bias as one less than the total

> > page_count so that we always have the 1 around. So if pagecnt_bias

> > hits 0 and we have a page_count of 1 it means that the current thread

> > owns the only reference to the page.

> >

> >> checking (page_count(cb->priv) - cb->pagecnt_bias) again one instead

> >> of zero is in hns3_can_reuse_page because there is "pagecnt_bias--"

> >> before checking hns3_can_reuse_page() in hns3_nic_reuse_page().

> >>

> >> "pagecnt_bias--" means the driver gives the one of its refcnt to the

> >> stack, it is the stack'job to release the refcnt when the skb is passed

> >> to the stack.

> >>

> >> 1. https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c#L2870

> >

> > It is mostly just a matter of preference. As long as the difference is

> > a predictable value it can be worked with.

> >

> >>>

> >>>> ---

> >>>>  drivers/net/ethernet/marvell/mvneta.c           |   6 +-

> >>>>  drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c |   2 +-

> >>>>  include/linux/mm_types.h                        |   2 +-

> >>>>  include/linux/skbuff.h                          |   4 +-

> >>>>  include/net/page_pool.h                         |  30 +++-

> >>>>  net/core/page_pool.c                            | 215 ++++++++++++++++++++----

> >>>>  6 files changed, 207 insertions(+), 52 deletions(-)

> >>>>

> >>>> diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c

> >>>> index 88a7550..5a29af2 100644

> >>>> --- a/drivers/net/ethernet/marvell/mvneta.c

> >>>> +++ b/drivers/net/ethernet/marvell/mvneta.c

> >>>> @@ -2327,7 +2327,7 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,

> >>>>         if (!skb)

> >>>>                 return ERR_PTR(-ENOMEM);

> >>>>

> >>>> -       skb_mark_for_recycle(skb, virt_to_page(xdp->data), pool);

> >>>> +       skb_mark_for_recycle(skb);

> >>>>

> >>>>         skb_reserve(skb, xdp->data - xdp->data_hard_start);

> >>>>         skb_put(skb, xdp->data_end - xdp->data);

> >>>> @@ -2339,10 +2339,6 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,

> >>>>                 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,

> >>>>                                 skb_frag_page(frag), skb_frag_off(frag),

> >>>>                                 skb_frag_size(frag), PAGE_SIZE);

> >>>> -               /* We don't need to reset pp_recycle here. It's already set, so

> >>>> -                * just mark fragments for recycling.

> >>>> -                */

> >>>> -               page_pool_store_mem_info(skb_frag_page(frag), pool);

> >>>>         }

> >>>>

> >>>>         return skb;

> >>>

> >>> So as I mentioned earlier the problem with recycling is that splitting

> >>> up the ownership of the page makes it difficult for us to clean it up.

> >>> Technically speaking if the pages are being allowed to leave while

> >>> holding references to DMA addresses that we cannot revoke then we

> >>> should be holding references to the device.

> >>>

> >>> That is one of the reasons why the previous code was just clearing the

> >>> mapping as soon as the refcount was greater than 1. However for this

> >>> to work out correctly we would have to track how many DMA mappings we

> >>> have outstanding in addition to the one we are working on currently.

> >>

> >> I think page pool has already handled the above case if I understand

> >> correctly, see page_pool_release().

> >

> > The problem is pagecnt_bias is not multi-thread safe. You are just

> > accessing an int which is prone to races. In order to fix it you would

> > need to add either an atomic count or locks around the access of it

> > which would pretty much negate the point of it.

>

> As pagecnt_bias being not multi-thread safe, let's get back to it

> later.


Actually this is kind of core to things for the batch count updates.
We have to guarantee that the pagecnt_bias is only updated in the
softirq handler, and read-only everywhere else. What we have is
effectively a consumer-producer ticket lock.

> >

> > Really in terms of the page pool recycling code I think it would have

> > made more sense to add the page pool release logic as an skb

> > destructor rather than trying to embed the page pool into the page

> > itself. At least with that if the device is going to go out of scope

> > by being orphaned or the like we could unmap the page and avoid

> > potential races.

>

> I suppose it is not the netdev relevant here, it is the "struct device"

> relevant here, right?

>

> I suppose the page_ref_dec_return() and get_device(pool->p.dev) in

> page_pool_init() is able to avoid the above race, as the unmaping

> is done after page_ref_dec_return()?


The problem is the pointer to pool->p.dev could be potentially stale
in the event of something such as a hotplug event. I would like to
avoid that as it could cause some ugly issues.

> >

> >>>

> >>>> diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

> >>>> index 3135220..540e387 100644

> >>>> --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

> >>>> +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

> >>>> @@ -3997,7 +3997,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,

> >>>>                 }

> >>>>

> >>>>                 if (pp)

> >>>> -                       skb_mark_for_recycle(skb, page, pp);

> >>>> +                       skb_mark_for_recycle(skb);

> >>>>                 else

> >>>>                         dma_unmap_single_attrs(dev->dev.parent, dma_addr,

> >>>>                                                bm_pool->buf_size, DMA_FROM_DEVICE,

> >>>> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

> >>>> index 862f88a..cf613df 100644

> >>>> --- a/include/linux/mm_types.h

> >>>> +++ b/include/linux/mm_types.h

> >>>> @@ -101,7 +101,7 @@ struct page {

> >>>>                          * page_pool allocated pages.

> >>>>                          */

> >>>>                         unsigned long pp_magic;

> >>>> -                       struct page_pool *pp;

> >>>> +                       struct page_pool_info *pp_info;

> >>>>                         unsigned long _pp_mapping_pad;

> >>>>                         /**

> >>>>                          * @dma_addr: might require a 64-bit value on

> >>>

> >>> So the problem here is that this is creating a pointer chase, and the

> >>> need to allocate yet another structure to store it is going to be

> >>> expensive.

> >>>

> >>> As far as storing the pagecnt_bias it might make more sense to

> >>> repurpose the lower 12 bits of the dma address. A DMA mapping should

> >>> be page aligned anyway so the lower 12 bits would be reserved 0. When

> >>> we decrement the value so that the lower 12 bits are 0 we should be

> >>> unmapping the page anyway, or resetting the pagecnt_bias to PAGE_SIZE

> >>> - 1 and adding back the bias to the page to effectively reset it for

> >>> reuse.

> >>

> >> Yes, that is a great idea. I like it very much supposing page refcnt

> >> updating batching for 'PAGE_SIZE - 1" is enough for performance sake.

> >>

> >> Will take a look about it.

> >>

> >>>

> >>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

> >>>> index b2db9cd..7795979 100644

> >>>> --- a/include/linux/skbuff.h

> >>>> +++ b/include/linux/skbuff.h

> >>>> @@ -4711,11 +4711,9 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)

> >>>>  }

> >>>>

> >>>>  #ifdef CONFIG_PAGE_POOL

> >>>> -static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page,

> >>>> -                                       struct page_pool *pp)

> >>>> +static inline void skb_mark_for_recycle(struct sk_buff *skb)

> >>>>  {

> >>>>         skb->pp_recycle = 1;

> >>>> -       page_pool_store_mem_info(page, pp);

> >>>>  }

> >>>>  #endif

> >>>

> >>> I am not a fan of the pp_recycle flag either. We duplicate it via

> >>> skb_clone and from what I can tell if we call pskb_expand_head

> >>> afterwards I don't see how we avoid recycling the page frags twice.

> >>

> >> Acctually skb->pp_recycle is kind of duplicated, as there is

> >> still page->pp_magic to avoid recycling the page frags twice.

> >>

> >> The argument above adding skb->pp_recycle seems to be short

> >> cut code path for non-page_pool case in the previous disscusion,

> >> see [2].

> >>

> >> 2. https://lore.kernel.org/linux-mm/074b0d1d-9531-57f3-8e0e-a447387478d1@huawei.com/

> >

> > Yes, but that doesn't guarantee atomic protections so you still have

> > race conditions possible. All it takes is something stalling during

> > the dma_unamp call. Worse yet from what I can tell it looks like you

> > clear page->pp before you clear page->pp_magic so you have the

> > potential for a NULL pointer issue since it is cleared before the

> > pp_magic value is.

>

> Hopefully the page_ref_dec_return() in page_pool_bias_page_recyclable()

> called by page_pool_put_page() will make the order of page->pp_magic

> clearing and page->pp clearing irrelevant?


Really it doesn't address the issue. The problem is the clearing of
pp_magic is after the dec_and_ref while the reading/clearing of
page->pp is before it.

So having code like the following is not safe:
    pp = page->pp;
    page->pp = NULL;

    if (pp->something)
        do_something();

The check for page->pp_magic before this doens't resolve it because 2
threads can get into the code path before either one has updated
page->pp_magic.

Arguably the pagecnt_bias does something to help, but what it has
effectively done is created a ticket lock where until you can get
page_ref_count to reach the pagecnt_bias value you cannot unmap or
free the page. So the tradeoff is that if anyone takes a reference to
the page you are now stuck and cannot unmap it nor remove the device
while the page is still in use elsewhere.

Also it just occurred to me that this will cause likely leaks because
page_ref_count is also updated outside of page_pool so we would have
to worry about someone calling get_page, then your call to
page_pool_bias_page_recyclable, and then put page and at that point
the page is leaked.

<...>
> >>>

> >>>>   * Fast allocation side cache array/stack

> >>>> @@ -77,6 +79,7 @@ struct page_pool_params {

> >>>>         enum dma_data_direction dma_dir; /* DMA mapping direction */

> >>>>         unsigned int    max_len; /* max DMA sync memory size */

> >>>>         unsigned int    offset;  /* DMA addr offset */

> >>>> +       unsigned int    frag_size;

> >>>>  };

> >>>>

> >>>>  struct page_pool {

> >>>> @@ -88,6 +91,8 @@ struct page_pool {

> >>>>         unsigned long defer_warn;

> >>>>

> >>>>         u32 pages_state_hold_cnt;

> >>>> +       unsigned int frag_offset;

> >>>> +       struct page *frag_page;

> >>>>

> >>>>         /*

> >>>>          * Data structure for allocation side

> >>>> @@ -128,6 +133,11 @@ struct page_pool {

> >>>>         u64 destroy_cnt;

> >>>>  };

> >>>>

> >>>> +struct page_pool_info {

> >>>> +       struct page_pool *pp;

> >>>> +       int pagecnt_bias;

> >>>> +};

> >>>> +

> >>>

> >>> Rather than having a top-down structure here it might be better to

> >>> work bottom up. If you assume you are keeping a pagecnt_bias per page

> >>> it might make more sense to store this in the driver somewhere rather

> >>> than having it as a separate allocated buffer. One advantage of the

> >>> Intel drivers was doing this as we had the pagecnt_bias in a structure

> >>> that also pointed to the page. That way we were only updating that

> >>> count if we dropped the page and didn't have to even touch the page.

> >>> You could use that to batch updates to the pagecnt_bias if we did use

> >>> the lower 12 bits of the DMA address to store it as well.

> >>

> >> I am not sure I understood what "we dropped the page" meant.

> >

> > For XDP_DROP if we are dropping the buffer we are dropping the page

> > which in our case means we just need to increment the pagecnt_bias

> > indicating we are putting it back and don't have to do anything with

> > the actual page refcount or struct.

>

> In that case, the driver not doing a page_pool_put_page() seems enough

> and reuse the page frag again?

>

> It seems to like the usecase as below in hns3 driver? If all the buffer

> has memcpy the head page, just reuse it.

>

> https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c#L3149

>

> >

> >> The driver does not really need to call page_pool_put_full_page()

> >> if the page of a skb is passed to stack, the driver mainly call

> >> page_pool_put_full_page() when unloading or uniniting when the page

> >> is not passed to stack yet.

> >

> > I was thinking mostly of something like XDP_TX cases when combined

> > with the pagecnt_bias. You will need to have something to return the

> > page to the pool after the XDP_TX is completed.

>

> I suppose XDP_TX is aware of page pool to call page_pool_put_full_page()

> when XDP_TX is completed now?

>

> I suppose the above should be handled as similar as the non-elevated refcnt

> case?


This is where including page frags makes this messy. In the frags case
you only want to put back the page once, however if you are using
frags the XDP_TX will have multiple copies of the same page so you
would need to have a way to identify when all the copies have been
consumed before you can recycle the page.

> >

> >>> I'm assuming the idea with this is that you will be having multiple

> >>> buffers received off of a single page and so doing it that way you

> >>> should only have one update on allocation, maybe a trickle of updates

> >>> for XDP_TX, and another large update when the page is fully consumed

> >>> and you drop the remaining pagecnt_bias for Rx.

> >>

> >> I suppose "having multiple buffers received off of a single page" mean:

> >> use first half of a page for a desc, and the second half of the same page

> >> for another desc, intead of ping-pong way of reusing implemented in most

> >> driver currently?

> >>

> >> I am not so familiar with XDP to understand the latter part of comment too.

> >

> > The alloc_frag logic below is an example of what I am talking about.

> > Basically taking a page and chopping it up into multiple pieces for

> > use as multiple receives instead of just one receive.

>

> Ok, but when multiple receives is passed to the stack and after the stack is

> done with all the receives, we should be able to recycle the page, right?


Yes. That is the trick to all this. Identifying when we can safely
recycle the page.

> >

> >>>

> >>>>  struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);

> >>>>

> >>>>  static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)

> >>>> @@ -137,6 +147,17 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)

> >>>>         return page_pool_alloc_pages(pool, gfp);

> >>>>  }

> >>>>

> >>>> +struct page *page_pool_alloc_frag(struct page_pool *pool,

> >>>> +                                 unsigned int *offset, gfp_t gfp);

> >>>> +

> >>>> +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,

> >>>> +                                                   unsigned int *offset)

> >>>> +{

> >>>> +       gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);

> >>>> +

> >>>> +       return page_pool_alloc_frag(pool, offset, gfp);

> >>>> +}

> >>>> +

> >>>>  /* get the stored dma direction. A driver might decide to treat this locally and

> >>>>   * avoid the extra cache line from page_pool to determine the direction

> >>>>   */

> >>>> @@ -253,11 +274,4 @@ static inline void page_pool_ring_unlock(struct page_pool *pool)

> >>>>                 spin_unlock_bh(&pool->ring.producer_lock);

> >>>>  }

> >>>>

> >>>> -/* Store mem_info on struct page and use it while recycling skb frags */

> >>>> -static inline

> >>>> -void page_pool_store_mem_info(struct page *page, struct page_pool *pp)

> >>>> -{

> >>>> -       page->pp = pp;

> >>>> -}

> >>>> -

> >>>>  #endif /* _NET_PAGE_POOL_H */

> >>>

> >>> So the issue as I see it with the page_pool recycling patch set is

> >>> that I don't think we had proper guarantees in place that the page->pp

> >>> value was flushed in all cases where skb->dev was changed. Basically

> >>> the logic we need to have in place to address those issues is that

> >>> skb->dev is changed we need to invalidate the DMA mappings on the

> >>> page_pool page.

> >>

> >> The DMA mappings invalidating is based on the pool->p.dev, is there

> >> any reason why the DMA mappings need invalidating when skb->dev is

> >> change, as fast I can tell, the tx is not aware of page pool, so

> >> when the skb is redirected, the page of the skb is always DMA mapped

> >> according to skb->dev before xmitting.

> >>

> >> Or it is about XDP redirected?

> >>

> >> Is there something obvious I missed here?

> >

> > It is about unmapping the page. In order to do so we have to maintain

> > a pointer to the original DMA device. The page pool is doing that for

> > us currently.

> >

> > Most netdevs have a parent  device that is used for DMA mapping.

> > Therefore if skb->dev is valid, then the parent device is still valid

> > since destroying the parent would destroy the children. If the

> > skb->dev is dropped or changed, then we cannot guarantee the parent

> > device is still present. So generally if skb->dev cannot be maintained

> > then we probably shouldn't be maintaining the DMA mapping or page->pp

> > across that boundary either.

>

> Does the get_device(pool->p.dev) in page_pool_init() not prevent the

> above case?


Actually it is the inflight pages that are the important part and it
does look like page_pool_release does appear to take care of that
case.

<...>
> >>>

> >>>> +static int page_pool_clear_pp_info(struct page *page)

> >>>> +{

> >>>> +       struct page_pool_info *pp_info = page->pp_info;

> >>>> +       int bias;

> >>>> +

> >>>> +       bias = pp_info->pagecnt_bias;

> >>>> +

> >>>> +       kfree(pp_info);

> >>>> +       page->pp_info = NULL;

> >>>> +       page->pp_magic = 0;

> >>>> +

> >>>> +       return bias;

> >>>> +}

> >>>> +

> >>>> +static void page_pool_clear_and_drain_page(struct page *page)

> >>>> +{

> >>>> +       int bias = page_pool_clear_pp_info(page);

> >>>> +

> >>>> +       __page_frag_cache_drain(page, bias + 1);

> >>>> +}

> >>>> +

> >>>>  static struct page *__page_pool_alloc_page_order(struct page_pool *pool,

> >>>>                                                  gfp_t gfp)

> >>>>  {

> >>>> @@ -216,13 +259,16 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,

> >>>>         if (unlikely(!page))

> >>>>                 return NULL;

> >>>>

> >>>> -       if ((pool->p.flags & PP_FLAG_DMA_MAP) &&

> >>>> -           unlikely(!page_pool_dma_map(pool, page))) {

> >>>> +       if (unlikely(page_pool_set_pp_info(pool, page, gfp))) {

> >>>>                 put_page(page);

> >>>>                 return NULL;

> >>>>         }

> >>>>

> >>>> -       page->pp_magic |= PP_SIGNATURE;

> >>>> +       if ((pool->p.flags & PP_FLAG_DMA_MAP) &&

> >>>> +           unlikely(!page_pool_dma_map(pool, page))) {

> >>>> +               page_pool_clear_and_drain_page(page);

> >>>> +               return NULL;

> >>>> +       }

> >>>>

> >>>>         /* Track how many pages are held 'in-flight' */

> >>>>         pool->pages_state_hold_cnt++;

> >>>> @@ -261,12 +307,17 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,

> >>>>          */

> >>>>         for (i = 0; i < nr_pages; i++) {

> >>>>                 page = pool->alloc.cache[i];

> >>>> +               if (unlikely(page_pool_set_pp_info(pool, page, gfp))) {

> >>>> +                       put_page(page);

> >>>> +                       continue;

> >>>> +               }

> >>>> +

> >>>>                 if ((pp_flags & PP_FLAG_DMA_MAP) &&

> >>>>                     unlikely(!page_pool_dma_map(pool, page))) {

> >>>> -                       put_page(page);

> >>>> +                       page_pool_clear_and_drain_page(page);

> >>>>                         continue;

> >>>>                 }

> >>>

> >>> This seems backwards to me. I would have the pp_info populated after

> >>> you have generated the DMA mapping.

> >>

> >> Ok.

> >>

> >>>

> >>>> -               page->pp_magic |= PP_SIGNATURE;

> >>>> +

> >>>>                 pool->alloc.cache[pool->alloc.count++] = page;

> >>>>                 /* Track how many pages are held 'in-flight' */

> >>>>                 pool->pages_state_hold_cnt++;

> >>>> @@ -284,6 +335,25 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,

> >>>>         return page;

> >>>>  }

> >>>>

> >>>> +static void page_pool_sub_bias(struct page *page, int nr)

> >>>> +{

> >>>> +       struct page_pool_info *pp_info = page->pp_info;

> >>>> +

> >>>> +       /* "pp_info->pagecnt_bias == 0" indicates the PAGECNT_BIAS

> >>>> +        * flags is not set.

> >>>> +        */

> >>>> +       if (!pp_info->pagecnt_bias)

> >>>> +               return;

> >>>> +

> >>>> +       /* Make sure pagecnt_bias > 0 for elevated refcnt case */

> >>>> +       if (unlikely(pp_info->pagecnt_bias <= nr)) {

> >>>> +               page_ref_add(page, USHRT_MAX);

> >>>> +               pp_info->pagecnt_bias += USHRT_MAX;

> >>>> +       }

> >>>> +

> >>>> +       pp_info->pagecnt_bias -= nr;

> >>>

> >>> So we should never have a case where pagecnt_bias is less than the

> >>> value we are subtracting. If we have that then it is a bug.

> >>

> >> Yes.

> >

> > Sorry, I was referring to the code above comparing pagecnt_bias to nr.

> > At most nr should only ever be equal to pagecnt_bias, you should hold

> > off on recharging pagecnt_bias until you can verify the page_count

> > indicates we are the only holder of the page. Then we can recharge it

> > and reset any offsets.

>

> Actually the page pool is the only user of the page when the driver is

> calling page_pool_alloc_frag(), page is from pool->alloc/pool->ring or

> page allocator in page_pool_alloc_pages(), as memtioned above, the

> last user will put the page in pool->ring holding a lock, and when

> page_pool_alloc_pages() get a page (also holding the same lock) from

> pool->ring, there should be no user of the page other than the page pool.

>

> And page_pool_sub_bias() is called in page_pool_alloc_frag() and

> page_pool_alloc_pages().


I think we would need to see a version of this patch without the
alloc_frag calls in order to really be able to do a review. The
problem is I don't see how the page_pool_alloc_frag can expect to have
sole ownership of the page if it is allocating fragments of the page.
The frags call imply multiple users for a single page.

> >

> >>>

> >>> The general idea with the pagecnt_bias is that we want to batch the

> >>> release of the page from the device. So the assumption is we are going

> >>> to pull multiple references from the page and rather than doing

> >>> page_ref_inc repeatedly we want to batch it at the start, and we have

> >>> to perform a __page_frag_cache_drain to remove any unused references

> >>> when we need to free it.

> >>

> >> Yes, it is about batching the page_ref_inc() operation.

> >>

> >>>

> >>> What we should probably be checking for is "pp_info->pagecnt_bias -

> >>> page_count(page) > 1" when we hit the end of the page. If that is true

> >>> then we cannot recycle the page and so when we hit PAGE_SIZE for the

> >>> offset we have to drop the mapping and free the page subtracting any

> >>> remaining pagecnt_bias we are holding. If I recall I actually ran this

> >>> the other way and ran toward 0 in my implementation before as that

> >>> allows for not having to track via a value and instead simply checking

> >>> for a signed result.

> >>

> >>

> >> When allocating a page for frag, we have decided how many user is using

> >> the page, that is the "page_pool_sub_bias(frag_page, max_len / frag_size - 1)"

> >> in page_pool_alloc_frag().

> >>

> >> so it is up to the driver or stack to do multi page_pool_put_full_page()

> >> calling for the same page.

> >

> > So that is one spot that I think is an issue. We normally only want

> > this called once per page and ideally after pagecnt_bias is 0. One

> > issue is that pagecnt_bias is non-atomic so we should really be

> > restricting this to just the driver calling it in softirq context.

>

> Let's discuss the pagecnt_bias handling at the end.

>

> >

> >> Or the page pool will call page_pool_put_full_page() in page_pool_empty_frag()

> >> if some of the page frag is not allocated to the driver yet.

> >>

> >> It seems you are suggesting a slightly different way to do frag reusing.

> >

> > As I mentioned I am not a fan of the current recycling scheme. There

> > are too many openings for it to end up unmapping the same page

> > multiple times or other possible issues.

>

> Other than the pagecnt_bias handling in non-atomic context, I think

> most of the race you mentioned above has been handled if I understand

> it correctly?


The biggest issue is that if we assume this to be more of a ticket
lock model, you have threads outside of this that are using
get_page/put_page that will mess with your tickets and cause leaks
because your unlocker may end up getting a non-matching ticket even
though it is the last call to __page_pool_put_page.

> >

> > In my mind the driver or page_pool should own the page and just keep

> > it on a list to either be freed or recycled with the skb destructor

> > being used to trigger the recycling.

>

> The page_pool still own the page, it is just that when driver also own

> the page by calling page_pool_alloc_pages(), and the page is not on a

> list of page pool, the driver or stack calling the page_pool_put_full_page()

> will put the page back to the list of page pool(or do resource cleaning and

> put it back to page allocator) if it is the last user.

>

> I am not similar enough with destructor to say if using skb destructor

> has any difference here.

>

> >

> >>>

> >>>> +}

> >>>> +

> >>>>  /* For using page_pool replace: alloc_pages() API calls, but provide

> >>>>   * synchronization guarantee for allocation side.

> >>>>   */

> >>>> @@ -293,15 +363,66 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)

> >>>>

> >>>>         /* Fast-path: Get a page from cache */

> >>>>         page = __page_pool_get_cached(pool);

> >>>> -       if (page)

> >>>> +       if (page) {

> >>>> +               page_pool_sub_bias(page, 1);

> >>>>                 return page;

> >>>> +       }

> >>>

> >>> I'm not sure we should be subtracting from the bias here. Ideally if

> >>> you are getting a page you are getting the full 4K page. So having a

> >>> bias other than PAGE_SIZE - 1 wouldn't make much sense here.

> >>

> >> It seems we have different understanding about pagecnt_bias here,

> >> as the pagecnt_bias is hidden in the page pool now, the subtracting

> >> here mean we give one refcnt to the caller of page_pool_alloc_pages(),

> >> And in page_pool_alloc_frag(), we give different part of page to the

> >> driver, so it means more user too, so there is also subtracting in the

> >> page_pool_alloc_frag() too.

> >

> > I see what you are getting at, however I think it depends on your use

> > case. In my mind since you are allocating the full page you should

> > have the full count available to you. I don't believe pagecnt_bias is

> > something that should be looked at outside of the driver, or at least

> > outside of the napi context of the device softirq.

> >

> > So really in order for this to work correctly you would need to have

> > some minimum amount of bias reserved for the device to access if you

> > are going to break up page in to n usable buffers.

>

> Ensuring the pagecnt_bias > 0 in page_pool_sub_bias() seems enough

> to make sure the page pool always own the page?


Except for the leak issue I pointed out above. If we are going to
enforce pagecnt_bias as a check for unmapping we have to guarantee
that anyone touching the page will use your function to release the
references to it.

Again this is why I think it would be better to just maintain a list
of inflight pages and then unmap them fro the driver if they are still
on the list greater than some fixed period of time.

> >

> >>>

> >>>>

> >>>>         /* Slow-path: cache empty, do real allocation */

> >>>>         page = __page_pool_alloc_pages_slow(pool, gfp);

> >>>> +       if (page)

> >>>> +               page_pool_sub_bias(page, 1);

> >>>> +

> >>>

> >>> Same here. Really in both cases we should be getting initialized

> >>> pages, not ones that are already decrementing.

> >>>

> >>>>         return page;

> >>>>  }

> >>>>  EXPORT_SYMBOL(page_pool_alloc_pages);

> >>>>

> >>>> +struct page *page_pool_alloc_frag(struct page_pool *pool,

> >>>> +                                 unsigned int *offset, gfp_t gfp)

> >>>> +{

> >>>> +       unsigned int frag_offset = pool->frag_offset;

> >>>> +       unsigned int frag_size = pool->p.frag_size;

> >>>> +       struct page *frag_page = pool->frag_page;

> >>>> +       unsigned int max_len = pool->p.max_len;

> >>>> +

> >>>> +       if (!frag_page || frag_offset + frag_size > max_len) {

> >>>

> >>> These are two very different cases. If frag_page is set and just out

> >>> of space we need to be freeing the unused references.

> >>

> >> As mention above, we are depending on the last user to do the

> >> recycling or freeing the unused references.

> >

> > But you are holding the pagecnt_bias for it aren't you? If so you need

> > to release it so that the last user knows that they were the last

> > user.

>

> The user will know it is the last user if page_pool_bias_page_recyclable()

> return true.


Except for the leak issue pointed out above.

> >

> > Once you aren't using the page you need to release the pagecnt_bias

> > since the page is on the path to being freed.

> It seems the above is more above what does the pagecnt_bias represent?

>

> >

> >>>

> >>>> +               frag_page = page_pool_alloc_pages(pool, gfp);

> >>>

> >>> So as per my comment above the page should be coming in with a

> >>> pagecnt_bias of PAGE_SIZE - 1, and an actual page_ref_count of

> >>> PAGE_SIZE.

> >>

> >> Let's align the understanding of pagecnt_bias first?

> >>

> >> pagecnt_bias meant how many refcnt of a page belong to the page

> >> pool, and (page_ref_count() - pagecnt_bias) means how many refcnt

>

> Actually it is (page_ref_count() - (pagecnt_bias + 1))

>

> >> of a page belong to user of the page pool.

> >

> > So my view is a slight variation on that. I view pagecnt_bias as the

> > count of references reserved by the page_pool, and page_ref_count -

> > pagecnt_bias is the actual reference count. So if I am going to free a

> > page I should deduct pagecnt_bias + 1 from the reference count to

> > account for dropping our bias and the one for the fact that we own the

> > page.

>

> So if (page_ref_count() - (pagecnt_bias + 1)) == 0 means only the page

> pool hold the page and it means whichever caller having the

> page_pool_bias_page_recyclable() returning true is the last user, right?

>

> >

> >>>

> >>>> +               if (unlikely(!frag_page)) {

> >>>> +                       pool->frag_page = NULL;

> >>>> +                       return NULL;

> >>>> +               }

> >>>> +

> >>>> +               pool->frag_page = frag_page;

> >>>> +               frag_offset = 0;

> >>>> +

> >>>> +               page_pool_sub_bias(frag_page, max_len / frag_size - 1);

> >>>

> >>> Why are you doing division here? We should just be subtracting 1 from

> >>> the pagecnt_bias since that is the number of buffers that are being

> >>> used. The general idea is that when pagecnt_bias is 0 we cut the page

> >>> loose for potential recycling or freeing, otherwise we just subtract

> >>> our new value from pagecnt_bias until we reach it.

> >>

> >> As mentioned above, division is used to find out how many user may be

> >> using the page.

> >

> > That doesn't make any sense to me because it won't tell you the actual

> > users, and from what I can tell it is buggy since if I use this to

> > allocate a chunk larger than 2K this comes out to 0 doesn't it? It

> > seems like you should just always use 1 as the count.

>

> There is already a page_pool_sub_bias(page, 1) in page_pool_alloc_pages(),

> so for 4K page, there is two users for a page with 2K frag size, and there

> is 32 users for 64K page with 2K frag size.

>

> The reason doing a page_pool_sub_bias(page, 1) in page_pool_alloc_pages()

> is that the caller is expected to use the page as a whole when using the

> page_pool_alloc_pages() directly, so it means only one user.


The logic doesn't make any sense. You shouldn't need to do any
subtraction then. The idea is you subtract 1 per frag pulled from the
page. The logic you have here just doesn't make sense as you are
making smaller frags pull additional bias counts. If I pull a small
fragment I could consume the entire bias in a single call.

> >

> >>>

> >>>> +       }

> >>>> +

> >>>> +       *offset = frag_offset;

> >>>> +       pool->frag_offset = frag_offset + frag_size;

> >>>> +

> >>>> +       return frag_page;

> >>>> +}

> >>>> +EXPORT_SYMBOL(page_pool_alloc_frag);

> >>>> +

> >>>> +static void page_pool_empty_frag(struct page_pool *pool)

> >>>> +{

> >>>> +       unsigned int frag_offset = pool->frag_offset;

> >>>> +       unsigned int frag_size = pool->p.frag_size;

> >>>> +       struct page *frag_page = pool->frag_page;

> >>>> +       unsigned int max_len = pool->p.max_len;

> >>>> +

> >>>> +       if (!frag_page)

> >>>> +               return;

> >>>> +

> >>>> +       while (frag_offset + frag_size <= max_len) {

> >>>> +               page_pool_put_full_page(pool, frag_page, false);

> >>>> +               frag_offset += frag_size;

> >>>> +       }

> >>>> +

> >>>> +       pool->frag_page = NULL;

> >>>> +}

> >>>> +

> >>>

> >>> It would be good to look over the page_frag_alloc_align and

> >>> __page_frag_cache_drain functions for examples of how to do most of

> >>> this. The one complication is that we have the dma mappings and

> >>> page_pool logic to deal with.

> >>

> >> Is it ok to rely on the user providing a aligning frag_size, so

> >> that do not need handling it here?

> >

> > It is probably fine since the page pool should only have one consumer

> > so the requests just need to be aligned by them.

> >

> >>>

> >>>>  /* Calculate distance between two u32 values, valid if distance is below 2^(31)

> >>>>   *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution

> >>>>   */

> >>>> @@ -326,10 +447,11 @@ static s32 page_pool_inflight(struct page_pool *pool)

> >>>>   * a regular page (that will eventually be returned to the normal

> >>>>   * page-allocator via put_page).

> >>>>   */

> >>>> -void page_pool_release_page(struct page_pool *pool, struct page *page)

> >>>> +static int __page_pool_release_page(struct page_pool *pool,

> >>>> +                                   struct page *page)

> >>>>  {

> >>>>         dma_addr_t dma;

> >>>> -       int count;

> >>>> +       int bias, count;

> >>>>

> >>>>         if (!(pool->p.flags & PP_FLAG_DMA_MAP))

> >>>>                 /* Always account for inflight pages, even if we didn't

> >>>> @@ -345,22 +467,29 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)

> >>>>                              DMA_ATTR_SKIP_CPU_SYNC);

> >>>>         page_pool_set_dma_addr(page, 0);

> >>>>  skip_dma_unmap:

> >>>> -       page->pp_magic = 0;

> >>>> +       bias = page_pool_clear_pp_info(page);

> >>>>

> >>>>         /* This may be the last page returned, releasing the pool, so

> >>>>          * it is not safe to reference pool afterwards.

> >>>>          */

> >>>>         count = atomic_inc_return(&pool->pages_state_release_cnt);

> >>>>         trace_page_pool_state_release(pool, page, count);

> >>>> +       return bias;

> >>>> +}

> >>>> +

> >>>> +void page_pool_release_page(struct page_pool *pool, struct page *page)

> >>>> +{

> >>>> +       int bias = __page_pool_release_page(pool, page);

> >>>> +

> >>>> +       WARN_ONCE(bias, "PAGECNT_BIAS is not supposed to be enabled\n");

> >>>>  }

> >>>>  EXPORT_SYMBOL(page_pool_release_page);

> >>>>

> >>>>  /* Return a page to the page allocator, cleaning up our state */

> >>>>  static void page_pool_return_page(struct page_pool *pool, struct page *page)

> >>>>  {

> >>>> -       page_pool_release_page(pool, page);

> >>>> +       __page_frag_cache_drain(page, __page_pool_release_page(pool, page) + 1);

> >>>>

> >>>> -       put_page(page);

> >>>>         /* An optimization would be to call __free_pages(page, pool->p.order)

> >>>>          * knowing page is not part of page-cache (thus avoiding a

> >>>>          * __page_cache_release() call).

> >>>> @@ -395,7 +524,16 @@ static bool page_pool_recycle_in_cache(struct page *page,

> >>>>         return true;

> >>>>  }

> >>>>

> >>>> -/* If the page refcnt == 1, this will try to recycle the page.

> >>>> +static bool page_pool_bias_page_recyclable(struct page *page, int bias)

> >>>> +{

> >>>> +       int ref = page_ref_dec_return(page);

> >>>> +

> >>>> +       WARN_ON(ref < bias);

> >>>> +       return ref == bias + 1;

> >>>> +}

> >>>> +

> >>>> +/* If pagecnt_bias == 0 and the page refcnt == 1, this will try to

> >>>> + * recycle the page.

> >>>>   * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for

> >>>>   * the configured size min(dma_sync_size, pool->max_len).

> >>>>   * If the page refcnt != 1, then the page will be returned to memory

> >>>> @@ -405,16 +543,35 @@ static __always_inline struct page *

> >>>>  __page_pool_put_page(struct page_pool *pool, struct page *page,

> >>>>                      unsigned int dma_sync_size, bool allow_direct)

> >>>>  {

> >>>> -       /* This allocator is optimized for the XDP mode that uses

> >>>> +       int bias = page->pp_info->pagecnt_bias;

> >>>> +

> >>>> +       /* Handle the elevated refcnt case first:

> >>>> +        * multi-frames-per-page, it is likely from the skb, which

> >>>> +        * is likely called in non-sofrirq context, so do not recycle

> >>>> +        * it in pool->alloc.

> >>>> +        *

> >>>> +        * Then handle non-elevated refcnt case:

> >>>>          * one-frame-per-page, but have fallbacks that act like the

> >>>>          * regular page allocator APIs.

> >>>> -        *

> >>>>          * refcnt == 1 means page_pool owns page, and can recycle it.

> >>>>          *

> >>>>          * page is NOT reusable when allocated when system is under

> >>>>          * some pressure. (page_is_pfmemalloc)

> >>>>          */

> >>>> -       if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {

> >>>> +       if (bias) {

> >>>> +               /* We have gave some refcnt to the stack, so wait for

> >>>> +                * all refcnt of the stack to be decremented before

> >>>> +                * enabling recycling.

> >>>> +                */

> >>>> +               if (!page_pool_bias_page_recyclable(page, bias))

> >>>> +                       return NULL;

> >>>> +

> >>>> +               /* only enable recycling when it is not pfmemalloced */

> >>>> +               if (!page_is_pfmemalloc(page))

> >>>> +                       return page;

> >>>> +

> >>>

> >>> So this would be fine if this was only accessed from the driver. The

> >>> problem is the recycling code made it so that this is accessed in the

> >>> generic skb freeing path. As such I think this is prone to races since

> >>> you have to guarantee the ordering of things between the reference

> >>> count and pagecnt_bias.

> >>

> >> As reference count is handled atomically is page_pool_bias_page_recyclable,

> >> and pagecnt_bias is changed before any page is handled to the stack(maybe

> >> some READ_ONCE/WRITE_ONCE or barrier is still needed, will check it again),

> >> so I suppose the ordering is correct?

> >

> > The problem is in order to get this working correctly you would likely

> > need to add a number of barriers so that reads and writes are in a

> > specific order. You would be much better off just not

> > reading/modifying the pagecnt_bias outside of the softirq paths.

>

> Most of the reusing implemented in the driver today may not be

> able to do reusing when the stack does not process the skb and

> dec the refcnt quick enough, this patch try to reuse the page

> as much as possible when above case happens.

>

> So it seems the pagecnt_bias need to be checked outside of the

> softirq to implement that?

>

> Let's break down the step of reusing a page:

> 1. driver call page_pool_alloc_frag() to allocte a page frag.

> 2. page pool sub the pagecnt_bias according to the user using the

>    page.

> 3. driver fill the page info to the desc.

> 4. driver notify the hw that desc is filled with page info.

> 5. hw write the packet to page memory according to info in desc.

> 6. driver process the desc and passed the skb(contianing the page

>    frag) to stack

> 7. stack process the skb

> 8. stack put the page to page pool calling page_pool_return_page(),

>    if it is the last user by checking pagecnt_bias, the page is recycled

>    in page pool or is returned to page allocated after cleaning the

>    resource.

>

> There is usually barrier in step 4 and step 6, at least in hns3 drvier,

> see the barrier does not seems to be necessary?

>

> see:

> https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c#L2867

> https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c#L3368


I am not really interested in the page frag case for now. There are
bigger issues with the patch set. I would recommend splitting the page
frags out into a separate patch and just try to work out the bulk
updating of the page count for now.

Ideally this would be broken out into smaller patches so it is easier
to review as there are currently several issues that we are talking
about here in parallel which is making the discussion confusing.
Alexander Duyck July 8, 2021, 3:41 p.m. UTC | #19
On Thu, Jul 8, 2021 at 8:36 AM Ilias Apalodimas
<ilias.apalodimas@linaro.org> wrote:
>

> On Thu, Jul 08, 2021 at 08:29:56AM -0700, Alexander Duyck wrote:

> > On Thu, Jul 8, 2021 at 8:17 AM Ilias Apalodimas

> > <ilias.apalodimas@linaro.org> wrote:


<snip>

> > > What do you think about resetting pp_recycle bit on pskb_expand_head()?

> >

> > I assume you mean specifically in the cloned case?

> >

>

> Yes. Even if we do it unconditionally we'll just loose non-cloned buffers from

> the recycling.

> I'll send a patch later today.


If you do it unconditionally you could leak DMA mappings since in the
non-cloned case we don't bother with releasing the shared info since
we just did a memcpy of it without the reference count tweaks. We have
to be really careful here. The idea is that we have to make exactly
one call to the __page_pool_put_page function for this page.

> > > If my memory serves me right Eric wanted that from the beginning. Then the

> > > cloned/expanded SKB won't trigger the recycling.  If that skb hits the free

> > > path first, we'll end up recycling the fragments eventually.  If the

> > > original one goes first, we'll just unmap the page(s) and freeing the cloned

> > > one will free all the remaining buffers.

> >

> > I *think* that should be fine. Effectively what we are doing is making

> > it so that if the original skb is freed first the pages are released,

> > and if it is released after the clone/expended skb then it can be

> > recycled.

>

> Exactly

>

> >

> > The issue is we have to maintain it so that there will be exactly one

> > caller of the recycling function for the pages. So any spot where we

> > are updating skb->head we will have to see if there is a clone and if

> > so we have to clear the pp_recycle flag on our skb so that it doesn't

> > try to recycle the page frags as well.

>

> Correct. I'll keep looking around in case there's something less fragile we

> can do


That is the risk to this kind of thing. We have to make the call once
and only once and if we either miss it or call it too many times we
can introduce some serious issues.

Thanks.

- Alex
Ilias Apalodimas July 8, 2021, 3:47 p.m. UTC | #20
On Thu, Jul 08, 2021 at 08:41:08AM -0700, Alexander Duyck wrote:
> On Thu, Jul 8, 2021 at 8:36 AM Ilias Apalodimas

> <ilias.apalodimas@linaro.org> wrote:

> >

> > On Thu, Jul 08, 2021 at 08:29:56AM -0700, Alexander Duyck wrote:

> > > On Thu, Jul 8, 2021 at 8:17 AM Ilias Apalodimas

> > > <ilias.apalodimas@linaro.org> wrote:

> 

> <snip>

> 

> > > > What do you think about resetting pp_recycle bit on pskb_expand_head()?

> > >

> > > I assume you mean specifically in the cloned case?

> > >

> >

> > Yes. Even if we do it unconditionally we'll just loose non-cloned buffers from

> > the recycling.

> > I'll send a patch later today.

> 

> If you do it unconditionally you could leak DMA mappings since in the

> non-cloned case we don't bother with releasing the shared info since

> we just did a memcpy of it without the reference count tweaks. We have

> to be really careful here. The idea is that we have to make exactly

> one call to the __page_pool_put_page function for this page.

> 

> > > > If my memory serves me right Eric wanted that from the beginning. Then the

> > > > cloned/expanded SKB won't trigger the recycling.  If that skb hits the free

> > > > path first, we'll end up recycling the fragments eventually.  If the

> > > > original one goes first, we'll just unmap the page(s) and freeing the cloned

> > > > one will free all the remaining buffers.

> > >

> > > I *think* that should be fine. Effectively what we are doing is making

> > > it so that if the original skb is freed first the pages are released,

> > > and if it is released after the clone/expended skb then it can be

> > > recycled.

> >

> > Exactly

> >

> > >

> > > The issue is we have to maintain it so that there will be exactly one

> > > caller of the recycling function for the pages. So any spot where we

> > > are updating skb->head we will have to see if there is a clone and if

> > > so we have to clear the pp_recycle flag on our skb so that it doesn't

> > > try to recycle the page frags as well.

> >

> > Correct. I'll keep looking around in case there's something less fragile we

> > can do

> 

> That is the risk to this kind of thing. We have to make the call once

> and only once and if we either miss it or call it too many times we

> can introduce some serious issues.


And I fully agree. Let me fix the obvious one now and I'll have a closer
look on the recycling function it self. I can probably pick up the
"changed head"/expanded SKB in the generic recycling code and refuse to recycle
these packets. Then we'll just accept the fact that if those kind of
packets are freed last, we won't recycle.

Thanks, that was a very nice catch
/Ilias
> 

> Thanks.

> 

> - Alex
Yunsheng Lin July 9, 2021, 6:26 a.m. UTC | #21
On 2021/7/8 23:36, Alexander Duyck wrote:
> On Wed, Jul 7, 2021 at 7:27 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:

>>

>> On 2021/7/7 23:01, Alexander Duyck wrote:

>>> On Tue, Jul 6, 2021 at 8:05 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:

>>>>

>>>> On 2021/7/7 4:45, Alexander Duyck wrote:

>>>>> On Wed, Jun 30, 2021 at 2:19 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:

>>>>>>

>>>>>> Currently page pool only support page recycling only when

>>>>>> refcnt of page is one, which means it can not support the

>>>>>> split page recycling implemented in the most ethernet driver.

>>>>>>

>>>>>> So add elevated refcnt support in page pool, and support

>>>>>> allocating page frag to enable multi-frames-per-page based

>>>>>> on the elevated refcnt support.

>>>>>>

>>>>>> As the elevated refcnt is per page, and there is no space

>>>>>> for that in "struct page" now, so add a dynamically allocated

>>>>>> "struct page_pool_info" to record page pool ptr and refcnt

>>>>>> corrsponding to a page for now. Later, we can recycle the

>>>>>> "struct page_pool_info" too, or use part of page memory to

>>>>>> record pp_info.

>>>>>>

>>>>>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>

>>>>

>>>> Hi, Alexander

>>>>

>>>> Thanks for detailed reviewing.

>>>>

>>>>>

>>>>> So this isn't going to work with the current recycling logic. The

>>>>> expectation there is that we can safely unmap the entire page as soon

>>>>> as the reference count is greater than 1.

>>>>

>>>> Yes, the expectation is changed to we can always recycle the page

>>>> when the last user has dropped the refcnt that has given to it when

>>>> the page is not pfmemalloced.

>>>>

>>>> The above expectation is based on that the last user will always

>>>> call page_pool_put_full_page() in order to do the recycling or do

>>>> the resource cleanup(dma unmaping..etc).

>>>>

>>>> As the skb_free_head() and skb_release_data() have both checked the

>>>> skb->pp_recycle to call the page_pool_put_full_page() if needed, I

>>>> think we are safe for most case, the one case I am not so sure above

>>>> is the rx zero copy, which seems to also bump up the refcnt before

>>>> mapping the page to user space, we might need to ensure rx zero copy

>>>> is not the last user of the page or if it is the last user, make sure

>>>> it calls page_pool_put_full_page() too.

>>>

>>> Yes, but the skb->pp_recycle value is per skb, not per page. So my

>>> concern is that carrying around that value can be problematic as there

>>> are a number of possible cases where the pages might be

>>> unintentionally recycled. All it would take is for a packet to get

>>> cloned a few times and then somebody starts using pskb_expand_head and

>>> you would have multiple cases, possibly simultaneously, of entities

>>> trying to free the page. I just worry it opens us up to a number of

>>> possible races.

>>

>> I think page_ref_dec_return() in page_pool_bias_page_recyclable() will

>> prevent the above race to happen.

>>

>> As the page_ref_dec_return() and page_pool_bias_page_recyclable() return

>> true, all user of the page have done with the p->pp_magic and p->pp_info,

>> so it should be ok to reset the p->pp_magic and p->pp_info in any order?

>>

>> And page_ref_dec_return() has both __atomic_pre_full_fence() and

>> __atomic_post_full_fence() to ensure the above ordering.

> 

> So if I understand correctly what you are saying is that because of

> the pagecnt_bias check we will not hit the page_pool_release_page.

> That may help to address the issue introduced by the recycling patch

> but I don't think it completely resolves it. In addition there may be

> performance implications to this change since you are requiring the

> atomic dec for every page.

> 

> The difference between pagecnt_bias and what you have here is that we

> freed the page when page_ref_count hit 0. With this approach you are

> effectively freeing the page when page_ref_count == pagecnt_bias +

> modifier. The two implementations have quite a number of differences

> in behavior.

> 

> What you have effectively done here is make the page refcount and

> pagecnt_bias effectively into a ticket lock where we cannot call the

> free function until page_ref_cnt == pagecnt_bias + 1. So you need to

> keep the pagecnt_bias much lower than the page_ref_cnt otherwise you

> run the risk of frequent recycling. For the non-shared page_pool pages

> this is probably fine, however the frags implementation is horribly

> broken.


Yes, if ticket lock is the name for that.

I suppose "non-shared page_pool pages" mean caller allocates the page by
calling page_pool_alloc_pages() directly for elevated refcnt case, right?

The main difference between page_pool_alloc_pages() and page_pool_alloc_frag()
for elevated refcnt case is how many tickets have been given out, so I
am not sure why giving out one ticket is ok, and giving out more than one
ticket is broken?

> 

> Also the ticketlock approach is flawed because with something like

> that we shouldn't rewind the number we are currently serving like we

> do. We would have to wait until we are the only one holding the page

> before we could recycle previously used values.


I am not sure I understand the above.

I suppose it means we might not be able to clean up the resource(mainly
to do unmapping and drain the page_ref according to pagecnt_bias) while
the stack is still holding the reference to the page, which is possible
for the current reusing implemented in most driver.

But one good thing come out of that is we might still be able to reuse
the page when the stack release the reference to the page later, which
is not possible for the current reusing implemented in most driver.

> 

>>>

>>>>>

>>>>> In addition I think I need to look over that code better as I am

>>>>> wondering if there are potential issues assuming a path such as a

>>>>> skb_clone followed by pskb_expand_head may lead to memory corruptions

>>>>> since the clone will still have pp_recycle set but none of the pages

>>>>> will be part of the page pool anymore.

>>>>

>>>> There is still page->pp_magic that decides if the page is from

>>>> page_pool or not.

>>>

>>> The problem with pp_magic is that it doesn't prevent races. The page

>>> pool code was meant to be protected by NAPI to prevent simultaneous

>>> access. With us now allowing the stack to be a part of the handling we

>>> open things up to potential races in the code.

>>

>> As above.

>>

>>>

>>>>>

>>>>> For us the pagecnt_bias would really represent the number of

>>>>> additional mappings beyond the current page that are being held. I

>>>>> have already been playing around with something similar. However the

>>>>> general idea is that we want to keep track of how many references to

>>>>> the page the device is holding onto. When that hits 0 and the actual

>>>>> page count is 1 we can refill both, however if we hit 0 and there are

>>>>> multiple references to the page still floating around we should just

>>>>> unmap the page and turn it over to the stack or free it.

>>>>

>>>> I am not sure I understood the above.

>>>

>>> As I have already mentioned, the fundamental problem with sharing a

>>> page and using the page pool is that the page pool assumes that it can

>>> unmap if it has a reference count greater than 0. That will no longer

>>> be the case. It has to wait until all of the pagecnt_bias has been

>>> cleared before it can unmap the page. Using get_page/put_page is fine

>>> since it will have no impact on the DMA mappings, but we have to hold

>>> off on calling things like page_pool_put_full_page or update it so

>>> that it will not unmap as long as there is still pagecnt_bias in

>>> place.

>>

>> Actually pagecnt_bias is never clear when the page is in use or is still

>> recyclable, and DMA unmapping is only done when page is not in use and

>> and the page is not recyclable(page is from pf_memealloced or pool->ring

>> is full).

>>

>> The page_pool_bias_page_recyclable() is used to decide whether there is

>> user using the page, if there is still other user using the page, current

>> user calling the page_pool_bias_page_recyclable() just do a ref_dec and

>> return, it is only the last user calling the page_pool_bias_page_recyclable()

>> will do the DMA unmapping if the page is not recyclable.

> 

> So now that I have the ticketlock model in my mind I think I see where

> you and I may be differing in how we have been viewing things. One

> thing is that in my mind we would be freeing/recycling the page when

> page_ref_count == pagecnt_bias and skip the extra "+1" modifier.

> 

> In my mind the driver is needing to hold onto one reference to the

> page itself as long as it is processing Rx DMA requests. So we need to

> block recycling until the driver is no longer holding onto the page

> for possible DMA operations. In my mind we are doing so via the

> pagecnt_bias value and keeping it at least 1 lower than the

> page_ref_count until the Rx buffer is ready to be unmapped. For the


I am agree with that about "keeping it at least 1 lower than the
page_ref_count until the Rx buffer is ready to be unmapped".

> last buffer we don't bother with decrementing the pagecnt_bias and

> instead just hand the page over to the stack. So what we should have

> is the page cycling between a pagecnt_bias that is +1-2 of the actual

> page_ref_count and when the two are equal we then perform the

> unmap/free or recycle of the page.


What does "last buffer" mean?
The driver does not know whether the buffer is the last one or not as the
pagecnt_bias is hidden inside the page pool.

> 

> On the Tx and SKB side of things we are using the page_ref_count to

> track which instances can be recycled and should only ever be reading

> pagecnt_bias.


pagecnt_bias in this patch *does* indeed being only read for SKB side.
I suppose Tx side is for XDP?

> 

> At recycle time we will need to verify there are enough tickets to

> support another run through the allocator. We may want to look at

> adding a value to the page pool to track the maximum number of slices

> a page can be broken into in order to avoid having to update the

> page_ref_count and pagecnt_bias too often.


Why is page_ref_count and pagecnt_bias not enough to do the job?
The user have provided the frag_size and we know about the page size, so
we should be able to ensure pagecnt_bias is big enough for the maximum
number of slices when allocating the first frag of page.

> 

>>>

>>>> As page reusing in hns3 driver, pagecnt_bias means how many refcnt the

>>>> driver is holding, and (page_count(cb->priv) - pagecnt_bias) means how

>>>> many refcnt the stack is holding, see [1].

>>>>

>>>> static bool hns3_can_reuse_page(struct hns3_desc_cb *cb)

>>>> {

>>>>         return (page_count(cb->priv) - cb->pagecnt_bias) == 1;

>>>> }

>>>

>>> So one thing we have to be careful of is letting the page_count hit 0.

>>> My preference is to keep the bias as one less than the total

>>> page_count so that we always have the 1 around. So if pagecnt_bias

>>> hits 0 and we have a page_count of 1 it means that the current thread

>>> owns the only reference to the page.

>>>

>>>> checking (page_count(cb->priv) - cb->pagecnt_bias) again one instead

>>>> of zero is in hns3_can_reuse_page because there is "pagecnt_bias--"

>>>> before checking hns3_can_reuse_page() in hns3_nic_reuse_page().

>>>>

>>>> "pagecnt_bias--" means the driver gives the one of its refcnt to the

>>>> stack, it is the stack'job to release the refcnt when the skb is passed

>>>> to the stack.

>>>>

>>>> 1. https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c#L2870

>>>

>>> It is mostly just a matter of preference. As long as the difference is

>>> a predictable value it can be worked with.

>>>

>>>>>

>>>>>> ---

>>>>>>  drivers/net/ethernet/marvell/mvneta.c           |   6 +-

>>>>>>  drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c |   2 +-

>>>>>>  include/linux/mm_types.h                        |   2 +-

>>>>>>  include/linux/skbuff.h                          |   4 +-

>>>>>>  include/net/page_pool.h                         |  30 +++-

>>>>>>  net/core/page_pool.c                            | 215 ++++++++++++++++++++----

>>>>>>  6 files changed, 207 insertions(+), 52 deletions(-)

>>>>>>

>>>>>> diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c

>>>>>> index 88a7550..5a29af2 100644

>>>>>> --- a/drivers/net/ethernet/marvell/mvneta.c

>>>>>> +++ b/drivers/net/ethernet/marvell/mvneta.c

>>>>>> @@ -2327,7 +2327,7 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,

>>>>>>         if (!skb)

>>>>>>                 return ERR_PTR(-ENOMEM);

>>>>>>

>>>>>> -       skb_mark_for_recycle(skb, virt_to_page(xdp->data), pool);

>>>>>> +       skb_mark_for_recycle(skb);

>>>>>>

>>>>>>         skb_reserve(skb, xdp->data - xdp->data_hard_start);

>>>>>>         skb_put(skb, xdp->data_end - xdp->data);

>>>>>> @@ -2339,10 +2339,6 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,

>>>>>>                 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,

>>>>>>                                 skb_frag_page(frag), skb_frag_off(frag),

>>>>>>                                 skb_frag_size(frag), PAGE_SIZE);

>>>>>> -               /* We don't need to reset pp_recycle here. It's already set, so

>>>>>> -                * just mark fragments for recycling.

>>>>>> -                */

>>>>>> -               page_pool_store_mem_info(skb_frag_page(frag), pool);

>>>>>>         }

>>>>>>

>>>>>>         return skb;

>>>>>

>>>>> So as I mentioned earlier the problem with recycling is that splitting

>>>>> up the ownership of the page makes it difficult for us to clean it up.

>>>>> Technically speaking if the pages are being allowed to leave while

>>>>> holding references to DMA addresses that we cannot revoke then we

>>>>> should be holding references to the device.

>>>>>

>>>>> That is one of the reasons why the previous code was just clearing the

>>>>> mapping as soon as the refcount was greater than 1. However for this

>>>>> to work out correctly we would have to track how many DMA mappings we

>>>>> have outstanding in addition to the one we are working on currently.

>>>>

>>>> I think page pool has already handled the above case if I understand

>>>> correctly, see page_pool_release().

>>>

>>> The problem is pagecnt_bias is not multi-thread safe. You are just

>>> accessing an int which is prone to races. In order to fix it you would

>>> need to add either an atomic count or locks around the access of it

>>> which would pretty much negate the point of it.

>>

>> As pagecnt_bias being not multi-thread safe, let's get back to it

>> later.

> 

> Actually this is kind of core to things for the batch count updates.

> We have to guarantee that the pagecnt_bias is only updated in the

> softirq handler, and read-only everywhere else. What we have is

> effectively a consumer-producer ticket lock.


The pagecnt_bias is already only updated in the softirq handler for
this patch because it is only changed in page_pool_alloc_frag() and
page_pool_alloc_frag(), and both of them are called in desc refill
process, which is in softirq context during normal packet processing
if I understand it correctly.

> 

>>>

>>> Really in terms of the page pool recycling code I think it would have

>>> made more sense to add the page pool release logic as an skb

>>> destructor rather than trying to embed the page pool into the page

>>> itself. At least with that if the device is going to go out of scope

>>> by being orphaned or the like we could unmap the page and avoid

>>> potential races.

>>

>> I suppose it is not the netdev relevant here, it is the "struct device"

>> relevant here, right?

>>

>> I suppose the page_ref_dec_return() and get_device(pool->p.dev) in

>> page_pool_init() is able to avoid the above race, as the unmaping

>> is done after page_ref_dec_return()?

> 

> The problem is the pointer to pool->p.dev could be potentially stale

> in the event of something such as a hotplug event. I would like to

> avoid that as it could cause some ugly issues.


I suppose hotplug process should ensure all resoure corresponding to
the hotpluging device will be clean up, so that the above stale state
does not happen?

> 

>>>

>>>>>

>>>>>> diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

>>>>>> index 3135220..540e387 100644

>>>>>> --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

>>>>>> +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c

>>>>>> @@ -3997,7 +3997,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,

>>>>>>                 }

>>>>>>

>>>>>>                 if (pp)

>>>>>> -                       skb_mark_for_recycle(skb, page, pp);

>>>>>> +                       skb_mark_for_recycle(skb);

>>>>>>                 else

>>>>>>                         dma_unmap_single_attrs(dev->dev.parent, dma_addr,

>>>>>>                                                bm_pool->buf_size, DMA_FROM_DEVICE,

>>>>>> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

>>>>>> index 862f88a..cf613df 100644

>>>>>> --- a/include/linux/mm_types.h

>>>>>> +++ b/include/linux/mm_types.h

>>>>>> @@ -101,7 +101,7 @@ struct page {

>>>>>>                          * page_pool allocated pages.

>>>>>>                          */

>>>>>>                         unsigned long pp_magic;

>>>>>> -                       struct page_pool *pp;

>>>>>> +                       struct page_pool_info *pp_info;

>>>>>>                         unsigned long _pp_mapping_pad;

>>>>>>                         /**

>>>>>>                          * @dma_addr: might require a 64-bit value on

>>>>>

>>>>> So the problem here is that this is creating a pointer chase, and the

>>>>> need to allocate yet another structure to store it is going to be

>>>>> expensive.

>>>>>

>>>>> As far as storing the pagecnt_bias it might make more sense to

>>>>> repurpose the lower 12 bits of the dma address. A DMA mapping should

>>>>> be page aligned anyway so the lower 12 bits would be reserved 0. When

>>>>> we decrement the value so that the lower 12 bits are 0 we should be

>>>>> unmapping the page anyway, or resetting the pagecnt_bias to PAGE_SIZE

>>>>> - 1 and adding back the bias to the page to effectively reset it for

>>>>> reuse.

>>>>

>>>> Yes, that is a great idea. I like it very much supposing page refcnt

>>>> updating batching for 'PAGE_SIZE - 1" is enough for performance sake.

>>>>

>>>> Will take a look about it.

>>>>

>>>>>

>>>>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

>>>>>> index b2db9cd..7795979 100644

>>>>>> --- a/include/linux/skbuff.h

>>>>>> +++ b/include/linux/skbuff.h

>>>>>> @@ -4711,11 +4711,9 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)

>>>>>>  }

>>>>>>

>>>>>>  #ifdef CONFIG_PAGE_POOL

>>>>>> -static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page,

>>>>>> -                                       struct page_pool *pp)

>>>>>> +static inline void skb_mark_for_recycle(struct sk_buff *skb)

>>>>>>  {

>>>>>>         skb->pp_recycle = 1;

>>>>>> -       page_pool_store_mem_info(page, pp);

>>>>>>  }

>>>>>>  #endif

>>>>>

>>>>> I am not a fan of the pp_recycle flag either. We duplicate it via

>>>>> skb_clone and from what I can tell if we call pskb_expand_head

>>>>> afterwards I don't see how we avoid recycling the page frags twice.

>>>>

>>>> Acctually skb->pp_recycle is kind of duplicated, as there is

>>>> still page->pp_magic to avoid recycling the page frags twice.

>>>>

>>>> The argument above adding skb->pp_recycle seems to be short

>>>> cut code path for non-page_pool case in the previous disscusion,

>>>> see [2].

>>>>

>>>> 2. https://lore.kernel.org/linux-mm/074b0d1d-9531-57f3-8e0e-a447387478d1@huawei.com/

>>>

>>> Yes, but that doesn't guarantee atomic protections so you still have

>>> race conditions possible. All it takes is something stalling during

>>> the dma_unamp call. Worse yet from what I can tell it looks like you

>>> clear page->pp before you clear page->pp_magic so you have the

>>> potential for a NULL pointer issue since it is cleared before the

>>> pp_magic value is.

>>

>> Hopefully the page_ref_dec_return() in page_pool_bias_page_recyclable()

>> called by page_pool_put_page() will make the order of page->pp_magic

>> clearing and page->pp clearing irrelevant?

> 

> Really it doesn't address the issue. The problem is the clearing of

> pp_magic is after the dec_and_ref while the reading/clearing of

> page->pp is before it.

> 

> So having code like the following is not safe:

>     pp = page->pp;

>     page->pp = NULL;

> 

>     if (pp->something)

>         do_something();

> 

> The check for page->pp_magic before this doens't resolve it because 2

> threads can get into the code path before either one has updated

> page->pp_magic.


I suppose the above issue is the one you and Ilias are discussing?

> 

> Arguably the pagecnt_bias does something to help, but what it has

> effectively done is created a ticket lock where until you can get

> page_ref_count to reach the pagecnt_bias value you cannot unmap or

> free the page. So the tradeoff is that if anyone takes a reference to

> the page you are now stuck and cannot unmap it nor remove the device

> while the page is still in use elsewhere.

> 

> Also it just occurred to me that this will cause likely leaks because

> page_ref_count is also updated outside of page_pool so we would have

> to worry about someone calling get_page, then your call to

> page_pool_bias_page_recyclable, and then put page and at that point

> the page is leaked.


Yes, as mentioned in the previous discussion:

"Yes, the expectation is changed to we can always recycle the page
when the last user has dropped the refcnt that has given to it when
the page is not pfmemalloced.

The above expectation is based on that the last user will always
call page_pool_put_full_page() in order to do the recycling or do
the resource cleanup(dma unmaping..etc).

As the skb_free_head() and skb_release_data() have both checked the
skb->pp_recycle to call the page_pool_put_full_page() if needed, I
think we are safe for most case, the one case I am not so sure above
is the rx zero copy, which seems to also bump up the refcnt before
mapping the page to user space, we might need to ensure rx zero copy
is not the last user of the page or if it is the last user, make sure
it calls page_pool_put_full_page() too."

> 

> <...>

>>>>>

>>>>>>   * Fast allocation side cache array/stack

>>>>>> @@ -77,6 +79,7 @@ struct page_pool_params {

>>>>>>         enum dma_data_direction dma_dir; /* DMA mapping direction */

>>>>>>         unsigned int    max_len; /* max DMA sync memory size */

>>>>>>         unsigned int    offset;  /* DMA addr offset */

>>>>>> +       unsigned int    frag_size;

>>>>>>  };

>>>>>>

>>>>>>  struct page_pool {

>>>>>> @@ -88,6 +91,8 @@ struct page_pool {

>>>>>>         unsigned long defer_warn;

>>>>>>

>>>>>>         u32 pages_state_hold_cnt;

>>>>>> +       unsigned int frag_offset;

>>>>>> +       struct page *frag_page;

>>>>>>

>>>>>>         /*

>>>>>>          * Data structure for allocation side

>>>>>> @@ -128,6 +133,11 @@ struct page_pool {

>>>>>>         u64 destroy_cnt;

>>>>>>  };

>>>>>>

>>>>>> +struct page_pool_info {

>>>>>> +       struct page_pool *pp;

>>>>>> +       int pagecnt_bias;

>>>>>> +};

>>>>>> +

>>>>>

>>>>> Rather than having a top-down structure here it might be better to

>>>>> work bottom up. If you assume you are keeping a pagecnt_bias per page

>>>>> it might make more sense to store this in the driver somewhere rather

>>>>> than having it as a separate allocated buffer. One advantage of the

>>>>> Intel drivers was doing this as we had the pagecnt_bias in a structure

>>>>> that also pointed to the page. That way we were only updating that

>>>>> count if we dropped the page and didn't have to even touch the page.

>>>>> You could use that to batch updates to the pagecnt_bias if we did use

>>>>> the lower 12 bits of the DMA address to store it as well.

>>>>

>>>> I am not sure I understood what "we dropped the page" meant.

>>>

>>> For XDP_DROP if we are dropping the buffer we are dropping the page

>>> which in our case means we just need to increment the pagecnt_bias

>>> indicating we are putting it back and don't have to do anything with

>>> the actual page refcount or struct.

>>

>> In that case, the driver not doing a page_pool_put_page() seems enough

>> and reuse the page frag again?

>>

>> It seems to like the usecase as below in hns3 driver? If all the buffer

>> has memcpy the head page, just reuse it.

>>

>> https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c#L3149

>>

>>>

>>>> The driver does not really need to call page_pool_put_full_page()

>>>> if the page of a skb is passed to stack, the driver mainly call

>>>> page_pool_put_full_page() when unloading or uniniting when the page

>>>> is not passed to stack yet.

>>>

>>> I was thinking mostly of something like XDP_TX cases when combined

>>> with the pagecnt_bias. You will need to have something to return the

>>> page to the pool after the XDP_TX is completed.

>>

>> I suppose XDP_TX is aware of page pool to call page_pool_put_full_page()

>> when XDP_TX is completed now?

>>

>> I suppose the above should be handled as similar as the non-elevated refcnt

>> case?

> 

> This is where including page frags makes this messy. In the frags case

> you only want to put back the page once, however if you are using

> frags the XDP_TX will have multiple copies of the same page so you

> would need to have a way to identify when all the copies have been

> consumed before you can recycle the page.


As matter of fact, I do not think the drvier need to be aware of that.
Each desc has it's own frag, so the it does not really matter if two
frag corrsponding to two desc is from the same page, the driver should
treat them as being from different page and call page_pool_put_page()
for each of frag if it is done with the frag and does not pass the frag
to the stack.

In future we might support doing frag coalesce, in that case some handling
is needed.

> 

>>>

>>>>> I'm assuming the idea with this is that you will be having multiple

>>>>> buffers received off of a single page and so doing it that way you

>>>>> should only have one update on allocation, maybe a trickle of updates

>>>>> for XDP_TX, and another large update when the page is fully consumed

>>>>> and you drop the remaining pagecnt_bias for Rx.

>>>>

>>>> I suppose "having multiple buffers received off of a single page" mean:

>>>> use first half of a page for a desc, and the second half of the same page

>>>> for another desc, intead of ping-pong way of reusing implemented in most

>>>> driver currently?

>>>>

>>>> I am not so familiar with XDP to understand the latter part of comment too.

>>>

>>> The alloc_frag logic below is an example of what I am talking about.

>>> Basically taking a page and chopping it up into multiple pieces for

>>> use as multiple receives instead of just one receive.

>>

>> Ok, but when multiple receives is passed to the stack and after the stack is

>> done with all the receives, we should be able to recycle the page, right?

> 

> Yes. That is the trick to all this. Identifying when we can safely

> recycle the page.


Yes.

> 

>>>

>>>>>

>>>>>>  struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);

>>>>>>

>>>>>>  static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)

>>>>>> @@ -137,6 +147,17 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)

>>>>>>         return page_pool_alloc_pages(pool, gfp);

>>>>>>  }

>>>>>>

>>>>>> +struct page *page_pool_alloc_frag(struct page_pool *pool,

>>>>>> +                                 unsigned int *offset, gfp_t gfp);

>>>>>> +

>>>>>> +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,

>>>>>> +                                                   unsigned int *offset)

>>>>>> +{

>>>>>> +       gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);

>>>>>> +

>>>>>> +       return page_pool_alloc_frag(pool, offset, gfp);

>>>>>> +}

>>>>>> +

>>>>>>  /* get the stored dma direction. A driver might decide to treat this locally and

>>>>>>   * avoid the extra cache line from page_pool to determine the direction

>>>>>>   */

>>>>>> @@ -253,11 +274,4 @@ static inline void page_pool_ring_unlock(struct page_pool *pool)

>>>>>>                 spin_unlock_bh(&pool->ring.producer_lock);

>>>>>>  }

>>>>>>

>>>>>> -/* Store mem_info on struct page and use it while recycling skb frags */

>>>>>> -static inline

>>>>>> -void page_pool_store_mem_info(struct page *page, struct page_pool *pp)

>>>>>> -{

>>>>>> -       page->pp = pp;

>>>>>> -}

>>>>>> -

>>>>>>  #endif /* _NET_PAGE_POOL_H */

>>>>>

>>>>> So the issue as I see it with the page_pool recycling patch set is

>>>>> that I don't think we had proper guarantees in place that the page->pp

>>>>> value was flushed in all cases where skb->dev was changed. Basically

>>>>> the logic we need to have in place to address those issues is that

>>>>> skb->dev is changed we need to invalidate the DMA mappings on the

>>>>> page_pool page.

>>>>

>>>> The DMA mappings invalidating is based on the pool->p.dev, is there

>>>> any reason why the DMA mappings need invalidating when skb->dev is

>>>> change, as fast I can tell, the tx is not aware of page pool, so

>>>> when the skb is redirected, the page of the skb is always DMA mapped

>>>> according to skb->dev before xmitting.

>>>>

>>>> Or it is about XDP redirected?

>>>>

>>>> Is there something obvious I missed here?

>>>

>>> It is about unmapping the page. In order to do so we have to maintain

>>> a pointer to the original DMA device. The page pool is doing that for

>>> us currently.

>>>

>>> Most netdevs have a parent  device that is used for DMA mapping.

>>> Therefore if skb->dev is valid, then the parent device is still valid

>>> since destroying the parent would destroy the children. If the

>>> skb->dev is dropped or changed, then we cannot guarantee the parent

>>> device is still present. So generally if skb->dev cannot be maintained

>>> then we probably shouldn't be maintaining the DMA mapping or page->pp

>>> across that boundary either.

>>

>> Does the get_device(pool->p.dev) in page_pool_init() not prevent the

>> above case?

> 

> Actually it is the inflight pages that are the important part and it

> does look like page_pool_release does appear to take care of that

> case.


Yes.

> 

> <...>

>>>>>

>>>>>> +static int page_pool_clear_pp_info(struct page *page)

>>>>>> +{

>>>>>> +       struct page_pool_info *pp_info = page->pp_info;

>>>>>> +       int bias;

>>>>>> +

>>>>>> +       bias = pp_info->pagecnt_bias;

>>>>>> +

>>>>>> +       kfree(pp_info);

>>>>>> +       page->pp_info = NULL;

>>>>>> +       page->pp_magic = 0;

>>>>>> +

>>>>>> +       return bias;

>>>>>> +}

>>>>>> +

>>>>>> +static void page_pool_clear_and_drain_page(struct page *page)

>>>>>> +{

>>>>>> +       int bias = page_pool_clear_pp_info(page);

>>>>>> +

>>>>>> +       __page_frag_cache_drain(page, bias + 1);

>>>>>> +}

>>>>>> +

>>>>>>  static struct page *__page_pool_alloc_page_order(struct page_pool *pool,

>>>>>>                                                  gfp_t gfp)

>>>>>>  {

>>>>>> @@ -216,13 +259,16 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,

>>>>>>         if (unlikely(!page))

>>>>>>                 return NULL;

>>>>>>

>>>>>> -       if ((pool->p.flags & PP_FLAG_DMA_MAP) &&

>>>>>> -           unlikely(!page_pool_dma_map(pool, page))) {

>>>>>> +       if (unlikely(page_pool_set_pp_info(pool, page, gfp))) {

>>>>>>                 put_page(page);

>>>>>>                 return NULL;

>>>>>>         }

>>>>>>

>>>>>> -       page->pp_magic |= PP_SIGNATURE;

>>>>>> +       if ((pool->p.flags & PP_FLAG_DMA_MAP) &&

>>>>>> +           unlikely(!page_pool_dma_map(pool, page))) {

>>>>>> +               page_pool_clear_and_drain_page(page);

>>>>>> +               return NULL;

>>>>>> +       }

>>>>>>

>>>>>>         /* Track how many pages are held 'in-flight' */

>>>>>>         pool->pages_state_hold_cnt++;

>>>>>> @@ -261,12 +307,17 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,

>>>>>>          */

>>>>>>         for (i = 0; i < nr_pages; i++) {

>>>>>>                 page = pool->alloc.cache[i];

>>>>>> +               if (unlikely(page_pool_set_pp_info(pool, page, gfp))) {

>>>>>> +                       put_page(page);

>>>>>> +                       continue;

>>>>>> +               }

>>>>>> +

>>>>>>                 if ((pp_flags & PP_FLAG_DMA_MAP) &&

>>>>>>                     unlikely(!page_pool_dma_map(pool, page))) {

>>>>>> -                       put_page(page);

>>>>>> +                       page_pool_clear_and_drain_page(page);

>>>>>>                         continue;

>>>>>>                 }

>>>>>

>>>>> This seems backwards to me. I would have the pp_info populated after

>>>>> you have generated the DMA mapping.

>>>>

>>>> Ok.

>>>>

>>>>>

>>>>>> -               page->pp_magic |= PP_SIGNATURE;

>>>>>> +

>>>>>>                 pool->alloc.cache[pool->alloc.count++] = page;

>>>>>>                 /* Track how many pages are held 'in-flight' */

>>>>>>                 pool->pages_state_hold_cnt++;

>>>>>> @@ -284,6 +335,25 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,

>>>>>>         return page;

>>>>>>  }

>>>>>>

>>>>>> +static void page_pool_sub_bias(struct page *page, int nr)

>>>>>> +{

>>>>>> +       struct page_pool_info *pp_info = page->pp_info;

>>>>>> +

>>>>>> +       /* "pp_info->pagecnt_bias == 0" indicates the PAGECNT_BIAS

>>>>>> +        * flags is not set.

>>>>>> +        */

>>>>>> +       if (!pp_info->pagecnt_bias)

>>>>>> +               return;

>>>>>> +

>>>>>> +       /* Make sure pagecnt_bias > 0 for elevated refcnt case */

>>>>>> +       if (unlikely(pp_info->pagecnt_bias <= nr)) {

>>>>>> +               page_ref_add(page, USHRT_MAX);

>>>>>> +               pp_info->pagecnt_bias += USHRT_MAX;

>>>>>> +       }

>>>>>> +

>>>>>> +       pp_info->pagecnt_bias -= nr;

>>>>>

>>>>> So we should never have a case where pagecnt_bias is less than the

>>>>> value we are subtracting. If we have that then it is a bug.

>>>>

>>>> Yes.

>>>

>>> Sorry, I was referring to the code above comparing pagecnt_bias to nr.

>>> At most nr should only ever be equal to pagecnt_bias, you should hold

>>> off on recharging pagecnt_bias until you can verify the page_count

>>> indicates we are the only holder of the page. Then we can recharge it

>>> and reset any offsets.

>>

>> Actually the page pool is the only user of the page when the driver is

>> calling page_pool_alloc_frag(), page is from pool->alloc/pool->ring or

>> page allocator in page_pool_alloc_pages(), as memtioned above, the

>> last user will put the page in pool->ring holding a lock, and when

>> page_pool_alloc_pages() get a page (also holding the same lock) from

>> pool->ring, there should be no user of the page other than the page pool.

>>

>> And page_pool_sub_bias() is called in page_pool_alloc_frag() and

>> page_pool_alloc_pages().

> 

> I think we would need to see a version of this patch without the

> alloc_frag calls in order to really be able to do a review. The

> problem is I don't see how the page_pool_alloc_frag can expect to have

> sole ownership of the page if it is allocating fragments of the page.

> The frags call imply multiple users for a single page.


The driver calls page_pool_alloc_frag(), and page_pool_alloc_frag()
will call page_pool_alloc_pages() to allocate a new page if the
pool->frag_page is NULL or there is no frag left in the pool->frag_page
(using pool->frag_offset and pool->frag_size to decide if there is any
frag left), and when the new page is allocated, it will decide how many
frag the page has by using PAGE_SIZE and pool->frag_size, which also mean
how many user will be using the page, so the "page_ref - (pagecnt_bias + 1)"
is the number of the user will using the page at the time when the first frag
is allocated, and pagecnt_bias is only updated for the first user of the page,
for subsequent user, just use the pool->frag_offset to decide which frag to
allocate if there is still frag left, and pagecnt_bias does not need changing
for subsequent user of the same page.

> 

>>>

>>>>>

>>>>> The general idea with the pagecnt_bias is that we want to batch the

>>>>> release of the page from the device. So the assumption is we are going

>>>>> to pull multiple references from the page and rather than doing

>>>>> page_ref_inc repeatedly we want to batch it at the start, and we have

>>>>> to perform a __page_frag_cache_drain to remove any unused references

>>>>> when we need to free it.

>>>>

>>>> Yes, it is about batching the page_ref_inc() operation.

>>>>

>>>>>

>>>>> What we should probably be checking for is "pp_info->pagecnt_bias -

>>>>> page_count(page) > 1" when we hit the end of the page. If that is true

>>>>> then we cannot recycle the page and so when we hit PAGE_SIZE for the

>>>>> offset we have to drop the mapping and free the page subtracting any

>>>>> remaining pagecnt_bias we are holding. If I recall I actually ran this

>>>>> the other way and ran toward 0 in my implementation before as that

>>>>> allows for not having to track via a value and instead simply checking

>>>>> for a signed result.

>>>>

>>>>

>>>> When allocating a page for frag, we have decided how many user is using

>>>> the page, that is the "page_pool_sub_bias(frag_page, max_len / frag_size - 1)"

>>>> in page_pool_alloc_frag().

>>>>

>>>> so it is up to the driver or stack to do multi page_pool_put_full_page()

>>>> calling for the same page.

>>>

>>> So that is one spot that I think is an issue. We normally only want

>>> this called once per page and ideally after pagecnt_bias is 0. One

>>> issue is that pagecnt_bias is non-atomic so we should really be

>>> restricting this to just the driver calling it in softirq context.

>>

>> Let's discuss the pagecnt_bias handling at the end.

>>

>>>

>>>> Or the page pool will call page_pool_put_full_page() in page_pool_empty_frag()

>>>> if some of the page frag is not allocated to the driver yet.

>>>>

>>>> It seems you are suggesting a slightly different way to do frag reusing.

>>>

>>> As I mentioned I am not a fan of the current recycling scheme. There

>>> are too many openings for it to end up unmapping the same page

>>> multiple times or other possible issues.

>>

>> Other than the pagecnt_bias handling in non-atomic context, I think

>> most of the race you mentioned above has been handled if I understand

>> it correctly?

> 

> The biggest issue is that if we assume this to be more of a ticket

> lock model, you have threads outside of this that are using

> get_page/put_page that will mess with your tickets and cause leaks

> because your unlocker may end up getting a non-matching ticket even

> though it is the last call to __page_pool_put_page.


Yes, we need to make sure there is no get_page/put_page messing with
this process. Or if there is, make sure there is a __page_pool_put_page()
after get_page/put_page.

> 

>>>

>>> In my mind the driver or page_pool should own the page and just keep

>>> it on a list to either be freed or recycled with the skb destructor

>>> being used to trigger the recycling.

>>

>> The page_pool still own the page, it is just that when driver also own

>> the page by calling page_pool_alloc_pages(), and the page is not on a

>> list of page pool, the driver or stack calling the page_pool_put_full_page()

>> will put the page back to the list of page pool(or do resource cleaning and

>> put it back to page allocator) if it is the last user.

>>

>> I am not similar enough with destructor to say if using skb destructor

>> has any difference here.

>>

>>>

>>>>>

>>>>>> +}

>>>>>> +

>>>>>>  /* For using page_pool replace: alloc_pages() API calls, but provide

>>>>>>   * synchronization guarantee for allocation side.

>>>>>>   */

>>>>>> @@ -293,15 +363,66 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)

>>>>>>

>>>>>>         /* Fast-path: Get a page from cache */

>>>>>>         page = __page_pool_get_cached(pool);

>>>>>> -       if (page)

>>>>>> +       if (page) {

>>>>>> +               page_pool_sub_bias(page, 1);

>>>>>>                 return page;

>>>>>> +       }

>>>>>

>>>>> I'm not sure we should be subtracting from the bias here. Ideally if

>>>>> you are getting a page you are getting the full 4K page. So having a

>>>>> bias other than PAGE_SIZE - 1 wouldn't make much sense here.

>>>>

>>>> It seems we have different understanding about pagecnt_bias here,

>>>> as the pagecnt_bias is hidden in the page pool now, the subtracting

>>>> here mean we give one refcnt to the caller of page_pool_alloc_pages(),

>>>> And in page_pool_alloc_frag(), we give different part of page to the

>>>> driver, so it means more user too, so there is also subtracting in the

>>>> page_pool_alloc_frag() too.

>>>

>>> I see what you are getting at, however I think it depends on your use

>>> case. In my mind since you are allocating the full page you should

>>> have the full count available to you. I don't believe pagecnt_bias is

>>> something that should be looked at outside of the driver, or at least

>>> outside of the napi context of the device softirq.

>>>

>>> So really in order for this to work correctly you would need to have

>>> some minimum amount of bias reserved for the device to access if you

>>> are going to break up page in to n usable buffers.

>>

>> Ensuring the pagecnt_bias > 0 in page_pool_sub_bias() seems enough

>> to make sure the page pool always own the page?

> 

> Except for the leak issue I pointed out above. If we are going to

> enforce pagecnt_bias as a check for unmapping we have to guarantee

> that anyone touching the page will use your function to release the

> references to it.


Yes.

> 

> Again this is why I think it would be better to just maintain a list

> of inflight pages and then unmap them fro the driver if they are still

> on the list greater than some fixed period of time.


I am not sure if adding a list of inflight pages is the proper way
to solve the problem if the page is not returned to the page for a
very long time.

Maybe we should find out why the page is not returned to page pool
and fix it if that happen?

> 

>>>

>>>>>

>>>>>>

>>>>>>         /* Slow-path: cache empty, do real allocation */

>>>>>>         page = __page_pool_alloc_pages_slow(pool, gfp);

>>>>>> +       if (page)

>>>>>> +               page_pool_sub_bias(page, 1);

>>>>>> +

>>>>>

>>>>> Same here. Really in both cases we should be getting initialized

>>>>> pages, not ones that are already decrementing.

>>>>>

>>>>>>         return page;

>>>>>>  }

>>>>>>  EXPORT_SYMBOL(page_pool_alloc_pages);

>>>>>>

>>>>>> +struct page *page_pool_alloc_frag(struct page_pool *pool,

>>>>>> +                                 unsigned int *offset, gfp_t gfp)

>>>>>> +{

>>>>>> +       unsigned int frag_offset = pool->frag_offset;

>>>>>> +       unsigned int frag_size = pool->p.frag_size;

>>>>>> +       struct page *frag_page = pool->frag_page;

>>>>>> +       unsigned int max_len = pool->p.max_len;

>>>>>> +

>>>>>> +       if (!frag_page || frag_offset + frag_size > max_len) {

>>>>>

>>>>> These are two very different cases. If frag_page is set and just out

>>>>> of space we need to be freeing the unused references.

>>>>

>>>> As mention above, we are depending on the last user to do the

>>>> recycling or freeing the unused references.

>>>

>>> But you are holding the pagecnt_bias for it aren't you? If so you need

>>> to release it so that the last user knows that they were the last

>>> user.

>>

>> The user will know it is the last user if page_pool_bias_page_recyclable()

>> return true.

> 

> Except for the leak issue pointed out above.

> 

>>>

>>> Once you aren't using the page you need to release the pagecnt_bias

>>> since the page is on the path to being freed.

>> It seems the above is more above what does the pagecnt_bias represent?

>>

>>>

>>>>>

>>>>>> +               frag_page = page_pool_alloc_pages(pool, gfp);

>>>>>

>>>>> So as per my comment above the page should be coming in with a

>>>>> pagecnt_bias of PAGE_SIZE - 1, and an actual page_ref_count of

>>>>> PAGE_SIZE.

>>>>

>>>> Let's align the understanding of pagecnt_bias first?

>>>>

>>>> pagecnt_bias meant how many refcnt of a page belong to the page

>>>> pool, and (page_ref_count() - pagecnt_bias) means how many refcnt

>>

>> Actually it is (page_ref_count() - (pagecnt_bias + 1))

>>

>>>> of a page belong to user of the page pool.

>>>

>>> So my view is a slight variation on that. I view pagecnt_bias as the

>>> count of references reserved by the page_pool, and page_ref_count -

>>> pagecnt_bias is the actual reference count. So if I am going to free a

>>> page I should deduct pagecnt_bias + 1 from the reference count to

>>> account for dropping our bias and the one for the fact that we own the

>>> page.

>>

>> So if (page_ref_count() - (pagecnt_bias + 1)) == 0 means only the page

>> pool hold the page and it means whichever caller having the

>> page_pool_bias_page_recyclable() returning true is the last user, right?

>>

>>>

>>>>>

>>>>>> +               if (unlikely(!frag_page)) {

>>>>>> +                       pool->frag_page = NULL;

>>>>>> +                       return NULL;

>>>>>> +               }

>>>>>> +

>>>>>> +               pool->frag_page = frag_page;

>>>>>> +               frag_offset = 0;

>>>>>> +

>>>>>> +               page_pool_sub_bias(frag_page, max_len / frag_size - 1);

>>>>>

>>>>> Why are you doing division here? We should just be subtracting 1 from

>>>>> the pagecnt_bias since that is the number of buffers that are being

>>>>> used. The general idea is that when pagecnt_bias is 0 we cut the page

>>>>> loose for potential recycling or freeing, otherwise we just subtract

>>>>> our new value from pagecnt_bias until we reach it.

>>>>

>>>> As mentioned above, division is used to find out how many user may be

>>>> using the page.

>>>

>>> That doesn't make any sense to me because it won't tell you the actual

>>> users, and from what I can tell it is buggy since if I use this to

>>> allocate a chunk larger than 2K this comes out to 0 doesn't it? It

>>> seems like you should just always use 1 as the count.

>>

>> There is already a page_pool_sub_bias(page, 1) in page_pool_alloc_pages(),

>> so for 4K page, there is two users for a page with 2K frag size, and there

>> is 32 users for 64K page with 2K frag size.

>>

>> The reason doing a page_pool_sub_bias(page, 1) in page_pool_alloc_pages()

>> is that the caller is expected to use the page as a whole when using the

>> page_pool_alloc_pages() directly, so it means only one user.

> 

> The logic doesn't make any sense. You shouldn't need to do any

> subtraction then. The idea is you subtract 1 per frag pulled from the

> page. The logic you have here just doesn't make sense as you are

> making smaller frags pull additional bias counts. If I pull a small

> fragment I could consume the entire bias in a single call.


I am not sure I understand the above comment.
Basically the page returned from page_pool_alloc_pages() is expected
to be used by one user, when page_pool_alloc_frag() use that page to
serve more users, it decides the total user using "max_len / frag_size",
as there is already one user added in page_pool_alloc_pages(), so only
"max_len / frag_size - 1" more user need adding(adding more user is by
calling page_pool_sub_bias(), which is kind of confusing as the "sub"
word).

> 

>>>

>>>>>

>>>>>> +       }

>>>>>> +

>>>>>> +       *offset = frag_offset;

>>>>>> +       pool->frag_offset = frag_offset + frag_size;

>>>>>> +

>>>>>> +       return frag_page;

>>>>>> +}

>>>>>> +EXPORT_SYMBOL(page_pool_alloc_frag);

>>>>>> +

>>>>>> +static void page_pool_empty_frag(struct page_pool *pool)

>>>>>> +{

>>>>>> +       unsigned int frag_offset = pool->frag_offset;

>>>>>> +       unsigned int frag_size = pool->p.frag_size;

>>>>>> +       struct page *frag_page = pool->frag_page;

>>>>>> +       unsigned int max_len = pool->p.max_len;

>>>>>> +

>>>>>> +       if (!frag_page)

>>>>>> +               return;

>>>>>> +

>>>>>> +       while (frag_offset + frag_size <= max_len) {

>>>>>> +               page_pool_put_full_page(pool, frag_page, false);

>>>>>> +               frag_offset += frag_size;

>>>>>> +       }

>>>>>> +

>>>>>> +       pool->frag_page = NULL;

>>>>>> +}

>>>>>> +

>>>>>

>>>>> It would be good to look over the page_frag_alloc_align and

>>>>> __page_frag_cache_drain functions for examples of how to do most of

>>>>> this. The one complication is that we have the dma mappings and

>>>>> page_pool logic to deal with.

>>>>

>>>> Is it ok to rely on the user providing a aligning frag_size, so

>>>> that do not need handling it here?

>>>

>>> It is probably fine since the page pool should only have one consumer

>>> so the requests just need to be aligned by them.

>>>

>>>>>

>>>>>>  /* Calculate distance between two u32 values, valid if distance is below 2^(31)

>>>>>>   *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution

>>>>>>   */

>>>>>> @@ -326,10 +447,11 @@ static s32 page_pool_inflight(struct page_pool *pool)

>>>>>>   * a regular page (that will eventually be returned to the normal

>>>>>>   * page-allocator via put_page).

>>>>>>   */

>>>>>> -void page_pool_release_page(struct page_pool *pool, struct page *page)

>>>>>> +static int __page_pool_release_page(struct page_pool *pool,

>>>>>> +                                   struct page *page)

>>>>>>  {

>>>>>>         dma_addr_t dma;

>>>>>> -       int count;

>>>>>> +       int bias, count;

>>>>>>

>>>>>>         if (!(pool->p.flags & PP_FLAG_DMA_MAP))

>>>>>>                 /* Always account for inflight pages, even if we didn't

>>>>>> @@ -345,22 +467,29 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)

>>>>>>                              DMA_ATTR_SKIP_CPU_SYNC);

>>>>>>         page_pool_set_dma_addr(page, 0);

>>>>>>  skip_dma_unmap:

>>>>>> -       page->pp_magic = 0;

>>>>>> +       bias = page_pool_clear_pp_info(page);

>>>>>>

>>>>>>         /* This may be the last page returned, releasing the pool, so

>>>>>>          * it is not safe to reference pool afterwards.

>>>>>>          */

>>>>>>         count = atomic_inc_return(&pool->pages_state_release_cnt);

>>>>>>         trace_page_pool_state_release(pool, page, count);

>>>>>> +       return bias;

>>>>>> +}

>>>>>> +

>>>>>> +void page_pool_release_page(struct page_pool *pool, struct page *page)

>>>>>> +{

>>>>>> +       int bias = __page_pool_release_page(pool, page);

>>>>>> +

>>>>>> +       WARN_ONCE(bias, "PAGECNT_BIAS is not supposed to be enabled\n");

>>>>>>  }

>>>>>>  EXPORT_SYMBOL(page_pool_release_page);

>>>>>>

>>>>>>  /* Return a page to the page allocator, cleaning up our state */

>>>>>>  static void page_pool_return_page(struct page_pool *pool, struct page *page)

>>>>>>  {

>>>>>> -       page_pool_release_page(pool, page);

>>>>>> +       __page_frag_cache_drain(page, __page_pool_release_page(pool, page) + 1);

>>>>>>

>>>>>> -       put_page(page);

>>>>>>         /* An optimization would be to call __free_pages(page, pool->p.order)

>>>>>>          * knowing page is not part of page-cache (thus avoiding a

>>>>>>          * __page_cache_release() call).

>>>>>> @@ -395,7 +524,16 @@ static bool page_pool_recycle_in_cache(struct page *page,

>>>>>>         return true;

>>>>>>  }

>>>>>>

>>>>>> -/* If the page refcnt == 1, this will try to recycle the page.

>>>>>> +static bool page_pool_bias_page_recyclable(struct page *page, int bias)

>>>>>> +{

>>>>>> +       int ref = page_ref_dec_return(page);

>>>>>> +

>>>>>> +       WARN_ON(ref < bias);

>>>>>> +       return ref == bias + 1;

>>>>>> +}

>>>>>> +

>>>>>> +/* If pagecnt_bias == 0 and the page refcnt == 1, this will try to

>>>>>> + * recycle the page.

>>>>>>   * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for

>>>>>>   * the configured size min(dma_sync_size, pool->max_len).

>>>>>>   * If the page refcnt != 1, then the page will be returned to memory

>>>>>> @@ -405,16 +543,35 @@ static __always_inline struct page *

>>>>>>  __page_pool_put_page(struct page_pool *pool, struct page *page,

>>>>>>                      unsigned int dma_sync_size, bool allow_direct)

>>>>>>  {

>>>>>> -       /* This allocator is optimized for the XDP mode that uses

>>>>>> +       int bias = page->pp_info->pagecnt_bias;

>>>>>> +

>>>>>> +       /* Handle the elevated refcnt case first:

>>>>>> +        * multi-frames-per-page, it is likely from the skb, which

>>>>>> +        * is likely called in non-sofrirq context, so do not recycle

>>>>>> +        * it in pool->alloc.

>>>>>> +        *

>>>>>> +        * Then handle non-elevated refcnt case:

>>>>>>          * one-frame-per-page, but have fallbacks that act like the

>>>>>>          * regular page allocator APIs.

>>>>>> -        *

>>>>>>          * refcnt == 1 means page_pool owns page, and can recycle it.

>>>>>>          *

>>>>>>          * page is NOT reusable when allocated when system is under

>>>>>>          * some pressure. (page_is_pfmemalloc)

>>>>>>          */

>>>>>> -       if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {

>>>>>> +       if (bias) {

>>>>>> +               /* We have gave some refcnt to the stack, so wait for

>>>>>> +                * all refcnt of the stack to be decremented before

>>>>>> +                * enabling recycling.

>>>>>> +                */

>>>>>> +               if (!page_pool_bias_page_recyclable(page, bias))

>>>>>> +                       return NULL;

>>>>>> +

>>>>>> +               /* only enable recycling when it is not pfmemalloced */

>>>>>> +               if (!page_is_pfmemalloc(page))

>>>>>> +                       return page;

>>>>>> +

>>>>>

>>>>> So this would be fine if this was only accessed from the driver. The

>>>>> problem is the recycling code made it so that this is accessed in the

>>>>> generic skb freeing path. As such I think this is prone to races since

>>>>> you have to guarantee the ordering of things between the reference

>>>>> count and pagecnt_bias.

>>>>

>>>> As reference count is handled atomically is page_pool_bias_page_recyclable,

>>>> and pagecnt_bias is changed before any page is handled to the stack(maybe

>>>> some READ_ONCE/WRITE_ONCE or barrier is still needed, will check it again),

>>>> so I suppose the ordering is correct?

>>>

>>> The problem is in order to get this working correctly you would likely

>>> need to add a number of barriers so that reads and writes are in a

>>> specific order. You would be much better off just not

>>> reading/modifying the pagecnt_bias outside of the softirq paths.

>>

>> Most of the reusing implemented in the driver today may not be

>> able to do reusing when the stack does not process the skb and

>> dec the refcnt quick enough, this patch try to reuse the page

>> as much as possible when above case happens.

>>

>> So it seems the pagecnt_bias need to be checked outside of the

>> softirq to implement that?

>>

>> Let's break down the step of reusing a page:

>> 1. driver call page_pool_alloc_frag() to allocte a page frag.

>> 2. page pool sub the pagecnt_bias according to the user using the

>>    page.

>> 3. driver fill the page info to the desc.

>> 4. driver notify the hw that desc is filled with page info.

>> 5. hw write the packet to page memory according to info in desc.

>> 6. driver process the desc and passed the skb(contianing the page

>>    frag) to stack

>> 7. stack process the skb

>> 8. stack put the page to page pool calling page_pool_return_page(),

>>    if it is the last user by checking pagecnt_bias, the page is recycled

>>    in page pool or is returned to page allocated after cleaning the

>>    resource.

>>

>> There is usually barrier in step 4 and step 6, at least in hns3 drvier,

>> see the barrier does not seems to be necessary?

>>

>> see:

>> https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c#L2867

>> https://elixir.bootlin.com/linux/latest/source/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c#L3368

> 

> I am not really interested in the page frag case for now. There are

> bigger issues with the patch set. I would recommend splitting the page

> frags out into a separate patch and just try to work out the bulk

> updating of the page count for now.

> 

> Ideally this would be broken out into smaller patches so it is easier

> to review as there are currently several issues that we are talking

> about here in parallel which is making the discussion confusing.


Ok, will split this patch to more reviewable one.

> .

>
Alexander Duyck July 9, 2021, 2:15 p.m. UTC | #22
On Thu, Jul 8, 2021 at 11:26 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>

> On 2021/7/8 23:36, Alexander Duyck wrote:

> > On Wed, Jul 7, 2021 at 7:27 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:

> >>

> >> On 2021/7/7 23:01, Alexander Duyck wrote:

> >>> On Tue, Jul 6, 2021 at 8:05 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:

> >>>>

> >>>> On 2021/7/7 4:45, Alexander Duyck wrote:

> >>>>> On Wed, Jun 30, 2021 at 2:19 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:

> >>>>>>

> >>>>>> Currently page pool only support page recycling only when

> >>>>>> refcnt of page is one, which means it can not support the

> >>>>>> split page recycling implemented in the most ethernet driver.

> >>>>>>

> >>>>>> So add elevated refcnt support in page pool, and support

> >>>>>> allocating page frag to enable multi-frames-per-page based

> >>>>>> on the elevated refcnt support.

> >>>>>>

> >>>>>> As the elevated refcnt is per page, and there is no space

> >>>>>> for that in "struct page" now, so add a dynamically allocated

> >>>>>> "struct page_pool_info" to record page pool ptr and refcnt

> >>>>>> corrsponding to a page for now. Later, we can recycle the

> >>>>>> "struct page_pool_info" too, or use part of page memory to

> >>>>>> record pp_info.

> >>>>>>

> >>>>>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>

> >>>>

> >>>> Hi, Alexander

> >>>>

> >>>> Thanks for detailed reviewing.

> >>>>

> >>>>>

> >>>>> So this isn't going to work with the current recycling logic. The

> >>>>> expectation there is that we can safely unmap the entire page as soon

> >>>>> as the reference count is greater than 1.

> >>>>

> >>>> Yes, the expectation is changed to we can always recycle the page

> >>>> when the last user has dropped the refcnt that has given to it when

> >>>> the page is not pfmemalloced.

> >>>>

> >>>> The above expectation is based on that the last user will always

> >>>> call page_pool_put_full_page() in order to do the recycling or do

> >>>> the resource cleanup(dma unmaping..etc).

> >>>>

> >>>> As the skb_free_head() and skb_release_data() have both checked the

> >>>> skb->pp_recycle to call the page_pool_put_full_page() if needed, I

> >>>> think we are safe for most case, the one case I am not so sure above

> >>>> is the rx zero copy, which seems to also bump up the refcnt before

> >>>> mapping the page to user space, we might need to ensure rx zero copy

> >>>> is not the last user of the page or if it is the last user, make sure

> >>>> it calls page_pool_put_full_page() too.

> >>>

> >>> Yes, but the skb->pp_recycle value is per skb, not per page. So my

> >>> concern is that carrying around that value can be problematic as there

> >>> are a number of possible cases where the pages might be

> >>> unintentionally recycled. All it would take is for a packet to get

> >>> cloned a few times and then somebody starts using pskb_expand_head and

> >>> you would have multiple cases, possibly simultaneously, of entities

> >>> trying to free the page. I just worry it opens us up to a number of

> >>> possible races.

> >>

> >> I think page_ref_dec_return() in page_pool_bias_page_recyclable() will

> >> prevent the above race to happen.

> >>

> >> As the page_ref_dec_return() and page_pool_bias_page_recyclable() return

> >> true, all user of the page have done with the p->pp_magic and p->pp_info,

> >> so it should be ok to reset the p->pp_magic and p->pp_info in any order?

> >>

> >> And page_ref_dec_return() has both __atomic_pre_full_fence() and

> >> __atomic_post_full_fence() to ensure the above ordering.

> >

> > So if I understand correctly what you are saying is that because of

> > the pagecnt_bias check we will not hit the page_pool_release_page.

> > That may help to address the issue introduced by the recycling patch

> > but I don't think it completely resolves it. In addition there may be

> > performance implications to this change since you are requiring the

> > atomic dec for every page.

> >

> > The difference between pagecnt_bias and what you have here is that we

> > freed the page when page_ref_count hit 0. With this approach you are

> > effectively freeing the page when page_ref_count == pagecnt_bias +

> > modifier. The two implementations have quite a number of differences

> > in behavior.

> >

> > What you have effectively done here is make the page refcount and

> > pagecnt_bias effectively into a ticket lock where we cannot call the

> > free function until page_ref_cnt == pagecnt_bias + 1. So you need to

> > keep the pagecnt_bias much lower than the page_ref_cnt otherwise you

> > run the risk of frequent recycling. For the non-shared page_pool pages

> > this is probably fine, however the frags implementation is horribly

> > broken.

>

> Yes, if ticket lock is the name for that.

>

> I suppose "non-shared page_pool pages" mean caller allocates the page by

> calling page_pool_alloc_pages() directly for elevated refcnt case, right?

>

> The main difference between page_pool_alloc_pages() and page_pool_alloc_frag()

> for elevated refcnt case is how many tickets have been given out, so I

> am not sure why giving out one ticket is ok, and giving out more than one

> ticket is broken?


The model for page_pool_alloc_frag is that you are giving out one
slice of the page at a time. The general idea is you are allocating
variable sized sections of the page, so normally you cannot predict
exactly how many references will be needed.

In addition division is an extremely expensive operation when you
aren't working with a constant power of 2. As such for any fastpath
thing such as an allocation you want to try to avoid it if at all
possible.

> >

> > Also the ticketlock approach is flawed because with something like

> > that we shouldn't rewind the number we are currently serving like we

> > do. We would have to wait until we are the only one holding the page

> > before we could recycle previously used values.

>

> I am not sure I understand the above.

>

> I suppose it means we might not be able to clean up the resource(mainly

> to do unmapping and drain the page_ref according to pagecnt_bias) while

> the stack is still holding the reference to the page, which is possible

> for the current reusing implemented in most driver.

>

> But one good thing come out of that is we might still be able to reuse

> the page when the stack release the reference to the page later, which

> is not possible for the current reusing implemented in most driver.


There are several flaws with the approach.

1. The fact that external entities can all get_page/put_page which may
cause __page_pool_put_page to miss the case where it would have
otherwise found that page_ref_count == pagecnt_bias.

2. Rewinding the page without first verifying it owns all references.
Technically that is an exploitable issue as someone would just have to
take 64K references at just the right time to cause page_ref_count ==
pagecnt_bias. Not a likely issue but technically not a correct thing
to do either as it does open a window for exploitation.

3. Generally any sort of count rewind waits until we know we are the
only ones holding onto the page. Basically we have to verify the
page_ref_count == 1 or page_ref_count == pagecnt_bias case before
resetting offsets and assuming we can safely reuse the page.

<...>
> > last buffer we don't bother with decrementing the pagecnt_bias and

> > instead just hand the page over to the stack. So what we should have

> > is the page cycling between a pagecnt_bias that is +1-2 of the actual

> > page_ref_count and when the two are equal we then perform the

> > unmap/free or recycle of the page.

>

> What does "last buffer" mean?

> The driver does not know whether the buffer is the last one or not as the

> pagecnt_bias is hidden inside the page pool.


It depends on use case. If we are doing something like classic
page_pool with one use per page then every buffer would be the last
buffer so we technically wouldn't need to decrement it after the page
has been recycled. If we are doing the page_frag type model it would
be the last fragment of the page being used.

> >

> > On the Tx and SKB side of things we are using the page_ref_count to

> > track which instances can be recycled and should only ever be reading

> > pagecnt_bias.

>

> pagecnt_bias in this patch *does* indeed being only read for SKB side.

> I suppose Tx side is for XDP?


Yes.

> >

> > At recycle time we will need to verify there are enough tickets to

> > support another run through the allocator. We may want to look at

> > adding a value to the page pool to track the maximum number of slices

> > a page can be broken into in order to avoid having to update the

> > page_ref_count and pagecnt_bias too often.

>

> Why is page_ref_count and pagecnt_bias not enough to do the job?

> The user have provided the frag_size and we know about the page size, so

> we should be able to ensure pagecnt_bias is big enough for the maximum

> number of slices when allocating the first frag of page.


As I mentioned before we shouldn't be just arbitrarily rewinding. And
resetting every time the page is freed would be expensive. So the idea
is to have a check and as long as pagecnt_bias is greater than the
number of fragments we will break out of a single page we don't need
to update pagecnt_bias or page_ref_count when the page is recycled.

<...>
> >>>>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

> >>>>>> index b2db9cd..7795979 100644

> >>>>>> --- a/include/linux/skbuff.h

> >>>>>> +++ b/include/linux/skbuff.h

> >>>>>> @@ -4711,11 +4711,9 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)

> >>>>>>  }

> >>>>>>

> >>>>>>  #ifdef CONFIG_PAGE_POOL

> >>>>>> -static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page,

> >>>>>> -                                       struct page_pool *pp)

> >>>>>> +static inline void skb_mark_for_recycle(struct sk_buff *skb)

> >>>>>>  {

> >>>>>>         skb->pp_recycle = 1;

> >>>>>> -       page_pool_store_mem_info(page, pp);

> >>>>>>  }

> >>>>>>  #endif

> >>>>>

> >>>>> I am not a fan of the pp_recycle flag either. We duplicate it via

> >>>>> skb_clone and from what I can tell if we call pskb_expand_head

> >>>>> afterwards I don't see how we avoid recycling the page frags twice.

> >>>>

> >>>> Acctually skb->pp_recycle is kind of duplicated, as there is

> >>>> still page->pp_magic to avoid recycling the page frags twice.

> >>>>

> >>>> The argument above adding skb->pp_recycle seems to be short

> >>>> cut code path for non-page_pool case in the previous disscusion,

> >>>> see [2].

> >>>>

> >>>> 2. https://lore.kernel.org/linux-mm/074b0d1d-9531-57f3-8e0e-a447387478d1@huawei.com/

> >>>

> >>> Yes, but that doesn't guarantee atomic protections so you still have

> >>> race conditions possible. All it takes is something stalling during

> >>> the dma_unamp call. Worse yet from what I can tell it looks like you

> >>> clear page->pp before you clear page->pp_magic so you have the

> >>> potential for a NULL pointer issue since it is cleared before the

> >>> pp_magic value is.

> >>

> >> Hopefully the page_ref_dec_return() in page_pool_bias_page_recyclable()

> >> called by page_pool_put_page() will make the order of page->pp_magic

> >> clearing and page->pp clearing irrelevant?

> >

> > Really it doesn't address the issue. The problem is the clearing of

> > pp_magic is after the dec_and_ref while the reading/clearing of

> > page->pp is before it.

> >

> > So having code like the following is not safe:

> >     pp = page->pp;

> >     page->pp = NULL;

> >

> >     if (pp->something)

> >         do_something();

> >

> > The check for page->pp_magic before this doens't resolve it because 2

> > threads can get into the code path before either one has updated

> > page->pp_magic.

>

> I suppose the above issue is the one you and Ilias are discussing?


Yes. I think we are getting that sorted out.

> >

> > Arguably the pagecnt_bias does something to help, but what it has

> > effectively done is created a ticket lock where until you can get

> > page_ref_count to reach the pagecnt_bias value you cannot unmap or

> > free the page. So the tradeoff is that if anyone takes a reference to

> > the page you are now stuck and cannot unmap it nor remove the device

> > while the page is still in use elsewhere.

> >

> > Also it just occurred to me that this will cause likely leaks because

> > page_ref_count is also updated outside of page_pool so we would have

> > to worry about someone calling get_page, then your call to

> > page_pool_bias_page_recyclable, and then put page and at that point

> > the page is leaked.

>

> Yes, as mentioned in the previous discussion:

>

> "Yes, the expectation is changed to we can always recycle the page

> when the last user has dropped the refcnt that has given to it when

> the page is not pfmemalloced.

>

> The above expectation is based on that the last user will always

> call page_pool_put_full_page() in order to do the recycling or do

> the resource cleanup(dma unmaping..etc).


The problem is we cannot make that assumption. The memory management
subsystem has a number of operations that will take a reference on the
page as long as it is not zero and is completely unrelated to
networking. So that breaks this whole concept. As does the fixes
needed to deal with the skb_clone/pskb_expand_head issue.

> As the skb_free_head() and skb_release_data() have both checked the

> skb->pp_recycle to call the page_pool_put_full_page() if needed, I

> think we are safe for most case, the one case I am not so sure above

> is the rx zero copy, which seems to also bump up the refcnt before

> mapping the page to user space, we might need to ensure rx zero copy

> is not the last user of the page or if it is the last user, make sure

> it calls page_pool_put_full_page() too."


That isn't going to work. In order for this patch set to work you
would effectively have to somehow modify put_page since that is used
at a number of given points throughout the kernel on the page. That is
the whole reason for the checks against page_ref_count != 1 in the
__page_pool_put_page call since it is the first call to it that will
have to perform the unmapping if something else is holding onto the
page.

<...>
> >>>>>> @@ -284,6 +335,25 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,

> >>>>>>         return page;

> >>>>>>  }

> >>>>>>

> >>>>>> +static void page_pool_sub_bias(struct page *page, int nr)

> >>>>>> +{

> >>>>>> +       struct page_pool_info *pp_info = page->pp_info;

> >>>>>> +

> >>>>>> +       /* "pp_info->pagecnt_bias == 0" indicates the PAGECNT_BIAS

> >>>>>> +        * flags is not set.

> >>>>>> +        */

> >>>>>> +       if (!pp_info->pagecnt_bias)

> >>>>>> +               return;

> >>>>>> +

> >>>>>> +       /* Make sure pagecnt_bias > 0 for elevated refcnt case */

> >>>>>> +       if (unlikely(pp_info->pagecnt_bias <= nr)) {

> >>>>>> +               page_ref_add(page, USHRT_MAX);

> >>>>>> +               pp_info->pagecnt_bias += USHRT_MAX;

> >>>>>> +       }

> >>>>>> +

> >>>>>> +       pp_info->pagecnt_bias -= nr;

> >>>>>

> >>>>> So we should never have a case where pagecnt_bias is less than the

> >>>>> value we are subtracting. If we have that then it is a bug.

> >>>>

> >>>> Yes.

> >>>

> >>> Sorry, I was referring to the code above comparing pagecnt_bias to nr.

> >>> At most nr should only ever be equal to pagecnt_bias, you should hold

> >>> off on recharging pagecnt_bias until you can verify the page_count

> >>> indicates we are the only holder of the page. Then we can recharge it

> >>> and reset any offsets.

> >>

> >> Actually the page pool is the only user of the page when the driver is

> >> calling page_pool_alloc_frag(), page is from pool->alloc/pool->ring or

> >> page allocator in page_pool_alloc_pages(), as memtioned above, the

> >> last user will put the page in pool->ring holding a lock, and when

> >> page_pool_alloc_pages() get a page (also holding the same lock) from

> >> pool->ring, there should be no user of the page other than the page pool.

> >>

> >> And page_pool_sub_bias() is called in page_pool_alloc_frag() and

> >> page_pool_alloc_pages().

> >

> > I think we would need to see a version of this patch without the

> > alloc_frag calls in order to really be able to do a review. The

> > problem is I don't see how the page_pool_alloc_frag can expect to have

> > sole ownership of the page if it is allocating fragments of the page.

> > The frags call imply multiple users for a single page.

>

> The driver calls page_pool_alloc_frag(), and page_pool_alloc_frag()

> will call page_pool_alloc_pages() to allocate a new page if the

> pool->frag_page is NULL or there is no frag left in the pool->frag_page

> (using pool->frag_offset and pool->frag_size to decide if there is any

> frag left), and when the new page is allocated, it will decide how many

> frag the page has by using PAGE_SIZE and pool->frag_size, which also mean

> how many user will be using the page, so the "page_ref - (pagecnt_bias + 1)"

> is the number of the user will using the page at the time when the first frag

> is allocated, and pagecnt_bias is only updated for the first user of the page,

> for subsequent user, just use the pool->frag_offset to decide which frag to

> allocate if there is still frag left, and pagecnt_bias does not need changing

> for subsequent user of the same page.


The point I am getting at is that the patch is doing too much and we
are essentially trying to discuss 3 to 4 patches worth of content in
one email thread which has us going in circles. It would be better to
break the page_frag case out of this patch and into one of its own for
us to discuss separately as too many issues with the patch set are
being conflated.

<...>
> >>>

> >>>> Or the page pool will call page_pool_put_full_page() in page_pool_empty_frag()

> >>>> if some of the page frag is not allocated to the driver yet.

> >>>>

> >>>> It seems you are suggesting a slightly different way to do frag reusing.

> >>>

> >>> As I mentioned I am not a fan of the current recycling scheme. There

> >>> are too many openings for it to end up unmapping the same page

> >>> multiple times or other possible issues.

> >>

> >> Other than the pagecnt_bias handling in non-atomic context, I think

> >> most of the race you mentioned above has been handled if I understand

> >> it correctly?

> >

> > The biggest issue is that if we assume this to be more of a ticket

> > lock model, you have threads outside of this that are using

> > get_page/put_page that will mess with your tickets and cause leaks

> > because your unlocker may end up getting a non-matching ticket even

> > though it is the last call to __page_pool_put_page.

>

> Yes, we need to make sure there is no get_page/put_page messing with

> this process. Or if there is, make sure there is a __page_pool_put_page()

> after get_page/put_page.


We can't. That is the fundamental problem of this patch set. We won't
be able to enforce that kind of change on the memory management
subsystem.

<...>
> >

> > Again this is why I think it would be better to just maintain a list

> > of inflight pages and then unmap them fro the driver if they are still

> > on the list greater than some fixed period of time.

>

> I am not sure if adding a list of inflight pages is the proper way

> to solve the problem if the page is not returned to the page for a

> very long time.

>

> Maybe we should find out why the page is not returned to page pool

> and fix it if that happen?


There are plenty of reasons for something like that to occur. For all
we know it could be that we are sitting on the one 4K page in a 2M
hugepage that prevents the memory subsystem from compacting it into a
2M page instead of leaving it as a bunch of order 0 pages. In such a
case the correct behavior would be for us to give up the page. We
cannot assume we are safe to sit on a page for eternity.

This is why in my mind it would make sense to just maintain a list of
pages we have returned to the stack and if they aren't freed up in
some given period of time we just unmap them and drop the reference we
are holding to them.

<...>
> >>>>>

> >>>>>> +               if (unlikely(!frag_page)) {

> >>>>>> +                       pool->frag_page = NULL;

> >>>>>> +                       return NULL;

> >>>>>> +               }

> >>>>>> +

> >>>>>> +               pool->frag_page = frag_page;

> >>>>>> +               frag_offset = 0;

> >>>>>> +

> >>>>>> +               page_pool_sub_bias(frag_page, max_len / frag_size - 1);

> >>>>>

> >>>>> Why are you doing division here? We should just be subtracting 1 from

> >>>>> the pagecnt_bias since that is the number of buffers that are being

> >>>>> used. The general idea is that when pagecnt_bias is 0 we cut the page

> >>>>> loose for potential recycling or freeing, otherwise we just subtract

> >>>>> our new value from pagecnt_bias until we reach it.

> >>>>

> >>>> As mentioned above, division is used to find out how many user may be

> >>>> using the page.

> >>>

> >>> That doesn't make any sense to me because it won't tell you the actual

> >>> users, and from what I can tell it is buggy since if I use this to

> >>> allocate a chunk larger than 2K this comes out to 0 doesn't it? It

> >>> seems like you should just always use 1 as the count.

> >>

> >> There is already a page_pool_sub_bias(page, 1) in page_pool_alloc_pages(),

> >> so for 4K page, there is two users for a page with 2K frag size, and there

> >> is 32 users for 64K page with 2K frag size.

> >>

> >> The reason doing a page_pool_sub_bias(page, 1) in page_pool_alloc_pages()

> >> is that the caller is expected to use the page as a whole when using the

> >> page_pool_alloc_pages() directly, so it means only one user.

> >

> > The logic doesn't make any sense. You shouldn't need to do any

> > subtraction then. The idea is you subtract 1 per frag pulled from the

> > page. The logic you have here just doesn't make sense as you are

> > making smaller frags pull additional bias counts. If I pull a small

> > fragment I could consume the entire bias in a single call.

>

> I am not sure I understand the above comment.

> Basically the page returned from page_pool_alloc_pages() is expected

> to be used by one user, when page_pool_alloc_frag() use that page to

> serve more users, it decides the total user using "max_len / frag_size",

> as there is already one user added in page_pool_alloc_pages(), so only

> "max_len / frag_size - 1" more user need adding(adding more user is by

> calling page_pool_sub_bias(), which is kind of confusing as the "sub"

> word).


I see. So effectively you are just batching the pagecnt_bias update.
Still not a huge fan of the idea since that division effectively costs
you about the same as something like a dozen or more decrement
operations. You would be better off just updating once per frag
instead of trying to batch that.

<...>
> > Ideally this would be broken out into smaller patches so it is easier

> > to review as there are currently several issues that we are talking

> > about here in parallel which is making the discussion confusing.

>

> Ok, will split this patch to more reviewable one.


Thanks
Yunsheng Lin July 10, 2021, 9:16 a.m. UTC | #23
On 2021/7/9 22:15, Alexander Duyck wrote:
> On Thu, Jul 8, 2021 at 11:26 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:

>>

>> On 2021/7/8 23:36, Alexander Duyck wrote:

>>> On Wed, Jul 7, 2021 at 7:27 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:

>>>>

>>>> On 2021/7/7 23:01, Alexander Duyck wrote:

>>>>> On Tue, Jul 6, 2021 at 8:05 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:

>>>>>>

>>>>>> On 2021/7/7 4:45, Alexander Duyck wrote:

>>>>>>> On Wed, Jun 30, 2021 at 2:19 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:

>>>>>>>>

>>>>>>>> Currently page pool only support page recycling only when

>>>>>>>> refcnt of page is one, which means it can not support the

>>>>>>>> split page recycling implemented in the most ethernet driver.

>>>>>>>>

>>>>>>>> So add elevated refcnt support in page pool, and support

>>>>>>>> allocating page frag to enable multi-frames-per-page based

>>>>>>>> on the elevated refcnt support.

>>>>>>>>

>>>>>>>> As the elevated refcnt is per page, and there is no space

>>>>>>>> for that in "struct page" now, so add a dynamically allocated

>>>>>>>> "struct page_pool_info" to record page pool ptr and refcnt

>>>>>>>> corrsponding to a page for now. Later, we can recycle the

>>>>>>>> "struct page_pool_info" too, or use part of page memory to

>>>>>>>> record pp_info.

>>>>>>>>

>>>>>>>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>

>>>>>>

>>>>>> Hi, Alexander

>>>>>>

>>>>>> Thanks for detailed reviewing.

>>>>>>

>>>>>>>

>>>>>>> So this isn't going to work with the current recycling logic. The

>>>>>>> expectation there is that we can safely unmap the entire page as soon

>>>>>>> as the reference count is greater than 1.

>>>>>>

>>>>>> Yes, the expectation is changed to we can always recycle the page

>>>>>> when the last user has dropped the refcnt that has given to it when

>>>>>> the page is not pfmemalloced.

>>>>>>

>>>>>> The above expectation is based on that the last user will always

>>>>>> call page_pool_put_full_page() in order to do the recycling or do

>>>>>> the resource cleanup(dma unmaping..etc).

>>>>>>

>>>>>> As the skb_free_head() and skb_release_data() have both checked the

>>>>>> skb->pp_recycle to call the page_pool_put_full_page() if needed, I

>>>>>> think we are safe for most case, the one case I am not so sure above

>>>>>> is the rx zero copy, which seems to also bump up the refcnt before

>>>>>> mapping the page to user space, we might need to ensure rx zero copy

>>>>>> is not the last user of the page or if it is the last user, make sure

>>>>>> it calls page_pool_put_full_page() too.

>>>>>

>>>>> Yes, but the skb->pp_recycle value is per skb, not per page. So my

>>>>> concern is that carrying around that value can be problematic as there

>>>>> are a number of possible cases where the pages might be

>>>>> unintentionally recycled. All it would take is for a packet to get

>>>>> cloned a few times and then somebody starts using pskb_expand_head and

>>>>> you would have multiple cases, possibly simultaneously, of entities

>>>>> trying to free the page. I just worry it opens us up to a number of

>>>>> possible races.

>>>>

>>>> I think page_ref_dec_return() in page_pool_bias_page_recyclable() will

>>>> prevent the above race to happen.

>>>>

>>>> As the page_ref_dec_return() and page_pool_bias_page_recyclable() return

>>>> true, all user of the page have done with the p->pp_magic and p->pp_info,

>>>> so it should be ok to reset the p->pp_magic and p->pp_info in any order?

>>>>

>>>> And page_ref_dec_return() has both __atomic_pre_full_fence() and

>>>> __atomic_post_full_fence() to ensure the above ordering.

>>>

>>> So if I understand correctly what you are saying is that because of

>>> the pagecnt_bias check we will not hit the page_pool_release_page.

>>> That may help to address the issue introduced by the recycling patch

>>> but I don't think it completely resolves it. In addition there may be

>>> performance implications to this change since you are requiring the

>>> atomic dec for every page.

>>>

>>> The difference between pagecnt_bias and what you have here is that we

>>> freed the page when page_ref_count hit 0. With this approach you are

>>> effectively freeing the page when page_ref_count == pagecnt_bias +

>>> modifier. The two implementations have quite a number of differences

>>> in behavior.

>>>

>>> What you have effectively done here is make the page refcount and

>>> pagecnt_bias effectively into a ticket lock where we cannot call the

>>> free function until page_ref_cnt == pagecnt_bias + 1. So you need to

>>> keep the pagecnt_bias much lower than the page_ref_cnt otherwise you

>>> run the risk of frequent recycling. For the non-shared page_pool pages

>>> this is probably fine, however the frags implementation is horribly

>>> broken.

>>

>> Yes, if ticket lock is the name for that.

>>

>> I suppose "non-shared page_pool pages" mean caller allocates the page by

>> calling page_pool_alloc_pages() directly for elevated refcnt case, right?

>>

>> The main difference between page_pool_alloc_pages() and page_pool_alloc_frag()

>> for elevated refcnt case is how many tickets have been given out, so I

>> am not sure why giving out one ticket is ok, and giving out more than one

>> ticket is broken?

> 

> The model for page_pool_alloc_frag is that you are giving out one

> slice of the page at a time. The general idea is you are allocating

> variable sized sections of the page, so normally you cannot predict

> exactly how many references will be needed.


Usually how the driver split the page is fixed for a given rx
configuration(like MTU), so the driver is able to pass that info to
page pool, and page pool can use that info to calculate how many
references will be needed.

> 

> In addition division is an extremely expensive operation when you

> aren't working with a constant power of 2. As such for any fastpath

> thing such as an allocation you want to try to avoid it if at all

> possible.

> 

>>>

>>> Also the ticketlock approach is flawed because with something like

>>> that we shouldn't rewind the number we are currently serving like we

>>> do. We would have to wait until we are the only one holding the page

>>> before we could recycle previously used values.

>>

>> I am not sure I understand the above.

>>

>> I suppose it means we might not be able to clean up the resource(mainly

>> to do unmapping and drain the page_ref according to pagecnt_bias) while

>> the stack is still holding the reference to the page, which is possible

>> for the current reusing implemented in most driver.

>>

>> But one good thing come out of that is we might still be able to reuse

>> the page when the stack release the reference to the page later, which

>> is not possible for the current reusing implemented in most driver.

> 

> There are several flaws with the approach.

> 

> 1. The fact that external entities can all get_page/put_page which may

> cause __page_pool_put_page to miss the case where it would have

> otherwise found that page_ref_count == pagecnt_bias.


Ok, like the rx zero copy one, right?

> 

> 2. Rewinding the page without first verifying it owns all references.

> Technically that is an exploitable issue as someone would just have to

> take 64K references at just the right time to cause page_ref_count ==

> pagecnt_bias. Not a likely issue but technically not a correct thing

> to do either as it does open a window for exploitation.


I am not sure I understand the above, more specificly what does
"Rewinding" mean?
Does it means the pagecnt_bias and ppage_ref_count manipulation in
page_pool_sub_bias()?

When page_pool_sub_bias() is called, we can ensure no one but the
page pool owns the page if the page_ref_dec_return() in
page_pool_bias_page_recyclable() can ensure the correct ordering.

> 

> 3. Generally any sort of count rewind waits until we know we are the

> only ones holding onto the page. Basically we have to verify the

> page_ref_count == 1 or page_ref_count == pagecnt_bias case before

> resetting offsets and assuming we can safely reuse the page.


This one seems like the above one?

> 

> <...>

>>> last buffer we don't bother with decrementing the pagecnt_bias and

>>> instead just hand the page over to the stack. So what we should have

>>> is the page cycling between a pagecnt_bias that is +1-2 of the actual

>>> page_ref_count and when the two are equal we then perform the

>>> unmap/free or recycle of the page.

>>

>> What does "last buffer" mean?

>> The driver does not know whether the buffer is the last one or not as the

>> pagecnt_bias is hidden inside the page pool.

> 

> It depends on use case. If we are doing something like classic

> page_pool with one use per page then every buffer would be the last

> buffer so we technically wouldn't need to decrement it after the page

> has been recycled. If we are doing the page_frag type model it would

> be the last fragment of the page being used.


For the page frag implemented in this patch, it does not really matter
whether it is the last frag.

A ticket is given to every user of each frag, when the users of all the frag
from the same page have returned the ticket back to page pool, the page pool
can recycle the page.

> 

>>>

>>> On the Tx and SKB side of things we are using the page_ref_count to

>>> track which instances can be recycled and should only ever be reading

>>> pagecnt_bias.

>>

>> pagecnt_bias in this patch *does* indeed being only read for SKB side.

>> I suppose Tx side is for XDP?

> 

> Yes.


If this is about XDP_TX and XDP_REDIRECT?
It seems XDP_TX is happening in NAPI polling of rx, it should be
able to tell the page is from which page pool?
For XDP_REDIRECT, It seems should be handled in __xdp_return()?

> 

>>>

>>> At recycle time we will need to verify there are enough tickets to

>>> support another run through the allocator. We may want to look at

>>> adding a value to the page pool to track the maximum number of slices

>>> a page can be broken into in order to avoid having to update the

>>> page_ref_count and pagecnt_bias too often.

>>

>> Why is page_ref_count and pagecnt_bias not enough to do the job?

>> The user have provided the frag_size and we know about the page size, so

>> we should be able to ensure pagecnt_bias is big enough for the maximum

>> number of slices when allocating the first frag of page.

> 

> As I mentioned before we shouldn't be just arbitrarily rewinding. And

> resetting every time the page is freed would be expensive. So the idea

> is to have a check and as long as pagecnt_bias is greater than the

> number of fragments we will break out of a single page we don't need

> to update pagecnt_bias or page_ref_count when the page is recycled.


I am not sure I follow the above.
As I has split this patch to more viewable one in RFC v2, let's discuss
that in RFC v2 if it is better way to avoiding pagecnt_bias or
page_ref_count frequently.

> 

> <...>

>>>>>>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

>>>>>>>> index b2db9cd..7795979 100644

>>>>>>>> --- a/include/linux/skbuff.h

>>>>>>>> +++ b/include/linux/skbuff.h

>>>>>>>> @@ -4711,11 +4711,9 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)

>>>>>>>>  }

>>>>>>>>

>>>>>>>>  #ifdef CONFIG_PAGE_POOL

>>>>>>>> -static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page,

>>>>>>>> -                                       struct page_pool *pp)

>>>>>>>> +static inline void skb_mark_for_recycle(struct sk_buff *skb)

>>>>>>>>  {

>>>>>>>>         skb->pp_recycle = 1;

>>>>>>>> -       page_pool_store_mem_info(page, pp);

>>>>>>>>  }

>>>>>>>>  #endif

>>>>>>>

>>>>>>> I am not a fan of the pp_recycle flag either. We duplicate it via

>>>>>>> skb_clone and from what I can tell if we call pskb_expand_head

>>>>>>> afterwards I don't see how we avoid recycling the page frags twice.

>>>>>>

>>>>>> Acctually skb->pp_recycle is kind of duplicated, as there is

>>>>>> still page->pp_magic to avoid recycling the page frags twice.

>>>>>>

>>>>>> The argument above adding skb->pp_recycle seems to be short

>>>>>> cut code path for non-page_pool case in the previous disscusion,

>>>>>> see [2].

>>>>>>

>>>>>> 2. https://lore.kernel.org/linux-mm/074b0d1d-9531-57f3-8e0e-a447387478d1@huawei.com/

>>>>>

>>>>> Yes, but that doesn't guarantee atomic protections so you still have

>>>>> race conditions possible. All it takes is something stalling during

>>>>> the dma_unamp call. Worse yet from what I can tell it looks like you

>>>>> clear page->pp before you clear page->pp_magic so you have the

>>>>> potential for a NULL pointer issue since it is cleared before the

>>>>> pp_magic value is.

>>>>

>>>> Hopefully the page_ref_dec_return() in page_pool_bias_page_recyclable()

>>>> called by page_pool_put_page() will make the order of page->pp_magic

>>>> clearing and page->pp clearing irrelevant?

>>>

>>> Really it doesn't address the issue. The problem is the clearing of

>>> pp_magic is after the dec_and_ref while the reading/clearing of

>>> page->pp is before it.

>>>

>>> So having code like the following is not safe:

>>>     pp = page->pp;

>>>     page->pp = NULL;

>>>

>>>     if (pp->something)

>>>         do_something();

>>>

>>> The check for page->pp_magic before this doens't resolve it because 2

>>> threads can get into the code path before either one has updated

>>> page->pp_magic.

>>

>> I suppose the above issue is the one you and Ilias are discussing?

> 

> Yes. I think we are getting that sorted out.

> 

>>>

>>> Arguably the pagecnt_bias does something to help, but what it has

>>> effectively done is created a ticket lock where until you can get

>>> page_ref_count to reach the pagecnt_bias value you cannot unmap or

>>> free the page. So the tradeoff is that if anyone takes a reference to

>>> the page you are now stuck and cannot unmap it nor remove the device

>>> while the page is still in use elsewhere.

>>>

>>> Also it just occurred to me that this will cause likely leaks because

>>> page_ref_count is also updated outside of page_pool so we would have

>>> to worry about someone calling get_page, then your call to

>>> page_pool_bias_page_recyclable, and then put page and at that point

>>> the page is leaked.

>>

>> Yes, as mentioned in the previous discussion:

>>

>> "Yes, the expectation is changed to we can always recycle the page

>> when the last user has dropped the refcnt that has given to it when

>> the page is not pfmemalloced.

>>

>> The above expectation is based on that the last user will always

>> call page_pool_put_full_page() in order to do the recycling or do

>> the resource cleanup(dma unmaping..etc).

> 

> The problem is we cannot make that assumption. The memory management

> subsystem has a number of operations that will take a reference on the

> page as long as it is not zero and is completely unrelated to

> networking. So that breaks this whole concept. As does the fixes

> needed to deal with the skb_clone/pskb_expand_head issue.


I suppose the page_ref_dec_return() in this patch can deal with the
skb_clone/pskb_expand_head issue too?

> 

>> As the skb_free_head() and skb_release_data() have both checked the

>> skb->pp_recycle to call the page_pool_put_full_page() if needed, I

>> think we are safe for most case, the one case I am not so sure above

>> is the rx zero copy, which seems to also bump up the refcnt before

>> mapping the page to user space, we might need to ensure rx zero copy

>> is not the last user of the page or if it is the last user, make sure

>> it calls page_pool_put_full_page() too."

> 

> That isn't going to work. In order for this patch set to work you

> would effectively have to somehow modify put_page since that is used

> at a number of given points throughout the kernel on the page. That is

> the whole reason for the checks against page_ref_count != 1 in the

> __page_pool_put_page call since it is the first call to it that will

> have to perform the unmapping if something else is holding onto the

> page.


Let's discuss this below.

> 

> <...>

>>>>>>>> @@ -284,6 +335,25 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,

>>>>>>>>         return page;

>>>>>>>>  }

>>>>>>>>

>>>>>>>> +static void page_pool_sub_bias(struct page *page, int nr)

>>>>>>>> +{

>>>>>>>> +       struct page_pool_info *pp_info = page->pp_info;

>>>>>>>> +

>>>>>>>> +       /* "pp_info->pagecnt_bias == 0" indicates the PAGECNT_BIAS

>>>>>>>> +        * flags is not set.

>>>>>>>> +        */

>>>>>>>> +       if (!pp_info->pagecnt_bias)

>>>>>>>> +               return;

>>>>>>>> +

>>>>>>>> +       /* Make sure pagecnt_bias > 0 for elevated refcnt case */

>>>>>>>> +       if (unlikely(pp_info->pagecnt_bias <= nr)) {

>>>>>>>> +               page_ref_add(page, USHRT_MAX);

>>>>>>>> +               pp_info->pagecnt_bias += USHRT_MAX;

>>>>>>>> +       }

>>>>>>>> +

>>>>>>>> +       pp_info->pagecnt_bias -= nr;

>>>>>>>

>>>>>>> So we should never have a case where pagecnt_bias is less than the

>>>>>>> value we are subtracting. If we have that then it is a bug.

>>>>>>

>>>>>> Yes.

>>>>>

>>>>> Sorry, I was referring to the code above comparing pagecnt_bias to nr.

>>>>> At most nr should only ever be equal to pagecnt_bias, you should hold

>>>>> off on recharging pagecnt_bias until you can verify the page_count

>>>>> indicates we are the only holder of the page. Then we can recharge it

>>>>> and reset any offsets.

>>>>

>>>> Actually the page pool is the only user of the page when the driver is

>>>> calling page_pool_alloc_frag(), page is from pool->alloc/pool->ring or

>>>> page allocator in page_pool_alloc_pages(), as memtioned above, the

>>>> last user will put the page in pool->ring holding a lock, and when

>>>> page_pool_alloc_pages() get a page (also holding the same lock) from

>>>> pool->ring, there should be no user of the page other than the page pool.

>>>>

>>>> And page_pool_sub_bias() is called in page_pool_alloc_frag() and

>>>> page_pool_alloc_pages().

>>>

>>> I think we would need to see a version of this patch without the

>>> alloc_frag calls in order to really be able to do a review. The

>>> problem is I don't see how the page_pool_alloc_frag can expect to have

>>> sole ownership of the page if it is allocating fragments of the page.

>>> The frags call imply multiple users for a single page.

>>

>> The driver calls page_pool_alloc_frag(), and page_pool_alloc_frag()

>> will call page_pool_alloc_pages() to allocate a new page if the

>> pool->frag_page is NULL or there is no frag left in the pool->frag_page

>> (using pool->frag_offset and pool->frag_size to decide if there is any

>> frag left), and when the new page is allocated, it will decide how many

>> frag the page has by using PAGE_SIZE and pool->frag_size, which also mean

>> how many user will be using the page, so the "page_ref - (pagecnt_bias + 1)"

>> is the number of the user will using the page at the time when the first frag

>> is allocated, and pagecnt_bias is only updated for the first user of the page,

>> for subsequent user, just use the pool->frag_offset to decide which frag to

>> allocate if there is still frag left, and pagecnt_bias does not need changing

>> for subsequent user of the same page.

> 

> The point I am getting at is that the patch is doing too much and we

> are essentially trying to discuss 3 to 4 patches worth of content in

> one email thread which has us going in circles. It would be better to

> break the page_frag case out of this patch and into one of its own for

> us to discuss separately as too many issues with the patch set are

> being conflated.

> 

> <...>

>>>>>

>>>>>> Or the page pool will call page_pool_put_full_page() in page_pool_empty_frag()

>>>>>> if some of the page frag is not allocated to the driver yet.

>>>>>>

>>>>>> It seems you are suggesting a slightly different way to do frag reusing.

>>>>>

>>>>> As I mentioned I am not a fan of the current recycling scheme. There

>>>>> are too many openings for it to end up unmapping the same page

>>>>> multiple times or other possible issues.

>>>>

>>>> Other than the pagecnt_bias handling in non-atomic context, I think

>>>> most of the race you mentioned above has been handled if I understand

>>>> it correctly?

>>>

>>> The biggest issue is that if we assume this to be more of a ticket

>>> lock model, you have threads outside of this that are using

>>> get_page/put_page that will mess with your tickets and cause leaks

>>> because your unlocker may end up getting a non-matching ticket even

>>> though it is the last call to __page_pool_put_page.

>>

>> Yes, we need to make sure there is no get_page/put_page messing with

>> this process. Or if there is, make sure there is a __page_pool_put_page()

>> after get_page/put_page.

> 

> We can't. That is the fundamental problem of this patch set. We won't

> be able to enforce that kind of change on the memory management

> subsystem.

> 

> <...>

>>>

>>> Again this is why I think it would be better to just maintain a list

>>> of inflight pages and then unmap them fro the driver if they are still

>>> on the list greater than some fixed period of time.

>>

>> I am not sure if adding a list of inflight pages is the proper way

>> to solve the problem if the page is not returned to the page for a

>> very long time.

>>

>> Maybe we should find out why the page is not returned to page pool

>> and fix it if that happen?

> 

> There are plenty of reasons for something like that to occur. For all

> we know it could be that we are sitting on the one 4K page in a 2M

> hugepage that prevents the memory subsystem from compacting it into a

> 2M page instead of leaving it as a bunch of order 0 pages. In such a

> case the correct behavior would be for us to give up the page. We

> cannot assume we are safe to sit on a page for eternity.


But the reason page pool can not give up page may be that there is
still someone holding the page, if this is the case, the page pool
giving up the page does not seems to solve the problem?

> 

> This is why in my mind it would make sense to just maintain a list of

> pages we have returned to the stack and if they aren't freed up in

> some given period of time we just unmap them and drop the reference we

> are holding to them.


The initial thinking about maintaining the inflight page seems complex.
Considering we might need to add it to a list when the page is given
to a user and remove it from the list when the user return the page,
and the list ptr need to be per page too. It might need a backgroud
worker to release the page if a page sit on the list too long, and
the work also need to deal with concurrent user giving back pages too.

> 

> <...>

>>>>>>>

>>>>>>>> +               if (unlikely(!frag_page)) {

>>>>>>>> +                       pool->frag_page = NULL;

>>>>>>>> +                       return NULL;

>>>>>>>> +               }

>>>>>>>> +

>>>>>>>> +               pool->frag_page = frag_page;

>>>>>>>> +               frag_offset = 0;

>>>>>>>> +

>>>>>>>> +               page_pool_sub_bias(frag_page, max_len / frag_size - 1);

>>>>>>>

>>>>>>> Why are you doing division here? We should just be subtracting 1 from

>>>>>>> the pagecnt_bias since that is the number of buffers that are being

>>>>>>> used. The general idea is that when pagecnt_bias is 0 we cut the page

>>>>>>> loose for potential recycling or freeing, otherwise we just subtract

>>>>>>> our new value from pagecnt_bias until we reach it.

>>>>>>

>>>>>> As mentioned above, division is used to find out how many user may be

>>>>>> using the page.

>>>>>

>>>>> That doesn't make any sense to me because it won't tell you the actual

>>>>> users, and from what I can tell it is buggy since if I use this to

>>>>> allocate a chunk larger than 2K this comes out to 0 doesn't it? It

>>>>> seems like you should just always use 1 as the count.

>>>>

>>>> There is already a page_pool_sub_bias(page, 1) in page_pool_alloc_pages(),

>>>> so for 4K page, there is two users for a page with 2K frag size, and there

>>>> is 32 users for 64K page with 2K frag size.

>>>>

>>>> The reason doing a page_pool_sub_bias(page, 1) in page_pool_alloc_pages()

>>>> is that the caller is expected to use the page as a whole when using the

>>>> page_pool_alloc_pages() directly, so it means only one user.

>>>

>>> The logic doesn't make any sense. You shouldn't need to do any

>>> subtraction then. The idea is you subtract 1 per frag pulled from the

>>> page. The logic you have here just doesn't make sense as you are

>>> making smaller frags pull additional bias counts. If I pull a small

>>> fragment I could consume the entire bias in a single call.

>>

>> I am not sure I understand the above comment.

>> Basically the page returned from page_pool_alloc_pages() is expected

>> to be used by one user, when page_pool_alloc_frag() use that page to

>> serve more users, it decides the total user using "max_len / frag_size",

>> as there is already one user added in page_pool_alloc_pages(), so only

>> "max_len / frag_size - 1" more user need adding(adding more user is by

>> calling page_pool_sub_bias(), which is kind of confusing as the "sub"

>> word).

> 

> I see. So effectively you are just batching the pagecnt_bias update.

> Still not a huge fan of the idea since that division effectively costs

> you about the same as something like a dozen or more decrement

> operations. You would be better off just updating once per frag

> instead of trying to batch that.


Updating once per frag does not work for the implementation in this
patch.

Suppose one user use the first frag, and it calls page_pool_put_page()
with the frag page, page_pool_put_page() will recycle the page, but the
page is not recyclable yet as it still sit in the pool->frag_page for
user to allocate the rest of frag.

> 

> <...>

>>> Ideally this would be broken out into smaller patches so it is easier

>>> to review as there are currently several issues that we are talking

>>> about here in parallel which is making the discussion confusing.

>>

>> Ok, will split this patch to more reviewable one.

> 

> Thanks

> _______________________________________________

> Linuxarm mailing list -- linuxarm@openeuler.org

> To unsubscribe send an email to linuxarm-leave@openeuler.org

>
diff mbox series

Patch

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 88a7550..5a29af2 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -2327,7 +2327,7 @@  mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,
 	if (!skb)
 		return ERR_PTR(-ENOMEM);
 
-	skb_mark_for_recycle(skb, virt_to_page(xdp->data), pool);
+	skb_mark_for_recycle(skb);
 
 	skb_reserve(skb, xdp->data - xdp->data_hard_start);
 	skb_put(skb, xdp->data_end - xdp->data);
@@ -2339,10 +2339,6 @@  mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool,
 		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
 				skb_frag_page(frag), skb_frag_off(frag),
 				skb_frag_size(frag), PAGE_SIZE);
-		/* We don't need to reset pp_recycle here. It's already set, so
-		 * just mark fragments for recycling.
-		 */
-		page_pool_store_mem_info(skb_frag_page(frag), pool);
 	}
 
 	return skb;
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 3135220..540e387 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -3997,7 +3997,7 @@  static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,
 		}
 
 		if (pp)
-			skb_mark_for_recycle(skb, page, pp);
+			skb_mark_for_recycle(skb);
 		else
 			dma_unmap_single_attrs(dev->dev.parent, dma_addr,
 					       bm_pool->buf_size, DMA_FROM_DEVICE,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 862f88a..cf613df 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -101,7 +101,7 @@  struct page {
 			 * page_pool allocated pages.
 			 */
 			unsigned long pp_magic;
-			struct page_pool *pp;
+			struct page_pool_info *pp_info;
 			unsigned long _pp_mapping_pad;
 			/**
 			 * @dma_addr: might require a 64-bit value on
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b2db9cd..7795979 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4711,11 +4711,9 @@  static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
 }
 
 #ifdef CONFIG_PAGE_POOL
-static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page,
-					struct page_pool *pp)
+static inline void skb_mark_for_recycle(struct sk_buff *skb)
 {
 	skb->pp_recycle = 1;
-	page_pool_store_mem_info(page, pp);
 }
 #endif
 
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 3dd62dd..44e7545 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -45,7 +45,9 @@ 
 					* Please note DMA-sync-for-CPU is still
 					* device driver responsibility
 					*/
-#define PP_FLAG_ALL		(PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)
+#define PP_FLAG_PAGECNT_BIAS	BIT(2)	/* Enable elevated refcnt */
+#define PP_FLAG_ALL		(PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV |\
+				 PP_FLAG_PAGECNT_BIAS)
 
 /*
  * Fast allocation side cache array/stack
@@ -77,6 +79,7 @@  struct page_pool_params {
 	enum dma_data_direction dma_dir; /* DMA mapping direction */
 	unsigned int	max_len; /* max DMA sync memory size */
 	unsigned int	offset;  /* DMA addr offset */
+	unsigned int	frag_size;
 };
 
 struct page_pool {
@@ -88,6 +91,8 @@  struct page_pool {
 	unsigned long defer_warn;
 
 	u32 pages_state_hold_cnt;
+	unsigned int frag_offset;
+	struct page *frag_page;
 
 	/*
 	 * Data structure for allocation side
@@ -128,6 +133,11 @@  struct page_pool {
 	u64 destroy_cnt;
 };
 
+struct page_pool_info {
+	struct page_pool *pp;
+	int pagecnt_bias;
+};
+
 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
 
 static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
@@ -137,6 +147,17 @@  static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
 	return page_pool_alloc_pages(pool, gfp);
 }
 
+struct page *page_pool_alloc_frag(struct page_pool *pool,
+				  unsigned int *offset, gfp_t gfp);
+
+static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
+						    unsigned int *offset)
+{
+	gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
+
+	return page_pool_alloc_frag(pool, offset, gfp);
+}
+
 /* get the stored dma direction. A driver might decide to treat this locally and
  * avoid the extra cache line from page_pool to determine the direction
  */
@@ -253,11 +274,4 @@  static inline void page_pool_ring_unlock(struct page_pool *pool)
 		spin_unlock_bh(&pool->ring.producer_lock);
 }
 
-/* Store mem_info on struct page and use it while recycling skb frags */
-static inline
-void page_pool_store_mem_info(struct page *page, struct page_pool *pp)
-{
-	page->pp = pp;
-}
-
 #endif /* _NET_PAGE_POOL_H */
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 5e4eb45..95d94a7 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -206,6 +206,49 @@  static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 	return true;
 }
 
+static int page_pool_set_pp_info(struct page_pool *pool,
+				 struct page *page, gfp_t gfp)
+{
+	struct page_pool_info *pp_info;
+
+	pp_info = kzalloc_node(sizeof(*pp_info), gfp, pool->p.nid);
+	if (!pp_info)
+		return -ENOMEM;
+
+	if (pool->p.flags & PP_FLAG_PAGECNT_BIAS) {
+		page_ref_add(page, USHRT_MAX);
+		pp_info->pagecnt_bias = USHRT_MAX;
+	} else {
+		pp_info->pagecnt_bias = 0;
+	}
+
+	page->pp_magic |= PP_SIGNATURE;
+	pp_info->pp = pool;
+	page->pp_info = pp_info;
+	return 0;
+}
+
+static int page_pool_clear_pp_info(struct page *page)
+{
+	struct page_pool_info *pp_info = page->pp_info;
+	int bias;
+
+	bias = pp_info->pagecnt_bias;
+
+	kfree(pp_info);
+	page->pp_info = NULL;
+	page->pp_magic = 0;
+
+	return bias;
+}
+
+static void page_pool_clear_and_drain_page(struct page *page)
+{
+	int bias = page_pool_clear_pp_info(page);
+
+	__page_frag_cache_drain(page, bias + 1);
+}
+
 static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 						 gfp_t gfp)
 {
@@ -216,13 +259,16 @@  static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 	if (unlikely(!page))
 		return NULL;
 
-	if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
-	    unlikely(!page_pool_dma_map(pool, page))) {
+	if (unlikely(page_pool_set_pp_info(pool, page, gfp))) {
 		put_page(page);
 		return NULL;
 	}
 
-	page->pp_magic |= PP_SIGNATURE;
+	if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
+	    unlikely(!page_pool_dma_map(pool, page))) {
+		page_pool_clear_and_drain_page(page);
+		return NULL;
+	}
 
 	/* Track how many pages are held 'in-flight' */
 	pool->pages_state_hold_cnt++;
@@ -261,12 +307,17 @@  static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 	 */
 	for (i = 0; i < nr_pages; i++) {
 		page = pool->alloc.cache[i];
+		if (unlikely(page_pool_set_pp_info(pool, page, gfp))) {
+			put_page(page);
+			continue;
+		}
+
 		if ((pp_flags & PP_FLAG_DMA_MAP) &&
 		    unlikely(!page_pool_dma_map(pool, page))) {
-			put_page(page);
+			page_pool_clear_and_drain_page(page);
 			continue;
 		}
-		page->pp_magic |= PP_SIGNATURE;
+
 		pool->alloc.cache[pool->alloc.count++] = page;
 		/* Track how many pages are held 'in-flight' */
 		pool->pages_state_hold_cnt++;
@@ -284,6 +335,25 @@  static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 	return page;
 }
 
+static void page_pool_sub_bias(struct page *page, int nr)
+{
+	struct page_pool_info *pp_info = page->pp_info;
+
+	/* "pp_info->pagecnt_bias == 0" indicates the PAGECNT_BIAS
+	 * flags is not set.
+	 */
+	if (!pp_info->pagecnt_bias)
+		return;
+
+	/* Make sure pagecnt_bias > 0 for elevated refcnt case */
+	if (unlikely(pp_info->pagecnt_bias <= nr)) {
+		page_ref_add(page, USHRT_MAX);
+		pp_info->pagecnt_bias += USHRT_MAX;
+	}
+
+	pp_info->pagecnt_bias -= nr;
+}
+
 /* For using page_pool replace: alloc_pages() API calls, but provide
  * synchronization guarantee for allocation side.
  */
@@ -293,15 +363,66 @@  struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
 
 	/* Fast-path: Get a page from cache */
 	page = __page_pool_get_cached(pool);
-	if (page)
+	if (page) {
+		page_pool_sub_bias(page, 1);
 		return page;
+	}
 
 	/* Slow-path: cache empty, do real allocation */
 	page = __page_pool_alloc_pages_slow(pool, gfp);
+	if (page)
+		page_pool_sub_bias(page, 1);
+
 	return page;
 }
 EXPORT_SYMBOL(page_pool_alloc_pages);
 
+struct page *page_pool_alloc_frag(struct page_pool *pool,
+				  unsigned int *offset, gfp_t gfp)
+{
+	unsigned int frag_offset = pool->frag_offset;
+	unsigned int frag_size = pool->p.frag_size;
+	struct page *frag_page = pool->frag_page;
+	unsigned int max_len = pool->p.max_len;
+
+	if (!frag_page || frag_offset + frag_size > max_len) {
+		frag_page = page_pool_alloc_pages(pool, gfp);
+		if (unlikely(!frag_page)) {
+			pool->frag_page = NULL;
+			return NULL;
+		}
+
+		pool->frag_page = frag_page;
+		frag_offset = 0;
+
+		page_pool_sub_bias(frag_page, max_len / frag_size - 1);
+	}
+
+	*offset = frag_offset;
+	pool->frag_offset = frag_offset + frag_size;
+
+	return frag_page;
+}
+EXPORT_SYMBOL(page_pool_alloc_frag);
+
+static void page_pool_empty_frag(struct page_pool *pool)
+{
+	unsigned int frag_offset = pool->frag_offset;
+	unsigned int frag_size = pool->p.frag_size;
+	struct page *frag_page = pool->frag_page;
+	unsigned int max_len = pool->p.max_len;
+
+	if (!frag_page)
+		return;
+
+	while (frag_offset + frag_size <= max_len) {
+		page_pool_put_full_page(pool, frag_page, false);
+		frag_offset += frag_size;
+	}
+
+	pool->frag_page = NULL;
+}
+
 /* Calculate distance between two u32 values, valid if distance is below 2^(31)
  *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
  */
@@ -326,10 +447,11 @@  static s32 page_pool_inflight(struct page_pool *pool)
  * a regular page (that will eventually be returned to the normal
  * page-allocator via put_page).
  */
-void page_pool_release_page(struct page_pool *pool, struct page *page)
+static int __page_pool_release_page(struct page_pool *pool,
+				    struct page *page)
 {
 	dma_addr_t dma;
-	int count;
+	int bias, count;
 
 	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
 		/* Always account for inflight pages, even if we didn't
@@ -345,22 +467,29 @@  void page_pool_release_page(struct page_pool *pool, struct page *page)
 			     DMA_ATTR_SKIP_CPU_SYNC);
 	page_pool_set_dma_addr(page, 0);
 skip_dma_unmap:
-	page->pp_magic = 0;
+	bias = page_pool_clear_pp_info(page);
 
 	/* This may be the last page returned, releasing the pool, so
 	 * it is not safe to reference pool afterwards.
 	 */
 	count = atomic_inc_return(&pool->pages_state_release_cnt);
 	trace_page_pool_state_release(pool, page, count);
+	return bias;
+}
+
+void page_pool_release_page(struct page_pool *pool, struct page *page)
+{
+	int bias = __page_pool_release_page(pool, page);
+
+	WARN_ONCE(bias, "PAGECNT_BIAS is not supposed to be enabled\n");
 }
 EXPORT_SYMBOL(page_pool_release_page);
 
 /* Return a page to the page allocator, cleaning up our state */
 static void page_pool_return_page(struct page_pool *pool, struct page *page)
 {
-	page_pool_release_page(pool, page);
+	__page_frag_cache_drain(page, __page_pool_release_page(pool, page) + 1);
 
-	put_page(page);
 	/* An optimization would be to call __free_pages(page, pool->p.order)
 	 * knowing page is not part of page-cache (thus avoiding a
 	 * __page_cache_release() call).
@@ -395,7 +524,16 @@  static bool page_pool_recycle_in_cache(struct page *page,
 	return true;
 }
 
-/* If the page refcnt == 1, this will try to recycle the page.
+static bool page_pool_bias_page_recyclable(struct page *page, int bias)
+{
+	int ref = page_ref_dec_return(page);
+
+	WARN_ON(ref < bias);
+	return ref == bias + 1;
+}
+
+/* If pagecnt_bias == 0 and the page refcnt == 1, this will try to
+ * recycle the page.
  * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
  * the configured size min(dma_sync_size, pool->max_len).
  * If the page refcnt != 1, then the page will be returned to memory
@@ -405,16 +543,35 @@  static __always_inline struct page *
 __page_pool_put_page(struct page_pool *pool, struct page *page,
 		     unsigned int dma_sync_size, bool allow_direct)
 {
-	/* This allocator is optimized for the XDP mode that uses
+	int bias = page->pp_info->pagecnt_bias;
+
+	/* Handle the elevated refcnt case first:
+	 * multi-frames-per-page, it is likely from the skb, which
+	 * is likely called in non-sofrirq context, so do not recycle
+	 * it in pool->alloc.
+	 *
+	 * Then handle non-elevated refcnt case:
 	 * one-frame-per-page, but have fallbacks that act like the
 	 * regular page allocator APIs.
-	 *
 	 * refcnt == 1 means page_pool owns page, and can recycle it.
 	 *
 	 * page is NOT reusable when allocated when system is under
 	 * some pressure. (page_is_pfmemalloc)
 	 */
-	if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
+	if (bias) {
+		/* We have gave some refcnt to the stack, so wait for
+		 * all refcnt of the stack to be decremented before
+		 * enabling recycling.
+		 */
+		if (!page_pool_bias_page_recyclable(page, bias))
+			return NULL;
+
+		/* only enable recycling when it is not pfmemalloced */
+		if (!page_is_pfmemalloc(page))
+			return page;
+
+	} else if (likely(page_ref_count(page) == 1 &&
+			  !page_is_pfmemalloc(page))) {
 		/* Read barrier done in page_ref_count / READ_ONCE */
 
 		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
@@ -428,22 +585,8 @@  __page_pool_put_page(struct page_pool *pool, struct page *page,
 		/* Page found as candidate for recycling */
 		return page;
 	}
-	/* Fallback/non-XDP mode: API user have elevated refcnt.
-	 *
-	 * Many drivers split up the page into fragments, and some
-	 * want to keep doing this to save memory and do refcnt based
-	 * recycling. Support this use case too, to ease drivers
-	 * switching between XDP/non-XDP.
-	 *
-	 * In-case page_pool maintains the DMA mapping, API user must
-	 * call page_pool_put_page once.  In this elevated refcnt
-	 * case, the DMA is unmapped/released, as driver is likely
-	 * doing refcnt based recycle tricks, meaning another process
-	 * will be invoking put_page.
-	 */
-	/* Do not replace this with page_pool_return_page() */
+
 	page_pool_release_page(pool, page);
-	put_page(page);
 
 	return NULL;
 }
@@ -452,6 +595,7 @@  void page_pool_put_page(struct page_pool *pool, struct page *page,
 			unsigned int dma_sync_size, bool allow_direct)
 {
 	page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
+
 	if (page && !page_pool_recycle_in_ring(pool, page)) {
 		/* Cache full, fallback to free pages */
 		page_pool_return_page(pool, page);
@@ -503,8 +647,11 @@  static void page_pool_empty_ring(struct page_pool *pool)
 
 	/* Empty recycle ring */
 	while ((page = ptr_ring_consume_bh(&pool->ring))) {
-		/* Verify the refcnt invariant of cached pages */
-		if (!(page_ref_count(page) == 1))
+		/* Verify the refcnt invariant of cached pages for
+		 * non elevated refcnt case.
+		 */
+		if (!(pool->p.flags & PP_FLAG_PAGECNT_BIAS) &&
+		    !(page_ref_count(page) == 1))
 			pr_crit("%s() page_pool refcnt %d violation\n",
 				__func__, page_ref_count(page));
 
@@ -544,6 +691,7 @@  static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
 
 static void page_pool_scrub(struct page_pool *pool)
 {
+	page_pool_empty_frag(pool);
 	page_pool_empty_alloc_cache_once(pool);
 	pool->destroy_cnt++;
 
@@ -637,14 +785,13 @@  bool page_pool_return_skb_page(struct page *page)
 	if (unlikely(page->pp_magic != PP_SIGNATURE))
 		return false;
 
-	pp = page->pp;
+	pp = page->pp_info->pp;
 
 	/* Driver set this to memory recycling info. Reset it on recycle.
 	 * This will *not* work for NIC using a split-page memory model.
 	 * The page will be returned to the pool here regardless of the
 	 * 'flipped' fragment being in use or not.
 	 */
-	page->pp = NULL;
 	page_pool_put_full_page(pp, page, false);
 
 	return true;