From patchwork Sat Jul 10 07:43:18 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yunsheng Lin X-Patchwork-Id: 472624 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-17.0 required=3.0 tests=BAYES_00, HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI, SPF_HELO_NONE, SPF_PASS, URIBL_BLOCKED, USER_AGENT_GIT autolearn=unavailable autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id ED777C11F66 for ; Sat, 10 Jul 2021 07:44:15 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id E0795613EC for ; Sat, 10 Jul 2021 07:44:15 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232276AbhGJHq4 (ORCPT ); Sat, 10 Jul 2021 03:46:56 -0400 Received: from szxga02-in.huawei.com ([45.249.212.188]:6905 "EHLO szxga02-in.huawei.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232012AbhGJHqs (ORCPT ); Sat, 10 Jul 2021 03:46:48 -0400 Received: from dggemv703-chm.china.huawei.com (unknown [172.30.72.53]) by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4GMMQ75PvBz79mb; Sat, 10 Jul 2021 15:40:31 +0800 (CST) Received: from dggpemm500005.china.huawei.com (7.185.36.74) by dggemv703-chm.china.huawei.com (10.3.19.46) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2176.2; Sat, 10 Jul 2021 15:44:00 +0800 Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2176.2; Sat, 10 Jul 2021 15:44:00 +0800 From: Yunsheng Lin To: , CC: , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , Subject: [PATCH rfc v2 1/5] page_pool: keep pp info as long as page pool owns the page Date: Sat, 10 Jul 2021 15:43:18 +0800 Message-ID: <1625903002-31619-2-git-send-email-linyunsheng@huawei.com> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1625903002-31619-1-git-send-email-linyunsheng@huawei.com> References: <1625903002-31619-1-git-send-email-linyunsheng@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.69.192.56] X-ClientProxiedBy: dggems705-chm.china.huawei.com (10.3.19.182) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org Currently, page->pp is cleared and set everytime the page is recycled, which is unnecessary. So only set the page->pp when the page is added to the page pool and only clear it when the page is released from the page pool. This is also a preparation to support elevated refcnt in page pool. Signed-off-by: Yunsheng Lin --- drivers/net/ethernet/marvell/mvneta.c | 6 +----- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 2 +- drivers/net/ethernet/ti/cpsw.c | 2 +- drivers/net/ethernet/ti/cpsw_new.c | 2 +- include/linux/skbuff.h | 4 +--- include/net/page_pool.h | 7 ------- net/core/page_pool.c | 21 +++++++++++++++++---- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 361bc4f..89bf31fd 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2327,7 +2327,7 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool, if (!skb) return ERR_PTR(-ENOMEM); - skb_mark_for_recycle(skb, virt_to_page(xdp->data), pool); + skb_mark_for_recycle(skb); skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); @@ -2339,10 +2339,6 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool, skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, skb_frag_page(frag), skb_frag_off(frag), skb_frag_size(frag), PAGE_SIZE); - /* We don't need to reset pp_recycle here. It's already set, so - * just mark fragments for recycling. - */ - page_pool_store_mem_info(skb_frag_page(frag), pool); } return skb; diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 3229baf..320eddb 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3995,7 +3995,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, } if (pp) - skb_mark_for_recycle(skb, page, pp); + skb_mark_for_recycle(skb); else dma_unmap_single_attrs(dev->dev.parent, dma_addr, bm_pool->buf_size, DMA_FROM_DEVICE, diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index cbbd0f6..9d59143 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -431,7 +431,7 @@ static void cpsw_rx_handler(void *token, int len, int status) skb->protocol = eth_type_trans(skb, ndev); /* mark skb for recycling */ - skb_mark_for_recycle(skb, page, pool); + skb_mark_for_recycle(skb); netif_receive_skb(skb); ndev->stats.rx_bytes += len; diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 57d279f..a4234a3 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -374,7 +374,7 @@ static void cpsw_rx_handler(void *token, int len, int status) skb->protocol = eth_type_trans(skb, ndev); /* mark skb for recycling */ - skb_mark_for_recycle(skb, page, pool); + skb_mark_for_recycle(skb); netif_receive_skb(skb); ndev->stats.rx_bytes += len; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b2db9cd..7795979 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4711,11 +4711,9 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb) } #ifdef CONFIG_PAGE_POOL -static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page, - struct page_pool *pp) +static inline void skb_mark_for_recycle(struct sk_buff *skb) { skb->pp_recycle = 1; - page_pool_store_mem_info(page, pp); } #endif diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 3dd62dd..8d7744d 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -253,11 +253,4 @@ static inline void page_pool_ring_unlock(struct page_pool *pool) spin_unlock_bh(&pool->ring.producer_lock); } -/* Store mem_info on struct page and use it while recycling skb frags */ -static inline -void page_pool_store_mem_info(struct page *page, struct page_pool *pp) -{ - page->pp = pp; -} - #endif /* _NET_PAGE_POOL_H */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 5e4eb45..78838c6 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -206,6 +206,19 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) return true; } +static void page_pool_set_pp_info(struct page_pool *pool, + struct page *page) +{ + page->pp = pool; + page->pp_magic |= PP_SIGNATURE; +} + +static void page_pool_clear_pp_info(struct page *page) +{ + page->pp_magic = 0; + page->pp = NULL; +} + static struct page *__page_pool_alloc_page_order(struct page_pool *pool, gfp_t gfp) { @@ -222,7 +235,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, return NULL; } - page->pp_magic |= PP_SIGNATURE; + page_pool_set_pp_info(pool, page); /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -266,7 +279,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, put_page(page); continue; } - page->pp_magic |= PP_SIGNATURE; + + page_pool_set_pp_info(pool, page); pool->alloc.cache[pool->alloc.count++] = page; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -345,7 +359,7 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) DMA_ATTR_SKIP_CPU_SYNC); page_pool_set_dma_addr(page, 0); skip_dma_unmap: - page->pp_magic = 0; + page_pool_clear_pp_info(page); /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. @@ -644,7 +658,6 @@ bool page_pool_return_skb_page(struct page *page) * The page will be returned to the pool here regardless of the * 'flipped' fragment being in use or not. */ - page->pp = NULL; page_pool_put_full_page(pp, page, false); return true; From patchwork Sat Jul 10 07:43:19 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yunsheng Lin X-Patchwork-Id: 472818 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-17.0 required=3.0 tests=BAYES_00, HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI, SPF_HELO_NONE, SPF_PASS, URIBL_BLOCKED, USER_AGENT_GIT autolearn=unavailable autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 23E3BC11F68 for ; Sat, 10 Jul 2021 07:44:17 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 0E3D3613F2 for ; Sat, 10 Jul 2021 07:44:17 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231728AbhGJHrA (ORCPT ); Sat, 10 Jul 2021 03:47:00 -0400 Received: from szxga01-in.huawei.com ([45.249.212.187]:6798 "EHLO szxga01-in.huawei.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232004AbhGJHqs (ORCPT ); Sat, 10 Jul 2021 03:46:48 -0400 Received: from dggemv711-chm.china.huawei.com (unknown [172.30.72.57]) by szxga01-in.huawei.com (SkyGuard) with ESMTP id 4GMMMm0x3dzXrLh; Sat, 10 Jul 2021 15:38:28 +0800 (CST) Received: from dggpemm500005.china.huawei.com (7.185.36.74) by dggemv711-chm.china.huawei.com (10.1.198.66) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2176.2; Sat, 10 Jul 2021 15:44:00 +0800 Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2176.2; Sat, 10 Jul 2021 15:44:00 +0800 From: Yunsheng Lin To: , CC: , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , Subject: [PATCH rfc v2 2/5] page_pool: add interface for getting and setting pagecnt_bias Date: Sat, 10 Jul 2021 15:43:19 +0800 Message-ID: <1625903002-31619-3-git-send-email-linyunsheng@huawei.com> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1625903002-31619-1-git-send-email-linyunsheng@huawei.com> References: <1625903002-31619-1-git-send-email-linyunsheng@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.69.192.56] X-ClientProxiedBy: dggems705-chm.china.huawei.com (10.3.19.182) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org As suggested by Alexander, "A DMA mapping should be page aligned anyway so the lower 12 bits would be reserved 0", so it might make more sense to repurpose the lower 12 bits of the dma address to store the pagecnt_bias for elevated refcnt case in page pool. As newly added page_pool_get_pagecnt_bias() may be called outside of the softirq context, so annotate the access to page->dma_addr[0] with READ_ONCE() and WRITE_ONCE(). Other three interfaces using page->dma_addr[0] is only called in the softirq context during normal rx processing, hopefully the barrier in the rx processing will ensure the correct order between getting and setting pagecnt_bias. Signed-off-by: Yunsheng Lin --- include/net/page_pool.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 8d7744d..5746f17 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -200,7 +200,7 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, static inline dma_addr_t page_pool_get_dma_addr(struct page *page) { - dma_addr_t ret = page->dma_addr[0]; + dma_addr_t ret = READ_ONCE(page->dma_addr[0]) & PAGE_MASK; if (sizeof(dma_addr_t) > sizeof(unsigned long)) ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16; return ret; @@ -208,11 +208,31 @@ static inline dma_addr_t page_pool_get_dma_addr(struct page *page) static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) { - page->dma_addr[0] = addr; + unsigned long dma_addr_0 = READ_ONCE(page->dma_addr[0]); + + dma_addr_0 &= ~PAGE_MASK; + dma_addr_0 |= (addr & PAGE_MASK); + WRITE_ONCE(page->dma_addr[0], dma_addr_0); + if (sizeof(dma_addr_t) > sizeof(unsigned long)) page->dma_addr[1] = upper_32_bits(addr); } +static inline int page_pool_get_pagecnt_bias(struct page *page) +{ + return (READ_ONCE(page->dma_addr[0]) & ~PAGE_MASK); +} + +static inline void page_pool_set_pagecnt_bias(struct page *page, int bias) +{ + unsigned long dma_addr_0 = READ_ONCE(page->dma_addr[0]); + + dma_addr_0 &= PAGE_MASK; + dma_addr_0 |= (bias & ~PAGE_MASK); + + WRITE_ONCE(page->dma_addr[0], dma_addr_0); +} + static inline bool is_page_pool_compiled_in(void) { #ifdef CONFIG_PAGE_POOL From patchwork Sat Jul 10 07:43:20 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yunsheng Lin X-Patchwork-Id: 472623 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-17.0 required=3.0 tests=BAYES_00, HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI, SPF_HELO_NONE, SPF_PASS, URIBL_BLOCKED, USER_AGENT_GIT autolearn=unavailable autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id E0214C07E9B for ; Sat, 10 Jul 2021 07:44:15 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id C75EF613ED for ; Sat, 10 Jul 2021 07:44:15 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232201AbhGJHqx (ORCPT ); Sat, 10 Jul 2021 03:46:53 -0400 Received: from szxga08-in.huawei.com ([45.249.212.255]:11250 "EHLO szxga08-in.huawei.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231726AbhGJHqs (ORCPT ); Sat, 10 Jul 2021 03:46:48 -0400 Received: from dggemv704-chm.china.huawei.com (unknown [172.30.72.53]) by szxga08-in.huawei.com (SkyGuard) with ESMTP id 4GMMMm4FpJz1CGx2; Sat, 10 Jul 2021 15:38:28 +0800 (CST) Received: from dggpemm500005.china.huawei.com (7.185.36.74) by dggemv704-chm.china.huawei.com (10.3.19.47) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2176.2; Sat, 10 Jul 2021 15:44:01 +0800 Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2176.2; Sat, 10 Jul 2021 15:44:00 +0800 From: Yunsheng Lin To: , CC: , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , Subject: [PATCH rfc v2 3/5] page_pool: add page recycling support based on elevated refcnt Date: Sat, 10 Jul 2021 15:43:20 +0800 Message-ID: <1625903002-31619-4-git-send-email-linyunsheng@huawei.com> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1625903002-31619-1-git-send-email-linyunsheng@huawei.com> References: <1625903002-31619-1-git-send-email-linyunsheng@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.69.192.56] X-ClientProxiedBy: dggems705-chm.china.huawei.com (10.3.19.182) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org Currently page pool only support page recycling only when refcnt of page is one, which means it can not support the split page recycling implemented in the most driver. The expectation of page recycling based on elevated refcnt is that we only do the recycling or freeing of page when the last user has dropped the refcnt that has given to it. The above expectation is based on that the last user will always call page_pool_put_full_page() in order to do the recycling or do the resource cleanup(dma unmaping..etc) and freeing. Signed-off-by: Yunsheng Lin --- include/net/page_pool.h | 5 ++- net/core/page_pool.c | 106 ++++++++++++++++++++++++++++++++++++------------ 2 files changed, 84 insertions(+), 27 deletions(-) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 5746f17..f0e708d 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -45,7 +45,10 @@ * Please note DMA-sync-for-CPU is still * device driver responsibility */ -#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV) +#define PP_FLAG_PAGECNT_BIAS BIT(2) /* For elevated refcnt feature */ +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ + PP_FLAG_DMA_SYNC_DEV |\ + PP_FLAG_PAGECNT_BIAS) /* * Fast allocation side cache array/stack diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 78838c6..a87cbe1 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -24,6 +24,8 @@ #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) +#define BIAS_MAX (PAGE_SIZE - 1) + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -209,14 +211,24 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) static void page_pool_set_pp_info(struct page_pool *pool, struct page *page) { + if (pool->p.flags & PP_FLAG_PAGECNT_BIAS) { + page_ref_add(page, BIAS_MAX); + page_pool_set_pagecnt_bias(page, BIAS_MAX); + } + page->pp = pool; page->pp_magic |= PP_SIGNATURE; } -static void page_pool_clear_pp_info(struct page *page) +static int page_pool_clear_pp_info(struct page *page) { + int bias = page_pool_get_pagecnt_bias(page); + page->pp_magic = 0; page->pp = NULL; + page_pool_set_pagecnt_bias(page, 0); + + return bias; } static struct page *__page_pool_alloc_page_order(struct page_pool *pool, @@ -298,6 +310,23 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, return page; } +static void page_pool_sub_bias(struct page_pool *pool, + struct page *page, int nr) +{ + int bias; + + if (!(pool->p.flags & PP_FLAG_PAGECNT_BIAS)) + return; + + bias = page_pool_get_pagecnt_bias(page); + if (unlikely(bias <= nr)) { + page_ref_add(page, BIAS_MAX - bias); + bias = BIAS_MAX; + } + + page_pool_set_pagecnt_bias(page, bias - nr); +} + /* For using page_pool replace: alloc_pages() API calls, but provide * synchronization guarantee for allocation side. */ @@ -307,11 +336,16 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) /* Fast-path: Get a page from cache */ page = __page_pool_get_cached(pool); - if (page) + if (page) { + page_pool_sub_bias(pool, page, 1); return page; + } /* Slow-path: cache empty, do real allocation */ page = __page_pool_alloc_pages_slow(pool, gfp); + if (likely(page)) + page_pool_sub_bias(pool, page, 1); + return page; } EXPORT_SYMBOL(page_pool_alloc_pages); @@ -340,10 +374,11 @@ static s32 page_pool_inflight(struct page_pool *pool) * a regular page (that will eventually be returned to the normal * page-allocator via put_page). */ -void page_pool_release_page(struct page_pool *pool, struct page *page) +static int __page_pool_release_page(struct page_pool *pool, + struct page *page) { + int bias, count; dma_addr_t dma; - int count; if (!(pool->p.flags & PP_FLAG_DMA_MAP)) /* Always account for inflight pages, even if we didn't @@ -359,22 +394,30 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) DMA_ATTR_SKIP_CPU_SYNC); page_pool_set_dma_addr(page, 0); skip_dma_unmap: - page_pool_clear_pp_info(page); + bias = page_pool_clear_pp_info(page); /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ count = atomic_inc_return(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, page, count); + + return bias; +} + +void page_pool_release_page(struct page_pool *pool, struct page *page) +{ + int bias = __page_pool_release_page(pool, page); + + WARN_ONCE(bias, "%s is called from driver with elevated refcnt\n", + __func__); } EXPORT_SYMBOL(page_pool_release_page); /* Return a page to the page allocator, cleaning up our state */ static void page_pool_return_page(struct page_pool *pool, struct page *page) { - page_pool_release_page(pool, page); - - put_page(page); + __page_frag_cache_drain(page, __page_pool_release_page(pool, page) + 1); /* An optimization would be to call __free_pages(page, pool->p.order) * knowing page is not part of page-cache (thus avoiding a * __page_cache_release() call). @@ -409,6 +452,15 @@ static bool page_pool_recycle_in_cache(struct page *page, return true; } +static bool page_pool_bias_page_recyclable(struct page *page, int bias) +{ + int ref = page_ref_dec_return(page); + + WARN_ON(ref <= bias); + + return ref == bias + 1; +} + /* If the page refcnt == 1, this will try to recycle the page. * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for * the configured size min(dma_sync_size, pool->max_len). @@ -419,6 +471,20 @@ static __always_inline struct page * __page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { + int bias = page_pool_get_pagecnt_bias(page); + + /* Handle the elevated refcnt case first */ + if (bias) { + /* It is not the last user yet */ + if (!page_pool_bias_page_recyclable(page, bias)) + return NULL; + + if (likely(!page_is_pfmemalloc(page))) + goto recyclable; + else + goto unrecyclable; + } + /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. @@ -430,7 +496,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, */ if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { /* Read barrier done in page_ref_count / READ_ONCE */ - +recyclable: if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) page_pool_dma_sync_for_device(pool, page, dma_sync_size); @@ -442,22 +508,9 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, /* Page found as candidate for recycling */ return page; } - /* Fallback/non-XDP mode: API user have elevated refcnt. - * - * Many drivers split up the page into fragments, and some - * want to keep doing this to save memory and do refcnt based - * recycling. Support this use case too, to ease drivers - * switching between XDP/non-XDP. - * - * In-case page_pool maintains the DMA mapping, API user must - * call page_pool_put_page once. In this elevated refcnt - * case, the DMA is unmapped/released, as driver is likely - * doing refcnt based recycle tricks, meaning another process - * will be invoking put_page. - */ - /* Do not replace this with page_pool_return_page() */ - page_pool_release_page(pool, page); - put_page(page); + +unrecyclable: + page_pool_return_page(pool, page); return NULL; } @@ -518,7 +571,8 @@ static void page_pool_empty_ring(struct page_pool *pool) /* Empty recycle ring */ while ((page = ptr_ring_consume_bh(&pool->ring))) { /* Verify the refcnt invariant of cached pages */ - if (!(page_ref_count(page) == 1)) + if (!(page_ref_count(page) == + (page_pool_get_pagecnt_bias(page) + 1))) pr_crit("%s() page_pool refcnt %d violation\n", __func__, page_ref_count(page)); From patchwork Sat Jul 10 07:43:21 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yunsheng Lin X-Patchwork-Id: 472816 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-17.0 required=3.0 tests=BAYES_00, HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI, SPF_HELO_NONE, SPF_PASS, URIBL_BLOCKED, USER_AGENT_GIT autolearn=unavailable autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id C97DCC11F68 for ; Sat, 10 Jul 2021 07:44:20 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id B3FE7613EC for ; Sat, 10 Jul 2021 07:44:20 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232448AbhGJHrE (ORCPT ); Sat, 10 Jul 2021 03:47:04 -0400 Received: from szxga02-in.huawei.com ([45.249.212.188]:6906 "EHLO szxga02-in.huawei.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232123AbhGJHqx (ORCPT ); Sat, 10 Jul 2021 03:46:53 -0400 Received: from dggemv703-chm.china.huawei.com (unknown [172.30.72.53]) by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4GMMQ80Q3Lz79mm; Sat, 10 Jul 2021 15:40:32 +0800 (CST) Received: from dggpemm500005.china.huawei.com (7.185.36.74) by dggemv703-chm.china.huawei.com (10.3.19.46) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2176.2; Sat, 10 Jul 2021 15:44:01 +0800 Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2176.2; Sat, 10 Jul 2021 15:44:00 +0800 From: Yunsheng Lin To: , CC: , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , Subject: [PATCH rfc v2 4/5] page_pool: support page frag API for page pool Date: Sat, 10 Jul 2021 15:43:21 +0800 Message-ID: <1625903002-31619-5-git-send-email-linyunsheng@huawei.com> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1625903002-31619-1-git-send-email-linyunsheng@huawei.com> References: <1625903002-31619-1-git-send-email-linyunsheng@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.69.192.56] X-ClientProxiedBy: dggems705-chm.china.huawei.com (10.3.19.182) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org Currently each desc use a whole page to do ping-pong page reusing in most driver. As the page pool has support page recycling based on elevated refcnt, it makes sense to add a page frag API in page pool to split a page to different frag to serve multi descriptions. This means a huge memory saving for kernel with page size of 64K, as a page can be used by 32 descriptions with 2k buffer size, comparing to each desc using one page currently. Signed-off-by: Yunsheng Lin --- include/net/page_pool.h | 14 ++++++++++++++ net/core/page_pool.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index f0e708d..06a5e43 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -80,6 +80,7 @@ struct page_pool_params { enum dma_data_direction dma_dir; /* DMA mapping direction */ unsigned int max_len; /* max DMA sync memory size */ unsigned int offset; /* DMA addr offset */ + unsigned int frag_size; }; struct page_pool { @@ -91,6 +92,8 @@ struct page_pool { unsigned long defer_warn; u32 pages_state_hold_cnt; + unsigned int frag_offset; + struct page *frag_page; /* * Data structure for allocation side @@ -140,6 +143,17 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) return page_pool_alloc_pages(pool, gfp); } +struct page *page_pool_alloc_frag(struct page_pool *pool, + unsigned int *offset, gfp_t gfp); + +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, + unsigned int *offset) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_frag(pool, offset, gfp); +} + /* get the stored dma direction. A driver might decide to treat this locally and * avoid the extra cache line from page_pool to determine the direction */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a87cbe1..b787033 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -350,6 +350,53 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) } EXPORT_SYMBOL(page_pool_alloc_pages); +struct page *page_pool_alloc_frag(struct page_pool *pool, + unsigned int *offset, gfp_t gfp) +{ + unsigned int frag_offset = pool->frag_offset; + unsigned int frag_size = pool->p.frag_size; + struct page *frag_page = pool->frag_page; + unsigned int max_len = pool->p.max_len; + + if (!frag_page || frag_offset + frag_size > max_len) { + frag_page = page_pool_alloc_pages(pool, gfp); + if (unlikely(!frag_page)) { + pool->frag_page = NULL; + return NULL; + } + + pool->frag_page = frag_page; + frag_offset = 0; + + page_pool_sub_bias(pool, frag_page, + max_len / frag_size - 1); + } + + *offset = frag_offset; + pool->frag_offset = frag_offset + frag_size; + + return frag_page; +} +EXPORT_SYMBOL(page_pool_alloc_frag); + +static void page_pool_empty_frag(struct page_pool *pool) +{ + unsigned int frag_offset = pool->frag_offset; + unsigned int frag_size = pool->p.frag_size; + struct page *frag_page = pool->frag_page; + unsigned int max_len = pool->p.max_len; + + if (!frag_page) + return; + + while (frag_offset + frag_size <= max_len) { + page_pool_put_full_page(pool, frag_page, false); + frag_offset += frag_size; + } + + pool->frag_page = NULL; +} + /* Calculate distance between two u32 values, valid if distance is below 2^(31) * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution */ @@ -670,6 +717,8 @@ void page_pool_destroy(struct page_pool *pool) if (!page_pool_put(pool)) return; + page_pool_empty_frag(pool); + if (!page_pool_release(pool)) return; From patchwork Sat Jul 10 07:43:22 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yunsheng Lin X-Patchwork-Id: 472817 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-17.0 required=3.0 tests=BAYES_00, HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI, SPF_HELO_NONE, SPF_PASS, URIBL_BLOCKED, USER_AGENT_GIT autolearn=unavailable autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 4A56AC11F67 for ; Sat, 10 Jul 2021 07:44:16 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 34505611CD for ; Sat, 10 Jul 2021 07:44:16 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232363AbhGJHq6 (ORCPT ); Sat, 10 Jul 2021 03:46:58 -0400 Received: from szxga08-in.huawei.com ([45.249.212.255]:11251 "EHLO szxga08-in.huawei.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232010AbhGJHqs (ORCPT ); Sat, 10 Jul 2021 03:46:48 -0400 Received: from dggemv711-chm.china.huawei.com (unknown [172.30.72.53]) by szxga08-in.huawei.com (SkyGuard) with ESMTP id 4GMMMn0hnHz1CH1Y; Sat, 10 Jul 2021 15:38:29 +0800 (CST) Received: from dggpemm500005.china.huawei.com (7.185.36.74) by dggemv711-chm.china.huawei.com (10.1.198.66) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2176.2; Sat, 10 Jul 2021 15:44:01 +0800 Received: from localhost.localdomain (10.69.192.56) by dggpemm500005.china.huawei.com (7.185.36.74) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2176.2; Sat, 10 Jul 2021 15:44:01 +0800 From: Yunsheng Lin To: , CC: , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , Subject: [PATCH rfc v2 5/5] net: hns3: support skb's frag page recycling based on page pool Date: Sat, 10 Jul 2021 15:43:22 +0800 Message-ID: <1625903002-31619-6-git-send-email-linyunsheng@huawei.com> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1625903002-31619-1-git-send-email-linyunsheng@huawei.com> References: <1625903002-31619-1-git-send-email-linyunsheng@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.69.192.56] X-ClientProxiedBy: dggems705-chm.china.huawei.com (10.3.19.182) To dggpemm500005.china.huawei.com (7.185.36.74) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org This patch adds skb's frag page recycling support based on the elevated refcnt support in page pool. The performance improves above 10~20% with IOMMU disabled. The performance improves about two times when IOMMU is enabled and iperf server shares the same cpu with irq/NAPI. Signed-off-by: Yunsheng Lin --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 79 +++++++++++++++++++++++-- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 3 + 2 files changed, 77 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index cdb5f14..a76e0f7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -3205,6 +3205,20 @@ static int hns3_alloc_buffer(struct hns3_enet_ring *ring, unsigned int order = hns3_page_order(ring); struct page *p; + if (ring->page_pool) { + p = page_pool_dev_alloc_frag(ring->page_pool, + &cb->page_offset); + if (unlikely(!p)) + return -ENOMEM; + + cb->priv = p; + cb->buf = page_address(p); + cb->dma = page_pool_get_dma_addr(p); + cb->type = DESC_TYPE_FRAG; + cb->reuse_flag = 0; + return 0; + } + p = dev_alloc_pages(order); if (!p) return -ENOMEM; @@ -3227,8 +3241,12 @@ static void hns3_free_buffer(struct hns3_enet_ring *ring, if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_HEAD | DESC_TYPE_BOUNCE_ALL | DESC_TYPE_SGL_SKB)) napi_consume_skb(cb->priv, budget); - else if (!HNAE3_IS_TX_RING(ring) && cb->pagecnt_bias) - __page_frag_cache_drain(cb->priv, cb->pagecnt_bias); + else if (!HNAE3_IS_TX_RING(ring)) { + if (cb->type & DESC_TYPE_PAGE && cb->pagecnt_bias) + __page_frag_cache_drain(cb->priv, cb->pagecnt_bias); + else if (cb->type & DESC_TYPE_FRAG) + page_pool_put_full_page(ring->page_pool, cb->priv, false); + } memset(cb, 0, sizeof(*cb)); } @@ -3315,7 +3333,7 @@ static int hns3_alloc_and_map_buffer(struct hns3_enet_ring *ring, int ret; ret = hns3_alloc_buffer(ring, cb); - if (ret) + if (ret || ring->page_pool) goto out; ret = hns3_map_buffer(ring, cb); @@ -3337,7 +3355,8 @@ static int hns3_alloc_and_attach_buffer(struct hns3_enet_ring *ring, int i) if (ret) return ret; - ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma); + ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + + ring->desc_cb[i].page_offset); return 0; } @@ -3367,7 +3386,8 @@ static void hns3_replace_buffer(struct hns3_enet_ring *ring, int i, { hns3_unmap_buffer(ring, &ring->desc_cb[i]); ring->desc_cb[i] = *res_cb; - ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma); + ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + + ring->desc_cb[i].page_offset); ring->desc[i].rx.bd_base_info = 0; } @@ -3539,6 +3559,12 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int i, u32 frag_size = size - pull_len; bool reused; + if (ring->page_pool) { + skb_add_rx_frag(skb, i, desc_cb->priv, frag_offset, + frag_size, truesize); + return; + } + /* Avoid re-using remote or pfmem page */ if (unlikely(!dev_page_is_reusable(desc_cb->priv))) goto out; @@ -3856,6 +3882,9 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length, /* We can reuse buffer as-is, just make sure it is reusable */ if (dev_page_is_reusable(desc_cb->priv)) desc_cb->reuse_flag = 1; + else if (desc_cb->type & DESC_TYPE_FRAG) + page_pool_put_full_page(ring->page_pool, desc_cb->priv, + false); else /* This page cannot be reused so discard it */ __page_frag_cache_drain(desc_cb->priv, desc_cb->pagecnt_bias); @@ -3863,6 +3892,10 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length, hns3_rx_ring_move_fw(ring); return 0; } + + if (ring->page_pool) + skb_mark_for_recycle(skb); + u64_stats_update_begin(&ring->syncp); ring->stats.seg_pkt_cnt++; u64_stats_update_end(&ring->syncp); @@ -3901,6 +3934,10 @@ static int hns3_add_frag(struct hns3_enet_ring *ring) "alloc rx fraglist skb fail\n"); return -ENXIO; } + + if (ring->page_pool) + skb_mark_for_recycle(new_skb); + ring->frag_num = 0; if (ring->tail_skb) { @@ -4705,6 +4742,30 @@ static void hns3_put_ring_config(struct hns3_nic_priv *priv) priv->ring = NULL; } +static void hns3_alloc_page_pool(struct hns3_enet_ring *ring) +{ + struct page_pool_params pp_params = { + .flags = PP_FLAG_DMA_MAP | PP_FLAG_PAGECNT_BIAS, + .order = hns3_page_order(ring), + .pool_size = ring->desc_num * hns3_buf_size(ring) / PAGE_SIZE, + .nid = dev_to_node(ring_to_dev(ring)), + .dev = ring_to_dev(ring), + .dma_dir = DMA_FROM_DEVICE, + .offset = 0, + .max_len = PAGE_SIZE, + .frag_size = hns3_buf_size(ring), + }; + + ring->page_pool = page_pool_create(&pp_params); + if (IS_ERR(ring->page_pool)) { + dev_warn(ring_to_dev(ring), "page pool creation failed: %ld\n", + PTR_ERR(ring->page_pool)); + ring->page_pool = NULL; + } else { + dev_info(ring_to_dev(ring), "page pool creation succeeded\n"); + } +} + static int hns3_alloc_ring_memory(struct hns3_enet_ring *ring) { int ret; @@ -4724,6 +4785,8 @@ static int hns3_alloc_ring_memory(struct hns3_enet_ring *ring) goto out_with_desc_cb; if (!HNAE3_IS_TX_RING(ring)) { + hns3_alloc_page_pool(ring); + ret = hns3_alloc_ring_buffers(ring); if (ret) goto out_with_desc; @@ -4764,6 +4827,12 @@ void hns3_fini_ring(struct hns3_enet_ring *ring) devm_kfree(ring_to_dev(ring), tx_spare); ring->tx_spare = NULL; } + + if (!HNAE3_IS_TX_RING(ring) && ring->page_pool) { + page_pool_destroy(ring->page_pool); + ring->page_pool = NULL; + dev_info(ring_to_dev(ring), "page pool destroyed\n"); + } } static int hns3_buf_size2type(u32 buf_size) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 15af3d9..115c0ce 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -6,6 +6,7 @@ #include #include +#include #include "hnae3.h" @@ -307,6 +308,7 @@ enum hns3_desc_type { DESC_TYPE_BOUNCE_ALL = 1 << 3, DESC_TYPE_BOUNCE_HEAD = 1 << 4, DESC_TYPE_SGL_SKB = 1 << 5, + DESC_TYPE_FRAG = 1 << 6, }; struct hns3_desc_cb { @@ -451,6 +453,7 @@ struct hns3_enet_ring { struct hnae3_queue *tqp; int queue_index; struct device *dev; /* will be used for DMA mapping of descriptors */ + struct page_pool *page_pool; /* statistic */ struct ring_stats stats;