diff mbox series

[v6] mm/zswap: move to use crypto_acomp API for hardware acceleration

Message ID 20200818123100.4140-1-song.bao.hua@hisilicon.com
State Superseded
Headers show
Series [v6] mm/zswap: move to use crypto_acomp API for hardware acceleration | expand

Commit Message

Song Bao Hua (Barry Song) Aug. 18, 2020, 12:31 p.m. UTC
Right now, all new ZIP drivers are adapted to crypto_acomp APIs rather
than legacy crypto_comp APIs. Tradiontal ZIP drivers like lz4,lzo etc
have been also wrapped into acomp via scomp backend. But zswap.c is still
using the old APIs. That means zswap won't be able to work on any new
ZIP drivers in kernel.

This patch moves to use cryto_acomp APIs to fix the disconnected bridge
between new ZIP drivers and zswap. It is probably the first real user
to use acomp but perhaps not a good example to demonstrate how multiple
acomp requests can be executed in parallel in one acomp instance.
frontswap is doing page load and store page by page synchronously.
swap_writepage() depends on the completion of frontswap_store() to
decide if it should call __swap_writepage() to swap to disk.

However this patch creates multiple acomp instances, so multiple threads
running on multiple different cpus can actually do (de)compression
parallelly, leveraging the power of multiple ZIP hardware queues. This
is also consistent with frontswap's page management model.

The old zswap code uses atomic context and avoids the race conditions
while shared resources like zswap_dstmem are accessed. Here since acomp
can sleep, per-cpu mutex is used to replace preemption-disable.

While it is possible to make mm/page_io.c and mm/frontswap.c support
async (de)compression in some way, the entire design requires careful
thinking and performance evaluation. For the first step, the base with
fixed connection between ZIP drivers and zswap should be built.

Cc: Luis Claudio R. Goncalves <lgoncalv@redhat.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: David S. Miller <davem@davemloft.net>
Cc: Mahipal Challa <mahipalreddy2006@gmail.com>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Zhou Wang <wangzhou1@hisilicon.com>
Cc: Hao Fang <fanghao11@huawei.com>
Cc: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
---
 -v6:
 * rebase on top of 5.9-rc1;
 * move to crypto_alloc_acomp_node() API to use local ZIP hardware

 mm/zswap.c | 183 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 138 insertions(+), 45 deletions(-)

Comments

Vitaly Wool Sept. 28, 2020, 12:44 p.m. UTC | #1
On Tue, Aug 18, 2020 at 2:34 PM Barry Song <song.bao.hua@hisilicon.com> wrote:
>
> Right now, all new ZIP drivers are adapted to crypto_acomp APIs rather
> than legacy crypto_comp APIs. Tradiontal ZIP drivers like lz4,lzo etc
> have been also wrapped into acomp via scomp backend. But zswap.c is still
> using the old APIs. That means zswap won't be able to work on any new
> ZIP drivers in kernel.
>
> This patch moves to use cryto_acomp APIs to fix the disconnected bridge
> between new ZIP drivers and zswap. It is probably the first real user
> to use acomp but perhaps not a good example to demonstrate how multiple
> acomp requests can be executed in parallel in one acomp instance.
> frontswap is doing page load and store page by page synchronously.
> swap_writepage() depends on the completion of frontswap_store() to
> decide if it should call __swap_writepage() to swap to disk.
>
> However this patch creates multiple acomp instances, so multiple threads
> running on multiple different cpus can actually do (de)compression
> parallelly, leveraging the power of multiple ZIP hardware queues. This
> is also consistent with frontswap's page management model.
>
> The old zswap code uses atomic context and avoids the race conditions
> while shared resources like zswap_dstmem are accessed. Here since acomp
> can sleep, per-cpu mutex is used to replace preemption-disable.
>
> While it is possible to make mm/page_io.c and mm/frontswap.c support
> async (de)compression in some way, the entire design requires careful
> thinking and performance evaluation. For the first step, the base with
> fixed connection between ZIP drivers and zswap should be built.
>
> Cc: Luis Claudio R. Goncalves <lgoncalv@redhat.com>
> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: David S. Miller <davem@davemloft.net>
> Cc: Mahipal Challa <mahipalreddy2006@gmail.com>
> Cc: Seth Jennings <sjenning@redhat.com>
> Cc: Dan Streetman <ddstreet@ieee.org>
> Cc: Vitaly Wool <vitaly.wool@konsulko.com>
> Cc: Zhou Wang <wangzhou1@hisilicon.com>
> Cc: Hao Fang <fanghao11@huawei.com>
> Cc: Colin Ian King <colin.king@canonical.com>
> Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>

Acked-by: Vitaly Wool <vitalywool@gmail.com>

> ---
>  -v6:
>  * rebase on top of 5.9-rc1;
>  * move to crypto_alloc_acomp_node() API to use local ZIP hardware
>
>  mm/zswap.c | 183 ++++++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 138 insertions(+), 45 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index fbb782924ccc..00b5f14a7332 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -24,8 +24,10 @@
>  #include <linux/rbtree.h>
>  #include <linux/swap.h>
>  #include <linux/crypto.h>
> +#include <linux/scatterlist.h>
>  #include <linux/mempool.h>
>  #include <linux/zpool.h>
> +#include <crypto/acompress.h>
>
>  #include <linux/mm_types.h>
>  #include <linux/page-flags.h>
> @@ -127,9 +129,17 @@ module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
>  * data structures
>  **********************************/
>
> +struct crypto_acomp_ctx {
> +       struct crypto_acomp *acomp;
> +       struct acomp_req *req;
> +       struct crypto_wait wait;
> +       u8 *dstmem;
> +       struct mutex *mutex;
> +};
> +
>  struct zswap_pool {
>         struct zpool *zpool;
> -       struct crypto_comp * __percpu *tfm;
> +       struct crypto_acomp_ctx __percpu *acomp_ctx;
>         struct kref kref;
>         struct list_head list;
>         struct work_struct release_work;
> @@ -388,23 +398,43 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
>  * per-cpu code
>  **********************************/
>  static DEFINE_PER_CPU(u8 *, zswap_dstmem);
> +/*
> + * If users dynamically change the zpool type and compressor at runtime, i.e.
> + * zswap is running, zswap can have more than one zpool on one cpu, but they
> + * are sharing dtsmem. So we need this mutex to be per-cpu.
> + */
> +static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
>
>  static int zswap_dstmem_prepare(unsigned int cpu)
>  {
> +       struct mutex *mutex;
>         u8 *dst;
>
>         dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
>         if (!dst)
>                 return -ENOMEM;
>
> +       mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
> +       if (!mutex) {
> +               kfree(dst);
> +               return -ENOMEM;
> +       }
> +
> +       mutex_init(mutex);
>         per_cpu(zswap_dstmem, cpu) = dst;
> +       per_cpu(zswap_mutex, cpu) = mutex;
>         return 0;
>  }
>
>  static int zswap_dstmem_dead(unsigned int cpu)
>  {
> +       struct mutex *mutex;
>         u8 *dst;
>
> +       mutex = per_cpu(zswap_mutex, cpu);
> +       kfree(mutex);
> +       per_cpu(zswap_mutex, cpu) = NULL;
> +
>         dst = per_cpu(zswap_dstmem, cpu);
>         kfree(dst);
>         per_cpu(zswap_dstmem, cpu) = NULL;
> @@ -415,30 +445,54 @@ static int zswap_dstmem_dead(unsigned int cpu)
>  static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
>  {
>         struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> -       struct crypto_comp *tfm;
> -
> -       if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
> -               return 0;
> +       struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> +       struct crypto_acomp *acomp;
> +       struct acomp_req *req;
> +
> +       acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
> +       if (IS_ERR(acomp)) {
> +               pr_err("could not alloc crypto acomp %s : %ld\n",
> +                               pool->tfm_name, PTR_ERR(acomp));
> +               return PTR_ERR(acomp);
> +       }
> +       acomp_ctx->acomp = acomp;
>
> -       tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
> -       if (IS_ERR_OR_NULL(tfm)) {
> -               pr_err("could not alloc crypto comp %s : %ld\n",
> -                      pool->tfm_name, PTR_ERR(tfm));
> +       req = acomp_request_alloc(acomp_ctx->acomp);
> +       if (!req) {
> +               pr_err("could not alloc crypto acomp_request %s\n",
> +                      pool->tfm_name);
> +               crypto_free_acomp(acomp_ctx->acomp);
>                 return -ENOMEM;
>         }
> -       *per_cpu_ptr(pool->tfm, cpu) = tfm;
> +       acomp_ctx->req = req;
> +
> +       crypto_init_wait(&acomp_ctx->wait);
> +       /*
> +        * if the backend of acomp is async zip, crypto_req_done() will wakeup
> +        * crypto_wait_req(); if the backend of acomp is scomp, the callback
> +        * won't be called, crypto_wait_req() will return without blocking.
> +        */
> +       acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
> +                                  crypto_req_done, &acomp_ctx->wait);
> +
> +       acomp_ctx->mutex = per_cpu(zswap_mutex, cpu);
> +       acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu);
> +
>         return 0;
>  }
>
>  static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
>  {
>         struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> -       struct crypto_comp *tfm;
> +       struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> +
> +       if (!IS_ERR_OR_NULL(acomp_ctx)) {
> +               if (!IS_ERR_OR_NULL(acomp_ctx->req))
> +                       acomp_request_free(acomp_ctx->req);
> +               if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
> +                       crypto_free_acomp(acomp_ctx->acomp);
> +       }
>
> -       tfm = *per_cpu_ptr(pool->tfm, cpu);
> -       if (!IS_ERR_OR_NULL(tfm))
> -               crypto_free_comp(tfm);
> -       *per_cpu_ptr(pool->tfm, cpu) = NULL;
>         return 0;
>  }
>
> @@ -561,8 +615,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>         pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
>
>         strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
> -       pool->tfm = alloc_percpu(struct crypto_comp *);
> -       if (!pool->tfm) {
> +
> +       pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
> +       if (!pool->acomp_ctx) {
>                 pr_err("percpu alloc failed\n");
>                 goto error;
>         }
> @@ -585,7 +640,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>         return pool;
>
>  error:
> -       free_percpu(pool->tfm);
> +       if (pool->acomp_ctx)
> +               free_percpu(pool->acomp_ctx);
>         if (pool->zpool)
>                 zpool_destroy_pool(pool->zpool);
>         kfree(pool);
> @@ -596,14 +652,14 @@ static __init struct zswap_pool *__zswap_pool_create_fallback(void)
>  {
>         bool has_comp, has_zpool;
>
> -       has_comp = crypto_has_comp(zswap_compressor, 0, 0);
> +       has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
>         if (!has_comp && strcmp(zswap_compressor,
>                                 CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
>                 pr_err("compressor %s not available, using default %s\n",
>                        zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
>                 param_free_charp(&zswap_compressor);
>                 zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
> -               has_comp = crypto_has_comp(zswap_compressor, 0, 0);
> +               has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
>         }
>         if (!has_comp) {
>                 pr_err("default compressor %s not available\n",
> @@ -639,7 +695,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
>         zswap_pool_debug("destroying", pool);
>
>         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> -       free_percpu(pool->tfm);
> +       free_percpu(pool->acomp_ctx);
>         zpool_destroy_pool(pool->zpool);
>         kfree(pool);
>  }
> @@ -723,7 +779,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
>                 }
>                 type = s;
>         } else if (!compressor) {
> -               if (!crypto_has_comp(s, 0, 0)) {
> +               if (!crypto_has_acomp(s, 0, 0)) {
>                         pr_err("compressor %s not available\n", s);
>                         return -ENOENT;
>                 }
> @@ -774,7 +830,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
>                  * failed, maybe both compressor and zpool params were bad.
>                  * Allow changing this param, so pool creation will succeed
>                  * when the other param is changed. We already verified this
> -                * param is ok in the zpool_has_pool() or crypto_has_comp()
> +                * param is ok in the zpool_has_pool() or crypto_has_acomp()
>                  * checks above.
>                  */
>                 ret = param_set_charp(s, kp);
> @@ -876,7 +932,9 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
>         pgoff_t offset;
>         struct zswap_entry *entry;
>         struct page *page;
> -       struct crypto_comp *tfm;
> +       struct scatterlist input, output;
> +       struct crypto_acomp_ctx *acomp_ctx;
> +
>         u8 *src, *dst;
>         unsigned int dlen;
>         int ret;
> @@ -916,14 +974,21 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
>
>         case ZSWAP_SWAPCACHE_NEW: /* page is locked */
>                 /* decompress */
> +               acomp_ctx = this_cpu_ptr(entry->pool->acomp_ctx);
> +
>                 dlen = PAGE_SIZE;
>                 src = (u8 *)zhdr + sizeof(struct zswap_header);
> -               dst = kmap_atomic(page);
> -               tfm = *get_cpu_ptr(entry->pool->tfm);
> -               ret = crypto_comp_decompress(tfm, src, entry->length,
> -                                            dst, &dlen);
> -               put_cpu_ptr(entry->pool->tfm);
> -               kunmap_atomic(dst);
> +               dst = kmap(page);
> +
> +               mutex_lock(acomp_ctx->mutex);
> +               sg_init_one(&input, src, entry->length);
> +               sg_init_one(&output, dst, dlen);
> +               acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
> +               ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
> +               dlen = acomp_ctx->req->dlen;
> +               mutex_unlock(acomp_ctx->mutex);
> +
> +               kunmap(page);
>                 BUG_ON(ret);
>                 BUG_ON(dlen != PAGE_SIZE);
>
> @@ -1004,7 +1069,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>  {
>         struct zswap_tree *tree = zswap_trees[type];
>         struct zswap_entry *entry, *dupentry;
> -       struct crypto_comp *tfm;
> +       struct scatterlist input, output;
> +       struct crypto_acomp_ctx *acomp_ctx;
>         int ret;
>         unsigned int hlen, dlen = PAGE_SIZE;
>         unsigned long handle, value;
> @@ -1074,12 +1140,32 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>         }
>
>         /* compress */
> -       dst = get_cpu_var(zswap_dstmem);
> -       tfm = *get_cpu_ptr(entry->pool->tfm);
> -       src = kmap_atomic(page);
> -       ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
> -       kunmap_atomic(src);
> -       put_cpu_ptr(entry->pool->tfm);
> +       acomp_ctx = this_cpu_ptr(entry->pool->acomp_ctx);
> +
> +       mutex_lock(acomp_ctx->mutex);
> +
> +       src = kmap(page);
> +       dst = acomp_ctx->dstmem;
> +       sg_init_one(&input, src, PAGE_SIZE);
> +       /* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */
> +       sg_init_one(&output, dst, PAGE_SIZE * 2);
> +       acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
> +       /*
> +        * it maybe looks a little bit silly that we send an asynchronous request,
> +        * then wait for its completion synchronously. This makes the process look
> +        * synchronous in fact.
> +        * Theoretically, acomp supports users send multiple acomp requests in one
> +        * acomp instance, then get those requests done simultaneously. but in this
> +        * case, frontswap actually does store and load page by page, there is no
> +        * existing method to send the second page before the first page is done
> +        * in one thread doing frontswap.
> +        * but in different threads running on different cpu, we have different
> +        * acomp instance, so multiple threads can do (de)compression in parallel.
> +        */
> +       ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
> +       dlen = acomp_ctx->req->dlen;
> +       kunmap(page);
> +
>         if (ret) {
>                 ret = -EINVAL;
>                 goto put_dstmem;
> @@ -1103,7 +1189,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>         memcpy(buf, &zhdr, hlen);
>         memcpy(buf + hlen, dst, dlen);
>         zpool_unmap_handle(entry->pool->zpool, handle);
> -       put_cpu_var(zswap_dstmem);
> +       mutex_unlock(acomp_ctx->mutex);
>
>         /* populate entry */
>         entry->offset = offset;
> @@ -1131,7 +1217,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>         return 0;
>
>  put_dstmem:
> -       put_cpu_var(zswap_dstmem);
> +       mutex_unlock(acomp_ctx->mutex);
>         zswap_pool_put(entry->pool);
>  freepage:
>         zswap_entry_cache_free(entry);
> @@ -1148,7 +1234,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>  {
>         struct zswap_tree *tree = zswap_trees[type];
>         struct zswap_entry *entry;
> -       struct crypto_comp *tfm;
> +       struct scatterlist input, output;
> +       struct crypto_acomp_ctx *acomp_ctx;
>         u8 *src, *dst;
>         unsigned int dlen;
>         int ret;
> @@ -1175,11 +1262,17 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>         src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
>         if (zpool_evictable(entry->pool->zpool))
>                 src += sizeof(struct zswap_header);
> -       dst = kmap_atomic(page);
> -       tfm = *get_cpu_ptr(entry->pool->tfm);
> -       ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
> -       put_cpu_ptr(entry->pool->tfm);
> -       kunmap_atomic(dst);
> +       dst = kmap(page);
> +
> +       acomp_ctx = this_cpu_ptr(entry->pool->acomp_ctx);
> +       mutex_lock(acomp_ctx->mutex);
> +       sg_init_one(&input, src, entry->length);
> +       sg_init_one(&output, dst, dlen);
> +       acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
> +       ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
> +       mutex_unlock(acomp_ctx->mutex);
> +
> +       kunmap(page);
>         zpool_unmap_handle(entry->pool->zpool, entry->handle);
>         BUG_ON(ret);
>
> --
> 2.27.0
>
>
Sebastian Andrzej Siewior Sept. 28, 2020, 3:24 p.m. UTC | #2
On 2020-08-19 00:31:00 [+1200], Barry Song wrote:
> diff --git a/mm/zswap.c b/mm/zswap.c

> index fbb782924ccc..00b5f14a7332 100644

> --- a/mm/zswap.c

> +++ b/mm/zswap.c

> @@ -127,9 +129,17 @@ module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,

>  * data structures

>  **********************************/

>  

> +struct crypto_acomp_ctx {

> +	struct crypto_acomp *acomp;

> +	struct acomp_req *req;

> +	struct crypto_wait wait;

> +	u8 *dstmem;

> +	struct mutex *mutex;

> +};

> +

>  struct zswap_pool {

>  	struct zpool *zpool;

> -	struct crypto_comp * __percpu *tfm;

> +	struct crypto_acomp_ctx __percpu *acomp_ctx;

>  	struct kref kref;

>  	struct list_head list;

>  	struct work_struct release_work;

> @@ -388,23 +398,43 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,

>  * per-cpu code

>  **********************************/

>  static DEFINE_PER_CPU(u8 *, zswap_dstmem);

> +/*

> + * If users dynamically change the zpool type and compressor at runtime, i.e.

> + * zswap is running, zswap can have more than one zpool on one cpu, but they

> + * are sharing dtsmem. So we need this mutex to be per-cpu.

> + */

> +static DEFINE_PER_CPU(struct mutex *, zswap_mutex);


There is no need to make it sturct mutex*. You could make it struct
mutex. The following make it more obvious how the relation here is (even
without a comment):

|struct zswap_mem_pool {
|	void		*dst_mem;
|	struct mutex	lock;
|};
|
|static DEFINE_PER_CPU(struct zswap_mem_pool , zswap_mem_pool);

Please access only this, don't add use a pointer in a another struct to
dest_mem or lock which may confuse people.

This resource is per-CPU.  Do you really utilize them all? On 2, 8, 16,
64, 128 core system? More later…

> @@ -916,14 +974,21 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)

>  

>  	case ZSWAP_SWAPCACHE_NEW: /* page is locked */

>  		/* decompress */

> +		acomp_ctx = this_cpu_ptr(entry->pool->acomp_ctx);


My feeling is that this triggers a warning with CONFIG_DEBUG_PREEMPT. I
don't see how it could be avoid it. If you are not preemptible here then
you must not sleep later.

> +

>  		dlen = PAGE_SIZE;

>  		src = (u8 *)zhdr + sizeof(struct zswap_header);

> -		dst = kmap_atomic(page);

> -		tfm = *get_cpu_ptr(entry->pool->tfm);

> -		ret = crypto_comp_decompress(tfm, src, entry->length,

> -					     dst, &dlen);

> -		put_cpu_ptr(entry->pool->tfm);

> -		kunmap_atomic(dst);

> +		dst = kmap(page);

> +

> +		mutex_lock(acomp_ctx->mutex);

> +		sg_init_one(&input, src, entry->length);

> +		sg_init_one(&output, dst, dlen);

> +		acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);

> +		ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);

> +		dlen = acomp_ctx->req->dlen;

> +		mutex_unlock(acomp_ctx->mutex);


Say a request comes in on CPU1. After the mutex_lock() you get migrated to
CPU2. You do crypto_wait_req() on CPU2. In that time another request
gets in on CPU1. It blocks on the very same mutex. No data corruption but
it could have used another buffer instead of blocking and waiting for
the previous one to finish its work.
So it might make sense to have a pool of pages which are shared in the
system globally system instead of having strict per-CPU allocations
while being fully migrate-able while the are in use.

While at it: For dst you could use sg_set_page(). This would avoid the
kmap(). 

> +		kunmap(page);

>  		BUG_ON(ret);

>  		BUG_ON(dlen != PAGE_SIZE);

>  


Sebastian
Song Bao Hua (Barry Song) Sept. 28, 2020, 9:05 p.m. UTC | #3
> -----Original Message-----

> From: Sebastian Andrzej Siewior [mailto:bigeasy@linutronix.de]

> Sent: Tuesday, September 29, 2020 4:25 AM

> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>

> Cc: akpm@linux-foundation.org; herbert@gondor.apana.org.au;

> davem@davemloft.net; linux-crypto@vger.kernel.org; linux-mm@kvack.org;

> linux-kernel@vger.kernel.org; Luis Claudio R . Goncalves

> <lgoncalv@redhat.com>; Mahipal Challa <mahipalreddy2006@gmail.com>;

> Seth Jennings <sjenning@redhat.com>; Dan Streetman <ddstreet@ieee.org>;

> Vitaly Wool <vitaly.wool@konsulko.com>; Wangzhou (B)

> <wangzhou1@hisilicon.com>; fanghao (A) <fanghao11@huawei.com>; Colin

> Ian King <colin.king@canonical.com>

> Subject: Re: [PATCH v6] mm/zswap: move to use crypto_acomp API for

> hardware acceleration

> 

> On 2020-08-19 00:31:00 [+1200], Barry Song wrote:

> > diff --git a/mm/zswap.c b/mm/zswap.c

> > index fbb782924ccc..00b5f14a7332 100644

> > --- a/mm/zswap.c

> > +++ b/mm/zswap.c

> > @@ -127,9 +129,17 @@

> module_param_named(same_filled_pages_enabled,

> zswap_same_filled_pages_enabled,

> >  * data structures

> >  **********************************/

> >

> > +struct crypto_acomp_ctx {

> > +	struct crypto_acomp *acomp;

> > +	struct acomp_req *req;

> > +	struct crypto_wait wait;

> > +	u8 *dstmem;

> > +	struct mutex *mutex;

> > +};

> > +

> >  struct zswap_pool {

> >  	struct zpool *zpool;

> > -	struct crypto_comp * __percpu *tfm;

> > +	struct crypto_acomp_ctx __percpu *acomp_ctx;

> >  	struct kref kref;

> >  	struct list_head list;

> >  	struct work_struct release_work;

> > @@ -388,23 +398,43 @@ static struct zswap_entry

> *zswap_entry_find_get(struct rb_root *root,

> >  * per-cpu code

> >  **********************************/

> >  static DEFINE_PER_CPU(u8 *, zswap_dstmem);

> > +/*

> > + * If users dynamically change the zpool type and compressor at runtime,

> i.e.

> > + * zswap is running, zswap can have more than one zpool on one cpu, but

> they

> > + * are sharing dtsmem. So we need this mutex to be per-cpu.

> > + */

> > +static DEFINE_PER_CPU(struct mutex *, zswap_mutex);

> 

> There is no need to make it sturct mutex*. You could make it struct

> mutex. The following make it more obvious how the relation here is (even

> without a comment):

> 

> |struct zswap_mem_pool {

> |	void		*dst_mem;

> |	struct mutex	lock;

> |};

> |

> |static DEFINE_PER_CPU(struct zswap_mem_pool , zswap_mem_pool);

> 

> Please access only this, don't add use a pointer in a another struct to

> dest_mem or lock which may confuse people.


Well. It seems sensible.

> 

> This resource is per-CPU.  Do you really utilize them all? On 2, 8, 16,

> 64, 128 core system? More later…


It might be a good place to investigate if we are going to do deep performance
tuning of zswap afterwards. But it is obviously out of the scope of this patch.

In my understanding, some bottleneck could exist in the earlier code path of 
memory management before frontswap and zswap if there are too many cores
as the memory and the LRU is globally managed and there might be some
lock issue to block the throughput of frontswap.

> 

> > @@ -916,14 +974,21 @@ static int zswap_writeback_entry(struct zpool

> *pool, unsigned long handle)

> >

> >  	case ZSWAP_SWAPCACHE_NEW: /* page is locked */

> >  		/* decompress */

> > +		acomp_ctx = this_cpu_ptr(entry->pool->acomp_ctx);

> 

> My feeling is that this triggers a warning with CONFIG_DEBUG_PREEMPT. I

> don't see how it could be avoid it. If you are not preemptible here then

> you must not sleep later.


I guess you are talking about some other APIs like get_cpu_ptr().

While APIs like get_cpu_var() and get_cpu_ptr() will prohibit preemption and
disallow sleeping, APIs like this_cpu_ptr() and this_cpu_ptr() won't do that:

#define get_cpu_var(var)						\
(*({									\
	preempt_disable();						\
	this_cpu_ptr(&var);						\
}))

#define put_cpu_var(var)						\
do {									\
	(void)&(var);							\
	preempt_enable();						\
} while (0)

#define get_cpu_ptr(var)						\
({									\
	preempt_disable();						\
	this_cpu_ptr(var);						\
})

#define put_cpu_ptr(var)						\
do {									\
	(void)(var);							\
	preempt_enable();						\
} while (0)

> 

> > +

> >  		dlen = PAGE_SIZE;

> >  		src = (u8 *)zhdr + sizeof(struct zswap_header);

> > -		dst = kmap_atomic(page);

> > -		tfm = *get_cpu_ptr(entry->pool->tfm);

> > -		ret = crypto_comp_decompress(tfm, src, entry->length,

> > -					     dst, &dlen);

> > -		put_cpu_ptr(entry->pool->tfm);

> > -		kunmap_atomic(dst);

> > +		dst = kmap(page);

> > +

> > +		mutex_lock(acomp_ctx->mutex);

> > +		sg_init_one(&input, src, entry->length);

> > +		sg_init_one(&output, dst, dlen);

> > +		acomp_request_set_params(acomp_ctx->req, &input, &output,

> entry->length, dlen);

> > +		ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req),

> &acomp_ctx->wait);

> > +		dlen = acomp_ctx->req->dlen;

> > +		mutex_unlock(acomp_ctx->mutex);

> 

> Say a request comes in on CPU1. After the mutex_lock() you get migrated to

> CPU2. You do crypto_wait_req() on CPU2. In that time another request

> gets in on CPU1. It blocks on the very same mutex. No data corruption but

> it could have used another buffer instead of blocking and waiting for

> the previous one to finish its work.

> So it might make sense to have a pool of pages which are shared in the

> system globally system instead of having strict per-CPU allocations

> while being fully migrate-able while the are in use.


It might be a good place to investigate whether this can help improve the performance of
zswap. It could be in a todo list for optimizing the utilization of the pool. On the other
hand, it is obviously not proper to put that change in this patch.

> 

> While at it: For dst you could use sg_set_page(). This would avoid the

> kmap().


It seems it is true while it needs a call to sg_init_table(). Maybe it is worth to add one
more API similar with sg_init_one() but the parameter is page rather than buffer if sg
with one page(s) is a quite common case.

For this case, I agree it is better to remove this redundant kmap/kunmap.

> 

> > +		kunmap(page);

> >  		BUG_ON(ret);

> >  		BUG_ON(dlen != PAGE_SIZE);

> >

> 

> Sebastian


Thanks
Barry
Song Bao Hua (Barry Song) Sept. 29, 2020, 5:14 a.m. UTC | #4
> -----Original Message-----

> From: Song Bao Hua (Barry Song)

> Sent: Tuesday, September 29, 2020 10:02 AM

> To: 'Sebastian Andrzej Siewior' <bigeasy@linutronix.de>

> Cc: akpm@linux-foundation.org; herbert@gondor.apana.org.au;

> davem@davemloft.net; linux-crypto@vger.kernel.org; linux-mm@kvack.org;

> linux-kernel@vger.kernel.org; Luis Claudio R . Goncalves

> <lgoncalv@redhat.com>; Mahipal Challa <mahipalreddy2006@gmail.com>;

> Seth Jennings <sjenning@redhat.com>; Dan Streetman <ddstreet@ieee.org>;

> Vitaly Wool <vitaly.wool@konsulko.com>; Wangzhou (B)

> <wangzhou1@hisilicon.com>; fanghao (A) <fanghao11@huawei.com>; Colin

> Ian King <colin.king@canonical.com>

> Subject: RE: [PATCH v6] mm/zswap: move to use crypto_acomp API for

> hardware acceleration

> 

> 

> 

> > -----Original Message-----

> > From: Sebastian Andrzej Siewior [mailto:bigeasy@linutronix.de]

> > Sent: Tuesday, September 29, 2020 4:25 AM

> > To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>

> > Cc: akpm@linux-foundation.org; herbert@gondor.apana.org.au;

> > davem@davemloft.net; linux-crypto@vger.kernel.org; linux-mm@kvack.org;

> > linux-kernel@vger.kernel.org; Luis Claudio R . Goncalves

> > <lgoncalv@redhat.com>; Mahipal Challa <mahipalreddy2006@gmail.com>;

> > Seth Jennings <sjenning@redhat.com>; Dan Streetman <ddstreet@ieee.org>;

> > Vitaly Wool <vitaly.wool@konsulko.com>; Wangzhou (B)

> > <wangzhou1@hisilicon.com>; fanghao (A) <fanghao11@huawei.com>; Colin

> > Ian King <colin.king@canonical.com>

> > Subject: Re: [PATCH v6] mm/zswap: move to use crypto_acomp API for

> > hardware acceleration

> >

> > On 2020-08-19 00:31:00 [+1200], Barry Song wrote:

> > > diff --git a/mm/zswap.c b/mm/zswap.c

> > > index fbb782924ccc..00b5f14a7332 100644

> > > --- a/mm/zswap.c

> > > +++ b/mm/zswap.c

> > > @@ -127,9 +129,17 @@

> > module_param_named(same_filled_pages_enabled,

> > zswap_same_filled_pages_enabled,

> > >  * data structures

> > >  **********************************/

> > >

> > > +struct crypto_acomp_ctx {

> > > +	struct crypto_acomp *acomp;

> > > +	struct acomp_req *req;

> > > +	struct crypto_wait wait;

> > > +	u8 *dstmem;

> > > +	struct mutex *mutex;

> > > +};

> > > +

> > >  struct zswap_pool {

> > >  	struct zpool *zpool;

> > > -	struct crypto_comp * __percpu *tfm;

> > > +	struct crypto_acomp_ctx __percpu *acomp_ctx;

> > >  	struct kref kref;

> > >  	struct list_head list;

> > >  	struct work_struct release_work;

> > > @@ -388,23 +398,43 @@ static struct zswap_entry

> > *zswap_entry_find_get(struct rb_root *root,

> > >  * per-cpu code

> > >  **********************************/

> > >  static DEFINE_PER_CPU(u8 *, zswap_dstmem);

> > > +/*

> > > + * If users dynamically change the zpool type and compressor at runtime,

> > i.e.

> > > + * zswap is running, zswap can have more than one zpool on one cpu, but

> > they

> > > + * are sharing dtsmem. So we need this mutex to be per-cpu.

> > > + */

> > > +static DEFINE_PER_CPU(struct mutex *, zswap_mutex);

> >

> > There is no need to make it sturct mutex*. You could make it struct

> > mutex. The following make it more obvious how the relation here is (even

> > without a comment):

> >

> > |struct zswap_mem_pool {

> > |	void		*dst_mem;

> > |	struct mutex	lock;

> > |};

> > |

> > |static DEFINE_PER_CPU(struct zswap_mem_pool , zswap_mem_pool);

> >

> > Please access only this, don't add use a pointer in a another struct to

> > dest_mem or lock which may confuse people.

> 

> Well. It seems sensible.


After second thought and trying to make this change, I would like to change my mind
and disagree with this idea. Two reasons:
1. while using this_cpu_ptr() without preemption lock, people usually put all things bound
with one cpu to one structure, so that once we get the pointer of the whole structure, we get
all its parts belonging to the same cpu. If we move the dstmem and mutex out of the structure
containing them, we will have to do:
	a. get_cpu_ptr() for the acomp_ctx   //lock preemption
	b. this_cpu_ptr() for the dstmem and mutex
	c. put_cpu_ptr() for the acomp_ctx  //unlock preemption
	d. mutex_lock()
	  sg_init_one()
	  compress/decompress etc.
	  ...
	  mutex_unlock

as the get() and put() have a preemption lock/unlock, this will make certain this_cpu_ptr()
in the step "b" will return the right dstmem and mutex which belong to the same cpu with
step "a".

The steps from "a" to "c" are quite silly and confusing. I believe the existing code aligns
with the most similar code in kernel better:
	a. this_cpu_ptr()   //get everything for one cpu
	b. mutex_lock()
	  sg_init_one()
	  compress/decompress etc.
	  ...
	  mutex_unlock

2. while allocating mutex, we can put the mutex into local memory by using kmalloc_node().
If we move to "struct mutex lock" directly, most CPUs in a NUMA server will have to access
remote memory to read/write the mutex, therefore, this will increase the latency dramatically.

Thanks
Barry
Sebastian Andrzej Siewior Sept. 29, 2020, 9:31 a.m. UTC | #5
On 2020-09-29 05:14:31 [+0000], Song Bao Hua (Barry Song) wrote:
> After second thought and trying to make this change, I would like to change my mind

> and disagree with this idea. Two reasons:

> 1. while using this_cpu_ptr() without preemption lock, people usually put all things bound

> with one cpu to one structure, so that once we get the pointer of the whole structure, we get

> all its parts belonging to the same cpu. If we move the dstmem and mutex out of the structure

> containing them, we will have to do:

> 	a. get_cpu_ptr() for the acomp_ctx   //lock preemption

> 	b. this_cpu_ptr() for the dstmem and mutex

> 	c. put_cpu_ptr() for the acomp_ctx  //unlock preemption

> 	d. mutex_lock()

> 	  sg_init_one()

> 	  compress/decompress etc.

> 	  ...

> 	  mutex_unlock

> 

> as the get() and put() have a preemption lock/unlock, this will make certain this_cpu_ptr()

> in the step "b" will return the right dstmem and mutex which belong to the same cpu with

> step "a".

> 

> The steps from "a" to "c" are quite silly and confusing. I believe the existing code aligns

> with the most similar code in kernel better:

> 	a. this_cpu_ptr()   //get everything for one cpu

> 	b. mutex_lock()

> 	  sg_init_one()

> 	  compress/decompress etc.

> 	  ...

> 	  mutex_unlock


My point was that there will be a warning at run-time and you don't want
that. There are raw_ accessors if you know what you are doing. But…

Earlier you had compression/decompression with disabled preemption and
strict per-CPU memory allocation. Now if you keep this per-CPU memory
allocation then you gain a possible bottleneck.
In the previous email you said that there may be a bottleneck in the
upper layer where you can't utilize all that memory you allocate. So you
may want to rethink that strategy before that rework.

> 2. while allocating mutex, we can put the mutex into local memory by using kmalloc_node().

> If we move to "struct mutex lock" directly, most CPUs in a NUMA server will have to access

> remote memory to read/write the mutex, therefore, this will increase the latency dramatically.


If you need something per-CPU then DEFINE_PER_CPU() will give it to you.
It would be very bad for performance if this allocations were not from
CPU-local memory, right? So what makes you think this is worse than
kmalloc_node() based allocations?

> Thanks

> Barry


Sebastian
Song Bao Hua (Barry Song) Sept. 29, 2020, 10:02 a.m. UTC | #6
> -----Original Message-----

> From: Sebastian Andrzej Siewior [mailto:bigeasy@linutronix.de]

> Sent: Tuesday, September 29, 2020 10:31 PM

> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>

> Cc: akpm@linux-foundation.org; herbert@gondor.apana.org.au;

> davem@davemloft.net; linux-crypto@vger.kernel.org; linux-mm@kvack.org;

> linux-kernel@vger.kernel.org; Luis Claudio R . Goncalves

> <lgoncalv@redhat.com>; Mahipal Challa <mahipalreddy2006@gmail.com>;

> Seth Jennings <sjenning@redhat.com>; Dan Streetman <ddstreet@ieee.org>;

> Vitaly Wool <vitaly.wool@konsulko.com>; Wangzhou (B)

> <wangzhou1@hisilicon.com>; fanghao (A) <fanghao11@huawei.com>; Colin

> Ian King <colin.king@canonical.com>

> Subject: Re: [PATCH v6] mm/zswap: move to use crypto_acomp API for

> hardware acceleration

> 

> On 2020-09-29 05:14:31 [+0000], Song Bao Hua (Barry Song) wrote:

> > After second thought and trying to make this change, I would like to change

> my mind

> > and disagree with this idea. Two reasons:

> > 1. while using this_cpu_ptr() without preemption lock, people usually put all

> things bound

> > with one cpu to one structure, so that once we get the pointer of the whole

> structure, we get

> > all its parts belonging to the same cpu. If we move the dstmem and mutex

> out of the structure

> > containing them, we will have to do:

> > 	a. get_cpu_ptr() for the acomp_ctx   //lock preemption

> > 	b. this_cpu_ptr() for the dstmem and mutex

> > 	c. put_cpu_ptr() for the acomp_ctx  //unlock preemption

> > 	d. mutex_lock()

> > 	  sg_init_one()

> > 	  compress/decompress etc.

> > 	  ...

> > 	  mutex_unlock

> >

> > as the get() and put() have a preemption lock/unlock, this will make certain

> this_cpu_ptr()

> > in the step "b" will return the right dstmem and mutex which belong to the

> same cpu with

> > step "a".

> >

> > The steps from "a" to "c" are quite silly and confusing. I believe the existing

> code aligns

> > with the most similar code in kernel better:

> > 	a. this_cpu_ptr()   //get everything for one cpu

> > 	b. mutex_lock()

> > 	  sg_init_one()

> > 	  compress/decompress etc.

> > 	  ...

> > 	  mutex_unlock

> 

> My point was that there will be a warning at run-time and you don't want

> that. There are raw_ accessors if you know what you are doing. But…


I have only seen get_cpu_ptr/var() things will disable preemption. I don't think
we will have a warning as this_cpu_ptr() won't disable preemption.

> 

> Earlier you had compression/decompression with disabled preemption and


No. that is right now done in enabled preemption context with this patch. The code before this patch
was doing (de)compression in preemption-disabled context by using get_cpu_ptr and get_cpu_var.

> strict per-CPU memory allocation. Now if you keep this per-CPU memory

> allocation then you gain a possible bottleneck.

> In the previous email you said that there may be a bottleneck in the

> upper layer where you can't utilize all that memory you allocate. So you

> may want to rethink that strategy before that rework.


we are probably not talking about same thing :-)
I was talking about possible generic swap bottleneck. For example, LRU is global,
while swapping, multiple cores might have some locks on this LRU. for example,
if we have 8 inactive pages to swap out, I am not sure if mm can use 8 cores
to swap them out at the same time.

> 

> > 2. while allocating mutex, we can put the mutex into local memory by using

> kmalloc_node().

> > If we move to "struct mutex lock" directly, most CPUs in a NUMA server will

> have to access

> > remote memory to read/write the mutex, therefore, this will increase the

> latency dramatically.

> 

> If you need something per-CPU then DEFINE_PER_CPU() will give it to you.


Yes. It is true.

> It would be very bad for performance if this allocations were not from

> CPU-local memory, right? So what makes you think this is worse than

> kmalloc_node() based allocations?


Yes. If your read zswap code, it has considered NUMA very carefully by allocating various
memory locally. And in crypto framework, I also added API to allocate local compression.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7bc13b5b60e94
this zswap patch has used the new node-aware API.

Memory access crossing NUMA node, practically crossing packages, can dramatically increase,
like double, triple or more.

Thanks
Barry
Sebastian Andrzej Siewior Sept. 29, 2020, 10:28 a.m. UTC | #7
On 2020-09-29 10:02:15 [+0000], Song Bao Hua (Barry Song) wrote:
> > My point was that there will be a warning at run-time and you don't want

> > that. There are raw_ accessors if you know what you are doing. But…

> 

> I have only seen get_cpu_ptr/var() things will disable preemption. I don't think

> we will have a warning as this_cpu_ptr() won't disable preemption.


Good. Just enable CONFIG_DEBUG_PREEMPT and tell please what happens.

> > Earlier you had compression/decompression with disabled preemption and

> 

> No. that is right now done in enabled preemption context with this patch. The code before this patch

> was doing (de)compression in preemption-disabled context by using get_cpu_ptr and get_cpu_var.


Exactly what I am saying. And within this get_cpu_ptr() section there
was the compression/decompression sitting. So compression/decompression
happend while preemtion was off.

> > strict per-CPU memory allocation. Now if you keep this per-CPU memory

> > allocation then you gain a possible bottleneck.

> > In the previous email you said that there may be a bottleneck in the

> > upper layer where you can't utilize all that memory you allocate. So you

> > may want to rethink that strategy before that rework.

> 

> we are probably not talking about same thing :-)

> I was talking about possible generic swap bottleneck. For example, LRU is global,

> while swapping, multiple cores might have some locks on this LRU. for example,

> if we have 8 inactive pages to swap out, I am not sure if mm can use 8 cores

> to swap them out at the same time.


In that case you probably don't need 8* per-CPU memory for this task.

> > 

> > > 2. while allocating mutex, we can put the mutex into local memory by using

> > kmalloc_node().

> > > If we move to "struct mutex lock" directly, most CPUs in a NUMA server will

> > have to access

> > > remote memory to read/write the mutex, therefore, this will increase the

> > latency dramatically.

> > 

> > If you need something per-CPU then DEFINE_PER_CPU() will give it to you.

> 

> Yes. It is true.

> 

> > It would be very bad for performance if this allocations were not from

> > CPU-local memory, right? So what makes you think this is worse than

> > kmalloc_node() based allocations?

> 

> Yes. If your read zswap code, it has considered NUMA very carefully by allocating various

> memory locally. And in crypto framework, I also added API to allocate local compression.

> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7bc13b5b60e94

> this zswap patch has used the new node-aware API.

> 

> Memory access crossing NUMA node, practically crossing packages, can dramatically increase,

> like double, triple or more.


So you are telling me, DEFINE_PER_CPU() does not allocate the memory for
each CPU to be local but kmalloc_node() does?

> Thanks

> Barry


Sebastian
Song Bao Hua (Barry Song) Sept. 29, 2020, 11:05 a.m. UTC | #8
> -----Original Message-----

> From: Sebastian Andrzej Siewior [mailto:bigeasy@linutronix.de]

> Sent: Tuesday, September 29, 2020 11:29 PM

> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>

> Cc: akpm@linux-foundation.org; herbert@gondor.apana.org.au;

> davem@davemloft.net; linux-crypto@vger.kernel.org; linux-mm@kvack.org;

> linux-kernel@vger.kernel.org; Luis Claudio R . Goncalves

> <lgoncalv@redhat.com>; Mahipal Challa <mahipalreddy2006@gmail.com>;

> Seth Jennings <sjenning@redhat.com>; Dan Streetman <ddstreet@ieee.org>;

> Vitaly Wool <vitaly.wool@konsulko.com>; Wangzhou (B)

> <wangzhou1@hisilicon.com>; fanghao (A) <fanghao11@huawei.com>; Colin

> Ian King <colin.king@canonical.com>

> Subject: Re: [PATCH v6] mm/zswap: move to use crypto_acomp API for

> hardware acceleration

> 

> On 2020-09-29 10:02:15 [+0000], Song Bao Hua (Barry Song) wrote:

> > > My point was that there will be a warning at run-time and you don't

> > > want that. There are raw_ accessors if you know what you are doing.

> > > But…

> >

> > I have only seen get_cpu_ptr/var() things will disable preemption. I

> > don't think we will have a warning as this_cpu_ptr() won't disable

> preemption.

> 

> Good. Just enable CONFIG_DEBUG_PREEMPT and tell please what happens.

> 


Probably got what you mean while this_cpu_ptr should usually be called in preemption-disabled
context. Will do further test to check and verify carefully with DEBUG_PREEMPT.

So your suggestion is actually the below?
a. get_cpu_ptr  // for acomp_ctx and disable preemption
b. raw_cpu_ptr or this_cpu_ptr  // for dstmem+ mutex
c. put_cpu_ptr  //enable preemption

in this way, all steps "a" to "c" are strictly in a preemption-disabled context.

> > > Earlier you had compression/decompression with disabled preemption

> > > and

> >

> > No. that is right now done in enabled preemption context with this

> > patch. The code before this patch was doing (de)compression in

> preemption-disabled context by using get_cpu_ptr and get_cpu_var.

> 

> Exactly what I am saying. And within this get_cpu_ptr() section there was the

> compression/decompression sitting. So compression/decompression happend

> while preemtion was off.

> 

> > > strict per-CPU memory allocation. Now if you keep this per-CPU

> > > memory allocation then you gain a possible bottleneck.

> > > In the previous email you said that there may be a bottleneck in the

> > > upper layer where you can't utilize all that memory you allocate. So

> > > you may want to rethink that strategy before that rework.

> >

> > we are probably not talking about same thing :-) I was talking about

> > possible generic swap bottleneck. For example, LRU is global, while

> > swapping, multiple cores might have some locks on this LRU. for

> > example, if we have 8 inactive pages to swap out, I am not sure if mm

> > can use 8 cores to swap them out at the same time.

> 

> In that case you probably don't need 8* per-CPU memory for this task.


Eventually I got what you mean, it seems you mean we might be able to save some memory
since we have moved to preemption-enabled context for (de)compression, we don’t have to
strictly depend on per-cpu page.

I agree it might be put into todo list to investigate. For the first patch to fix the broken APIs
connection, it seems it is not the proper time to handle this memory saving issue. And it is
actually quite complicated as we need a per-numa pool for dstmem rather than global pool.

> 

> > >

> > > > 2. while allocating mutex, we can put the mutex into local memory

> > > > by using

> > > kmalloc_node().

> > > > If we move to "struct mutex lock" directly, most CPUs in a NUMA

> > > > server will

> > > have to access

> > > > remote memory to read/write the mutex, therefore, this will

> > > > increase the

> > > latency dramatically.

> > >

> > > If you need something per-CPU then DEFINE_PER_CPU() will give it to you.

> >

> > Yes. It is true.

> >

> > > It would be very bad for performance if this allocations were not

> > > from CPU-local memory, right? So what makes you think this is worse

> > > than

> > > kmalloc_node() based allocations?

> >

> > Yes. If your read zswap code, it has considered NUMA very carefully by

> > allocating various memory locally. And in crypto framework, I also added API

> to allocate local compression.

> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/com

> > mit/?id=7bc13b5b60e94 this zswap patch has used the new node-aware

> > API.

> >

> > Memory access crossing NUMA node, practically crossing packages, can

> > dramatically increase, like double, triple or more.

> 

> So you are telling me, DEFINE_PER_CPU() does not allocate the memory for

> each CPU to be local but kmalloc_node() does?


For the first beginning, they are put together. But after setup_per_cpu_areas()
they are copied to a memory accolated by memblock with node information.
I missed the second part. You are right :-)

Thanks
Barry
diff mbox series

Patch

diff --git a/mm/zswap.c b/mm/zswap.c
index fbb782924ccc..00b5f14a7332 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -24,8 +24,10 @@ 
 #include <linux/rbtree.h>
 #include <linux/swap.h>
 #include <linux/crypto.h>
+#include <linux/scatterlist.h>
 #include <linux/mempool.h>
 #include <linux/zpool.h>
+#include <crypto/acompress.h>
 
 #include <linux/mm_types.h>
 #include <linux/page-flags.h>
@@ -127,9 +129,17 @@  module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
 * data structures
 **********************************/
 
+struct crypto_acomp_ctx {
+	struct crypto_acomp *acomp;
+	struct acomp_req *req;
+	struct crypto_wait wait;
+	u8 *dstmem;
+	struct mutex *mutex;
+};
+
 struct zswap_pool {
 	struct zpool *zpool;
-	struct crypto_comp * __percpu *tfm;
+	struct crypto_acomp_ctx __percpu *acomp_ctx;
 	struct kref kref;
 	struct list_head list;
 	struct work_struct release_work;
@@ -388,23 +398,43 @@  static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
 * per-cpu code
 **********************************/
 static DEFINE_PER_CPU(u8 *, zswap_dstmem);
+/*
+ * If users dynamically change the zpool type and compressor at runtime, i.e.
+ * zswap is running, zswap can have more than one zpool on one cpu, but they
+ * are sharing dtsmem. So we need this mutex to be per-cpu.
+ */
+static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
 
 static int zswap_dstmem_prepare(unsigned int cpu)
 {
+	struct mutex *mutex;
 	u8 *dst;
 
 	dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
 	if (!dst)
 		return -ENOMEM;
 
+	mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
+	if (!mutex) {
+		kfree(dst);
+		return -ENOMEM;
+	}
+
+	mutex_init(mutex);
 	per_cpu(zswap_dstmem, cpu) = dst;
+	per_cpu(zswap_mutex, cpu) = mutex;
 	return 0;
 }
 
 static int zswap_dstmem_dead(unsigned int cpu)
 {
+	struct mutex *mutex;
 	u8 *dst;
 
+	mutex = per_cpu(zswap_mutex, cpu);
+	kfree(mutex);
+	per_cpu(zswap_mutex, cpu) = NULL;
+
 	dst = per_cpu(zswap_dstmem, cpu);
 	kfree(dst);
 	per_cpu(zswap_dstmem, cpu) = NULL;
@@ -415,30 +445,54 @@  static int zswap_dstmem_dead(unsigned int cpu)
 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
 {
 	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
-	struct crypto_comp *tfm;
-
-	if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
-		return 0;
+	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+	struct crypto_acomp *acomp;
+	struct acomp_req *req;
+
+	acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
+	if (IS_ERR(acomp)) {
+		pr_err("could not alloc crypto acomp %s : %ld\n",
+				pool->tfm_name, PTR_ERR(acomp));
+		return PTR_ERR(acomp);
+	}
+	acomp_ctx->acomp = acomp;
 
-	tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
-	if (IS_ERR_OR_NULL(tfm)) {
-		pr_err("could not alloc crypto comp %s : %ld\n",
-		       pool->tfm_name, PTR_ERR(tfm));
+	req = acomp_request_alloc(acomp_ctx->acomp);
+	if (!req) {
+		pr_err("could not alloc crypto acomp_request %s\n",
+		       pool->tfm_name);
+		crypto_free_acomp(acomp_ctx->acomp);
 		return -ENOMEM;
 	}
-	*per_cpu_ptr(pool->tfm, cpu) = tfm;
+	acomp_ctx->req = req;
+
+	crypto_init_wait(&acomp_ctx->wait);
+	/*
+	 * if the backend of acomp is async zip, crypto_req_done() will wakeup
+	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
+	 * won't be called, crypto_wait_req() will return without blocking.
+	 */
+	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				   crypto_req_done, &acomp_ctx->wait);
+
+	acomp_ctx->mutex = per_cpu(zswap_mutex, cpu);
+	acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu);
+
 	return 0;
 }
 
 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
 {
 	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
-	struct crypto_comp *tfm;
+	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+
+	if (!IS_ERR_OR_NULL(acomp_ctx)) {
+		if (!IS_ERR_OR_NULL(acomp_ctx->req))
+			acomp_request_free(acomp_ctx->req);
+		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
+			crypto_free_acomp(acomp_ctx->acomp);
+	}
 
-	tfm = *per_cpu_ptr(pool->tfm, cpu);
-	if (!IS_ERR_OR_NULL(tfm))
-		crypto_free_comp(tfm);
-	*per_cpu_ptr(pool->tfm, cpu) = NULL;
 	return 0;
 }
 
@@ -561,8 +615,9 @@  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 	pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
 
 	strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
-	pool->tfm = alloc_percpu(struct crypto_comp *);
-	if (!pool->tfm) {
+
+	pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
+	if (!pool->acomp_ctx) {
 		pr_err("percpu alloc failed\n");
 		goto error;
 	}
@@ -585,7 +640,8 @@  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 	return pool;
 
 error:
-	free_percpu(pool->tfm);
+	if (pool->acomp_ctx)
+		free_percpu(pool->acomp_ctx);
 	if (pool->zpool)
 		zpool_destroy_pool(pool->zpool);
 	kfree(pool);
@@ -596,14 +652,14 @@  static __init struct zswap_pool *__zswap_pool_create_fallback(void)
 {
 	bool has_comp, has_zpool;
 
-	has_comp = crypto_has_comp(zswap_compressor, 0, 0);
+	has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
 	if (!has_comp && strcmp(zswap_compressor,
 				CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
 		pr_err("compressor %s not available, using default %s\n",
 		       zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
 		param_free_charp(&zswap_compressor);
 		zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
-		has_comp = crypto_has_comp(zswap_compressor, 0, 0);
+		has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
 	}
 	if (!has_comp) {
 		pr_err("default compressor %s not available\n",
@@ -639,7 +695,7 @@  static void zswap_pool_destroy(struct zswap_pool *pool)
 	zswap_pool_debug("destroying", pool);
 
 	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
-	free_percpu(pool->tfm);
+	free_percpu(pool->acomp_ctx);
 	zpool_destroy_pool(pool->zpool);
 	kfree(pool);
 }
@@ -723,7 +779,7 @@  static int __zswap_param_set(const char *val, const struct kernel_param *kp,
 		}
 		type = s;
 	} else if (!compressor) {
-		if (!crypto_has_comp(s, 0, 0)) {
+		if (!crypto_has_acomp(s, 0, 0)) {
 			pr_err("compressor %s not available\n", s);
 			return -ENOENT;
 		}
@@ -774,7 +830,7 @@  static int __zswap_param_set(const char *val, const struct kernel_param *kp,
 		 * failed, maybe both compressor and zpool params were bad.
 		 * Allow changing this param, so pool creation will succeed
 		 * when the other param is changed. We already verified this
-		 * param is ok in the zpool_has_pool() or crypto_has_comp()
+		 * param is ok in the zpool_has_pool() or crypto_has_acomp()
 		 * checks above.
 		 */
 		ret = param_set_charp(s, kp);
@@ -876,7 +932,9 @@  static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 	pgoff_t offset;
 	struct zswap_entry *entry;
 	struct page *page;
-	struct crypto_comp *tfm;
+	struct scatterlist input, output;
+	struct crypto_acomp_ctx *acomp_ctx;
+
 	u8 *src, *dst;
 	unsigned int dlen;
 	int ret;
@@ -916,14 +974,21 @@  static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 
 	case ZSWAP_SWAPCACHE_NEW: /* page is locked */
 		/* decompress */
+		acomp_ctx = this_cpu_ptr(entry->pool->acomp_ctx);
+
 		dlen = PAGE_SIZE;
 		src = (u8 *)zhdr + sizeof(struct zswap_header);
-		dst = kmap_atomic(page);
-		tfm = *get_cpu_ptr(entry->pool->tfm);
-		ret = crypto_comp_decompress(tfm, src, entry->length,
-					     dst, &dlen);
-		put_cpu_ptr(entry->pool->tfm);
-		kunmap_atomic(dst);
+		dst = kmap(page);
+
+		mutex_lock(acomp_ctx->mutex);
+		sg_init_one(&input, src, entry->length);
+		sg_init_one(&output, dst, dlen);
+		acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
+		ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
+		dlen = acomp_ctx->req->dlen;
+		mutex_unlock(acomp_ctx->mutex);
+
+		kunmap(page);
 		BUG_ON(ret);
 		BUG_ON(dlen != PAGE_SIZE);
 
@@ -1004,7 +1069,8 @@  static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 {
 	struct zswap_tree *tree = zswap_trees[type];
 	struct zswap_entry *entry, *dupentry;
-	struct crypto_comp *tfm;
+	struct scatterlist input, output;
+	struct crypto_acomp_ctx *acomp_ctx;
 	int ret;
 	unsigned int hlen, dlen = PAGE_SIZE;
 	unsigned long handle, value;
@@ -1074,12 +1140,32 @@  static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 	}
 
 	/* compress */
-	dst = get_cpu_var(zswap_dstmem);
-	tfm = *get_cpu_ptr(entry->pool->tfm);
-	src = kmap_atomic(page);
-	ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
-	kunmap_atomic(src);
-	put_cpu_ptr(entry->pool->tfm);
+	acomp_ctx = this_cpu_ptr(entry->pool->acomp_ctx);
+
+	mutex_lock(acomp_ctx->mutex);
+
+	src = kmap(page);
+	dst = acomp_ctx->dstmem;
+	sg_init_one(&input, src, PAGE_SIZE);
+	/* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */
+	sg_init_one(&output, dst, PAGE_SIZE * 2);
+	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
+	/*
+	 * it maybe looks a little bit silly that we send an asynchronous request,
+	 * then wait for its completion synchronously. This makes the process look
+	 * synchronous in fact.
+	 * Theoretically, acomp supports users send multiple acomp requests in one
+	 * acomp instance, then get those requests done simultaneously. but in this
+	 * case, frontswap actually does store and load page by page, there is no
+	 * existing method to send the second page before the first page is done
+	 * in one thread doing frontswap.
+	 * but in different threads running on different cpu, we have different
+	 * acomp instance, so multiple threads can do (de)compression in parallel.
+	 */
+	ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
+	dlen = acomp_ctx->req->dlen;
+	kunmap(page);
+
 	if (ret) {
 		ret = -EINVAL;
 		goto put_dstmem;
@@ -1103,7 +1189,7 @@  static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 	memcpy(buf, &zhdr, hlen);
 	memcpy(buf + hlen, dst, dlen);
 	zpool_unmap_handle(entry->pool->zpool, handle);
-	put_cpu_var(zswap_dstmem);
+	mutex_unlock(acomp_ctx->mutex);
 
 	/* populate entry */
 	entry->offset = offset;
@@ -1131,7 +1217,7 @@  static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 	return 0;
 
 put_dstmem:
-	put_cpu_var(zswap_dstmem);
+	mutex_unlock(acomp_ctx->mutex);
 	zswap_pool_put(entry->pool);
 freepage:
 	zswap_entry_cache_free(entry);
@@ -1148,7 +1234,8 @@  static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 {
 	struct zswap_tree *tree = zswap_trees[type];
 	struct zswap_entry *entry;
-	struct crypto_comp *tfm;
+	struct scatterlist input, output;
+	struct crypto_acomp_ctx *acomp_ctx;
 	u8 *src, *dst;
 	unsigned int dlen;
 	int ret;
@@ -1175,11 +1262,17 @@  static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 	src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
 	if (zpool_evictable(entry->pool->zpool))
 		src += sizeof(struct zswap_header);
-	dst = kmap_atomic(page);
-	tfm = *get_cpu_ptr(entry->pool->tfm);
-	ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
-	put_cpu_ptr(entry->pool->tfm);
-	kunmap_atomic(dst);
+	dst = kmap(page);
+
+	acomp_ctx = this_cpu_ptr(entry->pool->acomp_ctx);
+	mutex_lock(acomp_ctx->mutex);
+	sg_init_one(&input, src, entry->length);
+	sg_init_one(&output, dst, dlen);
+	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
+	ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
+	mutex_unlock(acomp_ctx->mutex);
+
+	kunmap(page);
 	zpool_unmap_handle(entry->pool->zpool, entry->handle);
 	BUG_ON(ret);