From patchwork Sat Jul 30 08:32:44 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "yekai \(A\)" X-Patchwork-Id: 594949 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 95017C3F6B0 for ; Sat, 30 Jul 2022 08:39:53 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S230138AbiG3Ijv (ORCPT ); Sat, 30 Jul 2022 04:39:51 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55420 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232009AbiG3Iju (ORCPT ); Sat, 30 Jul 2022 04:39:50 -0400 Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 84779286C5; Sat, 30 Jul 2022 01:39:47 -0700 (PDT) Received: from dggpeml500021.china.huawei.com (unknown [172.30.72.54]) by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4LvyQD6YCFzWfC9; Sat, 30 Jul 2022 16:35:48 +0800 (CST) Received: from dggpeml100012.china.huawei.com (7.185.36.121) by dggpeml500021.china.huawei.com (7.185.36.21) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2375.24; Sat, 30 Jul 2022 16:39:45 +0800 Received: from huawei.com (10.67.165.24) by dggpeml100012.china.huawei.com (7.185.36.121) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2375.24; Sat, 30 Jul 2022 16:39:45 +0800 From: Kai Ye To: , CC: , , , , Subject: [PATCH v6 1/3] uacce: supports device isolation feature Date: Sat, 30 Jul 2022 16:32:44 +0800 Message-ID: <20220730083246.55646-2-yekai13@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20220730083246.55646-1-yekai13@huawei.com> References: <20220730083246.55646-1-yekai13@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.67.165.24] X-ClientProxiedBy: dggems706-chm.china.huawei.com (10.3.19.183) To dggpeml100012.china.huawei.com (7.185.36.121) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-crypto@vger.kernel.org UACCE adds the hardware error isolation API. Users can configure the isolation frequency by this sysfs node. UACCE reports the device isolate state to the user space. If the AER error frequency exceeds the value of setting for a certain period of time, the device will be isolated. Signed-off-by: Kai Ye --- drivers/misc/uacce/uacce.c | 58 ++++++++++++++++++++++++++++++++++++++ include/linux/uacce.h | 11 ++++++++ 2 files changed, 69 insertions(+) diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c index 281c54003edc..41f454c89cd1 100644 --- a/drivers/misc/uacce/uacce.c +++ b/drivers/misc/uacce/uacce.c @@ -7,6 +7,8 @@ #include #include +#define MAX_ERR_ISOLATE_COUNT 65535 + static struct class *uacce_class; static dev_t uacce_devt; static DEFINE_MUTEX(uacce_mutex); @@ -339,12 +341,57 @@ static ssize_t region_dus_size_show(struct device *dev, uacce->qf_pg_num[UACCE_QFRT_DUS] << PAGE_SHIFT); } +static ssize_t isolate_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct uacce_device *uacce = to_uacce_device(dev); + + if (!uacce->ops->get_isolate_state) + return -ENODEV; + + return sysfs_emit(buf, "%d\n", uacce->ops->get_isolate_state(uacce)); +} + +static ssize_t isolate_strategy_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct uacce_device *uacce = to_uacce_device(dev); + u32 val; + + val = uacce->ops->isolate_strategy_read(uacce); + if (val > MAX_ERR_ISOLATE_COUNT) + return -EINVAL; + + return sysfs_emit(buf, "%u\n", val); +} + +static ssize_t isolate_strategy_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct uacce_device *uacce = to_uacce_device(dev); + unsigned long val; + int ret; + + if (kstrtoul(buf, 0, &val) < 0) + return -EINVAL; + + if (val > MAX_ERR_ISOLATE_COUNT) + return -EINVAL; + + ret = uacce->ops->isolate_strategy_write(uacce, val); + + return ret ? ret : count; +} + static DEVICE_ATTR_RO(api); static DEVICE_ATTR_RO(flags); static DEVICE_ATTR_RO(available_instances); static DEVICE_ATTR_RO(algorithms); static DEVICE_ATTR_RO(region_mmio_size); static DEVICE_ATTR_RO(region_dus_size); +static DEVICE_ATTR_RO(isolate); +static DEVICE_ATTR_RW(isolate_strategy); static struct attribute *uacce_dev_attrs[] = { &dev_attr_api.attr, @@ -353,6 +400,8 @@ static struct attribute *uacce_dev_attrs[] = { &dev_attr_algorithms.attr, &dev_attr_region_mmio_size.attr, &dev_attr_region_dus_size.attr, + &dev_attr_isolate.attr, + &dev_attr_isolate_strategy.attr, NULL, }; @@ -368,6 +417,15 @@ static umode_t uacce_dev_is_visible(struct kobject *kobj, (!uacce->qf_pg_num[UACCE_QFRT_DUS]))) return 0; + if (attr == &dev_attr_isolate_strategy.attr && + (!uacce->ops->isolate_strategy_read || + !uacce->ops->isolate_strategy_write)) + return 0; + + if (attr == &dev_attr_isolate.attr && + !uacce->ops->get_isolate_state) + return 0; + return attr->mode; } diff --git a/include/linux/uacce.h b/include/linux/uacce.h index 48e319f40275..69e8f238d80c 100644 --- a/include/linux/uacce.h +++ b/include/linux/uacce.h @@ -30,6 +30,9 @@ struct uacce_qfile_region { * @is_q_updated: check whether the task is finished * @mmap: mmap addresses of queue to user space * @ioctl: ioctl for user space users of the queue + * @get_isolate_state: get the device state after set the isolate strategy + * @isolate_strategy_write: stored the isolate strategy to the device + * @isolate_strategy_read: read the isolate strategy value from the device */ struct uacce_ops { int (*get_available_instances)(struct uacce_device *uacce); @@ -43,6 +46,9 @@ struct uacce_ops { struct uacce_qfile_region *qfr); long (*ioctl)(struct uacce_queue *q, unsigned int cmd, unsigned long arg); + enum uacce_dev_state (*get_isolate_state)(struct uacce_device *uacce); + int (*isolate_strategy_write)(struct uacce_device *uacce, u32 freq); + u32 (*isolate_strategy_read)(struct uacce_device *uacce); }; /** @@ -57,6 +63,11 @@ struct uacce_interface { const struct uacce_ops *ops; }; +enum uacce_dev_state { + UACCE_DEV_NORMAL, + UACCE_DEV_ISOLATE, +}; + enum uacce_q_state { UACCE_Q_ZOMBIE = 0, UACCE_Q_INIT, From patchwork Sat Jul 30 08:32:45 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "yekai \(A\)" X-Patchwork-Id: 594551 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 75186C04A68 for ; Sat, 30 Jul 2022 08:39:51 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231819AbiG3Iju (ORCPT ); Sat, 30 Jul 2022 04:39:50 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55416 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S230138AbiG3Ijt (ORCPT ); Sat, 30 Jul 2022 04:39:49 -0400 Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 71AB927B29; Sat, 30 Jul 2022 01:39:47 -0700 (PDT) Received: from dggpeml500024.china.huawei.com (unknown [172.30.72.53]) by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4LvyRm38hYzlW5b; Sat, 30 Jul 2022 16:37:08 +0800 (CST) Received: from dggpeml100012.china.huawei.com (7.185.36.121) by dggpeml500024.china.huawei.com (7.185.36.10) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2375.24; Sat, 30 Jul 2022 16:39:45 +0800 Received: from huawei.com (10.67.165.24) by dggpeml100012.china.huawei.com (7.185.36.121) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2375.24; Sat, 30 Jul 2022 16:39:45 +0800 From: Kai Ye To: , CC: , , , , Subject: [PATCH v6 2/3] Documentation: add a isolation strategy sysfs node for uacce Date: Sat, 30 Jul 2022 16:32:45 +0800 Message-ID: <20220730083246.55646-3-yekai13@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20220730083246.55646-1-yekai13@huawei.com> References: <20220730083246.55646-1-yekai13@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.67.165.24] X-ClientProxiedBy: dggems706-chm.china.huawei.com (10.3.19.183) To dggpeml100012.china.huawei.com (7.185.36.121) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-crypto@vger.kernel.org Update documentation describing sysfs node that could help to configure isolation strategy for users in the user space. And describing sysfs node that could read the device isolated state. Signed-off-by: Kai Ye --- Documentation/ABI/testing/sysfs-driver-uacce | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce index 08f2591138af..1601f9dac29c 100644 --- a/Documentation/ABI/testing/sysfs-driver-uacce +++ b/Documentation/ABI/testing/sysfs-driver-uacce @@ -19,6 +19,23 @@ Contact: linux-accelerators@lists.ozlabs.org Description: Available instances left of the device Return -ENODEV if uacce_ops get_available_instances is not provided +What: /sys/class/uacce//isolate_strategy +Date: Jul 2022 +KernelVersion: 5.20 +Contact: linux-accelerators@lists.ozlabs.org +Description: (RW) Configure the frequency size for the hardware error + isolation strategy. This size is a configured integer value. + The default is 0. The maximum value is 65535. This value is a + threshold based on your driver handling strategy. + +What: /sys/class/uacce//isolate +Date: Jul 2022 +KernelVersion: 5.20 +Contact: linux-accelerators@lists.ozlabs.org +Description: (R) A sysfs node that read the device isolated state. The value 1 + means the device is unavailable. The 0 means the device is + available. + What: /sys/class/uacce//algorithms Date: Feb 2020 KernelVersion: 5.7 From patchwork Sat Jul 30 08:32:46 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "yekai \(A\)" X-Patchwork-Id: 594550 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id D2614C19F2C for ; Sat, 30 Jul 2022 08:39:52 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232080AbiG3Iju (ORCPT ); Sat, 30 Jul 2022 04:39:50 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55422 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231637AbiG3Ijt (ORCPT ); Sat, 30 Jul 2022 04:39:49 -0400 Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 190EF28E06; Sat, 30 Jul 2022 01:39:48 -0700 (PDT) Received: from dggpeml500023.china.huawei.com (unknown [172.30.72.53]) by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4LvyRm4jjczlWC3; Sat, 30 Jul 2022 16:37:08 +0800 (CST) Received: from dggpeml100012.china.huawei.com (7.185.36.121) by dggpeml500023.china.huawei.com (7.185.36.114) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2375.24; Sat, 30 Jul 2022 16:39:45 +0800 Received: from huawei.com (10.67.165.24) by dggpeml100012.china.huawei.com (7.185.36.121) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2375.24; Sat, 30 Jul 2022 16:39:45 +0800 From: Kai Ye To: , CC: , , , , Subject: [PATCH v6 3/3] crypto: hisilicon/qm - define the device isolation strategy Date: Sat, 30 Jul 2022 16:32:46 +0800 Message-ID: <20220730083246.55646-4-yekai13@huawei.com> X-Mailer: git-send-email 2.33.0 In-Reply-To: <20220730083246.55646-1-yekai13@huawei.com> References: <20220730083246.55646-1-yekai13@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.67.165.24] X-ClientProxiedBy: dggems706-chm.china.huawei.com (10.3.19.183) To dggpeml100012.china.huawei.com (7.185.36.121) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-crypto@vger.kernel.org Define the device isolation strategy by the device driver. The user configures a frequency value by uacce interface. If the slot reset frequency exceeds the value of setting for a certain period of time, the device will not be available in user space. The time window is one hour. The VF device use the PF device isolation strategy. All the hardware errors are processed by PF driver. This solution can be used for other drivers. Signed-off-by: Kai Ye --- drivers/crypto/hisilicon/qm.c | 163 +++++++++++++++++++++++++++++++--- include/linux/hisi_acc_qm.h | 9 ++ 2 files changed, 160 insertions(+), 12 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index ad83c194d664..a519ddad0af5 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -417,6 +417,16 @@ struct hisi_qm_resource { struct list_head list; }; +/** + * struct qm_hw_err - Structure describing the device errors + * @list: hardware error list + * @timestamp: timestamp when the error occurred + */ +struct qm_hw_err { + struct list_head list; + unsigned long long timestamp; +}; + struct hisi_qm_hw_ops { int (*get_vft)(struct hisi_qm *qm, u32 *base, u32 *number); void (*qm_db)(struct hisi_qm *qm, u16 qn, @@ -3410,6 +3420,111 @@ static long hisi_qm_uacce_ioctl(struct uacce_queue *q, unsigned int cmd, return 0; } +/** + * qm_hw_err_isolate() - Try to isolate the uacce device with its VFs + * according to user's configuration of isolation strategy. Warning: this + * API should be called while there the users on this device are suspended + * by slot resetting preparation of PCI AER. + * @qm: the uacce device + */ +static int qm_hw_err_isolate(struct hisi_qm *qm) +{ + struct qm_hw_err *err, *tmp, *hw_err; + struct qm_err_isolate *isolate; + u32 count = 0; + + isolate = &qm->isolate_data; + +#define SECONDS_PER_HOUR 3600 + + /* All the hw errs are processed by PF driver */ + if (qm->uacce->is_vf || isolate->is_isolate || + !isolate->hw_err_isolate_hz) + return 0; + + hw_err = kzalloc(sizeof(*hw_err), GFP_KERNEL); + if (!hw_err) + return -ENOMEM; + + mutex_lock(&isolate->isolate_lock); + hw_err->timestamp = jiffies; + list_for_each_entry_safe(err, tmp, &isolate->uacce_hw_errs, list) { + if ((hw_err->timestamp - err->timestamp) / HZ > + SECONDS_PER_HOUR) { + list_del(&err->list); + kfree(err); + } else { + count++; + } + } + list_add(&hw_err->list, &isolate->uacce_hw_errs); + mutex_unlock(&isolate->isolate_lock); + + if (count >= isolate->hw_err_isolate_hz) + isolate->is_isolate = true; + + return 0; +} + +static void qm_hw_err_destroy(struct hisi_qm *qm) +{ + struct qm_hw_err *err, *tmp; + + mutex_lock(&qm->isolate_data.isolate_lock); + list_for_each_entry_safe(err, tmp, &qm->isolate_data.uacce_hw_errs, list) { + list_del(&err->list); + kfree(err); + } + mutex_unlock(&qm->isolate_data.isolate_lock); +} + +static enum uacce_dev_state hisi_qm_get_isolate_state(struct uacce_device *uacce) +{ + struct hisi_qm *qm = uacce->priv; + struct hisi_qm *pf_qm; + + if (uacce->is_vf) + pf_qm = pci_get_drvdata(pci_physfn(qm->pdev)); + else + pf_qm = qm; + + return pf_qm->isolate_data.is_isolate ? + UACCE_DEV_ISOLATE : UACCE_DEV_NORMAL; +} + +static int hisi_qm_isolate_strategy_write(struct uacce_device *uacce, + u32 freq) +{ + struct hisi_qm *qm = uacce->priv; + + /* Must be set by PF */ + if (uacce->is_vf) + return -EPERM; + + if (qm->isolate_data.is_isolate) + return -EPERM; + + qm->isolate_data.hw_err_isolate_hz = freq; + + /* After the policy is updated, need to reset the hardware err list */ + qm_hw_err_destroy(qm); + + return 0; +} + +static u32 hisi_qm_isolate_strategy_read(struct uacce_device *uacce) +{ + struct hisi_qm *qm = uacce->priv; + struct hisi_qm *pf_qm; + + if (uacce->is_vf) { + pf_qm = pci_get_drvdata(pci_physfn(qm->pdev)); + return pf_qm->isolate_data.hw_err_isolate_hz; + } else { + return qm->isolate_data.hw_err_isolate_hz; + } +} + static const struct uacce_ops uacce_qm_ops = { .get_available_instances = hisi_qm_get_available_instances, .get_queue = hisi_qm_uacce_get_queue, @@ -3419,8 +3534,22 @@ static const struct uacce_ops uacce_qm_ops = { .mmap = hisi_qm_uacce_mmap, .ioctl = hisi_qm_uacce_ioctl, .is_q_updated = hisi_qm_is_q_updated, + .get_isolate_state = hisi_qm_get_isolate_state, + .isolate_strategy_write = hisi_qm_isolate_strategy_write, + .isolate_strategy_read = hisi_qm_isolate_strategy_read, }; +static void qm_remove_uacce(struct hisi_qm *qm) +{ + struct uacce_device *uacce = qm->uacce; + + if (qm->use_sva) { + qm_hw_err_destroy(qm); + uacce_remove(uacce); + qm->uacce = NULL; + } +} + static int qm_alloc_uacce(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; @@ -3446,8 +3575,7 @@ static int qm_alloc_uacce(struct hisi_qm *qm) qm->use_sva = true; } else { /* only consider sva case */ - uacce_remove(uacce); - qm->uacce = NULL; + qm_remove_uacce(qm); return -EINVAL; } @@ -3479,6 +3607,8 @@ static int qm_alloc_uacce(struct hisi_qm *qm) uacce->qf_pg_num[UACCE_QFRT_DUS] = dus_page_nr; qm->uacce = uacce; + INIT_LIST_HEAD(&qm->isolate_data.uacce_hw_errs); + mutex_init(&qm->isolate_data.isolate_lock); return 0; } @@ -5109,6 +5239,12 @@ static int qm_controller_reset_prepare(struct hisi_qm *qm) return ret; } + if (qm->use_sva) { + ret = qm_hw_err_isolate(qm); + if (ret) + pci_err(pdev, "failed to isolate hw err!\n"); + } + ret = qm_wait_vf_prepare_finish(qm); if (ret) pci_err(pdev, "failed to stop by vfs in soft reset!\n"); @@ -5436,19 +5572,25 @@ static int qm_controller_reset(struct hisi_qm *qm) ret = qm_soft_reset(qm); if (ret) { pci_err(pdev, "Controller reset failed (%d)\n", ret); - qm_reset_bit_clear(qm); - return ret; + goto err_reset; } ret = qm_controller_reset_done(qm); - if (ret) { - qm_reset_bit_clear(qm); - return ret; - } + if (ret) + goto err_reset; pci_info(pdev, "Controller reset complete\n"); return 0; + +err_reset: + pci_err(pdev, "Controller reset failed (%d)\n", ret); + qm_reset_bit_clear(qm); + + /* if resetting fails, isolate the device */ + if (qm->use_sva && !qm->uacce->is_vf) + qm->isolate_data.is_isolate = true; + return ret; } /** @@ -6246,10 +6388,7 @@ int hisi_qm_init(struct hisi_qm *qm) err_free_qm_memory: hisi_qm_memory_uninit(qm); err_alloc_uacce: - if (qm->use_sva) { - uacce_remove(qm->uacce); - qm->uacce = NULL; - } + qm_remove_uacce(qm); err_irq_register: qm_irq_unregister(qm); err_pci_init: diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 116e8bd68c99..e7aa6a451ec9 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -271,6 +271,14 @@ struct hisi_qm_poll_data { u16 *qp_finish_id; }; +struct qm_err_isolate { + struct mutex isolate_lock; + /* user cfg freq which triggers isolation */ + u32 hw_err_isolate_hz; + bool is_isolate; + struct list_head uacce_hw_errs; +}; + struct hisi_qm { enum qm_hw_ver ver; enum qm_fun_type fun_type; @@ -335,6 +343,7 @@ struct hisi_qm { struct qm_shaper_factor *factor; u32 mb_qos; u32 type_rate; + struct qm_err_isolate isolate_data; }; struct hisi_qp_status {