diff mbox series

[7/9] crypto: qat - add auto reset on error

Message ID 20240103040722.14467-8-mun.chun.yep@intel.com
State Superseded
Headers show
Series crypto: qat - improve recovery flows | expand

Commit Message

Mun Chun Yep Jan. 3, 2024, 4:07 a.m. UTC
From: Damian Muszynski <damian.muszynski@intel.com>

Expose the `auto_reset` sysfs attribute to configure the driver to reset
the device when a fatal error is detected.

When auto reset is enabled, the driver resets the device when it detects
either an heartbeat failure or a fatal error through an interrupt.

This patch is based on earlier work done by Shashank Gupta.

Signed-off-by: Damian Muszynski <damian.muszynski@intel.com>
Reviewed-by: Ahsan Atta <ahsan.atta@intel.com>
Reviewed-by: Markas Rapoportas <markas.rapoportas@intel.com>
---
 Documentation/ABI/testing/sysfs-driver-qat    | 20 ++++++++++
 .../intel/qat/qat_common/adf_accel_devices.h  |  1 +
 drivers/crypto/intel/qat/qat_common/adf_aer.c | 11 +++++-
 .../intel/qat/qat_common/adf_common_drv.h     |  1 +
 .../crypto/intel/qat/qat_common/adf_sysfs.c   | 37 +++++++++++++++++++
 5 files changed, 69 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-driver-qat b/Documentation/ABI/testing/sysfs-driver-qat
index bbf329cf0d67..6778f1fea874 100644
--- a/Documentation/ABI/testing/sysfs-driver-qat
+++ b/Documentation/ABI/testing/sysfs-driver-qat
@@ -141,3 +141,23 @@  Description:
 			64
 
 		This attribute is only available for qat_4xxx devices.
+
+What:		/sys/bus/pci/devices/<BDF>/qat/auto_reset
+Date:		March 2024
+KernelVersion:	6.8
+Contact:	qat-linux@intel.com
+Description:	(RW) Reports the current state of the autoreset feature
+		for a QAT device
+
+		Write to the attribute to enable or disable device auto reset.
+
+		Device auto reset is disabled by default.
+
+		The values are::
+
+		* 1/Yy/on: auto reset enabled. If the device encounters an
+		  unrecoverable error, it will be reset automatically.
+		* 0/Nn/off: auto reset disabled. If the device encounters an
+		  unrecoverable error, it will not be reset.
+
+		This attribute is only available for qat_4xxx devices.
diff --git a/drivers/crypto/intel/qat/qat_common/adf_accel_devices.h b/drivers/crypto/intel/qat/qat_common/adf_accel_devices.h
index 4a3c36aaa7ca..0f26aa976c8c 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_accel_devices.h
+++ b/drivers/crypto/intel/qat/qat_common/adf_accel_devices.h
@@ -402,6 +402,7 @@  struct adf_accel_dev {
 	struct adf_error_counters ras_errors;
 	struct mutex state_lock; /* protect state of the device */
 	bool is_vf;
+	bool autoreset_on_error;
 	u32 accel_id;
 };
 #endif
diff --git a/drivers/crypto/intel/qat/qat_common/adf_aer.c b/drivers/crypto/intel/qat/qat_common/adf_aer.c
index cd273b31db0e..b3d4b6b99c65 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_aer.c
+++ b/drivers/crypto/intel/qat/qat_common/adf_aer.c
@@ -204,6 +204,14 @@  const struct pci_error_handlers adf_err_handler = {
 };
 EXPORT_SYMBOL_GPL(adf_err_handler);
 
+int adf_dev_autoreset(struct adf_accel_dev *accel_dev)
+{
+	if (accel_dev->autoreset_on_error)
+		return adf_dev_aer_schedule_reset(accel_dev, ADF_DEV_RESET_ASYNC);
+
+	return 0;
+}
+
 static void adf_notify_fatal_error_worker(struct work_struct *work)
 {
 	struct adf_fatal_error_data *wq_data =
@@ -215,10 +223,11 @@  static void adf_notify_fatal_error_worker(struct work_struct *work)
 
 	if (!accel_dev->is_vf) {
 		/* Disable arbitration to stop processing of new requests */
-		if (hw_device->exit_arb)
+		if (accel_dev->autoreset_on_error && hw_device->exit_arb)
 			hw_device->exit_arb(accel_dev);
 		if (accel_dev->pf.vf_info)
 			adf_pf2vf_notify_fatal_error(accel_dev);
+		adf_dev_autoreset(accel_dev);
 	}
 
 	kfree(wq_data);
diff --git a/drivers/crypto/intel/qat/qat_common/adf_common_drv.h b/drivers/crypto/intel/qat/qat_common/adf_common_drv.h
index 10891c9da6e7..57328249c89e 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_common_drv.h
+++ b/drivers/crypto/intel/qat/qat_common/adf_common_drv.h
@@ -87,6 +87,7 @@  int adf_ae_stop(struct adf_accel_dev *accel_dev);
 extern const struct pci_error_handlers adf_err_handler;
 void adf_reset_sbr(struct adf_accel_dev *accel_dev);
 void adf_reset_flr(struct adf_accel_dev *accel_dev);
+int adf_dev_autoreset(struct adf_accel_dev *accel_dev);
 void adf_dev_restore(struct adf_accel_dev *accel_dev);
 int adf_init_aer(void);
 void adf_exit_aer(void);
diff --git a/drivers/crypto/intel/qat/qat_common/adf_sysfs.c b/drivers/crypto/intel/qat/qat_common/adf_sysfs.c
index d450dad32c9e..4e7f70d4049d 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_sysfs.c
+++ b/drivers/crypto/intel/qat/qat_common/adf_sysfs.c
@@ -204,6 +204,42 @@  static ssize_t pm_idle_enabled_store(struct device *dev, struct device_attribute
 }
 static DEVICE_ATTR_RW(pm_idle_enabled);
 
+static ssize_t auto_reset_show(struct device *dev, struct device_attribute *attr,
+			       char *buf)
+{
+	char *auto_reset;
+	struct adf_accel_dev *accel_dev;
+
+	accel_dev = adf_devmgr_pci_to_accel_dev(to_pci_dev(dev));
+	if (!accel_dev)
+		return -EINVAL;
+
+	auto_reset = accel_dev->autoreset_on_error ? "on" : "off";
+
+	return sysfs_emit(buf, "%s\n", auto_reset);
+}
+
+static ssize_t auto_reset_store(struct device *dev, struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct adf_accel_dev *accel_dev;
+	bool enabled = false;
+	int ret;
+
+	ret = kstrtobool(buf, &enabled);
+	if (ret)
+		return ret;
+
+	accel_dev = adf_devmgr_pci_to_accel_dev(to_pci_dev(dev));
+	if (!accel_dev)
+		return -EINVAL;
+
+	accel_dev->autoreset_on_error = enabled;
+
+	return count;
+}
+static DEVICE_ATTR_RW(auto_reset);
+
 static DEVICE_ATTR_RW(state);
 static DEVICE_ATTR_RW(cfg_services);
 
@@ -291,6 +327,7 @@  static struct attribute *qat_attrs[] = {
 	&dev_attr_pm_idle_enabled.attr,
 	&dev_attr_rp2srv.attr,
 	&dev_attr_num_rps.attr,
+	&dev_attr_auto_reset.attr,
 	NULL,
 };