diff mbox series

[v3,6/7] acpi/ghes, cper: Recognize and cache CXL Protocol errors

Message ID 20241119003915.174386-7-Smita.KoralahalliChannabasappa@amd.com
State Superseded
Headers show
Series acpi/ghes, cper, cxl: Process CXL CPER Protocol errors | expand

Commit Message

Smita Koralahalli Nov. 19, 2024, 12:39 a.m. UTC
Add support in GHES to detect and process CXL CPER Protocol errors, as
defined in UEFI v2.10, section N.2.13.

Define struct cxl_cper_prot_err_work_data to cache CXL protocol error
information, including RAS capabilities and severity, for further
handling.

These cached CXL CPER records will later be processed by workqueues
within the CXL subsystem.

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
---
 drivers/acpi/apei/ghes.c | 52 ++++++++++++++++++++++++++++++++++++++++
 include/cxl/event.h      |  6 +++++
 2 files changed, 58 insertions(+)

Comments

Jonathan Cameron Nov. 26, 2024, 4:05 p.m. UTC | #1
On Tue, 19 Nov 2024 00:39:14 +0000
Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:

> Add support in GHES to detect and process CXL CPER Protocol errors, as
> defined in UEFI v2.10, section N.2.13.
> 
> Define struct cxl_cper_prot_err_work_data to cache CXL protocol error
> information, including RAS capabilities and severity, for further
> handling.
> 
> These cached CXL CPER records will later be processed by workqueues
> within the CXL subsystem.
> 
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Looks fine,
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Ira Weiny Dec. 2, 2024, 6:41 p.m. UTC | #2
Smita Koralahalli wrote:
> Add support in GHES to detect and process CXL CPER Protocol errors, as
> defined in UEFI v2.10, section N.2.13.
> 
> Define struct cxl_cper_prot_err_work_data to cache CXL protocol error
> information, including RAS capabilities and severity, for further
> handling.
> 
> These cached CXL CPER records will later be processed by workqueues
> within the CXL subsystem.
> 
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
>  drivers/acpi/apei/ghes.c | 52 ++++++++++++++++++++++++++++++++++++++++
>  include/cxl/event.h      |  6 +++++
>  2 files changed, 58 insertions(+)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 62ffe6eb5503..6cd9d5375d7c 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -676,6 +676,54 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
>  	schedule_work(&entry->work);
>  }
>  
> +static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
> +				   int severity)
> +{
> +	struct cxl_cper_prot_err_work_data wd;
> +	u8 *dvsec_start, *cap_start;
> +
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS)) {
> +		pr_err_ratelimited("CXL CPER invalid agent type\n");
> +		return;
> +	}
> +
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
> +		pr_err_ratelimited("CXL CPER invalid protocol error log\n");
> +		return;
> +	}
> +
> +	if (prot_err->err_len != sizeof(struct cxl_ras_capability_regs)) {
> +		pr_err_ratelimited("CXL CPER invalid RAS Cap size (%u)\n",
> +				   prot_err->err_len);
> +		return;
> +	}
> +
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER))
> +		pr_warn(FW_WARN "CXL CPER no device serial number\n");
> +
> +	switch (prot_err->agent_type) {
> +	case RCD:
> +	case DEVICE:
> +	case LD:
> +	case FMLD:
> +	case RP:
> +	case DSP:
> +	case USP:
> +		memcpy(&wd.prot_err, prot_err, sizeof(wd.prot_err));
> +
> +		dvsec_start = (u8 *)(prot_err + 1);
> +		cap_start = dvsec_start + prot_err->dvsec_len;
> +
> +		wd.ras_cap = *(struct cxl_ras_capability_regs *)cap_start;

Why not memcpy()?

Ira

[snip]
Smita Koralahalli Dec. 6, 2024, 4:16 p.m. UTC | #3
Hi Ira,

On 12/2/2024 10:41 AM, Ira Weiny wrote:
> Smita Koralahalli wrote:
>> Add support in GHES to detect and process CXL CPER Protocol errors, as
>> defined in UEFI v2.10, section N.2.13.
>>
>> Define struct cxl_cper_prot_err_work_data to cache CXL protocol error
>> information, including RAS capabilities and severity, for further
>> handling.
>>
>> These cached CXL CPER records will later be processed by workqueues
>> within the CXL subsystem.
>>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>> ---
>>   drivers/acpi/apei/ghes.c | 52 ++++++++++++++++++++++++++++++++++++++++
>>   include/cxl/event.h      |  6 +++++
>>   2 files changed, 58 insertions(+)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index 62ffe6eb5503..6cd9d5375d7c 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -676,6 +676,54 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
>>   	schedule_work(&entry->work);
>>   }
>>   
>> +static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
>> +				   int severity)
>> +{
>> +	struct cxl_cper_prot_err_work_data wd;
>> +	u8 *dvsec_start, *cap_start;
>> +
>> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS)) {
>> +		pr_err_ratelimited("CXL CPER invalid agent type\n");
>> +		return;
>> +	}
>> +
>> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
>> +		pr_err_ratelimited("CXL CPER invalid protocol error log\n");
>> +		return;
>> +	}
>> +
>> +	if (prot_err->err_len != sizeof(struct cxl_ras_capability_regs)) {
>> +		pr_err_ratelimited("CXL CPER invalid RAS Cap size (%u)\n",
>> +				   prot_err->err_len);
>> +		return;
>> +	}
>> +
>> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER))
>> +		pr_warn(FW_WARN "CXL CPER no device serial number\n");
>> +
>> +	switch (prot_err->agent_type) {
>> +	case RCD:
>> +	case DEVICE:
>> +	case LD:
>> +	case FMLD:
>> +	case RP:
>> +	case DSP:
>> +	case USP:
>> +		memcpy(&wd.prot_err, prot_err, sizeof(wd.prot_err));
>> +
>> +		dvsec_start = (u8 *)(prot_err + 1);
>> +		cap_start = dvsec_start + prot_err->dvsec_len;
>> +
>> +		wd.ras_cap = *(struct cxl_ras_capability_regs *)cap_start;
> Why not memcpy()?

Thanks for pointing out. Yes, I will change to memcpy() in next 
revision. I think

memcpy() may suit more better here due to consistency and as well addressing

alignment concerns.


Thanks

Smita

>
> Ira
>
> [snip]
diff mbox series

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 62ffe6eb5503..6cd9d5375d7c 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -676,6 +676,54 @@  static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
 	schedule_work(&entry->work);
 }
 
+static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
+				   int severity)
+{
+	struct cxl_cper_prot_err_work_data wd;
+	u8 *dvsec_start, *cap_start;
+
+	if (!(prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS)) {
+		pr_err_ratelimited("CXL CPER invalid agent type\n");
+		return;
+	}
+
+	if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
+		pr_err_ratelimited("CXL CPER invalid protocol error log\n");
+		return;
+	}
+
+	if (prot_err->err_len != sizeof(struct cxl_ras_capability_regs)) {
+		pr_err_ratelimited("CXL CPER invalid RAS Cap size (%u)\n",
+				   prot_err->err_len);
+		return;
+	}
+
+	if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER))
+		pr_warn(FW_WARN "CXL CPER no device serial number\n");
+
+	switch (prot_err->agent_type) {
+	case RCD:
+	case DEVICE:
+	case LD:
+	case FMLD:
+	case RP:
+	case DSP:
+	case USP:
+		memcpy(&wd.prot_err, prot_err, sizeof(wd.prot_err));
+
+		dvsec_start = (u8 *)(prot_err + 1);
+		cap_start = dvsec_start + prot_err->dvsec_len;
+
+		wd.ras_cap = *(struct cxl_ras_capability_regs *)cap_start;
+		wd.severity = cper_severity_to_aer(severity);
+		break;
+	default:
+		pr_err_ratelimited("CXL CPER invalid agent type: %d\n",
+				   prot_err->agent_type);
+		return;
+	}
+}
+
 /* Room for 8 entries for each of the 4 event log queues */
 #define CXL_CPER_FIFO_DEPTH 32
 DEFINE_KFIFO(cxl_cper_fifo, struct cxl_cper_work_data, CXL_CPER_FIFO_DEPTH);
@@ -795,6 +843,10 @@  static bool ghes_do_proc(struct ghes *ghes,
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
 			queued = ghes_handle_arm_hw_error(gdata, sev, sync);
+		} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
+			struct cxl_cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
+
+			cxl_cper_post_prot_err(prot_err, gdata->error_severity);
 		} else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) {
 			struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
 
diff --git a/include/cxl/event.h b/include/cxl/event.h
index 992568b35455..c9a38ebaf207 100644
--- a/include/cxl/event.h
+++ b/include/cxl/event.h
@@ -232,6 +232,12 @@  struct cxl_ras_capability_regs {
 	u32 header_log[16];
 };
 
+struct cxl_cper_prot_err_work_data {
+	struct cxl_cper_sec_prot_err prot_err;
+	struct cxl_ras_capability_regs ras_cap;
+	int severity;
+};
+
 #ifdef CONFIG_ACPI_APEI_GHES
 int cxl_cper_register_event_work(struct work_struct *work);
 int cxl_cper_unregister_event_work(struct work_struct *work);