Message ID | 20240321-b4-arm-ras-error-vendor-info-v5-rc3-v5-2-850f9bfb97a8@os.amperecomputing.com |
---|---|
State | New |
Headers | show |
Series | Adds additional information to ARM RAS errors | expand |
Tested-by: Shiju Jose <shiju.jose@huawei.com> CPU core isolation feature in rasdaemon has dependency on this kernel patch. Thanks, Shiju >-----Original Message----- >From: Daniel Ferguson <danielf@os.amperecomputing.com> >Sent: 21 March 2024 22:56 >To: Rafael J. Wysocki <rafael@kernel.org>; Len Brown <lenb@kernel.org>; >James Morse <james.morse@arm.com>; Tony Luck <tony.luck@intel.com>; >Borislav Petkov <bp@alien8.de> >Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux- >edac@vger.kernel.org; Daniel Ferguson <danielf@os.amperecomputing.com>; >luoshengwei <luoshengwei@huawei.com>; Jason Tian ><jason@os.amperecomputing.com> >Subject: [PATCH v5 2/2] RAS: Report ARM processor information to userspace > >From: Shengwei Luo <luoshengwei@huawei.com> > >The original arm_event trace code only traces out ARM processor error >information data. It's not enough for user to take appropriate action. > >According to UEFI_2_9 specification chapter N2.4.4, the ARM processor error >section includes several ARM processor error information, several ARM >processor context information and several vendor specific error information >structures. In addition to these info, there are error severity and cpu logical >index about the event. Report all of these information to userspace via perf i/f. >So that the user can do cpu core isolation according to error severity and other >info. > >Signed-off-by: Shengwei Luo <luoshengwei@huawei.com> >Signed-off-by: Jason Tian <jason@os.amperecomputing.com> >Signed-off-by: Daniel Ferguson <danielf@os.amperecomputing.com> >--- > drivers/acpi/apei/ghes.c | 3 +-- > drivers/ras/ras.c | 46 >++++++++++++++++++++++++++++++++++++++++++++-- > include/linux/ras.h | 15 ++++++++++++--- > include/ras/ras_event.h | 48 >+++++++++++++++++++++++++++++++++++++++++++----- > 4 files changed, 100 insertions(+), 12 deletions(-) > >diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index >58014558b8e0..a93c80fe1bab 100644 >--- a/drivers/acpi/apei/ghes.c >+++ b/drivers/acpi/apei/ghes.c >@@ -535,9 +535,8 @@ static bool ghes_handle_arm_hw_error(struct >acpi_hest_generic_data *gdata, > int sec_sev, i; > char *p; > >- log_arm_hw_error(err); >- > sec_sev = ghes_severity(gdata->error_severity); >+ log_arm_hw_error(err, sec_sev); > if (sev != GHES_SEV_RECOVERABLE || sec_sev != >GHES_SEV_RECOVERABLE) > return false; > >diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index >249dce21a738..3e2beed2db07 100644 >--- a/drivers/ras/ras.c >+++ b/drivers/ras/ras.c >@@ -53,9 +53,51 @@ void log_non_standard_event(const guid_t *sec_type, >const guid_t *fru_id, } > > #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) -void >log_arm_hw_error(struct cper_sec_proc_arm *err) >+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) > { >- trace_arm_event(err); >+ u32 pei_len; >+ u32 ctx_len = 0; >+ s32 vsei_len; >+ u8 *pei_err; >+ u8 *ctx_err; >+ u8 *ven_err_data; >+ struct cper_arm_err_info *err_info; >+ struct cper_arm_ctx_info *ctx_info; >+ int n, sz; >+ int cpu; >+ >+ pei_len = sizeof(struct cper_arm_err_info) * err->err_info_num; >+ pei_err = (u8 *)err + sizeof(struct cper_sec_proc_arm); >+ >+ err_info = (struct cper_arm_err_info *)(err + 1); >+ ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num); >+ ctx_err = (u8 *)ctx_info; >+ for (n = 0; n < err->context_info_num; n++) { >+ sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size; >+ ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz); >+ ctx_len += sz; >+ } >+ >+ vsei_len = err->section_length - (sizeof(struct cper_sec_proc_arm) + >+ pei_len + ctx_len); >+ if (vsei_len < 0) { >+ pr_warn(FW_BUG >+ "section length: %d\n", err->section_length); >+ pr_warn(FW_BUG >+ "section length is too small\n"); >+ pr_warn(FW_BUG >+ "firmware-generated error record is incorrect\n"); >+ vsei_len = 0; >+ } >+ ven_err_data = (u8 *)ctx_info; >+ >+ cpu = GET_LOGICAL_INDEX(err->mpidr); >+ /* when return value is invalid, set cpu index to -1 */ >+ if (cpu < 0) >+ cpu = -1; >+ >+ trace_arm_event(err, pei_err, pei_len, ctx_err, ctx_len, >+ ven_err_data, (u32)vsei_len, sev, cpu); > } > #endif > >diff --git a/include/linux/ras.h b/include/linux/ras.h index >811feb9d8160..2070e4ae0626 100644 >--- a/include/linux/ras.h >+++ b/include/linux/ras.h >@@ -25,7 +25,7 @@ void log_non_standard_event(const guid_t *sec_type, > const guid_t *fru_id, const char *fru_text, > const u8 sev, const u8 *err, const u32 len); #if >defined(CONFIG_ARM) || defined(CONFIG_ARM64) -void >log_arm_hw_error(struct cper_sec_proc_arm *err); >+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev); > #endif > #else > static inline void >@@ -35,7 +35,7 @@ log_non_standard_event(const guid_t *sec_type, { return; >} #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) static inline void - >log_arm_hw_error(struct cper_sec_proc_arm *err) { return; } >+log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { return; >+} > #endif > #endif > >@@ -55,5 +55,14 @@ static inline void amd_retire_dram_row(struct atl_err >*err) { } static inline unsigned long >amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; >} #endif /* CONFIG_AMD_ATL */ >- >+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) #include >+<asm/smp_plat.h> >+/* >+ * Include ARM specific SMP header which provides a function mapping >+mpidr to >+ * cpu logical index. >+ */ >+#define GET_LOGICAL_INDEX(mpidr) get_logical_index(mpidr & >+MPIDR_HWID_BITMASK) #else #define GET_LOGICAL_INDEX(mpidr) -EINVAL >+#endif /* CONFIG_ARM || CONFIG_ARM64 */ > #endif /* __RAS_H__ */ >diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index >c011ea236e9b..a7d7b6e717b6 100644 >--- a/include/ras/ras_event.h >+++ b/include/ras/ras_event.h >@@ -168,11 +168,24 @@ TRACE_EVENT(mc_event, > * This event is generated when hardware detects an ARM processor error > * has occurred. UEFI 2.6 spec section N.2.4.4. > */ >+#define APEIL "ARM Processor Err Info data len" >+#define APEID "ARM Processor Err Info raw data" >+#define APECIL "ARM Processor Err Context Info data len" >+#define APECID "ARM Processor Err Context Info raw data" >+#define VSEIL "Vendor Specific Err Info data len" >+#define VSEID "Vendor Specific Err Info raw data" > TRACE_EVENT(arm_event, > >- TP_PROTO(const struct cper_sec_proc_arm *proc), >+ TP_PROTO(const struct cper_sec_proc_arm *proc, const u8 *pei_err, >+ const u32 pei_len, >+ const u8 *ctx_err, >+ const u32 ctx_len, >+ const u8 *oem, >+ const u32 oem_len, >+ u8 sev, >+ int cpu), > >- TP_ARGS(proc), >+ TP_ARGS(proc, pei_err, pei_len, ctx_err, ctx_len, oem, oem_len, sev, >+cpu), > > TP_STRUCT__entry( > __field(u64, mpidr) >@@ -180,6 +193,14 @@ TRACE_EVENT(arm_event, > __field(u32, running_state) > __field(u32, psci_state) > __field(u8, affinity) >+ __field(u32, pei_len) >+ __dynamic_array(u8, buf, pei_len) >+ __field(u32, ctx_len) >+ __dynamic_array(u8, buf1, ctx_len) >+ __field(u32, oem_len) >+ __dynamic_array(u8, buf2, oem_len) >+ __field(u8, sev) >+ __field(int, cpu) > ), > > TP_fast_assign( >@@ -199,12 +220,29 @@ TRACE_EVENT(arm_event, > __entry->running_state = ~0; > __entry->psci_state = ~0; > } >+ __entry->pei_len = pei_len; >+ memcpy(__get_dynamic_array(buf), pei_err, pei_len); >+ __entry->ctx_len = ctx_len; >+ memcpy(__get_dynamic_array(buf1), ctx_err, ctx_len); >+ __entry->oem_len = oem_len; >+ memcpy(__get_dynamic_array(buf2), oem, oem_len); >+ __entry->sev = sev; >+ __entry->cpu = cpu; > ), > >- TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; " >- "running state: %d; PSCI state: %d", >+ TP_printk("cpu: %d; error: %d; affinity level: %d; MPIDR: %016llx; MIDR: >%016llx; " >+ "running state: %d; PSCI state: %d; " >+ "%s: %d; %s: %s; %s: %d; %s: %s; %s: %d; %s: %s", >+ __entry->cpu, >+ __entry->sev, > __entry->affinity, __entry->mpidr, __entry->midr, >- __entry->running_state, __entry->psci_state) >+ __entry->running_state, __entry->psci_state, >+ APEIL, __entry->pei_len, APEID, >+ __print_hex(__get_dynamic_array(buf), __entry->pei_len), >+ APECIL, __entry->ctx_len, APECID, >+ __print_hex(__get_dynamic_array(buf1), __entry->ctx_len), >+ VSEIL, __entry->oem_len, VSEID, >+ __print_hex(__get_dynamic_array(buf2), __entry->oem_len)) > ); > > /* > >-- >2.43.0 >
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 58014558b8e0..a93c80fe1bab 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -535,9 +535,8 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sec_sev, i; char *p; - log_arm_hw_error(err); - sec_sev = ghes_severity(gdata->error_severity); + log_arm_hw_error(err, sec_sev); if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE) return false; diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index 249dce21a738..3e2beed2db07 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -53,9 +53,51 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, } #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) -void log_arm_hw_error(struct cper_sec_proc_arm *err) +void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { - trace_arm_event(err); + u32 pei_len; + u32 ctx_len = 0; + s32 vsei_len; + u8 *pei_err; + u8 *ctx_err; + u8 *ven_err_data; + struct cper_arm_err_info *err_info; + struct cper_arm_ctx_info *ctx_info; + int n, sz; + int cpu; + + pei_len = sizeof(struct cper_arm_err_info) * err->err_info_num; + pei_err = (u8 *)err + sizeof(struct cper_sec_proc_arm); + + err_info = (struct cper_arm_err_info *)(err + 1); + ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num); + ctx_err = (u8 *)ctx_info; + for (n = 0; n < err->context_info_num; n++) { + sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size; + ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz); + ctx_len += sz; + } + + vsei_len = err->section_length - (sizeof(struct cper_sec_proc_arm) + + pei_len + ctx_len); + if (vsei_len < 0) { + pr_warn(FW_BUG + "section length: %d\n", err->section_length); + pr_warn(FW_BUG + "section length is too small\n"); + pr_warn(FW_BUG + "firmware-generated error record is incorrect\n"); + vsei_len = 0; + } + ven_err_data = (u8 *)ctx_info; + + cpu = GET_LOGICAL_INDEX(err->mpidr); + /* when return value is invalid, set cpu index to -1 */ + if (cpu < 0) + cpu = -1; + + trace_arm_event(err, pei_err, pei_len, ctx_err, ctx_len, + ven_err_data, (u32)vsei_len, sev, cpu); } #endif diff --git a/include/linux/ras.h b/include/linux/ras.h index 811feb9d8160..2070e4ae0626 100644 --- a/include/linux/ras.h +++ b/include/linux/ras.h @@ -25,7 +25,7 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, const char *fru_text, const u8 sev, const u8 *err, const u32 len); #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) -void log_arm_hw_error(struct cper_sec_proc_arm *err); +void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev); #endif #else static inline void @@ -35,7 +35,7 @@ log_non_standard_event(const guid_t *sec_type, { return; } #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) static inline void -log_arm_hw_error(struct cper_sec_proc_arm *err) { return; } +log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { return; } #endif #endif @@ -55,5 +55,14 @@ static inline void amd_retire_dram_row(struct atl_err *err) { } static inline unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; } #endif /* CONFIG_AMD_ATL */ - +#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) +#include <asm/smp_plat.h> +/* + * Include ARM specific SMP header which provides a function mapping mpidr to + * cpu logical index. + */ +#define GET_LOGICAL_INDEX(mpidr) get_logical_index(mpidr & MPIDR_HWID_BITMASK) +#else +#define GET_LOGICAL_INDEX(mpidr) -EINVAL +#endif /* CONFIG_ARM || CONFIG_ARM64 */ #endif /* __RAS_H__ */ diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index c011ea236e9b..a7d7b6e717b6 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -168,11 +168,24 @@ TRACE_EVENT(mc_event, * This event is generated when hardware detects an ARM processor error * has occurred. UEFI 2.6 spec section N.2.4.4. */ +#define APEIL "ARM Processor Err Info data len" +#define APEID "ARM Processor Err Info raw data" +#define APECIL "ARM Processor Err Context Info data len" +#define APECID "ARM Processor Err Context Info raw data" +#define VSEIL "Vendor Specific Err Info data len" +#define VSEID "Vendor Specific Err Info raw data" TRACE_EVENT(arm_event, - TP_PROTO(const struct cper_sec_proc_arm *proc), + TP_PROTO(const struct cper_sec_proc_arm *proc, const u8 *pei_err, + const u32 pei_len, + const u8 *ctx_err, + const u32 ctx_len, + const u8 *oem, + const u32 oem_len, + u8 sev, + int cpu), - TP_ARGS(proc), + TP_ARGS(proc, pei_err, pei_len, ctx_err, ctx_len, oem, oem_len, sev, cpu), TP_STRUCT__entry( __field(u64, mpidr) @@ -180,6 +193,14 @@ TRACE_EVENT(arm_event, __field(u32, running_state) __field(u32, psci_state) __field(u8, affinity) + __field(u32, pei_len) + __dynamic_array(u8, buf, pei_len) + __field(u32, ctx_len) + __dynamic_array(u8, buf1, ctx_len) + __field(u32, oem_len) + __dynamic_array(u8, buf2, oem_len) + __field(u8, sev) + __field(int, cpu) ), TP_fast_assign( @@ -199,12 +220,29 @@ TRACE_EVENT(arm_event, __entry->running_state = ~0; __entry->psci_state = ~0; } + __entry->pei_len = pei_len; + memcpy(__get_dynamic_array(buf), pei_err, pei_len); + __entry->ctx_len = ctx_len; + memcpy(__get_dynamic_array(buf1), ctx_err, ctx_len); + __entry->oem_len = oem_len; + memcpy(__get_dynamic_array(buf2), oem, oem_len); + __entry->sev = sev; + __entry->cpu = cpu; ), - TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; " - "running state: %d; PSCI state: %d", + TP_printk("cpu: %d; error: %d; affinity level: %d; MPIDR: %016llx; MIDR: %016llx; " + "running state: %d; PSCI state: %d; " + "%s: %d; %s: %s; %s: %d; %s: %s; %s: %d; %s: %s", + __entry->cpu, + __entry->sev, __entry->affinity, __entry->mpidr, __entry->midr, - __entry->running_state, __entry->psci_state) + __entry->running_state, __entry->psci_state, + APEIL, __entry->pei_len, APEID, + __print_hex(__get_dynamic_array(buf), __entry->pei_len), + APECIL, __entry->ctx_len, APECID, + __print_hex(__get_dynamic_array(buf1), __entry->ctx_len), + VSEIL, __entry->oem_len, VSEID, + __print_hex(__get_dynamic_array(buf2), __entry->oem_len)) ); /*