Message ID | 20240625195624.2565741-5-avadhut.naik@amd.com |
---|---|
State | New |
Headers | show |
Series | MCE wrapper and support for new SMCA syndrome MSRs | expand |
On Tue, Jun 25, 2024 at 02:56:24PM -0500, Avadhut Naik wrote: > From: Yazen Ghannam <yazen.ghannam@amd.com> > > A new "FRU Text in MCA" feature is defined where the Field Replaceable > Unit (FRU) Text for a device is represented by a string in the new > MCA_SYND1 and MCA_SYND2 registers. This feature is supported per MCA > bank, and it is advertised by the McaFruTextInMca bit (MCA_CONFIG[9]). > > The FRU Text is populated dynamically for each individual error state > (MCA_STATUS, MCA_ADDR, et al.). This handles the case where an MCA bank > covers multiple devices, for example, a Unified Memory Controller (UMC) > bank that manages two DIMMs. >
On 6/26/2024 07:04, Borislav Petkov wrote: > On Tue, Jun 25, 2024 at 02:56:24PM -0500, Avadhut Naik wrote: >> From: Yazen Ghannam <yazen.ghannam@amd.com> >> >> A new "FRU Text in MCA" feature is defined where the Field Replaceable >> Unit (FRU) Text for a device is represented by a string in the new >> MCA_SYND1 and MCA_SYND2 registers. This feature is supported per MCA >> bank, and it is advertised by the McaFruTextInMca bit (MCA_CONFIG[9]). >> >> The FRU Text is populated dynamically for each individual error state >> (MCA_STATUS, MCA_ADDR, et al.). This handles the case where an MCA bank >> covers multiple devices, for example, a Unified Memory Controller (UMC) >> bank that manages two DIMMs. >> > > From here... > >> Print the FRU Text string, if available, when decoding an MCA error. >> >> Also, add field for MCA_CONFIG MSR in struct mce_hw_err as vendor specific >> error information and save the value of the MSR. The very value can then be >> exported through tracepoint for userspace tools like rasdaemon to print FRU >> Text, if available. >> >> Note: Checkpatch checks/warnings are ignored to maintain coding style. > > ... to here goes into the trash can except what MCA_CONFIG is for being logged > as part of the error. > Will do. >> [Yazen: Add Avadhut as co-developer for wrapper changes. ] >> >> Co-developed-by: Avadhut Naik <avadhut.naik@amd.com> >> Signed-off-by: Avadhut Naik <avadhut.naik@amd.com> >> Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com> > > Ditto as for patch 3. > Will do. >> --- > >> @@ -853,8 +850,18 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) >> >> if (m->status & MCI_STATUS_SYNDV) { >> pr_cont(", Syndrome: 0x%016llx\n", m->synd); >> - pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx", >> - err->vi.amd.synd1, err->vi.amd.synd2); >> + if (mca_config & MCI_CONFIG_FRUTEXT) { >> + char frutext[17]; >> + >> + memset(frutext, 0, sizeof(frutext)); > > Why are you clearing it if you're overwriting it immediately? > Since its a local variable, wanted to ensure that the memory is zeroed out to prevent any issues with the %s specifier, used later on. Would you recommend removing that and using initializer instead for the string? >> + memcpy(&frutext[0], &err->vi.amd.synd1, 8); >> + memcpy(&frutext[8], &err->vi.amd.synd2, 8); >> + >> + pr_emerg(HW_ERR "FRU Text: %s", frutext); >> + } else { >> + pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx", >> + err->vi.amd.synd1, err->vi.amd.synd2); >> + } >> } >> >> pr_cont("\n"); >> -- >> 2.34.1 >> >
On Wed, Jun 26, 2024 at 01:00:30PM -0500, Naik, Avadhut wrote: > > > > Why are you clearing it if you're overwriting it immediately? > > > Since its a local variable, wanted to ensure that the memory is zeroed out to prevent > any issues with the %s specifier, used later on. What issues? > Would you recommend removing that and using initializer instead for the string? I'd recommend looking at what the code does and then really thinking whether that makes any sense.
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 2b43ba37bbda..c6dea9c12498 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -61,6 +61,7 @@ * - TCC bit is present in MCx_STATUS. */ #define MCI_CONFIG_MCAX 0x1 +#define MCI_CONFIG_FRUTEXT BIT_ULL(9) #define MCI_IPID_MCATYPE 0xFFFF0000 #define MCI_IPID_HWID 0xFFF @@ -199,6 +200,7 @@ struct mce_hw_err { struct { u64 synd1; u64 synd2; + u64 config; } amd; } vi; }; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index fc69d244ca7f..f690905aa04f 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -798,6 +798,7 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) if (mce_flags.smca) { rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); + rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(bank), err.vi.amd.config); if (m->status & MCI_STATUS_SYNDV) { rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 7a15f0ca1bd1..ba8947983dc7 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -157,6 +157,8 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) fallthrough; /* MCA_CONFIG */ case 4: + err.vi.amd.config = *(i_mce + 3); + fallthrough; /* MCA_MISC0 */ case 3: m->misc = *(i_mce + 2); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 3bb0f8b39f97..cbd10e499a28 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -195,6 +195,8 @@ static void __print_mce(struct mce_hw_err *err) pr_cont("SYND2 %llx ", err->vi.amd.synd2); if (m->ipid) pr_cont("IPID %llx ", m->ipid); + if (err->vi.amd.config) + pr_cont("CONFIG %llx ", err->vi.amd.config); } pr_cont("\n"); @@ -667,6 +669,7 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) if (mce_flags.smca) { m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); + err->vi.amd.config = mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(i)); if (m->status & MCI_STATUS_SYNDV) { m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 69e12cb2f0de..6ae6b89b1a1e 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -795,6 +795,7 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) struct mce_hw_err *err = (struct mce_hw_err *)data; struct mce *m = &err->m; unsigned int fam = x86_family(m->cpuid); + u64 mca_config = err->vi.amd.config; int ecc; if (m->kflags & MCE_HANDLED_CEC) @@ -814,11 +815,7 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ((m->status & MCI_STATUS_PCC) ? "PCC" : "-")); if (boot_cpu_has(X86_FEATURE_SMCA)) { - u32 low, high; - u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); - - if (!rdmsr_safe(addr, &low, &high) && - (low & MCI_CONFIG_MCAX)) + if (mca_config & MCI_CONFIG_MCAX) pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); @@ -853,8 +850,18 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (m->status & MCI_STATUS_SYNDV) { pr_cont(", Syndrome: 0x%016llx\n", m->synd); - pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx", - err->vi.amd.synd1, err->vi.amd.synd2); + if (mca_config & MCI_CONFIG_FRUTEXT) { + char frutext[17]; + + memset(frutext, 0, sizeof(frutext)); + memcpy(&frutext[0], &err->vi.amd.synd1, 8); + memcpy(&frutext[8], &err->vi.amd.synd2, 8); + + pr_emerg(HW_ERR "FRU Text: %s", frutext); + } else { + pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx", + err->vi.amd.synd1, err->vi.amd.synd2); + } } pr_cont("\n");