Message ID | 1623218580-41912-1-git-send-email-tanxiaofei@huawei.com |
---|---|
State | Superseded |
Headers | show |
Series | [v6] ACPI / APEI: fix the regression of synchronous external aborts occur in user-mode | expand |
Hi Rafael, On 2021/6/9 21:22, Rafael J. Wysocki wrote: > On Wed, Jun 9, 2021 at 8:06 AM Xiaofei Tan <tanxiaofei@huawei.com> wrote: >> >> Before commit 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea() >> synchronise with APEI's irq work"), do_sea() would unconditionally >> signal the affected task from the arch code. Since that change, >> the GHES driver sends the signals. > > Since this fixes a regression apparently introduced by the above > commit, please add a Fixes tag pointing to that commit to it. > OK. >> This exposes a problem as errors the GHES driver doesn't understand >> or doesn't handle effectively are silently ignored. It will cause >> the errors get taken again, and circulate endlessly. User-space task >> get stuck in this loop. >> >> Existing firmware on Kunpeng9xx systems reports cache errors with the >> 'ARM Processor Error' CPER records. >> >> Do memory failure handling for ARM Processor Error Section just like >> for Memory Error Section. > > So why is this the right thing to do? > > I guess it doesn't address the problem entirely, but only in this > particular case, so what if the firmware on some other platform > reports errors with a new type unknown to the GHES driver? Will the > problem show up again? Yes. GHES driver should give right feedback to ARCH code. I mean apei_claim_sea() or ghes_notify_sea() doesn't return 0 if the error is unknown. But it seems difficult to achieve this for current architecture of GHES driver. > >> Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com> >> Reviewed-by: James Morse <james.morse@arm.com> >> >> --- >> Changes since v5: >> - Do some changes following James's suggestions: 1) optimize commit log >> 2) use err_info->length instead of err_info++' 3) some coding style >> advice. >> >> Changes since v4: >> - 1. Change the patch name from " ACPI / APEI: do memory failure on the >> physical address reported by ARM processor error section" to this >> more proper one. >> - 2. Add a comment in the code to tell why not filter out corrected >> error in an uncorrected section. >> >> Changes since v3: >> - Print unhandled error following James Morse's advice. >> >> Changes since v2: >> - Updated commit log >> --- >> drivers/acpi/apei/ghes.c | 81 ++++++++++++++++++++++++++++++++++++++---------- >> 1 file changed, 64 insertions(+), 17 deletions(-) >> >> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c >> index fce7ade..0c8330e 100644 >> --- a/drivers/acpi/apei/ghes.c >> +++ b/drivers/acpi/apei/ghes.c >> @@ -441,28 +441,35 @@ static void ghes_kick_task_work(struct callback_head *head) >> gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len); >> } >> >> -static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, >> - int sev) >> +static bool ghes_do_memory_failure(u64 physical_addr, int flags) >> { >> unsigned long pfn; >> - int flags = -1; >> - int sec_sev = ghes_severity(gdata->error_severity); >> - struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); >> >> if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE)) >> return false; >> >> - if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) >> - return false; >> - >> - pfn = mem_err->physical_addr >> PAGE_SHIFT; >> + pfn = PHYS_PFN(physical_addr); >> if (!pfn_valid(pfn)) { >> pr_warn_ratelimited(FW_WARN GHES_PFX >> "Invalid address in generic error data: %#llx\n", >> - mem_err->physical_addr); >> + physical_addr); >> return false; >> } >> >> + memory_failure_queue(pfn, flags); >> + return true; >> +} >> + >> +static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, >> + int sev) >> +{ >> + int flags = -1; >> + int sec_sev = ghes_severity(gdata->error_severity); >> + struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); >> + >> + if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) >> + return false; >> + >> /* iff following two events can be handled properly by now */ >> if (sec_sev == GHES_SEV_CORRECTED && >> (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED)) >> @@ -470,14 +477,56 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, >> if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE) >> flags = 0; >> >> - if (flags != -1) { >> - memory_failure_queue(pfn, flags); >> - return true; >> - } >> + if (flags != -1) >> + return ghes_do_memory_failure(mem_err->physical_addr, flags); >> >> return false; >> } >> >> +static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev) >> +{ >> + struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata); >> + bool queued = false; >> + int sec_sev, i; >> + char *p; >> + >> + log_arm_hw_error(err); >> + >> + sec_sev = ghes_severity(gdata->error_severity); >> + if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE) >> + return false; >> + >> + p = (char *)(err + 1); >> + for (i = 0; i < err->err_info_num; i++) { >> + struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p; >> + bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR); >> + bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR); >> + const char *error_type = "unknown error"; >> + >> + /* >> + * The field (err_info->error_info & BIT(26)) is fixed to set to >> + * 1 in some old firmware of HiSilicon Kunpeng920. We assume that >> + * firmware won't mix corrected errors in an uncorrected section, >> + * and don't filter out 'corrected' error here. >> + */ >> + if (is_cache && has_pa) { >> + queued = ghes_do_memory_failure(err_info->physical_fault_addr, 0); >> + p += err_info->length; >> + continue; >> + } >> + >> + if (err_info->type < ARRAY_SIZE(cper_proc_error_type_strs)) >> + error_type = cper_proc_error_type_strs[err_info->type]; >> + >> + pr_warn_ratelimited(FW_WARN GHES_PFX >> + "Unhandled processor error type: %s\n", >> + error_type); >> + p += err_info->length; >> + } >> + >> + return queued; >> +} >> + >> /* >> * PCIe AER errors need to be sent to the AER driver for reporting and >> * recovery. The GHES severities map to the following AER severities and >> @@ -605,9 +654,7 @@ static bool ghes_do_proc(struct ghes *ghes, >> ghes_handle_aer(gdata); >> } >> else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) { >> - struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata); >> - >> - log_arm_hw_error(err); >> + queued = ghes_handle_arm_hw_error(gdata, sev); >> } else { >> void *err = acpi_hest_get_payload(gdata); >> >> -- >> 2.8.1 >> > > . >
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index fce7ade..0c8330e 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -441,28 +441,35 @@ static void ghes_kick_task_work(struct callback_head *head) gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len); } -static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, - int sev) +static bool ghes_do_memory_failure(u64 physical_addr, int flags) { unsigned long pfn; - int flags = -1; - int sec_sev = ghes_severity(gdata->error_severity); - struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE)) return false; - if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) - return false; - - pfn = mem_err->physical_addr >> PAGE_SHIFT; + pfn = PHYS_PFN(physical_addr); if (!pfn_valid(pfn)) { pr_warn_ratelimited(FW_WARN GHES_PFX "Invalid address in generic error data: %#llx\n", - mem_err->physical_addr); + physical_addr); return false; } + memory_failure_queue(pfn, flags); + return true; +} + +static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, + int sev) +{ + int flags = -1; + int sec_sev = ghes_severity(gdata->error_severity); + struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); + + if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) + return false; + /* iff following two events can be handled properly by now */ if (sec_sev == GHES_SEV_CORRECTED && (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED)) @@ -470,14 +477,56 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE) flags = 0; - if (flags != -1) { - memory_failure_queue(pfn, flags); - return true; - } + if (flags != -1) + return ghes_do_memory_failure(mem_err->physical_addr, flags); return false; } +static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev) +{ + struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata); + bool queued = false; + int sec_sev, i; + char *p; + + log_arm_hw_error(err); + + sec_sev = ghes_severity(gdata->error_severity); + if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE) + return false; + + p = (char *)(err + 1); + for (i = 0; i < err->err_info_num; i++) { + struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p; + bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR); + bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR); + const char *error_type = "unknown error"; + + /* + * The field (err_info->error_info & BIT(26)) is fixed to set to + * 1 in some old firmware of HiSilicon Kunpeng920. We assume that + * firmware won't mix corrected errors in an uncorrected section, + * and don't filter out 'corrected' error here. + */ + if (is_cache && has_pa) { + queued = ghes_do_memory_failure(err_info->physical_fault_addr, 0); + p += err_info->length; + continue; + } + + if (err_info->type < ARRAY_SIZE(cper_proc_error_type_strs)) + error_type = cper_proc_error_type_strs[err_info->type]; + + pr_warn_ratelimited(FW_WARN GHES_PFX + "Unhandled processor error type: %s\n", + error_type); + p += err_info->length; + } + + return queued; +} + /* * PCIe AER errors need to be sent to the AER driver for reporting and * recovery. The GHES severities map to the following AER severities and @@ -605,9 +654,7 @@ static bool ghes_do_proc(struct ghes *ghes, ghes_handle_aer(gdata); } else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) { - struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata); - - log_arm_hw_error(err); + queued = ghes_handle_arm_hw_error(gdata, sev); } else { void *err = acpi_hest_get_payload(gdata);