Message ID | 1439916257-5483-2-git-send-email-fu.wei@linaro.org |
---|---|
State | New |
Headers | show |
Hi Borislav, Thanks for your review. I totally agree all your comment I have made a new patch, and that is in testing, I will post it ASAP On 16 December 2015 at 18:29, Borislav Petkov <bp@alien8.de> wrote: > On Wed, Aug 19, 2015 at 12:44:16AM +0800, fu.wei@linaro.org wrote: >> From: Huang Ying <ying.huang@intel.com> >> >> Under normal circumstances, when a hardware error occurs, kernel will >> be notified via NMI, MCE or some other method, then kernel will >> process the error condition, report it, and recover it if possible. >> But sometime, the situation is so bad, so that firmware may choose to >> reset directly without notifying Linux kernel. >> >> Linux kernel can use the Boot Error Record Table (BERT) to get the >> un-notified hardware errors that occurred in a previous boot. In this >> patch, the error information is reported via printk. >> >> For more information about BERT, please refer to ACPI Specification >> version 6.0, section 18.3.1: >> http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf >> >> [Tony: Applied some cleanups suggested by Fu Wei] >> [Fu Wei: delete EXPORT_SYMBOL_GPL(bert_disable), and do some cleanups] >> >> Signed-off-by: Huang Ying <ying.huang@intel.com> >> Signed-off-by: Tomasz Nowicki <tomasz.nowicki@linaro.org> >> Signed-off-by: Chen, Gong <gong.chen@linux.intel.com> >> Tested-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org> >> Tested-by: Fu Wei <fu.wei@linaro.org> >> Signed-off-by: Tony Luck <tony.luck@intel.com> >> Signed-off-by: Fu Wei <fu.wei@linaro.org> >> --- >> Documentation/kernel-parameters.txt | 3 + >> drivers/acpi/apei/Makefile | 2 +- >> drivers/acpi/apei/bert.c | 162 ++++++++++++++++++++++++++++++++++++ >> include/acpi/apei.h | 1 + >> 4 files changed, 167 insertions(+), 1 deletion(-) >> >> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt >> index 1d6f045..7c6402c 100644 >> --- a/Documentation/kernel-parameters.txt >> +++ b/Documentation/kernel-parameters.txt >> @@ -554,6 +554,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. >> >> bootmem_debug [KNL] Enable bootmem allocator debug messages. >> >> + bert_disable [ACPI] >> + Disable Boot Error Record Table (BERT) support. > > Please document what that new parameter is good for. > >> + >> bttv.card= [HW,V4L] bttv (bt848 + bt878 based grabber cards) >> bttv.radio= Most important insmod options are available as >> kernel args too. >> diff --git a/drivers/acpi/apei/Makefile b/drivers/acpi/apei/Makefile >> index 5d575a9..e50573d 100644 >> --- a/drivers/acpi/apei/Makefile >> +++ b/drivers/acpi/apei/Makefile >> @@ -3,4 +3,4 @@ obj-$(CONFIG_ACPI_APEI_GHES) += ghes.o >> obj-$(CONFIG_ACPI_APEI_EINJ) += einj.o >> obj-$(CONFIG_ACPI_APEI_ERST_DEBUG) += erst-dbg.o >> >> -apei-y := apei-base.o hest.o erst.o >> +apei-y := apei-base.o hest.o erst.o bert.o >> diff --git a/drivers/acpi/apei/bert.c b/drivers/acpi/apei/bert.c >> new file mode 100644 >> index 0000000..1426227 >> --- /dev/null >> +++ b/drivers/acpi/apei/bert.c >> @@ -0,0 +1,162 @@ >> +/* >> + * APEI Boot Error Record Table (BERT) support >> + * >> + * Copyright 2011 Intel Corp. >> + * Author: Huang Ying <ying.huang@intel.com> >> + * >> + * Under normal circumstances, when a hardware error occurs, kernel >> + * will be notified via NMI, MCE or some other method, then kernel >> + * will process the error condition, report it, and recover it if >> + * possible. But sometime, the situation is so bad, so that firmware >> + * may choose to reset directly without notifying Linux kernel. >> + * >> + * Linux kernel can use the Boot Error Record Table (BERT) to get the >> + * un-notified hardware errors that occurred in a previous boot. >> + * >> + * For more information about BERT, please refer to ACPI Specification >> + * version 4.0, section 17.3.1 >> + * >> + * This file is licensed under GPLv2. >> + * >> + */ >> + >> +#include <linux/kernel.h> >> +#include <linux/module.h> >> +#include <linux/init.h> >> +#include <linux/acpi.h> >> +#include <linux/io.h> >> + >> +#include "apei-internal.h" >> + >> +#define BERT_PFX "BERT: " > > This is done with pr_fmt > >> +static int bert_disable; >> + >> +static void __init bert_print_all(struct acpi_hest_generic_status *region, >> + unsigned int region_len) >> +{ >> + struct acpi_hest_generic_status *estatus = region; >> + int remain = region_len; >> + u32 estatus_len; >> + int first = 1; > > if (!estatus->block_status) > return; > >> + while (remain > sizeof(struct acpi_hest_generic_status)) { >> + /* No more error record */ > > ... records */ > >> + if (!estatus->block_status) >> + break; > > This test should happen when you enter the function (see above) and when > you assign to estatus, i.e. below, at the end of the while loop body: > > estatus = (void *)estatus + estatus_len; > if (!estatus->block_status) > break; > >> + >> + estatus_len = cper_estatus_len(estatus); >> + if (estatus_len < sizeof(struct acpi_hest_generic_status) || >> + remain < estatus_len) { >> + pr_err(FW_BUG BERT_PFX >> + "Invalid error status block with length %u\n", > > "Invalid status block length (%u)" > >> + estatus_len); >> + return; >> + } >> + >> + if (cper_estatus_check(estatus)) { >> + pr_err(FW_BUG BERT_PFX "Invalid error status block\n"); > > "Invalid error record." >> + goto next; >> + } >> + >> + if (first) { >> + pr_info(HW_ERR "Error record from previous boot:\n"); > > Error records<--- plural. > >> + first = 0; >> + } > > We have pr_info_once() for this. > >> + >> + cper_estatus_print(KERN_INFO HW_ERR, estatus); >> +next: >> + estatus = (void *)estatus + estatus_len; >> + remain -= estatus_len; >> + } >> +} >> + >> +static int __init setup_bert_disable(char *str) >> +{ >> + bert_disable = 1; >> + >> + return 0; >> +} >> +__setup("bert_disable", setup_bert_disable); >> + >> +static int __init bert_check_table(struct acpi_table_bert *bert_tab) >> +{ >> + if (bert_tab->header.length < sizeof(struct acpi_table_bert)) >> + return -EINVAL; > > \n here > >> + if (bert_tab->region_length && > > This test looks redundant if you're going to compare it to the size of > the BERT region struct below. > >> + bert_tab->region_length < sizeof(struct acpi_bert_region)) >> + return -EINVAL; >> + >> + return 0; >> +} >> + >> +static int __init bert_init(void) >> +{ >> + struct acpi_hest_generic_status *bert_region; >> + struct acpi_table_bert *bert_tab; >> + unsigned int region_len; >> + acpi_status status; >> + struct resource *r; >> + int rc = -EINVAL; >> + >> + if (acpi_disabled) >> + goto out; >> + >> + if (bert_disable) { >> + pr_info(BERT_PFX "Boot Error Record Table (BERT) support is disabled.\n"); >> + goto out; >> + } >> + >> + status = acpi_get_table(ACPI_SIG_BERT, 0, >> + (struct acpi_table_header **)&bert_tab); > > No need to break that line, just leave it stick out. > >> + if (status == AE_NOT_FOUND) { >> + pr_err(BERT_PFX "Table is not found!\n"); > > This will get issued on *all* machines which don't have BERT - which > is the majority now, I'm afraid - and that information is useless to > people. It's not like they can do anything about it short of overriding > their own firmware. > > So please remove that printk. > >> + goto out; >> + } else if (ACPI_FAILURE(status)) { >> + const char *msg = acpi_format_exception(status); >> + >> + pr_err(BERT_PFX "Failed to get table, %s\n", msg); >> + goto out; >> + } >> + >> + rc = bert_check_table(bert_tab); >> + if (rc) { >> + pr_err(FW_BUG BERT_PFX "BERT table is invalid\n"); > > Prefix already has "BERT". Either write it out: > > "BERT: Boot Error Record Table invalid" > > or make it shorter: > > "BERT: table invalid." > >> + goto out; >> + } >> + >> + region_len = bert_tab->region_length; >> + if (!region_len) { >> + rc = 0; >> + goto out; >> + } > > Huh, we just checked ->region_length in bert_check_table(). This test is > redundant. > >> + >> + r = request_mem_region(bert_tab->address, region_len, "APEI BERT"); >> + if (!r) { >> + pr_err(BERT_PFX "Can't request iomem region <%016llx-%016llx>\n", >> + (unsigned long long)bert_tab->address, >> + (unsigned long long)bert_tab->address + region_len - 1); >> + rc = -EIO; >> + goto out; >> + } >> + >> + bert_region = ioremap_cache(bert_tab->address, region_len); >> + if (!bert_region) { >> + rc = -ENOMEM; >> + goto out_release; >> + } >> + >> + bert_print_all(bert_region, region_len); >> + >> + iounmap(bert_region); >> + >> +out_release: >> + release_mem_region(bert_tab->address, region_len); >> +out: >> + if (rc) >> + bert_disable = 1; > > This assignment is redundant. We're not testing it anywhere else after > bert_init() has run. > >> + >> + return rc; >> +} >> + >> +late_initcall(bert_init); >> diff --git a/include/acpi/apei.h b/include/acpi/apei.h >> index 76284bb..284801a 100644 >> --- a/include/acpi/apei.h >> +++ b/include/acpi/apei.h >> @@ -23,6 +23,7 @@ extern bool ghes_disable; >> #else >> #define ghes_disable 1 >> #endif >> +extern int bert_disable; >> >> #ifdef CONFIG_ACPI_APEI >> void __init acpi_hest_init(void); >> -- >> 2.4.3 >> >> > > -- > Regards/Gruss, > Boris. > > ECO tip #101: Trim your mails when you reply. -- Best regards, Fu Wei Software Engineer Red Hat Software (Beijing) Co.,Ltd.Shanghai Branch Ph: +86 21 61221326(direct) Ph: +86 186 2020 4684 (mobile) Room 1512, Regus One Corporate Avenue,Level 15, One Corporate Avenue,222 Hubin Road,Huangpu District, Shanghai,China 200021 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 1d6f045..7c6402c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -554,6 +554,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. bootmem_debug [KNL] Enable bootmem allocator debug messages. + bert_disable [ACPI] + Disable Boot Error Record Table (BERT) support. + bttv.card= [HW,V4L] bttv (bt848 + bt878 based grabber cards) bttv.radio= Most important insmod options are available as kernel args too. diff --git a/drivers/acpi/apei/Makefile b/drivers/acpi/apei/Makefile index 5d575a9..e50573d 100644 --- a/drivers/acpi/apei/Makefile +++ b/drivers/acpi/apei/Makefile @@ -3,4 +3,4 @@ obj-$(CONFIG_ACPI_APEI_GHES) += ghes.o obj-$(CONFIG_ACPI_APEI_EINJ) += einj.o obj-$(CONFIG_ACPI_APEI_ERST_DEBUG) += erst-dbg.o -apei-y := apei-base.o hest.o erst.o +apei-y := apei-base.o hest.o erst.o bert.o diff --git a/drivers/acpi/apei/bert.c b/drivers/acpi/apei/bert.c new file mode 100644 index 0000000..1426227 --- /dev/null +++ b/drivers/acpi/apei/bert.c @@ -0,0 +1,162 @@ +/* + * APEI Boot Error Record Table (BERT) support + * + * Copyright 2011 Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * + * Under normal circumstances, when a hardware error occurs, kernel + * will be notified via NMI, MCE or some other method, then kernel + * will process the error condition, report it, and recover it if + * possible. But sometime, the situation is so bad, so that firmware + * may choose to reset directly without notifying Linux kernel. + * + * Linux kernel can use the Boot Error Record Table (BERT) to get the + * un-notified hardware errors that occurred in a previous boot. + * + * For more information about BERT, please refer to ACPI Specification + * version 4.0, section 17.3.1 + * + * This file is licensed under GPLv2. + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/acpi.h> +#include <linux/io.h> + +#include "apei-internal.h" + +#define BERT_PFX "BERT: " + +static int bert_disable; + +static void __init bert_print_all(struct acpi_hest_generic_status *region, + unsigned int region_len) +{ + struct acpi_hest_generic_status *estatus = region; + int remain = region_len; + u32 estatus_len; + int first = 1; + + while (remain > sizeof(struct acpi_hest_generic_status)) { + /* No more error record */ + if (!estatus->block_status) + break; + + estatus_len = cper_estatus_len(estatus); + if (estatus_len < sizeof(struct acpi_hest_generic_status) || + remain < estatus_len) { + pr_err(FW_BUG BERT_PFX + "Invalid error status block with length %u\n", + estatus_len); + return; + } + + if (cper_estatus_check(estatus)) { + pr_err(FW_BUG BERT_PFX "Invalid error status block\n"); + goto next; + } + + if (first) { + pr_info(HW_ERR "Error record from previous boot:\n"); + first = 0; + } + + cper_estatus_print(KERN_INFO HW_ERR, estatus); +next: + estatus = (void *)estatus + estatus_len; + remain -= estatus_len; + } +} + +static int __init setup_bert_disable(char *str) +{ + bert_disable = 1; + + return 0; +} +__setup("bert_disable", setup_bert_disable); + +static int __init bert_check_table(struct acpi_table_bert *bert_tab) +{ + if (bert_tab->header.length < sizeof(struct acpi_table_bert)) + return -EINVAL; + if (bert_tab->region_length && + bert_tab->region_length < sizeof(struct acpi_bert_region)) + return -EINVAL; + + return 0; +} + +static int __init bert_init(void) +{ + struct acpi_hest_generic_status *bert_region; + struct acpi_table_bert *bert_tab; + unsigned int region_len; + acpi_status status; + struct resource *r; + int rc = -EINVAL; + + if (acpi_disabled) + goto out; + + if (bert_disable) { + pr_info(BERT_PFX "Boot Error Record Table (BERT) support is disabled.\n"); + goto out; + } + + status = acpi_get_table(ACPI_SIG_BERT, 0, + (struct acpi_table_header **)&bert_tab); + if (status == AE_NOT_FOUND) { + pr_err(BERT_PFX "Table is not found!\n"); + goto out; + } else if (ACPI_FAILURE(status)) { + const char *msg = acpi_format_exception(status); + + pr_err(BERT_PFX "Failed to get table, %s\n", msg); + goto out; + } + + rc = bert_check_table(bert_tab); + if (rc) { + pr_err(FW_BUG BERT_PFX "BERT table is invalid\n"); + goto out; + } + + region_len = bert_tab->region_length; + if (!region_len) { + rc = 0; + goto out; + } + + r = request_mem_region(bert_tab->address, region_len, "APEI BERT"); + if (!r) { + pr_err(BERT_PFX "Can't request iomem region <%016llx-%016llx>\n", + (unsigned long long)bert_tab->address, + (unsigned long long)bert_tab->address + region_len - 1); + rc = -EIO; + goto out; + } + + bert_region = ioremap_cache(bert_tab->address, region_len); + if (!bert_region) { + rc = -ENOMEM; + goto out_release; + } + + bert_print_all(bert_region, region_len); + + iounmap(bert_region); + +out_release: + release_mem_region(bert_tab->address, region_len); +out: + if (rc) + bert_disable = 1; + + return rc; +} + +late_initcall(bert_init); diff --git a/include/acpi/apei.h b/include/acpi/apei.h index 76284bb..284801a 100644 --- a/include/acpi/apei.h +++ b/include/acpi/apei.h @@ -23,6 +23,7 @@ extern bool ghes_disable; #else #define ghes_disable 1 #endif +extern int bert_disable; #ifdef CONFIG_ACPI_APEI void __init acpi_hest_init(void);