Message ID | 20250226221157.149406-3-Smita.KoralahalliChannabasappa@amd.com |
---|---|
State | New |
Headers | show |
Series | acpi/ghes, cper, cxl: Process CXL CPER Protocol errors | expand |
Smita Koralahalli wrote: > The CXL drivers use kernel trace functions for logging endpoint and > Restricted CXL host (RCH) Downstream Port RAS errors. Similar functionality > is required for CXL Root Ports, CXL Downstream Switch Ports, and CXL > Upstream Switch Ports. > > Introduce trace logging functions for both RAS correctable and > uncorrectable errors specific to CXL PCIe Ports. Use them to trace > FW-First Protocol errors. > > Co-developed-by: Terry Bowman <terry.bowman@amd.com> > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> Reviewed-by: Ira Weiny <ira.weiny@intel.com> [snip]
On Wed, Feb 26, 2025 at 10:11:57PM +0000, Smita Koralahalli wrote: Kind of a nit, but since these are exposed to user space suggest matching these names now. > +++ b/drivers/cxl/core/trace.h > @@ -48,6 +48,34 @@ > { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \ > ) > > +TRACE_EVENT(cxl_port_aer_uncorrectable_error, > + TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl), > + TP_ARGS(dev, status, fe, hl), > + TP_STRUCT__entry( > + __string(devname, dev_name(dev)) > + __string(parent, dev_name(dev->parent)) Above devname, parent > + __field(u32, status) > + __field(u32, first_error) > + __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) > + ), > + TP_fast_assign( > + __assign_str(devname); > + __assign_str(parent); > + __entry->status = status; > + __entry->first_error = fe; > + /* > + * Embed the 512B headerlog data for user app retrieval and > + * parsing, but no need to print this in the trace buffer. > + */ > + memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE); > + ), > + TP_printk("device=%s host=%s status: '%s' first_error: '%s'", Above device, host > + __get_str(devname), __get_str(parent), > + show_uc_errs(__entry->status), > + show_uc_errs(__entry->first_error) > + ) > +); > + > TRACE_EVENT(cxl_aer_uncorrectable_error, > TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl), > TP_ARGS(cxlmd, status, fe, hl), > @@ -96,6 +124,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error, > { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \ > ) > same thing below - > +TRACE_EVENT(cxl_port_aer_correctable_error, > + TP_PROTO(struct device *dev, u32 status), > + TP_ARGS(dev, status), > + TP_STRUCT__entry( > + __string(devname, dev_name(dev)) > + __string(parent, dev_name(dev->parent)) > + __field(u32, status) > + ), > + TP_fast_assign( > + __assign_str(devname); > + __assign_str(parent); > + __entry->status = status; > + ), > + TP_printk("device=%s host=%s status='%s'", > + __get_str(devname), __get_str(parent), > + show_ce_errs(__entry->status) > + ) > +); > + > TRACE_EVENT(cxl_aer_correctable_error, > TP_PROTO(const struct cxl_memdev *cxlmd, u32 status), > TP_ARGS(cxlmd, status), > -- > 2.17.1 >
Hi Alison, Thanks for the review. On 3/4/2025 10:56 AM, Alison Schofield wrote: > On Wed, Feb 26, 2025 at 10:11:57PM +0000, Smita Koralahalli wrote: > > Kind of a nit, but since these are exposed to user space suggest > matching these names now. > >> +++ b/drivers/cxl/core/trace.h >> @@ -48,6 +48,34 @@ >> { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \ >> ) >> >> +TRACE_EVENT(cxl_port_aer_uncorrectable_error, >> + TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl), >> + TP_ARGS(dev, status, fe, hl), >> + TP_STRUCT__entry( >> + __string(devname, dev_name(dev)) >> + __string(parent, dev_name(dev->parent)) > > Above devname, parent Ok I'm planning to keep as device and parent. Let me know if wording "host" is preferred over "parent". Thanks Smita > >> + __field(u32, status) >> + __field(u32, first_error) >> + __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) >> + ), >> + TP_fast_assign( >> + __assign_str(devname); >> + __assign_str(parent); >> + __entry->status = status; >> + __entry->first_error = fe; >> + /* >> + * Embed the 512B headerlog data for user app retrieval and >> + * parsing, but no need to print this in the trace buffer. >> + */ >> + memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE); >> + ), >> + TP_printk("device=%s host=%s status: '%s' first_error: '%s'", > > Above device, host > >> + __get_str(devname), __get_str(parent), >> + show_uc_errs(__entry->status), >> + show_uc_errs(__entry->first_error) >> + ) >> +); >> + >> TRACE_EVENT(cxl_aer_uncorrectable_error, >> TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl), >> TP_ARGS(cxlmd, status, fe, hl), >> @@ -96,6 +124,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error, >> { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \ >> ) >> > > same thing below - > > >> +TRACE_EVENT(cxl_port_aer_correctable_error, >> + TP_PROTO(struct device *dev, u32 status), >> + TP_ARGS(dev, status), >> + TP_STRUCT__entry( >> + __string(devname, dev_name(dev)) >> + __string(parent, dev_name(dev->parent)) >> + __field(u32, status) >> + ), >> + TP_fast_assign( >> + __assign_str(devname); >> + __assign_str(parent); >> + __entry->status = status; >> + ), >> + TP_printk("device=%s host=%s status='%s'", >> + __get_str(devname), __get_str(parent), >> + show_ce_errs(__entry->status) >> + ) >> +); >> + >> TRACE_EVENT(cxl_aer_correctable_error, >> TP_PROTO(const struct cxl_memdev *cxlmd, u32 status), >> TP_ARGS(cxlmd, status), >> -- >> 2.17.1 >>
On Tue, Mar 04, 2025 at 12:33:56PM -0800, Koralahalli Channabasappa, Smita wrote: snip > > > +TRACE_EVENT(cxl_port_aer_uncorrectable_error, > > > + TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl), > > > + TP_ARGS(dev, status, fe, hl), > > > + TP_STRUCT__entry( > > > + __string(devname, dev_name(dev)) > > > + __string(parent, dev_name(dev->parent)) > > > > Above devname, parent > > Ok I'm planning to keep as device and parent. Let me know if wording "host" > is preferred over "parent". Take a look at these in the same file that use memdev, 'host'. Maybe you want to be similar. TRACE_EVENT(cxl_aer_uncorrectable_error, TRACE_EVENT(cxl_aer_correctable_error, snip >
On Tue, Mar 04, 2025 at 01:10:11PM -0800, Alison Schofield wrote: > On Tue, Mar 04, 2025 at 12:33:56PM -0800, Koralahalli Channabasappa, Smita wrote: > snip > > > > +TRACE_EVENT(cxl_port_aer_uncorrectable_error, > > > > + TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl), > > > > + TP_ARGS(dev, status, fe, hl), > > > > + TP_STRUCT__entry( > > > > + __string(devname, dev_name(dev)) > > > > + __string(parent, dev_name(dev->parent)) > > > > > > Above devname, parent > > > > Ok I'm planning to keep as device and parent. Let me know if wording "host" > > is preferred over "parent". > > Take a look at these in the same file that use memdev, 'host'. > Maybe you want to be similar. > > TRACE_EVENT(cxl_aer_uncorrectable_error, > TRACE_EVENT(cxl_aer_correctable_error, BTW - I wasn't being intentionally vague. I don't know what is the best lingo. Do memdev's have hosts and ports have parents? If that's the right lingo, then go with it. > > snip > > >
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c index 8c596f035095..0234645a9eef 100644 --- a/drivers/cxl/core/ras.c +++ b/drivers/cxl/core/ras.c @@ -7,6 +7,30 @@ #include <cxlmem.h> #include "trace.h" +static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev, + struct cxl_ras_capability_regs ras_cap) +{ + u32 status = ras_cap.cor_status & ~ras_cap.cor_mask; + + trace_cxl_port_aer_correctable_error(&pdev->dev, status); +} + +static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev, + struct cxl_ras_capability_regs ras_cap) +{ + u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask; + u32 fe; + + if (hweight32(status) > 1) + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, + ras_cap.cap_control)); + else + fe = status; + + trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe, + ras_cap.header_log); +} + static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, struct cxl_ras_capability_regs ras_cap) { @@ -49,11 +73,24 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment, data->prot_err.agent_addr.bus, devfn); + int port_type; guard(device)(&pdev->dev); if (!pdev) return; + port_type = pci_pcie_type(pdev); + if (port_type == PCI_EXP_TYPE_ROOT_PORT || + port_type == PCI_EXP_TYPE_DOWNSTREAM || + port_type == PCI_EXP_TYPE_UPSTREAM) { + if (data->severity == AER_CORRECTABLE) + cxl_cper_trace_corr_port_prot_err(pdev, data->ras_cap); + else + cxl_cper_trace_uncorr_port_prot_err(pdev, data->ras_cap); + + return; + } + if (data->severity == AER_CORRECTABLE) cxl_cper_trace_corr_prot_err(pdev, data->ras_cap); else diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index e3f842dcdf1d..220a667ff377 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -48,6 +48,34 @@ { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \ ) +TRACE_EVENT(cxl_port_aer_uncorrectable_error, + TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl), + TP_ARGS(dev, status, fe, hl), + TP_STRUCT__entry( + __string(devname, dev_name(dev)) + __string(parent, dev_name(dev->parent)) + __field(u32, status) + __field(u32, first_error) + __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) + ), + TP_fast_assign( + __assign_str(devname); + __assign_str(parent); + __entry->status = status; + __entry->first_error = fe; + /* + * Embed the 512B headerlog data for user app retrieval and + * parsing, but no need to print this in the trace buffer. + */ + memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE); + ), + TP_printk("device=%s host=%s status: '%s' first_error: '%s'", + __get_str(devname), __get_str(parent), + show_uc_errs(__entry->status), + show_uc_errs(__entry->first_error) + ) +); + TRACE_EVENT(cxl_aer_uncorrectable_error, TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl), TP_ARGS(cxlmd, status, fe, hl), @@ -96,6 +124,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error, { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \ ) +TRACE_EVENT(cxl_port_aer_correctable_error, + TP_PROTO(struct device *dev, u32 status), + TP_ARGS(dev, status), + TP_STRUCT__entry( + __string(devname, dev_name(dev)) + __string(parent, dev_name(dev->parent)) + __field(u32, status) + ), + TP_fast_assign( + __assign_str(devname); + __assign_str(parent); + __entry->status = status; + ), + TP_printk("device=%s host=%s status='%s'", + __get_str(devname), __get_str(parent), + show_ce_errs(__entry->status) + ) +); + TRACE_EVENT(cxl_aer_correctable_error, TP_PROTO(const struct cxl_memdev *cxlmd, u32 status), TP_ARGS(cxlmd, status),