diff mbox series

[v2,2/2] RAS/AMD/ATL: Translate normalized to system physical addresses using PRM

Message ID 20240506174721.72018-3-john.allen@amd.com
State New
Headers show
Series PRM handler direct call interface | expand

Commit Message

John Allen May 6, 2024, 5:47 p.m. UTC
Future AMD platforms will provide a UEFI PRM module that implements a
number of address translation PRM handlers. This will provide an
interface for the OS to call platform specific code without requiring
the use of SMM or other heavy firmware operations.

AMD Zen-based systems report memory error addresses through Machine
Check banks representing Unified Memory Controllers (UMCs) in the form
of UMC relative "normalized" addresses. A normalized address must be
converted to a system physical address to be usable by the OS.

Add support for the normalized to system physical address translation
PRM handler in the AMD Address Translation Library and prefer it over
native code if available. The GUID and parameter buffer structure are
specific to the normalized to system physical address handler provided
by the address translation PRM module included in future AMD systems.

The address translation PRM module is documented in chapter 22 of the
publicly available "AMD Family 1Ah Models 00h–0Fh and Models 10h–1Fh
ACPI v6.5 Porting Guide":
https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/programmer-references/58088-0.75-pub.pdf

Signed-off-by: John Allen <john.allen@amd.com>
---
v2:
  - Make norm_to_sys_prm_handler_guid static.
  - Change pr_info statements to more appropriate pr_debug and
    pr_info_once statements.
---
 drivers/ras/amd/atl/Makefile   |  1 +
 drivers/ras/amd/atl/internal.h |  2 ++
 drivers/ras/amd/atl/prm.c      | 61 ++++++++++++++++++++++++++++++++++
 drivers/ras/amd/atl/umc.c      |  5 +++
 4 files changed, 69 insertions(+)
 create mode 100644 drivers/ras/amd/atl/prm.c

Comments

Borislav Petkov June 28, 2024, 7:45 a.m. UTC | #1
On Mon, May 06, 2024 at 05:47:21PM +0000, John Allen wrote:
> Future AMD platforms will provide a UEFI PRM module that implements a
> number of address translation PRM handlers. This will provide an
> interface for the OS to call platform specific code without requiring
> the use of SMM or other heavy firmware operations.
> 
> AMD Zen-based systems report memory error addresses through Machine
> Check banks representing Unified Memory Controllers (UMCs) in the form
> of UMC relative "normalized" addresses. A normalized address must be
> converted to a system physical address to be usable by the OS.

This should be your first paragraph.

> Add support for the normalized to system physical address translation
> PRM handler in the AMD Address Translation Library and prefer it over
> native code if available. The GUID and parameter buffer structure are
> specific to the normalized to system physical address handler provided
> by the address translation PRM module included in future AMD systems.
> 
> The address translation PRM module is documented in chapter 22 of the
> publicly available "AMD Family 1Ah Models 00h–0Fh and Models 10h–1Fh
> ACPI v6.5 Porting Guide":
> https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/programmer-references/58088-0.75-pub.pdf

Those URLs are flaky and become invalid over time. When you quote
a document, quote it in such a way so that searching for it on the web,
can find it. The name above works for me so that's good.

> +#include "internal.h"
> +
> +#if defined(CONFIG_ACPI_PRMT)

Instead of that ifdeffery you could do:

config AMD_ATL_PRM
	depends on AMD_ATL && ACPI_PRMT

and it'll get enabled automatically and then you don't need the empty
stub either.

> +#include <linux/prmt.h>
> +
> +struct prm_umc_param_buffer_norm {

What's a prm_umc_param_buffer_norm?

> +	u64 norm_addr;
> +	u8 socket;
> +	u64 umc_bank_inst_id;
> +	void *output_buffer;

Use the usual short versions for such standard names: "out_buf"

> +} __packed;
> +
> +static const guid_t norm_to_sys_prm_handler_guid = GUID_INIT(0xE7180659, 0xA65D,
> +							     0x451D, 0x92, 0xCD,
> +							     0x2B, 0x56, 0xF1, 0x2B,
> +							     0xEB, 0xA6);

When you define such long variable names, your lines stick out
unnecessarily. Shorten pls.

> +unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 umc_bank_inst_id, unsigned long addr)

bank_id is fine.

> +{
> +	struct prm_umc_param_buffer_norm param_buffer;

... p_buf;

> +	unsigned long ret_addr;
> +	int ret;
> +
> +	param_buffer.norm_addr        = addr;
> +	param_buffer.socket           = socket_id;
> +	param_buffer.umc_bank_inst_id = umc_bank_inst_id;
> +	param_buffer.output_buffer    = &ret_addr;
> +
> +	ret = acpi_call_prm_handler(norm_to_sys_prm_handler_guid, &param_buffer);
> +	if (!ret)
> +		return ret_addr;
> +
> +	if (ret == -ENODEV)
> +		pr_debug("PRM module/handler not available\n");
> +	else
> +		pr_notice_once("PRM address translation failed\n");
> +
> +	return ret;
> +}
John Allen July 1, 2024, 4:23 p.m. UTC | #2
On Fri, Jun 28, 2024 at 09:45:22AM +0200, Borislav Petkov wrote:
> On Mon, May 06, 2024 at 05:47:21PM +0000, John Allen wrote:
> > Future AMD platforms will provide a UEFI PRM module that implements a
> > number of address translation PRM handlers. This will provide an
> > interface for the OS to call platform specific code without requiring
> > the use of SMM or other heavy firmware operations.
> > 
> > AMD Zen-based systems report memory error addresses through Machine
> > Check banks representing Unified Memory Controllers (UMCs) in the form
> > of UMC relative "normalized" addresses. A normalized address must be
> > converted to a system physical address to be usable by the OS.
> 
> This should be your first paragraph.
> 
> > Add support for the normalized to system physical address translation
> > PRM handler in the AMD Address Translation Library and prefer it over
> > native code if available. The GUID and parameter buffer structure are
> > specific to the normalized to system physical address handler provided
> > by the address translation PRM module included in future AMD systems.
> > 
> > The address translation PRM module is documented in chapter 22 of the
> > publicly available "AMD Family 1Ah Models 00h–0Fh and Models 10h–1Fh
> > ACPI v6.5 Porting Guide":
> > https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/programmer-references/58088-0.75-pub.pdf
> 
> Those URLs are flaky and become invalid over time. When you quote
> a document, quote it in such a way so that searching for it on the web,
> can find it. The name above works for me so that's good.
> 
> > +#include "internal.h"
> > +
> > +#if defined(CONFIG_ACPI_PRMT)
> 
> Instead of that ifdeffery you could do:
> 
> config AMD_ATL_PRM
> 	depends on AMD_ATL && ACPI_PRMT
> 
> and it'll get enabled automatically and then you don't need the empty
> stub either.
> 
> > +#include <linux/prmt.h>
> > +
> > +struct prm_umc_param_buffer_norm {
> 
> What's a prm_umc_param_buffer_norm?

This is the param buffer struct used for norm -> X tranlations. I can
shorten and clarify this along with the others you pointed out. Maybe
"param_buffer_norm" instead and a comment explaining the purpose?

Thanks,
John

> 
> > +	u64 norm_addr;
> > +	u8 socket;
> > +	u64 umc_bank_inst_id;
> > +	void *output_buffer;
> 
> Use the usual short versions for such standard names: "out_buf"
> 
> > +} __packed;
> > +
> > +static const guid_t norm_to_sys_prm_handler_guid = GUID_INIT(0xE7180659, 0xA65D,
> > +							     0x451D, 0x92, 0xCD,
> > +							     0x2B, 0x56, 0xF1, 0x2B,
> > +							     0xEB, 0xA6);
> 
> When you define such long variable names, your lines stick out
> unnecessarily. Shorten pls.
> 
> > +unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 umc_bank_inst_id, unsigned long addr)
> 
> bank_id is fine.
> 
> > +{
> > +	struct prm_umc_param_buffer_norm param_buffer;
> 
> ... p_buf;
> 
> > +	unsigned long ret_addr;
> > +	int ret;
> > +
> > +	param_buffer.norm_addr        = addr;
> > +	param_buffer.socket           = socket_id;
> > +	param_buffer.umc_bank_inst_id = umc_bank_inst_id;
> > +	param_buffer.output_buffer    = &ret_addr;
> > +
> > +	ret = acpi_call_prm_handler(norm_to_sys_prm_handler_guid, &param_buffer);
> > +	if (!ret)
> > +		return ret_addr;
> > +
> > +	if (ret == -ENODEV)
> > +		pr_debug("PRM module/handler not available\n");
> > +	else
> > +		pr_notice_once("PRM address translation failed\n");
> > +
> > +	return ret;
> > +}
> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> https://people.kernel.org/tglx/notes-about-netiquette
Borislav Petkov July 1, 2024, 4:35 p.m. UTC | #3
On Mon, Jul 01, 2024 at 11:23:36AM -0500, John Allen wrote:
> This is the param buffer struct used for norm -> X tranlations. I can
> shorten and clarify this along with the others you pointed out. Maybe
> "param_buffer_norm" instead and a comment explaining the purpose?

What's wrong with

struct param_buffer

simply?

It is obvious from the code that it is used in the normalized -> physical
address translation.

Btw, pls do me a favor and trim your replies like I just did.

Thx.
John Allen July 1, 2024, 4:43 p.m. UTC | #4
On Mon, Jul 01, 2024 at 06:35:05PM +0200, Borislav Petkov wrote:
> On Mon, Jul 01, 2024 at 11:23:36AM -0500, John Allen wrote:
> > This is the param buffer struct used for norm -> X tranlations. I can
> > shorten and clarify this along with the others you pointed out. Maybe
> > "param_buffer_norm" instead and a comment explaining the purpose?
> 
> What's wrong with
> 
> struct param_buffer
> 
> simply?
> 
> It is obvious from the code that it is used in the normalized -> physical
> address translation.

This is because the spec defines different param buffer structures for
different types of translation. We can call this just param_buffer for
now, but would need to be renamed if/when we add use cases for the other
translation handlers. My preference would be to sort of future-proof
the name now, but I don't have an issue calling it param_buffer now and
changing it later if that's what you'd prefer.

> Btw, pls do me a favor and trim your replies like I just did.

Sure, sorry about that.

Thanks,
John
Borislav Petkov July 1, 2024, 4:59 p.m. UTC | #5
On Mon, Jul 01, 2024 at 11:43:35AM -0500, John Allen wrote:
> This is because the spec defines different param buffer structures for
> different types of translation. We can call this just param_buffer for
> now, but would need to be renamed if/when we add use cases for the other
> translation handlers. My preference would be to sort of future-proof
> the name now, but I don't have an issue calling it param_buffer now and
> changing it later if that's what you'd prefer.

Sure:

/*
 * PRM parameter buffer - normalized to system physical address, as described
 * in the section "PRM Parameter Buffer" in the aforementioned spec.
 */
struct norm_to_spa_param_buf {
	...
} __packed;

and you'll have to mention the spec in the prm.c file, at the top.

This way readers can immediately map it to the place in the spec.

Thx.
John Allen July 3, 2024, 8:14 p.m. UTC | #6
On Fri, Jun 28, 2024 at 09:45:22AM +0200, Borislav Petkov wrote:
> On Mon, May 06, 2024 at 05:47:21PM +0000, John Allen wrote:
> > +#include "internal.h"
> > +
> > +#if defined(CONFIG_ACPI_PRMT)
> 
> Instead of that ifdeffery you could do:
> 
> config AMD_ATL_PRM
> 	depends on AMD_ATL && ACPI_PRMT
> 
> and it'll get enabled automatically and then you don't need the empty
> stub either.

I'm not sure this works the way we need it to. If ACPI_PRMT is not
enabled, then the norm to sys translation function will be referenced by
the base AMD ATL, but will the symbol will not be found since the the
PRM file doesn't get compiled.

I added the AMD_ATL_PRM config and added the following to the ATL
Makefile:
amd_atl-$(CONFIG_AMD_ATL_PRM) += prm.o

instead of:

amd_atl-y		+= prm.o

Is there another way you had in mind to make the additional config
option work as expected?

Thanks,
John
Borislav Petkov July 3, 2024, 10:46 p.m. UTC | #7
On Wed, Jul 03, 2024 at 03:14:53PM -0500, John Allen wrote:
> I'm not sure this works the way we need it to. If ACPI_PRMT is not
> enabled, then the norm to sys translation function will be referenced by
> the base AMD ATL, but will the symbol will not be found since the the
> PRM file doesn't get compiled.

So you don't delete the stub but you put it in the internal.h header:

#ifndef CONFIG_AMD_ATL_PRM
+unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 umc_bank_inst_id, unsigned long addr)
+{
+       return -ENODEV;
+}
+
+#endif

Don't be afraid to grep the tree - there are gazillion examples how stuff like
that is usually done.
diff mbox series

Patch

diff --git a/drivers/ras/amd/atl/Makefile b/drivers/ras/amd/atl/Makefile
index 4acd5f05bd9c..8f1afa793e3b 100644
--- a/drivers/ras/amd/atl/Makefile
+++ b/drivers/ras/amd/atl/Makefile
@@ -14,5 +14,6 @@  amd_atl-y		+= denormalize.o
 amd_atl-y		+= map.o
 amd_atl-y		+= system.o
 amd_atl-y		+= umc.o
+amd_atl-y		+= prm.o
 
 obj-$(CONFIG_AMD_ATL)	+= amd_atl.o
diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h
index 5de69e0bb0f9..f739dcada126 100644
--- a/drivers/ras/amd/atl/internal.h
+++ b/drivers/ras/amd/atl/internal.h
@@ -234,6 +234,8 @@  int dehash_address(struct addr_ctx *ctx);
 unsigned long norm_to_sys_addr(u8 socket_id, u8 die_id, u8 coh_st_inst_id, unsigned long addr);
 unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
 
+unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 umc_bank_inst_id, unsigned long addr);
+
 /*
  * Make a gap in @data that is @num_bits long starting at @bit_num.
  * e.g. data		= 11111111'b
diff --git a/drivers/ras/amd/atl/prm.c b/drivers/ras/amd/atl/prm.c
new file mode 100644
index 000000000000..8e96a6370ae3
--- /dev/null
+++ b/drivers/ras/amd/atl/prm.c
@@ -0,0 +1,61 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AMD Address Translation Library
+ *
+ * prm.c : Plumbing code to UEFI Platform Runtime Mechanism (PRM)
+ *
+ * Copyright (c) 2024, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Author: John Allen <john.allen@amd.com>
+ */
+
+#include "internal.h"
+
+#if defined(CONFIG_ACPI_PRMT)
+
+#include <linux/prmt.h>
+
+struct prm_umc_param_buffer_norm {
+	u64 norm_addr;
+	u8 socket;
+	u64 umc_bank_inst_id;
+	void *output_buffer;
+} __packed;
+
+static const guid_t norm_to_sys_prm_handler_guid = GUID_INIT(0xE7180659, 0xA65D,
+							     0x451D, 0x92, 0xCD,
+							     0x2B, 0x56, 0xF1, 0x2B,
+							     0xEB, 0xA6);
+
+unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 umc_bank_inst_id, unsigned long addr)
+{
+	struct prm_umc_param_buffer_norm param_buffer;
+	unsigned long ret_addr;
+	int ret;
+
+	param_buffer.norm_addr        = addr;
+	param_buffer.socket           = socket_id;
+	param_buffer.umc_bank_inst_id = umc_bank_inst_id;
+	param_buffer.output_buffer    = &ret_addr;
+
+	ret = acpi_call_prm_handler(norm_to_sys_prm_handler_guid, &param_buffer);
+	if (!ret)
+		return ret_addr;
+
+	if (ret == -ENODEV)
+		pr_debug("PRM module/handler not available\n");
+	else
+		pr_notice_once("PRM address translation failed\n");
+
+	return ret;
+}
+
+#else /* ACPI_PRMT */
+
+unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 umc_bank_inst_id, unsigned long addr)
+{
+	return -ENODEV;
+}
+
+#endif
diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
index 08c6dbd44c62..3c018870633c 100644
--- a/drivers/ras/amd/atl/umc.c
+++ b/drivers/ras/amd/atl/umc.c
@@ -333,9 +333,14 @@  unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err)
 	u8 coh_st_inst_id = get_coh_st_inst_id(err);
 	unsigned long addr = get_addr(err->addr);
 	u8 die_id = get_die_id(err);
+	unsigned long ret_addr;
 
 	pr_debug("socket_id=0x%x die_id=0x%x coh_st_inst_id=0x%x addr=0x%016lx",
 		 socket_id, die_id, coh_st_inst_id, addr);
 
+	ret_addr = prm_umc_norm_to_sys_addr(socket_id, err->ipid, addr);
+	if (!IS_ERR_VALUE(ret_addr))
+		return ret_addr;
+
 	return norm_to_sys_addr(socket_id, die_id, coh_st_inst_id, addr);
 }