Message ID | 20240528095522.509667-1-kirill.shutemov@linux.intel.com |
---|---|
Headers | show |
Series | x86/tdx: Add kexec support | expand |
On Tue, May 28, 2024 at 11:55 AM Kirill A. Shutemov <kirill.shutemov@linux.intel.com> wrote: > > The patchset adds bits and pieces to get kexec (and crashkernel) work on > TDX guest. > > The last patch implements CPU offlining according to the approved ACPI > spec change poposal[1]. It unlocks kexec with all CPUs visible in the target > kernel. It requires BIOS-side enabling. If it missing we fallback to booting > 2nd kernel with single CPU. > > Please review. I would be glad for any feedback. > > [1] https://lore.kernel.org/all/13356251.uLZWGnKmhe@kreacher For the ACPI-related changes in the series Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
On Tue, 2024-05-28 at 12:55 +0300, Kirill A. Shutemov wrote: > TDX guests run with MCA enabled (CR4.MCE=1b) from the very start. If > that bit is cleared during CR4 register reprogramming during boot or > kexec flows, a #VE exception will be raised which the guest kernel > cannot handle it. Nit: the ending "it" isn't needed. > > Therefore, make sure the CR4.MCE setting is preserved over kexec too and > avoid raising any #VEs. > > The change doesn't affect non-TDX-guest environments. > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Reviewed-by: Kai Huang <kai.huang@intel.com> > --- > arch/x86/kernel/relocate_kernel_64.S | 16 ++++++++++------ > 1 file changed, 10 insertions(+), 6 deletions(-) > > diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S > index 085eef5c3904..b668a6be4f6f 100644 > --- a/arch/x86/kernel/relocate_kernel_64.S > +++ b/arch/x86/kernel/relocate_kernel_64.S > @@ -5,6 +5,8 @@ > */ > > #include <linux/linkage.h> > +#include <linux/stringify.h> > +#include <asm/alternative.h> > #include <asm/page_types.h> > #include <asm/kexec.h> > #include <asm/processor-flags.h> > @@ -143,15 +145,17 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) > > /* > * Set cr4 to a known state: > - * - physical address extension enabled > * - 5-level paging, if it was enabled before > + * - Machine check exception on TDX guest, if it was enabled before. > + * Clearing MCE might not be allowed in TDX guests, depending on setup. > + * - physical address extension enabled > */ > - movl $X86_CR4_PAE, %eax > - testq $X86_CR4_LA57, %r13 > - jz .Lno_la57 > - orl $X86_CR4_LA57, %eax > -.Lno_la57: > + movl $X86_CR4_LA57, %eax > + ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %eax), X86_FEATURE_TDX_GUEST > > + /* R13 contains the original CR4 value, read in relocate_kernel() */ > + andl %r13d, %eax > + orl $X86_CR4_PAE, %eax > movq %rax, %cr4 > > jmp 1f
On 28.05.24 г. 12:55 ч., Kirill A. Shutemov wrote: > From: Borislav Petkov <bp@alien8.de> > > That identity_mapped() functions was loving that "1" label to the point > of completely confusing its readers. > > Use named labels in each place for clarity. > > No functional changes. > > Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> > --- > arch/x86/kernel/relocate_kernel_64.S | 13 +++++++------ > 1 file changed, 7 insertions(+), 6 deletions(-) > > diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S > index 56cab1bb25f5..085eef5c3904 100644 > --- a/arch/x86/kernel/relocate_kernel_64.S > +++ b/arch/x86/kernel/relocate_kernel_64.S > @@ -148,9 +148,10 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) > */ > movl $X86_CR4_PAE, %eax > testq $X86_CR4_LA57, %r13 > - jz 1f > + jz .Lno_la57 > orl $X86_CR4_LA57, %eax > -1: > +.Lno_la57: > + > movq %rax, %cr4 > > jmp 1f That jmp 1f becomes redundant now as it simply jumps 1 line below. > @@ -165,9 +166,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) > * used by kexec. Flush the caches before copying the kernel. > */ > testq %r12, %r12 > - jz 1f > + jz .Lsme_off > wbinvd > -1: > +.Lsme_off: > > movq %rcx, %r11 > call swap_pages > @@ -187,7 +188,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) > */ > > testq %r11, %r11 > - jnz 1f > + jnz .Lrelocate > xorl %eax, %eax > xorl %ebx, %ebx > xorl %ecx, %ecx > @@ -208,7 +209,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) > ret > int3 > > -1: > +.Lrelocate: > popq %rdx > leaq PAGE_SIZE(%r10), %rsp > ANNOTATE_RETPOLINE_SAFE
On Wed, May 29, 2024 at 01:47:50PM +0300, Nikolay Borisov wrote: > > > On 28.05.24 г. 12:55 ч., Kirill A. Shutemov wrote: > > From: Borislav Petkov <bp@alien8.de> > > > > That identity_mapped() functions was loving that "1" label to the point > > of completely confusing its readers. > > > > Use named labels in each place for clarity. > > > > No functional changes. > > > > Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> > > --- > > arch/x86/kernel/relocate_kernel_64.S | 13 +++++++------ > > 1 file changed, 7 insertions(+), 6 deletions(-) > > > > diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S > > index 56cab1bb25f5..085eef5c3904 100644 > > --- a/arch/x86/kernel/relocate_kernel_64.S > > +++ b/arch/x86/kernel/relocate_kernel_64.S > > @@ -148,9 +148,10 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) > > */ > > movl $X86_CR4_PAE, %eax > > testq $X86_CR4_LA57, %r13 > > - jz 1f > > + jz .Lno_la57 > > orl $X86_CR4_LA57, %eax > > -1: > > +.Lno_la57: > > + > > movq %rax, %cr4 > > jmp 1f > > That jmp 1f becomes redundant now as it simply jumps 1 line below. > Nothing changed wrt this jump. It dates back to initial kexec implementation. See 5234f5eb04ab ("[PATCH] kexec: x86_64 kexec implementation"). But I don't see functional need in it. Anyway, it is outside of the scope of the patch.
On Wed, May 29, 2024 at 02:17:29PM +0300, Kirill A. Shutemov wrote: > > That jmp 1f becomes redundant now as it simply jumps 1 line below. > > > > Nothing changed wrt this jump. It dates back to initial kexec > implementation. > > See 5234f5eb04ab ("[PATCH] kexec: x86_64 kexec implementation"). > > But I don't see functional need in it. > > Anyway, it is outside of the scope of the patch. Yap, Kirill did what Nikolay should've done - git archeology. Please don't forget to do that next time. And back in the day they didn't comment non-obvious things because commenting is for losers. :-\ So that unconditional forward jump either flushes branch prediction on some old uarch or something else weird, uarch-special. I doubt we can remove it just like that. Lemme add Andy - he should know.
On 29/05/2024 12:28 pm, Borislav Petkov wrote: > On Wed, May 29, 2024 at 02:17:29PM +0300, Kirill A. Shutemov wrote: >>> That jmp 1f becomes redundant now as it simply jumps 1 line below. >>> >> Nothing changed wrt this jump. It dates back to initial kexec >> implementation. >> >> See 5234f5eb04ab ("[PATCH] kexec: x86_64 kexec implementation"). >> >> But I don't see functional need in it. >> >> Anyway, it is outside of the scope of the patch. > Yap, Kirill did what Nikolay should've done - git archeology. Please > don't forget to do that next time. > > And back in the day they didn't comment non-obvious things because > commenting is for losers. :-\ > > So that unconditional forward jump either flushes branch prediction on > some old uarch or something else weird, uarch-special. > > I doubt we can remove it just like that. > > Lemme add Andy - he should know. Seems I've gained a reputation... jmp 1f dates back to ye olde 8086, which started the whole trend of the instruction pointer just being a figment of the ISA's imagination[1]. Hardware maintains the pointer to the next byte to fetch (the prefetch queue was up to 6 bytes), and there was a micro-op to subtract the current length of the prefetch queue from the accumulator. In those days, the prefetch queue was not coherent with main memory, and jumps (being a discontinuity in the instruction stream) simply flushed the prefetch queue. This was necessary after modifying executable code, because otherwise you could end up executing stale bytes from the prefetch queue and then non-stale bytes thereafter. (Otherwise known as the way to distinguish the 8086 from the 8088 because the latter only had a 4 byte prefetch queue.) Anyway. It's how you used to spell "serialising operation" before that term ever entered the architecture. Linux still supports CPUs prior to the Pentium, so still needs to care about prefetch queues in the 486. However, this example appears to be in 64bit code and following a write to CR4 which will be fully serialising, so it's probably copy&paste from 32bit code where it would be necessary in principle. ~Andrew [1] https://www.righto.com/2023/01/inside-8086-processors-instruction.html#fn:pc In fact, anyone who hasn't should read the entire series on the 8086, https://www.righto.com/p/index.html
On Wed, May 29, 2024 at 01:33:35PM +0100, Andrew Cooper wrote: > Seems I've gained a reputation... Yes you have. You have this weird interest in very deep uarch details that I can't share. Not at that detail. :-P > jmp 1f dates back to ye olde 8086, which started the whole trend of the > instruction pointer just being a figment of the ISA's imagination[1]. > > Hardware maintains the pointer to the next byte to fetch (the prefetch > queue was up to 6 bytes), and there was a micro-op to subtract the > current length of the prefetch queue from the accumulator. > > In those days, the prefetch queue was not coherent with main memory, and > jumps (being a discontinuity in the instruction stream) simply flushed > the prefetch queue. > > This was necessary after modifying executable code, because otherwise > you could end up executing stale bytes from the prefetch queue and then > non-stale bytes thereafter. (Otherwise known as the way to distinguish > the 8086 from the 8088 because the latter only had a 4 byte prefetch queue.) Thanks - that certainly wakes up a long-asleep neuron in the back of my mind... > Anyway. It's how you used to spell "serialising operation" before that > term ever entered the architecture. Linux still supports CPUs prior to > the Pentium, so still needs to care about prefetch queues in the 486. > > However, this example appears to be in 64bit code and following a write > to CR4 which will be fully serialising, so it's probably copy&paste from > 32bit code where it would be necessary in principle. Yap, fully agreed. We could try to remove it and see what complains. Nikolay, wanna do a patch which properly explains the situation? > https://www.righto.com/2023/01/inside-8086-processors-instruction.html#fn:pc > > In fact, anyone who hasn't should read the entire series on the 8086, > https://www.righto.com/p/index.html Oh yeah, already bookmarked. Thanks Andy!
On 5/29/24 03:47, Nikolay Borisov wrote: >> >> diff --git a/arch/x86/kernel/relocate_kernel_64.S >> b/arch/x86/kernel/relocate_kernel_64.S >> index 56cab1bb25f5..085eef5c3904 100644 >> --- a/arch/x86/kernel/relocate_kernel_64.S >> +++ b/arch/x86/kernel/relocate_kernel_64.S >> @@ -148,9 +148,10 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) >> */ >> movl $X86_CR4_PAE, %eax >> testq $X86_CR4_LA57, %r13 >> - jz 1f >> + jz .Lno_la57 >> orl $X86_CR4_LA57, %eax >> -1: >> +.Lno_la57: >> + >> movq %rax, %cr4 >> jmp 1f > Sorry if this is a duplicate; something strange happened with my email. If you are cleaning up this code anyway... this whole piece of code can be simplified to: and $(X86_CR4_PAE | X86_CR4_LA57), %r13d mov %r13, %cr4 The PAE bit in %r13 is guaranteed to be set, and %r13 is dead after this. -hpa
On Mon, Jun 03, 2024 at 05:24:00PM -0700, H. Peter Anvin wrote: > Trying one more time; sorry (again) if someone receives this in duplicate. > > > > > > > > > diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S > > > > index 56cab1bb25f5..085eef5c3904 100644 > > > > --- a/arch/x86/kernel/relocate_kernel_64.S > > > > +++ b/arch/x86/kernel/relocate_kernel_64.S > > > > @@ -148,9 +148,10 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) > > > > */ > > > > movl $X86_CR4_PAE, %eax > > > > testq $X86_CR4_LA57, %r13 > > > > - jz 1f > > > > + jz .Lno_la57 > > > > orl $X86_CR4_LA57, %eax > > > > -1: > > > > +.Lno_la57: > > > > + > > > > movq %rax, %cr4 > > If we are cleaning up this code... the above can simply be: > > andl $(X86_CR4_PAE | X86_CR4_LA54), %r13 > movq %r13, %cr4 > > %r13 is dead afterwards, and the PAE bit *will* be set in %r13 anyway. Yeah, with a proper comment. The testing of bits is not really needed. Thx.
On Tue, Jun 04, 2024 at 11:15:03AM +0200, Borislav Petkov wrote: > On Mon, Jun 03, 2024 at 05:24:00PM -0700, H. Peter Anvin wrote: > > Trying one more time; sorry (again) if someone receives this in duplicate. > > > > > > > > > > > > diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S > > > > > index 56cab1bb25f5..085eef5c3904 100644 > > > > > --- a/arch/x86/kernel/relocate_kernel_64.S > > > > > +++ b/arch/x86/kernel/relocate_kernel_64.S > > > > > @@ -148,9 +148,10 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) > > > > > */ > > > > > movl $X86_CR4_PAE, %eax > > > > > testq $X86_CR4_LA57, %r13 > > > > > - jz 1f > > > > > + jz .Lno_la57 > > > > > orl $X86_CR4_LA57, %eax > > > > > -1: > > > > > +.Lno_la57: > > > > > + > > > > > movq %rax, %cr4 > > > > If we are cleaning up this code... the above can simply be: > > > > andl $(X86_CR4_PAE | X86_CR4_LA54), %r13 > > movq %r13, %cr4 > > > > %r13 is dead afterwards, and the PAE bit *will* be set in %r13 anyway. > > Yeah, with a proper comment. The testing of bits is not really needed. I think it is better fit the next patch. What about this?
On Tue, Jun 04, 2024 at 06:21:27PM +0300, Kirill A. Shutemov wrote:
> What about this?
Yeah, LGTM.
Thx.
On Tue, Jun 11, 2024 at 11:26:17AM -0700, H. Peter Anvin wrote: > On 6/4/24 08:21, Kirill A. Shutemov wrote: > > > > From b45fe48092abad2612c2bafbb199e4de80c99545 Mon Sep 17 00:00:00 2001 > > From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> > > Date: Fri, 10 Feb 2023 12:53:11 +0300 > > Subject: [PATCHv11.1 06/19] x86/kexec: Keep CR4.MCE set during kexec for TDX guest > > > > TDX guests run with MCA enabled (CR4.MCE=1b) from the very start. If > > that bit is cleared during CR4 register reprogramming during boot or > > kexec flows, a #VE exception will be raised which the guest kernel > > cannot handle it. > > > > Therefore, make sure the CR4.MCE setting is preserved over kexec too and > > avoid raising any #VEs. > > > > The change doesn't affect non-TDX-guest environments. > > > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> > > --- > > arch/x86/kernel/relocate_kernel_64.S | 17 ++++++++++------- > > 1 file changed, 10 insertions(+), 7 deletions(-) > > > > diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S > > index 085eef5c3904..9c2cf70c5f54 100644 > > --- a/arch/x86/kernel/relocate_kernel_64.S > > +++ b/arch/x86/kernel/relocate_kernel_64.S > > @@ -5,6 +5,8 @@ > > */ > > #include <linux/linkage.h> > > +#include <linux/stringify.h> > > +#include <asm/alternative.h> > > #include <asm/page_types.h> > > #include <asm/kexec.h> > > #include <asm/processor-flags.h> > > @@ -145,14 +147,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) > > * Set cr4 to a known state: > > * - physical address extension enabled > > * - 5-level paging, if it was enabled before > > + * - Machine check exception on TDX guest, if it was enabled before. > > + * Clearing MCE might not be allowed in TDX guests, depending on setup. > > + * > > + * Use R13 that contains the original CR4 value, read in relocate_kernel(). > > + * PAE is always set in the original CR4. > > */ > > - movl $X86_CR4_PAE, %eax > > - testq $X86_CR4_LA57, %r13 > > - jz .Lno_la57 > > - orl $X86_CR4_LA57, %eax > > -.Lno_la57: > > - > > - movq %rax, %cr4 > > + andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d > > + ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST > > + movq %r13, %cr4 > > If this is the case, I don't really see a reason to clear MCE per se as I'm > guessing a machine check here will be fatal anyway? It just changes the > method of death. Andrew had a strong opinion on method of death here. https://lore.kernel.org/all/1144340e-dd95-ee3b-dabb-579f9a65b3c7@citrix.com > Also, is there a reason to save %cr4, run code, and *then* clear the > relevant bits? Wouldn't it be better to sanitize %cr4 as soon as possible? You mean set new CR4 directly in relocate_kernel() before switching CR3? I guess it is possible. But I can say I see huge benefit of changing it. Such change would have own risks.
On 3.06.24 г. 17:43 ч., H. Peter Anvin wrote: > On 5/29/24 03:47, Nikolay Borisov wrote: >>> >>> diff --git a/arch/x86/kernel/relocate_kernel_64.S >>> b/arch/x86/kernel/relocate_kernel_64.S >>> index 56cab1bb25f5..085eef5c3904 100644 >>> --- a/arch/x86/kernel/relocate_kernel_64.S >>> +++ b/arch/x86/kernel/relocate_kernel_64.S >>> @@ -148,9 +148,10 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) >>> */ >>> movl $X86_CR4_PAE, %eax >>> testq $X86_CR4_LA57, %r13 >>> - jz 1f >>> + jz .Lno_la57 >>> orl $X86_CR4_LA57, %eax >>> -1: >>> +.Lno_la57: >>> + >>> movq %rax, %cr4 >>> jmp 1f >> >> That jmp 1f becomes redundant now as it simply jumps 1 line below. >> > > Uh... am I the only person to notice that ALL that is needed here is: > > andl $(X86_CR4_PAE|X86_CR4_LA57), %r13d > movq %r13, %rax > > ... since %r13 is dead afterwards, and PAE *will* have been set in %r13 > already? > > I don't believe that this specific jmp is actually needed -- there are > several more synchronizing jumps later -- but it doesn't hurt. > > However, if the effort is for improving the readability, it might be > worthwhile to encapsulate the "jmp 1f; 1:" as a macro, e.g. "SYNC_CODE". The preceding move to CR4 is itself a serializing instruction, no? > > -hpa
On 12/06/2024 10:22 am, Kirill A. Shutemov wrote: > On Tue, Jun 11, 2024 at 11:26:17AM -0700, H. Peter Anvin wrote: >> On 6/4/24 08:21, Kirill A. Shutemov wrote: >>> From b45fe48092abad2612c2bafbb199e4de80c99545 Mon Sep 17 00:00:00 2001 >>> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> >>> Date: Fri, 10 Feb 2023 12:53:11 +0300 >>> Subject: [PATCHv11.1 06/19] x86/kexec: Keep CR4.MCE set during kexec for TDX guest >>> >>> TDX guests run with MCA enabled (CR4.MCE=1b) from the very start. If >>> that bit is cleared during CR4 register reprogramming during boot or >>> kexec flows, a #VE exception will be raised which the guest kernel >>> cannot handle it. >>> >>> Therefore, make sure the CR4.MCE setting is preserved over kexec too and >>> avoid raising any #VEs. >>> >>> The change doesn't affect non-TDX-guest environments. >>> >>> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> >>> --- >>> arch/x86/kernel/relocate_kernel_64.S | 17 ++++++++++------- >>> 1 file changed, 10 insertions(+), 7 deletions(-) >>> >>> diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S >>> index 085eef5c3904..9c2cf70c5f54 100644 >>> --- a/arch/x86/kernel/relocate_kernel_64.S >>> +++ b/arch/x86/kernel/relocate_kernel_64.S >>> @@ -5,6 +5,8 @@ >>> */ >>> #include <linux/linkage.h> >>> +#include <linux/stringify.h> >>> +#include <asm/alternative.h> >>> #include <asm/page_types.h> >>> #include <asm/kexec.h> >>> #include <asm/processor-flags.h> >>> @@ -145,14 +147,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) >>> * Set cr4 to a known state: >>> * - physical address extension enabled >>> * - 5-level paging, if it was enabled before >>> + * - Machine check exception on TDX guest, if it was enabled before. >>> + * Clearing MCE might not be allowed in TDX guests, depending on setup. >>> + * >>> + * Use R13 that contains the original CR4 value, read in relocate_kernel(). >>> + * PAE is always set in the original CR4. >>> */ >>> - movl $X86_CR4_PAE, %eax >>> - testq $X86_CR4_LA57, %r13 >>> - jz .Lno_la57 >>> - orl $X86_CR4_LA57, %eax >>> -.Lno_la57: >>> - >>> - movq %rax, %cr4 >>> + andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d >>> + ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST >>> + movq %r13, %cr4 >> If this is the case, I don't really see a reason to clear MCE per se as I'm >> guessing a machine check here will be fatal anyway? It just changes the >> method of death. > Andrew had a strong opinion on method of death here. > > https://lore.kernel.org/all/1144340e-dd95-ee3b-dabb-579f9a65b3c7@citrix.com Not sure if I intended it to come across that strongly, but given a choice, the !CR4.MCE death is cleaner because at least you're not interpreting garbage and trying to use it as a valid IDT. ~Andrew
On June 12, 2024 4:06:07 PM PDT, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >On 12/06/2024 10:22 am, Kirill A. Shutemov wrote: >> On Tue, Jun 11, 2024 at 11:26:17AM -0700, H. Peter Anvin wrote: >>> On 6/4/24 08:21, Kirill A. Shutemov wrote: >>>> From b45fe48092abad2612c2bafbb199e4de80c99545 Mon Sep 17 00:00:00 2001 >>>> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> >>>> Date: Fri, 10 Feb 2023 12:53:11 +0300 >>>> Subject: [PATCHv11.1 06/19] x86/kexec: Keep CR4.MCE set during kexec for TDX guest >>>> >>>> TDX guests run with MCA enabled (CR4.MCE=1b) from the very start. If >>>> that bit is cleared during CR4 register reprogramming during boot or >>>> kexec flows, a #VE exception will be raised which the guest kernel >>>> cannot handle it. >>>> >>>> Therefore, make sure the CR4.MCE setting is preserved over kexec too and >>>> avoid raising any #VEs. >>>> >>>> The change doesn't affect non-TDX-guest environments. >>>> >>>> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> >>>> --- >>>> arch/x86/kernel/relocate_kernel_64.S | 17 ++++++++++------- >>>> 1 file changed, 10 insertions(+), 7 deletions(-) >>>> >>>> diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S >>>> index 085eef5c3904..9c2cf70c5f54 100644 >>>> --- a/arch/x86/kernel/relocate_kernel_64.S >>>> +++ b/arch/x86/kernel/relocate_kernel_64.S >>>> @@ -5,6 +5,8 @@ >>>> */ >>>> #include <linux/linkage.h> >>>> +#include <linux/stringify.h> >>>> +#include <asm/alternative.h> >>>> #include <asm/page_types.h> >>>> #include <asm/kexec.h> >>>> #include <asm/processor-flags.h> >>>> @@ -145,14 +147,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) >>>> * Set cr4 to a known state: >>>> * - physical address extension enabled >>>> * - 5-level paging, if it was enabled before >>>> + * - Machine check exception on TDX guest, if it was enabled before. >>>> + * Clearing MCE might not be allowed in TDX guests, depending on setup. >>>> + * >>>> + * Use R13 that contains the original CR4 value, read in relocate_kernel(). >>>> + * PAE is always set in the original CR4. >>>> */ >>>> - movl $X86_CR4_PAE, %eax >>>> - testq $X86_CR4_LA57, %r13 >>>> - jz .Lno_la57 >>>> - orl $X86_CR4_LA57, %eax >>>> -.Lno_la57: >>>> - >>>> - movq %rax, %cr4 >>>> + andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d >>>> + ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST >>>> + movq %r13, %cr4 >>> If this is the case, I don't really see a reason to clear MCE per se as I'm >>> guessing a machine check here will be fatal anyway? It just changes the >>> method of death. >> Andrew had a strong opinion on method of death here. >> >> https://lore.kernel.org/all/1144340e-dd95-ee3b-dabb-579f9a65b3c7@citrix.com > >Not sure if I intended it to come across that strongly, but given a >choice, the !CR4.MCE death is cleaner because at least you're not >interpreting garbage and trying to use it as a valid IDT. > >~Andrew Zorch the IDT if it isn't valid?