diff mbox series

[v2,09/25] KVM: VMX: Switch FRED RSP0 between host and guest

Message ID 20240207172646.3981-10-xin3.li@intel.com
State New
Headers show
Series Enable FRED with KVM VMX | expand

Commit Message

Li, Xin3 Feb. 7, 2024, 5:26 p.m. UTC
Switch MSR_IA32_FRED_RSP0 between host and guest in
vmx_prepare_switch_to_{host,guest}().

MSR_IA32_FRED_RSP0 is used during ring 3 event delivery only, thus
KVM, running on ring 0, can run safely with guest FRED RSP0, i.e.,
no need to switch between host/guest FRED RSP0 during VM entry and
exit.

KVM should switch to host FRED RSP0 before returning to user level,
and switch to guest FRED RSP0 before entering guest mode.

Signed-off-by: Xin Li <xin3.li@intel.com>
Tested-by: Shan Kang <shan.kang@intel.com>
---

Changes since v1:
* Don't use guest_cpuid_has() in vmx_prepare_switch_to_{host,guest}(),
  which are called from IRQ-disabled context (Chao Gao).
* Reset msr_guest_fred_rsp0 in __vmx_vcpu_reset() (Chao Gao).
---
 arch/x86/kvm/vmx/vmx.c | 17 +++++++++++++++++
 arch/x86/kvm/vmx/vmx.h |  2 ++
 2 files changed, 19 insertions(+)

Comments

Chao Gao April 19, 2024, 2:23 p.m. UTC | #1
On Wed, Feb 07, 2024 at 09:26:29AM -0800, Xin Li wrote:
>Switch MSR_IA32_FRED_RSP0 between host and guest in
>vmx_prepare_switch_to_{host,guest}().
>
>MSR_IA32_FRED_RSP0 is used during ring 3 event delivery only, thus
>KVM, running on ring 0, can run safely with guest FRED RSP0, i.e.,
>no need to switch between host/guest FRED RSP0 during VM entry and
>exit.
>
>KVM should switch to host FRED RSP0 before returning to user level,
>and switch to guest FRED RSP0 before entering guest mode.
>
>Signed-off-by: Xin Li <xin3.li@intel.com>
>Tested-by: Shan Kang <shan.kang@intel.com>
>---
>
>Changes since v1:
>* Don't use guest_cpuid_has() in vmx_prepare_switch_to_{host,guest}(),
>  which are called from IRQ-disabled context (Chao Gao).
>* Reset msr_guest_fred_rsp0 in __vmx_vcpu_reset() (Chao Gao).
>---
> arch/x86/kvm/vmx/vmx.c | 17 +++++++++++++++++
> arch/x86/kvm/vmx/vmx.h |  2 ++
> 2 files changed, 19 insertions(+)
>
>diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>index b7b772183ee4..264378c3b784 100644
>--- a/arch/x86/kvm/vmx/vmx.c
>+++ b/arch/x86/kvm/vmx/vmx.c
>@@ -1337,6 +1337,16 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
> 	}
> 
> 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
>+
>+	if (guest_can_use(vcpu, X86_FEATURE_FRED)) {
>+		/*
>+		 * MSR_IA32_FRED_RSP0 is top of task stack, which never changes.
>+		 * Thus it should be initialized only once.
>+		 */
>+		if (unlikely(vmx->msr_host_fred_rsp0 == 0))
>+			vmx->msr_host_fred_rsp0 = read_msr(MSR_IA32_FRED_RSP0);

can we just drop this and use "(unsigned long)task_stack_page(current) + THREAD_SIZE"
as host fred rsp0?

>+		wrmsrl(MSR_IA32_FRED_RSP0, vmx->msr_guest_fred_rsp0);

any reason to not use wrmsrns?

>+	}
> #else
> 	savesegment(fs, fs_sel);
> 	savesegment(gs, gs_sel);
>@@ -1381,6 +1391,11 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
> 	invalidate_tss_limit();
> #ifdef CONFIG_X86_64
> 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
>+
>+	if (guest_can_use(&vmx->vcpu, X86_FEATURE_FRED)) {
>+		vmx->msr_guest_fred_rsp0 = read_msr(MSR_IA32_FRED_RSP0);
>+		wrmsrl(MSR_IA32_FRED_RSP0, vmx->msr_host_fred_rsp0);

same question.

>+	}
> #endif
> 	load_fixmap_gdt(raw_smp_processor_id());
> 	vmx->guest_state_loaded = false;
>@@ -4889,6 +4904,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
> 
> #ifdef CONFIG_X86_64
> 	if (kvm_cpu_cap_has(X86_FEATURE_FRED)) {
>+		vmx->msr_guest_fred_rsp0 = 0;
>+
> 		vmcs_write64(GUEST_IA32_FRED_CONFIG, 0);
> 		vmcs_write64(GUEST_IA32_FRED_RSP1, 0);
> 		vmcs_write64(GUEST_IA32_FRED_RSP2, 0);
>diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
>index 3ad52437f426..176ad39be406 100644
>--- a/arch/x86/kvm/vmx/vmx.h
>+++ b/arch/x86/kvm/vmx/vmx.h
>@@ -278,6 +278,8 @@ struct vcpu_vmx {
> #ifdef CONFIG_X86_64
> 	u64		      msr_host_kernel_gs_base;
> 	u64		      msr_guest_kernel_gs_base;
>+	u64		      msr_host_fred_rsp0;
>+	u64		      msr_guest_fred_rsp0;
> #endif
> 
> 	u64		      spec_ctrl;
>-- 
>2.43.0
>
>
Li, Xin3 April 19, 2024, 4:37 p.m. UTC | #2
> >+		if (unlikely(vmx->msr_host_fred_rsp0 == 0))
> >+			vmx->msr_host_fred_rsp0 =
> read_msr(MSR_IA32_FRED_RSP0);
> 
> can we just drop this and use "(unsigned long)task_stack_page(current) +
> THREAD_SIZE"
> as host fred rsp0?

I thought about it, however, don't see a strong reason that it's better,
 i.e., is RDMSR slower than reading 'stack' from current task_struct?

> 
> >+		wrmsrl(MSR_IA32_FRED_RSP0, vmx->msr_guest_fred_rsp0);
> 
> any reason to not use wrmsrns?

Good call!


> >+	}
> > #else
> > 	savesegment(fs, fs_sel);
> > 	savesegment(gs, gs_sel);
> >@@ -1381,6 +1391,11 @@ static void vmx_prepare_switch_to_host(struct
> vcpu_vmx *vmx)
> > 	invalidate_tss_limit();
> > #ifdef CONFIG_X86_64
> > 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
> >+
> >+	if (guest_can_use(&vmx->vcpu, X86_FEATURE_FRED)) {
> >+		vmx->msr_guest_fred_rsp0 = read_msr(MSR_IA32_FRED_RSP0);
> >+		wrmsrl(MSR_IA32_FRED_RSP0, vmx->msr_host_fred_rsp0);
> 
> same question.

Will do!

Thanks!
    Xin
Sean Christopherson June 12, 2024, 9:53 p.m. UTC | #3
On Wed, Feb 07, 2024, Xin Li wrote:
> Switch MSR_IA32_FRED_RSP0 between host and guest in
> vmx_prepare_switch_to_{host,guest}().
> 
> MSR_IA32_FRED_RSP0 is used during ring 3 event delivery only, thus
> KVM, running on ring 0, can run safely with guest FRED RSP0, i.e.,
> no need to switch between host/guest FRED RSP0 during VM entry and
> exit.
> 
> KVM should switch to host FRED RSP0 before returning to user level,
> and switch to guest FRED RSP0 before entering guest mode.

Heh, if only KVM had a framework that was specifically designed for context
switching MSRs on return to userspace.  Translation: please use the user_return_msr()
APIs.

> Signed-off-by: Xin Li <xin3.li@intel.com>
> Tested-by: Shan Kang <shan.kang@intel.com>
> ---
> 
> Changes since v1:
> * Don't use guest_cpuid_has() in vmx_prepare_switch_to_{host,guest}(),
>   which are called from IRQ-disabled context (Chao Gao).
> * Reset msr_guest_fred_rsp0 in __vmx_vcpu_reset() (Chao Gao).
> ---
>  arch/x86/kvm/vmx/vmx.c | 17 +++++++++++++++++
>  arch/x86/kvm/vmx/vmx.h |  2 ++
>  2 files changed, 19 insertions(+)
> 
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index b7b772183ee4..264378c3b784 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -1337,6 +1337,16 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
>  	}
>  
>  	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
> +
> +	if (guest_can_use(vcpu, X86_FEATURE_FRED)) {
> +		/*
> +		 * MSR_IA32_FRED_RSP0 is top of task stack, which never changes.
> +		 * Thus it should be initialized only once.

Then grab the host value during vmx_hardware_setup().  And when you rebase on top
of the latest kvm-x86/next, there's a handy dandy "struct kvm_host_values kvm_host"
to track host MSR values (and similar state).

You could also use that for MSR_IA32_FRED_CONFIG and MSR_IA32_FRED_STKLVLS.

> +		 */
> +		if (unlikely(vmx->msr_host_fred_rsp0 == 0))
> +			vmx->msr_host_fred_rsp0 = read_msr(MSR_IA32_FRED_RSP0);
> +		wrmsrl(MSR_IA32_FRED_RSP0, vmx->msr_guest_fred_rsp0);
> +	}
>  #else
>  	savesegment(fs, fs_sel);
>  	savesegment(gs, gs_sel);
> @@ -1381,6 +1391,11 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
>  	invalidate_tss_limit();
>  #ifdef CONFIG_X86_64
>  	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
> +
> +	if (guest_can_use(&vmx->vcpu, X86_FEATURE_FRED)) {
> +		vmx->msr_guest_fred_rsp0 = read_msr(MSR_IA32_FRED_RSP0);
> +		wrmsrl(MSR_IA32_FRED_RSP0, vmx->msr_host_fred_rsp0);
> +	}
>  #endif
>  	load_fixmap_gdt(raw_smp_processor_id());
>  	vmx->guest_state_loaded = false;
> @@ -4889,6 +4904,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
>  
>  #ifdef CONFIG_X86_64
>  	if (kvm_cpu_cap_has(X86_FEATURE_FRED)) {
> +		vmx->msr_guest_fred_rsp0 = 0;

Eh, I wouldn't bother.  Arguably it's better to use __kvm_set_msr(), and "vmx"
is zero-allocated so this is unnecessary.

The GUEST_IA32_FRED_* VMCS fields need to be explicitly initialized because the
VMCS could (very theoretically) use a non-zero-based encoding scheme.
Li, Xin3 July 10, 2024, 3:51 p.m. UTC | #4
> On Wed, Feb 07, 2024, Xin Li wrote:
> > Switch MSR_IA32_FRED_RSP0 between host and guest in
> > vmx_prepare_switch_to_{host,guest}().
> >
> > MSR_IA32_FRED_RSP0 is used during ring 3 event delivery only, thus
> > KVM, running on ring 0, can run safely with guest FRED RSP0, i.e., no
> > need to switch between host/guest FRED RSP0 during VM entry and exit.
> >
> > KVM should switch to host FRED RSP0 before returning to user level,
> > and switch to guest FRED RSP0 before entering guest mode.
> 
> Heh, if only KVM had a framework that was specifically designed for context
> switching MSRs on return to userspace.  Translation: please use the
> user_return_msr() APIs.

IIUC the user return MSR framework works for MSRs that are per CPU
constants, but like MSR_KERNEL_GS_BASE, MSR_IA32_FRED_RSP0 is a per
*task* constant, thus we can't use it.
Sean Christopherson July 12, 2024, 3:12 p.m. UTC | #5
On Wed, Jul 10, 2024, Xin3 Li wrote:
> > On Wed, Feb 07, 2024, Xin Li wrote:
> > > Switch MSR_IA32_FRED_RSP0 between host and guest in
> > > vmx_prepare_switch_to_{host,guest}().
> > >
> > > MSR_IA32_FRED_RSP0 is used during ring 3 event delivery only, thus
> > > KVM, running on ring 0, can run safely with guest FRED RSP0, i.e., no
> > > need to switch between host/guest FRED RSP0 during VM entry and exit.
> > >
> > > KVM should switch to host FRED RSP0 before returning to user level,
> > > and switch to guest FRED RSP0 before entering guest mode.
> > 
> > Heh, if only KVM had a framework that was specifically designed for context
> > switching MSRs on return to userspace.  Translation: please use the
> > user_return_msr() APIs.
> 
> IIUC the user return MSR framework works for MSRs that are per CPU
> constants, but like MSR_KERNEL_GS_BASE, MSR_IA32_FRED_RSP0 is a per
> *task* constant, thus we can't use it.

Ah, in that case, the changelog is very misleading and needs to be fixed.
Alternatively, is the desired RSP0 value tracked anywhere other than the MSR?
E.g. if it's somewhere in task_struct, then kvm_on_user_return() would restore
the current task's desired RSP0.  Even if we don't get fancy, avoiding the RDMSR
to get the current task's value would be nice.
Li, Xin3 July 12, 2024, 4:15 p.m. UTC | #6
> > > > Switch MSR_IA32_FRED_RSP0 between host and guest in
> > > > vmx_prepare_switch_to_{host,guest}().
> > > >
> > > > MSR_IA32_FRED_RSP0 is used during ring 3 event delivery only, thus
> > > > KVM, running on ring 0, can run safely with guest FRED RSP0, i.e., no
> > > > need to switch between host/guest FRED RSP0 during VM entry and exit.
> > > >
> > > > KVM should switch to host FRED RSP0 before returning to user level,
> > > > and switch to guest FRED RSP0 before entering guest mode.
> > >
> > > Heh, if only KVM had a framework that was specifically designed for context
> > > switching MSRs on return to userspace.  Translation: please use the
> > > user_return_msr() APIs.
> >
> > IIUC the user return MSR framework works for MSRs that are per CPU
> > constants, but like MSR_KERNEL_GS_BASE, MSR_IA32_FRED_RSP0 is a per
> > *task* constant, thus we can't use it.
> 
> Ah, in that case, the changelog is very misleading and needs to be fixed.

I probably should've given more details about how FRED RSPs are used:
RSP0 is a bit of special because it's a per task constant pointing to
its kernel stack top, while other RSPs are per CPU constants.

> Alternatively, is the desired RSP0 value tracked anywhere other than the MSR?

Yes, It's simply "(unsigned long)task_stack_page(task) + THREAD_SIZE".

> E.g. if it's somewhere in task_struct, then kvm_on_user_return() would restore
> the current task's desired RSP0.

So you're suggesting to extend the framework to allow per task constants?

>  Even if we don't get fancy, avoiding the RDMSR
> to get the current task's value would be nice.

TBH, I didn't know RDMSR is NOT a preferred approach.  But it does make
sense because it costs hundreds cycles to read from CR2.

And of course this can be easily changed to
"(unsigned long)task_stack_page(task) + THREAD_SIZE".
Li, Xin3 July 12, 2024, 5:17 p.m. UTC | #7
> > > E.g. if it's somewhere in task_struct, then kvm_on_user_return()
> > > would restore the current task's desired RSP0.
> >
> > So you're suggesting to extend the framework to allow per task constants?
> 
> Yeah, or more likely, special case MSR_IA32_FRED_RSP0.  If KVM didn't already
> have the user return framework, I wouldn't suggest this as I doubt avoiding
> WRMSR when switching between vCPU tasks will be very meaningful, but it's
> easy to handle FRED_RSP0, so why not.

Great, I will take the patch.

It looks to me that this also works for KERNEL GS BASE MSR, no?

> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index
> 1783986d8626..ebecb205e5de 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -352,6 +352,7 @@ static void kvm_on_user_return(struct
> user_return_notifier *urn)
>                 = container_of(urn, struct kvm_user_return_msrs, urn);
>         struct kvm_user_return_msr_values *values;
>         unsigned long flags;
> +       u64 host_val;
> 
>         /*
>          * Disabling irqs at this point since the following code could be @@ -365,9
> +366,15 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
>         local_irq_restore(flags);
>         for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
>                 values = &msrs->values[slot];
> -               if (values->host != values->curr) {
> -                       wrmsrl(kvm_uret_msrs_list[slot], values->host);
> -                       values->curr = values->host;
> +
> +               if (kvm_uret_msrs_list[slot] == MSR_IA32_FRED_RSP0)
> +                       host_val = get_current_fred_rsp0();
> +               else
> +                       host_val = values->host;
> +
> +               if (host_val != values->curr) {
> +                       wrmsrl(kvm_uret_msrs_list[slot], host_val);
> +                       values->curr = host_val;
>                 }
>         }
>  }
Sean Christopherson July 12, 2024, 7:30 p.m. UTC | #8
On Fri, Jul 12, 2024, Xin3 Li wrote:
> > > > E.g. if it's somewhere in task_struct, then kvm_on_user_return()
> > > > would restore the current task's desired RSP0.
> > >
> > > So you're suggesting to extend the framework to allow per task constants?
> > 
> > Yeah, or more likely, special case MSR_IA32_FRED_RSP0.  If KVM didn't already
> > have the user return framework, I wouldn't suggest this as I doubt avoiding
> > WRMSR when switching between vCPU tasks will be very meaningful, but it's
> > easy to handle FRED_RSP0, so why not.
> 
> Great, I will take the patch.
> 
> It looks to me that this also works for KERNEL GS BASE MSR, no?

I don't think so, because the kernel expects MSR_KERNEL_GS_BASE to be accurate
when querying GS.base for the current task:

  unsigned long x86_gsbase_read_task(struct task_struct *task)
  {
	unsigned long gsbase;

	if (task == current)
		gsbase = x86_gsbase_read_cpu_inactive();
	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
		 (task->thread.gsindex == 0))
		gsbase = task->thread.gsbase;
	else
		gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);

	return gsbase;
  }
Li, Xin3 July 17, 2024, 5:31 p.m. UTC | #9
> > > > E.g. if it's somewhere in task_struct, then kvm_on_user_return()
> > > > would restore the current task's desired RSP0.
> > >
> > > So you're suggesting to extend the framework to allow per task constants?
> >
> > Yeah, or more likely, special case MSR_IA32_FRED_RSP0.  If KVM didn't
> > already have the user return framework, I wouldn't suggest this as I
> > doubt avoiding WRMSR when switching between vCPU tasks will be very
> > meaningful, but it's easy to handle FRED_RSP0, so why not.
> 
> Great, I will take the patch.

I tried to make this work, however because FRED RSP0 is per task and
keeps changing during context switch[1], we lose track of FRED RSP0
values from both host and guest, thus we need to:

1) *always* save guest FRED RSP0 in vmx_prepare_switch_to_host().

2) *always* restore guest FRED RSP0 in vmx_prepare_switch_to_guest(),
   because sometimes context switches happen but the CPU does NOT
   return to user mode thus the user return framework detects no change.

So it essentially becomes the same as what the original patch does.

I guess It's probably not worth the change, how do you think?

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9356c4b8886c4f7d3436c3f7fe31715bdcf1c79e


> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index
> > 1783986d8626..ebecb205e5de 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -352,6 +352,7 @@ static void kvm_on_user_return(struct
> > user_return_notifier *urn)
> >                 = container_of(urn, struct kvm_user_return_msrs, urn);
> >         struct kvm_user_return_msr_values *values;
> >         unsigned long flags;
> > +       u64 host_val;
> >
> >         /*
> >          * Disabling irqs at this point since the following code could
> > be @@ -365,9
> > +366,15 @@ static void kvm_on_user_return(struct user_return_notifier
> > +*urn)
> >         local_irq_restore(flags);
> >         for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
> >                 values = &msrs->values[slot];
> > -               if (values->host != values->curr) {
> > -                       wrmsrl(kvm_uret_msrs_list[slot], values->host);
> > -                       values->curr = values->host;
> > +
> > +               if (kvm_uret_msrs_list[slot] == MSR_IA32_FRED_RSP0)
> > +                       host_val = get_current_fred_rsp0();
> > +               else
> > +                       host_val = values->host;
> > +
> > +               if (host_val != values->curr) {
> > +                       wrmsrl(kvm_uret_msrs_list[slot], host_val);
> > +                       values->curr = host_val;
> >                 }
> >         }
> >  }
Sean Christopherson July 18, 2024, 1:59 p.m. UTC | #10
On Wed, Jul 17, 2024, Xin3 Li wrote:
> > > > > E.g. if it's somewhere in task_struct, then kvm_on_user_return()
> > > > > would restore the current task's desired RSP0.
> > > >
> > > > So you're suggesting to extend the framework to allow per task constants?
> > >
> > > Yeah, or more likely, special case MSR_IA32_FRED_RSP0.  If KVM didn't
> > > already have the user return framework, I wouldn't suggest this as I
> > > doubt avoiding WRMSR when switching between vCPU tasks will be very
> > > meaningful, but it's easy to handle FRED_RSP0, so why not.
> > 
> > Great, I will take the patch.
> 
> I tried to make this work, however because FRED RSP0 is per task and
> keeps changing during context switch[1], we lose track of FRED RSP0
> values from both host and guest,

Ah, right, the host value is volatile.  And MSR_IA32_FRED_RSP0 is passed through
to the guest, so the guest value is volatile/unknown too.

> thus we need to:
> 
> 1) *always* save guest FRED RSP0 in vmx_prepare_switch_to_host().
> 
> 2) *always* restore guest FRED RSP0 in vmx_prepare_switch_to_guest(),
>    because sometimes context switches happen but the CPU does NOT
>    return to user mode thus the user return framework detects no change.
> 
> So it essentially becomes the same as what the original patch does.
> 
> I guess It's probably not worth the change, how do you think?

One idea would be to have the kernel write MSR_IA32_FRED_RSP0 on return to
userspace instead of on context switch.  As is, amusingly, there's no need to
restore the host value if a context switch occurs as the kernel will have written
the new task's value.  RSP0 only needs to be manually restored if the kernel
returns to userspace for the vCPU task.  Using a TI flag to track if RSP0 needs
to be loaded would avoid a fair number of WRMSRs in both KVM and the kernel.

E.g.

diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index ce8f50192ae3..76724cc42869 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -57,6 +57,11 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
        if (unlikely(ti_work & _TIF_NEED_FPU_LOAD))
                switch_fpu_return();
 
+       if (cpu_feature_enabled(X86_FEATURE_FRED) &&
+           (ti_work & _TIF_NEED_RSP0_LOAD))
+               wrmsrns(MSR_IA32_FRED_RSP0,
+                       (unsigned long)task_stack_page(current) + THREAD_SIZE);
+
 #ifdef CONFIG_COMPAT
        /*
         * Compat syscalls set TS_COMPAT.  Make sure we clear it before
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index c3bd0c0758c9..1674d98a8850 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -71,8 +71,7 @@ static inline void update_task_stack(struct task_struct *task)
        this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
 #else
        if (cpu_feature_enabled(X86_FEATURE_FRED)) {
-               /* WRMSRNS is a baseline feature for FRED. */
-               wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(task) + THREAD_SIZE);
+               set_thread_flag(TIF_NEED_RSP0_LOAD);
        } else if (cpu_feature_enabled(X86_FEATURE_XENPV)) {
                /* Xen PV enters the kernel on the thread stack. */
                load_sp0(task_top_of_stack(task));
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 5c6bb26463e8..cb7e3bcb001f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1338,15 +1338,9 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 
        wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
 
-       if (guest_can_use(vcpu, X86_FEATURE_FRED)) {
-               /*
-                * MSR_IA32_FRED_RSP0 is top of task stack, which never changes.
-                * Thus it should be initialized only once.
-                */
-               if (unlikely(vmx->msr_host_fred_rsp0 == 0))
-                       vmx->msr_host_fred_rsp0 = read_msr(MSR_IA32_FRED_RSP0);
-               wrmsrl(MSR_IA32_FRED_RSP0, vmx->msr_guest_fred_rsp0);
-       }
+       if (cpu_feature_enabled(X86_FEATURE_FRED) &&
+           guest_can_use(vcpu, X86_FEATURE_FRED))
+               wrmsrns(MSR_IA32_FRED_RSP0, vmx->msr_guest_fred_rsp0);
 #else
        savesegment(fs, fs_sel);
        savesegment(gs, gs_sel);
@@ -1392,9 +1386,10 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
 #ifdef CONFIG_X86_64
        wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 
-       if (guest_can_use(&vmx->vcpu, X86_FEATURE_FRED)) {
+       if (cpu_feature_enabled(X86_FEATURE_FRED) &&
+           guest_can_use(&vmx->vcpu, X86_FEATURE_FRED)) {
                vmx->msr_guest_fred_rsp0 = read_msr(MSR_IA32_FRED_RSP0);
-               wrmsrl(MSR_IA32_FRED_RSP0, vmx->msr_host_fred_rsp0);
+               set_thread_flag(TIF_NEED_RSP0_LOAD);
        }
 #endif
        load_fixmap_gdt(raw_smp_processor_id());
Li, Xin3 July 18, 2024, 5:44 p.m. UTC | #11
> > thus we need to:
> >
> > 1) *always* save guest FRED RSP0 in vmx_prepare_switch_to_host().
> >
> > 2) *always* restore guest FRED RSP0 in vmx_prepare_switch_to_guest(),
> >    because sometimes context switches happen but the CPU does NOT
> >    return to user mode thus the user return framework detects no change.
> >
> > So it essentially becomes the same as what the original patch does.
> >
> > I guess It's probably not worth the change, how do you think?
> 
> One idea would be to have the kernel write MSR_IA32_FRED_RSP0 on return to
> userspace instead of on context switch.  As is, amusingly, there's no need to
> restore the host value if a context switch occurs as the kernel will have written
> the new task's value.  RSP0 only needs to be manually restored if the kernel
> returns to userspace for the vCPU task.  Using a TI flag to track if RSP0 needs to
> be loaded would avoid a fair number of WRMSRs in both KVM and the kernel.

I also thought about it (while in the FRED ERETU code path), however
we will need performance data first, even an extra check
(ti_work & _TIF_NEED_RSP0_LOAD) seems neglectable, but both return to
user and context switch are hot paths, and I assume return to user is
extremely hot.

So let's revisit it at a later time?

> diff --git a/arch/x86/include/asm/entry-common.h
> b/arch/x86/include/asm/entry-common.h
> index ce8f50192ae3..76724cc42869 100644
> --- a/arch/x86/include/asm/entry-common.h
> +++ b/arch/x86/include/asm/entry-common.h
> @@ -57,6 +57,11 @@ static inline void arch_exit_to_user_mode_prepare(struct
> pt_regs *regs,
>         if (unlikely(ti_work & _TIF_NEED_FPU_LOAD))
>                 switch_fpu_return();
> 
> +       if (cpu_feature_enabled(X86_FEATURE_FRED) &&
> +           (ti_work & _TIF_NEED_RSP0_LOAD))
> +               wrmsrns(MSR_IA32_FRED_RSP0,
> +                       (unsigned long)task_stack_page(current) +
> + THREAD_SIZE);
> +
>  #ifdef CONFIG_COMPAT
>         /*
>          * Compat syscalls set TS_COMPAT.  Make sure we clear it before diff --git
> a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
> index c3bd0c0758c9..1674d98a8850 100644
> --- a/arch/x86/include/asm/switch_to.h
> +++ b/arch/x86/include/asm/switch_to.h
> @@ -71,8 +71,7 @@ static inline void update_task_stack(struct task_struct *task)
>         this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);  #else
>         if (cpu_feature_enabled(X86_FEATURE_FRED)) {
> -               /* WRMSRNS is a baseline feature for FRED. */
> -               wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(task) +
> THREAD_SIZE);
> +               set_thread_flag(TIF_NEED_RSP0_LOAD);
>         } else if (cpu_feature_enabled(X86_FEATURE_XENPV)) {
>                 /* Xen PV enters the kernel on the thread stack. */
>                 load_sp0(task_top_of_stack(task));
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index
> 5c6bb26463e8..cb7e3bcb001f 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -1338,15 +1338,9 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu
> *vcpu)
> 
>         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
> 
> -       if (guest_can_use(vcpu, X86_FEATURE_FRED)) {
> -               /*
> -                * MSR_IA32_FRED_RSP0 is top of task stack, which never changes.
> -                * Thus it should be initialized only once.
> -                */
> -               if (unlikely(vmx->msr_host_fred_rsp0 == 0))
> -                       vmx->msr_host_fred_rsp0 = read_msr(MSR_IA32_FRED_RSP0);
> -               wrmsrl(MSR_IA32_FRED_RSP0, vmx->msr_guest_fred_rsp0);
> -       }
> +       if (cpu_feature_enabled(X86_FEATURE_FRED) &&

This check seems unnecessary, I guess you add it to skip the following
code when FRED is not enabled on host?

> +           guest_can_use(vcpu, X86_FEATURE_FRED))
> +               wrmsrns(MSR_IA32_FRED_RSP0, vmx->msr_guest_fred_rsp0);
>  #else
>         savesegment(fs, fs_sel);
>         savesegment(gs, gs_sel);
> @@ -1392,9 +1386,10 @@ static void vmx_prepare_switch_to_host(struct
> vcpu_vmx *vmx)  #ifdef CONFIG_X86_64
>         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
> 
> -       if (guest_can_use(&vmx->vcpu, X86_FEATURE_FRED)) {
> +       if (cpu_feature_enabled(X86_FEATURE_FRED) &&
> +           guest_can_use(&vmx->vcpu, X86_FEATURE_FRED)) {
>                 vmx->msr_guest_fred_rsp0 = read_msr(MSR_IA32_FRED_RSP0);
> -               wrmsrl(MSR_IA32_FRED_RSP0, vmx->msr_host_fred_rsp0);
> +               set_thread_flag(TIF_NEED_RSP0_LOAD);
>         }
>  #endif
>         load_fixmap_gdt(raw_smp_processor_id());
H. Peter Anvin July 18, 2024, 9:04 p.m. UTC | #12
On July 12, 2024 8:12:51 AM PDT, Sean Christopherson <seanjc@google.com> wrote:
>On Wed, Jul 10, 2024, Xin3 Li wrote:
>> > On Wed, Feb 07, 2024, Xin Li wrote:
>> > > Switch MSR_IA32_FRED_RSP0 between host and guest in
>> > > vmx_prepare_switch_to_{host,guest}().
>> > >
>> > > MSR_IA32_FRED_RSP0 is used during ring 3 event delivery only, thus
>> > > KVM, running on ring 0, can run safely with guest FRED RSP0, i.e., no
>> > > need to switch between host/guest FRED RSP0 during VM entry and exit.
>> > >
>> > > KVM should switch to host FRED RSP0 before returning to user level,
>> > > and switch to guest FRED RSP0 before entering guest mode.
>> > 
>> > Heh, if only KVM had a framework that was specifically designed for context
>> > switching MSRs on return to userspace.  Translation: please use the
>> > user_return_msr() APIs.
>> 
>> IIUC the user return MSR framework works for MSRs that are per CPU
>> constants, but like MSR_KERNEL_GS_BASE, MSR_IA32_FRED_RSP0 is a per
>> *task* constant, thus we can't use it.
>
>Ah, in that case, the changelog is very misleading and needs to be fixed.
>Alternatively, is the desired RSP0 value tracked anywhere other than the MSR?
>E.g. if it's somewhere in task_struct, then kvm_on_user_return() would restore
>the current task's desired RSP0.  Even if we don't get fancy, avoiding the RDMSR
>to get the current task's value would be nice.

Hm, perhaps the right thing to do is to always invoke this function before a context switch happens if that happens before return to user space?
Sean Christopherson July 19, 2024, 3:59 p.m. UTC | #13
On Thu, Jul 18, 2024, H. Peter Anvin wrote:
> On July 12, 2024 8:12:51 AM PDT, Sean Christopherson <seanjc@google.com> wrote:
> >On Wed, Jul 10, 2024, Xin3 Li wrote:
> >> > On Wed, Feb 07, 2024, Xin Li wrote:
> >> > > Switch MSR_IA32_FRED_RSP0 between host and guest in
> >> > > vmx_prepare_switch_to_{host,guest}().
> >> > >
> >> > > MSR_IA32_FRED_RSP0 is used during ring 3 event delivery only, thus
> >> > > KVM, running on ring 0, can run safely with guest FRED RSP0, i.e., no
> >> > > need to switch between host/guest FRED RSP0 during VM entry and exit.
> >> > >
> >> > > KVM should switch to host FRED RSP0 before returning to user level,
> >> > > and switch to guest FRED RSP0 before entering guest mode.
> >> > 
> >> > Heh, if only KVM had a framework that was specifically designed for context
> >> > switching MSRs on return to userspace.  Translation: please use the
> >> > user_return_msr() APIs.
> >> 
> >> IIUC the user return MSR framework works for MSRs that are per CPU
> >> constants, but like MSR_KERNEL_GS_BASE, MSR_IA32_FRED_RSP0 is a per
> >> *task* constant, thus we can't use it.
> >
> >Ah, in that case, the changelog is very misleading and needs to be fixed.
> >Alternatively, is the desired RSP0 value tracked anywhere other than the MSR?
> >E.g. if it's somewhere in task_struct, then kvm_on_user_return() would restore
> >the current task's desired RSP0.  Even if we don't get fancy, avoiding the RDMSR
> >to get the current task's value would be nice.
> 
> Hm, perhaps the right thing to do is to always invoke this function before a
> context switch happens if that happens before return to user space?

Actually, if the _TIF_NEED_RSP0_LOAD doesn't provide a meaningful benefit (or
y'all just don't want it :-) ), what KVM could do is restore MSR_IA32_FRED_RSP0
when putting the vCPU and the vCPU is not being scheduled out, i.e. if and only
if KVM can't guarantee a context switch.

If the vCPU/task is being scheduled out, update_task_stack() is guaranteed to
write MSR_IA32_FRED_RSP0 with the new task's value.

On top of kvm/next, which adds the necessary vcpu->scheduled_out:

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 5c6bb26463e8..4532ae943f2a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1338,15 +1338,9 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 
        wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
 
-       if (guest_can_use(vcpu, X86_FEATURE_FRED)) {
-               /*
-                * MSR_IA32_FRED_RSP0 is top of task stack, which never changes.
-                * Thus it should be initialized only once.
-                */
-               if (unlikely(vmx->msr_host_fred_rsp0 == 0))
-                       vmx->msr_host_fred_rsp0 = read_msr(MSR_IA32_FRED_RSP0);
-               wrmsrl(MSR_IA32_FRED_RSP0, vmx->msr_guest_fred_rsp0);
-       }
+       if (cpu_feature_enabled(X86_FEATURE_FRED) &&
+           guest_can_use(vcpu, X86_FEATURE_FRED))
+               wrmsrns(MSR_IA32_FRED_RSP0, vmx->msr_guest_fred_rsp0);
 #else
        savesegment(fs, fs_sel);
        savesegment(gs, gs_sel);
@@ -1392,9 +1386,13 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
 #ifdef CONFIG_X86_64
        wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 
-       if (guest_can_use(&vmx->vcpu, X86_FEATURE_FRED)) {
+       if (cpu_feature_enabled(X86_FEATURE_FRED) &&
+           guest_can_use(&vmx->vcpu, X86_FEATURE_FRED)) {
                vmx->msr_guest_fred_rsp0 = read_msr(MSR_IA32_FRED_RSP0);
-               wrmsrl(MSR_IA32_FRED_RSP0, vmx->msr_host_fred_rsp0);
+
+               if (!vcpu->scheduled_out)
+                       wrmsrns(MSR_IA32_FRED_RSP0,
+                                (unsigned long)task_stack_page(task) + THREAD_SIZE);
        }
 #endif
        load_fixmap_gdt(raw_smp_processor_id());
Li, Xin3 July 21, 2024, 6:09 p.m. UTC | #14
> On Thu, Jul 18, 2024, H. Peter Anvin wrote:
> > On July 12, 2024 8:12:51 AM PDT, Sean Christopherson <seanjc@google.com>
> wrote:
> > >On Wed, Jul 10, 2024, Xin3 Li wrote:
> > >> > On Wed, Feb 07, 2024, Xin Li wrote:
> > >> > > Switch MSR_IA32_FRED_RSP0 between host and guest in
> > >> > > vmx_prepare_switch_to_{host,guest}().
> > >> > >
> > >> > > MSR_IA32_FRED_RSP0 is used during ring 3 event delivery only,
> > >> > > thus KVM, running on ring 0, can run safely with guest FRED
> > >> > > RSP0, i.e., no need to switch between host/guest FRED RSP0 during VM
> entry and exit.
> > >> > >
> > >> > > KVM should switch to host FRED RSP0 before returning to user
> > >> > > level, and switch to guest FRED RSP0 before entering guest mode.
> > >> >
> > >> > Heh, if only KVM had a framework that was specifically designed
> > >> > for context switching MSRs on return to userspace.  Translation:
> > >> > please use the
> > >> > user_return_msr() APIs.
> > >>
> > >> IIUC the user return MSR framework works for MSRs that are per CPU
> > >> constants, but like MSR_KERNEL_GS_BASE, MSR_IA32_FRED_RSP0 is a per
> > >> *task* constant, thus we can't use it.
> > >
> > >Ah, in that case, the changelog is very misleading and needs to be fixed.
> > >Alternatively, is the desired RSP0 value tracked anywhere other than the MSR?
> > >E.g. if it's somewhere in task_struct, then kvm_on_user_return()
> > >would restore the current task's desired RSP0.  Even if we don't get
> > >fancy, avoiding the RDMSR to get the current task's value would be nice.
> >
> > Hm, perhaps the right thing to do is to always invoke this function
> > before a context switch happens if that happens before return to user space?
> 
> Actually, if the _TIF_NEED_RSP0_LOAD doesn't provide a meaningful benefit (or
> y'all just don't want it :-) ), 

We want it 😊.

My concern was adding an extra check of (ti_work & _TIF_NEED_RSP0_LOAD)
into a hot function arch_exit_to_user_mode_prepare().  HPA checked the
function and suggested to test ti_work for zero and then process
individual bits in it:

diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index fb2809b20b0a..4c78b99060b5 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -47,15 +47,17 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs)
 static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
                                                  unsigned long ti_work)
 {
-       if (ti_work & _TIF_USER_RETURN_NOTIFY)
-               fire_user_return_notifiers();
+       if (ti_work) {
+               if (ti_work & _TIF_USER_RETURN_NOTIFY)
+                       fire_user_return_notifiers();

-       if (unlikely(ti_work & _TIF_IO_BITMAP))
-               tss_update_io_bitmap();
+               if (unlikely(ti_work & _TIF_IO_BITMAP))
+                       tss_update_io_bitmap();

-       fpregs_assert_state_consistent();
-       if (unlikely(ti_work & _TIF_NEED_FPU_LOAD))
-               switch_fpu_return();
+               fpregs_assert_state_consistent();
+               if (unlikely(ti_work & _TIF_NEED_FPU_LOAD))
+                       switch_fpu_return();
+       }

 #ifdef CONFIG_COMPAT
        /*

Based on it, I measured how many 0s are out of every one million ti_work
values in kernel build tests, it's over 99%, i.e., unlikely(ti_work).

When booting a KVM guest, it becomes 75%, which is expected.  After the
guest is up running kernel build in it, it's 99% again.

So at least this patch seems a low-hanging fruit, and I have sent it to
Intel 0day for broader perf tests.

As context switches are way less frequent than exit to user mode, I do
NOT expect it makes a difference to write MSR_IA32_FRED_RSP0 on exit to
user mode instead of on context switch especially when we do it on top
of the above patch.
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b7b772183ee4..264378c3b784 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1337,6 +1337,16 @@  void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 	}
 
 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+
+	if (guest_can_use(vcpu, X86_FEATURE_FRED)) {
+		/*
+		 * MSR_IA32_FRED_RSP0 is top of task stack, which never changes.
+		 * Thus it should be initialized only once.
+		 */
+		if (unlikely(vmx->msr_host_fred_rsp0 == 0))
+			vmx->msr_host_fred_rsp0 = read_msr(MSR_IA32_FRED_RSP0);
+		wrmsrl(MSR_IA32_FRED_RSP0, vmx->msr_guest_fred_rsp0);
+	}
 #else
 	savesegment(fs, fs_sel);
 	savesegment(gs, gs_sel);
@@ -1381,6 +1391,11 @@  static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
 	invalidate_tss_limit();
 #ifdef CONFIG_X86_64
 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+
+	if (guest_can_use(&vmx->vcpu, X86_FEATURE_FRED)) {
+		vmx->msr_guest_fred_rsp0 = read_msr(MSR_IA32_FRED_RSP0);
+		wrmsrl(MSR_IA32_FRED_RSP0, vmx->msr_host_fred_rsp0);
+	}
 #endif
 	load_fixmap_gdt(raw_smp_processor_id());
 	vmx->guest_state_loaded = false;
@@ -4889,6 +4904,8 @@  static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
 #ifdef CONFIG_X86_64
 	if (kvm_cpu_cap_has(X86_FEATURE_FRED)) {
+		vmx->msr_guest_fred_rsp0 = 0;
+
 		vmcs_write64(GUEST_IA32_FRED_CONFIG, 0);
 		vmcs_write64(GUEST_IA32_FRED_RSP1, 0);
 		vmcs_write64(GUEST_IA32_FRED_RSP2, 0);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 3ad52437f426..176ad39be406 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -278,6 +278,8 @@  struct vcpu_vmx {
 #ifdef CONFIG_X86_64
 	u64		      msr_host_kernel_gs_base;
 	u64		      msr_guest_kernel_gs_base;
+	u64		      msr_host_fred_rsp0;
+	u64		      msr_guest_fred_rsp0;
 #endif
 
 	u64		      spec_ctrl;