diff mbox series

[RFC,v2,06/34] x86/msr: Use the alternatives mechanism to read PMC

Message ID 20250422082216.1954310-7-xin@zytor.com
State New
Headers show
Series MSR refactor with new MSR instructions support | expand

Commit Message

Xin Li April 22, 2025, 8:21 a.m. UTC
To eliminate the indirect call overhead introduced by the pv_ops API,
use the alternatives mechanism to read PMC:

    1) When built with !CONFIG_XEN_PV, X86_FEATURE_XENPV becomes a
       disabled feature, preventing the Xen PMC read code from being
       built and ensuring the native code is executed unconditionally.

    2) When built with CONFIG_XEN_PV:

       2.1) If not running on the Xen hypervisor (!X86_FEATURE_XENPV),
            the kernel runtime binary is patched to unconditionally
            jump to the native PMC read code.

       2.2) If running on the Xen hypervisor (X86_FEATURE_XENPV), the
            kernel runtime binary is patched to unconditionally jump
            to the Xen PMC read code.

Consequently, remove the pv_ops PMC read API.

Signed-off-by: Xin Li (Intel) <xin@zytor.com>
---
 arch/x86/include/asm/msr.h            | 31 ++++++++++++++++++++-------
 arch/x86/include/asm/paravirt.h       |  5 -----
 arch/x86/include/asm/paravirt_types.h |  2 --
 arch/x86/kernel/paravirt.c            |  1 -
 arch/x86/xen/enlighten_pv.c           |  2 --
 drivers/net/vmxnet3/vmxnet3_drv.c     |  2 +-
 6 files changed, 24 insertions(+), 19 deletions(-)

Comments

Jürgen Groß April 22, 2025, 8:38 a.m. UTC | #1
On 22.04.25 10:21, Xin Li (Intel) wrote:
> To eliminate the indirect call overhead introduced by the pv_ops API,
> use the alternatives mechanism to read PMC:

Which indirect call overhead? The indirect call is patched via the
alternative mechanism to a direct one.

> 
>      1) When built with !CONFIG_XEN_PV, X86_FEATURE_XENPV becomes a
>         disabled feature, preventing the Xen PMC read code from being
>         built and ensuring the native code is executed unconditionally.

Without CONFIG_XEN_PV CONFIG_PARAVIRT_XXL is not selected, resulting in
native code anyway.

> 
>      2) When built with CONFIG_XEN_PV:
> 
>         2.1) If not running on the Xen hypervisor (!X86_FEATURE_XENPV),
>              the kernel runtime binary is patched to unconditionally
>              jump to the native PMC read code.
> 
>         2.2) If running on the Xen hypervisor (X86_FEATURE_XENPV), the
>              kernel runtime binary is patched to unconditionally jump
>              to the Xen PMC read code.
> 
> Consequently, remove the pv_ops PMC read API.

I don't see the value of this patch.

It adds more #ifdef and code lines without any real gain.

In case the x86 maintainers think it is still worth it, I won't object.


Juergen

> 
> Signed-off-by: Xin Li (Intel) <xin@zytor.com>
> ---
>   arch/x86/include/asm/msr.h            | 31 ++++++++++++++++++++-------
>   arch/x86/include/asm/paravirt.h       |  5 -----
>   arch/x86/include/asm/paravirt_types.h |  2 --
>   arch/x86/kernel/paravirt.c            |  1 -
>   arch/x86/xen/enlighten_pv.c           |  2 --
>   drivers/net/vmxnet3/vmxnet3_drv.c     |  2 +-
>   6 files changed, 24 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
> index 01dc8e61ef97..33cf506e2fd6 100644
> --- a/arch/x86/include/asm/msr.h
> +++ b/arch/x86/include/asm/msr.h
> @@ -8,6 +8,7 @@
>   
>   #include <asm/asm.h>
>   #include <asm/errno.h>
> +#include <asm/cpufeature.h>
>   #include <asm/cpumask.h>
>   #include <uapi/asm/msr.h>
>   #include <asm/shared/msr.h>
> @@ -73,6 +74,10 @@ static inline void do_trace_read_msr(u32 msr, u64 val, int failed) {}
>   static inline void do_trace_rdpmc(u32 msr, u64 val, int failed) {}
>   #endif
>   
> +#ifdef CONFIG_XEN_PV
> +extern u64 xen_read_pmc(int counter);
> +#endif
> +
>   /*
>    * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR
>    * accessors and should not have any tracing or other functionality piggybacking
> @@ -170,16 +175,32 @@ native_write_msr_safe(u32 msr, u32 low, u32 high)
>   extern int rdmsr_safe_regs(u32 regs[8]);
>   extern int wrmsr_safe_regs(u32 regs[8]);
>   
> -static inline u64 native_read_pmc(int counter)
> +static __always_inline u64 native_rdpmcq(int counter)
>   {
>   	DECLARE_ARGS(val, low, high);
>   
> -	asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
> +	asm_inline volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
> +
>   	if (tracepoint_enabled(rdpmc))
>   		do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0);
> +
>   	return EAX_EDX_VAL(val, low, high);
>   }
>   
> +static __always_inline u64 rdpmcq(int counter)
> +{
> +#ifdef CONFIG_XEN_PV
> +	if (cpu_feature_enabled(X86_FEATURE_XENPV))
> +		return xen_read_pmc(counter);
> +#endif
> +
> +	/*
> +	 * 1) When built with !CONFIG_XEN_PV.
> +	 * 2) When built with CONFIG_XEN_PV but not running on Xen hypervisor.
> +	 */
> +	return native_rdpmcq(counter);
> +}
> +
>   #ifdef CONFIG_PARAVIRT_XXL
>   #include <asm/paravirt.h>
>   #else
> @@ -233,12 +254,6 @@ static inline int rdmsrq_safe(u32 msr, u64 *p)
>   	*p = native_read_msr_safe(msr, &err);
>   	return err;
>   }
> -
> -static __always_inline u64 rdpmcq(int counter)
> -{
> -	return native_read_pmc(counter);
> -}
> -
>   #endif	/* !CONFIG_PARAVIRT_XXL */
>   
>   /* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */
> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
> index 590824916394..c7689f5f70d6 100644
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -239,11 +239,6 @@ static inline int rdmsrq_safe(unsigned msr, u64 *p)
>   	return err;
>   }
>   
> -static __always_inline u64 rdpmcq(int counter)
> -{
> -	return PVOP_CALL1(u64, cpu.read_pmc, counter);
> -}
> -
>   static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
>   {
>   	PVOP_VCALL2(cpu.alloc_ldt, ldt, entries);
> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
> index 631c306ce1ff..475f508531d6 100644
> --- a/arch/x86/include/asm/paravirt_types.h
> +++ b/arch/x86/include/asm/paravirt_types.h
> @@ -101,8 +101,6 @@ struct pv_cpu_ops {
>   	u64 (*read_msr_safe)(unsigned int msr, int *err);
>   	int (*write_msr_safe)(unsigned int msr, unsigned low, unsigned high);
>   
> -	u64 (*read_pmc)(int counter);
> -
>   	void (*start_context_switch)(struct task_struct *prev);
>   	void (*end_context_switch)(struct task_struct *next);
>   #endif
> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
> index 1ccd05d8999f..28d195ad7514 100644
> --- a/arch/x86/kernel/paravirt.c
> +++ b/arch/x86/kernel/paravirt.c
> @@ -132,7 +132,6 @@ struct paravirt_patch_template pv_ops = {
>   	.cpu.write_msr		= native_write_msr,
>   	.cpu.read_msr_safe	= native_read_msr_safe,
>   	.cpu.write_msr_safe	= native_write_msr_safe,
> -	.cpu.read_pmc		= native_read_pmc,
>   	.cpu.load_tr_desc	= native_load_tr_desc,
>   	.cpu.set_ldt		= native_set_ldt,
>   	.cpu.load_gdt		= native_load_gdt,
> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
> index 846b5737d320..9fbe187aff00 100644
> --- a/arch/x86/xen/enlighten_pv.c
> +++ b/arch/x86/xen/enlighten_pv.c
> @@ -1236,8 +1236,6 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = {
>   		.read_msr_safe = xen_read_msr_safe,
>   		.write_msr_safe = xen_write_msr_safe,
>   
> -		.read_pmc = xen_read_pmc,
> -
>   		.load_tr_desc = paravirt_nop,
>   		.set_ldt = xen_set_ldt,
>   		.load_gdt = xen_load_gdt,
> diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
> index 7edd0b5e0e77..8af3b4d7ef4d 100644
> --- a/drivers/net/vmxnet3/vmxnet3_drv.c
> +++ b/drivers/net/vmxnet3/vmxnet3_drv.c
> @@ -151,7 +151,7 @@ static u64
>   vmxnet3_get_cycles(int pmc)
>   {
>   #ifdef CONFIG_X86
> -	return native_read_pmc(pmc);
> +	return native_rdpmcq(pmc);
>   #else
>   	return 0;
>   #endif
Xin Li April 22, 2025, 9:12 a.m. UTC | #2
On 4/22/2025 1:38 AM, Jürgen Groß wrote:
> On 22.04.25 10:21, Xin Li (Intel) wrote:
>> To eliminate the indirect call overhead introduced by the pv_ops API,
>> use the alternatives mechanism to read PMC:
> 
> Which indirect call overhead? The indirect call is patched via the
> alternative mechanism to a direct one.
> 

See below.


>>
>>      1) When built with !CONFIG_XEN_PV, X86_FEATURE_XENPV becomes a
>>         disabled feature, preventing the Xen PMC read code from being
>>         built and ensuring the native code is executed unconditionally.
> 
> Without CONFIG_XEN_PV CONFIG_PARAVIRT_XXL is not selected, resulting in
> native code anyway.

Yes, this is kept in this patch, but in a little different way.

> 
>>
>>      2) When built with CONFIG_XEN_PV:
>>
>>         2.1) If not running on the Xen hypervisor (!X86_FEATURE_XENPV),
>>              the kernel runtime binary is patched to unconditionally
>>              jump to the native PMC read code.
>>
>>         2.2) If running on the Xen hypervisor (X86_FEATURE_XENPV), the
>>              kernel runtime binary is patched to unconditionally jump
>>              to the Xen PMC read code.
>>
>> Consequently, remove the pv_ops PMC read API.
> 
> I don't see the value of this patch.
> 
> It adds more #ifdef and code lines without any real gain.
> 
> In case the x86 maintainers think it is still worth it, I won't object.

I think we want to totally bypass pv_ops in the case 2.1).

Do you mean the indirect call is patched to call native code *directly*
for 2.1?  I don't know it, can you please elaborate?

AFAIK, Xen PV has been the sole user of pv_ops for nearly 20 years. This
raises significant doubts about whether pv_ops provides Linux with the
value of being a well-abstracted "CPU" or "Platform".  And the x86
maintainers have said that it's a maintenance nightmare.

Thanks!
     Xin
Jürgen Groß April 22, 2025, 9:28 a.m. UTC | #3
On 22.04.25 11:12, Xin Li wrote:
> On 4/22/2025 1:38 AM, Jürgen Groß wrote:
>> On 22.04.25 10:21, Xin Li (Intel) wrote:
>>> To eliminate the indirect call overhead introduced by the pv_ops API,
>>> use the alternatives mechanism to read PMC:
>>
>> Which indirect call overhead? The indirect call is patched via the
>> alternative mechanism to a direct one.
>>
> 
> See below.
> 
> 
>>>
>>>      1) When built with !CONFIG_XEN_PV, X86_FEATURE_XENPV becomes a
>>>         disabled feature, preventing the Xen PMC read code from being
>>>         built and ensuring the native code is executed unconditionally.
>>
>> Without CONFIG_XEN_PV CONFIG_PARAVIRT_XXL is not selected, resulting in
>> native code anyway.
> 
> Yes, this is kept in this patch, but in a little different way.
> 
>>
>>>
>>>      2) When built with CONFIG_XEN_PV:
>>>
>>>         2.1) If not running on the Xen hypervisor (!X86_FEATURE_XENPV),
>>>              the kernel runtime binary is patched to unconditionally
>>>              jump to the native PMC read code.
>>>
>>>         2.2) If running on the Xen hypervisor (X86_FEATURE_XENPV), the
>>>              kernel runtime binary is patched to unconditionally jump
>>>              to the Xen PMC read code.
>>>
>>> Consequently, remove the pv_ops PMC read API.
>>
>> I don't see the value of this patch.
>>
>> It adds more #ifdef and code lines without any real gain.
>>
>> In case the x86 maintainers think it is still worth it, I won't object.
> 
> I think we want to totally bypass pv_ops in the case 2.1).
> 
> Do you mean the indirect call is patched to call native code *directly*
> for 2.1?  I don't know it, can you please elaborate?

All paravirt indirect calls are patched to direct calls via the normal
alternative patch mechanism.

Have a look at alt_replace_call() in arch/x86/kernel/alternative.c

> AFAIK, Xen PV has been the sole user of pv_ops for nearly 20 years. This

Not quite. There was lguest until I ripped it out. :-)

And some use cases are left for KVM and Hyper-V guests (I have kept those
behind CONFIG_PARAVIRT, while the Xen-specific parts are behind
CONFIG_PARAVIRT_XXL now).

> raises significant doubts about whether pv_ops provides Linux with the
> value of being a well-abstracted "CPU" or "Platform".  And the x86
> maintainers have said that it's a maintenance nightmare.

I have worked rather hard to make it less intrusive, especially by removing
the paravirt specific code patching (now all done via alternative patching)
and by removing 32-bit Xen PV mode.


Juergen
Xin Li April 23, 2025, 7:40 a.m. UTC | #4
On 4/22/2025 2:28 AM, Juergen Gross wrote:
> 
> I have worked rather hard to make it less intrusive, especially by removing
> the paravirt specific code patching (now all done via alternative patching)
> and by removing 32-bit Xen PV mode.

I looked at the optimization, and it is a nice improvement.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 01dc8e61ef97..33cf506e2fd6 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -8,6 +8,7 @@ 
 
 #include <asm/asm.h>
 #include <asm/errno.h>
+#include <asm/cpufeature.h>
 #include <asm/cpumask.h>
 #include <uapi/asm/msr.h>
 #include <asm/shared/msr.h>
@@ -73,6 +74,10 @@  static inline void do_trace_read_msr(u32 msr, u64 val, int failed) {}
 static inline void do_trace_rdpmc(u32 msr, u64 val, int failed) {}
 #endif
 
+#ifdef CONFIG_XEN_PV
+extern u64 xen_read_pmc(int counter);
+#endif
+
 /*
  * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR
  * accessors and should not have any tracing or other functionality piggybacking
@@ -170,16 +175,32 @@  native_write_msr_safe(u32 msr, u32 low, u32 high)
 extern int rdmsr_safe_regs(u32 regs[8]);
 extern int wrmsr_safe_regs(u32 regs[8]);
 
-static inline u64 native_read_pmc(int counter)
+static __always_inline u64 native_rdpmcq(int counter)
 {
 	DECLARE_ARGS(val, low, high);
 
-	asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
+	asm_inline volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
+
 	if (tracepoint_enabled(rdpmc))
 		do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0);
+
 	return EAX_EDX_VAL(val, low, high);
 }
 
+static __always_inline u64 rdpmcq(int counter)
+{
+#ifdef CONFIG_XEN_PV
+	if (cpu_feature_enabled(X86_FEATURE_XENPV))
+		return xen_read_pmc(counter);
+#endif
+
+	/*
+	 * 1) When built with !CONFIG_XEN_PV.
+	 * 2) When built with CONFIG_XEN_PV but not running on Xen hypervisor.
+	 */
+	return native_rdpmcq(counter);
+}
+
 #ifdef CONFIG_PARAVIRT_XXL
 #include <asm/paravirt.h>
 #else
@@ -233,12 +254,6 @@  static inline int rdmsrq_safe(u32 msr, u64 *p)
 	*p = native_read_msr_safe(msr, &err);
 	return err;
 }
-
-static __always_inline u64 rdpmcq(int counter)
-{
-	return native_read_pmc(counter);
-}
-
 #endif	/* !CONFIG_PARAVIRT_XXL */
 
 /* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 590824916394..c7689f5f70d6 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -239,11 +239,6 @@  static inline int rdmsrq_safe(unsigned msr, u64 *p)
 	return err;
 }
 
-static __always_inline u64 rdpmcq(int counter)
-{
-	return PVOP_CALL1(u64, cpu.read_pmc, counter);
-}
-
 static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
 {
 	PVOP_VCALL2(cpu.alloc_ldt, ldt, entries);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 631c306ce1ff..475f508531d6 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -101,8 +101,6 @@  struct pv_cpu_ops {
 	u64 (*read_msr_safe)(unsigned int msr, int *err);
 	int (*write_msr_safe)(unsigned int msr, unsigned low, unsigned high);
 
-	u64 (*read_pmc)(int counter);
-
 	void (*start_context_switch)(struct task_struct *prev);
 	void (*end_context_switch)(struct task_struct *next);
 #endif
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1ccd05d8999f..28d195ad7514 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -132,7 +132,6 @@  struct paravirt_patch_template pv_ops = {
 	.cpu.write_msr		= native_write_msr,
 	.cpu.read_msr_safe	= native_read_msr_safe,
 	.cpu.write_msr_safe	= native_write_msr_safe,
-	.cpu.read_pmc		= native_read_pmc,
 	.cpu.load_tr_desc	= native_load_tr_desc,
 	.cpu.set_ldt		= native_set_ldt,
 	.cpu.load_gdt		= native_load_gdt,
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 846b5737d320..9fbe187aff00 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1236,8 +1236,6 @@  static const typeof(pv_ops) xen_cpu_ops __initconst = {
 		.read_msr_safe = xen_read_msr_safe,
 		.write_msr_safe = xen_write_msr_safe,
 
-		.read_pmc = xen_read_pmc,
-
 		.load_tr_desc = paravirt_nop,
 		.set_ldt = xen_set_ldt,
 		.load_gdt = xen_load_gdt,
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 7edd0b5e0e77..8af3b4d7ef4d 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -151,7 +151,7 @@  static u64
 vmxnet3_get_cycles(int pmc)
 {
 #ifdef CONFIG_X86
-	return native_read_pmc(pmc);
+	return native_rdpmcq(pmc);
 #else
 	return 0;
 #endif