Message ID | 20250422082216.1954310-7-xin@zytor.com |
---|---|
State | New |
Headers | show |
Series | MSR refactor with new MSR instructions support | expand |
On 22.04.25 10:21, Xin Li (Intel) wrote: > To eliminate the indirect call overhead introduced by the pv_ops API, > use the alternatives mechanism to read PMC: Which indirect call overhead? The indirect call is patched via the alternative mechanism to a direct one. > > 1) When built with !CONFIG_XEN_PV, X86_FEATURE_XENPV becomes a > disabled feature, preventing the Xen PMC read code from being > built and ensuring the native code is executed unconditionally. Without CONFIG_XEN_PV CONFIG_PARAVIRT_XXL is not selected, resulting in native code anyway. > > 2) When built with CONFIG_XEN_PV: > > 2.1) If not running on the Xen hypervisor (!X86_FEATURE_XENPV), > the kernel runtime binary is patched to unconditionally > jump to the native PMC read code. > > 2.2) If running on the Xen hypervisor (X86_FEATURE_XENPV), the > kernel runtime binary is patched to unconditionally jump > to the Xen PMC read code. > > Consequently, remove the pv_ops PMC read API. I don't see the value of this patch. It adds more #ifdef and code lines without any real gain. In case the x86 maintainers think it is still worth it, I won't object. Juergen > > Signed-off-by: Xin Li (Intel) <xin@zytor.com> > --- > arch/x86/include/asm/msr.h | 31 ++++++++++++++++++++------- > arch/x86/include/asm/paravirt.h | 5 ----- > arch/x86/include/asm/paravirt_types.h | 2 -- > arch/x86/kernel/paravirt.c | 1 - > arch/x86/xen/enlighten_pv.c | 2 -- > drivers/net/vmxnet3/vmxnet3_drv.c | 2 +- > 6 files changed, 24 insertions(+), 19 deletions(-) > > diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h > index 01dc8e61ef97..33cf506e2fd6 100644 > --- a/arch/x86/include/asm/msr.h > +++ b/arch/x86/include/asm/msr.h > @@ -8,6 +8,7 @@ > > #include <asm/asm.h> > #include <asm/errno.h> > +#include <asm/cpufeature.h> > #include <asm/cpumask.h> > #include <uapi/asm/msr.h> > #include <asm/shared/msr.h> > @@ -73,6 +74,10 @@ static inline void do_trace_read_msr(u32 msr, u64 val, int failed) {} > static inline void do_trace_rdpmc(u32 msr, u64 val, int failed) {} > #endif > > +#ifdef CONFIG_XEN_PV > +extern u64 xen_read_pmc(int counter); > +#endif > + > /* > * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR > * accessors and should not have any tracing or other functionality piggybacking > @@ -170,16 +175,32 @@ native_write_msr_safe(u32 msr, u32 low, u32 high) > extern int rdmsr_safe_regs(u32 regs[8]); > extern int wrmsr_safe_regs(u32 regs[8]); > > -static inline u64 native_read_pmc(int counter) > +static __always_inline u64 native_rdpmcq(int counter) > { > DECLARE_ARGS(val, low, high); > > - asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter)); > + asm_inline volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter)); > + > if (tracepoint_enabled(rdpmc)) > do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0); > + > return EAX_EDX_VAL(val, low, high); > } > > +static __always_inline u64 rdpmcq(int counter) > +{ > +#ifdef CONFIG_XEN_PV > + if (cpu_feature_enabled(X86_FEATURE_XENPV)) > + return xen_read_pmc(counter); > +#endif > + > + /* > + * 1) When built with !CONFIG_XEN_PV. > + * 2) When built with CONFIG_XEN_PV but not running on Xen hypervisor. > + */ > + return native_rdpmcq(counter); > +} > + > #ifdef CONFIG_PARAVIRT_XXL > #include <asm/paravirt.h> > #else > @@ -233,12 +254,6 @@ static inline int rdmsrq_safe(u32 msr, u64 *p) > *p = native_read_msr_safe(msr, &err); > return err; > } > - > -static __always_inline u64 rdpmcq(int counter) > -{ > - return native_read_pmc(counter); > -} > - > #endif /* !CONFIG_PARAVIRT_XXL */ > > /* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */ > diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h > index 590824916394..c7689f5f70d6 100644 > --- a/arch/x86/include/asm/paravirt.h > +++ b/arch/x86/include/asm/paravirt.h > @@ -239,11 +239,6 @@ static inline int rdmsrq_safe(unsigned msr, u64 *p) > return err; > } > > -static __always_inline u64 rdpmcq(int counter) > -{ > - return PVOP_CALL1(u64, cpu.read_pmc, counter); > -} > - > static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) > { > PVOP_VCALL2(cpu.alloc_ldt, ldt, entries); > diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h > index 631c306ce1ff..475f508531d6 100644 > --- a/arch/x86/include/asm/paravirt_types.h > +++ b/arch/x86/include/asm/paravirt_types.h > @@ -101,8 +101,6 @@ struct pv_cpu_ops { > u64 (*read_msr_safe)(unsigned int msr, int *err); > int (*write_msr_safe)(unsigned int msr, unsigned low, unsigned high); > > - u64 (*read_pmc)(int counter); > - > void (*start_context_switch)(struct task_struct *prev); > void (*end_context_switch)(struct task_struct *next); > #endif > diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c > index 1ccd05d8999f..28d195ad7514 100644 > --- a/arch/x86/kernel/paravirt.c > +++ b/arch/x86/kernel/paravirt.c > @@ -132,7 +132,6 @@ struct paravirt_patch_template pv_ops = { > .cpu.write_msr = native_write_msr, > .cpu.read_msr_safe = native_read_msr_safe, > .cpu.write_msr_safe = native_write_msr_safe, > - .cpu.read_pmc = native_read_pmc, > .cpu.load_tr_desc = native_load_tr_desc, > .cpu.set_ldt = native_set_ldt, > .cpu.load_gdt = native_load_gdt, > diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c > index 846b5737d320..9fbe187aff00 100644 > --- a/arch/x86/xen/enlighten_pv.c > +++ b/arch/x86/xen/enlighten_pv.c > @@ -1236,8 +1236,6 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = { > .read_msr_safe = xen_read_msr_safe, > .write_msr_safe = xen_write_msr_safe, > > - .read_pmc = xen_read_pmc, > - > .load_tr_desc = paravirt_nop, > .set_ldt = xen_set_ldt, > .load_gdt = xen_load_gdt, > diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c > index 7edd0b5e0e77..8af3b4d7ef4d 100644 > --- a/drivers/net/vmxnet3/vmxnet3_drv.c > +++ b/drivers/net/vmxnet3/vmxnet3_drv.c > @@ -151,7 +151,7 @@ static u64 > vmxnet3_get_cycles(int pmc) > { > #ifdef CONFIG_X86 > - return native_read_pmc(pmc); > + return native_rdpmcq(pmc); > #else > return 0; > #endif
On 4/22/2025 1:38 AM, Jürgen Groß wrote: > On 22.04.25 10:21, Xin Li (Intel) wrote: >> To eliminate the indirect call overhead introduced by the pv_ops API, >> use the alternatives mechanism to read PMC: > > Which indirect call overhead? The indirect call is patched via the > alternative mechanism to a direct one. > See below. >> >> 1) When built with !CONFIG_XEN_PV, X86_FEATURE_XENPV becomes a >> disabled feature, preventing the Xen PMC read code from being >> built and ensuring the native code is executed unconditionally. > > Without CONFIG_XEN_PV CONFIG_PARAVIRT_XXL is not selected, resulting in > native code anyway. Yes, this is kept in this patch, but in a little different way. > >> >> 2) When built with CONFIG_XEN_PV: >> >> 2.1) If not running on the Xen hypervisor (!X86_FEATURE_XENPV), >> the kernel runtime binary is patched to unconditionally >> jump to the native PMC read code. >> >> 2.2) If running on the Xen hypervisor (X86_FEATURE_XENPV), the >> kernel runtime binary is patched to unconditionally jump >> to the Xen PMC read code. >> >> Consequently, remove the pv_ops PMC read API. > > I don't see the value of this patch. > > It adds more #ifdef and code lines without any real gain. > > In case the x86 maintainers think it is still worth it, I won't object. I think we want to totally bypass pv_ops in the case 2.1). Do you mean the indirect call is patched to call native code *directly* for 2.1? I don't know it, can you please elaborate? AFAIK, Xen PV has been the sole user of pv_ops for nearly 20 years. This raises significant doubts about whether pv_ops provides Linux with the value of being a well-abstracted "CPU" or "Platform". And the x86 maintainers have said that it's a maintenance nightmare. Thanks! Xin
On 22.04.25 11:12, Xin Li wrote: > On 4/22/2025 1:38 AM, Jürgen Groß wrote: >> On 22.04.25 10:21, Xin Li (Intel) wrote: >>> To eliminate the indirect call overhead introduced by the pv_ops API, >>> use the alternatives mechanism to read PMC: >> >> Which indirect call overhead? The indirect call is patched via the >> alternative mechanism to a direct one. >> > > See below. > > >>> >>> 1) When built with !CONFIG_XEN_PV, X86_FEATURE_XENPV becomes a >>> disabled feature, preventing the Xen PMC read code from being >>> built and ensuring the native code is executed unconditionally. >> >> Without CONFIG_XEN_PV CONFIG_PARAVIRT_XXL is not selected, resulting in >> native code anyway. > > Yes, this is kept in this patch, but in a little different way. > >> >>> >>> 2) When built with CONFIG_XEN_PV: >>> >>> 2.1) If not running on the Xen hypervisor (!X86_FEATURE_XENPV), >>> the kernel runtime binary is patched to unconditionally >>> jump to the native PMC read code. >>> >>> 2.2) If running on the Xen hypervisor (X86_FEATURE_XENPV), the >>> kernel runtime binary is patched to unconditionally jump >>> to the Xen PMC read code. >>> >>> Consequently, remove the pv_ops PMC read API. >> >> I don't see the value of this patch. >> >> It adds more #ifdef and code lines without any real gain. >> >> In case the x86 maintainers think it is still worth it, I won't object. > > I think we want to totally bypass pv_ops in the case 2.1). > > Do you mean the indirect call is patched to call native code *directly* > for 2.1? I don't know it, can you please elaborate? All paravirt indirect calls are patched to direct calls via the normal alternative patch mechanism. Have a look at alt_replace_call() in arch/x86/kernel/alternative.c > AFAIK, Xen PV has been the sole user of pv_ops for nearly 20 years. This Not quite. There was lguest until I ripped it out. :-) And some use cases are left for KVM and Hyper-V guests (I have kept those behind CONFIG_PARAVIRT, while the Xen-specific parts are behind CONFIG_PARAVIRT_XXL now). > raises significant doubts about whether pv_ops provides Linux with the > value of being a well-abstracted "CPU" or "Platform". And the x86 > maintainers have said that it's a maintenance nightmare. I have worked rather hard to make it less intrusive, especially by removing the paravirt specific code patching (now all done via alternative patching) and by removing 32-bit Xen PV mode. Juergen
On 4/22/2025 2:28 AM, Juergen Gross wrote: > > I have worked rather hard to make it less intrusive, especially by removing > the paravirt specific code patching (now all done via alternative patching) > and by removing 32-bit Xen PV mode. I looked at the optimization, and it is a nice improvement.
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 01dc8e61ef97..33cf506e2fd6 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -8,6 +8,7 @@ #include <asm/asm.h> #include <asm/errno.h> +#include <asm/cpufeature.h> #include <asm/cpumask.h> #include <uapi/asm/msr.h> #include <asm/shared/msr.h> @@ -73,6 +74,10 @@ static inline void do_trace_read_msr(u32 msr, u64 val, int failed) {} static inline void do_trace_rdpmc(u32 msr, u64 val, int failed) {} #endif +#ifdef CONFIG_XEN_PV +extern u64 xen_read_pmc(int counter); +#endif + /* * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR * accessors and should not have any tracing or other functionality piggybacking @@ -170,16 +175,32 @@ native_write_msr_safe(u32 msr, u32 low, u32 high) extern int rdmsr_safe_regs(u32 regs[8]); extern int wrmsr_safe_regs(u32 regs[8]); -static inline u64 native_read_pmc(int counter) +static __always_inline u64 native_rdpmcq(int counter) { DECLARE_ARGS(val, low, high); - asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter)); + asm_inline volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter)); + if (tracepoint_enabled(rdpmc)) do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0); + return EAX_EDX_VAL(val, low, high); } +static __always_inline u64 rdpmcq(int counter) +{ +#ifdef CONFIG_XEN_PV + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return xen_read_pmc(counter); +#endif + + /* + * 1) When built with !CONFIG_XEN_PV. + * 2) When built with CONFIG_XEN_PV but not running on Xen hypervisor. + */ + return native_rdpmcq(counter); +} + #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else @@ -233,12 +254,6 @@ static inline int rdmsrq_safe(u32 msr, u64 *p) *p = native_read_msr_safe(msr, &err); return err; } - -static __always_inline u64 rdpmcq(int counter) -{ - return native_read_pmc(counter); -} - #endif /* !CONFIG_PARAVIRT_XXL */ /* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 590824916394..c7689f5f70d6 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -239,11 +239,6 @@ static inline int rdmsrq_safe(unsigned msr, u64 *p) return err; } -static __always_inline u64 rdpmcq(int counter) -{ - return PVOP_CALL1(u64, cpu.read_pmc, counter); -} - static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(cpu.alloc_ldt, ldt, entries); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 631c306ce1ff..475f508531d6 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -101,8 +101,6 @@ struct pv_cpu_ops { u64 (*read_msr_safe)(unsigned int msr, int *err); int (*write_msr_safe)(unsigned int msr, unsigned low, unsigned high); - u64 (*read_pmc)(int counter); - void (*start_context_switch)(struct task_struct *prev); void (*end_context_switch)(struct task_struct *next); #endif diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1ccd05d8999f..28d195ad7514 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -132,7 +132,6 @@ struct paravirt_patch_template pv_ops = { .cpu.write_msr = native_write_msr, .cpu.read_msr_safe = native_read_msr_safe, .cpu.write_msr_safe = native_write_msr_safe, - .cpu.read_pmc = native_read_pmc, .cpu.load_tr_desc = native_load_tr_desc, .cpu.set_ldt = native_set_ldt, .cpu.load_gdt = native_load_gdt, diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 846b5737d320..9fbe187aff00 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1236,8 +1236,6 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = { .read_msr_safe = xen_read_msr_safe, .write_msr_safe = xen_write_msr_safe, - .read_pmc = xen_read_pmc, - .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, .load_gdt = xen_load_gdt, diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 7edd0b5e0e77..8af3b4d7ef4d 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -151,7 +151,7 @@ static u64 vmxnet3_get_cycles(int pmc) { #ifdef CONFIG_X86 - return native_read_pmc(pmc); + return native_rdpmcq(pmc); #else return 0; #endif
To eliminate the indirect call overhead introduced by the pv_ops API, use the alternatives mechanism to read PMC: 1) When built with !CONFIG_XEN_PV, X86_FEATURE_XENPV becomes a disabled feature, preventing the Xen PMC read code from being built and ensuring the native code is executed unconditionally. 2) When built with CONFIG_XEN_PV: 2.1) If not running on the Xen hypervisor (!X86_FEATURE_XENPV), the kernel runtime binary is patched to unconditionally jump to the native PMC read code. 2.2) If running on the Xen hypervisor (X86_FEATURE_XENPV), the kernel runtime binary is patched to unconditionally jump to the Xen PMC read code. Consequently, remove the pv_ops PMC read API. Signed-off-by: Xin Li (Intel) <xin@zytor.com> --- arch/x86/include/asm/msr.h | 31 ++++++++++++++++++++------- arch/x86/include/asm/paravirt.h | 5 ----- arch/x86/include/asm/paravirt_types.h | 2 -- arch/x86/kernel/paravirt.c | 1 - arch/x86/xen/enlighten_pv.c | 2 -- drivers/net/vmxnet3/vmxnet3_drv.c | 2 +- 6 files changed, 24 insertions(+), 19 deletions(-)