diff mbox series

[v3,|,v5.12,1/3] x86/kvm: Teardown PV features on boot CPU as well

Message ID 20210601071644.6055-1-krzysztof.kozlowski@canonical.com
State New
Headers show
Series [v3,|,v5.12,1/3] x86/kvm: Teardown PV features on boot CPU as well | expand

Commit Message

Krzysztof Kozlowski June 1, 2021, 7:16 a.m. UTC
From: Vitaly Kuznetsov <vkuznets@redhat.com>

commit 8b79feffeca28c5459458fe78676b081e87c93a4 upstream.

Various PV features (Async PF, PV EOI, steal time) work through memory
shared with hypervisor and when we restore from hibernation we must
properly teardown all these features to make sure hypervisor doesn't
write to stale locations after we jump to the previously hibernated kernel
(which can try to place anything there). For secondary CPUs the job is
already done by kvm_cpu_down_prepare(), register syscore ops to do
the same for boot CPU.

Krzysztof:
This fixes memory corruption visible after second resume from
hibernation:

  BUG: Bad page state in process dbus-daemon  pfn:18b01
  page:ffffea000062c040 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 compound_mapcount: -30591
  flags: 0xfffffc0078141(locked|error|workingset|writeback|head|mappedtodisk|reclaim)
  raw: 000fffffc0078141 dead0000000002d0 dead000000000100 0000000000000000
  raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000
  page dumped because: PAGE_FLAGS_CHECK_AT_PREP flag set
  bad because of flags: 0x78141(locked|error|workingset|writeback|head|mappedtodisk|reclaim)

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20210414123544.1060604-3-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
[krzysztof: Extend the commit message, adjust for v5.10 context]
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
---
 arch/x86/kernel/kvm.c | 57 +++++++++++++++++++++++++++++++------------
 1 file changed, 41 insertions(+), 16 deletions(-)

Comments

Paolo Bonzini June 1, 2021, 8:19 a.m. UTC | #1
On 01/06/21 09:16, Krzysztof Kozlowski wrote:
> From: Vitaly Kuznetsov <vkuznets@redhat.com>
> 
> commit c02027b5742b5aa804ef08a4a9db433295533046 upstream.
> 
> Currenly, we disable kvmclock from machine_shutdown() hook and this
> only happens for boot CPU. We need to disable it for all CPUs to
> guard against memory corruption e.g. on restore from hibernate.
> 
> Note, writing '0' to kvmclock MSR doesn't clear memory location, it
> just prevents hypervisor from updating the location so for the short
> while after write and while CPU is still alive, the clock remains usable
> and correct so we don't need to switch to some other clocksource.
> 
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> Message-Id: <20210414123544.1060604-4-vkuznets@redhat.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
> Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
> ---
>   arch/x86/include/asm/kvm_para.h | 4 ++--
>   arch/x86/kernel/kvm.c           | 1 +
>   arch/x86/kernel/kvmclock.c      | 5 +----
>   3 files changed, 4 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
> index 338119852512..9c56e0defd45 100644
> --- a/arch/x86/include/asm/kvm_para.h
> +++ b/arch/x86/include/asm/kvm_para.h
> @@ -7,8 +7,6 @@
>   #include <linux/interrupt.h>
>   #include <uapi/asm/kvm_para.h>
>   
> -extern void kvmclock_init(void);
> -
>   #ifdef CONFIG_KVM_GUEST
>   bool kvm_check_and_clear_guest_paused(void);
>   #else
> @@ -86,6 +84,8 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
>   }
>   
>   #ifdef CONFIG_KVM_GUEST
> +void kvmclock_init(void);
> +void kvmclock_disable(void);
>   bool kvm_para_available(void);
>   unsigned int kvm_arch_para_features(void);
>   unsigned int kvm_arch_para_hints(void);
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 5ae57be19187..2ebb93c34bc5 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -468,6 +468,7 @@ static void kvm_guest_cpu_offline(void)
>   		wrmsrl(MSR_KVM_PV_EOI_EN, 0);
>   	kvm_pv_disable_apf();
>   	apf_task_wake_all();
> +	kvmclock_disable();
>   }
>   
>   static int kvm_cpu_online(unsigned int cpu)
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index 1fc0962c89c0..cf869de98eec 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -220,11 +220,9 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
>   }
>   #endif
>   
> -static void kvm_shutdown(void)
> +void kvmclock_disable(void)
>   {
>   	native_write_msr(msr_kvm_system_time, 0, 0);
> -	kvm_disable_steal_time();
> -	native_machine_shutdown();
>   }
>   
>   static void __init kvmclock_init_mem(void)
> @@ -351,7 +349,6 @@ void __init kvmclock_init(void)
>   #endif
>   	x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
>   	x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
> -	machine_ops.shutdown  = kvm_shutdown;
>   #ifdef CONFIG_KEXEC_CORE
>   	machine_ops.crash_shutdown  = kvm_crash_shutdown;
>   #endif
> 

Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Greg KH June 8, 2021, 3:43 p.m. UTC | #2
On Tue, Jun 01, 2021 at 09:16:42AM +0200, Krzysztof Kozlowski wrote:
> From: Vitaly Kuznetsov <vkuznets@redhat.com>

> 

> commit 8b79feffeca28c5459458fe78676b081e87c93a4 upstream.

> 

> Various PV features (Async PF, PV EOI, steal time) work through memory

> shared with hypervisor and when we restore from hibernation we must

> properly teardown all these features to make sure hypervisor doesn't

> write to stale locations after we jump to the previously hibernated kernel

> (which can try to place anything there). For secondary CPUs the job is

> already done by kvm_cpu_down_prepare(), register syscore ops to do

> the same for boot CPU.

> 

> Krzysztof:

> This fixes memory corruption visible after second resume from

> hibernation:

> 

>   BUG: Bad page state in process dbus-daemon  pfn:18b01

>   page:ffffea000062c040 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 compound_mapcount: -30591

>   flags: 0xfffffc0078141(locked|error|workingset|writeback|head|mappedtodisk|reclaim)

>   raw: 000fffffc0078141 dead0000000002d0 dead000000000100 0000000000000000

>   raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000

>   page dumped because: PAGE_FLAGS_CHECK_AT_PREP flag set

>   bad because of flags: 0x78141(locked|error|workingset|writeback|head|mappedtodisk|reclaim)

> 

> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>

> Message-Id: <20210414123544.1060604-3-vkuznets@redhat.com>

> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

> Signed-off-by: Andrea Righi <andrea.righi@canonical.com>

> [krzysztof: Extend the commit message, adjust for v5.10 context]

> Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>

> ---

>  arch/x86/kernel/kvm.c | 57 +++++++++++++++++++++++++++++++------------

>  1 file changed, 41 insertions(+), 16 deletions(-)


All now queued up, thanks.

greg k-h
diff mbox series

Patch

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 78bb0fae3982..5ae57be19187 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -26,6 +26,7 @@ 
 #include <linux/kprobes.h>
 #include <linux/nmi.h>
 #include <linux/swait.h>
+#include <linux/syscore_ops.h>
 #include <asm/timer.h>
 #include <asm/cpu.h>
 #include <asm/traps.h>
@@ -460,6 +461,25 @@  static bool pv_tlb_flush_supported(void)
 
 static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
 
+static void kvm_guest_cpu_offline(void)
+{
+	kvm_disable_steal_time();
+	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+		wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+	kvm_pv_disable_apf();
+	apf_task_wake_all();
+}
+
+static int kvm_cpu_online(unsigned int cpu)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	kvm_guest_cpu_init();
+	local_irq_restore(flags);
+	return 0;
+}
+
 #ifdef CONFIG_SMP
 
 static bool pv_ipi_supported(void)
@@ -587,31 +607,34 @@  static void __init kvm_smp_prepare_boot_cpu(void)
 	kvm_spinlock_init();
 }
 
-static void kvm_guest_cpu_offline(void)
+static int kvm_cpu_down_prepare(unsigned int cpu)
 {
-	kvm_disable_steal_time();
-	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
-		wrmsrl(MSR_KVM_PV_EOI_EN, 0);
-	kvm_pv_disable_apf();
-	apf_task_wake_all();
-}
+	unsigned long flags;
 
-static int kvm_cpu_online(unsigned int cpu)
-{
-	local_irq_disable();
-	kvm_guest_cpu_init();
-	local_irq_enable();
+	local_irq_save(flags);
+	kvm_guest_cpu_offline();
+	local_irq_restore(flags);
 	return 0;
 }
 
-static int kvm_cpu_down_prepare(unsigned int cpu)
+#endif
+
+static int kvm_suspend(void)
 {
-	local_irq_disable();
 	kvm_guest_cpu_offline();
-	local_irq_enable();
+
 	return 0;
 }
-#endif
+
+static void kvm_resume(void)
+{
+	kvm_cpu_online(raw_smp_processor_id());
+}
+
+static struct syscore_ops kvm_syscore_ops = {
+	.suspend	= kvm_suspend,
+	.resume		= kvm_resume,
+};
 
 static void kvm_flush_tlb_others(const struct cpumask *cpumask,
 			const struct flush_tlb_info *info)
@@ -681,6 +704,8 @@  static void __init kvm_guest_init(void)
 	kvm_guest_cpu_init();
 #endif
 
+	register_syscore_ops(&kvm_syscore_ops);
+
 	/*
 	 * Hard lockup detection is enabled by default. Disable it, as guests
 	 * can get false positives too easily, for example if the host is