diff mbox series

[v7,12/14] KVM: Handle page fault for private memory

Message ID 20220706082016.2603916-13-chao.p.peng@linux.intel.com
State New
Headers show
Series KVM: mm: fd-based approach for supporting KVM guest private memory | expand

Commit Message

Chao Peng July 6, 2022, 8:20 a.m. UTC
A page fault can carry the private/shared information for
KVM_MEM_PRIVATE memslot, this can be filled by architecture code(like
TDX code). To handle page fault for such access, KVM maps the page only
when this private property matches the host's view on the page.

For a successful match, private pfn is obtained with memfile_notifier
callbacks from private fd and shared pfn is obtained with existing
get_user_pages.

For a failed match, KVM causes a KVM_EXIT_MEMORY_FAULT exit to
userspace. Userspace then can convert memory between private/shared from
host's view then retry the access.

Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
 arch/x86/kvm/mmu/mmu.c          | 60 ++++++++++++++++++++++++++++++++-
 arch/x86/kvm/mmu/mmu_internal.h | 18 ++++++++++
 arch/x86/kvm/mmu/mmutrace.h     |  1 +
 include/linux/kvm_host.h        | 35 ++++++++++++++++++-
 4 files changed, 112 insertions(+), 2 deletions(-)

Comments

Sean Christopherson July 29, 2022, 8:58 p.m. UTC | #1
On Wed, Jul 06, 2022, Chao Peng wrote:
> A page fault can carry the private/shared information for
> KVM_MEM_PRIVATE memslot, this can be filled by architecture code(like
> TDX code). To handle page fault for such access, KVM maps the page only
> when this private property matches the host's view on the page.
> 
> For a successful match, private pfn is obtained with memfile_notifier
> callbacks from private fd and shared pfn is obtained with existing
> get_user_pages.
> 
> For a failed match, KVM causes a KVM_EXIT_MEMORY_FAULT exit to
> userspace. Userspace then can convert memory between private/shared from
> host's view then retry the access.
> 
> Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> ---
>  arch/x86/kvm/mmu/mmu.c          | 60 ++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/mmu/mmu_internal.h | 18 ++++++++++
>  arch/x86/kvm/mmu/mmutrace.h     |  1 +
>  include/linux/kvm_host.h        | 35 ++++++++++++++++++-
>  4 files changed, 112 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 545eb74305fe..27dbdd4fe8d1 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3004,6 +3004,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
>  	if (max_level == PG_LEVEL_4K)
>  		return PG_LEVEL_4K;
>  
> +	if (kvm_mem_is_private(kvm, gfn))
> +		return max_level;
> +
>  	host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
>  	return min(host_level, max_level);
>  }
> @@ -4101,10 +4104,52 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
>  	kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
>  }
>  
> +static inline u8 order_to_level(int order)
> +{
> +	enum pg_level level;
> +
> +	for (level = KVM_MAX_HUGEPAGE_LEVEL; level > PG_LEVEL_4K; level--)

Curly braces needed for the for-loop.

And I think it makes sense to take in the fault->max_level, that way this is
slightly more performant when the guest mapping is smaller than the host, e.g.

	for (level = max_level; level > PG_LEVEL_4K; level--)
		...

	return level;

Though I think I'd vote to avoid a loop entirely and do:

	BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);

	if (order > ???)
		return PG_LEVEL_1G;
	
	if (order > ???)
		return PG_LEVEL_2M;

	return PG_LEVEL_4K;


> +		if (order >= page_level_shift(level) - PAGE_SHIFT)
> +			return level;
> +	return level;
> +}
> +
> +static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
> +				   struct kvm_page_fault *fault)
> +{
> +	int order;
> +	struct kvm_memory_slot *slot = fault->slot;
> +	bool private_exist = kvm_mem_is_private(vcpu->kvm, fault->gfn);
> +
> +	if (fault->is_private != private_exist) {
> +		vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
> +		if (fault->is_private)
> +			vcpu->run->memory.flags = KVM_MEMORY_EXIT_FLAG_PRIVATE;
> +		else
> +			vcpu->run->memory.flags = 0;
> +		vcpu->run->memory.padding = 0;
> +		vcpu->run->memory.gpa = fault->gfn << PAGE_SHIFT;
> +		vcpu->run->memory.size = PAGE_SIZE;
> +		return RET_PF_USER;
> +	}
> +
> +	if (fault->is_private) {
> +		if (kvm_private_mem_get_pfn(slot, fault->gfn, &fault->pfn, &order))
> +			return RET_PF_RETRY;
> +		fault->max_level = min(order_to_level(order), fault->max_level);
> +		fault->map_writable = !(slot->flags & KVM_MEM_READONLY);
> +		return RET_PF_FIXED;
> +	}
> +
> +	/* Fault is shared, fallthrough. */
> +	return RET_PF_CONTINUE;
> +}
> +
>  static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>  {
>  	struct kvm_memory_slot *slot = fault->slot;
>  	bool async;
> +	int r;
>  
>  	/*
>  	 * Retry the page fault if the gfn hit a memslot that is being deleted
> @@ -4133,6 +4178,12 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>  			return RET_PF_EMULATE;
>  	}
>  
> +	if (kvm_slot_can_be_private(slot)) {
> +		r = kvm_faultin_pfn_private(vcpu, fault);
> +		if (r != RET_PF_CONTINUE)
> +			return r == RET_PF_FIXED ? RET_PF_CONTINUE : r;

I apologize if I've given you conflicting feedback in the past.  Now that this
returns RET_PF_* directly, I definitely think it makes sense to do:

	if (kvm_slot_can_be_private(slot) &&
	    fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
		vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
		if (fault->is_private)
			vcpu->run->memory.flags = KVM_MEMORY_EXIT_FLAG_PRIVATE;
		else
			vcpu->run->memory.flags = 0;
		vcpu->run->memory.padding = 0;
		vcpu->run->memory.gpa = fault->gfn << PAGE_SHIFT;
		vcpu->run->memory.size = PAGE_SIZE;
		return RET_PF_USER;
	}

	if (fault->is_private)
		return kvm_faultin_pfn_private(vcpu, fault);

That way kvm_faultin_pfn_private() only handles private faults, and this doesn't
need to play games with RET_PF_FIXED.


> +	}
> +
>  	async = false;
>  	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
>  					  fault->write, &fault->map_writable,
> @@ -4241,7 +4292,11 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  		read_unlock(&vcpu->kvm->mmu_lock);
>  	else
>  		write_unlock(&vcpu->kvm->mmu_lock);
> -	kvm_release_pfn_clean(fault->pfn);
> +
> +	if (fault->is_private)
> +		kvm_private_mem_put_pfn(fault->slot, fault->pfn);
> +	else
> +		kvm_release_pfn_clean(fault->pfn);

AFAIK, we never bottomed out on whether or not this is needed[*].  Can you follow
up with Kirill to get an answer before posting v8?

[*] https://lore.kernel.org/all/20220620141647.GC2016793@chaop.bj.intel.com
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 545eb74305fe..27dbdd4fe8d1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3004,6 +3004,9 @@  int kvm_mmu_max_mapping_level(struct kvm *kvm,
 	if (max_level == PG_LEVEL_4K)
 		return PG_LEVEL_4K;
 
+	if (kvm_mem_is_private(kvm, gfn))
+		return max_level;
+
 	host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
 	return min(host_level, max_level);
 }
@@ -4101,10 +4104,52 @@  void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 	kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
 }
 
+static inline u8 order_to_level(int order)
+{
+	enum pg_level level;
+
+	for (level = KVM_MAX_HUGEPAGE_LEVEL; level > PG_LEVEL_4K; level--)
+		if (order >= page_level_shift(level) - PAGE_SHIFT)
+			return level;
+	return level;
+}
+
+static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
+				   struct kvm_page_fault *fault)
+{
+	int order;
+	struct kvm_memory_slot *slot = fault->slot;
+	bool private_exist = kvm_mem_is_private(vcpu->kvm, fault->gfn);
+
+	if (fault->is_private != private_exist) {
+		vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
+		if (fault->is_private)
+			vcpu->run->memory.flags = KVM_MEMORY_EXIT_FLAG_PRIVATE;
+		else
+			vcpu->run->memory.flags = 0;
+		vcpu->run->memory.padding = 0;
+		vcpu->run->memory.gpa = fault->gfn << PAGE_SHIFT;
+		vcpu->run->memory.size = PAGE_SIZE;
+		return RET_PF_USER;
+	}
+
+	if (fault->is_private) {
+		if (kvm_private_mem_get_pfn(slot, fault->gfn, &fault->pfn, &order))
+			return RET_PF_RETRY;
+		fault->max_level = min(order_to_level(order), fault->max_level);
+		fault->map_writable = !(slot->flags & KVM_MEM_READONLY);
+		return RET_PF_FIXED;
+	}
+
+	/* Fault is shared, fallthrough. */
+	return RET_PF_CONTINUE;
+}
+
 static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 {
 	struct kvm_memory_slot *slot = fault->slot;
 	bool async;
+	int r;
 
 	/*
 	 * Retry the page fault if the gfn hit a memslot that is being deleted
@@ -4133,6 +4178,12 @@  static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 			return RET_PF_EMULATE;
 	}
 
+	if (kvm_slot_can_be_private(slot)) {
+		r = kvm_faultin_pfn_private(vcpu, fault);
+		if (r != RET_PF_CONTINUE)
+			return r == RET_PF_FIXED ? RET_PF_CONTINUE : r;
+	}
+
 	async = false;
 	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
 					  fault->write, &fault->map_writable,
@@ -4241,7 +4292,11 @@  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 		read_unlock(&vcpu->kvm->mmu_lock);
 	else
 		write_unlock(&vcpu->kvm->mmu_lock);
-	kvm_release_pfn_clean(fault->pfn);
+
+	if (fault->is_private)
+		kvm_private_mem_put_pfn(fault->slot, fault->pfn);
+	else
+		kvm_release_pfn_clean(fault->pfn);
 	return r;
 }
 
@@ -5518,6 +5573,9 @@  int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
 			return -EIO;
 	}
 
+	if (r == RET_PF_USER)
+		return 0;
+
 	if (r < 0)
 		return r;
 	if (r != RET_PF_EMULATE)
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index ae2d660e2dab..fb9c298abcf0 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -188,6 +188,7 @@  struct kvm_page_fault {
 
 	/* Derived from mmu and global state.  */
 	const bool is_tdp;
+	const bool is_private;
 	const bool nx_huge_page_workaround_enabled;
 
 	/*
@@ -236,6 +237,7 @@  int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
  * RET_PF_RETRY: let CPU fault again on the address.
  * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
  * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ * RET_PF_USER: need to exit to userspace to handle this fault.
  * RET_PF_FIXED: The faulting entry has been fixed.
  * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
  *
@@ -252,6 +254,7 @@  enum {
 	RET_PF_RETRY,
 	RET_PF_EMULATE,
 	RET_PF_INVALID,
+	RET_PF_USER,
 	RET_PF_FIXED,
 	RET_PF_SPURIOUS,
 };
@@ -318,4 +321,19 @@  void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 
+#ifndef CONFIG_HAVE_KVM_PRIVATE_MEM
+static inline int kvm_private_mem_get_pfn(struct kvm_memory_slot *slot,
+					  gfn_t gfn, kvm_pfn_t *pfn, int *order)
+{
+	WARN_ON_ONCE(1);
+	return -EOPNOTSUPP;
+}
+
+static inline void kvm_private_mem_put_pfn(struct kvm_memory_slot *slot,
+					   kvm_pfn_t pfn)
+{
+	WARN_ON_ONCE(1);
+}
+#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */
+
 #endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index ae86820cef69..2d7555381955 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -58,6 +58,7 @@  TRACE_DEFINE_ENUM(RET_PF_CONTINUE);
 TRACE_DEFINE_ENUM(RET_PF_RETRY);
 TRACE_DEFINE_ENUM(RET_PF_EMULATE);
 TRACE_DEFINE_ENUM(RET_PF_INVALID);
+TRACE_DEFINE_ENUM(RET_PF_USER);
 TRACE_DEFINE_ENUM(RET_PF_FIXED);
 TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index da33f8828456..8f56426aa1e3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -778,6 +778,10 @@  struct kvm {
 
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	struct mmu_notifier mmu_notifier;
+#endif
+
+#if (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)) || \
+	defined(CONFIG_MEMFILE_NOTIFIER)
 	unsigned long mmu_updating_seq;
 	long mmu_updating_count;
 	gfn_t mmu_updating_range_start;
@@ -1917,7 +1921,8 @@  extern const struct _kvm_stats_desc kvm_vm_stats_desc[];
 extern const struct kvm_stats_header kvm_vcpu_stats_header;
 extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[];
 
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#if (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)) || \
+	defined(CONFIG_MEMFILE_NOTIFIER)
 static inline int mmu_updating_retry(struct kvm *kvm, unsigned long mmu_seq)
 {
 	if (unlikely(kvm->mmu_updating_count))
@@ -2266,4 +2271,32 @@  static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
 /* Max number of entries allowed for each kvm dirty ring */
 #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
 
+#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
+static inline int kvm_private_mem_get_pfn(struct kvm_memory_slot *slot,
+					  gfn_t gfn, kvm_pfn_t *pfn, int *order)
+{
+	int ret;
+	pfn_t pfnt;
+	pgoff_t index = gfn - slot->base_gfn +
+			(slot->private_offset >> PAGE_SHIFT);
+
+	ret = slot->notifier.bs->get_pfn(slot->private_file, index, &pfnt,
+					 order);
+	*pfn = pfn_t_to_pfn(pfnt);
+	return ret;
+}
+
+static inline void kvm_private_mem_put_pfn(struct kvm_memory_slot *slot,
+					   kvm_pfn_t pfn)
+{
+	slot->notifier.bs->put_pfn(pfn_to_pfn_t(pfn));
+}
+
+static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
+{
+	return !!xa_load(&kvm->mem_attr_array, gfn);
+}
+
+#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */
+
 #endif