Message ID | 20220706082016.2603916-14-chao.p.peng@linux.intel.com |
---|---|
State | New |
Headers | show |
Series | KVM: mm: fd-based approach for supporting KVM guest private memory | expand |
> Register private memslot to fd-based memory backing store and handle the > memfile notifiers to zap the existing mappings. > > Currently the register is happened at memslot creating time and the > initial support does not include page migration/swap. > > KVM_MEM_PRIVATE is not exposed by default, architecture code can turn > on it by implementing kvm_arch_private_mem_supported(). > > A 'kvm' reference is added in memslot structure since in > memfile_notifier callbacks we can only obtain a memslot reference while > kvm is need to do the zapping. > > Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com> > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com> > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com> > --- > include/linux/kvm_host.h | 1 + > virt/kvm/kvm_main.c | 117 ++++++++++++++++++++++++++++++++++++--- > 2 files changed, 109 insertions(+), 9 deletions(-) > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 8f56426aa1e3..4e5a0db68799 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -584,6 +584,7 @@ struct kvm_memory_slot { > struct file *private_file; > loff_t private_offset; > struct memfile_notifier notifier; > + struct kvm *kvm; > }; > > static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot) > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index bb714c2a4b06..d6f7e074cab2 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -941,6 +941,63 @@ static int kvm_vm_ioctl_set_encrypted_region(struct kvm *kvm, unsigned int ioctl > > return r; > } > + > +static void kvm_memfile_notifier_invalidate(struct memfile_notifier *notifier, > + pgoff_t start, pgoff_t end) > +{ > + struct kvm_memory_slot *slot = container_of(notifier, > + struct kvm_memory_slot, > + notifier); > + unsigned long base_pgoff = slot->private_offset >> PAGE_SHIFT; > + gfn_t start_gfn = slot->base_gfn; > + gfn_t end_gfn = slot->base_gfn + slot->npages; > + > + > + if (start > base_pgoff) > + start_gfn = slot->base_gfn + start - base_pgoff; > + > + if (end < base_pgoff + slot->npages) > + end_gfn = slot->base_gfn + end - base_pgoff; > + > + if (start_gfn >= end_gfn) > + return; > + > + kvm_zap_gfn_range(slot->kvm, start_gfn, end_gfn); > +} > + > +static struct memfile_notifier_ops kvm_memfile_notifier_ops = { > + .invalidate = kvm_memfile_notifier_invalidate, > +}; > + > +#define KVM_MEMFILE_FLAGS (MEMFILE_F_USER_INACCESSIBLE | \ > + MEMFILE_F_UNMOVABLE | \ > + MEMFILE_F_UNRECLAIMABLE) > + > +static inline int kvm_private_mem_register(struct kvm_memory_slot *slot) > +{ > + slot->notifier.ops = &kvm_memfile_notifier_ops; > + return memfile_register_notifier(slot->private_file, KVM_MEMFILE_FLAGS, > + &slot->notifier); > +} > + > +static inline void kvm_private_mem_unregister(struct kvm_memory_slot *slot) > +{ > + memfile_unregister_notifier(&slot->notifier); > +} > + > +#else /* !CONFIG_HAVE_KVM_PRIVATE_MEM */ > + > +static inline int kvm_private_mem_register(struct kvm_memory_slot *slot) > +{ > + WARN_ON_ONCE(1); > + return -EOPNOTSUPP; > +} > + > +static inline void kvm_private_mem_unregister(struct kvm_memory_slot *slot) > +{ > + WARN_ON_ONCE(1); > +} > + > #endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */ > > #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER > @@ -987,6 +1044,11 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) > /* This does not remove the slot from struct kvm_memslots data structures */ > static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) > { > + if (slot->flags & KVM_MEM_PRIVATE) { > + kvm_private_mem_unregister(slot); > + fput(slot->private_file); > + } > + > kvm_destroy_dirty_bitmap(slot); > > kvm_arch_free_memslot(kvm, slot); > @@ -1548,10 +1610,16 @@ bool __weak kvm_arch_private_mem_supported(struct kvm *kvm) > return false; > } > > -static int check_memory_region_flags(const struct kvm_user_mem_region *mem) > +static int check_memory_region_flags(struct kvm *kvm, > + const struct kvm_user_mem_region *mem) > { > u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; > > +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM > + if (kvm_arch_private_mem_supported(kvm)) > + valid_flags |= KVM_MEM_PRIVATE; > +#endif > + > #ifdef __KVM_HAVE_READONLY_MEM > valid_flags |= KVM_MEM_READONLY; > #endif > @@ -1627,6 +1695,12 @@ static int kvm_prepare_memory_region(struct kvm *kvm, > { > int r; > > + if (change == KVM_MR_CREATE && new->flags & KVM_MEM_PRIVATE) { > + r = kvm_private_mem_register(new); > + if (r) > + return r; > + } > + > /* > * If dirty logging is disabled, nullify the bitmap; the old bitmap > * will be freed on "commit". If logging is enabled in both old and > @@ -1655,6 +1729,9 @@ static int kvm_prepare_memory_region(struct kvm *kvm, > if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap)) > kvm_destroy_dirty_bitmap(new); > > + if (r && change == KVM_MR_CREATE && new->flags & KVM_MEM_PRIVATE) > + kvm_private_mem_unregister(new); > + > return r; > } > > @@ -1952,7 +2029,7 @@ int __kvm_set_memory_region(struct kvm *kvm, > int as_id, id; > int r; > > - r = check_memory_region_flags(mem); > + r = check_memory_region_flags(kvm, mem); > if (r) > return r; > > @@ -1971,6 +2048,10 @@ int __kvm_set_memory_region(struct kvm *kvm, > !access_ok((void __user *)(unsigned long)mem->userspace_addr, > mem->memory_size)) > return -EINVAL; > + if (mem->flags & KVM_MEM_PRIVATE && > + (mem->private_offset & (PAGE_SIZE - 1) || > + mem->private_offset > U64_MAX - mem->memory_size)) > + return -EINVAL; > if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) > return -EINVAL; > if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) > @@ -2009,6 +2090,9 @@ int __kvm_set_memory_region(struct kvm *kvm, > if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages) > return -EINVAL; > } else { /* Modify an existing slot. */ > + /* Private memslots are immutable, they can only be deleted. */ > + if (mem->flags & KVM_MEM_PRIVATE) > + return -EINVAL; > if ((mem->userspace_addr != old->userspace_addr) || > (npages != old->npages) || > ((mem->flags ^ old->flags) & KVM_MEM_READONLY)) > @@ -2037,10 +2121,27 @@ int __kvm_set_memory_region(struct kvm *kvm, > new->npages = npages; > new->flags = mem->flags; > new->userspace_addr = mem->userspace_addr; > + if (mem->flags & KVM_MEM_PRIVATE) { > + new->private_file = fget(mem->private_fd); > + if (!new->private_file) { > + r = -EINVAL; > + goto out; > + } > + new->private_offset = mem->private_offset; > + } > + > + new->kvm = kvm; > > r = kvm_set_memslot(kvm, old, new, change); > if (r) > - kfree(new); > + goto out; > + > + return 0; > + > +out: > + if (new->private_file) > + fput(new->private_file); > + kfree(new); > return r; > } > EXPORT_SYMBOL_GPL(__kvm_set_memory_region); > @@ -4712,12 +4813,10 @@ static long kvm_vm_ioctl(struct file *filp, > (u32 __user *)(argp + offsetof(typeof(mem), flags)))) > goto out; > > - if (flags & KVM_MEM_PRIVATE) { > - r = -EINVAL; > - goto out; > - } > - > - size = sizeof(struct kvm_userspace_memory_region); > + if (flags & KVM_MEM_PRIVATE) > + size = sizeof(struct kvm_userspace_memory_region_ext); Not sure if we use kvm_userspace_memory_region_ext or kvm_user_mem_region, just for readability. > + else > + size = sizeof(struct kvm_userspace_memory_region); > > if (copy_from_user(&mem, argp, size)) > goto out;
On Tue, Jul 19, 2022 at 11:55:24AM +0200, Gupta, Pankaj wrote: ... > > @@ -4712,12 +4813,10 @@ static long kvm_vm_ioctl(struct file *filp, > > (u32 __user *)(argp + offsetof(typeof(mem), flags)))) > > goto out; > > - if (flags & KVM_MEM_PRIVATE) { > > - r = -EINVAL; > > - goto out; > > - } > > - > > - size = sizeof(struct kvm_userspace_memory_region); > > + if (flags & KVM_MEM_PRIVATE) > > + size = sizeof(struct kvm_userspace_memory_region_ext); > > Not sure if we use kvm_userspace_memory_region_ext or kvm_user_mem_region, > just for readability. Somehow, but majorly for code maintainability, kvm_user_mem_region is designed to be the alias of kvm_userspace_memory_region_ext so in the code we can access the 'unpacked' fields using something like 'mem.usersapce_addr' instead of 'mem.region.userspace_addr'. Chao > > > + else > > + size = sizeof(struct kvm_userspace_memory_region); > > if (copy_from_user(&mem, argp, size)) > > goto out;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8f56426aa1e3..4e5a0db68799 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -584,6 +584,7 @@ struct kvm_memory_slot { struct file *private_file; loff_t private_offset; struct memfile_notifier notifier; + struct kvm *kvm; }; static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index bb714c2a4b06..d6f7e074cab2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -941,6 +941,63 @@ static int kvm_vm_ioctl_set_encrypted_region(struct kvm *kvm, unsigned int ioctl return r; } + +static void kvm_memfile_notifier_invalidate(struct memfile_notifier *notifier, + pgoff_t start, pgoff_t end) +{ + struct kvm_memory_slot *slot = container_of(notifier, + struct kvm_memory_slot, + notifier); + unsigned long base_pgoff = slot->private_offset >> PAGE_SHIFT; + gfn_t start_gfn = slot->base_gfn; + gfn_t end_gfn = slot->base_gfn + slot->npages; + + + if (start > base_pgoff) + start_gfn = slot->base_gfn + start - base_pgoff; + + if (end < base_pgoff + slot->npages) + end_gfn = slot->base_gfn + end - base_pgoff; + + if (start_gfn >= end_gfn) + return; + + kvm_zap_gfn_range(slot->kvm, start_gfn, end_gfn); +} + +static struct memfile_notifier_ops kvm_memfile_notifier_ops = { + .invalidate = kvm_memfile_notifier_invalidate, +}; + +#define KVM_MEMFILE_FLAGS (MEMFILE_F_USER_INACCESSIBLE | \ + MEMFILE_F_UNMOVABLE | \ + MEMFILE_F_UNRECLAIMABLE) + +static inline int kvm_private_mem_register(struct kvm_memory_slot *slot) +{ + slot->notifier.ops = &kvm_memfile_notifier_ops; + return memfile_register_notifier(slot->private_file, KVM_MEMFILE_FLAGS, + &slot->notifier); +} + +static inline void kvm_private_mem_unregister(struct kvm_memory_slot *slot) +{ + memfile_unregister_notifier(&slot->notifier); +} + +#else /* !CONFIG_HAVE_KVM_PRIVATE_MEM */ + +static inline int kvm_private_mem_register(struct kvm_memory_slot *slot) +{ + WARN_ON_ONCE(1); + return -EOPNOTSUPP; +} + +static inline void kvm_private_mem_unregister(struct kvm_memory_slot *slot) +{ + WARN_ON_ONCE(1); +} + #endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */ #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER @@ -987,6 +1044,11 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) /* This does not remove the slot from struct kvm_memslots data structures */ static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { + if (slot->flags & KVM_MEM_PRIVATE) { + kvm_private_mem_unregister(slot); + fput(slot->private_file); + } + kvm_destroy_dirty_bitmap(slot); kvm_arch_free_memslot(kvm, slot); @@ -1548,10 +1610,16 @@ bool __weak kvm_arch_private_mem_supported(struct kvm *kvm) return false; } -static int check_memory_region_flags(const struct kvm_user_mem_region *mem) +static int check_memory_region_flags(struct kvm *kvm, + const struct kvm_user_mem_region *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM + if (kvm_arch_private_mem_supported(kvm)) + valid_flags |= KVM_MEM_PRIVATE; +#endif + #ifdef __KVM_HAVE_READONLY_MEM valid_flags |= KVM_MEM_READONLY; #endif @@ -1627,6 +1695,12 @@ static int kvm_prepare_memory_region(struct kvm *kvm, { int r; + if (change == KVM_MR_CREATE && new->flags & KVM_MEM_PRIVATE) { + r = kvm_private_mem_register(new); + if (r) + return r; + } + /* * If dirty logging is disabled, nullify the bitmap; the old bitmap * will be freed on "commit". If logging is enabled in both old and @@ -1655,6 +1729,9 @@ static int kvm_prepare_memory_region(struct kvm *kvm, if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap)) kvm_destroy_dirty_bitmap(new); + if (r && change == KVM_MR_CREATE && new->flags & KVM_MEM_PRIVATE) + kvm_private_mem_unregister(new); + return r; } @@ -1952,7 +2029,7 @@ int __kvm_set_memory_region(struct kvm *kvm, int as_id, id; int r; - r = check_memory_region_flags(mem); + r = check_memory_region_flags(kvm, mem); if (r) return r; @@ -1971,6 +2048,10 @@ int __kvm_set_memory_region(struct kvm *kvm, !access_ok((void __user *)(unsigned long)mem->userspace_addr, mem->memory_size)) return -EINVAL; + if (mem->flags & KVM_MEM_PRIVATE && + (mem->private_offset & (PAGE_SIZE - 1) || + mem->private_offset > U64_MAX - mem->memory_size)) + return -EINVAL; if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) return -EINVAL; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) @@ -2009,6 +2090,9 @@ int __kvm_set_memory_region(struct kvm *kvm, if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages) return -EINVAL; } else { /* Modify an existing slot. */ + /* Private memslots are immutable, they can only be deleted. */ + if (mem->flags & KVM_MEM_PRIVATE) + return -EINVAL; if ((mem->userspace_addr != old->userspace_addr) || (npages != old->npages) || ((mem->flags ^ old->flags) & KVM_MEM_READONLY)) @@ -2037,10 +2121,27 @@ int __kvm_set_memory_region(struct kvm *kvm, new->npages = npages; new->flags = mem->flags; new->userspace_addr = mem->userspace_addr; + if (mem->flags & KVM_MEM_PRIVATE) { + new->private_file = fget(mem->private_fd); + if (!new->private_file) { + r = -EINVAL; + goto out; + } + new->private_offset = mem->private_offset; + } + + new->kvm = kvm; r = kvm_set_memslot(kvm, old, new, change); if (r) - kfree(new); + goto out; + + return 0; + +out: + if (new->private_file) + fput(new->private_file); + kfree(new); return r; } EXPORT_SYMBOL_GPL(__kvm_set_memory_region); @@ -4712,12 +4813,10 @@ static long kvm_vm_ioctl(struct file *filp, (u32 __user *)(argp + offsetof(typeof(mem), flags)))) goto out; - if (flags & KVM_MEM_PRIVATE) { - r = -EINVAL; - goto out; - } - - size = sizeof(struct kvm_userspace_memory_region); + if (flags & KVM_MEM_PRIVATE) + size = sizeof(struct kvm_userspace_memory_region_ext); + else + size = sizeof(struct kvm_userspace_memory_region); if (copy_from_user(&mem, argp, size)) goto out;