diff mbox series

[v1,2/9] KVM: s390: Extend MEM_OP ioctl by storage key checked cmpxchg

Message ID 20220930210751.225873-3-scgl@linux.ibm.com
State New
Headers show
Series KVM: s390: Extend MEM_OP ioctl by storage key checked cmpxchg | expand

Commit Message

Janis Schoetterl-Glausch Sept. 30, 2022, 9:07 p.m. UTC
User space can use the MEM_OP ioctl to make storage key checked reads
and writes to the guest, however, it has no way of performing atomic,
key checked, accesses to the guest.
Extend the MEM_OP ioctl in order to allow for this, by adding a cmpxchg
mode. For now, support this mode for absolute accesses only.

This mode can be use, for example, to set the device-state-change
indicator and the adapter-local-summary indicator atomically.

Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
---


The return value of MEM_OP is:
  0 on success,
  < 0 on generic error (e.g. -EFAULT or -ENOMEM),
  > 0 if an exception occurred while walking the page tables
A cmpxchg failing because the old value doesn't match is neither an
error nor an exception, so the question is how best to signal that
condition. This is not strictly necessary since user space can compare
the value of old after the MEM_OP with the value it set. If they're
different the cmpxchg failed. It might be a better user interface if
there is an easier way to see if the cmpxchg failed.
This patch sets the cmpxchg flag bit to 0 on a successful cmpxchg.
This way you can compare against a constant instead of the old old
value.
This has the disadvantage of being a bit weird, other suggestions
welcome.


 include/uapi/linux/kvm.h |  5 ++++
 arch/s390/kvm/gaccess.h  |  4 +++
 arch/s390/kvm/gaccess.c  | 56 ++++++++++++++++++++++++++++++++++++++++
 arch/s390/kvm/kvm-s390.c | 50 ++++++++++++++++++++++++++++++-----
 4 files changed, 109 insertions(+), 6 deletions(-)

Comments

kernel test robot Oct. 1, 2022, 5:03 a.m. UTC | #1
Hi Janis,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on f76349cf41451c5c42a99f18a9163377e4b364ff]

url:    https://github.com/intel-lab-lkp/linux/commits/Janis-Schoetterl-Glausch/KVM-s390-Extend-MEM_OP-ioctl-by-storage-key-checked-cmpxchg/20221001-050945
base:   f76349cf41451c5c42a99f18a9163377e4b364ff
config: s390-randconfig-m031-20220925
compiler: s390-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/b53d129604de03147fce1d353698f961b256f895
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Janis-Schoetterl-Glausch/KVM-s390-Extend-MEM_OP-ioctl-by-storage-key-checked-cmpxchg/20221001-050945
        git checkout b53d129604de03147fce1d353698f961b256f895
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   arch/s390/include/asm/uaccess.h: Assembler messages:
>> arch/s390/include/asm/uaccess.h:430: Error: Unrecognized opcode: `xrk'
   arch/s390/include/asm/uaccess.h:434: Error: Unrecognized opcode: `xrk'


vim +430 arch/s390/include/asm/uaccess.h

110a6dbb2eca6b Heiko Carstens           2020-09-14  394  
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  395  static __always_inline int __cmpxchg_user_key_small(int size, u64 address,
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  396  						    unsigned __int128 *old_p,
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  397  						    unsigned __int128 new, u8 access_key)
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  398  {
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  399  	u32 shift, mask, old_word, new_word, align_mask, tmp, diff;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  400  	u64 aligned;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  401  	int ret = -EFAULT;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  402  
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  403  	switch (size) {
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  404  	case 2:
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  405  		align_mask = 2;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  406  		aligned = (address ^ (address & align_mask));
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  407  		shift = (sizeof(u32) - (address & align_mask) - size) * 8;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  408  		mask = 0xffff << shift;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  409  		old_word = ((u16)*old_p) << shift;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  410  		new_word = ((u16)new) << shift;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  411  		break;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  412  	case 1:
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  413  		align_mask = 3;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  414  		aligned = (address ^ (address & align_mask));
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  415  		shift = (sizeof(u32) - (address & align_mask) - size) * 8;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  416  		mask = 0xff << shift;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  417  		old_word = ((u8)*old_p) << shift;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  418  		new_word = ((u8)new) << shift;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  419  		break;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  420  	}
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  421  	asm volatile(
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  422  		       "spka	0(%[access_key])\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  423  		"	sacf	256\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  424  		"0:	l	%[tmp],%[aligned]\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  425  		"1:	nr	%[tmp],%[hole_mask]\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  426  		"	or	%[new_word],%[tmp]\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  427  		"	or	%[old_word],%[tmp]\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  428  		"	lr	%[tmp],%[old_word]\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  429  		"2:	cs	%[tmp],%[new_word],%[aligned]\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30 @430  		"3:	jnl	4f\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  431  		"	xrk	%[diff],%[tmp],%[old_word]\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  432  		"	nr	%[diff],%[hole_mask]\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  433  		"	xr	%[new_word],%[diff]\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  434  		"	xr	%[old_word],%[diff]\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  435  		"	xrk	%[diff],%[tmp],%[old_word]\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  436  		"	jz	2b\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  437  		"4:	ipm	%[ret]\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  438  		"	srl	%[ret],28\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  439  		"5:	sacf	768\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  440  		"	spka	%[default_key]\n"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  441  		EX_TABLE(0b, 5b) EX_TABLE(1b, 5b)
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  442  		EX_TABLE(2b, 5b) EX_TABLE(3b, 5b)
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  443  		: [old_word] "+&d" (old_word),
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  444  		  [new_word] "+&d" (new_word),
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  445  		  [tmp] "=&d" (tmp),
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  446  		  [aligned] "+Q" (*(u32 *)aligned),
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  447  		  [diff] "=&d" (diff),
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  448  		  [ret] "+d" (ret)
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  449  		: [access_key] "a" (access_key << 4),
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  450  		  [hole_mask] "d" (~mask),
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  451  		  [default_key] "J" (PAGE_DEFAULT_KEY)
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  452  		: "cc"
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  453  	);
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  454  	*old_p = (tmp & mask) >> shift;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  455  	return ret;
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  456  }
db6f2eb3910899 Janis Schoetterl-Glausch 2022-09-30  457
Thomas Huth Oct. 4, 2022, 8:13 a.m. UTC | #2
On 30/09/2022 23.07, Janis Schoetterl-Glausch wrote:
> User space can use the MEM_OP ioctl to make storage key checked reads
> and writes to the guest, however, it has no way of performing atomic,
> key checked, accesses to the guest.
> Extend the MEM_OP ioctl in order to allow for this, by adding a cmpxchg
> mode. For now, support this mode for absolute accesses only.
> 
> This mode can be use, for example, to set the device-state-change
> indicator and the adapter-local-summary indicator atomically.
> 
> Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
> ---
...
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index eed0315a77a6..b856705f3f6b 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -580,7 +580,9 @@ struct kvm_translation {
>   struct kvm_s390_mem_op {
>   	/* in */
>   	__u64 gaddr;		/* the guest address */
> +	/* in & out */
>   	__u64 flags;		/* flags */
> +	/* in */
>   	__u32 size;		/* amount of bytes */
>   	__u32 op;		/* type of operation */
>   	__u64 buf;		/* buffer in userspace */
> @@ -588,6 +590,8 @@ struct kvm_s390_mem_op {
>   		struct {
>   			__u8 ar;	/* the access register number */
>   			__u8 key;	/* access key, ignored if flag unset */
> +			/* in & out */
> +			__u64 old[2];	/* ignored if flag unset */

The alignment looks very unfortunate now ... could you please add a "__u8 
pad[6]" or "__u8 pad[14]" in front of the new field?

>   		};
>   		__u32 sida_offset; /* offset into the sida */
>   		__u8 reserved[32]; /* ignored */
> @@ -604,6 +608,7 @@ struct kvm_s390_mem_op {
>   #define KVM_S390_MEMOP_F_CHECK_ONLY		(1ULL << 0)
>   #define KVM_S390_MEMOP_F_INJECT_EXCEPTION	(1ULL << 1)
>   #define KVM_S390_MEMOP_F_SKEY_PROTECTION	(1ULL << 2)
> +#define KVM_S390_MEMOP_F_CMPXCHG		(1ULL << 3)
>   
>   /* for KVM_INTERRUPT */
>   struct kvm_interrupt {
> diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
> index 9408d6cc8e2c..a1cb66ae0995 100644
> --- a/arch/s390/kvm/gaccess.h
> +++ b/arch/s390/kvm/gaccess.h
> @@ -206,6 +206,10 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
>   int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
>   		      void *data, unsigned long len, enum gacc_mode mode);
>   
> +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
> +			       unsigned __int128 *old,
> +			       unsigned __int128 new, u8 access_key);
> +
>   /**
>    * write_guest_with_key - copy data from kernel space to guest space
>    * @vcpu: virtual cpu
> diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
> index 0243b6e38d36..c0e490ecc372 100644
> --- a/arch/s390/kvm/gaccess.c
> +++ b/arch/s390/kvm/gaccess.c
> @@ -1161,6 +1161,62 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
>   	return rc;
>   }
>   
> +/**
> + * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address.
> + * @kvm: Virtual machine instance.
> + * @gpa: Absolute guest address of the location to be changed.
> + * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a
> + *       non power of two will result in failure.
> + * @old_p: Pointer to old value. If the location at @gpa contains this value, the
> + *         exchange will succeed. After calling cmpxchg_guest_abs_with_key() *@old
> + *         contains the value at @gpa before the attempt to exchange the value.
> + * @new: The value to place at @gpa.
> + * @access_key: The access key to use for the guest access.
> + *
> + * Atomically exchange the value at @gpa by @new, if it contains *@old.
> + * Honors storage keys.
> + *
> + * Return: * 0: successful exchange
> + *         * 1: exchange unsuccessful
> + *         * a program interruption code indicating the reason cmpxchg could
> + *           not be attempted
> + *         * -EINVAL: address misaligned or len not power of two
> + */
> +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
> +			       unsigned __int128 *old_p, unsigned __int128 new,
> +			       u8 access_key)
> +{
> +	gfn_t gfn = gpa >> PAGE_SHIFT;
> +	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
> +	bool writable;
> +	hva_t hva;
> +	int ret;
> +
> +	if (!IS_ALIGNED(gpa, len))
> +		return -EINVAL;
> +
> +	hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
> +	if (kvm_is_error_hva(hva))
> +		return PGM_ADDRESSING;
> +	/*
> +	 * Check if it's a ro memslot, even tho that can't occur (they're unsupported).

Not everybody is used to read such abbreviated English ... I'd recommend to 
spell it rather properly (ro ==> read-only, tho ==> though)

> +	 * Don't try to actually handle that case.
> +	 */
> +	if (!writable)
> +		return -EOPNOTSUPP;
> +
> +	hva += offset_in_page(gpa);
> +	ret = cmpxchg_user_key_size(len, (void __user *)hva, old_p, new, access_key);
> +	mark_page_dirty_in_slot(kvm, slot, gfn);
> +	/*
> +	 * Assume that the fault is caused by key protection, the alternative
> +	 * is that the user page is write protected.
> +	 */
> +	if (ret == -EFAULT)
> +		ret = PGM_PROTECTION;
> +	return ret;
> +}
> +
>   /**
>    * guest_translate_address_with_key - translate guest logical into guest absolute address
>    * @vcpu: virtual cpu
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index b7ef0b71014d..d594d1318d2a 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -576,7 +576,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>   	case KVM_CAP_S390_VCPU_RESETS:
>   	case KVM_CAP_SET_GUEST_DEBUG:
>   	case KVM_CAP_S390_DIAG318:
> -	case KVM_CAP_S390_MEM_OP_EXTENSION:
>   		r = 1;
>   		break;
>   	case KVM_CAP_SET_GUEST_DEBUG2:
> @@ -590,6 +589,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>   	case KVM_CAP_S390_MEM_OP:
>   		r = MEM_OP_MAX_SIZE;
>   		break;
> +	case KVM_CAP_S390_MEM_OP_EXTENSION:
> +		r = 0x3;

Add a comment to explain that magic value 0x3 ?

> +		break;
>   	case KVM_CAP_NR_VCPUS:
>   	case KVM_CAP_MAX_VCPUS:
>   	case KVM_CAP_MAX_VCPU_ID:

  Thomas
Thomas Huth Oct. 5, 2022, 6:32 a.m. UTC | #3
On 30/09/2022 23.07, Janis Schoetterl-Glausch wrote:
> User space can use the MEM_OP ioctl to make storage key checked reads
> and writes to the guest, however, it has no way of performing atomic,
> key checked, accesses to the guest.
> Extend the MEM_OP ioctl in order to allow for this, by adding a cmpxchg
> mode. For now, support this mode for absolute accesses only.
> 
> This mode can be use, for example, to set the device-state-change
> indicator and the adapter-local-summary indicator atomically.
> 
> Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
> ---
> 
> 
> The return value of MEM_OP is:
>    0 on success,
>    < 0 on generic error (e.g. -EFAULT or -ENOMEM),
>    > 0 if an exception occurred while walking the page tables
> A cmpxchg failing because the old value doesn't match is neither an
> error nor an exception, so the question is how best to signal that
> condition. This is not strictly necessary since user space can compare
> the value of old after the MEM_OP with the value it set. If they're
> different the cmpxchg failed. It might be a better user interface if
> there is an easier way to see if the cmpxchg failed.
> This patch sets the cmpxchg flag bit to 0 on a successful cmpxchg.
> This way you can compare against a constant instead of the old old
> value.
> This has the disadvantage of being a bit weird, other suggestions
> welcome.

This also breaks the old API of defining the ioctl as _IOW only ... with 
your change to the flags field, it effectively gets IOWR instead.

Maybe it would be better to put all the new logic into a new struct and only 
pass a pointer to that struct in kvm_s390_mem_op, so that the ioctl stays 
IOW ? ... or maybe even introduce a completely new ioctl for this 
functionality instead?

  Thomas
Janis Schoetterl-Glausch Oct. 5, 2022, 7:16 p.m. UTC | #4
On Wed, 2022-10-05 at 08:32 +0200, Thomas Huth wrote:
> On 30/09/2022 23.07, Janis Schoetterl-Glausch wrote:
> > User space can use the MEM_OP ioctl to make storage key checked reads
> > and writes to the guest, however, it has no way of performing atomic,
> > key checked, accesses to the guest.
> > Extend the MEM_OP ioctl in order to allow for this, by adding a cmpxchg
> > mode. For now, support this mode for absolute accesses only.
> > 
> > This mode can be use, for example, to set the device-state-change
> > indicator and the adapter-local-summary indicator atomically.
> > 
> > Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
> > ---
> > 
> > 
> > The return value of MEM_OP is:
> >    0 on success,
> >    < 0 on generic error (e.g. -EFAULT or -ENOMEM),
> >    > 0 if an exception occurred while walking the page tables
> > A cmpxchg failing because the old value doesn't match is neither an
> > error nor an exception, so the question is how best to signal that
> > condition. This is not strictly necessary since user space can compare
> > the value of old after the MEM_OP with the value it set. If they're
> > different the cmpxchg failed. It might be a better user interface if
> > there is an easier way to see if the cmpxchg failed.
> > This patch sets the cmpxchg flag bit to 0 on a successful cmpxchg.
> > This way you can compare against a constant instead of the old old
> > value.
> > This has the disadvantage of being a bit weird, other suggestions
> > welcome.
> 
> This also breaks the old API of defining the ioctl as _IOW only ... with 
> your change to the flags field, it effectively gets IOWR instead.

Oh, right.
> 
> Maybe it would be better to put all the new logic into a new struct and only 
> pass a pointer to that struct in kvm_s390_mem_op, so that the ioctl stays 
> IOW ? ... or maybe even introduce a completely new ioctl for this 
> functionality instead?

Hmmm, the latter seems a bit ugly since there is so much commonality
with the existing memop. 
> 
>   Thomas
>
diff mbox series

Patch

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index eed0315a77a6..b856705f3f6b 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -580,7 +580,9 @@  struct kvm_translation {
 struct kvm_s390_mem_op {
 	/* in */
 	__u64 gaddr;		/* the guest address */
+	/* in & out */
 	__u64 flags;		/* flags */
+	/* in */
 	__u32 size;		/* amount of bytes */
 	__u32 op;		/* type of operation */
 	__u64 buf;		/* buffer in userspace */
@@ -588,6 +590,8 @@  struct kvm_s390_mem_op {
 		struct {
 			__u8 ar;	/* the access register number */
 			__u8 key;	/* access key, ignored if flag unset */
+			/* in & out */
+			__u64 old[2];	/* ignored if flag unset */
 		};
 		__u32 sida_offset; /* offset into the sida */
 		__u8 reserved[32]; /* ignored */
@@ -604,6 +608,7 @@  struct kvm_s390_mem_op {
 #define KVM_S390_MEMOP_F_CHECK_ONLY		(1ULL << 0)
 #define KVM_S390_MEMOP_F_INJECT_EXCEPTION	(1ULL << 1)
 #define KVM_S390_MEMOP_F_SKEY_PROTECTION	(1ULL << 2)
+#define KVM_S390_MEMOP_F_CMPXCHG		(1ULL << 3)
 
 /* for KVM_INTERRUPT */
 struct kvm_interrupt {
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 9408d6cc8e2c..a1cb66ae0995 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -206,6 +206,10 @@  int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
 		      void *data, unsigned long len, enum gacc_mode mode);
 
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
+			       unsigned __int128 *old,
+			       unsigned __int128 new, u8 access_key);
+
 /**
  * write_guest_with_key - copy data from kernel space to guest space
  * @vcpu: virtual cpu
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 0243b6e38d36..c0e490ecc372 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1161,6 +1161,62 @@  int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
 	return rc;
 }
 
+/**
+ * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address.
+ * @kvm: Virtual machine instance.
+ * @gpa: Absolute guest address of the location to be changed.
+ * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a
+ *       non power of two will result in failure.
+ * @old_p: Pointer to old value. If the location at @gpa contains this value, the
+ *         exchange will succeed. After calling cmpxchg_guest_abs_with_key() *@old
+ *         contains the value at @gpa before the attempt to exchange the value.
+ * @new: The value to place at @gpa.
+ * @access_key: The access key to use for the guest access.
+ *
+ * Atomically exchange the value at @gpa by @new, if it contains *@old.
+ * Honors storage keys.
+ *
+ * Return: * 0: successful exchange
+ *         * 1: exchange unsuccessful
+ *         * a program interruption code indicating the reason cmpxchg could
+ *           not be attempted
+ *         * -EINVAL: address misaligned or len not power of two
+ */
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
+			       unsigned __int128 *old_p, unsigned __int128 new,
+			       u8 access_key)
+{
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+	bool writable;
+	hva_t hva;
+	int ret;
+
+	if (!IS_ALIGNED(gpa, len))
+		return -EINVAL;
+
+	hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
+	if (kvm_is_error_hva(hva))
+		return PGM_ADDRESSING;
+	/*
+	 * Check if it's a ro memslot, even tho that can't occur (they're unsupported).
+	 * Don't try to actually handle that case.
+	 */
+	if (!writable)
+		return -EOPNOTSUPP;
+
+	hva += offset_in_page(gpa);
+	ret = cmpxchg_user_key_size(len, (void __user *)hva, old_p, new, access_key);
+	mark_page_dirty_in_slot(kvm, slot, gfn);
+	/*
+	 * Assume that the fault is caused by key protection, the alternative
+	 * is that the user page is write protected.
+	 */
+	if (ret == -EFAULT)
+		ret = PGM_PROTECTION;
+	return ret;
+}
+
 /**
  * guest_translate_address_with_key - translate guest logical into guest absolute address
  * @vcpu: virtual cpu
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index b7ef0b71014d..d594d1318d2a 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -576,7 +576,6 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_VCPU_RESETS:
 	case KVM_CAP_SET_GUEST_DEBUG:
 	case KVM_CAP_S390_DIAG318:
-	case KVM_CAP_S390_MEM_OP_EXTENSION:
 		r = 1;
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
@@ -590,6 +589,9 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_MEM_OP:
 		r = MEM_OP_MAX_SIZE;
 		break;
+	case KVM_CAP_S390_MEM_OP_EXTENSION:
+		r = 0x3;
+		break;
 	case KVM_CAP_NR_VCPUS:
 	case KVM_CAP_MAX_VCPUS:
 	case KVM_CAP_MAX_VCPU_ID:
@@ -2711,15 +2713,22 @@  static bool access_key_invalid(u8 access_key)
 	return access_key > 0xf;
 }
 
-static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
+static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop, bool *modified)
 {
 	void __user *uaddr = (void __user *)mop->buf;
+	unsigned __int128 old;
+	union {
+		unsigned __int128 quad;
+		char raw[sizeof(unsigned __int128)];
+	} new = { .quad = 0 };
 	u64 supported_flags;
 	void *tmpbuf = NULL;
 	int r, srcu_idx;
 
+	*modified = false;
 	supported_flags = KVM_S390_MEMOP_F_SKEY_PROTECTION
-			  | KVM_S390_MEMOP_F_CHECK_ONLY;
+			  | KVM_S390_MEMOP_F_CHECK_ONLY
+			  | KVM_S390_MEMOP_F_CMPXCHG;
 	if (mop->flags & ~supported_flags || !mop->size)
 		return -EINVAL;
 	if (mop->size > MEM_OP_MAX_SIZE)
@@ -2741,6 +2750,13 @@  static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
 	} else {
 		mop->key = 0;
 	}
+	if (mop->flags & KVM_S390_MEMOP_F_CMPXCHG) {
+		if (mop->size > sizeof(new))
+			return -EINVAL;
+		if (copy_from_user(&new.raw[sizeof(new) - mop->size], uaddr, mop->size))
+			return -EFAULT;
+		memcpy(&old, mop->old, sizeof(old));
+	}
 	if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
 		tmpbuf = vmalloc(mop->size);
 		if (!tmpbuf)
@@ -2771,6 +2787,16 @@  static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
 	case KVM_S390_MEMOP_ABSOLUTE_WRITE: {
 		if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
 			r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_STORE, mop->key);
+		} else if (mop->flags & KVM_S390_MEMOP_F_CMPXCHG) {
+			r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size,
+						       &old, new.quad, mop->key);
+			if (!r) {
+				mop->flags &= ~KVM_S390_MEMOP_F_CMPXCHG;
+			} else if (r == 1) {
+				memcpy(mop->old, &old, sizeof(old));
+				r = 0;
+			}
+			*modified = true;
 		} else {
 			if (copy_from_user(tmpbuf, uaddr, mop->size)) {
 				r = -EFAULT;
@@ -2918,11 +2944,23 @@  long kvm_arch_vm_ioctl(struct file *filp,
 	}
 	case KVM_S390_MEM_OP: {
 		struct kvm_s390_mem_op mem_op;
+		bool modified;
 
-		if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0)
-			r = kvm_s390_vm_mem_op(kvm, &mem_op);
-		else
+		r = copy_from_user(&mem_op, argp, sizeof(mem_op));
+		if (r) {
 			r = -EFAULT;
+			break;
+		}
+		r = kvm_s390_vm_mem_op(kvm, &mem_op, &modified);
+		if (r)
+			break;
+		if (modified) {
+			r = copy_to_user(argp, &mem_op, sizeof(mem_op));
+			if (r) {
+				r = -EFAULT;
+				break;
+			}
+		}
 		break;
 	}
 	case KVM_S390_ZPCI_OP: {