Message ID | 20221111080820.2132412-13-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | target/s390x: Use TCGv_i128 | expand |
On Fri, Nov 11, 2022 at 06:08:19PM +1000, Richard Henderson wrote: > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > target/s390x/helper.h | 2 -- > target/s390x/tcg/mem_helper.c | 52 --------------------------- > target/s390x/tcg/translate.c | 60 ++++++++++++++++++++------------ > target/s390x/tcg/insn-data.h.inc | 2 +- > 4 files changed, 38 insertions(+), 78 deletions(-) Acked-by: Ilya Leoshkevich <iii@linux.ibm.com> I was wondering what assembly this would generate in parallel mode and wrote a small test. On my x86_64 machine it ended up being helper_atomic_cmpxchgo_be() -> cpu_atomic_cmpxchgo_be_mmu() -> lock cmpxchg16b, nothing surprising. On an s390x host we fall back to cpu_exec_step_atomic(), because in the configure test: int main(void) { unsigned __int128 x = 0, y = 0; __sync_val_compare_and_swap_16(&x, y, x); return 0; } x and y are not aligned. I guess that's working as intended as well, even though it would be nice to eventually make use of cdsg there. I will post the test shortly.
On 11/28/22 15:40, Ilya Leoshkevich wrote: > On an s390x host we fall back to cpu_exec_step_atomic(), because in the > configure test: > > int main(void) > { > unsigned __int128 x = 0, y = 0; > __sync_val_compare_and_swap_16(&x, y, x); > return 0; > } > > x and y are not aligned. I guess that's working as intended as well, > even though it would be nice to eventually make use of cdsg there. I have a fix for that as a part of '[PATCH for-8.0 00/29] tcg: Improve atomicity support': https://lore.kernel.org/qemu-devel/20221118094754.242910-14-richard.henderson@linaro.org/ r~
On 29.11.22 00:48, Ilya Leoshkevich wrote: > Add a simple test to prevent regressions. > > Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com> > --- > tests/tcg/s390x/Makefile.target | 4 ++ > tests/tcg/s390x/cdsg.c | 73 +++++++++++++++++++++++++++++++++ > 2 files changed, 77 insertions(+) > create mode 100644 tests/tcg/s390x/cdsg.c > > diff --git a/tests/tcg/s390x/Makefile.target b/tests/tcg/s390x/Makefile.target > index 1d454270c0e..523214dac33 100644 > --- a/tests/tcg/s390x/Makefile.target > +++ b/tests/tcg/s390x/Makefile.target > @@ -27,6 +27,7 @@ TESTS+=noexec > TESTS+=div > TESTS+=clst > TESTS+=long-double > +TESTS+=cdsg > > Z13_TESTS=vistr > $(Z13_TESTS): CFLAGS+=-march=z13 -O2 > @@ -66,3 +67,6 @@ sha512-mvx: sha512.c > $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ $(LDFLAGS) > > TESTS+=sha512-mvx > + > +cdsg: CFLAGS+=-pthread > +cdsg: LDFLAGS+=-pthread > diff --git a/tests/tcg/s390x/cdsg.c b/tests/tcg/s390x/cdsg.c > new file mode 100644 > index 00000000000..83313699f7d > --- /dev/null > +++ b/tests/tcg/s390x/cdsg.c > @@ -0,0 +1,73 @@ > +#include <assert.h> > +#include <pthread.h> > +#include <stdbool.h> > +#include <stdlib.h> > + > +static volatile bool start; > +static unsigned long val[2] __attribute__((__aligned__(16))); > + > +void *cdsg_loop(void *arg) > +{ > + unsigned long orig0, orig1, new0, new1; > + register unsigned long r0 asm("r0"); > + register unsigned long r1 asm("r1"); > + register unsigned long r2 asm("r2"); > + register unsigned long r3 asm("r3"); > + int cc; > + int i; > + > + while (!start) { > + } > + > + orig0 = val[0]; > + orig1 = val[1]; > + for (i = 0; i < 1000;) { Are 1000 iterations sufficient to catch the race window reliably? > + new0 = orig0 + 1; > + new1 = orig1 + 2; > + > + r0 = orig0; > + r1 = orig1; > + r2 = new0; > + r3 = new1; > + asm("cdsg %[r0],%[r2],%[db2]\n" > + "ipm %[cc]" > + : [r0] "+r" (r0) > + , [r1] "+r" (r1) > + , [db2] "=m" (val) > + , [cc] "=r" (cc) > + : [r2] "r" (r2) > + , [r3] "r" (r3) > + : "cc"); Nit: I'd suggest a simple cdsg helper function that makes this code easier to digest. > + orig0 = r0; > + orig1 = r1; > + cc = (cc >> 28) & 3; > + > + if (cc == 0) { > + orig0 = new0; > + orig1 = new1; > + i++; > + } else { > + assert(cc == 1); > + } > + } > + > + return NULL; > +} > + > +int main(void) > +{ > + pthread_t thread; > + int ret; > + > + ret = pthread_create(&thread, NULL, cdsg_loop, NULL); > + assert(ret == 0); > + start = true; > + cdsg_loop(NULL); > + ret = pthread_join(thread, NULL); > + assert(ret == 0); > + > + assert(val[0] == 2000); > + assert(val[1] == 4000); > + > + return EXIT_SUCCESS; > +}
On Tue, Nov 29, 2022 at 09:54:13AM +0100, David Hildenbrand wrote: > On 29.11.22 00:48, Ilya Leoshkevich wrote: > > Add a simple test to prevent regressions. > > > > Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com> > > --- > > tests/tcg/s390x/Makefile.target | 4 ++ > > tests/tcg/s390x/cdsg.c | 73 +++++++++++++++++++++++++++++++++ > > 2 files changed, 77 insertions(+) > > create mode 100644 tests/tcg/s390x/cdsg.c > > > > diff --git a/tests/tcg/s390x/Makefile.target b/tests/tcg/s390x/Makefile.target > > index 1d454270c0e..523214dac33 100644 > > --- a/tests/tcg/s390x/Makefile.target > > +++ b/tests/tcg/s390x/Makefile.target > > @@ -27,6 +27,7 @@ TESTS+=noexec > > TESTS+=div > > TESTS+=clst > > TESTS+=long-double > > +TESTS+=cdsg > > Z13_TESTS=vistr > > $(Z13_TESTS): CFLAGS+=-march=z13 -O2 > > @@ -66,3 +67,6 @@ sha512-mvx: sha512.c > > $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ $(LDFLAGS) > > TESTS+=sha512-mvx > > + > > +cdsg: CFLAGS+=-pthread > > +cdsg: LDFLAGS+=-pthread > > diff --git a/tests/tcg/s390x/cdsg.c b/tests/tcg/s390x/cdsg.c > > new file mode 100644 > > index 00000000000..83313699f7d > > --- /dev/null > > +++ b/tests/tcg/s390x/cdsg.c > > @@ -0,0 +1,73 @@ > > +#include <assert.h> > > +#include <pthread.h> > > +#include <stdbool.h> > > +#include <stdlib.h> > > + > > +static volatile bool start; > > +static unsigned long val[2] __attribute__((__aligned__(16))); > > + > > +void *cdsg_loop(void *arg) > > +{ > > + unsigned long orig0, orig1, new0, new1; > > + register unsigned long r0 asm("r0"); > > + register unsigned long r1 asm("r1"); > > + register unsigned long r2 asm("r2"); > > + register unsigned long r3 asm("r3"); > > + int cc; > > + int i; > > + > > + while (!start) { > > + } > > + > > + orig0 = val[0]; > > + orig1 = val[1]; > > + for (i = 0; i < 1000;) { > > Are 1000 iterations sufficient to catch the race window reliably? Good point, I had to raise it to 10k. If I break the code like this: --- a/tcg/tcg-op.c +++ b/tcg/tcg-op.c @@ -3509,7 +3509,7 @@ void tcg_gen_atomic_cmpxchg_i128(TCGv_i128 retv, TCGv addr, TCGv_i128 cmpv, { gen_atomic_cx_i128 gen; - if (!(tcg_ctx->tb_cflags & CF_PARALLEL)) { + if (true) { tcg_gen_nonatomic_cmpxchg_i128(retv, addr, cmpv, newv, idx, memop); return; } the test with 10k iterations fails consistently. And it's still fast: $ time -p ./qemu-s390x ./tests/tcg/s390x-linux-user/cdsg real 0.01 > > + new0 = orig0 + 1; > > + new1 = orig1 + 2; > > + > > + r0 = orig0; > > + r1 = orig1; > > + r2 = new0; > > + r3 = new1; > > + asm("cdsg %[r0],%[r2],%[db2]\n" > > + "ipm %[cc]" > > + : [r0] "+r" (r0) > > + , [r1] "+r" (r1) > > + , [db2] "=m" (val) > > + , [cc] "=r" (cc) > > + : [r2] "r" (r2) > > + , [r3] "r" (r3) > > + : "cc"); > > Nit: I'd suggest a simple cdsg helper function that makes this code easier > to digest. Ok. > > > + orig0 = r0; > > + orig1 = r1; > > + cc = (cc >> 28) & 3; > > + > > + if (cc == 0) { > > + orig0 = new0; > > + orig1 = new1; > > + i++; > > + } else { > > + assert(cc == 1); > > + } > > + } > > + > > + return NULL; > > +} > > + > > +int main(void) > > +{ > > + pthread_t thread; > > + int ret; > > + > > + ret = pthread_create(&thread, NULL, cdsg_loop, NULL); > > + assert(ret == 0); > > + start = true; > > + cdsg_loop(NULL); > > + ret = pthread_join(thread, NULL); > > + assert(ret == 0); > > + > > + assert(val[0] == 2000); > > + assert(val[1] == 4000); > > + > > + return EXIT_SUCCESS; > > +} > > -- > Thanks, > > David / dhildenb > >
diff --git a/target/s390x/helper.h b/target/s390x/helper.h index 481b9019f9..e5001ffddc 100644 --- a/target/s390x/helper.h +++ b/target/s390x/helper.h @@ -35,8 +35,6 @@ DEF_HELPER_3(cxgb, i128, env, s64, i32) DEF_HELPER_3(celgb, i64, env, i64, i32) DEF_HELPER_3(cdlgb, i64, env, i64, i32) DEF_HELPER_3(cxlgb, i128, env, i64, i32) -DEF_HELPER_4(cdsg, void, env, i64, i32, i32) -DEF_HELPER_4(cdsg_parallel, void, env, i64, i32, i32) DEF_HELPER_4(csst, i32, env, i32, i64, i64) DEF_HELPER_4(csst_parallel, i32, env, i32, i64, i64) DEF_HELPER_FLAGS_3(aeb, TCG_CALL_NO_WG, i64, env, i64, i64) diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c index caf8c408ef..ae4df8df3d 100644 --- a/target/s390x/tcg/mem_helper.c +++ b/target/s390x/tcg/mem_helper.c @@ -1771,58 +1771,6 @@ uint32_t HELPER(trXX)(CPUS390XState *env, uint32_t r1, uint32_t r2, return cc; } -void HELPER(cdsg)(CPUS390XState *env, uint64_t addr, - uint32_t r1, uint32_t r3) -{ - uintptr_t ra = GETPC(); - Int128 cmpv = int128_make128(env->regs[r1 + 1], env->regs[r1]); - Int128 newv = int128_make128(env->regs[r3 + 1], env->regs[r3]); - Int128 oldv; - uint64_t oldh, oldl; - bool fail; - - check_alignment(env, addr, 16, ra); - - oldh = cpu_ldq_data_ra(env, addr + 0, ra); - oldl = cpu_ldq_data_ra(env, addr + 8, ra); - - oldv = int128_make128(oldl, oldh); - fail = !int128_eq(oldv, cmpv); - if (fail) { - newv = oldv; - } - - cpu_stq_data_ra(env, addr + 0, int128_gethi(newv), ra); - cpu_stq_data_ra(env, addr + 8, int128_getlo(newv), ra); - - env->cc_op = fail; - env->regs[r1] = int128_gethi(oldv); - env->regs[r1 + 1] = int128_getlo(oldv); -} - -void HELPER(cdsg_parallel)(CPUS390XState *env, uint64_t addr, - uint32_t r1, uint32_t r3) -{ - uintptr_t ra = GETPC(); - Int128 cmpv = int128_make128(env->regs[r1 + 1], env->regs[r1]); - Int128 newv = int128_make128(env->regs[r3 + 1], env->regs[r3]); - int mem_idx; - MemOpIdx oi; - Int128 oldv; - bool fail; - - assert(HAVE_CMPXCHG128); - - mem_idx = cpu_mmu_index(env, false); - oi = make_memop_idx(MO_TE | MO_128 | MO_ALIGN, mem_idx); - oldv = cpu_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra); - fail = !int128_eq(oldv, cmpv); - - env->cc_op = fail; - env->regs[r1] = int128_gethi(oldv); - env->regs[r1 + 1] = int128_getlo(oldv); -} - static uint32_t do_csst(CPUS390XState *env, uint32_t r3, uint64_t a1, uint64_t a2, bool parallel) { diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c index 6a351a5245..480c89dae3 100644 --- a/target/s390x/tcg/translate.c +++ b/target/s390x/tcg/translate.c @@ -2224,31 +2224,22 @@ static DisasJumpType op_cs(DisasContext *s, DisasOps *o) static DisasJumpType op_cdsg(DisasContext *s, DisasOps *o) { int r1 = get_field(s, r1); - int r3 = get_field(s, r3); - int d2 = get_field(s, d2); - int b2 = get_field(s, b2); - DisasJumpType ret = DISAS_NEXT; - TCGv_i64 addr; - TCGv_i32 t_r1, t_r3; - /* Note that R1:R1+1 = expected value and R3:R3+1 = new value. */ - addr = get_address(s, 0, b2, d2); - t_r1 = tcg_const_i32(r1); - t_r3 = tcg_const_i32(r3); - if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) { - gen_helper_cdsg(cpu_env, addr, t_r1, t_r3); - } else if (HAVE_CMPXCHG128) { - gen_helper_cdsg_parallel(cpu_env, addr, t_r1, t_r3); - } else { - gen_helper_exit_atomic(cpu_env); - ret = DISAS_NORETURN; - } - tcg_temp_free_i64(addr); - tcg_temp_free_i32(t_r1); - tcg_temp_free_i32(t_r3); + /* Note out (R1:R1+1) = expected value and in2 (R3:R3+1) = new value. */ + tcg_gen_atomic_cmpxchg_i128(o->out_128, o->addr1, o->out_128, o->in2_128, + get_mem_index(s), MO_BE | MO_128 | MO_ALIGN); - set_cc_static(s); - return ret; + /* + * Extract result into cc_dst:cc_src, compare vs the expected value + * in the as yet unmodified input registers, then update CC_OP. + */ + tcg_gen_extr_i128_i64(cc_src, cc_dst, o->out_128); + tcg_gen_xor_i64(cc_dst, cc_dst, regs[r1]); + tcg_gen_xor_i64(cc_src, cc_src, regs[r1 + 1]); + tcg_gen_or_i64(cc_dst, cc_dst, cc_src); + set_cc_nz_u64(s, cc_dst); + + return DISAS_NEXT; } static DisasJumpType op_csst(DisasContext *s, DisasOps *o) @@ -5417,6 +5408,14 @@ static void prep_r1_P(DisasContext *s, DisasOps *o) } #define SPEC_prep_r1_P SPEC_r1_even +static void prep_r1_D64(DisasContext *s, DisasOps *o) +{ + int r1 = get_field(s, r1); + o->out_128 = tcg_temp_new_i128(); + tcg_gen_concat_i64_i128(o->out_128, regs[r1 + 1], regs[r1]); +} +#define SPEC_prep_r1_D64 SPEC_r1_even + static void prep_x1(DisasContext *s, DisasOps *o) { o->out_128 = load_freg_128(get_field(s, r1)); @@ -5486,6 +5485,13 @@ static void wout_r1_D32(DisasContext *s, DisasOps *o) } #define SPEC_wout_r1_D32 SPEC_r1_even +static void wout_r1_D64(DisasContext *s, DisasOps *o) +{ + int r1 = get_field(s, r1); + tcg_gen_extr_i128_i64(regs[r1 + 1], regs[r1], o->out_128); +} +#define SPEC_wout_r1_D64 SPEC_r1_even + static void wout_r3_P32(DisasContext *s, DisasOps *o) { int r3 = get_field(s, r3); @@ -5933,6 +5939,14 @@ static void in2_r3(DisasContext *s, DisasOps *o) } #define SPEC_in2_r3 0 +static void in2_r3_D64(DisasContext *s, DisasOps *o) +{ + int r3 = get_field(s, r3); + o->in2_128 = tcg_temp_new_i128(); + tcg_gen_concat_i64_i128(o->in2_128, regs[r3 + 1], regs[r3]); +} +#define SPEC_in2_r3_D64 SPEC_r3_even + static void in2_r3_sr32(DisasContext *s, DisasOps *o) { o->in2 = tcg_temp_new_i64(); diff --git a/target/s390x/tcg/insn-data.h.inc b/target/s390x/tcg/insn-data.h.inc index 1a2a55bf5e..7dfcbdd980 100644 --- a/target/s390x/tcg/insn-data.h.inc +++ b/target/s390x/tcg/insn-data.h.inc @@ -276,7 +276,7 @@ /* COMPARE DOUBLE AND SWAP */ D(0xbb00, CDS, RS_a, Z, r3_D32, r1_D32, new, r1_D32, cs, 0, MO_TEUQ) D(0xeb31, CDSY, RSY_a, LD, r3_D32, r1_D32, new, r1_D32, cs, 0, MO_TEUQ) - C(0xeb3e, CDSG, RSY_a, Z, 0, 0, 0, 0, cdsg, 0) + C(0xeb3e, CDSG, RSY_a, Z, la2, r3_D64, r1_D64, r1_D64, cdsg, 0) /* COMPARE AND SWAP AND STORE */ C(0xc802, CSST, SSF, CASS, la1, a2, 0, 0, csst, 0)
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/s390x/helper.h | 2 -- target/s390x/tcg/mem_helper.c | 52 --------------------------- target/s390x/tcg/translate.c | 60 ++++++++++++++++++++------------ target/s390x/tcg/insn-data.h.inc | 2 +- 4 files changed, 38 insertions(+), 78 deletions(-)