diff mbox series

[1/2] tcg: Add gvec compare with immediate and scalar operand

Message ID 20230831030904.1194667-2-richard.henderson@linaro.org
State New
Headers show
Series tcg: Add gvec compare with immediate and scalar operand | expand

Commit Message

Richard Henderson Aug. 31, 2023, 3:09 a.m. UTC
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/tcg-runtime.h          |  25 ++++++
 include/tcg/tcg-op-gvec-common.h |   6 ++
 accel/tcg/tcg-runtime-gvec.c     |  26 ++++++
 tcg/tcg-op-gvec.c                | 150 +++++++++++++++++++++++++++++++
 4 files changed, 207 insertions(+)

Comments

Song Gao Sept. 7, 2023, 7:39 a.m. UTC | #1
Hi, Richard
在 2023/8/31 上午11:09, Richard Henderson 写道:
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   accel/tcg/tcg-runtime.h          |  25 ++++++
>   include/tcg/tcg-op-gvec-common.h |   6 ++
>   accel/tcg/tcg-runtime-gvec.c     |  26 ++++++
>   tcg/tcg-op-gvec.c                | 150 +++++++++++++++++++++++++++++++
>   4 files changed, 207 insertions(+)
> 

I use tcg_gen_gvec_cmps for LoongArch vector cmp instructions.  but I 
got an Aborted error from temp_load().  I'll fixes this later.

And I'll send LASX V5 series. this series will not use tcg_gen_gvec_cmps.

Thanks.
Song Gao

> diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
> index 186899a2c7..c23b5e66c4 100644
> --- a/accel/tcg/tcg-runtime.h
> +++ b/accel/tcg/tcg-runtime.h
> @@ -297,4 +297,29 @@ DEF_HELPER_FLAGS_4(gvec_leu16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
>   DEF_HELPER_FLAGS_4(gvec_leu32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
>   DEF_HELPER_FLAGS_4(gvec_leu64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
>   
> +DEF_HELPER_FLAGS_4(gvec_eqs8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_eqs16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_eqs32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_eqs64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +
> +DEF_HELPER_FLAGS_4(gvec_lts8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_lts16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_lts32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_lts64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +
> +DEF_HELPER_FLAGS_4(gvec_les8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_les16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_les32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_les64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +
> +DEF_HELPER_FLAGS_4(gvec_ltus8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_ltus16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_ltus32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_ltus64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +
> +DEF_HELPER_FLAGS_4(gvec_leus8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_leus16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_leus32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_leus64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
> +
>   DEF_HELPER_FLAGS_5(gvec_bitsel, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
> diff --git a/include/tcg/tcg-op-gvec-common.h b/include/tcg/tcg-op-gvec-common.h
> index e2683d487f..4db8a58c14 100644
> --- a/include/tcg/tcg-op-gvec-common.h
> +++ b/include/tcg/tcg-op-gvec-common.h
> @@ -374,6 +374,12 @@ void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
>   void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
>                         uint32_t aofs, uint32_t bofs,
>                         uint32_t oprsz, uint32_t maxsz);
> +void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs,
> +                       uint32_t aofs, int64_t c,
> +                       uint32_t oprsz, uint32_t maxsz);
> +void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs,
> +                       uint32_t aofs, TCGv_i64 c,
> +                       uint32_t oprsz, uint32_t maxsz);
>   
>   /*
>    * Perform vector bit select: d = (b & a) | (c & ~a).
> diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
> index 6c99f952ca..afca89baa1 100644
> --- a/accel/tcg/tcg-runtime-gvec.c
> +++ b/accel/tcg/tcg-runtime-gvec.c
> @@ -1042,6 +1042,32 @@ DO_CMP2(64)
>   #undef DO_CMP1
>   #undef DO_CMP2
>   
> +#define DO_CMP1(NAME, TYPE, OP)                                            \
> +void HELPER(NAME)(void *d, void *a, uint64_t b64, uint32_t desc)           \
> +{                                                                          \
> +    intptr_t oprsz = simd_oprsz(desc);                                     \
> +    TYPE inv = simd_data(desc), b = b64;                                   \
> +    for (intptr_t i = 0; i < oprsz; i += sizeof(TYPE)) {                   \
> +        *(TYPE *)(d + i) = -((*(TYPE *)(a + i) OP b) ^ inv);               \
> +    }                                                                      \
> +    clear_high(d, oprsz, desc);                                            \
> +}
> +
> +#define DO_CMP2(SZ) \
> +    DO_CMP1(gvec_eqs##SZ, uint##SZ##_t, ==)    \
> +    DO_CMP1(gvec_lts##SZ, int##SZ##_t, <)      \
> +    DO_CMP1(gvec_les##SZ, int##SZ##_t, <=)     \
> +    DO_CMP1(gvec_ltus##SZ, uint##SZ##_t, <)    \
> +    DO_CMP1(gvec_leus##SZ, uint##SZ##_t, <=)
> +
> +DO_CMP2(8)
> +DO_CMP2(16)
> +DO_CMP2(32)
> +DO_CMP2(64)
> +
> +#undef DO_CMP1
> +#undef DO_CMP2
> +
>   void HELPER(gvec_ssadd8)(void *d, void *a, void *b, uint32_t desc)
>   {
>       intptr_t oprsz = simd_oprsz(desc);
> diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
> index f5cfd9bf99..f7ca9e1051 100644
> --- a/tcg/tcg-op-gvec.c
> +++ b/tcg/tcg-op-gvec.c
> @@ -3819,6 +3819,156 @@ void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
>       }
>   }
>   
> +void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs,
> +                       uint32_t aofs, TCGv_i64 c,
> +                       uint32_t oprsz, uint32_t maxsz)
> +{
> +    static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
> +    static gen_helper_gvec_2i * const eq_fn[4] = {
> +        gen_helper_gvec_eqs8, gen_helper_gvec_eqs16,
> +        gen_helper_gvec_eqs32, gen_helper_gvec_eqs64
> +    };
> +    static gen_helper_gvec_2i * const lt_fn[4] = {
> +        gen_helper_gvec_lts8, gen_helper_gvec_lts16,
> +        gen_helper_gvec_lts32, gen_helper_gvec_lts64
> +    };
> +    static gen_helper_gvec_2i * const le_fn[4] = {
> +        gen_helper_gvec_les8, gen_helper_gvec_les16,
> +        gen_helper_gvec_les32, gen_helper_gvec_les64
> +    };
> +    static gen_helper_gvec_2i * const ltu_fn[4] = {
> +        gen_helper_gvec_ltus8, gen_helper_gvec_ltus16,
> +        gen_helper_gvec_ltus32, gen_helper_gvec_ltus64
> +    };
> +    static gen_helper_gvec_2i * const leu_fn[4] = {
> +        gen_helper_gvec_leus8, gen_helper_gvec_leus16,
> +        gen_helper_gvec_leus32, gen_helper_gvec_leus64
> +    };
> +    static gen_helper_gvec_2i * const * const fns[16] = {
> +        [TCG_COND_EQ] = eq_fn,
> +        [TCG_COND_LT] = lt_fn,
> +        [TCG_COND_LE] = le_fn,
> +        [TCG_COND_LTU] = ltu_fn,
> +        [TCG_COND_LEU] = leu_fn,
> +    };
> +
> +    TCGType type;
> +
> +    check_size_align(oprsz, maxsz, dofs | aofs);
> +    check_overlap_2(dofs, aofs, maxsz);
> +
> +    if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
> +        do_dup(MO_8, dofs, oprsz, maxsz,
> +               NULL, NULL, -(cond == TCG_COND_ALWAYS));
> +        return;
> +    }
> +
> +    /*
> +     * Implement inline with a vector type, if possible.
> +     * Prefer integer when 64-bit host and 64-bit comparison.
> +     */
> +    type = choose_vector_type(cmp_list, vece, oprsz,
> +                              TCG_TARGET_REG_BITS == 64 && vece == MO_64);
> +    if (type != 0) {
> +        const TCGOpcode *hold_list = tcg_swap_vecop_list(cmp_list);
> +        TCGv_vec t_vec = tcg_temp_new_vec(type);
> +        uint32_t some, i;
> +
> +        tcg_gen_dup_i64_vec(vece, t_vec, c);
> +
> +        switch (type) {
> +        case TCG_TYPE_V256:
> +            some = QEMU_ALIGN_DOWN(oprsz, 32);
> +            for (i = 0; i < some; i += 32) {
> +                TCGv_vec t0 = tcg_temp_new_vec(TCG_TYPE_V256);
> +                TCGv_vec t1 = tcg_temp_new_vec(TCG_TYPE_V256);
> +                tcg_gen_ld_vec(t0, cpu_env, aofs);
> +                tcg_gen_cmp_vec(cond, vece, t0, t1, t_vec);
> +                tcg_gen_st_vec(t0, cpu_env, dofs);
> +                aofs += 32;
> +                dofs += 32;
> +            }
> +            oprsz -= some;
> +            maxsz -= some;
> +            /* fallthru */
> +
> +        case TCG_TYPE_V128:
> +            some = QEMU_ALIGN_DOWN(oprsz, 16);
> +            for (i = 0; i < some; i += 16) {
> +                TCGv_vec t0 = tcg_temp_new_vec(TCG_TYPE_V128);
> +                TCGv_vec t1 = tcg_temp_new_vec(TCG_TYPE_V128);
> +                tcg_gen_ld_vec(t0, cpu_env, aofs + i);
> +                tcg_gen_cmp_vec(cond, vece, t0, t1, t_vec);
> +                tcg_gen_st_vec(t0, cpu_env, dofs + i);
> +            }
> +            break;
> +
> +        case TCG_TYPE_V64:
> +            some = QEMU_ALIGN_DOWN(oprsz, 8);
> +            for (i = 0; i < some; i += 8) {
> +                TCGv_vec t0 = tcg_temp_new_vec(TCG_TYPE_V64);
> +                TCGv_vec t1 = tcg_temp_new_vec(TCG_TYPE_V64);
> +                tcg_gen_ld_vec(t0, cpu_env, aofs + i);
> +                tcg_gen_cmp_vec(cond, vece, t0, t1, t_vec);
> +                tcg_gen_st_vec(t0, cpu_env, dofs + i);
> +            }
> +            break;
> +
> +        default:
> +            g_assert_not_reached();
> +        }
> +        tcg_temp_free_vec(t_vec);
> +        tcg_swap_vecop_list(hold_list);
> +    } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
> +        TCGv_i64 t0 = tcg_temp_ebb_new_i64();
> +        uint32_t i;
> +
> +        for (i = 0; i < oprsz; i += 8) {
> +            tcg_gen_ld_i64(t0, cpu_env, aofs + i);
> +            tcg_gen_negsetcond_i64(cond, t0, t0, c);
> +            tcg_gen_st_i64(t0, cpu_env, dofs + i);
> +        }
> +        tcg_temp_free_i64(t0);
> +    } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
> +        TCGv_i32 t0 = tcg_temp_ebb_new_i32();
> +        TCGv_i32 t1 = tcg_temp_ebb_new_i32();
> +        uint32_t i;
> +
> +        tcg_gen_extrl_i64_i32(t1, c);
> +        for (i = 0; i < oprsz; i += 8) {
> +            tcg_gen_ld_i32(t0, cpu_env, aofs + i);
> +            tcg_gen_negsetcond_i32(cond, t0, t0, t1);
> +            tcg_gen_st_i32(t0, cpu_env, dofs + i);
> +        }
> +        tcg_temp_free_i32(t0);
> +        tcg_temp_free_i32(t1);
> +    } else {
> +        gen_helper_gvec_2i * const *fn = fns[cond];
> +        bool inv = false;
> +
> +        if (fn == NULL) {
> +            cond = tcg_invert_cond(cond);
> +            fn = fns[cond];
> +            assert(fn != NULL);
> +            inv = true;
> +        }
> +        tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, inv, fn[vece]);
> +        return;
> +    }
> +
> +    if (oprsz < maxsz) {
> +        expand_clr(dofs + oprsz, maxsz - oprsz);
> +    }
> +}
> +
> +void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs,
> +                       uint32_t aofs, int64_t c,
> +                       uint32_t oprsz, uint32_t maxsz)
> +{
> +    TCGv_i64 tmp = tcg_constant_i64(c);
> +    tcg_gen_gvec_cmps(cond, vece, dofs, aofs, tmp, oprsz, maxsz);
> +}
> +
>   static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
>   {
>       TCGv_i64 t = tcg_temp_ebb_new_i64();
>
Song Gao Sept. 7, 2023, 12:12 p.m. UTC | #2
在 2023/9/7 下午3:39, gaosong 写道:
> Hi, Richard
> 在 2023/8/31 上午11:09, Richard Henderson 写道:
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>>   accel/tcg/tcg-runtime.h          |  25 ++++++
>>   include/tcg/tcg-op-gvec-common.h |   6 ++
>>   accel/tcg/tcg-runtime-gvec.c     |  26 ++++++
>>   tcg/tcg-op-gvec.c                | 150 +++++++++++++++++++++++++++++++
>>   4 files changed, 207 insertions(+)
>>
> 
> I use tcg_gen_gvec_cmps for LoongArch vector cmp instructions.  but I 
> got an Aborted error from temp_load().  I'll fixes this later.
> 
My mistaken,  It's work well.
I will use tcg_gen_gvec_cmps on LoongArch's LASX series.

For this patch:
Tested-by: Song Gao <gaosong@loongson.cn>
Reviewed-by: Song Gao <gaosong@loongson.cn>

Thanks.
Song Gao

> And I'll send LASX V5 series. this series will not use tcg_gen_gvec_cmps.
> 
> Thanks.
> Song Gao
> 
>> diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
>> index 186899a2c7..c23b5e66c4 100644
>> --- a/accel/tcg/tcg-runtime.h
>> +++ b/accel/tcg/tcg-runtime.h
>> @@ -297,4 +297,29 @@ DEF_HELPER_FLAGS_4(gvec_leu16, TCG_CALL_NO_RWG, 
>> void, ptr, ptr, ptr, i32)
>>   DEF_HELPER_FLAGS_4(gvec_leu32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, 
>> i32)
>>   DEF_HELPER_FLAGS_4(gvec_leu64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, 
>> i32)
>> +DEF_HELPER_FLAGS_4(gvec_eqs8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
>> +DEF_HELPER_FLAGS_4(gvec_eqs16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +DEF_HELPER_FLAGS_4(gvec_eqs32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +DEF_HELPER_FLAGS_4(gvec_eqs64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +
>> +DEF_HELPER_FLAGS_4(gvec_lts8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
>> +DEF_HELPER_FLAGS_4(gvec_lts16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +DEF_HELPER_FLAGS_4(gvec_lts32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +DEF_HELPER_FLAGS_4(gvec_lts64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +
>> +DEF_HELPER_FLAGS_4(gvec_les8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
>> +DEF_HELPER_FLAGS_4(gvec_les16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +DEF_HELPER_FLAGS_4(gvec_les32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +DEF_HELPER_FLAGS_4(gvec_les64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +
>> +DEF_HELPER_FLAGS_4(gvec_ltus8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +DEF_HELPER_FLAGS_4(gvec_ltus16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +DEF_HELPER_FLAGS_4(gvec_ltus32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +DEF_HELPER_FLAGS_4(gvec_ltus64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +
>> +DEF_HELPER_FLAGS_4(gvec_leus8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +DEF_HELPER_FLAGS_4(gvec_leus16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +DEF_HELPER_FLAGS_4(gvec_leus32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +DEF_HELPER_FLAGS_4(gvec_leus64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>> i32)
>> +
>>   DEF_HELPER_FLAGS_5(gvec_bitsel, TCG_CALL_NO_RWG, void, ptr, ptr, 
>> ptr, ptr, i32)
>> diff --git a/include/tcg/tcg-op-gvec-common.h 
>> b/include/tcg/tcg-op-gvec-common.h
>> index e2683d487f..4db8a58c14 100644
>> --- a/include/tcg/tcg-op-gvec-common.h
>> +++ b/include/tcg/tcg-op-gvec-common.h
>> @@ -374,6 +374,12 @@ void tcg_gen_gvec_rotrv(unsigned vece, uint32_t 
>> dofs, uint32_t aofs,
>>   void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
>>                         uint32_t aofs, uint32_t bofs,
>>                         uint32_t oprsz, uint32_t maxsz);
>> +void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs,
>> +                       uint32_t aofs, int64_t c,
>> +                       uint32_t oprsz, uint32_t maxsz);
>> +void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs,
>> +                       uint32_t aofs, TCGv_i64 c,
>> +                       uint32_t oprsz, uint32_t maxsz);
>>   /*
>>    * Perform vector bit select: d = (b & a) | (c & ~a).
>> diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
>> index 6c99f952ca..afca89baa1 100644
>> --- a/accel/tcg/tcg-runtime-gvec.c
>> +++ b/accel/tcg/tcg-runtime-gvec.c
>> @@ -1042,6 +1042,32 @@ DO_CMP2(64)
>>   #undef DO_CMP1
>>   #undef DO_CMP2
>> +#define DO_CMP1(NAME, TYPE, 
>> OP)                                            \
>> +void HELPER(NAME)(void *d, void *a, uint64_t b64, uint32_t 
>> desc)           \
>> +{                                                                          
>> \
>> +    intptr_t oprsz = 
>> simd_oprsz(desc);                                     \
>> +    TYPE inv = simd_data(desc), b = 
>> b64;                                   \
>> +    for (intptr_t i = 0; i < oprsz; i += sizeof(TYPE)) 
>> {                   \
>> +        *(TYPE *)(d + i) = -((*(TYPE *)(a + i) OP b) ^ 
>> inv);               \
>> +    
>> }                                                                      \
>> +    clear_high(d, oprsz, 
>> desc);                                            \
>> +}
>> +
>> +#define DO_CMP2(SZ) \
>> +    DO_CMP1(gvec_eqs##SZ, uint##SZ##_t, ==)    \
>> +    DO_CMP1(gvec_lts##SZ, int##SZ##_t, <)      \
>> +    DO_CMP1(gvec_les##SZ, int##SZ##_t, <=)     \
>> +    DO_CMP1(gvec_ltus##SZ, uint##SZ##_t, <)    \
>> +    DO_CMP1(gvec_leus##SZ, uint##SZ##_t, <=)
>> +
>> +DO_CMP2(8)
>> +DO_CMP2(16)
>> +DO_CMP2(32)
>> +DO_CMP2(64)
>> +
>> +#undef DO_CMP1
>> +#undef DO_CMP2
>> +
>>   void HELPER(gvec_ssadd8)(void *d, void *a, void *b, uint32_t desc)
>>   {
>>       intptr_t oprsz = simd_oprsz(desc);
>> diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
>> index f5cfd9bf99..f7ca9e1051 100644
>> --- a/tcg/tcg-op-gvec.c
>> +++ b/tcg/tcg-op-gvec.c
>> @@ -3819,6 +3819,156 @@ void tcg_gen_gvec_cmp(TCGCond cond, unsigned 
>> vece, uint32_t dofs,
>>       }
>>   }
>> +void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs,
>> +                       uint32_t aofs, TCGv_i64 c,
>> +                       uint32_t oprsz, uint32_t maxsz)
>> +{
>> +    static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
>> +    static gen_helper_gvec_2i * const eq_fn[4] = {
>> +        gen_helper_gvec_eqs8, gen_helper_gvec_eqs16,
>> +        gen_helper_gvec_eqs32, gen_helper_gvec_eqs64
>> +    };
>> +    static gen_helper_gvec_2i * const lt_fn[4] = {
>> +        gen_helper_gvec_lts8, gen_helper_gvec_lts16,
>> +        gen_helper_gvec_lts32, gen_helper_gvec_lts64
>> +    };
>> +    static gen_helper_gvec_2i * const le_fn[4] = {
>> +        gen_helper_gvec_les8, gen_helper_gvec_les16,
>> +        gen_helper_gvec_les32, gen_helper_gvec_les64
>> +    };
>> +    static gen_helper_gvec_2i * const ltu_fn[4] = {
>> +        gen_helper_gvec_ltus8, gen_helper_gvec_ltus16,
>> +        gen_helper_gvec_ltus32, gen_helper_gvec_ltus64
>> +    };
>> +    static gen_helper_gvec_2i * const leu_fn[4] = {
>> +        gen_helper_gvec_leus8, gen_helper_gvec_leus16,
>> +        gen_helper_gvec_leus32, gen_helper_gvec_leus64
>> +    };
>> +    static gen_helper_gvec_2i * const * const fns[16] = {
>> +        [TCG_COND_EQ] = eq_fn,
>> +        [TCG_COND_LT] = lt_fn,
>> +        [TCG_COND_LE] = le_fn,
>> +        [TCG_COND_LTU] = ltu_fn,
>> +        [TCG_COND_LEU] = leu_fn,
>> +    };
>> +
>> +    TCGType type;
>> +
>> +    check_size_align(oprsz, maxsz, dofs | aofs);
>> +    check_overlap_2(dofs, aofs, maxsz);
>> +
>> +    if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
>> +        do_dup(MO_8, dofs, oprsz, maxsz,
>> +               NULL, NULL, -(cond == TCG_COND_ALWAYS));
>> +        return;
>> +    }
>> +
>> +    /*
>> +     * Implement inline with a vector type, if possible.
>> +     * Prefer integer when 64-bit host and 64-bit comparison.
>> +     */
>> +    type = choose_vector_type(cmp_list, vece, oprsz,
>> +                              TCG_TARGET_REG_BITS == 64 && vece == 
>> MO_64);
>> +    if (type != 0) {
>> +        const TCGOpcode *hold_list = tcg_swap_vecop_list(cmp_list);
>> +        TCGv_vec t_vec = tcg_temp_new_vec(type);
>> +        uint32_t some, i;
>> +
>> +        tcg_gen_dup_i64_vec(vece, t_vec, c);
>> +
>> +        switch (type) {
>> +        case TCG_TYPE_V256:
>> +            some = QEMU_ALIGN_DOWN(oprsz, 32);
>> +            for (i = 0; i < some; i += 32) {
>> +                TCGv_vec t0 = tcg_temp_new_vec(TCG_TYPE_V256);
>> +                TCGv_vec t1 = tcg_temp_new_vec(TCG_TYPE_V256);
>> +                tcg_gen_ld_vec(t0, cpu_env, aofs);
>> +                tcg_gen_cmp_vec(cond, vece, t0, t1, t_vec);
>> +                tcg_gen_st_vec(t0, cpu_env, dofs);
>> +                aofs += 32;
>> +                dofs += 32;
>> +            }
>> +            oprsz -= some;
>> +            maxsz -= some;
>> +            /* fallthru */
>> +
>> +        case TCG_TYPE_V128:
>> +            some = QEMU_ALIGN_DOWN(oprsz, 16);
>> +            for (i = 0; i < some; i += 16) {
>> +                TCGv_vec t0 = tcg_temp_new_vec(TCG_TYPE_V128);
>> +                TCGv_vec t1 = tcg_temp_new_vec(TCG_TYPE_V128);
>> +                tcg_gen_ld_vec(t0, cpu_env, aofs + i);
>> +                tcg_gen_cmp_vec(cond, vece, t0, t1, t_vec);
>> +                tcg_gen_st_vec(t0, cpu_env, dofs + i);
>> +            }
>> +            break;
>> +
>> +        case TCG_TYPE_V64:
>> +            some = QEMU_ALIGN_DOWN(oprsz, 8);
>> +            for (i = 0; i < some; i += 8) {
>> +                TCGv_vec t0 = tcg_temp_new_vec(TCG_TYPE_V64);
>> +                TCGv_vec t1 = tcg_temp_new_vec(TCG_TYPE_V64);
>> +                tcg_gen_ld_vec(t0, cpu_env, aofs + i);
>> +                tcg_gen_cmp_vec(cond, vece, t0, t1, t_vec);
>> +                tcg_gen_st_vec(t0, cpu_env, dofs + i);
>> +            }
>> +            break;
>> +
>> +        default:
>> +            g_assert_not_reached();
>> +        }
>> +        tcg_temp_free_vec(t_vec);
>> +        tcg_swap_vecop_list(hold_list);
>> +    } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
>> +        TCGv_i64 t0 = tcg_temp_ebb_new_i64();
>> +        uint32_t i;
>> +
>> +        for (i = 0; i < oprsz; i += 8) {
>> +            tcg_gen_ld_i64(t0, cpu_env, aofs + i);
>> +            tcg_gen_negsetcond_i64(cond, t0, t0, c);
>> +            tcg_gen_st_i64(t0, cpu_env, dofs + i);
>> +        }
>> +        tcg_temp_free_i64(t0);
>> +    } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
>> +        TCGv_i32 t0 = tcg_temp_ebb_new_i32();
>> +        TCGv_i32 t1 = tcg_temp_ebb_new_i32();
>> +        uint32_t i;
>> +
>> +        tcg_gen_extrl_i64_i32(t1, c);
>> +        for (i = 0; i < oprsz; i += 8) {
>> +            tcg_gen_ld_i32(t0, cpu_env, aofs + i);
>> +            tcg_gen_negsetcond_i32(cond, t0, t0, t1);
>> +            tcg_gen_st_i32(t0, cpu_env, dofs + i);
>> +        }
>> +        tcg_temp_free_i32(t0);
>> +        tcg_temp_free_i32(t1);
>> +    } else {
>> +        gen_helper_gvec_2i * const *fn = fns[cond];
>> +        bool inv = false;
>> +
>> +        if (fn == NULL) {
>> +            cond = tcg_invert_cond(cond);
>> +            fn = fns[cond];
>> +            assert(fn != NULL);
>> +            inv = true;
>> +        }
>> +        tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, inv, fn[vece]);
>> +        return;
>> +    }
>> +
>> +    if (oprsz < maxsz) {
>> +        expand_clr(dofs + oprsz, maxsz - oprsz);
>> +    }
>> +}
>> +
>> +void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs,
>> +                       uint32_t aofs, int64_t c,
>> +                       uint32_t oprsz, uint32_t maxsz)
>> +{
>> +    TCGv_i64 tmp = tcg_constant_i64(c);
>> +    tcg_gen_gvec_cmps(cond, vece, dofs, aofs, tmp, oprsz, maxsz);
>> +}
>> +
>>   static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, 
>> TCGv_i64 c)
>>   {
>>       TCGv_i64 t = tcg_temp_ebb_new_i64();
>>
>
Song Gao Sept. 11, 2023, 12:38 p.m. UTC | #3
在 2023/9/7 下午8:12, gaosong 写道:
> 在 2023/9/7 下午3:39, gaosong 写道:
>> Hi, Richard
>> 在 2023/8/31 上午11:09, Richard Henderson 写道:
>>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>>> ---
>>>   accel/tcg/tcg-runtime.h          |  25 ++++++
>>>   include/tcg/tcg-op-gvec-common.h |   6 ++
>>>   accel/tcg/tcg-runtime-gvec.c     |  26 ++++++
>>>   tcg/tcg-op-gvec.c                | 150 +++++++++++++++++++++++++++++++
>>>   4 files changed, 207 insertions(+)
>>>
>>
>> I use tcg_gen_gvec_cmps for LoongArch vector cmp instructions.  but I 
>> got an Aborted error from temp_load().  I'll fixes this later.
>>
> My mistaken,  It's work well.
> I will use tcg_gen_gvec_cmps on LoongArch's LASX series.
> 
> For this patch:
> Tested-by: Song Gao <gaosong@loongson.cn>
> Reviewed-by: Song Gao <gaosong@loongson.cn>
> 
> Thanks.
> Song Gao
> 
>> And I'll send LASX V5 series. this series will not use tcg_gen_gvec_cmps.
>>
>> Thanks.
>> Song Gao
>>
Oh,  It's my tested not enough,  I got an temp_load Aborted  again.
So I Look this patch more carefully,  And find a typo.

>>> diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
>>> index 186899a2c7..c23b5e66c4 100644
>>> --- a/accel/tcg/tcg-runtime.h
>>> +++ b/accel/tcg/tcg-runtime.h
>>> @@ -297,4 +297,29 @@ DEF_HELPER_FLAGS_4(gvec_leu16, TCG_CALL_NO_RWG, 
>>> void, ptr, ptr, ptr, i32)
>>>   DEF_HELPER_FLAGS_4(gvec_leu32, TCG_CALL_NO_RWG, void, ptr, ptr, 
>>> ptr, i32)
>>>   DEF_HELPER_FLAGS_4(gvec_leu64, TCG_CALL_NO_RWG, void, ptr, ptr, 
>>> ptr, i32)
>>> +DEF_HELPER_FLAGS_4(gvec_eqs8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>>> i32)
>>> +DEF_HELPER_FLAGS_4(gvec_eqs16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>>> i32)
>>> +DEF_HELPER_FLAGS_4(gvec_eqs32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>>> i32)
>>> +DEF_HELPER_FLAGS_4(gvec_eqs64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>>> i32)
>>> +
>>> +DEF_HELPER_FLAGS_4(gvec_lts8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>>> i32)
>>> +DEF_HELPER_FLAGS_4(gvec_lts16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>>> i32)
>>> +DEF_HELPER_FLAGS_4(gvec_lts32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>>> i32)
>>> +DEF_HELPER_FLAGS_4(gvec_lts64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>>> i32)
>>> +
>>> +DEF_HELPER_FLAGS_4(gvec_les8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>>> i32)
>>> +DEF_HELPER_FLAGS_4(gvec_les16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>>> i32)
>>> +DEF_HELPER_FLAGS_4(gvec_les32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>>> i32)
>>> +DEF_HELPER_FLAGS_4(gvec_les64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>>> i32)
>>> +
>>> +DEF_HELPER_FLAGS_4(gvec_ltus8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>>> i32)
>>> +DEF_HELPER_FLAGS_4(gvec_ltus16, TCG_CALL_NO_RWG, void, ptr, ptr, 
>>> i64, i32)
>>> +DEF_HELPER_FLAGS_4(gvec_ltus32, TCG_CALL_NO_RWG, void, ptr, ptr, 
>>> i64, i32)
>>> +DEF_HELPER_FLAGS_4(gvec_ltus64, TCG_CALL_NO_RWG, void, ptr, ptr, 
>>> i64, i32)
>>> +
>>> +DEF_HELPER_FLAGS_4(gvec_leus8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, 
>>> i32)
>>> +DEF_HELPER_FLAGS_4(gvec_leus16, TCG_CALL_NO_RWG, void, ptr, ptr, 
>>> i64, i32)
>>> +DEF_HELPER_FLAGS_4(gvec_leus32, TCG_CALL_NO_RWG, void, ptr, ptr, 
>>> i64, i32)
>>> +DEF_HELPER_FLAGS_4(gvec_leus64, TCG_CALL_NO_RWG, void, ptr, ptr, 
>>> i64, i32)
>>> +
>>>   DEF_HELPER_FLAGS_5(gvec_bitsel, TCG_CALL_NO_RWG, void, ptr, ptr, 
>>> ptr, ptr, i32)
>>> diff --git a/include/tcg/tcg-op-gvec-common.h 
>>> b/include/tcg/tcg-op-gvec-common.h
>>> index e2683d487f..4db8a58c14 100644
>>> --- a/include/tcg/tcg-op-gvec-common.h
>>> +++ b/include/tcg/tcg-op-gvec-common.h
>>> @@ -374,6 +374,12 @@ void tcg_gen_gvec_rotrv(unsigned vece, uint32_t 
>>> dofs, uint32_t aofs,
>>>   void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
>>>                         uint32_t aofs, uint32_t bofs,
>>>                         uint32_t oprsz, uint32_t maxsz);
>>> +void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs,
>>> +                       uint32_t aofs, int64_t c,
>>> +                       uint32_t oprsz, uint32_t maxsz);
>>> +void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs,
>>> +                       uint32_t aofs, TCGv_i64 c,
>>> +                       uint32_t oprsz, uint32_t maxsz);
>>>   /*
>>>    * Perform vector bit select: d = (b & a) | (c & ~a).
>>> diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
>>> index 6c99f952ca..afca89baa1 100644
>>> --- a/accel/tcg/tcg-runtime-gvec.c
>>> +++ b/accel/tcg/tcg-runtime-gvec.c
>>> @@ -1042,6 +1042,32 @@ DO_CMP2(64)
>>>   #undef DO_CMP1
>>>   #undef DO_CMP2
>>> +#define DO_CMP1(NAME, TYPE, 
>>> OP)                                            \
>>> +void HELPER(NAME)(void *d, void *a, uint64_t b64, uint32_t 
>>> desc)           \
>>> +{ \
>>> +    intptr_t oprsz = 
>>> simd_oprsz(desc);                                     \
>>> +    TYPE inv = simd_data(desc), b = 
>>> b64;                                   \
>>> +    for (intptr_t i = 0; i < oprsz; i += sizeof(TYPE)) 
>>> {                   \
>>> +        *(TYPE *)(d + i) = -((*(TYPE *)(a + i) OP b) ^ 
>>> inv);               \
>>> + 
>>> }                                                                      \
>>> +    clear_high(d, oprsz, 
>>> desc);                                            \
>>> +}
>>> +
>>> +#define DO_CMP2(SZ) \
>>> +    DO_CMP1(gvec_eqs##SZ, uint##SZ##_t, ==)    \
>>> +    DO_CMP1(gvec_lts##SZ, int##SZ##_t, <)      \
>>> +    DO_CMP1(gvec_les##SZ, int##SZ##_t, <=)     \
>>> +    DO_CMP1(gvec_ltus##SZ, uint##SZ##_t, <)    \
>>> +    DO_CMP1(gvec_leus##SZ, uint##SZ##_t, <=)
>>> +
>>> +DO_CMP2(8)
>>> +DO_CMP2(16)
>>> +DO_CMP2(32)
>>> +DO_CMP2(64)
>>> +
>>> +#undef DO_CMP1
>>> +#undef DO_CMP2
>>> +
>>>   void HELPER(gvec_ssadd8)(void *d, void *a, void *b, uint32_t desc)
>>>   {
>>>       intptr_t oprsz = simd_oprsz(desc);
>>> diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
>>> index f5cfd9bf99..f7ca9e1051 100644
>>> --- a/tcg/tcg-op-gvec.c
>>> +++ b/tcg/tcg-op-gvec.c
>>> @@ -3819,6 +3819,156 @@ void tcg_gen_gvec_cmp(TCGCond cond, unsigned 
>>> vece, uint32_t dofs,
>>>       }
>>>   }
>>> +void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs,
>>> +                       uint32_t aofs, TCGv_i64 c,
>>> +                       uint32_t oprsz, uint32_t maxsz)
>>> +{
>>> +    static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
>>> +    static gen_helper_gvec_2i * const eq_fn[4] = {
>>> +        gen_helper_gvec_eqs8, gen_helper_gvec_eqs16,
>>> +        gen_helper_gvec_eqs32, gen_helper_gvec_eqs64
>>> +    };
>>> +    static gen_helper_gvec_2i * const lt_fn[4] = {
>>> +        gen_helper_gvec_lts8, gen_helper_gvec_lts16,
>>> +        gen_helper_gvec_lts32, gen_helper_gvec_lts64
>>> +    };
>>> +    static gen_helper_gvec_2i * const le_fn[4] = {
>>> +        gen_helper_gvec_les8, gen_helper_gvec_les16,
>>> +        gen_helper_gvec_les32, gen_helper_gvec_les64
>>> +    };
>>> +    static gen_helper_gvec_2i * const ltu_fn[4] = {
>>> +        gen_helper_gvec_ltus8, gen_helper_gvec_ltus16,
>>> +        gen_helper_gvec_ltus32, gen_helper_gvec_ltus64
>>> +    };
>>> +    static gen_helper_gvec_2i * const leu_fn[4] = {
>>> +        gen_helper_gvec_leus8, gen_helper_gvec_leus16,
>>> +        gen_helper_gvec_leus32, gen_helper_gvec_leus64
>>> +    };
>>> +    static gen_helper_gvec_2i * const * const fns[16] = {
>>> +        [TCG_COND_EQ] = eq_fn,
>>> +        [TCG_COND_LT] = lt_fn,
>>> +        [TCG_COND_LE] = le_fn,
>>> +        [TCG_COND_LTU] = ltu_fn,
>>> +        [TCG_COND_LEU] = leu_fn,
>>> +    };
>>> +
>>> +    TCGType type;
>>> +
>>> +    check_size_align(oprsz, maxsz, dofs | aofs);
>>> +    check_overlap_2(dofs, aofs, maxsz);
>>> +
>>> +    if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
>>> +        do_dup(MO_8, dofs, oprsz, maxsz,
>>> +               NULL, NULL, -(cond == TCG_COND_ALWAYS));
>>> +        return;
>>> +    }
>>> +
>>> +    /*
>>> +     * Implement inline with a vector type, if possible.
>>> +     * Prefer integer when 64-bit host and 64-bit comparison.
>>> +     */
>>> +    type = choose_vector_type(cmp_list, vece, oprsz,
>>> +                              TCG_TARGET_REG_BITS == 64 && vece == 
>>> MO_64);
>>> +    if (type != 0) {
>>> +        const TCGOpcode *hold_list = tcg_swap_vecop_list(cmp_list);
>>> +        TCGv_vec t_vec = tcg_temp_new_vec(type);
>>> +        uint32_t some, i;
>>> +
>>> +        tcg_gen_dup_i64_vec(vece, t_vec, c);
>>> +
>>> +        switch (type) {
>>> +        case TCG_TYPE_V256:
>>> +            some = QEMU_ALIGN_DOWN(oprsz, 32);
>>> +            for (i = 0; i < some; i += 32) {
>>> +                TCGv_vec t0 = tcg_temp_new_vec(TCG_TYPE_V256);
>>> +                TCGv_vec t1 = tcg_temp_new_vec(TCG_TYPE_V256);
>>> +                tcg_gen_ld_vec(t0, cpu_env, aofs);
Typo, This should be t1.
>>> +                tcg_gen_cmp_vec(cond, vece, t0, t1, t_vec);
>>> +                tcg_gen_st_vec(t0, cpu_env, dofs);
>>> +                aofs += 32;
>>> +                dofs += 32;
>>> +            }
>>> +            oprsz -= some;
>>> +            maxsz -= some;
>>> +            /* fallthru */
>>> +
>>> +        case TCG_TYPE_V128:
>>> +            some = QEMU_ALIGN_DOWN(oprsz, 16);
>>> +            for (i = 0; i < some; i += 16) {
>>> +                TCGv_vec t0 = tcg_temp_new_vec(TCG_TYPE_V128);
>>> +                TCGv_vec t1 = tcg_temp_new_vec(TCG_TYPE_V128);
>>> +                tcg_gen_ld_vec(t0, cpu_env, aofs + i);
Likewise.
>>> +                tcg_gen_cmp_vec(cond, vece, t0, t1, t_vec);
>>> +                tcg_gen_st_vec(t0, cpu_env, dofs + i);
>>> +            }
>>> +            break;
>>> +
>>> +        case TCG_TYPE_V64:
>>> +            some = QEMU_ALIGN_DOWN(oprsz, 8);
>>> +            for (i = 0; i < some; i += 8) {
>>> +                TCGv_vec t0 = tcg_temp_new_vec(TCG_TYPE_V64);
>>> +                TCGv_vec t1 = tcg_temp_new_vec(TCG_TYPE_V64);
>>> +                tcg_gen_ld_vec(t0, cpu_env, aofs + i);
Likewise.
>>> +                tcg_gen_cmp_vec(cond, vece, t0, t1, t_vec);
>>> +                tcg_gen_st_vec(t0, cpu_env, dofs + i);
>>> +            }

How about create expand_cmpi_vec() like expand_cmp_vec()?

Anyway, this patch is very useful.

Thanks.
Song Gao
>>> +            break;
>>> +
>>> +        default:
>>> +            g_assert_not_reached();
>>> +        }
>>> +        tcg_temp_free_vec(t_vec);
>>> +        tcg_swap_vecop_list(hold_list);
>>> +    } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
>>> +        TCGv_i64 t0 = tcg_temp_ebb_new_i64();
>>> +        uint32_t i;
>>> +
>>> +        for (i = 0; i < oprsz; i += 8) {
>>> +            tcg_gen_ld_i64(t0, cpu_env, aofs + i);
>>> +            tcg_gen_negsetcond_i64(cond, t0, t0, c);
>>> +            tcg_gen_st_i64(t0, cpu_env, dofs + i);
>>> +        }
>>> +        tcg_temp_free_i64(t0);
>>> +    } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
>>> +        TCGv_i32 t0 = tcg_temp_ebb_new_i32();
>>> +        TCGv_i32 t1 = tcg_temp_ebb_new_i32();
>>> +        uint32_t i;
>>> +
>>> +        tcg_gen_extrl_i64_i32(t1, c);
>>> +        for (i = 0; i < oprsz; i += 8) {
>>> +            tcg_gen_ld_i32(t0, cpu_env, aofs + i);
>>> +            tcg_gen_negsetcond_i32(cond, t0, t0, t1);
>>> +            tcg_gen_st_i32(t0, cpu_env, dofs + i);
>>> +        }
>>> +        tcg_temp_free_i32(t0);
>>> +        tcg_temp_free_i32(t1);
>>> +    } else {
>>> +        gen_helper_gvec_2i * const *fn = fns[cond];
>>> +        bool inv = false;
>>> +
>>> +        if (fn == NULL) {
>>> +            cond = tcg_invert_cond(cond);
>>> +            fn = fns[cond];
>>> +            assert(fn != NULL);
>>> +            inv = true;
>>> +        }
>>> +        tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, inv, 
>>> fn[vece]);
>>> +        return;
>>> +    }
>>> +
>>> +    if (oprsz < maxsz) {
>>> +        expand_clr(dofs + oprsz, maxsz - oprsz);
>>> +    }
>>> +}
>>> +
>>> +void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs,
>>> +                       uint32_t aofs, int64_t c,
>>> +                       uint32_t oprsz, uint32_t maxsz)
>>> +{
>>> +    TCGv_i64 tmp = tcg_constant_i64(c);
>>> +    tcg_gen_gvec_cmps(cond, vece, dofs, aofs, tmp, oprsz, maxsz);
>>> +}
>>> +
>>>   static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, 
>>> TCGv_i64 c)
>>>   {
>>>       TCGv_i64 t = tcg_temp_ebb_new_i64();
>>>
>>
>
diff mbox series

Patch

diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index 186899a2c7..c23b5e66c4 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -297,4 +297,29 @@  DEF_HELPER_FLAGS_4(gvec_leu16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_leu32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_leu64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(gvec_eqs8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_eqs16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_eqs32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_eqs64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_lts8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_lts16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_lts32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_lts64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_les8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_les16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_les32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_les64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_ltus8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_ltus16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_ltus32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_ltus64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_leus8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_leus16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_leus32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_leus64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
 DEF_HELPER_FLAGS_5(gvec_bitsel, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
diff --git a/include/tcg/tcg-op-gvec-common.h b/include/tcg/tcg-op-gvec-common.h
index e2683d487f..4db8a58c14 100644
--- a/include/tcg/tcg-op-gvec-common.h
+++ b/include/tcg/tcg-op-gvec-common.h
@@ -374,6 +374,12 @@  void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
                       uint32_t aofs, uint32_t bofs,
                       uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs,
+                       uint32_t aofs, int64_t c,
+                       uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs,
+                       uint32_t aofs, TCGv_i64 c,
+                       uint32_t oprsz, uint32_t maxsz);
 
 /*
  * Perform vector bit select: d = (b & a) | (c & ~a).
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index 6c99f952ca..afca89baa1 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -1042,6 +1042,32 @@  DO_CMP2(64)
 #undef DO_CMP1
 #undef DO_CMP2
 
+#define DO_CMP1(NAME, TYPE, OP)                                            \
+void HELPER(NAME)(void *d, void *a, uint64_t b64, uint32_t desc)           \
+{                                                                          \
+    intptr_t oprsz = simd_oprsz(desc);                                     \
+    TYPE inv = simd_data(desc), b = b64;                                   \
+    for (intptr_t i = 0; i < oprsz; i += sizeof(TYPE)) {                   \
+        *(TYPE *)(d + i) = -((*(TYPE *)(a + i) OP b) ^ inv);               \
+    }                                                                      \
+    clear_high(d, oprsz, desc);                                            \
+}
+
+#define DO_CMP2(SZ) \
+    DO_CMP1(gvec_eqs##SZ, uint##SZ##_t, ==)    \
+    DO_CMP1(gvec_lts##SZ, int##SZ##_t, <)      \
+    DO_CMP1(gvec_les##SZ, int##SZ##_t, <=)     \
+    DO_CMP1(gvec_ltus##SZ, uint##SZ##_t, <)    \
+    DO_CMP1(gvec_leus##SZ, uint##SZ##_t, <=)
+
+DO_CMP2(8)
+DO_CMP2(16)
+DO_CMP2(32)
+DO_CMP2(64)
+
+#undef DO_CMP1
+#undef DO_CMP2
+
 void HELPER(gvec_ssadd8)(void *d, void *a, void *b, uint32_t desc)
 {
     intptr_t oprsz = simd_oprsz(desc);
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index f5cfd9bf99..f7ca9e1051 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -3819,6 +3819,156 @@  void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
     }
 }
 
+void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs,
+                       uint32_t aofs, TCGv_i64 c,
+                       uint32_t oprsz, uint32_t maxsz)
+{
+    static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
+    static gen_helper_gvec_2i * const eq_fn[4] = {
+        gen_helper_gvec_eqs8, gen_helper_gvec_eqs16,
+        gen_helper_gvec_eqs32, gen_helper_gvec_eqs64
+    };
+    static gen_helper_gvec_2i * const lt_fn[4] = {
+        gen_helper_gvec_lts8, gen_helper_gvec_lts16,
+        gen_helper_gvec_lts32, gen_helper_gvec_lts64
+    };
+    static gen_helper_gvec_2i * const le_fn[4] = {
+        gen_helper_gvec_les8, gen_helper_gvec_les16,
+        gen_helper_gvec_les32, gen_helper_gvec_les64
+    };
+    static gen_helper_gvec_2i * const ltu_fn[4] = {
+        gen_helper_gvec_ltus8, gen_helper_gvec_ltus16,
+        gen_helper_gvec_ltus32, gen_helper_gvec_ltus64
+    };
+    static gen_helper_gvec_2i * const leu_fn[4] = {
+        gen_helper_gvec_leus8, gen_helper_gvec_leus16,
+        gen_helper_gvec_leus32, gen_helper_gvec_leus64
+    };
+    static gen_helper_gvec_2i * const * const fns[16] = {
+        [TCG_COND_EQ] = eq_fn,
+        [TCG_COND_LT] = lt_fn,
+        [TCG_COND_LE] = le_fn,
+        [TCG_COND_LTU] = ltu_fn,
+        [TCG_COND_LEU] = leu_fn,
+    };
+
+    TCGType type;
+
+    check_size_align(oprsz, maxsz, dofs | aofs);
+    check_overlap_2(dofs, aofs, maxsz);
+
+    if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
+        do_dup(MO_8, dofs, oprsz, maxsz,
+               NULL, NULL, -(cond == TCG_COND_ALWAYS));
+        return;
+    }
+
+    /*
+     * Implement inline with a vector type, if possible.
+     * Prefer integer when 64-bit host and 64-bit comparison.
+     */
+    type = choose_vector_type(cmp_list, vece, oprsz,
+                              TCG_TARGET_REG_BITS == 64 && vece == MO_64);
+    if (type != 0) {
+        const TCGOpcode *hold_list = tcg_swap_vecop_list(cmp_list);
+        TCGv_vec t_vec = tcg_temp_new_vec(type);
+        uint32_t some, i;
+
+        tcg_gen_dup_i64_vec(vece, t_vec, c);
+
+        switch (type) {
+        case TCG_TYPE_V256:
+            some = QEMU_ALIGN_DOWN(oprsz, 32);
+            for (i = 0; i < some; i += 32) {
+                TCGv_vec t0 = tcg_temp_new_vec(TCG_TYPE_V256);
+                TCGv_vec t1 = tcg_temp_new_vec(TCG_TYPE_V256);
+                tcg_gen_ld_vec(t0, cpu_env, aofs);
+                tcg_gen_cmp_vec(cond, vece, t0, t1, t_vec);
+                tcg_gen_st_vec(t0, cpu_env, dofs);
+                aofs += 32;
+                dofs += 32;
+            }
+            oprsz -= some;
+            maxsz -= some;
+            /* fallthru */
+
+        case TCG_TYPE_V128:
+            some = QEMU_ALIGN_DOWN(oprsz, 16);
+            for (i = 0; i < some; i += 16) {
+                TCGv_vec t0 = tcg_temp_new_vec(TCG_TYPE_V128);
+                TCGv_vec t1 = tcg_temp_new_vec(TCG_TYPE_V128);
+                tcg_gen_ld_vec(t0, cpu_env, aofs + i);
+                tcg_gen_cmp_vec(cond, vece, t0, t1, t_vec);
+                tcg_gen_st_vec(t0, cpu_env, dofs + i);
+            }
+            break;
+
+        case TCG_TYPE_V64:
+            some = QEMU_ALIGN_DOWN(oprsz, 8);
+            for (i = 0; i < some; i += 8) {
+                TCGv_vec t0 = tcg_temp_new_vec(TCG_TYPE_V64);
+                TCGv_vec t1 = tcg_temp_new_vec(TCG_TYPE_V64);
+                tcg_gen_ld_vec(t0, cpu_env, aofs + i);
+                tcg_gen_cmp_vec(cond, vece, t0, t1, t_vec);
+                tcg_gen_st_vec(t0, cpu_env, dofs + i);
+            }
+            break;
+
+        default:
+            g_assert_not_reached();
+        }
+        tcg_temp_free_vec(t_vec);
+        tcg_swap_vecop_list(hold_list);
+    } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
+        TCGv_i64 t0 = tcg_temp_ebb_new_i64();
+        uint32_t i;
+
+        for (i = 0; i < oprsz; i += 8) {
+            tcg_gen_ld_i64(t0, cpu_env, aofs + i);
+            tcg_gen_negsetcond_i64(cond, t0, t0, c);
+            tcg_gen_st_i64(t0, cpu_env, dofs + i);
+        }
+        tcg_temp_free_i64(t0);
+    } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
+        TCGv_i32 t0 = tcg_temp_ebb_new_i32();
+        TCGv_i32 t1 = tcg_temp_ebb_new_i32();
+        uint32_t i;
+
+        tcg_gen_extrl_i64_i32(t1, c);
+        for (i = 0; i < oprsz; i += 8) {
+            tcg_gen_ld_i32(t0, cpu_env, aofs + i);
+            tcg_gen_negsetcond_i32(cond, t0, t0, t1);
+            tcg_gen_st_i32(t0, cpu_env, dofs + i);
+        }
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    } else {
+        gen_helper_gvec_2i * const *fn = fns[cond];
+        bool inv = false;
+
+        if (fn == NULL) {
+            cond = tcg_invert_cond(cond);
+            fn = fns[cond];
+            assert(fn != NULL);
+            inv = true;
+        }
+        tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, inv, fn[vece]);
+        return;
+    }
+
+    if (oprsz < maxsz) {
+        expand_clr(dofs + oprsz, maxsz - oprsz);
+    }
+}
+
+void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs,
+                       uint32_t aofs, int64_t c,
+                       uint32_t oprsz, uint32_t maxsz)
+{
+    TCGv_i64 tmp = tcg_constant_i64(c);
+    tcg_gen_gvec_cmps(cond, vece, dofs, aofs, tmp, oprsz, maxsz);
+}
+
 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
 {
     TCGv_i64 t = tcg_temp_ebb_new_i64();