@@ -139,6 +139,11 @@ static bool have_bmi2;
#else
# define have_bmi2 0
#endif
+#if defined(CONFIG_CPUID_H) && defined(bit_LZCNT)
+static bool have_lzcnt;
+#else
+# define have_lzcnt 0
+#endif
static tcg_insn_unit *tb_ret_addr;
@@ -1148,6 +1153,76 @@ static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
}
#endif
+static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
+ TCGArg arg2, bool a2const)
+{
+ if (a2const) {
+ tcg_debug_assert(have_bmi1);
+ tcg_debug_assert(arg2 == (rexw ? 64 : 32));
+ tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
+ } else {
+ /* ??? The manual says that the output is undefined when the
+ input is zero, but real hardware leaves it unchanged. As
+ noted in target-i386/translate.c, real programs depend on
+ this -- now we are one more of those. */
+ /* ??? We could avoid this if TCG had an early clobber marking
+ for the output. */
+ tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
+ if (dest != arg2) {
+ tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
+ }
+ }
+}
+
+static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
+ TCGArg arg2, bool a2const)
+{
+ TCGLabel *over;
+ TCGType type;
+ unsigned rev;
+
+ /* ??? All this would be easier (and would avoid the semi-undefined
+ behaviour) if TCG had an early clobber marking for the output. */
+
+ if (have_lzcnt) {
+ if (a2const && arg2 == (rexw ? 64 : 32)) {
+ tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
+ return;
+ }
+ if (!a2const && dest != arg2) {
+ tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
+ tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
+ return;
+ }
+ }
+
+ over = gen_new_label();
+ type = rexw ? TCG_TYPE_I64: TCG_TYPE_I32;
+ rev = rexw ? 63 : 31;
+
+ tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
+
+ /* Recall that the output of BSR is the index not the count.
+ Therefore we must adjust the result by ^ (SIZE-1). In some
+ cases below, we prefer an extra XOR to an extra JMP. */
+ if (!a2const && dest == arg2) {
+ /* ??? See the comment in tcg_out_ctz re BSF. */
+ tcg_out_jxx(s, tcg_cond_to_jcc[TCG_COND_EQ], over, 1);
+ tgen_arithi(s, ARITH_XOR + rexw, dest, rev, 0);
+ tcg_out_label(s, over, s->code_ptr);
+ } else {
+ tcg_out_jxx(s, tcg_cond_to_jcc[TCG_COND_NE], over, 1);
+ if (a2const) {
+ tcg_out_movi(s, type, dest, arg2 ^ rev);
+ } else {
+ tcg_out_mov(s, type, dest, arg2);
+ tgen_arithi(s, ARITH_XOR + rexw, dest, rev, 0);
+ }
+ tcg_out_label(s, over, s->code_ptr);
+ tgen_arithi(s, ARITH_XOR + rexw, dest, rev, 0);
+ }
+}
+
static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
{
intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
@@ -2024,34 +2099,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
break;
OP_32_64(ctz):
- if (const_args[2]) {
- tcg_debug_assert(have_bmi1);
- tcg_debug_assert(args[2] == (rexw ? 64 : 32));
- tcg_out_modrm(s, OPC_TZCNT + rexw, args[0], args[1]);
- } else {
- /* ??? The manual says that the output is undefined when the
- input is zero, but real hardware leaves it unchanged. As
- noted in target-i386/translate.c, real programs depend on
- this -- now we are one more of those. */
- tcg_out_modrm(s, OPC_BSF + rexw, args[0], args[1]);
- if (args[0] != args[2]) {
- tcg_out_cmov(s, TCG_COND_EQ, rexw, args[0], args[2]);
- }
- }
+ tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
break;
-
OP_32_64(clz):
- if (const_args[2]) {
- tcg_debug_assert(have_bmi1);
- tcg_debug_assert(args[2] == (rexw ? 64 : 32));
- tcg_out_modrm(s, OPC_LZCNT + rexw, args[0], args[1]);
- } else {
- /* ??? See above. */
- tcg_out_modrm(s, OPC_BSR + rexw, args[0], args[1]);
- if (args[0] != args[2]) {
- tcg_out_cmov(s, TCG_COND_EQ, rexw, args[0], args[2]);
- }
- }
+ tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
break;
case INDEX_op_brcond_i32:
@@ -2281,7 +2332,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
{ INDEX_op_sar_i32, { "r", "0", "Ci" } },
{ INDEX_op_rotl_i32, { "r", "0", "ci" } },
{ INDEX_op_rotr_i32, { "r", "0", "ci" } },
- { INDEX_op_clz_i32, { "r", "r", "rW" } },
+ { INDEX_op_clz_i32, { "r", "r", "ri" } },
{ INDEX_op_ctz_i32, { "r", "r", "rW" } },
{ INDEX_op_brcond_i32, { "r", "ri" } },
@@ -2344,7 +2395,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
{ INDEX_op_sar_i64, { "r", "0", "Ci" } },
{ INDEX_op_rotl_i64, { "r", "0", "ci" } },
{ INDEX_op_rotr_i64, { "r", "0", "ci" } },
- { INDEX_op_clz_i64, { "r", "r", "rW" } },
+ { INDEX_op_clz_i64, { "r", "r", "re" } },
{ INDEX_op_ctz_i64, { "r", "r", "rW" } },
{ INDEX_op_brcond_i64, { "r", "re" } },
@@ -2498,6 +2549,10 @@ static void tcg_target_init(TCGContext *s)
need to probe for it. */
have_movbe = (c & bit_MOVBE) != 0;
#endif
+#ifndef have_lzcnt
+ /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */
+ have_lzcnt = (c & bit_LZCNT) != 0;
+#endif
}
if (max >= 7) {