From patchwork Mon Feb 14 10:22:48 2011
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Peter Maydell <peter.maydell@linaro.org>
X-Patchwork-Id: 137
Return-Path: <pm215@archaic.org.uk>
Delivered-To: unknown
Received: from imap.gmail.com (74.125.159.109) by localhost6.localdomain6
 with IMAP4-SSL; 08 Jun 2011 14:40:17 -0000
Delivered-To: patches@linaro.org
Received: by 10.146.83.12 with SMTP id g12cs289935yab;
 Mon, 14 Feb 2011 02:22:56 -0800 (PST)
Received: by 10.213.15.135 with SMTP id k7mr3734445eba.26.1297678973608;
 Mon, 14 Feb 2011 02:22:53 -0800 (PST)
Received: from mnementh.archaic.org.uk (mnementh.archaic.org.uk
 [81.2.115.146]) by mx.google.com with ESMTPS id
 a20si4983939eei.75.2011.02.14.02.22.51
 (version=TLSv1/SSLv3 cipher=OTHER);
 Mon, 14 Feb 2011 02:22:53 -0800 (PST)
Received-SPF: pass (google.com: best guess record for domain of
 pm215@archaic.org.uk designates 81.2.115.146 as permitted
 sender) client-ip=81.2.115.146; 
Authentication-Results: mx.google.com;
 spf=pass (google.com: best guess record for domain
 of pm215@archaic.org.uk designates 81.2.115.146 as permitted
 sender) smtp.mail=pm215@archaic.org.uk
Received: from pm215 by mnementh.archaic.org.uk with local (Exim 4.72)
 (envelope-from <pm215@archaic.org.uk>)
 id 1PovZx-0000tq-OB; Mon, 14 Feb 2011 10:22:49 +0000
From: Peter Maydell <peter.maydell@linaro.org>
To: qemu-devel@nongnu.org
Cc: patches@linaro.org
Subject: [PATCH v2 1/2] target-arm: Move Neon VUZP to helper functions
Date: Mon, 14 Feb 2011 10:22:48 +0000
Message-Id: <1297678969-3433-2-git-send-email-peter.maydell@linaro.org>
X-Mailer: git-send-email 1.7.2.3
In-Reply-To: <1297678969-3433-1-git-send-email-peter.maydell@linaro.org>
References: <1297678969-3433-1-git-send-email-peter.maydell@linaro.org>

Move the implementation of the Neon VUZP unzip instruction from inline
code to helper functions. (At 50+ TCG ops it was well over the
recommended limit for coding inline.) The helper implementations also
fix the handling of the quadword version of the instruction.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target-arm/helpers.h     |    6 +++
 target-arm/neon_helper.c |   94 ++++++++++++++++++++++++++++++++++++++
 target-arm/translate.c   |  112 +++++++++++++++-------------------------------
 3 files changed, 137 insertions(+), 75 deletions(-)

diff --git a/target-arm/helpers.h b/target-arm/helpers.h
index 77f1635..88c2fce 100644
--- a/target-arm/helpers.h
+++ b/target-arm/helpers.h
@@ -460,4 +460,10 @@ DEF_HELPER_3(iwmmxt_muladdswl, i64, i64, i32, i32)
 
 DEF_HELPER_2(set_teecr, void, env, i32)
 
+DEF_HELPER_3(neon_unzip8, void, env, i32, i32)
+DEF_HELPER_3(neon_unzip16, void, env, i32, i32)
+DEF_HELPER_3(neon_qunzip8, void, env, i32, i32)
+DEF_HELPER_3(neon_qunzip16, void, env, i32, i32)
+DEF_HELPER_3(neon_qunzip32, void, env, i32, i32)
+
 #include "def-helper.h"
diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index dc09968..6e2a2fd 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -1663,3 +1663,97 @@ uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b)
     float32 f1 = float32_abs(vfp_itos(b));
     return (float32_compare_quiet(f0, f1, NFS) > 0) ? ~0 : 0;
 }
+
+#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
+
+void HELPER(neon_qunzip8)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
+    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
+    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
+    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
+    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
+        | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
+        | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
+        | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
+    uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
+        | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
+        | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
+        | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
+    uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
+        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
+        | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
+        | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
+    uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
+        | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
+        | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
+        | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
+    env->vfp.regs[rm] = make_float64(m0);
+    env->vfp.regs[rm + 1] = make_float64(m1);
+    env->vfp.regs[rd] = make_float64(d0);
+    env->vfp.regs[rd + 1] = make_float64(d1);
+}
+
+void HELPER(neon_qunzip16)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
+    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
+    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
+    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
+    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
+        | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
+    uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
+        | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
+    uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
+        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
+    uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
+        | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
+    env->vfp.regs[rm] = make_float64(m0);
+    env->vfp.regs[rm + 1] = make_float64(m1);
+    env->vfp.regs[rd] = make_float64(d0);
+    env->vfp.regs[rd + 1] = make_float64(d1);
+}
+
+void HELPER(neon_qunzip32)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
+    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
+    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
+    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
+    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
+    uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
+    uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
+    uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
+    env->vfp.regs[rm] = make_float64(m0);
+    env->vfp.regs[rm + 1] = make_float64(m1);
+    env->vfp.regs[rd] = make_float64(d0);
+    env->vfp.regs[rd + 1] = make_float64(d1);
+}
+
+void HELPER(neon_unzip8)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+    uint64_t zm = float64_val(env->vfp.regs[rm]);
+    uint64_t zd = float64_val(env->vfp.regs[rd]);
+    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
+        | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
+        | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
+        | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
+    uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
+        | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
+        | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
+        | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
+    env->vfp.regs[rm] = make_float64(m0);
+    env->vfp.regs[rd] = make_float64(d0);
+}
+
+void HELPER(neon_unzip16)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+    uint64_t zm = float64_val(env->vfp.regs[rm]);
+    uint64_t zd = float64_val(env->vfp.regs[rd]);
+    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
+        | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
+    uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
+        | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
+    env->vfp.regs[rm] = make_float64(m0);
+    env->vfp.regs[rd] = make_float64(d0);
+}
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 362d1d0..6d94223 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -3614,40 +3614,43 @@ static inline TCGv neon_get_scalar(int size, int reg)
     return tmp;
 }
 
-static void gen_neon_unzip_u8(TCGv t0, TCGv t1)
+static int gen_neon_unzip(int rd, int rm, int size, int q)
 {
-    TCGv rd, rm, tmp;
-
-    rd = new_tmp();
-    rm = new_tmp();
-    tmp = new_tmp();
-
-    tcg_gen_andi_i32(rd, t0, 0xff);
-    tcg_gen_shri_i32(tmp, t0, 8);
-    tcg_gen_andi_i32(tmp, tmp, 0xff00);
-    tcg_gen_or_i32(rd, rd, tmp);
-    tcg_gen_shli_i32(tmp, t1, 16);
-    tcg_gen_andi_i32(tmp, tmp, 0xff0000);
-    tcg_gen_or_i32(rd, rd, tmp);
-    tcg_gen_shli_i32(tmp, t1, 8);
-    tcg_gen_andi_i32(tmp, tmp, 0xff000000);
-    tcg_gen_or_i32(rd, rd, tmp);
-
-    tcg_gen_shri_i32(rm, t0, 8);
-    tcg_gen_andi_i32(rm, rm, 0xff);
-    tcg_gen_shri_i32(tmp, t0, 16);
-    tcg_gen_andi_i32(tmp, tmp, 0xff00);
-    tcg_gen_or_i32(rm, rm, tmp);
-    tcg_gen_shli_i32(tmp, t1, 8);
-    tcg_gen_andi_i32(tmp, tmp, 0xff0000);
-    tcg_gen_or_i32(rm, rm, tmp);
-    tcg_gen_andi_i32(tmp, t1, 0xff000000);
-    tcg_gen_or_i32(t1, rm, tmp);
-    tcg_gen_mov_i32(t0, rd);
-
-    dead_tmp(tmp);
-    dead_tmp(rm);
-    dead_tmp(rd);
+    TCGv tmp, tmp2;
+    if (size == 3 || (!q && size == 2)) {
+        return 1;
+    }
+    tmp = tcg_const_i32(rd);
+    tmp2 = tcg_const_i32(rm);
+    if (q) {
+        switch (size) {
+        case 0:
+            gen_helper_neon_qunzip8(cpu_env, tmp, tmp2);
+            break;
+        case 1:
+            gen_helper_neon_qunzip16(cpu_env, tmp, tmp2);
+            break;
+        case 2:
+            gen_helper_neon_qunzip32(cpu_env, tmp, tmp2);
+            break;
+        default:
+            abort();
+        }
+    } else {
+        switch (size) {
+        case 0:
+            gen_helper_neon_unzip8(cpu_env, tmp, tmp2);
+            break;
+        case 1:
+            gen_helper_neon_unzip16(cpu_env, tmp, tmp2);
+            break;
+        default:
+            abort();
+        }
+    }
+    tcg_temp_free_i32(tmp);
+    tcg_temp_free_i32(tmp2);
+    return 0;
 }
 
 static void gen_neon_zip_u8(TCGv t0, TCGv t1)
@@ -3705,25 +3708,6 @@ static void gen_neon_zip_u16(TCGv t0, TCGv t1)
     dead_tmp(tmp);
 }
 
-static void gen_neon_unzip(int reg, int q, int tmp, int size)
-{
-    int n;
-    TCGv t0, t1;
-
-    for (n = 0; n < q + 1; n += 2) {
-        t0 = neon_load_reg(reg, n);
-        t1 = neon_load_reg(reg, n + 1);
-        switch (size) {
-        case 0: gen_neon_unzip_u8(t0, t1); break;
-        case 1: gen_neon_zip_u16(t0, t1); break; /* zip and unzip are the same.  */
-        case 2: /* no-op */; break;
-        default: abort();
-        }
-        neon_store_scratch(tmp + n, t0);
-        neon_store_scratch(tmp + n + 1, t1);
-    }
-}
-
 static void gen_neon_trn_u8(TCGv t0, TCGv t1)
 {
     TCGv rd, tmp;
@@ -5436,30 +5420,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
                     }
                     break;
                 case 34: /* VUZP */
-                    /* Reg  Before       After
-                       Rd   A3 A2 A1 A0  B2 B0 A2 A0
-                       Rm   B3 B2 B1 B0  B3 B1 A3 A1
-                     */
-                    if (size == 3)
+                    if (gen_neon_unzip(rd, rm, size, q)) {
                         return 1;
-                    gen_neon_unzip(rd, q, 0, size);
-                    gen_neon_unzip(rm, q, 4, size);
-                    if (q) {
-                        static int unzip_order_q[8] =
-                            {0, 2, 4, 6, 1, 3, 5, 7};
-                        for (n = 0; n < 8; n++) {
-                            int reg = (n < 4) ? rd : rm;
-                            tmp = neon_load_scratch(unzip_order_q[n]);
-                            neon_store_reg(reg, n % 4, tmp);
-                        }
-                    } else {
-                        static int unzip_order[4] =
-                            {0, 4, 1, 5};
-                        for (n = 0; n < 4; n++) {
-                            int reg = (n < 2) ? rd : rm;
-                            tmp = neon_load_scratch(unzip_order[n]);
-                            neon_store_reg(reg, n % 2, tmp);
-                        }
                     }
                     break;
                 case 35: /* VZIP */