From patchwork Tue Dec  6 13:53:33 2011
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Mans Rullgard <mans.rullgard@linaro.org>
X-Patchwork-Id: 5516
Return-Path: <patch+caf_=linaro-patchwork=canonical.com@linaro.org>
X-Original-To: patchwork@peony.canonical.com
Delivered-To: patchwork@peony.canonical.com
Received: from fiordland.canonical.com (fiordland.canonical.com
 [91.189.94.145])
 by peony.canonical.com (Postfix) with ESMTP id 7D4192400C
 for <patchwork@peony.canonical.com>;
 Tue,  6 Dec 2011 13:53:53 +0000 (UTC)
Received: from mail-ee0-f52.google.com (mail-ee0-f52.google.com [74.125.83.52])
 by fiordland.canonical.com (Postfix) with ESMTP id 722BEA189D7
 for <linaro-patchwork@canonical.com>;
 Tue,  6 Dec 2011 13:53:53 +0000 (UTC)
Received: by mail-ee0-f52.google.com with SMTP id c14so818959eek.11
 for <linaro-patchwork@canonical.com>;
 Tue, 06 Dec 2011 05:53:53 -0800 (PST)
Received: by 10.14.17.211 with SMTP id j59mr2619353eej.138.1323179633293;
 Tue, 06 Dec 2011 05:53:53 -0800 (PST)
X-Forwarded-To: linaro-patchwork@canonical.com
X-Forwarded-For: patch@linaro.org linaro-patchwork@canonical.com
Delivered-To: patches@linaro.org
Received: by 10.205.129.2 with SMTP id hg2cs44130bkc;
 Tue, 6 Dec 2011 05:53:53 -0800 (PST)
Received: by 10.14.18.22 with SMTP id k22mr2457047eek.53.1323179630091;
 Tue, 06 Dec 2011 05:53:50 -0800 (PST)
Received: from unicorn.mansr.com (unicorn.mansr.com. [78.86.181.103])
 by mx.google.com with ESMTP id
 af11si21934352bkc.93.2011.12.06.05.53.49; 
 Tue, 06 Dec 2011 05:53:49 -0800 (PST)
Received-SPF: pass (google.com: best guess record for domain of
 mru@mansr.com designates 78.86.181.103 as permitted sender)
 client-ip=78.86.181.103; 
Authentication-Results: mx.google.com;
 spf=pass (google.com: best guess record for domain
 of mru@mansr.com designates 78.86.181.103 as permitted
 sender) smtp.mail=mru@mansr.com
Received: by unicorn.mansr.com (Postfix, from userid 51770)
 id D3CB513262; Tue,  6 Dec 2011 13:53:48 +0000 (GMT)
From: Mans Rullgard <mans.rullgard@linaro.org>
To: libav-devel@libav.org
Cc: patches@linaro.org
Subject: [PATCH 3/4] rv40: NEON optimised chroma MC
Date: Tue,  6 Dec 2011 13:53:33 +0000
Message-Id: <1323179614-32610-3-git-send-email-mans.rullgard@linaro.org>
X-Mailer: git-send-email 1.7.8
In-Reply-To: <1323179614-32610-1-git-send-email-mans.rullgard@linaro.org>
References: <1323179614-32610-1-git-send-email-mans.rullgard@linaro.org>

From: Janne Grunau <janne-libav@jannau.net>

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/arm/Makefile            |    2 +
 libavcodec/arm/h264cmc_neon.S      |   80 +++++++++++++++++++++++++++++++++--
 libavcodec/arm/rv40dsp_init_neon.c |   38 +++++++++++++++++
 libavcodec/rv34dsp.h               |    1 +
 libavcodec/rv40dsp.c               |    2 +
 5 files changed, 118 insertions(+), 5 deletions(-)
 create mode 100644 libavcodec/arm/rv40dsp_init_neon.c

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index c125a59..a948e6d 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -68,6 +68,8 @@ NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_init_neon.o       \
 
 NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_init_neon.o       \
                                           arm/rv34dsp_neon.o            \
+                                          arm/rv40dsp_init_neon.o       \
+                                          arm/h264cmc_neon.o            \
 
 NEON-OBJS-$(CONFIG_VP3_DECODER)        += arm/vp3dsp_neon.o
 
diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S
index e10adac..a6feadd 100644
--- a/libavcodec/arm/h264cmc_neon.S
+++ b/libavcodec/arm/h264cmc_neon.S
@@ -21,8 +21,8 @@
 #include "asm.S"
 
 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
-.macro  h264_chroma_mc8 type
-function ff_\type\()_h264_chroma_mc8_neon, export=1
+.macro  h264_chroma_mc8 type, codec=h264
+function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
         push            {r4-r7, lr}
         ldrd            r4,  [sp, #20]
   .ifc \type,avg
@@ -31,6 +31,15 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1
         pld             [r1]
         pld             [r1, r2]
 
+  .ifc \codec,rv40
+        movrel          r6,  rv40bias
+        lsr             r7,  r5,  #1
+        add             r6,  r6,  r7,  lsl #3
+        lsr             r7,  r4,  #1
+        add             r6,  r6,  r7,  lsl #1
+        vld1.16         {d22[],d23[]}, [r6,:16]
+  .endif
+
 A       muls            r7,  r4,  r5
 T       mul             r7,  r4,  r5
 T       cmp             r7,  #0
@@ -67,10 +76,17 @@ T       cmp             r7,  #0
         vmlal.u8        q9,  d7,  d1
         vmlal.u8        q9,  d4,  d2
         vmlal.u8        q9,  d5,  d3
-        vrshrn.u16      d16, q8,  #6
         vld1.8          {d6, d7}, [r5], r4
         pld             [r1]
+  .ifc \codec,h264
+        vrshrn.u16      d16, q8,  #6
         vrshrn.u16      d17, q9,  #6
+  .else
+        vadd.u16        q8,  q8,  q11
+        vadd.u16        q9,  q9,  q11
+        vshrn.u16       d16, q8,  #6
+        vshrn.u16       d17, q9,  #6
+  .endif
   .ifc \type,avg
         vld1.8          {d20}, [lr,:64], r2
         vld1.8          {d21}, [lr,:64], r2
@@ -102,8 +118,15 @@ T       cmp             r7,  #0
         vmull.u8        q9,  d6,  d0
         vmlal.u8        q9,  d4,  d1
         vld1.8          {d6}, [r5], r4
+  .ifc \codec,h264
         vrshrn.u16      d16, q8,  #6
         vrshrn.u16      d17, q9,  #6
+  .else
+        vadd.u16        q8,  q8,  q11
+        vadd.u16        q9,  q9,  q11
+        vshrn.u16       d16, q8,  #6
+        vshrn.u16       d17, q9,  #6
+  .endif
   .ifc \type,avg
         vld1.8          {d20}, [lr,:64], r2
         vld1.8          {d21}, [lr,:64], r2
@@ -131,8 +154,15 @@ T       cmp             r7,  #0
         vmlal.u8        q9,  d7,  d1
         pld             [r1]
         vext.8          d5,  d4,  d5,  #1
+  .ifc \codec,h264
         vrshrn.u16      d16, q8,  #6
         vrshrn.u16      d17, q9,  #6
+  .else
+        vadd.u16        q8,  q8,  q11
+        vadd.u16        q9,  q9,  q11
+        vshrn.u16       d16, q8,  #6
+        vshrn.u16       d17, q9,  #6
+  .endif
   .ifc \type,avg
         vld1.8          {d20}, [lr,:64], r2
         vld1.8          {d21}, [lr,:64], r2
@@ -149,8 +179,8 @@ endfunc
 .endm
 
 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
-.macro  h264_chroma_mc4 type
-function ff_\type\()_h264_chroma_mc4_neon, export=1
+.macro  h264_chroma_mc4 type, codec=h264
+function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
         push            {r4-r7, lr}
         ldrd            r4,  [sp, #20]
   .ifc \type,avg
@@ -159,6 +189,15 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1
         pld             [r1]
         pld             [r1, r2]
 
+  .ifc \codec,rv40
+        movrel          r6,  rv40bias
+        lsr             r7,  r5,  #1
+        add             r6,  r6,  r7,  lsl #3
+        lsr             r7,  r4,  #1
+        add             r6,  r6,  r7,  lsl #1
+        vld1.16         {d22[],d23[]}, [r6,:16]
+  .endif
+
 A       muls            r7,  r4,  r5
 T       mul             r7,  r4,  r5
 T       cmp             r7,  #0
@@ -199,7 +238,12 @@ T       cmp             r7,  #0
         vld1.8          {d6},     [r5], r4
         vadd.i16        d16, d16, d17
         vadd.i16        d17, d18, d19
+  .ifc \codec,h264
         vrshrn.u16      d16, q8,  #6
+  .else
+        vadd.u16        q8,  q8,  q11
+        vshrn.u16       d16, q8,  #6
+  .endif
         subs            r3,  r3,  #2
         pld             [r1]
   .ifc \type,avg
@@ -236,7 +280,12 @@ T       cmp             r7,  #0
         vld1.32         {d4[1]},  [r5], r4
         vadd.i16        d16, d16, d17
         vadd.i16        d17, d18, d19
+  .ifc \codec,h264
         vrshrn.u16      d16, q8,  #6
+  .else
+        vadd.u16        q8,  q8,  q11
+        vshrn.u16       d16, q8,  #6
+  .endif
   .ifc \type,avg
         vld1.32         {d20[0]}, [lr,:32], r2
         vld1.32         {d20[1]}, [lr,:32], r2
@@ -266,7 +315,12 @@ T       cmp             r7,  #0
         vadd.i16        d16, d16, d17
         vadd.i16        d17, d18, d19
         pld             [r1]
+  .ifc \codec,h264
         vrshrn.u16      d16, q8,  #6
+  .else
+        vadd.u16        q8,  q8,  q11
+        vshrn.u16       d16, q8,  #6
+  .endif
   .ifc \type,avg
         vld1.32         {d20[0]}, [lr,:32], r2
         vld1.32         {d20[1]}, [lr,:32], r2
@@ -352,9 +406,25 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1
 endfunc
 .endm
 
+#if CONFIG_H264_DECODER
         h264_chroma_mc8 put
         h264_chroma_mc8 avg
         h264_chroma_mc4 put
         h264_chroma_mc4 avg
         h264_chroma_mc2 put
         h264_chroma_mc2 avg
+#endif
+
+#if CONFIG_RV40_DECODER
+const   rv40bias
+        .short           0, 16, 32, 16
+        .short          32, 28, 32, 28
+        .short           0, 32, 16, 32
+        .short          32, 28, 32, 28
+endconst
+
+        h264_chroma_mc8 put, rv40
+        h264_chroma_mc8 avg, rv40
+        h264_chroma_mc4 put, rv40
+        h264_chroma_mc4 avg, rv40
+#endif
diff --git a/libavcodec/arm/rv40dsp_init_neon.c b/libavcodec/arm/rv40dsp_init_neon.c
new file mode 100644
index 0000000..aa4a88d
--- /dev/null
+++ b/libavcodec/arm/rv40dsp_init_neon.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/rv34dsp.h"
+
+void ff_put_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_put_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_avg_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
+{
+    c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_neon;
+    c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_neon;
+    c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
+    c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
+}
diff --git a/libavcodec/rv34dsp.h b/libavcodec/rv34dsp.h
index a1636e6..695af06 100644
--- a/libavcodec/rv34dsp.h
+++ b/libavcodec/rv34dsp.h
@@ -59,5 +59,6 @@ void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp);
 void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext *dsp);
 
 void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp);
+void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext *dsp);
 
 #endif /* AVCODEC_RV34DSP_H */
diff --git a/libavcodec/rv40dsp.c b/libavcodec/rv40dsp.c
index f193b60..06bdf18 100644
--- a/libavcodec/rv40dsp.c
+++ b/libavcodec/rv40dsp.c
@@ -534,4 +534,6 @@ av_cold void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp) {
 
     if (HAVE_MMX)
         ff_rv40dsp_init_x86(c, dsp);
+    if (HAVE_NEON)
+        ff_rv40dsp_init_neon(c, dsp);
 }