From patchwork Tue Dec 6 13:53:31 2011 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Mans Rullgard X-Patchwork-Id: 5515 Return-Path: X-Original-To: patchwork@peony.canonical.com Delivered-To: patchwork@peony.canonical.com Received: from fiordland.canonical.com (fiordland.canonical.com [91.189.94.145]) by peony.canonical.com (Postfix) with ESMTP id EC65A23E2A for ; Tue, 6 Dec 2011 13:53:52 +0000 (UTC) Received: from mail-ee0-f52.google.com (mail-ee0-f52.google.com [74.125.83.52]) by fiordland.canonical.com (Postfix) with ESMTP id D004DA189D7 for ; Tue, 6 Dec 2011 13:53:52 +0000 (UTC) Received: by eekc14 with SMTP id c14so818959eek.11 for ; Tue, 06 Dec 2011 05:53:52 -0800 (PST) Received: by 10.14.9.163 with SMTP id 35mr2648493eet.234.1323179632537; Tue, 06 Dec 2011 05:53:52 -0800 (PST) X-Forwarded-To: linaro-patchwork@canonical.com X-Forwarded-For: patch@linaro.org linaro-patchwork@canonical.com Delivered-To: patches@linaro.org Received: by 10.205.129.2 with SMTP id hg2cs44128bkc; Tue, 6 Dec 2011 05:53:51 -0800 (PST) Received: by 10.213.8.21 with SMTP id f21mr2244556ebf.20.1323179629912; Tue, 06 Dec 2011 05:53:49 -0800 (PST) Received: from unicorn.mansr.com (unicorn.mansr.com. [78.86.181.103]) by mx.google.com with ESMTP id uq8si21981690bkb.6.2011.12.06.05.53.49; Tue, 06 Dec 2011 05:53:49 -0800 (PST) Received-SPF: pass (google.com: best guess record for domain of mru@mansr.com designates 78.86.181.103 as permitted sender) client-ip=78.86.181.103; Authentication-Results: mx.google.com; spf=pass (google.com: best guess record for domain of mru@mansr.com designates 78.86.181.103 as permitted sender) smtp.mail=mru@mansr.com Received: by unicorn.mansr.com (Postfix, from userid 51770) id D09C21538C; Tue, 6 Dec 2011 13:53:48 +0000 (GMT) From: Mans Rullgard To: libav-devel@libav.org Cc: patches@linaro.org Subject: [PATCH 1/4] rv34: NEON optimised inverse transform functions Date: Tue, 6 Dec 2011 13:53:31 +0000 Message-Id: <1323179614-32610-1-git-send-email-mans.rullgard@linaro.org> X-Mailer: git-send-email 1.7.8 From: Janne Grunau Signed-off-by: Mans Rullgard --- libavcodec/arm/Makefile | 6 ++ libavcodec/arm/rv34dsp_init_neon.c | 33 +++++++++++ libavcodec/arm/rv34dsp_neon.S | 109 ++++++++++++++++++++++++++++++++++++ libavcodec/rv34dsp.c | 3 + libavcodec/rv34dsp.h | 2 + 5 files changed, 153 insertions(+), 0 deletions(-) create mode 100644 libavcodec/arm/rv34dsp_init_neon.c create mode 100644 libavcodec/arm/rv34dsp_neon.S diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index 3374f0e..9199fae 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -62,6 +62,12 @@ NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \ arm/synth_filter_neon.o \ +NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_init_neon.o \ + arm/rv34dsp_neon.o \ + +NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_neon.o \ + arm/rv34dsp_neon.o \ + NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o NEON-OBJS-$(CONFIG_VP5_DECODER) += arm/vp56dsp_neon.o \ diff --git a/libavcodec/arm/rv34dsp_init_neon.c b/libavcodec/arm/rv34dsp_init_neon.c new file mode 100644 index 0000000..9a09fde --- /dev/null +++ b/libavcodec/arm/rv34dsp_init_neon.c @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2011 Janne Grunau + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavcodec/avcodec.h" +#include "libavcodec/rv34dsp.h" + +void ff_rv34_inv_transform_neon(DCTELEM *block); +void ff_rv34_inv_transform_noround_neon(DCTELEM *block); + +void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext* dsp) +{ + c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon; + c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon; +} diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S new file mode 100644 index 0000000..f700f5c --- /dev/null +++ b/libavcodec/arm/rv34dsp_neon.S @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2011 Janne Grunau + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +.macro rv34_inv_transform + mov r1, #16 + vld1.16 {d28}, [r0,:64], r1 @ block[i+8*0] + vld1.16 {d29}, [r0,:64], r1 @ block[i+8*1] + vld1.16 {d30}, [r0,:64], r1 @ block[i+8*2] + vld1.16 {d31}, [r0,:64], r1 @ block[i+8*3] + vmov.s16 d0, #13 + vshll.s16 q12, d29, #3 + vshll.s16 q13, d29, #4 + vshll.s16 q9, d31, #3 + vshll.s16 q1, d31, #4 + vmull.s16 q10, d28, d0 + vmlal.s16 q10, d30, d0 + vmull.s16 q11, d28, d0 + vmlsl.s16 q11, d30, d0 + vsubw.s16 q12, q12, d29 @ z2 = block[i+8*1]*7 + vaddw.s16 q13, q13, d29 @ z3 = block[i+8*1]*17 + vsubw.s16 q9, q9, d31 + vaddw.s16 q1, q1, d31 + vadd.s32 q13, q13, q9 @ z3 = 17*block[i+8*1] + 7*block[i+8*3] + vsub.s32 q12, q12, q1 @ z2 = 7*block[i+8*1] - 17*block[i+8*3] + vadd.s32 q1, q10, q13 @ z0 + z3 + vadd.s32 q2, q11, q12 @ z1 + z2 + vsub.s32 q8, q10, q13 @ z0 - z3 + vsub.s32 q3, q11, q12 @ z1 - z2 + vtrn.32 q1, q2 + vtrn.32 q3, q8 + vswp d3, d6 + vswp d5, d16 + vmov.s32 d0, #13 + vadd.s32 q10, q1, q3 + vsub.s32 q11, q1, q3 + vshl.s32 q12, q2, #3 + vshl.s32 q9, q2, #4 + vmul.s32 q13, q11, d0[0] + vshl.s32 q11, q8, #4 + vadd.s32 q9, q9, q2 + vshl.s32 q15, q8, #3 + vsub.s32 q12, q12, q2 + vadd.s32 q11, q11, q8 + vmul.s32 q14, q10, d0[0] + vsub.s32 q8, q15, q8 + vsub.s32 q12, q12, q11 + vadd.s32 q9, q9, q8 + vadd.s32 q2, q13, q12 @ z1 + z2 + vadd.s32 q1, q14, q9 @ z0 + z3 + vsub.s32 q3, q13, q12 @ z1 - z2 + vsub.s32 q15, q14, q9 @ z0 - z3 +.endm + +/* void ff_rv34_inv_transform_neon(DCTELEM *block); */ +function ff_rv34_inv_transform_neon, export=1 + mov r2, r0 + rv34_inv_transform + vrshrn.s32 d1, q2, #10 @ (z1 + z2) >> 10 + vrshrn.s32 d0, q1, #10 @ (z0 + z3) >> 10 + vrshrn.s32 d2, q3, #10 @ (z1 - z2) >> 10 + vrshrn.s32 d3, q15, #10 @ (z0 - z3) >> 10 + vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r2,:64], r1 + vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r2,:64], r1 + vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r2,:64], r1 + vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r2,:64], r1 + bx lr +endfunc + +/* void rv34_inv_transform_noround_neon(DCTELEM *block); */ +function ff_rv34_inv_transform_noround_neon, export=1 + mov r2, r0 + rv34_inv_transform + vshl.s32 q11, q2, #1 + vshl.s32 q10, q1, #1 + vshl.s32 q12, q3, #1 + vshl.s32 q13, q15, #1 + vadd.s32 q11, q11, q2 + vadd.s32 q10, q10, q1 + vadd.s32 q12, q12, q3 + vadd.s32 q13, q13, q15 + vshrn.s32 d0, q10, #11 @ (z0 + z3)*3 >> 11 + vshrn.s32 d1, q11, #11 @ (z1 + z2)*3 >> 11 + vshrn.s32 d2, q12, #11 @ (z1 - z2)*3 >> 11 + vshrn.s32 d3, q13, #11 @ (z0 - z3)*3 >> 11 + vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r2,:64], r1 + vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r2,:64], r1 + vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r2,:64], r1 + vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r2,:64], r1 + bx lr +endfunc diff --git a/libavcodec/rv34dsp.c b/libavcodec/rv34dsp.c index 59038a7..1f4cea8 100644 --- a/libavcodec/rv34dsp.c +++ b/libavcodec/rv34dsp.c @@ -103,4 +103,7 @@ static void rv34_inv_transform_noround_c(DCTELEM *block){ av_cold void ff_rv34dsp_init(RV34DSPContext *c, DSPContext* dsp) { c->rv34_inv_transform_tab[0] = rv34_inv_transform_c; c->rv34_inv_transform_tab[1] = rv34_inv_transform_noround_c; + + if (HAVE_NEON) + ff_rv34dsp_init_neon(c, dsp); } diff --git a/libavcodec/rv34dsp.h b/libavcodec/rv34dsp.h index 4ade050..a1636e6 100644 --- a/libavcodec/rv34dsp.h +++ b/libavcodec/rv34dsp.h @@ -56,6 +56,8 @@ void ff_rv30dsp_init(RV34DSPContext *c, DSPContext* dsp); void ff_rv34dsp_init(RV34DSPContext *c, DSPContext* dsp); void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp); +void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext *dsp); + void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp); #endif /* AVCODEC_RV34DSP_H */