@@ -30,6 +30,7 @@ PIXELFLINGER_SRC_FILES:= \
ifeq ($(TARGET_ARCH),arm)
ifeq ($(TARGET_ARCH_VARIANT),armv7-a-neon)
+PIXELFLINGER_SRC_FILES += t32cb16_neon.S
PIXELFLINGER_SRC_FILES += t32cb16blend.S
PIXELFLINGER_SRC_FILES += col32cb16blend_neon.S
else
@@ -93,6 +93,7 @@ static void scanline_clear(context_t* c);
static void rect_generic(context_t* c, size_t yc);
static void rect_memcpy(context_t* c, size_t yc);
+extern "C" void scanline_t32cb16_neon(uint16_t *dst, uint32_t *src, size_t ct);
extern "C" void scanline_t32cb16blend_arm(uint16_t*, uint32_t*, size_t);
extern "C" void scanline_t32cb16_arm(uint16_t *dst, uint32_t *src, size_t ct);
extern "C" void scanline_col32cb16blend_neon(uint16_t *dst, uint32_t *col, size_t ct);
@@ -1320,6 +1321,10 @@ void scanline_t32cb16(context_t* c)
int sR, sG, sB;
uint32_t s, d;
+#if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && defined(__arm__)) && \
+ defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
+ scanline_t32cb16_neon(dst, src, ct);
+#else
if (ct==1 || uint32_t(dst)&2) {
last_one:
s = GGL_RGBA_TO_HOST( *src++ );
@@ -1354,6 +1359,7 @@ last_one:
if (ct > 0) {
goto last_one;
}
+#endif
}
void scanline_t32cb16blend(context_t* c)
new file mode 100644
@@ -0,0 +1,180 @@
+/* libs/pixelflinger/t32cb16_neon.S
+ *
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ .text
+ .align
+
+ .global scanline_t32cb16_neon
+
+// r0: dst ptr
+// r1: src ptr
+// r2: count
+
+scanline_t32cb16_neon:
+ cmp r2, #7
+ bhi count_great_than_8
+
+ // handle count < 8
+ mov r3, #0
+ vmov.u8 d31, #1<<7
+ mov r3, r0
+
+ tst r2, #4
+ beq 14f
+ vld1.16 {d25}, [r0]!
+ vld1.32 {q1}, [r1]!
+
+14:
+ tst r2, #2
+ beq 12f
+ vld1.32 {d24[1]}, [r0]!
+ vld1.32 {d1}, [r1]!
+
+12:
+ tst r2, #1
+ beq 11f
+ vld1.16 {d24[1]}, [r0]!
+ vld1.32 {d0[1]}, [r1]!
+
+11:
+ // unzip achieve the same as a vld4 operation
+ vuzpq.u16 q0, q1
+ vuzp.u8 d0, d1
+ vuzp.u8 d2, d3
+ // expand 0565 q12 to 8888 {d4-d7}
+ vmovn.u16 d4, q12
+ vshr.u16 q11, q12, #5
+ vshr.u16 q10, q12, #6+5
+ vmovn.u16 d5, q11
+ vmovn.u16 d6, q10
+ vshl.u8 d4, d4, #3
+ vshl.u8 d5, d5, #2
+ vshl.u8 d6, d6, #3
+
+ vmovl.u8 q14, d31
+ vmovl.u8 q13, d31
+ vmovl.u8 q12, d31
+
+ // duplicate in 4/2/1 & 8pix vsns
+ vmvn.8 d30, d3
+ vmlal.u8 q14, d30, d6
+ vmlal.u8 q13, d30, d5
+ vmlal.u8 q12, d30, d4
+ vshr.u16 q8, q14, #5
+ vshr.u16 q9, q13, #6
+ vaddhn.u16 d6, q14, q8
+ vshr.u16 q8, q12, #5
+ vaddhn.u16 d5, q13, q9
+ vqadd.u8 d6, d6, d0 // moved up
+ vaddhn.u16 d4, q12, q8
+ // intentionally, don't calculate alpha result in d4-d6
+
+ vqadd.u8 d5, d5, d1
+ vqadd.u8 d4, d4, d2
+
+ // pack 8888 {d4-d6} to 0565 q10
+ vshll.u8 q10, d6, #8
+ vshll.u8 q3, d5, #8
+ vshll.u8 q2, d4, #8
+ vsri.u16 q10, q3, #5
+ vsri.u16 q10, q2, #11
+
+ // store
+ tst r2, #4
+ beq 24f
+ vst1.16 {d21}, [r3]!
+
+24:
+ tst r2, #2
+ beq 22f
+ vst1.32 {d20[1]}, [r3]!
+
+22:
+ tst r2, #1
+ beq 21f
+ vst1.16 {d20[1]}, [r3]!
+
+21:
+ bx lr
+
+# count >= 8
+count_great_than_8:
+ mov r3, #0
+ ands ip, r2, #7
+ vmov.u8 d31, #1<<7
+ vld1.16 {q12}, [r0]
+ vld4.8 {d0-d3}, [r1]
+ moveq ip, #8
+ mov r3, r0
+
+ add r1, r1, ip, LSL#2
+ add r0, r0, ip, LSL#1
+ subs r2, r2, ip
+ b 9f
+
+// LOOP
+2:
+ vld1.16 {q12}, [r0]!
+ vld4.8 {d0-d3}, [r1]!
+ vst1.16 {q10}, [r3]
+ sub r3, r0, #8*2
+ subs r2, r2, #8
+9:
+ pld [r0,#32]
+ // expand 0565 q12 to 8888 {d4-d7}
+ vmovn.u16 d4, q12
+ vshr.u16 q11, q12, #5
+ vshr.u16 q10, q12, #6+5
+ vmovn.u16 d5, q11
+ vmovn.u16 d6, q10
+ vshl.u8 d4, d4, #3
+ vshl.u8 d5, d5, #2
+ vshl.u8 d6, d6, #3
+
+ // duplicate in 4/2/1 & 8pix vsns
+ vmovl.u8 q14, d31
+ vmovl.u8 q13, d31
+ vmovl.u8 q12, d31
+ vmvn.8 d30, d3
+ vmlal.u8 q14, d30, d6
+ vmlal.u8 q13, d30, d5
+ vmlal.u8 q12, d30, d4
+ vshr.u16 q8, q14, #5
+ vshr.u16 q9, q13, #6
+ vaddhn.u16 d6, q14, q8 // moved up
+ vshr.u16 q8, q12, #5
+ vaddhn.u16 d5, q13, q9
+ // intentionally, don't calculate alpha result in d4-d6
+
+ vqadd.u8 d6, d6, d0
+ vaddhn.u16 d4, q12, q8
+
+ // pack 8888 {d4-d6} to 0565 q10
+ vqadd.u8 d5, d5, d1
+ vqadd.u8 d4, d4, d2
+ vshll.u8 q10, d6, #8
+ vshll.u8 q3, d5, #8
+ vshll.u8 q2, d4, #8
+ vsri.u16 q10, q3, #5
+ vsri.u16 q10, q2, #11
+
+ bne 2b
+
+1:
+ vst1.16 {q10}, [r3]
+
+ bx lr
Reference benchmark results on Beagleboard (TI OMAP353x) at 500 MHz: scanline_t32cb16_c memory bandwidth: 31.63 MB/s scanline_t32cb16_neon memory bandwidth: 147.69 MB/s It can dramatically improve the performance of boot animation. Code Review: https://review.source.android.com//#change,16358 From 40a2184b1b3f34179fd50138105daede56a62f7f Mon Sep 17 00:00:00 2001 From: Jim Huang <jim.huang@linaro.org> Date: Sat, 9 Apr 2011 01:15:28 +0800 Subject: [PATCH] libpixelflinger: Add ARM NEON optimized scanline_t32cb16 Reference benchmark results on Beagleboard (TI OMAP353x) at 500 MHz: scanline_t32cb16_c memory bandwidth: 31.63 MB/s scanline_t32cb16_neon memory bandwidth: 147.69 MB/s It can dramatically improve the performance of boot animation. Change-Id: Iff6a0fea330fc82d1570909dd57155913ef056ab --- libpixelflinger/Android.mk | 1 + libpixelflinger/scanline.cpp | 6 ++ libpixelflinger/t32cb16_neon.S | 180 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 187 insertions(+), 0 deletions(-) create mode 100644 libpixelflinger/t32cb16_neon.S