summaryrefslogtreecommitdiff
path: root/apps/plugins
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2009-06-20 14:05:15 +0000
committerJens Arnold <amiconn@rockbox.org>2009-06-20 14:05:15 +0000
commitf289b9f591746e5ff85835194bf7eae8ff088e6b (patch)
treefb6bb36f7167a71b6bf486fd45c0d30ec6ee6a9c /apps/plugins
parente7c4cd90768cadcdc2f5202378b77c55513f4eef (diff)
downloadrockbox-f289b9f591746e5ff85835194bf7eae8ff088e6b.zip
rockbox-f289b9f591746e5ff85835194bf7eae8ff088e6b.tar.gz
rockbox-f289b9f591746e5ff85835194bf7eae8ff088e6b.tar.bz2
rockbox-f289b9f591746e5ff85835194bf7eae8ff088e6b.tar.xz
Faster idct for ARMv6. Overall mpegplayer speedup is quite minimal though.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21392 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/plugins')
-rw-r--r--apps/plugins/mpegplayer/SOURCES4
-rw-r--r--apps/plugins/mpegplayer/decode.c2
-rw-r--r--apps/plugins/mpegplayer/idct_armv6.S337
3 files changed, 342 insertions, 1 deletions
diff --git a/apps/plugins/mpegplayer/SOURCES b/apps/plugins/mpegplayer/SOURCES
index 5b3360c..5ca0fcd 100644
--- a/apps/plugins/mpegplayer/SOURCES
+++ b/apps/plugins/mpegplayer/SOURCES
@@ -9,7 +9,11 @@ idct_coldfire.S
motion_comp_coldfire_c.c
motion_comp_coldfire_s.S
#elif defined CPU_ARM
+#if ARM_ARCH >= 6
+idct_armv6.S
+#else
idct_arm.S
+#endif
motion_comp_arm_c.c
motion_comp_arm_s.S
#else /* other CPU or SIM */
diff --git a/apps/plugins/mpegplayer/decode.c b/apps/plugins/mpegplayer/decode.c
index a19b929..9125120 100644
--- a/apps/plugins/mpegplayer/decode.c
+++ b/apps/plugins/mpegplayer/decode.c
@@ -35,7 +35,7 @@
#define BUFFER_SIZE (1194 * 1024)
-#ifdef CPU_COLDFIRE
+#if defined(CPU_COLDFIRE) || (defined(CPU_ARM) && ARM_ARCH >= 6)
/* twice as large as on other targets because coldfire uses
* a secondary, transposed buffer for optimisation */
static int16_t static_dct_block[128] IBSS_ATTR ATTR_ALIGN(16);
diff --git a/apps/plugins/mpegplayer/idct_armv6.S b/apps/plugins/mpegplayer/idct_armv6.S
new file mode 100644
index 0000000..73feed4
--- /dev/null
+++ b/apps/plugins/mpegplayer/idct_armv6.S
@@ -0,0 +1,337 @@
+/***************************************************************************
+ * __________ __ ___.
+ * Open \______ \ ____ ____ | | _\_ |__ _______ ___
+ * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
+ * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
+ * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
+ * \/ \/ \/ \/ \/
+ * $Id$
+ *
+ * Copyright (C) 2009 by Jens Arnold
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+
+ .global mpeg2_idct_copy
+ .type mpeg2_idct_copy, %function
+ .global mpeg2_idct_add
+ .type mpeg2_idct_add, %function
+
+/* Custom calling convention:
+ * r0 contains block pointer and is non-volatile
+ * all non-volatile c context saved and restored on its behalf
+ */
+.idct:
+ str lr, [sp, #-4]! @ lr is used
+ add r1, r0, #128 @ secondary, transposed temp buffer
+ mov r14, #8 @ loop counter
+
+.row_loop:
+ ldmia r0!, {r2, r3, r10, r11} @ fetch f0, f2, f4, f6, f1, f3, f5, f7
+ ldrd r4, L_W1357 @ load W1, W3, W5, W7
+
+ smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3
+ smlad r6, r5, r11, r6 @ + W5 * f5 + W7 * f7
+
+ smultt r7, r5, r10 @ b1 = -W7 * f3
+ smlabb r7, r4, r11, r7 @ + -W1 * f5
+ smlabt r7, r5, r11, r7 @ + -W5 * f7
+ rsb r7, r7, #0
+ smlatb r7, r4, r10, r7 @ + W3 * f1
+
+ smulbt r8, r4, r10 @ b2 = -W1 * f3
+ rsb r8, r8, #0
+ smlabb r8, r5, r10, r8 @ + W5 * f1
+ smlatb r8, r5, r11, r8 @ + W7 * f5
+ smlatt r8, r4, r11, r8 @ + W3 * f7
+
+ smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5
+ smlsdx r9, r11, r4, r9 @ + f5 * W3 - f1 * W1
+
+ ldrd r4, L_W0246 @ load W0, W2, W4, W6
+ add r2, r2, #1 @ f0 += 1
+
+ smulbb r10, r4, r2 @ a0' = W0 * f0
+ smlabb r10, r5, r3, r10 @ + W4 * f4
+ smultt r12, r4, r2 @ a3' = W2 * f2
+ smlatt r12, r5, r3, r12 @ + W6 * f6
+ add r10, r10, r12 @ a0 = a0' + a3'
+ sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3'
+
+ smulbb r11, r5, r3 @ a1' = -W4 * f4
+ rsb r11, r11, #0
+ smlabb r11, r4, r2, r11 @ + W0 * f0
+ smultt r3, r4, r3 @ a2' = -W2 * f6
+ rsb r3, r3, #0
+ smlatt r3, r5, r2, r3 @ + W6 * f2
+ add r11, r11, r3 @ a1 = a1' + a2'
+ sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2'
+
+ sub r2, r10, r6 @ block[7] = (a0 - b0)
+ mov r2, r2, asr #12 @ >> 12
+ strh r2, [r1, #7*16]
+ sub r2, r11, r7 @ block[6] = (a1 - b1)
+ mov r2, r2, asr #12 @ >> 12
+ strh r2, [r1, #6*16]
+ sub r2, r3, r8 @ block[5] = (a2 - b2)
+ mov r2, r2, asr #12 @ >> 12
+ strh r2, [r1, #5*16]
+ sub r2, r12, r9 @ block[4] = (a3 - b3)
+ mov r2, r2, asr #12 @ >> 12
+ strh r2, [r1, #4*16]
+ add r2, r12, r9 @ block[3] = (a3 + b3)
+ mov r2, r2, asr #12 @ >> 12
+ strh r2, [r1, #3*16]
+ add r2, r3, r8 @ block[2] = (a2 + b2)
+ mov r2, r2, asr #12 @ >> 12
+ strh r2, [r1, #2*16]
+ add r2, r11, r7 @ block[1] = (a1 + b1)
+ mov r2, r2, asr #12 @ >> 12
+ strh r2, [r1, #1*16]
+ add r2, r10, r6 @ block[0] = (a0 + b0)
+ mov r2, r2, asr #12 @ >> 12
+ strh r2, [r1], #2 @ advance to next temp column
+
+ subs r14, r14, #1
+ bne .row_loop
+ b .col_start
+
+ @placed here because of ldrd's offset limit
+L_W1357:
+ .short 2841
+ .short 2408
+ .short 1609
+ .short 565
+
+L_W0246:
+ .short 2048
+ .short 2676
+ .short 2048
+ .short 1108
+
+.col_start:
+ @ r0 now points to the temp buffer, where we need it.
+ sub r1, r1, #128+16 @ point r1 back to the input block
+ mov r14, #8 @ loop counter
+
+.col_loop:
+ ldmia r0!, {r2, r3, r10, r11} @ fetch f0, f2, f4, f6, f1, f3, f5, f7
+ ldrd r4, L_W1357 @ load W1, W3, W5, W7
+
+ smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3
+ smlad r6, r5, r11, r6 @ + W5 * f5 + W7 * f7
+
+ smultt r7, r5, r10 @ b1 = -W7 * f3
+ smlabb r7, r4, r11, r7 @ + -W1 * f5
+ smlabt r7, r5, r11, r7 @ + -W5 * f7
+ rsb r7, r7, #0
+ smlatb r7, r4, r10, r7 @ + W3 * f1
+
+ smulbt r8, r4, r10 @ b2 = -W1 * f3
+ rsb r8, r8, #0
+ smlabb r8, r5, r10, r8 @ + W5 * f1
+ smlatb r8, r5, r11, r8 @ + W7 * f5
+ smlatt r8, r4, r11, r8 @ + W3 * f7
+
+ smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5
+ smlsdx r9, r11, r4, r9 @ + f5 * W3 - f1 * W1
+
+ ldrd r4, L_W0246 @ load W0, W2, W4, W6
+ add r2, r2, #32 @ DC offset: 0.5
+
+ smulbb r10, r4, r2 @ a0' = W0 * f0
+ smlabb r10, r5, r3, r10 @ + W4 * f4
+ smultt r12, r4, r2 @ a3' = W2 * f2
+ smlatt r12, r5, r3, r12 @ + W6 * f6
+ add r10, r10, r12 @ a0 = a0' + a3'
+ sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3'
+
+ smulbb r11, r5, r3 @ a1' = -W4 * f4
+ rsb r11, r11, #0
+ smlabb r11, r4, r2, r11 @ + W0 * f0
+ smultt r3, r4, r3 @ a2' = -W2 * f6
+ rsb r3, r3, #0
+ smlatt r3, r5, r2, r3 @ + W6 * f2
+ add r11, r11, r3 @ a1 = a1' + a2'
+ sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2'
+
+ sub r2, r10, r6 @ block[7] = (a0 - b0)
+ mov r2, r2, asr #17 @ >> 17
+ strh r2, [r1, #7*16]
+ sub r2, r11, r7 @ block[6] = (a1 - b1)
+ mov r2, r2, asr #17 @ >> 17
+ strh r2, [r1, #6*16]
+ sub r2, r3, r8 @ block[5] = (a2 - b2)
+ mov r2, r2, asr #17 @ >> 17
+ strh r2, [r1, #5*16]
+ sub r2, r12, r9 @ block[4] = (a3 - b3)
+ mov r2, r2, asr #17 @ >> 17
+ strh r2, [r1, #4*16]
+ add r2, r12, r9 @ block[3] = (a3 + b3)
+ mov r2, r2, asr #17 @ >> 17
+ strh r2, [r1, #3*16]
+ add r2, r3, r8 @ block[2] = (a2 + b2)
+ mov r2, r2, asr #17 @ >> 17
+ strh r2, [r1, #2*16]
+ add r2, r11, r7 @ block[1] = (a1 + b1)
+ mov r2, r2, asr #17 @ >> 17
+ strh r2, [r1, #1*16]
+ add r2, r10, r6 @ block[0] = (a0 + b0)
+ mov r2, r2, asr #17 @ >> 17
+ strh r2, [r1], #2 @ advance to next column
+
+ subs r14, r14, #1
+ bne .col_loop
+
+ sub r0, r0, #256 @ point r0 back to the input block
+ ldr pc, [sp], #4
+
+
+mpeg2_idct_copy:
+ stmfd sp!, {r1-r2, r4-r12, lr}
+ bl .idct
+ ldmfd sp!, {r1-r2}
+
+ add r12, r0, #128
+ ldrd r4, [r0]
+ mov r8, #0
+ mov r9, #0
+ mov r10, #0
+ mov r11, #0
+1:
+ ldrd r6, [r0, #8]
+ usat16 r4, #8, r4
+ strb r4, [r1, #0]
+ mov r4, r4, lsr #16
+ strb r4, [r1, #1]
+ usat16 r5, #8, r5
+ strb r5, [r1, #2]
+ mov r5, r5, lsr #16
+ strb r5, [r1, #3]
+ ldrd r4, [r0, #16]
+ usat16 r6, #8, r6
+ strb r6, [r1, #4]
+ mov r6, r6, lsr #16
+ strb r6, [r1, #5]
+ usat16 r7, #8, r7
+ strb r7, [r1, #6]
+ mov r7, r7, lsr #16
+ strb r7, [r1, #7]
+ stmia r0!, {r8-r11}
+ add r1, r1, r2
+ cmp r0, r12
+ blo 1b
+
+ ldmfd sp!, {r4-r12, pc}
+
+mpeg2_idct_add:
+ cmp r0, #129
+ mov r0, r1
+ ldreqsh r1, [r0, #0]
+ bne 1f
+ and r1, r1, #0x70
+ cmp r1, #0x40
+ bne 3f
+1:
+ stmfd sp!, {r2-r12, lr}
+ bl .idct
+ ldmfd sp!, {r1-r2}
+ mov r11, #0
+ add r12, r0, #128
+2:
+ ldmia r0, {r3-r6}
+ ldrb r7, [r1, #0]
+ ldrb r8, [r1, #1]
+ ldrb r9, [r1, #2]
+ ldrb r10, [r1, #3]
+ str r11, [r0], #4
+ orr r7, r7, r8, lsl #16
+ sadd16 r3, r3, r7
+ usat16 r3, #8, r3
+ strb r3, [r1, #0]
+ mov r3, r3, lsr #16
+ strb r3, [r1, #1]
+ str r11, [r0], #4
+ orr r9, r9, r10, lsl #16
+ sadd16 r4, r4, r9
+ usat16 r4, #8, r4
+ strb r4, [r1, #2]
+ mov r4, r4, lsr #16
+ strb r4, [r1, #3]
+ ldrb r7, [r1, #4]
+ ldrb r8, [r1, #5]
+ ldrb r9, [r1, #6]
+ ldrb r10, [r1, #7]
+ str r11, [r0], #4
+ orr r7, r7, r8, lsl #16
+ sadd16 r5, r5, r7
+ usat16 r5, #8, r5
+ strb r5, [r1, #4]
+ mov r5, r5, lsr #16
+ strb r5, [r1, #5]
+ str r11, [r0], #4
+ orr r9, r9, r10, lsl #16
+ sadd16 r6, r6, r9
+ usat16 r6, #8, r6
+ strb r6, [r1, #6]
+ mov r6, r6, lsr #16
+ strb r6, [r1, #7]
+ add r1, r1, r2
+ cmp r0, r12
+ blo 2b
+ ldmfd sp!, {r4-r12, pc}
+
+3:
+ stmfd sp!, {r4-r7}
+ ldrsh r1, [r0, #0] /* r1 = block[0] */
+ mov r11, #0
+ strh r11, [r0, #0] /* block[0] = 0 */
+ strh r11, [r0, #126] /* block[63] = 0 */
+ add r1, r1, #64 /* r1 = DC << 7 */
+ add r0, r2, r3, asl #3
+4:
+ ldrb r4, [r2, #0]
+ ldrb r5, [r2, #1]
+ ldrb r6, [r2, #2]
+ ldrb r7, [r2, #3]
+ add r4, r4, r1, asr #7
+ usat r4, #8, r4
+ strb r4, [r2, #0]
+ add r5, r5, r1, asr #7
+ usat r5, #8, r5
+ strb r5, [r2, #1]
+ add r6, r6, r1, asr #7
+ usat r6, #8, r6
+ strb r6, [r2, #2]
+ add r7, r7, r1, asr #7
+ usat r7, #8, r7
+ strb r7, [r2, #3]
+ ldrb r4, [r2, #4]
+ ldrb r5, [r2, #5]
+ ldrb r6, [r2, #6]
+ ldrb r7, [r2, #7]
+ add r4, r4, r1, asr #7
+ usat r4, #8, r4
+ strb r4, [r2, #4]
+ add r5, r5, r1, asr #7
+ usat r5, #8, r5
+ strb r5, [r2, #5]
+ add r6, r6, r1, asr #7
+ usat r6, #8, r6
+ strb r6, [r2, #6]
+ add r7, r7, r1, asr #7
+ usat r7, #8, r7
+ strb r7, [r2, #7]
+ add r2, r2, r3
+ cmp r2, r0
+ blo 4b
+ ldmfd sp!, {r4-r7}
+ bx lr