summaryrefslogtreecommitdiff
path: root/apps/codecs
diff options
context:
space:
mode:
Diffstat (limited to 'apps/codecs')
-rw-r--r--apps/codecs/libmusepack/synth_filter.c13
-rwxr-xr-xapps/codecs/libmusepack/synth_filter_arm.S123
2 files changed, 73 insertions, 63 deletions
diff --git a/apps/codecs/libmusepack/synth_filter.c b/apps/codecs/libmusepack/synth_filter.c
index ae94741..9c8d27e 100644
--- a/apps/codecs/libmusepack/synth_filter.c
+++ b/apps/codecs/libmusepack/synth_filter.c
@@ -54,9 +54,16 @@
// in this configuration a post-shift by >>1 is needed after synthesis
#else
- // saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17
- #define D(value) (value << (14))
-
+ #if defined(CPU_ARM)
+ // do not up-scale D-values to achieve higher speed in smull/mlal
+ // operations. saves ~14/8 = 1.75 cycles per multiplication
+ #define D(value) (value)
+
+ // in this configuration a post-shift by >>16 is needed after synthesis
+ #else
+ // saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17
+ #define D(value) (value << (14))
+ #endif
// do not perform pre-shift
#define MPC_V_PRESHIFT(X) (X)
#endif
diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S
index 7b2d2df..8bc6bd3 100755
--- a/apps/codecs/libmusepack/synth_filter_arm.S
+++ b/apps/codecs/libmusepack/synth_filter_arm.S
@@ -26,7 +26,8 @@
*
* 2nd step within synthesis filter. Does the dewindowing.
* 32=32x32 multiplies (OPTIMIZE_FOR_SPEED)
- * Uses pre-shifted V[] and D[] values.
+ * Uses pre-shifted V[] and D[] values. D[] will always be the second operand
+ * of mul/mla to achieve higher speed as D[] has lower amplitude than V[].
****************************************************************************/
#if defined(OPTIMIZE_FOR_SPEED)
.align 2
@@ -42,40 +43,40 @@ mpc_decoder_windowing_D:
mov lr, #32
.loop32:
- ldmia r2!, { r3-r10 } /* load first 8 window coefficients */
+ ldmia r2!, { r3-r10 } /* load D[00..07] */
ldr r11, [r1] /* 0 */
- mul r12, r3, r11
+ mul r12, r11, r3
ldr r11, [r1, #96*4] /* 1 */
- mla r12, r4, r11, r12
+ mla r12, r11, r4, r12
ldr r11, [r1, #128*4] /* 2 */
- mla r12, r5, r11, r12
+ mla r12, r11, r5, r12
ldr r11, [r1, #224*4] /* 3 */
- mla r12, r6, r11, r12
+ mla r12, r11, r6, r12
ldr r11, [r1, #256*4] /* 4 */
- mla r12, r7, r11, r12
+ mla r12, r11, r7, r12
ldr r11, [r1, #352*4] /* 5 */
- mla r12, r8, r11, r12
+ mla r12, r11, r8, r12
ldr r11, [r1, #384*4] /* 6 */
- mla r12, r9, r11, r12
+ mla r12, r11, r9, r12
ldr r11, [r1, #480*4] /* 7 */
- mla r12, r10, r11, r12
- ldmia r2!, { r3-r10 } /* load last 8 window coefficients */
+ mla r12, r11, r10, r12
+ ldmia r2!, { r3-r10 } /* load D[08..15] */
ldr r11, [r1, #512*4] /* 8 */
- mla r12, r3, r11, r12
+ mla r12, r11, r3, r12
ldr r11, [r1, #608*4] /* 9 */
- mla r12, r4, r11, r12
+ mla r12, r11, r4, r12
ldr r11, [r1, #640*4] /* 10 */
- mla r12, r5, r11, r12
+ mla r12, r11, r5, r12
ldr r11, [r1, #736*4] /* 11 */
- mla r12, r6, r11, r12
+ mla r12, r11, r6, r12
ldr r11, [r1, #768*4] /* 12 */
- mla r12, r7, r11, r12
+ mla r12, r11, r7, r12
ldr r11, [r1, #864*4] /* 13 */
- mla r12, r8, r11, r12
+ mla r12, r11, r8, r12
ldr r11, [r1, #896*4] /* 14 */
- mla r12, r9, r11, r12
+ mla r12, r11, r9, r12
ldr r11, [r1, #992*4] /* 15 */
- mla r12, r10, r11, r12
+ mla r12, r11, r10, r12
mov r12, r12, asr #1 /* post shift to compensate for pre-shifting */
str r12, [r0], #4 /* store Data */
add r1, r1, #4 /* V++ */
@@ -92,9 +93,8 @@ mpc_decoder_windowing_D:
*
* 2nd step within synthesis filter. Does the dewindowing.
* 64=32x32 multiplies
- * Drops lo-part of 64bit multiply results and will therefor loose 1 bit
- * accuracy. The decoder output is binary identical as this imprecision is
- * far below the output's 16bit resolution.
+ * Uses un-shifted D[]-values. D[] will always be the second operand of
+ * smull/smlal to achieve higher speed as D[] has lower amplitude than V[].
****************************************************************************/
.align 2
.global mpc_decoder_windowing_D
@@ -105,52 +105,55 @@ mpc_decoder_windowing_D:
/* r2 = D[] */
/* lr = counter */
- stmfd sp!, {r4-r12, lr}
+ stmfd sp!, {r4-r9, lr}
mov lr, #32
.loop32:
- ldmia r2!, { r3-r10 } /* load first 8 window coefficients */
- ldr r11, [r1] /* 0 */
- smull r11, r12, r3, r11
- ldr r11, [r1, #96*4] /* 1 */
- smlal r11, r12, r4, r11
- ldr r11, [r1, #128*4] /* 2 */
- smlal r11, r12, r5, r11
- ldr r11, [r1, #224*4] /* 3 */
- smlal r11, r12, r6, r11
- ldr r11, [r1, #256*4] /* 4 */
- smlal r11, r12, r7, r11
- ldr r11, [r1, #352*4] /* 5 */
- smlal r11, r12, r8, r11
- ldr r11, [r1, #384*4] /* 6 */
- smlal r11, r12, r9, r11
- ldr r11, [r1, #480*4] /* 7 */
- smlal r11, r12, r10, r11
- ldmia r2!, { r3-r10 } /* load last 8 window coefficients */
- ldr r11, [r1, #512*4] /* 8 */
- smlal r11, r12, r3, r11
- ldr r11, [r1, #608*4] /* 9 */
- smlal r11, r12, r4, r11
- ldr r11, [r1, #640*4] /* 10 */
- smlal r11, r12, r5, r11
- ldr r11, [r1, #736*4] /* 11 */
- smlal r11, r12, r6, r11
- ldr r11, [r1, #768*4] /* 12 */
- smlal r11, r12, r7, r11
- ldr r11, [r1, #864*4] /* 13 */
- smlal r11, r12, r8, r11
- ldr r11, [r1, #896*4] /* 14 */
- smlal r11, r12, r9, r11
- ldr r11, [r1, #992*4] /* 15 */
- smlal r11, r12, r10, r11
- mov r4, r12, lsl #2 /* get result from hi-part, loose 2 bits */
- str r4, [r0], #4 /* store Data */
+ ldmia r2!, { r3-r6 } /* load D[00..03] */
+ ldr r7, [r1] /* 0 */
+ smull r8, r9, r7, r3
+ ldr r7, [r1, #96*4] /* 1 */
+ smlal r8, r9, r7, r4
+ ldr r7, [r1, #128*4] /* 2 */
+ smlal r8, r9, r7, r5
+ ldr r7, [r1, #224*4] /* 3 */
+ smlal r8, r9, r7, r6
+ ldmia r2!, { r3-r6 } /* load D[04..07] */
+ ldr r7, [r1, #256*4] /* 4 */
+ smlal r8, r9, r7, r3
+ ldr r7, [r1, #352*4] /* 5 */
+ smlal r8, r9, r7, r4
+ ldr r7, [r1, #384*4] /* 6 */
+ smlal r8, r9, r7, r5
+ ldr r7, [r1, #480*4] /* 7 */
+ smlal r8, r9, r7, r6
+ ldmia r2!, { r3-r6 } /* load D[08..11] */
+ ldr r7, [r1, #512*4] /* 8 */
+ smlal r8, r9, r7, r3
+ ldr r7, [r1, #608*4] /* 9 */
+ smlal r8, r9, r7, r4
+ ldr r7, [r1, #640*4] /* 10 */
+ smlal r8, r9, r7, r5
+ ldr r7, [r1, #736*4] /* 11 */
+ smlal r8, r9, r7, r6
+ ldmia r2!, { r3-r6 } /* load D[12..15] */
+ ldr r7, [r1, #768*4] /* 12 */
+ smlal r8, r9, r7, r3
+ ldr r7, [r1, #864*4] /* 13 */
+ smlal r8, r9, r7, r4
+ ldr r7, [r1, #896*4] /* 14 */
+ smlal r8, r9, r7, r5
+ ldr r7, [r1, #992*4] /* 15 */
+ smlal r8, r9, r7, r6
+ mov r8, r8, lsr #16
+ orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */
+ str r8, [r0], #4 /* store Data */
add r1, r1, #4 /* V++ */
subs lr, lr, #1
bgt .loop32
- ldmfd sp!, {r4-r12, pc}
+ ldmfd sp!, {r4-r9, pc}
.mpc_dewindowing_end:
.size mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D
#endif