summaryrefslogtreecommitdiff
path: root/apps/codecs
diff options
context:
space:
mode:
authorMichael Giacomelli <giac2000@hotmail.com>2008-08-10 20:19:38 +0000
committerMichael Giacomelli <giac2000@hotmail.com>2008-08-10 20:19:38 +0000
commit54903eb7fa4db8146f3412a4efd883d932545b75 (patch)
treec9ff7a0e77aa864930a96bed4dbf58939637cf29 /apps/codecs
parent32694a7040f6aa4ff7160d8ba85c36593ef49e71 (diff)
downloadrockbox-54903eb7fa4db8146f3412a4efd883d932545b75.zip
rockbox-54903eb7fa4db8146f3412a4efd883d932545b75.tar.gz
rockbox-54903eb7fa4db8146f3412a4efd883d932545b75.tar.bz2
rockbox-54903eb7fa4db8146f3412a4efd883d932545b75.tar.xz
Use WMA windowing optimizations for AAC. Saves about 3.5MHz on Coldfire, and about 2 MHz on ARM. Thanks to amiconn for help with Coldfire ASM improvements. Next step: dump faad IMDCT.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@18238 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs')
-rw-r--r--apps/codecs/libfaad/filtbank.c184
1 files changed, 146 insertions, 38 deletions
diff --git a/apps/codecs/libfaad/filtbank.c b/apps/codecs/libfaad/filtbank.c
index 9b1bc85..333bad6 100644
--- a/apps/codecs/libfaad/filtbank.c
+++ b/apps/codecs/libfaad/filtbank.c
@@ -43,6 +43,131 @@
#include "sine_win.h"
#include "mdct.h"
+/*Windowing functions borrowed from libwmai*/
+
+#ifdef CPU_ARM
+static inline
+void vector_fmul_add_add(real_t *dst, const real_t *src0, const real_t *src1, const real_t *src2, int len)
+{
+ /* Block sizes are always power of two */
+ asm volatile (
+ "0:"
+ "ldmia %[d]!, {r0, r1};"
+ "ldmia %[w]!, {r4, r5};"
+ /* consume the first data and window value so we can use those
+ * registers again */
+ "smull r8, r9, r0, r4;"
+ "ldmia %[src2]!, {r0, r4};"
+ "add r0, r0, r9, lsl #1;" /* *dst=*dst+(r9<<1)*/
+ "smull r8, r9, r1, r5;"
+ "add r1, r4, r9, lsl #1;"
+ "stmia %[dst]!, {r0, r1};"
+ "subs %[n], %[n], #2;"
+ "bne 0b;"
+ : [d] "+r" (src0), [w] "+r" (src1), [src2] "+r" (src2), [dst] "+r" (dst), [n] "+r" (len)
+ :
+ : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
+}
+static inline
+void vector_fmul_reverse(real_t *dst, const real_t *src0, const real_t *src1,
+ int len)
+{
+ /* Block sizes are always power of two */
+ asm volatile (
+ "add %[s1], %[s1], %[n], lsl #2;"
+ "0:"
+ "ldmia %[s0]!, {r0, r1};"
+ "ldmdb %[s1]!, {r4, r5};"
+ "smull r8, r9, r0, r5;"
+ "mov r0, r9, lsl #1;"
+ "smull r8, r9, r1, r4;"
+ "mov r1, r9, lsl #1;"
+ "stmia %[dst]!, {r0, r1};"
+ "subs %[n], %[n], #2;"
+ "bne 0b;"
+ : [s0] "+r" (src0), [s1] "+r" (src1), [dst] "+r" (dst), [n] "+r" (len)
+ :
+ : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc");
+}
+
+#elif defined(CPU_COLDFIRE)
+static inline
+void vector_fmul_add_add(real_t *dst, const real_t *src0, const real_t *src1, const real_t *src2, int len)
+{
+ /* Block sizes are always power of two. Smallest block is always way bigger
+ * than four too.*/
+ asm volatile (
+ "0:"
+ "movem.l (%[src0]), %%d0-%%d3;"
+ "movem.l (%[src1]), %%d4-%%d5/%%a0-%%a1;"
+ "mac.l %%d0, %%d4, %%acc0;"
+ "mac.l %%d1, %%d5, %%acc1;"
+ "mac.l %%d2, %%a0, %%acc2;"
+ "mac.l %%d3, %%a1, %%acc3;"
+ "lea.l (16, %[src0]), %[src0];"
+ "lea.l (16, %[src1]), %[src1];"
+ "movclr.l %%acc0, %%d0;"
+ "movclr.l %%acc1, %%d1;"
+ "movclr.l %%acc2, %%d2;"
+ "movclr.l %%acc3, %%d3;"
+ "movem.l (%[src2]), %%d4-%%d5/%%a0-%%a1;"
+ "lea.l (16, %[src2]), %[src2];"
+ "add.l %%d4, %%d0;"
+ "add.l %%d5, %%d1;"
+ "add.l %%a0, %%d2;"
+ "add.l %%a1, %%d3;"
+ "movem.l %%d0-%%d3, (%[dst]);"
+ "lea.l (16, %[dst]), %[dst];"
+ "subq.l #4, %[n];"
+ "jne 0b;"
+ : [src0] "+a" (src0), [src1] "+a" (src1), [src2] "+a" (src2), [dst] "+a" (dst), [n] "+d" (len)
+ :
+ : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
+}
+
+static inline
+void vector_fmul_reverse(real_t *dst, const real_t *src0, const real_t *src1,
+ int len)
+{
+ /* Block sizes are always power of two. Smallest block is always way bigger
+ * than four too.*/
+ asm volatile (
+ "lea.l (-16, %[s1], %[n]*4), %[s1];"
+ "0:"
+ "movem.l (%[s0]), %%d0-%%d3;"
+ "movem.l (%[s1]), %%d4-%%d5/%%a0-%%a1;"
+ "mac.l %%d0, %%a1, %%acc0;"
+ "mac.l %%d1, %%a0, %%acc1;"
+ "mac.l %%d2, %%d5, %%acc2;"
+ "mac.l %%d3, %%d4, %%acc3;"
+ "lea.l (16, %[s0]), %[s0];"
+ "lea.l (-16, %[s1]), %[s1];"
+ "movclr.l %%acc0, %%d0;"
+ "movclr.l %%acc1, %%d1;"
+ "movclr.l %%acc2, %%d2;"
+ "movclr.l %%acc3, %%d3;"
+ "movem.l %%d0-%%d3, (%[dst]);"
+ "lea.l (16, %[dst]), %[dst];"
+ "subq.l #4, %[n];"
+ "jne 0b;"
+ : [s0] "+a" (src0), [s1] "+a" (src1), [dst] "+a" (dst), [n] "+d" (len)
+ : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc");
+}
+
+#else
+static inline void vector_fmul_add_add(real_t *dst, const real_t *src0, const real_t *src1, const real_t *src2, int len){
+ int i;
+ for(i=0; i<len; i++)
+ dst[i] = MUL_F(src0[i], src1[i]) + src2[i];
+}
+
+static inline void vector_fmul_reverse(real_t *dst, const real_t *src0, const real_t *src1, int len){
+ int i;
+ src1 += len-1;
+ for(i=0; i<len; i++)
+ dst[i] = MUL_F(src0[i], src1[-i]);
+}
+#endif
fb_info *filter_bank_init(uint16_t frame_len)
{
@@ -213,7 +338,6 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
#if 0
printf("%d %d\n", window_sequence, window_shape);
#endif
-
switch (window_sequence)
{
case ONLY_LONG_SEQUENCE:
@@ -221,22 +345,11 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
imdct_long(fb, freq_in, transf_buf, 2*nlong);
/* add second half output of previous frame to windowed output of current frame */
- for (i = 0; i < nlong; i+=4)
- {
- time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]);
- time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]);
- time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]);
- time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]);
- }
+ vector_fmul_add_add(time_out, transf_buf, window_long_prev, overlap, nlong);
/* window the second half and save as overlap for next frame */
- for (i = 0; i < nlong; i+=4)
- {
- overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]);
- overlap[i+1] = MUL_F(transf_buf[nlong+i+1],window_long[nlong-2-i]);
- overlap[i+2] = MUL_F(transf_buf[nlong+i+2],window_long[nlong-3-i]);
- overlap[i+3] = MUL_F(transf_buf[nlong+i+3],window_long[nlong-4-i]);
- }
+ vector_fmul_reverse(overlap, transf_buf+nlong, window_long, nlong);
+
break;
case LONG_START_SEQUENCE:
@@ -244,25 +357,21 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
imdct_long(fb, freq_in, transf_buf, 2*nlong);
/* add second half output of previous frame to windowed output of current frame */
- for (i = 0; i < nlong; i+=4)
- {
- time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]);
- time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]);
- time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]);
- time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]);
- }
+ vector_fmul_add_add(time_out, transf_buf, window_long_prev, overlap, nlong);
/* window the second half and save as overlap for next frame */
/* construct second half window using padding with 1's and 0's */
- for (i = 0; i < nflat_ls; i++)
- overlap[i] = transf_buf[nlong+i];
- for (i = 0; i < nshort; i++)
- overlap[nflat_ls+i] = MUL_F(transf_buf[nlong+nflat_ls+i],window_short[nshort-i-1]);
- for (i = 0; i < nflat_ls; i++)
- overlap[nflat_ls+nshort+i] = 0;
+
+ memcpy(overlap, transf_buf+nlong, nflat_ls*sizeof(real_t));
+
+ vector_fmul_reverse(overlap+nflat_ls, transf_buf+nlong+nflat_ls, window_short, nshort);
+
+ memset(overlap+nflat_ls+nshort, 0, nflat_ls*sizeof(real_t));
break;
case EIGHT_SHORT_SEQUENCE:
+ /*this could be assemblerized too, but this case is extremely uncommon*/
+
/* perform iMDCT for each short block */
faad_imdct(fb->mdct256, freq_in+0*nshort, transf_buf+2*nshort*0);
faad_imdct(fb->mdct256, freq_in+1*nshort, transf_buf+2*nshort*1);
@@ -275,7 +384,7 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
/* add second half output of previous frame to windowed output of current frame */
for (i = 0; i < nflat_ls; i++)
- time_out[i] = overlap[i];
+ time_out[i] = overlap[i];
for(i = 0; i < nshort; i++)
{
time_out[nflat_ls+ i] = overlap[nflat_ls+ i] + MUL_F(transf_buf[nshort*0+i],window_short_prev[i]);
@@ -296,8 +405,8 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
overlap[nflat_ls+7*nshort+i-nlong] = MUL_F(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*14+i],window_short[i]);
overlap[nflat_ls+8*nshort+i-nlong] = MUL_F(transf_buf[nshort*15+i],window_short[nshort-1-i]);
}
- for (i = 0; i < nflat_ls; i++)
- overlap[nflat_ls+nshort+i] = 0;
+ memset(overlap+nflat_ls+nshort, 0, nflat_ls*sizeof(real_t));
+
break;
case LONG_STOP_SEQUENCE:
@@ -306,17 +415,16 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
/* add second half output of previous frame to windowed output of current frame */
/* construct first half window using padding with 1's and 0's */
- for (i = 0; i < nflat_ls; i++)
- time_out[i] = overlap[i];
- for (i = 0; i < nshort; i++)
- time_out[nflat_ls+i] = overlap[nflat_ls+i] + MUL_F(transf_buf[nflat_ls+i],window_short_prev[i]);
+ memcpy(time_out, overlap, nflat_ls*sizeof(real_t));
+
+ vector_fmul_add_add(time_out+nflat_ls, transf_buf+nflat_ls, window_short_prev, overlap+nflat_ls, nshort);
+
for (i = 0; i < nflat_ls; i++)
time_out[nflat_ls+nshort+i] = overlap[nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i];
/* window the second half and save as overlap for next frame */
- for (i = 0; i < nlong; i++)
- overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]);
- break;
+ vector_fmul_reverse(overlap, transf_buf+nlong, window_long, nlong);
+ break;
}
#if 0