From 54903eb7fa4db8146f3412a4efd883d932545b75 Mon Sep 17 00:00:00 2001 From: Michael Giacomelli Date: Sun, 10 Aug 2008 20:19:38 +0000 Subject: Use WMA windowing optimizations for AAC. Saves about 3.5MHz on Coldfire, and about 2 MHz on ARM. Thanks to amiconn for help with Coldfire ASM improvements. Next step: dump faad IMDCT. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@18238 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libfaad/filtbank.c | 184 ++++++++++++++++++++++++++++++++--------- 1 file changed, 146 insertions(+), 38 deletions(-) (limited to 'apps/codecs/libfaad') diff --git a/apps/codecs/libfaad/filtbank.c b/apps/codecs/libfaad/filtbank.c index 9b1bc85..333bad6 100644 --- a/apps/codecs/libfaad/filtbank.c +++ b/apps/codecs/libfaad/filtbank.c @@ -43,6 +43,131 @@ #include "sine_win.h" #include "mdct.h" +/*Windowing functions borrowed from libwmai*/ + +#ifdef CPU_ARM +static inline +void vector_fmul_add_add(real_t *dst, const real_t *src0, const real_t *src1, const real_t *src2, int len) +{ + /* Block sizes are always power of two */ + asm volatile ( + "0:" + "ldmia %[d]!, {r0, r1};" + "ldmia %[w]!, {r4, r5};" + /* consume the first data and window value so we can use those + * registers again */ + "smull r8, r9, r0, r4;" + "ldmia %[src2]!, {r0, r4};" + "add r0, r0, r9, lsl #1;" /* *dst=*dst+(r9<<1)*/ + "smull r8, r9, r1, r5;" + "add r1, r4, r9, lsl #1;" + "stmia %[dst]!, {r0, r1};" + "subs %[n], %[n], #2;" + "bne 0b;" + : [d] "+r" (src0), [w] "+r" (src1), [src2] "+r" (src2), [dst] "+r" (dst), [n] "+r" (len) + : + : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc"); +} +static inline +void vector_fmul_reverse(real_t *dst, const real_t *src0, const real_t *src1, + int len) +{ + /* Block sizes are always power of two */ + asm volatile ( + "add %[s1], %[s1], %[n], lsl #2;" + "0:" + "ldmia %[s0]!, {r0, r1};" + "ldmdb %[s1]!, {r4, r5};" + "smull r8, r9, r0, r5;" + "mov r0, r9, lsl #1;" + "smull r8, r9, r1, r4;" + "mov r1, r9, lsl #1;" + "stmia %[dst]!, {r0, r1};" + "subs %[n], %[n], #2;" + "bne 0b;" + : [s0] "+r" (src0), [s1] "+r" (src1), [dst] "+r" (dst), [n] "+r" (len) + : + : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc"); +} + +#elif defined(CPU_COLDFIRE) +static inline +void vector_fmul_add_add(real_t *dst, const real_t *src0, const real_t *src1, const real_t *src2, int len) +{ + /* Block sizes are always power of two. Smallest block is always way bigger + * than four too.*/ + asm volatile ( + "0:" + "movem.l (%[src0]), %%d0-%%d3;" + "movem.l (%[src1]), %%d4-%%d5/%%a0-%%a1;" + "mac.l %%d0, %%d4, %%acc0;" + "mac.l %%d1, %%d5, %%acc1;" + "mac.l %%d2, %%a0, %%acc2;" + "mac.l %%d3, %%a1, %%acc3;" + "lea.l (16, %[src0]), %[src0];" + "lea.l (16, %[src1]), %[src1];" + "movclr.l %%acc0, %%d0;" + "movclr.l %%acc1, %%d1;" + "movclr.l %%acc2, %%d2;" + "movclr.l %%acc3, %%d3;" + "movem.l (%[src2]), %%d4-%%d5/%%a0-%%a1;" + "lea.l (16, %[src2]), %[src2];" + "add.l %%d4, %%d0;" + "add.l %%d5, %%d1;" + "add.l %%a0, %%d2;" + "add.l %%a1, %%d3;" + "movem.l %%d0-%%d3, (%[dst]);" + "lea.l (16, %[dst]), %[dst];" + "subq.l #4, %[n];" + "jne 0b;" + : [src0] "+a" (src0), [src1] "+a" (src1), [src2] "+a" (src2), [dst] "+a" (dst), [n] "+d" (len) + : + : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc"); +} + +static inline +void vector_fmul_reverse(real_t *dst, const real_t *src0, const real_t *src1, + int len) +{ + /* Block sizes are always power of two. Smallest block is always way bigger + * than four too.*/ + asm volatile ( + "lea.l (-16, %[s1], %[n]*4), %[s1];" + "0:" + "movem.l (%[s0]), %%d0-%%d3;" + "movem.l (%[s1]), %%d4-%%d5/%%a0-%%a1;" + "mac.l %%d0, %%a1, %%acc0;" + "mac.l %%d1, %%a0, %%acc1;" + "mac.l %%d2, %%d5, %%acc2;" + "mac.l %%d3, %%d4, %%acc3;" + "lea.l (16, %[s0]), %[s0];" + "lea.l (-16, %[s1]), %[s1];" + "movclr.l %%acc0, %%d0;" + "movclr.l %%acc1, %%d1;" + "movclr.l %%acc2, %%d2;" + "movclr.l %%acc3, %%d3;" + "movem.l %%d0-%%d3, (%[dst]);" + "lea.l (16, %[dst]), %[dst];" + "subq.l #4, %[n];" + "jne 0b;" + : [s0] "+a" (src0), [s1] "+a" (src1), [dst] "+a" (dst), [n] "+d" (len) + : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc"); +} + +#else +static inline void vector_fmul_add_add(real_t *dst, const real_t *src0, const real_t *src1, const real_t *src2, int len){ + int i; + for(i=0; imdct256, freq_in+0*nshort, transf_buf+2*nshort*0); faad_imdct(fb->mdct256, freq_in+1*nshort, transf_buf+2*nshort*1); @@ -275,7 +384,7 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, /* add second half output of previous frame to windowed output of current frame */ for (i = 0; i < nflat_ls; i++) - time_out[i] = overlap[i]; + time_out[i] = overlap[i]; for(i = 0; i < nshort; i++) { time_out[nflat_ls+ i] = overlap[nflat_ls+ i] + MUL_F(transf_buf[nshort*0+i],window_short_prev[i]); @@ -296,8 +405,8 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, overlap[nflat_ls+7*nshort+i-nlong] = MUL_F(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*14+i],window_short[i]); overlap[nflat_ls+8*nshort+i-nlong] = MUL_F(transf_buf[nshort*15+i],window_short[nshort-1-i]); } - for (i = 0; i < nflat_ls; i++) - overlap[nflat_ls+nshort+i] = 0; + memset(overlap+nflat_ls+nshort, 0, nflat_ls*sizeof(real_t)); + break; case LONG_STOP_SEQUENCE: @@ -306,17 +415,16 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, /* add second half output of previous frame to windowed output of current frame */ /* construct first half window using padding with 1's and 0's */ - for (i = 0; i < nflat_ls; i++) - time_out[i] = overlap[i]; - for (i = 0; i < nshort; i++) - time_out[nflat_ls+i] = overlap[nflat_ls+i] + MUL_F(transf_buf[nflat_ls+i],window_short_prev[i]); + memcpy(time_out, overlap, nflat_ls*sizeof(real_t)); + + vector_fmul_add_add(time_out+nflat_ls, transf_buf+nflat_ls, window_short_prev, overlap+nflat_ls, nshort); + for (i = 0; i < nflat_ls; i++) time_out[nflat_ls+nshort+i] = overlap[nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i]; /* window the second half and save as overlap for next frame */ - for (i = 0; i < nlong; i++) - overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]); - break; + vector_fmul_reverse(overlap, transf_buf+nlong, window_long, nlong); + break; } #if 0 -- cgit v1.1