diff options
| author | Michael Giacomelli <giac2000@hotmail.com> | 2008-08-10 20:19:38 +0000 |
|---|---|---|
| committer | Michael Giacomelli <giac2000@hotmail.com> | 2008-08-10 20:19:38 +0000 |
| commit | 54903eb7fa4db8146f3412a4efd883d932545b75 (patch) | |
| tree | c9ff7a0e77aa864930a96bed4dbf58939637cf29 /apps/codecs | |
| parent | 32694a7040f6aa4ff7160d8ba85c36593ef49e71 (diff) | |
| download | rockbox-54903eb7fa4db8146f3412a4efd883d932545b75.zip rockbox-54903eb7fa4db8146f3412a4efd883d932545b75.tar.gz rockbox-54903eb7fa4db8146f3412a4efd883d932545b75.tar.bz2 rockbox-54903eb7fa4db8146f3412a4efd883d932545b75.tar.xz | |
Use WMA windowing optimizations for AAC. Saves about 3.5MHz on Coldfire, and about 2 MHz on ARM. Thanks to amiconn for help with Coldfire ASM improvements. Next step: dump faad IMDCT.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@18238 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs')
| -rw-r--r-- | apps/codecs/libfaad/filtbank.c | 184 |
1 files changed, 146 insertions, 38 deletions
diff --git a/apps/codecs/libfaad/filtbank.c b/apps/codecs/libfaad/filtbank.c index 9b1bc85..333bad6 100644 --- a/apps/codecs/libfaad/filtbank.c +++ b/apps/codecs/libfaad/filtbank.c @@ -43,6 +43,131 @@ #include "sine_win.h" #include "mdct.h" +/*Windowing functions borrowed from libwmai*/ + +#ifdef CPU_ARM +static inline +void vector_fmul_add_add(real_t *dst, const real_t *src0, const real_t *src1, const real_t *src2, int len) +{ + /* Block sizes are always power of two */ + asm volatile ( + "0:" + "ldmia %[d]!, {r0, r1};" + "ldmia %[w]!, {r4, r5};" + /* consume the first data and window value so we can use those + * registers again */ + "smull r8, r9, r0, r4;" + "ldmia %[src2]!, {r0, r4};" + "add r0, r0, r9, lsl #1;" /* *dst=*dst+(r9<<1)*/ + "smull r8, r9, r1, r5;" + "add r1, r4, r9, lsl #1;" + "stmia %[dst]!, {r0, r1};" + "subs %[n], %[n], #2;" + "bne 0b;" + : [d] "+r" (src0), [w] "+r" (src1), [src2] "+r" (src2), [dst] "+r" (dst), [n] "+r" (len) + : + : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc"); +} +static inline +void vector_fmul_reverse(real_t *dst, const real_t *src0, const real_t *src1, + int len) +{ + /* Block sizes are always power of two */ + asm volatile ( + "add %[s1], %[s1], %[n], lsl #2;" + "0:" + "ldmia %[s0]!, {r0, r1};" + "ldmdb %[s1]!, {r4, r5};" + "smull r8, r9, r0, r5;" + "mov r0, r9, lsl #1;" + "smull r8, r9, r1, r4;" + "mov r1, r9, lsl #1;" + "stmia %[dst]!, {r0, r1};" + "subs %[n], %[n], #2;" + "bne 0b;" + : [s0] "+r" (src0), [s1] "+r" (src1), [dst] "+r" (dst), [n] "+r" (len) + : + : "r0", "r1", "r4", "r5", "r8", "r9", "memory", "cc"); +} + +#elif defined(CPU_COLDFIRE) +static inline +void vector_fmul_add_add(real_t *dst, const real_t *src0, const real_t *src1, const real_t *src2, int len) +{ + /* Block sizes are always power of two. Smallest block is always way bigger + * than four too.*/ + asm volatile ( + "0:" + "movem.l (%[src0]), %%d0-%%d3;" + "movem.l (%[src1]), %%d4-%%d5/%%a0-%%a1;" + "mac.l %%d0, %%d4, %%acc0;" + "mac.l %%d1, %%d5, %%acc1;" + "mac.l %%d2, %%a0, %%acc2;" + "mac.l %%d3, %%a1, %%acc3;" + "lea.l (16, %[src0]), %[src0];" + "lea.l (16, %[src1]), %[src1];" + "movclr.l %%acc0, %%d0;" + "movclr.l %%acc1, %%d1;" + "movclr.l %%acc2, %%d2;" + "movclr.l %%acc3, %%d3;" + "movem.l (%[src2]), %%d4-%%d5/%%a0-%%a1;" + "lea.l (16, %[src2]), %[src2];" + "add.l %%d4, %%d0;" + "add.l %%d5, %%d1;" + "add.l %%a0, %%d2;" + "add.l %%a1, %%d3;" + "movem.l %%d0-%%d3, (%[dst]);" + "lea.l (16, %[dst]), %[dst];" + "subq.l #4, %[n];" + "jne 0b;" + : [src0] "+a" (src0), [src1] "+a" (src1), [src2] "+a" (src2), [dst] "+a" (dst), [n] "+d" (len) + : + : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc"); +} + +static inline +void vector_fmul_reverse(real_t *dst, const real_t *src0, const real_t *src1, + int len) +{ + /* Block sizes are always power of two. Smallest block is always way bigger + * than four too.*/ + asm volatile ( + "lea.l (-16, %[s1], %[n]*4), %[s1];" + "0:" + "movem.l (%[s0]), %%d0-%%d3;" + "movem.l (%[s1]), %%d4-%%d5/%%a0-%%a1;" + "mac.l %%d0, %%a1, %%acc0;" + "mac.l %%d1, %%a0, %%acc1;" + "mac.l %%d2, %%d5, %%acc2;" + "mac.l %%d3, %%d4, %%acc3;" + "lea.l (16, %[s0]), %[s0];" + "lea.l (-16, %[s1]), %[s1];" + "movclr.l %%acc0, %%d0;" + "movclr.l %%acc1, %%d1;" + "movclr.l %%acc2, %%d2;" + "movclr.l %%acc3, %%d3;" + "movem.l %%d0-%%d3, (%[dst]);" + "lea.l (16, %[dst]), %[dst];" + "subq.l #4, %[n];" + "jne 0b;" + : [s0] "+a" (src0), [s1] "+a" (src1), [dst] "+a" (dst), [n] "+d" (len) + : : "d0", "d1", "d2", "d3", "d4", "d5", "a0", "a1", "memory", "cc"); +} + +#else +static inline void vector_fmul_add_add(real_t *dst, const real_t *src0, const real_t *src1, const real_t *src2, int len){ + int i; + for(i=0; i<len; i++) + dst[i] = MUL_F(src0[i], src1[i]) + src2[i]; +} + +static inline void vector_fmul_reverse(real_t *dst, const real_t *src0, const real_t *src1, int len){ + int i; + src1 += len-1; + for(i=0; i<len; i++) + dst[i] = MUL_F(src0[i], src1[-i]); +} +#endif fb_info *filter_bank_init(uint16_t frame_len) { @@ -213,7 +338,6 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, #if 0 printf("%d %d\n", window_sequence, window_shape); #endif - switch (window_sequence) { case ONLY_LONG_SEQUENCE: @@ -221,22 +345,11 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, imdct_long(fb, freq_in, transf_buf, 2*nlong); /* add second half output of previous frame to windowed output of current frame */ - for (i = 0; i < nlong; i+=4) - { - time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]); - time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]); - time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]); - time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]); - } + vector_fmul_add_add(time_out, transf_buf, window_long_prev, overlap, nlong); /* window the second half and save as overlap for next frame */ - for (i = 0; i < nlong; i+=4) - { - overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]); - overlap[i+1] = MUL_F(transf_buf[nlong+i+1],window_long[nlong-2-i]); - overlap[i+2] = MUL_F(transf_buf[nlong+i+2],window_long[nlong-3-i]); - overlap[i+3] = MUL_F(transf_buf[nlong+i+3],window_long[nlong-4-i]); - } + vector_fmul_reverse(overlap, transf_buf+nlong, window_long, nlong); + break; case LONG_START_SEQUENCE: @@ -244,25 +357,21 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, imdct_long(fb, freq_in, transf_buf, 2*nlong); /* add second half output of previous frame to windowed output of current frame */ - for (i = 0; i < nlong; i+=4) - { - time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]); - time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]); - time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]); - time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]); - } + vector_fmul_add_add(time_out, transf_buf, window_long_prev, overlap, nlong); /* window the second half and save as overlap for next frame */ /* construct second half window using padding with 1's and 0's */ - for (i = 0; i < nflat_ls; i++) - overlap[i] = transf_buf[nlong+i]; - for (i = 0; i < nshort; i++) - overlap[nflat_ls+i] = MUL_F(transf_buf[nlong+nflat_ls+i],window_short[nshort-i-1]); - for (i = 0; i < nflat_ls; i++) - overlap[nflat_ls+nshort+i] = 0; + + memcpy(overlap, transf_buf+nlong, nflat_ls*sizeof(real_t)); + + vector_fmul_reverse(overlap+nflat_ls, transf_buf+nlong+nflat_ls, window_short, nshort); + + memset(overlap+nflat_ls+nshort, 0, nflat_ls*sizeof(real_t)); break; case EIGHT_SHORT_SEQUENCE: + /*this could be assemblerized too, but this case is extremely uncommon*/ + /* perform iMDCT for each short block */ faad_imdct(fb->mdct256, freq_in+0*nshort, transf_buf+2*nshort*0); faad_imdct(fb->mdct256, freq_in+1*nshort, transf_buf+2*nshort*1); @@ -275,7 +384,7 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, /* add second half output of previous frame to windowed output of current frame */ for (i = 0; i < nflat_ls; i++) - time_out[i] = overlap[i]; + time_out[i] = overlap[i]; for(i = 0; i < nshort; i++) { time_out[nflat_ls+ i] = overlap[nflat_ls+ i] + MUL_F(transf_buf[nshort*0+i],window_short_prev[i]); @@ -296,8 +405,8 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, overlap[nflat_ls+7*nshort+i-nlong] = MUL_F(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*14+i],window_short[i]); overlap[nflat_ls+8*nshort+i-nlong] = MUL_F(transf_buf[nshort*15+i],window_short[nshort-1-i]); } - for (i = 0; i < nflat_ls; i++) - overlap[nflat_ls+nshort+i] = 0; + memset(overlap+nflat_ls+nshort, 0, nflat_ls*sizeof(real_t)); + break; case LONG_STOP_SEQUENCE: @@ -306,17 +415,16 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, /* add second half output of previous frame to windowed output of current frame */ /* construct first half window using padding with 1's and 0's */ - for (i = 0; i < nflat_ls; i++) - time_out[i] = overlap[i]; - for (i = 0; i < nshort; i++) - time_out[nflat_ls+i] = overlap[nflat_ls+i] + MUL_F(transf_buf[nflat_ls+i],window_short_prev[i]); + memcpy(time_out, overlap, nflat_ls*sizeof(real_t)); + + vector_fmul_add_add(time_out+nflat_ls, transf_buf+nflat_ls, window_short_prev, overlap+nflat_ls, nshort); + for (i = 0; i < nflat_ls; i++) time_out[nflat_ls+nshort+i] = overlap[nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i]; /* window the second half and save as overlap for next frame */ - for (i = 0; i < nlong; i++) - overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]); - break; + vector_fmul_reverse(overlap, transf_buf+nlong, window_long, nlong); + break; } #if 0 |