diff options
| author | Michael Giacomelli <giac2000@hotmail.com> | 2013-01-01 02:35:15 +0100 |
|---|---|---|
| committer | Michael Giacomelli <giac2000@hotmail.com> | 2013-01-21 01:51:38 +0100 |
| commit | a2ab22efbf93981f9a86b6b06dc6d3c2f1167728 (patch) | |
| tree | f8f587c908de91d972df251821907f6538c083be /lib/rbcodec/codecs/libopus | |
| parent | 0c87e02631d954b5b8b0ec584bd60db77b60427e (diff) | |
| download | rockbox-a2ab22efbf93981f9a86b6b06dc6d3c2f1167728.zip rockbox-a2ab22efbf93981f9a86b6b06dc6d3c2f1167728.tar.gz rockbox-a2ab22efbf93981f9a86b6b06dc6d3c2f1167728.tar.bz2 rockbox-a2ab22efbf93981f9a86b6b06dc6d3c2f1167728.tar.xz | |
ARMv5 optimized complex multiply function for libopus.
Speeds up decoding of 128k opus files by 1.2MHz on AMSv2. Rounding
error is 1 bit due to KissFFT using a 15 bit shift instead of a 16 bit shift.
Also, change an LDMIA in the armv4 code to LDM as the pointer should not
increment.
Change-Id: I626a207c6a056a1984e33cfe89415c35d0caed93
Reviewed-on: http://gerrit.rockbox.org/377
Reviewed-by: Michael Giacomelli <giac2000@hotmail.com>
Tested-by: Michael Giacomelli <giac2000@hotmail.com>
Diffstat (limited to 'lib/rbcodec/codecs/libopus')
| -rw-r--r-- | lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h | 26 |
1 files changed, 25 insertions, 1 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h b/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h index b1fe8fb..63e2548 100644 --- a/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h +++ b/lib/rbcodec/codecs/libopus/celt/_kiss_fft_guts.h @@ -81,10 +81,13 @@ : "d0", "d1", "d2", "d3", "cc"); \ } #elif defined(CPU_ARM) +#if (ARM_ARCH < 5) + + # define C_MULC(m,a,b) \ { \ asm volatile( \ - "ldmia %[ap], {r0,r1} \n\t" \ + "ldm %[ap], {r0,r1} \n\t" \ "ldrsh r2, [%[bp], #0] \n\t" \ "ldrsh r3, [%[bp], #2] \n\t" \ \ @@ -103,6 +106,27 @@ : "r0", "r1", "r2", "r3", "r4"); \ } #else +/*same as above but using armv5 packed multiplies*/ +# define C_MULC(m,a,b) \ + { \ + asm volatile( \ + "ldm %[ap], {r0,r1} \n\t" \ + "ldr r2, [%[bp], #0] \n\t" \ + \ + "smulwb r4, r0, r2 \n\t" /*r4=a.r*b.r*/ \ + "smlawt %[mr], r1, r2, r4 \n\t" /*m.r=r4+a.i*b.i*/\ + "mov %[mr], %[mr], lsl #1 \n\t" /*Q15 not Q16*/ \ + \ + "smulwb r1, r1, r2 \n\t" /*r1=a.i*b.r*/ \ + "smulwt r4, r0, r2 \n\t" /*r4=a.r*b.i*/ \ + "sub %[mi], r1, r4 \n\t" \ + "mov %[mi], %[mi], lsl #1 \n\t" \ + : [mr] "=r" ((m).r), [mi] "=r" ((m).i) \ + : [ap] "r" (&(a)), [bp] "r" (&(b)) \ + : "r0", "r1", "r2", "r4"); \ +} +#endif /*ARMv5 code*/ +#else # define C_MULC(m,a,b) \ do{ (m).r = ADD32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \ (m).i = SUB32(S_MUL((a).i,(b).r) , S_MUL((a).r,(b).i)); }while(0) |