diff options
Diffstat (limited to 'apps/codecs')
| -rw-r--r-- | apps/codecs/demac/libdemac/decoder.c | 13 | ||||
| -rw-r--r-- | apps/codecs/demac/libdemac/demac_config.h | 27 | ||||
| -rw-r--r-- | apps/codecs/demac/libdemac/filter.c | 41 | ||||
| -rw-r--r-- | apps/codecs/demac/libdemac/filter.h | 12 | ||||
| -rw-r--r-- | apps/codecs/demac/libdemac/vector_math16_arm7.h | 293 | ||||
| -rw-r--r-- | apps/codecs/demac/libdemac/vector_math32_armv4.h | 210 | ||||
| -rw-r--r-- | apps/codecs/demac/libdemac/vector_math_generic.h (renamed from apps/codecs/demac/libdemac/vector_math16.h) | 10 |
7 files changed, 280 insertions, 326 deletions
diff --git a/apps/codecs/demac/libdemac/decoder.c b/apps/codecs/demac/libdemac/decoder.c index 540db47..31bcb28 100644 --- a/apps/codecs/demac/libdemac/decoder.c +++ b/apps/codecs/demac/libdemac/decoder.c @@ -33,15 +33,16 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA /* Statically allocate the filter buffers */ -static int16_t filterbuf32[(32*3 + FILTER_HISTORY_SIZE) * 2] /* 2432 bytes */ - IBSS_ATTR __attribute__((aligned(16))); -static int16_t filterbuf256[(256*3 + FILTER_HISTORY_SIZE) * 2] /* 5120 bytes */ - IBSS_ATTR __attribute__((aligned(16))); +static filter_int filterbuf32[(32*3 + FILTER_HISTORY_SIZE) * 2] + IBSS_ATTR __attribute__((aligned(16))); /* 2432/4864 bytes */ +static filter_int filterbuf256[(256*3 + FILTER_HISTORY_SIZE) * 2] + IBSS_ATTR __attribute__((aligned(16))); /* 5120/10240 bytes */ /* This is only needed for "insane" files, and no current Rockbox targets can hope to decode them in realtime, although the Gigabeat S comes close. */ -static int16_t filterbuf1280[(1280*3 + FILTER_HISTORY_SIZE) * 2] /* 17408 bytes */ - IBSS_ATTR_DEMAC_INSANEBUF __attribute__((aligned(16))); +static filter_int filterbuf1280[(1280*3 + FILTER_HISTORY_SIZE) * 2] + IBSS_ATTR_DEMAC_INSANEBUF __attribute__((aligned(16))); + /* 17408 or 34816 bytes */ void init_frame_decoder(struct ape_ctx_t* ape_ctx, unsigned char* inbuffer, int* firstbyte, diff --git a/apps/codecs/demac/libdemac/demac_config.h b/apps/codecs/demac/libdemac/demac_config.h index 93fda76..86c2d24 100644 --- a/apps/codecs/demac/libdemac/demac_config.h +++ b/apps/codecs/demac/libdemac/demac_config.h @@ -39,12 +39,21 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA #define APE_OUTPUT_DEPTH 29 -/* On PP5002 code should go into IRAM. Otherwise put the insane - * filter buffer into IRAM as long as there is no better use. */ +/* On ARMv4, using 32 bit ints for the filters is faster. */ +#if defined(CPU_ARM) && (ARM_ARCH == 4) +#define FILTER_BITS 32 +#endif + #if CONFIG_CPU == PP5002 +/* Code in IRAM for speed, not enough IRAM for the insane filter buffer. */ #define ICODE_SECTION_DEMAC_ARM .icode #define ICODE_ATTR_DEMAC ICODE_ATTR #define IBSS_ATTR_DEMAC_INSANEBUF +#elif CONFIG_CPU == PP5020 +/* Not enough IRAM for the insane filter buffer. */ +#define ICODE_SECTION_DEMAC_ARM .text +#define ICODE_ATTR_DEMAC +#define IBSS_ATTR_DEMAC_INSANEBUF #else #define ICODE_SECTION_DEMAC_ARM .text #define ICODE_ATTR_DEMAC @@ -75,6 +84,20 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA #ifndef PREDICTOR_HISTORY_SIZE #define PREDICTOR_HISTORY_SIZE 512 +#endif + +#ifndef FILTER_BITS +#define FILTER_BITS 16 +#endif + + +#ifndef __ASSEMBLER__ +#include <inttypes.h> +#if FILTER_BITS == 32 +typedef int32_t filter_int; +#elif FILTER_BITS == 16 +typedef int16_t filter_int; +#endif #endif #endif /* _DEMAC_CONFIG_H */ diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c index b47a37a..5601fff 100644 --- a/apps/codecs/demac/libdemac/filter.c +++ b/apps/codecs/demac/libdemac/filter.c @@ -28,27 +28,38 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA #include "demac.h" #include "filter.h" #include "demac_config.h" + +#if FILTER_BITS == 32 + +#if defined(CPU_ARM) && (ARM_ARCH == 4) +#include "vector_math32_armv4.h" +#else +#include "vector_math_generic.h" +#endif + +#else /* FILTER_BITS == 16 */ #ifdef CPU_COLDFIRE #include "vector_math16_cf.h" -#elif ARM_ARCH >= 6 +#elif defined(CPU_ARM) && (ARM_ARCH >= 6) #include "vector_math16_armv6.h" -#elif ARM_ARCH >= 5 /* Assume all our ARMv5 targets are ARMv5te(j) */ +#elif defined(CPU_ARM) && (ARM_ARCH >= 5) +/* Assume all our ARMv5 targets are ARMv5te(j) */ #include "vector_math16_armv5te.h" -#elif defined CPU_ARM7TDMI -#include "vector_math16_arm7.h" #else -#include "vector_math16.h" +#include "vector_math_generic.h" #endif +#endif /* FILTER_BITS */ + struct filter_t { - int16_t* coeffs; /* ORDER entries */ + filter_int* coeffs; /* ORDER entries */ /* We store all the filter delays in a single buffer */ - int16_t* history_end; + filter_int* history_end; - int16_t* delay; - int16_t* adaptcoeffs; + filter_int* delay; + filter_int* adaptcoeffs; int avg; }; @@ -89,7 +100,7 @@ struct filter_t { #if defined(CPU_ARM) && (ARM_ARCH >= 6) #define SATURATE(x) ({int __res; asm("ssat %0, #16, %1" : "=r"(__res) : "r"(x)); __res; }) #else -#define SATURATE(x) (int16_t)(((x) == (int16_t)(x)) ? (x) : ((x) >> 31) ^ 0x7FFF); +#define SATURATE(x) (((x) == (int16_t)(x)) ? (x) : ((x) >> 31) ^ 0x7FFF); #endif /* Apply the filter with state f to count entries in data[] */ @@ -145,7 +156,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f, /* Have we filled the history buffer? */ if (f->delay == f->history_end) { memmove(f->coeffs + ORDER, f->delay - (ORDER*2), - (ORDER*2) * sizeof(int16_t)); + (ORDER*2) * sizeof(filter_int)); f->adaptcoeffs = f->coeffs + ORDER*2; f->delay = f->coeffs + ORDER*3; } @@ -190,7 +201,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f, /* Have we filled the history buffer? */ if (f->delay == f->history_end) { memmove(f->coeffs + ORDER, f->delay - (ORDER*2), - (ORDER*2) * sizeof(int16_t)); + (ORDER*2) * sizeof(filter_int)); f->adaptcoeffs = f->coeffs + ORDER*2; f->delay = f->coeffs + ORDER*3; } @@ -200,7 +211,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f, static struct filter_t filter0 IBSS_ATTR; static struct filter_t filter1 IBSS_ATTR; -static void do_init_filter(struct filter_t* f, int16_t* buf) +static void do_init_filter(struct filter_t* f, filter_int* buf) { f->coeffs = buf; f->history_end = buf + ORDER*3 + FILTER_HISTORY_SIZE; @@ -210,13 +221,13 @@ static void do_init_filter(struct filter_t* f, int16_t* buf) f->delay = f->coeffs + ORDER*3; /* Zero coefficients and history buffer */ - memset(f->coeffs, 0, ORDER*3 * sizeof(int16_t)); + memset(f->coeffs, 0, ORDER*3 * sizeof(filter_int)); /* Zero the running average */ f->avg = 0; } -void INIT_FILTER(int16_t* buf) +void INIT_FILTER(filter_int* buf) { do_init_filter(&filter0, buf); do_init_filter(&filter1, buf + ORDER*3 + FILTER_HISTORY_SIZE); diff --git a/apps/codecs/demac/libdemac/filter.h b/apps/codecs/demac/libdemac/filter.h index acbb155..bbe51d4 100644 --- a/apps/codecs/demac/libdemac/filter.h +++ b/apps/codecs/demac/libdemac/filter.h @@ -25,21 +25,21 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA #ifndef _APE_FILTER_H #define _APE_FILTER_H -#include <inttypes.h> +#include "demac_config.h" -void init_filter_16_11(int16_t* buf); +void init_filter_16_11(filter_int* buf); int apply_filter_16_11(int fileversion, int32_t* decoded0, int32_t* decoded1, int count); -void init_filter_64_11(int16_t* buf); +void init_filter_64_11(filter_int* buf); int apply_filter_64_11(int fileversion, int32_t* decoded0, int32_t* decoded1, int count); -void init_filter_32_10(int16_t* buf); +void init_filter_32_10(filter_int* buf); int apply_filter_32_10(int fileversion, int32_t* decoded0, int32_t* decoded1, int count); -void init_filter_256_13(int16_t* buf); +void init_filter_256_13(filter_int* buf); int apply_filter_256_13(int fileversion, int32_t* decoded0, int32_t* decoded1, int count); -void init_filter_1280_15(int16_t* buf); +void init_filter_1280_15(filter_int* buf); int apply_filter_1280_15(int fileversion, int32_t* decoded0, int32_t* decoded1, int count); #endif diff --git a/apps/codecs/demac/libdemac/vector_math16_arm7.h b/apps/codecs/demac/libdemac/vector_math16_arm7.h deleted file mode 100644 index 653bb1f..0000000 --- a/apps/codecs/demac/libdemac/vector_math16_arm7.h +++ /dev/null @@ -1,293 +0,0 @@ -/* - -libdemac - A Monkey's Audio decoder - -$Id$ - -Copyright (C) Dave Chapman 2007 - -ARM7 vector math copyright (C) 2007 Jens Arnold - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA - -*/ - -/* This version fetches data as 32 bit words, and *requires* v1 to be - * 32 bit aligned, otherwise it will result either in a data abort, or - * incorrect results (if ARM aligncheck is disabled). */ -static inline void vector_add(int16_t* v1, int16_t* v2) -{ -#if ORDER > 16 - int cnt = ORDER>>4; -#endif - -#define ADDHALFREGS(sum, s1) /* Adds register */ \ - "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight. */ \ - "add r8 , " #s1 ", " #sum ", lsl #16 \n" /* Clobbers 's1' */ \ - "add " #sum ", " #s1 ", " #sum ", lsr #16 \n" /* and r8. */ \ - "mov " #sum ", " #sum ", lsl #16 \n" \ - "orr " #sum ", " #sum ", r8 , lsr #16 \n" - -#define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \ - "add " #s1 ", " #s1 ", " #sum ", lsl #16 \n" /* halves across. */ \ - "add " #sum ", " #s2 ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \ - "mov " #sum ", " #sum ", lsl #16 \n" \ - "orr " #sum ", " #sum ", " #s1 ", lsr #16 \n" - - asm volatile ( - "tst %[v2], #2 \n" - "beq 20f \n" - - "10: \n" - "ldrh r4, [%[v2]], #2 \n" - "mov r4, r4, lsl #16 \n" - "1: \n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r5-r8} \n" - ADDHALFXREGS(r0, r4, r5) - ADDHALFXREGS(r1, r5, r6) - ADDHALFXREGS(r2, r6, r7) - ADDHALFXREGS(r3, r7, r8) - "stmia %[v1]!, {r0-r3} \n" - "mov r4, r8 \n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r5-r8} \n" - ADDHALFXREGS(r0, r4, r5) - ADDHALFXREGS(r1, r5, r6) - ADDHALFXREGS(r2, r6, r7) - ADDHALFXREGS(r3, r7, r8) - "stmia %[v1]!, {r0-r3} \n" -#if ORDER > 16 - "mov r4, r8 \n" - "subs %[cnt], %[cnt], #1 \n" - "bne 1b \n" -#endif - "b 99f \n" - - "20: \n" - "1: \n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - ADDHALFREGS(r0, r4) - ADDHALFREGS(r1, r5) - ADDHALFREGS(r2, r6) - ADDHALFREGS(r3, r7) - "stmia %[v1]!, {r0-r3} \n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - ADDHALFREGS(r0, r4) - ADDHALFREGS(r1, r5) - ADDHALFREGS(r2, r6) - ADDHALFREGS(r3, r7) - "stmia %[v1]!, {r0-r3} \n" -#if ORDER > 16 - "subs %[cnt], %[cnt], #1 \n" - "bne 1b \n" -#endif - - "99: \n" - : /* outputs */ -#if ORDER > 16 - [cnt]"+r"(cnt), -#endif - [v1] "+r"(v1), - [v2] "+r"(v2) - : /* inputs */ - : /* clobbers */ - "r0", "r1", "r2", "r3", "r4", - "r5", "r6", "r7", "r8", "memory" - ); -} - -/* This version fetches data as 32 bit words, and *requires* v1 to be - * 32 bit aligned, otherwise it will result either in a data abort, or - * incorrect results (if ARM aligncheck is disabled). */ -static inline void vector_sub(int16_t* v1, int16_t* v2) -{ -#if ORDER > 16 - int cnt = ORDER>>4; -#endif - -#define SUBHALFREGS(dif, s1) /* Subtracts register */ \ - "sub r8 , " #dif ", " #s1 "\n" /* halves straight. */ \ - "and r8 , r8 , r9 \n" /* Needs r9 = 0x0000ffff, */ \ - "mov " #dif ", " #dif ", lsr #16 \n" /* clobbers r8. */ \ - "sub " #dif ", " #dif ", " #s1 ", lsr #16 \n" \ - "orr " #dif ", r8 , " #dif ", lsl #16 \n" - -#define SUBHALFXREGS(dif, s1, s2) /* Subtracts register */ \ - "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \ - "and " #s1 ", " #s1 ", r9 \n" /* Needs r9 = 0x0000ffff, */ \ - "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* clobbers 's1'. */ \ - "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n" - - asm volatile ( - "mov r9, #0xff \n" - "orr r9, r9, #0xff00 \n" - "tst %[v2], #2 \n" - "beq 20f \n" - - "10: \n" - "ldrh r4, [%[v2]], #2 \n" - "mov r4, r4, lsl #16 \n" - "1: \n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r5-r8} \n" - SUBHALFXREGS(r0, r4, r5) - SUBHALFXREGS(r1, r5, r6) - SUBHALFXREGS(r2, r6, r7) - SUBHALFXREGS(r3, r7, r8) - "stmia %[v1]!, {r0-r3} \n" - "mov r4, r8 \n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r5-r8} \n" - SUBHALFXREGS(r0, r4, r5) - SUBHALFXREGS(r1, r5, r6) - SUBHALFXREGS(r2, r6, r7) - SUBHALFXREGS(r3, r7, r8) - "stmia %[v1]!, {r0-r3} \n" -#if ORDER > 16 - "mov r4, r8 \n" - "subs %[cnt], %[cnt], #1 \n" - "bne 1b \n" -#endif - "b 99f \n" - - "20: \n" - "1: \n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - SUBHALFREGS(r0, r4) - SUBHALFREGS(r1, r5) - SUBHALFREGS(r2, r6) - SUBHALFREGS(r3, r7) - "stmia %[v1]!, {r0-r3} \n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - SUBHALFREGS(r0, r4) - SUBHALFREGS(r1, r5) - SUBHALFREGS(r2, r6) - SUBHALFREGS(r3, r7) - "stmia %[v1]!, {r0-r3} \n" -#if ORDER > 16 - "subs %[cnt], %[cnt], #1 \n" - "bne 1b \n" -#endif - - "99: \n" - : /* outputs */ -#if ORDER > 16 - [cnt]"+r"(cnt), -#endif - [v1] "+r"(v1), - [v2] "+r"(v2) - : /* inputs */ - : /* clobbers */ - "r0", "r1", "r2", "r3", "r4", "r5", - "r6", "r7", "r8", "r9", "memory" - ); -} - -/* This version fetches data as 32 bit words, and *requires* v1 to be - * 32 bit aligned, otherwise it will result either in a data abort, or - * incorrect results (if ARM aligncheck is disabled). It is optimised - * for ARM7TDMI. Using it for ARM9 or higher results in worse performance - * than the C version. */ -static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) -{ - int res = 0; -#if ORDER > 16 - int cnt = ORDER>>4; -#endif - -#define MLABLOCK2(f1, f2) \ - "mov r8, " #f1 ", lsl #16 \n" \ - "mov r8, r8 , asr #16 \n" \ - "mov r9, " #f2 ", lsl #16 \n" \ - "mov r9, r9 , asr #16 \n" \ - "mla %[res], r9, r8, %[res] \n" \ - "mov r8, " #f1 ", asr #16 \n" \ - "mov r9, " #f2 ", asr #16 \n" \ - "mla %[res], r9, r8, %[res] \n" - -#define MLABLOCK2_U2(f1, f2) \ - "mov r8, " #f1 ", lsl #16 \n" \ - "mov r8, r8 , asr #16 \n" \ - "mla %[res], r9, r8, %[res] \n" \ - "mov r8, " #f1 ", asr #16 \n" \ - "mov r9, " #f2 ", lsl #16 \n" \ - "mov r9, r9 , asr #16 \n" \ - "mla %[res], r9, r8, %[res] \n" \ - "mov r9, " #f2 ", asr #16 \n" - - asm volatile ( - "tst %[v2], #2 \n" - "beq 20f \n" - - "10: \n" - "ldrsh r9, [%[v2]], #2 \n" - "1: \n" - "ldmia %[v1]!, {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - MLABLOCK2_U2(r0, r4) - MLABLOCK2_U2(r1, r5) - MLABLOCK2_U2(r2, r6) - MLABLOCK2_U2(r3, r7) - "ldmia %[v1]!, {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - MLABLOCK2_U2(r0, r4) - MLABLOCK2_U2(r1, r5) - MLABLOCK2_U2(r2, r6) - MLABLOCK2_U2(r3, r7) -#if ORDER > 16 - "subs %[cnt], %[cnt], #1 \n" - "bne 1b \n" -#endif - "b 99f \n" - - "20: \n" - "1: \n" - "ldmia %[v1]!, {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - MLABLOCK2(r0, r4) - MLABLOCK2(r1, r5) - MLABLOCK2(r2, r6) - MLABLOCK2(r3, r7) - "ldmia %[v1]!, {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - MLABLOCK2(r0, r4) - MLABLOCK2(r1, r5) - MLABLOCK2(r2, r6) - MLABLOCK2(r3, r7) -#if ORDER > 16 - "subs %[cnt], %[cnt], #1 \n" - "bne 1b \n" -#endif - - "99: \n" - : /* outputs */ -#if ORDER > 16 - [cnt]"+r"(cnt), -#endif - [v1] "+r"(v1), - [v2] "+r"(v2), - [res]"+r"(res) - : /* inputs */ - : /* clobbers */ - "r0", "r1", "r2", "r3", "r4", - "r5", "r6", "r7", "r8", "r9" - ); - return res; -} diff --git a/apps/codecs/demac/libdemac/vector_math32_armv4.h b/apps/codecs/demac/libdemac/vector_math32_armv4.h new file mode 100644 index 0000000..b729bd3 --- /dev/null +++ b/apps/codecs/demac/libdemac/vector_math32_armv4.h @@ -0,0 +1,210 @@ +/* + +libdemac - A Monkey's Audio decoder + +$Id$ + +Copyright (C) Dave Chapman 2007 + +ARMv4 vector math copyright (C) 2008 Jens Arnold + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA + +*/ + +static inline void vector_add(int32_t* v1, int32_t* v2) +{ +#if ORDER > 32 + int cnt = ORDER>>5; +#endif + +#define ADDBLOCK4 \ + "ldmia %[v1], {r0-r3} \n" \ + "ldmia %[v2]!, {r4-r7} \n" \ + "add r0, r0, r4 \n" \ + "add r1, r1, r5 \n" \ + "add r2, r2, r6 \n" \ + "add r3, r3, r7 \n" \ + "stmia %[v1]!, {r0-r3} \n" + + asm volatile ( + "1: \n" + ADDBLOCK4 + ADDBLOCK4 + ADDBLOCK4 + ADDBLOCK4 +#if ORDER > 16 + ADDBLOCK4 + ADDBLOCK4 + ADDBLOCK4 + ADDBLOCK4 +#endif +#if ORDER > 32 + "subs %[cnt], %[cnt], #1 \n" + "bne 1b \n" +#endif + : /* outputs */ +#if ORDER > 32 + [cnt]"+r"(cnt), +#endif + [v1] "+r"(v1), + [v2] "+r"(v2) + : /* inputs */ + : /* clobbers */ + "r0", "r1", "r2", "r3", "r4", + "r5", "r6", "r7", "memory" + ); +} + +static inline void vector_sub(int32_t* v1, int32_t* v2) +{ +#if ORDER > 32 + int cnt = ORDER>>5; +#endif + +#define SUBBLOCK4 \ + "ldmia %[v1], {r0-r3} \n" \ + "ldmia %[v2]!, {r4-r7} \n" \ + "sub r0, r0, r4 \n" \ + "sub r1, r1, r5 \n" \ + "sub r2, r2, r6 \n" \ + "sub r3, r3, r7 \n" \ + "stmia %[v1]!, {r0-r3} \n" + + asm volatile ( + "1: \n" + SUBBLOCK4 + SUBBLOCK4 + SUBBLOCK4 + SUBBLOCK4 +#if ORDER > 16 + SUBBLOCK4 + SUBBLOCK4 + SUBBLOCK4 + SUBBLOCK4 +#endif +#if ORDER > 32 + "subs %[cnt], %[cnt], #1 \n" + "bne 1b \n" +#endif + : /* outputs */ +#if ORDER > 32 + [cnt]"+r"(cnt), +#endif + [v1] "+r"(v1), + [v2] "+r"(v2) + : /* inputs */ + : /* clobbers */ + "r0", "r1", "r2", "r3", "r4", + "r5", "r6", "r7", "memory" + ); +} + +static inline int32_t scalarproduct(int32_t* v1, int32_t* v2) +{ + int res = 0; +#if ORDER > 32 + int cnt = ORDER>>5; +#endif + + asm volatile ( +#if ORDER > 16 + "ldmia %[v2]!, {r6-r7} \n" + "1: \n" + "ldmia %[v1]!, {r0,r1,r3-r5} \n" + "mla %[res], r6, r0, %[res] \n" + "mla %[res], r7, r1, %[res] \n" + "ldmia %[v2]!, {r0-r2,r6-r8} \n" + "mla %[res], r0, r3, %[res] \n" + "mla %[res], r1, r4, %[res] \n" + "mla %[res], r2, r5, %[res] \n" + "ldmia %[v1]!, {r0-r4} \n" + "mla %[res], r6, r0, %[res] \n" + "mla %[res], r7, r1, %[res] \n" + "mla %[res], r8, r2, %[res] \n" + "ldmia %[v2]!, {r0,r1,r6-r8} \n" + "mla %[res], r0, r3, %[res] \n" + "mla %[res], r1, r4, %[res] \n" + "ldmia %[v1]!, {r0-r5} \n" + "mla %[res], r6, r0, %[res] \n" + "mla %[res], r7, r1, %[res] \n" + "mla %[res], r8, r2, %[res] \n" + "ldmia %[v2]!, {r0-r2,r6,r7} \n" + "mla %[res], r0, r3, %[res] \n" + "mla %[res], r1, r4, %[res] \n" + "mla %[res], r2, r5, %[res] \n" + "ldmia %[v1]!, {r0,r1,r3-r5} \n" + "mla %[res], r6, r0, %[res] \n" + "mla %[res], r7, r1, %[res] \n" + "ldmia %[v2]!, {r0-r2,r6-r8} \n" + "mla %[res], r0, r3, %[res] \n" + "mla %[res], r1, r4, %[res] \n" + "mla %[res], r2, r5, %[res] \n" + "ldmia %[v1]!, {r0-r4} \n" + "mla %[res], r6, r0, %[res] \n" + "mla %[res], r7, r1, %[res] \n" + "mla %[res], r8, r2, %[res] \n" + "ldmia %[v2]!, {r0,r1,r6-r8} \n" + "mla %[res], r0, r3, %[res] \n" + "mla %[res], r1, r4, %[res] \n" + "ldmia %[v1]!, {r0-r5} \n" + "mla %[res], r6, r0, %[res] \n" + "mla %[res], r7, r1, %[res] \n" + "mla %[res], r8, r2, %[res] \n" +#if ORDER > 32 + "ldmia %[v2]!, {r0-r2,r6,r7} \n" +#else + "ldmia %[v2]!, {r0-r2} \n" +#endif + "mla %[res], r0, r3, %[res] \n" + "mla %[res], r1, r4, %[res] \n" + "mla %[res], r2, r5, %[res] \n" +#if ORDER > 32 + "subs %[cnt], %[cnt], #1 \n" + "bne 1b \n" +#endif + +#else /* ORDER <= 16 */ + +#define MLABLOCK4 \ + "ldmia %[v1]!, {r0-r3} \n" \ + "ldmia %[v2]!, {r4-r7} \n" \ + "mla %[res], r4, r0, %[res] \n" \ + "mla %[res], r5, r1, %[res] \n" \ + "mla %[res], r6, r2, %[res] \n" \ + "mla %[res], r7, r3, %[res] \n" + + MLABLOCK4 + MLABLOCK4 + MLABLOCK4 + MLABLOCK4 +#endif /* ORDER <= 16 */ + : /* outputs */ +#if ORDER > 32 + [cnt]"+r"(cnt), +#endif + [v1] "+r"(v1), + [v2] "+r"(v2), + [res]"+r"(res) + : /* inputs */ + : /* clobbers */ + "r0", "r1", "r2", "r3", + "r4", "r5", "r6", "r7" +#if ORDER > 16 + ,"r8" +#endif + ); + return res; +} diff --git a/apps/codecs/demac/libdemac/vector_math16.h b/apps/codecs/demac/libdemac/vector_math_generic.h index 5d82abe..7b61db7 100644 --- a/apps/codecs/demac/libdemac/vector_math16.h +++ b/apps/codecs/demac/libdemac/vector_math_generic.h @@ -2,7 +2,7 @@ libdemac - A Monkey's Audio decoder -$Id:$ +$Id$ Copyright (C) Dave Chapman 2007 @@ -22,7 +22,9 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA */ -static inline void vector_add(int16_t* v1, int16_t* v2) +#include "demac_config.h" + +static inline void vector_add(filter_int* v1, filter_int* v2) { #if ORDER > 32 int order = (ORDER >> 5); @@ -66,7 +68,7 @@ static inline void vector_add(int16_t* v1, int16_t* v2) } } -static inline void vector_sub(int16_t* v1, int16_t* v2) +static inline void vector_sub(filter_int* v1, filter_int* v2) { #if ORDER > 32 int order = (ORDER >> 5); @@ -110,7 +112,7 @@ static inline void vector_sub(int16_t* v1, int16_t* v2) } } -static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) +static inline int32_t scalarproduct(filter_int* v1, filter_int* v2) { int res = 0; |