summaryrefslogtreecommitdiff
path: root/apps/codecs
diff options
context:
space:
mode:
Diffstat (limited to 'apps/codecs')
-rw-r--r--apps/codecs/demac/libdemac/filter.c28
-rw-r--r--apps/codecs/demac/libdemac/vector_math16_cf.h388
-rw-r--r--apps/codecs/demac/libdemac/vector_math32_armv4.h194
-rw-r--r--apps/codecs/lib/udiv32_arm.S2
4 files changed, 335 insertions, 277 deletions
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c
index 93edf39..ed6f3c8 100644
--- a/apps/codecs/demac/libdemac/filter.c
+++ b/apps/codecs/demac/libdemac/filter.c
@@ -134,6 +134,19 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f,
while(LIKELY(count--))
{
+#ifdef FUSED_VECTOR_MATH
+ if (LIKELY(*data != 0)) {
+ if (*data < 0)
+ res = vector_sp_add(f->coeffs, f->delay - ORDER,
+ f->adaptcoeffs - ORDER);
+ else
+ res = vector_sp_sub(f->coeffs, f->delay - ORDER,
+ f->adaptcoeffs - ORDER);
+ } else {
+ res = scalarproduct(f->coeffs, f->delay - ORDER);
+ }
+ res = FP_TO_INT(res);
+#else
res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER));
if (LIKELY(*data != 0)) {
@@ -142,6 +155,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f,
else
vector_sub(f->coeffs, f->adaptcoeffs - ORDER);
}
+#endif
res += *data;
@@ -193,6 +207,19 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f,
while(LIKELY(count--))
{
+#ifdef FUSED_VECTOR_MATH
+ if (LIKELY(*data != 0)) {
+ if (*data < 0)
+ res = vector_sp_add(f->coeffs, f->delay - ORDER,
+ f->adaptcoeffs - ORDER);
+ else
+ res = vector_sp_sub(f->coeffs, f->delay - ORDER,
+ f->adaptcoeffs - ORDER);
+ } else {
+ res = scalarproduct(f->coeffs, f->delay - ORDER);
+ }
+ res = FP_TO_INT(res);
+#else
res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER));
if (LIKELY(*data != 0)) {
@@ -201,6 +228,7 @@ static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f,
else
vector_sub(f->coeffs, f->adaptcoeffs - ORDER);
}
+#endif
/* Convert res from (32-FRACBITS).FRACBITS fixed-point format to an
integer (rounding to nearest) and add the input value to
diff --git a/apps/codecs/demac/libdemac/vector_math16_cf.h b/apps/codecs/demac/libdemac/vector_math16_cf.h
index 11e7f07..6e8216c 100644
--- a/apps/codecs/demac/libdemac/vector_math16_cf.h
+++ b/apps/codecs/demac/libdemac/vector_math16_cf.h
@@ -24,19 +24,27 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
*/
-/* This version fetches data as 32 bit words, and *recommends* v1 to be
- * 32 bit aligned, otherwise performance will suffer. */
-static inline void vector_add(int16_t* v1, int16_t* v2)
+#define FUSED_VECTOR_MATH
+
+#define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */
+
+/* Calculate scalarproduct, then add a 2nd vector (fused for performance)
+ * This version fetches data as 32 bit words, and *recommends* v1 to be
+ * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit
+ * aligned or both unaligned. Performance will suffer if either condition
+ * isn't met. It also needs EMAC in signed integer mode. */
+static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
{
+ int res;
#if ORDER > 16
int cnt = ORDER>>4;
#endif
-#define ADDHALFREGS(s1, sum) /* Add register halves straight. */ \
- "move.l " #s1 ", %%d4 \n" /* 's1' can be an A or D reg. */ \
- "add.l " #sum ", " #s1 "\n" /* 'sum' must be a D reg. */ \
- "clr.w %%d4 \n" /* 's1' and %%d4 are clobbered! */ \
- "add.l %%d4 , " #sum "\n" \
+#define ADDHALFREGS(s1, s2, sum) /* Add register halves straight. */ \
+ "move.l " #s1 ", " #sum "\n" /* 's1' and 's2' can be A or D */ \
+ "add.l " #s2 ", " #s1 "\n" /* regs, 'sum' must be a D reg. */ \
+ "clr.w " #sum " \n" /* 's1' is clobbered! */ \
+ "add.l " #s2 ", " #sum "\n" \
"move.w " #s1 ", " #sum "\n"
#define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \
@@ -47,94 +55,115 @@ static inline void vector_add(int16_t* v1, int16_t* v2)
"move.w " #s1 ", " #sum "\n"
asm volatile (
- "move.l %[v2], %%d0 \n"
- "and.l #2, %%d0 \n"
- "jeq 20f \n"
-
- "10: \n"
- "move.w (%[v2])+, %%d0 \n"
- "swap %%d0 \n"
- "1: \n"
- "movem.l (%[v1]), %%a0-%%a3 \n"
- "movem.l (%[v2]), %%d1-%%d4 \n"
- ADDHALFXREGS(%%a0, %%d1, %%d0)
- "move.l %%d0, (%[v1])+ \n"
- ADDHALFXREGS(%%a1, %%d2, %%d1)
- "move.l %%d1, (%[v1])+ \n"
- ADDHALFXREGS(%%a2, %%d3, %%d2)
- "move.l %%d2, (%[v1])+ \n"
- ADDHALFXREGS(%%a3, %%d4, %%d3)
- "move.l %%d3, (%[v1])+ \n"
- "lea.l (16, %[v2]), %[v2] \n"
- "move.l %%d4, %%d0 \n"
-
- "movem.l (%[v1]), %%a0-%%a3 \n"
- "movem.l (%[v2]), %%d1-%%d4 \n"
- ADDHALFXREGS(%%a0, %%d1, %%d0)
- "move.l %%d0, (%[v1])+ \n"
- ADDHALFXREGS(%%a1, %%d2, %%d1)
- "move.l %%d1, (%[v1])+ \n"
- ADDHALFXREGS(%%a2, %%d3, %%d2)
- "move.l %%d2, (%[v1])+ \n"
- ADDHALFXREGS(%%a3, %%d4, %%d3)
- "move.l %%d3, (%[v1])+ \n"
+ "move.l %[f2], %%d0 \n"
+ "and.l #2, %%d0 \n"
+ "jeq 20f \n"
+
+ "10: \n"
+ "move.w (%[f2])+, %%d0 \n"
+ "move.w (%[s2])+, %%d1 \n"
+ "swap %%d1 \n"
+ "1: \n"
+ ".rept 2 \n"
+ "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
+ "mac.w %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n"
+ "mac.w %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n"
+ ADDHALFXREGS(%%d6, %%d2, %%d1)
+ "mac.w %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n"
+ "mac.w %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n"
+ "move.l %%d1, (%[v1])+ \n"
+ ADDHALFXREGS(%%d7, %%d6, %%d2)
+ "mac.w %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n"
+ "mac.w %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n"
+ "move.l %%d2, (%[v1])+ \n"
+ ADDHALFXREGS(%%a0, %%d7, %%d6)
+ "mac.w %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n"
+ "mac.w %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n"
+ "move.l %%d6, (%[v1])+ \n"
+ ADDHALFXREGS(%%a1, %%d1, %%d7)
+ "move.l %%d7, (%[v1])+ \n"
+ ".endr \n"
+
#if ORDER > 16
- "lea.l (16, %[v2]), %[v2] \n"
- "move.l %%d4, %%d0 \n"
+ "subq.l #1, %[res] \n"
+ "bne.w 1b \n"
+#endif
+ "jra 99f \n"
- "subq.l #1, %[cnt] \n"
- "jne 1b \n"
+ "20: \n"
+ "move.l (%[f2])+, %%d0 \n"
+ "1: \n"
+ "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
+ "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
+ "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
+ ADDHALFREGS(%%d6, %%d1, %%d2)
+ "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
+ "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
+ "move.l %%d2, (%[v1])+ \n"
+ ADDHALFREGS(%%d7, %%d1, %%d2)
+ "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
+ "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
+ "move.l %%d2, (%[v1])+ \n"
+ ADDHALFREGS(%%a0, %%d1, %%d2)
+ "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
+ "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
+ "move.l %%d2, (%[v1])+ \n"
+ ADDHALFREGS(%%a1, %%d1, %%d2)
+ "move.l %%d2, (%[v1])+ \n"
+
+ "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
+ "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
+ "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
+ ADDHALFREGS(%%d6, %%d1, %%d2)
+ "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
+ "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
+ "move.l %%d2, (%[v1])+ \n"
+ ADDHALFREGS(%%d7, %%d1, %%d2)
+ "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
+ "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
+ "move.l %%d2, (%[v1])+ \n"
+ ADDHALFREGS(%%a0, %%d1, %%d2)
+ "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
+#if ORDER > 16
+ "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
+#else
+ "mac.w %%d0l, %%a1l, %%acc0 \n"
#endif
- "jra 99f \n"
-
- "20: \n"
- "1: \n"
- "movem.l (%[v2]), %%a0-%%a3 \n"
- "movem.l (%[v1]), %%d0-%%d3 \n"
- ADDHALFREGS(%%a0, %%d0)
- "move.l %%d0, (%[v1])+ \n"
- ADDHALFREGS(%%a1, %%d1)
- "move.l %%d1, (%[v1])+ \n"
- ADDHALFREGS(%%a2, %%d2)
- "move.l %%d2, (%[v1])+ \n"
- ADDHALFREGS(%%a3, %%d3)
- "move.l %%d3, (%[v1])+ \n"
- "lea.l (16, %[v2]), %[v2] \n"
-
- "movem.l (%[v2]), %%a0-%%a3 \n"
- "movem.l (%[v1]), %%d0-%%d3 \n"
- ADDHALFREGS(%%a0, %%d0)
- "move.l %%d0, (%[v1])+ \n"
- ADDHALFREGS(%%a1, %%d1)
- "move.l %%d1, (%[v1])+ \n"
- ADDHALFREGS(%%a2, %%d2)
- "move.l %%d2, (%[v1])+ \n"
- ADDHALFREGS(%%a3, %%d3)
- "move.l %%d3, (%[v1])+ \n"
+ "move.l %%d2, (%[v1])+ \n"
+ ADDHALFREGS(%%a1, %%d1, %%d2)
+ "move.l %%d2, (%[v1])+ \n"
#if ORDER > 16
- "lea.l (16, %[v2]), %[v2] \n"
-
- "subq.l #1, %[cnt] \n"
- "jne 1b \n"
+ "subq.l #1, %[res] \n"
+ "bne.w 1b \n"
#endif
- "99: \n"
+
+ "99: \n"
+ "movclr.l %%acc0, %[res] \n"
: /* outputs */
+ [v1]"+a"(v1),
+ [f2]"+a"(f2),
+ [s2]"+a"(s2),
+ [res]"=d"(res)
+ : /* inputs */
#if ORDER > 16
- [cnt]"+d"(cnt),
+ [cnt]"[res]"(cnt)
#endif
- [v1] "+a"(v1),
- [v2] "+a"(v2)
- : /* inputs */
: /* clobbers */
- "d0", "d1", "d2", "d3", "d4",
- "a0", "a1", "a2", "a3", "memory"
+ "d0", "d1", "d2", "d6", "d7",
+ "a0", "a1", "memory"
+
);
+ return res;
}
-/* This version fetches data as 32 bit words, and *recommends* v1 to be
- * 32 bit aligned, otherwise performance will suffer. */
-static inline void vector_sub(int16_t* v1, int16_t* v2)
+/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance)
+ * This version fetches data as 32 bit words, and *recommends* v1 to be
+ * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit
+ * aligned or both unaligned. Performance will suffer if either condition
+ * isn't met. It also needs EMAC in signed integer mode. */
+static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
{
+ int res;
#if ORDER > 16
int cnt = ORDER>>4;
#endif
@@ -155,107 +184,116 @@ static inline void vector_sub(int16_t* v1, int16_t* v2)
"move.w " #min ", " #s1d "\n"
asm volatile (
- "move.l %[v2], %%d0 \n"
- "and.l #2, %%d0 \n"
- "jeq 20f \n"
-
- "10: \n"
- "move.w (%[v2])+, %%d0 \n"
- "swap %%d0 \n"
- "1: \n"
- "movem.l (%[v2]), %%d1-%%d4 \n"
- "movem.l (%[v1]), %%a0-%%a3 \n"
- SUBHALFXREGS(%%a0, %%d1, %%d0)
- "move.l %%d0, (%[v1])+ \n"
- SUBHALFXREGS(%%a1, %%d2, %%d1)
- "move.l %%d1, (%[v1])+ \n"
- SUBHALFXREGS(%%a2, %%d3, %%d2)
- "move.l %%d2, (%[v1])+ \n"
- SUBHALFXREGS(%%a3, %%d4, %%d3)
- "move.l %%d3, (%[v1])+ \n"
- "lea.l (16, %[v2]), %[v2] \n"
- "move.l %%d4, %%d0 \n"
-
- "movem.l (%[v2]), %%d1-%%d4 \n"
- "movem.l (%[v1]), %%a0-%%a3 \n"
- SUBHALFXREGS(%%a0, %%d1, %%d0)
- "move.l %%d0, (%[v1])+ \n"
- SUBHALFXREGS(%%a1, %%d2, %%d1)
- "move.l %%d1, (%[v1])+ \n"
- SUBHALFXREGS(%%a2, %%d3, %%d2)
- "move.l %%d2, (%[v1])+ \n"
- SUBHALFXREGS(%%a3, %%d4, %%d3)
- "move.l %%d3, (%[v1])+ \n"
+ "move.l %[f2], %%d0 \n"
+ "and.l #2, %%d0 \n"
+ "jeq 20f \n"
+
+ "10: \n"
+ "move.w (%[f2])+, %%d0 \n"
+ "move.w (%[s2])+, %%d1 \n"
+ "swap %%d1 \n"
+ "1: \n"
+ ".rept 2 \n"
+ "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
+ "mac.w %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n"
+ "mac.w %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n"
+ SUBHALFXREGS(%%d6, %%d2, %%d1)
+ "mac.w %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n"
+ "mac.w %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n"
+ "move.l %%d1, (%[v1])+ \n"
+ SUBHALFXREGS(%%d7, %%d6, %%d2)
+ "mac.w %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n"
+ "mac.w %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n"
+ "move.l %%d2, (%[v1])+ \n"
+ SUBHALFXREGS(%%a0, %%d7, %%d6)
+ "mac.w %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n"
+ "mac.w %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n"
+ "move.l %%d6, (%[v1])+ \n"
+ SUBHALFXREGS(%%a1, %%d1, %%d7)
+ "move.l %%d7, (%[v1])+ \n"
+ ".endr \n"
+
#if ORDER > 16
- "lea.l (16, %[v2]), %[v2] \n"
- "move.l %%d4, %%d0 \n"
-
- "subq.l #1, %[cnt] \n"
- "bne.w 1b \n"
+ "subq.l #1, %[res] \n"
+ "bne.w 1b \n"
#endif
- "jra 99f \n"
-
- "20: \n"
- "1: \n"
- "movem.l (%[v2]), %%d1-%%d4 \n"
- "movem.l (%[v1]), %%a0-%%a3 \n"
- SUBHALFREGS(%%a0, %%d1, %%d0)
- "move.l %%d0, (%[v1])+ \n"
- SUBHALFREGS(%%a1, %%d2, %%d1)
- "move.l %%d1, (%[v1])+ \n"
- SUBHALFREGS(%%a2, %%d3, %%d2)
- "move.l %%d2, (%[v1])+ \n"
- SUBHALFREGS(%%a3, %%d4, %%d3)
- "move.l %%d3, (%[v1])+ \n"
- "lea.l (16, %[v2]), %[v2] \n"
-
- "movem.l (%[v2]), %%d1-%%d4 \n"
- "movem.l (%[v1]), %%a0-%%a3 \n"
- SUBHALFREGS(%%a0, %%d1, %%d0)
- "move.l %%d0, (%[v1])+ \n"
- SUBHALFREGS(%%a1, %%d2, %%d1)
- "move.l %%d1, (%[v1])+ \n"
- SUBHALFREGS(%%a2, %%d3, %%d2)
- "move.l %%d2, (%[v1])+ \n"
- SUBHALFREGS(%%a3, %%d4, %%d3)
- "move.l %%d3, (%[v1])+ \n"
-#if ORDER > 16
- "lea.l (16, %[v2]), %[v2] \n"
- "subq.l #1, %[cnt] \n"
- "bne.w 1b \n"
+ "jra 99f \n"
+
+ "20: \n"
+ "move.l (%[f2])+, %%d0 \n"
+ "1: \n"
+ "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
+ "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
+ "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
+ SUBHALFREGS(%%d6, %%d1, %%d2)
+ "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
+ "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
+ "move.l %%d2, (%[v1])+ \n"
+ SUBHALFREGS(%%d7, %%d1, %%d2)
+ "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
+ "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
+ "move.l %%d2, (%[v1])+ \n"
+ SUBHALFREGS(%%a0, %%d1, %%d2)
+ "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
+ "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
+ "move.l %%d2, (%[v1])+ \n"
+ SUBHALFREGS(%%a1, %%d1, %%d2)
+ "move.l %%d2, (%[v1])+ \n"
+
+ "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
+ "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
+ "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
+ SUBHALFREGS(%%d6, %%d1, %%d2)
+ "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
+ "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
+ "move.l %%d2, (%[v1])+ \n"
+ SUBHALFREGS(%%d7, %%d1, %%d2)
+ "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
+ "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
+ "move.l %%d2, (%[v1])+ \n"
+ SUBHALFREGS(%%a0, %%d1, %%d2)
+ "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
+#if ORDER > 16
+ "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
+#else
+ "mac.w %%d0l, %%a1l, %%acc0 \n"
+#endif
+ "move.l %%d2, (%[v1])+ \n"
+ SUBHALFREGS(%%a1, %%d1, %%d2)
+ "move.l %%d2, (%[v1])+ \n"
+#if ORDER > 16
+ "subq.l #1, %[res] \n"
+ "bne.w 1b \n"
#endif
- "99: \n"
+ "99: \n"
+ "movclr.l %%acc0, %[res] \n"
: /* outputs */
+ [v1]"+a"(v1),
+ [f2]"+a"(f2),
+ [s2]"+a"(s2),
+ [res]"=d"(res)
+ : /* inputs */
#if ORDER > 16
- [cnt]"+d"(cnt),
+ [cnt]"[res]"(cnt)
#endif
- [v1] "+a"(v1),
- [v2] "+a"(v2)
- : /* inputs */
: /* clobbers */
- "d0", "d1", "d2", "d3", "d4",
- "a0", "a1", "a2", "a3", "memory"
+ "d0", "d1", "d2", "d6", "d7",
+ "a0", "a1", "memory"
+
);
+ return res;
}
-#define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */
-
/* This version fetches data as 32 bit words, and *recommends* v1 to be
* 32 bit aligned, otherwise performance will suffer. It also needs EMAC
- * in signed integer mode - call above macro before use. */
+ * in signed integer mode. */
static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
{
int res;
-#if ORDER > 32
- int cnt = ORDER>>5;
-#endif
-
#if ORDER > 16
-#define MAC_BLOCKS "7"
-#else
-#define MAC_BLOCKS "3"
+ int cnt = ORDER>>4;
#endif
asm volatile (
@@ -267,20 +305,16 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
"move.l (%[v1])+, %%d0 \n"
"move.w (%[v2])+, %%d1 \n"
"1: \n"
- ".rept " MAC_BLOCKS "\n"
- "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
- "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
+ ".rept 7 \n"
"mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
"mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
".endr \n"
"mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
- "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
- "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
-#if ORDER > 32
+#if ORDER > 16
"mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
"subq.l #1, %[res] \n"
- "bne.w 1b \n"
+ "bne.b 1b \n"
#else
"mac.w %%d0l, %%d1u, %%acc0 \n"
#endif
@@ -290,7 +324,7 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
"move.l (%[v1])+, %%d0 \n"
"move.l (%[v2])+, %%d1 \n"
"1: \n"
- ".rept " MAC_BLOCKS "\n"
+ ".rept 3 \n"
"mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
"mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
"mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
@@ -299,11 +333,11 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
"mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
"mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
-#if ORDER > 32
+#if ORDER > 16
"mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
"mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
"subq.l #1, %[res] \n"
- "bne.w 1b \n"
+ "bne.b 1b \n"
#else
"mac.w %%d2u, %%d1u, %%acc0 \n"
"mac.w %%d2l, %%d1l, %%acc0 \n"
@@ -316,7 +350,7 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
[v2]"+a"(v2),
[res]"=d"(res)
: /* inputs */
-#if ORDER > 32
+#if ORDER > 16
[cnt]"[res]"(cnt)
#endif
: /* clobbers */
diff --git a/apps/codecs/demac/libdemac/vector_math32_armv4.h b/apps/codecs/demac/libdemac/vector_math32_armv4.h
index 89b24f2..207fca3 100644
--- a/apps/codecs/demac/libdemac/vector_math32_armv4.h
+++ b/apps/codecs/demac/libdemac/vector_math32_armv4.h
@@ -24,78 +24,134 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
*/
-static inline void vector_add(int32_t* v1, int32_t* v2)
+#define FUSED_VECTOR_MATH
+
+#if ORDER > 32
+#define BLOCK_REPEAT "8"
+#elif ORDER > 16
+#define BLOCK_REPEAT "7"
+#else
+#define BLOCK_REPEAT "3"
+#endif
+
+/* Calculate scalarproduct, then add a 2nd vector (fused for performance) */
+static inline int32_t vector_sp_add(int32_t* v1, int32_t* f2, int32_t* s2)
{
+ int res;
#if ORDER > 32
int cnt = ORDER>>5;
#endif
-#if ORDER > 16
-#define ADD_SUB_BLOCKS "8"
+ asm volatile (
+#if ORDER > 32
+ "mov %[res], #0 \n"
+ "1: \n"
#else
-#define ADD_SUB_BLOCKS "4"
+ "ldmia %[v1], {r0-r3} \n"
+ "ldmia %[f2]!, {r4-r7} \n"
+ "mul %[res], r4, r0 \n"
+ "mla %[res], r5, r1, %[res] \n"
+ "mla %[res], r6, r2, %[res] \n"
+ "mla %[res], r7, r3, %[res] \n"
+ "ldmia %[s2]!, {r4-r7} \n"
+ "add r0, r0, r4 \n"
+ "add r1, r1, r5 \n"
+ "add r2, r2, r6 \n"
+ "add r3, r3, r7 \n"
+ "stmia %[v1]!, {r0-r3} \n"
#endif
-
- asm volatile (
- "1: \n"
- ".rept " ADD_SUB_BLOCKS "\n"
- "ldmia %[v1], {r0-r3} \n"
- "ldmia %[v2]!, {r4-r7} \n"
- "add r0, r0, r4 \n"
- "add r1, r1, r5 \n"
- "add r2, r2, r6 \n"
- "add r3, r3, r7 \n"
- "stmia %[v1]!, {r0-r3} \n"
- ".endr \n"
+ ".rept " BLOCK_REPEAT "\n"
+ "ldmia %[v1], {r0-r3} \n"
+ "ldmia %[f2]!, {r4-r7} \n"
+ "mla %[res], r4, r0, %[res] \n"
+ "mla %[res], r5, r1, %[res] \n"
+ "mla %[res], r6, r2, %[res] \n"
+ "mla %[res], r7, r3, %[res] \n"
+ "ldmia %[s2]!, {r4-r7} \n"
+ "add r0, r0, r4 \n"
+ "add r1, r1, r5 \n"
+ "add r2, r2, r6 \n"
+ "add r3, r3, r7 \n"
+ "stmia %[v1]!, {r0-r3} \n"
+ ".endr \n"
#if ORDER > 32
- "subs %[cnt], %[cnt], #1 \n"
- "bne 1b \n"
+ "subs %[cnt], %[cnt], #1 \n"
+ "bne 1b \n"
#endif
: /* outputs */
#if ORDER > 32
[cnt]"+r"(cnt),
#endif
[v1] "+r"(v1),
- [v2] "+r"(v2)
+ [f2] "+r"(f2),
+ [s2] "+r"(s2),
+ [res]"=r"(res)
: /* inputs */
: /* clobbers */
"r0", "r1", "r2", "r3", "r4",
"r5", "r6", "r7", "memory"
);
+ return res;
}
-static inline void vector_sub(int32_t* v1, int32_t* v2)
+/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */
+static inline int32_t vector_sp_sub(int32_t* v1, int32_t* f2, int32_t* s2)
{
+ int res;
#if ORDER > 32
int cnt = ORDER>>5;
#endif
asm volatile (
- "1: \n"
- ".rept " ADD_SUB_BLOCKS "\n"
- "ldmia %[v1], {r0-r3} \n"
- "ldmia %[v2]!, {r4-r7} \n"
- "sub r0, r0, r4 \n"
- "sub r1, r1, r5 \n"
- "sub r2, r2, r6 \n"
- "sub r3, r3, r7 \n"
- "stmia %[v1]!, {r0-r3} \n"
- ".endr \n"
#if ORDER > 32
- "subs %[cnt], %[cnt], #1 \n"
- "bne 1b \n"
+ "mov %[res], #0 \n"
+ "1: \n"
+#else
+ "ldmia %[v1], {r0-r3} \n"
+ "ldmia %[f2]!, {r4-r7} \n"
+ "mul %[res], r4, r0 \n"
+ "mla %[res], r5, r1, %[res] \n"
+ "mla %[res], r6, r2, %[res] \n"
+ "mla %[res], r7, r3, %[res] \n"
+ "ldmia %[s2]!, {r4-r7} \n"
+ "sub r0, r0, r4 \n"
+ "sub r1, r1, r5 \n"
+ "sub r2, r2, r6 \n"
+ "sub r3, r3, r7 \n"
+ "stmia %[v1]!, {r0-r3} \n"
+#endif
+ ".rept " BLOCK_REPEAT "\n"
+ "ldmia %[v1], {r0-r3} \n"
+ "ldmia %[f2]!, {r4-r7} \n"
+ "mla %[res], r4, r0, %[res] \n"
+ "mla %[res], r5, r1, %[res] \n"
+ "mla %[res], r6, r2, %[res] \n"
+ "mla %[res], r7, r3, %[res] \n"
+ "ldmia %[s2]!, {r4-r7} \n"
+ "sub r0, r0, r4 \n"
+ "sub r1, r1, r5 \n"
+ "sub r2, r2, r6 \n"
+ "sub r3, r3, r7 \n"
+ "stmia %[v1]!, {r0-r3} \n"
+ ".endr \n"
+#if ORDER > 32
+ "subs %[cnt], %[cnt], #1 \n"
+ "bne 1b \n"
#endif
: /* outputs */
#if ORDER > 32
[cnt]"+r"(cnt),
#endif
[v1] "+r"(v1),
- [v2] "+r"(v2)
+ [f2] "+r"(f2),
+ [s2] "+r"(s2),
+ [res]"=r"(res)
: /* inputs */
: /* clobbers */
"r0", "r1", "r2", "r3", "r4",
"r5", "r6", "r7", "memory"
);
+ return res;
}
static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
@@ -106,78 +162,18 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
#endif
asm volatile (
-#if ORDER > 16
#if ORDER > 32
"mov %[res], #0 \n"
-#endif
- "ldmia %[v2]!, {r6-r7} \n"
"1: \n"
- "ldmia %[v1]!, {r0,r1,r3-r5} \n"
-#if ORDER > 32
- "mla %[res], r6, r0, %[res] \n"
#else
- "mul %[res], r6, r0 \n"
-#endif
- "mla %[res], r7, r1, %[res] \n"
- "ldmia %[v2]!, {r0-r2,r6-r8} \n"
- "mla %[res], r0, r3, %[res] \n"
- "mla %[res], r1, r4, %[res] \n"
- "mla %[res], r2, r5, %[res] \n"
- "ldmia %[v1]!, {r0-r4} \n"
- "mla %[res], r6, r0, %[res] \n"
- "mla %[res], r7, r1, %[res] \n"
- "mla %[res], r8, r2, %[res] \n"
- "ldmia %[v2]!, {r0,r1,r6-r8} \n"
- "mla %[res], r0, r3, %[res] \n"
- "mla %[res], r1, r4, %[res] \n"
- "ldmia %[v1]!, {r0-r5} \n"
- "mla %[res], r6, r0, %[res] \n"
- "mla %[res], r7, r1, %[res] \n"
- "mla %[res], r8, r2, %[res] \n"
- "ldmia %[v2]!, {r0-r2,r6,r7} \n"
- "mla %[res], r0, r3, %[res] \n"
- "mla %[res], r1, r4, %[res] \n"
- "mla %[res], r2, r5, %[res] \n"
- "ldmia %[v1]!, {r0,r1,r3-r5} \n"
- "mla %[res], r6, r0, %[res] \n"
- "mla %[res], r7, r1, %[res] \n"
- "ldmia %[v2]!, {r0-r2,r6-r8} \n"
- "mla %[res], r0, r3, %[res] \n"
- "mla %[res], r1, r4, %[res] \n"
- "mla %[res], r2, r5, %[res] \n"
- "ldmia %[v1]!, {r0-r4} \n"
- "mla %[res], r6, r0, %[res] \n"
- "mla %[res], r7, r1, %[res] \n"
- "mla %[res], r8, r2, %[res] \n"
- "ldmia %[v2]!, {r0,r1,r6-r8} \n"
- "mla %[res], r0, r3, %[res] \n"
- "mla %[res], r1, r4, %[res] \n"
- "ldmia %[v1]!, {r0-r5} \n"
- "mla %[res], r6, r0, %[res] \n"
- "mla %[res], r7, r1, %[res] \n"
- "mla %[res], r8, r2, %[res] \n"
-#if ORDER > 32
- "ldmia %[v2]!, {r0-r2,r6,r7} \n"
-#else
- "ldmia %[v2]!, {r0-r2} \n"
-#endif
- "mla %[res], r0, r3, %[res] \n"
- "mla %[res], r1, r4, %[res] \n"
- "mla %[res], r2, r5, %[res] \n"
-#if ORDER > 32
- "subs %[cnt], %[cnt], #1 \n"
- "bne 1b \n"
-#endif
-
-#else /* ORDER <= 16 */
"ldmia %[v1]!, {r0-r3} \n"
"ldmia %[v2]!, {r4-r7} \n"
"mul %[res], r4, r0 \n"
"mla %[res], r5, r1, %[res] \n"
"mla %[res], r6, r2, %[res] \n"
"mla %[res], r7, r3, %[res] \n"
-
- ".rept 3 \n"
+#endif
+ ".rept " BLOCK_REPEAT "\n"
"ldmia %[v1]!, {r0-r3} \n"
"ldmia %[v2]!, {r4-r7} \n"
"mla %[res], r4, r0, %[res] \n"
@@ -185,7 +181,10 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
"mla %[res], r6, r2, %[res] \n"
"mla %[res], r7, r3, %[res] \n"
".endr \n"
-#endif /* ORDER <= 16 */
+#if ORDER > 32
+ "subs %[cnt], %[cnt], #1 \n"
+ "bne 1b \n"
+#endif
: /* outputs */
#if ORDER > 32
[cnt]"+r"(cnt),
@@ -197,9 +196,6 @@ static inline int32_t scalarproduct(int32_t* v1, int32_t* v2)
: /* clobbers */
"r0", "r1", "r2", "r3",
"r4", "r5", "r6", "r7"
-#if ORDER > 16
- ,"r8"
-#endif
);
return res;
}
diff --git a/apps/codecs/lib/udiv32_arm.S b/apps/codecs/lib/udiv32_arm.S
index 8efc92c..117b178 100644
--- a/apps/codecs/lib/udiv32_arm.S
+++ b/apps/codecs/lib/udiv32_arm.S
@@ -92,7 +92,7 @@
#if CONFIG_CPU == PP5020
.set recip_max, 8384
#elif CONFIG_CPU == PP5002
-.set recip_max, 4992
+.set recip_max, 4608
#else
.set recip_max, 16384
#endif