summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2008-12-02 02:26:04 +0000
committerJens Arnold <amiconn@rockbox.org>2008-12-02 02:26:04 +0000
commitc1cd0469ca9f084b39d747ccca5d64442c3833ca (patch)
tree976efee8d7131013414583e5bd2ad2fe323c8063
parent6c65b357bca384a3d65a6795edc2928b889254ee (diff)
downloadrockbox-c1cd0469ca9f084b39d747ccca5d64442c3833ca.zip
rockbox-c1cd0469ca9f084b39d747ccca5d64442c3833ca.tar.gz
rockbox-c1cd0469ca9f084b39d747ccca5d64442c3833ca.tar.bz2
rockbox-c1cd0469ca9f084b39d747ccca5d64442c3833ca.tar.xz
Implement mono predictor in assembler for coldfire, yielding a ~6% speedup for mono -c1000. Apply ideas gained from it back to the stereo predictor, saving 4 instructions. No speed increase for stereo, probably due to cache aliasing effects. * 80-column police.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19296 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/demac/libdemac/predictor-cf.S435
-rw-r--r--apps/codecs/demac/libdemac/predictor.c2
2 files changed, 291 insertions, 146 deletions
diff --git a/apps/codecs/demac/libdemac/predictor-cf.S b/apps/codecs/demac/libdemac/predictor-cf.S
index cd2e07f..c76d7f6 100644
--- a/apps/codecs/demac/libdemac/predictor-cf.S
+++ b/apps/codecs/demac/libdemac/predictor-cf.S
@@ -25,13 +25,6 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
*/
#include "demac_config.h"
- .text
-
- .align 2
-
- .global predictor_decode_stereo
- .type predictor_decode_stereo,@function
-
/* NOTE: The following need to be kept in sync with parser.h */
#define YDELAYA 200
@@ -63,6 +56,13 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
#define historybuffer 100 /* int32_t historybuffer[] */
+ .text
+
+ .align 2
+
+ .global predictor_decode_stereo
+ .type predictor_decode_stereo,@function
+
| void predictor_decode_stereo(struct predictor_t* p,
| int32_t* decoded0,
| int32_t* decoded1,
@@ -92,6 +92,8 @@ predictor_decode_stereo:
| %d1 = p->buf[YDELAYA-2]
| %d2 = p->buf[YDELAYA-1]
+ move.l %d3, (YDELAYA,%a5) | p->buf[YDELAYA] = %d3
+
sub.l %d3, %d2
neg.l %d2 | %d2 = %d3 - %d2
@@ -102,12 +104,10 @@ predictor_decode_stereo:
| %d6 = p->YcoeffsA[2]
| %d7 = p->YcoeffsA[3]
- mac.l %d3, %d4, %acc0 | %acc0 = p->buf[YDELAYA] * p->YcoeffsA[0]
- mac.l %d2, %d5, %acc0 | %acc0 += p->buf[YDELAYA-1] * p->YcoeffsA[1]
- mac.l %d1, %d6, %acc0 | %acc0 += p->buf[YDELAYA-2] * p->YcoeffsA[2]
- mac.l %d0, %d7, %acc0 | %acc0 += p->buf[YDELAYA-3] * p->YcoeffsA[3]
-
- move.l %d3, (YDELAYA,%a5) | p->buf[YDELAYA] = %d3
+ mac.l %d3, %d4, %acc0 | %acc0 = p->buf[YDELAYA] * p->YcoeffsA[0]
+ mac.l %d2, %d5, %acc0 | %acc0 += p->buf[YDELAYA-1] * p->YcoeffsA[1]
+ mac.l %d1, %d6, %acc0 | %acc0 += p->buf[YDELAYA-2] * p->YcoeffsA[2]
+ mac.l %d0, %d7, %acc0 | %acc0 += p->buf[YDELAYA-3] * p->YcoeffsA[3]
tst.l %d2
beq.s 1f
@@ -125,10 +125,6 @@ predictor_decode_stereo:
1: | %d3 = SIGN(%d3)
move.l %d3, (YADAPTCOEFFSA,%a5) | p->buf[YADAPTCOEFFSA] = %d3
- movclr.l %acc0, %d0
-
- | NOTE: %d0 now contains predictionA - don't overwrite.
-
| Predictor Y, Filter B
movem.l (YfilterB,%a6), %d2-%d3 | %d2 = p->YfilterB
@@ -156,11 +152,11 @@ predictor_decode_stereo:
| %a1 = p->YcoeffsB[3]
| %a2 = p->YcoeffsB[4]
- mac.l %d3, %d1, %acc0 | %acc0 = p->buf[YDELAYB] * p->YcoeffsB[0]
- mac.l %d7, %d2, %acc0 | %acc0 += p->buf[YDELAYB-1] * p->YcoeffsB[1]
- mac.l %d6, %a0, %acc0 | %acc0 += p->buf[YDELAYB-2] * p->YcoeffsB[2]
- mac.l %d5, %a1, %acc0 | %acc0 += p->buf[YDELAYB-3] * p->YcoeffsB[3]
- mac.l %d4, %a2, %acc0 | %acc0 += p->buf[YDELAYB-4] * p->YcoeffsB[4]
+ mac.l %d3, %d1, %acc1 | %acc1 = p->buf[YDELAYB] * p->YcoeffsB[0]
+ mac.l %d7, %d2, %acc1 | %acc1 += p->buf[YDELAYB-1] * p->YcoeffsB[1]
+ mac.l %d6, %a0, %acc1 | %acc1 += p->buf[YDELAYB-2] * p->YcoeffsB[2]
+ mac.l %d5, %a1, %acc1 | %acc1 += p->buf[YDELAYB-3] * p->YcoeffsB[3]
+ mac.l %d4, %a2, %acc1 | %acc1 += p->buf[YDELAYB-4] * p->YcoeffsB[4]
move.l %d3, (YDELAYB, %a5) | p->buf[YDELAYB] = %d3
@@ -179,38 +175,10 @@ predictor_decode_stereo:
1: | %d3 = SIGN(%d3)
move.l %d3, (YADAPTCOEFFSB, %a5) | p->buf[YADAPTCOEFFSB] = %d3
- movclr.l %acc0, %d4
-
- | %d0 still contains predictionA
- | %d4 contains predictionB
-
- | Finish Predictor Y
-
- asr.l #1, %d4
- add.l %d4, %d0 | %d0 += (%d1 >> 1)
- move.l (%a3), %d5 | %d5 = *decoded0
- move.l %d5, %d4 | %d4 = %d5
- asr.l #8, %d0
- asr.l #2, %d0 | %d0 >>= 10
- add.l %d0, %d4 | %d4 += %d0
- move.l %d4, (YlastA,%a6) | p->YlastA = %d4
-
- move.l (YfilterA,%a6), %d6 | %d6 = p->YfilterA
- move.l %d6, %d0
- lsl.l #5, %d6
- sub.l %d0, %d6 | %d6 = 31 * %d6
- asr.l #5, %d6 | %d6 >>= 5
- add.l %d6, %d4
- move.l %d4, (YfilterA,%a6) | p->YfilterA = %d4
-
- | %d4 contains p->YfilterA
- | %d5 contains *decoded0
-
| %d1, %d2, %a0, %a1, %a2 contain p->YcoeffsB[0..4]
| %d7, %d3 contain p->buf[YADAPTCOEFFSB-1] and p->buf[YADAPTCOEFFSB]
- move.l %d4, (%a3)+ | *(decoded0++) = %d1 (p->YfilterA)
- tst.l %d5
+ move.l (%a3), %d0 | %d0 = *decoded0
beq.s 3f
movem.l (YADAPTCOEFFSB-16,%a5), %d4-%d6 | %d4 = p->buf[YADAPTCOEFFSB-4]
@@ -221,11 +189,11 @@ predictor_decode_stereo:
| *decoded0 > 0
- sub.l %d3, %d1 | %d1 = p->YcoeffsB[0] - p->buf[YADAPTCOEFFSB]
- sub.l %d7, %d2 | %d2 = p->YcoeffsB[1] - p->buf[YADAPTCOEFFSB-1]
- sub.l %d6, %a0 | %a0 = p->YcoeffsB[2] - p->buf[YADAPTCOEFFSB-2]
- sub.l %d5, %a1 | %a1 = p->YcoeffsB[3] - p->buf[YADAPTCOEFFSB-3]
- sub.l %d4, %a2 | %a2 = p->YcoeffsB[4] - p->buf[YADAPTCOEFFSB-4]
+ sub.l %d3, %d1 | %d1 = p->YcoeffsB[0] - p->buf[YADAPTCOEFFSB]
+ sub.l %d7, %d2 | %d2 = p->YcoeffsB[1] - p->buf[YADAPTCOEFFSB-1]
+ sub.l %d6, %a0 | %a0 = p->YcoeffsB[2] - p->buf[YADAPTCOEFFSB-2]
+ sub.l %d5, %a1 | %a1 = p->YcoeffsB[3] - p->buf[YADAPTCOEFFSB-3]
+ sub.l %d4, %a2 | %a2 = p->YcoeffsB[4] - p->buf[YADAPTCOEFFSB-4]
movem.l %d1-%d2/%a0-%a2, (YcoeffsB,%a6) | Save p->YcoeffsB[]
@@ -234,47 +202,69 @@ predictor_decode_stereo:
| %d6 = p->YcoeffsA[2]
| %d7 = p->YcoeffsA[3]
- movem.l (YADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 | %d2 = p->buf[YADAPTCOEFFSA-3]
+ movem.l (YADAPTCOEFFSA-12,%a5), %d2/%a0-%a2
+ | %d2 = p->buf[YADAPTCOEFFSA-3]
| %a0 = p->buf[YADAPTCOEFFSA-2]
| %a1 = p->buf[YADAPTCOEFFSA-1]
| %a2 = p->buf[YADAPTCOEFFSA]
- sub.l %a2, %d4 | %d4 = p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA]
- sub.l %a1, %d5 | %d5 = p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1]
- sub.l %a0, %d6 | %d6 = p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2]
- sub.l %d2, %d7 | %d7 = p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3]
+ sub.l %a2, %d4 | %d4 = p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA]
+ sub.l %a1, %d5 | %d5 = p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1]
+ sub.l %a0, %d6 | %d6 = p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2]
+ sub.l %d2, %d7 | %d7 = p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3]
bra.s 2f
1: | *decoded0 < 0
- add.l %d3, %d1 | %d1 = p->YcoeffsB[0] + p->buf[YADAPTCOEFFSB]
- add.l %d7, %d2 | %d2 = p->YcoeffsB[1] + p->buf[YADAPTCOEFFSB-1]
- add.l %d6, %a0 | %a0 = p->YcoeffsB[2] + p->buf[YADAPTCOEFFSB-2]
- add.l %d5, %a1 | %a1 = p->YcoeffsB[3] + p->buf[YADAPTCOEFFSB-3]
- add.l %d4, %a2 | %a2 = p->YcoeffsB[4] + p->buf[YADAPTCOEFFSB-4]
+ add.l %d3, %d1 | %d1 = p->YcoeffsB[0] + p->buf[YADAPTCOEFFSB]
+ add.l %d7, %d2 | %d2 = p->YcoeffsB[1] + p->buf[YADAPTCOEFFSB-1]
+ add.l %d6, %a0 | %a0 = p->YcoeffsB[2] + p->buf[YADAPTCOEFFSB-2]
+ add.l %d5, %a1 | %a1 = p->YcoeffsB[3] + p->buf[YADAPTCOEFFSB-3]
+ add.l %d4, %a2 | %a2 = p->YcoeffsB[4] + p->buf[YADAPTCOEFFSB-4]
movem.l %d1-%d2/%a0-%a2, (YcoeffsB,%a6) | Save p->YcoeffsB[]
-
+
movem.l (YcoeffsA,%a6), %d4-%d7 | %d4 = p->YcoeffsA[0]
| %d5 = p->YcoeffsA[1]
| %d6 = p->YcoeffsA[2]
| %d7 = p->YcoeffsA[3]
-
- movem.l (YADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 | %d2 = p->buf[YADAPTCOEFFSA-3]
+
+ movem.l (YADAPTCOEFFSA-12,%a5), %d2/%a0-%a2
+ | %d2 = p->buf[YADAPTCOEFFSA-3]
| %a0 = p->buf[YADAPTCOEFFSA-2]
| %a1 = p->buf[YADAPTCOEFFSA-1]
| %a2 = p->buf[YADAPTCOEFFSA]
-
- add.l %a2, %d4 | %d4 = p->YcoeffsA[0] + p->buf[YADAPTCOEFFSA]
- add.l %a1, %d5 | %d5 = p->YcoeffsA[1] + p->buf[YADAPTCOEFFSA-1]
- add.l %a0, %d6 | %d6 = p->YcoeffsA[2] + p->buf[YADAPTCOEFFSA-2]
- add.l %d2, %d7 | %d7 = p->YcoeffsA[3] + p->buf[YADAPTCOEFFSA-3]
+
+ add.l %a2, %d4 | %d4 = p->YcoeffsA[0] + p->buf[YADAPTCOEFFSA]
+ add.l %a1, %d5 | %d5 = p->YcoeffsA[1] + p->buf[YADAPTCOEFFSA-1]
+ add.l %a0, %d6 | %d6 = p->YcoeffsA[2] + p->buf[YADAPTCOEFFSA-2]
+ add.l %d2, %d7 | %d7 = p->YcoeffsA[3] + p->buf[YADAPTCOEFFSA-3]
2:
movem.l %d4-%d7, (YcoeffsA,%a6) | Save p->YcoeffsA[]
3:
+ | Finish Predictor Y
+
+ movclr.l %acc0, %d1 | %d1 = predictionA
+ movclr.l %acc1, %d2 | %d2 = predictionB
+ asr.l #1, %d2
+ add.l %d2, %d1 | %d1 += (%d2 >> 1)
+ asr.l #8, %d1
+ asr.l #2, %d1 | %d1 >>= 10
+ add.l %d0, %d1 | %d1 += %d0
+ move.l %d1, (YlastA,%a6) | p->YlastA = %d1
+
+ move.l (YfilterA,%a6), %d2 | %d2 = p->YfilterA
+ move.l %d2, %d0
+ lsl.l #5, %d2
+ sub.l %d0, %d2 | %d2 = 31 * %d2
+ asr.l #5, %d2 | %d2 >>= 5
+ add.l %d1, %d2
+ move.l %d2, (YfilterA,%a6) | p->YfilterA = %d2
+
+ | *decoded0 stored 2 instructions down, avoiding pipeline stall
| ***** PREDICTOR X *****
@@ -282,11 +272,15 @@ predictor_decode_stereo:
move.l (XlastA,%a6), %d3 | %d3 = p->XlastA
+ move.l %d2, (%a3)+ | *(decoded0++) = %d2 (p->YfilterA)
+
movem.l (XDELAYA-12,%a5), %d0-%d2 | %d0 = p->buf[XDELAYA-3]
| %d1 = p->buf[XDELAYA-2]
| %d2 = p->buf[XDELAYA-1]
- sub.l %d3, %d2
+ move.l %d3, (XDELAYA,%a5) | p->buf[XDELAYA] = %d3
+
+ sub.l %d3, %d2
neg.l %d2 | %d2 = %d3 -%d2
move.l %d2, (XDELAYA-4,%a5) | p->buf[XDELAYA-1] = %d2
@@ -296,13 +290,11 @@ predictor_decode_stereo:
| %d6 = p->XcoeffsA[2]
| %d7 = p->XcoeffsA[3]
- mac.l %d3, %d4, %acc0 | %acc0 = p->buf[XDELAYA] * p->XcoeffsA[0]
- mac.l %d2, %d5, %acc0 | %acc0 += p->buf[XDELAYA-1] * p->XcoeffsA[1]
- mac.l %d1, %d6, %acc0 | %acc0 += p->buf[XDELAYA-2] * p->XcoeffsA[2]
- mac.l %d0, %d7, %acc0 | %acc0 += p->buf[XDELAYA-3] * p->XcoeffsA[3]
+ mac.l %d3, %d4, %acc0 | %acc0 = p->buf[XDELAYA] * p->XcoeffsA[0]
+ mac.l %d2, %d5, %acc0 | %acc0 += p->buf[XDELAYA-1] * p->XcoeffsA[1]
+ mac.l %d1, %d6, %acc0 | %acc0 += p->buf[XDELAYA-2] * p->XcoeffsA[2]
+ mac.l %d0, %d7, %acc0 | %acc0 += p->buf[XDELAYA-3] * p->XcoeffsA[3]
- move.l %d3, (XDELAYA,%a5) | p->buf[XDELAYA] = %d3
-
tst.l %d2
beq.s 1f
spl.b %d2 | pos: 0x??????ff, neg: 0x??????00
@@ -319,10 +311,6 @@ predictor_decode_stereo:
1: | %d3 = SIGN(%d3)
move.l %d3, (XADAPTCOEFFSA,%a5) | p->buf[XADAPTCOEFFSA] = %d3
- movclr.l %acc0, %d0
-
- | NOTE: %d0 now contains predictionA - don't overwrite.
-
| Predictor X, Filter B
movem.l (XfilterB,%a6), %d2-%d3 | %d2 = p->XfilterB
@@ -350,11 +338,11 @@ predictor_decode_stereo:
| %a1 = p->XcoeffsB[3]
| %a2 = p->XcoeffsB[4]
- mac.l %d3, %d1, %acc0 | %acc0 = p->buf[XDELAYB] * p->XcoeffsB[0]
- mac.l %d7, %d2, %acc0 | %acc0 += p->buf[XDELAYB-1] * p->XcoeffsB[1]
- mac.l %d6, %a0, %acc0 | %acc0 += p->buf[XDELAYB-2] * p->XcoeffsB[2]
- mac.l %d5, %a1, %acc0 | %acc0 += p->buf[XDELAYB-3] * p->XcoeffsB[3]
- mac.l %d4, %a2, %acc0 | %acc0 += p->buf[XDELAYB-4] * p->XcoeffsB[4]
+ mac.l %d3, %d1, %acc1 | %acc1 = p->buf[XDELAYB] * p->XcoeffsB[0]
+ mac.l %d7, %d2, %acc1 | %acc1 += p->buf[XDELAYB-1] * p->XcoeffsB[1]
+ mac.l %d6, %a0, %acc1 | %acc1 += p->buf[XDELAYB-2] * p->XcoeffsB[2]
+ mac.l %d5, %a1, %acc1 | %acc1 += p->buf[XDELAYB-3] * p->XcoeffsB[3]
+ mac.l %d4, %a2, %acc1 | %acc1 += p->buf[XDELAYB-4] * p->XcoeffsB[4]
move.l %d3, (XDELAYB, %a5) | p->buf[XDELAYB] = %d3
@@ -374,38 +362,10 @@ predictor_decode_stereo:
1: | %d3 = SIGN(%d3)
move.l %d3, (XADAPTCOEFFSB, %a5) | p->buf[XADAPTCOEFFSB] = %d3
- movclr.l %acc0, %d4
-
- | %d0 still contains predictionA
- | %d4 contains predictionB
-
- | Finish Predictor X
-
- asr.l #1, %d4
- add.l %d4, %d0 | %d0 += (%d1 >> 1)
- move.l (%a4), %d5 | %d5 = *decoded1
- move.l %d5, %d4 | %d4 = %d5
- asr.l #8, %d0
- asr.l #2, %d0 | %d0 >>= 10
- add.l %d0, %d4 | %d4 += %d0
- move.l %d4, (XlastA,%a6) | p->XlastA = %d1
-
- move.l (XfilterA,%a6), %d6 | %d6 = p->XfilterA
- move.l %d6, %d0
- lsl.l #5, %d6
- sub.l %d0, %d6 | %d6 = 31 * %d6
- asr.l #5, %d6 | %d6 >>= 5
- add.l %d6, %d4
- move.l %d4, (XfilterA,%a6) | p->XfilterA = %d6
-
- | %d4 contains p->XfilterA
- | %d5 contains *decoded1
-
| %d1, %d2, %a0, %a1, %a2 contain p->XcoeffsB[0..4]
| %d7, %d3 contain p->buf[XADAPTCOEFFSB-1] and p->buf[XADAPTCOEFFSB]
- move.l %d4, (%a4)+ | *(decoded1++) = %d1 (p->XfilterA)
- tst.l %d5
+ move.l (%a4), %d0 | %d0 = *decoded1
beq.s 3f
movem.l (XADAPTCOEFFSB-16,%a5), %d4-%d6 | %d4 = p->buf[XADAPTCOEFFSB-4]
@@ -416,38 +376,39 @@ predictor_decode_stereo:
| *decoded1 > 0
- sub.l %d3, %d1 | %d1 = p->XcoeffsB[0] - p->buf[XADAPTCOEFFSB]
- sub.l %d7, %d2 | %d2 = p->XcoeffsB[1] - p->buf[XADAPTCOEFFSB-1]
- sub.l %d6, %a0 | %a0 = p->XcoeffsB[2] - p->buf[XADAPTCOEFFSB-2]
- sub.l %d5, %a1 | %a1 = p->XcoeffsB[3] - p->buf[XADAPTCOEFFSB-3]
- sub.l %d4, %a2 | %a2 = p->XcoeffsB[4] - p->buf[XADAPTCOEFFSB-4]
-
+ sub.l %d3, %d1 | %d1 = p->XcoeffsB[0] - p->buf[XADAPTCOEFFSB]
+ sub.l %d7, %d2 | %d2 = p->XcoeffsB[1] - p->buf[XADAPTCOEFFSB-1]
+ sub.l %d6, %a0 | %a0 = p->XcoeffsB[2] - p->buf[XADAPTCOEFFSB-2]
+ sub.l %d5, %a1 | %a1 = p->XcoeffsB[3] - p->buf[XADAPTCOEFFSB-3]
+ sub.l %d4, %a2 | %a2 = p->XcoeffsB[4] - p->buf[XADAPTCOEFFSB-4]
+
movem.l %d1-%d2/%a0-%a2, (XcoeffsB,%a6) | Save p->XcoeffsB[]
-
+
movem.l (XcoeffsA,%a6), %d4-%d7 | %d4 = p->XcoeffsA[0]
| %d5 = p->XcoeffsA[1]
| %d6 = p->XcoeffsA[2]
| %d7 = p->XcoeffsA[3]
- movem.l (XADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 | %d2 = p->buf[XADAPTCOEFFSA-3]
+ movem.l (XADAPTCOEFFSA-12,%a5), %d2/%a0-%a2
+ | %d2 = p->buf[XADAPTCOEFFSA-3]
| %a0 = p->buf[XADAPTCOEFFSA-2]
| %a1 = p->buf[XADAPTCOEFFSA-1]
| %a2 = p->buf[XADAPTCOEFFSA]
- sub.l %a2, %d4 | %d4 = p->XcoeffsA[0] - p->buf[XADAPTCOEFFSA]
- sub.l %a1, %d5 | %d5 = p->XcoeffsA[1] - p->buf[XADAPTCOEFFSA-1]
- sub.l %a0, %d6 | %d6 = p->XcoeffsA[2] - p->buf[XADAPTCOEFFSA-2]
- sub.l %d2, %d7 | %d7 = p->XcoeffsA[3] - p->buf[XADAPTCOEFFSA-3]
-
+ sub.l %a2, %d4 | %d4 = p->XcoeffsA[0] - p->buf[XADAPTCOEFFSA]
+ sub.l %a1, %d5 | %d5 = p->XcoeffsA[1] - p->buf[XADAPTCOEFFSA-1]
+ sub.l %a0, %d6 | %d6 = p->XcoeffsA[2] - p->buf[XADAPTCOEFFSA-2]
+ sub.l %d2, %d7 | %d7 = p->XcoeffsA[3] - p->buf[XADAPTCOEFFSA-3]
+
bra.s 2f
1: | *decoded1 < 0
- add.l %d3, %d1 | %d1 = p->XcoeffsB[0] + p->buf[XADAPTCOEFFSB]
- add.l %d7, %d2 | %d2 = p->XcoeffsB[1] + p->buf[XADAPTCOEFFSB-1]
- add.l %d6, %a0 | %a0 = p->XcoeffsB[2] + p->buf[XADAPTCOEFFSB-2]
- add.l %d5, %a1 | %a1 = p->XcoeffsB[3] + p->buf[XADAPTCOEFFSB-3]
- add.l %d4, %a2 | %a2 = p->XcoeffsB[4] + p->buf[XADAPTCOEFFSB-4]
+ add.l %d3, %d1 | %d1 = p->XcoeffsB[0] + p->buf[XADAPTCOEFFSB]
+ add.l %d7, %d2 | %d2 = p->XcoeffsB[1] + p->buf[XADAPTCOEFFSB-1]
+ add.l %d6, %a0 | %a0 = p->XcoeffsB[2] + p->buf[XADAPTCOEFFSB-2]
+ add.l %d5, %a1 | %a1 = p->XcoeffsB[3] + p->buf[XADAPTCOEFFSB-3]
+ add.l %d4, %a2 | %a2 = p->XcoeffsB[4] + p->buf[XADAPTCOEFFSB-4]
movem.l %d1-%d2/%a0-%a2, (XcoeffsB,%a6) | Save p->XcoeffsB[]
@@ -456,31 +417,53 @@ predictor_decode_stereo:
| %d6 = p->XcoeffsA[2]
| %d7 = p->XcoeffsA[3]
- movem.l (XADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 | %d2 = p->buf[XADAPTCOEFFSA-3]
+ movem.l (XADAPTCOEFFSA-12,%a5), %d2/%a0-%a2
+ | %d2 = p->buf[XADAPTCOEFFSA-3]
| %a0 = p->buf[XADAPTCOEFFSA-2]
| %a1 = p->buf[XADAPTCOEFFSA-1]
| %a2 = p->buf[XADAPTCOEFFSA]
- add.l %a2, %d4 | %d4 = p->XcoeffsA[0] + p->buf[XADAPTCOEFFSA]
- add.l %a1, %d5 | %d5 = p->XcoeffsA[1] + p->buf[XADAPTCOEFFSA-1]
- add.l %a0, %d6 | %d6 = p->XcoeffsA[2] + p->buf[XADAPTCOEFFSA-2]
- add.l %d2, %d7 | %d7 = p->XcoeffsA[3] + p->buf[XADAPTCOEFFSA-3]
+ add.l %a2, %d4 | %d4 = p->XcoeffsA[0] + p->buf[XADAPTCOEFFSA]
+ add.l %a1, %d5 | %d5 = p->XcoeffsA[1] + p->buf[XADAPTCOEFFSA-1]
+ add.l %a0, %d6 | %d6 = p->XcoeffsA[2] + p->buf[XADAPTCOEFFSA-2]
+ add.l %d2, %d7 | %d7 = p->XcoeffsA[3] + p->buf[XADAPTCOEFFSA-3]
2:
movem.l %d4-%d7, (XcoeffsA,%a6) | Save p->XcoeffsA[]
3:
+ | Finish Predictor X
+
+ movclr.l %acc0, %d1 | %d1 = predictionA
+ movclr.l %acc1, %d2 | %d2 = predictionB
+ asr.l #1, %d2
+ add.l %d2, %d1 | %d1 += (%d2 >> 1)
+ asr.l #8, %d1
+ asr.l #2, %d1 | %d1 >>= 10
+ add.l %d0, %d1 | %d1 += %d0
+ move.l %d1, (XlastA,%a6) | p->XlastA = %d1
+
+ move.l (XfilterA,%a6), %d2 | %d2 = p->XfilterA
+ move.l %d2, %d0
+ lsl.l #5, %d2
+ sub.l %d0, %d2 | %d2 = 31 * %d2
+ asr.l #5, %d2 | %d6 >>= 2
+ add.l %d1, %d2
+ move.l %d2, (XfilterA,%a6) | p->XfilterA = %d2
+
+ | *decoded1 stored 3 instructions down, avoiding pipeline stall
| ***** COMMON *****
addq.l #4, %a5 | p->buf++
-
lea.l (historybuffer+PREDICTOR_HISTORY_SIZE*4,%a6), %a2
- | %a2 = &p->historybuffer[PREDICTOR_HISTORY_SIZE]
+ | %a2 = &p->historybuffer[PREDICTOR_HISTORY_SIZE]
+ move.l %d2, (%a4)+ | *(decoded1++) = %d2 (p->XfilterA)
+
cmp.l %a2, %a5
- beq.s .move_hist | The history buffer is full, we need to do a memmove
-
+ beq.s .move_hist | History buffer is full, we need to do a memmove
+
subq.l #1, (%sp) | decrease loop count
bne.w .loop
@@ -514,3 +497,163 @@ predictor_decode_stereo:
bne.w .loop
bra.s .done
+ .size predictor_decode_stereo, .-predictor_decode_stereo
+
+
+ .global predictor_decode_mono
+ .type predictor_decode_mono,@function
+
+| void predictor_decode_mono(struct predictor_t* p,
+| int32_t* decoded0,
+| int count)
+
+predictor_decode_mono:
+ lea.l (-11*4,%sp), %sp
+ movem.l %d2-%d7/%a2-%a6, (%sp)
+
+ move.l #0, %macsr | signed integer mode
+
+ move.l (11*4+4,%sp), %a6 | %a6 = p
+ move.l (11*4+8,%sp), %a4 | %a4 = decoded0
+ move.l (11*4+12,%sp), %d7 | %d7 = count
+ move.l (%a6), %a5 | %a5 = p->buf
+
+ move.l (YlastA,%a6), %d3 | %d3 = p->YlastA
+
+.loopm:
+
+ | ***** PREDICTOR *****
+
+ movem.l (YDELAYA-12,%a5), %d0-%d2 | %d0 = p->buf[YDELAYA-3]
+ | %d1 = p->buf[YDELAYA-2]
+ | %d2 = p->buf[YDELAYA-1]
+
+ move.l %d3, (YDELAYA,%a5) | p->buf[YDELAYA] = %d3
+
+ sub.l %d3, %d2
+ neg.l %d2 | %d2 = %d3 - %d2
+
+ move.l %d2, (YDELAYA-4,%a5) | p->buf[YDELAYA-1] = %d2
+
+ movem.l (YcoeffsA,%a6), %a0-%a3 | %a0 = p->YcoeffsA[0]
+ | %a1 = p->YcoeffsA[1]
+ | %a2 = p->YcoeffsA[2]
+ | %a3 = p->YcoeffsA[3]
+
+ mac.l %d3, %a0, %acc0 | %acc0 = p->buf[YDELAYA] * p->YcoeffsA[0]
+ mac.l %d2, %a1, %acc0 | %acc0 += p->buf[YDELAYA-1] * p->YcoeffsA[1]
+ mac.l %d1, %a2, %acc0 | %acc0 += p->buf[YDELAYA-2] * p->YcoeffsA[2]
+ mac.l %d0, %a3, %acc0 | %acc0 += p->buf[YDELAYA-3] * p->YcoeffsA[3]
+
+ tst.l %d2
+ beq.s 1f
+ spl.b %d2 | pos: 0x??????ff, neg: 0x??????00
+ extb.l %d2 | pos: 0xffffffff, neg: 0x00000000
+ or.l #1, %d2 | pos: 0xffffffff, neg: 0x00000001
+1: | %d2 = SIGN(%d2)
+ move.l %d2, (YADAPTCOEFFSA-4,%a5) | p->buf[YADAPTCOEFFSA-1] = %d2
+
+ tst.l %d3
+ beq.s 1f
+ spl.b %d3
+ extb.l %d3
+ or.l #1, %d3
+1: | %d3 = SIGN(%d3)
+ move.l %d3, (YADAPTCOEFFSA,%a5) | p->buf[YADAPTCOEFFSA] = %d3
+
+ move.l (%a4), %d0 | %d0 = *decoded0
+ beq.s 3f
+
+ movem.l (YADAPTCOEFFSA-12,%a5),%d4-%d5 | %d4 = p->buf[YADAPTCOEFFSA-3]
+ | %d5 = p->buf[YADAPTCOEFFSA-2]
+
+ bmi.s 1f | flags still valid here
+
+ | *decoded0 > 0
+
+ sub.l %d3, %a0 | %a0 = p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA]
+ sub.l %d2, %a1 | %a1 = p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1]
+ sub.l %d5, %a2 | %a2 = p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2]
+ sub.l %d4, %a3 | %a3 = p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3]
+
+ bra.s 2f
+
+1: | *decoded0 < 0
+
+ add.l %d3, %a0 | %a0 = p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA]
+ add.l %d2, %a1 | %a1 = p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1]
+ add.l %d5, %a2 | %a2 = p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2]
+ add.l %d4, %a3 | %a3 = p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3]
+
+2:
+ movem.l %a0-%a3, (YcoeffsA,%a6) | save p->YcoeffsA[]
+
+3:
+ | Finish Predictor
+
+ movclr.l %acc0, %d3 | %d3 = predictionA
+ asr.l #8, %d3
+ asr.l #2, %d3 | %d3 >>= 10
+ add.l %d0, %d3 | %d3 += %d0
+
+ move.l (YfilterA,%a6), %d2 | %d2 = p->YfilterA
+ move.l %d2, %d0
+ lsl.l #5, %d2
+ sub.l %d0, %d2 | %d2 = 31 * %d2
+ asr.l #5, %d2 | %d2 >>= 5
+ add.l %d3, %d2
+ move.l %d2, (YfilterA,%a6) | p->YfilterA = %d2
+
+ | *decoded0 stored 3 instructions down, avoiding pipeline stall
+
+ | ***** COMMON *****
+
+ addq.l #4, %a5 | p->buf++
+ lea.l (historybuffer+PREDICTOR_HISTORY_SIZE*4,%a6), %a3
+ | %a3 = &p->historybuffer[PREDICTOR_HISTORY_SIZE]
+
+ move.l %d2, (%a4)+ | *(decoded0++) = %d2 (p->YfilterA)
+
+ cmp.l %a3, %a5
+ beq.s .move_histm | History buffer is full, we need to do a memmove
+
+ subq.l #1, %d7 | decrease loop count
+ bne.w .loopm
+
+ move.l %d3, (YlastA,%a6) | %d3 = p->YlastA
+
+.donem:
+ move.l %a5, (%a6) | Save value of p->buf
+ movem.l (%sp), %d2-%d7/%a2-%a6
+ lea.l (11*4,%sp), %sp
+ rts
+
+.move_histm:
+ move.l %d3, (YlastA,%a6) | %d3 = p->YlastA
+
+ lea.l (historybuffer,%a6), %a3
+
+ | dest = %a3 (p->historybuffer)
+ | src = %a5 (p->buf)
+ | n = 200
+
+ movem.l (%a5), %d0-%d6/%a0-%a2 | 40 bytes
+ movem.l %d0-%d6/%a0-%a2, (%a3)
+ movem.l (40,%a5), %d0-%d6/%a0-%a2 | 40 bytes
+ movem.l %d0-%d6/%a0-%a2, (40,%a3)
+ movem.l (80,%a5), %d0-%d6/%a0-%a2 | 40 bytes
+ movem.l %d0-%d6/%a0-%a2, (80,%a3)
+ movem.l (120,%a5), %d0-%d6/%a0-%a2 | 40 bytes
+ movem.l %d0-%d6/%a0-%a2, (120,%a3)
+ movem.l (160,%a5), %d0-%d6/%a0-%a2 | 40 bytes
+ movem.l %d0-%d6/%a0-%a2, (160,%a3)
+
+ move.l %a3, %a5 | p->buf = &p->historybuffer[0]
+
+ move.l (YlastA,%a6), %d3 | %d3 = p->YlastA
+
+ subq.l #1, %d7 | decrease loop count
+ bne.w .loopm
+
+ bra.s .donem
+ .size predictor_decode_mono, .-predictor_decode_mono
diff --git a/apps/codecs/demac/libdemac/predictor.c b/apps/codecs/demac/libdemac/predictor.c
index d4f886f..0d03d1d 100644
--- a/apps/codecs/demac/libdemac/predictor.c
+++ b/apps/codecs/demac/libdemac/predictor.c
@@ -211,6 +211,7 @@ void ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p,
}
#endif
+#if !defined(CPU_COLDFIRE)
void ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p,
int32_t* decoded0,
int count)
@@ -269,3 +270,4 @@ void ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p,
p->YlastA = currentA;
}
+#endif