SWCODEC & Coldfire: Do some more DSP straigntening out. Do as much Coldfire optimizing as seems reasonably possible by jumping through some hoops to avoid stalls. Further boost reduction will just be fractional points if taken to extremes-- not worth it. Wrap up the ASM for awhile.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@12905 a1c6a512-1295-4272-9138-f99709370657
author: Michael Sevakis <jethead71@rockbox.org> 2007-03-25 04:03:44 +0000
committer: Michael Sevakis <jethead71@rockbox.org> 2007-03-25 04:03:44 +0000
commit: 369c2a37b7176e4f9c44f00a31b3b74e62b0b5d7 (patch)
tree: 7620c7da1d611d0d9a339487b6b264e44c6201bd /apps/dsp_cf.S
parent: cd630c9e0a2e0aa259a6e53a5af1369f36984b1c (diff)
download: rockbox-369c2a37b7176e4f9c44f00a31b3b74e62b0b5d7.zip
rockbox-369c2a37b7176e4f9c44f00a31b3b74e62b0b5d7.tar.gz
rockbox-369c2a37b7176e4f9c44f00a31b3b74e62b0b5d7.tar.bz2
rockbox-369c2a37b7176e4f9c44f00a31b3b74e62b0b5d7.tar.xz
1 files changed, 254 insertions, 170 deletions
diff --git a/apps/dsp_cf.S b/apps/dsp_cf.S
index af9ac1f..e5d3ee8 100644
--- a/apps/dsp_cf.S
+++ b/apps/dsp_cf.S
@@ -19,68 +19,117 @@
  ****************************************************************************/
 
 /****************************************************************************
- * void apply_crossfeed(int count, int32_t *src[])
+ * void dsp_apply_gain(int count, struct dsp_data *data, int32_t *buf[])
  */
     .section    .text
+	.align      2
+    .global     dsp_apply_gain
+dsp_apply_gain:
+    lea.l       -20(%sp), %sp           | save registers
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
+    movem.l     28(%sp), %a0-%a1        | %a0 = data,
+                                        | %a1 = buf
+	move.l      4(%a0), %d1             | %d1 = data->num_channels
+    move.l      32(%a0), %a0            | %a0 = data->gain (in s8.23)
+10: | channel loop                      |
+	move.l      24(%sp), %d0            | %d0 = count
+    move.l      -4(%a1, %d1.l*4), %a2   | %a2 = s = buf[ch-1]
+    move.l      %a2, %a3                | %a3 = d = s
+    move.l      (%a2)+, %d2             | %d2 = *s++,
+    mac.l       %a0, %d2, (%a2)+, %d2, %acc0 | %acc0 = S(n)*gain, load S(n+1)
+    subq.l      #1, %d0                 | --count > 0 ? : effectively n++
+    ble.b       30f | loop done         | no? finish up
+20: | loop                              |
+    move.l      %accext01, %d4          | fetch S(n-1)[7:0]
+    movclr.l    %acc0, %d3              | fetch S(n-1)[40:8] in %d5[31:0]
+    asl.l       #8, %d3                 | *s++ = (S(n-1)[40:8] << 8) | S(n-1)[7:0]
+    mac.l       %a0, %d2, (%a2)+, %d2, %acc0 | %acc0 = S(n)*gain, load S(n+1)
+    move.b      %d4, %d3                |
+    move.l      %d3, (%a3)+             |
+    subq.l      #1, %d0                 | --count > 0 ? : effectively n++
+    bgt.b       20b | loop              | yes? do more samples
+30: | loop done                         |
+    move.l      %accext01, %d4          | fetch S(n-1)[7:0]
+    movclr.l    %acc0, %d3              | fetch S(n-1)[40:8] in %d5[31:0]
+    asl.l       #8, %d3                 | *s = (S(n-1)[40:8] << 8) | S(n-1)[7:0]
+    move.b      %d4, %d3                |
+    move.l      %d3, (%a3)              |
+	subq.l      #1, %d1                 | next channel
+	bgt.b       10b | channel loop      |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup stack
+    rts                                 |
+    .size       dsp_apply_gain,.-dsp_apply_gain
+
+/****************************************************************************
+ * void apply_crossfeed(int count, int32_t *buf[])
+ */
+    .section    .text
+        .align      2
     .global     apply_crossfeed 
 apply_crossfeed:
-    lea.l       -44(%sp), %sp
+    lea.l       -44(%sp), %sp           |
     movem.l     %d2-%d7/%a2-%a6, (%sp)  | save all regs
     movem.l     48(%sp), %d7/%a4        | %d7 = count, %a4 = src
     movem.l     (%a4), %a4-%a5          | %a4 = src[0], %a5 = src[1]
-    lea.l       crossfeed_data, %a1
-    move.l      (%a1)+, %a6             | a6 = direct gain
+    lea.l       crossfeed_data, %a1     | %a1 = &crossfeed_data
+    move.l      (%a1)+, %d6             | %d6 = direct gain
     movem.l     12(%a1), %d0-%d3        | fetch filter history samples
     move.l      132(%a1), %a0           | fetch delay line address
     movem.l     (%a1), %a1-%a3          | load filter coefs
+    lea.l       crossfeed_data+136, %a6 | %a6 = delay line wrap limit
+    bra.b       20f | loop start        | go to loop start point
     /* Register usage in loop:
      * %a0 = delay_p, %a1..%a3 = b0, b1, a1 (filter coefs),
-     * %a4 = src[0], %a5 = src[1], %a6 = direct gain,
+     * %a4 = buf[0], %a5 = buf[1],
+     * %a6 = delay line pointer wrap limit,
      * %d0..%d3 = history
-     * %d4..%d6 = temp.
+     * %d4..%d5 = temp.
+     * %d6 = direct gain,
      * %d7 = count
      */
-.cfloop:
-    mac.l       %a2, %d0, 4(%a0), %d0, %acc0 | acc  = b1*dr[n - 1] d0 = dr[n]
-    mac.l       %a1, %d0             , %acc0 | acc += b0*dr[n]
-    mac.l       %a3, %d1,  (%a4), %d4, %acc0 | acc += a1*y_l[n - 1], load L
-    move.l      %acc0, %d1              | get filtered delayed sample
-    mac.l       %a6, %d4, %acc0         | acc += gain*x_l[n]
-    movclr.l    %acc0, %d6              |
-    move.l      %d6, (%a4)+             | write result
-
-    mac.l       %a2, %d2, (%a0), %d2, %acc0 | acc  = b1*dl[n - 1], d2 = dl[n]
-    mac.l       %a1, %d2            , %acc0 | acc += b0*dl[n]
-    mac.l       %a3, %d3, (%a5), %d5, %acc0 | acc += a1*y_r[n - 1], load R
-    movem.l     %d4-%d5, (%a0)          | save left & right inputs to delay line
-    move.l      %acc0, %d3              | get filtered delayed sample
-    mac.l       %a6, %d5, %acc0         | acc += gain*x_r[n]
-    lea.l       8(%a0), %a0             | increment delay pointer
-    movclr.l    %acc0, %d6              |
-    move.l      %d6, (%a5)+             | write result
-
-    cmpa.l      #crossfeed_data+136, %a0| wrap a0 if passed end
-    bge.b       .cfwrap                 |
-    .word       0x51fb                  | tpf.l - trap the buffer wrap
-.cfwrap:
-    lea.l       -104(%a0), %a0          | wrap
-    subq.l      #1, %d7                 | --count < 0 ?
-    bgt.b       .cfloop                 |
+10: | loop                              |
+    movclr.l    %acc0, %d4              | write outputs
+    move.l      %d4, (%a4)+             | .
+    movclr.l    %acc1, %d5              | .
+    move.l      %d5, (%a5)+             | .
+20: | loop start                        |
+    mac.l       %a2, %d0, (%a0)+, %d0, %acc0 | %acc0  = b1*dl[n - 1], %d0 = dl[n]
+    mac.l       %a1, %d0             , %acc0 | %acc0 += b0*dl[n]
+    mac.l       %a3, %d1, (%a5),  %d5, %acc0 | %acc0 += a1*y_r[n - 1], load R
+    mac.l       %a2, %d2, (%a0)+, %d2, %acc1 | %acc1  = b1*dr[n - 1], %d2 = dr[n]
+    mac.l       %a1, %d2             , %acc1 | %acc1 += b0*dr[n]
+    mac.l       %a3, %d3, (%a4),  %d4, %acc1 | %acc1 += a1*y_l[n - 1], load L
+    movem.l     %d4-%d5, -8(%a0)        | save left & right inputs to delay line
+    move.l      %acc0, %d3              | get filtered delayed left sample (y_l[n])
+    move.l      %acc1, %d1              | get filtered delayed right sample (y_r[n])
+    mac.l       %d6, %d4, %acc0         | %acc0 += gain*x_l[n]
+    mac.l       %d6, %d5, %acc1         | %acc1 += gain*x_r[n]
+    cmp.l       %a6, %a0                | wrap %a0 if passed end
+    bhs.b       30f | wrap buffer       |
+    .word       0x51fb | tpf.l          | trap the buffer wrap
+30: | wrap buffer                       | ...fwd taken branches more costly
+    lea.l       -104(%a0), %a0          | wrap it up
+    subq.l      #1, %d7                 | --count > 0 ?
+    bgt.b       10b | loop              | yes? do more
+    movclr.l    %acc0, %d4              | write last outputs
+    move.l      %d4, (%a4)              | .
+    movclr.l    %acc1, %d5              | .
+    move.l      %d5, (%a5)              | .
     lea.l       crossfeed_data+16, %a1  | save data back to struct
     movem.l     %d0-%d3, (%a1)          | ...history
     move.l      %a0, 120(%a1)           | ...delay_p
     movem.l     (%sp), %d2-%d7/%a2-%a6  | restore all regs
-    lea.l       44(%sp), %sp
-    rts
-.cfend:
-    .size       apply_crossfeed,.cfend-apply_crossfeed
-
+    lea.l       44(%sp), %sp            |
+    rts                                 |
+    .size       apply_crossfeed,.-apply_crossfeed 
 
 /****************************************************************************
  * int dsp_downsample(int count, struct dsp_data *data,
  *                    in32_t *src[], int32_t *dst[])
  */
     .section    .text
+	.align      2
     .global     dsp_downsample
 dsp_downsample:
     lea.l       -40(%sp), %sp           | save non-clobberables
@@ -92,7 +141,7 @@ dsp_downsample:
     movem.l     4(%a0), %d3-%d4         | %d3 = ch = data->num_channels
                                         | %d4 = delta = data->resample_data.delta
     moveq.l     #16, %d7                | %d7 = shift
-.dschannel_loop:
+10: | channel loop                      |
     move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
     move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
     move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
@@ -102,15 +151,15 @@ dsp_downsample:
     move.l      %d5, %d6                | %d6 = pos = phase >> 16
     lsr.l       %d7, %d6                |
     cmp.l       %d2, %d6                | past end of samples?
-    bge.b       .dsloop_skip            | yes? skip loop
+    bge.b       40f | skip resample loop| yes? skip loop
     tst.l       %d6                     | need last sample of prev. frame?
-    bne.b       .dsloop                 | no? start main loop
+    bne.b       20f | resample loop     | no? start main loop
     move.l      (%a3, %d6.l*4), %d1     | %d1 = s[pos]
-    bra.b       .dsuse_last_start       | start with last (last in %d0)
-.dsloop:
+    bra.b       30f | resample start last | start with last (last in %d0)
+20: | resample loop                     |
     lea.l       -4(%a3, %d6.l*4), %a5   | load s[pos-1] and s[pos]
     movem.l     (%a5), %d0-%d1          |
-.dsuse_last_start:
+30: | resample start last               |
     sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
     move.l      %d0, %acc0              | %acc0 = previous sample
     move.l      %d5, %d0                | frac = (phase << 16) >> 1
@@ -123,11 +172,11 @@ dsp_downsample:
     movclr.l    %acc0, %d0              |
     move.l      %d0, (%a4)+             | *d++ = %d0
     cmp.l       %d2, %d6                | pos < count?
-    blt.b       .dsloop                 | yes? continue resampling
-.dsloop_skip:
+    blt.b       20b | resample loop     | yes? continue resampling
+40: | skip resample loop                |
     subq.l      #1, %d3                 | ch > 0?
-    bgt.b       .dschannel_loop         | yes? process next channel
-    asl.l       %d7, %d2                | wrap phase to start of next frame
+    bgt.b       10b | channel loop      | yes? process next channel
+    lsl.l       %d7, %d2                | wrap phase to start of next frame
     sub.l       %d2, %d5                | data->resample_data.phase =
     move.l      %d5, 12(%a0)            | ... phase - (count << 16)
     move.l      %a4, %d0                | return d - d[0]
@@ -136,14 +185,14 @@ dsp_downsample:
     movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
     lea.l       40(%sp), %sp            | cleanup stack
     rts                                 | buh-bye
-.dsend:
-    .size       dsp_downsample,.dsend-dsp_downsample
+    .size       dsp_downsample,.-dsp_downsample
 
 /****************************************************************************
  * int dsp_upsample(int count, struct dsp_data *dsp,
- *                  in32_t *src[], int32_t *dst[])
+ *                  int32_t *src[], int32_t *dst[])
  */
     .section    .text
+	.align      2
     .global     dsp_upsample
 dsp_upsample:
     lea.l       -40(%sp), %sp           | save non-clobberables
@@ -154,47 +203,55 @@ dsp_upsample:
                                         | %a2 = dst
     movem.l      4(%a0), %d3-%d4        | %d3 = ch = channels
                                         | %d4 = delta = data->resample_data.delta
-    swap        %d4                     | swap delta to high word to use
-                                        | carries to increment position
-.uschannel_loop:
+    swap        %d4                     | swap delta to high word to use...
+                                        | ...carries to increment position
+10: | channel loop                      |
     move.l      12(%a0), %d5            | %d5 = phase = data->resample_data.phase
     move.l      -4(%a1, %d3.l*4), %a3   | %a3 = s = src[ch-1]
     lea.l       12(%a0, %d3.l*4), %a4   | %a4 = &data->resample_data.last_sample[ch-1]
-    lea.l       (%a3, %d2.l*4), %a5     | %a5 = src_end = &src[count]
+    lea.l       -4(%a3, %d2.l*4), %a5   | %a5 = src_end = &src[count-1]
     move.l      (%a4), %d0              | %d0 = last = data->resample_data.last_sample[ch-1]
-    move.l      -(%a5), (%a4)           | data->resample_data.last_sample[ch-1] = s[count-1]
+    move.l      (%a5), (%a4)            | data->resample_data.last_sample[ch-1] = s[count-1]
     move.l      -4(%a2, %d3.l*4), %a4   | %a4 = d = dst[ch-1]
+    move.l      (%a3)+, %d1             | fetch first sample - might throw this...
+                                        | ...away later but we'll be preincremented
+    move.l      %d1, %d6                | save sample value
+    sub.l       %d0, %d1                | %d1 = diff = s[0] - last
     swap        %d5                     | swap phase to high word to use
                                         | carries to increment position
-    move.l      %d5, %d6                | %d6 = pos = phase >> 16
+    move.l      %d5, %d7                | %d7 = pos = phase >> 16
     clr.w       %d5                     |
-    eor.l       %d5, %d6                | pos == 0?
-    beq.b       .usstart_0              | no? transistion from down
-    cmp.l       %d2, %d6                | past end of samples?
-    bge.b       .usloop_skip            | yes? skip loop
-    lea.l       -4(%a3, %d6.l*4), %a3   | %a3 = s = &s[pos-1] (previous)
-    move.l      (%a3)+, %d0             | %d0 = *s++
-    .word       0x51fa                  | tpf.w - trap next instruction
-.usloop_1:
+    eor.l       %d5, %d7                | pos == 0?
+    beq.b       40f | loop start        | yes? start loop
+    cmp.l       %d2, %d7                | past end of samples?
+    bge.b       50f | skip resample loop| yes? go to next channel and collect info
+    lea.l       (%a3, %d7.l*4), %a3     | %a3 = s = &s[pos+1]
+	movem.l     -8(%a3), %d0-%d1        | %d0 = s[pos-1], %d1 = s[pos]
+    move.l      %d1, %d6                | save sample value
+    sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
+	bra.b       40f | loop start        |
+20: | next sample loop                  |
     move.l      %d6, %d0                | move previous sample to %d0
-.usstart_0:
     move.l      (%a3)+, %d1             | fetch next sample
     move.l      %d1, %d6                | save sample value
     sub.l       %d0, %d1                | %d1 = diff = s[pos] - s[pos-1]
-.usloop_0:
+30: | same sample loop                  |
+    movclr.l    %acc0, %d7              | %d7 = result
+    move.l      %d7, (%a4)+             | *d++ = %d7
+40: | loop start                        |
     lsr.l       #1, %d5                 | make phase into frac
+    move.l      %d0, %acc0              | %acc0 = s[pos-1]
     mac.l       %d1, %d5, %acc0         | %acc0 = diff * frac
     lsl.l       #1, %d5                 | restore frac to phase
-    movclr.l    %acc0, %d7              | %d7 = product
-    add.l       %d0, %d7                | %d7 = last + product
-    move.l      %d7, (%a4)+             | *d++ = %d7
     add.l       %d4, %d5                | phase += delta
-    bcc.b       .usloop_0               | load next values?
+    bcc.b       30b | same sample loop  | load next values?
     cmp.l       %a5, %a3                | src <= src_end?
-    ble.b       .usloop_1               | yes? continue resampling
-.usloop_skip:
+    bls.b       20b | next sample loop  | yes? continue resampling
+    movclr.l    %acc0, %d7              | %d7 = result
+    move.l      %d7, (%a4)+             | *d++ = %d7
+50: | skip resample loop                |
     subq.l      #1, %d3                 | ch > 0?
-    bgt.b       .uschannel_loop         | yes? process next channel
+    bgt.b       10b | channel loop      | yes? process next channel
     swap        %d5                     | wrap phase to start of next frame
     move.l      %d5, 12(%a0)            | ...and save in data->resample_data.phase
     move.l      %a4, %d0                | return d - d[0]
@@ -203,12 +260,7 @@ dsp_upsample:
     asr.l       #2, %d0                 | convert bytes->samples
     lea.l       40(%sp), %sp            | cleanup stack
     rts                                 | buh-bye
-.usend:
-    .size       dsp_upsample,.usend-dsp_upsample
-
-/* These routines might benefit from burst transfers but we'll keep them
- * small for now since they're rather light weight
- */
+    .size       dsp_upsample,.-dsp_upsample
 
 /****************************************************************************
  * void channels_process_sound_chan_mono(int count, int32_t *buf[])
@@ -216,31 +268,39 @@ dsp_upsample:
  * Mix left and right channels 50/50 into a center channel.
  */
     .section    .text
+	.align      2
     .global     channels_process_sound_chan_mono
 channels_process_sound_chan_mono:
     movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
-    lea.l       -12(%sp), %sp           | save registers
-    move.l      %macsr, %d1             |
-    movem.l     %d1-%d3, (%sp)          |
-    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
+    lea.l       -20(%sp), %sp           | save registers
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
     movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
     move.l      #0x40000000, %d3        | %d3 = 0.5
-1:
-    move.l     (%a0), %d1               | L = R = l/2 + r/2
-    mac.l      %d1, %d3, (%a1), %d2, %acc0 |
-    mac.l      %d2, %d3, %acc0          |
-    movclr.l   %acc0, %d1               |
-    move.l     %d1, (%a0)+              | output to original buffer
-    move.l     %d1, (%a1)+              |
-    subq.l     #1, %d0                  |
-    bgt.s      1b                       |
-    movem.l    (%sp), %d1-%d3           | restore registers
-    move.l     %d1, %macsr              |
-    lea.l      12(%sp), %sp             | cleanup
-    rts
-.cpmono_end:
-    .size       channels_process_sound_chan_mono, .cpmono_end-channels_process_sound_chan_mono
-
+    move.l      (%a0)+, %d1             | prime the input registers
+    move.l      (%a1)+, %d2             |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc0 |
+    subq.l      #1, %d0                 |
+    ble.s       20f | loop done         |
+10: | loop                              |
+    movclr.l    %acc0, %d4              | L = R = l/2 + r/2
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc0 |
+    move.l      %d4, (%a2)+             | output to original buffer
+    move.l      %d4, (%a3)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       10b | loop              |
+20: | loop done                         |
+    movclr.l    %acc0, %d4              | output last sample
+    move.l      %d4, (%a2)              |
+    move.l      %d4, (%a3)              |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_mono, \
+                .-channels_process_sound_chan_mono
 
 /****************************************************************************
  * void channels_process_sound_chan_custom(int count, int32_t *buf[])
@@ -248,34 +308,47 @@ channels_process_sound_chan_mono:
  * Apply stereo width (narrowing/expanding) effect.
  */
     .section    .text
+	.align      2
     .global     channels_process_sound_chan_custom
 channels_process_sound_chan_custom:
     movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
-    lea.l       -16(%sp), %sp           | save registers
-    move.l      %macsr, %d1             |
-    movem.l     %d1-%d4, (%sp)          |
-    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
+    lea.l       -28(%sp), %sp           | save registers
+    movem.l     %d2-%d6/%a2-%a3, (%sp)  |
     movem.l     (%a0), %a0-%a1          | get channel pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
     move.l      dsp_sw_gain, %d3        | load straight (mid) gain
     move.l      dsp_sw_cross, %d4       | load cross (side) gain
-1:
-    move.l      (%a0), %d1              |
-    mac.l       %d1, %d3, (%a1), %d2, %acc0 |  L = l*gain + r*cross
-    mac.l       %d1, %d4            , %acc1 |  R = r*gain + l*cross
-    mac.l       %d2, %d4            , %acc0 |
-    mac.l       %d2, %d3            , %acc1 |
-    movclr.l    %acc0, %d1              |
-    movclr.l    %acc1, %d2              |
-    move.l      %d1, (%a0)+             |
-    move.l      %d2, (%a1)+             |
+    move.l      (%a0)+, %d1             | prime the input registers
+    move.l      (%a1)+, %d2             |
+    mac.l       %d1, %d3             , %acc0 |  L = l*gain + r*cross
+    mac.l       %d1, %d4, (%a0)+, %d1, %acc1 |  R = r*gain + l*cross
+    mac.l       %d2, %d4             , %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc1 |
     subq.l      #1, %d0                 |
-    bgt.s       1b                      |
-    movem.l     (%sp), %d1-%d4          | restore registers
-    move.l      %d1, %macsr             |
-    lea.l       16(%sp), %sp            | cleanup
-    rts
-.cpcustom_end:
-    .size       channels_process_sound_chan_custom, .cpcustom_end-channels_process_sound_chan_custom
+    ble.b       20f | loop done         |
+10: | loop                              |
+    movclr.l    %acc0, %d5              |
+    movclr.l    %acc1, %d6              |
+15: | loop start                        |
+    mac.l       %d1, %d3             , %acc0 |  L = l*gain + r*cross
+    mac.l       %d1, %d4, (%a0)+, %d1, %acc1 |  R = r*gain + l*cross
+    mac.l       %d2, %d4             , %acc0 |
+    mac.l       %d2, %d3, (%a1)+, %d2, %acc1 |
+    move.l      %d5, (%a2)+             |
+    move.l      %d6, (%a3)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       10b | loop              |
+20: | loop done                         |
+    movclr.l    %acc0, %d5              | output last sample
+    movclr.l    %acc1, %d6              |
+    move.l      %d5, (%a2)              |
+    move.l      %d6, (%a3)              |
+    movem.l     (%sp), %d2-%d6/%a2-%a3  | restore registers
+    lea.l       28(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_custom, \
+                .-channels_process_sound_chan_custom
 
 /****************************************************************************
  *  void channels_process_sound_chan_karaoke(int count, int32_t *buf[])
@@ -283,31 +356,42 @@ channels_process_sound_chan_custom:
  *  Separate channels into side channels.
  */
     .section    .text
+	.align      2
     .global     channels_process_sound_chan_karaoke
 channels_process_sound_chan_karaoke:
     movem.l     4(%sp), %d0/%a0         | %d0 = count, %a0 = buf
-    lea.l       -16(%sp), %sp           | save registers
-    move.l      %macsr, %d1             |
-    movem.l     %d1-%d4, (%sp)          |
-    move.l      #0xb0, %macsr           | put emac in rounding fractional mode
-    movem.l     (%a0), %a0-%a1          | get channel pointers
-    move.l      #0x40000000, %d4        | %d3 = 0.5
-1:
-    move.l     (%a0), %d1               |
-    msac.l     %d1, %d4, (%a1), %d2, %acc0 | R = r/2 - l/2
-    mac.l      %d2, %d4            , %acc0 |
-    movclr.l   %acc0, %d1               |
-    move.l     %d1, (%a1)+              |
-    neg.l      %d1                      | L = -R = -(r/2 - l/2) = l/2 - r/2
-    move.l     %d1, (%a0)+              |
-    subq.l     #1, %d0                  |
-    bgt.s      1b                       |
-    movem.l    (%sp), %d1-%d4           | restore registers
-    move.l     %d1, %macsr              |
-    lea.l      16(%sp), %sp             | cleanup
-    rts
-.cpkaraoke_end:
-    .size       channels_process_sound_chan_karaoke, .cpkaraoke_end-channels_process_sound_chan_karaoke
+    lea.l       -20(%sp), %sp           | save registers
+    movem.l     %d2-%d4/%a2-%a3, (%sp)  |
+    movem.l     (%a0), %a0-%a1          | get channel src pointers
+    move.l      %a0, %a2                | use separate dst pointers since read
+    move.l      %a1, %a3                | pointers run one ahead of write
+    move.l      #0x40000000, %d3        | %d3 = 0.5
+    move.l      (%a0)+, %d1             | prime the input registers
+    move.l      (%a1)+, %d2             |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 | L = l/2 - r/2
+    msac.l      %d2, %d3, (%a1)+, %d2, %acc0 |
+    subq.l      #1, %d0                 |
+    ble.b       20f | loop done         |
+10: | loop                              |
+    movclr.l    %acc0, %d4              |
+    mac.l       %d1, %d3, (%a0)+, %d1, %acc0 | L = l/2 - r/2
+    msac.l      %d2, %d3, (%a1)+, %d2, %acc0 |
+    move.l      %d4, (%a2)+             |
+    neg.l       %d4                     | R = -L = -(l/2 - r/2) = r/2 - l/2
+    move.l      %d4, (%a3)+             |
+    subq.l      #1, %d0                 |
+    bgt.s       10b | loop              |
+20: | loop done                         |
+    movclr.l    %acc0, %d4              | output last sample
+    move.l      %d4, (%a2)              |
+    neg.l       %d4                     | R = -L = -(l/2 - r/2) = r/2 - l/2
+    move.l      %d4, (%a3)              |
+    movem.l     (%sp), %d2-%d4/%a2-%a3  | restore registers
+    lea.l       20(%sp), %sp            | cleanup
+    rts                                 |
+    .size       channels_process_sound_chan_karaoke, \
+                .-channels_process_sound_chan_karaoke
+
 /****************************************************************************
  * void sample_output_stereo(int count, struct dsp_data *data,
  *                               int32_t *src[], int16_t *dst)
@@ -329,6 +413,7 @@ channels_process_sound_chan_karaoke:
  *
  */
     .section   .text
+	.align      2
     .global    sample_output_stereo
 sample_output_stereo:
     lea.l       -44(%sp), %sp             | save registers
@@ -348,11 +433,11 @@ sample_output_stereo:
     add.l       %a4, %d0                  |
     and.l       #0xfffffff0, %d0          |
     cmp.l       %a0, %d0                  | at least a full line?
-    bhi.w       .sos_longloop_1_start     | no? jump to trailing longword
+    bhi.w       40f | long loop 1 start   | no? do as trailing longwords
     sub.l       #16, %d0                  | %d1 = first line bound
     cmp.l       %a4, %d0                  | any leading longwords?
-    bls.b       .sos_lineloop_start       | no? jump to line loop
-.sos_longloop_0:
+    bls.b       20f | line loop start     | no? start line loop
+10: | long loop 0                         |
     move.l      (%a2)+, %d1               | read longword from L and R
     mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | shift L to high word
     mac.l       %d2, %a1, %acc1           | shift R to high word
@@ -362,10 +447,10 @@ sample_output_stereo:
     move.w      %d2, %d1                  | interleave MS 16 bits of each 
     move.l      %d1, (%a4)+               | ...and write both
     cmp.l       %a4, %d0                  |
-    bhi.b       .sos_longloop_0           |
-.sos_lineloop_start:
+    bhi.b       10b | long loop 0         |
+20: | line loop start                     |
     lea.l       -12(%a0), %a5             | %a5 = at or just before last line bound
-.sos_lineloop:
+30: | line loop                           |
     move.l      (%a3)+, %d4               | get next 4 R samples and scale
     mac.l       %d4, %a1, (%a3)+, %d5, %acc0 | with saturation
     mac.l       %d5, %a1, (%a3)+, %d6, %acc1 |
@@ -394,11 +479,11 @@ sample_output_stereo:
     move.w      %d7, %d3                  |
     movem.l     %d0-%d3, -16(%a4)         | write four stereo samples
     cmp.l       %a4, %a5                  |
-    bhi.b       .sos_lineloop             |
-.sos_longloop_1_start:
+    bhi.b       30b | line loop           |
+40: | long loop 1 start                   |
     cmp.l       %a4, %a0                  | any longwords left?
-    bls.b       .sos_done                 | no? finished.
-.sos_longloop_1:
+    bls.b       60f | output end          | no? stop
+50: | long loop 1                         |
     move.l      (%a2)+, %d1               | handle trailing longwords
     mac.l       %d1, %a1, (%a3)+, %d2, %acc0 | the same way as leading ones
     mac.l       %d2, %a1, %acc1           |
@@ -408,14 +493,13 @@ sample_output_stereo:
     move.w      %d2, %d1                  |
     move.l      %d1, (%a4)+               |
     cmp.l       %a4, %a0                  |
-    bhi.b       .sos_longloop_1           |
-.sos_done:
+    bhi.b       50b                       | long loop 1
+60: | output end                          |
     movem.l     (%sp), %d1-%d7/%a2-%a5    | restore registers
     move.l      %d1, %macsr               |
     lea.l       44(%sp), %sp              | cleanup
     rts                                   |
-.sos_end:
-    .size      sample_output_stereo, .sos_end-sample_output_stereo
+    .size      sample_output_stereo, .-sample_output_stereo
 
 /****************************************************************************
  * void sample_output_mono(int count, struct dsp_data *data,
@@ -424,6 +508,7 @@ sample_output_stereo:
  * Same treatment as sample_output_stereo but for one channel.
  */
     .section   .text
+	.align      2
     .global    sample_output_mono
 sample_output_mono:
     lea.l       -28(%sp), %sp             | save registers
@@ -442,11 +527,11 @@ sample_output_mono:
     add.l       %a3, %d0                  |
     and.l       #0xfffffff0, %d0          |
     cmp.l       %a0, %d0                  | at least a full line?
-    bhi.w       .som_longloop_1_start     | no? jump to trailing longword
+    bhi.w       40f | long loop 1 start   | no? do as trailing longwords
     sub.l       #16, %d0                  | %d1 = first line bound
     cmp.l       %a3, %d0                  | any leading longwords?
-    bls.b       .som_lineloop_start       | no? jump to line loop
-.som_longloop_0:
+    bls.b       20f | line loop start     | no? start line loop
+10: | long loop 0                         |
     move.l      (%a2)+, %d1               | read longword from L and R
     mac.l       %d1, %d5, %acc0           | shift L to high word
     movclr.l    %acc0, %d1                | get possibly saturated results
@@ -455,10 +540,10 @@ sample_output_mono:
     move.w      %d2, %d1                  | duplicate single channel into
     move.l      %d1, (%a3)+               | L and R
     cmp.l       %a3, %d0                  |
-    bhi.b       .som_longloop_0           |
-.som_lineloop_start:
+    bhi.b       10b | long loop 0         |
+20: | line loop start                     |
     lea.l       -12(%a0), %a1             | %a1 = at or just before last line bound
-.som_lineloop:
+30: | line loop                           |
     move.l      (%a2)+, %d0               | get next 4 L samples and scale
     mac.l       %d0, %d5, (%a2)+, %d1, %acc0 | with saturation
     mac.l       %d1, %d5, (%a2)+, %d2, %acc1 |
@@ -483,11 +568,11 @@ sample_output_mono:
     move.w      %d4, %d3                  |
     movem.l     %d0-%d3, -16(%a3)         | write four stereo samples
     cmp.l       %a3, %a1                  |
-    bhi.b       .som_lineloop             |
-.som_longloop_1_start:
+    bhi.b       30b | line loop           |
+40: | long loop 1 start                   |
     cmp.l       %a3, %a0                  | any longwords left?
-    bls.b       .som_done                 | no? finished.
-.som_longloop_1:
+    bls.b       60f | output end          | no? stop
+50: | loop loop 1                         |
     move.l      (%a2)+, %d1               | handle trailing longwords
     mac.l       %d1, %d5, %acc0           | the same way as leading ones
     movclr.l    %acc0, %d1                |
@@ -496,11 +581,10 @@ sample_output_mono:
     move.w      %d2, %d1                  |
     move.l      %d1, (%a3)+               |
     cmp.l       %a3, %a0                  |
-    bhi.b       .som_longloop_1           |
-.som_done:
+    bhi.b       50b | long loop 1         |
+60: | output end                          |
     movem.l     (%sp), %d1-%d5/%a2-%a3    | restore registers
     move.l      %d1, %macsr               |
     lea.l       28(%sp), %sp              | cleanup
     rts                                   |
-.som_end:
-    .size      sample_output_mono, .som_end-sample_output_mono
+    .size      sample_output_mono, .-sample_output_mono
author	Michael Sevakis <jethead71@rockbox.org>	2007-03-25 04:03:44 +0000
committer	Michael Sevakis <jethead71@rockbox.org>	2007-03-25 04:03:44 +0000
commit	369c2a37b7176e4f9c44f00a31b3b74e62b0b5d7 (patch)
tree	7620c7da1d611d0d9a339487b6b264e44c6201bd /apps/dsp_cf.S
parent	cd630c9e0a2e0aa259a6e53a5af1369f36984b1c (diff)
download	rockbox-369c2a37b7176e4f9c44f00a31b3b74e62b0b5d7.zip rockbox-369c2a37b7176e4f9c44f00a31b3b74e62b0b5d7.tar.gz rockbox-369c2a37b7176e4f9c44f00a31b3b74e62b0b5d7.tar.bz2 rockbox-369c2a37b7176e4f9c44f00a31b3b74e62b0b5d7.tar.xz