diff options
Diffstat (limited to 'apps/dsp_cf.S')
| -rw-r--r-- | apps/dsp_cf.S | 380 |
1 files changed, 341 insertions, 39 deletions
diff --git a/apps/dsp_cf.S b/apps/dsp_cf.S index 295ef05..1f8dd48 100644 --- a/apps/dsp_cf.S +++ b/apps/dsp_cf.S @@ -18,7 +18,7 @@ ****************************************************************************/ /**************************************************************************** - * apply_crossfeed(int32_t* src[], int count) + * void apply_crossfeed(int32_t *src[], int count) */ .section .text .global apply_crossfeed @@ -88,32 +88,31 @@ apply_crossfeed: .size apply_crossfeed,.cfend-apply_crossfeed /**************************************************************************** - * dsp_downsample(int channels, int count, struct resample_data *r, - * in32_t **src, int32_t **dst) + * int dsp_downsample(int count, struct dsp_data *data, + * in32_t *src[], int32_t *dst[]) */ .section .text .global dsp_downsample dsp_downsample: lea.l -40(%sp), %sp | save non-clobberables movem.l %d2-%d7/%a2-%a5, (%sp) | - movem.l 44(%sp), %d2-%d3/%a0-%a2| %d2 = ch = channels - | %d3 = count - | %a0 = r + movem.l 44(%sp), %d2/%a0-%a2 | %d2 = count + | %a0 = data | %a1 = src | %a2 = dst - move.l 4(%a0), %d4 | %d4 = delta = r->delta - move.l #16, %d7 | %d7 = shift + movem.l 4(%a0), %d3-%d4 | %d3 = ch = data->num_channels + | %d4 = delta = data->resample_data.delta + moveq.l #16, %d7 | %d7 = shift .dschannel_loop: - move.l (%a0), %d5 | %d5 = phase = r->phase - move.l -4(%a1, %d2.l*4), %a3 | %a3 = s = src[ch-1] - move.l -4(%a2, %d2.l*4), %a4 | %a4 = d = dst[ch-1] - lea.l 4(%a0, %d2.l*4), %a5 | %a5 = &r->last_sample[ch-1] - move.l (%a5), %d0 | %d0 = last = r->last_sample[ch-1] - move.l -4(%a3, %d3.l*4), %d1 | r->last_sample[ch-1] = s[count-1] - move.l %d1, (%a5) | + move.l 12(%a0), %d5 | %d5 = phase = data->resample_data.phase + move.l -4(%a1, %d3.l*4), %a3 | %a3 = s = src[ch-1] + move.l -4(%a2, %d3.l*4), %a4 | %a4 = d = dst[ch-1] + lea.l 12(%a0, %d3.l*4), %a5 | %a5 = &data->resample_data.ast_sample[ch-1] + move.l (%a5), %d0 | %d0 = last = data->resample_data.last_sample[ch-1] + move.l -4(%a3, %d2.l*4), (%a5) | data->resample_data.last_sample[ch-1] = s[count-1] move.l %d5, %d6 | %d6 = pos = phase >> 16 lsr.l %d7, %d6 | - cmp.l %d3, %d6 | past end of samples? + cmp.l %d2, %d6 | past end of samples? bge.b .dsloop_skip | yes? skip loop tst.l %d6 | need last sample of prev. frame? bne.b .dsloop | no? start main loop @@ -134,14 +133,14 @@ dsp_downsample: move.l %d5, %d6 | pos = phase >> 16 lsr.l %d7, %d6 | move.l %d0, (%a4)+ | *d++ = %d0 - cmp.l %d3, %d6 | pos < count? + cmp.l %d2, %d6 | pos < count? blt.b .dsloop | yes? continue resampling .dsloop_skip: - subq.l #1, %d2 | ch > 0? + subq.l #1, %d3 | ch > 0? bgt.b .dschannel_loop | yes? process next channel - asl.l %d7, %d3 | wrap phase to start of next frame - sub.l %d3, %d5 | r->phase = phase - (count << 16) - move.l %d5, (%a0) | + asl.l %d7, %d2 | wrap phase to start of next frame + sub.l %d2, %d5 | data->resample_data.phase = + move.l %d5, 12(%a0) | ... phase - (count << 16) move.l %a4, %d0 | return d - d[0] sub.l (%a2), %d0 | asr.l #2, %d0 | convert bytes->samples @@ -153,31 +152,30 @@ dsp_downsample: .size dsp_downsample,.dsend-dsp_downsample /**************************************************************************** - * dsp_upsample(int channels, int count, struct resample_data *r, - * in32_t **src, int32_t **dst) + * int dsp_upsample(int count, struct dsp_data *dsp, + * in32_t *src[], int32_t *dst[]) */ .section .text .global dsp_upsample dsp_upsample: lea.l -40(%sp), %sp | save non-clobberables movem.l %d2-%d7/%a2-%a5, (%sp) | - movem.l 44(%sp), %d2-%d3/%a0-%a2| %d2 = ch = channels - | %d3 = count - | %a0 = r + movem.l 44(%sp), %d2/%a0-%a2 | %d2 = count + | %a0 = data | %a1 = src | %a2 = dst - move.l 4(%a0), %d4 | %d4 = delta = r->delta + movem.l 4(%a0), %d3-%d4 | %d3 = ch = channels + | %d4 = delta = data->resample_data.delta swap %d4 | swap delta to high word to use | carries to increment position .uschannel_loop: - move.l (%a0), %d5 | %d5 = phase = r->phase - move.l -4(%a1, %d2.l*4), %a3 | %a3 = s = src[ch-1] - lea.l 4(%a0, %d2.l*4), %a4 | %a4 = &r->last_sample[ch-1] - lea.l (%a3, %d3.l*4), %a5 | %a5 = src_end = &src[count] - move.l (%a4), %d0 | %d0 = last = r->last_sample[ch-1] - move.l -4(%a5), %d1 | r->last_sample[ch-1] = s[count-1] - move.l %d1, (%a4) | - move.l -4(%a2, %d2.l*4), %a4 | %a4 = d = dst[ch-1] + move.l 12(%a0), %d5 | %d5 = phase = data->resample_data.phase + move.l -4(%a1, %d3.l*4), %a3 | %a3 = s = src[ch-1] + lea.l 12(%a0, %d3.l*4), %a4 | %a4 = &data->resample_data.last_sample[ch-1] + lea.l (%a3, %d2.l*4), %a5 | %a5 = src_end = &src[count] + move.l (%a4), %d0 | %d0 = last = data->resample_data.last_sample[ch-1] + move.l -(%a5), (%a4) | data->resample_data.last_sample[ch-1] = s[count-1] + move.l -4(%a2, %d3.l*4), %a4 | %a4 = d = dst[ch-1] swap %d5 | swap phase to high word to use | carries to increment position move.l %d5, %d6 | %d6 = pos = phase >> 16 @@ -204,13 +202,13 @@ dsp_upsample: move.l %d7, (%a4)+ | *d++ = %d7 add.l %d4, %d5 | phase += delta bcc.b .usloop_0 | load next values? - cmp.l %a5, %a3 | src < src_end? - blt.b .usloop_1 | yes? continue resampling + cmp.l %a5, %a3 | src <= src_end? + ble.b .usloop_1 | yes? continue resampling .usloop_skip: - subq.l #1, %d2 | ch > 0? + subq.l #1, %d3 | ch > 0? bgt.b .uschannel_loop | yes? process next channel swap %d5 | wrap phase to start of next frame - move.l %d5, (%a0) | ...and save in r->phase + move.l %d5, 12(%a0) | ...and save in data->resample_data.phase move.l %a4, %d0 | return d - d[0] sub.l (%a2), %d0 | movem.l (%sp), %d2-%d7/%a2-%a5 | restore non-clobberables @@ -219,3 +217,307 @@ dsp_upsample: rts | buh-bye .usend: .size dsp_upsample,.usend-dsp_upsample + +/* These routines might benefit from burst transfers but we'll keep them + * small for now since they're rather light weight + */ + +/**************************************************************************** + * void channels_process_sound_chan_mono(int count, int32_t *buf[]) + * + * Mix left and right channels 50/50 into a center channel. + */ + .section .text + .global channels_process_sound_chan_mono +channels_process_sound_chan_mono: + movem.l 4(%sp), %d0/%a0 | %d0 = count, %a0 = buf + lea.l -12(%sp), %sp | save registers + move.l %macsr, %d1 | + movem.l %d1-%d3, (%sp) | + move.l #0xb0, %macsr | put emac in rounding fractional mode + movem.l (%a0), %a0-%a1 | get channel pointers + move.l #0x40000000, %d3 | %d3 = 0.5 +1: + move.l (%a0), %d1 | L = R = l/2 + r/2 + mac.l %d1, %d3, (%a1), %d2, %acc0 | + mac.l %d2, %d3, %acc0 | + movclr.l %acc0, %d1 | + move.l %d1, (%a0)+ | output to original buffer + move.l %d1, (%a1)+ | + subq.l #1, %d0 | + bgt.s 1b | + movem.l (%sp), %d1-%d3 | restore registers + move.l %d1, %macsr | + lea.l 12(%sp), %sp | cleanup + rts +.cpmono_end: + .size channels_process_sound_chan_mono, .cpmono_end-channels_process_sound_chan_mono + + +/**************************************************************************** + * void channels_process_sound_chan_custom(int count, int32_t *buf[]) + * + * Apply stereo width (narrowing/expanding) effect. + */ + .section .text + .global channels_process_sound_chan_custom +channels_process_sound_chan_custom: + movem.l 4(%sp), %d0/%a0 | %d0 = count, %a0 = buf + lea.l -16(%sp), %sp | save registers + move.l %macsr, %d1 | + movem.l %d1-%d4, (%sp) | + move.l #0xb0, %macsr | put emac in rounding fractional mode + movem.l (%a0), %a0-%a1 | get channel pointers + move.l dsp_sw_gain, %d3 | load straight (mid) gain + move.l dsp_sw_cross, %d4 | load cross (side) gain +1: + move.l (%a0), %d1 | + mac.l %d1, %d3 , (%a1), %d2, %acc0 | L = l*gain + r*cross + mac.l %d1, %d4 , %acc1 | R = r*gain + l*cross + mac.l %d2, %d4 , %acc0 | + mac.l %d2, %d3 , %acc1 | + movclr.l %acc0, %d1 | + movclr.l %acc1, %d2 | + move.l %d1, (%a0)+ | + move.l %d2, (%a1)+ | + subq.l #1, %d0 | + bgt.s 1b | + movem.l (%sp), %d1-%d4 | restore registers + move.l %d1, %macsr | + lea.l 16(%sp), %sp | cleanup + rts +.cpcustom_end: + .size channels_process_sound_chan_custom, .cpcustom_end-channels_process_sound_chan_custom + +/**************************************************************************** + * void channels_process_sound_chan_karaoke(int count, int32_t *buf[]) + * + * Separate channels into side channels. + */ + .section .text + .global channels_process_sound_chan_karaoke +channels_process_sound_chan_karaoke: + movem.l 4(%sp), %d0/%a0 | %d0 = count, %a0 = buf + lea.l -16(%sp), %sp | save registers + move.l %macsr, %d1 | + movem.l %d1-%d4, (%sp) | + move.l #0xb0, %macsr | put emac in rounding fractional mode + movem.l (%a0), %a0-%a1 | get channel pointers + move.l #0x40000000, %d4 | %d3 = 0.5 +1: + move.l (%a0), %d1 | + mac.l %d1, %d4, (%a1), %d2, %acc0 | L = l/2 - r/2 + mac.l %d2, %d4, %acc1 | R = r/2 - l/2 + movclr.l %acc0, %d1 | + movclr.l %acc1, %d2 | + move.l %d1, %d3 | + sub.l %d2, %d1 | + sub.l %d3, %d2 | + move.l %d1, (%a0)+ | + move.l %d2, (%a1)+ | + subq.l #1, %d0 | + bgt.s 1b | + movem.l (%sp), %d1-%d4 | restore registers + move.l %d1, %macsr | + lea.l 16(%sp), %sp | cleanup + rts +.cpkaraoke_end: + .size channels_process_sound_chan_karaoke, .cpkaraoke_end-channels_process_sound_chan_karaoke + +/**************************************************************************** + * void sample_output_stereo(int count, struct dsp_data *data, + * int32_t *src[], int16_t *dst) + * + * Framework based on the ubiquitous Rockbox line transfer logic for + * Coldfire CPUs. + * + * Does emac clamping and scaling (which proved faster than the usual + * checks and branches - even single test clamping) and writes using + * line burst transfers. Also better than writing a single L-R pair per + * loop but a good deal more code. + * + * Attemping bursting during reads is rather futile since the source and + * destination alignments rarely agree and too much complication will + * slow us up. The parallel loads seem to do a bit better at least until + * a pcm buffer can always give line aligned chunk and then aligning the + * dest can then imply the source is aligned if the source buffers are. + * For now longword alignment is assumed of both the source and dest. + * + */ + .section .text + .global sample_output_stereo +sample_output_stereo: + lea.l -44(%sp), %sp | save registers + move.l %macsr, %d1 | do it now as at many lines will + movem.l %d1-%d7/%a2-%a5, (%sp) | be the far more common condition + move.l #0x80, %macsr | put emac unit in signed int mode + movem.l 48(%sp), %a0-%a2/%a4 | + lea.l (%a4, %a0.l*4), %a0 | %a0 = end address + move.l (%a1), %d1 | %a1 = multiplier: (1 << (16 - scale)) + sub.l #16, %d1 | + neg.l %d1 | + move.q #1, %d0 | + asl.l %d1, %d0 | + move.l %d0, %a1 | + movem.l (%a2), %a2-%a3 | get L/R channel pointers + moveq.l #28, %d0 | %d0 = second line bound + add.l %a4, %d0 | + and.l #0xfffffff0, %d0 | + cmp.l %a4, %d0 | at least a full line? + blo.w .sos_longloop_1_start | no? jump to trailing longword + sub.l #16, %d0 | %d1 = first line bound + cmp.l %a4, %d0 | any leading longwords? + bls.b .sos_lineloop_start | no? jump to line loop +.sos_longloop_0: + move.l (%a2)+, %d1 | read longword from L and R + mac.l %d1, %a1, (%a3)+, %d2, %acc0 | shift L to high word + mac.l %d2, %a1, %acc1 | shift R to high word + movclr.l %acc0, %d1 | get possibly saturated results + movclr.l %acc1, %d2 | + swap %d2 | move R to low word + move.w %d2, %d1 | interleave MS 16 bits of each + move.l %d1, (%a4)+ | ...and write both + cmp.l %a4, %d0 | + bhi.b .sos_longloop_0 | +.sos_lineloop_start: + lea.l -12(%a0), %a5 | %a5 = at or just before last line bound +.sos_lineloop: + move.l (%a2)+, %d0 | get next 4 L samples and scale + mac.l %d0, %a1, (%a2)+, %d1, %acc0 | with saturation + mac.l %d1, %a1, (%a2)+, %d2, %acc1 | + mac.l %d2, %a1, (%a2)+, %d3, %acc2 | + mac.l %d3, %a1, %acc3 | + movclr.l %acc0, %d0 | obtain results + movclr.l %acc1, %d1 | + movclr.l %acc2, %d2 | + movclr.l %acc3, %d3 | + move.l (%a3)+, %d4 | get next 4 R samples and scale + mac.l %d4, %a1, (%a3)+, %d5, %acc0 | with saturation + mac.l %d5, %a1, (%a3)+, %d6, %acc1 | + mac.l %d6, %a1, (%a3)+, %d7, %acc2 | + mac.l %d7, %a1, %acc3 | + movclr.l %acc0, %d4 | obtain results + movclr.l %acc1, %d5 | + movclr.l %acc2, %d6 | + movclr.l %acc3, %d7 | + swap %d4 | interleave most significant + move.w %d4, %d0 | 16 bits of L and R + swap %d5 | + move.w %d5, %d1 | + swap %d6 | + move.w %d6, %d2 | + swap %d7 | + move.w %d7, %d3 | + movem.l %d0-%d3, (%a4) | write four stereo samples + lea.l 16(%a4), %a4 | + cmp.l %a4, %a5 | + bhi.b .sos_lineloop | +.sos_longloop_1_start: + cmp.l %a4, %a0 | any longwords left? + bls.b .sos_done | no? finished. +.sos_longloop_1: + move.l (%a2)+, %d1 | handle trailing longwords + mac.l %d1, %a1, (%a3)+, %d2, %acc0 | the same way as leading ones + mac.l %d2, %a1, %acc1 | + movclr.l %acc0, %d1 | + movclr.l %acc1, %d2 | + swap %d2 | + move.w %d2, %d1 | + move.l %d1, (%a4)+ | + cmp.l %a4, %a0 | + bhi.b .sos_longloop_1 | +.sos_done: + movem.l (%sp), %d1-%d7/%a2-%a5 | restore registers + move.l %d1, %macsr | + lea.l 44(%sp), %sp | cleanup + rts | +.sos_end: + .size sample_output_stereo, .sos_end-sample_output_stereo + +/**************************************************************************** + * void sample_output_mono(int count, struct dsp_data *data, + * int32_t *src[], int16_t *dst) + * + * Same treatment as sample_output_stereo but for one channel. + */ + .section .text + .global sample_output_mono +sample_output_mono: + lea.l -28(%sp), %sp | save registers + move.l %macsr, %d1 | do it now as at many lines will + movem.l %d1-%d5/%a2-%a3, (%sp) | be the far more common condition + move.l #0x80, %macsr | put emac unit in signed int mode + movem.l 32(%sp), %a0-%a3 | + lea.l (%a3, %a0.l*4), %a0 | %a0 = end address + move.l (%a1), %d1 | %d5 = multiplier: (1 << (16 - scale)) + sub.l #16, %d1 | + neg.l %d1 | + move.q #1, %d5 | + asl.l %d1, %d5 | + movem.l (%a2), %a2 | get source channel pointer + moveq.l #28, %d0 | %d0 = second line bound + add.l %a3, %d0 | + and.l #0xfffffff0, %d0 | + cmp.l %a3, %d0 | at least a full line? + blo.w .som_longloop_1_start | no? jump to trailing longword + sub.l #16, %d0 | %d1 = first line bound + cmp.l %a3, %d0 | any leading longwords? + bls.b .som_lineloop_start | no? jump to line loop +.som_longloop_0: + move.l (%a2)+, %d1 | read longword from L and R + mac.l %d1, %d5, %acc0 | shift L to high word + movclr.l %acc0, %d1 | get possibly saturated results + move.l %d1, %d2 | + swap %d2 | move R to low word + move.w %d2, %d1 | duplicate single channel into + move.l %d1, (%a3)+ | L and R + cmp.l %a3, %d0 | + bhi.b .som_longloop_0 | +.som_lineloop_start: + lea.l -12(%a0), %a1 | %a1 = at or just before last line bound +.som_lineloop: + move.l (%a2)+, %d0 | get next 4 L samples and scale + mac.l %d0, %d5, (%a2)+, %d1, %acc0 | with saturation + mac.l %d1, %d5, (%a2)+, %d2, %acc1 | + mac.l %d2, %d5, (%a2)+, %d3, %acc2 | + mac.l %d3, %d5, %acc3 | + movclr.l %acc0, %d0 | obtain results + movclr.l %acc1, %d1 | + movclr.l %acc2, %d2 | + movclr.l %acc3, %d3 | + move.l %d0, %d4 | duplicate single channel + swap %d4 | into L and R + move.w %d4, %d0 | + move.l %d1, %d4 | + swap %d4 | + move.w %d4, %d1 | + move.l %d2, %d4 | + swap %d4 | + move.w %d4, %d2 | + move.l %d3, %d4 | + swap %d4 | + move.w %d4, %d3 | + movem.l %d0-%d3, (%a3) | write four stereo samples + lea.l 16(%a3), %a3 | + cmp.l %a3, %a1 | + bhi.b .som_lineloop | +.som_longloop_1_start: + cmp.l %a3, %a0 | any longwords left? + bls.b .som_done | no? finished. +.som_longloop_1: + move.l (%a2)+, %d1 | handle trailing longwords + mac.l %d1, %d5, %acc0 | the same way as leading ones + movclr.l %acc0, %d1 | + move.l %d1, %d2 | + swap %d2 | + move.w %d2, %d1 | + move.l %d1, (%a3)+ | + cmp.l %a3, %a0 | + bhi.b .som_longloop_1 | +.som_done: + movem.l (%sp), %d1-%d5/%a2-%a3 | restore registers + move.l %d1, %macsr | + lea.l 28(%sp), %sp | cleanup + rts | +.som_end: + .size sample_output_mono, .som_end-sample_output_mono |