diff options
Diffstat (limited to 'apps/codecs/libffmpegFLAC/coldfire.S')
| -rw-r--r-- | apps/codecs/libffmpegFLAC/coldfire.S | 535 |
1 files changed, 0 insertions, 535 deletions
diff --git a/apps/codecs/libffmpegFLAC/coldfire.S b/apps/codecs/libffmpegFLAC/coldfire.S deleted file mode 100644 index efbb907..0000000 --- a/apps/codecs/libffmpegFLAC/coldfire.S +++ /dev/null @@ -1,535 +0,0 @@ -/*************************************************************************** - * __________ __ ___. - * Open \______ \ ____ ____ | | _\_ |__ _______ ___ - * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / - * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < - * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ - * \/ \/ \/ \/ \/ - * $Id$ - * - * Copyright (C) 2005 by Thom Johansen - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ****************************************************************************/ - -/* The following are assembler optimised version of the LPC filtering - routines needed for FLAC decoding. They is optimised for use with the - MCF5249 processor, or any other similar ColdFire core with the EMAC unit. - */ - -/* This routine deals with sample widths 16 and lower. All LPC filtering up to - order 10 is done in specially optimised unrolled loops, while every order - above this is handled by a slower default routine. - */ - .section .icode,"ax",@progbits - .global lpc_decode_emac - .align 2 -lpc_decode_emac: - lea.l (-44, %sp), %sp - movem.l %d2-%d7/%a2-%a6, (%sp) - movem.l (44+4, %sp), %d0-%d2/%a0-%a1 - /* d0 = blocksize, d1 = qlevel, d2 = pred_order - a0 = data, a1 = coeffs - */ - - /* the data pointer always lags behind history pointer by 'pred_order' - samples. since we have one loop for each order, we can hard code this - and free a register by not saving data pointer. - */ - move.l %d2, %d3 - neg.l %d3 - lea.l (%a0, %d3.l*4), %a0 | history - clr.l %d3 - move.l %d3, %macsr | we'll need integer mode for this - tst.l %d0 - jeq .exit | zero samples to process, exit - moveq.l #10, %d3 - cmp.l %d3, %d2 - jgt .default | order is over 10, jump to default case - jmp.l (2, %pc, %d2.l*4) | jump to loop corresponding to pred_order -| jumptable: - bra.w .exit | zero order filter isn't possible, exit function - bra.w .order1 - bra.w .order2 - bra.w .order3 - bra.w .order4 - bra.w .order5 - bra.w .order6 - bra.w .order7 - bra.w .order8 - bra.w .order9 - -| last jump table entry coincides with target, so leave it out -.order10: - movem.l (%a1), %d3-%d7/%a1-%a5 | load lpc coefs - move.l (%a0)+, %a6 | load first history sample -1: - mac.l %a6, %a5, (%a0)+, %a6, %acc0 - mac.l %a6, %a4, (%a0)+, %a6, %acc0 - mac.l %a6, %a3, (%a0)+, %a6, %acc0 - mac.l %a6, %a2, (%a0)+, %a6, %acc0 - mac.l %a6, %a1, (%a0)+, %a6, %acc0 - mac.l %a6, %d7, (%a0)+, %a6, %acc0 - mac.l %a6, %d6, (%a0)+, %a6, %acc0 - mac.l %a6, %d5, (%a0)+, %a6, %acc0 - mac.l %a6, %d4, (%a0)+, %a6, %acc0 - mac.l %a6, %d3, (-9*4, %a0), %a6, %acc0 | load for the next iteration - movclr.l %acc0, %d2 | get sum - asr.l %d1, %d2 | shift sum by qlevel bits - add.l %d2, (%a0) | add residual and save - lea.l (-8*4, %a0), %a0 | point history back at second element - subq.l #1, %d0 | decrement sample count - jne 1b | are we done? - jra .exit - -.order9: - movem.l (%a1), %d4-%d7/%a1-%a5 - move.l (%a0)+, %a6 -1: - mac.l %a6, %a5, (%a0)+, %a6, %acc0 - mac.l %a6, %a4, (%a0)+, %a6, %acc0 - mac.l %a6, %a3, (%a0)+, %a6, %acc0 - mac.l %a6, %a2, (%a0)+, %a6, %acc0 - mac.l %a6, %a1, (%a0)+, %a6, %acc0 - mac.l %a6, %d7, (%a0)+, %a6, %acc0 - mac.l %a6, %d6, (%a0)+, %a6, %acc0 - mac.l %a6, %d5, (%a0)+, %a6, %acc0 - mac.l %a6, %d4, (-8*4, %a0), %a6, %acc0 - movclr.l %acc0, %d2 - asr.l %d1, %d2 - add.l %d2, (%a0) - lea.l (-7*4, %a0), %a0 - subq.l #1, %d0 - jne 1b - jra .exit - -.order8: - movem.l (%a1), %d5-%d7/%a1-%a5 - move.l (%a0)+, %a6 -1: - mac.l %a6, %a5, (%a0)+, %a6, %acc0 - mac.l %a6, %a4, (%a0)+, %a6, %acc0 - mac.l %a6, %a3, (%a0)+, %a6, %acc0 - mac.l %a6, %a2, (%a0)+, %a6, %acc0 - mac.l %a6, %a1, (%a0)+, %a6, %acc0 - mac.l %a6, %d7, (%a0)+, %a6, %acc0 - mac.l %a6, %d6, (%a0)+, %a6, %acc0 - mac.l %a6, %d5, (-7*4, %a0), %a6, %acc0 - movclr.l %acc0, %d2 - asr.l %d1, %d2 - add.l %d2, (%a0) - lea.l (-6*4, %a0), %a0 - subq.l #1, %d0 - jne 1b - jra .exit - -.order7: - movem.l (%a1), %d6-%d7/%a1-%a5 - move.l (%a0)+, %a6 -1: - mac.l %a6, %a5, (%a0)+, %a6, %acc0 - mac.l %a6, %a4, (%a0)+, %a6, %acc0 - mac.l %a6, %a3, (%a0)+, %a6, %acc0 - mac.l %a6, %a2, (%a0)+, %a6, %acc0 - mac.l %a6, %a1, (%a0)+, %a6, %acc0 - mac.l %a6, %d7, (%a0)+, %a6, %acc0 - mac.l %a6, %d6, (-6*4, %a0), %a6, %acc0 - movclr.l %acc0, %d2 - asr.l %d1, %d2 - add.l %d2, (%a0) - lea.l (-5*4, %a0), %a0 - subq.l #1, %d0 - jne 1b - jra .exit - -.order6: - movem.l (%a1), %d7/%a1-%a5 - move.l (%a0)+, %a6 -1: - mac.l %a6, %a5, (%a0)+, %a6, %acc0 - mac.l %a6, %a4, (%a0)+, %a6, %acc0 - mac.l %a6, %a3, (%a0)+, %a6, %acc0 - mac.l %a6, %a2, (%a0)+, %a6, %acc0 - mac.l %a6, %a1, (%a0)+, %a6, %acc0 - mac.l %a6, %d7, (-5*4, %a0), %a6, %acc0 - movclr.l %acc0, %d2 - asr.l %d1, %d2 - add.l %d2, (%a0) - lea.l (-4*4, %a0), %a0 - subq.l #1, %d0 - jne 1b - jra .exit - -.order5: - movem.l (%a1), %a1-%a5 - move.l (%a0)+, %a6 -1: - mac.l %a6, %a5, (%a0)+, %a6, %acc0 - mac.l %a6, %a4, (%a0)+, %a6, %acc0 - mac.l %a6, %a3, (%a0)+, %a6, %acc0 - mac.l %a6, %a2, (%a0)+, %a6, %acc0 - mac.l %a6, %a1, (-4*4, %a0), %a6, %acc0 - movclr.l %acc0, %d2 - asr.l %d1, %d2 - add.l %d2, (%a0) - lea.l (-3*4, %a0), %a0 - subq.l #1, %d0 - jne 1b - jra .exit - -.order4: - movem.l (%a1), %a2-%a5 - move.l (%a0)+, %a6 -1: - mac.l %a6, %a5, (%a0)+, %a6, %acc0 - mac.l %a6, %a4, (%a0)+, %a6, %acc0 - mac.l %a6, %a3, (%a0)+, %a6, %acc0 - mac.l %a6, %a2, (-3*4, %a0), %a6, %acc0 - movclr.l %acc0, %d2 - asr.l %d1, %d2 - add.l %d2, (%a0) - subq.l #8, %a0 - subq.l #1, %d0 - jne 1b - jra .exit - -.order3: - movem.l (%a1), %a3-%a5 - move.l (%a0)+, %a6 -1: - mac.l %a6, %a5, (%a0)+, %a6, %acc0 - mac.l %a6, %a4, (%a0)+, %a6, %acc0 - mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0 - movclr.l %acc0, %d2 - asr.l %d1, %d2 - add.l %d2, (%a0) - subq.l #4, %a0 - subq.l #1, %d0 - jne 1b - jra .exit - -.order2: - movem.l (%a1), %a4-%a5 - move.l (%a0)+, %a6 -1: - mac.l %a6, %a5, (%a0)+, %a6, %acc0 - mac.l %a6, %a4, %acc0 | data for next iteration is already loaded - movclr.l %acc0, %d2 - asr.l %d1, %d2 - add.l %d2, (%a0) - subq.l #1, %d0 - jne 1b - jra .exit - -.order1: - | no point in using mac here - move.l (%a1), %a5 -1: - move.l %a5, %d2 - muls.l (%a0)+, %d2 - asr.l %d1, %d2 - add.l %d2, (%a0) - subq.l #1, %d0 - jne 1b - jra .exit - -.default: - /* we do the filtering in an unrolled by 4 loop as far as we can, and then - do the rest by jump table. */ - lea.l (%a1, %d2.l*4), %a2 | need to start in the other end of coefs - move.l %a0, %a3 | working copy of history pointer - move.l %d2, %d3 - lsr.l #2, %d3 | coefs/4, num of iterations needed in next loop - move.l (%a3)+, %a5 | preload data for loop -1: - lea.l (-4*4, %a2), %a2 | move lpc coef pointer four samples backwards - movem.l (%a2), %d4-%d7 | load four coefs - mac.l %a5, %d7, (%a3)+, %a5, %acc0 - mac.l %a5, %d6, (%a3)+, %a5, %acc0 - mac.l %a5, %d5, (%a3)+, %a5, %acc0 - mac.l %a5, %d4, (%a3)+, %a5, %acc0 - subq.l #1, %d3 | any more unrolled loop operations left? - jne 1b - - moveq.l #3, %d3 | mask 0x00000003 - and.l %d2, %d3 | get the remaining samples to be filtered - jmp.l (2, %pc, %d3*2) | then jump into mac.l chain -| jumptable: - bra.b 3f | none left - bra.b 2f | one left - bra.b 1f | two left -| three left - move.l -(%a2), %d4 - mac.l %a5, %d4, (%a3)+, %a5, %acc0 -1: - move.l -(%a2), %d4 - mac.l %a5, %d4, (%a3)+, %a5, %acc0 -2: - move.l -(%a2), %d4 - mac.l %a5, %d4, (%a3)+, %a5, %acc0 -3: - movclr.l %acc0, %d3 | get result - asr.l %d1, %d3 | shift qlevel bits right - add.l %a5, %d3 | add residual, which is in a5 by now - move.l %d3, -(%a3) | save, a3 is also one past save location - addq.l #4, %a0 | increment history pointer - subq.l #1, %d0 | decrement sample count - jne .default | are we done? - jra .exit | if so, fall through to exit - - -/* This routine deals with sample widths 24 and lower. All LPC filtering up to - order 8 is done in specially optimised unrolled loops, while every order - above this is handled by a slower default routine. - */ - .global lpc_decode_emac_wide - .align 2 -lpc_decode_emac_wide: - lea.l (-44, %sp), %sp - movem.l %d2-%d7/%a2-%a6, (%sp) - movem.l (44+4, %sp), %d0-%d1/%d3/%a0-%a1 - /* d0 = blocksize, d1 = qlevel, d3 = pred_order - a0 = data, a1 = coeffs - */ - - /* the data pointer always lags behind history pointer by 'pred_order' - samples. since we have one loop for each order, we can hard code this - and free a register by not saving data pointer. - */ - move.l %d3, %d2 - neg.l %d2 - lea.l (%a0, %d2.l*4), %a0 | history - clr.l %d2 - move.l %d2, %macsr | we'll need integer mode for this - tst.l %d0 - jeq .exit | zero samples to process, exit - moveq.l #32, %d2 - sub.l %d1, %d2 | calculate shift amount for extension byte - moveq.l #8, %d4 - cmp.l %d4, %d3 - jgt .wdefault | order is over 8, jump to default case - jmp.l (2, %pc, %d3.l*4) | jump to loop corresponding to pred_order -| jumptable: - bra.w .exit | zero order filter isn't possible, exit function - bra.w .worder1 - bra.w .worder2 - bra.w .worder3 - bra.w .worder4 - bra.w .worder5 - bra.w .worder6 - bra.w .worder7 - -| last jump table entry coincides with target, so leave it out -.worder8: - movem.l (%a1), %d5-%d7/%a1-%a5 | load lpc coefs - move.l (%a0)+, %a6 | load first history sample -1: - mac.l %a6, %a5, (%a0)+, %a6, %acc0 - mac.l %a6, %a4, (%a0)+, %a6, %acc0 - mac.l %a6, %a3, (%a0)+, %a6, %acc0 - mac.l %a6, %a2, (%a0)+, %a6, %acc0 - mac.l %a6, %a1, (%a0)+, %a6, %acc0 - mac.l %a6, %d7, (%a0)+, %a6, %acc0 - mac.l %a6, %d6, (%a0)+, %a6, %acc0 - mac.l %a6, %d5, (-7*4, %a0), %a6, %acc0 | load for the next iteration - move.l %accext01, %d4 | get top 8 bits of sum - movclr.l %acc0, %d3 | then botten 32 bits - lsr.l %d1, %d3 | shift bottom bits qlevel bits right - asl.l %d2, %d4 | shift top bits 32 - qlevel bits left - or.l %d4, %d3 | now combine results - add.l %d3, (%a0) | add residual and save - lea.l (-6*4, %a0), %a0 | point history back at second element - subq.l #1, %d0 | decrement sample count - jne 1b | are we done? - jra .exit - -.worder7: - movem.l (%a1), %d6-%d7/%a1-%a5 - move.l (%a0)+, %a6 -1: - mac.l %a6, %a5, (%a0)+, %a6, %acc0 - mac.l %a6, %a4, (%a0)+, %a6, %acc0 - mac.l %a6, %a3, (%a0)+, %a6, %acc0 - mac.l %a6, %a2, (%a0)+, %a6, %acc0 - mac.l %a6, %a1, (%a0)+, %a6, %acc0 - mac.l %a6, %d7, (%a0)+, %a6, %acc0 - mac.l %a6, %d6, (-6*4, %a0), %a6, %acc0 - move.l %accext01, %d4 - movclr.l %acc0, %d3 - lsr.l %d1, %d3 - asl.l %d2, %d4 - or.l %d4, %d3 - add.l %d3, (%a0) - lea.l (-5*4, %a0), %a0 - subq.l #1, %d0 - jne 1b - jra .exit - -.worder6: - movem.l (%a1), %d7/%a1-%a5 - move.l (%a0)+, %a6 -1: - mac.l %a6, %a5, (%a0)+, %a6, %acc0 - mac.l %a6, %a4, (%a0)+, %a6, %acc0 - mac.l %a6, %a3, (%a0)+, %a6, %acc0 - mac.l %a6, %a2, (%a0)+, %a6, %acc0 - mac.l %a6, %a1, (%a0)+, %a6, %acc0 - mac.l %a6, %d7, (-5*4, %a0), %a6, %acc0 - move.l %accext01, %d4 - movclr.l %acc0, %d3 - lsr.l %d1, %d3 - asl.l %d2, %d4 - or.l %d4, %d3 - add.l %d3, (%a0) - lea.l (-4*4, %a0), %a0 - subq.l #1, %d0 - jne 1b - jra .exit - -.worder5: - movem.l (%a1), %a1-%a5 - move.l (%a0)+, %a6 -1: - mac.l %a6, %a5, (%a0)+, %a6, %acc0 - mac.l %a6, %a4, (%a0)+, %a6, %acc0 - mac.l %a6, %a3, (%a0)+, %a6, %acc0 - mac.l %a6, %a2, (%a0)+, %a6, %acc0 - mac.l %a6, %a1, (-4*4, %a0), %a6, %acc0 - move.l %accext01, %d4 - movclr.l %acc0, %d3 - lsr.l %d1, %d3 - asl.l %d2, %d4 - or.l %d4, %d3 - add.l %d3, (%a0) - lea.l (-3*4, %a0), %a0 - subq.l #1, %d0 - jne 1b - jra .exit - -.worder4: - movem.l (%a1), %a2-%a5 - move.l (%a0)+, %a6 -1: - mac.l %a6, %a5, (%a0)+, %a6, %acc0 - mac.l %a6, %a4, (%a0)+, %a6, %acc0 - mac.l %a6, %a3, (%a0)+, %a6, %acc0 - mac.l %a6, %a2, (-3*4, %a0), %a6, %acc0 - move.l %accext01, %d4 - movclr.l %acc0, %d3 - lsr.l %d1, %d3 - asl.l %d2, %d4 - or.l %d4, %d3 - add.l %d3, (%a0) - subq.l #8, %a0 - subq.l #1, %d0 - jne 1b - jra .exit - -.worder3: - movem.l (%a1), %a3-%a5 - move.l (%a0)+, %a6 -1: - mac.l %a6, %a5, (%a0)+, %a6, %acc0 - mac.l %a6, %a4, (%a0)+, %a6, %acc0 - mac.l %a6, %a3, (-2*4, %a0), %a6, %acc0 - move.l %accext01, %d4 - movclr.l %acc0, %d3 - lsr.l %d1, %d3 - asl.l %d2, %d4 - or.l %d4, %d3 - add.l %d3, (%a0) - subq.l #4, %a0 - subq.l #1, %d0 - jne 1b - jra .exit - -.worder2: - movem.l (%a1), %a4-%a5 - move.l (%a0)+, %a6 -1: - mac.l %a6, %a5, (%a0)+, %a6, %acc0 - mac.l %a6, %a4, %acc0 | data for next iteration is already loaded - move.l %accext01, %d4 - movclr.l %acc0, %d3 - lsr.l %d1, %d3 - asl.l %d2, %d4 - or.l %d4, %d3 - add.l %d3, (%a0) - subq.l #1, %d0 - jne 1b - jra .exit - -.worder1: - move.l (%a1), %a5 - move.l (%a0)+, %a6 -1: - mac.l %a6, %a5, (%a0), %a6, %acc0 - move.l %accext01, %d4 - movclr.l %acc0, %d3 - lsr.l %d1, %d3 - asl.l %d2, %d4 - or.l %d4, %d3 - add.l %a6, %d3 | residual is already in a6 - move.l %d3, (%a0)+ - subq.l #1, %d0 - jne 1b - jra .exit - -.wdefault: - /* we do the filtering in an unrolled by 4 loop as far as we can, and then - do the rest by jump table. */ - lea.l (%a1, %d3.l*4), %a2 | need to start in the other end of coefs - move.l %a0, %a3 | working copy of history pointer - move.l %d3, %d4 - lsr.l #2, %d4 | coefs/4, num of iterations needed in next loop - move.l (%a3)+, %a5 | preload data for loop -1: - lea.l (-4*4, %a2), %a2 | move lpc coef pointer four samples backwards - movem.l (%a2), %d5-%d7/%a4 | load four coefs - mac.l %a5, %a4, (%a3)+, %a5, %acc0 - mac.l %a5, %d7, (%a3)+, %a5, %acc0 - mac.l %a5, %d6, (%a3)+, %a5, %acc0 - mac.l %a5, %d5, (%a3)+, %a5, %acc0 - subq.l #1, %d4 | any more unrolled loop operations left? - jne 1b - - moveq.l #3, %d4 | mask 0x00000003 - and.l %d3, %d4 | get the remaining samples to be filtered - jmp.l (2, %pc, %d4*2) | then jump into mac.l chain -| jumptable: - bra.b 3f | none left - bra.b 2f | one left - bra.b 1f | two left -| three left - move.l -(%a2), %d4 - mac.l %a5, %d4, (%a3)+, %a5, %acc0 -1: - move.l -(%a2), %d4 - mac.l %a5, %d4, (%a3)+, %a5, %acc0 -2: - move.l -(%a2), %d4 - mac.l %a5, %d4, (%a3)+, %a5, %acc0 -3: - move.l %accext01, %d5 | get high 32 bits of result - movclr.l %acc0, %d4 | get low 32 bits of result - lsr.l %d1, %d4 | shift qlevel bits right - asl.l %d2, %d5 | shift 32 - qlevel bits left - or.l %d5, %d4 | combine top and low bits after shift - add.l %a5, %d4 | add residual, which is in a5 by now - move.l %d4, -(%a3) | save, a3 is also one past save location - addq.l #4, %a0 | increment history pointer - subq.l #1, %d0 | decrement sample count - jne .wdefault | are we done? - | if so, fall through to exit - -.exit: - movem.l (%sp), %d2-%d7/%a2-%a6 - lea.l (44, %sp), %sp - rts |