summaryrefslogtreecommitdiff
path: root/apps/codecs
diff options
context:
space:
mode:
authorThom Johansen <thomj@rockbox.org>2005-05-31 07:56:28 +0000
committerThom Johansen <thomj@rockbox.org>2005-05-31 07:56:28 +0000
commit9985caf3f96df691fad9332986b7af4d0f66676d (patch)
tree835adf7c966dcc50f0a4a58da1c9726a01835c12 /apps/codecs
parentff40e4cc6a0a66e0eecaceae784203298c8c408d (diff)
downloadrockbox-9985caf3f96df691fad9332986b7af4d0f66676d.zip
rockbox-9985caf3f96df691fad9332986b7af4d0f66676d.tar.gz
rockbox-9985caf3f96df691fad9332986b7af4d0f66676d.tar.bz2
rockbox-9985caf3f96df691fad9332986b7af4d0f66676d.tar.xz
ASM optimisation by David Bryant.
Placed various important arrays in IRAM. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6540 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/codecs')
-rw-r--r--apps/codecs/libwavpack/Makefile2
-rw-r--r--apps/codecs/libwavpack/SOURCES3
-rw-r--r--apps/codecs/libwavpack/coldfire.S535
-rw-r--r--apps/codecs/libwavpack/unpack.c45
-rw-r--r--apps/codecs/libwavpack/wputils.c2
5 files changed, 565 insertions, 22 deletions
diff --git a/apps/codecs/libwavpack/Makefile b/apps/codecs/libwavpack/Makefile
index df26559..75b9060 100644
--- a/apps/codecs/libwavpack/Makefile
+++ b/apps/codecs/libwavpack/Makefile
@@ -15,7 +15,7 @@ INCLUDES += -I$(APPSDIR)/$(APPEXTRA)
endif
CFLAGS = $(GCCOPTS) \
-$(INCLUDES) $(TARGET) $(EXTRA_DEFINES) -DMEM=${MEMORYSIZE}
+$(INCLUDES) $(TARGET) $(EXTRA_DEFINES) -DMEM=${MEMORYSIZE} -O2 \
# This sets up 'SRC' based on the files mentioned in SOURCES
include $(TOOLSDIR)/makesrc.inc
diff --git a/apps/codecs/libwavpack/SOURCES b/apps/codecs/libwavpack/SOURCES
index def57b7..a4f0f2f 100644
--- a/apps/codecs/libwavpack/SOURCES
+++ b/apps/codecs/libwavpack/SOURCES
@@ -4,4 +4,7 @@ metadata.c
unpack.c
words.c
wputils.c
+#if CONFIG_CPU==MCF5249 && !defined(SIMULATOR)
+coldfire.S
+#endif
diff --git a/apps/codecs/libwavpack/coldfire.S b/apps/codecs/libwavpack/coldfire.S
new file mode 100644
index 0000000..9c7e098
--- /dev/null
+++ b/apps/codecs/libwavpack/coldfire.S
@@ -0,0 +1,535 @@
+/***************************************************************************
+ * __________ __ ___.
+ * Open \______ \ ____ ____ | | _\_ |__ _______ ___
+ * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
+ * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
+ * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
+ * \/ \/ \/ \/ \/
+ * $Id$
+ *
+ * Copyright (C) 2005 by David Bryant
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+/* This is an assembly optimized version of the following WavPack function:
+ *
+ * void decorr_stereo_pass_cont_mcf5249 (struct decorr_pass *dpp,
+ * long *buffer, long sample_count);
+ *
+ * It performs a single pass of stereo decorrelation on the provided buffer.
+ * Note that this version of the function requires that the 8 previous stereo
+ * samples are visible and correct. In other words, it ignores the "samples_*"
+ * fields in the decorr_pass structure and gets the history data directly
+ * from the buffer. It does, however, return the appropriate history samples
+ * to the decorr_pass structure before returning.
+ *
+ * This is written to work on a MCF5249 processor, or any processor based on
+ * the ColdFire V2 core with an EMAC unit. The EMAC is perfectly suited for
+ * the "apply_weight" function of WavPack decorrelation because it provides
+ * the requires 40-bit product. The fractional rounding mode of the EMAC is not
+ * configurable and uses "round to even" while WavPack uses "round to larger",
+ * so the rounding has to be done manually.
+ */
+
+ .text
+ .align 2
+ .global decorr_stereo_pass_cont_mcf5249
+
+decorr_stereo_pass_cont_mcf5249:
+
+ lea (-44, %sp), %sp
+ movem.l %d2-%d7/%a2-%a6, (%sp)
+ move.l 44+4(%sp), %a2 | a2 = dpp->
+ move.l 44+8(%sp), %a1 | a1 = bptr
+ move.w 2(%a2), %a3 | a3 = dpp->delta
+ move.w 4(%a2), %d3 | d3 = dpp->weight_A (sign extended)
+ ext.l %d3
+ move.w 6(%a2), %d4 | d4 = dpp->weight_B (sign extended)
+ ext.l %d4
+ move.l 44+12(%sp), %d0 | d0 = sample_count
+ jbeq return_only | if zero, nothing to do
+
+ lsl.l #3, %d0 | d5 = bptr + (sample_count * 8)
+ move.l %d0, %d5
+ add.l %a1, %d5
+
+ moveq.l #17, %d0 | left shift weights & delta 17 places
+ asl.l %d0, %d3
+ asl.l %d0, %d4
+ move.l %a3, %d1
+ asl.l %d0, %d1
+ move.l %d1, %a3
+
+ move.l #0x20, %macsr | set fractional mode for MAC
+ move.l #0, %acc1 | acc1 = 0x00 0000 80 (for rounding)
+ move.l #0x800000, %accext01
+
+ move.l #1024<<17, %d6 | d6 & d7 are weight clipping limits
+ move.l #-1024<<17, %d7 | (only used by negative terms)
+
+ move.w (%a2), %d0 | d0 = term
+ ext.l %d0
+ cmp.l #17, %d0
+ jbeq term_17 | term = 17
+ cmp.l #18, %d0
+ jbeq term_18 | term = 18
+ addq.l #1, %d0
+ jbeq term_minus_1 | term = -1
+ addq.l #1, %d0
+ jbeq term_minus_2 | term = -2
+ addq.l #1, %d0
+ jbeq term_minus_3 | term = -3
+ jbra term_default | default term = 1 - 8
+
+|------------------------------------------------------------------------------
+| Loop to handle term = 17 condition
+|
+| a0 = d0 = (2 * bptr [-1]) - bptr [-2]
+| a1 = bptr d1 = initial bptr [0]
+| a2 = dpp-> d2 = updated bptr [0]
+| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
+| a4 = d4 = dpp->weight_B << 17
+| a5 = d5 = eptr
+| macsr = 0x20 acc1 = 0x00 0000 80
+|------------------------------------------------------------------------------
+
+term_17:
+ move.l -8(%a1), %d0 | d0 = 2 * bptr [-1] - bptr [-2]
+ add.l %d0, %d0
+ sub.l -16(%a1), %d0
+ beq .L251 | if zero, skip calculation
+ move.l %acc1, %acc0
+ asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A
+ mac.l %d0, %d3, %acc0
+ move.l (%a1), %d1
+ beq .L255
+ eor.l %d1, %d0 | else compare signs
+ bge .L256 | if same, add delta to weight
+ sub.l %a3, %d3 | else subtract delta from weight
+ sub.l %a3, %d3 | subtract again instead of branch
+.L256: add.l %a3, %d3 | add delta to weight
+
+.L255: move.l %acc0, %d2 | d2 = rounded product
+ add.l %d1, %d2 | update bptr [0] and store
+ move.l %d2, (%a1)+
+
+.L253: move.l -8(%a1), %d0 | d0 = 2 * bptr [-1] - bptr [-2]
+ add.l %d0, %d0
+ sub.l -16(%a1), %d0
+ beq .L257 | if zero, skip calculations
+ move.l %acc1, %acc0
+ asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B
+ mac.l %d0, %d4, %acc0
+ move.l (%a1), %d1
+ beq .L254
+ eor.l %d1, %d0 | else compare signs
+ bge .L259 | if same, add delta to weight
+ sub.l %a3, %d4 | else subtract delta from weight
+ sub.l %a3, %d4 | subtract again instead of branch
+.L259: add.l %a3, %d4 | add delta to weight
+
+.L254: move.l %acc0, %d2 | d2 = rounded product
+ add.l %d1, %d2 | update bptr [0] and store
+ move.l %d2, (%a1)+
+
+.L252: cmp.l %a1, %d5 | loop if bptr < eptr
+ jbhi term_17
+ bra term_17_18_finish | exit through common path
+
+.L251: addq.l #4, %a1 | update point and jump back into loop
+ bra .L253
+
+.L257: addq.l #4, %a1 | update point and jump back into loop
+ bra .L252
+
+|------------------------------------------------------------------------------
+| Loop to handle term = 18 condition
+|
+| a0 = d0 = ((3 * bptr [-1]) - bptr [-2]) >> 1
+| a1 = bptr d1 = initial bptr [0]
+| a2 = dpp-> d2 = updated bptr [0]
+| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
+| a4 = d4 = dpp->weight_B << 17
+| a5 = d5 = eptr
+| macsr = 0x20 acc1 = 0x00 0000 80
+|------------------------------------------------------------------------------
+
+term_18:
+ move.l -8(%a1), %a0 | d0 = (3 * bptr [-1] - bptr [-2]) >> 1
+ lea (%a0,%a0.l*2), %a0
+ move.l %a0, %d0
+ sub.l -16(%a1), %d0
+ asr.l #1, %d0
+ beq .L260
+ move.l %acc1, %acc0
+ asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A
+ mac.l %d0, %d3, %acc0
+ move.l (%a1), %d1
+ beq .L266
+ eor.l %d1, %d0 | else compare signs
+ bge .L267 | if same, add delta to weight
+ sub.l %a3, %d3 | else subtract delta from weight
+ sub.l %a3, %d3 | subtract again instead of branch
+.L267: add.l %a3, %d3 | add delta to weight
+
+.L266: move.l %acc0, %d2 | d2 = rounded product
+ add.l %d1, %d2 | add applied weight to bptr [0], store
+ move.l %d2, (%a1)+
+
+.L268: move.l -8(%a1), %a0 | d0 = (3 * bptr [-1] - bptr [-2]) >> 1
+ lea (%a0,%a0.l*2), %a0
+ move.l %a0, %d0
+ sub.l -16(%a1), %d0
+ asr.l #1, %d0
+ beq .L261
+ move.l %acc1, %acc0
+ asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B
+ mac.l %d0, %d4, %acc0
+ move.l (%a1), %d1
+ beq .L265
+ eor.l %d1, %d0 | else compare signs
+ bge .L270 | if same, add delta to weight
+ sub.l %a3, %d4 | else subtract delta from weight
+ sub.l %a3, %d4 | subtract again instead of branch
+.L270: add.l %a3, %d4 | add delta to weight
+
+.L265: move.l %acc0, %d2 | d2 = rounded product
+ add.l %d1, %d2 | add applied weight to bptr [0], store
+ move.l %d2, (%a1)+
+
+.L269: cmp.l %a1, %d5 | loop if bptr < eptr
+ jbhi term_18
+ bra term_17_18_finish | exit through common path
+
+.L260: addq.l #4, %a1 | bump pointer and jump back into loop
+ bra .L268
+
+.L261: addq.l #4, %a1 | bump pointer and jump back into loop
+ bra .L269
+
+term_17_18_finish:
+ move.l -4(%a1), 40(%a2) | restore dpp->samples_A [0-1], B [0-1]
+ move.l -8(%a1), 8(%a2)
+ move.l -12(%a1), 44(%a2)
+ move.l -16(%a1), 12(%a2)
+ jbra finish_up
+
+|------------------------------------------------------------------------------
+| Loop to handle default terms (i.e. 1 - 8)
+|
+| a0 = tptr d0 = tptr [0]
+| a1 = bptr d1 = initial bptr [0]
+| a2 = dpp-> d2 = updated bptr [0]
+| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
+| a4 = d4 = dpp->weight_B << 17
+| a5 = d5 = eptr
+| macsr = 0x20 acc1 = 0x00 0000 80
+|------------------------------------------------------------------------------
+
+term_default:
+ move.w (%a2), %d0 | a0 = a1 - (dpp->term * 8)
+ ext.l %d0
+ lsl.l #3, %d0
+ move.l %a1, %a0
+ sub.l %d0, %a0
+
+term_default_loop:
+ move.l (%a0)+, %d0 | d0 = tptr [0], skip ahead if zero
+ beq .L271
+ move.l %acc1, %acc0
+ asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A
+ mac.l %d0, %d3, %acc0
+ move.l (%a1), %d1
+ beq .L277
+ eor.l %d1, %d0 | else compare signs
+ bge .L278 | if same, add delta to weight
+ sub.l %a3, %d3 | else subtract delta from weight
+ sub.l %a3, %d3 | subtract again instead of branch
+.L278: add.l %a3, %d3 | add delta to weight
+
+.L277: move.l %acc0, %d2 | d2 = rounded product
+ add.l %d1, %d2 | add applied weight to bptr [0], store
+ move.l %d2, (%a1)+
+
+.L275: move.l (%a0)+, %d0 | d0 = tptr [0], skip ahead if zero
+ beq .L272
+ move.l %acc1, %acc0
+ asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B
+ mac.l %d0, %d4, %acc0
+ move.l (%a1), %d1
+ beq .L276
+ eor.l %d1, %d0 | else compare signs
+ bge .L281 | if same, add delta to weight
+ sub.l %a3, %d4 | else subtract delta from weight
+ sub.l %a3, %d4 | subtract again instead of branch
+.L281: add.l %a3, %d4 | add delta to weight
+
+.L276: move.l %acc0, %d2 | d2 = rounded product
+ add.l %d1, %d2 | add applied weight to bptr [0], store
+ move.l %d2, (%a1)+
+
+.L274: cmp.l %a1, %d5 | loop back if bptr < eptr
+ jbhi term_default_loop
+ move.w (%a2), %d0 | d0 = term - 1
+ moveq.l #8, %d1 | d1 = loop counter
+
+.L323: subq.l #1, %d0 | back up & mask index
+ and.l #7, %d0
+ move.l -(%a1), 40(%a2,%d0.l*4) | store dpp->samples_B [d0]
+ move.l -(%a1), 8(%a2,%d0.l*4) | store dpp->samples_A [d0]
+ subq.l #1, %d1 | loop on count
+ jbne .L323
+ jbra finish_up
+
+.L271: addq.l #4, %a1 | bump pointer and jump back into loop
+ bra .L275
+
+.L272: addq.l #4, %a1 | bump pointer and jump back into loop
+ bra .L274
+
+
+|------------------------------------------------------------------------------
+| Loop to handle term = -1 condition
+|
+| a0 = d0 = decorrelation sample
+| a1 = bptr d1 = initial bptr [0]
+| a2 = dpp-> d2 = updated bptr [0]
+| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
+| a4 = d4 = dpp->weight_B << 17
+| a5 = d5 = eptr
+| a6 = d6 = 1024 << 17
+| a7 = d7 = -1024 << 17
+| macsr = 0x20 acc1 = 0x00 0000 80
+|------------------------------------------------------------------------------
+
+term_minus_1:
+ move.l -4(%a1), %d0 | d0 = bptr [-1]
+ beq .L402
+ move.l %acc1, %acc0
+ asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A)
+ mac.l %d0, %d3, %acc0
+ move.l (%a1), %d1
+ beq .L405
+ eor.l %d1, %d0 | else compare signs
+ bge .L404 | if same, add delta to weight
+ sub.l %a3, %d3 | else subtract delta from weight
+ cmp.l %d7, %d3 | check for negative clip limit
+ bge .L405
+ move.l %d7, %d3
+ bra .L405
+
+.L404: add.l %a3, %d3 | add delta to weight
+ cmp.l %d6, %d3 | check for positive clip limit
+ ble .L405
+ move.l %d6, %d3
+
+.L405: move.l %acc0, %d0 | d2 = rounded product
+ add.l %d1, %d0 | add applied weight to bptr [0], store
+ move.l %d0, (%a1)+
+ beq .L401
+
+.L410: move.l %acc1, %acc0
+ asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B)
+ mac.l %d0, %d4, %acc0
+ move.l (%a1), %d1
+ beq .L403
+ eor.l %d1, %d0 | else compare signs
+ bge .L407 | if same, add delta to weight
+ sub.l %a3, %d4 | else subtract delta from weight
+ cmp.l %d7, %d4 | check for negative clip limit
+ bge .L403
+ move.l %d7, %d4
+ bra .L403
+
+.L407: add.l %a3, %d4 | add delta to weight
+ cmp.l %d6, %d4 | check for positive clip limit
+ ble .L403
+ move.l %d6, %d4
+
+.L403: move.l %acc0, %d2 | d2 = rounded product
+ add.l %d1, %d2 | add applied weight to bptr [1], store
+ move.l %d2, (%a1)+
+
+.L411: cmp.l %a1, %d5 | loop back if bptr < eptr
+ jbhi term_minus_1
+ move.l -4(%a1), 8(%a2) | dpp->samples_A [0] = bptr [-1]
+ jbra finish_up
+
+.L402: move.l (%a1)+, %d0
+ bne .L410
+
+.L401: addq.l #4, %a1
+ bra .L411
+
+
+|------------------------------------------------------------------------------
+| Loop to handle term = -2 condition
+|
+| a0 = d0 = decorrelation sample
+| a1 = bptr d1 = initial bptr [0]
+| a2 = dpp-> d2 = updated bptr [0]
+| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
+| a4 = d4 = dpp->weight_B << 17
+| a5 = d5 = eptr
+| a6 = d6 = 1024 << 17
+| a7 = d7 = -1024 << 17
+| macsr = 0x20 acc1 = 0x00 0000 80
+|------------------------------------------------------------------------------
+
+term_minus_2:
+ move.l -8(%a1), %d0 | d0 = bptr [-2]
+ beq .L511
+ move.l %acc1, %acc0
+ asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B)
+ mac.l %d0, %d4, %acc0
+ move.l 4(%a1), %d1
+ beq .L505
+ eor.l %d1, %d0 | else compare signs
+ bge .L504 | if same, add delta to weight
+ sub.l %a3, %d4 | else subtract delta from weight
+ cmp.l %d7, %d4 | ckeck for negative clip limit
+ bge .L505
+ move.l %d7, %d4
+ bra .L505
+
+.L504: add.l %a3, %d4 | add delta to weight
+ cmp.l %d6, %d4 | check for positive clip limit
+ ble .L505
+ move.l %d6, %d4
+
+.L505: move.l %acc0, %d0 | d2 = rounded product
+ add.l %d1, %d0 | add applied weight to bptr [0], store
+ move.l %d0, 4(%a1)
+ beq .L512
+
+.L510: move.l %acc1, %acc0
+ asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A)
+ mac.l %d0, %d3, %acc0
+ move.l (%a1), %d1
+ beq .L503
+ eor.l %d1, %d0 | else compare signs
+ bge .L507 | if same, add delta to weight
+ sub.l %a3, %d3 | else subtract delta from weight
+ cmp.l %d7, %d3 | check for negative clip limit
+ bge .L503
+ move.l %d7, %d3
+ bra .L503
+
+.L507: add.l %a3, %d3 | add delta to weight
+ cmp.l %d6, %d3 | check for negative clip limit
+ ble .L503
+ move.l %d6, %d3
+
+.L503: move.l %acc0, %d2 | d2 = rounded product
+ add.l %d1, %d2 | add applied weight to bptr [1], store
+ move.l %d2, (%a1)
+
+.L512: addq.l #8, %a1
+ cmp.l %a1, %d5 | loop if bptr < eptr
+ jbhi term_minus_2
+ move.l -8(%a1), 40(%a2) | dpp->samples_B [0] = bptr [-4]
+ jbra finish_up
+
+.L511: move.l 4(%a1), %d0
+ beq .L512
+ bra .L510
+
+
+|------------------------------------------------------------------------------
+| Loop to handle term = -3 condition
+|
+| a0 = d0 = decorrelation sample
+| a1 = bptr d1 = initial bptr [0]
+| a2 = dpp-> d2 = updated bptr [0]
+| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
+| a4 = d4 = dpp->weight_B << 17
+| a5 = d5 = eptr
+| a6 = d6 = 1024 << 17
+| a7 = d7 = -1024 << 17
+| macsr = 0x20 acc1 = 0x00 0000 80
+|------------------------------------------------------------------------------
+
+term_minus_3:
+ move.l -4(%a1), %d0 | d0 = bptr [-1]
+ beq .L301
+ move.l %acc1, %acc0
+ asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A)
+ mac.l %d0, %d3, %acc0
+ move.l (%a1), %d1
+ beq .L320
+ eor.l %d1, %d0 | else compare signs
+ bge .L319 | if same, add delta to weight
+ sub.l %a3, %d3 | else subtract delta from weight
+ cmp.l %d7, %d3 | check for negative clip limit
+ bge .L320
+ move.l %d7, %d3
+ bra .L320
+
+.L319: add.l %a3, %d3 | add delta to weight
+ cmp.l %d6, %d3 | check for positive clip limit
+ ble .L320
+ move.l %d6, %d3
+
+.L320: move.l %acc0, %d2 | d2 = rounded product
+ add.l %d1, %d2 | add applied weight to bptr [0], store
+ move.l %d2, (%a1)+
+
+.L330: move.l -12(%a1), %d0 | d0 = bptr [-2]
+ beq .L302
+ move.l %acc1, %acc0
+ asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B)
+ mac.l %d0, %d4, %acc0
+ move.l (%a1), %d1
+ beq .L318
+ eor.l %d1, %d0 | else compare signs
+ bge .L322 | if same, add delta to weight
+ sub.l %a3, %d4 | else subtract delta from weight
+ cmp.l %d7, %d4 | check for negative clip limit
+ bge .L318
+ move.l %d7, %d4
+ bra .L318
+
+.L322: add.l %a3, %d4 | add delta to weight
+ cmp.l %d6, %d4 | check for positive clip limit
+ ble .L318
+ move.l %d6, %d4
+
+.L318: move.l %acc0, %d2 | d2 = rounded product
+ add.l %d1, %d2 | add applied weight to bptr [1], store
+ move.l %d2, (%a1)+
+
+.L331: cmp.l %a1, %d5 | bptr, eptr
+ jbhi term_minus_3
+ move.l -4(%a1), 8(%a2) | dpp->samples_A [0] = bptr [-1]
+ move.l -8(%a1), 40(%a2) | dpp->samples_B [0] = bptr [-2]
+ jbra finish_up
+
+.L301: addq.l #4, %a1
+ bra .L330
+
+.L302: addq.l #4, %a1
+ bra .L331
+
+| finish and return
+
+finish_up:
+ moveq.l #17, %d0
+ asr.l %d0, %d3
+ asr.l %d0, %d4
+ move.w %d3, 4(%a2) | weight_A, dpp->weight_A
+ move.w %d4, 6(%a2) | weight_B, dpp->weight_B
+
+ clr.l %d0 | clear up EMAC
+ move.l %d0, %acc0
+ move.l %d0, %acc1
+
+return_only:
+ movem.l (%sp), %d2-%d7/%a2-%a6
+ lea (44,%sp), %sp
+ rts
diff --git a/apps/codecs/libwavpack/unpack.c b/apps/codecs/libwavpack/unpack.c
index ae47378..5afaac3 100644
--- a/apps/codecs/libwavpack/unpack.c
+++ b/apps/codecs/libwavpack/unpack.c
@@ -27,7 +27,11 @@ static void strcpy_loc (char *dst, char *src) { while (*src) *dst++ = *src++; *d
// these macros implement the weight application and update operations
// that are at the heart of the decorrelation loops
+#if 0 // PERFCOND
#define apply_weight_i(weight, sample) ((weight * sample + 512) >> 10)
+#else
+#define apply_weight_i(weight, sample) ((((weight * sample) >> 8) + 2) >> 2)
+#endif
#define apply_weight_f(weight, sample) (((((sample & 0xffff) * weight) >> 9) + \
(((sample & ~0xffff) >> 9) * weight) + 1) >> 1)
@@ -39,7 +43,7 @@ static void strcpy_loc (char *dst, char *src) { while (*src) *dst++ = *src++; *d
#define apply_weight(weight, sample) ((int32_t)((weight * (int64_t) sample + 512) >> 10))
#endif
-#if 1 // PERFCOND
+#if 0 // PERFCOND
#define update_weight(weight, delta, source, result) \
if (source && result) weight -= ((((source ^ result) >> 30) & 2) - 1) * delta;
#else
@@ -315,9 +319,14 @@ int read_config_info (WavpackContext *wpc, WavpackMetadata *wpmd)
// samples unpacked, which can be less than the number requested if an error
// occurs or the end of the block is reached.
+#if CONFIG_CPU==MCF5249 && !defined(SIMULATOR)
+extern void decorr_stereo_pass_cont_mcf5249 (struct decorr_pass *dpp, long *buffer, long sample_count);
+#else
+static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count);
+#endif
+
static void decorr_mono_pass (struct decorr_pass *dpp, long *buffer, long sample_count);
static void decorr_stereo_pass (struct decorr_pass *dpp, long *buffer, long sample_count);
-static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count);
static void fixup_samples (WavpackStream *wps, long *buffer, ulong sample_count);
long unpack_samples (WavpackContext *wpc, long *buffer, ulong sample_count)
@@ -372,7 +381,11 @@ long unpack_samples (WavpackContext *wpc, long *buffer, ulong sample_count)
else
for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++) {
decorr_stereo_pass (dpp, buffer, 8);
+#if CONFIG_CPU==MCF5249 && !defined(SIMULATOR)
+ decorr_stereo_pass_cont_mcf5249 (dpp, buffer + 16, sample_count - 8);
+#else
decorr_stereo_pass_cont (dpp, buffer + 16, sample_count - 8);
+#endif
}
if (flags & JOINT_STEREO)
@@ -530,11 +543,13 @@ static void decorr_stereo_pass (struct decorr_pass *dpp, long *buffer, long samp
dpp->weight_B = weight_B;
}
+#if CONFIG_CPU != MCF5249 || defined(SIMULATOR)
+
static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count)
{
long delta = dpp->delta, weight_A = dpp->weight_A, weight_B = dpp->weight_B;
long *bptr, *tptr, *eptr = buffer + (sample_count * 2), sam_A, sam_B;
- int k;
+ int k, i;
switch (dpp->term) {
@@ -581,23 +596,11 @@ static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long
update_weight (weight_B, delta, tptr [1], sam_A);
}
- k = dpp->term;
- dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-1];
- dpp->samples_A [ k & (MAX_TERM - 1)] = bptr [-2];
- dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-3];
- dpp->samples_A [ k & (MAX_TERM - 1)] = bptr [-4];
- dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-5];
- dpp->samples_A [ k & (MAX_TERM - 1)] = bptr [-6];
- dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-7];
- dpp->samples_A [ k & (MAX_TERM - 1)] = bptr [-8];
- dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-9];
- dpp->samples_A [ k & (MAX_TERM - 1)] = bptr [-10];
- dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-11];
- dpp->samples_A [ k & (MAX_TERM - 1)] = bptr [-12];
- dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-13];
- dpp->samples_A [ k & (MAX_TERM - 1)] = bptr [-14];
- dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-15];
- dpp->samples_A [ k & (MAX_TERM - 1)] = bptr [-16];
+ for (k = dpp->term - 1, i = 8; i--; k--) {
+ dpp->samples_B [k & (MAX_TERM - 1)] = *--bptr;
+ dpp->samples_A [k & (MAX_TERM - 1)] = *--bptr;
+ }
+
break;
case -1:
@@ -639,6 +642,8 @@ static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long
dpp->weight_B = weight_B;
}
+#endif
+
static void decorr_mono_pass (struct decorr_pass *dpp, long *buffer, long sample_count)
{
long delta = dpp->delta, weight_A = dpp->weight_A;
diff --git a/apps/codecs/libwavpack/wputils.c b/apps/codecs/libwavpack/wputils.c
index 9227b66..8d58b3b 100644
--- a/apps/codecs/libwavpack/wputils.c
+++ b/apps/codecs/libwavpack/wputils.c
@@ -45,7 +45,7 @@ static ulong read_next_header (read_stream infile, WavpackHeader *wphdr);
// large integer or floating point files (but always provides at least 24 bits
// of resolution).
-static WavpackContext wpc;
+static WavpackContext wpc IDATA_ATTR;
WavpackContext *WavpackOpenFileInput (read_stream infile, char *error)
{