summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2005-06-21 18:15:35 +0000
committerJens Arnold <amiconn@rockbox.org>2005-06-21 18:15:35 +0000
commit8caf175c7fbbe60bc799c908fdd0b861791f5da0 (patch)
tree735233b62593655db06b8e17f665b971080af527
parentac0bc83777738fed73d0f2e6129feb4687a19ed4 (diff)
downloadrockbox-8caf175c7fbbe60bc799c908fdd0b861791f5da0.zip
rockbox-8caf175c7fbbe60bc799c908fdd0b861791f5da0.tar.gz
rockbox-8caf175c7fbbe60bc799c908fdd0b861791f5da0.tar.bz2
rockbox-8caf175c7fbbe60bc799c908fdd0b861791f5da0.tar.xz
memset() on coldfire now exploits burst mode whenever possible, giving another speed increase of up to 2.4 times for large blocks. Added a slight optimisation for small blocks as well.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6790 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--firmware/common/memset_a.S69
1 files changed, 60 insertions, 9 deletions
diff --git a/firmware/common/memset_a.S b/firmware/common/memset_a.S
index a35fcb1..c3da668 100644
--- a/firmware/common/memset_a.S
+++ b/firmware/common/memset_a.S
@@ -123,21 +123,27 @@ _memset:
*
* register usage:
* %d0 - data (spread to all 4 bytes when using long stores)
- * %d1 - temporary
+ * %d1 - temporary / data (for burst transfer)
+ * %d2 - data (for burst transfer)
+ * %d3 - data (for burst transfer)
* %a0 - start address
* %a1 - current address (runs down from end to start)
+ * %a2 - end address (for burst transfer)
+ *
+ * For maximum speed this routine uses both long stores and burst mode,
+ * storing whole lines with movem.l. The routine fills memory from end
+ * to start in order to ease returning the start address.
*/
memset:
move.l (4,%sp),%a0 /* start address */
move.l (8,%sp),%d0 /* data */
move.l (12,%sp),%a1 /* length */
+ add.l %a0,%a1 /* %a1 = end address */
move.l %a0,%d1
- neg.l %d1
- and.l #3,%d1 /* %d1 = (4 - align_offset) % 4 */
- addq.l #4,%d1
+ addq.l #7,%d1
+ and.l #0xFFFFFFFC,%d1 /* %d1 = first long bound + 4 */
cmp.l %d1,%a1 /* at least one aligned longword to fill? */
- add.l %a0,%a1 /* %a1 = end address; doesn't change flags */
blo.b .no_longs /* no, jump directly to byte loop */
and.l #0xFF,%d0 /* start: spread data to all 4 bytes */
@@ -148,7 +154,7 @@ memset:
swap %d0
or.l %d1,%d0 /* data now in all 4 bytes of %d0 */
- mov.l %a1,%d1
+ move.l %a1,%d1
and.l #0xFFFFFFFC,%d1 /* %d1 = last long bound */
cmp.l %d1,%a1 /* any bytes to set? */
bls.b .end_b1 /* no: skip byte loop */
@@ -160,14 +166,59 @@ memset:
bhi.b .loop_b1
.end_b1:
+ move.l %a0,%d1
+ add.l #31,%d1
+ and.l #0xFFFFFFF0,%d1 /* %d1 = first line bound + 16 */
+ cmp.l %d1,%a1 /* at least one full line to fill? */
+ blo.b .no_lines /* no, jump to longword loop */
+
+ mov.l %a1,%d1
+ and.l #0xFFFFFFF0,%d1 /* %d1 = last line bound */
+ cmp.l %d1,%a1 /* any longwords to set? */
+ bls.b .end_l1 /* no: skip longword loop */
+
+ /* leading longword loop: sets 0..3 longwords */
+.loop_l1:
+ move.l %d0,-(%a1) /* store longword */
+ cmp.l %d1,%a1 /* runs %a1 down to last line bound */
+ bhi.b .loop_l1
+
+.end_l1:
+ move.l %d2,-(%sp) /* free some registers */
+ move.l %d3,-(%sp)
+ move.l %a2,-(%sp)
+
+ move.l %d0,%d1 /* spread data to 4 data registers */
+ move.l %d0,%d2
+ move.l %d0,%d3
+ lea.l (15,%a0),%a2 /* %a2 = start address + 15, acct. for trl. data */
+
+ /* main loop: set whole lines utilising burst mode */
+.loop_line:
+ lea.l (-16,%a1),%a1 /* pre-decrement */
+ movem.l %d0-%d3,(%a1) /* store line */
+ cmp.l %a2,%a1 /* runs %a1 down to first line bound */
+ bhi.b .loop_line
+
+ move.l (%sp)+,%a2 /* restore registers */
+ move.l (%sp)+,%d3
+ move.l (%sp)+,%d2
+
+ move.l %a0,%d1 /* %d1 = start address ... */
+ addq.l #3,%d1 /* ... +3, account for possible trailing bytes */
+ bra.b .start_l2 /* there might be no longwords left when coming
+ * out of the main loop */
+
+.no_lines:
move.l %a0,%d1 /* %d1 = start address ... */
addq.l #3,%d1 /* ... +3, account for possible trailing bytes */
- /* main loop: set longs */
-.loop_l:
+ /* trailing longword loop */
+.loop_l2:
move.l %d0,-(%a1) /* store longword */
+.start_l2:
cmp.l %d1,%a1 /* runs %a1 down to first long bound */
- bhi.b .loop_l
+ bhi.b .loop_l2
.no_longs:
cmp.l %a0,%a1 /* any bytes left? */