diff options
Diffstat (limited to 'firmware/common/memcpy_a.S')
| -rw-r--r-- | firmware/common/memcpy_a.S | 25 |
1 files changed, 15 insertions, 10 deletions
diff --git a/firmware/common/memcpy_a.S b/firmware/common/memcpy_a.S index 7264c96..9f6c813 100644 --- a/firmware/common/memcpy_a.S +++ b/firmware/common/memcpy_a.S @@ -23,6 +23,7 @@ #if CONFIG_CPU == SH7034 .align 2 .global _memcpy + .global ___memcpy_fwd_entry .type _memcpy,@function /* Copies <length> bytes of data in memory from <source> to <dest> @@ -46,12 +47,13 @@ * r6 - source end address * r7 - stored dest start address * - * The instruction order below is devised in a way to utilize the pipelining + * The instruction order is devised in a way to utilize the pipelining * of the SH1 to the max. The routine also tries to utilize fast page mode. */ _memcpy: mov r4,r7 /* store dest for returning */ +___memcpy_fwd_entry: add #-8,r4 /* offset for early increment (max. 2 longs) */ mov #11,r0 cmp/hs r0,r6 /* at least 11 bytes to copy? (ensures 2 aligned longs) */ @@ -99,7 +101,7 @@ _memcpy: mov.l r0,@-r4 /* store second long */ mov.l r1,@-r4 /* store first long; NOT ALIGNED - no speed loss here! */ bt .loop_do0 - + add #4,r3 /* readjust end address */ cmp/hi r5,r3 /* one long left? */ bf .start_b2 /* no, jump to trailing byte loop */ @@ -148,20 +150,20 @@ _memcpy: mov.l @r5+,r1 /* load first long & increment source addr */ add #16,r4 /* increment dest addr */ mov.l @r5+,r0 /* load second long & increment source addr */ - mov r1,r2 /* copy first long */ + cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ mov.b r0,@-r4 /* store low byte of second long */ shlr8 r0 /* get upper 3 bytes */ + mov r1,r2 /* copy first long */ shll16 r2 /* move low byte of first long all the way up, .. */ shll8 r2 or r2,r0 /* ..combine with the 3 bytes of second long.. */ - cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ mov.l r0,@-r4 /* ..and store as long */ shlr8 r1 /* get middle 2 bytes */ mov.w r1,@-r4 /* store as word */ shlr16 r1 /* get upper byte */ mov.b r1,@-r4 /* and store */ bt .loop_do1 - + add #4,r3 /* readjust end address */ .last_do13: cmp/hi r5,r3 /* one long left? */ @@ -218,6 +220,7 @@ _memcpy: #define FULLSPEED /* use burst writing for word aligned destinations */ .align 2 .global memcpy + .global __memcpy_fwd_entry .type memcpy,@function /* Copies <length> bytes of data in memory from <source> to <dest> @@ -249,7 +252,9 @@ memcpy: move.l (4,%sp),%a1 /* Destination */ move.l (8,%sp),%a0 /* Source */ move.l (12,%sp),%d1 /* Length */ - add.l %a0,%d1 /* %d1 = end address */ + +__memcpy_fwd_entry: + add.l %a0,%d1 /* %d1 = source end */ move.l %a0,%d0 addq.l #7,%d0 @@ -278,7 +283,7 @@ memcpy: movem.l %d2-%d7/%a2,(%sp) moveq.l #16,%d2 - sub.l %d2,%d0 /* %d0 = first source long bound */ + sub.l %d2,%d0 /* %d0 = first source line bound */ move.l %d1,%a2 /* %a2 = end address */ lea.l (-15,%a2),%a2 /* adjust end address for loops doing 16 bytes/ pass */ move.l %a1,%d1 @@ -507,7 +512,7 @@ memcpy: lea.l (12,%a2),%a2 /* readjust end address for doing longwords */ cmp.l %a0,%a2 /* any trailing longwords? */ jls .lines_end /* no: get outta here */ - + .lines_do0_tail_loop: move.l (%a0)+,(%a1)+ /* copy longword */ cmp.l %a0,%a2 /* runs %a0 up to last long bound */ @@ -610,7 +615,7 @@ memcpy: /* word aligned destination (line + 14): use line bursts in the loop */ .lines_lo14_start: movem.l (%a0),%d4-%d7 /* load first line */ - lea.l (16,%a0),%a0 + add.l %d0,%a0 swap %d4 /* swap words of 1st long */ move.w %d4,(%a1)+ /* store word */ jra .lines_lo14_entry /* jump into main loop */ @@ -784,7 +789,7 @@ memcpy: move.l (%a0)+,%d7 /* load first longword */ swap %d7 /* swap words */ move.w %d7,(%a1)+ /* store high word */ - cmp.l %a0,%d0 /* any full lnogword? */ + cmp.l %a0,%d0 /* any full longword? */ jls .lines_do2_loop /* no: skip head loop */ .lines_do2_head_loop: |