summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--firmware/Makefile7
-rw-r--r--firmware/common/memcpy.S171
-rw-r--r--firmware/common/memcpy.c117
-rw-r--r--firmware/common/memset.S108
-rw-r--r--firmware/common/memset.c109
5 files changed, 283 insertions, 229 deletions
diff --git a/firmware/Makefile b/firmware/Makefile
index 93ee38a..38bcd4c 100644
--- a/firmware/Makefile
+++ b/firmware/Makefile
@@ -25,16 +25,17 @@ endif
ifdef DEBUG
CFLAGS += -g -DDEBUG
else
-CFLAGS += -fomit-frame-pointer -fschedule-insns
+CFLAGS += -fomit-frame-pointer -fschedule-insns
endif
SRC := $(wildcard drivers/*.c common/*.c *.c)
+SRC_S := $(wildcard drivers/*.S common/*.S *.S)
-OBJS := $(SRC:%.c=$(OBJDIR)/%.o) $(OBJDIR)/crt0.o $(OBJDIR)/bitswap.o $(OBJDIR)/descramble.o
+OBJS := $(SRC:%.c=$(OBJDIR)/%.o) $(SRC_S:%.S=$(OBJDIR)/%.o)
DEPS:=.deps
DEPDIRS:=$(DEPS) $(DEPS)/drivers $(DEPS)/common $(DEPS)/malloc
-DIRS = $(subst $(DEPS),".",$(DEPDIRS))
+DIRS = $(subst $(DEPS),".",$(DEPDIRS))
OUTPUT = $(OBJDIR)/librockbox.a
diff --git a/firmware/common/memcpy.S b/firmware/common/memcpy.S
new file mode 100644
index 0000000..2fb9f6a
--- /dev/null
+++ b/firmware/common/memcpy.S
@@ -0,0 +1,171 @@
+/***************************************************************************
+ * __________ __ ___.
+ * Open \______ \ ____ ____ | | _\_ |__ _______ ___
+ * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
+ * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
+ * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
+ * \/ \/ \/ \/ \/
+ * $Id$
+ *
+ * Copyright (C) 2004 by Jens Arnold
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+ .section .icode,"ax",@progbits
+
+ .align 2
+ .global _memcpy
+ .type _memcpy,@function
+
+/* Copies <length> bytes of data in memory from <source> to <dest>
+ * This version is optimized for speed
+ *
+ * arguments:
+ * r4 - destination address
+ * r5 - source address
+ * r6 - length
+ *
+ * return value:
+ * r0 - destination address (like ANSI version)
+ *
+ * register usage:
+ * r0 - data / temporary
+ * r1 - bit mask for rounding to long bounds / 2nd data
+ * r2 - first long bound (only if >= 12 bytes)
+ * r3 - last long bound (-4) (only if >= 12 bytes)
+ * r4 - current dest address
+ * r5 - current source address
+ * r6 - source end address
+ * r7 - stored dest start address
+ *
+ * The instruction order below is devised in a way to utilize the pipelining
+ * of the SH1 to the max. The routine also tries to utilize fast page mode.
+ */
+
+_memcpy:
+ add r5,r6 /* r6 = source_end */
+ mov r4,r7 /* store for returning */
+ add #-8,r4 /* adjust for early increments (max. 2 longs) */
+
+ mov r6,r0
+ add #-12,r0 /* r0 = r6 - 12; don't go below 12 here! */
+ cmp/hs r5,r0 /* >= 12 bytes to copy? */
+ bf .start_b2 /* no, jump into byte loop */
+
+ mov #-4,r1 /* r1 = 0xFFFFFFFC */
+
+ mov r5,r2
+ add #3,r2
+ and r1,r2 /* r2 = first source long bound */
+ mov r6,r3
+ add #-4,r3 /* end offset for copying 2 longs per pass */
+ bra .start_b1 /* jump into leading byte loop */
+ and r1,r3 /* r3 = last source long bound - 4 */
+
+ /* leading byte loop: copies 0..3 bytes */
+ .align 2
+.loop_b1:
+ mov.b @r5+,r0 /* load byte & increment source addr */
+ add #1,r4 /* increment dest addr */
+ mov.b r0,@(7,r4) /* store byte */
+.start_b1:
+ cmp/hi r5,r2 /* runs r5 up to first long bound */
+ bt .loop_b1
+ /* now r5 is always at a long boundary */
+ /* -> memory reading is done in longs for all dest alignments */
+
+ /* selector for main copy loop */
+ mov r4,r0
+ tst #3,r0 /* dest now also at long bound? */
+ bt .loop2_l /* yes, do long copy */
+ tst #1,r0 /* dest now at least at word bound? */
+ bt .start4_w /* yes, do word copy */
+
+ /* main loop for byte aligned destination (fast) */
+ /* copies 1 long per pass */
+ add #4,r3 /* reset end offset */
+ add #-1,r4 /* adjust to word alignment for word write+ */
+
+.loop4_b:
+ mov.l @r5+,r0 /* load a long & increment source addr */
+ add #4,r4 /* increment dest addr */
+ mov.b r0,@(8,r4) /* store low byte */
+ shlr8 r0 /* get middle 2 bytes */
+ mov.w r0,@(6,r4) /* store as word+ */
+ shlr16 r0 /* get upper byte */
+ mov.b r0,@(5,r4) /* and store */
+ cmp/hi r5,r3 /* runs r5 up to last long bound */
+ bt .loop4_b
+
+ bra .start_b2 /* jump to trailing byte loop */
+ add #1,r4 /* readjust */
+
+ /* main loop for word aligned destination (faster) */
+ /* copies 2 longs per pass, utilizing fast page mode */
+.start4_w:
+ add #-2,r4 /* adjust to long alignment for long write+ */
+
+.loop4_w:
+ mov.l @r5+,r1 /* load first long & increment source addr */
+ add #8,r4 /* increment dest addr */
+ mov.l @r5+,r0 /* load second long & increment source addr */
+ cmp/hi r5,r3 /* runs r5 up to last or second last long bound */
+ mov.w r0,@(8,r4) /* store low word of second long */
+ xtrct r1,r0 /* extract low word of first long & high word of second long */
+ mov.l r0,@(4,r4) /* and store as long+ */
+ swap.w r1,r0 /* get high word of first long */
+ mov.w r0,@(2,r4) /* and store it */
+ bt .loop4_w
+
+ add #2,r4 /* readjust destination */
+ add #4,r3 /* reset end offset */
+ cmp/hi r5,r3 /* one long left? */
+ bf .start_b2 /* no, jump to trailing byte loop */
+
+ mov.l @r5+,r0 /* load last long & increment source addr */
+ add #4,r4 /* increment dest addr */
+ mov.w r0,@(6,r4) /* store low word */
+ shlr16 r0 /* get high word */
+ bra .start_b2 /* jump to trailing byte loop */
+ mov.w r0,@(4,r4) /* and store it */
+
+ /* main loop for long aligned destination (fastest) */
+ /* copies 2 longs per pass, utilizing fast page mode */
+.loop2_l:
+ mov.l @r5+,r1 /* load first long & increment source addr */
+ add #8,r4 /* increment dest addr */
+ mov.l @r5+,r0 /* load second long & increment source addr */
+ cmp/hi r5,r3 /* runs r5 up to last or second last long bound */
+ mov.l r1,@r4 /* store first long */
+ mov.l r0,@(4,r4) /* store second long; NOT ALIGNED - no speed loss here! */
+ bt .loop2_l
+
+ add #4,r3 /* reset end offset */
+ cmp/hi r5,r3 /* one long left? */
+ bf .start_b2 /* no, jump to trailing byte loop */
+
+ mov.l @r5+,r0 /* load last long & increment source addr */
+ add #4,r4 /* increment dest addr */
+ bra .start_b2 /* jump to trailing byte loop */
+ mov.l r0,@(4,r4) /* store last long */
+
+ /* trailing byte loop: copies 0..3 bytes (or all for < 12 in total) */
+.loop_b2:
+ mov.b @r5+,r0 /* load byte & increment source addr */
+ add #1,r4 /* increment dest addr */
+ mov.b r0,@(7,r4) /* store byte */
+.start_b2:
+ cmp/hi r5,r6 /* runs r5 up to end address */
+ bt .loop_b2
+
+ rts
+ mov r7,r0 /* return dest start address */
+.end:
+ .size _memcpy,.end-_memcpy
+
diff --git a/firmware/common/memcpy.c b/firmware/common/memcpy.c
deleted file mode 100644
index 4967892..0000000
--- a/firmware/common/memcpy.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
-FUNCTION
- <<memcpy>>---copy memory regions
-
-ANSI_SYNOPSIS
- #include <string.h>
- void* memcpy(void *<[out]>, const void *<[in]>, size_t <[n]>);
-
-TRAD_SYNOPSIS
- void *memcpy(<[out]>, <[in]>, <[n]>
- void *<[out]>;
- void *<[in]>;
- size_t <[n]>;
-
-DESCRIPTION
- This function copies <[n]> bytes from the memory region
- pointed to by <[in]> to the memory region pointed to by
- <[out]>.
-
- If the regions overlap, the behavior is undefined.
-
-RETURNS
- <<memcpy>> returns a pointer to the first byte of the <[out]>
- region.
-
-PORTABILITY
-<<memcpy>> is ANSI C.
-
-<<memcpy>> requires no supporting OS subroutines.
-
-QUICKREF
- memcpy ansi pure
- */
-
-#include <_ansi.h>
-#include <stddef.h>
-#include <limits.h>
-
-/* Nonzero if either X or Y is not aligned on a "long" boundary. */
-#define UNALIGNED(X, Y) \
- (((long)X & (sizeof (long) - 1)) | ((long)Y & (sizeof (long) - 1)))
-
-/* How many bytes are copied each iteration of the 4X unrolled loop. */
-#define BIGBLOCKSIZE (sizeof (long) << 2)
-
-/* How many bytes are copied each iteration of the word copy loop. */
-#define LITTLEBLOCKSIZE (sizeof (long))
-
-/* Threshhold for punting to the byte copier. */
-#define TOO_SMALL(LEN) ((LEN) < BIGBLOCKSIZE)
-
-_PTR
-_DEFUN (memcpy, (dst0, src0, len0),
- _PTR dst0 _AND
- _CONST _PTR src0 _AND
- size_t len0) __attribute__ ((section (".icode")));
-
-_PTR
-_DEFUN (memcpy, (dst0, src0, len0),
- _PTR dst0 _AND
- _CONST _PTR src0 _AND
- size_t len0)
-{
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
- char *dst = (char *) dst0;
- char *src = (char *) src0;
-
- _PTR save = dst0;
-
- while (len0--)
- {
- *dst++ = *src++;
- }
-
- return save;
-#else
- char *dst = dst0;
- _CONST char *src = src0;
- long *aligned_dst;
- _CONST long *aligned_src;
- unsigned int len = len0;
-
- /* If the size is small, or either SRC or DST is unaligned,
- then punt into the byte copy loop. This should be rare. */
- if (!TOO_SMALL(len) && !UNALIGNED (src, dst))
- {
- aligned_dst = (long*)dst;
- aligned_src = (long*)src;
-
- /* Copy 4X long words at a time if possible. */
- while (len >= BIGBLOCKSIZE)
- {
- *aligned_dst++ = *aligned_src++;
- *aligned_dst++ = *aligned_src++;
- *aligned_dst++ = *aligned_src++;
- *aligned_dst++ = *aligned_src++;
- len -= BIGBLOCKSIZE;
- }
-
- /* Copy one long word at a time if possible. */
- while (len >= LITTLEBLOCKSIZE)
- {
- *aligned_dst++ = *aligned_src++;
- len -= LITTLEBLOCKSIZE;
- }
-
- /* Pick up any residual with a byte copier. */
- dst = (char*)aligned_dst;
- src = (char*)aligned_src;
- }
-
- while (len--)
- *dst++ = *src++;
-
- return dst0;
-#endif /* not PREFER_SIZE_OVER_SPEED */
-}
diff --git a/firmware/common/memset.S b/firmware/common/memset.S
new file mode 100644
index 0000000..038915c
--- /dev/null
+++ b/firmware/common/memset.S
@@ -0,0 +1,108 @@
+/***************************************************************************
+ * __________ __ ___.
+ * Open \______ \ ____ ____ | | _\_ |__ _______ ___
+ * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
+ * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
+ * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
+ * \/ \/ \/ \/ \/
+ * $Id$
+ *
+ * Copyright (C) 2004 by Jens Arnold
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+ .section .icode,"ax",@progbits
+
+ .align 2
+ .global _memset
+ .type _memset,@function
+
+/* Fills a memory region with specified byte value
+ * This version is optimized for speed
+ *
+ * arguments:
+ * r4 - start address
+ * r5 - data
+ * r6 - length
+ *
+ * return value:
+ * r0 - start address (like ANSI version)
+ *
+ * register usage:
+ * r0 - temporary
+ * r1 - bit mask for rounding to long bounds
+ * r2 - last / first long bound (only if >= 12 bytes)
+ * r4 - start address
+ * r5 - data (spread to all 4 bytes if >= 12 bytes)
+ * r6 - current address (runs down from end to start)
+ *
+ * The instruction order below is devised in a way to utilize the pipelining
+ * of the SH1 to the max. The routine fills memory from end to start in
+ * order to utilize the auto-decrementing store instructions.
+ */
+
+_memset:
+ add r4,r6 /* r6 = end_address */
+
+ mov r6,r0
+ add #-12,r0 /* r0 = r6 - 12; don't go below 12 here! */
+ cmp/hs r4,r0 /* >= 12 bytes to fill? */
+ bf .start_b2 /* no, jump directly to byte loop */
+
+ extu.b r5,r5 /* start: spread data to all 4 bytes */
+ swap.b r5,r0
+ or r0,r5 /* data now in 2 lower bytes of r5 */
+ swap.w r5,r0
+ or r0,r5 /* data now in all 4 bytes of r5 */
+
+ mov #-4,r1 /* r1 = 0xFFFFFFFC */
+
+ mov r6,r2
+ bra .start_b1
+ and r1,r2 /* r2 = last long bound */
+
+ /* leading byte loop: sets 0..3 bytes */
+.loop_b1:
+ mov.b r5,@-r6 /* store byte */
+.start_b1:
+ cmp/hi r2,r6 /* runs r6 down to last long bound */
+ bt .loop_b1
+
+ mov r4,r2
+ add #11,r2 /* combined for rounding and offset */
+ and r1,r2 /* r2 = first long bound + 8 */
+
+ /* main loop: set 2 longs per pass */
+.loop2_l:
+ mov.l r5,@-r6 /* store first long */
+ cmp/hi r2,r6 /* runs r6 down to first or second long bound */
+ mov.l r5,@-r6 /* store second long */
+ bt .loop2_l
+
+ add #-8,r2 /* correct offset */
+ cmp/hi r2,r6 /* 1 long left? */
+ bf .start_b2 /* no, jump to trailing byte loop */
+
+ bra .start_b2 /* jump to trailing byte loop */
+ mov.l r5,@-r6 /* store last long */
+
+ /* trailing byte loop */
+ .align 2
+.loop_b2:
+ mov.b r5,@-r6 /* store byte */
+.start_b2:
+ cmp/hi r4,r6 /* runs r6 down to the start address */
+ bt .loop_b2
+
+ rts
+ mov r4,r0 /* return start address */
+
+.end:
+ .size _memset,.end-_memset
+
diff --git a/firmware/common/memset.c b/firmware/common/memset.c
deleted file mode 100644
index c370191..0000000
--- a/firmware/common/memset.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
-FUNCTION
- <<memset>>---set an area of memory
-
-INDEX
- memset
-
-ANSI_SYNOPSIS
- #include <string.h>
- void *memset(const void *<[dst]>, int <[c]>, size_t <[length]>);
-
-TRAD_SYNOPSIS
- #include <string.h>
- void *memset(<[dst]>, <[c]>, <[length]>)
- void *<[dst]>;
- int <[c]>;
- size_t <[length]>;
-
-DESCRIPTION
- This function converts the argument <[c]> into an unsigned
- char and fills the first <[length]> characters of the array
- pointed to by <[dst]> to the value.
-
-RETURNS
- <<memset>> returns the value of <[m]>.
-
-PORTABILITY
-<<memset>> is ANSI C.
-
- <<memset>> requires no supporting OS subroutines.
-
-QUICKREF
- memset ansi pure
-*/
-
-#include <string.h>
-
-#define LBLOCKSIZE (sizeof(long))
-#define UNALIGNED(X) ((long)X & (LBLOCKSIZE - 1))
-#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
-
-_PTR
-_DEFUN (memset, (m, c, n),
- _PTR m _AND
- int c _AND
- size_t n)
-{
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
- char *s = (char *) m;
-
- while (n-- != 0)
- {
- *s++ = (char) c;
- }
-
- return m;
-#else
- char *s = (char *) m;
- unsigned int i;
- unsigned long buffer;
- unsigned long *aligned_addr;
-
- if (!TOO_SMALL (n) && !UNALIGNED (m))
- {
- /* If we get this far, we know that n is large and m is word-aligned. */
-
- aligned_addr = (unsigned long*)m;
-
- /* Store C into each char sized location in BUFFER so that
- we can set large blocks quickly. */
- c &= 0xff;
- if (LBLOCKSIZE == 4)
- {
- buffer = (c << 8) | c;
- buffer |= (buffer << 16);
- }
- else
- {
- buffer = 0;
- for (i = 0; i < LBLOCKSIZE; i++)
- buffer = (buffer << 8) | c;
- }
-
- while (n >= LBLOCKSIZE*4)
- {
- *aligned_addr++ = buffer;
- *aligned_addr++ = buffer;
- *aligned_addr++ = buffer;
- *aligned_addr++ = buffer;
- n -= 4*LBLOCKSIZE;
- }
-
- while (n >= LBLOCKSIZE)
- {
- *aligned_addr++ = buffer;
- n -= LBLOCKSIZE;
- }
- /* Pick up the remainder with a bytewise loop. */
- s = (char*)aligned_addr;
- }
-
- while (n--)
- {
- *s++ = (char)c;
- }
-
- return m;
-#endif /* not PREFER_SIZE_OVER_SPEED */
-}