[PATCH] sh4: use optimized asm version of memcpy - add config option to support backward copying
Carmelo Amoroso
carmelo73 at gmail.com
Mon May 28 12:44:47 UTC 2007
Carmelo AMOROSO wrote:
> Paul Mundt wrote:
>> On Sun, Mar 25, 2007 at 09:18:33AM -0400, Mike Frysinger wrote:
>>
>>> On Wednesday 21 March 2007, Carmelo AMOROSO wrote:
>>>
>>>> I'm currently using on uClibc-nptl for sh4 an optimized version
>>>> of the memcpy function (from Stuart Menefy @STMicroelectronics).
>>>> This implementation is based on 'backward copying'
>>>> and brakes the current implementation of 'memmove'
>>>> (libc/string/generic/memmove.c)
>>>> that, as clearly stated, assumes memcpy does a forward copying.
>>>>
>>>> The attached patch provides a solution for this adding a config option
>>>> to specify what kind of memcpy implementation the architecture
>>>> provides.
>>>> In this way the memmove works with both implementation.
>>>>
>>> if anything, this option should not be exported for the user to try
>>> and figure out ... either the architecture provides it or it doesnt
>>> which means it'd be a hardcoded selection in the arch-specific
>>> config.in files ...
>>>
>>> wouldnt it be simpler to provide a superh optimized memmove/memcpy ?
>>> then it wouldnt matter what the generic implementations assume ...
>>>
>>
>> It has to be split out separately for sh4, given the movca.l usage.
>>
> Hi All,
> I've updated the previous patch to keep into account both suggestions
> made by Mike and Paul.
> A brief explanation of the changes follows:
>
> extra/Configs/Config.in -> set the TARGET_SUBARCH for the sh4
> architecture
> extra/Configs/Config.in.sh -> set on the ARCH_HAS_BWD_MEMCPY for
> sh4 architecture only
>
> libc/string/sh/sh4 -> new file memcpy.S (sh4 specific)
> libc/string/generic/memmove.c -> use the new macro
> __ARCH_HAS_BWD_MEMCPY__ instead of #if 1
> libc/string/generic/memcpy.c -> move static function from C source
> to common header file with some reorder
> libc/string/generic/memcopy.h -> ""
> libc/string/Makefile.in -> add code the manage subarch
> specific code in addition to the arch specific one.
>
> Any comments are welcome.
>
> Cheers,
> Carmelo
>
>
Hi Mike, Paul,
did you have time to look at this ?
If accepted, may reduce a bit the diff from sh4 port and trunk.
This ode is currently used on nptl/sh4 port.
Carmelo
> ------------------------------------------------------------------------
>
> diff -Naupr uClibc-trunk/extra/Configs/Config.in uClibc-trunk-st/extra/Configs/Config.in
> --- uClibc-trunk/extra/Configs/Config.in 2007-04-24 15:19:31.000000000 +0200
> +++ uClibc-trunk-st/extra/Configs/Config.in 2007-05-07 10:24:22.045984000 +0200
> @@ -180,6 +180,7 @@ config TARGET_SUBARCH
> string
> default "e500" if CONFIG_E500
> default "classic" if CONFIG_CLASSIC
> + default "sh4" if CONFIG_SH4
> default ""
>
> source "extra/Configs/Config.in.arch"
> diff -Naupr uClibc-trunk/extra/Configs/Config.sh uClibc-trunk-st/extra/Configs/Config.sh
> --- uClibc-trunk/extra/Configs/Config.sh 2007-03-16 21:38:22.000000000 +0100
> +++ uClibc-trunk-st/extra/Configs/Config.sh 2007-05-07 14:02:04.426778000 +0200
> @@ -48,3 +48,8 @@ config CONFIG_SH4
> bool "SH4"
>
> endchoice
> +
> +config ARCH_HAS_BWD_MEMCPY
> + bool
> + default y
> + depends CONFIG_SH4
> diff -Naupr uClibc-trunk/libc/string/Makefile.in uClibc-trunk-st/libc/string/Makefile.in
> --- uClibc-trunk/libc/string/Makefile.in 2006-09-19 09:43:04.000000000 +0200
> +++ uClibc-trunk-st/libc/string/Makefile.in 2007-05-07 10:27:07.516749000 +0200
> @@ -8,6 +8,18 @@
> #
> # Arch specific fun
> #
> +# Collect the subarch specific implementation (asm files)
> +ifneq ($(strip $(TARGET_SUBARCH)),)
> +STRING_SUBARCH_DIR := $(top_srcdir)libc/string/$(TARGET_ARCH)/$(TARGET_SUBARCH)
> +STRING_SUBARCH_OUT := $(top_builddir)libc/string/$(TARGET_ARCH)/$(TARGET_SUBARCH)
> +
> +STRING_SUBARCH_SSRC := $(wildcard $(STRING_SUBARCH_OUT)/*.S)
> +STRING_SUBARCH_SOBJ := $(patsubst $(STRING_SUBARCH_DIR)/%.S,$(STRING_SUBARCH_OUT)/%.o,$(STRING_SUBARCH_SSRC))
> +
> +STRING_SUBARCH_OBJS := $(STRING_SUBARCH_SOBJ)
> +endif
> +
> +# Collect the arch specific implementation (asm, c files)
> STRING_ARCH_DIR := $(top_srcdir)libc/string/$(TARGET_ARCH)
> STRING_ARCH_OUT := $(top_builddir)libc/string/$(TARGET_ARCH)
>
> @@ -15,13 +27,18 @@ STRING_ARCH_SRC := $(wildcard $(STRING_A
> STRING_ARCH_OBJ := $(patsubst $(STRING_ARCH_DIR)/%.c,$(STRING_ARCH_OUT)/%.o,$(STRING_ARCH_SRC))
>
> STRING_ARCH_SSRC := $(wildcard $(STRING_ARCH_DIR)/*.S)
> -STRING_ARCH_SOBJ := $(patsubst $(STRING_ARCH_DIR)/%.S,$(STRING_ARCH_OUT)/%.o,$(STRING_ARCH_SSRC))
>
> +# Exclude the subarch implementation from the arch ones
> +ifneq ($(strip $(STRING_SUBARCH_OBJS)),)
> +STRING_ARCH_SSRC := $(filter-out $(patsubst %.o,$(STRING_ARCH_DIR)/%.S,$(notdir $(STRING_SUBARCH_OBJS))),$(STRING_ARCH_SSRC))
> +endif
> +
> +STRING_ARCH_SOBJ := $(patsubst $(STRING_ARCH_DIR)/%.S,$(STRING_ARCH_OUT)/%.o,$(STRING_ARCH_SSRC))
> STRING_ARCH_OBJS := $(STRING_ARCH_OBJ) $(STRING_ARCH_SOBJ)
>
> -libc-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_OBJS)
> +libc-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_OBJS) $(STRING_SUBARCH_OBJS)
>
> -libc-nomulti-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_SOBJ)
> +libc-nomulti-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_SOBJ) $(STRING_SUBARCH_OBJS)
>
> #
> # Generic stuff
> @@ -35,6 +52,9 @@ ifeq ($(UCLIBC_HAS_STRING_ARCH_OPT),y)
> ifneq ($(strip $(STRING_ARCH_OBJS)),)
> STRING_GENERIC_SRC := $(filter-out $(patsubst %.o,$(STRING_GENERIC_DIR)/%.c,$(notdir $(STRING_ARCH_OBJS))),$(STRING_GENERIC_SRC))
> endif
> +ifneq ($(strip $(STRING_SUBARCH_OBJS)),)
> +STRING_GENERIC_SRC := $(filter-out $(patsubst %.o,$(STRING_GENERIC_DIR)/%.c,$(notdir $(STRING_SUBARCH_OBJS))),$(STRING_GENERIC_SRC))
> +endif
> endif
>
> STRING_GENERIC_OBJS := $(patsubst $(STRING_GENERIC_DIR)/%.c,$(STRING_GENERIC_OUT)/%.o,$(STRING_GENERIC_SRC))
> @@ -93,6 +113,9 @@ ifeq ($(UCLIBC_HAS_STRING_ARCH_OPT),y)
> ifneq ($(strip $(STRING_ARCH_OBJS)),)
> STRING_CSRC := $(filter-out $(patsubst %.o,$(STRING_DIR)/%.c,$(notdir $(STRING_ARCH_OBJS))),$(STRING_CSRC))
> endif
> +ifneq ($(strip $(STRING_SUBARCH_OBJS)),)
> +STRING_CSRC := $(filter-out $(patsubst %.o,$(STRING_DIR)/%.c,$(notdir $(STRING_SUBARCH_OBJS))),$(STRING_CSRC))
> +endif
> endif
>
> ifeq ($(UCLIBC_HAS_STRING_GENERIC_OPT),y)
> diff -Naupr uClibc-trunk/libc/string/generic/memcopy.h uClibc-trunk-st/libc/string/generic/memcopy.h
> --- uClibc-trunk/libc/string/generic/memcopy.h 2006-09-19 09:43:00.000000000 +0200
> +++ uClibc-trunk-st/libc/string/generic/memcopy.h 2007-05-07 10:27:55.056971000 +0200
> @@ -107,24 +107,6 @@ typedef unsigned char byte;
> } \
> } while (0)
>
> -/* Copy *up to* NBYTES bytes from SRC_BP to DST_BP, with
> - the assumption that DST_BP is aligned on an OPSIZ multiple. If
> - not all bytes could be easily copied, store remaining number of bytes
> - in NBYTES_LEFT, otherwise store 0. */
> -/* extern void _wordcopy_fwd_aligned __P ((long int, long int, size_t)); */
> -/* extern void _wordcopy_fwd_dest_aligned __P ((long int, long int, size_t)); */
> -#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \
> - do \
> - { \
> - if (src_bp % OPSIZ == 0) \
> - _wordcopy_fwd_aligned (dst_bp, src_bp, (nbytes) / OPSIZ); \
> - else \
> - _wordcopy_fwd_dest_aligned (dst_bp, src_bp, (nbytes) / OPSIZ); \
> - src_bp += (nbytes) & -OPSIZ; \
> - dst_bp += (nbytes) & -OPSIZ; \
> - (nbytes_left) = (nbytes) % OPSIZ; \
> - } while (0)
> -
> /* Copy *up to* NBYTES_TO_COPY bytes from SRC_END_PTR to DST_END_PTR,
> beginning at the words (of type op_t) right before the pointers and
> continuing towards smaller addresses. May take advantage of that
> @@ -148,3 +130,213 @@ typedef unsigned char byte;
>
> /* Threshold value for when to enter the unrolled loops. */
> #define OP_T_THRES 16
> +
> +#ifdef __ARCH_HAS_BWD_MEMCPY__
> +
> +/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to
> + block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
> + Both SRCP and DSTP should be aligned for memory operations on `op_t's. */
> +
> +static void _wordcopy_fwd_aligned (long int dstp, long int srcp, size_t len)
> +{
> + op_t a0, a1;
> + a0 = a1 = 0L;
> + switch (len % 8)
> + {
> + case 2:
> + a0 = ((op_t *) srcp)[0];
> + srcp -= 6 * OPSIZ;
> + dstp -= 7 * OPSIZ;
> + len += 6;
> + goto do1;
> + case 3:
> + a1 = ((op_t *) srcp)[0];
> + srcp -= 5 * OPSIZ;
> + dstp -= 6 * OPSIZ;
> + len += 5;
> + goto do2;
> + case 4:
> + a0 = ((op_t *) srcp)[0];
> + srcp -= 4 * OPSIZ;
> + dstp -= 5 * OPSIZ;
> + len += 4;
> + goto do3;
> + case 5:
> + a1 = ((op_t *) srcp)[0];
> + srcp -= 3 * OPSIZ;
> + dstp -= 4 * OPSIZ;
> + len += 3;
> + goto do4;
> + case 6:
> + a0 = ((op_t *) srcp)[0];
> + srcp -= 2 * OPSIZ;
> + dstp -= 3 * OPSIZ;
> + len += 2;
> + goto do5;
> + case 7:
> + a1 = ((op_t *) srcp)[0];
> + srcp -= 1 * OPSIZ;
> + dstp -= 2 * OPSIZ;
> + len += 1;
> + goto do6;
> +
> + case 0:
> + if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> + return;
> + a0 = ((op_t *) srcp)[0];
> + srcp -= 0 * OPSIZ;
> + dstp -= 1 * OPSIZ;
> + goto do7;
> + case 1:
> + a1 = ((op_t *) srcp)[0];
> + srcp -=-1 * OPSIZ;
> + dstp -= 0 * OPSIZ;
> + len -= 1;
> + if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> + goto do0;
> + goto do8; /* No-op. */
> + }
> +
> + do
> + {
> + do8:
> + a0 = ((op_t *) srcp)[0];
> + ((op_t *) dstp)[0] = a1;
> + do7:
> + a1 = ((op_t *) srcp)[1];
> + ((op_t *) dstp)[1] = a0;
> + do6:
> + a0 = ((op_t *) srcp)[2];
> + ((op_t *) dstp)[2] = a1;
> + do5:
> + a1 = ((op_t *) srcp)[3];
> + ((op_t *) dstp)[3] = a0;
> + do4:
> + a0 = ((op_t *) srcp)[4];
> + ((op_t *) dstp)[4] = a1;
> + do3:
> + a1 = ((op_t *) srcp)[5];
> + ((op_t *) dstp)[5] = a0;
> + do2:
> + a0 = ((op_t *) srcp)[6];
> + ((op_t *) dstp)[6] = a1;
> + do1:
> + a1 = ((op_t *) srcp)[7];
> + ((op_t *) dstp)[7] = a0;
> +
> + srcp += 8 * OPSIZ;
> + dstp += 8 * OPSIZ;
> + len -= 8;
> + }
> + while (len != 0);
> +
> + /* This is the right position for do0. Please don't move
> + it into the loop. */
> + do0:
> + ((op_t *) dstp)[0] = a1;
> +}
> +
> +/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to
> + block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
> + DSTP should be aligned for memory operations on `op_t's, but SRCP must
> + *not* be aligned. */
> +
> +static void _wordcopy_fwd_dest_aligned (long int dstp, long int srcp, size_t len)
> +{
> + op_t a0, a1, a2, a3;
> + int sh_1, sh_2;
> +
> + /* Calculate how to shift a word read at the memory operation
> + aligned srcp to make it aligned for copy. */
> + a0 = a1 = a2 = a3 = 0L;
> + sh_1 = 8 * (srcp % OPSIZ);
> + sh_2 = 8 * OPSIZ - sh_1;
> +
> + /* Make SRCP aligned by rounding it down to the beginning of the `op_t'
> + it points in the middle of. */
> + srcp &= -OPSIZ;
> +
> + switch (len % 4)
> + {
> + case 2:
> + a1 = ((op_t *) srcp)[0];
> + a2 = ((op_t *) srcp)[1];
> + srcp -= 1 * OPSIZ;
> + dstp -= 3 * OPSIZ;
> + len += 2;
> + goto do1;
> + case 3:
> + a0 = ((op_t *) srcp)[0];
> + a1 = ((op_t *) srcp)[1];
> + srcp -= 0 * OPSIZ;
> + dstp -= 2 * OPSIZ;
> + len += 1;
> + goto do2;
> + case 0:
> + if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> + return;
> + a3 = ((op_t *) srcp)[0];
> + a0 = ((op_t *) srcp)[1];
> + srcp -=-1 * OPSIZ;
> + dstp -= 1 * OPSIZ;
> + len += 0;
> + goto do3;
> + case 1:
> + a2 = ((op_t *) srcp)[0];
> + a3 = ((op_t *) srcp)[1];
> + srcp -=-2 * OPSIZ;
> + dstp -= 0 * OPSIZ;
> + len -= 1;
> + if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> + goto do0;
> + goto do4; /* No-op. */
> + }
> +
> + do
> + {
> + do4:
> + a0 = ((op_t *) srcp)[0];
> + ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
> + do3:
> + a1 = ((op_t *) srcp)[1];
> + ((op_t *) dstp)[1] = MERGE (a3, sh_1, a0, sh_2);
> + do2:
> + a2 = ((op_t *) srcp)[2];
> + ((op_t *) dstp)[2] = MERGE (a0, sh_1, a1, sh_2);
> + do1:
> + a3 = ((op_t *) srcp)[3];
> + ((op_t *) dstp)[3] = MERGE (a1, sh_1, a2, sh_2);
> +
> + srcp += 4 * OPSIZ;
> + dstp += 4 * OPSIZ;
> + len -= 4;
> + }
> + while (len != 0);
> +
> + /* This is the right position for do0. Please don't move
> + it into the loop. */
> + do0:
> + ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
> +}
> +
> +
> +/* Copy *up to* NBYTES bytes from SRC_BP to DST_BP, with
> + the assumption that DST_BP is aligned on an OPSIZ multiple. If
> + not all bytes could be easily copied, store remaining number of bytes
> + in NBYTES_LEFT, otherwise store 0. */
> +/* extern void _wordcopy_fwd_aligned __P ((long int, long int, size_t)); */
> +/* extern void _wordcopy_fwd_dest_aligned __P ((long int, long int, size_t)); */
> +#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \
> + do \
> + { \
> + if (src_bp % OPSIZ == 0) \
> + _wordcopy_fwd_aligned (dst_bp, src_bp, (nbytes) / OPSIZ); \
> + else \
> + _wordcopy_fwd_dest_aligned (dst_bp, src_bp, (nbytes) / OPSIZ); \
> + src_bp += (nbytes) & -OPSIZ; \
> + dst_bp += (nbytes) & -OPSIZ; \
> + (nbytes_left) = (nbytes) % OPSIZ; \
> + } while (0)
> +
> +#endif /* __ARCH_HAS_BWD_MEMCPY__ */
> +
> diff -Naupr uClibc-trunk/libc/string/generic/memcpy.c uClibc-trunk-st/libc/string/generic/memcpy.c
> --- uClibc-trunk/libc/string/generic/memcpy.c 2006-09-19 09:43:00.000000000 +0200
> +++ uClibc-trunk-st/libc/string/generic/memcpy.c 2007-05-07 10:28:20.217087000 +0200
> @@ -25,192 +25,6 @@
>
> libc_hidden_proto(memcpy)
>
> -/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to
> - block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
> - Both SRCP and DSTP should be aligned for memory operations on `op_t's. */
> -
> -static void _wordcopy_fwd_aligned (long int dstp, long int srcp, size_t len)
> -{
> - op_t a0, a1;
> -
> - switch (len % 8)
> - {
> - case 2:
> - a0 = ((op_t *) srcp)[0];
> - srcp -= 6 * OPSIZ;
> - dstp -= 7 * OPSIZ;
> - len += 6;
> - goto do1;
> - case 3:
> - a1 = ((op_t *) srcp)[0];
> - srcp -= 5 * OPSIZ;
> - dstp -= 6 * OPSIZ;
> - len += 5;
> - goto do2;
> - case 4:
> - a0 = ((op_t *) srcp)[0];
> - srcp -= 4 * OPSIZ;
> - dstp -= 5 * OPSIZ;
> - len += 4;
> - goto do3;
> - case 5:
> - a1 = ((op_t *) srcp)[0];
> - srcp -= 3 * OPSIZ;
> - dstp -= 4 * OPSIZ;
> - len += 3;
> - goto do4;
> - case 6:
> - a0 = ((op_t *) srcp)[0];
> - srcp -= 2 * OPSIZ;
> - dstp -= 3 * OPSIZ;
> - len += 2;
> - goto do5;
> - case 7:
> - a1 = ((op_t *) srcp)[0];
> - srcp -= 1 * OPSIZ;
> - dstp -= 2 * OPSIZ;
> - len += 1;
> - goto do6;
> -
> - case 0:
> - if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> - return;
> - a0 = ((op_t *) srcp)[0];
> - srcp -= 0 * OPSIZ;
> - dstp -= 1 * OPSIZ;
> - goto do7;
> - case 1:
> - a1 = ((op_t *) srcp)[0];
> - srcp -=-1 * OPSIZ;
> - dstp -= 0 * OPSIZ;
> - len -= 1;
> - if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> - goto do0;
> - goto do8; /* No-op. */
> - }
> -
> - do
> - {
> - do8:
> - a0 = ((op_t *) srcp)[0];
> - ((op_t *) dstp)[0] = a1;
> - do7:
> - a1 = ((op_t *) srcp)[1];
> - ((op_t *) dstp)[1] = a0;
> - do6:
> - a0 = ((op_t *) srcp)[2];
> - ((op_t *) dstp)[2] = a1;
> - do5:
> - a1 = ((op_t *) srcp)[3];
> - ((op_t *) dstp)[3] = a0;
> - do4:
> - a0 = ((op_t *) srcp)[4];
> - ((op_t *) dstp)[4] = a1;
> - do3:
> - a1 = ((op_t *) srcp)[5];
> - ((op_t *) dstp)[5] = a0;
> - do2:
> - a0 = ((op_t *) srcp)[6];
> - ((op_t *) dstp)[6] = a1;
> - do1:
> - a1 = ((op_t *) srcp)[7];
> - ((op_t *) dstp)[7] = a0;
> -
> - srcp += 8 * OPSIZ;
> - dstp += 8 * OPSIZ;
> - len -= 8;
> - }
> - while (len != 0);
> -
> - /* This is the right position for do0. Please don't move
> - it into the loop. */
> - do0:
> - ((op_t *) dstp)[0] = a1;
> -}
> -
> -/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to
> - block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
> - DSTP should be aligned for memory operations on `op_t's, but SRCP must
> - *not* be aligned. */
> -
> -static void _wordcopy_fwd_dest_aligned (long int dstp, long int srcp, size_t len)
> -{
> - op_t a0, a1, a2, a3;
> - int sh_1, sh_2;
> -
> - /* Calculate how to shift a word read at the memory operation
> - aligned srcp to make it aligned for copy. */
> -
> - sh_1 = 8 * (srcp % OPSIZ);
> - sh_2 = 8 * OPSIZ - sh_1;
> -
> - /* Make SRCP aligned by rounding it down to the beginning of the `op_t'
> - it points in the middle of. */
> - srcp &= -OPSIZ;
> -
> - switch (len % 4)
> - {
> - case 2:
> - a1 = ((op_t *) srcp)[0];
> - a2 = ((op_t *) srcp)[1];
> - srcp -= 1 * OPSIZ;
> - dstp -= 3 * OPSIZ;
> - len += 2;
> - goto do1;
> - case 3:
> - a0 = ((op_t *) srcp)[0];
> - a1 = ((op_t *) srcp)[1];
> - srcp -= 0 * OPSIZ;
> - dstp -= 2 * OPSIZ;
> - len += 1;
> - goto do2;
> - case 0:
> - if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> - return;
> - a3 = ((op_t *) srcp)[0];
> - a0 = ((op_t *) srcp)[1];
> - srcp -=-1 * OPSIZ;
> - dstp -= 1 * OPSIZ;
> - len += 0;
> - goto do3;
> - case 1:
> - a2 = ((op_t *) srcp)[0];
> - a3 = ((op_t *) srcp)[1];
> - srcp -=-2 * OPSIZ;
> - dstp -= 0 * OPSIZ;
> - len -= 1;
> - if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> - goto do0;
> - goto do4; /* No-op. */
> - }
> -
> - do
> - {
> - do4:
> - a0 = ((op_t *) srcp)[0];
> - ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
> - do3:
> - a1 = ((op_t *) srcp)[1];
> - ((op_t *) dstp)[1] = MERGE (a3, sh_1, a0, sh_2);
> - do2:
> - a2 = ((op_t *) srcp)[2];
> - ((op_t *) dstp)[2] = MERGE (a0, sh_1, a1, sh_2);
> - do1:
> - a3 = ((op_t *) srcp)[3];
> - ((op_t *) dstp)[3] = MERGE (a1, sh_1, a2, sh_2);
> -
> - srcp += 4 * OPSIZ;
> - dstp += 4 * OPSIZ;
> - len -= 4;
> - }
> - while (len != 0);
> -
> - /* This is the right position for do0. Please don't move
> - it into the loop. */
> - do0:
> - ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
> -}
> -
> void *memcpy (void *dstpp, const void *srcpp, size_t len)
> {
> unsigned long int dstp = (long int) dstpp;
> diff -Naupr uClibc-trunk/libc/string/generic/memmove.c uClibc-trunk-st/libc/string/generic/memmove.c
> --- uClibc-trunk/libc/string/generic/memmove.c 2006-09-19 09:43:00.000000000 +0200
> +++ uClibc-trunk-st/libc/string/generic/memmove.c 2007-05-07 10:29:26.717396000 +0200
> @@ -29,7 +29,8 @@ libc_hidden_proto(memcpy)
>
> static void _wordcopy_bwd_aligned (long int dstp, long int srcp, size_t len)
> {
> - op_t a0, a1;
> + op_t a0 = 0;
> + op_t a1 = 0;
>
> switch (len % 8)
> {
> @@ -133,7 +134,10 @@ static void _wordcopy_bwd_aligned (long
>
> static void _wordcopy_bwd_dest_aligned (long int dstp, long int srcp, size_t len)
> {
> - op_t a0, a1, a2, a3;
> + op_t a0 = 0;
> + op_t a1 = 0;
> + op_t a2 = 0;
> + op_t a3 = 0;
> int sh_1, sh_2;
>
> /* Calculate how to shift a word read at the memory operation
> @@ -218,8 +222,8 @@ void *memmove (void *dest, const void *s
> Reduces the working set. */
> if (dstp - srcp >= len) /* *Unsigned* compare! */
> {
> -#if 1
> -#warning REMINDER: generic-opt memmove assumes memcpy does forward copying!
> +#ifndef __ARCH_HAS_BWD_MEMCPY__
> + /* generic-opt memmove assumes memcpy does forward copying! */
> memcpy(dest, src, len);
> #else
> /* Copy from the beginning to the end. */
> diff -Naupr uClibc-trunk/libc/string/sh/sh4/memcpy.S uClibc-trunk-st/libc/string/sh/sh4/memcpy.S
> --- uClibc-trunk/libc/string/sh/sh4/memcpy.S 1970-01-01 01:00:00.000000000 +0100
> +++ uClibc-trunk-st/libc/string/sh/sh4/memcpy.S 2007-05-07 13:43:16.291529000 +0200
> @@ -0,0 +1,807 @@
> +/*
> + * "memcpy" implementation of SuperH
> + *
> + * Copyright (C) 1999 Niibe Yutaka
> + * Copyright (c) 2002 STMicroelectronics Ltd
> + * Modified from memcpy.S and micro-optimised for SH4
> + * Stuart Menefy (stuart.menefy at st.com)
> + *
> + */
> +
> +/*
> + * void *memcpy(void *dst, const void *src, size_t n);
> + *
> + * It is assumed that there is no overlap between src and dst.
> + * If there is an overlap, then the results are undefined.
> + */
> +
> +#include <endian.h>
> +
> + !
> + ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.
> + !
> +
> + ! Size is 16 or greater, and may have trailing bytes
> +
> + .balign 32
> +.Lcase1:
> + ! Read a long word and write a long word at once
> + ! At the start of each iteration, r7 contains last long load
> + add #-1,r5 ! 79 EX
> + mov r4,r2 ! 5 MT (0 cycles latency)
> +
> + mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
> + add #-4,r5 ! 50 EX
> +
> + add #7,r2 ! 79 EX
> + !
> +#ifdef __LITTLE_ENDIAN__
> + ! 6 cycles, 4 bytes per iteration
> +3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
> + mov r7, r3 ! 5 MT (latency=0) ! RQPO
> +
> + cmp/hi r2,r0 ! 57 MT
> + shll16 r3 ! 103 EX
> +
> + mov r1,r6 ! 5 MT (latency=0)
> + shll8 r3 ! 102 EX ! Oxxx
> +
> + shlr8 r6 ! 106 EX ! xNML
> + mov r1, r7 ! 5 MT (latency=0)
> +
> + or r6,r3 ! 82 EX ! ONML
> + bt/s 3b ! 109 BR
> +
> + mov.l r3, at -r0 ! 30 LS
> +#else
> +3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN
> + mov r7,r3 ! 5 MT (latency=0) ! OPQR
> +
> + cmp/hi r2,r0 ! 57 MT
> + shlr16 r3 ! 107 EX
> +
> + shlr8 r3 ! 106 EX ! xxxO
> + mov r1,r6 ! 5 MT (latency=0)
> +
> + shll8 r6 ! 102 EX ! LMNx
> + mov r1,r7 ! 5 MT (latency=0)
> +
> + or r6,r3 ! 82 EX ! LMNO
> + bt/s 3b ! 109 BR
> +
> + mov.l r3, at -r0 ! 30 LS
> +#endif
> + ! Finally, copy a byte at once, if necessary
> +
> + add #4,r5 ! 50 EX
> + cmp/eq r4,r0 ! 54 MT
> +
> + add #-6,r2 ! 50 EX
> + bt 9f ! 109 BR
> +
> +8: cmp/hi r2,r0 ! 57 MT
> + mov.b @(r0,r5),r1 ! 20 LS (latency=2)
> +
> + bt/s 8b ! 109 BR
> +
> + mov.b r1, at -r0 ! 29 LS
> +
> +9: rts
> + nop
> +
> +
> + !
> + ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R...
> + !
> +
> + ! Size is 16 or greater, and may have trailing bytes
> +
> + .balign 32
> +.Lcase3:
> + ! Read a long word and write a long word at once
> + ! At the start of each iteration, r7 contains last long load
> + add #-3,r5 ! 79 EX
> + mov r4,r2 ! 5 MT (0 cycles latency)
> +
> + mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
> + add #-4,r5 ! 50 EX
> +
> + add #7,r2 ! 79 EX
> + !
> +#ifdef __LITTLE_ENDIAN__
> + ! 6 cycles, 4 bytes per iteration
> +3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
> + mov r7, r3 ! 5 MT (latency=0) ! RQPO
> +
> + cmp/hi r2,r0 ! 57 MT
> + shll8 r3 ! 102 EX ! QPOx
> +
> + mov r1,r6 ! 5 MT (latency=0)
> + shlr16 r6 ! 107 EX
> +
> + shlr8 r6 ! 106 EX ! xxxN
> + mov r1, r7 ! 5 MT (latency=0)
> +
> + or r6,r3 ! 82 EX ! QPON
> + bt/s 3b ! 109 BR
> +
> + mov.l r3, at -r0 ! 30 LS
> +#else
> +3: mov r1,r3 ! OPQR
> + shlr8 r3 ! xOPQ
> + mov.l @(r0,r5),r1 ! KLMN
> + mov r1,r6
> + shll16 r6
> + shll8 r6 ! Nxxx
> + or r6,r3 ! NOPQ
> + cmp/hi r2,r0
> + bt/s 3b
> + mov.l r3, at -r0
> +#endif
> +
> + ! Finally, copy a byte at once, if necessary
> +
> + add #6,r5 ! 50 EX
> + cmp/eq r4,r0 ! 54 MT
> +
> + add #-6,r2 ! 50 EX
> + bt 9f ! 109 BR
> +
> +8: cmp/hi r2,r0 ! 57 MT
> + mov.b @(r0,r5),r1 ! 20 LS (latency=2)
> +
> + bt/s 8b ! 109 BR
> +
> + mov.b r1, at -r0 ! 29 LS
> +
> +9: rts
> + nop
> +
> +/* void *memcpy(void *dst, const void *src, size_t len) */
> +.text
> +.align 5
> +.type memcpy, at function
> +.globl memcpy;
> +
> +memcpy:
> + ! Calculate the invariants which will be used in the remainder
> + ! of the code:
> + !
> + ! r4 --> [ ... ] DST [ ... ] SRC
> + ! [ ... ] [ ... ]
> + ! : :
> + ! r0 --> [ ... ] r0+r5 --> [ ... ]
> + !
> + !
> +
> + ! Short circuit the common case of src, dst and len being 32 bit aligned
> + ! and test for zero length move
> +
> + mov r6, r0 ! 5 MT (0 cycle latency)
> + or r4, r0 ! 82 EX
> +
> + or r5, r0 ! 82 EX
> + tst r6, r6 ! 86 MT
> +
> + bt/s 99f ! 111 BR (zero len)
> + tst #3, r0 ! 87 MT
> +
> + mov r4, r0 ! 5 MT (0 cycle latency)
> + add r6, r0 ! 49 EX
> +
> + mov #16, r1 ! 6 EX
> + bt/s .Lcase00 ! 111 BR (aligned)
> +
> + sub r4, r5 ! 75 EX
> +
> + ! Arguments are not nicely long word aligned or zero len.
> + ! Check for small copies, and if so do a simple byte at a time copy.
> + !
> + ! Deciding on an exact value of 'small' is not easy, as the point at which
> + ! using the optimised routines become worthwhile varies (these are the
> + ! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
> + ! size byte-at-time long word byte
> + ! 16 42 39-40 46-50 50-55
> + ! 24 58 43-44 54-58 62-67
> + ! 36 82 49-50 66-70 80-85
> + ! However the penalty for getting it 'wrong' is much higher for long word
> + ! aligned data (and this is more common), so use a value of 16.
> +
> + cmp/gt r6,r1 ! 56 MT
> +
> + add #-1,r5 ! 50 EX
> + bf/s 6f ! 108 BR (not small)
> +
> + mov r5, r3 ! 5 MT (latency=0)
> + shlr r6 ! 104 EX
> +
> + mov.b @(r0,r5),r1 ! 20 LS (latency=2)
> + bf/s 4f ! 111 BR
> +
> + add #-1,r3 ! 50 EX
> + tst r6, r6 ! 86 MT
> +
> + bt/s 98f ! 110 BR
> + mov.b r1, at -r0 ! 29 LS
> +
> + ! 4 cycles, 2 bytes per iteration
> +3: mov.b @(r0,r5),r1 ! 20 LS (latency=2)
> +
> +4: mov.b @(r0,r3),r2 ! 20 LS (latency=2)
> + dt r6 ! 67 EX
> +
> + mov.b r1, at -r0 ! 29 LS
> + bf/s 3b ! 111 BR
> +
> + mov.b r2, at -r0 ! 29 LS
> +98:
> + rts
> + nop
> +
> +99: rts
> + mov r4, r0
> +
> + ! Size is not small, so its worthwhile looking for optimisations.
> + ! First align destination to a long word boundary.
> + !
> + ! r5 = normal value -1
> +
> +6: tst #3, r0 ! 87 MT
> + mov #3, r3 ! 6 EX
> +
> + bt/s 2f ! 111 BR
> + and r0,r3 ! 78 EX
> +
> + ! 3 cycles, 1 byte per iteration
> +1: dt r3 ! 67 EX
> + mov.b @(r0,r5),r1 ! 19 LS (latency=2)
> +
> + add #-1, r6 ! 79 EX
> + bf/s 1b ! 109 BR
> +
> + mov.b r1, at -r0 ! 28 LS
> +
> +2: add #1, r5 ! 79 EX
> +
> + ! Now select the appropriate bulk transfer code based on relative
> + ! alignment of src and dst.
> +
> + mov r0, r3 ! 5 MT (latency=0)
> +
> + mov r5, r0 ! 5 MT (latency=0)
> + tst #1, r0 ! 87 MT
> +
> + bf/s 1f ! 111 BR
> + mov #64, r7 ! 6 EX
> +
> + ! bit 0 clear
> +
> + cmp/ge r7, r6 ! 55 MT
> +
> + bt/s 2f ! 111 BR
> + tst #2, r0 ! 87 MT
> +
> + ! small
> + bt/s .Lcase0
> + mov r3, r0
> +
> + bra .Lcase2
> + nop
> +
> + ! big
> +2: bt/s .Lcase0b
> + mov r3, r0
> +
> + bra .Lcase2b
> + nop
> +
> + ! bit 0 set
> +1: tst #2, r0 ! 87 MT
> +
> + bt/s .Lcase1
> + mov r3, r0
> +
> + bra .Lcase3
> + nop
> +
> +
> + !
> + ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR
> + !
> +
> + ! src, dst and size are all long word aligned
> + ! size is non-zero
> +
> + .balign 32
> +.Lcase00:
> + mov #64, r1 ! 6 EX
> + mov r5, r3 ! 5 MT (latency=0)
> +
> + cmp/gt r6, r1 ! 56 MT
> + add #-4, r5 ! 50 EX
> +
> + bf .Lcase00b ! 108 BR (big loop)
> + shlr2 r6 ! 105 EX
> +
> + shlr r6 ! 104 EX
> + mov.l @(r0, r5), r1 ! 21 LS (latency=2)
> +
> + bf/s 4f ! 111 BR
> + add #-8, r3 ! 50 EX
> +
> + tst r6, r6 ! 86 MT
> + bt/s 5f ! 110 BR
> +
> + mov.l r1, at -r0 ! 30 LS
> +
> + ! 4 cycles, 2 long words per iteration
> +3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
> +
> +4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
> + dt r6 ! 67 EX
> +
> + mov.l r1, @-r0 ! 30 LS
> + bf/s 3b ! 109 BR
> +
> + mov.l r2, @-r0 ! 30 LS
> +
> +5: rts
> + nop
> +
> +
> + ! Size is 16 or greater and less than 64, but may have trailing bytes
> +
> + .balign 32
> +.Lcase0:
> + add #-4, r5 ! 50 EX
> + mov r4, r7 ! 5 MT (latency=0)
> +
> + mov.l @(r0, r5), r1 ! 21 LS (latency=2)
> + mov #4, r2 ! 6 EX
> +
> + add #11, r7 ! 50 EX
> + tst r2, r6 ! 86 MT
> +
> + mov r5, r3 ! 5 MT (latency=0)
> + bt/s 4f ! 111 BR
> +
> + add #-4, r3 ! 50 EX
> + mov.l r1, at -r0 ! 30 LS
> +
> + ! 4 cycles, 2 long words per iteration
> +3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
> +
> +4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
> + cmp/hi r7, r0
> +
> + mov.l r1, @-r0 ! 30 LS
> + bt/s 3b ! 109 BR
> +
> + mov.l r2, @-r0 ! 30 LS
> +
> + ! Copy the final 0-3 bytes
> +
> + add #3,r5 ! 50 EX
> +
> + cmp/eq r0, r4 ! 54 MT
> + add #-10, r7 ! 50 EX
> +
> + bt 9f ! 110 BR
> +
> + ! 3 cycles, 1 byte per iteration
> +1: mov.b @(r0,r5),r1 ! 19 LS
> + cmp/hi r7,r0 ! 57 MT
> +
> + bt/s 1b ! 111 BR
> + mov.b r1, at -r0 ! 28 LS
> +
> +9: rts
> + nop
> +
> + ! Size is at least 64 bytes, so will be going round the big loop at least once.
> + !
> + ! r2 = rounded up r4
> + ! r3 = rounded down r0
> +
> + .balign 32
> +.Lcase0b:
> + add #-4, r5 ! 50 EX
> +
> +.Lcase00b:
> + mov r0, r3 ! 5 MT (latency=0)
> + mov #(~0x1f), r1 ! 6 EX
> +
> + and r1, r3 ! 78 EX
> + mov r4, r2 ! 5 MT (latency=0)
> +
> + cmp/eq r3, r0 ! 54 MT
> + add #0x1f, r2 ! 50 EX
> +
> + bt/s 1f ! 110 BR
> + and r1, r2 ! 78 EX
> +
> + ! copy initial words until cache line aligned
> +
> + mov.l @(r0, r5), r1 ! 21 LS (latency=2)
> + tst #4, r0 ! 87 MT
> +
> + mov r5, r6 ! 5 MT (latency=0)
> + add #-4, r6 ! 50 EX
> +
> + bt/s 4f ! 111 BR
> + add #8, r3 ! 50 EX
> +
> + tst #0x18, r0 ! 87 MT
> +
> + bt/s 1f ! 109 BR
> + mov.l r1, at -r0 ! 30 LS
> +
> + ! 4 cycles, 2 long words per iteration
> +3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
> +
> +4: mov.l @(r0, r6), r7 ! 21 LS (latency=2)
> + cmp/eq r3, r0 ! 54 MT
> +
> + mov.l r1, @-r0 ! 30 LS
> + bf/s 3b ! 109 BR
> +
> + mov.l r7, @-r0 ! 30 LS
> +
> + ! Copy the cache line aligned blocks
> + !
> + ! In use: r0, r2, r4, r5
> + ! Scratch: r1, r3, r6, r7
> + !
> + ! We could do this with the four scratch registers, but if src
> + ! and dest hit the same cache line, this will thrash, so make
> + ! use of additional registers.
> + !
> + ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
> + ! r5: src (was r0+r5)
> + ! r1: dest (was r0)
> + ! this can be reversed at the end, so we don't need to save any extra
> + ! state.
> + !
> +1: mov.l r8, @-r15 ! 30 LS
> + add r0, r5 ! 49 EX
> +
> + mov.l r9, @-r15 ! 30 LS
> + mov r0, r1 ! 5 MT (latency=0)
> +
> + mov.l r10, @-r15 ! 30 LS
> + add #-0x1c, r5 ! 50 EX
> +
> + mov.l r11, @-r15 ! 30 LS
> +
> + ! 16 cycles, 32 bytes per iteration
> +2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2)
> + add #-0x20, r1 ! 50 EX
> + mov.l @(0x04,r5),r3 ! 18 LS (latency=2)
> + mov.l @(0x08,r5),r6 ! 18 LS (latency=2)
> + mov.l @(0x0c,r5),r7 ! 18 LS (latency=2)
> + mov.l @(0x10,r5),r8 ! 18 LS (latency=2)
> + mov.l @(0x14,r5),r9 ! 18 LS (latency=2)
> + mov.l @(0x18,r5),r10 ! 18 LS (latency=2)
> + mov.l @(0x1c,r5),r11 ! 18 LS (latency=2)
> + movca.l r0, at r1 ! 40 LS (latency=3-7)
> + mov.l r3,@(0x04,r1) ! 33 LS
> + mov.l r6,@(0x08,r1) ! 33 LS
> + mov.l r7,@(0x0c,r1) ! 33 LS
> +
> + mov.l r8,@(0x10,r1) ! 33 LS
> + add #-0x20, r5 ! 50 EX
> +
> + mov.l r9,@(0x14,r1) ! 33 LS
> + cmp/eq r2,r1 ! 54 MT
> +
> + mov.l r10,@(0x18,r1) ! 33 LS
> + bf/s 2b ! 109 BR
> +
> + mov.l r11,@(0x1c,r1) ! 33 LS
> +
> + mov r1, r0 ! 5 MT (latency=0)
> +
> + mov.l @r15+, r11 ! 15 LS
> + sub r1, r5 ! 75 EX
> +
> + mov.l @r15+, r10 ! 15 LS
> + cmp/eq r4, r0 ! 54 MT
> +
> + bf/s 1f ! 109 BR
> + mov.l @r15+, r9 ! 15 LS
> +
> + rts
> +1: mov.l @r15+, r8 ! 15 LS
> + sub r4, r1 ! 75 EX (len remaining)
> +
> + ! number of trailing bytes is non-zero
> + !
> + ! invariants restored (r5 already decremented by 4)
> + ! also r1=num bytes remaining
> +
> + mov #4, r2 ! 6 EX
> + mov r4, r7 ! 5 MT (latency=0)
> +
> + add #0x1c, r5 ! 50 EX (back to -4)
> + cmp/hs r2, r1 ! 58 MT
> +
> + bf/s 5f ! 108 BR
> + add #11, r7 ! 50 EX
> +
> + mov.l @(r0, r5), r6 ! 21 LS (latency=2)
> + tst r2, r1 ! 86 MT
> +
> + mov r5, r3 ! 5 MT (latency=0)
> + bt/s 4f ! 111 BR
> +
> + add #-4, r3 ! 50 EX
> + cmp/hs r2, r1 ! 58 MT
> +
> + bt/s 5f ! 111 BR
> + mov.l r6, at -r0 ! 30 LS
> +
> + ! 4 cycles, 2 long words per iteration
> +3: mov.l @(r0, r5), r6 ! 21 LS (latency=2)
> +
> +4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
> + cmp/hi r7, r0
> +
> + mov.l r6, @-r0 ! 30 LS
> + bt/s 3b ! 109 BR
> +
> + mov.l r2, @-r0 ! 30 LS
> +
> + ! Copy the final 0-3 bytes
> +
> +5: cmp/eq r0, r4 ! 54 MT
> + add #-10, r7 ! 50 EX
> +
> + bt 9f ! 110 BR
> + add #3,r5 ! 50 EX
> +
> + ! 3 cycles, 1 byte per iteration
> +1: mov.b @(r0,r5),r1 ! 19 LS
> + cmp/hi r7,r0 ! 57 MT
> +
> + bt/s 1b ! 111 BR
> + mov.b r1, at -r0 ! 28 LS
> +
> +9: rts
> + nop
> +
> + !
> + ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR..
> + !
> +
> + .balign 32
> +.Lcase2:
> + ! Size is 16 or greater and less then 64, but may have trailing bytes
> +
> +2: mov r5, r6 ! 5 MT (latency=0)
> + add #-2,r5 ! 50 EX
> +
> + mov r4,r2 ! 5 MT (latency=0)
> + add #-4,r6 ! 50 EX
> +
> + add #7,r2 ! 50 EX
> +3: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
> +
> + mov.w @(r0,r6),r3 ! 20 LS (latency=2)
> + cmp/hi r2,r0 ! 57 MT
> +
> + mov.w r1, at -r0 ! 29 LS
> + bt/s 3b ! 111 BR
> +
> + mov.w r3, at -r0 ! 29 LS
> +
> + bra 10f
> + nop
> +
> +
> + .balign 32
> +.Lcase2b:
> + ! Size is at least 64 bytes, so will be going round the big loop at least once.
> + !
> + ! r2 = rounded up r4
> + ! r3 = rounded down r0
> +
> + mov r0, r3 ! 5 MT (latency=0)
> + mov #(~0x1f), r1 ! 6 EX
> +
> + and r1, r3 ! 78 EX
> + mov r4, r2 ! 5 MT (latency=0)
> +
> + cmp/eq r3, r0 ! 54 MT
> + add #0x1f, r2 ! 50 EX
> +
> + add #-2, r5 ! 50 EX
> + bt/s 1f ! 110 BR
> + and r1, r2 ! 78 EX
> +
> + ! Copy a short word one at a time until we are cache line aligned
> + ! Normal values: r0, r2, r3, r4
> + ! Unused: r1, r6, r7
> + ! Mod: r5 (=r5-2)
> + !
> + add #2, r3 ! 50 EX
> +
> +2: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
> + cmp/eq r3,r0 ! 54 MT
> +
> + bf/s 2b ! 111 BR
> +
> + mov.w r1, at -r0 ! 29 LS
> +
> + ! Copy the cache line aligned blocks
> + !
> + ! In use: r0, r2, r4, r5 (=r5-2)
> + ! Scratch: r1, r3, r6, r7
> + !
> + ! We could do this with the four scratch registers, but if src
> + ! and dest hit the same cache line, this will thrash, so make
> + ! use of additional registers.
> + !
> + ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
> + ! r5: src (was r0+r5)
> + ! r1: dest (was r0)
> + ! this can be reversed at the end, so we don't need to save any extra
> + ! state.
> + !
> +1: mov.l r8, @-r15 ! 30 LS
> + add r0, r5 ! 49 EX
> +
> + mov.l r9, @-r15 ! 30 LS
> + mov r0, r1 ! 5 MT (latency=0)
> +
> + mov.l r10, @-r15 ! 30 LS
> + add #-0x1e, r5 ! 50 EX
> +
> + mov.l r11, @-r15 ! 30 LS
> +
> + mov.l r12, @-r15 ! 30 LS
> +
> + ! 17 cycles, 32 bytes per iteration
> +#ifdef __LITTLE_ENDIAN__
> +2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI
> + add #-0x20, r1 ! 50 EX
> +
> + mov.l @r5+, r3 ! 15 LS (latency=2) NMLK
> +
> + mov.l @r5+, r6 ! 15 LS (latency=2) RQPO
> + shll16 r0 ! 103 EX JI..
> +
> + mov.l @r5+, r7 ! 15 LS (latency=2)
> + xtrct r3, r0 ! 48 EX LKJI
> +
> + mov.l @r5+, r8 ! 15 LS (latency=2)
> + xtrct r6, r3 ! 48 EX PONM
> +
> + mov.l @r5+, r9 ! 15 LS (latency=2)
> + xtrct r7, r6 ! 48 EX
> +
> + mov.l @r5+, r10 ! 15 LS (latency=2)
> + xtrct r8, r7 ! 48 EX
> +
> + mov.l @r5+, r11 ! 15 LS (latency=2)
> + xtrct r9, r8 ! 48 EX
> +
> + mov.w @r5+, r12 ! 15 LS (latency=2)
> + xtrct r10, r9 ! 48 EX
> +
> + movca.l r0, at r1 ! 40 LS (latency=3-7)
> + xtrct r11, r10 ! 48 EX
> +
> + mov.l r3, @(0x04,r1) ! 33 LS
> + xtrct r12, r11 ! 48 EX
> +
> + mov.l r6, @(0x08,r1) ! 33 LS
> +
> + mov.l r7, @(0x0c,r1) ! 33 LS
> +
> + mov.l r8, @(0x10,r1) ! 33 LS
> + add #-0x40, r5 ! 50 EX
> +
> + mov.l r9, @(0x14,r1) ! 33 LS
> + cmp/eq r2,r1 ! 54 MT
> +
> + mov.l r10, @(0x18,r1) ! 33 LS
> + bf/s 2b ! 109 BR
> +
> + mov.l r11, @(0x1c,r1) ! 33 LS
> +#else
> +2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2)
> + add #-2, r5 ! 50 EX
> +
> + mov.l @(0x1c,r5), r3 ! 18 LS (latency=2)
> + add #-4, r1 ! 50 EX
> +
> + mov.l @(0x18,r5), r6 ! 18 LS (latency=2)
> + shll16 r0 ! 103 EX
> +
> + mov.l @(0x14,r5), r7 ! 18 LS (latency=2)
> + xtrct r3, r0 ! 48 EX
> +
> + mov.l @(0x10,r5), r8 ! 18 LS (latency=2)
> + xtrct r6, r3 ! 48 EX
> +
> + mov.l @(0x0c,r5), r9 ! 18 LS (latency=2)
> + xtrct r7, r6 ! 48 EX
> +
> + mov.l @(0x08,r5), r10 ! 18 LS (latency=2)
> + xtrct r8, r7 ! 48 EX
> +
> + mov.l @(0x04,r5), r11 ! 18 LS (latency=2)
> + xtrct r9, r8 ! 48 EX
> +
> + mov.w @(0x02,r5), r12 ! 18 LS (latency=2)
> + xtrct r10, r9 ! 48 EX
> +
> + movca.l r0, at r1 ! 40 LS (latency=3-7)
> + add #-0x1c, r1 ! 50 EX
> +
> + mov.l r3, @(0x1c,r1) ! 33 LS
> + xtrct r11, r10 ! 48 EX
> +
> + mov.l r6, @(0x18,r1) ! 33 LS
> + xtrct r12, r11 ! 48 EX
> +
> + mov.l r7, @(0x14,r1) ! 33 LS
> +
> + mov.l r8, @(0x10,r1) ! 33 LS
> + add #-0x3e, r5 ! 50 EX
> +
> + mov.l r9, @(0x0c,r1) ! 33 LS
> + cmp/eq r2,r1 ! 54 MT
> +
> + mov.l r10, @(0x08,r1) ! 33 LS
> + bf/s 2b ! 109 BR
> +
> + mov.l r11, @(0x04,r1) ! 33 LS
> +#endif
> +
> + mov.l @r15+, r12
> + mov r1, r0 ! 5 MT (latency=0)
> +
> + mov.l @r15+, r11 ! 15 LS
> + sub r1, r5 ! 75 EX
> +
> + mov.l @r15+, r10 ! 15 LS
> + cmp/eq r4, r0 ! 54 MT
> +
> + bf/s 1f ! 109 BR
> + mov.l @r15+, r9 ! 15 LS
> +
> + rts
> +1: mov.l @r15+, r8 ! 15 LS
> +
> + add #0x1e, r5 ! 50 EX
> +
> + ! Finish off a short word at a time
> + ! r5 must be invariant - 2
> +10: mov r4,r2 ! 5 MT (latency=0)
> + add #1,r2 ! 50 EX
> +
> + cmp/hi r2, r0 ! 57 MT
> + bf/s 1f ! 109 BR
> +
> + add #2, r2 ! 50 EX
> +
> +3: mov.w @(r0,r5),r1 ! 20 LS
> + cmp/hi r2,r0 ! 57 MT
> +
> + bt/s 3b ! 109 BR
> +
> + mov.w r1, at -r0 ! 29 LS
> +1:
> +
> + !
> + ! Finally, copy the last byte if necessary
> + cmp/eq r4,r0 ! 54 MT
> + bt/s 9b
> + add #1,r5
> + mov.b @(r0,r5),r1
> + rts
> + mov.b r1, at -r0
> +
> +.size memcpy,.-memcpy;
> +libc_hidden_def (memcpy)
>
>
> ------------------------------------------------------------------------
>
> _______________________________________________
> uClibc mailing list
> uClibc at uclibc.org
> http://busybox.net/cgi-bin/mailman/listinfo/uclibc
More information about the uClibc
mailing list