[PATCH] sh4: use optimized asm version of memcpy - add config option to support backward copying

Carmelo Amoroso carmelo73 at gmail.com
Mon May 28 12:44:47 UTC 2007


Carmelo AMOROSO wrote:
> Paul Mundt wrote:
>> On Sun, Mar 25, 2007 at 09:18:33AM -0400, Mike Frysinger wrote:
>>  
>>> On Wednesday 21 March 2007, Carmelo AMOROSO wrote:
>>>    
>>>> I'm currently using on uClibc-nptl for sh4 an optimized version
>>>> of the memcpy function (from Stuart Menefy @STMicroelectronics).
>>>> This implementation is based on 'backward copying'
>>>> and brakes the current implementation of 'memmove'
>>>> (libc/string/generic/memmove.c)
>>>> that, as clearly stated, assumes memcpy does a forward copying.
>>>>
>>>> The attached patch provides a solution for this adding a config option
>>>> to specify what kind of memcpy implementation the architecture 
>>>> provides.
>>>> In this way the memmove works with both implementation.
>>>>       
>>> if anything, this option should not be exported for the user to try 
>>> and figure out ... either the architecture provides it or it doesnt 
>>> which means it'd be a hardcoded selection in the arch-specific 
>>> config.in files ...
>>>
>>> wouldnt it be simpler to provide a superh optimized memmove/memcpy ?  
>>> then it wouldnt matter what the generic implementations assume ...
>>>     
>>
>> It has to be split out separately for sh4, given the movca.l usage.
>>   
> Hi All,
> I've updated the previous patch to keep into account both suggestions 
> made by Mike and Paul.
> A brief explanation of the changes follows:
> 
> extra/Configs/Config.in         -> set the TARGET_SUBARCH for the sh4 
> architecture
> extra/Configs/Config.in.sh      ->  set on the ARCH_HAS_BWD_MEMCPY for 
> sh4 architecture only
> 
> libc/string/sh/sh4                  -> new file memcpy.S (sh4 specific)
> libc/string/generic/memmove.c   -> use the new macro 
> __ARCH_HAS_BWD_MEMCPY__ instead of #if 1
> libc/string/generic/memcpy.c    ->  move static function from C source 
> to common header file with some reorder
> libc/string/generic/memcopy.h   ->  ""
> libc/string/Makefile.in               -> add code the manage subarch 
> specific code in addition to the arch specific one.
> 
> Any comments are welcome.
> 
> Cheers,
> Carmelo
> 
> 
Hi Mike, Paul,
did you have time to look at this ?

If accepted, may reduce a bit the diff from sh4 port and trunk.
This ode is currently used on nptl/sh4 port.

Carmelo

> ------------------------------------------------------------------------
> 
> diff -Naupr uClibc-trunk/extra/Configs/Config.in uClibc-trunk-st/extra/Configs/Config.in
> --- uClibc-trunk/extra/Configs/Config.in	2007-04-24 15:19:31.000000000 +0200
> +++ uClibc-trunk-st/extra/Configs/Config.in	2007-05-07 10:24:22.045984000 +0200
> @@ -180,6 +180,7 @@ config TARGET_SUBARCH
>  	string
>  	default "e500" if CONFIG_E500
>  	default "classic" if CONFIG_CLASSIC
> +	default "sh4" if CONFIG_SH4
>  	default ""
>  
>  source "extra/Configs/Config.in.arch"
> diff -Naupr uClibc-trunk/extra/Configs/Config.sh uClibc-trunk-st/extra/Configs/Config.sh
> --- uClibc-trunk/extra/Configs/Config.sh	2007-03-16 21:38:22.000000000 +0100
> +++ uClibc-trunk-st/extra/Configs/Config.sh	2007-05-07 14:02:04.426778000 +0200
> @@ -48,3 +48,8 @@ config CONFIG_SH4
>  	bool "SH4"
>  
>  endchoice
> +
> +config ARCH_HAS_BWD_MEMCPY
> +       bool
> +       default y
> +       depends CONFIG_SH4
> diff -Naupr uClibc-trunk/libc/string/Makefile.in uClibc-trunk-st/libc/string/Makefile.in
> --- uClibc-trunk/libc/string/Makefile.in	2006-09-19 09:43:04.000000000 +0200
> +++ uClibc-trunk-st/libc/string/Makefile.in	2007-05-07 10:27:07.516749000 +0200
> @@ -8,6 +8,18 @@
>  #
>  # Arch specific fun
>  #
> +# Collect the subarch specific implementation (asm files)
> +ifneq ($(strip $(TARGET_SUBARCH)),)
> +STRING_SUBARCH_DIR := $(top_srcdir)libc/string/$(TARGET_ARCH)/$(TARGET_SUBARCH)
> +STRING_SUBARCH_OUT := $(top_builddir)libc/string/$(TARGET_ARCH)/$(TARGET_SUBARCH)
> +
> +STRING_SUBARCH_SSRC := $(wildcard $(STRING_SUBARCH_OUT)/*.S)
> +STRING_SUBARCH_SOBJ := $(patsubst $(STRING_SUBARCH_DIR)/%.S,$(STRING_SUBARCH_OUT)/%.o,$(STRING_SUBARCH_SSRC))
> +
> +STRING_SUBARCH_OBJS := $(STRING_SUBARCH_SOBJ)
> +endif
> +
> +# Collect the arch specific implementation (asm, c files)
>  STRING_ARCH_DIR := $(top_srcdir)libc/string/$(TARGET_ARCH)
>  STRING_ARCH_OUT := $(top_builddir)libc/string/$(TARGET_ARCH)
>  
> @@ -15,13 +27,18 @@ STRING_ARCH_SRC := $(wildcard $(STRING_A
>  STRING_ARCH_OBJ := $(patsubst $(STRING_ARCH_DIR)/%.c,$(STRING_ARCH_OUT)/%.o,$(STRING_ARCH_SRC))
>  
>  STRING_ARCH_SSRC := $(wildcard $(STRING_ARCH_DIR)/*.S)
> -STRING_ARCH_SOBJ := $(patsubst $(STRING_ARCH_DIR)/%.S,$(STRING_ARCH_OUT)/%.o,$(STRING_ARCH_SSRC))
>  
> +# Exclude the subarch implementation from the arch ones
> +ifneq ($(strip $(STRING_SUBARCH_OBJS)),)
> +STRING_ARCH_SSRC := $(filter-out $(patsubst %.o,$(STRING_ARCH_DIR)/%.S,$(notdir $(STRING_SUBARCH_OBJS))),$(STRING_ARCH_SSRC))
> +endif
> +
> +STRING_ARCH_SOBJ := $(patsubst $(STRING_ARCH_DIR)/%.S,$(STRING_ARCH_OUT)/%.o,$(STRING_ARCH_SSRC))
>  STRING_ARCH_OBJS := $(STRING_ARCH_OBJ) $(STRING_ARCH_SOBJ)
>  
> -libc-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_OBJS)
> +libc-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_OBJS) $(STRING_SUBARCH_OBJS)
>  
> -libc-nomulti-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_SOBJ)
> +libc-nomulti-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_SOBJ) $(STRING_SUBARCH_OBJS)
>  
>  #
>  # Generic stuff
> @@ -35,6 +52,9 @@ ifeq ($(UCLIBC_HAS_STRING_ARCH_OPT),y)
>  ifneq ($(strip $(STRING_ARCH_OBJS)),)
>  STRING_GENERIC_SRC := $(filter-out $(patsubst %.o,$(STRING_GENERIC_DIR)/%.c,$(notdir $(STRING_ARCH_OBJS))),$(STRING_GENERIC_SRC))
>  endif
> +ifneq ($(strip $(STRING_SUBARCH_OBJS)),)
> +STRING_GENERIC_SRC := $(filter-out $(patsubst %.o,$(STRING_GENERIC_DIR)/%.c,$(notdir $(STRING_SUBARCH_OBJS))),$(STRING_GENERIC_SRC))
> +endif
>  endif
>  
>  STRING_GENERIC_OBJS := $(patsubst $(STRING_GENERIC_DIR)/%.c,$(STRING_GENERIC_OUT)/%.o,$(STRING_GENERIC_SRC))
> @@ -93,6 +113,9 @@ ifeq ($(UCLIBC_HAS_STRING_ARCH_OPT),y)
>  ifneq ($(strip $(STRING_ARCH_OBJS)),)
>  STRING_CSRC := $(filter-out $(patsubst %.o,$(STRING_DIR)/%.c,$(notdir $(STRING_ARCH_OBJS))),$(STRING_CSRC))
>  endif
> +ifneq ($(strip $(STRING_SUBARCH_OBJS)),)
> +STRING_CSRC := $(filter-out $(patsubst %.o,$(STRING_DIR)/%.c,$(notdir $(STRING_SUBARCH_OBJS))),$(STRING_CSRC))
> +endif
>  endif
>  
>  ifeq ($(UCLIBC_HAS_STRING_GENERIC_OPT),y)
> diff -Naupr uClibc-trunk/libc/string/generic/memcopy.h uClibc-trunk-st/libc/string/generic/memcopy.h
> --- uClibc-trunk/libc/string/generic/memcopy.h	2006-09-19 09:43:00.000000000 +0200
> +++ uClibc-trunk-st/libc/string/generic/memcopy.h	2007-05-07 10:27:55.056971000 +0200
> @@ -107,24 +107,6 @@ typedef unsigned char byte;
>  	}								      \
>      } while (0)
>  
> -/* Copy *up to* NBYTES bytes from SRC_BP to DST_BP, with
> -   the assumption that DST_BP is aligned on an OPSIZ multiple.  If
> -   not all bytes could be easily copied, store remaining number of bytes
> -   in NBYTES_LEFT, otherwise store 0.  */
> -/* extern void _wordcopy_fwd_aligned __P ((long int, long int, size_t)); */
> -/* extern void _wordcopy_fwd_dest_aligned __P ((long int, long int, size_t)); */
> -#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)		      \
> -  do									      \
> -    {									      \
> -      if (src_bp % OPSIZ == 0)						      \
> -	_wordcopy_fwd_aligned (dst_bp, src_bp, (nbytes) / OPSIZ);	      \
> -      else								      \
> -	_wordcopy_fwd_dest_aligned (dst_bp, src_bp, (nbytes) / OPSIZ);	      \
> -      src_bp += (nbytes) & -OPSIZ;					      \
> -      dst_bp += (nbytes) & -OPSIZ;					      \
> -      (nbytes_left) = (nbytes) % OPSIZ;					      \
> -    } while (0)
> -
>  /* Copy *up to* NBYTES_TO_COPY bytes from SRC_END_PTR to DST_END_PTR,
>     beginning at the words (of type op_t) right before the pointers and
>     continuing towards smaller addresses.  May take advantage of that
> @@ -148,3 +130,213 @@ typedef unsigned char byte;
>  
>  /* Threshold value for when to enter the unrolled loops.  */
>  #define	OP_T_THRES	16
> +
> +#ifdef __ARCH_HAS_BWD_MEMCPY__
> +
> +/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to
> +   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
> +   Both SRCP and DSTP should be aligned for memory operations on `op_t's.  */
> +
> +static void _wordcopy_fwd_aligned (long int dstp, long int srcp, size_t len)
> +{
> +  op_t a0, a1;
> +  a0 = a1 = 0L;
> +  switch (len % 8)
> +    {
> +    case 2:
> +      a0 = ((op_t *) srcp)[0];
> +      srcp -= 6 * OPSIZ;
> +      dstp -= 7 * OPSIZ;
> +      len += 6;
> +      goto do1;
> +    case 3:
> +      a1 = ((op_t *) srcp)[0];
> +      srcp -= 5 * OPSIZ;
> +      dstp -= 6 * OPSIZ;
> +      len += 5;
> +      goto do2;
> +    case 4:
> +      a0 = ((op_t *) srcp)[0];
> +      srcp -= 4 * OPSIZ;
> +      dstp -= 5 * OPSIZ;
> +      len += 4;
> +      goto do3;
> +    case 5:
> +      a1 = ((op_t *) srcp)[0];
> +      srcp -= 3 * OPSIZ;
> +      dstp -= 4 * OPSIZ;
> +      len += 3;
> +      goto do4;
> +    case 6:
> +      a0 = ((op_t *) srcp)[0];
> +      srcp -= 2 * OPSIZ;
> +      dstp -= 3 * OPSIZ;
> +      len += 2;
> +      goto do5;
> +    case 7:
> +      a1 = ((op_t *) srcp)[0];
> +      srcp -= 1 * OPSIZ;
> +      dstp -= 2 * OPSIZ;
> +      len += 1;
> +      goto do6;
> +
> +    case 0:
> +      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> +	return;
> +      a0 = ((op_t *) srcp)[0];
> +      srcp -= 0 * OPSIZ;
> +      dstp -= 1 * OPSIZ;
> +      goto do7;
> +    case 1:
> +      a1 = ((op_t *) srcp)[0];
> +      srcp -=-1 * OPSIZ;
> +      dstp -= 0 * OPSIZ;
> +      len -= 1;
> +      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> +	goto do0;
> +      goto do8;			/* No-op.  */
> +    }
> +
> +  do
> +    {
> +    do8:
> +      a0 = ((op_t *) srcp)[0];
> +      ((op_t *) dstp)[0] = a1;
> +    do7:
> +      a1 = ((op_t *) srcp)[1];
> +      ((op_t *) dstp)[1] = a0;
> +    do6:
> +      a0 = ((op_t *) srcp)[2];
> +      ((op_t *) dstp)[2] = a1;
> +    do5:
> +      a1 = ((op_t *) srcp)[3];
> +      ((op_t *) dstp)[3] = a0;
> +    do4:
> +      a0 = ((op_t *) srcp)[4];
> +      ((op_t *) dstp)[4] = a1;
> +    do3:
> +      a1 = ((op_t *) srcp)[5];
> +      ((op_t *) dstp)[5] = a0;
> +    do2:
> +      a0 = ((op_t *) srcp)[6];
> +      ((op_t *) dstp)[6] = a1;
> +    do1:
> +      a1 = ((op_t *) srcp)[7];
> +      ((op_t *) dstp)[7] = a0;
> +
> +      srcp += 8 * OPSIZ;
> +      dstp += 8 * OPSIZ;
> +      len -= 8;
> +    }
> +  while (len != 0);
> +
> +  /* This is the right position for do0.  Please don't move
> +     it into the loop.  */
> + do0:
> +  ((op_t *) dstp)[0] = a1;
> +}
> +
> +/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to
> +   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
> +   DSTP should be aligned for memory operations on `op_t's, but SRCP must
> +   *not* be aligned.  */
> +
> +static void _wordcopy_fwd_dest_aligned (long int dstp, long int srcp, size_t len)
> +{
> +  op_t a0, a1, a2, a3;
> +  int sh_1, sh_2;
> +
> +  /* Calculate how to shift a word read at the memory operation
> +     aligned srcp to make it aligned for copy.  */
> +  a0 = a1 = a2 = a3 = 0L;
> +  sh_1 = 8 * (srcp % OPSIZ);
> +  sh_2 = 8 * OPSIZ - sh_1;
> + 
> +  /* Make SRCP aligned by rounding it down to the beginning of the `op_t'
> +     it points in the middle of.  */
> +  srcp &= -OPSIZ;
> +
> +  switch (len % 4)
> +    {
> +    case 2:
> +      a1 = ((op_t *) srcp)[0];
> +      a2 = ((op_t *) srcp)[1];
> +      srcp -= 1 * OPSIZ;
> +      dstp -= 3 * OPSIZ;
> +      len += 2;
> +      goto do1;
> +    case 3:
> +      a0 = ((op_t *) srcp)[0];
> +      a1 = ((op_t *) srcp)[1];
> +      srcp -= 0 * OPSIZ;
> +      dstp -= 2 * OPSIZ;
> +      len += 1;
> +      goto do2;
> +    case 0:
> +      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> +	return;
> +      a3 = ((op_t *) srcp)[0];
> +      a0 = ((op_t *) srcp)[1];
> +      srcp -=-1 * OPSIZ;
> +      dstp -= 1 * OPSIZ;
> +      len += 0;
> +      goto do3;
> +    case 1:
> +      a2 = ((op_t *) srcp)[0];
> +      a3 = ((op_t *) srcp)[1];
> +      srcp -=-2 * OPSIZ;
> +      dstp -= 0 * OPSIZ;
> +      len -= 1;
> +      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> +	goto do0;
> +      goto do4;			/* No-op.  */
> +    }
> +
> +  do
> +    {
> +    do4:
> +      a0 = ((op_t *) srcp)[0];
> +      ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
> +    do3:
> +      a1 = ((op_t *) srcp)[1];
> +      ((op_t *) dstp)[1] = MERGE (a3, sh_1, a0, sh_2);
> +    do2:
> +      a2 = ((op_t *) srcp)[2];
> +      ((op_t *) dstp)[2] = MERGE (a0, sh_1, a1, sh_2);
> +    do1:
> +      a3 = ((op_t *) srcp)[3];
> +      ((op_t *) dstp)[3] = MERGE (a1, sh_1, a2, sh_2);
> +
> +      srcp += 4 * OPSIZ;
> +      dstp += 4 * OPSIZ;
> +      len -= 4;
> +    }
> +  while (len != 0);
> +
> +  /* This is the right position for do0.  Please don't move
> +     it into the loop.  */
> + do0:
> +  ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
> +}
> +
> +
> +/* Copy *up to* NBYTES bytes from SRC_BP to DST_BP, with
> +   the assumption that DST_BP is aligned on an OPSIZ multiple.  If
> +   not all bytes could be easily copied, store remaining number of bytes
> +   in NBYTES_LEFT, otherwise store 0.  */
> +/* extern void _wordcopy_fwd_aligned __P ((long int, long int, size_t)); */
> +/* extern void _wordcopy_fwd_dest_aligned __P ((long int, long int, size_t)); */
> +#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)		      \
> +  do									      \
> +    {									      \
> +      if (src_bp % OPSIZ == 0)						      \
> +	_wordcopy_fwd_aligned (dst_bp, src_bp, (nbytes) / OPSIZ);	      \
> +      else								      \
> +	_wordcopy_fwd_dest_aligned (dst_bp, src_bp, (nbytes) / OPSIZ);	      \
> +      src_bp += (nbytes) & -OPSIZ;					      \
> +      dst_bp += (nbytes) & -OPSIZ;					      \
> +      (nbytes_left) = (nbytes) % OPSIZ;					      \
> +    } while (0)
> +
> +#endif /* __ARCH_HAS_BWD_MEMCPY__ */
> +
> diff -Naupr uClibc-trunk/libc/string/generic/memcpy.c uClibc-trunk-st/libc/string/generic/memcpy.c
> --- uClibc-trunk/libc/string/generic/memcpy.c	2006-09-19 09:43:00.000000000 +0200
> +++ uClibc-trunk-st/libc/string/generic/memcpy.c	2007-05-07 10:28:20.217087000 +0200
> @@ -25,192 +25,6 @@
>  
>  libc_hidden_proto(memcpy)
>  
> -/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to
> -   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
> -   Both SRCP and DSTP should be aligned for memory operations on `op_t's.  */
> -
> -static void _wordcopy_fwd_aligned (long int dstp, long int srcp, size_t len)
> -{
> -  op_t a0, a1;
> -
> -  switch (len % 8)
> -    {
> -    case 2:
> -      a0 = ((op_t *) srcp)[0];
> -      srcp -= 6 * OPSIZ;
> -      dstp -= 7 * OPSIZ;
> -      len += 6;
> -      goto do1;
> -    case 3:
> -      a1 = ((op_t *) srcp)[0];
> -      srcp -= 5 * OPSIZ;
> -      dstp -= 6 * OPSIZ;
> -      len += 5;
> -      goto do2;
> -    case 4:
> -      a0 = ((op_t *) srcp)[0];
> -      srcp -= 4 * OPSIZ;
> -      dstp -= 5 * OPSIZ;
> -      len += 4;
> -      goto do3;
> -    case 5:
> -      a1 = ((op_t *) srcp)[0];
> -      srcp -= 3 * OPSIZ;
> -      dstp -= 4 * OPSIZ;
> -      len += 3;
> -      goto do4;
> -    case 6:
> -      a0 = ((op_t *) srcp)[0];
> -      srcp -= 2 * OPSIZ;
> -      dstp -= 3 * OPSIZ;
> -      len += 2;
> -      goto do5;
> -    case 7:
> -      a1 = ((op_t *) srcp)[0];
> -      srcp -= 1 * OPSIZ;
> -      dstp -= 2 * OPSIZ;
> -      len += 1;
> -      goto do6;
> -
> -    case 0:
> -      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> -	return;
> -      a0 = ((op_t *) srcp)[0];
> -      srcp -= 0 * OPSIZ;
> -      dstp -= 1 * OPSIZ;
> -      goto do7;
> -    case 1:
> -      a1 = ((op_t *) srcp)[0];
> -      srcp -=-1 * OPSIZ;
> -      dstp -= 0 * OPSIZ;
> -      len -= 1;
> -      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> -	goto do0;
> -      goto do8;			/* No-op.  */
> -    }
> -
> -  do
> -    {
> -    do8:
> -      a0 = ((op_t *) srcp)[0];
> -      ((op_t *) dstp)[0] = a1;
> -    do7:
> -      a1 = ((op_t *) srcp)[1];
> -      ((op_t *) dstp)[1] = a0;
> -    do6:
> -      a0 = ((op_t *) srcp)[2];
> -      ((op_t *) dstp)[2] = a1;
> -    do5:
> -      a1 = ((op_t *) srcp)[3];
> -      ((op_t *) dstp)[3] = a0;
> -    do4:
> -      a0 = ((op_t *) srcp)[4];
> -      ((op_t *) dstp)[4] = a1;
> -    do3:
> -      a1 = ((op_t *) srcp)[5];
> -      ((op_t *) dstp)[5] = a0;
> -    do2:
> -      a0 = ((op_t *) srcp)[6];
> -      ((op_t *) dstp)[6] = a1;
> -    do1:
> -      a1 = ((op_t *) srcp)[7];
> -      ((op_t *) dstp)[7] = a0;
> -
> -      srcp += 8 * OPSIZ;
> -      dstp += 8 * OPSIZ;
> -      len -= 8;
> -    }
> -  while (len != 0);
> -
> -  /* This is the right position for do0.  Please don't move
> -     it into the loop.  */
> - do0:
> -  ((op_t *) dstp)[0] = a1;
> -}
> -
> -/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to
> -   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
> -   DSTP should be aligned for memory operations on `op_t's, but SRCP must
> -   *not* be aligned.  */
> -
> -static void _wordcopy_fwd_dest_aligned (long int dstp, long int srcp, size_t len)
> -{
> -  op_t a0, a1, a2, a3;
> -  int sh_1, sh_2;
> -
> -  /* Calculate how to shift a word read at the memory operation
> -     aligned srcp to make it aligned for copy.  */
> -
> -  sh_1 = 8 * (srcp % OPSIZ);
> -  sh_2 = 8 * OPSIZ - sh_1;
> -
> -  /* Make SRCP aligned by rounding it down to the beginning of the `op_t'
> -     it points in the middle of.  */
> -  srcp &= -OPSIZ;
> -
> -  switch (len % 4)
> -    {
> -    case 2:
> -      a1 = ((op_t *) srcp)[0];
> -      a2 = ((op_t *) srcp)[1];
> -      srcp -= 1 * OPSIZ;
> -      dstp -= 3 * OPSIZ;
> -      len += 2;
> -      goto do1;
> -    case 3:
> -      a0 = ((op_t *) srcp)[0];
> -      a1 = ((op_t *) srcp)[1];
> -      srcp -= 0 * OPSIZ;
> -      dstp -= 2 * OPSIZ;
> -      len += 1;
> -      goto do2;
> -    case 0:
> -      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> -	return;
> -      a3 = ((op_t *) srcp)[0];
> -      a0 = ((op_t *) srcp)[1];
> -      srcp -=-1 * OPSIZ;
> -      dstp -= 1 * OPSIZ;
> -      len += 0;
> -      goto do3;
> -    case 1:
> -      a2 = ((op_t *) srcp)[0];
> -      a3 = ((op_t *) srcp)[1];
> -      srcp -=-2 * OPSIZ;
> -      dstp -= 0 * OPSIZ;
> -      len -= 1;
> -      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> -	goto do0;
> -      goto do4;			/* No-op.  */
> -    }
> -
> -  do
> -    {
> -    do4:
> -      a0 = ((op_t *) srcp)[0];
> -      ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
> -    do3:
> -      a1 = ((op_t *) srcp)[1];
> -      ((op_t *) dstp)[1] = MERGE (a3, sh_1, a0, sh_2);
> -    do2:
> -      a2 = ((op_t *) srcp)[2];
> -      ((op_t *) dstp)[2] = MERGE (a0, sh_1, a1, sh_2);
> -    do1:
> -      a3 = ((op_t *) srcp)[3];
> -      ((op_t *) dstp)[3] = MERGE (a1, sh_1, a2, sh_2);
> -
> -      srcp += 4 * OPSIZ;
> -      dstp += 4 * OPSIZ;
> -      len -= 4;
> -    }
> -  while (len != 0);
> -
> -  /* This is the right position for do0.  Please don't move
> -     it into the loop.  */
> - do0:
> -  ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
> -}
> -
>  void *memcpy (void *dstpp, const void *srcpp, size_t len)
>  {
>    unsigned long int dstp = (long int) dstpp;
> diff -Naupr uClibc-trunk/libc/string/generic/memmove.c uClibc-trunk-st/libc/string/generic/memmove.c
> --- uClibc-trunk/libc/string/generic/memmove.c	2006-09-19 09:43:00.000000000 +0200
> +++ uClibc-trunk-st/libc/string/generic/memmove.c	2007-05-07 10:29:26.717396000 +0200
> @@ -29,7 +29,8 @@ libc_hidden_proto(memcpy)
>  
>  static void _wordcopy_bwd_aligned (long int dstp, long int srcp, size_t len)
>  {
> -  op_t a0, a1;
> +  op_t a0 = 0;
> +  op_t a1 = 0;
>  
>    switch (len % 8)
>      {
> @@ -133,7 +134,10 @@ static void _wordcopy_bwd_aligned (long 
>  
>  static void _wordcopy_bwd_dest_aligned (long int dstp, long int srcp, size_t len)
>  {
> -  op_t a0, a1, a2, a3;
> +  op_t a0 = 0;
> +  op_t a1 = 0;
> +  op_t a2 = 0;
> +  op_t a3 = 0;
>    int sh_1, sh_2;
>  
>    /* Calculate how to shift a word read at the memory operation
> @@ -218,8 +222,8 @@ void *memmove (void *dest, const void *s
>       Reduces the working set.  */
>    if (dstp - srcp >= len)	/* *Unsigned* compare!  */
>      {
> -#if 1
> -#warning REMINDER: generic-opt memmove assumes memcpy does forward copying!
> +#ifndef __ARCH_HAS_BWD_MEMCPY__
> +      /* generic-opt memmove assumes memcpy does forward copying! */
>        memcpy(dest, src, len);
>  #else
>        /* Copy from the beginning to the end.  */
> diff -Naupr uClibc-trunk/libc/string/sh/sh4/memcpy.S uClibc-trunk-st/libc/string/sh/sh4/memcpy.S
> --- uClibc-trunk/libc/string/sh/sh4/memcpy.S	1970-01-01 01:00:00.000000000 +0100
> +++ uClibc-trunk-st/libc/string/sh/sh4/memcpy.S	2007-05-07 13:43:16.291529000 +0200
> @@ -0,0 +1,807 @@
> +/*
> + * "memcpy" implementation of SuperH
> + *
> + * Copyright (C) 1999  Niibe Yutaka
> + * Copyright (c) 2002  STMicroelectronics Ltd
> + *   Modified from memcpy.S and micro-optimised for SH4
> + *   Stuart Menefy (stuart.menefy at st.com)
> + *
> + */
> +
> +/*
> + * void *memcpy(void *dst, const void *src, size_t n);
> + *
> + * It is assumed that there is no overlap between src and dst.
> + * If there is an overlap, then the results are undefined.
> + */
> +
> +#include <endian.h>
> +
> +	!
> +	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
> +	!
> +	
> +	! Size is 16 or greater, and may have trailing bytes
> +
> +	.balign	32
> +.Lcase1:
> +	! Read a long word and write a long word at once
> +	! At the start of each iteration, r7 contains last long load
> +	add	#-1,r5		!  79 EX
> +	mov	r4,r2		!   5 MT (0 cycles latency)
> +
> +	mov.l	@(r0,r5),r7	!  21 LS (2 cycles latency)
> +	add	#-4,r5		!  50 EX
> +
> +	add	#7,r2		!  79 EX
> +	!
> +#ifdef __LITTLE_ENDIAN__
> +	! 6 cycles, 4 bytes per iteration
> +3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
> +	mov	r7, r3		!   5 MT (latency=0)	! RQPO
> +	
> +	cmp/hi	r2,r0		!  57 MT
> +	shll16	r3		! 103 EX
> +
> +	mov	r1,r6		!   5 MT (latency=0)
> +	shll8	r3		! 102 EX		! Oxxx
> +
> +	shlr8	r6		! 106 EX		! xNML
> +	mov	r1, r7		!   5 MT (latency=0)
> +	
> +	or	r6,r3		!  82 EX		! ONML
> +	bt/s	3b		! 109 BR
> +
> +	 mov.l	r3, at -r0		!  30 LS
> +#else
> +3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! KLMN
> +	mov	r7,r3		!   5 MT (latency=0)	! OPQR
> +
> +	cmp/hi	r2,r0		!  57 MT
> +	shlr16	r3		! 107 EX
> +
> +	shlr8	r3		! 106 EX		! xxxO
> +	mov	r1,r6		!   5 MT (latency=0)
> +
> +	shll8	r6		! 102 EX		! LMNx
> +	mov	r1,r7		!   5 MT (latency=0)
> +
> +	or	r6,r3		!  82 EX		! LMNO
> +	bt/s	3b		! 109 BR
> +
> +	 mov.l	r3, at -r0		!  30 LS
> +#endif
> +	! Finally, copy a byte at once, if necessary
> +
> +	add	#4,r5		!  50 EX
> +	cmp/eq	r4,r0		!  54 MT
> +
> +	add	#-6,r2		!  50 EX
> +	bt	9f		! 109 BR
> +
> +8:	cmp/hi	r2,r0		!  57 MT
> +	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
> +	
> +	bt/s	8b		! 109 BR
> +
> +	 mov.b	r1, at -r0		!  29 LS
> +
> +9:	rts
> +	 nop
> +
> +	
> +	!
> +	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
> +	!
> +	
> +	! Size is 16 or greater, and may have trailing bytes
> +
> +	.balign	32
> +.Lcase3:
> +	! Read a long word and write a long word at once
> +	! At the start of each iteration, r7 contains last long load
> +	add	#-3,r5		! 79 EX
> +	mov	r4,r2		!  5 MT (0 cycles latency)
> +
> +	mov.l	@(r0,r5),r7	! 21 LS (2 cycles latency)
> +	add	#-4,r5		! 50 EX
> +
> +	add	#7,r2		!  79 EX
> +	!
> +#ifdef __LITTLE_ENDIAN__
> +	! 6 cycles, 4 bytes per iteration
> +3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
> +	mov	r7, r3		!   5 MT (latency=0)	! RQPO
> +	
> +	cmp/hi	r2,r0		!  57 MT
> +	shll8	r3		! 102 EX		! QPOx
> +
> +	mov	r1,r6		!   5 MT (latency=0)
> +	shlr16	r6		! 107 EX
> +
> +	shlr8	r6		! 106 EX		! xxxN
> +	mov	r1, r7		!   5 MT (latency=0)
> +	
> +	or	r6,r3		!  82 EX		! QPON
> +	bt/s	3b		! 109 BR
> +
> +	 mov.l	r3, at -r0		!  30 LS
> +#else
> +3:	mov	r1,r3		! OPQR
> +	shlr8	r3		! xOPQ
> +	mov.l	@(r0,r5),r1	! KLMN
> +	mov	r1,r6
> +	shll16	r6
> +	shll8	r6		! Nxxx
> +	or	r6,r3		! NOPQ
> +	cmp/hi	r2,r0
> +	bt/s	3b
> +	 mov.l	r3, at -r0
> +#endif
> +
> +	! Finally, copy a byte at once, if necessary
> +
> +	add	#6,r5		!  50 EX
> +	cmp/eq	r4,r0		!  54 MT
> +
> +	add	#-6,r2		!  50 EX
> +	bt	9f		! 109 BR
> +
> +8:	cmp/hi	r2,r0		!  57 MT
> +	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
> +	
> +	bt/s	8b		! 109 BR
> +
> +	 mov.b	r1, at -r0		!  29 LS
> +
> +9:	rts
> +	 nop
> +	
> +/* void *memcpy(void *dst, const void *src, size_t len) */
> +.text
> +.align 5
> +.type memcpy, at function
> +.globl memcpy;
> +
> +memcpy:
> +	! Calculate the invariants which will be used in the remainder
> +	! of the code:
> +	!
> +	!      r4   -->  [ ...  ] DST             [ ...  ] SRC
> +	!	         [ ...  ]                 [ ...  ]
> +	!	           :                        :
> +	!      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
> +	!
> +	!
> +	
> +	! Short circuit the common case of src, dst and len being 32 bit aligned
> +	! and test for zero length move
> +
> +	mov	r6, r0		!   5 MT (0 cycle latency)
> +	or	r4, r0		!  82 EX
> +
> +	or	r5, r0		!  82 EX
> +	tst	r6, r6		!  86 MT
> +
> +	bt/s	99f		! 111 BR		(zero len)
> +	 tst	#3, r0		!  87 MT
> +
> +	mov	r4, r0		!   5 MT (0 cycle latency)
> +	add	r6, r0		!  49 EX
> +
> +	mov	#16, r1		!   6 EX
> +	bt/s	.Lcase00	! 111 BR		(aligned)
> +
> +	 sub	r4, r5		!  75 EX
> +
> +	! Arguments are not nicely long word aligned or zero len.
> +	! Check for small copies, and if so do a simple byte at a time copy.
> +	!
> +	! Deciding on an exact value of 'small' is not easy, as the point at which
> +	! using the optimised routines become worthwhile varies (these are the
> +	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
> +	!	size	byte-at-time	long	word	byte
> +	!	16	42		39-40	46-50	50-55
> +	!	24	58		43-44	54-58	62-67
> +	!	36	82		49-50	66-70	80-85
> +	! However the penalty for getting it 'wrong' is much higher for long word
> +	! aligned data (and this is more common), so use a value of 16.
> +	
> +	cmp/gt	r6,r1		!  56 MT
> +
> +	add	#-1,r5		!  50 EX
> +	bf/s	6f		! 108 BR		(not small)
> +
> +	 mov	r5, r3		!   5 MT (latency=0)
> +	shlr	r6		! 104 EX
> +
> +	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
> +	bf/s	4f		! 111 BR
> +	
> +	 add	#-1,r3		!  50 EX
> +	tst	r6, r6		!  86 MT
> +
> +	bt/s	98f		! 110 BR
> +	 mov.b	r1, at -r0		!  29 LS
> +
> +	! 4 cycles, 2 bytes per iteration
> +3:	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
> +
> +4:	mov.b	@(r0,r3),r2	!  20 LS (latency=2)
> +	dt	r6		!  67 EX
> +
> +	mov.b	r1, at -r0		!  29 LS
> +	bf/s	3b		! 111 BR
> +
> +	 mov.b	r2, at -r0		!  29 LS
> +98:
> +	rts
> +	 nop
> +
> +99:	rts
> +	 mov	r4, r0
> +
> +	! Size is not small, so its worthwhile looking for optimisations.
> +	! First align destination to a long word boundary.
> +	!
> +	! r5 = normal value -1
> +
> +6:	tst	#3, r0		!  87 MT
> +        mov	#3, r3		!   6 EX
> +
> +	bt/s	2f		! 111 BR
> +	 and	r0,r3		!  78 EX
> +
> +	! 3 cycles, 1 byte per iteration	
> +1:	dt	r3		!  67 EX
> +	mov.b	@(r0,r5),r1	!  19 LS (latency=2)
> +
> +	add	#-1, r6		!  79 EX
> +	bf/s	1b		! 109 BR
> +
> +	 mov.b	r1, at -r0		!  28 LS
> +
> +2:	add	#1, r5		!  79 EX
> +	
> +	! Now select the appropriate bulk transfer code based on relative
> +	! alignment of src and dst.
> +	
> +	mov	r0, r3		!   5 MT (latency=0)
> +
> +	mov	r5, r0		!   5 MT (latency=0)
> +	tst	#1, r0		!  87 MT
> +
> +	bf/s	1f		! 111 BR
> +	 mov	#64, r7		!   6 EX
> +
> +	! bit 0 clear
> +		
> +	cmp/ge	r7, r6		!  55 MT
> +
> +	bt/s	2f		! 111 BR
> +	 tst	#2, r0		!  87 MT
> +
> +	! small
> +	bt/s	.Lcase0
> +	 mov	r3, r0
> +
> +	bra	.Lcase2
> +	 nop
> +
> +	! big
> +2:	bt/s	.Lcase0b
> +	 mov	r3, r0
> +
> +	bra	.Lcase2b
> +	 nop
> +	
> +	! bit 0 set
> +1:	tst	#2, r0		! 87 MT
> +
> +	bt/s	.Lcase1
> +	 mov	r3, r0
> +
> +	bra	.Lcase3
> +	 nop
> +	
> +
> +	!
> +	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
> +	!
> +
> +	! src, dst and size are all long word aligned
> +	! size is non-zero
> +	
> +	.balign	32
> +.Lcase00:
> +	mov	#64, r1		!   6 EX
> +	mov	r5, r3		!   5 MT (latency=0)
> +
> +	cmp/gt	r6, r1		!  56 MT
> +	add	#-4, r5		!  50 EX
> +
> +	bf	.Lcase00b	! 108 BR		(big loop)
> +	shlr2	r6		! 105 EX
> +
> +	shlr	r6		! 104 EX
> +	mov.l	@(r0, r5), r1	!  21 LS (latency=2)	
> +
> +	bf/s	4f		! 111 BR
> +	 add	#-8, r3		!  50 EX
> +
> +	tst	r6, r6		!  86 MT
> +	bt/s	5f		! 110 BR
> +
> +	 mov.l	r1, at -r0		!  30 LS
> +
> +	! 4 cycles, 2 long words per iteration
> +3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
> +
> +4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
> +	dt	r6		!  67 EX
> +
> +	mov.l	r1, @-r0	!  30 LS
> +	bf/s	3b		! 109 BR
> +
> +	 mov.l	r2, @-r0	!  30 LS
> +
> +5:	rts
> +	 nop
> +
> +	
> +	! Size is 16 or greater and less than 64, but may have trailing bytes
> +
> +	.balign	32
> +.Lcase0:
> +	add	#-4, r5		!  50 EX
> +	mov	r4, r7		!   5 MT (latency=0)
> +
> +	mov.l	@(r0, r5), r1	!  21 LS (latency=2)	
> +	mov	#4, r2		!   6 EX
> +
> +	add	#11, r7		!  50 EX
> +	tst	r2, r6		!  86 MT
> +
> +	mov	r5, r3		!   5 MT (latency=0)
> +	bt/s	4f		! 111 BR
> +
> +	 add	#-4, r3		!  50 EX
> +	mov.l	r1, at -r0		!  30 LS
> +
> +	! 4 cycles, 2 long words per iteration
> +3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
> +
> +4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
> +	cmp/hi	r7, r0
> +
> +	mov.l	r1, @-r0	!  30 LS
> +	bt/s	3b		! 109 BR
> +
> +	 mov.l	r2, @-r0	!  30 LS
> +
> +	! Copy the final 0-3 bytes
> +
> +	add	#3,r5		!  50 EX
> +	
> +	cmp/eq	r0, r4		!  54 MT
> +	add	#-10, r7	!  50 EX
> +
> +	bt	9f		! 110 BR
> +
> +	! 3 cycles, 1 byte per iteration
> +1:	mov.b	@(r0,r5),r1	!  19 LS
> +	cmp/hi	r7,r0		!  57 MT
> +	
> +	bt/s	1b		! 111 BR
> +	 mov.b	r1, at -r0		!  28 LS
> +
> +9:	rts
> +	 nop
> +
> +	! Size is at least 64 bytes, so will be going round the big loop at least once.
> +	!
> +	!   r2 = rounded up r4
> +	!   r3 = rounded down r0
> +
> +	.balign	32
> +.Lcase0b:
> +	add	#-4, r5		!  50 EX
> +
> +.Lcase00b:
> +	mov	r0, r3		!   5 MT (latency=0)
> +	mov	#(~0x1f), r1	!   6 EX
> +
> +	and	r1, r3		!  78 EX
> +	mov	r4, r2		!   5 MT (latency=0)
> +
> +	cmp/eq	r3, r0		!  54 MT
> +	add	#0x1f, r2	!  50 EX
> +
> +	bt/s	1f		! 110 BR
> +	 and	r1, r2		!  78 EX
> +
> +	! copy initial words until cache line aligned
> +
> +	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
> +	tst	#4, r0		!  87 MT
> +
> +	mov	r5, r6		!   5 MT (latency=0)
> +	add	#-4, r6		!  50 EX
> +
> +	bt/s	4f		! 111 BR
> +	 add	#8, r3		!  50 EX
> +
> +	tst	#0x18, r0	!  87 MT
> +	
> +	bt/s	1f		! 109 BR
> +	 mov.l	r1, at -r0		!  30 LS
> +	
> +	! 4 cycles, 2 long words per iteration
> +3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
> +
> +4:	mov.l	@(r0, r6), r7	!  21 LS (latency=2)
> +	cmp/eq	r3, r0		!  54 MT
> +
> +	mov.l	r1, @-r0	!  30 LS
> +	bf/s	3b		! 109 BR
> +
> +	 mov.l	r7, @-r0	!  30 LS
> +
> +	! Copy the cache line aligned blocks
> +	!
> +	! In use: r0, r2, r4, r5
> +	! Scratch: r1, r3, r6, r7
> +	!
> +	! We could do this with the four scratch registers, but if src
> +	! and dest hit the same cache line, this will thrash, so make
> +	! use of additional registers.
> +	! 
> +	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
> +	!   r5:	 src (was r0+r5)
> +	!   r1:	 dest (was r0)
> +	! this can be reversed at the end, so we don't need to save any extra
> +	! state.
> +	!
> +1:	mov.l	r8, @-r15	!  30 LS
> +	add	r0, r5		!  49 EX
> +	
> +	mov.l	r9, @-r15	!  30 LS
> +	mov	r0, r1		!   5 MT (latency=0)
> +	
> +	mov.l	r10, @-r15	!  30 LS
> +	add	#-0x1c, r5	!  50 EX
> +	
> +	mov.l	r11, @-r15	!  30 LS			
> +
> +	! 16 cycles, 32 bytes per iteration
> +2:	mov.l	@(0x00,r5),r0	! 18 LS (latency=2)
> +	add	#-0x20, r1	! 50 EX
> +	mov.l	@(0x04,r5),r3	! 18 LS (latency=2)
> +	mov.l	@(0x08,r5),r6	! 18 LS (latency=2)
> +	mov.l	@(0x0c,r5),r7	! 18 LS (latency=2)
> +	mov.l	@(0x10,r5),r8	! 18 LS (latency=2)
> +	mov.l	@(0x14,r5),r9	! 18 LS (latency=2)
> +	mov.l	@(0x18,r5),r10	! 18 LS (latency=2)
> +	mov.l	@(0x1c,r5),r11	! 18 LS (latency=2)
> +	movca.l	r0, at r1		! 40 LS (latency=3-7)
> +	mov.l	r3,@(0x04,r1)	! 33 LS
> +	mov.l	r6,@(0x08,r1)	! 33 LS
> +	mov.l	r7,@(0x0c,r1)	! 33 LS
> +
> +	mov.l	r8,@(0x10,r1)	! 33 LS
> +	add	#-0x20, r5	! 50 EX
> +
> +	mov.l	r9,@(0x14,r1)	! 33 LS
> +	cmp/eq	r2,r1		! 54 MT
> +
> +	mov.l	r10,@(0x18,r1)	!  33 LS
> +	bf/s	2b		! 109 BR
> +
> +	 mov.l	r11,@(0x1c,r1)	!  33 LS
> +
> +	mov	r1, r0		!   5 MT (latency=0)
> +
> +	mov.l	@r15+, r11	!  15 LS
> +	sub	r1, r5		!  75 EX
> +
> +	mov.l	@r15+, r10	!  15 LS
> +	cmp/eq	r4, r0		!  54 MT
> +
> +	bf/s	1f		! 109 BR
> +	 mov.l	 @r15+, r9	!  15 LS
> +
> +	rts
> +1:	 mov.l	@r15+, r8	!  15 LS
> +	sub	r4, r1		!  75 EX		(len remaining)
> +
> +	! number of trailing bytes is non-zero
> +	!	
> +	! invariants restored (r5 already decremented by 4)
> +	! also r1=num bytes remaining
> +	
> +	mov	#4, r2		!   6 EX
> +	mov	r4, r7		!   5 MT (latency=0)
> +
> +	add	#0x1c, r5	!  50 EX		(back to -4)
> +	cmp/hs	r2, r1		!  58 MT
> +
> +	bf/s	5f		! 108 BR
> +	 add	 #11, r7	!  50 EX
> +
> +	mov.l	@(r0, r5), r6	!  21 LS (latency=2)	
> +	tst	r2, r1		!  86 MT
> +
> +	mov	r5, r3		!   5 MT (latency=0)
> +	bt/s	4f		! 111 BR
> +
> +	 add	#-4, r3		!  50 EX
> +	cmp/hs	r2, r1		!  58 MT
> +
> +	bt/s	5f		! 111 BR
> +	 mov.l	r6, at -r0		!  30 LS
> +
> +	! 4 cycles, 2 long words per iteration
> +3:	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
> +
> +4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
> +	cmp/hi	r7, r0
> +
> +	mov.l	r6, @-r0	!  30 LS
> +	bt/s	3b		! 109 BR
> +
> +	 mov.l	r2, @-r0	!  30 LS
> +
> +	! Copy the final 0-3 bytes
> +
> +5:	cmp/eq	r0, r4		!  54 MT
> +	add	#-10, r7	!  50 EX
> +
> +	bt	9f		! 110 BR
> +	add	#3,r5		!  50 EX
> +	
> +	! 3 cycles, 1 byte per iteration
> +1:	mov.b	@(r0,r5),r1	!  19 LS
> +	cmp/hi	r7,r0		!  57 MT
> +	
> +	bt/s	1b		! 111 BR
> +	 mov.b	r1, at -r0		!  28 LS
> +
> +9:	rts
> +	 nop
> +
> +	!
> +	!	GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
> +	!
> +		
> +	.balign	32
> +.Lcase2:
> +	! Size is 16 or greater and less then 64, but may have trailing bytes
> +
> +2:	mov	r5, r6		!   5 MT (latency=0)
> +	add	#-2,r5		!  50 EX
> +
> +	mov	r4,r2		!   5 MT (latency=0)
> +	add	#-4,r6		!  50 EX
> +
> +	add	#7,r2		!  50 EX
> +3:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
> +
> +	mov.w	@(r0,r6),r3	!  20 LS (latency=2)
> +	cmp/hi	r2,r0		!  57 MT
> +
> +	mov.w	r1, at -r0		!  29 LS
> +	bt/s	3b		! 111 BR
> +
> +	 mov.w	r3, at -r0		!  29 LS
> +
> +	bra	10f
> +	 nop
> +
> +
> +	.balign	32
> +.Lcase2b:
> +	! Size is at least 64 bytes, so will be going round the big loop at least once.
> +	!
> +	!   r2 = rounded up r4
> +	!   r3 = rounded down r0
> +
> +	mov	r0, r3		!   5 MT (latency=0)
> +	mov	#(~0x1f), r1	!   6 EX
> +
> +	and	r1, r3		!  78 EX
> +	mov	r4, r2		!   5 MT (latency=0)
> +
> +	cmp/eq	r3, r0		!  54 MT
> +	add	#0x1f, r2	!  50 EX
> +	
> +	add	#-2, r5		!  50 EX
> +	bt/s	1f		! 110 BR
> +	 and	r1, r2		!  78 EX
> +	
> +	! Copy a short word one at a time until we are cache line aligned
> +	!   Normal values: r0, r2, r3, r4
> +	!   Unused: r1, r6, r7
> +	!   Mod: r5 (=r5-2)
> +	!
> +	add	#2, r3		!  50 EX
> +	
> +2:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
> +	cmp/eq	r3,r0		!  54 MT
> +		
> +	bf/s	2b		! 111 BR
> +
> +	 mov.w	r1, at -r0		!  29 LS
> +
> +	! Copy the cache line aligned blocks
> +	!
> +	! In use: r0, r2, r4, r5 (=r5-2)
> +	! Scratch: r1, r3, r6, r7
> +	!
> +	! We could do this with the four scratch registers, but if src
> +	! and dest hit the same cache line, this will thrash, so make
> +	! use of additional registers.
> +	! 
> +	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
> +	!   r5:	 src (was r0+r5)
> +	!   r1:	 dest (was r0)
> +	! this can be reversed at the end, so we don't need to save any extra
> +	! state.
> +	!
> +1:	mov.l	r8, @-r15	!  30 LS
> +	add	r0, r5		!  49 EX
> +	
> +	mov.l	r9, @-r15	!  30 LS
> +	mov	r0, r1		!   5 MT (latency=0)
> +	
> +	mov.l	r10, @-r15	!  30 LS
> +	add	#-0x1e, r5	!  50 EX
> +	
> +	mov.l	r11, @-r15	!  30 LS			
> +	
> +	mov.l	r12, @-r15	!  30 LS			
> +
> +	! 17 cycles, 32 bytes per iteration
> +#ifdef __LITTLE_ENDIAN__
> +2:	mov.w	@r5+, r0	!  14 LS (latency=2)		..JI
> +	add	#-0x20, r1	!  50 EX
> +
> +	mov.l	@r5+, r3	!  15 LS (latency=2)		NMLK
> +
> +	mov.l	@r5+, r6	!  15 LS (latency=2)		RQPO
> +	shll16	r0		! 103 EX			JI..
> +
> +	mov.l	@r5+, r7	!  15 LS (latency=2)
> +	xtrct	r3, r0		!  48 EX			LKJI
> +
> +	mov.l	@r5+, r8	!  15 LS (latency=2)
> +	xtrct	r6, r3		!  48 EX			PONM
> +
> +	mov.l	@r5+, r9	!  15 LS (latency=2)
> +	xtrct	r7, r6		!  48 EX
> +
> +	mov.l	@r5+, r10	!  15 LS (latency=2)
> +	xtrct	r8, r7		!  48 EX
> +
> +	mov.l	@r5+, r11	!  15 LS (latency=2)
> +	xtrct	r9, r8		!  48 EX
> +
> +	mov.w	@r5+, r12	!  15 LS (latency=2)
> +	xtrct	r10, r9		!  48 EX
> +
> +	movca.l	r0, at r1		!  40 LS (latency=3-7)
> +	xtrct	r11, r10	!  48 EX
> +
> +	mov.l	r3, @(0x04,r1)	!  33 LS
> +	xtrct	r12, r11	!  48 EX
> +
> +	mov.l	r6, @(0x08,r1)	!  33 LS
> +	
> +	mov.l	r7, @(0x0c,r1)	!  33 LS
> +
> +	mov.l	r8, @(0x10,r1)	!  33 LS
> +	add	#-0x40, r5	!  50 EX
> +
> +	mov.l	r9, @(0x14,r1)	!  33 LS
> +	cmp/eq	r2,r1		!  54 MT
> +
> +	mov.l	r10, @(0x18,r1)	!  33 LS
> +	bf/s	2b		! 109 BR
> +
> +	 mov.l	r11, @(0x1c,r1)	!  33 LS
> +#else
> +2:	mov.w	@(0x1e,r5), r0	!  17 LS (latency=2)
> +	add	#-2, r5		!  50 EX
> +
> +	mov.l	@(0x1c,r5), r3	!  18 LS (latency=2)
> +	add	#-4, r1		!  50 EX
> +
> +	mov.l	@(0x18,r5), r6	!  18 LS (latency=2)
> +	shll16	r0		! 103 EX
> +
> +	mov.l	@(0x14,r5), r7	!  18 LS (latency=2)
> +	xtrct	r3, r0		!  48 EX
> +
> +	mov.l	@(0x10,r5), r8	!  18 LS (latency=2)
> +	xtrct	r6, r3		!  48 EX
> +
> +	mov.l	@(0x0c,r5), r9	!  18 LS (latency=2)
> +	xtrct	r7, r6		!  48 EX
> +
> +	mov.l	@(0x08,r5), r10	!  18 LS (latency=2)
> +	xtrct	r8, r7		!  48 EX
> +
> +	mov.l	@(0x04,r5), r11	!  18 LS (latency=2)
> +	xtrct	r9, r8		!  48 EX
> +
> +	mov.w	@(0x02,r5), r12	!  18 LS (latency=2)
> +	xtrct	r10, r9		!  48 EX
> +
> +	movca.l	r0, at r1		!  40 LS (latency=3-7)
> +	add	#-0x1c, r1	!  50 EX
> +
> +	mov.l	r3, @(0x1c,r1)	!  33 LS
> +	xtrct	r11, r10	!  48 EX
> +
> +	mov.l	r6, @(0x18,r1)	!  33 LS
> +	xtrct	r12, r11	!  48 EX
> +	
> +	mov.l	r7, @(0x14,r1)	!  33 LS
> +
> +	mov.l	r8, @(0x10,r1)	!  33 LS
> +	add	#-0x3e, r5	!  50 EX
> +
> +	mov.l	r9, @(0x0c,r1)	!  33 LS
> +	cmp/eq	r2,r1		!  54 MT
> +
> +	mov.l	r10, @(0x08,r1)	!  33 LS
> +	bf/s	2b		! 109 BR
> +
> +	 mov.l	r11, @(0x04,r1)	!  33 LS
> +#endif
> +
> +	mov.l	@r15+, r12
> +	mov	r1, r0		!   5 MT (latency=0)
> +
> +	mov.l	@r15+, r11	!  15 LS
> +	sub	r1, r5		!  75 EX
> +
> +	mov.l	@r15+, r10	!  15 LS
> +	cmp/eq	r4, r0		!  54 MT
> +
> +	bf/s	1f		! 109 BR
> +	 mov.l	 @r15+, r9	!  15 LS
> +
> +	rts
> +1:	 mov.l	@r15+, r8	!  15 LS
> +
> +	add	#0x1e, r5	!  50 EX
> +	
> +	! Finish off a short word at a time
> +	! r5 must be invariant - 2
> +10:	mov	r4,r2		!   5 MT (latency=0)
> +	add	#1,r2		!  50 EX
> +
> +	cmp/hi	r2, r0		!  57 MT
> +	bf/s	1f		! 109 BR
> +
> +	 add	#2, r2		!  50 EX
> +	
> +3:	mov.w	@(r0,r5),r1	!  20 LS
> +	cmp/hi	r2,r0		!  57 MT
> +
> +	bt/s	3b		! 109 BR
> +
> +	 mov.w	r1, at -r0		!  29 LS
> +1:
> +		
> +	!
> +	! Finally, copy the last byte if necessary
> +	cmp/eq	r4,r0		!  54 MT
> +	bt/s	9b
> +	 add	#1,r5
> +	mov.b	@(r0,r5),r1
> +	rts
> +	 mov.b	r1, at -r0
> +
> +.size memcpy,.-memcpy;
> +libc_hidden_def (memcpy)
> 
> 
> ------------------------------------------------------------------------
> 
> _______________________________________________
> uClibc mailing list
> uClibc at uclibc.org
> http://busybox.net/cgi-bin/mailman/listinfo/uclibc




More information about the uClibc mailing list