[git commit nptl] sh: Add new optimisation to the SH4 memcpy

Giuseppe Cavallaro peppe.cavallaro at st.com
Sat Sep 26 16:37:45 UTC 2009


commit: http://git.uclibc.org/uClibc/commit/?id=2e84309879728a97265a9be3da7de98ab8395c8b
branch: http://git.uclibc.org/uClibc/commit/?id=refs/heads/nptl

This optimization is based on prefetching and 64bit data transfer via FPU
(only for the little endianess)

Tests shows that:

  ----------------------------------------
  Memory bandwidth    |        Gain
                      | sh4-300 | sh4-200
  ----------------------------------------
  512 bytes to 16KiB  | ~20%    | ~25%
  from 32KiB to 16MiB | ~190%   | ~5%
  ----------------------------------------

Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro at st.com>
Signed-off-by: Carmelo Amoroso <carmelo.amoroso at st.com>
---
 libc/string/sh/sh4/memcpy.S |  109 ++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/libc/string/sh/sh4/memcpy.S b/libc/string/sh/sh4/memcpy.S
index 4a26d4e..efdaf8b 100644
--- a/libc/string/sh/sh4/memcpy.S
+++ b/libc/string/sh/sh4/memcpy.S
@@ -6,6 +6,9 @@
  *   Modified from memcpy.S and micro-optimised for SH4
  *   Stuart Menefy (stuart.menefy at st.com)
  *
+ * Copyright (c) 2009  STMicroelectronics Ltd
+ *   Optimised using prefetching and 64bit data transfer via FPU
+ *   Author: Giuseppe Cavallaro <peppe.cavallaro at st.com>
  */
 
 /*
@@ -18,6 +21,22 @@
 #include <sysdep.h>
 #include <endian.h>
 
+#ifdef __LITTLE_ENDIAN__
+#define	MEMCPY_USES_FPU
+/* Use paired single precision load or store mode for 64-bit tranfering.
+ * FPSCR.SZ=1,FPSCR.SZ=0 is well defined on both SH4-200 and SH4-300.
+ * Currenlty it has been only implemented and tested for little endian mode. */
+.macro FPU_SET_PAIRED_PREC
+	sts	fpscr, r7
+	mov	#0x10, r6	! PR=0 SZ=1
+	shll16	r6
+	lds	r6, fpscr
+.endm
+.macro RESTORE_FPSCR
+	lds	r7, fpscr
+.endm
+#endif
+
 	!
 	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
 	!
@@ -185,9 +204,7 @@ ENTRY(memcpy)
 	mov	r4, r0		!   5 MT (0 cycle latency)
 	add	r6, r0		!  49 EX
 
-	mov	#16, r1		!   6 EX
 	bt/s	.Lcase00	! 111 BR		(aligned)
-
 	 sub	r4, r5		!  75 EX
 
 	! Arguments are not nicely long word aligned or zero len.
@@ -203,6 +220,7 @@ ENTRY(memcpy)
 	! However the penalty for getting it 'wrong' is much higher for long word
 	! aligned data (and this is more common), so use a value of 16.
 
+	mov	#16, r1		!   6 EX
 	cmp/gt	r6,r1		!  56 MT
 
 	add	#-1,r5		!  50 EX
@@ -443,6 +461,92 @@ ENTRY(memcpy)
 
 	 mov.l	r7, @-r0	!  30 LS
 
+#ifdef MEMCPY_USES_FPU
+	! Copy the cache line aligned blocks by using the FPU registers.
+	! If src and dst are well aligned adopt 64-bit data transfer.
+	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
+	!   r5:	 src (was r0+r5)
+	!   r1:	 dest (was r0)
+1:
+	add	r0, r5
+	mov	r0, r1
+
+	add	#-0x1c, r5
+	mov	r5, r0
+
+	tst	#7, r0		! src is 8byte aligned
+	mov	r5, r3
+
+	add	#-64, r3	! To pefetch head
+	bt/s	3f
+
+	 pref	@r3
+
+2:	fmov.s	@r5+, fr0
+	mov	r1, r6
+	fmov.s	@r5+, fr1
+	add	#-32, r6
+	fmov.s	@r5+, fr2
+	fmov.s	@r5+, fr3
+	fmov.s	@r5+, fr4
+	fmov.s	@r5+, fr5
+	fmov.s	@r5+, fr6
+	fmov.s	@r5+, fr7
+	add	#-0x40, r5
+
+	movca.l	r0, @r6		! Cache allocate + store on dst-32.
+
+	fmov.s	fr7, @-r1
+	fmov.s	fr6, @-r1
+	fmov.s	fr5, @-r1
+	fmov.s	fr4, @-r1
+	fmov.s	fr3, @-r1
+	fmov.s	fr2, @-r1
+	fmov.s	fr1, @-r1
+	fmov.s	fr0, @-r1
+
+	add	#-32, r3
+	cmp/eq	r2,r1
+
+	bf/s	2b
+	 pref	@r3		! Prefetch the next cache line.
+
+	bra	5f
+
+3:	FPU_SET_PAIRED_PREC
+
+4:	fmov	@r5+, dr0
+	mov	r1, r6
+	fmov	@r5+, dr2
+	add	#-32, r6
+	fmov	@r5+, dr4
+	fmov	@r5+, dr6
+	add	#-0x40, r5
+
+	movca.l	r0, @r6
+
+	fmov	dr6, @-r1
+	fmov	dr4, @-r1
+	fmov	dr2, @-r1
+	fmov	dr0, @-r1
+	add	#-32, r3
+	cmp/eq	r2,r1
+
+	bf/s	4b
+	 pref	@r3
+
+	RESTORE_FPSCR
+
+5:	mov	r1, r0
+
+	cmp/eq	r4, r0		!  54 MT
+	bf/s	1f		! 109 BR
+	 sub	r1, r5		!  75 EX
+
+	rts
+	 nop
+1:
+#else
 	! Copy the cache line aligned blocks
 	!
 	! In use: r0, r2, r4, r5
@@ -508,6 +612,7 @@ ENTRY(memcpy)
 
 	rts
 1:	 mov.l	@r15+, r8	!  15 LS
+#endif
 	sub	r4, r1		!  75 EX		(len remaining)
 
 	! number of trailing bytes is non-zero
-- 
1.6.3.3



More information about the uClibc-cvs mailing list