[git commit master 1/1] sh: move data without fetching cache block within the memset

Wed Sep 15 11:08:50 UTC 2010

commit: http://git.uclibc.org/uClibc/commit/?id=599c74a4d7e9bbe68b946d65aef2725821ea3fe9
branch: http://git.uclibc.org/uClibc/commit/?id=refs/heads/master

With this patch the movca.l instruction is used within the memset.
The current memset implementation only uses the FPU and there is
an real gain for all the sizes.
Adding the movca.l instruction numbers always are better than the generic code.
There is a big gain for size greater than 64 KiB but number are worst for 4-32KiB
sizes compared with the implementation without movca.l.

	Time Memory Bandwidth (Mbytes)
-------------------------------------------------
	    Generic         SH4          SH4
	                   (FPU)     (FPU+movca.l)
-------------------------------------------------
512         1143	 1998          1596
1 KiB       1273	 2567          1915
2 KiB       1350	 2993          2128
4-32KiB     1391	 3262          2252
64KiB-16MiB 170		 186	      *830*

Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro at st.com>
Signed-off-by: Carmelo Amoroso <carmelo.amoroso at st.com>
---
 libc/string/sh/sh4/memset.S |   62 +++++++++++++++++++++++-------------------
 1 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/libc/string/sh/sh4/memset.S b/libc/string/sh/sh4/memset.S
index 83f8746..eb83355 100644
--- a/libc/string/sh/sh4/memset.S
+++ b/libc/string/sh/sh4/memset.S
@@ -5,7 +5,7 @@
  * Copyright (C) 1999  Niibe Yutaka
  *
  * Copyright (c) 2009  STMicroelectronics Ltd
- *   Optimised using 64bit data transfer via FPU
+ *   Optimised using 64bit data transfer (via FPU) and the movca.l inst.
  *   Author: Giuseppe Cavallaro <peppe.cavallaro at st.com>
  *
  * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
@@ -24,9 +24,9 @@
  * Currenlty it has been only implemented and tested for little endian mode. */
 .macro FPU_SET_PAIRED_PREC
 	sts	fpscr, r3
-	mov	#0x10, r0	! PR=0 SZ=1
-	shll16  r0
-	lds	r0, fpscr
+	mov	#0x10, r1	! PR=0 SZ=1
+	shll16  r1
+	lds	r1, fpscr
 .endm
 .macro RESTORE_FPSCR
 	lds	r3, fpscr
@@ -34,12 +34,10 @@
 #endif
 
 ENTRY(memset)
-	tst	r6,r6
-	bt/s	5f		! if n=0, do nothing
-	 add	r6,r4
 	mov	#12,r0
+	add	r6,r4
 	cmp/gt	r6,r0
-	bt/s	4f		! if it's too small, set a byte at once
+	bt/s	40f		! if it's too small, set a byte at once
 	 mov	r4,r0
 	and	#3,r0
 	cmp/eq	#0,r0
@@ -56,7 +54,7 @@ ENTRY(memset)
 	swap.w	r5,r0		! VV00
 	or	r0,r5		! VVVV
 
-	! Enough bytes need to be copied
+	! Check if enough bytes need to be copied to be worth the big loop
 	mov	#0x40, r0	! (MT)
 	cmp/gt	r6,r0		! (MT)  64 > len => slow loop
 
@@ -84,6 +82,9 @@ ENTRY(memset)
 	mov	#-5,r0
 	shld	r0,r2		! number of loops
 
+	add	#-32, r4
+	mov	r5, r0
+
 #ifdef MEMSET_USES_FPU
 	lds	r5, fpul	! (CO)
 	fsts	fpul, fr0	! Dr0 will be 'VVVVVVVV'
@@ -91,36 +92,40 @@ ENTRY(memset)
 
 	FPU_SET_PAIRED_PREC
 12:
-	add	#-0x20, r6	!(MT)
+	movca.l	r0, @r4
+	mov.l	r5, @(4, r4)
+	add	#32, r4
 	fmov	dr0, @-r4
 	fmov	dr0, @-r4
+	add	#-0x20, r6
 	fmov	dr0, @-r4
 	dt	r2
-	bf/s	12b		!(BR)
-	 fmov	dr0, @-r4
+	bf/s	12b
+	 add	#-40, r4
 
 	RESTORE_FPSCR
 #else
 12:
-	mov.l	r5, at -r4
-	mov.l	r5, at -r4
-	mov.l	r5, at -r4
-	mov.l	r5, at -r4
-	mov.l	r5, at -r4
-	mov.l	r5, at -r4
+	movca.l	r0, at r4
+	mov.l	r5,@(4, r4)
+	mov.l	r5,@(8, r4)
+	mov.l	r5,@(12,r4)
+	mov.l	r5,@(16,r4)
+	mov.l	r5,@(20,r4)
 	add	#-0x20, r6
-	mov.l	r5, at -r4
+	mov.l	r5,@(24,r4)
 	dt	r2
+	mov.l	r5,@(28,r4)
 	bf/s	12b
-	 mov.l	r5, at -r4
-#endif
-	tst	r6,r6
-	bt/s	5f
-	 mov	#8, r0
+	 add	#-32, r4
 
+#endif
+	add	#32, r4
+	mov	#8, r0
 	cmp/ge	r0, r6
-	bf/s	4f
-	 mov	r6,r0
+	bf	40f
+
+	mov	r6,r0
 22:
 	shlr2	r0
 	shlr	r0		! r0 = r6 >> 3
@@ -132,9 +137,10 @@ ENTRY(memset)
 	!
 	mov	#7,r0
 	and	r0,r6
-	tst	r6,r6
+
+	! fill bytes (length may be zero)
+40:	tst	r6,r6
 	bt	5f
-	! fill bytes
 4:
 	dt	r6
 	bf/s	4b
-- 
1.7.1