svn commit: trunk/uClibc/libc/string/x86_64
vda at uclibc.org
vda at uclibc.org
Tue Apr 15 08:27:24 UTC 2008
Author: vda
Date: 2008-04-15 01:27:24 -0700 (Tue, 15 Apr 2008)
New Revision: 21738
Log:
amd64 string ops: use alignment more carefully, and comment it.
By capping max padding to not be bigger than three next insns,
we avoid having ridiculously big NOPs like this one:
53:66 66 66 66 2e 0f 1f nopw %cs:0x0(%rax,%rax,1)
5a:84 00 00 00 00 00
which was bigger than next three insns combined!
Size changes:
text data bss dec hex filename
102 0 0 102 66 x86_64/memcpy.o
102 0 0 102 66 x86_64.old/memcpy.o
90 0 0 90 5a x86_64/mempcpy.o
102 0 0 102 66 x86_64.old/mempcpy.o
210 0 0 210 d2 x86_64/memset.o
242 0 0 242 f2 x86_64.old/memset.o
213 0 0 213 d5 x86_64/stpcpy.o
220 0 0 220 dc x86_64.old/stpcpy.o
428 0 0 428 1ac x86_64/strcat.o
444 0 0 444 1bc x86_64.old/strcat.o
417 0 0 417 1a1 x86_64/strchr.o
418 0 0 418 1a2 x86_64.old/strchr.o
33 0 0 33 21 x86_64/strcmp.o
33 0 0 33 21 x86_64.old/strcmp.o
213 0 0 213 d5 x86_64/strcpy.o
220 0 0 220 dc x86_64.old/strcpy.o
135 0 0 135 87 x86_64/strcspn.o
151 0 0 151 97 x86_64.old/strcspn.o
225 0 0 225 e1 x86_64/strlen.o
233 0 0 233 e9 x86_64.old/strlen.o
140 0 0 140 8c x86_64/strpbrk.o
156 0 0 156 9c x86_64.old/strpbrk.o
135 0 0 135 87 x86_64/strspn.o
151 0 0 151 97 x86_64.old/strspn.o
Also, a few files got their .text alignment relaxed from 16 to 8 bytes,
which reduces padding at link time.
Modified:
trunk/uClibc/libc/string/x86_64/memcpy.S
trunk/uClibc/libc/string/x86_64/memset.S
trunk/uClibc/libc/string/x86_64/strcat.S
trunk/uClibc/libc/string/x86_64/strchr.S
trunk/uClibc/libc/string/x86_64/strcpy.S
trunk/uClibc/libc/string/x86_64/strcspn.S
trunk/uClibc/libc/string/x86_64/strlen.S
trunk/uClibc/libc/string/x86_64/strspn.S
Changeset:
Modified: trunk/uClibc/libc/string/x86_64/memcpy.S
===================================================================
--- trunk/uClibc/libc/string/x86_64/memcpy.S 2008-04-15 08:25:32 UTC (rev 21737)
+++ trunk/uClibc/libc/string/x86_64/memcpy.S 2008-04-15 08:27:24 UTC (rev 21738)
@@ -59,9 +59,9 @@
subq $32, %rcx
js 2f
- .p2align 4
+ /* Next 3 insns are 11 bytes total, make sure we decode them in one go */
+ .p2align 4,,11
3:
-
/* Now correct the loop counter. Please note that in the following
code the flags are not changed anymore. */
subq $32, %rcx
Modified: trunk/uClibc/libc/string/x86_64/memset.S
===================================================================
--- trunk/uClibc/libc/string/x86_64/memset.S 2008-04-15 08:25:32 UTC (rev 21737)
+++ trunk/uClibc/libc/string/x86_64/memset.S 2008-04-15 08:27:24 UTC (rev 21738)
@@ -55,8 +55,10 @@
test $0x7,%edi /* Check for alignment. */
jz 2f
- .p2align 4
-1: /* Align ptr to 8 byte. */
+ /* Next 3 insns are 9 bytes total, make sure we decode them in one go */
+ .p2align 4,,9
+1:
+ /* Align ptr to 8 byte. */
mov %sil,(%rcx)
dec %rdx
inc %rcx
@@ -70,8 +72,10 @@
cmp LARGE, %rdx
jae 11f
- .p2align 4
-3: /* Fill 64 bytes. */
+ /* Next 3 insns are 11 bytes total, make sure we decode them in one go */
+ .p2align 4,,11
+3:
+ /* Fill 64 bytes. */
mov %r8,(%rcx)
mov %r8,0x8(%rcx)
mov %r8,0x10(%rcx)
@@ -114,9 +118,11 @@
#endif
retq
- .p2align 4
-11: /* Fill 64 bytes without polluting the cache. */
- /* We could use movntdq %xmm0,(%rcx) here to further
+ /* Next 3 insns are 14 bytes total, make sure we decode them in one go */
+ .p2align 4,,14
+11:
+ /* Fill 64 bytes without polluting the cache. */
+ /* We could use movntdq %xmm0,(%rcx) here to further
speed up for large cases but let's not use XMM registers. */
movnti %r8,(%rcx)
movnti %r8,0x8(%rcx)
Modified: trunk/uClibc/libc/string/x86_64/strcat.S
===================================================================
--- trunk/uClibc/libc/string/x86_64/strcat.S 2008-04-15 08:25:32 UTC (rev 21737)
+++ trunk/uClibc/libc/string/x86_64/strcat.S 2008-04-15 08:27:24 UTC (rev 21738)
@@ -45,7 +45,9 @@
/* Now the source is aligned. Scan for NUL byte. */
- .p2align 4
+
+ /* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+ .p2align 4,,10
4:
/* First unroll. */
movq (%rax), %rcx /* get double word (= 8 bytes) in question */
@@ -103,8 +105,11 @@
the addition will not result in 0. */
jz 4b /* no NUL found => continue loop */
- .p2align 4 /* Align, it is a jump target. */
-3: subq $8,%rax /* correct pointer increment. */
+ /* Align, it is a jump target. */
+ /* Next 3 insns are 8 bytes total, make sure we decode them in one go */
+ .p2align 3,,8
+3:
+ subq $8,%rax /* correct pointer increment. */
testb %cl, %cl /* is first byte NUL? */
jz 2f /* yes => return */
@@ -160,7 +165,9 @@
/* Now the sources is aligned. Unfortunatly we cannot force
to have both source and destination aligned, so ignore the
alignment of the destination. */
- .p2align 4
+
+ /* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+ .p2align 4,,10
22:
/* 1st unroll. */
movq (%rsi), %rax /* Read double word (8 bytes). */
@@ -237,7 +244,9 @@
/* Do the last few bytes. %rax contains the value to write.
The loop is unrolled twice. */
- .p2align 4
+
+ /* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+ .p2align 3,,6
23:
movb %al, (%rdx) /* 1st byte. */
testb %al, %al /* Is it NUL. */
Modified: trunk/uClibc/libc/string/x86_64/strchr.S
===================================================================
--- trunk/uClibc/libc/string/x86_64/strchr.S 2008-04-15 08:25:32 UTC (rev 21737)
+++ trunk/uClibc/libc/string/x86_64/strchr.S 2008-04-15 08:27:24 UTC (rev 21738)
@@ -92,7 +92,8 @@
each of whose bytes is C. This turns each byte that is C
into a zero. */
- .p2align 4
+ /* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+ .p2align 4,,10
4:
/* Main Loop is unrolled 4 times. */
/* First unroll. */
@@ -230,8 +231,11 @@
reversed. */
- .p2align 4 /* Align, it's a jump target. */
-3: movq %r9,%rdx /* move to %rdx so that we can access bytes */
+ /* Align, it's a jump target. */
+ /* Next 3 insns are 9 bytes total, make sure we decode them in one go */
+ .p2align 4,,9
+3:
+ movq %r9,%rdx /* move to %rdx so that we can access bytes */
subq $8,%rax /* correct pointer increment. */
testb %cl, %cl /* is first byte C? */
jz 6f /* yes => return pointer */
@@ -281,7 +285,7 @@
incq %rax
6:
- nop
+ /* nop - huh?? */
retq
END (BP_SYM (strchr))
Modified: trunk/uClibc/libc/string/x86_64/strcpy.S
===================================================================
--- trunk/uClibc/libc/string/x86_64/strcpy.S 2008-04-15 08:25:32 UTC (rev 21737)
+++ trunk/uClibc/libc/string/x86_64/strcpy.S 2008-04-15 08:27:24 UTC (rev 21738)
@@ -53,7 +53,9 @@
/* Now the sources is aligned. Unfortunatly we cannot force
to have both source and destination aligned, so ignore the
alignment of the destination. */
- .p2align 4
+
+ /* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+ .p2align 4,,10
1:
/* 1st unroll. */
movq (%rsi), %rax /* Read double word (8 bytes). */
@@ -130,7 +132,9 @@
/* Do the last few bytes. %rax contains the value to write.
The loop is unrolled twice. */
- .p2align 4
+
+ /* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+ .p2align 3,,6
3:
/* Note that stpcpy needs to return with the value of the NUL
byte. */
Modified: trunk/uClibc/libc/string/x86_64/strcspn.S
===================================================================
--- trunk/uClibc/libc/string/x86_64/strcspn.S 2008-04-15 08:25:32 UTC (rev 21737)
+++ trunk/uClibc/libc/string/x86_64/strcspn.S 2008-04-15 08:27:24 UTC (rev 21738)
@@ -55,7 +55,9 @@
Although all the following instruction only modify %cl we always
have a correct zero-extended 64-bit value in %rcx. */
- .p2align 4
+ /* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+ .p2align 3,,6
+
L(2): movb (%rax), %cl /* get byte from skipset */
testb %cl, %cl /* is NUL char? */
jz L(1) /* yes => start compare loop */
@@ -88,7 +90,13 @@
value in the table. But the value of NUL is NUL so the loop
terminates for NUL in every case. */
- .p2align 4
+ /* Next 3 insns are 9 bytes total. */
+ /* .p2align 4,,9 would make sure we decode them in one go, */
+ /* but it will also align entire function to 16 bytes, */
+ /* potentially creating largish padding at link time. */
+ /* We are aligning to 8 bytes instead: */
+ .p2align 3,,8
+
L(3): addq $4, %rax /* adjust pointer for full loop round */
movb (%rax), %cl /* get byte from string */
Modified: trunk/uClibc/libc/string/x86_64/strlen.S
===================================================================
--- trunk/uClibc/libc/string/x86_64/strlen.S 2008-04-15 08:25:32 UTC (rev 21737)
+++ trunk/uClibc/libc/string/x86_64/strlen.S 2008-04-15 08:27:24 UTC (rev 21738)
@@ -40,8 +40,11 @@
1: movq $0xfefefefefefefeff,%r8 /* Save magic. */
- .p2align 4 /* Align loop. */
-4: /* Main Loop is unrolled 4 times. */
+ /* Align loop. */
+ /* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+ .p2align 4,,10
+4:
+ /* Main Loop is unrolled 4 times. */
/* First unroll. */
movq (%rax), %rcx /* get double word (= 8 bytes) in question */
addq $8,%rax /* adjust pointer for next word */
@@ -98,8 +101,11 @@
the addition will not result in 0. */
jz 4b /* no NUL found => continue loop */
- .p2align 4 /* Align, it is a jump target. */
-3: subq $8,%rax /* correct pointer increment. */
+ /* Align, it is a jump target. */
+ /* Next 3 insns are 8 bytes total, make sure we decode them in one go */
+ .p2align 3,,8
+3:
+ subq $8,%rax /* correct pointer increment. */
testb %cl, %cl /* is first byte NUL? */
jz 2f /* yes => return */
Modified: trunk/uClibc/libc/string/x86_64/strspn.S
===================================================================
--- trunk/uClibc/libc/string/x86_64/strspn.S 2008-04-15 08:25:32 UTC (rev 21737)
+++ trunk/uClibc/libc/string/x86_64/strspn.S 2008-04-15 08:27:24 UTC (rev 21738)
@@ -50,8 +50,10 @@
Although all the following instruction only modify %cl we always
have a correct zero-extended 64-bit value in %rcx. */
- .p2align 4
-L(2): movb (%rax), %cl /* get byte from stopset */
+ /* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+ .p2align 3,,6
+L(2):
+ movb (%rax), %cl /* get byte from stopset */
testb %cl, %cl /* is NUL char? */
jz L(1) /* yes => start compare loop */
movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
@@ -83,8 +85,14 @@
value in the table. But the value of NUL is NUL so the loop
terminates for NUL in every case. */
- .p2align 4
-L(3): addq $4, %rax /* adjust pointer for full loop round */
+ /* Next 3 insns are 9 bytes total. */
+ /* .p2align 4,,9 would make sure we decode them in one go, */
+ /* but it will also align entire function to 16 bytes, */
+ /* potentially creating largish padding at link time. */
+ /* We are aligning to 8 bytes instead: */
+ .p2align 3,,8
+L(3):
+ addq $4, %rax /* adjust pointer for full loop round */
movb (%rax), %cl /* get byte from string */
testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
More information about the uClibc-cvs
mailing list