[git commit] libbb: SHA-NI code shrink

Denys Vlasenko vda.linux at googlemail.com
Mon Jul 21 22:12:54 UTC 2025


commit: https://git.busybox.net/busybox/commit/?id=ed22c5bd4a537f016321a888743a8b12f6bb15a9
branch: https://git.busybox.net/busybox/commit/?id=refs/heads/master

function                                             old     new   delta
sha256_process_block64_shaNI                         673     657     -16

Signed-off-by: Denys Vlasenko <vda.linux at googlemail.com>
---
 libbb/hash_sha256_hwaccel_x86-32.S | 50 +++++++++++++++++++++++++-------------
 libbb/hash_sha256_hwaccel_x86-64.S | 50 +++++++++++++++++++++++++-------------
 2 files changed, 66 insertions(+), 34 deletions(-)

diff --git a/libbb/hash_sha256_hwaccel_x86-32.S b/libbb/hash_sha256_hwaccel_x86-32.S
index 332b7513f..6362ae382 100644
--- a/libbb/hash_sha256_hwaccel_x86-32.S
+++ b/libbb/hash_sha256_hwaccel_x86-32.S
@@ -41,7 +41,7 @@
 
 #define XMMTMP		%xmm7
 
-#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
+#define SHUF(a,b,c,d) $((a)+((b)<<2)+((c)<<4)+((d)<<6))
 
 	.balign	8	# allow decoders to fetch at least 2 first insns
 sha256_process_block64_shaNI:
@@ -58,13 +58,29 @@ sha256_process_block64_shaNI:
 	mova128		PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
 	movl		$K256+8*16, SHA256CONSTANTS
 
+// sha256rnds2 instruction uses only lower 64 bits of MSG.
+// The code below needs to move upper 64 bits to lower 64 bits
+// for the second sha256rnds2 invocation
+// (what remains in upper bits does not matter).
+// There are several ways to do it:
+// movhlps    MSG, MSG                // abcd -> cdcd (3 bytes of code)
+// shuf128_32 SHUF(2,3,n,n), MSG, MSG // abcd -> cdXX (4 bytes)
+// punpckhqdq MSG, MSG                // abcd -> cdcd (4 bytes)
+// psrldq     $8, MSG                 // abcd -> cd00 (5 bytes)
+// palignr    $8, MSG, MSG            // abcd -> cdab (6 bytes, SSSE3 insn)
+#define MOVE_UPPER64_DOWN(reg) movhlps reg, reg
+//#define MOVE_UPPER64_DOWN(reg) shuf128_32 SHUF(2,3,0,0), reg, reg
+//#define MOVE_UPPER64_DOWN(reg) punpckhqdq reg, reg
+//#define MOVE_UPPER64_DOWN(reg) psrldq $8, reg
+//#define MOVE_UPPER64_DOWN(reg) palignr $8, reg, reg
+
 	/* Rounds 0-3 */
 	movu128		0*16(DATA_PTR), MSG
 	pshufb		XMMTMP, MSG
 	mova128		MSG, MSG0
 		paddd		0*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 
 	/* Rounds 4-7 */
@@ -73,7 +89,7 @@ sha256_process_block64_shaNI:
 	mova128		MSG, MSG1
 		paddd		1*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG1, MSG0
 
@@ -83,7 +99,7 @@ sha256_process_block64_shaNI:
 	mova128		MSG, MSG2
 		paddd		2*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG2, MSG1
 
@@ -98,7 +114,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG2, XMMTMP
 	paddd		XMMTMP, MSG0
 	sha256msg2	MSG3, MSG0
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG3, MSG2
 
@@ -110,7 +126,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG3, XMMTMP
 	paddd		XMMTMP, MSG1
 	sha256msg2	MSG0, MSG1
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG0, MSG3
 
@@ -122,7 +138,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG0, XMMTMP
 	paddd		XMMTMP, MSG2
 	sha256msg2	MSG1, MSG2
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG1, MSG0
 
@@ -134,7 +150,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG1, XMMTMP
 	paddd		XMMTMP, MSG3
 	sha256msg2	MSG2, MSG3
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG2, MSG1
 
@@ -146,7 +162,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG2, XMMTMP
 	paddd		XMMTMP, MSG0
 	sha256msg2	MSG3, MSG0
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG3, MSG2
 
@@ -158,7 +174,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG3, XMMTMP
 	paddd		XMMTMP, MSG1
 	sha256msg2	MSG0, MSG1
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG0, MSG3
 
@@ -170,7 +186,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG0, XMMTMP
 	paddd		XMMTMP, MSG2
 	sha256msg2	MSG1, MSG2
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG1, MSG0
 
@@ -182,7 +198,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG1, XMMTMP
 	paddd		XMMTMP, MSG3
 	sha256msg2	MSG2, MSG3
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG2, MSG1
 
@@ -194,7 +210,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG2, XMMTMP
 	paddd		XMMTMP, MSG0
 	sha256msg2	MSG3, MSG0
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG3, MSG2
 
@@ -206,7 +222,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG3, XMMTMP
 	paddd		XMMTMP, MSG1
 	sha256msg2	MSG0, MSG1
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG0, MSG3
 
@@ -218,7 +234,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG0, XMMTMP
 	paddd		XMMTMP, MSG2
 	sha256msg2	MSG1, MSG2
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 
 	/* Rounds 56-59 */
@@ -229,14 +245,14 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG1, XMMTMP
 	paddd		XMMTMP, MSG3
 	sha256msg2	MSG2, MSG3
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 
 	/* Rounds 60-63 */
 	mova128		MSG3, MSG
 		paddd		15*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 
 	/* Write hash values back in the correct order */
diff --git a/libbb/hash_sha256_hwaccel_x86-64.S b/libbb/hash_sha256_hwaccel_x86-64.S
index f8911968b..92f00ebcd 100644
--- a/libbb/hash_sha256_hwaccel_x86-64.S
+++ b/libbb/hash_sha256_hwaccel_x86-64.S
@@ -44,7 +44,7 @@
 #define SAVE0		%xmm8
 #define SAVE1		%xmm9
 
-#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
+#define SHUF(a,b,c,d) $((a)+((b)<<2)+((c)<<4)+((d)<<6))
 
 	.balign	8	# allow decoders to fetch at least 2 first insns
 sha256_process_block64_shaNI:
@@ -65,13 +65,29 @@ sha256_process_block64_shaNI:
 	mova128		STATE0, SAVE0
 	mova128		STATE1, SAVE1
 
+// sha256rnds2 instruction uses only lower 64 bits of MSG.
+// The code below needs to move upper 64 bits to lower 64 bits
+// for the second sha256rnds2 invocation
+// (what remains in upper bits does not matter).
+// There are several ways to do it:
+// movhlps    MSG, MSG                // abcd -> cdcd (3 bytes of code)
+// shuf128_32 SHUF(2,3,n,n), MSG, MSG // abcd -> cdXX (4 bytes)
+// punpckhqdq MSG, MSG                // abcd -> cdcd (4 bytes)
+// psrldq     $8, MSG                 // abcd -> cd00 (5 bytes)
+// palignr    $8, MSG, MSG            // abcd -> cdab (6 bytes, SSSE3 insn)
+#define MOVE_UPPER64_DOWN(reg) movhlps reg, reg
+//#define MOVE_UPPER64_DOWN(reg) shuf128_32 SHUF(2,3,0,0), reg, reg
+//#define MOVE_UPPER64_DOWN(reg) punpckhqdq reg, reg
+//#define MOVE_UPPER64_DOWN(reg) psrldq $8, reg
+//#define MOVE_UPPER64_DOWN(reg) palignr $8, reg, reg
+
 	/* Rounds 0-3 */
 	movu128		0*16(DATA_PTR), MSG
 	pshufb		XMMTMP, MSG
 	mova128		MSG, MSG0
 		paddd		0*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 
 	/* Rounds 4-7 */
@@ -80,7 +96,7 @@ sha256_process_block64_shaNI:
 	mova128		MSG, MSG1
 		paddd		1*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG1, MSG0
 
@@ -90,7 +106,7 @@ sha256_process_block64_shaNI:
 	mova128		MSG, MSG2
 		paddd		2*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG2, MSG1
 
@@ -105,7 +121,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG2, XMMTMP
 	paddd		XMMTMP, MSG0
 	sha256msg2	MSG3, MSG0
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG3, MSG2
 
@@ -117,7 +133,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG3, XMMTMP
 	paddd		XMMTMP, MSG1
 	sha256msg2	MSG0, MSG1
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG0, MSG3
 
@@ -129,7 +145,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG0, XMMTMP
 	paddd		XMMTMP, MSG2
 	sha256msg2	MSG1, MSG2
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG1, MSG0
 
@@ -141,7 +157,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG1, XMMTMP
 	paddd		XMMTMP, MSG3
 	sha256msg2	MSG2, MSG3
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG2, MSG1
 
@@ -153,7 +169,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG2, XMMTMP
 	paddd		XMMTMP, MSG0
 	sha256msg2	MSG3, MSG0
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG3, MSG2
 
@@ -165,7 +181,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG3, XMMTMP
 	paddd		XMMTMP, MSG1
 	sha256msg2	MSG0, MSG1
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG0, MSG3
 
@@ -177,7 +193,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG0, XMMTMP
 	paddd		XMMTMP, MSG2
 	sha256msg2	MSG1, MSG2
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG1, MSG0
 
@@ -189,7 +205,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG1, XMMTMP
 	paddd		XMMTMP, MSG3
 	sha256msg2	MSG2, MSG3
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG2, MSG1
 
@@ -201,7 +217,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG2, XMMTMP
 	paddd		XMMTMP, MSG0
 	sha256msg2	MSG3, MSG0
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG3, MSG2
 
@@ -213,7 +229,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG3, XMMTMP
 	paddd		XMMTMP, MSG1
 	sha256msg2	MSG0, MSG1
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSG0, MSG3
 
@@ -225,7 +241,7 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG0, XMMTMP
 	paddd		XMMTMP, MSG2
 	sha256msg2	MSG1, MSG2
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 
 	/* Rounds 56-59 */
@@ -236,14 +252,14 @@ sha256_process_block64_shaNI:
 	palignr		$4, MSG1, XMMTMP
 	paddd		XMMTMP, MSG3
 	sha256msg2	MSG2, MSG3
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 
 	/* Rounds 60-63 */
 	mova128		MSG3, MSG
 		paddd		15*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
+		MOVE_UPPER64_DOWN(MSG)
 		sha256rnds2	MSG, STATE1, STATE0
 
 	/* Add current hash values with previously saved */


More information about the busybox-cvs mailing list