[git commit] libbb: introduce and use block-XOR functions

Denys Vlasenko vda.linux at googlemail.com
Wed Jul 9 05:00:59 UTC 2025


commit: https://git.busybox.net/busybox/commit/?id=c305c81c94a086fb09444b1ea6f31fb911c25ec0
branch: https://git.busybox.net/busybox/commit/?id=refs/heads/master

On x86_64, they can be done in 16-byte blocks

64-bit:
function                                             old     new   delta
xorbuf_3                                               -      84     +84
xorbuf64_3_aligned64                                   -      58     +58
smix1                                                687     712     +25
xwrite_encrypted                                     520     534     +14
xorbuf16_aligned_long                                  -      13     +13
tls_xread_record                                     733     742      +9
xorbuf                                                21      13      -8
xorbuf_aligned_AES_BLOCK_SIZE                         15       -     -15
blockmix                                             814     762     -52
blockmix_salsa8                                      317     198    -119
blockmix_xor_save                                   1620    1499    -121
blockmix_xor                                        1543    1322    -221
------------------------------------------------------------------------------
(add/remove: 4/1 grow/shrink: 3/5 up/down: 203/-536)         Total: -333 bytes

32-bit:
function                                             old     new   delta
xorbuf_3                                               -      76     +76
xorbuf64_3_aligned64                                   -      36     +36
xorbuf16_aligned_long                                  -      23     +23
xwrite_encrypted                                     499     507      +8
tls_xread_record                                     646     650      +4
xorbuf                                                22      11     -11
xorbuf_aligned_AES_BLOCK_SIZE                         23       -     -23
blockmix                                            1083     938    -145
blockmix_salsa8                                      415     210    -205
blockmix_salsa8_xor                                  601     163    -438
blockmix_xor                                        2103    1533    -570
blockmix_xor_save                                   2614    1859    -755
------------------------------------------------------------------------------
(add/remove: 4/1 grow/shrink: 2/6 up/down: 147/-2147)       Total: -2000 bytes

Signed-off-by: Denys Vlasenko <vda.linux at googlemail.com>
---
 include/libbb.h                   |  10 ++++
 libbb/bitops.c                    | 108 ++++++++++++++++++++++++++++++++++++++
 libbb/yescrypt/alg-sha256.c       |   1 +
 libbb/yescrypt/alg-yescrypt-kdf.c |   7 +++
 networking/tls.c                  |  39 +++-----------
 networking/tls.h                  |   5 +-
 networking/tls_aesgcm.c           |   5 +-
 7 files changed, 137 insertions(+), 38 deletions(-)

diff --git a/include/libbb.h b/include/libbb.h
index 544ca3155..79427fb31 100644
--- a/include/libbb.h
+++ b/include/libbb.h
@@ -1113,6 +1113,16 @@ char *bin2hex(char *dst, const char *src, int count) FAST_FUNC;
 /* Reverse */
 char* hex2bin(char *dst, const char *src, int count) FAST_FUNC;
 
+void FAST_FUNC xorbuf_3(void *dst, const void *src1, const void *src2, unsigned count);
+void FAST_FUNC xorbuf(void* buf, const void* mask, unsigned count);
+void FAST_FUNC xorbuf16_aligned_long(void* buf, const void* mask);
+void FAST_FUNC xorbuf64_3_aligned64(void *dst, const void *src1, const void *src2);
+#if BB_UNALIGNED_MEMACCESS_OK
+# define xorbuf16(buf,mask) xorbuf16_aligned_long(buf,mask)
+#else
+void FAST_FUNC xorbuf16(void* buf, const void* mask);
+#endif
+
 /* Generate a UUID */
 void generate_uuid(uint8_t *buf) FAST_FUNC;
 
diff --git a/libbb/bitops.c b/libbb/bitops.c
new file mode 100644
index 000000000..5f239676c
--- /dev/null
+++ b/libbb/bitops.c
@@ -0,0 +1,108 @@
+/*
+ * Utility routines.
+ *
+ * Copyright (C) 2025 by Denys Vlasenko <vda.linux at googlemail.com>
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+//kbuild:lib-y += bitops.o
+
+#include "libbb.h"
+
+void FAST_FUNC xorbuf_3(void *dst, const void *src1, const void *src2, unsigned count)
+{
+	uint8_t *d = dst;
+	const uint8_t *s1 = src1;
+	const uint8_t *s2 = src2;
+#if BB_UNALIGNED_MEMACCESS_OK
+	while (count >= sizeof(long)) {
+		*(long*)d = *(long*)s1 ^ *(long*)s2;
+		count -= sizeof(long);
+		d += sizeof(long);
+		s1 += sizeof(long);
+		s2 += sizeof(long);
+	}
+#endif
+	while (count--)
+		*d++ = *s1++ ^ *s2++;
+}
+
+void FAST_FUNC xorbuf(void *dst, const void *src, unsigned count)
+{
+	xorbuf_3(dst, dst, src, count);
+}
+
+void FAST_FUNC xorbuf16_aligned_long(void *dst, const void *src)
+{
+#if defined(__SSE__) /* any x86_64 has it */
+	asm volatile(
+"\n		movups	(%0),%%xmm0"
+"\n		movups	(%1),%%xmm1"   // can't just xorps(%1),%%xmm0:
+"\n		xorps	%%xmm1,%%xmm0" // SSE requires 16-byte alignment
+"\n		movups	%%xmm0,(%0)"
+"\n"
+		: "=r" (dst), "=r" (src)
+		: "0" (dst), "1" (src)
+		: "xmm0", "xmm1", "memory"
+	);
+#else
+	unsigned long *d = dst;
+	const unsigned long *s = src;
+	d[0] ^= s[0];
+# if LONG_MAX <= 0x7fffffffffffffff
+	d[1] ^= s[1];
+#  if LONG_MAX == 0x7fffffff
+	d[2] ^= s[2];
+	d[3] ^= s[3];
+#  endif
+# endif
+#endif
+}
+
+void FAST_FUNC xorbuf64_3_aligned64(void *dst, const void *src1, const void *src2)
+{
+#if defined(__SSE__) /* any x86_64 has it */
+	asm volatile(
+"\n		movups	0*16(%1),%%xmm0"
+"\n		movups	0*16(%2),%%xmm1" // can't just xorps(%2),%%xmm0:
+"\n		xorps	%%xmm1,%%xmm0"   // SSE requires 16-byte alignment, we have only 8-byte
+"\n		movups	%%xmm0,0*16(%0)"
+"\n		movups	1*16(%1),%%xmm0"
+"\n		movups	1*16(%2),%%xmm1"
+"\n		xorps	%%xmm1,%%xmm0"
+"\n		movups	%%xmm0,1*16(%0)"
+"\n		movups	2*16(%1),%%xmm0"
+"\n		movups	2*16(%2),%%xmm1"
+"\n		xorps	%%xmm1,%%xmm0"
+"\n		movups	%%xmm0,2*16(%0)"
+"\n		movups	3*16(%1),%%xmm0"
+"\n		movups	3*16(%2),%%xmm1"
+"\n		xorps	%%xmm1,%%xmm0"
+"\n		movups	%%xmm0,3*16(%0)"
+"\n"
+		: "=r" (dst), "=r" (src1), "=r" (src2)
+		: "0" (dst), "1" (src1), "2" (src2)
+		: "xmm0", "xmm1", "memory"
+	);
+#else
+	long *d = dst;
+	const long *s1 = src1;
+	const long *s2 = src2;
+	unsigned count = 64 / sizeof(long);
+	do {
+		*d++ = *s1++ ^ *s2++;
+	} while (--count != 0);
+#endif
+}
+
+#if !BB_UNALIGNED_MEMACCESS_OK
+void FAST_FUNC xorbuf16(void *dst, const void *src)
+{
+#define p_aligned(a) (((uintptr_t)(a) & (sizeof(long)-1)) == 0)
+	if (p_aligned(src) && p_aligned(dst)) {
+		xorbuf16_aligned_long(dst, src);
+		return;
+	}
+	xorbuf_3(dst, dst, src, 16);
+}
+#endif
diff --git a/libbb/yescrypt/alg-sha256.c b/libbb/yescrypt/alg-sha256.c
index 25446406b..20e8d1ee4 100644
--- a/libbb/yescrypt/alg-sha256.c
+++ b/libbb/yescrypt/alg-sha256.c
@@ -72,6 +72,7 @@ PBKDF2_SHA256(const uint8_t *passwd, size_t passwdlen,
 				/* ... xor U_j ... */
 				for (k = 0; k < 32 / 8; k++)
 					T[k] ^= U[k];
+				//TODO: xorbuf32_aligned_long(T, U);
 			}
 		}
 
diff --git a/libbb/yescrypt/alg-yescrypt-kdf.c b/libbb/yescrypt/alg-yescrypt-kdf.c
index f421db111..112862ec9 100644
--- a/libbb/yescrypt/alg-yescrypt-kdf.c
+++ b/libbb/yescrypt/alg-yescrypt-kdf.c
@@ -180,6 +180,7 @@ static void salsa20(salsa20_blk_t *restrict B,
 #define SALSA20_2(out) \
 	salsa20(&X, &out, 1)
 
+#if 0
 #define XOR(out, in1, in2) \
 do { \
 	(out).d[0] = (in1).d[0] ^ (in2).d[0]; \
@@ -191,6 +192,12 @@ do { \
 	(out).d[6] = (in1).d[6] ^ (in2).d[6]; \
 	(out).d[7] = (in1).d[7] ^ (in2).d[7]; \
 } while (0)
+#else
+#define XOR(out, in1, in2) \
+do { \
+	xorbuf64_3_aligned64(&(out).d, &(in1).d, &(in2).d); \
+} while (0)
+#endif
 
 #define XOR_X(in)         XOR(X, X, in)
 #define XOR_X_2(in1, in2) XOR(X, in1, in2)
diff --git a/networking/tls.c b/networking/tls.c
index 098cf7cac..ac6f0767f 100644
--- a/networking/tls.c
+++ b/networking/tls.c
@@ -333,34 +333,6 @@ void FAST_FUNC tls_get_random(void *buf, unsigned len)
 		xfunc_die();
 }
 
-static void xorbuf3(void *dst, const void *src1, const void *src2, unsigned count)
-{
-	uint8_t *d = dst;
-	const uint8_t *s1 = src1;
-	const uint8_t* s2 = src2;
-	while (count--)
-		*d++ = *s1++ ^ *s2++;
-}
-
-void FAST_FUNC xorbuf(void *dst, const void *src, unsigned count)
-{
-	xorbuf3(dst, dst, src, count);
-}
-
-void FAST_FUNC xorbuf_aligned_AES_BLOCK_SIZE(void *dst, const void *src)
-{
-	unsigned long *d = dst;
-	const unsigned long *s = src;
-	d[0] ^= s[0];
-#if ULONG_MAX <= 0xffffffffffffffff
-	d[1] ^= s[1];
- #if ULONG_MAX == 0xffffffff
-	d[2] ^= s[2];
-	d[3] ^= s[3];
- #endif
-#endif
-}
-
 #if !TLS_DEBUG_HASH
 # define hash_handshake(tls, fmt, buffer, len) \
          hash_handshake(tls, buffer, len)
@@ -764,8 +736,13 @@ static void xwrite_encrypted_aesgcm(tls_state_t *tls, unsigned size, unsigned ty
 		cnt++;
 		COUNTER(nonce) = htonl(cnt); /* yes, first cnt here is 2 (!) */
 		aes_encrypt_one_block(&tls->aes_encrypt, nonce, scratch);
-		n = remaining > AES_BLOCK_SIZE ? AES_BLOCK_SIZE : remaining;
-		xorbuf(buf, scratch, n);
+		if (remaining >= AES_BLOCK_SIZE) {
+			n = AES_BLOCK_SIZE;
+			xorbuf_AES_BLOCK_SIZE(buf, scratch);
+		} else {
+			n = remaining;
+			xorbuf(buf, scratch, n);
+		}
 		buf += n;
 		remaining -= n;
 	}
@@ -923,7 +900,7 @@ static void tls_aesgcm_decrypt(tls_state_t *tls, uint8_t *buf, int size)
 		COUNTER(nonce) = htonl(cnt); /* yes, first cnt here is 2 (!) */
 		aes_encrypt_one_block(&tls->aes_decrypt, nonce, scratch);
 		n = remaining > AES_BLOCK_SIZE ? AES_BLOCK_SIZE : remaining;
-		xorbuf3(buf, scratch, buf + 8, n);
+		xorbuf_3(buf, scratch, buf + 8, n);
 		buf += n;
 		remaining -= n;
 	}
diff --git a/networking/tls.h b/networking/tls.h
index 0173b87b2..9751d30ff 100644
--- a/networking/tls.h
+++ b/networking/tls.h
@@ -82,10 +82,9 @@ typedef  int16_t  int16;
 
 void tls_get_random(void *buf, unsigned len) FAST_FUNC;
 
-void xorbuf(void* buf, const void* mask, unsigned count) FAST_FUNC;
-
 #define ALIGNED_long ALIGNED(sizeof(long))
-void xorbuf_aligned_AES_BLOCK_SIZE(void* buf, const void* mask) FAST_FUNC;
+#define xorbuf_aligned_AES_BLOCK_SIZE(dst,src) xorbuf16_aligned_long(dst,src)
+#define xorbuf_AES_BLOCK_SIZE(dst,src)         xorbuf16(dst,src)
 
 #define matrixCryptoGetPrngData(buf, len, userPtr) (tls_get_random(buf, len), PS_SUCCESS)
 
diff --git a/networking/tls_aesgcm.c b/networking/tls_aesgcm.c
index 5ddcdd2ad..9c2381a57 100644
--- a/networking/tls_aesgcm.c
+++ b/networking/tls_aesgcm.c
@@ -167,10 +167,7 @@ void FAST_FUNC aesgcm_GHASH(byte* h,
         blocks = cSz / AES_BLOCK_SIZE;
         partial = cSz % AES_BLOCK_SIZE;
         while (blocks--) {
-            if (BB_UNALIGNED_MEMACCESS_OK) // c is not guaranteed to be aligned
-                xorbuf_aligned_AES_BLOCK_SIZE(x, c);
-            else
-                xorbuf(x, c, AES_BLOCK_SIZE);
+            xorbuf_AES_BLOCK_SIZE(x, c);
             GMULT(x, h);
             c += AES_BLOCK_SIZE;
         }


More information about the busybox-cvs mailing list