[git commit] libbb: introduce and use block-XOR functions
Denys Vlasenko
vda.linux at googlemail.com
Wed Jul 9 05:00:59 UTC 2025
commit: https://git.busybox.net/busybox/commit/?id=c305c81c94a086fb09444b1ea6f31fb911c25ec0
branch: https://git.busybox.net/busybox/commit/?id=refs/heads/master
On x86_64, they can be done in 16-byte blocks
64-bit:
function old new delta
xorbuf_3 - 84 +84
xorbuf64_3_aligned64 - 58 +58
smix1 687 712 +25
xwrite_encrypted 520 534 +14
xorbuf16_aligned_long - 13 +13
tls_xread_record 733 742 +9
xorbuf 21 13 -8
xorbuf_aligned_AES_BLOCK_SIZE 15 - -15
blockmix 814 762 -52
blockmix_salsa8 317 198 -119
blockmix_xor_save 1620 1499 -121
blockmix_xor 1543 1322 -221
------------------------------------------------------------------------------
(add/remove: 4/1 grow/shrink: 3/5 up/down: 203/-536) Total: -333 bytes
32-bit:
function old new delta
xorbuf_3 - 76 +76
xorbuf64_3_aligned64 - 36 +36
xorbuf16_aligned_long - 23 +23
xwrite_encrypted 499 507 +8
tls_xread_record 646 650 +4
xorbuf 22 11 -11
xorbuf_aligned_AES_BLOCK_SIZE 23 - -23
blockmix 1083 938 -145
blockmix_salsa8 415 210 -205
blockmix_salsa8_xor 601 163 -438
blockmix_xor 2103 1533 -570
blockmix_xor_save 2614 1859 -755
------------------------------------------------------------------------------
(add/remove: 4/1 grow/shrink: 2/6 up/down: 147/-2147) Total: -2000 bytes
Signed-off-by: Denys Vlasenko <vda.linux at googlemail.com>
---
include/libbb.h | 10 ++++
libbb/bitops.c | 108 ++++++++++++++++++++++++++++++++++++++
libbb/yescrypt/alg-sha256.c | 1 +
libbb/yescrypt/alg-yescrypt-kdf.c | 7 +++
networking/tls.c | 39 +++-----------
networking/tls.h | 5 +-
networking/tls_aesgcm.c | 5 +-
7 files changed, 137 insertions(+), 38 deletions(-)
diff --git a/include/libbb.h b/include/libbb.h
index 544ca3155..79427fb31 100644
--- a/include/libbb.h
+++ b/include/libbb.h
@@ -1113,6 +1113,16 @@ char *bin2hex(char *dst, const char *src, int count) FAST_FUNC;
/* Reverse */
char* hex2bin(char *dst, const char *src, int count) FAST_FUNC;
+void FAST_FUNC xorbuf_3(void *dst, const void *src1, const void *src2, unsigned count);
+void FAST_FUNC xorbuf(void* buf, const void* mask, unsigned count);
+void FAST_FUNC xorbuf16_aligned_long(void* buf, const void* mask);
+void FAST_FUNC xorbuf64_3_aligned64(void *dst, const void *src1, const void *src2);
+#if BB_UNALIGNED_MEMACCESS_OK
+# define xorbuf16(buf,mask) xorbuf16_aligned_long(buf,mask)
+#else
+void FAST_FUNC xorbuf16(void* buf, const void* mask);
+#endif
+
/* Generate a UUID */
void generate_uuid(uint8_t *buf) FAST_FUNC;
diff --git a/libbb/bitops.c b/libbb/bitops.c
new file mode 100644
index 000000000..5f239676c
--- /dev/null
+++ b/libbb/bitops.c
@@ -0,0 +1,108 @@
+/*
+ * Utility routines.
+ *
+ * Copyright (C) 2025 by Denys Vlasenko <vda.linux at googlemail.com>
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+//kbuild:lib-y += bitops.o
+
+#include "libbb.h"
+
+void FAST_FUNC xorbuf_3(void *dst, const void *src1, const void *src2, unsigned count)
+{
+ uint8_t *d = dst;
+ const uint8_t *s1 = src1;
+ const uint8_t *s2 = src2;
+#if BB_UNALIGNED_MEMACCESS_OK
+ while (count >= sizeof(long)) {
+ *(long*)d = *(long*)s1 ^ *(long*)s2;
+ count -= sizeof(long);
+ d += sizeof(long);
+ s1 += sizeof(long);
+ s2 += sizeof(long);
+ }
+#endif
+ while (count--)
+ *d++ = *s1++ ^ *s2++;
+}
+
+void FAST_FUNC xorbuf(void *dst, const void *src, unsigned count)
+{
+ xorbuf_3(dst, dst, src, count);
+}
+
+void FAST_FUNC xorbuf16_aligned_long(void *dst, const void *src)
+{
+#if defined(__SSE__) /* any x86_64 has it */
+ asm volatile(
+"\n movups (%0),%%xmm0"
+"\n movups (%1),%%xmm1" // can't just xorps(%1),%%xmm0:
+"\n xorps %%xmm1,%%xmm0" // SSE requires 16-byte alignment
+"\n movups %%xmm0,(%0)"
+"\n"
+ : "=r" (dst), "=r" (src)
+ : "0" (dst), "1" (src)
+ : "xmm0", "xmm1", "memory"
+ );
+#else
+ unsigned long *d = dst;
+ const unsigned long *s = src;
+ d[0] ^= s[0];
+# if LONG_MAX <= 0x7fffffffffffffff
+ d[1] ^= s[1];
+# if LONG_MAX == 0x7fffffff
+ d[2] ^= s[2];
+ d[3] ^= s[3];
+# endif
+# endif
+#endif
+}
+
+void FAST_FUNC xorbuf64_3_aligned64(void *dst, const void *src1, const void *src2)
+{
+#if defined(__SSE__) /* any x86_64 has it */
+ asm volatile(
+"\n movups 0*16(%1),%%xmm0"
+"\n movups 0*16(%2),%%xmm1" // can't just xorps(%2),%%xmm0:
+"\n xorps %%xmm1,%%xmm0" // SSE requires 16-byte alignment, we have only 8-byte
+"\n movups %%xmm0,0*16(%0)"
+"\n movups 1*16(%1),%%xmm0"
+"\n movups 1*16(%2),%%xmm1"
+"\n xorps %%xmm1,%%xmm0"
+"\n movups %%xmm0,1*16(%0)"
+"\n movups 2*16(%1),%%xmm0"
+"\n movups 2*16(%2),%%xmm1"
+"\n xorps %%xmm1,%%xmm0"
+"\n movups %%xmm0,2*16(%0)"
+"\n movups 3*16(%1),%%xmm0"
+"\n movups 3*16(%2),%%xmm1"
+"\n xorps %%xmm1,%%xmm0"
+"\n movups %%xmm0,3*16(%0)"
+"\n"
+ : "=r" (dst), "=r" (src1), "=r" (src2)
+ : "0" (dst), "1" (src1), "2" (src2)
+ : "xmm0", "xmm1", "memory"
+ );
+#else
+ long *d = dst;
+ const long *s1 = src1;
+ const long *s2 = src2;
+ unsigned count = 64 / sizeof(long);
+ do {
+ *d++ = *s1++ ^ *s2++;
+ } while (--count != 0);
+#endif
+}
+
+#if !BB_UNALIGNED_MEMACCESS_OK
+void FAST_FUNC xorbuf16(void *dst, const void *src)
+{
+#define p_aligned(a) (((uintptr_t)(a) & (sizeof(long)-1)) == 0)
+ if (p_aligned(src) && p_aligned(dst)) {
+ xorbuf16_aligned_long(dst, src);
+ return;
+ }
+ xorbuf_3(dst, dst, src, 16);
+}
+#endif
diff --git a/libbb/yescrypt/alg-sha256.c b/libbb/yescrypt/alg-sha256.c
index 25446406b..20e8d1ee4 100644
--- a/libbb/yescrypt/alg-sha256.c
+++ b/libbb/yescrypt/alg-sha256.c
@@ -72,6 +72,7 @@ PBKDF2_SHA256(const uint8_t *passwd, size_t passwdlen,
/* ... xor U_j ... */
for (k = 0; k < 32 / 8; k++)
T[k] ^= U[k];
+ //TODO: xorbuf32_aligned_long(T, U);
}
}
diff --git a/libbb/yescrypt/alg-yescrypt-kdf.c b/libbb/yescrypt/alg-yescrypt-kdf.c
index f421db111..112862ec9 100644
--- a/libbb/yescrypt/alg-yescrypt-kdf.c
+++ b/libbb/yescrypt/alg-yescrypt-kdf.c
@@ -180,6 +180,7 @@ static void salsa20(salsa20_blk_t *restrict B,
#define SALSA20_2(out) \
salsa20(&X, &out, 1)
+#if 0
#define XOR(out, in1, in2) \
do { \
(out).d[0] = (in1).d[0] ^ (in2).d[0]; \
@@ -191,6 +192,12 @@ do { \
(out).d[6] = (in1).d[6] ^ (in2).d[6]; \
(out).d[7] = (in1).d[7] ^ (in2).d[7]; \
} while (0)
+#else
+#define XOR(out, in1, in2) \
+do { \
+ xorbuf64_3_aligned64(&(out).d, &(in1).d, &(in2).d); \
+} while (0)
+#endif
#define XOR_X(in) XOR(X, X, in)
#define XOR_X_2(in1, in2) XOR(X, in1, in2)
diff --git a/networking/tls.c b/networking/tls.c
index 098cf7cac..ac6f0767f 100644
--- a/networking/tls.c
+++ b/networking/tls.c
@@ -333,34 +333,6 @@ void FAST_FUNC tls_get_random(void *buf, unsigned len)
xfunc_die();
}
-static void xorbuf3(void *dst, const void *src1, const void *src2, unsigned count)
-{
- uint8_t *d = dst;
- const uint8_t *s1 = src1;
- const uint8_t* s2 = src2;
- while (count--)
- *d++ = *s1++ ^ *s2++;
-}
-
-void FAST_FUNC xorbuf(void *dst, const void *src, unsigned count)
-{
- xorbuf3(dst, dst, src, count);
-}
-
-void FAST_FUNC xorbuf_aligned_AES_BLOCK_SIZE(void *dst, const void *src)
-{
- unsigned long *d = dst;
- const unsigned long *s = src;
- d[0] ^= s[0];
-#if ULONG_MAX <= 0xffffffffffffffff
- d[1] ^= s[1];
- #if ULONG_MAX == 0xffffffff
- d[2] ^= s[2];
- d[3] ^= s[3];
- #endif
-#endif
-}
-
#if !TLS_DEBUG_HASH
# define hash_handshake(tls, fmt, buffer, len) \
hash_handshake(tls, buffer, len)
@@ -764,8 +736,13 @@ static void xwrite_encrypted_aesgcm(tls_state_t *tls, unsigned size, unsigned ty
cnt++;
COUNTER(nonce) = htonl(cnt); /* yes, first cnt here is 2 (!) */
aes_encrypt_one_block(&tls->aes_encrypt, nonce, scratch);
- n = remaining > AES_BLOCK_SIZE ? AES_BLOCK_SIZE : remaining;
- xorbuf(buf, scratch, n);
+ if (remaining >= AES_BLOCK_SIZE) {
+ n = AES_BLOCK_SIZE;
+ xorbuf_AES_BLOCK_SIZE(buf, scratch);
+ } else {
+ n = remaining;
+ xorbuf(buf, scratch, n);
+ }
buf += n;
remaining -= n;
}
@@ -923,7 +900,7 @@ static void tls_aesgcm_decrypt(tls_state_t *tls, uint8_t *buf, int size)
COUNTER(nonce) = htonl(cnt); /* yes, first cnt here is 2 (!) */
aes_encrypt_one_block(&tls->aes_decrypt, nonce, scratch);
n = remaining > AES_BLOCK_SIZE ? AES_BLOCK_SIZE : remaining;
- xorbuf3(buf, scratch, buf + 8, n);
+ xorbuf_3(buf, scratch, buf + 8, n);
buf += n;
remaining -= n;
}
diff --git a/networking/tls.h b/networking/tls.h
index 0173b87b2..9751d30ff 100644
--- a/networking/tls.h
+++ b/networking/tls.h
@@ -82,10 +82,9 @@ typedef int16_t int16;
void tls_get_random(void *buf, unsigned len) FAST_FUNC;
-void xorbuf(void* buf, const void* mask, unsigned count) FAST_FUNC;
-
#define ALIGNED_long ALIGNED(sizeof(long))
-void xorbuf_aligned_AES_BLOCK_SIZE(void* buf, const void* mask) FAST_FUNC;
+#define xorbuf_aligned_AES_BLOCK_SIZE(dst,src) xorbuf16_aligned_long(dst,src)
+#define xorbuf_AES_BLOCK_SIZE(dst,src) xorbuf16(dst,src)
#define matrixCryptoGetPrngData(buf, len, userPtr) (tls_get_random(buf, len), PS_SUCCESS)
diff --git a/networking/tls_aesgcm.c b/networking/tls_aesgcm.c
index 5ddcdd2ad..9c2381a57 100644
--- a/networking/tls_aesgcm.c
+++ b/networking/tls_aesgcm.c
@@ -167,10 +167,7 @@ void FAST_FUNC aesgcm_GHASH(byte* h,
blocks = cSz / AES_BLOCK_SIZE;
partial = cSz % AES_BLOCK_SIZE;
while (blocks--) {
- if (BB_UNALIGNED_MEMACCESS_OK) // c is not guaranteed to be aligned
- xorbuf_aligned_AES_BLOCK_SIZE(x, c);
- else
- xorbuf(x, c, AES_BLOCK_SIZE);
+ xorbuf_AES_BLOCK_SIZE(x, c);
GMULT(x, h);
c += AES_BLOCK_SIZE;
}
More information about the busybox-cvs
mailing list