[git commit] libbb/yescrypt: disable unrolling in two places
Denys Vlasenko
vda.linux at googlemail.com
Mon Jul 7 20:34:31 UTC 2025
commit: https://git.busybox.net/busybox/commit/?id=f8e9bd30d73f2acf6818da71a2ba44748151b716
branch: https://git.busybox.net/busybox/commit/?id=refs/heads/master
Also, make many define macros safer
function old new delta
blockmix 2300 814 -1486
blockmix_xor 4606 1543 -3063
blockmix_xor_save 4737 1620 -3117
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-7666) Total: -7666 bytes
Signed-off-by: Denys Vlasenko <vda.linux at googlemail.com>
---
libbb/yescrypt/alg-yescrypt-kdf.c | 255 ++++++++++++++++++++++++--------------
1 file changed, 159 insertions(+), 96 deletions(-)
diff --git a/libbb/yescrypt/alg-yescrypt-kdf.c b/libbb/yescrypt/alg-yescrypt-kdf.c
index d24b05150..ab095eae1 100644
--- a/libbb/yescrypt/alg-yescrypt-kdf.c
+++ b/libbb/yescrypt/alg-yescrypt-kdf.c
@@ -42,6 +42,15 @@
#define unlikely(exp) (exp)
#endif
+// Not a size win if 0
+#define UNROLL_COPY 1
+
+// -5324 bytes if 0:
+#define UNROLL_PWXFORM_ROUND 0
+// -4864 bytes if 0:
+#define UNROLL_PWXFORM 0
+// both 0: -7666 bytes
+
typedef union {
uint32_t w[16];
uint64_t d[8];
@@ -52,15 +61,17 @@ static void salsa20_simd_shuffle(
salsa20_blk_t *Bout)
{
#define COMBINE(out, in1, in2) \
- Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32);
- COMBINE(0, 0, 2)
- COMBINE(1, 5, 7)
- COMBINE(2, 2, 4)
- COMBINE(3, 7, 1)
- COMBINE(4, 4, 6)
- COMBINE(5, 1, 3)
- COMBINE(6, 6, 0)
- COMBINE(7, 3, 5)
+do { \
+ Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32); \
+} while (0)
+ COMBINE(0, 0, 2);
+ COMBINE(1, 5, 7);
+ COMBINE(2, 2, 4);
+ COMBINE(3, 7, 1);
+ COMBINE(4, 4, 6);
+ COMBINE(5, 1, 3);
+ COMBINE(6, 6, 0);
+ COMBINE(7, 3, 5);
#undef COMBINE
}
@@ -69,25 +80,29 @@ static void salsa20_simd_unshuffle(
salsa20_blk_t *Bout)
{
#define UNCOMBINE(out, in1, in2) \
+do { \
Bout->w[out * 2] = Bin->d[in1]; \
- Bout->w[out * 2 + 1] = Bin->d[in2] >> 32;
- UNCOMBINE(0, 0, 6)
- UNCOMBINE(1, 5, 3)
- UNCOMBINE(2, 2, 0)
- UNCOMBINE(3, 7, 5)
- UNCOMBINE(4, 4, 2)
- UNCOMBINE(5, 1, 7)
- UNCOMBINE(6, 6, 4)
- UNCOMBINE(7, 3, 1)
+ Bout->w[out * 2 + 1] = Bin->d[in2] >> 32; \
+} while (0)
+ UNCOMBINE(0, 0, 6);
+ UNCOMBINE(1, 5, 3);
+ UNCOMBINE(2, 2, 0);
+ UNCOMBINE(3, 7, 5);
+ UNCOMBINE(4, 4, 2);
+ UNCOMBINE(5, 1, 7);
+ UNCOMBINE(6, 6, 4);
+ UNCOMBINE(7, 3, 1);
#undef UNCOMBINE
}
#define DECL_X \
- salsa20_blk_t X;
+ salsa20_blk_t X
#define DECL_Y \
- salsa20_blk_t Y;
+ salsa20_blk_t Y
+#if UNROLL_COPY
#define COPY(out, in) \
+do { \
(out).d[0] = (in).d[0]; \
(out).d[1] = (in).d[1]; \
(out).d[2] = (in).d[2]; \
@@ -95,9 +110,17 @@ static void salsa20_simd_unshuffle(
(out).d[4] = (in).d[4]; \
(out).d[5] = (in).d[5]; \
(out).d[6] = (in).d[6]; \
- (out).d[7] = (in).d[7];
+ (out).d[7] = (in).d[7]; \
+} while (0)
+#else
+#define COPY(out, in) \
+do { \
+ for (int copyi=0; copyi<8; copyi++) \
+ (out).d[copyi] = (in).d[copyi]; \
+} while (0)
+#endif
-#define READ_X(in) COPY(X, in)
+#define READ_X(in) COPY(X, in)
#define WRITE_X(out) COPY(out, X)
/**
@@ -154,7 +177,6 @@ static void salsa20(salsa20_blk_t *restrict B,
B->w[i + 3] = Bout->w[i + 3] += B->w[i + 3];
}
}
-
#if 0
/* Too expensive */
explicit_bzero(&X, sizeof(X));
@@ -165,9 +187,10 @@ static void salsa20(salsa20_blk_t *restrict B,
* Apply the Salsa20/2 core to the block provided in X.
*/
#define SALSA20_2(out) \
- salsa20(&X, &out, 1);
+ salsa20(&X, &out, 1)
#define XOR(out, in1, in2) \
+do { \
(out).d[0] = (in1).d[0] ^ (in2).d[0]; \
(out).d[1] = (in1).d[1] ^ (in2).d[1]; \
(out).d[2] = (in1).d[2] ^ (in2).d[2]; \
@@ -175,23 +198,28 @@ static void salsa20(salsa20_blk_t *restrict B,
(out).d[4] = (in1).d[4] ^ (in2).d[4]; \
(out).d[5] = (in1).d[5] ^ (in2).d[5]; \
(out).d[6] = (in1).d[6] ^ (in2).d[6]; \
- (out).d[7] = (in1).d[7] ^ (in2).d[7];
+ (out).d[7] = (in1).d[7] ^ (in2).d[7]; \
+} while (0)
-#define XOR_X(in) XOR(X, X, in)
+#define XOR_X(in) XOR(X, X, in)
#define XOR_X_2(in1, in2) XOR(X, in1, in2)
#define XOR_X_WRITE_XOR_Y_2(out, in) \
- XOR(Y, out, in) \
- COPY(out, Y) \
- XOR(X, X, Y)
+do { \
+ XOR(Y, out, in); \
+ COPY(out, Y); \
+ XOR(X, X, Y); \
+} while (0)
/**
* Apply the Salsa20/8 core to the block provided in X ^ in.
*/
#define SALSA20_8_XOR_MEM(in, out) \
+do { \
XOR_X(in); \
- salsa20(&X, &out, 4);
+ salsa20(&X, &out, 4); \
+} while (0)
-#define INTEGERIFY (uint32_t)X.d[0]
+#define INTEGERIFY ((uint32_t)X.d[0])
/**
* blockmix_salsa8(Bin, Bout, r):
@@ -204,12 +232,12 @@ static void blockmix_salsa8(
size_t r)
{
size_t i;
- DECL_X
+ DECL_X;
- READ_X(Bin[r * 2 - 1])
+ READ_X(Bin[r * 2 - 1]);
for (i = 0; i < r; i++) {
- SALSA20_8_XOR_MEM(Bin[i * 2], Bout[i])
- SALSA20_8_XOR_MEM(Bin[i * 2 + 1], Bout[r + i])
+ SALSA20_8_XOR_MEM(Bin[i * 2], Bout[i]);
+ SALSA20_8_XOR_MEM(Bin[i * 2 + 1], Bout[r + i]);
}
}
@@ -220,14 +248,14 @@ static uint32_t blockmix_salsa8_xor(
size_t r)
{
size_t i;
- DECL_X
+ DECL_X;
- XOR_X_2(Bin1[r * 2 - 1], Bin2[r * 2 - 1])
+ XOR_X_2(Bin1[r * 2 - 1], Bin2[r * 2 - 1]);
for (i = 0; i < r; i++) {
- XOR_X(Bin1[i * 2])
- SALSA20_8_XOR_MEM(Bin2[i * 2], Bout[i])
- XOR_X(Bin1[i * 2 + 1])
- SALSA20_8_XOR_MEM(Bin2[i * 2 + 1], Bout[r + i])
+ XOR_X(Bin1[i * 2]);
+ SALSA20_8_XOR_MEM(Bin2[i * 2], Bout[i]);
+ XOR_X(Bin1[i * 2 + 1]);
+ SALSA20_8_XOR_MEM(Bin2[i * 2 + 1], Bout[r + i]);
}
return INTEGERIFY;
@@ -242,27 +270,38 @@ static uint32_t blockmix_salsa8_xor(
/* Derived values. Not tunable except via Swidth above. */
#define PWXbytes (PWXgather * PWXsimple * 8)
-#define Sbytes (3 * (1 << Swidth) * PWXsimple * 8)
-#define Smask (((1 << Swidth) - 1) * PWXsimple * 8)
-#define Smask2 (((uint64_t)Smask << 32) | Smask)
+#define Sbytes (3 * (1 << Swidth) * PWXsimple * 8)
+#define Smask (((1 << Swidth) - 1) * PWXsimple * 8)
+#define Smask2 (((uint64_t)Smask << 32) | Smask)
-#define DECL_SMASK2REG /* empty */
-#define FORCE_REGALLOC_3 /* empty */
-#define MAYBE_MEMORY_BARRIER /* empty */
+#define DECL_SMASK2REG do {} while (0)
+#define FORCE_REGALLOC_3 do {} while (0)
+#define MAYBE_MEMORY_BARRIER do {} while (0)
-#define PWXFORM_SIMD(x0, x1) { \
+#define PWXFORM_SIMD(x0, x1) \
+do { \
uint64_t x = x0 & Smask2; \
uint64_t *p0 = (uint64_t *)(S0 + (uint32_t)x); \
uint64_t *p1 = (uint64_t *)(S1 + (x >> 32)); \
x0 = ((x0 >> 32) * (uint32_t)x0 + p0[0]) ^ p1[0]; \
x1 = ((x1 >> 32) * (uint32_t)x1 + p0[1]) ^ p1[1]; \
-}
+} while (0)
+#if UNROLL_PWXFORM_ROUND
+#define PWXFORM_ROUND \
+do { \
+ PWXFORM_SIMD(X.d[0], X.d[1]); \
+ PWXFORM_SIMD(X.d[2], X.d[3]); \
+ PWXFORM_SIMD(X.d[4], X.d[5]); \
+ PWXFORM_SIMD(X.d[6], X.d[7]); \
+} while (0)
+#else
#define PWXFORM_ROUND \
- PWXFORM_SIMD(X.d[0], X.d[1]) \
- PWXFORM_SIMD(X.d[2], X.d[3]) \
- PWXFORM_SIMD(X.d[4], X.d[5]) \
- PWXFORM_SIMD(X.d[6], X.d[7])
+do { \
+ for (int pwxi=0; pwxi<8; pwxi+=2) \
+ PWXFORM_SIMD(X.d[pwxi], X.d[pwxi + 1]); \
+} while (0)
+#endif
/*
* This offset helps address the 256-byte write block via the single-byte
@@ -275,19 +314,23 @@ static uint32_t blockmix_salsa8_xor(
#define PWXFORM_WRITE_OFFSET 0x7c
#define PWXFORM_WRITE \
- WRITE_X(*(salsa20_blk_t *)(Sw - PWXFORM_WRITE_OFFSET)) \
- Sw += 64;
-
-#define PWXFORM { \
+do { \
+ WRITE_X(*(salsa20_blk_t *)(Sw - PWXFORM_WRITE_OFFSET)); \
+ Sw += 64; \
+} while (0)
+
+#if UNROLL_PWXFORM
+#define PWXFORM \
+do { \
uint8_t *Sw = S2 + w + PWXFORM_WRITE_OFFSET; \
- FORCE_REGALLOC_3 \
- MAYBE_MEMORY_BARRIER \
- PWXFORM_ROUND \
- PWXFORM_ROUND PWXFORM_WRITE \
- PWXFORM_ROUND PWXFORM_WRITE \
- PWXFORM_ROUND PWXFORM_WRITE \
- PWXFORM_ROUND PWXFORM_WRITE \
- PWXFORM_ROUND \
+ FORCE_REGALLOC_3; \
+ MAYBE_MEMORY_BARRIER; \
+ PWXFORM_ROUND; \
+ PWXFORM_ROUND; PWXFORM_WRITE; \
+ PWXFORM_ROUND; PWXFORM_WRITE; \
+ PWXFORM_ROUND; PWXFORM_WRITE; \
+ PWXFORM_ROUND; PWXFORM_WRITE; \
+ PWXFORM_ROUND; \
w = (w + 64 * 4) & Smask2; \
{ \
uint8_t *Stmp = S2; \
@@ -295,7 +338,27 @@ static uint32_t blockmix_salsa8_xor(
S1 = S0; \
S0 = Stmp; \
} \
-}
+} while (0)
+#else
+#define PWXFORM \
+do { \
+ uint8_t *Sw = S2 + w + PWXFORM_WRITE_OFFSET; \
+ FORCE_REGALLOC_3; \
+ MAYBE_MEMORY_BARRIER; \
+ PWXFORM_ROUND; \
+ for (int pwxj=0; pwxj<4; pwxj++) {\
+ PWXFORM_ROUND; PWXFORM_WRITE; \
+ } \
+ PWXFORM_ROUND; \
+ w = (w + 64 * 4) & Smask2; \
+ { \
+ uint8_t *Stmp = S2; \
+ S2 = S1; \
+ S1 = S0; \
+ S0 = Stmp; \
+ } \
+} while (0)
+#endif
typedef struct {
uint8_t *S0, *S1, *S2;
@@ -318,29 +381,29 @@ static void blockmix(
uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2;
size_t w = ctx->w;
size_t i;
- DECL_X
+ DECL_X;
/* Convert count of 128-byte blocks to max index of 64-byte block */
r = r * 2 - 1;
- READ_X(Bin[r])
+ READ_X(Bin[r]);
- DECL_SMASK2REG
+ DECL_SMASK2REG;
i = 0;
do {
- XOR_X(Bin[i])
- PWXFORM
+ XOR_X(Bin[i]);
+ PWXFORM;
if (unlikely(i >= r))
break;
- WRITE_X(Bout[i])
+ WRITE_X(Bout[i]);
i++;
} while (1);
ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2;
ctx->w = w;
- SALSA20_2(Bout[i])
+ SALSA20_2(Bout[i]);
}
static uint32_t blockmix_xor(const salsa20_blk_t *Bin1,
@@ -352,31 +415,31 @@ static uint32_t blockmix_xor(const salsa20_blk_t *Bin1,
uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2;
size_t w = ctx->w;
size_t i;
- DECL_X
+ DECL_X;
/* Convert count of 128-byte blocks to max index of 64-byte block */
r = r * 2 - 1;
- XOR_X_2(Bin1[r], Bin2[r])
+ XOR_X_2(Bin1[r], Bin2[r]);
- DECL_SMASK2REG
+ DECL_SMASK2REG;
i = 0;
r--;
do {
- XOR_X(Bin1[i])
- XOR_X(Bin2[i])
- PWXFORM
- WRITE_X(Bout[i])
+ XOR_X(Bin1[i]);
+ XOR_X(Bin2[i]);
+ PWXFORM;
+ WRITE_X(Bout[i]);
- XOR_X(Bin1[i + 1])
- XOR_X(Bin2[i + 1])
- PWXFORM
+ XOR_X(Bin1[i + 1]);
+ XOR_X(Bin2[i + 1]);
+ PWXFORM;
if (unlikely(i >= r))
break;
- WRITE_X(Bout[i + 1])
+ WRITE_X(Bout[i + 1]);
i += 2;
} while (1);
@@ -385,7 +448,7 @@ static uint32_t blockmix_xor(const salsa20_blk_t *Bin1,
ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2;
ctx->w = w;
- SALSA20_2(Bout[i])
+ SALSA20_2(Bout[i]);
return INTEGERIFY;
}
@@ -399,30 +462,30 @@ static uint32_t blockmix_xor_save(
uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2;
size_t w = ctx->w;
size_t i;
- DECL_X
- DECL_Y
+ DECL_X;
+ DECL_Y;
/* Convert count of 128-byte blocks to max index of 64-byte block */
r = r * 2 - 1;
- XOR_X_2(Bin1out[r], Bin2[r])
+ XOR_X_2(Bin1out[r], Bin2[r]);
- DECL_SMASK2REG
+ DECL_SMASK2REG;
i = 0;
r--;
do {
- XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i])
- PWXFORM
- WRITE_X(Bin1out[i])
+ XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i]);
+ PWXFORM;
+ WRITE_X(Bin1out[i]);
- XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1])
- PWXFORM
+ XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1]);
+ PWXFORM;
if (unlikely(i >= r))
break;
- WRITE_X(Bin1out[i + 1])
+ WRITE_X(Bin1out[i + 1]);
i += 2;
} while (1);
@@ -431,7 +494,7 @@ static uint32_t blockmix_xor_save(
ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2;
ctx->w = w;
- SALSA20_2(Bin1out[i])
+ SALSA20_2(Bin1out[i]);
return INTEGERIFY;
}
More information about the busybox-cvs
mailing list