[PATCH] shuf: speed up when limited output is requested
Ron Yorston
rmy at pobox.com
Wed Mar 10 08:37:15 UTC 2021
A user noted that the following command was slower than they
expected:
busybox shuf -i "1500000000-$(date +%s)" -n 5
At time of writing the range contains 115 million values. On my
system this takes 6.9s whereas 'shuf' from coreutils takes a
handful of milliseconds.
Optimise BusyBox 'shuf' for cases where -n is specified by stopping
shuffling once the required number of lines have been processed.
On my system the time for the example is reduced to 0.4s.
function old new delta
shuf_main 521 560 +39
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 1/0 up/down: 39/0) Total: 39 bytes
Signed-off-by: Ron Yorston <rmy at pobox.com>
---
coreutils/shuf.c | 28 +++++++++++++++-------------
1 file changed, 15 insertions(+), 13 deletions(-)
diff --git a/coreutils/shuf.c b/coreutils/shuf.c
index fdbd3e9b2..839d1b80f 100644
--- a/coreutils/shuf.c
+++ b/coreutils/shuf.c
@@ -39,8 +39,10 @@
/*
* Use the Fisher-Yates shuffle algorithm on an array of lines.
+ * If the required number of output lines is less than the total
+ * we can stop shuffling early.
*/
-static void shuffle_lines(char **lines, unsigned numlines)
+static void shuffle_lines(char **lines, unsigned numlines, unsigned outlines)
{
unsigned i;
unsigned r;
@@ -48,7 +50,7 @@ static void shuffle_lines(char **lines, unsigned numlines)
srand(monotonic_us());
- for (i = numlines-1; i > 0; i--) {
+ for (i = numlines-1; i > 0 && outlines > 0; i--, outlines--) {
r = rand();
/* RAND_MAX can be as small as 32767 */
if (i > RAND_MAX)
@@ -67,7 +69,7 @@ int shuf_main(int argc, char **argv)
char *opt_i_str, *opt_n_str, *opt_o_str;
unsigned i;
char **lines;
- unsigned numlines;
+ unsigned numlines, outlines;
char eol;
opts = getopt32(argv, "^"
@@ -128,24 +130,24 @@ int shuf_main(int argc, char **argv)
fclose_if_not_stdin(fp);
}
- if (numlines != 0)
- shuffle_lines(lines, numlines);
+ outlines = numlines;
+ if (opts & OPT_n) {
+ outlines = xatou(opt_n_str);
+ if (outlines > numlines)
+ outlines = numlines;
+ }
+
+ if (numlines != 0 && outlines != 0)
+ shuffle_lines(lines, numlines, outlines);
if (opts & OPT_o)
xmove_fd(xopen(opt_o_str, O_WRONLY|O_CREAT|O_TRUNC), STDOUT_FILENO);
- if (opts & OPT_n) {
- unsigned maxlines;
- maxlines = xatou(opt_n_str);
- if (numlines > maxlines)
- numlines = maxlines;
- }
-
eol = '\n';
if (opts & OPT_z)
eol = '\0';
- for (i = 0; i < numlines; i++) {
+ for (i = numlines-outlines; i < numlines; i++) {
if (opts & OPT_i)
printf("%u%c", (unsigned)(uintptr_t)lines[i], eol);
else
--
2.29.2
More information about the busybox
mailing list