[PATCH] Compress common substrings in applet names to save space

Tue Apr 19 18:47:33 UTC 2016

On 04/19/2016 02:15 PM, Ron Yorston wrote:
> The string of applet names seems ripe for compression:  it has a limited
> range of characters and many common substrings.  If the compression is
> too aggressive, though, the code required to handle it causes bloat.
>
> Recursively replace common pairs of characters with a character having
> its top bit set; the lower 7 bits being an index into an array of
> character pairs.  This is only done if it results in a net saving,
> otherwise the current code is used.
>
> Provide an implementation of find_applet_by_name() which uses binary
> search:  this reduces the number of string comparisons from an average
> of 27.9 per lookup to 7.7.  This is important because of the added cost
> of expanding strings before comparison.  (Compressing the input string
> is uncompetitive.)
>
> The new version of find_applet_by_name() is about 8% faster than the
> old one when strings aren't abbreviated.  It isn't used in that case,
> though, as it's larger than the current implementation.
>
> When strings are abbreviated the new find_applet_by_name() is about
> 34% slower than currently.  (All comparisons are for the default
> configuration.)
>
> function                                             old     new   delta
> applet_common                                          -     256    +256
> find_applet_by_name                                  112     175     +63
> expand_char                                            -      40     +40
> expand_name                                            -      31     +31
> run_applet_and_exit                                  675     697     +22
> applet_nameofs                                        14      30     +16
> applet_names                                        2536    1630    -906
> ------------------------------------------------------------------------------
> (add/remove: 3/0 grow/shrink: 3/1 up/down: 428/-906)         Total: -478 bytes
>     text	   data	    bss	    dec	    hex	filename
>   764477	   2059	   8808	 775344	  bd4b0	busybox_old
>   763999	   2059	   8832	 774890	  bd2ea	busybox_unstripped
>
> Signed-off-by: Ron Yorston <rmy at pobox.com>
> ---
>   applets/applet_tables.c | 260 +++++++++++++++++++++++++++++++++++++++++-------
>   include/libbb.h         |   1 +
>   libbb/appletlib.c       | 113 +++++++++++++++++++--
>   libbb/lineedit.c        |   8 +-
>   shell/ash.c             |   2 +-
>   5 files changed, 340 insertions(+), 44 deletions(-)
>
> diff --git a/applets/applet_tables.c b/applets/applet_tables.c
> index 843f2ec..b8dd991 100644
> --- a/applets/applet_tables.c
> +++ b/applets/applet_tables.c
> @@ -34,6 +34,7 @@ struct bb_applet {
>   	/* true if instead of fork(); exec("applet"); waitpid();
>   	 * one can simply call applet_main(argc,argv); */
>   	unsigned char nofork;
> +	char *abbrev;
>   };
>
>   /* Define struct bb_applet applets[] */
> @@ -48,6 +49,21 @@ static int cmp_name(const void *a, const void *b)
>   	return strcmp(aa->name, bb->name);
>   }
>
> +#define MAXSUB 1000
> +#define SUBLEN 2
> +
> +struct bb_substr {
> +	char str[SUBLEN+1];
> +	int save;
> +};
> +
> +static int cmp_substr(const void *a, const void *b)
> +{
> +	const struct bb_substr *aa = a;
> +	const struct bb_substr *bb = b;
> +	return bb->save - aa->save;
> +}
> +
>   static int str_isalnum_(const char *s)
>   {
>   	while (*s) {
> @@ -58,25 +74,31 @@ static int str_isalnum_(const char *s)
>   	return 1;
>   }
>
> +void print_char(char c)
> +{
> +	if (c & 0x80) {
> +		printf("\\%o", (unsigned int)(c) & 0xff);
> +	}
> +	else {
> +		putchar(c);
> +	}
> +}
> +
>   int main(int argc, char **argv)
>   {
> -	int i, j;
> -
> -	// In find_applet_by_name(), before linear search, narrow it down
> -	// by looking at N "equidistant" names. With ~350 applets:
> -	// KNOWN_APPNAME_OFFSETS  cycles
> -	//                     0    9057
> -	//                     2    4604 + ~100 bytes of code
> -	//                     4    2407 + 4 bytes
> -	//                     8    1342 + 8 bytes
> -	//                    16     908 + 16 bytes
> -	//                    32     884 + 32 bytes
> -	// With 8, int16_t applet_nameofs[] table has 7 elements.
> +	int i, j, k, n;
> +	struct bb_substr substr[MAXSUB];
> +	int minsub, numsub;
> +	int minidx, maxidx, index;
> +	char *s;
> +	int MAX_APPLET_NAME_LEN = 0;
>   	int KNOWN_APPNAME_OFFSETS = 8;
> -	// With 128 applets we do two linear searches, with 1..7 strcmp's in the first one
> -	// and 1..16 strcmp's in the second. With 256 apps, second search does 1..32 strcmp's.
> -	if (NUM_APPLETS < 128)
> +	int ABBREV_APPLET_NAMES = 1;
> +
> +	if (NUM_APPLETS < 128) {
>   		KNOWN_APPNAME_OFFSETS = 4;
> +		ABBREV_APPLET_NAMES = 0;
> +	}
>   	if (NUM_APPLETS < 32)
>   		KNOWN_APPNAME_OFFSETS = 0;
>
> @@ -99,6 +121,174 @@ int main(int argc, char **argv)
>   		printf("#define SINGLE_APPLET_MAIN %s_main\n", applets[0].main);
>   	}
>
> +	for (i = 0; i < NUM_APPLETS; i++) {
> +		const char *t;
> +
> +		applets[i].abbrev = strdup(applets[i].name);
> +		for (t = applets[i].name; ABBREV_APPLET_NAMES && *t; t++) {
> +			/* if any char has its top bit set we can't use substrings */
> +			if (*t & 0x80)
> +				ABBREV_APPLET_NAMES = 0;
> +		}
> +		if (MAX_APPLET_NAME_LEN < strlen(applets[i].name))
> +			MAX_APPLET_NAME_LEN = strlen(applets[i].name);
> +	}
> +
> +	/* determine if applet names can be compressed */
> +	minsub = numsub = 0;
> +	minidx = index = 0;
> +	maxidx = 96;	/* leave a few slots free initially */
> +	if (ABBREV_APPLET_NAMES) {
> +		int count, oldlen, newlen, newtot;
> +
> + again:
> +		/* make a first pass looking for common substrings */
> +		for (i = 0; i < NUM_APPLETS; i++) {
> +			int len;
> +			char sub[SUBLEN+1];
> +
> +			if ((len=strlen(applets[i].abbrev)) < SUBLEN)
> +				continue;
> +
> +			for (j = 0; j < len-SUBLEN+1; j++) {
> +				strncpy(sub, applets[i].abbrev+j, SUBLEN);
> +				sub[SUBLEN] = '\0';
> +
> +				/* skip substrings we've seen before */
> +				for (n = minsub; n < numsub; n++) {
> +					if (strcmp(sub, substr[n].str) == 0) {
> +						break;
> +					}
> +				}
> +				if (n != numsub) {
> +					continue;
> +				}
> +
> +				count = 1;
> +				for (k = i + 1; k < NUM_APPLETS; k++) {
> +					if (strstr(applets[k].abbrev, sub)) {
> +						++count;
> +					}
> +				}
> +
> +				/* only record substrings that result in a saving */
> +				if (count > 1) {
> +					int save = count * SUBLEN;
> +					int cost = count + SUBLEN;
> +					if ( save > cost && numsub < MAXSUB ) {
> +						strcpy(substr[numsub].str, sub);
> +						substr[numsub++].save = save - cost;
> +					}
> +				}
> +			}
> +		}
> +
> +		/* sort by number of bytes saved */
> +		qsort(substr+minsub, numsub-minsub, sizeof(substr[0]), cmp_substr);
> +
> +		for (j = minsub, index = minidx; j < numsub; j++) {
> +			/* check that each substring still gives a saving */
> +			count = 0;
> +			for (i = 0; i < NUM_APPLETS && index < maxidx; i++) {
> +				if (strstr(applets[i].abbrev, substr[j].str) != NULL) {
> +					++count;
> +				}
> +			}
> +
> +			if ( count*SUBLEN > count+SUBLEN ) {
> +				/* replace each instance of substring with a character with */
> +				/* its top bit set, the remaining 7 bits are its index */
> +				for (i = 0; i < NUM_APPLETS; i++) {
> +					while ((s=strstr(applets[i].abbrev, substr[j].str))) {
> +						*s++ = 0x80 + index;
> +						strcpy(s, s+SUBLEN-1);
> +					}
> +				}
> +				++index;
> +			}
> +			else {
> +				/* this substring no longer gives a saving */
> +				substr[j].save = 0;
> +			}
> +		}
> +
> +		/* recurse a couple of times */
> +		if (maxidx == 96) {
> +			minsub = numsub;
> +			minidx = index;
> +			maxidx = 120;
> +			goto again;
> +		}
> +		else if (maxidx == 120) {
> +			minsub = numsub;
> +			minidx = index;
> +			maxidx = 128;
> +			goto again;
> +		}
> +
> +		/* calculate lengths of new and old strings */
> +		oldlen = newlen = 1;
> +		for (i = 0; i < NUM_APPLETS; i++) {
> +			oldlen += strlen(applets[i].name) + 1;
> +			newlen += strlen(applets[i].abbrev) + 1;
> +		}
> +
> +		/* only use substrings if the saving outweighs the larger code */
> +		newtot = newlen + (index*SUBLEN) + 160 + 2*KNOWN_APPNAME_OFFSETS;
> +		ABBREV_APPLET_NAMES = (newtot < oldlen);
> +
> +		if (ABBREV_APPLET_NAMES) {
> +			/* use some of our savings to speed up the linear search */
> +			KNOWN_APPNAME_OFFSETS *= 2;
> +			printf("/* strings reduced from %d to %d = (%d + %d) bytes */\n",
> +				oldlen, newlen+(index*SUBLEN), newlen, index*SUBLEN);
> +		}
> +	}
> +
> +	printf("#define ABBREV_APPLET_NAMES %d\n\n", ABBREV_APPLET_NAMES);
> +
> +	//printf("#ifndef SKIP_definitions\n");
> +	if (ABBREV_APPLET_NAMES) {
> +		char c = 0x80;
> +
> +		printf("static const char applet_common[][2] ALIGN1 = {\n");
> +		for (n = 0; n < numsub; n++) {
> +			if (substr[n].save != 0) {
> +				printf("/* ");
> +				print_char(c++);
> +				printf(" */ ");
> +				printf("{'");
> +				print_char(substr[n].str[0]);
> +				printf("', '");
> +				print_char(substr[n].str[1]);
> +				printf("'},\n");
> +			}
> +		}
> +		printf("};\n\n");
> +
> +		printf("const char applet_names[] ALIGN1 = \"\"\n");
> +		for (i = 0; i < NUM_APPLETS; i++) {
> +			putchar('"');
> +			for (s = applets[i].abbrev; *s; s++) {
> +				if (*s & 0x80) {
> +					printf("\\%o", (unsigned int)(*s) & 0xff);
> +				}
> +				else {
> +					putchar(*s);
> +				}
> +			}
> +			printf("\" \"\\0\"\n");
> +		}
> +		printf(";\n\n");
> +	}
> +	else {
> +		printf("const char applet_names[] ALIGN1 = \"\"\n");
> +		for (i = 0; i < NUM_APPLETS; i++) {
> +			printf("\"%s\" \"\\0\"\n", applets[i].name);
> +		}
> +		printf(";\n\n");
> +	}
> +
>   	printf("#define KNOWN_APPNAME_OFFSETS %u\n\n", KNOWN_APPNAME_OFFSETS);
>   	if (KNOWN_APPNAME_OFFSETS > 0) {
>   		int ofs, offset[KNOWN_APPNAME_OFFSETS], index[KNOWN_APPNAME_OFFSETS];
> @@ -109,26 +299,20 @@ int main(int argc, char **argv)
>   			for (j = 0; j < KNOWN_APPNAME_OFFSETS; j++)
>   				if (i == index[j])
>   					offset[j] = ofs;
> -			ofs += strlen(applets[i].name) + 1;
> +			if (i == NUM_APPLETS-1)
> +				printf("#define LAST_APPLET_NAME_OFFSET %d\n\n", ofs);
> +			ofs += strlen(ABBREV_APPLET_NAMES ?
> +					applets[i].abbrev : applets[i].name) + 1;
>   		}
>   		/* If the list of names is too long refuse to proceed */
>   		if (ofs > 0xffff)
>   			return 1;
> -		printf("const uint16_t applet_nameofs[] ALIGN2 = {\n");
> +		printf("static const uint16_t applet_nameofs[] ALIGN2 = {\n");
>   		for (i = 1; i < KNOWN_APPNAME_OFFSETS; i++)
>   			printf("%d,\n", offset[i]);
>   		printf("};\n\n");
>   	}
>
> -	//printf("#ifndef SKIP_definitions\n");
> -	printf("const char applet_names[] ALIGN1 = \"\"\n");
> -	for (i = 0; i < NUM_APPLETS; i++) {
> -		printf("\"%s\" \"\\0\"\n", applets[i].name);
> -//		if (MAX_APPLET_NAME_LEN < strlen(applets[i].name))
> -//			MAX_APPLET_NAME_LEN = strlen(applets[i].name);
> -	}
> -	printf(";\n\n");
> -
>   	for (i = 0; i < NUM_APPLETS; i++) {
>   		if (str_isalnum_(applets[i].name))
>   			printf("#define APPLET_NO_%s %d\n", applets[i].name, i);
> @@ -190,26 +374,32 @@ int main(int argc, char **argv)
>   	printf("};\n");
>   #endif
>   	//printf("#endif /* SKIP_definitions */\n");
> -//	printf("\n");
> -//	printf("#define MAX_APPLET_NAME_LEN %u\n", MAX_APPLET_NAME_LEN);
> +	printf("\n");
> +	printf("#define MAX_APPLET_NAME_LEN %u\n", MAX_APPLET_NAME_LEN);
>
>   	if (argv[2]) {
> -		char line_old[80];
> -		char line_new[80];
> +		char line_old[2][80];
> +		char line_new[2][80];
>   		FILE *fp;
>
> -		line_old[0] = 0;
> +		line_old[0][0] = 0;
> +		line_old[1][0] = 0;
>   		fp = fopen(argv[2], "r");
>   		if (fp) {
> -			fgets(line_old, sizeof(line_old), fp);
> +			fgets(line_old[0], sizeof(line_old[0]), fp);
> +			fgets(line_old[1], sizeof(line_old[1]), fp);
>   			fclose(fp);
>   		}
> -		sprintf(line_new, "#define NUM_APPLETS %u\n", NUM_APPLETS);
> -		if (strcmp(line_old, line_new) != 0) {
> +		sprintf(line_new[0], "#define NUM_APPLETS %u\n", NUM_APPLETS);
> +		sprintf(line_new[1], ABBREV_APPLET_NAMES ?
> +				"\n" : "#define expand_name(n) (n)\n");
> +		if (strcmp(line_old[0], line_new[0]) != 0 ||
> +				strcmp(line_old[1], line_new[1]) != 0) {
>   			fp = fopen(argv[2], "w");
>   			if (!fp)
>   				return 1;
> -			fputs(line_new, fp);
> +			fputs(line_new[0], fp);
> +			fputs(line_new[1], fp);
>   		}
>   	}
>
> diff --git a/include/libbb.h b/include/libbb.h
> index 111dd66..714283b 100644
> --- a/include/libbb.h
> +++ b/include/libbb.h
> @@ -1229,6 +1229,7 @@ const struct hwtype *get_hwntype(int type) FAST_FUNC;
>
>   #ifndef BUILD_INDIVIDUAL
>   extern int find_applet_by_name(const char *name) FAST_FUNC;
> +extern const char *expand_name(const char *name) FAST_FUNC;
>   /* Returns only if applet is not found. */
>   extern void run_applet_and_exit(const char *name, char **argv) FAST_FUNC;
>   extern void run_applet_no_and_exit(int a, char **argv) NORETURN FAST_FUNC;
> diff --git a/libbb/appletlib.c b/libbb/appletlib.c
> index b682e6b..aef98cc 100644
> --- a/libbb/appletlib.c
> +++ b/libbb/appletlib.c
> @@ -139,6 +139,7 @@ void FAST_FUNC bb_show_usage(void)
>   	xfunc_die();
>   }
>
> +#if !ABBREV_APPLET_NAMES
>   int FAST_FUNC find_applet_by_name(const char *name)
>   {
>   	unsigned i, max;
> @@ -265,6 +266,104 @@ int FAST_FUNC find_applet_by_name(const char *name)
>   	return -1;
>   #endif
>   }
> +#define expand_name(n) (n)
> +#else /* ABBREV_APPLET_NAMES */
> +static char *expand_char(char *t, int c)
> +{
> +	if (c & 0x80) {
> +		/* substitute common substring for char with top bit set */
> +		const char *u = applet_common[c & 0x7f];
> +		t = expand_char(t, u[0]);
> +		t = expand_char(t, u[1]);
> +	}
> +	else {
> +		*t++ = c;
> +	}
> +
> +	return t;
> +}
> +
> +const char * FAST_FUNC expand_name(const char *name)
> +{
> +	static char expand[MAX_APPLET_NAME_LEN+1];
> +	char *t = expand;
> +
> +	while (*name)
> +		t = expand_char(t, *name++);
> +	*t = '\0';
> +
> +	return expand;
> +}
> +
> +int FAST_FUNC find_applet_by_name(const char *name)
> +{
> +	int i, ret, lower, upper, middle;
> +
> +	/* Do a binary search to find the applet entry given the name. */
> +
> +	lower = 0;
> +	upper = LAST_APPLET_NAME_OFFSET;
> +	do {
> +		middle = (upper + lower) >> 1;
> +		/* the new index is probably not at the start of a name */
> +		/* first move forward to the start of the next name... */
> +		while (applet_names[middle++])
> +			continue;
> +		if (middle == upper) {
> +			/* ... if that didn't work go back to start of previous name, */
> +			/* taking care not to fall off the start of the string */
> +			do {
> +				middle--;
> +			} while (middle && applet_names[middle-1]);
> +			if (middle == lower) {
> +				/*
> +				 * End of binary search.  The bounds are either values
> +				 * that were previously 'middle' and have been checked
> +				 * already or are 0 or LAST_APPLET_NAME_OFFSET, which
> +				 * still need to be checked.
> +				 */
> +				if (lower == 0)
> +					/* middle = lower */;
> +				else if (upper == LAST_APPLET_NAME_OFFSET)
> +					middle = upper;
> +				else
> +					break;
> +			}
> +		}
> +		ret = strcmp(name, expand_name(applet_names+middle));
> +		if (ret < 0)
> +			upper = middle;
> +		else if (ret > 0)
> +			lower = middle;
> +		else
> +			goto found;
> +	} while (lower != upper);
> +
> +	return -1;
> +
> + found:
> +	/*
> +	 * Find index of name by linear search, using the known offsets to
> +	 * limit the range we need to search.
> +	 */
> +	lower = 0;
> +	ret = NUM_APPLETS * (KNOWN_APPNAME_OFFSETS-1);
> +	for (i = KNOWN_APPNAME_OFFSETS-2; i >= 0; i--) {
> +		if (middle >= applet_nameofs[i]) {
> +			lower = applet_nameofs[i];
> +			ret /= KNOWN_APPNAME_OFFSETS;
> +			break;
> +		}
> +		ret -= NUM_APPLETS;
> +	}
> +	while (lower < middle) {
> +		lower += strlen(applet_names+lower) + 1;
> +		++ret;
> +	}
> +
> +	return ret;
> +}
> +#endif
>
>
>   void lbb_prepare(const char *applet
> @@ -678,7 +777,7 @@ static void install_links(const char *busybox, int use_symbolic_links,
>   	 * busybox.h::bb_install_loc_t, or else... */
>   	int (*lf)(const char *, const char *);
>   	char *fpc;
> -        const char *appname = applet_names;
> +	const char *appname = applet_names;
>   	unsigned i;
>   	int rc;
>
> @@ -689,7 +788,7 @@ static void install_links(const char *busybox, int use_symbolic_links,
>   	for (i = 0; i < ARRAY_SIZE(applet_main); i++) {
>   		fpc = concat_path_file(
>   				custom_install_dir ? custom_install_dir : install_dir[APPLET_INSTALL_LOC(i)],
> -				appname);
> +				expand_name(appname));
>   		// debug: bb_error_msg("%slinking %s to busybox",
>   		//		use_symbolic_links ? "sym" : "", fpc);
>   		rc = lf(busybox, fpc);
> @@ -760,7 +859,8 @@ static int busybox_main(char **argv)
>   		/* prevent last comma to be in the very last pos */
>   		output_width--;
>   		while (*a) {
> -			int len2 = strlen(a) + 2;
> +			const char *name = expand_name(a);
> +			int len2 = strlen(name) + 2;
>   			if (col >= (int)output_width - len2) {
>   				full_write2_str(",\n");
>   				col = 0;
> @@ -771,9 +871,10 @@ static int busybox_main(char **argv)
>   			} else {
>   				full_write2_str(", ");
>   			}
> -			full_write2_str(a);
> +			full_write2_str(name);
>   			col += len2;
> -			a += len2 - 1;
> +			while (*a++ != '\0')
> +				continue;
>   		}
>   		full_write2_str("\n\n");
>   		return 0;
> @@ -788,7 +889,7 @@ static int busybox_main(char **argv)
>   			if (argv[1][6]) /* --list-full? */
>   				full_write2_str(install_dir[APPLET_INSTALL_LOC(i)] + 1);
>   # endif
> -			full_write2_str(a);
> +			full_write2_str(expand_name(a));
>   			full_write2_str("\n");
>   			i++;
>   			while (*a++ != '\0')
> diff --git a/libbb/lineedit.c b/libbb/lineedit.c
> index 3e62f46..83c13ae 100644
> --- a/libbb/lineedit.c
> +++ b/libbb/lineedit.c
> @@ -780,8 +780,12 @@ static NOINLINE unsigned complete_cmd_dir_file(const char *command, int type)
>   		const char *p = applet_names;
>
>   		while (*p) {
> -			if (strncmp(pfind, p, pf_len) == 0)
> -				add_match(xstrdup(p));
> +			const char *name = expand_name(p);
> +			int ret = strncmp(pfind, name, pf_len);
> +			if (ret < 0)
> +				break;
> +			else if (ret == 0)
> +				add_match(xstrdup(name));
>   			while (*p++ != '\0')
>   				continue;
>   		}
> diff --git a/shell/ash.c b/shell/ash.c
> index da9c950..7800f32 100644
> --- a/shell/ash.c
> +++ b/shell/ash.c
> @@ -12592,7 +12592,7 @@ helpcmd(int argc UNUSED_PARAM, char **argv UNUSED_PARAM)
>   	{
>   		const char *a = applet_names;
>   		while (*a) {
> -			col += out1fmt("%c%s", ((col == 0) ? '\t' : ' '), a);
> +			col += out1fmt("%c%s", ((col == 0) ? '\t' : ' '), expand_name(a));
>   			if (col > 60) {
>   				out1fmt("\n");
>   				col = 0;
>

Hi,
it seems to me that it goes against the KISS principle and even if
some bytes are saved and speed gained it adds a lot of complexity
for a trivial task like looking up a string ...or it is just to
smart for a self-taught programmer like me.

Just my 0,2 cents.

Ciao,
Tito