svn commit: trunk/busybox: findutils libbb testsuite

vda at busybox.net vda at busybox.net
Sat Aug 9 16:15:14 UTC 2008


Author: vda
Date: 2008-08-09 09:15:14 -0700 (Sat, 09 Aug 2008)
New Revision: 23072

Log:
grep: option to use GNU regex matching instead of POSIX one.
 This fixes problems with NULs in files being scanned, but
 costs +800 bytes. The same can be done to sed (TODO).



Modified:
   trunk/busybox/Config.in
   trunk/busybox/findutils/grep.c
   trunk/busybox/libbb/get_line_from_file.c
   trunk/busybox/libbb/xregcomp.c
   trunk/busybox/testsuite/grep.tests


Changeset:
Modified: trunk/busybox/Config.in
===================================================================
--- trunk/busybox/Config.in	2008-08-08 18:15:29 UTC (rev 23071)
+++ trunk/busybox/Config.in	2008-08-09 16:15:14 UTC (rev 23072)
@@ -21,6 +21,15 @@
 	  Select this only if you plan to use busybox on full-blown
 	  desktop machine with common Linux distro, not on an embedded box.
 
+config EXTRA_COMPAT
+	bool "Provide compatible behavior for rare corner cases (bigger code)"
+	default n
+	help
+	  This option makes grep, sed etc handle rare corner cases
+	  (embedded NUL bytes and such). This makes code bigger and uses
+	  some GNU extensions in libc. You probably only need this option
+	  if you plan to run busybox on desktop.
+
 config FEATURE_ASSUME_UNICODE
 	bool "Assume that 1:1 char/glyph correspondence is not true"
 	default n

Modified: trunk/busybox/findutils/grep.c
===================================================================
--- trunk/busybox/findutils/grep.c	2008-08-08 18:15:29 UTC (rev 23071)
+++ trunk/busybox/findutils/grep.c	2008-08-09 16:15:14 UTC (rev 23072)
@@ -96,6 +96,7 @@
 	int lines_before;
 	int lines_after;
 	char **before_buf;
+	USE_EXTRA_COMPAT(size_t *before_buf_size;)
 	int last_line_printed;
 #endif
 	/* globals used internally */
@@ -117,6 +118,7 @@
 #define lines_before      (G.lines_before        )
 #define lines_after       (G.lines_after         )
 #define before_buf        (G.before_buf          )
+#define before_buf_size   (G.before_buf_size     )
 #define last_line_printed (G.last_line_printed   )
 #define pattern_head      (G.pattern_head        )
 #define cur_file          (G.cur_file            )
@@ -124,14 +126,24 @@
 
 typedef struct grep_list_data_t {
 	char *pattern;
-	regex_t preg;
+/* for GNU regex, matched_range must be persistent across grep_file() calls */
+#if !ENABLE_EXTRA_COMPAT
+	regex_t compiled_regex;
+	regmatch_t matched_range;
+#else
+	struct re_pattern_buffer compiled_regex;
+	struct re_registers matched_range;
+#endif
 #define ALLOCATED 1
 #define COMPILED 2
 	int flg_mem_alocated_compiled;
 } grep_list_data_t;
 
-
-static void print_line(const char *line, int linenum, char decoration)
+#if !ENABLE_EXTRA_COMPAT
+#define print_line(line, line_len, linenum, decoration) \
+	print_line(line, linenum, decoration)
+#endif
+static void print_line(const char *line, size_t line_len, int linenum, char decoration)
 {
 #if ENABLE_FEATURE_GREP_CONTEXT
 	/* Happens when we go to next file, immediately hit match
@@ -139,8 +151,9 @@
 	if (linenum < 1)
 		return;
 	/* possibly print the little '--' separator */
-	if ((lines_before || lines_after) && did_print_line &&
-			last_line_printed != linenum - 1) {
+	if ((lines_before || lines_after) && did_print_line
+	 && last_line_printed != linenum - 1
+	) {
 		puts("--");
 	}
 	/* guard against printing "--" before first line of first file */
@@ -152,17 +165,50 @@
 	if (PRINT_LINE_NUM)
 		printf("%i%c", linenum, decoration);
 	/* Emulate weird GNU grep behavior with -ov */
-	if ((option_mask32 & (OPT_v|OPT_o)) != (OPT_v|OPT_o))
+	if ((option_mask32 & (OPT_v|OPT_o)) != (OPT_v|OPT_o)) {
+#if !ENABLE_EXTRA_COMPAT
 		puts(line);
+#else
+		fwrite(line, 1, line_len, stdout);
+		putchar('\n');
+#endif
+	}
 }
 
+#if ENABLE_EXTRA_COMPAT
+/* Unlike getline, this one removes trailing '\n' */
+static ssize_t FAST_FUNC bb_getline(char **line_ptr, size_t *line_alloc_len, FILE *file)
+{
+	ssize_t res_sz;
+	char *line;
+
+	res_sz = getline(line_ptr, line_alloc_len, file);
+	line = *line_ptr;
+
+	if (res_sz > 0) {
+		if (line[res_sz - 1] == '\n')
+			line[--res_sz] = '\0';
+	} else {
+		free(line); /* uclibc allocates a buffer even on EOF. WTF? */
+	}
+	return res_sz;
+}
+#endif
+
 static int grep_file(FILE *file)
 {
-	char *line;
 	smalluint found;
 	int linenum = 0;
 	int nmatches = 0;
-	regmatch_t regmatch;
+#if !ENABLE_EXTRA_COMPAT
+	char *line;
+#else
+	char *line = NULL;
+	ssize_t line_len;
+	size_t line_alloc_len;
+#define rm_so start[0]
+#define rm_eo end[0]
+#endif
 #if ENABLE_FEATURE_GREP_CONTEXT
 	int print_n_lines_after = 0;
 	int curpos = 0; /* track where we are in the circular 'before' buffer */
@@ -171,7 +217,13 @@
 	enum { print_n_lines_after = 0 };
 #endif /* ENABLE_FEATURE_GREP_CONTEXT */
 
-	while ((line = xmalloc_fgetline(file)) != NULL) {
+	while (
+#if !ENABLE_EXTRA_COMPAT
+		(line = xmalloc_fgetline(file)) != NULL
+#else
+		(line_len = bb_getline(&line, &line_alloc_len, file)) >= 0
+#endif
+	) {
 		llist_t *pattern_ptr = pattern_head;
 		grep_list_data_t *gl = gl; /* for gcc */
 
@@ -184,19 +236,35 @@
 			} else {
 				if (!(gl->flg_mem_alocated_compiled & COMPILED)) {
 					gl->flg_mem_alocated_compiled |= COMPILED;
-					xregcomp(&(gl->preg), gl->pattern, reflags);
+#if !ENABLE_EXTRA_COMPAT
+					xregcomp(&gl->compiled_regex, gl->pattern, reflags);
+#else
+					memset(&gl->compiled_regex, 0, sizeof(gl->compiled_regex));
+					if (re_compile_pattern(gl->pattern, strlen(gl->pattern), &gl->compiled_regex))
+						bb_error_msg_and_die("bad regex '%s'", gl->pattern);
+#endif
 				}
-				regmatch.rm_so = 0;
-				regmatch.rm_eo = 0;
-				if (regexec(&(gl->preg), line, 1, &regmatch, 0) == 0) {
+#if !ENABLE_EXTRA_COMPAT
+				gl->matched_range.rm_so = 0;
+				gl->matched_range.rm_eo = 0;
+#endif
+				if (
+#if !ENABLE_EXTRA_COMPAT
+					regexec(&gl->compiled_regex, line, 1, &gl->matched_range, 0) == 0
+#else
+					re_search(&gl->compiled_regex, line, line_len,
+							/*start:*/ 0, /*range:*/ line_len,
+							&gl->matched_range) >= 0
+#endif
+				) {
 					if (!(option_mask32 & OPT_w))
 						found = 1;
 					else {
 						char c = ' ';
-						if (regmatch.rm_so)
-							c = line[regmatch.rm_so - 1];
+						if (gl->matched_range.rm_so)
+							c = line[gl->matched_range.rm_so - 1];
 						if (!isalnum(c) && c != '_') {
-							c = line[regmatch.rm_eo];
+							c = line[gl->matched_range.rm_eo];
 							if (!c || (!isalnum(c) && c != '_'))
 								found = 1;
 						}
@@ -261,7 +329,7 @@
 
 					/* now print each line in the buffer, clearing them as we go */
 					while (before_buf[idx] != NULL) {
-						print_line(before_buf[idx], first_buf_entry_line_num, '-');
+						print_line(before_buf[idx], before_buf_size[idx], first_buf_entry_line_num, '-');
 						free(before_buf[idx]);
 						before_buf[idx] = NULL;
 						idx = (idx + 1) % lines_before;
@@ -277,13 +345,15 @@
 						/* -Fo just prints the pattern
 						 * (unless -v: -Fov doesnt print anything at all) */
 						if (found)
-							print_line(gl->pattern, linenum, ':');
+							print_line(gl->pattern, strlen(gl->pattern), linenum, ':');
 					} else {
-						line[regmatch.rm_eo] = '\0';
-						print_line(line + regmatch.rm_so, linenum, ':');
+						line[gl->matched_range.rm_eo] = '\0';
+						print_line(line + gl->matched_range.rm_so,
+								gl->matched_range.rm_eo - gl->matched_range.rm_so,
+								linenum, ':');
 					}
 				} else {
-					print_line(line, linenum, ':');
+					print_line(line, line_len, linenum, ':');
 				}
 			}
 		}
@@ -291,12 +361,13 @@
 		else { /* no match */
 			/* if we need to print some context lines after the last match, do so */
 			if (print_n_lines_after) {
-				print_line(line, linenum, '-');
+				print_line(line, strlen(line), linenum, '-');
 				print_n_lines_after--;
 			} else if (lines_before) {
 				/* Add the line to the circular 'before' buffer */
 				free(before_buf[curpos]);
 				before_buf[curpos] = line;
+				USE_EXTRA_COMPAT(before_buf_size[curpos] = line_len;)
 				curpos = (curpos + 1) % lines_before;
 				/* avoid free(line) - we took the line */
 				line = NULL;
@@ -304,13 +375,14 @@
 		}
 
 #endif /* ENABLE_FEATURE_GREP_CONTEXT */
+#if !ENABLE_EXTRA_COMPAT
 		free(line);
-
+#endif
 		/* Did we print all context after last requested match? */
 		if ((option_mask32 & OPT_m)
 		 && !print_n_lines_after && nmatches == max_matches)
 			break;
-	}
+	} /* while (read line) */
 
 	/* special-case file post-processing for options where we don't print line
 	 * matches, just filenames and possibly match counts */
@@ -428,15 +500,16 @@
 			lines_after = Copt;
 		if (!(option_mask32 & OPT_B)) /* not overridden */
 			lines_before = Copt;
-		//option_mask32 |= OPT_A|OPT_B; /* for parser */
 	}
 	/* sanity checks */
 	if (option_mask32 & (OPT_c|OPT_q|OPT_l|OPT_L)) {
 		option_mask32 &= ~OPT_n;
 		lines_before = 0;
 		lines_after = 0;
-	} else if (lines_before > 0)
-		before_buf = xzalloc(lines_before * sizeof(char *));
+	} else if (lines_before > 0) {
+		before_buf = xzalloc(lines_before * sizeof(before_buf[0]));
+		USE_EXTRA_COMPAT(before_buf_size = xzalloc(lines_before * sizeof(before_buf_size[0]));)
+	}
 #else
 	/* with auto sanity checks */
 	/* -H unsets -h; -c,-q or -l unset -n; -e,-f are lists; -m N */
@@ -537,7 +610,7 @@
 			if (gl->flg_mem_alocated_compiled & ALLOCATED)
 				free(gl->pattern);
 			if (gl->flg_mem_alocated_compiled & COMPILED)
-				regfree(&(gl->preg));
+				regfree(&gl->compiled_regex);
 			free(gl);
 			free(pattern_head_ptr);
 		}

Modified: trunk/busybox/libbb/get_line_from_file.c
===================================================================
--- trunk/busybox/libbb/get_line_from_file.c	2008-08-08 18:15:29 UTC (rev 23071)
+++ trunk/busybox/libbb/get_line_from_file.c	2008-08-09 16:15:14 UTC (rev 23072)
@@ -9,6 +9,10 @@
  * Licensed under GPLv2 or later, see file LICENSE in this tarball for details.
  */
 
+/* for getline() [GNUism] */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif
 #include "libbb.h"
 
 /* This function reads an entire line from a text file, up to a newline
@@ -55,7 +59,6 @@
 
 	return bb_get_chunk_from_file(file, &i);
 }
-
 /* Get line.  Remove trailing \n */
 char* FAST_FUNC xmalloc_fgetline(FILE *file)
 {
@@ -69,6 +72,44 @@
 }
 
 #if 0
+
+/* GNUism getline() should be faster (not tested) than a loop with fgetc */
+
+/* Get line, including trailing \n if any */
+char* FAST_FUNC xmalloc_fgets(FILE *file)
+{
+	char *res_buf = NULL;
+	size_t res_sz;
+
+	if (getline(&res_buf, &res_sz, file) == -1) {
+		free(res_buf); /* uclibc allocates a buffer even on EOF. WTF? */
+		res_buf = NULL;
+	}
+//TODO: trimming to res_sz?
+	return res_buf;
+}
+/* Get line.  Remove trailing \n */
+char* FAST_FUNC xmalloc_fgetline(FILE *file)
+{
+	char *res_buf = NULL;
+	size_t res_sz;
+
+	res_sz = getline(&res_buf, &res_sz, file);
+
+	if ((ssize_t)res_sz != -1) {
+		if (res_buf[res_sz - 1] == '\n')
+			res_buf[--res_sz] = '\0';
+//TODO: trimming to res_sz?
+	} else {
+		free(res_buf); /* uclibc allocates a buffer even on EOF. WTF? */
+		res_buf = NULL;
+	}
+	return res_buf;
+}
+
+#endif
+
+#if 0
 /* Faster routines (~twice as fast). +170 bytes. Unused as of 2008-07.
  *
  * NB: they stop at NUL byte too.

Modified: trunk/busybox/libbb/xregcomp.c
===================================================================
--- trunk/busybox/libbb/xregcomp.c	2008-08-08 18:15:29 UTC (rev 23071)
+++ trunk/busybox/libbb/xregcomp.c	2008-08-09 16:15:14 UTC (rev 23072)
@@ -27,6 +27,6 @@
 {
 	char *errmsg = regcomp_or_errmsg(preg, regex, cflags);
 	if (errmsg) {
-		bb_error_msg_and_die("xregcomp: %s", errmsg);
+		bb_error_msg_and_die("bad regex '%s': %s", regex, errmsg);
 	}
 }

Modified: trunk/busybox/testsuite/grep.tests
===================================================================
--- trunk/busybox/testsuite/grep.tests	2008-08-08 18:15:29 UTC (rev 23071)
+++ trunk/busybox/testsuite/grep.tests	2008-08-09 16:15:14 UTC (rev 23072)
@@ -62,12 +62,8 @@
 	"grep -s domatch nonexistent - ; echo \$?" \
 	"(standard input):domatch\n2\n" "" "nomatch\ndomatch\nend\n"
 
-# This doesn't match GNU behaviour (Binary file input matches)
-# acts like GNU grep -a
-testing "grep handles binary files" "grep foo input" "foo\n" "\0foo\n\n" ""
-# This doesn't match GNU behaviour (Binary file (standard input) matches)
-# acts like GNU grep -a
-testing "grep handles binary stdin" "grep foo" "foo\n" "" "\0foo\n\n"
+testing "grep handles NUL in files" "grep -a foo input" "\0foo\n" "\0foo\n\n" ""
+testing "grep handles NUL on stdin" "grep -a foo" "\0foo\n" "" "\0foo\n\n"
 
 testing "grep matches NUL" "grep . input > /dev/null 2>&1 ; echo \$?" \
 	"0\n" "\0\n" ""




More information about the busybox-cvs mailing list