[git commit] sed: fix zero chars match/replace

Denys Vlasenko vda.linux at googlemail.com
Mon Jun 4 12:44:47 UTC 2012


commit: http://git.busybox.net/busybox/commit/?id=21f6fbf545e7fa58f0eaa444001a9d25bc37c4eb
branch: http://git.busybox.net/busybox/commit/?id=refs/heads/master

function                                             old     new   delta
process_files                                       2099    2181     +82

Signed-off-by: Denys Vlasenko <vda.linux at googlemail.com>
---
 editors/sed.c       |   64 +++++++++++++++++++++++++++++++++-----------------
 testsuite/sed.tests |   10 ++++++-
 2 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/editors/sed.c b/editors/sed.c
index a2df931..87fc755 100644
--- a/editors/sed.c
+++ b/editors/sed.c
@@ -673,7 +673,7 @@ static void do_subst_w_backrefs(char *line, char *replace)
 
 	/* go through the replacement string */
 	for (i = 0; replace[i]; i++) {
-		/* if we find a backreference (\1, \2, etc.) print the backref'ed * text */
+		/* if we find a backreference (\1, \2, etc.) print the backref'ed text */
 		if (replace[i] == '\\') {
 			unsigned backref = replace[++i] - '0';
 			if (backref <= 9) {
@@ -707,8 +707,10 @@ static void do_subst_w_backrefs(char *line, char *replace)
 static int do_subst_command(sed_cmd_t *sed_cmd, char **line_p)
 {
 	char *line = *line_p;
-	int altered = 0;
 	unsigned match_count = 0;
+	bool altered = 0;
+	bool prev_match_empty = 1;
+	bool tried_at_eol = 0;
 	regex_t *current_regex;
 
 	current_regex = sed_cmd->sub_match;
@@ -737,46 +739,64 @@ static int do_subst_command(sed_cmd_t *sed_cmd, char **line_p)
 	do {
 		int i;
 
-		/* Work around bug in glibc regexec, demonstrated by:
-		 * echo " a.b" | busybox sed 's [^ .]* x g'
-		 * The match_count check is so not to break
-		 * echo "hi" | busybox sed 's/^/!/g'
-		 */
-		if (!G.regmatch[0].rm_so && !G.regmatch[0].rm_eo && match_count) {
-			pipe_putc(*line++);
-			goto next;
-		}
-
 		match_count++;
 
 		/* If we aren't interested in this match, output old line to
-		   end of match and continue */
+		 * end of match and continue */
 		if (sed_cmd->which_match
 		 && (sed_cmd->which_match != match_count)
 		) {
 			for (i = 0; i < G.regmatch[0].rm_eo; i++)
 				pipe_putc(*line++);
+			/* Null match? Print one more char */
+			if (G.regmatch[0].rm_so == i && *line)
+				pipe_putc(*line++);
 			goto next;
 		}
 
-		/* print everything before the match */
+		/* Print everything before the match */
 		for (i = 0; i < G.regmatch[0].rm_so; i++)
 			pipe_putc(line[i]);
 
-		/* then print the substitution string */
-		do_subst_w_backrefs(line, sed_cmd->string);
+		/* Then print the substitution string,
+		 * unless we just matched empty string after non-empty one.
+		 * Example: string "cccd", pattern "c*", repl "R":
+		 * result is "RdR", not "RRdR": first match "ccc",
+		 * second is "" before "d", third is "" after "d".
+		 * Second match is NOT replaced!
+		 */
+		if (prev_match_empty || i != 0) {
+			dbg("inserting replacement at %d in '%s'", i, line);
+			do_subst_w_backrefs(line, sed_cmd->string);
+		} else {
+			dbg("NOT inserting replacement at %d in '%s'", i, line);
+		}
+
+		/* If matched string is empty (f.e. "c*" pattern),
+		 * copy verbatim one char after it before attempting more matches
+		 */
+		prev_match_empty = (G.regmatch[0].rm_eo == i);
+		if (prev_match_empty && line[i]) {
+			pipe_putc(line[i]);
+			G.regmatch[0].rm_eo++;
+		}
 
-		/* advance past the match */
+		/* Advance past the match */
+		dbg("line += %d", G.regmatch[0].rm_eo);
 		line += G.regmatch[0].rm_eo;
-		/* flag that something has changed */
-		altered++;
+		/* Flag that something has changed */
+		altered = 1;
 
 		/* if we're not doing this globally, get out now */
 		if (sed_cmd->which_match != 0)
 			break;
  next:
-		if (*line == '\0')
-			break;
+		/* Exit if we are at EOL and already tried matching at it */
+		if (*line == '\0') {
+			if (tried_at_eol)
+				break;
+			tried_at_eol = 1;
+		}
 
 //maybe (G.regmatch[0].rm_eo ? REG_NOTBOL : 0) instead of unconditional REG_NOTBOL?
 	} while (regexec(current_regex, line, 10, G.regmatch, REG_NOTBOL) != REG_NOMATCH);
@@ -1127,7 +1147,7 @@ static void process_files(void)
 		case 's':
 			if (!do_subst_command(sed_cmd, &pattern_space))
 				break;
-			dbg("do_subst_command succeeeded:'%s'", pattern_space);
+			dbg("do_subst_command succeeded:'%s'", pattern_space);
 			substituted |= 1;
 
 			/* handle p option */
diff --git a/testsuite/sed.tests b/testsuite/sed.tests
index 9fa8e19..375beb5 100755
--- a/testsuite/sed.tests
+++ b/testsuite/sed.tests
@@ -52,10 +52,8 @@ testing "sed with empty match" "sed 's/z*//g'" "string\n" "" "string\n"
 testing "sed s//p" "sed -e s/foo/bar/p -e s/bar/baz/p" "bar\nbaz\nbaz\n" \
 	"" "foo\n"
 testing "sed -n s//p" "sed -ne s/abc/def/p" "def\n" "" "abc\n"
-test x"$SKIP_KNOWN_BUGS" = x"" && {
 testing "sed s//g (exhaustive)" "sed -e 's/[[:space:]]*/,/g'" ",1,2,3,4,5,\n" \
 	"" "12345\n"
-}
 testing "sed s arbitrary delimiter" "sed -e 's woo boing '" "boing\n" "" "woo\n"
 testing "sed s chains" "sed -e s/foo/bar/ -e s/bar/baz/" "baz\n" "" "foo\n"
 testing "sed s chains2" "sed -e s/foo/bar/ -e s/baz/nee/" "bar\n" "" "foo\n"
@@ -296,6 +294,14 @@ testing "sed -i finishes ranges correctly" \
 	"sed '1,2d' -i input; echo \$?; cat input" \
 	"0\n3\n4\n" "1\n2\n3\n4\n" ""
 
+testing "sed zero chars match/replace advances correctly 1" \
+	"sed 's/l*/@/g'" \
+	"@h at e@o@\n" "" "helllo\n"
+
+testing "sed zero chars match/replace advances correctly 2" \
+	"sed 's [^ .]* x g'" \
+	"x x.x\n" "" " a.b\n"
+
 # testing "description" "commands" "result" "infile" "stdin"
 
 exit $FAILCOUNT


More information about the busybox-cvs mailing list