[RESEND PATCH v2] sed: parse delimiters in regular expression correctly
Yao Zi
ziyao at disroot.org
Thu Nov 14 11:10:30 UTC 2024
As specified in POSIX standard[1], delimiters in bracket expression
should not terminate the regex and always have their original meaning,
hus 's/[\/]//' matches either '\' or '/' and 's/[[:alpha:]/]//' matches
any alphabet or '/'. But with busybox sed,
$ echo a | sed 's/[[:alpha:]/]/b/'
sed: bad option in substitution expression
$ echo '\/' | sed 's/[\/]//'
\
This commit implements a state machine to determine whether a character
is in a bracket expression, in order to parse escape sequence and
command delimiters correctly, following the specification and other
implementation's behavior (GNU and NetBSD). Corresponding test is added
as well.
[1]: "Regular Expressions in sed" https://pubs.opengroup.org/onlinepubs/9799919799/utilities/sed.html
Closes: http://lists.busybox.net/pipermail/busybox/2024-July/090844.html
Fixes: e998c7c03 ("sed: fix handling of escaped delimiters in s/// search pattern, closes 14541")
Signed-off-by: Yao Zi <ziyao at disroot.org>
---
editors/sed.c | 145 ++++++++++++++++++++++++++++++++++----------
testsuite/sed.tests | 5 ++
2 files changed, 119 insertions(+), 31 deletions(-)
diff --git a/editors/sed.c b/editors/sed.c
index 6179c5e80..72397bf37 100644
--- a/editors/sed.c
+++ b/editors/sed.c
@@ -254,17 +254,102 @@ static void cleanup_outname(void)
if (G.outname) unlink(G.outname);
}
-/* strcpy, replacing "\from" with 'to'. If to is NUL, replacing "\any" with 'any' */
-static unsigned parse_escapes(char *dest, const char *string, int len, char from, char to)
+/*
+ * detect whether c is in a bracket expression, status should be the value
+ * returned on last call to this function, or 0 on the first call.
+ * returns 0 if c is not in bracket expression, or -1 if c is start of an
+ * escape sequence.
+ */
+static int is_in_bracket_expr(int status, char c)
+{
+ enum {
+ ESCAPE_SEQ_BACKSLASH = -1,
+ OUT_OF_BRACKET_EXPR = 0,
+ COMPLEMENT,
+ FIRST_LITERAL_CHAR,
+ IN_BRACKET_EXPR,
+ BRACKET_IN_BRACKET_EXPR,
+ COLLATING_SEQ = '.',
+ COLLATING_SEQ_END,
+ EQU_CLASS = '=',
+ EQU_CLASS_END,
+ WORD_CLASS = ':',
+ WORD_CLASS_END
+ };
+
+ switch (status) {
+ case OUT_OF_BRACKET_EXPR:
+ status = c == '\\' ? ESCAPE_SEQ_BACKSLASH :
+ c == '[' ? COMPLEMENT :
+ status;
+ break;
+ case COMPLEMENT:
+ if (c == '^') {
+ status = FIRST_LITERAL_CHAR;
+ break;
+ }
+ // fallthrough
+ case FIRST_LITERAL_CHAR:
+ // ']' and '-' as the first character (maybe after '^') are
+ // literal. we don't care about the later.
+ if (c == ']') {
+ status = IN_BRACKET_EXPR;
+ break;
+ }
+
+ // avoid the beginning '[' of a collating element being ignored
+ // fallthrough
+ case IN_BRACKET_EXPR:
+ status = c == '[' ? BRACKET_IN_BRACKET_EXPR :
+ c == ']' ? OUT_OF_BRACKET_EXPR :
+ IN_BRACKET_EXPR;
+ break;
+ case BRACKET_IN_BRACKET_EXPR:
+ status = c == '.' ? COLLATING_SEQ :
+ c == '=' ? EQU_CLASS :
+ c == ':' ? WORD_CLASS :
+ IN_BRACKET_EXPR;
+ break;
+ case COLLATING_SEQ:
+ case EQU_CLASS:
+ case WORD_CLASS:
+ if (c == status)
+ status++;
+ break;
+ case COLLATING_SEQ_END:
+ case EQU_CLASS_END:
+ case WORD_CLASS_END:
+ status = c == ']' ? IN_BRACKET_EXPR : status - 1;
+ break;
+ default:
+ bb_error_msg_and_die("Unreachable code path");
+ break;
+ }
+
+ return status;
+}
+
+/* strcpy, replacing "\from" with 'to'.
+ * If to is NUL, replacing "\any" with 'any'.
+ * If re is 1, '\from' in bracket expression is not treated as escape sequence.
+ * to must be non-NUL in this case.
+ */
+static unsigned parse_escapes(char *dest, const char *string, int len,
+ char from, char to, int re)
{
char *d = dest;
int i = 0;
+ int status = re - 1;
if (len == -1)
len = strlen(string);
while (i < len) {
- if (string[i] == '\\') {
+ if (re)
+ status = is_in_bracket_expr(status, string[i]);
+
+ if (status < 0 && string[i] == '\\') {
+ status = re - 1;
if (!to || string[i+1] == from) {
if ((*d = to ? to : string[i+1]) == '\0')
return d - dest;
@@ -276,6 +361,7 @@ static unsigned parse_escapes(char *dest, const char *string, int len, char from
*d++ = '\\';
/* fall through: copy next char verbatim */
}
+
if ((*d = string[i++]) == '\0')
return d - dest;
d++;
@@ -284,7 +370,8 @@ static unsigned parse_escapes(char *dest, const char *string, int len, char from
return d - dest;
}
-static char *copy_parsing_escapes(const char *string, int len, char delim)
+static char *copy_parsing_escapes(const char *string, int len, char delim,
+ int re)
{
const char *s;
char *dest = xmalloc(len + 1);
@@ -292,14 +379,15 @@ static char *copy_parsing_escapes(const char *string, int len, char delim)
/* sed recognizes \n */
/* GNU sed also recognizes \t and \r */
for (s = "\nn\tt\rr"; *s; s += 2) {
- len = parse_escapes(dest, string, len, s[1], s[0]);
+ len = parse_escapes(dest, string, len, s[1], s[0],
+ re && delim == s[1]);
string = dest;
}
if (delim) {
/* we additionally unescape any instances of escaped delimiter.
* For example, in 's+9\++X+' the pattern is "9+", not "9\+".
*/
- len = parse_escapes(dest, string, len, delim, delim);
+ len = parse_escapes(dest, string, len, delim, delim, re);
}
return dest;
}
@@ -312,31 +400,25 @@ static char *copy_parsing_escapes(const char *string, int len, char delim)
*/
static int index_of_next_unescaped_regexp_delim(int delimiter, const char *str)
{
- int bracket = -1;
- int escaped = 0;
+ int status = 0, bracket = 1;
int idx = 0;
- char ch;
if (delimiter < 0) {
- bracket--;
+ bracket = 0;
delimiter = -delimiter;
}
- for (; (ch = str[idx]) != '\0'; idx++) {
- if (bracket >= 0) {
- if (ch == ']'
- && !(bracket == idx - 1 || (bracket == idx - 2 && str[idx - 1] == '^'))
- ) {
- bracket = -1;
- }
- } else if (escaped)
- escaped = 0;
- else if (ch == '\\')
- escaped = 1;
- else if (bracket == -1 && ch == '[')
- bracket = idx;
- else if (ch == delimiter)
+ for (; str[idx]; idx++) {
+ if (bracket)
+ status = is_in_bracket_expr(status, str[idx]);
+
+ if (status < 0 || (!bracket && str[idx] == '\\')) {
+ status = 0;
+ if (str[idx + 1])
+ idx++;
+ } else if (status == 0 && str[idx] == delimiter) {
return idx;
+ }
}
/* if we make it to here, we've hit the end of the string */
@@ -360,14 +442,14 @@ static int parse_regex_delim(const char *cmdstr, char **match, char **replace)
/* save the match string */
idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr);
- *match = copy_parsing_escapes(cmdstr_ptr, idx, delimiter);
+ *match = copy_parsing_escapes(cmdstr_ptr, idx, delimiter, 1);
/* save the replacement string */
cmdstr_ptr += idx + 1;
idx = index_of_next_unescaped_regexp_delim(- (int)delimiter, cmdstr_ptr);
//GNU sed 4.8:
// echo 789 | sed 's&8&\&&' - 7&9 ("\&" remained "\&")
// echo 789 | sed 's1\(8\)1\1\11' - 7119 ("\1\1" become "11")
- *replace = copy_parsing_escapes(cmdstr_ptr, idx, delimiter != '&' ? delimiter : 0);
+ *replace = copy_parsing_escapes(cmdstr_ptr, idx, delimiter != '&' ? delimiter : 0, 0);
return ((cmdstr_ptr - cmdstr) + idx);
}
@@ -395,7 +477,7 @@ static int get_address(const char *my_str, int *linenum, regex_t ** regex)
delimiter = *++pos;
next = index_of_next_unescaped_regexp_delim(delimiter, ++pos);
if (next != 0) {
- temp = copy_parsing_escapes(pos, next, 0);
+ temp = copy_parsing_escapes(pos, next, 0, 0);
G.previous_regex_ptr = *regex = xzalloc(sizeof(regex_t));
xregcomp(*regex, temp, G.regex_type);
free(temp);
@@ -590,10 +672,11 @@ static const char *parse_cmd_args(sed_cmd_t *sed_cmd, const char *cmdstr)
cmdstr++;
}
len = strlen(cmdstr);
- sed_cmd->string = copy_parsing_escapes(cmdstr, len, 0);
+ sed_cmd->string = copy_parsing_escapes(cmdstr, len, 0, 0);
cmdstr += len;
/* "\anychar" -> "anychar" */
- parse_escapes(sed_cmd->string, sed_cmd->string, -1, '\0', '\0');
+ parse_escapes(sed_cmd->string, sed_cmd->string, -1, '\0', '\0',
+ 0);
}
/* handle file cmds: (r)ead */
else if (idx <= IDX_w) { /* r,w */
@@ -625,8 +708,8 @@ static const char *parse_cmd_args(sed_cmd_t *sed_cmd, const char *cmdstr)
cmdstr += parse_regex_delim(cmdstr, &match, &replace)+1;
/* \n already parsed, but \delimiter needs unescaping. */
- parse_escapes(match, match, -1, i, i);
- parse_escapes(replace, replace, -1, i, i);
+ parse_escapes(match, match, -1, i, i, 1);
+ parse_escapes(replace, replace, -1, i, i, 0);
sed_cmd->string = xzalloc((strlen(match) + 1) * 2);
for (i = 0; match[i] && replace[i]; i++) {
diff --git a/testsuite/sed.tests b/testsuite/sed.tests
index 626542e33..0656e3bda 100755
--- a/testsuite/sed.tests
+++ b/testsuite/sed.tests
@@ -428,6 +428,11 @@ testing "sed understands duplicate file name" \
"" \
"a\nb\nc\n"
+testing "sed doesn't escape delimiter in bracket expressions" \
+ "sed 's/[\/]//'" '/' "" '\/'
+
+testing "sed delimiter in bracket expression doesn't abort the regex" \
+ "sed 's/[[:alpha:]/]/b/'" 'b' "" 'z'
# testing "description" "commands" "result" "infile" "stdin"
--
2.46.0
More information about the busybox
mailing list