[git commit] awk: fix handling of empty fields

Denys Vlasenko vda.linux at googlemail.com
Sun Dec 31 14:49:54 UTC 2023


commit: https://git.busybox.net/busybox/commit/?id=789ccac7d9d1a9e433570ac9628992a01f946643
branch: https://git.busybox.net/busybox/commit/?id=refs/heads/master

Patch by M Rubon <rubonmtz at gmail.com>:
Busybox awk handles references to empty (not provided in the input)
fields differently during the first line of input, as compared to
subsequent lines.

$ (echo a ; echo b) | awk '$2 != 0'    #wrong
b

No field $2 value is provided in the input.  When awk references field
$2 for the "a" line, it is seen to have a different behaviour than
when it is referenced for the "b" line.

Problem in BusyBox v1.36.1 embedded in OpenWrt 23.05.0
Same problem also in 21.02 versions of OpenWrt
Same problem in BusyBox v1.37.0.git

I get the correct expected output from Ubuntu gawk and Debian mawk,
and from my fix.
will at dev:~$ (echo a ; echo b) | awk '$2 != 0'  #correct
a
b
will at dev:~/busybox$ (echo a ; echo b ) | ./busybox awk '$2 != 0'  #fixed
a
b

I built and poked into the source code at editors/awk.c  The function
fsrealloc(int size) is core to allocating, initializing, reallocating,
and reinitializing fields, both real input line fields and imaginary
fields that the script references but do not exist in the input.

When fsrealloc() needs more field space than it has previously
allocated, it initializes those new fields differently than how they
are later reinitialized for the next input line.  This works fine for
fields defined in the input, like $1, but does not work the first time
when there is no input for that field (e.g. field $99)

My one-line fix simply makes the initialization and clrvar()
reinitialization use the same value for .type.  I am not sure if there
are regression tests to run, but I have not done those.

I'm not sure if I understand why clrvar() is not setting .type to a
default constant value, but in any case I have left that untouched.

function                                             old     new   delta
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 0/0 up/down: 0/0)                 Total: 0 bytes

Signed-off-by: Denys Vlasenko <vda.linux at googlemail.com>
---
 editors/awk.c       | 33 +++++++++++++++++----------------
 testsuite/awk.tests |  7 +++++++
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/editors/awk.c b/editors/awk.c
index bc95c4155..aa485c782 100644
--- a/editors/awk.c
+++ b/editors/awk.c
@@ -555,8 +555,9 @@ struct globals {
 	//we are reusing ahash as fdhash, via define (see later)
 	const char *g_progname;
 	int g_lineno;
-	int nfields;
-	unsigned maxfields;
+	int num_fields;             /* number of existing $N's */
+	unsigned num_alloc_fields;  /* current size of Fields[] */
+	/* NB: Fields[0] corresponds to $1, not to $0 */
 	var *Fields;
 	char *g_pos;
 	char g_saved_ch;
@@ -631,8 +632,8 @@ struct globals2 {
 // for fdhash in execution stage.
 #define g_progname   (G1.g_progname  )
 #define g_lineno     (G1.g_lineno    )
-#define nfields      (G1.nfields     )
-#define maxfields    (G1.maxfields   )
+#define num_fields   (G1.num_fields  )
+#define num_alloc_fields (G1.num_alloc_fields)
 #define Fields       (G1.Fields      )
 #define g_pos        (G1.g_pos       )
 #define g_saved_ch   (G1.g_saved_ch  )
@@ -1966,30 +1967,30 @@ static void fsrealloc(int size)
 {
 	int i, newsize;
 
-	if ((unsigned)size >= maxfields) {
+	if ((unsigned)size >= num_alloc_fields) {
 		/* Sanity cap, easier than catering for over/underflows */
 		if ((unsigned)size > 0xffffff)
 			bb_die_memory_exhausted();
 
-		i = maxfields;
-		maxfields = size + 16;
+		i = num_alloc_fields;
+		num_alloc_fields = size + 16;
 
-		newsize = maxfields * sizeof(Fields[0]);
+		newsize = num_alloc_fields * sizeof(Fields[0]);
 		debug_printf_eval("fsrealloc: xrealloc(%p, %u)\n", Fields, newsize);
 		Fields = xrealloc(Fields, newsize);
 		debug_printf_eval("fsrealloc: Fields=%p..%p\n", Fields, (char*)Fields + newsize - 1);
 		/* ^^^ did Fields[] move? debug aid for L.v getting "upstaged" by R.v in evaluate() */
 
-		for (; i < maxfields; i++) {
-			Fields[i].type = VF_SPECIAL;
+		for (; i < num_alloc_fields; i++) {
+			Fields[i].type = VF_SPECIAL | VF_DIRTY;
 			Fields[i].string = NULL;
 		}
 	}
-	/* if size < nfields, clear extra field variables */
-	for (i = size; i < nfields; i++) {
+	/* if size < num_fields, clear extra field variables */
+	for (i = size; i < num_fields; i++) {
 		clrvar(Fields + i);
 	}
-	nfields = size;
+	num_fields = size;
 }
 
 static int regexec1_nonempty(const regex_t *preg, const char *s, regmatch_t pmatch[])
@@ -2126,7 +2127,7 @@ static void split_f0(void)
 	/* set NF manually to avoid side effects */
 	clrvar(intvar[NF]);
 	intvar[NF]->type = VF_NUMBER | VF_SPECIAL;
-	intvar[NF]->number = nfields;
+	intvar[NF]->number = num_fields;
 #undef fstrings
 }
 
@@ -2976,7 +2977,7 @@ static var *evaluate(node *op, var *res)
 				syntax_error(EMSG_TOO_FEW_ARGS);
 			L.v = evaluate(op1, TMPVAR0);
 			/* Does L.v point to $n variable? */
-			if ((size_t)(L.v - Fields) < maxfields) {
+			if ((size_t)(L.v - Fields) < num_alloc_fields) {
 				/* yes, remember where Fields[] is */
 				old_Fields_ptr = Fields;
 			}
@@ -3517,7 +3518,7 @@ static var *evaluate(node *op, var *res)
 				res = intvar[F0];
 			} else {
 				split_f0();
-				if (i > nfields)
+				if (i > num_fields)
 					fsrealloc(i);
 				res = &Fields[i - 1];
 			}
diff --git a/testsuite/awk.tests b/testsuite/awk.tests
index 5a792c241..063084a1c 100755
--- a/testsuite/awk.tests
+++ b/testsuite/awk.tests
@@ -592,6 +592,13 @@ testing 'awk gensub backslashes \\0' \
 \\0|\\0
 ' '' ''
 
+# References to empty (not provided in the input) fields in first versus subsequent lines
+testing 'awk references to empty fields' \
+	'awk '$sq'$2 != 0'$sq \
+	'a
+b
+' '' 'a\nb\n'
+
 # The "b" in "abc" should not match <b* pattern.
 # Currently we use REG_STARTEND ("This flag is a BSD extension, not present in POSIX")
 # to implement the code to handle this correctly, but if your libc has no REG_STARTEND,


More information about the busybox-cvs mailing list