[PATCH] major performance improvement for get_line_from_file.c

Rich Felker dalias at aerifal.cx
Wed Jul 5 08:03:20 UTC 2006


the attached patch greatly improves performance by using fgets rather
than individual getc calls in get_line_from_file.c, which is used by
most utilities that process text files. it can easily be modified (see
the commented lines) to use the also-attached function for handling
embedded nul bytes, but as this is slower, larger, and most/all of the
calling applets would not support it anyway, i doubt it's useful at
this time, but i'm sending it to the list for future reference anyway.

oh, one other thing included in this patch: it modifies
bb_get_chunk_from_file to use size_t rather than int for the size of
the string read. imo using int here is dangerous and could lead to
unexpected behavior on platforms with 64bit size_t but 32bit int and
maliciously-crafted input files.

rich


p.s. also apologies that the diff is not minimal-size. i got confused
trying to translate the logic from gets to fgets so i just rewrote the
function from scratch.

-------------- next part --------------
Index: libbb/get_line_from_file.c
===================================================================
--- libbb/get_line_from_file.c	(revision 15108)
+++ libbb/get_line_from_file.c	(working copy)
@@ -11,6 +11,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include "libbb.h"
 
 /* get_line_from_file() - This function reads an entire line from a text file,
@@ -18,43 +19,43 @@
  * stored and free'ed  by the caller.  If end is null '\n' isn't considered
  * and of line.  If end isn't null, length of the chunk read is stored in it. */
 
-char *bb_get_chunk_from_file(FILE *file, int *end)
+char *bb_get_chunk_from_file(FILE *f, size_t *n)
 {
-	int ch;
-	int idx = 0;
-	char *linebuf = NULL;
-	int linebufsz = 0;
+	size_t len = 0, size = 127, l;
+	char *buf = xmalloc(size);
 
-	while ((ch = getc(file)) != EOF) {
-		/* grow the line buffer as necessary */
-		if (idx > linebufsz - 2) {
-			linebuf = xrealloc(linebuf, linebufsz += 80);
-		}
-		linebuf[idx++] = (char)ch;
-		if (!ch || (end && ch == '\n')) break;
+	for (;;) {
+		//if (!(l = fgets_nul(buf+len, size-len, f))) break;
+		if (!fgets(buf+len, size-len, f) || !(l = strlen(buf+len))) break;
+		if (l < size-len-1 || (n && buf[size-2] == '\n')) {
+			len += l;
+			break;
+		} else len = size - 1;
+		// safe: SIZE_MAX allocation will fail before overflow
+		size = 2*size+1;
+		buf = xrealloc(buf, size);
 	}
-	if (end) *end = idx;
-	if (linebuf) {
-		if (ferror(file)) {
-			free(linebuf);
-			return NULL;
-		}
-		linebuf[idx] = 0;
+
+	if (n) *n = len;
+	if (!len || ferror(f)) {
+		free(buf);
+		return NULL;
 	}
-	return linebuf;
+
+	return buf; // slower: xrealloc(buf, len+1);
 }
 
 /* Get line, including trailing /n if any */
 char *bb_get_line_from_file(FILE *file)
 {
-	int i;
+	size_t i;
 	return bb_get_chunk_from_file(file, &i);
 }
 
 /* Get line.  Remove trailing /n */
 char *bb_get_chomped_line_from_file(FILE *file)
 {
-	int i;
+	size_t i;
 	char *c=bb_get_chunk_from_file(file, &i);
 	if(i) c[--i]=0;
 	
Index: include/libbb.h
===================================================================
--- include/libbb.h	(revision 15108)
+++ include/libbb.h	(working copy)
@@ -130,7 +130,7 @@
 extern char *find_block_device(char *path);
 extern char *bb_get_line_from_file(FILE *file);
 extern char *bb_get_chomped_line_from_file(FILE *file);
-extern char *bb_get_chunk_from_file(FILE *file, int *end);
+extern char *bb_get_chunk_from_file(FILE *file, size_t *end);
 extern int bb_copyfd_size(int fd1, int fd2, const off_t size);
 extern int bb_copyfd_eof(int fd1, int fd2);
 extern void  bb_xprint_and_close_file(FILE *file);
-------------- next part --------------
#include <stdio.h>
#include <string.h>

/* This function implements a version of fgets than can handle embedded nul
 * bytes on top of the existing stdio fgets function, with reasonable
 * performance, assuming the string operations memset/memchr are a good bit
 * faster than loops full of conditionals (like a getc loop).
 *
 * The only case in which performance may be significantly slower than
 * fgets is when the supplied buffer is significantly oversized. This issue
 * could be worked around by working in smaller fgets-chunks and only
 * filling the "current chunk" with newline bytes.
 *
 * GNU users may want to just use getline(). However this code demonstrates
 * that equivalent functionality may be built on top of the standard C
 * library with minimal impact to performance.
 *
 * - Rich Felker, July 2006
 */

size_t fgets_nul(char *s, size_t n, FILE *f)
{
	char *p;

	/* Newline is the (only) safe inband signal character because fgets
	 * will never write a newline except when it has finished reading. */
	memset(s, '\n', n);

	/* If fgets fails completely, nothing to do. */
	if (!fgets(s, n, f)) return 0;

	/* We have two cases here. Either... */
	if ((p = memchr(s, '\n', n))) {
		/* ...a null byte follows the newline, in which case the
		 * newline must actually have been read from the stream... */
		if (p < s+n-1 && !p[1]) return p-s+1;

		/* ...or else the newline is one of the sentinels we wrote
		 * with memset, in which case the read must have ended
		 * prematurely with eof and the location of this sentinel
		 * newline byte tells us how many bytes were read. */
		return p-s-1;
	}

	/* If the buffer contains no newline bytes, then fgets must have used
	 * all available (n-1) bytes. */
	return n-1;
}


More information about the busybox mailing list