[PATCH] major performance improvement for get_line_from_file.c
Rich Felker
dalias at aerifal.cx
Wed Jul 5 08:03:20 UTC 2006
the attached patch greatly improves performance by using fgets rather
than individual getc calls in get_line_from_file.c, which is used by
most utilities that process text files. it can easily be modified (see
the commented lines) to use the also-attached function for handling
embedded nul bytes, but as this is slower, larger, and most/all of the
calling applets would not support it anyway, i doubt it's useful at
this time, but i'm sending it to the list for future reference anyway.
oh, one other thing included in this patch: it modifies
bb_get_chunk_from_file to use size_t rather than int for the size of
the string read. imo using int here is dangerous and could lead to
unexpected behavior on platforms with 64bit size_t but 32bit int and
maliciously-crafted input files.
rich
p.s. also apologies that the diff is not minimal-size. i got confused
trying to translate the logic from gets to fgets so i just rewrote the
function from scratch.
-------------- next part --------------
Index: libbb/get_line_from_file.c
===================================================================
--- libbb/get_line_from_file.c (revision 15108)
+++ libbb/get_line_from_file.c (working copy)
@@ -11,6 +11,7 @@
#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
#include "libbb.h"
/* get_line_from_file() - This function reads an entire line from a text file,
@@ -18,43 +19,43 @@
* stored and free'ed by the caller. If end is null '\n' isn't considered
* and of line. If end isn't null, length of the chunk read is stored in it. */
-char *bb_get_chunk_from_file(FILE *file, int *end)
+char *bb_get_chunk_from_file(FILE *f, size_t *n)
{
- int ch;
- int idx = 0;
- char *linebuf = NULL;
- int linebufsz = 0;
+ size_t len = 0, size = 127, l;
+ char *buf = xmalloc(size);
- while ((ch = getc(file)) != EOF) {
- /* grow the line buffer as necessary */
- if (idx > linebufsz - 2) {
- linebuf = xrealloc(linebuf, linebufsz += 80);
- }
- linebuf[idx++] = (char)ch;
- if (!ch || (end && ch == '\n')) break;
+ for (;;) {
+ //if (!(l = fgets_nul(buf+len, size-len, f))) break;
+ if (!fgets(buf+len, size-len, f) || !(l = strlen(buf+len))) break;
+ if (l < size-len-1 || (n && buf[size-2] == '\n')) {
+ len += l;
+ break;
+ } else len = size - 1;
+ // safe: SIZE_MAX allocation will fail before overflow
+ size = 2*size+1;
+ buf = xrealloc(buf, size);
}
- if (end) *end = idx;
- if (linebuf) {
- if (ferror(file)) {
- free(linebuf);
- return NULL;
- }
- linebuf[idx] = 0;
+
+ if (n) *n = len;
+ if (!len || ferror(f)) {
+ free(buf);
+ return NULL;
}
- return linebuf;
+
+ return buf; // slower: xrealloc(buf, len+1);
}
/* Get line, including trailing /n if any */
char *bb_get_line_from_file(FILE *file)
{
- int i;
+ size_t i;
return bb_get_chunk_from_file(file, &i);
}
/* Get line. Remove trailing /n */
char *bb_get_chomped_line_from_file(FILE *file)
{
- int i;
+ size_t i;
char *c=bb_get_chunk_from_file(file, &i);
if(i) c[--i]=0;
Index: include/libbb.h
===================================================================
--- include/libbb.h (revision 15108)
+++ include/libbb.h (working copy)
@@ -130,7 +130,7 @@
extern char *find_block_device(char *path);
extern char *bb_get_line_from_file(FILE *file);
extern char *bb_get_chomped_line_from_file(FILE *file);
-extern char *bb_get_chunk_from_file(FILE *file, int *end);
+extern char *bb_get_chunk_from_file(FILE *file, size_t *end);
extern int bb_copyfd_size(int fd1, int fd2, const off_t size);
extern int bb_copyfd_eof(int fd1, int fd2);
extern void bb_xprint_and_close_file(FILE *file);
-------------- next part --------------
#include <stdio.h>
#include <string.h>
/* This function implements a version of fgets than can handle embedded nul
* bytes on top of the existing stdio fgets function, with reasonable
* performance, assuming the string operations memset/memchr are a good bit
* faster than loops full of conditionals (like a getc loop).
*
* The only case in which performance may be significantly slower than
* fgets is when the supplied buffer is significantly oversized. This issue
* could be worked around by working in smaller fgets-chunks and only
* filling the "current chunk" with newline bytes.
*
* GNU users may want to just use getline(). However this code demonstrates
* that equivalent functionality may be built on top of the standard C
* library with minimal impact to performance.
*
* - Rich Felker, July 2006
*/
size_t fgets_nul(char *s, size_t n, FILE *f)
{
char *p;
/* Newline is the (only) safe inband signal character because fgets
* will never write a newline except when it has finished reading. */
memset(s, '\n', n);
/* If fgets fails completely, nothing to do. */
if (!fgets(s, n, f)) return 0;
/* We have two cases here. Either... */
if ((p = memchr(s, '\n', n))) {
/* ...a null byte follows the newline, in which case the
* newline must actually have been read from the stream... */
if (p < s+n-1 && !p[1]) return p-s+1;
/* ...or else the newline is one of the sentinels we wrote
* with memset, in which case the read must have ended
* prematurely with eof and the location of this sentinel
* newline byte tells us how many bytes were read. */
return p-s-1;
}
/* If the buffer contains no newline bytes, then fgets must have used
* all available (n-1) bytes. */
return n-1;
}
More information about the busybox
mailing list