[PATCH] diff: properly implement diff of binary files. -14 bytes

Matheus Izvekov mizvekov at gmail.com
Tue Jan 12 21:59:15 UTC 2010


>From 30a8e4968767978a791057b35f718498704f200e Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov at gmail.com>
Date: Tue, 12 Jan 2010 19:43:44 -0200
Subject: [PATCH] diff: properly implement diff of binary files. -14 bytes

This applet always handled binary files incorrectly, by always returning
they were different. This is fixed now, by removing asciifile() and
reusing the diff machinery for comparing the files. The good upside of
this change is that for the normal case, now only 3 passes over the
input files are done, instead of 4. Another good upside is that more
code is reused, and we end up saving some bytes. The downside is that
comparing binary files is more expensive in terms of cpu and memory.
But if the user wants that, he should be using `cmp` anyway.

This commit also removes FEATURE_DIFF_MINIMAL and FEATURE_DIFF_BINARY,
having them now always on.
The former because it saves only one branch that is executed once per
diff, the latter because with the code size reduction achieved in this
patch, it is not worth the hassle IMO.
---
 editors/Config.in |   16 --------
 editors/diff.c    |  110 ++++++++++++++++++++--------------------------------
 2 files changed, 42 insertions(+), 84 deletions(-)

diff --git a/editors/Config.in b/editors/Config.in
index 7dbc9b6..e1285f4 100644
--- a/editors/Config.in
+++ b/editors/Config.in
@@ -35,14 +35,6 @@ config DIFF
 	  differences between them in a form that can be given to
 	  the patch command.
 
-config FEATURE_DIFF_BINARY
-	bool "Enable checks for binary files"
-	default y
-	depends on DIFF
-	help
-	  This option enables support for checking for binary files
-	  before a comparison is carried out.
-
 config FEATURE_DIFF_DIR
 	bool "Enable directory support"
 	default y
@@ -51,14 +43,6 @@ config FEATURE_DIFF_DIR
 	  This option enables support for directory and subdirectory
 	  comparison.
 
-config FEATURE_DIFF_MINIMAL
-	bool "Enable -d option to find smaller sets of changes"
-	default n
-	depends on DIFF
-	help
-	  Enabling this option allows the use of -d to make diff
-	  try hard to find the smallest possible set of changes.
-
 config ED
 	bool "ed"
 	default n
diff --git a/editors/diff.c b/editors/diff.c
index 2087aaa..405a3da 100644
--- a/editors/diff.c
+++ b/editors/diff.c
@@ -119,8 +119,6 @@ enum flags {               /* Commandline flags */
 };
 #define FLAG(x) (1 << FLAG_##x)
 
-#define g_read_buf bb_common_bufsiz1
-
 struct globals {
 	smallint exit_status;
 	int opt_U_context;
@@ -148,7 +146,7 @@ static int read_char(int fd)
 }
 
 struct token {
-	bool eol, eof, empty, space;
+	bool eol, eof, empty, space, print;
 	int c;
 };
 
@@ -161,7 +159,8 @@ static void read_token(int fd, struct token *tok)
 	while (!tok->eol) {
 		int t = read_char(fd);
 		bool is_space = isspace(t) || t == EOF;
-		
+		tok->print = is_space || isprint_asciionly(t);
+
 		switch(t) {
 		case EOF:  tok->eof = true;
 		case '\n': tok->eol = true;
@@ -212,15 +211,10 @@ static int search(const int *c, const int k, const int y, const struct cand *lis
 
 static void stone(const int *a, const int n, const int *b, int *J, int pref)
 {
-	int k = 0;
 	const int isq = sqrt(n);
-#if ENABLE_FEATURE_DIFF_MINIMAL
 	const unsigned bound =
 		(option_mask32 & FLAG(d)) ? UINT_MAX : MAX(256, isq);
-#else
-	const unsigned bound = MAX(256, isq);
-#endif
-	int clen = 1, clistlen = 100;
+	int clen = 1, clistlen = 100, k = 0;
 	struct cand *clist = xmalloc(clistlen * sizeof(struct cand));
 	int *klist = xmalloc((n + 2) * sizeof(int));
 	clist[0] = (struct cand){0};
@@ -376,7 +370,7 @@ static void fetch(int fd, const off_t *ix, int a, int b, int ch)
  * * 1: if EOF token was not read
  * * 2: opposite of the above
  */
-static int hash_line(int fd, int *hash)
+static int hash_line(int fd, int *hash, bool *binary)
 {
 	struct token tok = {0};
 	bool begin = false;
@@ -384,6 +378,7 @@ static int hash_line(int fd, int *hash)
 
 	for (out = 0; !tok.empty; out = out * 127 + tok.c) {
 		read_token(fd, &tok);
+		*binary |= !tok.print;
 		if (tok.eof && !begin)
 			return 0;
 		begin = true;
@@ -406,7 +401,7 @@ static int hash_line(int fd, int *hash)
  *   assigned dynamically allocated vectors of the offsets of the lines
  *   of the old and new file respectively. These must be freed by the caller
  */
-static int *create_J(int fd[2], int nlen[2], off_t *ix[2])
+static int *create_J(int fd[2], int nlen[2], off_t *ix[2], bool *binary)
 {
 	int *J, slen[2], *class, *member;
 	struct line *nfile[2], *sfile[2];
@@ -421,9 +416,9 @@ static int *create_J(int fd[2], int nlen[2], off_t *ix[2])
 		size_t sz = 100;
 		nfile[i] = xmalloc((sz + 3) * sizeof(nfile[i][0]));
 		xlseek(fd[i], 0, SEEK_SET);
-		
+
 		nlen[i] = 0;
-		for (int hash, ret; (ret = hash_line(fd[i], &hash)); ) {
+		for (int hash, ret; (ret = hash_line(fd[i], &hash, binary)); ) {
 			if (nlen[i]++ == sz) {
 				sz = sz * 3 / 2;
 				nfile[i] = xrealloc(nfile[i], (sz + 3) * sizeof(nfile[i][0]));
@@ -534,23 +529,27 @@ struct context_vec {
 	int d;          /* end line in new file */
 };
 
-static bool NOINLINE diff(char *file[2], int fd[2])
+static unsigned NOINLINE diff(char *file[2], int fd[2])
 {
 	int nlen[2];
 	off_t *ix[2];
-	int *J = create_J(fd, nlen, ix);
-	
+	bool binary = false;
+	int *J = create_J(fd, nlen, ix, &binary);
+
 	bool anychange = false;
 	struct context_vec *vec = NULL;
 	int idx = -1, i = 1;
 
+	if (option_mask32 & FLAG(a))
+		binary = false;
+
 	do {
 		while (1) {
 			struct context_vec v;
-			
+
 			for (v.a = i; v.a <= nlen[0] && J[v.a] == J[v.a - 1] + 1; v.a++);
 			v.c = J[v.a - 1] + 1;
-			
+
 			for (v.b = v.a - 1; v.b < nlen[0] && !J[v.b + 1]; v.b++);
 			v.d = J[v.b + 1] - 1;
 			/*
@@ -580,7 +579,7 @@ static bool NOINLINE diff(char *file[2], int fd[2])
 		}
 		if (idx < 0)
 			continue;
-		if (!(option_mask32 & FLAG(q))) {
+		if (!(option_mask32 & FLAG(q)) && !binary) {
 			struct context_vec *cvp = vec;
 			int lowa = MAX(1, cvp->a - opt_U_context),
 			    upb  = MIN(nlen[0], vec[idx].b + opt_U_context),
@@ -630,27 +629,13 @@ static bool NOINLINE diff(char *file[2], int fd[2])
 	free(ix[0]);
 	free(ix[1]);
 	free(J);
-	return anychange;
-}
-
-static bool asciifile(int fd)
-{
-#if ENABLE_FEATURE_DIFF_BINARY
-	if (option_mask32 & FLAG(a))
-		return true;
-	for (int i = 0, cnt = full_read(fd, g_read_buf, COMMON_BUFSIZE);
-	     i < cnt; i++)
-		if (!isprint_asciionly(g_read_buf[i]) && !isspace(g_read_buf[i]))
-			return false;
-#endif
-	return true;
+	return !anychange ? STATUS_SAME : (binary ? STATUS_BINARY : STATUS_DIFFER);
 }
 
-
 static unsigned diffreg(char *file[2], int flags)
 {
 	int fd[2] = { -1, -1 };
-	unsigned rval = STATUS_SAME; /* None of them are directories */
+	unsigned rval;
 
 	/* Is any of them a directory? Then it's simple */
 	if (S_ISDIR(stb[0].st_mode) != S_ISDIR(stb[1].st_mode))
@@ -679,21 +664,12 @@ static unsigned diffreg(char *file[2], int flags)
 				close(fd[i]);
 			fd[i] = fd_tmp;
 		}
-		if (!asciifile(fd[i])) {
-			exit_status |= 1;
-			rval = STATUS_BINARY;
-			goto closem;
-		}
-	}
-	
-	if (diff(file, fd)) {
-		exit_status |= 1;
-		dbg_error_msg("exit_status|=1 = %d", exit_status);
-		if (rval == STATUS_SAME)
-			rval = STATUS_DIFFER;
 	}
 
- closem:
+	rval = diff(file, fd);
+
+	if (rval != STATUS_SAME)
+		exit_status |= 1;
 	if (fd[0])
 		close(fd[0]);
 	if (fd[1])
@@ -708,10 +684,8 @@ static void print_status(int val, char *_path[2])
 		printf("Common subdirectories: %s and %s\n", _path[0], _path[1]);
 		break;
 	case STATUS_BINARY:
-		printf("Binary files %s and %s differ\n", _path[0], _path[1]);
-		break;
 	case STATUS_DIFFER:
-		if (option_mask32 & FLAG(q))
+		if ((option_mask32 & FLAG(q)) || val == STATUS_BINARY)
 			printf("Files %s and %s differ\n", _path[0], _path[1]);
 		break;
 	case STATUS_SAME:
@@ -797,7 +771,9 @@ static int FAST_FUNC add_to_dirlist(const char *filename,
 	return true;
 }
 
-/* This function adds a filename to dl, the directory listing. */
+/* If recursion is not set, this function adds the directory
+ * to the list and prevents recursive_action from recursing into it.
+ */
 static int FAST_FUNC skip_dir(const char *filename UNUSED_PARAM,
 		struct stat *sb UNUSED_PARAM,
 		void *userdata UNUSED_PARAM,
@@ -872,7 +848,6 @@ int diff_main(int argc UNUSED_PARAM, char **argv)
 	int gotstdin = 0;
 	char *file[2], *s_start = NULL;
 	llist_t *L_arg = NULL;
-	bool dir, dirfile;
 
 	INIT_G();
 
@@ -908,25 +883,24 @@ int diff_main(int argc UNUSED_PARAM, char **argv)
 	if (S_ISDIR(stb[0].st_mode) && S_ISDIR(stb[1].st_mode)) {
 #if ENABLE_FEATURE_DIFF_DIR
 		diffdir(file, s_start);
-		return exit_status;
 #else
 		bb_error_msg_and_die("no support for directory comparison");
 #endif
-	}
+	} else {
+		bool dirfile = S_ISDIR(stb[0].st_mode) || S_ISDIR(stb[1].st_mode);
+		bool dir = S_ISDIR(stb[1].st_mode);
+		if (dirfile) {
+			const char *slash = strrchr(file[!dir], '/');
+			file[dir] = concat_path_file(file[dir], slash ? slash + 1 : file[!dir]);
+			xstat(file[dir], &stb[dir]);
+		}
+		/* diffreg can get non-regular files here,
+		 * they are not both DIRestories */
+		print_status((gotstdin > 1 ? STATUS_SAME : diffreg(file, 0)), file);
 
-	dirfile = S_ISDIR(stb[0].st_mode) || S_ISDIR(stb[1].st_mode);
-	dir = S_ISDIR(stb[1].st_mode);
-	if (dirfile) {
-		const char *slash = strrchr(file[!dir], '/');
-		file[dir] = concat_path_file(file[dir], slash ? slash + 1 : file[!dir]);
-		xstat(file[dir], &stb[dir]);
+		if (dirfile)
+			free(file[dir]);
 	}
-	/* diffreg can get non-regular files here,
-	 * they are not both DIRestories */
-	print_status((gotstdin > 1 ? STATUS_SAME : diffreg(file, 0)), file);
-
-	if (dirfile)
-		free(file[dir]);
 
 	free(label[0]);
 	free(label[1]);
-- 
1.6.6



More information about the busybox mailing list