[git commit] unzip: properly use CDF to find compressed files. Closes 9536

Denys Vlasenko vda.linux at googlemail.com
Thu Jan 5 10:43:53 UTC 2017


commit: https://git.busybox.net/busybox/commit/?id=e3c4db8b396e955ca01648ba51c0fd093f3cab56
branch: https://git.busybox.net/busybox/commit/?id=refs/heads/master

function                                             old     new   delta
unzip_main                                          2437    2350     -87

Signed-off-by: Denys Vlasenko <vda.linux at googlemail.com>
---
 archival/unzip.c      | 285 +++++++++++++++++++++++++++++---------------------
 testsuite/unzip.tests |   6 +-
 2 files changed, 168 insertions(+), 123 deletions(-)

diff --git a/archival/unzip.c b/archival/unzip.c
index c540485..edef22f 100644
--- a/archival/unzip.c
+++ b/archival/unzip.c
@@ -16,7 +16,6 @@
  * TODO
  * Zip64 + other methods
  */
-
 //config:config UNZIP
 //config:	bool "unzip"
 //config:	default y
@@ -24,8 +23,17 @@
 //config:	  unzip will list or extract files from a ZIP archive,
 //config:	  commonly found on DOS/WIN systems. The default behavior
 //config:	  (with no options) is to extract the archive into the
-//config:	  current directory. Use the `-d' option to extract to a
-//config:	  directory of your choice.
+//config:	  current directory.
+//config:
+//config:config FEATURE_UNZIP_CDF
+//config:	bool "Read and use Central Directory data"
+//config:	default y
+//config:	depends on UNZIP
+//config:	help
+//config:	  If you know that you only need to deal with simple
+//config:	  ZIP files without deleted/updated files, SFX archves etc,
+//config:	  you can reduce code size by unselecting this option.
+//config:	  To support less trivial ZIPs, say Y.
 
 //applet:IF_UNZIP(APPLET(unzip, BB_DIR_USR_BIN, BB_SUID_DROP))
 //kbuild:lib-$(CONFIG_UNZIP) += unzip.o
@@ -80,30 +88,20 @@ typedef union {
 		uint32_t ucmpsize PACKED;       /* 18-21 */
 		uint16_t filename_len;          /* 22-23 */
 		uint16_t extra_len;             /* 24-25 */
+		/* filename follows (not NUL terminated) */
+		/* extra field follows */
+		/* data follows */
 	} formatted PACKED;
 } zip_header_t; /* PACKED - gcc 4.2.1 doesn't like it (spews warning) */
 
-/* Check the offset of the last element, not the length.  This leniency
- * allows for poor packing, whereby the overall struct may be too long,
- * even though the elements are all in the right place.
- */
-struct BUG_zip_header_must_be_26_bytes {
-	char BUG_zip_header_must_be_26_bytes[
-		offsetof(zip_header_t, formatted.extra_len) + 2
-			== ZIP_HEADER_LEN ? 1 : -1];
-};
-
-#define FIX_ENDIANNESS_ZIP(zip_header) do { \
-	(zip_header).formatted.version      = SWAP_LE16((zip_header).formatted.version     ); \
-	(zip_header).formatted.method       = SWAP_LE16((zip_header).formatted.method      ); \
-	(zip_header).formatted.modtime      = SWAP_LE16((zip_header).formatted.modtime     ); \
-	(zip_header).formatted.moddate      = SWAP_LE16((zip_header).formatted.moddate     ); \
+#define FIX_ENDIANNESS_ZIP(zip_header) \
+do { if (BB_BIG_ENDIAN) { \
 	(zip_header).formatted.crc32        = SWAP_LE32((zip_header).formatted.crc32       ); \
 	(zip_header).formatted.cmpsize      = SWAP_LE32((zip_header).formatted.cmpsize     ); \
 	(zip_header).formatted.ucmpsize     = SWAP_LE32((zip_header).formatted.ucmpsize    ); \
 	(zip_header).formatted.filename_len = SWAP_LE16((zip_header).formatted.filename_len); \
 	(zip_header).formatted.extra_len    = SWAP_LE16((zip_header).formatted.extra_len   ); \
-} while (0)
+}} while (0)
 
 #define CDF_HEADER_LEN 42
 
@@ -115,8 +113,8 @@ typedef union {
 		uint16_t version_needed;        /* 2-3 */
 		uint16_t cdf_flags;             /* 4-5 */
 		uint16_t method;                /* 6-7 */
-		uint16_t mtime;                 /* 8-9 */
-		uint16_t mdate;                 /* 10-11 */
+		uint16_t modtime;               /* 8-9 */
+		uint16_t moddate;               /* 10-11 */
 		uint32_t crc32;                 /* 12-15 */
 		uint32_t cmpsize;               /* 16-19 */
 		uint32_t ucmpsize;              /* 20-23 */
@@ -127,27 +125,27 @@ typedef union {
 		uint16_t internal_file_attributes; /* 32-33 */
 		uint32_t external_file_attributes PACKED; /* 34-37 */
 		uint32_t relative_offset_of_local_header PACKED; /* 38-41 */
+		/* filename follows (not NUL terminated) */
+		/* extra field follows */
+		/* comment follows */
 	} formatted PACKED;
 } cdf_header_t;
 
-struct BUG_cdf_header_must_be_42_bytes {
-	char BUG_cdf_header_must_be_42_bytes[
-		offsetof(cdf_header_t, formatted.relative_offset_of_local_header) + 4
-			== CDF_HEADER_LEN ? 1 : -1];
-};
-
-#define FIX_ENDIANNESS_CDF(cdf_header) do { \
+#define FIX_ENDIANNESS_CDF(cdf_header) \
+do { if (BB_BIG_ENDIAN) { \
+	(cdf_header).formatted.version_made_by = SWAP_LE16((cdf_header).formatted.version_made_by); \
+	(cdf_header).formatted.version_needed = SWAP_LE16((cdf_header).formatted.version_needed); \
+	(cdf_header).formatted.method       = SWAP_LE16((cdf_header).formatted.method      ); \
+	(cdf_header).formatted.modtime      = SWAP_LE16((cdf_header).formatted.modtime     ); \
+	(cdf_header).formatted.moddate      = SWAP_LE16((cdf_header).formatted.moddate     ); \
 	(cdf_header).formatted.crc32        = SWAP_LE32((cdf_header).formatted.crc32       ); \
 	(cdf_header).formatted.cmpsize      = SWAP_LE32((cdf_header).formatted.cmpsize     ); \
 	(cdf_header).formatted.ucmpsize     = SWAP_LE32((cdf_header).formatted.ucmpsize    ); \
 	(cdf_header).formatted.file_name_length = SWAP_LE16((cdf_header).formatted.file_name_length); \
 	(cdf_header).formatted.extra_field_length = SWAP_LE16((cdf_header).formatted.extra_field_length); \
 	(cdf_header).formatted.file_comment_length = SWAP_LE16((cdf_header).formatted.file_comment_length); \
-	IF_DESKTOP( \
-	(cdf_header).formatted.version_made_by = SWAP_LE16((cdf_header).formatted.version_made_by); \
 	(cdf_header).formatted.external_file_attributes = SWAP_LE32((cdf_header).formatted.external_file_attributes); \
-	) \
-} while (0)
+}} while (0)
 
 #define CDE_HEADER_LEN 16
 
@@ -166,20 +164,38 @@ typedef union {
 	} formatted PACKED;
 } cde_header_t;
 
-struct BUG_cde_header_must_be_16_bytes {
+#define FIX_ENDIANNESS_CDE(cde_header) \
+do { if (BB_BIG_ENDIAN) { \
+	(cde_header).formatted.cdf_offset = SWAP_LE32((cde_header).formatted.cdf_offset); \
+}} while (0)
+
+struct BUG {
+	/* Check the offset of the last element, not the length.  This leniency
+	 * allows for poor packing, whereby the overall struct may be too long,
+	 * even though the elements are all in the right place.
+	 */
+	char BUG_zip_header_must_be_26_bytes[
+		offsetof(zip_header_t, formatted.extra_len) + 2
+			== ZIP_HEADER_LEN ? 1 : -1];
+	char BUG_cdf_header_must_be_42_bytes[
+		offsetof(cdf_header_t, formatted.relative_offset_of_local_header) + 4
+			== CDF_HEADER_LEN ? 1 : -1];
 	char BUG_cde_header_must_be_16_bytes[
 		sizeof(cde_header_t) == CDE_HEADER_LEN ? 1 : -1];
 };
 
-#define FIX_ENDIANNESS_CDE(cde_header) do { \
-	(cde_header).formatted.cdf_offset = SWAP_LE32((cde_header).formatted.cdf_offset); \
-} while (0)
 
 enum { zip_fd = 3 };
 
 
-#if ENABLE_DESKTOP
+/* This value means that we failed to find CDF */
+#define BAD_CDF_OFFSET ((uint32_t)0xffffffff)
+
+#if !ENABLE_FEATURE_UNZIP_CDF
 
+# define find_cdf_offset() BAD_CDF_OFFSET
+
+#else
 /* Seen in the wild:
  * Self-extracting PRO2K3XP_32.exe contains 19078464 byte zip archive,
  * where CDE was nearly 48 kbytes before EOF.
@@ -188,25 +204,26 @@ enum { zip_fd = 3 };
  * To make extraction work, bumped PEEK_FROM_END from 16k to 64k.
  */
 #define PEEK_FROM_END (64*1024)
-
-/* This value means that we failed to find CDF */
-#define BAD_CDF_OFFSET ((uint32_t)0xffffffff)
-
 /* NB: does not preserve file position! */
 static uint32_t find_cdf_offset(void)
 {
 	cde_header_t cde_header;
+	unsigned char *buf;
 	unsigned char *p;
 	off_t end;
-	unsigned char *buf = xzalloc(PEEK_FROM_END);
 	uint32_t found;
 
-	end = xlseek(zip_fd, 0, SEEK_END);
+	end = lseek(zip_fd, 0, SEEK_END);
+	if (end == (off_t) -1)
+		return BAD_CDF_OFFSET;
+
 	end -= PEEK_FROM_END;
 	if (end < 0)
 		end = 0;
+
 	dbg("Looking for cdf_offset starting from 0x%"OFF_FMT"x", end);
  	xlseek(zip_fd, end, SEEK_SET);
+	buf = xzalloc(PEEK_FROM_END);
 	full_read(zip_fd, buf, PEEK_FROM_END);
 
 	found = BAD_CDF_OFFSET;
@@ -252,30 +269,36 @@ static uint32_t find_cdf_offset(void)
 static uint32_t read_next_cdf(uint32_t cdf_offset, cdf_header_t *cdf_ptr)
 {
 	off_t org;
+	uint32_t magic;
 
-	org = xlseek(zip_fd, 0, SEEK_CUR);
+	if (cdf_offset == BAD_CDF_OFFSET)
+		return cdf_offset;
 
-	if (!cdf_offset)
-		cdf_offset = find_cdf_offset();
-
-	if (cdf_offset != BAD_CDF_OFFSET) {
-		dbg("Reading CDF at 0x%x", (unsigned)cdf_offset);
-		xlseek(zip_fd, cdf_offset + 4, SEEK_SET);
-		xread(zip_fd, cdf_ptr->raw, CDF_HEADER_LEN);
-		FIX_ENDIANNESS_CDF(*cdf_ptr);
-		dbg("  file_name_length:%u extra_field_length:%u file_comment_length:%u",
-			(unsigned)cdf_ptr->formatted.file_name_length,
-			(unsigned)cdf_ptr->formatted.extra_field_length,
-			(unsigned)cdf_ptr->formatted.file_comment_length
-		);
-		cdf_offset += 4 + CDF_HEADER_LEN
-			+ cdf_ptr->formatted.file_name_length
-			+ cdf_ptr->formatted.extra_field_length
-			+ cdf_ptr->formatted.file_comment_length;
+	org = xlseek(zip_fd, 0, SEEK_CUR);
+	dbg("Reading CDF at 0x%x", (unsigned)cdf_offset);
+	xlseek(zip_fd, cdf_offset, SEEK_SET);
+	xread(zip_fd, &magic, 4);
+	/* Central Directory End? */
+	if (magic == ZIP_CDE_MAGIC) {
+		dbg("got ZIP_CDE_MAGIC");
+		return 0; /* EOF */
 	}
+	xread(zip_fd, cdf_ptr->raw, CDF_HEADER_LEN);
+	/* Caller doesn't need this: */
+	/* dbg("Returning file position to 0x%"OFF_FMT"x", org); */
+	/* xlseek(zip_fd, org, SEEK_SET); */
+
+	FIX_ENDIANNESS_CDF(*cdf_ptr);
+	dbg("  file_name_length:%u extra_field_length:%u file_comment_length:%u",
+		(unsigned)cdf_ptr->formatted.file_name_length,
+		(unsigned)cdf_ptr->formatted.extra_field_length,
+		(unsigned)cdf_ptr->formatted.file_comment_length
+	);
+	cdf_offset += 4 + CDF_HEADER_LEN
+		+ cdf_ptr->formatted.file_name_length
+		+ cdf_ptr->formatted.extra_field_length
+		+ cdf_ptr->formatted.file_comment_length;
 
-	dbg("Returning file position to 0x%"OFF_FMT"x", org);
-	xlseek(zip_fd, org, SEEK_SET);
 	return cdf_offset;
 };
 #endif
@@ -324,6 +347,7 @@ static void unzip_extract(zip_header_t *zip_header, int dst_fd)
 			bb_error_msg("bad length");
 		}
 	}
+	/* TODO? method 12: bzip2, method 14: LZMA */
 }
 
 static void my_fgets80(char *buf80)
@@ -339,15 +363,12 @@ int unzip_main(int argc, char **argv)
 {
 	enum { O_PROMPT, O_NEVER, O_ALWAYS };
 
-	zip_header_t zip_header;
 	smallint quiet = 0;
-	IF_NOT_DESKTOP(const) smallint verbose = 0;
+	IF_NOT_FEATURE_UNZIP_CDF(const) smallint verbose = 0;
 	smallint listing = 0;
 	smallint overwrite = O_PROMPT;
 	smallint x_opt_seen;
-#if ENABLE_DESKTOP
 	uint32_t cdf_offset;
-#endif
 	unsigned long total_usize;
 	unsigned long total_size;
 	unsigned total_entries;
@@ -430,7 +451,7 @@ int unzip_main(int argc, char **argv)
 			break;
 
 		case 'v': /* Verbose list */
-			IF_DESKTOP(verbose++;)
+			IF_FEATURE_UNZIP_CDF(verbose++;)
 			listing = 1;
 			break;
 
@@ -545,78 +566,102 @@ int unzip_main(int argc, char **argv)
 	total_usize = 0;
 	total_size = 0;
 	total_entries = 0;
-#if ENABLE_DESKTOP
-	cdf_offset = 0;
-#endif
+	cdf_offset = find_cdf_offset();	/* try to seek to the end, find CDE and CDF start */
 	while (1) {
-		uint32_t magic;
+		zip_header_t zip_header;
 		mode_t dir_mode = 0777;
-#if ENABLE_DESKTOP
+#if ENABLE_FEATURE_UNZIP_CDF
 		mode_t file_mode = 0666;
 #endif
 
-		/* Check magic number */
-		xread(zip_fd, &magic, 4);
-		/* Central directory? It's at the end, so exit */
-		if (magic == ZIP_CDF_MAGIC) {
-			dbg("got ZIP_CDF_MAGIC");
-			break;
-		}
-#if ENABLE_DESKTOP
-		/* Data descriptor? It was a streaming file, go on */
-		if (magic == ZIP_DD_MAGIC) {
-			dbg("got ZIP_DD_MAGIC");
-			/* skip over duplicate crc32, cmpsize and ucmpsize */
-			unzip_skip(3 * 4);
-			continue;
-		}
-#endif
-		if (magic != ZIP_FILEHEADER_MAGIC)
-			bb_error_msg_and_die("invalid zip magic %08X", (int)magic);
-		dbg("got ZIP_FILEHEADER_MAGIC");
-
-		/* Read the file header */
-		xread(zip_fd, zip_header.raw, ZIP_HEADER_LEN);
-		FIX_ENDIANNESS_ZIP(zip_header);
-		if ((zip_header.formatted.method != 0) && (zip_header.formatted.method != 8)) {
-			bb_error_msg_and_die("unsupported method %d", zip_header.formatted.method);
-		}
-#if !ENABLE_DESKTOP
-		if (zip_header.formatted.zip_flags & SWAP_LE16(0x0009)) {
-			bb_error_msg_and_die("zip flags 1 and 8 are not supported");
-		}
-#else
-		if (zip_header.formatted.zip_flags & SWAP_LE16(0x0001)) {
-			/* 0x0001 - encrypted */
-			bb_error_msg_and_die("zip flag 1 (encryption) is not supported");
-		}
+		if (!ENABLE_FEATURE_UNZIP_CDF || cdf_offset == BAD_CDF_OFFSET) {
+			/* Normally happens when input is unseekable.
+			 *
+			 * Valid ZIP file has Central Directory at the end
+			 * with central directory file headers (CDFs).
+			 * After it, there is a Central Directory End structure.
+			 * CDFs identify what files are in the ZIP and where
+			 * they are located. This allows ZIP readers to load
+			 * the list of files without reading the entire ZIP archive.
+			 * ZIP files may be appended to, only files specified in
+			 * the CD are valid. Scanning for local file headers is
+			 * not a correct algorithm.
+			 *
+			 * We try to do the above, and resort to "linear" reading
+			 * of ZIP file only if seek failed or CDE wasn't found.
+			 */
+			uint32_t magic;
 
-		if (cdf_offset != BAD_CDF_OFFSET) {
+			/* Check magic number */
+			xread(zip_fd, &magic, 4);
+			/* Central directory? It's at the end, so exit */
+			if (magic == ZIP_CDF_MAGIC) {
+				dbg("got ZIP_CDF_MAGIC");
+				break;
+			}
+			/* Data descriptor? It was a streaming file, go on */
+			if (magic == ZIP_DD_MAGIC) {
+				dbg("got ZIP_DD_MAGIC");
+				/* skip over duplicate crc32, cmpsize and ucmpsize */
+				unzip_skip(3 * 4);
+				continue;
+			}
+			if (magic != ZIP_FILEHEADER_MAGIC)
+				bb_error_msg_and_die("invalid zip magic %08X", (int)magic);
+			dbg("got ZIP_FILEHEADER_MAGIC");
+
+			xread(zip_fd, zip_header.raw, ZIP_HEADER_LEN);
+			FIX_ENDIANNESS_ZIP(zip_header);
+			if ((zip_header.formatted.method != 0)
+			 && (zip_header.formatted.method != 8)
+			) {
+				/* TODO? method 12: bzip2, method 14: LZMA */
+				bb_error_msg_and_die("unsupported method %d", zip_header.formatted.method);
+			}
+			if (zip_header.formatted.zip_flags & SWAP_LE16(0x0009)) {
+				bb_error_msg_and_die("zip flags 1 and 8 are not supported");
+			}
+		}
+#if ENABLE_FEATURE_UNZIP_CDF
+		else {
+			/* cdf_offset is valid (and we know the file is seekable) */
 			cdf_header_t cdf_header;
 			cdf_offset = read_next_cdf(cdf_offset, &cdf_header);
-			/*
-			 * Note: cdf_offset can become BAD_CDF_OFFSET after the above call.
-			 */
+			if (cdf_offset == 0) /* EOF? */
+				break;
+# if 0
+			xlseek(zip_fd,
+				SWAP_LE32(cdf_header.formatted.relative_offset_of_local_header) + 4,
+				SEEK_SET);
+			xread(zip_fd, zip_header.raw, ZIP_HEADER_LEN);
+			FIX_ENDIANNESS_ZIP(zip_header);
 			if (zip_header.formatted.zip_flags & SWAP_LE16(0x0008)) {
 				/* 0x0008 - streaming. [u]cmpsize can be reliably gotten
-				 * only from Central Directory. See unzip_doc.txt
+				 * only from Central Directory.
 				 */
 				zip_header.formatted.crc32    = cdf_header.formatted.crc32;
 				zip_header.formatted.cmpsize  = cdf_header.formatted.cmpsize;
 				zip_header.formatted.ucmpsize = cdf_header.formatted.ucmpsize;
 			}
+# else
+			/* CDF has the same data as local header, no need to read the latter */
+			memcpy(&zip_header.formatted.version,
+				&cdf_header.formatted.version_needed, ZIP_HEADER_LEN);
+			xlseek(zip_fd,
+				SWAP_LE32(cdf_header.formatted.relative_offset_of_local_header) + 4 + ZIP_HEADER_LEN,
+				SEEK_SET);
+# endif
 			if ((cdf_header.formatted.version_made_by >> 8) == 3) {
 				/* This archive is created on Unix */
 				dir_mode = file_mode = (cdf_header.formatted.external_file_attributes >> 16);
 			}
 		}
-		if (cdf_offset == BAD_CDF_OFFSET
-		 && (zip_header.formatted.zip_flags & SWAP_LE16(0x0008))
-		) {
-			/* If it's a streaming zip, we _require_ CDF */
-			bb_error_msg_and_die("can't find file table");
-		}
 #endif
+
+		if (zip_header.formatted.zip_flags & SWAP_LE16(0x0001)) {
+			/* 0x0001 - encrypted */
+			bb_error_msg_and_die("zip flag 1 (encryption) is not supported");
+		}
 		dbg("File cmpsize:0x%x extra_len:0x%x ucmpsize:0x%x",
 			(unsigned)zip_header.formatted.cmpsize,
 			(unsigned)zip_header.formatted.extra_len,
@@ -751,7 +796,7 @@ int unzip_main(int argc, char **argv)
 			overwrite = O_ALWAYS;
 		case 'y': /* Open file and fall into unzip */
 			unzip_create_leading_dirs(dst_fn);
-#if ENABLE_DESKTOP
+#if ENABLE_FEATURE_UNZIP_CDF
 			dst_fd = xopen3(dst_fn, O_WRONLY | O_CREAT | O_TRUNC, file_mode);
 #else
 			dst_fd = xopen(dst_fn, O_WRONLY | O_CREAT | O_TRUNC);
diff --git a/testsuite/unzip.tests b/testsuite/unzip.tests
index d8738a3..d9c4524 100755
--- a/testsuite/unzip.tests
+++ b/testsuite/unzip.tests
@@ -31,11 +31,10 @@ rmdir foo
 rm foo.zip
 
 # File containing some damaged encrypted stream
+optional FEATURE_UNZIP_CDF
 testing "unzip (bad archive)" "uudecode; unzip bad.zip 2>&1; echo \$?" \
 "Archive:  bad.zip
-  inflating: ]3j½r«IK-%Ix
-unzip: corrupted data
-unzip: inflate error
+unzip: short read
 1
 " \
 "" "\
@@ -49,6 +48,7 @@ BDYAAAAMAAEADQAAADIADQAAAEEAAAASw73Ct1DKokohPXQiNzA+FAI1HCcW
 NzITNFBLBQUKAC4JAA04Cw0EOhZQSwUGAQAABAIAAgCZAAAAeQAAAAIALhM=
 ====
 "
+SKIP=
 
 rm *
 


More information about the busybox-cvs mailing list