Patches to make GNU gzip and BusyBox gzip produce identical compression results

Denys Vlasenko vda.linux at googlemail.com
Thu Sep 5 11:41:04 UTC 2019


Applied, thanks

On Mon, Sep 2, 2019 at 11:58 PM Daniel Edgecumbe
<email at esotericnonsense.com> wrote:
>
> Let's try once more, patches inline (new email client, gah!)
>
> 0001-gzip-default-level-with-ENABLE_FEATURE_GZIP_LEVELS-s.patch
>
> From 9d06f01e2805a5d6f1d775ceb651ae18ae2e1808 Mon Sep 17 00:00:00 2001
> From: Daniel Edgecumbe <git at esotericnonsense.com>
> Date: Mon, 2 Sep 2019 22:03:14 +0100
> Subject: [PATCH 1/3] gzip: default level with ENABLE_FEATURE_GZIP_LEVELS
>  should be 6
>
> Fixes an off-by-one that actually resulted in level 7 being used
> ---
>  archival/gzip.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/archival/gzip.c b/archival/gzip.c
> index 17341de45..37db347b8 100644
> --- a/archival/gzip.c
> +++ b/archival/gzip.c
> @@ -2222,7 +2222,7 @@ int gzip_main(int argc UNUSED_PARAM, char **argv)
>  #if ENABLE_FEATURE_GZIP_LEVELS
>         opt >>= (BBUNPK_OPTSTRLEN IF_FEATURE_GZIP_DECOMPRESS(+ 2) + 1); /* drop cfkvq[dt]n bits */
>         if (opt == 0)
> -               opt = 1 << 6; /* default: 6 */
> +               opt = 1 << 5; /* default: 6 */
>         opt = ffs(opt >> 4); /* Maps -1..-4 to [0], -5 to [1] ... -9 to [5] */
>         max_chain_length = 1 << gzip_level_config[opt].chain_shift;
>         good_match       = gzip_level_config[opt].good;
> --
> 2.23.0
>
> 0002-gzip-set-compression-flags-correctly-as-per-standard.patch
>
> From 4280c9633b359dcbf2ddadcf33790b8690f81c82 Mon Sep 17 00:00:00 2001
> From: Daniel Edgecumbe <git at esotericnonsense.com>
> Date: Mon, 2 Sep 2019 22:05:26 +0100
> Subject: [PATCH 2/3] gzip: set compression flags correctly as per standard
>
> With this change and CONFIG_GZIP_FAST=2, CONFIG_FEATURE_GZIP_LEVELS=y,
>
> GNU gzip and BusyBox gzip now produce identical output at each compression
> level (excluding 1..3, as BusyBox does not implement these levels).
> ---
>  archival/gzip.c | 22 ++++++++++++++++------
>  1 file changed, 16 insertions(+), 6 deletions(-)
>
> diff --git a/archival/gzip.c b/archival/gzip.c
> index 37db347b8..f13748aa1 100644
> --- a/archival/gzip.c
> +++ b/archival/gzip.c
> @@ -259,6 +259,7 @@ enum {
>
>  #if !ENABLE_FEATURE_GZIP_LEVELS
>
> +       comp_level = 9,
>         max_chain_length = 4096,
>  /* To speed up deflation, hash chains are never searched beyond this length.
>   * A higher limit improves compression ratio but degrades the speed.
> @@ -334,10 +335,12 @@ struct globals {
>  #define head (G1.prev + WSIZE) /* hash head (see deflate.c) */
>
>  #if ENABLE_FEATURE_GZIP_LEVELS
> +       unsigned comp_level;
>         unsigned max_chain_length;
>         unsigned max_lazy_match;
>         unsigned good_match;
>         unsigned nice_match;
> +#define comp_level (G1.comp_level)
>  #define max_chain_length (G1.max_chain_length)
>  #define max_lazy_match   (G1.max_lazy_match)
>  #define good_match      (G1.good_match)
> @@ -1919,7 +1922,7 @@ static void bi_init(void)
>  /* ===========================================================================
>   * Initialize the "longest match" routines for a new file
>   */
> -static void lm_init(unsigned *flags16p)
> +static void lm_init(void)
>  {
>         unsigned j;
>
> @@ -1927,8 +1930,6 @@ static void lm_init(unsigned *flags16p)
>         memset(head, 0, HASH_SIZE * sizeof(*head));
>         /* prev will be initialized on the fly */
>
> -       /* speed options for the general purpose bit flag */
> -       *flags16p |= 2; /* FAST 4, SLOW 2 */
>         /* ??? reduce max_chain_length for binary files */
>
>         //G1.strstart = 0; // globals are zeroed in pack_gzip()
> @@ -2076,10 +2077,16 @@ static void zip(void)
>
>         bi_init();
>         ct_init();
> -       deflate_flags = 0;  /* pkzip -es, -en or -ex equivalent */
> -       lm_init(&deflate_flags);
> +       lm_init();
>
> -       put_16bit(deflate_flags | 0x300); /* extra flags. OS id = 3 (Unix) */
> +       deflate_flags = 0x300; /* extra flags. OS id = 3 (Unix) */
> +#if ENABLE_FEATURE_GZIP_LEVELS
> +       /* Note that comp_levels < 4 do not exist in this version of gzip */
> +       if (comp_level == 9) {
> +               deflate_flags |= 0x02; /* SLOW flag */
> +       }
> +#endif
> +       put_16bit(deflate_flags);
>
>         /* The above 32-bit misaligns outbuf (10 bytes are stored), flush it */
>         flush_outbuf_if_32bit_optimized();
> @@ -2224,6 +2231,9 @@ int gzip_main(int argc UNUSED_PARAM, char **argv)
>         if (opt == 0)
>                 opt = 1 << 5; /* default: 6 */
>         opt = ffs(opt >> 4); /* Maps -1..-4 to [0], -5 to [1] ... -9 to [5] */
> +
> +       comp_level = opt + 4;
> +
>         max_chain_length = 1 << gzip_level_config[opt].chain_shift;
>         good_match       = gzip_level_config[opt].good;
>         max_lazy_match   = gzip_level_config[opt].lazy2 * 2;
> --
> 2.23.0
>
> 0003-gzip-set-default-compression-level-to-6-when-CONFIG_.patch
>
> From 12d30559486502feec4e2821b3ab45ae6139e7aa Mon Sep 17 00:00:00 2001
> From: Daniel Edgecumbe <git at esotericnonsense.com>
> Date: Mon, 2 Sep 2019 22:09:15 +0100
> Subject: [PATCH 3/3] gzip: set default compression level to 6 when
>  CONFIG_FEATURE_GZIP_LEVELS=n
>
> With this change, GNU gzip -n and BusyBox gzip now produce identical output
> assuming that CONFIG_GZIP_FAST=2.
> ---
>  archival/gzip.c | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/archival/gzip.c b/archival/gzip.c
> index f13748aa1..779df5c19 100644
> --- a/archival/gzip.c
> +++ b/archival/gzip.c
> @@ -52,7 +52,7 @@ aa:      85.1% -- replaced with aa.gz
>  //config:      help
>  //config:      Enable support for compression levels 4-9. The default level
>  //config:      is 6. If levels 1-3 are specified, 4 is used.
> -//config:      If this option is not selected, -N options are ignored and -9
> +//config:      If this option is not selected, -N options are ignored and -6
>  //config:      is used.
>  //config:
>  //config:config FEATURE_GZIP_DECOMPRESS
> @@ -259,13 +259,13 @@ enum {
>
>  #if !ENABLE_FEATURE_GZIP_LEVELS
>
> -       comp_level = 9,
> -       max_chain_length = 4096,
> +       comp_level = 6,
> +       max_chain_length = 128,
>  /* To speed up deflation, hash chains are never searched beyond this length.
>   * A higher limit improves compression ratio but degrades the speed.
>   */
>
> -       max_lazy_match = 258,
> +       max_lazy_match = 16,
>  /* Attempt to find a better match only when the current match is strictly
>   * smaller than this value. This mechanism is used only for compression
>   * levels >= 4.
> @@ -277,7 +277,7 @@ enum {
>   * max_insert_length is used only for compression levels <= 3.
>   */
>
> -       good_match = 32,
> +       good_match = 8,
>  /* Use a faster search when the previous match is longer than this */
>
>  /* Values for max_lazy_match, good_match and max_chain_length, depending on
> @@ -286,7 +286,7 @@ enum {
>   * found for specific files.
>   */
>
> -       nice_match = 258,       /* Stop searching when current match exceeds this */
> +       nice_match = 128,       /* Stop searching when current match exceeds this */
>  /* Note: the deflate() code requires max_lazy >= MIN_MATCH and max_chain >= 4
>   * For deflate_fast() (levels <= 3) good is ignored and lazy has a different
>   * meaning.
> --
> 2.23.0
>
> On 02/09/2019 23.43, Daniel Edgecumbe wrote:
> > A discussion with eschwartz on the Arch Linux freenode IRC channel led to the discovery of some minor implementation details lacking in the BusyBox gzip applet which can cause output to differ both across GNU gzip and BusyBox, and different versions of BusyBox.
> >
> > Please find attached three seperate patches for the solution of these issues.
> >
> > I've also pushed the branch at
> > https://git.esotericnonsense.com/busybox.git/
> >
> > This is a re-submission as my original e-mail was bounced due to not being an ML member.
> >
> >
> > _______________________________________________
> > busybox mailing list
> > busybox at busybox.net
> > http://lists.busybox.net/mailman/listinfo/busybox
> >
>
> --
> Daniel Edgecumbe | esotericnonsense
> Kalix NO, Sverige | +358 46 584 2810
> email at esotericnonsense.com | https://esotericnonsense.com
> _______________________________________________
> busybox mailing list
> busybox at busybox.net
> http://lists.busybox.net/mailman/listinfo/busybox


More information about the busybox mailing list