/* * deflate_compress.c - a compressor for DEFLATE * * Copyright 2016 Eric Biggers * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include "deflate_compress.h" #include "deflate_constants.h" /******************************************************************************/ /* * The following parameters can be changed at build time to customize the * compression algorithms slightly: * * (Note, not all customizable parameters are here. Some others can be found in * libdeflate_alloc_compressor() and in *_matchfinder.h.) */ /* * If this parameter is defined to 1, then the near-optimal parsing algorithm * will be included, and compression levels 10-12 will use it. This algorithm * usually produces a compression ratio significantly better than the other * algorithms. However, it is slow. If this parameter is defined to 0, then * levels 10-12 will be the same as level 9 and will use the lazy2 algorithm. */ #define SUPPORT_NEAR_OPTIMAL_PARSING 1 /* * This is the minimum block length that the compressor will use, in * uncompressed bytes. This should be a value below which using shorter blocks * is unlikely to be worthwhile, due to the per-block overhead. This value does * not apply to the final block, which may be shorter than this (if the input is * shorter, it will have to be), or to the final uncompressed block in a series * of uncompressed blocks that cover more than UINT16_MAX bytes. * * This value is also approximately the amount by which what would otherwise be * the second-to-last block is allowed to grow past the soft maximum length in * order to avoid having to use a very short final block. * * Defining a fixed minimum block length is needed in order to guarantee a * reasonable upper bound on the compressed size. It's also needed because our * block splitting algorithm doesn't work well on very short blocks. */ #define MIN_BLOCK_LENGTH 5000 /* * For the greedy, lazy, lazy2, and near-optimal compressors: This is the soft * maximum block length, in uncompressed bytes. The compressor will try to end * blocks at this length, but it may go slightly past it if there is a match * that straddles this limit or if the input data ends soon after this limit. * This parameter doesn't apply to uncompressed blocks, which the DEFLATE format * limits to 65535 bytes. * * This should be a value above which it is very likely that splitting the block * would produce a better compression ratio. For the near-optimal compressor, * increasing/decreasing this parameter will increase/decrease per-compressor * memory usage linearly. */ #define SOFT_MAX_BLOCK_LENGTH 300000 /* * For the greedy, lazy, and lazy2 compressors: this is the length of the * sequence store, which is an array where the compressor temporarily stores * matches that it's going to use in the current block. This value is the * maximum number of matches that can be used in a block. If the sequence store * fills up, then the compressor will be forced to end the block early. This * value should be large enough so that this rarely happens, due to the block * being ended normally before then. Increasing/decreasing this value will * increase/decrease per-compressor memory usage linearly. */ #define SEQ_STORE_LENGTH 50000 /* * For deflate_compress_fastest(): This is the soft maximum block length. * deflate_compress_fastest() doesn't use the regular block splitting algorithm; * it only ends blocks when they reach FAST_SOFT_MAX_BLOCK_LENGTH bytes or * FAST_SEQ_STORE_LENGTH matches. Therefore, this value should be lower than * the regular SOFT_MAX_BLOCK_LENGTH. */ #define FAST_SOFT_MAX_BLOCK_LENGTH 65535 /* * For deflate_compress_fastest(): this is the length of the sequence store. * This is like SEQ_STORE_LENGTH, but this should be a lower value. */ #define FAST_SEQ_STORE_LENGTH 8192 /* * These are the maximum codeword lengths, in bits, the compressor will use for * each Huffman code. The DEFLATE format defines limits for these. However, * further limiting litlen codewords to 14 bits is beneficial, since it has * negligible effect on compression ratio but allows some optimizations when * outputting bits. (It allows 4 literals to be written at once rather than 3.) */ #define MAX_LITLEN_CODEWORD_LEN 14 #define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN #define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN #if SUPPORT_NEAR_OPTIMAL_PARSING /* Parameters specific to the near-optimal parsing algorithm */ /* * BIT_COST is a scaling factor that allows the near-optimal compressor to * consider fractional bit costs when deciding which literal/match sequence to * use. This is useful when the true symbol costs are unknown. For example, if * the compressor thinks that a symbol has 6.5 bits of entropy, it can set its * cost to 6.5 bits rather than have to use 6 or 7 bits. Although in the end * each symbol will use a whole number of bits due to the Huffman coding, * considering fractional bits can be helpful due to the limited information. * * BIT_COST should be a power of 2. A value of 8 or 16 works well. A higher * value isn't very useful since the calculations are approximate anyway. * * BIT_COST doesn't apply to deflate_flush_block() and * deflate_compute_true_cost(), which consider whole bits. */ #define BIT_COST 16 /* * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to * be needed to output a symbol that was unused in the previous optimization * pass. Assigning a default cost allows the symbol to be used in the next * optimization pass. However, the cost should be relatively high because the * symbol probably won't be used very many times (if at all). */ #define LITERAL_NOSTAT_BITS 13 #define LENGTH_NOSTAT_BITS 13 #define OFFSET_NOSTAT_BITS 10 /* * This is (slightly less than) the maximum number of matches that the * near-optimal compressor will cache per block. This behaves similarly to * SEQ_STORE_LENGTH for the other compressors. */ #define MATCH_CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5) #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ /******************************************************************************/ /* Include the needed matchfinders. */ #define MATCHFINDER_WINDOW_ORDER DEFLATE_WINDOW_ORDER #include "hc_matchfinder.h" #include "ht_matchfinder.h" #if SUPPORT_NEAR_OPTIMAL_PARSING # include "bt_matchfinder.h" /* * This is the maximum number of matches the binary trees matchfinder can find * at a single position. Since the matchfinder never finds more than one match * for the same length, presuming one of each possible length is sufficient for * an upper bound. (This says nothing about whether it is worthwhile to * consider so many matches; this is just defining the worst case.) */ #define MAX_MATCHES_PER_POS \ (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1) #endif /* * The largest block length we will ever use is when the final block is of * length SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, or when any block is of * length SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN. The latter case * occurs when the lazy2 compressor chooses two literals and a maximum-length * match, starting at SOFT_MAX_BLOCK_LENGTH - 1. */ #define MAX_BLOCK_LENGTH \ MAX(SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, \ SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN) static forceinline void check_buildtime_parameters(void) { /* * Verify that MIN_BLOCK_LENGTH is being honored, as * libdeflate_deflate_compress_bound() depends on it. */ STATIC_ASSERT(SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH); STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH); STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN >= MIN_BLOCK_LENGTH); STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN >= MIN_BLOCK_LENGTH); #if SUPPORT_NEAR_OPTIMAL_PARSING STATIC_ASSERT(MIN_BLOCK_LENGTH * MAX_MATCHES_PER_POS <= MATCH_CACHE_LENGTH); #endif /* The definition of MAX_BLOCK_LENGTH assumes this. */ STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH <= SOFT_MAX_BLOCK_LENGTH); /* Verify that the sequence stores aren't uselessly large. */ STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN <= SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH); STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN <= FAST_SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH); /* Verify that the maximum codeword lengths are valid. */ STATIC_ASSERT( MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN); STATIC_ASSERT( MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN); STATIC_ASSERT( MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN); STATIC_ASSERT( (1U << MAX_LITLEN_CODEWORD_LEN) >= DEFLATE_NUM_LITLEN_SYMS); STATIC_ASSERT( (1U << MAX_OFFSET_CODEWORD_LEN) >= DEFLATE_NUM_OFFSET_SYMS); STATIC_ASSERT( (1U << MAX_PRE_CODEWORD_LEN) >= DEFLATE_NUM_PRECODE_SYMS); } /******************************************************************************/ /* Table: length slot => length slot base value */ static const unsigned deflate_length_slot_base[] = { 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, }; /* Table: length slot => number of extra length bits */ static const u8 deflate_extra_length_bits[] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, }; /* Table: offset slot => offset slot base value */ static const unsigned deflate_offset_slot_base[] = { 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, }; /* Table: offset slot => number of extra offset bits */ static const u8 deflate_extra_offset_bits[] = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, }; /* Table: length => length slot */ static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, }; /* * Table: 'offset - 1 => offset_slot' for offset <= 256. * This was generated by scripts/gen_offset_slot_map.py. */ static const u8 deflate_offset_slot[256] = { 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, }; /* The order in which precode codeword lengths are stored */ static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 }; /* Table: precode symbol => number of extra bits */ static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7 }; /* Codewords for the DEFLATE Huffman codes */ struct deflate_codewords { u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; u32 offset[DEFLATE_NUM_OFFSET_SYMS]; }; /* * Codeword lengths (in bits) for the DEFLATE Huffman codes. * A zero length means the corresponding symbol had zero frequency. */ struct deflate_lens { u8 litlen[DEFLATE_NUM_LITLEN_SYMS]; u8 offset[DEFLATE_NUM_OFFSET_SYMS]; }; /* Codewords and lengths for the DEFLATE Huffman codes */ struct deflate_codes { struct deflate_codewords codewords; struct deflate_lens lens; }; /* Symbol frequency counters for the DEFLATE Huffman codes */ struct deflate_freqs { u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; u32 offset[DEFLATE_NUM_OFFSET_SYMS]; }; /* * Represents a run of literals followed by a match or end-of-block. This * struct is needed to temporarily store items chosen by the parser, since items * cannot be written until all items for the block have been chosen and the * block's Huffman codes have been computed. */ struct deflate_sequence { /* * Bits 0..22: the number of literals in this run. This may be 0 and * can be at most MAX_BLOCK_LENGTH. The literals are not stored * explicitly in this structure; instead, they are read directly from * the uncompressed data. * * Bits 23..31: the length of the match which follows the literals, or 0 * if this literal run was the last in the block, so there is no match * which follows it. */ #define SEQ_LENGTH_SHIFT 23 #define SEQ_LITRUNLEN_MASK (((u32)1 << SEQ_LENGTH_SHIFT) - 1) u32 litrunlen_and_length; /* * If 'length' doesn't indicate end-of-block, then this is the offset of * the match which follows the literals. */ u16 offset; /* * If 'length' doesn't indicate end-of-block, then this is the offset * slot of the match which follows the literals. */ u16 offset_slot; }; #if SUPPORT_NEAR_OPTIMAL_PARSING /* Costs for the near-optimal parsing algorithm */ struct deflate_costs { /* The cost to output each possible literal */ u32 literal[DEFLATE_NUM_LITERALS]; /* The cost to output each possible match length */ u32 length[DEFLATE_MAX_MATCH_LEN + 1]; /* The cost to output a match offset of each possible offset slot */ u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS]; }; /* * This structure represents a byte position in the input data and a node in the * graph of possible match/literal choices for the current block. * * Logically, each incoming edge to this node is labeled with a literal or a * match that can be taken to reach this position from an earlier position; and * each outgoing edge from this node is labeled with a literal or a match that * can be taken to advance from this position to a later position. * * But these "edges" are actually stored elsewhere (in 'match_cache'). Here we * associate with each node just two pieces of information: * * 'cost_to_end' is the minimum cost to reach the end of the block from * this position. * * 'item' represents the literal or match that must be chosen from here to * reach the end of the block with the minimum cost. Equivalently, this * can be interpreted as the label of the outgoing edge on the minimum-cost * path to the "end of block" node from this node. */ struct deflate_optimum_node { u32 cost_to_end; /* * Notes on the match/literal representation used here: * * The low bits of 'item' are the length: 1 if this is a literal, * or the match length if this is a match. * * The high bits of 'item' are the actual literal byte if this is a * literal, or the match offset if this is a match. */ #define OPTIMUM_OFFSET_SHIFT 9 #define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1) u32 item; }; #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ /* Block split statistics. See "Block splitting algorithm" below. */ #define NUM_LITERAL_OBSERVATION_TYPES 8 #define NUM_MATCH_OBSERVATION_TYPES 2 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \ NUM_MATCH_OBSERVATION_TYPES) #define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512 struct block_split_stats { u32 new_observations[NUM_OBSERVATION_TYPES]; u32 observations[NUM_OBSERVATION_TYPES]; u32 num_new_observations; u32 num_observations; }; struct deflate_output_bitstream; /* The main DEFLATE compressor structure */ struct libdeflate_compressor { /* Pointer to the compress() implementation chosen at allocation time */ void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in, size_t in_nbytes, struct deflate_output_bitstream *os); /* The free() function for this struct, chosen at allocation time */ free_func_t free_func; /* The compression level with which this compressor was created */ unsigned compression_level; /* Anything of this size or less we won't bother trying to compress. */ size_t max_passthrough_size; /* * The maximum search depth: consider at most this many potential * matches at each position */ unsigned max_search_depth; /* * The "nice" match length: if a match of this length is found, choose * it immediately without further consideration */ unsigned nice_match_length; /* Frequency counters for the current block */ struct deflate_freqs freqs; /* Block split statistics for the current block */ struct block_split_stats split_stats; /* Dynamic Huffman codes for the current block */ struct deflate_codes codes; /* The static Huffman codes defined by the DEFLATE format */ struct deflate_codes static_codes; /* Temporary space for block flushing */ union { /* Information about the precode */ struct { u32 freqs[DEFLATE_NUM_PRECODE_SYMS]; u32 codewords[DEFLATE_NUM_PRECODE_SYMS]; u8 lens[DEFLATE_NUM_PRECODE_SYMS]; unsigned items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS]; unsigned num_litlen_syms; unsigned num_offset_syms; unsigned num_explicit_lens; unsigned num_items; } precode; /* * The "full" length codewords. Used only after the information * in 'precode' is no longer needed. */ struct { u32 codewords[DEFLATE_MAX_MATCH_LEN + 1]; u8 lens[DEFLATE_MAX_MATCH_LEN + 1]; } length; } o; union { /* Data for greedy or lazy parsing */ struct { /* Hash chains matchfinder */ struct hc_matchfinder hc_mf; /* Matches and literals chosen for the current block */ struct deflate_sequence sequences[SEQ_STORE_LENGTH + 1]; } g; /* (g)reedy */ /* Data for fastest parsing */ struct { /* Hash table matchfinder */ struct ht_matchfinder ht_mf; /* Matches and literals chosen for the current block */ struct deflate_sequence sequences[ FAST_SEQ_STORE_LENGTH + 1]; } f; /* (f)astest */ #if SUPPORT_NEAR_OPTIMAL_PARSING /* Data for near-optimal parsing */ struct { /* Binary tree matchfinder */ struct bt_matchfinder bt_mf; /* * Cached matches for the current block. This array * contains the matches that were found at each position * in the block. Specifically, for each position, there * is a list of matches found at that position, if any, * sorted by strictly increasing length. In addition, * following the matches for each position, there is a * special 'struct lz_match' whose 'length' member * contains the number of matches found at that * position, and whose 'offset' member contains the * literal at that position. * * Note: in rare cases, there will be a very high number * of matches in the block and this array will overflow. * If this happens, we force the end of the current * block. MATCH_CACHE_LENGTH is the length at which we * actually check for overflow. The extra slots beyond * this are enough to absorb the worst case overflow, * which occurs if starting at * &match_cache[MATCH_CACHE_LENGTH - 1], we write * MAX_MATCHES_PER_POS matches and a match count header, * then skip searching for matches at * 'DEFLATE_MAX_MATCH_LEN - 1' positions and write the * match count header for each. */ struct lz_match match_cache[MATCH_CACHE_LENGTH + MAX_MATCHES_PER_POS + DEFLATE_MAX_MATCH_LEN - 1]; /* * Array of nodes, one per position, for running the * minimum-cost path algorithm. * * This array must be large enough to accommodate the * worst-case number of nodes, which is MAX_BLOCK_LENGTH * plus 1 for the end-of-block node. */ struct deflate_optimum_node optimum_nodes[ MAX_BLOCK_LENGTH + 1]; /* The current cost model being used */ struct deflate_costs costs; /* Saved cost model */ struct deflate_costs costs_saved; /* * A table that maps match offset to offset slot. This * differs from deflate_offset_slot[] in that this is a * full map, not a condensed one. The full map is more * appropriate for the near-optimal parser, since the * near-optimal parser does more offset => offset_slot * translations, it doesn't intersperse them with * matchfinding (so cache evictions are less of a * concern), and it uses more memory anyway. */ u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1]; /* Literal/match statistics saved from previous block */ u32 prev_observations[NUM_OBSERVATION_TYPES]; u32 prev_num_observations; /* * Approximate match length frequencies based on a * greedy parse, gathered during matchfinding. This is * used for setting the initial symbol costs. */ u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1]; u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1]; /* * The maximum number of optimization passes * (min-cost path searches) per block. * Larger values = more compression. */ unsigned max_optim_passes; /* * If an optimization pass improves the cost by fewer * than this number of bits, then optimization will stop * early, before max_optim_passes has been reached. * Smaller values = more compression. */ unsigned min_improvement_to_continue; /* * The minimum number of bits that would need to be * saved for it to be considered worth the time to * regenerate and use the min-cost path from a previous * optimization pass, in the case where the final * optimization pass actually increased the cost. * Smaller values = more compression. */ unsigned min_bits_to_use_nonfinal_path; /* * The maximum block length, in uncompressed bytes, at * which to find and consider the optimal match/literal * list for the static Huffman codes. This strategy * improves the compression ratio produced by static * Huffman blocks and can discover more cases in which * static blocks are worthwhile. This helps mostly with * small blocks, hence why this parameter is a max_len. * * Above this block length, static Huffman blocks are * only used opportunistically. I.e. a static Huffman * block is only used if a static block using the same * match/literal list as the optimized dynamic block * happens to be cheaper than the dynamic block itself. */ unsigned max_len_to_optimize_static_block; } n; /* (n)ear-optimal */ #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ } p; /* (p)arser */ }; /* * The type for the bitbuffer variable, which temporarily holds bits that are * being packed into bytes and written to the output buffer. For best * performance, this should have size equal to a machine word. */ typedef machine_word_t bitbuf_t; /* * The capacity of the bitbuffer, in bits. This is 1 less than the real size, * in order to avoid undefined behavior when doing bitbuf >>= bitcount & ~7. */ #define BITBUF_NBITS (8 * sizeof(bitbuf_t) - 1) /* * Can the specified number of bits always be added to 'bitbuf' after any * pending bytes have been flushed? There can be up to 7 bits remaining after a * flush, so the count must not exceed BITBUF_NBITS after adding 'n' more bits. */ #define CAN_BUFFER(n) (7 + (n) <= BITBUF_NBITS) /* * Structure to keep track of the current state of sending bits to the * compressed output buffer */ struct deflate_output_bitstream { /* Bits that haven't yet been written to the output buffer */ bitbuf_t bitbuf; /* * Number of bits currently held in @bitbuf. This can be between 0 and * BITBUF_NBITS in general, or between 0 and 7 after a flush. */ unsigned bitcount; /* * Pointer to the position in the output buffer at which the next byte * should be written */ u8 *next; /* Pointer to the end of the output buffer */ u8 *end; /* true if the output buffer ran out of space */ bool overflow; }; /* * Add some bits to the bitbuffer variable of the output bitstream. The caller * must ensure that 'bitcount + n <= BITBUF_NBITS', by calling FLUSH_BITS() * frequently enough. */ #define ADD_BITS(bits, n) \ do { \ bitbuf |= (bitbuf_t)(bits) << bitcount; \ bitcount += (n); \ ASSERT(bitcount <= BITBUF_NBITS); \ } while (0) /* * Flush bits from the bitbuffer variable to the output buffer. After this, the * bitbuffer will contain at most 7 bits (a partial byte). * * Since deflate_flush_block() verified ahead of time that there is enough space * remaining before actually writing the block, it's guaranteed that out_next * won't exceed os->end. However, there might not be enough space remaining to * flush a whole word, even though that's fastest. Therefore, flush a whole * word if there is space for it, otherwise flush a byte at a time. */ #define FLUSH_BITS() \ do { \ if (UNALIGNED_ACCESS_IS_FAST && likely(out_next < out_fast_end)) { \ /* Flush a whole word (branchlessly). */ \ put_unaligned_leword(bitbuf, out_next); \ bitbuf >>= bitcount & ~7; \ out_next += bitcount >> 3; \ bitcount &= 7; \ } else { \ /* Flush a byte at a time. */ \ while (bitcount >= 8) { \ ASSERT(out_next < os->end); \ *out_next++ = bitbuf; \ bitcount -= 8; \ bitbuf >>= 8; \ } \ } \ } while (0) /* * Given the binary tree node A[subtree_idx] whose children already satisfy the * maxheap property, swap the node with its greater child until it is greater * than or equal to both of its children, so that the maxheap property is * satisfied in the subtree rooted at A[subtree_idx]. 'A' uses 1-based indices. */ static void heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx) { unsigned parent_idx; unsigned child_idx; u32 v; v = A[subtree_idx]; parent_idx = subtree_idx; while ((child_idx = parent_idx * 2) <= length) { if (child_idx < length && A[child_idx + 1] > A[child_idx]) child_idx++; if (v >= A[child_idx]) break; A[parent_idx] = A[child_idx]; parent_idx = child_idx; } A[parent_idx] = v; } /* * Rearrange the array 'A' so that it satisfies the maxheap property. * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1]. */ static void heapify_array(u32 A[], unsigned length) { unsigned subtree_idx; for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--) heapify_subtree(A, length, subtree_idx); } /* * Sort the array 'A', which contains 'length' unsigned 32-bit integers. * * Note: name this function heap_sort() instead of heapsort() to avoid colliding * with heapsort() from stdlib.h on BSD-derived systems. */ static void heap_sort(u32 A[], unsigned length) { A--; /* Use 1-based indices */ heapify_array(A, length); while (length >= 2) { u32 tmp = A[length]; A[length] = A[1]; A[1] = tmp; length--; heapify_subtree(A, length, 1); } } #define NUM_SYMBOL_BITS 10 #define NUM_FREQ_BITS (32 - NUM_SYMBOL_BITS) #define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) #define FREQ_MASK (~SYMBOL_MASK) #define GET_NUM_COUNTERS(num_syms) (num_syms) /* * Sort the symbols primarily by frequency and secondarily by symbol value. * Discard symbols with zero frequency and fill in an array with the remaining * symbols, along with their frequencies. The low NUM_SYMBOL_BITS bits of each * array entry will contain the symbol value, and the remaining bits will * contain the frequency. * * @num_syms * Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS. * * @freqs[num_syms] * Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1. * * @lens[num_syms] * An array that eventually will hold the length of each codeword. This * function only fills in the codeword lengths for symbols that have zero * frequency, which are not well defined per se but will be set to 0. * * @symout[num_syms] * The output array, described above. * * Returns the number of entries in 'symout' that were filled. This is the * number of symbols that have nonzero frequency. */ static unsigned sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[]) { unsigned sym; unsigned i; unsigned num_used_syms; unsigned num_counters; unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)]; /* * We use heapsort, but with an added optimization. Since often most * symbol frequencies are low, we first do a count sort using a limited * number of counters. High frequencies are counted in the last * counter, and only they will be sorted with heapsort. * * Note: with more symbols, it is generally beneficial to have more * counters. About 1 counter per symbol seems fastest. */ num_counters = GET_NUM_COUNTERS(num_syms); memset(counters, 0, num_counters * sizeof(counters[0])); /* Count the frequencies. */ for (sym = 0; sym < num_syms; sym++) counters[MIN(freqs[sym], num_counters - 1)]++; /* * Make the counters cumulative, ignoring the zero-th, which counted * symbols with zero frequency. As a side effect, this calculates the * number of symbols with nonzero frequency. */ num_used_syms = 0; for (i = 1; i < num_counters; i++) { unsigned count = counters[i]; counters[i] = num_used_syms; num_used_syms += count; } /* * Sort nonzero-frequency symbols using the counters. At the same time, * set the codeword lengths of zero-frequency symbols to 0. */ for (sym = 0; sym < num_syms; sym++) { u32 freq = freqs[sym]; if (freq != 0) { symout[counters[MIN(freq, num_counters - 1)]++] = sym | (freq << NUM_SYMBOL_BITS); } else { lens[sym] = 0; } } /* Sort the symbols counted in the last counter. */ heap_sort(symout + counters[num_counters - 2], counters[num_counters - 1] - counters[num_counters - 2]); return num_used_syms; } /* * Build a Huffman tree. * * This is an optimized implementation that * (a) takes advantage of the frequencies being already sorted; * (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman * tree are sufficient to generate a canonical code; * (c) Only stores parent pointers, not child pointers; * (d) Produces the nodes in the same memory used for input frequency * information. * * Array 'A', which contains 'sym_count' entries, is used for both input and * output. For this function, 'sym_count' must be at least 2. * * For input, the array must contain the frequencies of the symbols, sorted in * increasing order. Specifically, each entry must contain a frequency left * shifted by NUM_SYMBOL_BITS bits. Any data in the low NUM_SYMBOL_BITS bits of * the entries will be ignored by this function. Although these bits will, in * fact, contain the symbols that correspond to the frequencies, this function * is concerned with frequencies only and keeps the symbols as-is. * * For output, this function will produce the non-leaf nodes of the Huffman * tree. These nodes will be stored in the first (sym_count - 1) entries of the * array. Entry A[sym_count - 2] will represent the root node. Each other node * will contain the zero-based index of its parent node in 'A', left shifted by * NUM_SYMBOL_BITS bits. The low NUM_SYMBOL_BITS bits of each entry in A will * be kept as-is. Again, note that although these low bits will, in fact, * contain a symbol value, this symbol will have *no relationship* with the * Huffman tree node that happens to occupy the same slot. This is because this * implementation only generates the non-leaf nodes of the tree. */ static void build_tree(u32 A[], unsigned sym_count) { const unsigned last_idx = sym_count - 1; /* Index of the next lowest frequency leaf that still needs a parent */ unsigned i = 0; /* * Index of the next lowest frequency non-leaf that still needs a * parent, or 'e' if there is currently no such node */ unsigned b = 0; /* Index of the next spot for a non-leaf (will overwrite a leaf) */ unsigned e = 0; do { u32 new_freq; /* * Select the next two lowest frequency nodes among the leaves * A[i] and non-leaves A[b], and create a new node A[e] to be * their parent. Set the new node's frequency to the sum of the * frequencies of its two children. * * Usually the next two lowest frequency nodes are of the same * type (leaf or non-leaf), so check those cases first. */ if (i + 1 <= last_idx && (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) { /* Two leaves */ new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK); i += 2; } else if (b + 2 <= e && (i > last_idx || (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) { /* Two non-leaves */ new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK); A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); A[b + 1] = (e << NUM_SYMBOL_BITS) | (A[b + 1] & SYMBOL_MASK); b += 2; } else { /* One leaf and one non-leaf */ new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK); A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); i++; b++; } A[e] = new_freq | (A[e] & SYMBOL_MASK); /* * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the * tree is complete once we've created 'n - 1' non-leaves. */ } while (++e < last_idx); } /* * Given the stripped-down Huffman tree constructed by build_tree(), determine * the number of codewords that should be assigned each possible length, taking * into account the length-limited constraint. * * @A * The array produced by build_tree(), containing parent index information * for the non-leaf nodes of the Huffman tree. Each entry in this array is * a node; a node's parent always has a greater index than that node * itself. This function will overwrite the parent index information in * this array, so essentially it will destroy the tree. However, the data * in the low NUM_SYMBOL_BITS of each entry will be preserved. * * @root_idx * The 0-based index of the root node in 'A', and consequently one less * than the number of tree node entries in 'A'. (Or, really 2 less than * the actual length of 'A'.) * * @len_counts * An array of length ('max_codeword_len' + 1) in which the number of * codewords having each length <= max_codeword_len will be returned. * * @max_codeword_len * The maximum permissible codeword length. */ static void compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[], unsigned max_codeword_len) { unsigned len; int node; /* * The key observations are: * * (1) We can traverse the non-leaf nodes of the tree, always visiting a * parent before its children, by simply iterating through the array * in reverse order. Consequently, we can compute the depth of each * node in one pass, overwriting the parent indices with depths. * * (2) We can initially assume that in the real Huffman tree, both * children of the root are leaves. This corresponds to two * codewords of length 1. Then, whenever we visit a (non-leaf) node * during the traversal, we modify this assumption to account for * the current node *not* being a leaf, but rather its two children * being leaves. This causes the loss of one codeword for the * current depth and the addition of two codewords for the current * depth plus one. * * (3) We can handle the length-limited constraint fairly easily by * simply using the largest length available when a depth exceeds * max_codeword_len. */ for (len = 0; len <= max_codeword_len; len++) len_counts[len] = 0; len_counts[1] = 2; /* Set the root node's depth to 0. */ A[root_idx] &= SYMBOL_MASK; for (node = root_idx - 1; node >= 0; node--) { /* Calculate the depth of this node. */ unsigned parent = A[node] >> NUM_SYMBOL_BITS; unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS; unsigned depth = parent_depth + 1; /* * Set the depth of this node so that it is available when its * children (if any) are processed. */ A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS); /* * If needed, decrease the length to meet the length-limited * constraint. This is not the optimal method for generating * length-limited Huffman codes! But it should be good enough. */ if (depth >= max_codeword_len) { depth = max_codeword_len; do { depth--; } while (len_counts[depth] == 0); } /* * Account for the fact that we have a non-leaf node at the * current depth. */ len_counts[depth]--; len_counts[depth + 1] += 2; } } /* * DEFLATE uses bit-reversed codewords, so we must bit-reverse the codewords * after generating them. All codewords have length <= 16 bits. If the CPU has * a bit-reversal instruction, then that is the fastest method. Otherwise the * fastest method is to reverse the bits in each of the two bytes using a table. * The table method is slightly faster than using bitwise operations to flip * adjacent 1, 2, 4, and then 8-bit fields, even if 2 to 4 codewords are packed * into a machine word and processed together using that method. */ #ifdef rbit32 static forceinline u32 reverse_codeword(u32 codeword, u8 len) { return rbit32(codeword) >> ((32 - len) & 31); } #else /* Generated by scripts/gen_bitreverse_tab.py */ static const u8 bitreverse_tab[256] = { 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc, 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa, 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe, 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1, 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9, 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd, 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb, 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff, }; static forceinline u32 reverse_codeword(u32 codeword, u8 len) { STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16); codeword = ((u32)bitreverse_tab[codeword & 0xff] << 8) | bitreverse_tab[codeword >> 8]; return codeword >> (16 - len); } #endif /* !rbit32 */ /* * Generate the codewords for a canonical Huffman code. * * @A * The output array for codewords. In addition, initially this * array must contain the symbols, sorted primarily by frequency and * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of * each entry. * * @len * Output array for codeword lengths. * * @len_counts * An array that provides the number of codewords that will have * each possible length <= max_codeword_len. * * @max_codeword_len * Maximum length, in bits, of each codeword. * * @num_syms * Number of symbols in the alphabet, including symbols with zero * frequency. This is the length of the 'A' and 'len' arrays. */ static void gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[], unsigned max_codeword_len, unsigned num_syms) { u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1]; unsigned i; unsigned len; unsigned sym; /* * Given the number of codewords that will have each length, assign * codeword lengths to symbols. We do this by assigning the lengths in * decreasing order to the symbols sorted primarily by increasing * frequency and secondarily by increasing symbol value. */ for (i = 0, len = max_codeword_len; len >= 1; len--) { unsigned count = len_counts[len]; while (count--) lens[A[i++] & SYMBOL_MASK] = len; } /* * Generate the codewords themselves. We initialize the * 'next_codewords' array to provide the lexicographically first * codeword of each length, then assign codewords in symbol order. This * produces a canonical code. */ next_codewords[0] = 0; next_codewords[1] = 0; for (len = 2; len <= max_codeword_len; len++) next_codewords[len] = (next_codewords[len - 1] + len_counts[len - 1]) << 1; for (sym = 0; sym < num_syms; sym++) { /* DEFLATE requires bit-reversed codewords. */ A[sym] = reverse_codeword(next_codewords[lens[sym]]++, lens[sym]); } } /* * --------------------------------------------------------------------- * deflate_make_huffman_code() * --------------------------------------------------------------------- * * Given an alphabet and the frequency of each symbol in it, construct a * length-limited canonical Huffman code. * * @num_syms * The number of symbols in the alphabet. The symbols are the integers in * the range [0, num_syms - 1]. This parameter must be at least 2 and * must not exceed (1 << NUM_SYMBOL_BITS). * * @max_codeword_len * The maximum permissible codeword length. * * @freqs * An array of length @num_syms that gives the frequency of each symbol. * It is valid for some, none, or all of the frequencies to be 0. The sum * of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1. * * @lens * An array of @num_syms entries in which this function will return the * length, in bits, of the codeword assigned to each symbol. Symbols with * 0 frequency will not have codewords per se, but their entries in this * array will be set to 0. No lengths greater than @max_codeword_len will * be assigned. * * @codewords * An array of @num_syms entries in which this function will return the * codeword for each symbol, right-justified and padded on the left with * zeroes. Codewords for symbols with 0 frequency will be undefined. * * --------------------------------------------------------------------- * * This function builds a length-limited canonical Huffman code. * * A length-limited Huffman code contains no codewords longer than some * specified length, and has exactly (with some algorithms) or approximately * (with the algorithm used here) the minimum weighted path length from the * root, given this constraint. * * A canonical Huffman code satisfies the properties that a longer codeword * never lexicographically precedes a shorter codeword, and the lexicographic * ordering of codewords of the same length is the same as the lexicographic * ordering of the corresponding symbols. A canonical Huffman code, or more * generally a canonical prefix code, can be reconstructed from only a list * containing the codeword length of each symbol. * * The classic algorithm to generate a Huffman code creates a node for each * symbol, then inserts these nodes into a min-heap keyed by symbol frequency. * Then, repeatedly, the two lowest-frequency nodes are removed from the * min-heap and added as the children of a new node having frequency equal to * the sum of its two children, which is then inserted into the min-heap. When * only a single node remains in the min-heap, it is the root of the Huffman * tree. The codeword for each symbol is determined by the path needed to reach * the corresponding node from the root. Descending to the left child appends a * 0 bit, whereas descending to the right child appends a 1 bit. * * The classic algorithm is relatively easy to understand, but it is subject to * a number of inefficiencies. In practice, it is fastest to first sort the * symbols by frequency. (This itself can be subject to an optimization based * on the fact that most frequencies tend to be low.) At the same time, we sort * secondarily by symbol value, which aids the process of generating a canonical * code. Then, during tree construction, no heap is necessary because both the * leaf nodes and the unparented non-leaf nodes can be easily maintained in * sorted order. Consequently, there can never be more than two possibilities * for the next-lowest-frequency node. * * In addition, because we're generating a canonical code, we actually don't * need the leaf nodes of the tree at all, only the non-leaf nodes. This is * because for canonical code generation we don't need to know where the symbols * are in the tree. Rather, we only need to know how many leaf nodes have each * depth (codeword length). And this information can, in fact, be quickly * generated from the tree of non-leaves only. * * Furthermore, we can build this stripped-down Huffman tree directly in the * array in which the codewords are to be generated, provided that these array * slots are large enough to hold a symbol and frequency value. * * Still furthermore, we don't even need to maintain explicit child pointers. * We only need the parent pointers, and even those can be overwritten in-place * with depth information as part of the process of extracting codeword lengths * from the tree. So in summary, we do NOT need a big structure like: * * struct huffman_tree_node { * unsigned int symbol; * unsigned int frequency; * unsigned int depth; * struct huffman_tree_node *left_child; * struct huffman_tree_node *right_child; * }; * * * ... which often gets used in "naive" implementations of Huffman code * generation. * * Many of these optimizations are based on the implementation in 7-Zip (source * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov. */ static void deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len, const u32 freqs[], u8 lens[], u32 codewords[]) { u32 *A = codewords; unsigned num_used_syms; STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS); STATIC_ASSERT(MAX_BLOCK_LENGTH <= ((u32)1 << NUM_FREQ_BITS) - 1); /* * We begin by sorting the symbols primarily by frequency and * secondarily by symbol value. As an optimization, the array used for * this purpose ('A') shares storage with the space in which we will * eventually return the codewords. */ num_used_syms = sort_symbols(num_syms, freqs, lens, A); /* * 'num_used_syms' is the number of symbols with nonzero frequency. * This may be less than @num_syms. 'num_used_syms' is also the number * of entries in 'A' that are valid. Each entry consists of a distinct * symbol and a nonzero frequency packed into a 32-bit integer. */ /* * A complete Huffman code must contain at least 2 codewords. Yet, it's * possible that fewer than 2 symbols were used. When this happens, * it's usually for the offset code (0-1 symbols used). But it's also * theoretically possible for the litlen and pre codes (1 symbol used). * * The DEFLATE RFC explicitly allows the offset code to contain just 1 * codeword, or even be completely empty. But it's silent about the * other codes. It also doesn't say whether, in the 1-codeword case, * the codeword (which it says must be 1 bit) is '0' or '1'. * * In any case, some DEFLATE decompressors reject these cases. zlib * generally allows them, but it does reject precodes that have just 1 * codeword. More problematically, zlib v1.2.1 and earlier rejected * empty offset codes, and this behavior can also be seen in Windows * Explorer's ZIP unpacker (supposedly even still in Windows 11). * * Other DEFLATE compressors, including zlib, always send at least 2 * codewords in order to make a complete Huffman code. Therefore, this * is a case where practice does not entirely match the specification. * We follow practice by generating 2 codewords of length 1: codeword * '0' for symbol 0, and codeword '1' for another symbol -- the used * symbol if it exists and is not symbol 0, otherwise symbol 1. This * does worsen the compression ratio by having to send an unnecessary * offset codeword length. But this only affects rare cases such as * blocks containing all literals, and it only makes a tiny difference. */ if (unlikely(num_used_syms < 2)) { unsigned sym = num_used_syms ? (A[0] & SYMBOL_MASK) : 0; unsigned nonzero_idx = sym ? sym : 1; codewords[0] = 0; lens[0] = 1; codewords[nonzero_idx] = 1; lens[nonzero_idx] = 1; return; } /* * Build a stripped-down version of the Huffman tree, sharing the array * 'A' with the symbol values. Then extract length counts from the tree * and use them to generate the final codewords. */ build_tree(A, num_used_syms); { unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; compute_length_counts(A, num_used_syms - 2, len_counts, max_codeword_len); gen_codewords(A, lens, len_counts, max_codeword_len, num_syms); } } /* * Clear the Huffman symbol frequency counters. This must be called when * starting a new DEFLATE block. */ static void deflate_reset_symbol_frequencies(struct libdeflate_compressor *c) { memset(&c->freqs, 0, sizeof(c->freqs)); } /* * Build the literal/length and offset Huffman codes for a DEFLATE block. * * This takes as input the frequency tables for each alphabet and produces as * output a set of tables that map symbols to codewords and codeword lengths. */ static void deflate_make_huffman_codes(const struct deflate_freqs *freqs, struct deflate_codes *codes) { deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS, MAX_LITLEN_CODEWORD_LEN, freqs->litlen, codes->lens.litlen, codes->codewords.litlen); deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS, MAX_OFFSET_CODEWORD_LEN, freqs->offset, codes->lens.offset, codes->codewords.offset); } /* Initialize c->static_codes. */ static void deflate_init_static_codes(struct libdeflate_compressor *c) { unsigned i; for (i = 0; i < 144; i++) c->freqs.litlen[i] = 1 << (9 - 8); for (; i < 256; i++) c->freqs.litlen[i] = 1 << (9 - 9); for (; i < 280; i++) c->freqs.litlen[i] = 1 << (9 - 7); for (; i < 288; i++) c->freqs.litlen[i] = 1 << (9 - 8); for (i = 0; i < 32; i++) c->freqs.offset[i] = 1 << (5 - 5); deflate_make_huffman_codes(&c->freqs, &c->static_codes); } /* Return the offset slot for the given match offset, using the small map. */ static forceinline unsigned deflate_get_offset_slot(u32 offset) { /* * 1 <= offset <= 32768 here. For 1 <= offset <= 256, * deflate_offset_slot[offset - 1] gives the slot. * * For 257 <= offset <= 32768, we take advantage of the fact that 257 is * the beginning of slot 16, and each slot [16..30) is exactly 1 << 7 == * 128 times larger than each slot [2..16) (since the number of extra * bits increases by 1 every 2 slots). Thus, the slot is: * * deflate_offset_slot[2 + ((offset - 257) >> 7)] + (16 - 2) * == deflate_offset_slot[((offset - 1) >> 7)] + 14 * * Define 'n = (offset <= 256) ? 0 : 7'. Then any offset is handled by: * * deflate_offset_slot[(offset - 1) >> n] + (n << 1) * * For better performance, replace 'n = (offset <= 256) ? 0 : 7' with * the equivalent (for offset <= 536871168) 'n = (256 - offset) >> 29'. */ unsigned n = (256 - offset) >> 29; ASSERT(offset >= 1 && offset <= 32768); return deflate_offset_slot[(offset - 1) >> n] + (n << 1); } static unsigned deflate_compute_precode_items(const u8 lens[], const unsigned num_lens, u32 precode_freqs[], unsigned precode_items[]) { unsigned *itemptr; unsigned run_start; unsigned run_end; unsigned extra_bits; u8 len; memset(precode_freqs, 0, DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0])); itemptr = precode_items; run_start = 0; do { /* Find the next run of codeword lengths. */ /* len = the length being repeated */ len = lens[run_start]; /* Extend the run. */ run_end = run_start; do { run_end++; } while (run_end != num_lens && len == lens[run_end]); if (len == 0) { /* Run of zeroes. */ /* Symbol 18: RLE 11 to 138 zeroes at a time. */ while ((run_end - run_start) >= 11) { extra_bits = MIN((run_end - run_start) - 11, 0x7F); precode_freqs[18]++; *itemptr++ = 18 | (extra_bits << 5); run_start += 11 + extra_bits; } /* Symbol 17: RLE 3 to 10 zeroes at a time. */ if ((run_end - run_start) >= 3) { extra_bits = MIN((run_end - run_start) - 3, 0x7); precode_freqs[17]++; *itemptr++ = 17 | (extra_bits << 5); run_start += 3 + extra_bits; } } else { /* A run of nonzero lengths. */ /* Symbol 16: RLE 3 to 6 of the previous length. */ if ((run_end - run_start) >= 4) { precode_freqs[len]++; *itemptr++ = len; run_start++; do { extra_bits = MIN((run_end - run_start) - 3, 0x3); precode_freqs[16]++; *itemptr++ = 16 | (extra_bits << 5); run_start += 3 + extra_bits; } while ((run_end - run_start) >= 3); } } /* Output any remaining lengths without RLE. */ while (run_start != run_end) { precode_freqs[len]++; *itemptr++ = len; run_start++; } } while (run_start != num_lens); return itemptr - precode_items; } /* * Huffman codeword lengths for dynamic Huffman blocks are compressed using a * separate Huffman code, the "precode", which contains a symbol for each * possible codeword length in the larger code as well as several special * symbols to represent repeated codeword lengths (a form of run-length * encoding). The precode is itself constructed in canonical form, and its * codeword lengths are represented literally in 19 3-bit fields that * immediately precede the compressed codeword lengths of the larger code. */ /* Precompute the information needed to output dynamic Huffman codes. */ static void deflate_precompute_huffman_header(struct libdeflate_compressor *c) { /* Compute how many litlen and offset symbols are needed. */ for (c->o.precode.num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS; c->o.precode.num_litlen_syms > 257; c->o.precode.num_litlen_syms--) if (c->codes.lens.litlen[c->o.precode.num_litlen_syms - 1] != 0) break; for (c->o.precode.num_offset_syms = DEFLATE_NUM_OFFSET_SYMS; c->o.precode.num_offset_syms > 1; c->o.precode.num_offset_syms--) if (c->codes.lens.offset[c->o.precode.num_offset_syms - 1] != 0) break; /* * If we're not using the full set of literal/length codeword lengths, * then temporarily move the offset codeword lengths over so that the * literal/length and offset codeword lengths are contiguous. */ STATIC_ASSERT(offsetof(struct deflate_lens, offset) == DEFLATE_NUM_LITLEN_SYMS); if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { memmove((u8 *)&c->codes.lens + c->o.precode.num_litlen_syms, (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, c->o.precode.num_offset_syms); } /* * Compute the "items" (RLE / literal tokens and extra bits) with which * the codeword lengths in the larger code will be output. */ c->o.precode.num_items = deflate_compute_precode_items((u8 *)&c->codes.lens, c->o.precode.num_litlen_syms + c->o.precode.num_offset_syms, c->o.precode.freqs, c->o.precode.items); /* Build the precode. */ deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS, MAX_PRE_CODEWORD_LEN, c->o.precode.freqs, c->o.precode.lens, c->o.precode.codewords); /* Count how many precode lengths we actually need to output. */ for (c->o.precode.num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS; c->o.precode.num_explicit_lens > 4; c->o.precode.num_explicit_lens--) if (c->o.precode.lens[deflate_precode_lens_permutation[ c->o.precode.num_explicit_lens - 1]] != 0) break; /* Restore the offset codeword lengths if needed. */ if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, (u8 *)&c->codes.lens + c->o.precode.num_litlen_syms, c->o.precode.num_offset_syms); } } /* * To make it faster to output matches, compute the "full" match length * codewords, i.e. the concatenation of the litlen codeword and the extra bits * for each possible match length. */ static void deflate_compute_full_len_codewords(struct libdeflate_compressor *c, const struct deflate_codes *codes) { unsigned len; STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN + DEFLATE_MAX_EXTRA_LENGTH_BITS <= 32); for (len = DEFLATE_MIN_MATCH_LEN; len <= DEFLATE_MAX_MATCH_LEN; len++) { unsigned slot = deflate_length_slot[len]; unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + slot; u32 extra_bits = len - deflate_length_slot_base[slot]; c->o.length.codewords[len] = codes->codewords.litlen[litlen_sym] | (extra_bits << codes->lens.litlen[litlen_sym]); c->o.length.lens[len] = codes->lens.litlen[litlen_sym] + deflate_extra_length_bits[slot]; } } /* Write a match to the output buffer. */ #define WRITE_MATCH(c_, codes_, length_, offset_, offset_slot_) \ do { \ const struct libdeflate_compressor *c__ = (c_); \ const struct deflate_codes *codes__ = (codes_); \ unsigned length__ = (length_); \ unsigned offset__ = (offset_); \ unsigned offset_slot__ = (offset_slot_); \ \ /* Litlen symbol and extra length bits */ \ STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + \ DEFLATE_MAX_EXTRA_LENGTH_BITS)); \ ADD_BITS(c__->o.length.codewords[length__], \ c__->o.length.lens[length__]); \ \ if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + \ DEFLATE_MAX_EXTRA_LENGTH_BITS + \ MAX_OFFSET_CODEWORD_LEN + \ DEFLATE_MAX_EXTRA_OFFSET_BITS)) \ FLUSH_BITS(); \ \ /* Offset symbol */ \ ADD_BITS(codes__->codewords.offset[offset_slot__], \ codes__->lens.offset[offset_slot__]); \ \ if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + \ DEFLATE_MAX_EXTRA_OFFSET_BITS)) \ FLUSH_BITS(); \ \ /* Extra offset bits */ \ ADD_BITS(offset__ - deflate_offset_slot_base[offset_slot__], \ deflate_extra_offset_bits[offset_slot__]); \ \ FLUSH_BITS(); \ } while (0) /* * Choose the best type of block to use (dynamic Huffman, static Huffman, or * uncompressed), then output it. * * The uncompressed data of the block is @block_begin[0..@block_length-1]. The * sequence of literals and matches that will be used to compress the block (if * a compressed block is chosen) is given by @sequences if it's non-NULL, or * else @c->p.n.optimum_nodes. @c->freqs and @c->codes must be already set * according to the literals, matches, and end-of-block symbol. */ static void deflate_flush_block(struct libdeflate_compressor *c, struct deflate_output_bitstream *os, const u8 *block_begin, u32 block_length, const struct deflate_sequence *sequences, bool is_final_block) { /* * It is hard to get compilers to understand that writes to 'os->next' * don't alias 'os'. That hurts performance significantly, as * everything in 'os' would keep getting re-loaded. ('restrict' * *should* do the trick, but it's unreliable.) Therefore, we keep all * the output bitstream state in local variables, and output bits using * macros. This is similar to what the decompressor does. */ const u8 *in_next = block_begin; const u8 * const in_end = block_begin + block_length; bitbuf_t bitbuf = os->bitbuf; unsigned bitcount = os->bitcount; u8 *out_next = os->next; u8 * const out_fast_end = os->end - MIN(WORDBYTES - 1, os->end - out_next); /* * The cost for each block type, in bits. Start with the cost of the * block header which is 3 bits. */ u32 dynamic_cost = 3; u32 static_cost = 3; u32 uncompressed_cost = 3; u32 best_cost; struct deflate_codes *codes; unsigned sym; ASSERT(block_length >= MIN_BLOCK_LENGTH || (is_final_block && block_length > 0)); ASSERT(block_length <= MAX_BLOCK_LENGTH); ASSERT(bitcount <= 7); ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0); ASSERT(out_next <= os->end); ASSERT(!os->overflow); /* Precompute the precode items and build the precode. */ deflate_precompute_huffman_header(c); /* Account for the cost of encoding dynamic Huffman codes. */ dynamic_cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens); for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { u32 extra = deflate_extra_precode_bits[sym]; dynamic_cost += c->o.precode.freqs[sym] * (extra + c->o.precode.lens[sym]); } /* Account for the cost of encoding literals. */ for (sym = 0; sym < 144; sym++) { dynamic_cost += c->freqs.litlen[sym] * c->codes.lens.litlen[sym]; static_cost += c->freqs.litlen[sym] * 8; } for (; sym < 256; sym++) { dynamic_cost += c->freqs.litlen[sym] * c->codes.lens.litlen[sym]; static_cost += c->freqs.litlen[sym] * 9; } /* Account for the cost of encoding the end-of-block symbol. */ dynamic_cost += c->codes.lens.litlen[DEFLATE_END_OF_BLOCK]; static_cost += 7; /* Account for the cost of encoding lengths. */ for (sym = DEFLATE_FIRST_LEN_SYM; sym < DEFLATE_FIRST_LEN_SYM + ARRAY_LEN(deflate_extra_length_bits); sym++) { u32 extra = deflate_extra_length_bits[ sym - DEFLATE_FIRST_LEN_SYM]; dynamic_cost += c->freqs.litlen[sym] * (extra + c->codes.lens.litlen[sym]); static_cost += c->freqs.litlen[sym] * (extra + c->static_codes.lens.litlen[sym]); } /* Account for the cost of encoding offsets. */ for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) { u32 extra = deflate_extra_offset_bits[sym]; dynamic_cost += c->freqs.offset[sym] * (extra + c->codes.lens.offset[sym]); static_cost += c->freqs.offset[sym] * (extra + 5); } /* Compute the cost of using uncompressed blocks. */ uncompressed_cost += (-(bitcount + 3) & 7) + 32 + (40 * (DIV_ROUND_UP(block_length, UINT16_MAX) - 1)) + (8 * block_length); /* * Choose and output the cheapest type of block. If there is a tie, * prefer uncompressed, then static, then dynamic. */ best_cost = MIN(dynamic_cost, MIN(static_cost, uncompressed_cost)); /* If the block isn't going to fit, then stop early. */ if (DIV_ROUND_UP(bitcount + best_cost, 8) > os->end - out_next) { os->overflow = true; return; } /* * Else, now we know that the block fits, so no further bounds checks on * the output buffer are required until the next block. */ if (best_cost == uncompressed_cost) { /* * Uncompressed block(s). DEFLATE limits the length of * uncompressed blocks to UINT16_MAX bytes, so if the length of * the "block" we're flushing is over UINT16_MAX, we actually * output multiple blocks. */ do { u8 bfinal = 0; size_t len = UINT16_MAX; if (in_end - in_next <= UINT16_MAX) { bfinal = is_final_block; len = in_end - in_next; } /* It was already checked that there is enough space. */ ASSERT(os->end - out_next >= DIV_ROUND_UP(bitcount + 3, 8) + 4 + len); /* * Output BFINAL (1 bit) and BTYPE (2 bits), then align * to a byte boundary. */ STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0); *out_next++ = (bfinal << bitcount) | bitbuf; if (bitcount > 5) *out_next++ = 0; bitbuf = 0; bitcount = 0; /* Output LEN and NLEN, then the data itself. */ put_unaligned_le16(len, out_next); out_next += 2; put_unaligned_le16(~len, out_next); out_next += 2; memcpy(out_next, in_next, len); out_next += len; in_next += len; } while (in_next != in_end); /* Done outputting uncompressed block(s) */ goto out; } if (best_cost == static_cost) { /* Static Huffman block */ codes = &c->static_codes; ADD_BITS(is_final_block, 1); ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2); FLUSH_BITS(); } else { const unsigned num_explicit_lens = c->o.precode.num_explicit_lens; const unsigned num_precode_items = c->o.precode.num_items; unsigned precode_sym, precode_item; unsigned i; /* Dynamic Huffman block */ codes = &c->codes; STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3)); ADD_BITS(is_final_block, 1); ADD_BITS(DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN, 2); ADD_BITS(c->o.precode.num_litlen_syms - 257, 5); ADD_BITS(c->o.precode.num_offset_syms - 1, 5); ADD_BITS(num_explicit_lens - 4, 4); /* Output the lengths of the codewords in the precode. */ if (CAN_BUFFER(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) { /* * A 64-bit bitbuffer is just one bit too small to hold * the maximum number of precode lens, so to minimize * flushes we merge one len with the previous fields. */ precode_sym = deflate_precode_lens_permutation[0]; ADD_BITS(c->o.precode.lens[precode_sym], 3); FLUSH_BITS(); i = 1; /* num_explicit_lens >= 4 */ do { precode_sym = deflate_precode_lens_permutation[i]; ADD_BITS(c->o.precode.lens[precode_sym], 3); } while (++i < num_explicit_lens); FLUSH_BITS(); } else { FLUSH_BITS(); i = 0; do { precode_sym = deflate_precode_lens_permutation[i]; ADD_BITS(c->o.precode.lens[precode_sym], 3); FLUSH_BITS(); } while (++i < num_explicit_lens); } /* * Output the lengths of the codewords in the litlen and offset * codes, encoded by the precode. */ i = 0; do { precode_item = c->o.precode.items[i]; precode_sym = precode_item & 0x1F; STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7)); ADD_BITS(c->o.precode.codewords[precode_sym], c->o.precode.lens[precode_sym]); ADD_BITS(precode_item >> 5, deflate_extra_precode_bits[precode_sym]); FLUSH_BITS(); } while (++i < num_precode_items); } /* Output the literals and matches for a dynamic or static block. */ ASSERT(bitcount <= 7); deflate_compute_full_len_codewords(c, codes); #if SUPPORT_NEAR_OPTIMAL_PARSING if (sequences == NULL) { /* Output the literals and matches from the minimum-cost path */ struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length]; do { unsigned length = cur_node->item & OPTIMUM_LEN_MASK; unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; if (length == 1) { /* Literal */ ADD_BITS(codes->codewords.litlen[offset], codes->lens.litlen[offset]); FLUSH_BITS(); } else { /* Match */ WRITE_MATCH(c, codes, length, offset, c->p.n.offset_slot_full[offset]); } cur_node += length; } while (cur_node != end_node); } else #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ { /* Output the literals and matches from the sequences list. */ const struct deflate_sequence *seq; for (seq = sequences; ; seq++) { u32 litrunlen = seq->litrunlen_and_length & SEQ_LITRUNLEN_MASK; unsigned length = seq->litrunlen_and_length >> SEQ_LENGTH_SHIFT; unsigned lit; /* Output a run of literals. */ if (CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) { for (; litrunlen >= 4; litrunlen -= 4) { lit = *in_next++; ADD_BITS(codes->codewords.litlen[lit], codes->lens.litlen[lit]); lit = *in_next++; ADD_BITS(codes->codewords.litlen[lit], codes->lens.litlen[lit]); lit = *in_next++; ADD_BITS(codes->codewords.litlen[lit], codes->lens.litlen[lit]); lit = *in_next++; ADD_BITS(codes->codewords.litlen[lit], codes->lens.litlen[lit]); FLUSH_BITS(); } if (litrunlen-- != 0) { lit = *in_next++; ADD_BITS(codes->codewords.litlen[lit], codes->lens.litlen[lit]); if (litrunlen-- != 0) { lit = *in_next++; ADD_BITS(codes->codewords.litlen[lit], codes->lens.litlen[lit]); if (litrunlen-- != 0) { lit = *in_next++; ADD_BITS(codes->codewords.litlen[lit], codes->lens.litlen[lit]); } } FLUSH_BITS(); } } else { while (litrunlen--) { lit = *in_next++; ADD_BITS(codes->codewords.litlen[lit], codes->lens.litlen[lit]); FLUSH_BITS(); } } if (length == 0) { /* Last sequence? */ ASSERT(in_next == in_end); break; } /* Output a match. */ WRITE_MATCH(c, codes, length, seq->offset, seq->offset_slot); in_next += length; } } /* Output the end-of-block symbol. */ ASSERT(bitcount <= 7); ADD_BITS(codes->codewords.litlen[DEFLATE_END_OF_BLOCK], codes->lens.litlen[DEFLATE_END_OF_BLOCK]); FLUSH_BITS(); out: ASSERT(bitcount <= 7); /* * Assert that the block cost was computed correctly. This is relied on * above for the bounds check on the output buffer. Also, * libdeflate_deflate_compress_bound() relies on this via the assumption * that uncompressed blocks will always be used when cheapest. */ ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == best_cost); os->bitbuf = bitbuf; os->bitcount = bitcount; os->next = out_next; } static void deflate_finish_block(struct libdeflate_compressor *c, struct deflate_output_bitstream *os, const u8 *block_begin, u32 block_length, const struct deflate_sequence *sequences, bool is_final_block) { c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; deflate_make_huffman_codes(&c->freqs, &c->codes); deflate_flush_block(c, os, block_begin, block_length, sequences, is_final_block); } /******************************************************************************/ /* * Block splitting algorithm. The problem is to decide when it is worthwhile to * start a new block with new Huffman codes. There is a theoretically optimal * solution: recursively consider every possible block split, considering the * exact cost of each block, and choose the minimum cost approach. But this is * far too slow. Instead, as an approximation, we can count symbols and after * every N symbols, compare the expected distribution of symbols based on the * previous data with the actual distribution. If they differ "by enough", then * start a new block. * * As an optimization and heuristic, we don't distinguish between every symbol * but rather we combine many symbols into a single "observation type". For * literals we only look at the high bits and low bits, and for matches we only * look at whether the match is long or not. The assumption is that for typical * "real" data, places that are good block boundaries will tend to be noticeable * based only on changes in these aggregate probabilities, without looking for * subtle differences in individual symbols. For example, a change from ASCII * bytes to non-ASCII bytes, or from few matches (generally less compressible) * to many matches (generally more compressible), would be easily noticed based * on the aggregates. * * For determining whether the probability distributions are "different enough" * to start a new block, the simple heuristic of splitting when the sum of * absolute differences exceeds a constant seems to be good enough. We also add * a number proportional to the block length so that the algorithm is more * likely to end long blocks than short blocks. This reflects the general * expectation that it will become increasingly beneficial to start a new block * as the current block grows longer. * * Finally, for an approximation, it is not strictly necessary that the exact * symbols being used are considered. With "near-optimal parsing", for example, * the actual symbols that will be used are unknown until after the block * boundary is chosen and the block has been optimized. Since the final choices * cannot be used, we can use preliminary "greedy" choices instead. */ /* Initialize the block split statistics when starting a new block. */ static void init_block_split_stats(struct block_split_stats *stats) { int i; for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { stats->new_observations[i] = 0; stats->observations[i] = 0; } stats->num_new_observations = 0; stats->num_observations = 0; } /* * Literal observation. Heuristic: use the top 2 bits and low 1 bits of the * literal, for 8 possible literal observation types. */ static forceinline void observe_literal(struct block_split_stats *stats, u8 lit) { stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++; stats->num_new_observations++; } /* * Match observation. Heuristic: use one observation type for "short match" and * one observation type for "long match". */ static forceinline void observe_match(struct block_split_stats *stats, unsigned length) { stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++; stats->num_new_observations++; } static void merge_new_observations(struct block_split_stats *stats) { int i; for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { stats->observations[i] += stats->new_observations[i]; stats->new_observations[i] = 0; } stats->num_observations += stats->num_new_observations; stats->num_new_observations = 0; } static bool do_end_block_check(struct block_split_stats *stats, u32 block_length) { if (stats->num_observations > 0) { /* * Compute the sum of absolute differences of probabilities. To * avoid needing to use floating point arithmetic or do slow * divisions, we do all arithmetic with the probabilities * multiplied by num_observations * num_new_observations. E.g., * for the "old" observations the probabilities would be * (double)observations[i] / num_observations, but since we * multiply by both num_observations and num_new_observations we * really do observations[i] * num_new_observations. */ u32 total_delta = 0; u32 num_items; u32 cutoff; int i; for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { u32 expected = stats->observations[i] * stats->num_new_observations; u32 actual = stats->new_observations[i] * stats->num_observations; u32 delta = (actual > expected) ? actual - expected : expected - actual; total_delta += delta; } num_items = stats->num_observations + stats->num_new_observations; /* * Heuristic: the cutoff is when the sum of absolute differences * of probabilities becomes at least 200/512. As above, the * probability is multiplied by both num_new_observations and * num_observations. Be careful to avoid integer overflow. */ cutoff = stats->num_new_observations * 200 / 512 * stats->num_observations; /* * Very short blocks have a lot of overhead for the Huffman * codes, so only use them if it clearly seems worthwhile. * (This is an additional penalty, which adds to the smaller * penalty below which scales more slowly.) */ if (block_length < 10000 && num_items < 8192) cutoff += (u64)cutoff * (8192 - num_items) / 8192; /* Ready to end the block? */ if (total_delta + (block_length / 4096) * stats->num_observations >= cutoff) return true; } merge_new_observations(stats); return false; } static forceinline bool ready_to_check_block(const struct block_split_stats *stats, const u8 *in_block_begin, const u8 *in_next, const u8 *in_end) { return stats->num_new_observations >= NUM_OBSERVATIONS_PER_BLOCK_CHECK && in_next - in_block_begin >= MIN_BLOCK_LENGTH && in_end - in_next >= MIN_BLOCK_LENGTH; } static forceinline bool should_end_block(struct block_split_stats *stats, const u8 *in_block_begin, const u8 *in_next, const u8 *in_end) { /* Ready to try to end the block (again)? */ if (!ready_to_check_block(stats, in_block_begin, in_next, in_end)) return false; return do_end_block_check(stats, in_next - in_block_begin); } /******************************************************************************/ static void deflate_begin_sequences(struct libdeflate_compressor *c, struct deflate_sequence *first_seq) { deflate_reset_symbol_frequencies(c); first_seq->litrunlen_and_length = 0; } static forceinline void deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal, bool gather_split_stats, struct deflate_sequence *seq) { c->freqs.litlen[literal]++; if (gather_split_stats) observe_literal(&c->split_stats, literal); STATIC_ASSERT(MAX_BLOCK_LENGTH <= SEQ_LITRUNLEN_MASK); seq->litrunlen_and_length++; } static forceinline void deflate_choose_match(struct libdeflate_compressor *c, unsigned length, unsigned offset, bool gather_split_stats, struct deflate_sequence **seq_p) { struct deflate_sequence *seq = *seq_p; unsigned length_slot = deflate_length_slot[length]; unsigned offset_slot = deflate_get_offset_slot(offset); c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++; c->freqs.offset[offset_slot]++; if (gather_split_stats) observe_match(&c->split_stats, length); seq->litrunlen_and_length |= (u32)length << SEQ_LENGTH_SHIFT; seq->offset = offset; seq->offset_slot = offset_slot; seq++; seq->litrunlen_and_length = 0; *seq_p = seq; } /* * Decrease the maximum and nice match lengths if we're approaching the end of * the input buffer. */ static forceinline void adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining) { if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) { *max_len = remaining; *nice_len = MIN(*nice_len, *max_len); } } /* * Choose the minimum match length for the greedy and lazy parsers. * * By default the minimum match length is 3, which is the smallest length the * DEFLATE format allows. However, with greedy and lazy parsing, some data * (e.g. DNA sequencing data) benefits greatly from a longer minimum length. * Typically, this is because literals are very cheap. In general, the * near-optimal parser handles this case naturally, but the greedy and lazy * parsers need a heuristic to decide when to use short matches. * * The heuristic we use is to make the minimum match length depend on the number * of different literals that exist in the data. If there are many different * literals, then literals will probably be expensive, so short matches will * probably be worthwhile. Conversely, if not many literals are used, then * probably literals will be cheap and short matches won't be worthwhile. */ static unsigned choose_min_match_len(unsigned num_used_literals, unsigned max_search_depth) { /* map from num_used_literals to min_len */ static const u8 min_lens[] = { 9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* The rest is implicitly 3. */ }; unsigned min_len; STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3); STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1); if (num_used_literals >= ARRAY_LEN(min_lens)) return 3; min_len = min_lens[num_used_literals]; /* * With a low max_search_depth, it may be too hard to find long matches. */ if (max_search_depth < 16) { if (max_search_depth < 5) min_len = MIN(min_len, 4); else if (max_search_depth < 10) min_len = MIN(min_len, 5); else min_len = MIN(min_len, 7); } return min_len; } static unsigned calculate_min_match_len(const u8 *data, size_t data_len, unsigned max_search_depth) { u8 used[256] = { 0 }; unsigned num_used_literals = 0; size_t i; /* * For very short inputs, the static Huffman code has a good chance of * being best, in which case there is no reason to avoid short matches. */ if (data_len < 512) return DEFLATE_MIN_MATCH_LEN; /* * For an initial approximation, scan the first 4 KiB of data. The * caller may use recalculate_min_match_len() to update min_len later. */ data_len = MIN(data_len, 4096); for (i = 0; i < data_len; i++) used[data[i]] = 1; for (i = 0; i < 256; i++) num_used_literals += used[i]; return choose_min_match_len(num_used_literals, max_search_depth); } /* * Recalculate the minimum match length for a block, now that we know the * distribution of literals that are actually being used (freqs->litlen). */ static unsigned recalculate_min_match_len(const struct deflate_freqs *freqs, unsigned max_search_depth) { u32 literal_freq = 0; u32 cutoff; unsigned num_used_literals = 0; int i; for (i = 0; i < DEFLATE_NUM_LITERALS; i++) literal_freq += freqs->litlen[i]; cutoff = literal_freq >> 10; /* Ignore literals used very rarely. */ for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { if (freqs->litlen[i] > cutoff) num_used_literals++; } return choose_min_match_len(num_used_literals, max_search_depth); } static forceinline const u8 * choose_max_block_end(const u8 *in_block_begin, const u8 *in_end, size_t soft_max_len) { if (in_end - in_block_begin < soft_max_len + MIN_BLOCK_LENGTH) return in_end; return in_block_begin + soft_max_len; } /* * This is the level 0 "compressor". It always outputs uncompressed blocks. */ static size_t deflate_compress_none(const u8 *in, size_t in_nbytes, u8 *out, size_t out_nbytes_avail) { const u8 *in_next = in; const u8 * const in_end = in + in_nbytes; u8 *out_next = out; u8 * const out_end = out + out_nbytes_avail; /* * If the input is zero-length, we still must output a block in order * for the output to be a valid DEFLATE stream. Handle this case * specially to avoid potentially passing NULL to memcpy() below. */ if (unlikely(in_nbytes == 0)) { if (out_nbytes_avail < 5) return 0; /* BFINAL and BTYPE */ *out_next++ = 1 | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1); /* LEN and NLEN */ put_unaligned_le32(0xFFFF0000, out_next); return 5; } do { u8 bfinal = 0; size_t len = UINT16_MAX; if (in_end - in_next <= UINT16_MAX) { bfinal = 1; len = in_end - in_next; } if (out_end - out_next < 5 + len) return 0; /* * Output BFINAL and BTYPE. The stream is already byte-aligned * here, so this step always requires outputting exactly 1 byte. */ *out_next++ = bfinal | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1); /* Output LEN and NLEN, then the data itself. */ put_unaligned_le16(len, out_next); out_next += 2; put_unaligned_le16(~len, out_next); out_next += 2; memcpy(out_next, in_next, len); out_next += len; in_next += len; } while (in_next != in_end); return out_next - out; } /* * This is a faster variant of deflate_compress_greedy(). It uses the * ht_matchfinder rather than the hc_matchfinder. It also skips the block * splitting algorithm and just uses fixed length blocks. c->max_search_depth * has no effect with this algorithm, as it is hardcoded in ht_matchfinder.h. */ static void deflate_compress_fastest(struct libdeflate_compressor * restrict c, const u8 *in, size_t in_nbytes, struct deflate_output_bitstream *os) { const u8 *in_next = in; const u8 *in_end = in_next + in_nbytes; const u8 *in_cur_base = in_next; unsigned max_len = DEFLATE_MAX_MATCH_LEN; unsigned nice_len = MIN(c->nice_match_length, max_len); u32 next_hash = 0; ht_matchfinder_init(&c->p.f.ht_mf); do { /* Starting a new DEFLATE block */ const u8 * const in_block_begin = in_next; const u8 * const in_max_block_end = choose_max_block_end( in_next, in_end, FAST_SOFT_MAX_BLOCK_LENGTH); struct deflate_sequence *seq = c->p.f.sequences; deflate_begin_sequences(c, seq); do { u32 length; u32 offset; size_t remaining = in_end - in_next; if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) { max_len = remaining; if (max_len < HT_MATCHFINDER_REQUIRED_NBYTES) { do { deflate_choose_literal(c, *in_next++, false, seq); } while (--max_len); break; } nice_len = MIN(nice_len, max_len); } length = ht_matchfinder_longest_match(&c->p.f.ht_mf, &in_cur_base, in_next, max_len, nice_len, &next_hash, &offset); if (length) { /* Match found */ deflate_choose_match(c, length, offset, false, &seq); ht_matchfinder_skip_bytes(&c->p.f.ht_mf, &in_cur_base, in_next + 1, in_end, length - 1, &next_hash); in_next += length; } else { /* No match found */ deflate_choose_literal(c, *in_next++, false, seq); } /* Check if it's time to output another block. */ } while (in_next < in_max_block_end && seq < &c->p.f.sequences[FAST_SEQ_STORE_LENGTH]); deflate_finish_block(c, os, in_block_begin, in_next - in_block_begin, c->p.f.sequences, in_next == in_end); } while (in_next != in_end && !os->overflow); } /* * This is the "greedy" DEFLATE compressor. It always chooses the longest match. */ static void deflate_compress_greedy(struct libdeflate_compressor * restrict c, const u8 *in, size_t in_nbytes, struct deflate_output_bitstream *os) { const u8 *in_next = in; const u8 *in_end = in_next + in_nbytes; const u8 *in_cur_base = in_next; unsigned max_len = DEFLATE_MAX_MATCH_LEN; unsigned nice_len = MIN(c->nice_match_length, max_len); u32 next_hashes[2] = {0, 0}; hc_matchfinder_init(&c->p.g.hc_mf); do { /* Starting a new DEFLATE block */ const u8 * const in_block_begin = in_next; const u8 * const in_max_block_end = choose_max_block_end( in_next, in_end, SOFT_MAX_BLOCK_LENGTH); struct deflate_sequence *seq = c->p.g.sequences; unsigned min_len; init_block_split_stats(&c->split_stats); deflate_begin_sequences(c, seq); min_len = calculate_min_match_len(in_next, in_max_block_end - in_next, c->max_search_depth); do { u32 length; u32 offset; adjust_max_and_nice_len(&max_len, &nice_len, in_end - in_next); length = hc_matchfinder_longest_match( &c->p.g.hc_mf, &in_cur_base, in_next, min_len - 1, max_len, nice_len, c->max_search_depth, next_hashes, &offset); if (length >= min_len && (length > DEFLATE_MIN_MATCH_LEN || offset <= 4096)) { /* Match found */ deflate_choose_match(c, length, offset, true, &seq); hc_matchfinder_skip_bytes(&c->p.g.hc_mf, &in_cur_base, in_next + 1, in_end, length - 1, next_hashes); in_next += length; } else { /* No match found */ deflate_choose_literal(c, *in_next++, true, seq); } /* Check if it's time to output another block. */ } while (in_next < in_max_block_end && seq < &c->p.g.sequences[SEQ_STORE_LENGTH] && !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); deflate_finish_block(c, os, in_block_begin, in_next - in_block_begin, c->p.g.sequences, in_next == in_end); } while (in_next != in_end && !os->overflow); } static forceinline void deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c, const u8 *in, size_t in_nbytes, struct deflate_output_bitstream *os, bool lazy2) { const u8 *in_next = in; const u8 *in_end = in_next + in_nbytes; const u8 *in_cur_base = in_next; unsigned max_len = DEFLATE_MAX_MATCH_LEN; unsigned nice_len = MIN(c->nice_match_length, max_len); u32 next_hashes[2] = {0, 0}; hc_matchfinder_init(&c->p.g.hc_mf); do { /* Starting a new DEFLATE block */ const u8 * const in_block_begin = in_next; const u8 * const in_max_block_end = choose_max_block_end( in_next, in_end, SOFT_MAX_BLOCK_LENGTH); const u8 *next_recalc_min_len = in_next + MIN(in_end - in_next, 10000); struct deflate_sequence *seq = c->p.g.sequences; unsigned min_len; init_block_split_stats(&c->split_stats); deflate_begin_sequences(c, seq); min_len = calculate_min_match_len(in_next, in_max_block_end - in_next, c->max_search_depth); do { unsigned cur_len; unsigned cur_offset; unsigned next_len; unsigned next_offset; /* * Recalculate the minimum match length if it hasn't * been done recently. */ if (in_next >= next_recalc_min_len) { min_len = recalculate_min_match_len( &c->freqs, c->max_search_depth); next_recalc_min_len += MIN(in_end - next_recalc_min_len, in_next - in_block_begin); } /* Find the longest match at the current position. */ adjust_max_and_nice_len(&max_len, &nice_len, in_end - in_next); cur_len = hc_matchfinder_longest_match( &c->p.g.hc_mf, &in_cur_base, in_next, min_len - 1, max_len, nice_len, c->max_search_depth, next_hashes, &cur_offset); if (cur_len < min_len || (cur_len == DEFLATE_MIN_MATCH_LEN && cur_offset > 8192)) { /* No match found. Choose a literal. */ deflate_choose_literal(c, *in_next++, true, seq); continue; } in_next++; have_cur_match: /* * We have a match at the current position. * If it's very long, choose it immediately. */ if (cur_len >= nice_len) { deflate_choose_match(c, cur_len, cur_offset, true, &seq); hc_matchfinder_skip_bytes(&c->p.g.hc_mf, &in_cur_base, in_next, in_end, cur_len - 1, next_hashes); in_next += cur_len - 1; continue; } /* * Try to find a better match at the next position. * * Note: since we already have a match at the *current* * position, we use only half the 'max_search_depth' * when checking the *next* position. This is a useful * trade-off because it's more worthwhile to use a * greater search depth on the initial match. * * Note: it's possible to structure the code such that * there's only one call to longest_match(), which * handles both the "find the initial match" and "try to * find a better match" cases. However, it is faster to * have two call sites, with longest_match() inlined at * each. */ adjust_max_and_nice_len(&max_len, &nice_len, in_end - in_next); next_len = hc_matchfinder_longest_match( &c->p.g.hc_mf, &in_cur_base, in_next++, cur_len - 1, max_len, nice_len, c->max_search_depth >> 1, next_hashes, &next_offset); if (next_len >= cur_len && 4 * (int)(next_len - cur_len) + ((int)bsr32(cur_offset) - (int)bsr32(next_offset)) > 2) { /* * Found a better match at the next position. * Output a literal. Then the next match * becomes the current match. */ deflate_choose_literal(c, *(in_next - 2), true, seq); cur_len = next_len; cur_offset = next_offset; goto have_cur_match; } if (lazy2) { /* In lazy2 mode, look ahead another position */ adjust_max_and_nice_len(&max_len, &nice_len, in_end - in_next); next_len = hc_matchfinder_longest_match( &c->p.g.hc_mf, &in_cur_base, in_next++, cur_len - 1, max_len, nice_len, c->max_search_depth >> 2, next_hashes, &next_offset); if (next_len >= cur_len && 4 * (int)(next_len - cur_len) + ((int)bsr32(cur_offset) - (int)bsr32(next_offset)) > 6) { /* * There's a much better match two * positions ahead, so use two literals. */ deflate_choose_literal( c, *(in_next - 3), true, seq); deflate_choose_literal( c, *(in_next - 2), true, seq); cur_len = next_len; cur_offset = next_offset; goto have_cur_match; } /* * No better match at either of the next 2 * positions. Output the current match. */ deflate_choose_match(c, cur_len, cur_offset, true, &seq); if (cur_len > 3) { hc_matchfinder_skip_bytes(&c->p.g.hc_mf, &in_cur_base, in_next, in_end, cur_len - 3, next_hashes); in_next += cur_len - 3; } } else { /* !lazy2 */ /* * No better match at the next position. Output * the current match. */ deflate_choose_match(c, cur_len, cur_offset, true, &seq); hc_matchfinder_skip_bytes(&c->p.g.hc_mf, &in_cur_base, in_next, in_end, cur_len - 2, next_hashes); in_next += cur_len - 2; } /* Check if it's time to output another block. */ } while (in_next < in_max_block_end && seq < &c->p.g.sequences[SEQ_STORE_LENGTH] && !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); deflate_finish_block(c, os, in_block_begin, in_next - in_block_begin, c->p.g.sequences, in_next == in_end); } while (in_next != in_end && !os->overflow); } /* * This is the "lazy" DEFLATE compressor. Before choosing a match, it checks to * see if there's a better match at the next position. If yes, it outputs a * literal and continues to the next position. If no, it outputs the match. */ static void deflate_compress_lazy(struct libdeflate_compressor * restrict c, const u8 *in, size_t in_nbytes, struct deflate_output_bitstream *os) { deflate_compress_lazy_generic(c, in, in_nbytes, os, false); } /* * The lazy2 compressor. This is similar to the regular lazy one, but it looks * for a better match at the next 2 positions rather than the next 1. This * makes it take slightly more time, but compress some inputs slightly more. */ static void deflate_compress_lazy2(struct libdeflate_compressor * restrict c, const u8 *in, size_t in_nbytes, struct deflate_output_bitstream *os) { deflate_compress_lazy_generic(c, in, in_nbytes, os, true); } #if SUPPORT_NEAR_OPTIMAL_PARSING /* * Follow the minimum-cost path in the graph of possible match/literal choices * for the current block and compute the frequencies of the Huffman symbols that * would be needed to output those matches and literals. */ static void deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length) { struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length]; do { unsigned length = cur_node->item & OPTIMUM_LEN_MASK; unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; if (length == 1) { /* Literal */ c->freqs.litlen[offset]++; } else { /* Match */ c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + deflate_length_slot[length]]++; c->freqs.offset[c->p.n.offset_slot_full[offset]]++; } cur_node += length; } while (cur_node != end_node); /* Tally the end-of-block symbol. */ c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; } static void deflate_choose_all_literals(struct libdeflate_compressor *c, const u8 *block, u32 block_length) { u32 i; deflate_reset_symbol_frequencies(c); for (i = 0; i < block_length; i++) c->freqs.litlen[block[i]]++; c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; deflate_make_huffman_codes(&c->freqs, &c->codes); } /* * Compute the exact cost, in bits, that would be required to output the matches * and literals described by @c->freqs as a dynamic Huffman block. The litlen * and offset codes are assumed to have already been built in @c->codes. */ static u32 deflate_compute_true_cost(struct libdeflate_compressor *c) { u32 cost = 0; unsigned sym; deflate_precompute_huffman_header(c); memset(&c->codes.lens.litlen[c->o.precode.num_litlen_syms], 0, DEFLATE_NUM_LITLEN_SYMS - c->o.precode.num_litlen_syms); cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens); for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { cost += c->o.precode.freqs[sym] * (c->o.precode.lens[sym] + deflate_extra_precode_bits[sym]); } for (sym = 0; sym < DEFLATE_FIRST_LEN_SYM; sym++) cost += c->freqs.litlen[sym] * c->codes.lens.litlen[sym]; for (; sym < DEFLATE_FIRST_LEN_SYM + ARRAY_LEN(deflate_extra_length_bits); sym++) cost += c->freqs.litlen[sym] * (c->codes.lens.litlen[sym] + deflate_extra_length_bits[sym - DEFLATE_FIRST_LEN_SYM]); for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) cost += c->freqs.offset[sym] * (c->codes.lens.offset[sym] + deflate_extra_offset_bits[sym]); return cost; } /* Set the current cost model from the codeword lengths specified in @lens. */ static void deflate_set_costs_from_codes(struct libdeflate_compressor *c, const struct deflate_lens *lens) { unsigned i; /* Literals */ for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS); c->p.n.costs.literal[i] = bits * BIT_COST; } /* Lengths */ for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) { unsigned length_slot = deflate_length_slot[i]; unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + length_slot; u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS); bits += deflate_extra_length_bits[length_slot]; c->p.n.costs.length[i] = bits * BIT_COST; } /* Offset slots */ for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) { u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS); bits += deflate_extra_offset_bits[i]; c->p.n.costs.offset_slot[i] = bits * BIT_COST; } } /* * This lookup table gives the default cost of a literal symbol and of a length * symbol, depending on the characteristics of the input data. It was generated * by scripts/gen_default_litlen_costs.py. * * This table is indexed first by the estimated match probability: * * i=0: data doesn't contain many matches [match_prob=0.25] * i=1: neutral [match_prob=0.50] * i=2: data contains lots of matches [match_prob=0.75] * * This lookup produces a subtable which maps the number of distinct used * literals to the default cost of a literal symbol, i.e.: * * int(-log2((1 - match_prob) / num_used_literals) * BIT_COST) * * ... for num_used_literals in [1, 256] (and 0, which is copied from 1). This * accounts for literals usually getting cheaper as the number of distinct * literals decreases, and as the proportion of literals to matches increases. * * The lookup also produces the cost of a length symbol, which is: * * int(-log2(match_prob/NUM_LEN_SLOTS) * BIT_COST) * * Note: we don't currently assign different costs to different literal symbols, * or to different length symbols, as this is hard to do in a useful way. */ static const struct { u8 used_lits_to_lit_cost[257]; u8 len_sym_cost; } default_litlen_costs[] = { { /* match_prob = 0.25 */ .used_lits_to_lit_cost = { 6, 6, 22, 32, 38, 43, 48, 51, 54, 57, 59, 61, 64, 65, 67, 69, 70, 72, 73, 74, 75, 76, 77, 79, 80, 80, 81, 82, 83, 84, 85, 85, 86, 87, 88, 88, 89, 89, 90, 91, 91, 92, 92, 93, 93, 94, 95, 95, 96, 96, 96, 97, 97, 98, 98, 99, 99, 99, 100, 100, 101, 101, 101, 102, 102, 102, 103, 103, 104, 104, 104, 105, 105, 105, 105, 106, 106, 106, 107, 107, 107, 108, 108, 108, 108, 109, 109, 109, 109, 110, 110, 110, 111, 111, 111, 111, 112, 112, 112, 112, 112, 113, 113, 113, 113, 114, 114, 114, 114, 114, 115, 115, 115, 115, 115, 116, 116, 116, 116, 116, 117, 117, 117, 117, 117, 118, 118, 118, 118, 118, 118, 119, 119, 119, 119, 119, 120, 120, 120, 120, 120, 120, 121, 121, 121, 121, 121, 121, 121, 122, 122, 122, 122, 122, 122, 123, 123, 123, 123, 123, 123, 123, 124, 124, 124, 124, 124, 124, 124, 125, 125, 125, 125, 125, 125, 125, 125, 126, 126, 126, 126, 126, 126, 126, 127, 127, 127, 127, 127, 127, 127, 127, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 130, 130, 130, 130, 130, 130, 130, 130, 130, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 134, 134, 134, 134, 134, 134, 134, 134, }, .len_sym_cost = 109, }, { /* match_prob = 0.5 */ .used_lits_to_lit_cost = { 16, 16, 32, 41, 48, 53, 57, 60, 64, 66, 69, 71, 73, 75, 76, 78, 80, 81, 82, 83, 85, 86, 87, 88, 89, 90, 91, 92, 92, 93, 94, 95, 96, 96, 97, 98, 98, 99, 99, 100, 101, 101, 102, 102, 103, 103, 104, 104, 105, 105, 106, 106, 107, 107, 108, 108, 108, 109, 109, 110, 110, 110, 111, 111, 112, 112, 112, 113, 113, 113, 114, 114, 114, 115, 115, 115, 115, 116, 116, 116, 117, 117, 117, 118, 118, 118, 118, 119, 119, 119, 119, 120, 120, 120, 120, 121, 121, 121, 121, 122, 122, 122, 122, 122, 123, 123, 123, 123, 124, 124, 124, 124, 124, 125, 125, 125, 125, 125, 126, 126, 126, 126, 126, 127, 127, 127, 127, 127, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 130, 130, 130, 130, 130, 130, 131, 131, 131, 131, 131, 131, 131, 132, 132, 132, 132, 132, 132, 133, 133, 133, 133, 133, 133, 133, 134, 134, 134, 134, 134, 134, 134, 134, 135, 135, 135, 135, 135, 135, 135, 135, 136, 136, 136, 136, 136, 136, 136, 136, 137, 137, 137, 137, 137, 137, 137, 137, 138, 138, 138, 138, 138, 138, 138, 138, 138, 139, 139, 139, 139, 139, 139, 139, 139, 139, 140, 140, 140, 140, 140, 140, 140, 140, 140, 141, 141, 141, 141, 141, 141, 141, 141, 141, 141, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 143, 143, 143, 143, 143, 143, 143, 143, 143, 143, 144, }, .len_sym_cost = 93, }, { /* match_prob = 0.75 */ .used_lits_to_lit_cost = { 32, 32, 48, 57, 64, 69, 73, 76, 80, 82, 85, 87, 89, 91, 92, 94, 96, 97, 98, 99, 101, 102, 103, 104, 105, 106, 107, 108, 108, 109, 110, 111, 112, 112, 113, 114, 114, 115, 115, 116, 117, 117, 118, 118, 119, 119, 120, 120, 121, 121, 122, 122, 123, 123, 124, 124, 124, 125, 125, 126, 126, 126, 127, 127, 128, 128, 128, 129, 129, 129, 130, 130, 130, 131, 131, 131, 131, 132, 132, 132, 133, 133, 133, 134, 134, 134, 134, 135, 135, 135, 135, 136, 136, 136, 136, 137, 137, 137, 137, 138, 138, 138, 138, 138, 139, 139, 139, 139, 140, 140, 140, 140, 140, 141, 141, 141, 141, 141, 142, 142, 142, 142, 142, 143, 143, 143, 143, 143, 144, 144, 144, 144, 144, 144, 145, 145, 145, 145, 145, 145, 146, 146, 146, 146, 146, 146, 147, 147, 147, 147, 147, 147, 147, 148, 148, 148, 148, 148, 148, 149, 149, 149, 149, 149, 149, 149, 150, 150, 150, 150, 150, 150, 150, 150, 151, 151, 151, 151, 151, 151, 151, 151, 152, 152, 152, 152, 152, 152, 152, 152, 153, 153, 153, 153, 153, 153, 153, 153, 154, 154, 154, 154, 154, 154, 154, 154, 154, 155, 155, 155, 155, 155, 155, 155, 155, 155, 156, 156, 156, 156, 156, 156, 156, 156, 156, 157, 157, 157, 157, 157, 157, 157, 157, 157, 157, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 159, 159, 159, 159, 159, 159, 159, 159, 159, 159, 160, }, .len_sym_cost = 84, }, }; /* * Choose the default costs for literal and length symbols. These symbols are * both part of the litlen alphabet. */ static void deflate_choose_default_litlen_costs(struct libdeflate_compressor *c, const u8 *block_begin, u32 block_length, u32 *lit_cost, u32 *len_sym_cost) { unsigned num_used_literals = 0; u32 literal_freq = block_length; u32 match_freq = 0; u32 cutoff; u32 i; /* Calculate the number of distinct literals that exist in the data. */ memset(c->freqs.litlen, 0, DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0])); cutoff = literal_freq >> 11; /* Ignore literals used very rarely. */ for (i = 0; i < block_length; i++) c->freqs.litlen[block_begin[i]]++; for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { if (c->freqs.litlen[i] > cutoff) num_used_literals++; } if (num_used_literals == 0) num_used_literals = 1; /* * Estimate the relative frequency of literals and matches in the * optimal parsing solution. We don't know the optimal solution, so * this can only be a very rough estimate. Therefore, we basically use * the match frequency from a greedy parse. We also apply the min_len * heuristic used by the greedy and lazy parsers, to avoid counting too * many matches when literals are cheaper than short matches. */ match_freq = 0; i = choose_min_match_len(num_used_literals, c->max_search_depth); for (; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) { match_freq += c->p.n.match_len_freqs[i]; literal_freq -= i * c->p.n.match_len_freqs[i]; } if ((s32)literal_freq < 0) /* shouldn't happen */ literal_freq = 0; if (match_freq > literal_freq) i = 2; /* many matches */ else if (match_freq * 4 > literal_freq) i = 1; /* neutral */ else i = 0; /* few matches */ STATIC_ASSERT(BIT_COST == 16); *lit_cost = default_litlen_costs[i].used_lits_to_lit_cost[ num_used_literals]; *len_sym_cost = default_litlen_costs[i].len_sym_cost; } static forceinline u32 deflate_default_length_cost(unsigned len, u32 len_sym_cost) { unsigned slot = deflate_length_slot[len]; u32 num_extra_bits = deflate_extra_length_bits[slot]; return len_sym_cost + (num_extra_bits * BIT_COST); } static forceinline u32 deflate_default_offset_slot_cost(unsigned slot) { u32 num_extra_bits = deflate_extra_offset_bits[slot]; /* * Assume that all offset symbols are equally probable. * The resulting cost is 'int(-log2(1/30) * BIT_COST)', * where 30 is the number of potentially-used offset symbols. */ u32 offset_sym_cost = 4*BIT_COST + (907*BIT_COST)/1000; return offset_sym_cost + (num_extra_bits * BIT_COST); } /* Set default symbol costs for the first block's first optimization pass. */ static void deflate_set_default_costs(struct libdeflate_compressor *c, u32 lit_cost, u32 len_sym_cost) { unsigned i; /* Literals */ for (i = 0; i < DEFLATE_NUM_LITERALS; i++) c->p.n.costs.literal[i] = lit_cost; /* Lengths */ for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) c->p.n.costs.length[i] = deflate_default_length_cost(i, len_sym_cost); /* Offset slots */ for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i); } static forceinline void deflate_adjust_cost(u32 *cost_p, u32 default_cost, int change_amount) { if (change_amount == 0) /* Block is very similar to previous; prefer previous costs. */ *cost_p = (default_cost + 3 * *cost_p) / 4; else if (change_amount == 1) *cost_p = (default_cost + *cost_p) / 2; else if (change_amount == 2) *cost_p = (5 * default_cost + 3 * *cost_p) / 8; else /* Block differs greatly from previous; prefer default costs. */ *cost_p = (3 * default_cost + *cost_p) / 4; } static forceinline void deflate_adjust_costs_impl(struct libdeflate_compressor *c, u32 lit_cost, u32 len_sym_cost, int change_amount) { unsigned i; /* Literals */ for (i = 0; i < DEFLATE_NUM_LITERALS; i++) deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost, change_amount); /* Lengths */ for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) deflate_adjust_cost(&c->p.n.costs.length[i], deflate_default_length_cost(i, len_sym_cost), change_amount); /* Offset slots */ for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) deflate_adjust_cost(&c->p.n.costs.offset_slot[i], deflate_default_offset_slot_cost(i), change_amount); } /* * Adjust the costs when beginning a new block. * * Since the current costs are optimized for the data already, it can be helpful * to reuse them instead of starting over with the default costs. However, this * depends on how similar the new block is to the previous block. Therefore, * use a heuristic to decide how similar the blocks are, and mix together the * current costs and the default costs accordingly. */ static void deflate_adjust_costs(struct libdeflate_compressor *c, u32 lit_cost, u32 len_sym_cost) { u64 total_delta = 0; u64 cutoff; int i; /* * Decide how different the current block is from the previous block, * using the block splitting statistics from the current and previous * blocks. The more different the current block is, the more we prefer * the default costs rather than the previous block's costs. * * The algorithm here is similar to the end-of-block check one, but here * we compare two entire blocks rather than a partial block with a small * extra part, and therefore we need 64-bit numbers in some places. */ for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { u64 prev = (u64)c->p.n.prev_observations[i] * c->split_stats.num_observations; u64 cur = (u64)c->split_stats.observations[i] * c->p.n.prev_num_observations; total_delta += prev > cur ? prev - cur : cur - prev; } cutoff = ((u64)c->p.n.prev_num_observations * c->split_stats.num_observations * 200) / 512; if (total_delta > 3 * cutoff) /* Big change in the data; just use the default costs. */ deflate_set_default_costs(c, lit_cost, len_sym_cost); else if (4 * total_delta > 9 * cutoff) deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3); else if (2 * total_delta > 3 * cutoff) deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2); else if (2 * total_delta > cutoff) deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 1); else deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0); } static void deflate_set_initial_costs(struct libdeflate_compressor *c, const u8 *block_begin, u32 block_length, bool is_first_block) { u32 lit_cost, len_sym_cost; deflate_choose_default_litlen_costs(c, block_begin, block_length, &lit_cost, &len_sym_cost); if (is_first_block) deflate_set_default_costs(c, lit_cost, len_sym_cost); else deflate_adjust_costs(c, lit_cost, len_sym_cost); } /* * Find the minimum-cost path through the graph of possible match/literal * choices for this block. * * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which * represents the node at the beginning of the block, to * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of * the block. Edge costs are evaluated using the cost model 'c->p.n.costs'. * * The algorithm works backwards, starting at the end node and proceeding * backwards one node at a time. At each node, the minimum cost to reach the * end node is computed and the match/literal choice that begins that path is * saved. */ static void deflate_find_min_cost_path(struct libdeflate_compressor *c, const u32 block_length, const struct lz_match *cache_ptr) { struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length]; struct deflate_optimum_node *cur_node = end_node; cur_node->cost_to_end = 0; do { unsigned num_matches; unsigned literal; u32 best_cost_to_end; cur_node--; cache_ptr--; num_matches = cache_ptr->length; literal = cache_ptr->offset; /* It's always possible to choose a literal. */ best_cost_to_end = c->p.n.costs.literal[literal] + (cur_node + 1)->cost_to_end; cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1; /* Also consider matches if there are any. */ if (num_matches) { const struct lz_match *match; unsigned len; unsigned offset; unsigned offset_slot; u32 offset_cost; u32 cost_to_end; /* * Consider each length from the minimum * (DEFLATE_MIN_MATCH_LEN) to the length of the longest * match found at this position. For each length, we * consider only the smallest offset for which that * length is available. Although this is not guaranteed * to be optimal due to the possibility of a larger * offset costing less than a smaller offset to code, * this is a very useful heuristic. */ match = cache_ptr - num_matches; len = DEFLATE_MIN_MATCH_LEN; do { offset = match->offset; offset_slot = c->p.n.offset_slot_full[offset]; offset_cost = c->p.n.costs.offset_slot[offset_slot]; do { cost_to_end = offset_cost + c->p.n.costs.length[len] + (cur_node + len)->cost_to_end; if (cost_to_end < best_cost_to_end) { best_cost_to_end = cost_to_end; cur_node->item = len | ((u32)offset << OPTIMUM_OFFSET_SHIFT); } } while (++len <= match->length); } while (++match != cache_ptr); cache_ptr -= num_matches; } cur_node->cost_to_end = best_cost_to_end; } while (cur_node != &c->p.n.optimum_nodes[0]); deflate_reset_symbol_frequencies(c); deflate_tally_item_list(c, block_length); deflate_make_huffman_codes(&c->freqs, &c->codes); } /* * Choose the literals and matches for the current block, then output the block. * * To choose the literal/match sequence, we find the minimum-cost path through * the block's graph of literal/match choices, given a cost model. However, the * true cost of each symbol is unknown until the Huffman codes have been built, * but at the same time the Huffman codes depend on the frequencies of chosen * symbols. Consequently, multiple passes must be used to try to approximate an * optimal solution. The first pass uses default costs, mixed with the costs * from the previous block when it seems appropriate. Later passes use the * Huffman codeword lengths from the previous pass as the costs. * * As an alternate strategy, also consider using only literals. The boolean * returned in *used_only_literals indicates whether that strategy was best. */ static void deflate_optimize_and_flush_block(struct libdeflate_compressor *c, struct deflate_output_bitstream *os, const u8 *block_begin, u32 block_length, const struct lz_match *cache_ptr, bool is_first_block, bool is_final_block, bool *used_only_literals) { unsigned num_passes_remaining = c->p.n.max_optim_passes; u32 best_true_cost = UINT32_MAX; u32 true_cost; u32 only_lits_cost; u32 static_cost = UINT32_MAX; struct deflate_sequence seq_; struct deflate_sequence *seq = NULL; u32 i; /* * On some data, using only literals (no matches) ends up being better * than what the iterative optimization algorithm produces. Therefore, * consider using only literals. */ deflate_choose_all_literals(c, block_begin, block_length); only_lits_cost = deflate_compute_true_cost(c); /* * Force the block to really end at the desired length, even if some * matches extend beyond it. */ for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN, ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++) c->p.n.optimum_nodes[i].cost_to_end = 0x80000000; /* * Sometimes a static Huffman block ends up being cheapest, particularly * if the block is small. So, if the block is sufficiently small, find * the optimal static block solution and remember its cost. */ if (block_length <= c->p.n.max_len_to_optimize_static_block) { /* Save c->p.n.costs temporarily. */ c->p.n.costs_saved = c->p.n.costs; deflate_set_costs_from_codes(c, &c->static_codes.lens); deflate_find_min_cost_path(c, block_length, cache_ptr); static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST; static_cost += 7; /* for the end-of-block symbol */ /* Restore c->p.n.costs. */ c->p.n.costs = c->p.n.costs_saved; } /* Initialize c->p.n.costs with default costs. */ deflate_set_initial_costs(c, block_begin, block_length, is_first_block); do { /* * Find the minimum-cost path for this pass. * Also set c->freqs and c->codes to match the path. */ deflate_find_min_cost_path(c, block_length, cache_ptr); /* * Compute the exact cost of the block if the path were to be * used. Note that this differs from * c->p.n.optimum_nodes[0].cost_to_end in that true_cost uses * the actual Huffman codes instead of c->p.n.costs. */ true_cost = deflate_compute_true_cost(c); /* * If the cost didn't improve much from the previous pass, then * doing more passes probably won't be helpful, so stop early. */ if (true_cost + c->p.n.min_improvement_to_continue > best_true_cost) break; best_true_cost = true_cost; /* Save the cost model that gave 'best_true_cost'. */ c->p.n.costs_saved = c->p.n.costs; /* Update the cost model from the Huffman codes. */ deflate_set_costs_from_codes(c, &c->codes.lens); } while (--num_passes_remaining); *used_only_literals = false; if (MIN(only_lits_cost, static_cost) < best_true_cost) { if (only_lits_cost < static_cost) { /* Using only literals ended up being best! */ deflate_choose_all_literals(c, block_begin, block_length); deflate_set_costs_from_codes(c, &c->codes.lens); seq_.litrunlen_and_length = block_length; seq = &seq_; *used_only_literals = true; } else { /* Static block ended up being best! */ deflate_set_costs_from_codes(c, &c->static_codes.lens); deflate_find_min_cost_path(c, block_length, cache_ptr); } } else if (true_cost >= best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) { /* * The best solution was actually from a non-final optimization * pass, so recover and use the min-cost path from that pass. */ c->p.n.costs = c->p.n.costs_saved; deflate_find_min_cost_path(c, block_length, cache_ptr); deflate_set_costs_from_codes(c, &c->codes.lens); } deflate_flush_block(c, os, block_begin, block_length, seq, is_final_block); } static void deflate_near_optimal_init_stats(struct libdeflate_compressor *c) { init_block_split_stats(&c->split_stats); memset(c->p.n.new_match_len_freqs, 0, sizeof(c->p.n.new_match_len_freqs)); memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs)); } static void deflate_near_optimal_merge_stats(struct libdeflate_compressor *c) { unsigned i; merge_new_observations(&c->split_stats); for (i = 0; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) { c->p.n.match_len_freqs[i] += c->p.n.new_match_len_freqs[i]; c->p.n.new_match_len_freqs[i] = 0; } } /* * Save some literal/match statistics from the previous block so that * deflate_adjust_costs() will be able to decide how much the current block * differs from the previous one. */ static void deflate_near_optimal_save_stats(struct libdeflate_compressor *c) { int i; for (i = 0; i < NUM_OBSERVATION_TYPES; i++) c->p.n.prev_observations[i] = c->split_stats.observations[i]; c->p.n.prev_num_observations = c->split_stats.num_observations; } static void deflate_near_optimal_clear_old_stats(struct libdeflate_compressor *c) { int i; for (i = 0; i < NUM_OBSERVATION_TYPES; i++) c->split_stats.observations[i] = 0; c->split_stats.num_observations = 0; memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs)); } /* * This is the "near-optimal" DEFLATE compressor. It computes the optimal * representation of each DEFLATE block using a minimum-cost path search over * the graph of possible match/literal choices for that block, assuming a * certain cost for each Huffman symbol. * * For several reasons, the end result is not guaranteed to be optimal: * * - Nonoptimal choice of blocks * - Heuristic limitations on which matches are actually considered * - Symbol costs are unknown until the symbols have already been chosen * (so iterative optimization must be used) */ static void deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, const u8 *in, size_t in_nbytes, struct deflate_output_bitstream *os) { const u8 *in_next = in; const u8 *in_block_begin = in_next; const u8 *in_end = in_next + in_nbytes; const u8 *in_cur_base = in_next; const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE); unsigned max_len = DEFLATE_MAX_MATCH_LEN; unsigned nice_len = MIN(c->nice_match_length, max_len); struct lz_match *cache_ptr = c->p.n.match_cache; u32 next_hashes[2] = {0, 0}; bool prev_block_used_only_literals = false; bt_matchfinder_init(&c->p.n.bt_mf); deflate_near_optimal_init_stats(c); do { /* Starting a new DEFLATE block */ const u8 * const in_max_block_end = choose_max_block_end( in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH); const u8 *prev_end_block_check = NULL; bool change_detected = false; const u8 *next_observation = in_next; unsigned min_len; /* * Use the minimum match length heuristic to improve the * literal/match statistics gathered during matchfinding. * However, the actual near-optimal parse won't respect min_len, * as it can accurately assess the costs of different matches. * * If the "use only literals" strategy happened to be the best * strategy on the previous block, then probably the * min_match_len heuristic is still not aggressive enough for * the data, so force gathering literal stats only. */ if (prev_block_used_only_literals) min_len = DEFLATE_MAX_MATCH_LEN + 1; else min_len = calculate_min_match_len( in_block_begin, in_max_block_end - in_block_begin, c->max_search_depth); /* * Find matches until we decide to end the block. We end the * block if any of the following is true: * * (1) Maximum block length has been reached * (2) Match catch may overflow. * (3) Block split heuristic says to split now. */ for (;;) { struct lz_match *matches; unsigned best_len; size_t remaining = in_end - in_next; /* Slide the window forward if needed. */ if (in_next == in_next_slide) { bt_matchfinder_slide_window(&c->p.n.bt_mf); in_cur_base = in_next; in_next_slide = in_next + MIN(remaining, MATCHFINDER_WINDOW_SIZE); } /* * Find matches with the current position using the * binary tree matchfinder and save them in match_cache. * * Note: the binary tree matchfinder is more suited for * optimal parsing than the hash chain matchfinder. The * reasons for this include: * * - The binary tree matchfinder can find more matches * in the same number of steps. * - One of the major advantages of hash chains is that * skipping positions (not searching for matches at * them) is faster; however, with optimal parsing we * search for matches at almost all positions, so this * advantage of hash chains is negated. */ matches = cache_ptr; best_len = 0; adjust_max_and_nice_len(&max_len, &nice_len, remaining); if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) { cache_ptr = bt_matchfinder_get_matches( &c->p.n.bt_mf, in_cur_base, in_next - in_cur_base, max_len, nice_len, c->max_search_depth, next_hashes, matches); if (cache_ptr > matches) best_len = cache_ptr[-1].length; } if (in_next >= next_observation) { if (best_len >= min_len) { observe_match(&c->split_stats, best_len); next_observation = in_next + best_len; c->p.n.new_match_len_freqs[best_len]++; } else { observe_literal(&c->split_stats, *in_next); next_observation = in_next + 1; } } cache_ptr->length = cache_ptr - matches; cache_ptr->offset = *in_next; in_next++; cache_ptr++; /* * If there was a very long match found, don't cache any * matches for the bytes covered by that match. This * avoids degenerate behavior when compressing highly * redundant data, where the number of matches can be * very large. * * This heuristic doesn't actually hurt the compression * ratio very much. If there's a long match, then the * data must be highly compressible, so it doesn't * matter much what we do. */ if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) { --best_len; do { remaining = in_end - in_next; if (in_next == in_next_slide) { bt_matchfinder_slide_window( &c->p.n.bt_mf); in_cur_base = in_next; in_next_slide = in_next + MIN(remaining, MATCHFINDER_WINDOW_SIZE); } adjust_max_and_nice_len(&max_len, &nice_len, remaining); if (max_len >= BT_MATCHFINDER_REQUIRED_NBYTES) { bt_matchfinder_skip_byte( &c->p.n.bt_mf, in_cur_base, in_next - in_cur_base, nice_len, c->max_search_depth, next_hashes); } cache_ptr->length = 0; cache_ptr->offset = *in_next; in_next++; cache_ptr++; } while (--best_len); } /* Maximum block length or end of input reached? */ if (in_next >= in_max_block_end) break; /* Match cache overflowed? */ if (cache_ptr >= &c->p.n.match_cache[MATCH_CACHE_LENGTH]) break; /* Not ready to try to end the block (again)? */ if (!ready_to_check_block(&c->split_stats, in_block_begin, in_next, in_end)) continue; /* Check if it would be worthwhile to end the block. */ if (do_end_block_check(&c->split_stats, in_next - in_block_begin)) { change_detected = true; break; } /* Ending the block doesn't seem worthwhile here. */ deflate_near_optimal_merge_stats(c); prev_end_block_check = in_next; } /* * All the matches for this block have been cached. Now choose * the precise end of the block and the sequence of items to * output to represent it, then flush the block. */ if (change_detected && prev_end_block_check != NULL) { /* * The block is being ended because a recent chunk of * data differs from the rest of the block. We could * end the block at 'in_next' like the greedy and lazy * compressors do, but that's not ideal since it would * include the differing chunk in the block. The * near-optimal compressor has time to do a better job. * Therefore, we rewind to just before the chunk, and * output a block that only goes up to there. * * We then set things up to correctly start the next * block, considering that some work has already been * done on it (some matches found and stats gathered). */ struct lz_match *orig_cache_ptr = cache_ptr; const u8 *in_block_end = prev_end_block_check; u32 block_length = in_block_end - in_block_begin; bool is_first = (in_block_begin == in); bool is_final = false; u32 num_bytes_to_rewind = in_next - in_block_end; size_t cache_len_rewound; /* Rewind the match cache. */ do { cache_ptr--; cache_ptr -= cache_ptr->length; } while (--num_bytes_to_rewind); cache_len_rewound = orig_cache_ptr - cache_ptr; deflate_optimize_and_flush_block( c, os, in_block_begin, block_length, cache_ptr, is_first, is_final, &prev_block_used_only_literals); memmove(c->p.n.match_cache, cache_ptr, cache_len_rewound * sizeof(*cache_ptr)); cache_ptr = &c->p.n.match_cache[cache_len_rewound]; deflate_near_optimal_save_stats(c); /* * Clear the stats for the just-flushed block, leaving * just the stats for the beginning of the next block. */ deflate_near_optimal_clear_old_stats(c); in_block_begin = in_block_end; } else { /* * The block is being ended for a reason other than a * differing data chunk being detected. Don't rewind at * all; just end the block at the current position. */ u32 block_length = in_next - in_block_begin; bool is_first = (in_block_begin == in); bool is_final = (in_next == in_end); deflate_near_optimal_merge_stats(c); deflate_optimize_and_flush_block( c, os, in_block_begin, block_length, cache_ptr, is_first, is_final, &prev_block_used_only_literals); cache_ptr = &c->p.n.match_cache[0]; deflate_near_optimal_save_stats(c); deflate_near_optimal_init_stats(c); in_block_begin = in_next; } } while (in_next != in_end && !os->overflow); } /* Initialize c->p.n.offset_slot_full. */ static void deflate_init_offset_slot_full(struct libdeflate_compressor *c) { unsigned offset_slot; unsigned offset; unsigned offset_end; for (offset_slot = 0; offset_slot < ARRAY_LEN(deflate_offset_slot_base); offset_slot++) { offset = deflate_offset_slot_base[offset_slot]; offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); do { c->p.n.offset_slot_full[offset] = offset_slot; } while (++offset != offset_end); } } #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ LIBDEFLATEAPI struct libdeflate_compressor * libdeflate_alloc_compressor_ex(int compression_level, const struct libdeflate_options *options) { struct libdeflate_compressor *c; size_t size = offsetof(struct libdeflate_compressor, p); check_buildtime_parameters(); /* * Note: if more fields are added to libdeflate_options, this code will * need to be updated to support both the old and new structs. */ if (options->sizeof_options != sizeof(*options)) return NULL; if (compression_level < 0 || compression_level > 12) return NULL; #if SUPPORT_NEAR_OPTIMAL_PARSING if (compression_level >= 10) size += sizeof(c->p.n); else #endif { if (compression_level >= 2) size += sizeof(c->p.g); else if (compression_level == 1) size += sizeof(c->p.f); } c = libdeflate_aligned_malloc(options->malloc_func ? options->malloc_func : libdeflate_default_malloc_func, MATCHFINDER_MEM_ALIGNMENT, size); if (!c) return NULL; c->free_func = options->free_func ? options->free_func : libdeflate_default_free_func; c->compression_level = compression_level; /* * The higher the compression level, the more we should bother trying to * compress very small inputs. */ c->max_passthrough_size = 55 - (compression_level * 4); switch (compression_level) { case 0: c->max_passthrough_size = SIZE_MAX; c->impl = NULL; /* not used */ break; case 1: c->impl = deflate_compress_fastest; /* max_search_depth is unused. */ c->nice_match_length = 32; break; case 2: c->impl = deflate_compress_greedy; c->max_search_depth = 6; c->nice_match_length = 10; break; case 3: c->impl = deflate_compress_greedy; c->max_search_depth = 12; c->nice_match_length = 14; break; case 4: c->impl = deflate_compress_greedy; c->max_search_depth = 16; c->nice_match_length = 30; break; case 5: c->impl = deflate_compress_lazy; c->max_search_depth = 16; c->nice_match_length = 30; break; case 6: c->impl = deflate_compress_lazy; c->max_search_depth = 35; c->nice_match_length = 65; break; case 7: c->impl = deflate_compress_lazy; c->max_search_depth = 100; c->nice_match_length = 130; break; case 8: c->impl = deflate_compress_lazy2; c->max_search_depth = 300; c->nice_match_length = DEFLATE_MAX_MATCH_LEN; break; case 9: #if !SUPPORT_NEAR_OPTIMAL_PARSING default: #endif c->impl = deflate_compress_lazy2; c->max_search_depth = 600; c->nice_match_length = DEFLATE_MAX_MATCH_LEN; break; #if SUPPORT_NEAR_OPTIMAL_PARSING case 10: c->impl = deflate_compress_near_optimal; c->max_search_depth = 35; c->nice_match_length = 75; c->p.n.max_optim_passes = 2; c->p.n.min_improvement_to_continue = 32; c->p.n.min_bits_to_use_nonfinal_path = 32; c->p.n.max_len_to_optimize_static_block = 0; deflate_init_offset_slot_full(c); break; case 11: c->impl = deflate_compress_near_optimal; c->max_search_depth = 100; c->nice_match_length = 150; c->p.n.max_optim_passes = 4; c->p.n.min_improvement_to_continue = 16; c->p.n.min_bits_to_use_nonfinal_path = 16; c->p.n.max_len_to_optimize_static_block = 1000; deflate_init_offset_slot_full(c); break; case 12: default: c->impl = deflate_compress_near_optimal; c->max_search_depth = 300; c->nice_match_length = DEFLATE_MAX_MATCH_LEN; c->p.n.max_optim_passes = 10; c->p.n.min_improvement_to_continue = 1; c->p.n.min_bits_to_use_nonfinal_path = 1; c->p.n.max_len_to_optimize_static_block = 10000; deflate_init_offset_slot_full(c); break; #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ } deflate_init_static_codes(c); return c; } LIBDEFLATEAPI struct libdeflate_compressor * libdeflate_alloc_compressor(int compression_level) { static const struct libdeflate_options defaults = { .sizeof_options = sizeof(defaults), }; return libdeflate_alloc_compressor_ex(compression_level, &defaults); } LIBDEFLATEAPI size_t libdeflate_deflate_compress(struct libdeflate_compressor *c, const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail) { struct deflate_output_bitstream os; /* * For extremely short inputs, or for compression level 0, just output * uncompressed blocks. */ if (unlikely(in_nbytes <= c->max_passthrough_size)) return deflate_compress_none(in, in_nbytes, out, out_nbytes_avail); /* Initialize the output bitstream structure. */ os.bitbuf = 0; os.bitcount = 0; os.next = out; os.end = os.next + out_nbytes_avail; os.overflow = false; /* Call the actual compression function. */ (*c->impl)(c, in, in_nbytes, &os); /* Return 0 if the output buffer is too small. */ if (os.overflow) return 0; /* * Write the final byte if needed. This can't overflow the output * buffer because deflate_flush_block() would have set the overflow flag * if there wasn't enough space remaining for the full final block. */ ASSERT(os.bitcount <= 7); if (os.bitcount) { ASSERT(os.next < os.end); *os.next++ = os.bitbuf; } /* Return the compressed size in bytes. */ return os.next - (u8 *)out; } LIBDEFLATEAPI void libdeflate_free_compressor(struct libdeflate_compressor *c) { if (c) libdeflate_aligned_free(c->free_func, c); } unsigned int libdeflate_get_compression_level(struct libdeflate_compressor *c) { return c->compression_level; } LIBDEFLATEAPI size_t libdeflate_deflate_compress_bound(struct libdeflate_compressor *c, size_t in_nbytes) { size_t max_blocks; /* * Since the compressor never uses a compressed block when an * uncompressed block is cheaper, the worst case can be no worse than * the case where only uncompressed blocks are used. * * This is true even though up to 7 bits are "wasted" to byte-align the * bitstream when a compressed block is followed by an uncompressed * block. This is because a compressed block wouldn't have been used if * it wasn't cheaper than an uncompressed block, and uncompressed blocks * always end on a byte boundary. So the alignment bits will, at worst, * go up to the place where the uncompressed block would have ended. */ /* * Calculate the maximum number of uncompressed blocks that the * compressor can use for 'in_nbytes' of data. * * The minimum length that is passed to deflate_flush_block() is * MIN_BLOCK_LENGTH bytes, except for the final block if needed. If * deflate_flush_block() decides to use an uncompressed block, it * actually will (in general) output a series of uncompressed blocks in * order to stay within the UINT16_MAX limit of DEFLATE. But this can * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX', * as in that case this behavior can't result in more blocks than the * case where deflate_flush_block() is called with min-length inputs. * * So the number of uncompressed blocks needed would be bounded by * DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH). However, empty inputs * need 1 (empty) block, which gives the final expression below. */ STATIC_ASSERT(2 * MIN_BLOCK_LENGTH <= UINT16_MAX); max_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1); /* * Each uncompressed block has 5 bytes of overhead, for the BFINAL, * BTYPE, LEN, and NLEN fields. (For the reason explained earlier, the * alignment bits at the very start of the block can be disregarded; * they would otherwise increase the overhead to 6 bytes per block.) * Therefore, the maximum number of overhead bytes is '5 * max_blocks'. * To get the final bound, add the number of uncompressed bytes. */ return (5 * max_blocks) + in_nbytes; }