1 /* Lzip - LZMA lossless data compressor
2    Copyright (C) 2008-2021 Antonio Diaz Diaz.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 2 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 */
17 
18 enum { price_shift_bits = 6,
19        price_step_bits = 2,
20        price_step = 1 << price_step_bits };
21 
22 class Dis_slots
23   {
24   uint8_t data[1<<10];
25 
26 public:
init()27   void init()
28     {
29     for( int slot = 0; slot < 4; ++slot ) data[slot] = slot;
30     for( int i = 4, size = 2, slot = 4; slot < 20; slot += 2 )
31       {
32       std::memset( &data[i], slot, size );
33       std::memset( &data[i+size], slot + 1, size );
34       size <<= 1;
35       i += size;
36       }
37     }
38 
39   uint8_t operator[]( const int dis ) const { return data[dis]; }
40   };
41 
42 extern Dis_slots dis_slots;
43 
get_slot(const unsigned dis)44 inline uint8_t get_slot( const unsigned dis )
45   {
46   if( dis < (1 << 10) ) return dis_slots[dis];
47   if( dis < (1 << 19) ) return dis_slots[dis>> 9] + 18;
48   if( dis < (1 << 28) ) return dis_slots[dis>>18] + 36;
49   return dis_slots[dis>>27] + 54;
50   }
51 
52 
53 class Prob_prices
54   {
55   short data[bit_model_total >> price_step_bits];
56 
57 public:
init()58   void init()
59     {
60     for( int i = 0; i < bit_model_total >> price_step_bits; ++i )
61       {
62       unsigned val = ( i * price_step ) + ( price_step / 2 );
63       int bits = 0;				// base 2 logarithm of val
64       for( int j = 0; j < price_shift_bits; ++j )
65         {
66         val = val * val;
67         bits <<= 1;
68         while( val >= 1 << 16 ) { val >>= 1; ++bits; }
69         }
70       bits += 15;				// remaining bits in val
71       data[i] = ( bit_model_total_bits << price_shift_bits ) - bits;
72       }
73     }
74 
75   int operator[]( const int probability ) const
76     { return data[probability >> price_step_bits]; }
77   };
78 
79 extern Prob_prices prob_prices;
80 
81 
price0(const Bit_model bm)82 inline int price0( const Bit_model bm )
83   { return prob_prices[bm.probability]; }
84 
price1(const Bit_model bm)85 inline int price1( const Bit_model bm )
86   { return prob_prices[bit_model_total - bm.probability]; }
87 
price_bit(const Bit_model bm,const bool bit)88 inline int price_bit( const Bit_model bm, const bool bit )
89   { return ( bit ? price1( bm ) : price0( bm ) ); }
90 
91 
price_symbol3(const Bit_model bm[],int symbol)92 inline int price_symbol3( const Bit_model bm[], int symbol )
93   {
94   bool bit = symbol & 1;
95   symbol |= 8; symbol >>= 1;
96   int price = price_bit( bm[symbol], bit );
97   bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
98   return price + price_bit( bm[1], symbol & 1 );
99   }
100 
101 
price_symbol6(const Bit_model bm[],unsigned symbol)102 inline int price_symbol6( const Bit_model bm[], unsigned symbol )
103   {
104   bool bit = symbol & 1;
105   symbol |= 64; symbol >>= 1;
106   int price = price_bit( bm[symbol], bit );
107   bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
108   bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
109   bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
110   bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
111   return price + price_bit( bm[1], symbol & 1 );
112   }
113 
114 
price_symbol8(const Bit_model bm[],int symbol)115 inline int price_symbol8( const Bit_model bm[], int symbol )
116   {
117   bool bit = symbol & 1;
118   symbol |= 0x100; symbol >>= 1;
119   int price = price_bit( bm[symbol], bit );
120   bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
121   bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
122   bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
123   bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
124   bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
125   bit = symbol & 1; symbol >>= 1; price += price_bit( bm[symbol], bit );
126   return price + price_bit( bm[1], symbol & 1 );
127   }
128 
129 
price_symbol_reversed(const Bit_model bm[],int symbol,const int num_bits)130 inline int price_symbol_reversed( const Bit_model bm[], int symbol,
131                                   const int num_bits )
132   {
133   int price = 0;
134   int model = 1;
135   for( int i = num_bits; i > 0; --i )
136     {
137     const bool bit = symbol & 1;
138     symbol >>= 1;
139     price += price_bit( bm[model], bit );
140     model <<= 1; model |= bit;
141     }
142   return price;
143   }
144 
145 
price_matched(const Bit_model bm[],unsigned symbol,unsigned match_byte)146 inline int price_matched( const Bit_model bm[], unsigned symbol,
147                           unsigned match_byte )
148   {
149   int price = 0;
150   unsigned mask = 0x100;
151   symbol |= mask;
152   while( true )
153     {
154     const unsigned match_bit = ( match_byte <<= 1 ) & mask;
155     const bool bit = ( symbol <<= 1 ) & 0x100;
156     price += price_bit( bm[(symbol>>9)+match_bit+mask], bit );
157     if( symbol >= 0x10000 ) return price;
158     mask &= ~(match_bit ^ symbol);	// if( match_bit != bit ) mask = 0;
159     }
160   }
161 
162 
163 class Matchfinder_base
164   {
165   bool read_block();
166   void normalize_pos();
167 
168   Matchfinder_base( const Matchfinder_base & );	// declared as private
169   void operator=( const Matchfinder_base & );	// declared as private
170 
171 protected:
172   unsigned long long partial_data_pos;
173   uint8_t * buffer;		// input buffer
174   int32_t * prev_positions;	// 1 + last seen position of key. else 0
175   int32_t * pos_array;		// may be tree or chain
176   const int before_size;	// bytes to keep in buffer before dictionary
177   int buffer_size;
178   int dictionary_size;		// bytes to keep in buffer before pos
179   int pos;			// current pos in buffer
180   int cyclic_pos;		// cycles through [0, dictionary_size]
181   int stream_pos;		// first byte not yet read from file
182   int pos_limit;		// when reached, a new block must be read
183   int key4_mask;
184   const int num_prev_positions23;
185   int num_prev_positions;	// size of prev_positions
186   int pos_array_size;
187   const int infd;		// input file descriptor
188   bool at_stream_end;		// stream_pos shows real end of file
189 
190   Matchfinder_base( const int before_size_,
191                     const int dict_size, const int after_size,
192                     const int dict_factor, const int num_prev_positions23_,
193                     const int pos_array_factor, const int ifd );
194 
~Matchfinder_base()195   ~Matchfinder_base()
196     { delete[] prev_positions; std::free( buffer ); }
197 
198 public:
peek(const int distance)199   uint8_t peek( const int distance ) const { return buffer[pos-distance]; }
available_bytes()200   int available_bytes() const { return stream_pos - pos; }
data_position()201   unsigned long long data_position() const { return partial_data_pos + pos; }
data_finished()202   bool data_finished() const { return at_stream_end && pos >= stream_pos; }
ptr_to_current_pos()203   const uint8_t * ptr_to_current_pos() const { return buffer + pos; }
204 
true_match_len(const int index,const int distance)205   int true_match_len( const int index, const int distance ) const
206     {
207     const uint8_t * const data = buffer + pos;
208     int i = index;
209     const int len_limit = std::min( available_bytes(), (int)max_match_len );
210     while( i < len_limit && data[i-distance] == data[i] ) ++i;
211     return i;
212     }
213 
move_pos()214   void move_pos()
215     {
216     if( ++cyclic_pos > dictionary_size ) cyclic_pos = 0;
217     if( ++pos >= pos_limit ) normalize_pos();
218     }
219 
220   void reset();
221   };
222 
223 
224 class Range_encoder
225   {
226   enum { buffer_size = 65536 };
227   uint64_t low;
228   unsigned long long partial_member_pos;
229   uint8_t * const buffer;	// output buffer
230   int pos;			// current pos in buffer
231   uint32_t range;
232   unsigned ff_count;
233   const int outfd;		// output file descriptor
234   uint8_t cache;
235   Lzip_header header;
236 
shift_low()237   void shift_low()
238     {
239     if( low >> 24 != 0xFF )
240       {
241       const bool carry = ( low > 0xFFFFFFFFU );
242       put_byte( cache + carry );
243       for( ; ff_count > 0; --ff_count ) put_byte( 0xFF + carry );
244       cache = low >> 24;
245       }
246     else ++ff_count;
247     low = ( low & 0x00FFFFFFU ) << 8;
248     }
249 
250   Range_encoder( const Range_encoder & );	// declared as private
251   void operator=( const Range_encoder & );	// declared as private
252 
253 public:
reset(const unsigned dictionary_size)254   void reset( const unsigned dictionary_size )
255     {
256     low = 0;
257     partial_member_pos = 0;
258     pos = 0;
259     range = 0xFFFFFFFFU;
260     ff_count = 0;
261     cache = 0;
262     header.dictionary_size( dictionary_size );
263     for( int i = 0; i < Lzip_header::size; ++i )
264       put_byte( header.data[i] );
265     }
266 
Range_encoder(const unsigned dictionary_size,const int ofd)267   Range_encoder( const unsigned dictionary_size, const int ofd )
268     :
269     buffer( new uint8_t[buffer_size] ), outfd( ofd )
270     {
271     header.set_magic();
272     reset( dictionary_size );
273     }
274 
~Range_encoder()275   ~Range_encoder() { delete[] buffer; }
276 
member_position()277   unsigned long long member_position() const
278     { return partial_member_pos + pos + ff_count; }
279 
flush()280   void flush() { for( int i = 0; i < 5; ++i ) shift_low(); }
281   void flush_data();
282 
put_byte(const uint8_t b)283   void put_byte( const uint8_t b )
284     {
285     buffer[pos] = b;
286     if( ++pos >= buffer_size ) flush_data();
287     }
288 
encode(const int symbol,const int num_bits)289   void encode( const int symbol, const int num_bits )
290     {
291     for( unsigned mask = 1 << ( num_bits - 1 ); mask > 0; mask >>= 1 )
292       {
293       range >>= 1;
294       if( symbol & mask ) low += range;
295       if( range <= 0x00FFFFFFU ) { range <<= 8; shift_low(); }
296       }
297     }
298 
encode_bit(Bit_model & bm,const bool bit)299   void encode_bit( Bit_model & bm, const bool bit )
300     {
301     const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability;
302     if( !bit )
303       {
304       range = bound;
305       bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits;
306       }
307     else
308       {
309       low += bound;
310       range -= bound;
311       bm.probability -= bm.probability >> bit_model_move_bits;
312       }
313     if( range <= 0x00FFFFFFU ) { range <<= 8; shift_low(); }
314     }
315 
encode_tree3(Bit_model bm[],const int symbol)316   void encode_tree3( Bit_model bm[], const int symbol )
317     {
318     bool bit = ( symbol >> 2 ) & 1;
319     encode_bit( bm[1], bit );
320     int model = 2 | bit;
321     bit = ( symbol >> 1 ) & 1;
322     encode_bit( bm[model], bit ); model <<= 1; model |= bit;
323     encode_bit( bm[model], symbol & 1 );
324     }
325 
encode_tree6(Bit_model bm[],const unsigned symbol)326   void encode_tree6( Bit_model bm[], const unsigned symbol )
327     {
328     bool bit = ( symbol >> 5 ) & 1;
329     encode_bit( bm[1], bit );
330     int model = 2 | bit;
331     bit = ( symbol >> 4 ) & 1;
332     encode_bit( bm[model], bit ); model <<= 1; model |= bit;
333     bit = ( symbol >> 3 ) & 1;
334     encode_bit( bm[model], bit ); model <<= 1; model |= bit;
335     bit = ( symbol >> 2 ) & 1;
336     encode_bit( bm[model], bit ); model <<= 1; model |= bit;
337     bit = ( symbol >> 1 ) & 1;
338     encode_bit( bm[model], bit ); model <<= 1; model |= bit;
339     encode_bit( bm[model], symbol & 1 );
340     }
341 
encode_tree8(Bit_model bm[],const int symbol)342   void encode_tree8( Bit_model bm[], const int symbol )
343     {
344     int model = 1;
345     for( int i = 7; i >= 0; --i )
346       {
347       const bool bit = ( symbol >> i ) & 1;
348       encode_bit( bm[model], bit );
349       model <<= 1; model |= bit;
350       }
351     }
352 
encode_tree_reversed(Bit_model bm[],int symbol,const int num_bits)353   void encode_tree_reversed( Bit_model bm[], int symbol, const int num_bits )
354     {
355     int model = 1;
356     for( int i = num_bits; i > 0; --i )
357       {
358       const bool bit = symbol & 1;
359       symbol >>= 1;
360       encode_bit( bm[model], bit );
361       model <<= 1; model |= bit;
362       }
363     }
364 
encode_matched(Bit_model bm[],unsigned symbol,unsigned match_byte)365   void encode_matched( Bit_model bm[], unsigned symbol, unsigned match_byte )
366     {
367     unsigned mask = 0x100;
368     symbol |= mask;
369     while( true )
370       {
371       const unsigned match_bit = ( match_byte <<= 1 ) & mask;
372       const bool bit = ( symbol <<= 1 ) & 0x100;
373       encode_bit( bm[(symbol>>9)+match_bit+mask], bit );
374       if( symbol >= 0x10000 ) break;
375       mask &= ~(match_bit ^ symbol);	// if( match_bit != bit ) mask = 0;
376       }
377     }
378 
encode_len(Len_model & lm,int symbol,const int pos_state)379   void encode_len( Len_model & lm, int symbol, const int pos_state )
380     {
381     bool bit = ( ( symbol -= min_match_len ) >= len_low_symbols );
382     encode_bit( lm.choice1, bit );
383     if( !bit )
384       encode_tree3( lm.bm_low[pos_state], symbol );
385     else
386       {
387       bit = ( ( symbol -= len_low_symbols ) >= len_mid_symbols );
388       encode_bit( lm.choice2, bit );
389       if( !bit )
390         encode_tree3( lm.bm_mid[pos_state], symbol );
391       else
392         encode_tree8( lm.bm_high, symbol - len_mid_symbols );
393       }
394     }
395   };
396 
397 
398 class LZ_encoder_base : public Matchfinder_base
399   {
400 protected:
401   enum { max_marker_size = 16,
402          num_rep_distances = 4 };	// must be 4
403 
404   uint32_t crc_;
405 
406   Bit_model bm_literal[1<<literal_context_bits][0x300];
407   Bit_model bm_match[State::states][pos_states];
408   Bit_model bm_rep[State::states];
409   Bit_model bm_rep0[State::states];
410   Bit_model bm_rep1[State::states];
411   Bit_model bm_rep2[State::states];
412   Bit_model bm_len[State::states][pos_states];
413   Bit_model bm_dis_slot[len_states][1<<dis_slot_bits];
414   Bit_model bm_dis[modeled_distances-end_dis_model+1];
415   Bit_model bm_align[dis_align_size];
416   Len_model match_len_model;
417   Len_model rep_len_model;
418   Range_encoder renc;
419 
LZ_encoder_base(const int before_size,const int dict_size,const int after_size,const int dict_factor,const int num_prev_positions23,const int pos_array_factor,const int ifd,const int outfd)420   LZ_encoder_base( const int before_size, const int dict_size,
421                    const int after_size, const int dict_factor,
422                    const int num_prev_positions23,
423                    const int pos_array_factor,
424                    const int ifd, const int outfd )
425     :
426     Matchfinder_base( before_size, dict_size, after_size, dict_factor,
427                       num_prev_positions23, pos_array_factor, ifd ),
428     crc_( 0xFFFFFFFFU ),
429     renc( dictionary_size, outfd )
430     {}
431 
crc()432   unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; }
433 
price_literal(const uint8_t prev_byte,const uint8_t symbol)434   int price_literal( const uint8_t prev_byte, const uint8_t symbol ) const
435     { return price_symbol8( bm_literal[get_lit_state(prev_byte)], symbol ); }
436 
price_matched(const uint8_t prev_byte,const uint8_t symbol,const uint8_t match_byte)437   int price_matched( const uint8_t prev_byte, const uint8_t symbol,
438                      const uint8_t match_byte ) const
439     { return ::price_matched( bm_literal[get_lit_state(prev_byte)], symbol,
440                               match_byte ); }
441 
encode_literal(const uint8_t prev_byte,const uint8_t symbol)442   void encode_literal( const uint8_t prev_byte, const uint8_t symbol )
443     { renc.encode_tree8( bm_literal[get_lit_state(prev_byte)], symbol ); }
444 
encode_matched(const uint8_t prev_byte,const uint8_t symbol,const uint8_t match_byte)445   void encode_matched( const uint8_t prev_byte, const uint8_t symbol,
446                        const uint8_t match_byte )
447     { renc.encode_matched( bm_literal[get_lit_state(prev_byte)], symbol,
448                            match_byte ); }
449 
encode_pair(const unsigned dis,const int len,const int pos_state)450   void encode_pair( const unsigned dis, const int len, const int pos_state )
451     {
452     renc.encode_len( match_len_model, len, pos_state );
453     const unsigned dis_slot = get_slot( dis );
454     renc.encode_tree6( bm_dis_slot[get_len_state(len)], dis_slot );
455 
456     if( dis_slot >= start_dis_model )
457       {
458       const int direct_bits = ( dis_slot >> 1 ) - 1;
459       const unsigned base = ( 2 | ( dis_slot & 1 ) ) << direct_bits;
460       const unsigned direct_dis = dis - base;
461 
462       if( dis_slot < end_dis_model )
463         renc.encode_tree_reversed( bm_dis + ( base - dis_slot ),
464                                    direct_dis, direct_bits );
465       else
466         {
467         renc.encode( direct_dis >> dis_align_bits, direct_bits - dis_align_bits );
468         renc.encode_tree_reversed( bm_align, direct_dis, dis_align_bits );
469         }
470       }
471     }
472 
473   void full_flush( const State state );
474 
475 public:
~LZ_encoder_base()476   virtual ~LZ_encoder_base() {}
477 
member_position()478   unsigned long long member_position() const { return renc.member_position(); }
479   virtual void reset();
480 
481   virtual bool encode_member( const unsigned long long member_size ) = 0;
482   };
483