1 /* Lzip - LZMA lossless data compressor
2    Copyright (C) 2008-2021 Antonio Diaz Diaz.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 2 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 */
17 
18 #define _FILE_OFFSET_BITS 64
19 
20 #include <algorithm>
21 #include <cerrno>
22 #include <cstdlib>
23 #include <cstring>
24 #include <string>
25 #include <vector>
26 #include <stdint.h>
27 
28 #include "lzip.h"
29 #include "encoder_base.h"
30 #include "encoder.h"
31 
32 
33 const CRC32 crc32;
34 
35 
get_match_pairs(Pair * pairs)36 int LZ_encoder::get_match_pairs( Pair * pairs )
37   {
38   int len_limit = match_len_limit;
39   if( len_limit > available_bytes() )
40     {
41     len_limit = available_bytes();
42     if( len_limit < 4 ) return 0;
43     }
44 
45   int maxlen = 3;			// only used if pairs != 0
46   int num_pairs = 0;
47   const int min_pos = ( pos > dictionary_size ) ? pos - dictionary_size : 0;
48   const uint8_t * const data = ptr_to_current_pos();
49 
50   unsigned tmp = crc32[data[0]] ^ data[1];
51   const int key2 = tmp & ( num_prev_positions2 - 1 );
52   tmp ^= (unsigned)data[2] << 8;
53   const int key3 = num_prev_positions2 + ( tmp & ( num_prev_positions3 - 1 ) );
54   const int key4 = num_prev_positions23 +
55                    ( ( tmp ^ ( crc32[data[3]] << 5 ) ) & key4_mask );
56 
57   if( pairs )
58     {
59     const int np2 = prev_positions[key2];
60     const int np3 = prev_positions[key3];
61     if( np2 > min_pos && buffer[np2-1] == data[0] )
62       {
63       pairs[0].dis = pos - np2;
64       pairs[0].len = maxlen = 2 + ( np2 == np3 );
65       num_pairs = 1;
66       }
67     if( np2 != np3 && np3 > min_pos && buffer[np3-1] == data[0] )
68       {
69       maxlen = 3;
70       pairs[num_pairs++].dis = pos - np3;
71       }
72     if( num_pairs > 0 )
73       {
74       const int delta = pairs[num_pairs-1].dis + 1;
75       while( maxlen < len_limit && data[maxlen-delta] == data[maxlen] )
76         ++maxlen;
77       pairs[num_pairs-1].len = maxlen;
78       if( maxlen < 3 ) maxlen = 3;
79       if( maxlen >= len_limit ) pairs = 0;	// done. now just skip
80       }
81     }
82 
83   const int pos1 = pos + 1;
84   prev_positions[key2] = pos1;
85   prev_positions[key3] = pos1;
86   int newpos1 = prev_positions[key4];
87   prev_positions[key4] = pos1;
88 
89   int32_t * ptr0 = pos_array + ( cyclic_pos << 1 );
90   int32_t * ptr1 = ptr0 + 1;
91   int len = 0, len0 = 0, len1 = 0;
92 
93   for( int count = cycles; ; )
94     {
95     if( newpos1 <= min_pos || --count < 0 ) { *ptr0 = *ptr1 = 0; break; }
96 
97     const int delta = pos1 - newpos1;
98     int32_t * const newptr = pos_array +
99       ( ( cyclic_pos - delta +
100           ( ( cyclic_pos >= delta ) ? 0 : dictionary_size + 1 ) ) << 1 );
101     if( data[len-delta] == data[len] )
102       {
103       while( ++len < len_limit && data[len-delta] == data[len] ) {}
104       if( pairs && maxlen < len )
105         {
106         pairs[num_pairs].dis = delta - 1;
107         pairs[num_pairs].len = maxlen = len;
108         ++num_pairs;
109         }
110       if( len >= len_limit )
111         {
112         *ptr0 = newptr[0];
113         *ptr1 = newptr[1];
114         break;
115         }
116       }
117     if( data[len-delta] < data[len] )
118       {
119       *ptr0 = newpos1;
120       ptr0 = newptr + 1;
121       newpos1 = *ptr0;
122       len0 = len; if( len1 < len ) len = len1;
123       }
124     else
125       {
126       *ptr1 = newpos1;
127       ptr1 = newptr;
128       newpos1 = *ptr1;
129       len1 = len; if( len0 < len ) len = len0;
130       }
131     }
132   return num_pairs;
133   }
134 
135 
update_distance_prices()136 void LZ_encoder::update_distance_prices()
137   {
138   for( int dis = start_dis_model; dis < modeled_distances; ++dis )
139     {
140     const int dis_slot = dis_slots[dis];
141     const int direct_bits = ( dis_slot >> 1 ) - 1;
142     const int base = ( 2 | ( dis_slot & 1 ) ) << direct_bits;
143     const int price = price_symbol_reversed( bm_dis + ( base - dis_slot ),
144                                              dis - base, direct_bits );
145     for( int len_state = 0; len_state < len_states; ++len_state )
146       dis_prices[len_state][dis] = price;
147     }
148 
149   for( int len_state = 0; len_state < len_states; ++len_state )
150     {
151     int * const dsp = dis_slot_prices[len_state];
152     const Bit_model * const bmds = bm_dis_slot[len_state];
153     int slot = 0;
154     for( ; slot < end_dis_model; ++slot )
155       dsp[slot] = price_symbol6( bmds, slot );
156     for( ; slot < num_dis_slots; ++slot )
157       dsp[slot] = price_symbol6( bmds, slot ) +
158                   (((( slot >> 1 ) - 1 ) - dis_align_bits ) << price_shift_bits );
159 
160     int * const dp = dis_prices[len_state];
161     int dis = 0;
162     for( ; dis < start_dis_model; ++dis )
163       dp[dis] = dsp[dis];
164     for( ; dis < modeled_distances; ++dis )
165       dp[dis] += dsp[dis_slots[dis]];
166     }
167   }
168 
169 
170 /* Returns the number of bytes advanced (ahead).
171    trials[0]..trials[ahead-1] contain the steps to encode.
172    ( trials[0].dis4 == -1 ) means literal.
173    A match/rep longer or equal than match_len_limit finishes the sequence.
174 */
sequence_optimizer(const int reps[num_rep_distances],const State state)175 int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances],
176                                     const State state )
177   {
178   int num_pairs, num_trials;
179 
180   if( pending_num_pairs > 0 )			// from previous call
181     {
182     num_pairs = pending_num_pairs;
183     pending_num_pairs = 0;
184     }
185   else
186     num_pairs = read_match_distances();
187   const int main_len = ( num_pairs > 0 ) ? pairs[num_pairs-1].len : 0;
188 
189   int replens[num_rep_distances];
190   int rep_index = 0;
191   for( int i = 0; i < num_rep_distances; ++i )
192     {
193     replens[i] = true_match_len( 0, reps[i] + 1 );
194     if( replens[i] > replens[rep_index] ) rep_index = i;
195     }
196   if( replens[rep_index] >= match_len_limit )
197     {
198     trials[0].price = replens[rep_index];
199     trials[0].dis4 = rep_index;
200     move_and_update( replens[rep_index] );
201     return replens[rep_index];
202     }
203 
204   if( main_len >= match_len_limit )
205     {
206     trials[0].price = main_len;
207     trials[0].dis4 = pairs[num_pairs-1].dis + num_rep_distances;
208     move_and_update( main_len );
209     return main_len;
210     }
211 
212   const int pos_state = data_position() & pos_state_mask;
213   const uint8_t prev_byte = peek( 1 );
214   const uint8_t cur_byte = peek( 0 );
215   const uint8_t match_byte = peek( reps[0] + 1 );
216 
217   trials[1].price = price0( bm_match[state()][pos_state] );
218   if( state.is_char() )
219     trials[1].price += price_literal( prev_byte, cur_byte );
220   else
221     trials[1].price += price_matched( prev_byte, cur_byte, match_byte );
222   trials[1].dis4 = -1;					// literal
223 
224   const int match_price = price1( bm_match[state()][pos_state] );
225   const int rep_match_price = match_price + price1( bm_rep[state()] );
226 
227   if( match_byte == cur_byte )
228     trials[1].update( rep_match_price + price_shortrep( state, pos_state ), 0, 0 );
229 
230   num_trials = std::max( main_len, replens[rep_index] );
231 
232   if( num_trials < min_match_len )
233     {
234     trials[0].price = 1;
235     trials[0].dis4 = trials[1].dis4;
236     move_pos();
237     return 1;
238     }
239 
240   trials[0].state = state;
241   for( int i = 0; i < num_rep_distances; ++i )
242     trials[0].reps[i] = reps[i];
243 
244   for( int len = min_match_len; len <= num_trials; ++len )
245     trials[len].price = infinite_price;
246 
247   for( int rep = 0; rep < num_rep_distances; ++rep )
248     {
249     if( replens[rep] < min_match_len ) continue;
250     const int price = rep_match_price + price_rep( rep, state, pos_state );
251     for( int len = min_match_len; len <= replens[rep]; ++len )
252       trials[len].update( price + rep_len_prices.price( len, pos_state ),
253                           rep, 0 );
254     }
255 
256   if( main_len > replens[0] )
257     {
258     const int normal_match_price = match_price + price0( bm_rep[state()] );
259     int i = 0, len = std::max( replens[0] + 1, (int)min_match_len );
260     while( len > pairs[i].len ) ++i;
261     while( true )
262       {
263       const int dis = pairs[i].dis;
264       trials[len].update( normal_match_price + price_pair( dis, len, pos_state ),
265                           dis + num_rep_distances, 0 );
266       if( ++len > pairs[i].len && ++i >= num_pairs ) break;
267       }
268     }
269 
270   int cur = 0;
271   while( true )				// price optimization loop
272     {
273     move_pos();
274     if( ++cur >= num_trials )		// no more initialized trials
275       {
276       backward( cur );
277       return cur;
278       }
279 
280     const int num_pairs = read_match_distances();
281     const int newlen = ( num_pairs > 0 ) ? pairs[num_pairs-1].len : 0;
282     if( newlen >= match_len_limit )
283       {
284       pending_num_pairs = num_pairs;
285       backward( cur );
286       return cur;
287       }
288 
289     // give final values to current trial
290     Trial & cur_trial = trials[cur];
291     State cur_state;
292     {
293     const int dis4 = cur_trial.dis4;
294     int prev_index = cur_trial.prev_index;
295     const int prev_index2 = cur_trial.prev_index2;
296 
297     if( prev_index2 == single_step_trial )
298       {
299       cur_state = trials[prev_index].state;
300       if( prev_index + 1 == cur )			// len == 1
301         {
302         if( dis4 == 0 ) cur_state.set_short_rep();
303         else cur_state.set_char();			// literal
304         }
305       else if( dis4 < num_rep_distances ) cur_state.set_rep();
306       else cur_state.set_match();
307       }
308     else
309       {
310       if( prev_index2 == dual_step_trial )	// dis4 == 0 (rep0)
311         --prev_index;
312       else					// prev_index2 >= 0
313         prev_index = prev_index2;
314       cur_state.set_char_rep();
315       }
316     cur_trial.state = cur_state;
317     for( int i = 0; i < num_rep_distances; ++i )
318       cur_trial.reps[i] = trials[prev_index].reps[i];
319     mtf_reps( dis4, cur_trial.reps );		// literal is ignored
320     }
321 
322     const int pos_state = data_position() & pos_state_mask;
323     const uint8_t prev_byte = peek( 1 );
324     const uint8_t cur_byte = peek( 0 );
325     const uint8_t match_byte = peek( cur_trial.reps[0] + 1 );
326 
327     int next_price = cur_trial.price +
328                      price0( bm_match[cur_state()][pos_state] );
329     if( cur_state.is_char() )
330       next_price += price_literal( prev_byte, cur_byte );
331     else
332       next_price += price_matched( prev_byte, cur_byte, match_byte );
333 
334     // try last updates to next trial
335     Trial & next_trial = trials[cur+1];
336 
337     next_trial.update( next_price, -1, cur );		// literal
338 
339     const int match_price = cur_trial.price + price1( bm_match[cur_state()][pos_state] );
340     const int rep_match_price = match_price + price1( bm_rep[cur_state()] );
341 
342     if( match_byte == cur_byte && next_trial.dis4 != 0 &&
343         next_trial.prev_index2 == single_step_trial )
344       {
345       const int price = rep_match_price + price_shortrep( cur_state, pos_state );
346       if( price <= next_trial.price )
347         {
348         next_trial.price = price;
349         next_trial.dis4 = 0;				// rep0
350         next_trial.prev_index = cur;
351         }
352       }
353 
354     const int triable_bytes =
355       std::min( available_bytes(), max_num_trials - 1 - cur );
356     if( triable_bytes < min_match_len ) continue;
357 
358     const int len_limit = std::min( match_len_limit, triable_bytes );
359 
360     // try literal + rep0
361     if( match_byte != cur_byte && next_trial.prev_index != cur )
362       {
363       const uint8_t * const data = ptr_to_current_pos();
364       const int dis = cur_trial.reps[0] + 1;
365       const int limit = std::min( match_len_limit + 1, triable_bytes );
366       int len = 1;
367       while( len < limit && data[len-dis] == data[len] ) ++len;
368       if( --len >= min_match_len )
369         {
370         const int pos_state2 = ( pos_state + 1 ) & pos_state_mask;
371         State state2 = cur_state; state2.set_char();
372         const int price = next_price +
373                           price1( bm_match[state2()][pos_state2] ) +
374                           price1( bm_rep[state2()] ) +
375                           price_rep0_len( len, state2, pos_state2 );
376         while( num_trials < cur + 1 + len )
377           trials[++num_trials].price = infinite_price;
378         trials[cur+1+len].update2( price, cur + 1 );
379         }
380       }
381 
382     int start_len = min_match_len;
383 
384     // try rep distances
385     for( int rep = 0; rep < num_rep_distances; ++rep )
386       {
387       const uint8_t * const data = ptr_to_current_pos();
388       const int dis = cur_trial.reps[rep] + 1;
389       int len;
390 
391       if( data[0-dis] != data[0] || data[1-dis] != data[1] ) continue;
392       for( len = min_match_len; len < len_limit; ++len )
393         if( data[len-dis] != data[len] ) break;
394       while( num_trials < cur + len )
395         trials[++num_trials].price = infinite_price;
396       int price = rep_match_price + price_rep( rep, cur_state, pos_state );
397       for( int i = min_match_len; i <= len; ++i )
398         trials[cur+i].update( price + rep_len_prices.price( i, pos_state ),
399                               rep, cur );
400 
401       if( rep == 0 ) start_len = len + 1;	// discard shorter matches
402 
403       // try rep + literal + rep0
404       int len2 = len + 1;
405       const int limit = std::min( match_len_limit + len2, triable_bytes );
406       while( len2 < limit && data[len2-dis] == data[len2] ) ++len2;
407       len2 -= len + 1;
408       if( len2 < min_match_len ) continue;
409 
410       int pos_state2 = ( pos_state + len ) & pos_state_mask;
411       State state2 = cur_state; state2.set_rep();
412       price += rep_len_prices.price( len, pos_state ) +
413                price0( bm_match[state2()][pos_state2] ) +
414                price_matched( data[len-1], data[len], data[len-dis] );
415       pos_state2 = ( pos_state2 + 1 ) & pos_state_mask;
416       state2.set_char();
417       price += price1( bm_match[state2()][pos_state2] ) +
418                price1( bm_rep[state2()] ) +
419                price_rep0_len( len2, state2, pos_state2 );
420       while( num_trials < cur + len + 1 + len2 )
421         trials[++num_trials].price = infinite_price;
422       trials[cur+len+1+len2].update3( price, rep, cur + len + 1, cur );
423       }
424 
425     // try matches
426     if( newlen >= start_len && newlen <= len_limit )
427       {
428       const int normal_match_price = match_price +
429                                      price0( bm_rep[cur_state()] );
430 
431       while( num_trials < cur + newlen )
432         trials[++num_trials].price = infinite_price;
433 
434       int i = 0;
435       while( pairs[i].len < start_len ) ++i;
436       int dis = pairs[i].dis;
437       for( int len = start_len; ; ++len )
438         {
439         int price = normal_match_price + price_pair( dis, len, pos_state );
440         trials[cur+len].update( price, dis + num_rep_distances, cur );
441 
442         // try match + literal + rep0
443         if( len == pairs[i].len )
444           {
445           const uint8_t * const data = ptr_to_current_pos();
446           const int dis2 = dis + 1;
447           int len2 = len + 1;
448           const int limit = std::min( match_len_limit + len2, triable_bytes );
449           while( len2 < limit && data[len2-dis2] == data[len2] ) ++len2;
450           len2 -= len + 1;
451           if( len2 >= min_match_len )
452             {
453             int pos_state2 = ( pos_state + len ) & pos_state_mask;
454             State state2 = cur_state; state2.set_match();
455             price += price0( bm_match[state2()][pos_state2] ) +
456                      price_matched( data[len-1], data[len], data[len-dis2] );
457             pos_state2 = ( pos_state2 + 1 ) & pos_state_mask;
458             state2.set_char();
459             price += price1( bm_match[state2()][pos_state2] ) +
460                      price1( bm_rep[state2()] ) +
461                      price_rep0_len( len2, state2, pos_state2 );
462 
463             while( num_trials < cur + len + 1 + len2 )
464               trials[++num_trials].price = infinite_price;
465             trials[cur+len+1+len2].update3( price, dis + num_rep_distances,
466                                             cur + len + 1, cur );
467             }
468           if( ++i >= num_pairs ) break;
469           dis = pairs[i].dis;
470           }
471         }
472       }
473     }
474   }
475 
476 
encode_member(const unsigned long long member_size)477 bool LZ_encoder::encode_member( const unsigned long long member_size )
478   {
479   const unsigned long long member_size_limit =
480     member_size - Lzip_trailer::size - max_marker_size;
481   const bool best = ( match_len_limit > 12 );
482   const int dis_price_count = best ? 1 : 512;
483   const int align_price_count = best ? 1 : dis_align_size;
484   const int price_count = ( match_len_limit > 36 ) ? 1013 : 4093;
485   int price_counter = 0;		// counters may decrement below 0
486   int dis_price_counter = 0;
487   int align_price_counter = 0;
488   int reps[num_rep_distances];
489   State state;
490   for( int i = 0; i < num_rep_distances; ++i ) reps[i] = 0;
491 
492   if( data_position() != 0 || renc.member_position() != Lzip_header::size )
493     return false;				// can be called only once
494 
495   if( !data_finished() )			// encode first byte
496     {
497     const uint8_t prev_byte = 0;
498     const uint8_t cur_byte = peek( 0 );
499     renc.encode_bit( bm_match[state()][0], 0 );
500     encode_literal( prev_byte, cur_byte );
501     crc32.update_byte( crc_, cur_byte );
502     get_match_pairs();
503     move_pos();
504     }
505 
506   while( !data_finished() )
507     {
508     if( price_counter <= 0 && pending_num_pairs == 0 )
509       {
510       price_counter = price_count;	// recalculate prices every these bytes
511       if( dis_price_counter <= 0 )
512         { dis_price_counter = dis_price_count; update_distance_prices(); }
513       if( align_price_counter <= 0 )
514         {
515         align_price_counter = align_price_count;
516         for( int i = 0; i < dis_align_size; ++i )
517           align_prices[i] = price_symbol_reversed( bm_align, i, dis_align_bits );
518         }
519       match_len_prices.update_prices();
520       rep_len_prices.update_prices();
521       }
522 
523     int ahead = sequence_optimizer( reps, state );
524     price_counter -= ahead;
525 
526     for( int i = 0; ahead > 0; )
527       {
528       const int pos_state = ( data_position() - ahead ) & pos_state_mask;
529       const int len = trials[i].price;
530       int dis = trials[i].dis4;
531 
532       bool bit = ( dis < 0 );
533       renc.encode_bit( bm_match[state()][pos_state], !bit );
534       if( bit )					// literal byte
535         {
536         const uint8_t prev_byte = peek( ahead + 1 );
537         const uint8_t cur_byte = peek( ahead );
538         crc32.update_byte( crc_, cur_byte );
539         if( state.is_char_set_char() )
540           encode_literal( prev_byte, cur_byte );
541         else
542           {
543           const uint8_t match_byte = peek( ahead + reps[0] + 1 );
544           encode_matched( prev_byte, cur_byte, match_byte );
545           }
546         }
547       else					// match or repeated match
548         {
549         crc32.update_buf( crc_, ptr_to_current_pos() - ahead, len );
550         mtf_reps( dis, reps );
551         bit = ( dis < num_rep_distances );
552         renc.encode_bit( bm_rep[state()], bit );
553         if( bit )				// repeated match
554           {
555           bit = ( dis == 0 );
556           renc.encode_bit( bm_rep0[state()], !bit );
557           if( bit )
558             renc.encode_bit( bm_len[state()][pos_state], len > 1 );
559           else
560             {
561             renc.encode_bit( bm_rep1[state()], dis > 1 );
562             if( dis > 1 )
563               renc.encode_bit( bm_rep2[state()], dis > 2 );
564             }
565           if( len == 1 ) state.set_short_rep();
566           else
567             {
568             renc.encode_len( rep_len_model, len, pos_state );
569             rep_len_prices.decrement_counter( pos_state );
570             state.set_rep();
571             }
572           }
573         else					// match
574           {
575           dis -= num_rep_distances;
576           encode_pair( dis, len, pos_state );
577           if( dis >= modeled_distances ) --align_price_counter;
578           --dis_price_counter;
579           match_len_prices.decrement_counter( pos_state );
580           state.set_match();
581           }
582         }
583       ahead -= len; i += len;
584       if( renc.member_position() >= member_size_limit )
585         {
586         if( !dec_pos( ahead ) ) return false;
587         full_flush( state );
588         return true;
589         }
590       }
591     }
592   full_flush( state );
593   return true;
594   }
595