1 /* Lzip - LZMA lossless data compressor
2 Copyright (C) 2008-2021 Antonio Diaz Diaz.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #define _FILE_OFFSET_BITS 64
19
20 #include <algorithm>
21 #include <cerrno>
22 #include <cstdlib>
23 #include <cstring>
24 #include <string>
25 #include <vector>
26 #include <stdint.h>
27
28 #include "lzip.h"
29 #include "encoder_base.h"
30 #include "encoder.h"
31
32
33 const CRC32 crc32;
34
35
get_match_pairs(Pair * pairs)36 int LZ_encoder::get_match_pairs( Pair * pairs )
37 {
38 int len_limit = match_len_limit;
39 if( len_limit > available_bytes() )
40 {
41 len_limit = available_bytes();
42 if( len_limit < 4 ) return 0;
43 }
44
45 int maxlen = 3; // only used if pairs != 0
46 int num_pairs = 0;
47 const int min_pos = ( pos > dictionary_size ) ? pos - dictionary_size : 0;
48 const uint8_t * const data = ptr_to_current_pos();
49
50 unsigned tmp = crc32[data[0]] ^ data[1];
51 const int key2 = tmp & ( num_prev_positions2 - 1 );
52 tmp ^= (unsigned)data[2] << 8;
53 const int key3 = num_prev_positions2 + ( tmp & ( num_prev_positions3 - 1 ) );
54 const int key4 = num_prev_positions23 +
55 ( ( tmp ^ ( crc32[data[3]] << 5 ) ) & key4_mask );
56
57 if( pairs )
58 {
59 const int np2 = prev_positions[key2];
60 const int np3 = prev_positions[key3];
61 if( np2 > min_pos && buffer[np2-1] == data[0] )
62 {
63 pairs[0].dis = pos - np2;
64 pairs[0].len = maxlen = 2 + ( np2 == np3 );
65 num_pairs = 1;
66 }
67 if( np2 != np3 && np3 > min_pos && buffer[np3-1] == data[0] )
68 {
69 maxlen = 3;
70 pairs[num_pairs++].dis = pos - np3;
71 }
72 if( num_pairs > 0 )
73 {
74 const int delta = pairs[num_pairs-1].dis + 1;
75 while( maxlen < len_limit && data[maxlen-delta] == data[maxlen] )
76 ++maxlen;
77 pairs[num_pairs-1].len = maxlen;
78 if( maxlen < 3 ) maxlen = 3;
79 if( maxlen >= len_limit ) pairs = 0; // done. now just skip
80 }
81 }
82
83 const int pos1 = pos + 1;
84 prev_positions[key2] = pos1;
85 prev_positions[key3] = pos1;
86 int newpos1 = prev_positions[key4];
87 prev_positions[key4] = pos1;
88
89 int32_t * ptr0 = pos_array + ( cyclic_pos << 1 );
90 int32_t * ptr1 = ptr0 + 1;
91 int len = 0, len0 = 0, len1 = 0;
92
93 for( int count = cycles; ; )
94 {
95 if( newpos1 <= min_pos || --count < 0 ) { *ptr0 = *ptr1 = 0; break; }
96
97 const int delta = pos1 - newpos1;
98 int32_t * const newptr = pos_array +
99 ( ( cyclic_pos - delta +
100 ( ( cyclic_pos >= delta ) ? 0 : dictionary_size + 1 ) ) << 1 );
101 if( data[len-delta] == data[len] )
102 {
103 while( ++len < len_limit && data[len-delta] == data[len] ) {}
104 if( pairs && maxlen < len )
105 {
106 pairs[num_pairs].dis = delta - 1;
107 pairs[num_pairs].len = maxlen = len;
108 ++num_pairs;
109 }
110 if( len >= len_limit )
111 {
112 *ptr0 = newptr[0];
113 *ptr1 = newptr[1];
114 break;
115 }
116 }
117 if( data[len-delta] < data[len] )
118 {
119 *ptr0 = newpos1;
120 ptr0 = newptr + 1;
121 newpos1 = *ptr0;
122 len0 = len; if( len1 < len ) len = len1;
123 }
124 else
125 {
126 *ptr1 = newpos1;
127 ptr1 = newptr;
128 newpos1 = *ptr1;
129 len1 = len; if( len0 < len ) len = len0;
130 }
131 }
132 return num_pairs;
133 }
134
135
update_distance_prices()136 void LZ_encoder::update_distance_prices()
137 {
138 for( int dis = start_dis_model; dis < modeled_distances; ++dis )
139 {
140 const int dis_slot = dis_slots[dis];
141 const int direct_bits = ( dis_slot >> 1 ) - 1;
142 const int base = ( 2 | ( dis_slot & 1 ) ) << direct_bits;
143 const int price = price_symbol_reversed( bm_dis + ( base - dis_slot ),
144 dis - base, direct_bits );
145 for( int len_state = 0; len_state < len_states; ++len_state )
146 dis_prices[len_state][dis] = price;
147 }
148
149 for( int len_state = 0; len_state < len_states; ++len_state )
150 {
151 int * const dsp = dis_slot_prices[len_state];
152 const Bit_model * const bmds = bm_dis_slot[len_state];
153 int slot = 0;
154 for( ; slot < end_dis_model; ++slot )
155 dsp[slot] = price_symbol6( bmds, slot );
156 for( ; slot < num_dis_slots; ++slot )
157 dsp[slot] = price_symbol6( bmds, slot ) +
158 (((( slot >> 1 ) - 1 ) - dis_align_bits ) << price_shift_bits );
159
160 int * const dp = dis_prices[len_state];
161 int dis = 0;
162 for( ; dis < start_dis_model; ++dis )
163 dp[dis] = dsp[dis];
164 for( ; dis < modeled_distances; ++dis )
165 dp[dis] += dsp[dis_slots[dis]];
166 }
167 }
168
169
170 /* Returns the number of bytes advanced (ahead).
171 trials[0]..trials[ahead-1] contain the steps to encode.
172 ( trials[0].dis4 == -1 ) means literal.
173 A match/rep longer or equal than match_len_limit finishes the sequence.
174 */
sequence_optimizer(const int reps[num_rep_distances],const State state)175 int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances],
176 const State state )
177 {
178 int num_pairs, num_trials;
179
180 if( pending_num_pairs > 0 ) // from previous call
181 {
182 num_pairs = pending_num_pairs;
183 pending_num_pairs = 0;
184 }
185 else
186 num_pairs = read_match_distances();
187 const int main_len = ( num_pairs > 0 ) ? pairs[num_pairs-1].len : 0;
188
189 int replens[num_rep_distances];
190 int rep_index = 0;
191 for( int i = 0; i < num_rep_distances; ++i )
192 {
193 replens[i] = true_match_len( 0, reps[i] + 1 );
194 if( replens[i] > replens[rep_index] ) rep_index = i;
195 }
196 if( replens[rep_index] >= match_len_limit )
197 {
198 trials[0].price = replens[rep_index];
199 trials[0].dis4 = rep_index;
200 move_and_update( replens[rep_index] );
201 return replens[rep_index];
202 }
203
204 if( main_len >= match_len_limit )
205 {
206 trials[0].price = main_len;
207 trials[0].dis4 = pairs[num_pairs-1].dis + num_rep_distances;
208 move_and_update( main_len );
209 return main_len;
210 }
211
212 const int pos_state = data_position() & pos_state_mask;
213 const uint8_t prev_byte = peek( 1 );
214 const uint8_t cur_byte = peek( 0 );
215 const uint8_t match_byte = peek( reps[0] + 1 );
216
217 trials[1].price = price0( bm_match[state()][pos_state] );
218 if( state.is_char() )
219 trials[1].price += price_literal( prev_byte, cur_byte );
220 else
221 trials[1].price += price_matched( prev_byte, cur_byte, match_byte );
222 trials[1].dis4 = -1; // literal
223
224 const int match_price = price1( bm_match[state()][pos_state] );
225 const int rep_match_price = match_price + price1( bm_rep[state()] );
226
227 if( match_byte == cur_byte )
228 trials[1].update( rep_match_price + price_shortrep( state, pos_state ), 0, 0 );
229
230 num_trials = std::max( main_len, replens[rep_index] );
231
232 if( num_trials < min_match_len )
233 {
234 trials[0].price = 1;
235 trials[0].dis4 = trials[1].dis4;
236 move_pos();
237 return 1;
238 }
239
240 trials[0].state = state;
241 for( int i = 0; i < num_rep_distances; ++i )
242 trials[0].reps[i] = reps[i];
243
244 for( int len = min_match_len; len <= num_trials; ++len )
245 trials[len].price = infinite_price;
246
247 for( int rep = 0; rep < num_rep_distances; ++rep )
248 {
249 if( replens[rep] < min_match_len ) continue;
250 const int price = rep_match_price + price_rep( rep, state, pos_state );
251 for( int len = min_match_len; len <= replens[rep]; ++len )
252 trials[len].update( price + rep_len_prices.price( len, pos_state ),
253 rep, 0 );
254 }
255
256 if( main_len > replens[0] )
257 {
258 const int normal_match_price = match_price + price0( bm_rep[state()] );
259 int i = 0, len = std::max( replens[0] + 1, (int)min_match_len );
260 while( len > pairs[i].len ) ++i;
261 while( true )
262 {
263 const int dis = pairs[i].dis;
264 trials[len].update( normal_match_price + price_pair( dis, len, pos_state ),
265 dis + num_rep_distances, 0 );
266 if( ++len > pairs[i].len && ++i >= num_pairs ) break;
267 }
268 }
269
270 int cur = 0;
271 while( true ) // price optimization loop
272 {
273 move_pos();
274 if( ++cur >= num_trials ) // no more initialized trials
275 {
276 backward( cur );
277 return cur;
278 }
279
280 const int num_pairs = read_match_distances();
281 const int newlen = ( num_pairs > 0 ) ? pairs[num_pairs-1].len : 0;
282 if( newlen >= match_len_limit )
283 {
284 pending_num_pairs = num_pairs;
285 backward( cur );
286 return cur;
287 }
288
289 // give final values to current trial
290 Trial & cur_trial = trials[cur];
291 State cur_state;
292 {
293 const int dis4 = cur_trial.dis4;
294 int prev_index = cur_trial.prev_index;
295 const int prev_index2 = cur_trial.prev_index2;
296
297 if( prev_index2 == single_step_trial )
298 {
299 cur_state = trials[prev_index].state;
300 if( prev_index + 1 == cur ) // len == 1
301 {
302 if( dis4 == 0 ) cur_state.set_short_rep();
303 else cur_state.set_char(); // literal
304 }
305 else if( dis4 < num_rep_distances ) cur_state.set_rep();
306 else cur_state.set_match();
307 }
308 else
309 {
310 if( prev_index2 == dual_step_trial ) // dis4 == 0 (rep0)
311 --prev_index;
312 else // prev_index2 >= 0
313 prev_index = prev_index2;
314 cur_state.set_char_rep();
315 }
316 cur_trial.state = cur_state;
317 for( int i = 0; i < num_rep_distances; ++i )
318 cur_trial.reps[i] = trials[prev_index].reps[i];
319 mtf_reps( dis4, cur_trial.reps ); // literal is ignored
320 }
321
322 const int pos_state = data_position() & pos_state_mask;
323 const uint8_t prev_byte = peek( 1 );
324 const uint8_t cur_byte = peek( 0 );
325 const uint8_t match_byte = peek( cur_trial.reps[0] + 1 );
326
327 int next_price = cur_trial.price +
328 price0( bm_match[cur_state()][pos_state] );
329 if( cur_state.is_char() )
330 next_price += price_literal( prev_byte, cur_byte );
331 else
332 next_price += price_matched( prev_byte, cur_byte, match_byte );
333
334 // try last updates to next trial
335 Trial & next_trial = trials[cur+1];
336
337 next_trial.update( next_price, -1, cur ); // literal
338
339 const int match_price = cur_trial.price + price1( bm_match[cur_state()][pos_state] );
340 const int rep_match_price = match_price + price1( bm_rep[cur_state()] );
341
342 if( match_byte == cur_byte && next_trial.dis4 != 0 &&
343 next_trial.prev_index2 == single_step_trial )
344 {
345 const int price = rep_match_price + price_shortrep( cur_state, pos_state );
346 if( price <= next_trial.price )
347 {
348 next_trial.price = price;
349 next_trial.dis4 = 0; // rep0
350 next_trial.prev_index = cur;
351 }
352 }
353
354 const int triable_bytes =
355 std::min( available_bytes(), max_num_trials - 1 - cur );
356 if( triable_bytes < min_match_len ) continue;
357
358 const int len_limit = std::min( match_len_limit, triable_bytes );
359
360 // try literal + rep0
361 if( match_byte != cur_byte && next_trial.prev_index != cur )
362 {
363 const uint8_t * const data = ptr_to_current_pos();
364 const int dis = cur_trial.reps[0] + 1;
365 const int limit = std::min( match_len_limit + 1, triable_bytes );
366 int len = 1;
367 while( len < limit && data[len-dis] == data[len] ) ++len;
368 if( --len >= min_match_len )
369 {
370 const int pos_state2 = ( pos_state + 1 ) & pos_state_mask;
371 State state2 = cur_state; state2.set_char();
372 const int price = next_price +
373 price1( bm_match[state2()][pos_state2] ) +
374 price1( bm_rep[state2()] ) +
375 price_rep0_len( len, state2, pos_state2 );
376 while( num_trials < cur + 1 + len )
377 trials[++num_trials].price = infinite_price;
378 trials[cur+1+len].update2( price, cur + 1 );
379 }
380 }
381
382 int start_len = min_match_len;
383
384 // try rep distances
385 for( int rep = 0; rep < num_rep_distances; ++rep )
386 {
387 const uint8_t * const data = ptr_to_current_pos();
388 const int dis = cur_trial.reps[rep] + 1;
389 int len;
390
391 if( data[0-dis] != data[0] || data[1-dis] != data[1] ) continue;
392 for( len = min_match_len; len < len_limit; ++len )
393 if( data[len-dis] != data[len] ) break;
394 while( num_trials < cur + len )
395 trials[++num_trials].price = infinite_price;
396 int price = rep_match_price + price_rep( rep, cur_state, pos_state );
397 for( int i = min_match_len; i <= len; ++i )
398 trials[cur+i].update( price + rep_len_prices.price( i, pos_state ),
399 rep, cur );
400
401 if( rep == 0 ) start_len = len + 1; // discard shorter matches
402
403 // try rep + literal + rep0
404 int len2 = len + 1;
405 const int limit = std::min( match_len_limit + len2, triable_bytes );
406 while( len2 < limit && data[len2-dis] == data[len2] ) ++len2;
407 len2 -= len + 1;
408 if( len2 < min_match_len ) continue;
409
410 int pos_state2 = ( pos_state + len ) & pos_state_mask;
411 State state2 = cur_state; state2.set_rep();
412 price += rep_len_prices.price( len, pos_state ) +
413 price0( bm_match[state2()][pos_state2] ) +
414 price_matched( data[len-1], data[len], data[len-dis] );
415 pos_state2 = ( pos_state2 + 1 ) & pos_state_mask;
416 state2.set_char();
417 price += price1( bm_match[state2()][pos_state2] ) +
418 price1( bm_rep[state2()] ) +
419 price_rep0_len( len2, state2, pos_state2 );
420 while( num_trials < cur + len + 1 + len2 )
421 trials[++num_trials].price = infinite_price;
422 trials[cur+len+1+len2].update3( price, rep, cur + len + 1, cur );
423 }
424
425 // try matches
426 if( newlen >= start_len && newlen <= len_limit )
427 {
428 const int normal_match_price = match_price +
429 price0( bm_rep[cur_state()] );
430
431 while( num_trials < cur + newlen )
432 trials[++num_trials].price = infinite_price;
433
434 int i = 0;
435 while( pairs[i].len < start_len ) ++i;
436 int dis = pairs[i].dis;
437 for( int len = start_len; ; ++len )
438 {
439 int price = normal_match_price + price_pair( dis, len, pos_state );
440 trials[cur+len].update( price, dis + num_rep_distances, cur );
441
442 // try match + literal + rep0
443 if( len == pairs[i].len )
444 {
445 const uint8_t * const data = ptr_to_current_pos();
446 const int dis2 = dis + 1;
447 int len2 = len + 1;
448 const int limit = std::min( match_len_limit + len2, triable_bytes );
449 while( len2 < limit && data[len2-dis2] == data[len2] ) ++len2;
450 len2 -= len + 1;
451 if( len2 >= min_match_len )
452 {
453 int pos_state2 = ( pos_state + len ) & pos_state_mask;
454 State state2 = cur_state; state2.set_match();
455 price += price0( bm_match[state2()][pos_state2] ) +
456 price_matched( data[len-1], data[len], data[len-dis2] );
457 pos_state2 = ( pos_state2 + 1 ) & pos_state_mask;
458 state2.set_char();
459 price += price1( bm_match[state2()][pos_state2] ) +
460 price1( bm_rep[state2()] ) +
461 price_rep0_len( len2, state2, pos_state2 );
462
463 while( num_trials < cur + len + 1 + len2 )
464 trials[++num_trials].price = infinite_price;
465 trials[cur+len+1+len2].update3( price, dis + num_rep_distances,
466 cur + len + 1, cur );
467 }
468 if( ++i >= num_pairs ) break;
469 dis = pairs[i].dis;
470 }
471 }
472 }
473 }
474 }
475
476
encode_member(const unsigned long long member_size)477 bool LZ_encoder::encode_member( const unsigned long long member_size )
478 {
479 const unsigned long long member_size_limit =
480 member_size - Lzip_trailer::size - max_marker_size;
481 const bool best = ( match_len_limit > 12 );
482 const int dis_price_count = best ? 1 : 512;
483 const int align_price_count = best ? 1 : dis_align_size;
484 const int price_count = ( match_len_limit > 36 ) ? 1013 : 4093;
485 int price_counter = 0; // counters may decrement below 0
486 int dis_price_counter = 0;
487 int align_price_counter = 0;
488 int reps[num_rep_distances];
489 State state;
490 for( int i = 0; i < num_rep_distances; ++i ) reps[i] = 0;
491
492 if( data_position() != 0 || renc.member_position() != Lzip_header::size )
493 return false; // can be called only once
494
495 if( !data_finished() ) // encode first byte
496 {
497 const uint8_t prev_byte = 0;
498 const uint8_t cur_byte = peek( 0 );
499 renc.encode_bit( bm_match[state()][0], 0 );
500 encode_literal( prev_byte, cur_byte );
501 crc32.update_byte( crc_, cur_byte );
502 get_match_pairs();
503 move_pos();
504 }
505
506 while( !data_finished() )
507 {
508 if( price_counter <= 0 && pending_num_pairs == 0 )
509 {
510 price_counter = price_count; // recalculate prices every these bytes
511 if( dis_price_counter <= 0 )
512 { dis_price_counter = dis_price_count; update_distance_prices(); }
513 if( align_price_counter <= 0 )
514 {
515 align_price_counter = align_price_count;
516 for( int i = 0; i < dis_align_size; ++i )
517 align_prices[i] = price_symbol_reversed( bm_align, i, dis_align_bits );
518 }
519 match_len_prices.update_prices();
520 rep_len_prices.update_prices();
521 }
522
523 int ahead = sequence_optimizer( reps, state );
524 price_counter -= ahead;
525
526 for( int i = 0; ahead > 0; )
527 {
528 const int pos_state = ( data_position() - ahead ) & pos_state_mask;
529 const int len = trials[i].price;
530 int dis = trials[i].dis4;
531
532 bool bit = ( dis < 0 );
533 renc.encode_bit( bm_match[state()][pos_state], !bit );
534 if( bit ) // literal byte
535 {
536 const uint8_t prev_byte = peek( ahead + 1 );
537 const uint8_t cur_byte = peek( ahead );
538 crc32.update_byte( crc_, cur_byte );
539 if( state.is_char_set_char() )
540 encode_literal( prev_byte, cur_byte );
541 else
542 {
543 const uint8_t match_byte = peek( ahead + reps[0] + 1 );
544 encode_matched( prev_byte, cur_byte, match_byte );
545 }
546 }
547 else // match or repeated match
548 {
549 crc32.update_buf( crc_, ptr_to_current_pos() - ahead, len );
550 mtf_reps( dis, reps );
551 bit = ( dis < num_rep_distances );
552 renc.encode_bit( bm_rep[state()], bit );
553 if( bit ) // repeated match
554 {
555 bit = ( dis == 0 );
556 renc.encode_bit( bm_rep0[state()], !bit );
557 if( bit )
558 renc.encode_bit( bm_len[state()][pos_state], len > 1 );
559 else
560 {
561 renc.encode_bit( bm_rep1[state()], dis > 1 );
562 if( dis > 1 )
563 renc.encode_bit( bm_rep2[state()], dis > 2 );
564 }
565 if( len == 1 ) state.set_short_rep();
566 else
567 {
568 renc.encode_len( rep_len_model, len, pos_state );
569 rep_len_prices.decrement_counter( pos_state );
570 state.set_rep();
571 }
572 }
573 else // match
574 {
575 dis -= num_rep_distances;
576 encode_pair( dis, len, pos_state );
577 if( dis >= modeled_distances ) --align_price_counter;
578 --dis_price_counter;
579 match_len_prices.decrement_counter( pos_state );
580 state.set_match();
581 }
582 }
583 ahead -= len; i += len;
584 if( renc.member_position() >= member_size_limit )
585 {
586 if( !dec_pos( ahead ) ) return false;
587 full_flush( state );
588 return true;
589 }
590 }
591 }
592 full_flush( state );
593 return true;
594 }
595