1 /* Lziprecover - Data recovery tool for the lzip format
2 Copyright (C) 2009-2021 Antonio Diaz Diaz.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 class Range_mtester
19 {
20 const uint8_t * const buffer; // input buffer
21 const long long buffer_size;
22 long long pos; // current pos in buffer
23 uint32_t code;
24 uint32_t range;
25 bool at_stream_end;
26
27 public:
Range_mtester(const uint8_t * const buf,const long long buf_size)28 Range_mtester( const uint8_t * const buf, const long long buf_size )
29 :
30 buffer( buf ),
31 buffer_size( buf_size ),
32 pos( Lzip_header::size ),
33 code( 0 ),
34 range( 0xFFFFFFFFU ),
35 at_stream_end( false )
36 {}
37
finished()38 bool finished() { return pos >= buffer_size; }
member_position()39 unsigned long long member_position() const { return pos; }
40
get_byte()41 uint8_t get_byte()
42 {
43 // 0xFF avoids decoder error if member is truncated at EOS marker
44 if( finished() ) return 0xFF;
45 return buffer[pos++];
46 }
47
get_trailer()48 const Lzip_trailer * get_trailer()
49 {
50 if( buffer_size - pos < Lzip_trailer::size ) return 0;
51 const Lzip_trailer * const p = (const Lzip_trailer *)( buffer + pos );
52 pos += Lzip_trailer::size;
53 return p;
54 }
55
load()56 void load()
57 {
58 code = 0;
59 for( int i = 0; i < 5; ++i ) code = ( code << 8 ) | get_byte();
60 range = 0xFFFFFFFFU;
61 code &= range; // make sure that first byte is discarded
62 }
63
normalize()64 void normalize()
65 {
66 if( range <= 0x00FFFFFFU )
67 { range <<= 8; code = ( code << 8 ) | get_byte(); }
68 }
69
decode(const int num_bits)70 unsigned decode( const int num_bits )
71 {
72 unsigned symbol = 0;
73 for( int i = num_bits; i > 0; --i )
74 {
75 normalize();
76 range >>= 1;
77 // symbol <<= 1;
78 // if( code >= range ) { code -= range; symbol |= 1; }
79 const bool bit = ( code >= range );
80 symbol <<= 1; symbol += bit;
81 code -= range & ( 0U - bit );
82 }
83 return symbol;
84 }
85
decode_bit(Bit_model & bm)86 unsigned decode_bit( Bit_model & bm )
87 {
88 normalize();
89 const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability;
90 if( code < bound )
91 {
92 range = bound;
93 bm.probability +=
94 ( bit_model_total - bm.probability ) >> bit_model_move_bits;
95 return 0;
96 }
97 else
98 {
99 range -= bound;
100 code -= bound;
101 bm.probability -= bm.probability >> bit_model_move_bits;
102 return 1;
103 }
104 }
105
decode_tree3(Bit_model bm[])106 unsigned decode_tree3( Bit_model bm[] )
107 {
108 unsigned symbol = 2 | decode_bit( bm[1] );
109 symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
110 symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
111 return symbol & 7;
112 }
113
decode_tree6(Bit_model bm[])114 unsigned decode_tree6( Bit_model bm[] )
115 {
116 unsigned symbol = 2 | decode_bit( bm[1] );
117 symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
118 symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
119 symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
120 symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
121 symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
122 return symbol & 0x3F;
123 }
124
decode_tree8(Bit_model bm[])125 unsigned decode_tree8( Bit_model bm[] )
126 {
127 unsigned symbol = 1;
128 for( int i = 0; i < 8; ++i )
129 symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
130 return symbol & 0xFF;
131 }
132
decode_tree_reversed(Bit_model bm[],const int num_bits)133 unsigned decode_tree_reversed( Bit_model bm[], const int num_bits )
134 {
135 unsigned model = 1;
136 unsigned symbol = 0;
137 for( int i = 0; i < num_bits; ++i )
138 {
139 const unsigned bit = decode_bit( bm[model] );
140 model <<= 1; model += bit;
141 symbol |= ( bit << i );
142 }
143 return symbol;
144 }
145
decode_tree_reversed4(Bit_model bm[])146 unsigned decode_tree_reversed4( Bit_model bm[] )
147 {
148 unsigned symbol = decode_bit( bm[1] );
149 symbol += decode_bit( bm[2+symbol] ) << 1;
150 symbol += decode_bit( bm[4+symbol] ) << 2;
151 symbol += decode_bit( bm[8+symbol] ) << 3;
152 return symbol;
153 }
154
decode_matched(Bit_model bm[],unsigned match_byte)155 unsigned decode_matched( Bit_model bm[], unsigned match_byte )
156 {
157 Bit_model * const bm1 = bm + 0x100;
158 unsigned symbol = 1;
159 while( symbol < 0x100 )
160 {
161 const unsigned match_bit = ( match_byte <<= 1 ) & 0x100;
162 const bool bit = decode_bit( bm1[symbol+match_bit] );
163 symbol <<= 1; symbol |= bit;
164 if( match_bit >> 8 != bit )
165 {
166 while( symbol < 0x100 )
167 symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
168 break;
169 }
170 }
171 return symbol & 0xFF;
172 }
173
decode_len(Len_model & lm,const int pos_state)174 unsigned decode_len( Len_model & lm, const int pos_state )
175 {
176 if( decode_bit( lm.choice1 ) == 0 )
177 return decode_tree3( lm.bm_low[pos_state] );
178 if( decode_bit( lm.choice2 ) == 0 )
179 return len_low_symbols + decode_tree3( lm.bm_mid[pos_state] );
180 return len_low_symbols + len_mid_symbols + decode_tree8( lm.bm_high );
181 }
182 };
183
184 class MD5SUM; // forward declaration
185
186 class LZ_mtester
187 {
188 unsigned long long partial_data_pos;
189 Range_mtester rdec;
190 const unsigned dictionary_size;
191 uint8_t * buffer; // output buffer
192 unsigned pos; // current pos in buffer
193 unsigned stream_pos; // first byte not yet written to file
194 uint32_t crc_;
195 const int outfd; // output file descriptor
196 unsigned rep0; // rep[0-3] latest four distances
197 unsigned rep1; // used for efficient coding of
198 unsigned rep2; // repeated distances
199 unsigned rep3;
200 State state;
201 MD5SUM * const md5sum;
202 unsigned long long total_packets_; // total number of packets in member
203 unsigned long long max_rep0_pos; // file position of maximum distance
204 unsigned max_rep0; // maximum distance found
205 std::vector< unsigned long long > max_packet_posv_; // file pos of large packets
206 unsigned max_packet_size_; // maximum packet size found
207 unsigned max_marker_size_; // maximum marker size found
208 bool pos_wrapped;
209
210 Bit_model bm_literal[1<<literal_context_bits][0x300];
211 Bit_model bm_match[State::states][pos_states];
212 Bit_model bm_rep[State::states];
213 Bit_model bm_rep0[State::states];
214 Bit_model bm_rep1[State::states];
215 Bit_model bm_rep2[State::states];
216 Bit_model bm_len[State::states][pos_states];
217 Bit_model bm_dis_slot[len_states][1<<dis_slot_bits];
218 Bit_model bm_dis[modeled_distances-end_dis_model+1];
219 Bit_model bm_align[dis_align_size];
220
221 Len_model match_len_model;
222 Len_model rep_len_model;
223
224 void print_block( const int len );
225 void flush_data();
226 bool verify_trailer( FILE * const f = 0, unsigned long long byte_pos = 0 );
227
peek_prev()228 uint8_t peek_prev() const
229 { return buffer[((pos > 0) ? pos : dictionary_size)-1]; }
230
peek(const unsigned distance)231 uint8_t peek( const unsigned distance ) const
232 {
233 const unsigned i = ( ( pos > distance ) ? 0 : dictionary_size ) +
234 pos - distance - 1;
235 return buffer[i];
236 }
237
put_byte(const uint8_t b)238 void put_byte( const uint8_t b )
239 {
240 buffer[pos] = b;
241 if( ++pos >= dictionary_size ) flush_data();
242 }
243
copy_block(const unsigned distance,unsigned len)244 void copy_block( const unsigned distance, unsigned len )
245 {
246 unsigned lpos = pos, i = lpos - distance - 1;
247 bool fast, fast2;
248 if( lpos > distance )
249 {
250 fast = ( len < dictionary_size - lpos );
251 fast2 = ( fast && len <= lpos - i );
252 }
253 else
254 {
255 i += dictionary_size;
256 fast = ( len < dictionary_size - i ); // (i == pos) may happen
257 fast2 = ( fast && len <= i - lpos );
258 }
259 if( fast ) // no wrap
260 {
261 pos += len;
262 if( fast2 ) // no wrap, no overlap
263 std::memcpy( buffer + lpos, buffer + i, len );
264 else
265 for( ; len > 0; --len ) buffer[lpos++] = buffer[i++];
266 }
267 else for( ; len > 0; --len )
268 {
269 buffer[pos] = buffer[i];
270 if( ++pos >= dictionary_size ) flush_data();
271 if( ++i >= dictionary_size ) i = 0;
272 }
273 }
274
set_max_packet(const unsigned new_size,const unsigned long long pos)275 void set_max_packet( const unsigned new_size, const unsigned long long pos )
276 {
277 if( max_packet_size_ > new_size || new_size == 0 ) return;
278 if( max_packet_size_ < new_size ) // new max size
279 { max_packet_size_ = new_size; max_packet_posv_.clear(); }
280 max_packet_posv_.push_back( pos - new_size ); // pos of first byte
281 }
282
set_max_marker(const unsigned new_size)283 void set_max_marker( const unsigned new_size )
284 { if( max_marker_size_ < new_size ) max_marker_size_ = new_size; }
285
286 public:
287 LZ_mtester( const uint8_t * const ibuf, const long long ibuf_size,
288 const unsigned dict_size, const int ofd = -1,
289 MD5SUM * const md5sum_ = 0 )
290 :
291 partial_data_pos( 0 ),
292 rdec( ibuf, ibuf_size ),
293 dictionary_size( dict_size ),
294 buffer( new uint8_t[dictionary_size] ),
295 pos( 0 ),
296 stream_pos( 0 ),
297 crc_( 0xFFFFFFFFU ),
298 outfd( ofd ),
299 rep0( 0 ),
300 rep1( 0 ),
301 rep2( 0 ),
302 rep3( 0 ),
303 md5sum( md5sum_ ),
304 total_packets_( -1ULL ), // don't count EOS marker
305 max_rep0_pos( 0 ),
306 max_rep0( 0 ),
307 max_packet_size_( 0 ),
308 max_marker_size_( 0 ),
309 pos_wrapped( false )
310 // prev_byte of first byte; also for peek( 0 ) on corrupt file
311 { buffer[dictionary_size-1] = 0; }
312
~LZ_mtester()313 ~LZ_mtester() { delete[] buffer; }
314
crc()315 unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; }
data_position()316 unsigned long long data_position() const { return partial_data_pos + pos; }
finished()317 bool finished() { return rdec.finished(); }
member_position()318 unsigned long long member_position() const { return rdec.member_position(); }
total_packets()319 unsigned long long total_packets() const { return total_packets_; }
max_distance_pos()320 unsigned long long max_distance_pos() const { return max_rep0_pos; }
max_distance()321 unsigned max_distance() const { return max_rep0 + 1; }
max_packet_posv()322 const std::vector< unsigned long long > & max_packet_posv() const
323 { return max_packet_posv_; }
max_packet_size()324 unsigned max_packet_size() const { return max_packet_size_; }
max_marker_size()325 unsigned max_marker_size() const { return max_marker_size_; }
326
get_buffers(const uint8_t ** prev_bufferp,int * sizep,int * prev_sizep)327 const uint8_t * get_buffers( const uint8_t ** prev_bufferp,
328 int * sizep, int * prev_sizep ) const
329 { *sizep = ( pos_wrapped && pos == 0 ) ? dictionary_size : pos;
330 *prev_sizep = ( pos_wrapped && pos > 0 ) ? dictionary_size - pos : 0;
331 *prev_bufferp = buffer + pos; return buffer; }
332
333 void duplicate_buffer();
334 // these two functions set max_rep0
335 int test_member( const unsigned long long mpos_limit = LLONG_MAX,
336 const unsigned long long dpos_limit = LLONG_MAX,
337 FILE * const f = 0, const unsigned long long byte_pos = 0 );
338 /* this function also sets max_rep0_pos, total_packets_, max_packet_size_,
339 max_packet_posv_, and max_marker_size_ */
340 int debug_decode_member( const long long dpos, const long long mpos,
341 const bool show_packets );
342 };
343