1 /* Lziprecover - Data recovery tool for the lzip format
2    Copyright (C) 2009-2021 Antonio Diaz Diaz.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 2 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 */
17 
18 #define _FILE_OFFSET_BITS 64
19 
20 #include <algorithm>
21 #include <cerrno>
22 #include <cstdio>
23 #include <cstring>
24 #include <string>
25 #include <vector>
26 #include <stdint.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <sys/stat.h>
30 
31 #include "lzip.h"
32 #include "lzip_index.h"
33 
34 
35 /* Show how well the frequency of sequences of N repeated bytes in LZMA data
36    matches the value expected for random data. ( 1 / 2^( 8 * N ) )
37    Print cumulative data for all files followed by the name of the first
38    file with the longest sequence.
39 */
print_nrep_stats(const std::vector<std::string> & filenames,const int repeated_byte,const bool ignore_errors,const bool ignore_trailing,const bool loose_trailing)40 int print_nrep_stats( const std::vector< std::string > & filenames,
41                       const int repeated_byte, const bool ignore_errors,
42                       const bool ignore_trailing, const bool loose_trailing )
43   {
44   std::vector< unsigned long > len_vector;
45   unsigned long long best_pos = 0, lzma_size = 0;
46   int best_name = -1, retval = 0;
47   const bool count_all = ( repeated_byte < 0 || repeated_byte >= 256 );
48   bool stdin_used = false;
49   for( unsigned i = 0; i < filenames.size(); ++i )
50     {
51     const bool from_stdin = ( filenames[i] == "-" );
52     if( from_stdin ) { if( stdin_used ) continue; else stdin_used = true; }
53     const char * const input_filename =
54       from_stdin ? "(stdin)" : filenames[i].c_str();
55     struct stat in_stats;				// not used
56     const int infd = from_stdin ? STDIN_FILENO :
57       open_instream( input_filename, &in_stats, false, true );
58     if( infd < 0 ) { set_retval( retval, 1 ); continue; }
59 
60     const Lzip_index lzip_index( infd, ignore_trailing, loose_trailing,
61                                  ignore_errors, ignore_errors );
62     if( lzip_index.retval() != 0 )
63       {
64       show_file_error( input_filename, lzip_index.error().c_str() );
65       set_retval( retval, lzip_index.retval() );
66       close( infd );
67       continue;
68       }
69     const unsigned long long cdata_size = lzip_index.cdata_size();
70     const uint8_t * const buffer =
71       (const uint8_t *)mmap( 0, cdata_size, PROT_READ, MAP_PRIVATE, infd, 0 );
72     close( infd );
73     if( buffer == MAP_FAILED )
74       { show_file_error( input_filename, "Can't mmap", errno );
75         set_retval( retval, 1 ); continue; }
76     for( long j = 0; j < lzip_index.members(); ++j )
77       {
78       const Block & mb = lzip_index.mblock( j );
79       long long pos = mb.pos() + 7;		// skip header (+1 byte) and
80       const long long end = mb.end() - 20;	// trailer of each member
81       lzma_size += end - pos;
82       while( pos < end )
83         {
84         const uint8_t byte = buffer[pos++];
85         if( buffer[pos] == byte )
86           {
87           unsigned len = 2;
88           ++pos;
89           while( pos < end && buffer[pos] == byte ) { ++pos; ++len; }
90           if( !count_all && repeated_byte != (int)byte ) continue;
91           if( len >= len_vector.size() ) { len_vector.resize( len + 1 );
92             best_name = i; best_pos = pos - len; }
93           ++len_vector[len];
94           }
95         }
96       }
97     munmap( (void *)buffer, cdata_size );
98     }
99 
100   if( count_all )
101     std::fputs( "\nShowing repeated sequences of any byte value.\n", stdout );
102   else
103     std::printf( "\nShowing repeated sequences of the byte value 0x%02X\n",
104                  repeated_byte );
105   std::printf( "Total size of LZMA data: %llu bytes (%sBytes)\n",
106                lzma_size, format_num( lzma_size, 999 ) );
107   for( unsigned len = 2; len < len_vector.size(); ++len )
108     if( len_vector[len] > 0 )
109       std::printf( "len %u found %lu times, 1 every %llu bytes "
110                    "(expected 1 every %sB)\n",
111                    len, len_vector[len], lzma_size / len_vector[len],
112                    format_num( 1ULL << ( 8 * ( len - count_all ) ), -1ULL, -1 ) );
113   if( best_name >= 0 )
114     std::printf( "Longest sequence found at position %llu of '%s'\n",
115                  best_pos, filenames[best_name].c_str() );
116   return retval;
117   }
118