1 /*
2  * JDiff.cpp
3  *
4  * Jojo's diff on binary files: main class.
5  *
6  * Copyright (C) 2002-2011 Joris Heirbaut
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
20  *
21  * ----------------------------------------------------------------------------
22  *
23  * Class JDiff takes two JFiles as input and outputs to a JOut instance.
24  *
25  * If you want to reuse JojoDiff, you need following files:
26  * - JDiff.h/cpp        The main JojoDiff class
27  * - JHashPos.h/cpp     The hash table collection of (sample-key, position)
28  * - JMatchTable.h/cpp  The matching table logic
29  * - JDefs.h            Global definitions
30  * - JDebug.h/cpp       Debugging definitions
31  * - JOut.h             Abstract output class
32  * - JFile.h            Abstract input class
33  * + at least one descendant of JOut and JFile (may be of your own).
34  *
35  * Method jdiff performs the actual diffing:
36  * 1) first, we create a hashtable collection of (hkey, position) pairs, where
37  *      hkey = hash key of a 32 byte sample in the original input file
38  *      position = position of this sample in the file
39  * 2) compares both files byte by byte
40  * 3) when a difference is found, looks ahead using ufFndAhd to find the nearest
41  *    equal region between both files
42  * 4) output skip/delete/backtrace instructions to reach the found regions
43  * 5) repeat steps 2-4 each time the end of an equal region is reached.
44  *
45  * Method ufFndAhd looks ahead on the input files to find the nearest equals regions:
46  * - for every 32-byte sample in the new file, look in the hastable whether a
47  *   similar sample exists in the original file.
48  * - creates a table of matching regions using this catalog
49  * - returns the nearest match from the table of matches
50  *
51  * There are two reasons for creating a matching table on top of the hashtable:
52  * - The hashtable may only contain a (small) percentage of all samples, hence
53  *   the nearest equal region is not always detected first.
54  * - Samples are considered equal when their hash-keys are equal (in fact there's only
55  *   1/256 chance that samples are really equal of their hash-keys match).
56  *
57  * Method ufFndAhdGet gets one byte of a file and counts the number of subsequent
58  * equal bytes (because samples containing too many equal bytes are usually bad to
59  * compare with between files).
60  *
61  * Method ufFndhdScn scans the left file and creates the hash table.
62  *
63  * TODO: allow org and new files to be the same file
64  * TODO: allow sequential files as input
65  *
66  * Author                Version Date       Modification
67  * --------------------- ------- -------    -----------------------
68  * Joris Heirbaut        v0.0    10-06-2002 hashed compare
69  * Joris Heirbaut                14-06-2002 full compare
70  * Joris Heirbaut                17-06-2002 global positions
71  * Joris Heirbaut        v0.1    18-06-2002 first well-working runs!!!
72  * Joris Heirbaut                19-06-2002 compare in buffer before read position
73  * Joris Heirbaut        v0.1    20-06-2002 optimized esc-sequences & lengths
74  * Joris Heirbaut        v0.2    24-06-2002 running okay again
75  * Joris Heirbaut        v0.2b   01-07-2002 bugfix on length=252
76  * Joris Heirbaut        v0.2c   09-07-2002 bugfix on divide by zero in statistics
77  * Joris Heirbaut        v0.3a   09-07-2002 hashtable hint only on samplerate
78  * Joris Heirbaut          |     09-07-2002 exit code 1 if files are equal
79  * Joris Heirbaut          |     12-07-2002 bugfix using ufFabPos in function call
80  * Joris Heirbaut        v0.3a   16-07-2002 backtrack on original file
81  * Joris Heirbaut        v0.4a   19-07-2002 prescan sourcefile
82  * Joris Heirbaut          |     30-08-2002 bugfix in ufFabRst and ufFabPos
83  * Joris Heirbaut          |     03-08-2002 bugfix for backtrack before start-of-file
84  * Joris Heirbaut          |     09-09-2002 reimplemented filebuffer
85  * Joris Heirbaut        v0.4a   10-09-2002 take best of multiple possibilities
86  * Joris Heirbaut        v0.4b   11-09-2002 soft-reading from files
87  * Joris Heirbaut          |     18-09-2002 moved ufFabCmp from ufFndAhdChk to ufFndAhdAdd/Bst
88  * Joris Heirbaut          |     18-09-2002 ufFabOpt - optimize a found solution
89  * Joris Heirbaut          |     10-10-2002 added Fab->izPosEof to correctly handle EOF condition
90  * Joris Heirbaut        v0.4b   16-10-2002 replaces ufFabCmpBck and ufFabCmpFwd with ufFabFnd
91  * Joris Heirbaut        v0.4c   04-11-2002 use ufHshFnd after prescanning
92  * Joris Heirbaut          |     04-11-2002 no reset of matching table
93  * Joris Heirbaut          |     21-12-2002 rewrite of matching table logic
94  * Joris Heirbaut          |     24-12-2002 no compare in ufFndAhdAdd
95  * Joris Heirbaut          |     02-01-2003 restart finding matches at regular intervals when no matches are found
96  * Joris Heirbaut          |     09-01-2003 renamed ufFabBkt to ufFabSek, use it for DEL and BKT instructions
97  * Joris Heirbaut        v0.4c   23-01-2003 distinguish between EOF en EOB
98  * Joris Heirbaut        v0.5    27-02-2003 dropped "fast" hash method (it was slow!)
99  * Joris Heirbaut          |     22-05-2003 started    rewrite of FAB-abstraction
100  * Joris Heirbaut          |     30-06-2003 terminated rewrite ...
101  * Joris Heirbaut          |     08-07-2003 correction in ufMchBst (llTstNxt = *alBstNew + 1 iso -1)
102  * Joris Heirbaut        v0.5    02-09-2003 production
103  * Joris Heirbaut        v0.6    29-04-2005 large-file support
104  * Joris Heirbaut        v0.7    23-06-2009 differentiate between position 0 and notfound in ufMchBst
105  * Joris Heirbaut          |     23-06-2009 optimize collission strategy using sample quality
106  * Joris Heirbaut          |     24-06-2009 introduce quality of samples
107  * Joris Heirbaut          |     26-06-2009 protect first samples
108  * Joris Heirbaut        v0.7g   24-09-2009 use fseeko for cygwin largefiles
109  * Joris Heirbaut          |     24-09-2009 removed casts to int from ufFndAhd
110  * Joris Heirbaut        v0.7h   24-09-2009 faster ufFabGetNxt
111  * Joris Heirbaut          |     24-09-2009 faster ufHshAdd: remove quality of hashes
112  * Joris Heirbaut        v0.7i   25-09-2009 drop use of ufHshBitCnt
113  * Joris Heirbaut        v0.7l   04-10-2009 increment glMchMaxDst as hashtable overloading grows
114  * Joris Heirbaut          |     16-10-2009 finalization
115  * Joris Heirbaut        v0.7m   17-10-2009 gprof optimization ufHshAdd
116  * Joris Heirbaut        v0.7n   17-10-2009 gprof optimization ufFabGet
117  * Joris Heirbaut        v0.7o   18-10-2009 gprof optimization asFab->iiRedSze
118  * Joris Heirbaut        v0.7p   19-10-2009 ufHshAdd: check uniform distribution
119  * Joris Heirbaut        v0.7q   23-10-2009 ufFabGet: scroll back on buffer
120  * Joris Heirbaut          |     19-10-2009 ufFndAhd: liMax = giBufSze
121  * Joris Heirbaut          |     25-10-2009 ufMchAdd: gliding matches
122  * Joris Heirbaut          |     25-10-2009 ufOut: return true for faster EQL sequences in jdiff function
123  * Joris Heirbaut        v0.7r   27-10-2009 ufMchBst: test position for gliding matches
124  * Joris Heirbaut          |     27-10-2009 ufMchBst: remove double loop
125  * Joris Heirbaut          |     27-10-2009 ufMchBst: double linked list ordered on azPosOrg
126  * Joris Heirbaut          |     27-10-2009 ufFndAhd: look back on reset (liBck)
127  * Joris Heirbaut          |     27-10-2009 ufFndAhd: reduce lookahead after giMchMin (speed optimization)
128  * Joris Heirbaut        v0.7x   05-11-2009 ufMchAdd: hashed method
129  * Joris Heirbaut        v0.7y   13-11-2009 ufMchBst: store unmatched samples too (reduce use of ufFabFnd)
130  * Joris Heirbaut        v0.8    30-06-2011 C++ version
131  * Joris Heirbaut        v0.8a   08-07-2011 Optimize position 0
132  * Joris Heirbaut        v0.8b   02-09-2011 Switch order of ahead/backtrack/skip logic
133  * Joris Heirbaut        v0.8.1  30-11-2011 Revert to use of fread/fseek (MinGW ifstream.gseek does not correctly handle files >2GB)
134  * Joris Heirbaut        v0.8.1  30-11-2011 Throuw out exception handling for MinGW (trying to reduce exe size)
135  *
136  *******************************************************************************/
137 
138 #ifndef JDIFF_H_
139 #define JDIFF_H_
140 #include "JDefs.h"
141 #include "JFile.h"
142 #include "JHashPos.h"
143 #include "JMatchTable.h"
144 #include "JOut.h"
145 
146 namespace JojoDiff {
147 
148 /**
149  * Central JDiff and FindAhead routines for the JDiff algorithm.
150  */
151 class JDiff {
152 public:
153     /**
154      * Create JDiff for working on specified files.
155      * @param apFilOrg  Original file.
156      * @param apFilNew  New file.
157      * @param aiHshSze  Hastable max size in number of elements (default = 8388608)
158      * @param aiVerbse  Verbose level 0=no, 1=normal, 2=high (default = 0)
159      * @param abSrcBkt  Backtrace on sourcefile allowed? (default = yes)
160      * @param aiSrcScn  Prescan source file: 0=no, 1=yes (default = yes)
161      * @param aiMchMax  Maximum entries in matching table (default = 8)
162      * @param aiMchMin  Minimum entries in matching table (default = 4)
163      * @param aiAhdMax  Maximum bytes to find ahead (default = 256kB)
164      */
165     JDiff(JFile * const apFilOrg, JFile * const apFilNew, JOut * const apOut,
166         const int aiHshSze = 8388608, const
167         int aiVerbse=0,
168         const int abSrcBkt=true,
169         const int aiSrcScn=true,
170         const int aiMchMax=8,
171         const int aiMchMin=4,
172         const int aiAhdMax=256*1024,
173         const bool abCmpAll = true);
174 
175 	/**
176 	 * Destroys JDiff object.
177 	 */
178 	virtual ~JDiff();
179 
180 	/*******************************************************************************
181 	* Difference function
182 	*
183 	* Writes out the differences between the two files passed via the constructor.
184 	*
185 	* Throws a bad_alloc exception in case of memory error.
186 	* Throws an io_base::failure in case of i/o error.
187 	*
188 	* @return 0             ok
189 	* @return EXI_SEK       Error seeking file
190     * @return EXI_LRG  7    Error on 64-bit number
191     * @return EXI_RED  8    Error reading file
192     * @return EXI_WRI  9    Error writing file
193     * @return EXI_MEM  10   Error allocating memory
194     * @return EXI_ERR  20   Spurious error occured
195 	*******************************************************************************/
196 	int jdiff ();
197 
198 	/* getters */
getHsh()199 	JHashPos * getHsh(){return gpHsh;};
getHshErr()200 	int getHshErr(){return giHshErr;};
201 
202 private:
203 	/* Context */
204 	JFile * const mpFilOrg ;    // Original file to read
205 	JFile * const mpFilNew ;    // New file to read
206 	JOut  * const mpOut ;       // Output handler
207 	JHashPos * gpHsh ;          // Hashtable containing hashes from mpFilOrg.
208 	JMatchTable * gpMch ;       // Table of matches
209 
210 	/* Settings */
211 	const int miVerbse;     /* Vebosity level */
212 	const int mbSrcBkt;     /* Allow bactrace on original file? */
213 	const int miMchMax;     /* Max number of matches to find */
214 	const int miMchMin;     /* Min number oif matches to find */
215 	const int miAhdMax ;    /* Max number of bytes to look ahead */
216     const bool mbCmpAll ;   /* Compare all matches, even if data not in buffer? */
217     int  miSrcScn;          /* Prescan original file: 0=no, 1=yes, 2=done */
218 
219     /* State */
220 	off_t mzAhdOrg;        // Current ahead position on original file
221 	off_t mzAhdNew;        // Current ahead position on new file
222 	hkey mlHshOrg;         // Current hash value for original file
223 	hkey mlHshNew;         // Current hash value for new file
224 	int miValOrg;          // Current file value
225 	int miValNew;          // Current file value
226 	int miEqlOrg;          // Indicator for equal bytes in current sample
227 	int miEqlNew;          // Indicator for equal bytes in current sample
228 
229 	/**
230 	 * Flush pending output
231 	 */
232 	void ufPutEql(const off_t &lzPosOrg, const off_t &lzPosNew, off_t &lzEql, bool &lbEql) ;
233 
234 	/**
235 	 * Finds the nearest equal regions between the two files
236 	 *
237 	 * @param azRedOrg  in:  read position in original file to start looking from
238 	 * @param azRedNew  in:  read position in new file to start looking from
239 	 * @param azSkpOrg  out: number of bytes to skip (delete) in original file to reach equal region
240 	 * @param azSkpNew  out: number of bytes to skip (insert) in new file to reach equal region
241 	 * @param azAhd     out: number of bytes to go ahead on both files to reach equal regions
242 	 */
243 	int ufFndAhd (
244 	  off_t const &azRedOrg,        /* read position in original file                     */
245 	  off_t const &azRedNew,        /* read position in new file                          */
246 	  off_t &azSkpOrg,              /* number of bytes to skip (delete) in original file  */
247 	  off_t &azSkpNew,              /* number of bytes to skip (insert) in new file       */
248 	  off_t &azAhd                  /* number of bytes to go before similarity is reached */
249 	);
250 
251     /** Scans the original file and fills up the hashtable. */
252     int ufFndAhdScn () ;
253 
254     /** Hashes the next byte from specified file. */
255     void ufFndAhdGet(JFile *apFil, const off_t &azPos, int &aiVal, int &aiEql, int aiSft) ;
256 
257 public:
258     /*
259      * Statistics about operations
260      */
261     int giHshErr ;         /* Number of false hash hits                         */
262 }; // class JDiff
263 
264 } // namespace JojoDiff
265 #endif /* JDIFF_H_ */
266