1 /* 2 * JDiff.cpp 3 * 4 * Jojo's diff on binary files: main class. 5 * 6 * Copyright (C) 2002-2011 Joris Heirbaut 7 * 8 * This program is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 * 21 * ---------------------------------------------------------------------------- 22 * 23 * Class JDiff takes two JFiles as input and outputs to a JOut instance. 24 * 25 * If you want to reuse JojoDiff, you need following files: 26 * - JDiff.h/cpp The main JojoDiff class 27 * - JHashPos.h/cpp The hash table collection of (sample-key, position) 28 * - JMatchTable.h/cpp The matching table logic 29 * - JDefs.h Global definitions 30 * - JDebug.h/cpp Debugging definitions 31 * - JOut.h Abstract output class 32 * - JFile.h Abstract input class 33 * + at least one descendant of JOut and JFile (may be of your own). 34 * 35 * Method jdiff performs the actual diffing: 36 * 1) first, we create a hashtable collection of (hkey, position) pairs, where 37 * hkey = hash key of a 32 byte sample in the original input file 38 * position = position of this sample in the file 39 * 2) compares both files byte by byte 40 * 3) when a difference is found, looks ahead using ufFndAhd to find the nearest 41 * equal region between both files 42 * 4) output skip/delete/backtrace instructions to reach the found regions 43 * 5) repeat steps 2-4 each time the end of an equal region is reached. 44 * 45 * Method ufFndAhd looks ahead on the input files to find the nearest equals regions: 46 * - for every 32-byte sample in the new file, look in the hastable whether a 47 * similar sample exists in the original file. 48 * - creates a table of matching regions using this catalog 49 * - returns the nearest match from the table of matches 50 * 51 * There are two reasons for creating a matching table on top of the hashtable: 52 * - The hashtable may only contain a (small) percentage of all samples, hence 53 * the nearest equal region is not always detected first. 54 * - Samples are considered equal when their hash-keys are equal (in fact there's only 55 * 1/256 chance that samples are really equal of their hash-keys match). 56 * 57 * Method ufFndAhdGet gets one byte of a file and counts the number of subsequent 58 * equal bytes (because samples containing too many equal bytes are usually bad to 59 * compare with between files). 60 * 61 * Method ufFndhdScn scans the left file and creates the hash table. 62 * 63 * TODO: allow org and new files to be the same file 64 * TODO: allow sequential files as input 65 * 66 * Author Version Date Modification 67 * --------------------- ------- ------- ----------------------- 68 * Joris Heirbaut v0.0 10-06-2002 hashed compare 69 * Joris Heirbaut 14-06-2002 full compare 70 * Joris Heirbaut 17-06-2002 global positions 71 * Joris Heirbaut v0.1 18-06-2002 first well-working runs!!! 72 * Joris Heirbaut 19-06-2002 compare in buffer before read position 73 * Joris Heirbaut v0.1 20-06-2002 optimized esc-sequences & lengths 74 * Joris Heirbaut v0.2 24-06-2002 running okay again 75 * Joris Heirbaut v0.2b 01-07-2002 bugfix on length=252 76 * Joris Heirbaut v0.2c 09-07-2002 bugfix on divide by zero in statistics 77 * Joris Heirbaut v0.3a 09-07-2002 hashtable hint only on samplerate 78 * Joris Heirbaut | 09-07-2002 exit code 1 if files are equal 79 * Joris Heirbaut | 12-07-2002 bugfix using ufFabPos in function call 80 * Joris Heirbaut v0.3a 16-07-2002 backtrack on original file 81 * Joris Heirbaut v0.4a 19-07-2002 prescan sourcefile 82 * Joris Heirbaut | 30-08-2002 bugfix in ufFabRst and ufFabPos 83 * Joris Heirbaut | 03-08-2002 bugfix for backtrack before start-of-file 84 * Joris Heirbaut | 09-09-2002 reimplemented filebuffer 85 * Joris Heirbaut v0.4a 10-09-2002 take best of multiple possibilities 86 * Joris Heirbaut v0.4b 11-09-2002 soft-reading from files 87 * Joris Heirbaut | 18-09-2002 moved ufFabCmp from ufFndAhdChk to ufFndAhdAdd/Bst 88 * Joris Heirbaut | 18-09-2002 ufFabOpt - optimize a found solution 89 * Joris Heirbaut | 10-10-2002 added Fab->izPosEof to correctly handle EOF condition 90 * Joris Heirbaut v0.4b 16-10-2002 replaces ufFabCmpBck and ufFabCmpFwd with ufFabFnd 91 * Joris Heirbaut v0.4c 04-11-2002 use ufHshFnd after prescanning 92 * Joris Heirbaut | 04-11-2002 no reset of matching table 93 * Joris Heirbaut | 21-12-2002 rewrite of matching table logic 94 * Joris Heirbaut | 24-12-2002 no compare in ufFndAhdAdd 95 * Joris Heirbaut | 02-01-2003 restart finding matches at regular intervals when no matches are found 96 * Joris Heirbaut | 09-01-2003 renamed ufFabBkt to ufFabSek, use it for DEL and BKT instructions 97 * Joris Heirbaut v0.4c 23-01-2003 distinguish between EOF en EOB 98 * Joris Heirbaut v0.5 27-02-2003 dropped "fast" hash method (it was slow!) 99 * Joris Heirbaut | 22-05-2003 started rewrite of FAB-abstraction 100 * Joris Heirbaut | 30-06-2003 terminated rewrite ... 101 * Joris Heirbaut | 08-07-2003 correction in ufMchBst (llTstNxt = *alBstNew + 1 iso -1) 102 * Joris Heirbaut v0.5 02-09-2003 production 103 * Joris Heirbaut v0.6 29-04-2005 large-file support 104 * Joris Heirbaut v0.7 23-06-2009 differentiate between position 0 and notfound in ufMchBst 105 * Joris Heirbaut | 23-06-2009 optimize collission strategy using sample quality 106 * Joris Heirbaut | 24-06-2009 introduce quality of samples 107 * Joris Heirbaut | 26-06-2009 protect first samples 108 * Joris Heirbaut v0.7g 24-09-2009 use fseeko for cygwin largefiles 109 * Joris Heirbaut | 24-09-2009 removed casts to int from ufFndAhd 110 * Joris Heirbaut v0.7h 24-09-2009 faster ufFabGetNxt 111 * Joris Heirbaut | 24-09-2009 faster ufHshAdd: remove quality of hashes 112 * Joris Heirbaut v0.7i 25-09-2009 drop use of ufHshBitCnt 113 * Joris Heirbaut v0.7l 04-10-2009 increment glMchMaxDst as hashtable overloading grows 114 * Joris Heirbaut | 16-10-2009 finalization 115 * Joris Heirbaut v0.7m 17-10-2009 gprof optimization ufHshAdd 116 * Joris Heirbaut v0.7n 17-10-2009 gprof optimization ufFabGet 117 * Joris Heirbaut v0.7o 18-10-2009 gprof optimization asFab->iiRedSze 118 * Joris Heirbaut v0.7p 19-10-2009 ufHshAdd: check uniform distribution 119 * Joris Heirbaut v0.7q 23-10-2009 ufFabGet: scroll back on buffer 120 * Joris Heirbaut | 19-10-2009 ufFndAhd: liMax = giBufSze 121 * Joris Heirbaut | 25-10-2009 ufMchAdd: gliding matches 122 * Joris Heirbaut | 25-10-2009 ufOut: return true for faster EQL sequences in jdiff function 123 * Joris Heirbaut v0.7r 27-10-2009 ufMchBst: test position for gliding matches 124 * Joris Heirbaut | 27-10-2009 ufMchBst: remove double loop 125 * Joris Heirbaut | 27-10-2009 ufMchBst: double linked list ordered on azPosOrg 126 * Joris Heirbaut | 27-10-2009 ufFndAhd: look back on reset (liBck) 127 * Joris Heirbaut | 27-10-2009 ufFndAhd: reduce lookahead after giMchMin (speed optimization) 128 * Joris Heirbaut v0.7x 05-11-2009 ufMchAdd: hashed method 129 * Joris Heirbaut v0.7y 13-11-2009 ufMchBst: store unmatched samples too (reduce use of ufFabFnd) 130 * Joris Heirbaut v0.8 30-06-2011 C++ version 131 * Joris Heirbaut v0.8a 08-07-2011 Optimize position 0 132 * Joris Heirbaut v0.8b 02-09-2011 Switch order of ahead/backtrack/skip logic 133 * Joris Heirbaut v0.8.1 30-11-2011 Revert to use of fread/fseek (MinGW ifstream.gseek does not correctly handle files >2GB) 134 * Joris Heirbaut v0.8.1 30-11-2011 Throuw out exception handling for MinGW (trying to reduce exe size) 135 * 136 *******************************************************************************/ 137 138 #ifndef JDIFF_H_ 139 #define JDIFF_H_ 140 #include "JDefs.h" 141 #include "JFile.h" 142 #include "JHashPos.h" 143 #include "JMatchTable.h" 144 #include "JOut.h" 145 146 namespace JojoDiff { 147 148 /** 149 * Central JDiff and FindAhead routines for the JDiff algorithm. 150 */ 151 class JDiff { 152 public: 153 /** 154 * Create JDiff for working on specified files. 155 * @param apFilOrg Original file. 156 * @param apFilNew New file. 157 * @param aiHshSze Hastable max size in number of elements (default = 8388608) 158 * @param aiVerbse Verbose level 0=no, 1=normal, 2=high (default = 0) 159 * @param abSrcBkt Backtrace on sourcefile allowed? (default = yes) 160 * @param aiSrcScn Prescan source file: 0=no, 1=yes (default = yes) 161 * @param aiMchMax Maximum entries in matching table (default = 8) 162 * @param aiMchMin Minimum entries in matching table (default = 4) 163 * @param aiAhdMax Maximum bytes to find ahead (default = 256kB) 164 */ 165 JDiff(JFile * const apFilOrg, JFile * const apFilNew, JOut * const apOut, 166 const int aiHshSze = 8388608, const 167 int aiVerbse=0, 168 const int abSrcBkt=true, 169 const int aiSrcScn=true, 170 const int aiMchMax=8, 171 const int aiMchMin=4, 172 const int aiAhdMax=256*1024, 173 const bool abCmpAll = true); 174 175 /** 176 * Destroys JDiff object. 177 */ 178 virtual ~JDiff(); 179 180 /******************************************************************************* 181 * Difference function 182 * 183 * Writes out the differences between the two files passed via the constructor. 184 * 185 * Throws a bad_alloc exception in case of memory error. 186 * Throws an io_base::failure in case of i/o error. 187 * 188 * @return 0 ok 189 * @return EXI_SEK Error seeking file 190 * @return EXI_LRG 7 Error on 64-bit number 191 * @return EXI_RED 8 Error reading file 192 * @return EXI_WRI 9 Error writing file 193 * @return EXI_MEM 10 Error allocating memory 194 * @return EXI_ERR 20 Spurious error occured 195 *******************************************************************************/ 196 int jdiff (); 197 198 /* getters */ getHsh()199 JHashPos * getHsh(){return gpHsh;}; getHshErr()200 int getHshErr(){return giHshErr;}; 201 202 private: 203 /* Context */ 204 JFile * const mpFilOrg ; // Original file to read 205 JFile * const mpFilNew ; // New file to read 206 JOut * const mpOut ; // Output handler 207 JHashPos * gpHsh ; // Hashtable containing hashes from mpFilOrg. 208 JMatchTable * gpMch ; // Table of matches 209 210 /* Settings */ 211 const int miVerbse; /* Vebosity level */ 212 const int mbSrcBkt; /* Allow bactrace on original file? */ 213 const int miMchMax; /* Max number of matches to find */ 214 const int miMchMin; /* Min number oif matches to find */ 215 const int miAhdMax ; /* Max number of bytes to look ahead */ 216 const bool mbCmpAll ; /* Compare all matches, even if data not in buffer? */ 217 int miSrcScn; /* Prescan original file: 0=no, 1=yes, 2=done */ 218 219 /* State */ 220 off_t mzAhdOrg; // Current ahead position on original file 221 off_t mzAhdNew; // Current ahead position on new file 222 hkey mlHshOrg; // Current hash value for original file 223 hkey mlHshNew; // Current hash value for new file 224 int miValOrg; // Current file value 225 int miValNew; // Current file value 226 int miEqlOrg; // Indicator for equal bytes in current sample 227 int miEqlNew; // Indicator for equal bytes in current sample 228 229 /** 230 * Flush pending output 231 */ 232 void ufPutEql(const off_t &lzPosOrg, const off_t &lzPosNew, off_t &lzEql, bool &lbEql) ; 233 234 /** 235 * Finds the nearest equal regions between the two files 236 * 237 * @param azRedOrg in: read position in original file to start looking from 238 * @param azRedNew in: read position in new file to start looking from 239 * @param azSkpOrg out: number of bytes to skip (delete) in original file to reach equal region 240 * @param azSkpNew out: number of bytes to skip (insert) in new file to reach equal region 241 * @param azAhd out: number of bytes to go ahead on both files to reach equal regions 242 */ 243 int ufFndAhd ( 244 off_t const &azRedOrg, /* read position in original file */ 245 off_t const &azRedNew, /* read position in new file */ 246 off_t &azSkpOrg, /* number of bytes to skip (delete) in original file */ 247 off_t &azSkpNew, /* number of bytes to skip (insert) in new file */ 248 off_t &azAhd /* number of bytes to go before similarity is reached */ 249 ); 250 251 /** Scans the original file and fills up the hashtable. */ 252 int ufFndAhdScn () ; 253 254 /** Hashes the next byte from specified file. */ 255 void ufFndAhdGet(JFile *apFil, const off_t &azPos, int &aiVal, int &aiEql, int aiSft) ; 256 257 public: 258 /* 259 * Statistics about operations 260 */ 261 int giHshErr ; /* Number of false hash hits */ 262 }; // class JDiff 263 264 } // namespace JojoDiff 265 #endif /* JDIFF_H_ */ 266