1 /*
2 * JDiff.cpp
3 *
4 * Jojo's diff on binary files: main class.
5 *
6 * Copyright (C) 2002-2011 Joris Heirbaut
7 *
8 * This program is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program. If not, see <http://www.gnu.org/licenses/>.
20 *
21 * ----------------------------------------------------------------------------
22 *
23 * Class JDiff takes two JFiles as input and outputs to a JOut instance.
24 *
25 * If you want to reuse JojoDiff, you need following files:
26 * - JDiff.h/cpp The main JojoDiff class
27 * - JHashPos.h/cpp The hash table collection of (sample-key, position)
28 * - JMatchTable.h/cpp The matching table logic
29 * - JDefs.h Global definitions
30 * - JDebug.h/cpp Debugging definitions
31 * - JOut.h Abstract output class
32 * - JFile.h Abstract input class
33 * + at least one descendant of JOut and JFile (may be of your own).
34 *
35 * Method jdiff performs the actual diffing:
36 * 1) first, we create a hashtable collection of (hkey, position) pairs, where
37 * hkey = hash key of a 32 byte sample in the left input file
38 * position = position of this sample in the file
39 * 2) compares both files byte by byte
40 * 3) when a difference is found, looks ahead using ufFndAhd to find the nearest
41 * equal region between both files
42 * 4) output skip/delete/backtrace instructions to reach the found regions
43 * 5) repeat steps 2-4 each time the end of an equal region is reached.
44 *
45 * Method ufFndAhd looks ahead on the input files to find the nearest equals regions:
46 * - for every 32-byte sample in the right file, look in the hastable whether a
47 * similar sample exists in the left file.
48 * - creates a table of matching regions using this catalog
49 * - returns the nearest match from the table of matches
50 *
51 * There are two reasons for creating a matching table on top of the hashtable:
52 * - The hashtable may only contain a (small) percentage of all samples, hence
53 * the nearest equal region is not always detected first.
54 * - Samples are considered equal when their hash-keys are equal (in fact there's only
55 * 1/256 chance that samples are really equal of their hash-keys match).
56 *
57 * Method ufFndAhdGet gets one byte of a file and counts the number of subsequent
58 * equal bytes (because samples containing too many equal bytes are usually bad to
59 * compare with between files).
60 *
61 * Method ufFndhdScn scans the left file and creates the hash table.
62 *
63 * Author Version Date Modification
64 * --------------------- ------- ------- -----------------------
65 * Joris Heirbaut v0.0 10-06-2002 hashed compare
66 * Joris Heirbaut 14-06-2002 full compare
67 * Joris Heirbaut 17-06-2002 global positions
68 * Joris Heirbaut v0.1 18-06-2002 first well-working runs!!!
69 * Joris Heirbaut 19-06-2002 compare in buffer before read position
70 * Joris Heirbaut v0.1 20-06-2002 optimized esc-sequences & lengths
71 * Joris Heirbaut v0.2 24-06-2002 running okay again
72 * Joris Heirbaut v0.2b 01-07-2002 bugfix on length=252
73 * Joris Heirbaut v0.2c 09-07-2002 bugfix on divide by zero in statistics
74 * Joris Heirbaut v0.3a 09-07-2002 hashtable hint only on samplerate
75 * Joris Heirbaut | 09-07-2002 exit code 1 if files are equal
76 * Joris Heirbaut | 12-07-2002 bugfix using ufFabPos in function call
77 * Joris Heirbaut v0.3a 16-07-2002 backtrack on original file
78 * Joris Heirbaut v0.4a 19-07-2002 prescan sourcefile
79 * Joris Heirbaut | 30-08-2002 bugfix in ufFabRst and ufFabPos
80 * Joris Heirbaut | 03-08-2002 bugfix for backtrack before start-of-file
81 * Joris Heirbaut | 09-09-2002 reimplemented filebuffer
82 * Joris Heirbaut v0.4a 10-09-2002 take best of multiple possibilities
83 * Joris Heirbaut v0.4b 11-09-2002 soft-reading from files
84 * Joris Heirbaut | 18-09-2002 moved ufFabCmp from ufFndAhdChk to ufFndAhdAdd/Bst
85 * Joris Heirbaut | 18-09-2002 ufFabOpt - optimize a found solution
86 * Joris Heirbaut | 10-10-2002 added Fab->izPosEof to correctly handle EOF condition
87 * Joris Heirbaut v0.4b 16-10-2002 replaces ufFabCmpBck and ufFabCmpFwd with ufFabFnd
88 * Joris Heirbaut v0.4c 04-11-2002 use ufHshFnd after prescanning
89 * Joris Heirbaut | 04-11-2002 no reset of matching table
90 * Joris Heirbaut | 21-12-2002 rewrite of matching table logic
91 * Joris Heirbaut | 24-12-2002 no compare in ufFndAhdAdd
92 * Joris Heirbaut | 02-01-2003 restart finding matches at regular intervals when no matches are found
93 * Joris Heirbaut | 09-01-2003 renamed ufFabBkt to ufFabSek, use it for DEL and BKT instructions
94 * Joris Heirbaut v0.4c 23-01-2003 distinguish between EOF en EOB
95 * Joris Heirbaut v0.5 27-02-2003 dropped "fast" hash method (it was slow!)
96 * Joris Heirbaut | 22-05-2003 started rewrite of FAB-abstraction
97 * Joris Heirbaut | 30-06-2003 terminated rewrite ...
98 * Joris Heirbaut | 08-07-2003 correction in ufMchBst (llTstNxt = *alBstNew + 1 iso -1)
99 * Joris Heirbaut v0.5 02-09-2003 production
100 * Joris Heirbaut v0.6 29-04-2005 large-file support
101 * Joris Heirbaut v0.7 23-06-2009 differentiate between position 0 and notfound in ufMchBst
102 * Joris Heirbaut | 23-06-2009 optimize collission strategy using sample quality
103 * Joris Heirbaut | 24-06-2009 introduce quality of samples
104 * Joris Heirbaut | 26-06-2009 protect first samples
105 * Joris Heirbaut v0.7g 24-09-2009 use fseeko for cygwin largefiles
106 * Joris Heirbaut | 24-09-2009 removed casts to int from ufFndAhd
107 * Joris Heirbaut v0.7h 24-09-2009 faster ufFabGetNxt
108 * Joris Heirbaut | 24-09-2009 faster ufHshAdd: remove quality of hashes
109 * Joris Heirbaut v0.7i 25-09-2009 drop use of ufHshBitCnt
110 * Joris Heirbaut v0.7l 04-10-2009 increment glMchMaxDst as hashtable overloading grows
111 * Joris Heirbaut | 16-10-2009 finalization
112 * Joris Heirbaut v0.7m 17-10-2009 gprof optimization ufHshAdd
113 * Joris Heirbaut v0.7n 17-10-2009 gprof optimization ufFabGet
114 * Joris Heirbaut v0.7o 18-10-2009 gprof optimization asFab->iiRedSze
115 * Joris Heirbaut v0.7p 19-10-2009 ufHshAdd: check uniform distribution
116 * Joris Heirbaut v0.7q 23-10-2009 ufFabGet: scroll back on buffer
117 * Joris Heirbaut | 19-10-2009 ufFndAhd: liMax = giBufSze
118 * Joris Heirbaut | 25-10-2009 ufMchAdd: gliding matches
119 * Joris Heirbaut | 25-10-2009 ufOut: return true for faster EQL sequences in jdiff function
120 * Joris Heirbaut v0.7r 27-10-2009 ufMchBst: test position for gliding matches
121 * Joris Heirbaut | 27-10-2009 ufMchBst: remove double loop
122 * Joris Heirbaut | 27-10-2009 ufMchBst: double linked list ordered on azPosOrg
123 * Joris Heirbaut | 27-10-2009 ufFndAhd: look back on reset (liBck)
124 * Joris Heirbaut | 27-10-2009 ufFndAhd: reduce lookahead after giMchMin (speed optimization)
125 * Joris Heirbaut v0.7x 05-11-2009 ufMchAdd: hashed method
126 * Joris Heirbaut v0.7y 13-11-2009 ufMchBst: store unmatched samples too (reduce use of ufFabFnd)
127 * Joris Heirbaut v0.8 30-06-2011 C++ version
128 * Joris Heirbaut v0.8a 08-07-2011 Optimize position 0
129 * Joris Heirbaut v0.8b 02-09-2011 Switch order of ahead/backtrack/skip logic
130 * Joris Heirbaut v0.8.1 30-11-2011 Revert to use of fread/fseek (MinGW ifstream.gseek does not correctly handle files >2GB)
131 * Joris Heirbaut v0.8.1 30-11-2011 Throuw out exception handling for MinGW (trying to reduce exe size)
132 *
133 *******************************************************************************/
134 #include "JDefs.h"
135 #include "JDiff.h"
136 #include <limits.h>
137
138 #ifdef _FILE_OFFSET_BITS
139 #warning FILE OFFSET BITS set
140 #endif
141 #ifdef _LARGEFILE64_SOURCE
142 #warning _LARGEFILE64_SOURCE set
143 #endif
144
145 namespace JojoDiff {
146
147 /*
148 * Constructor
149 */
JDiff(JFile * const apFilOrg,JFile * const apFilNew,JOut * const apOut,const int aiHshSze,const int aiVerbse,const int abSrcBkt,const int aiSrcScn,const int aiMchMax,const int aiMchMin,const int aiAhdMax,const bool abCmpAll)150 JDiff::JDiff(
151 JFile * const apFilOrg, JFile * const apFilNew,
152 JOut * const apOut,
153 const int aiHshSze, const int aiVerbse,
154 const int abSrcBkt, const int aiSrcScn,
155 const int aiMchMax, const int aiMchMin,
156 const int aiAhdMax, const bool abCmpAll
157 ) : mpFilOrg(apFilOrg), mpFilNew(apFilNew), mpOut(apOut),
158 miVerbse(aiVerbse), mbSrcBkt(abSrcBkt),
159 miMchMax(aiMchMax), miMchMin(aiMchMin),
160 miAhdMax(aiAhdMax<1024?1024:aiAhdMax),
161 mbCmpAll(abCmpAll), miSrcScn(aiSrcScn),
162 mzAhdOrg(0), mzAhdNew(0), mlHshOrg(0), mlHshNew(0), giHshErr(0)
163 {
164 gpHsh = new JHashPos(aiHshSze) ;
165 gpMch = new JMatchTable(gpHsh, mpFilOrg, mpFilNew, abCmpAll);
166 }
167
168 /*
169 * Destructor
170 */
~JDiff()171 JDiff::~JDiff() {
172 delete gpHsh ;
173 delete gpMch ;
174 }
175
176 /*******************************************************************************
177 * Difference function
178 *
179 * Takes two files as arguments and writes out the differences
180 *
181 * Principle:
182 * Take one byte from each file and compare. If they are equal, then continue.
183 * If they are different, start lookahead to find next equal blocks within file.
184 * If equal blocks are found,
185 * - first insert or delete the specified number of bytes,
186 * - then continue reading on both files until equal blocks are reached,
187 *
188 *******************************************************************************/
jdiff()189 int JDiff::jdiff()
190 {
191 int lcOrg; /* byte from original file */
192 int lcNew; /* byte from new file */
193 off_t lzPosOrg = 0 ;
194 off_t lzPosNew = 0 ;
195
196 bool lbEql = false; /* accumulate equal bytes? */
197 off_t lzEql = 0; /* accumulated equal bytes */
198
199 bool lbFnd = false; /* offsets are pointing to a valid solution? */
200 off_t lzAhd=0;
201 off_t lzSkpOrg=0;
202 off_t lzSkpNew=0;
203
204 #if debug
205 int liErr=0 ; /* check for malfunction: 0=not checking, 1=checking, 2=error */
206 #endif
207
208 /* Take one byte from each file ... */
209 lcOrg = mpFilOrg->get(lzPosOrg, 0);
210 lcNew = mpFilNew->get(lzPosNew, 0);
211 while (lcNew >= 0) {
212 #if debug
213 if (JDebug::gbDbg[DBGPRG])
214 fprintf(JDebug::stddbg, "Input "P8zd"->%2x "P8zd"->%2x.\n",
215 lzPosOrg - 1, lcOrg, lzPosNew - 1, lcNew) ;
216 #endif
217
218 if (lcOrg == lcNew){
219 /* Output or count equals */
220 if (lbEql){
221 lzEql ++ ;
222 } else {
223 lbEql = mpOut->put(EQL, 1, lcOrg, lcNew, lzPosOrg, lzPosNew);
224 }
225
226 /* Take next byte from each file ... */
227 lcOrg = mpFilOrg->get(++ lzPosOrg, 0) ;
228 lcNew = mpFilNew->get(++ lzPosNew, 0) ;
229
230 /* decrease ahead counter */
231 lzAhd -- ;
232 } else if (lzAhd > 0) {
233 /* Output accumulated equals */
234 ufPutEql(lzPosOrg, lzPosNew, lzEql, lbEql);
235
236 /* Output difference */
237 if (lcOrg < 0) {
238 mpOut->put(INS, 1, lcOrg, lcNew, lzPosOrg, lzPosNew);
239
240 /* Take next byte from each file ... */
241 lcNew = mpFilNew->get(++ lzPosNew, 0) ;
242 } else {
243 mpOut->put(MOD, 1, lcOrg, lcNew, lzPosOrg, lzPosNew);
244
245 /* Take next byte from each file ... */
246 lcOrg = mpFilOrg->get(++ lzPosOrg, 0) ;
247 lcNew = mpFilNew->get(++ lzPosNew, 0) ;
248 }
249
250 /* decrease ahead counter */
251 lzAhd-- ;
252
253 } else if (lbFnd && lzAhd == 0) {
254 lzAhd = SMPSZE ; // to avoid infine loop (when ufFabFnd persists with lzSkpOrg, lzSkpNew, lzAhd all zero)
255 lbFnd = false ;
256 #if debug
257 liErr = 2 ; // ufFabFnd did not point to an equal region !
258 #endif
259 } else {
260 #if debug
261 /* Flush output buffer in debug */
262 if (JDebug::gbDbg[DBGAHD] || JDebug::gbDbg[DBGMCH]){
263 ufPutEql(lzPosOrg, lzPosNew, lzEql, lbEql);
264 mpOut->put(ESC, 0, 0, 0, lzPosOrg, lzPosNew);
265 }
266
267 /* If lbChk is unset, then an expected equal block has not been reached.
268 * In fast mode, when doing non-compared jumps, this may happen.
269 * In normal mode (mbCmpAll), this should never happen.
270 */
271 if (liErr==2 && (miVerbse>2 || mbCmpAll)){
272 fprintf(JDebug::stddbg, "Hash miss!\n");
273 giHshErr++ ;
274 }
275 liErr=0; // clear error state
276 #endif
277
278 /* Find a new equals-reqion */
279 lbFnd = ufFndAhd(lzPosOrg, lzPosNew, lzSkpOrg, lzSkpNew, lzAhd) ;
280 if (lbFnd < 0)
281 return lbFnd ;
282
283 #if debug
284 if (JDebug::gbDbg[DBGAHD])
285 fprintf(JDebug::stddbg, "Findahead on %"PRIzd" %"PRIzd" skip %"PRIzd" %"PRIzd" ahead %"PRIzd"\n",
286 lzPosOrg, lzPosNew, lzSkpOrg, lzSkpNew, lzAhd) ;
287 if (JDebug::gbDbg[DBGPRG])
288 fprintf(JDebug::stddbg, "Current position in new file= %"PRIzd"\n", lzPosNew) ;
289 #endif
290
291 /* Output accumulated equals */
292 ufPutEql(lzPosOrg, lzPosNew, lzEql, lbEql);
293
294 /* Execute offsets */
295 if (lzSkpOrg > 0) {
296 mpOut->put(DEL, lzSkpOrg, 0, 0, lzPosOrg, lzPosNew) ;
297 lzPosOrg += lzSkpOrg ;
298 lcOrg = mpFilOrg->get(lzPosOrg, 0);
299 } else if (lzSkpOrg < 0) {
300 mpOut->put(BKT, - lzSkpOrg, 0, 0, lzPosOrg, lzPosNew) ;
301 lzPosOrg += lzSkpOrg ;
302 lcOrg = mpFilOrg->get(lzPosOrg, 0);
303 }
304 if (lzSkpNew > 0) {
305 while (lzSkpNew > 0 && lcNew > EOF) {
306 mpOut->put(INS, 1, 0, lcNew, lzPosOrg, lzPosNew);
307 lzSkpNew-- ;
308 lcNew = mpFilNew->get(++ lzPosNew, 0);
309 }
310 }
311 } /* if lcOrg == lcNew */
312 } /* while lcNew >= 0 */
313
314 /* Flush output buffer */
315 ufPutEql(lzPosOrg, lzPosNew, lzEql, lbEql);
316 mpOut->put(ESC, 0, 0, 0, lzPosOrg, lzPosNew);
317
318 /* Return code */
319 if (lcNew < EOB || lcOrg < EOB){
320 return (lcNew < lcOrg)?lcNew:lcOrg;
321 }
322 return 0;
323 } /* jdiff */
324
325 /**
326 * Flush output
327 */
ufPutEql(const off_t & lzPosOrg,const off_t & lzPosNew,off_t & lzEql,bool & lbEql)328 void JDiff::ufPutEql(const off_t &lzPosOrg, const off_t &lzPosNew, off_t &lzEql, bool &lbEql){
329 /* Output accumulated equals */
330 if (lzEql > 0){
331 mpOut->put(EQL, lzEql, 0, 0, lzPosOrg - lzEql, lzPosNew - lzEql);
332 lzEql = 0;
333 }
334 lbEql=false;
335 }
336
337 /**
338 * @brief Find Ahead function
339 * Read ahead on both files until we possibly found an equal series of 32 bytes
340 * in both files. Then calculate the deplacement vector between two files:
341 * - positive if characters need to be inserted in the original file,
342 * - negative if characters need to be removed from the original file.
343 * @param azRedOrg read position in left file
344 * @param azRedNew read position in right file
345 * @param azSkpOrg out: number of bytes to skip (delete) in left file
346 * @param azSkpNew out: number of bytes to skip (insert) in right file
347 * @param azAhd out: number of bytes to skip on bith files before similarity is reached
348 * @return 0 no solution found
349 * @return 1 solution found
350 * @return < 0 error: see EXI-codes
351 */
ufFndAhd(off_t const & azRedOrg,off_t const & azRedNew,off_t & azSkpOrg,off_t & azSkpNew,off_t & azAhd)352 int JDiff::ufFndAhd (
353 off_t const &azRedOrg,
354 off_t const &azRedNew,
355 off_t &azSkpOrg,
356 off_t &azSkpNew,
357 off_t &azAhd
358 )
359 { off_t lzFndOrg=0; /* Found position within original file */
360 off_t lzFndNew=0; /* Found position within new file */
361 off_t lzBseOrg; /* Base position on original file: gbSrcBkt?0:alRedOrg */
362
363 int liIdx; /* Index for initializing */
364 int liFnd=0; /* Number of matches found */
365 int liSft; /* 1 = hard look-ahead, 2 = soft look-ahead */
366
367 /* Start with hard lookahead, till we've found at least one match */
368 liSft = 1 ;
369
370 /* Prescan the original file? */
371 if (miSrcScn == 1) {
372 int liRet = ufFndAhdScn() ;
373 if (liRet < 0) return liRet ;
374 miSrcScn = 2 ;
375 }
376
377 /*
378 * How many bytes to look ahead ?
379 */
380 int liMax; /* Max number of bytes to read */
381 if (miSrcScn == 2){
382 if (mzAhdNew == 0 || mzAhdNew < azRedNew) {
383 liMax = miAhdMax ;
384 } else if (mzAhdNew > azRedNew + miAhdMax) {
385 liMax = miAhdMax ;
386 } else {
387 liMax = miAhdMax - (mzAhdNew - azRedNew) ;
388 }
389 } else {
390 liMax = INT_MAX / 2 ;
391 }
392
393 /*
394 * How many bytes to look back on reset ?
395 */
396 int liBck; /* Number of bytes to look back */
397 if (gpHsh->get_reliability() < miAhdMax)
398 liBck = gpHsh->get_reliability() / 2 ;
399 else
400 liBck = miAhdMax / 2 ;
401
402 /*
403 * Re-Initialize hash function (read 31 bytes) if
404 * - ahead position has been reset, or
405 * - read position has passed the ahead position
406 */
407 if (miSrcScn == 0 && (mzAhdOrg == 0 || mzAhdOrg + liBck < azRedOrg)) {
408 mzAhdOrg = azRedOrg - liBck ;
409 if (mzAhdOrg < 0) mzAhdOrg = 0 ;
410 miEqlOrg = 0 ;
411 mlHshOrg = 0 ;
412
413 miEqlOrg = 0 ;
414 miValOrg = mpFilOrg->get(mzAhdOrg, liSft) ;
415 for (liIdx=0;(liIdx < SMPSZE - 1) && (miValOrg > EOF); liIdx++){
416 gpHsh->hash(miValOrg, mlHshOrg) ;
417 ufFndAhdGet(mpFilOrg, ++ mzAhdOrg, miValOrg, miEqlOrg, liSft) ;
418 }
419 }
420 if (mzAhdNew == 0 || mzAhdNew + liBck < azRedNew) {
421 mzAhdNew = azRedNew - liBck ;
422 if (mzAhdNew < 0) mzAhdNew = 0 ;
423 miEqlNew = 0 ;
424 mlHshNew = 0 ;
425 liMax += liBck ;
426
427 miEqlNew = 0 ;
428 miValNew = mpFilNew->get(mzAhdNew, liSft) ;
429 liMax -- ;
430 for (liIdx=0;(liIdx < SMPSZE - 1) && (miValNew > EOF); liIdx++){
431 gpHsh->hash(miValNew, mlHshNew) ;
432 ufFndAhdGet(mpFilNew, ++ mzAhdNew, miValNew, miEqlNew, liSft) ;
433 liMax -- ;
434 }
435 }
436
437 /*
438 * Build the table of matches
439 */
440 if (gpMch->cleanup(azRedNew - gpHsh->get_reliability())){
441 /* Do not backtrace before lzBseOrg */
442 lzBseOrg = (mbSrcBkt?0:azRedOrg) ;
443
444 /* Do not read from original file if it has been prescanned */
445 if (miSrcScn > 0) miValOrg = EOB ;
446
447 /* Scroll through both files until an equal hash value has been found */
448 while ((liMax > 0) && ((miValNew > EOF ) || (miValOrg > EOF))) {
449 /* insert original file's value into hashtable (if no prescanning has been done) */
450 if (miValOrg > EOF){
451 /* hash the new value and add to hashtable */
452 gpHsh->hash(miValOrg, mlHshOrg) ;
453 gpHsh->add(mlHshOrg, mzAhdOrg, miEqlOrg) ;
454
455 #if debug
456 if (JDebug::gbDbg[DBGAHH])
457 fprintf(JDebug::stddbg, "ufHshAdd(%2x -> %8"PRIhkey", "P8zd", "P8zd")\n",
458 miValOrg, mlHshOrg, mzAhdOrg, lzBseOrg);
459 #endif
460
461 /* get next value from file */
462 ufFndAhdGet(mpFilOrg, ++ mzAhdOrg, miValOrg, miEqlOrg, liSft) ;
463 }
464
465 /* check new file against original file */
466 if (miValNew > EOF){
467 /* hash the new value and lookup in hashtable */
468 gpHsh->hash(miValNew, mlHshNew) ;
469 if (gpHsh->get(mlHshNew, lzFndOrg)) {
470 /* add found position into table of matches */
471 if (lzFndOrg > lzBseOrg) {
472 /* add solution to the table of matches */
473 switch (gpMch->add(lzFndOrg, mzAhdNew, azRedNew, miEqlNew))
474 { case 0: /* table is full */
475 if (liBck > 0 && gpMch->cleanup(azRedNew)){
476 // made more room
477 } else {
478 liMax = 0 ; // stop lookahead
479 continue;
480 }
481 case 1: /* alternative added */
482 if (mzAhdNew > azRedNew) {
483 liFnd ++ ;
484
485 if (liFnd == miMchMax) {
486 liMax = 0 ; // stop lookahead
487 continue;
488 } else if ((liFnd == miMchMin) && (liMax > gpHsh->get_reliability())) {
489 liMax = gpHsh->get_reliability() ; // reduce lookahead
490 }
491 }
492 break ;
493 case 2: ; /* alternative collided */
494 case -1: ;/* compare failed */
495 }
496 }
497 }
498
499 /* get next value from file */
500 ufFndAhdGet(mpFilNew, ++ mzAhdNew, miValNew, miEqlNew, liSft) ;
501 liMax -- ;
502 } /* if siValNew > EOF */
503 } /* while */
504 } /* if ufMchFre(..) */
505
506 /*
507 * Check for errors
508 */
509 if (miValNew < EOB || miValOrg < EOB){
510 return (miValNew < miValOrg) ? miValNew : miValOrg;
511 }
512
513 /*
514 * Get the best match and calculate the offsets
515 */
516 if (! gpMch->get(azRedOrg, azRedNew, /* out */ lzFndOrg, lzFndNew))
517 { azSkpOrg = 0 ;
518 azSkpNew = 0 ;
519 azAhd = (mzAhdNew - azRedNew) - gpHsh->get_reliability() ;
520 if (azAhd < SMPSZE) azAhd = SMPSZE ;
521 return 0 ;
522 } else {
523 if (lzFndOrg >= azRedOrg)
524 { if (lzFndOrg - azRedOrg >= lzFndNew - azRedNew)
525 { /* go forward on original file */
526 azSkpOrg = lzFndOrg - azRedOrg + azRedNew - lzFndNew ;
527 azSkpNew = 0 ;
528 azAhd = lzFndNew - azRedNew ;
529 } else {
530 /* go forward on new file */
531 azSkpOrg = 0;
532 azSkpNew = lzFndNew - azRedNew + azRedOrg - lzFndOrg ;
533 azAhd = lzFndOrg - azRedOrg ;
534 }
535 } else {
536 /* backtrack on original file */
537 azSkpOrg = azRedOrg - lzFndOrg + lzFndNew - azRedNew ;
538 if (azSkpOrg < azRedOrg)
539 { azSkpNew = 0 ;
540 azSkpOrg = - azSkpOrg ;
541 azAhd = lzFndNew - azRedNew ;
542 }
543 else /* do not bactrace before beginning of file */
544 { azSkpNew = azSkpOrg - azRedOrg ;
545 azSkpOrg = - azRedOrg ;
546 azAhd = (lzFndNew - azRedNew) - azSkpNew ;
547 }
548
549 /* reset ahead position when backtracking */
550 mzAhdOrg = 0 ; // TODO reset matching table too?
551 }
552
553 return 1 ;
554 }
555 }
556
557 /* -----------------------------------------------------------------------------
558 * Auxiliary function:
559 * Get next character from file (lookahead) and count number of equal chars
560 * in current cluster
561 * ---------------------------------------------------------------------------*/
562 /**
563 * @brief Get next character from file (lookahead) and count number of equal chars
564 * in current sample.
565 * @param apFil File to read
566 * @param azPos Position to read (will be incremented by one)
567 * @param aiVal in: previous byte read, out: new byte read
568 * @param aiEql Incremented by one if previous == new byte
569 * @param aiSft Soft or hard read-ahead (see JFile.get)
570 */
ufFndAhdGet(JFile * apFil,const off_t & azPos,int & acVal,int & aiEql,int aiSft)571 void JDiff::ufFndAhdGet(JFile *apFil, const off_t &azPos, int &acVal, int &aiEql, int aiSft)
572 {
573 int lcPrv = acVal ;
574 acVal = apFil->get(azPos, aiSft) ;
575 if (acVal != lcPrv) {
576 if (aiEql > 0) aiEql -= 2 ;
577 } else {
578 if (aiEql < SMPSZE) aiEql += 1 ;
579 }
580 }
581
582 /**
583 * Prescan the original file: calculates a hash-key for every 32-byte sample
584 * in the left file and stores them with their position in a hash-table.
585 */
ufFndAhdScn()586 int JDiff::ufFndAhdScn ()
587 {
588 hkey lkHshOrg=0; // Current hash value for original file
589 int liEqlOrg=0; // Number of times current value occurs in hash value
590 int lcValOrg; // Current file value
591 off_t lzPosOrg=0; // Position within original file
592
593 int liIdx ;
594
595 if (miVerbse > 0) {
596 fprintf(JDebug::stddbg, "Prescanning:\n");
597 }
598
599 /* Initialize hash function */
600 lcValOrg = mpFilOrg->get(lzPosOrg, 1) ;
601 for (liIdx=0;(liIdx < SMPSZE - 1) && (lcValOrg > EOF); liIdx++) {
602 gpHsh->hash(lcValOrg, lkHshOrg) ;
603 ufFndAhdGet(mpFilOrg, ++ lzPosOrg, lcValOrg, liEqlOrg, 1) ;
604 }
605
606 /* Build hashtable */
607 liIdx = 0;
608 while (lcValOrg > EOF) {
609 gpHsh->hash(lcValOrg, lkHshOrg) ;
610 gpHsh->add(lkHshOrg, lzPosOrg, liEqlOrg) ;
611 #if debug
612 if (JDebug::gbDbg[DBGAHH])
613 fprintf(JDebug::stddbg, "ufHshAdd(%2x -> %8"PRIhkey", "P8zd", %8d)\n",
614 lcValOrg, lkHshOrg, lzPosOrg, 0);
615 #endif
616
617 ufFndAhdGet(mpFilOrg, ++ lzPosOrg, lcValOrg, liEqlOrg, 1) ;
618
619 if (miVerbse > 0) {
620 /* output a dot every 16MB */
621 liIdx ++ ;
622 if ((liIdx & 0xffffff) == 0) {
623 if (liIdx == 0x40000000) {
624 liIdx = 0 ;
625 fprintf(JDebug::stddbg, ".\n"); /* output a newline every 1024MB */
626 } else {
627 fprintf(JDebug::stddbg, "."); /* output a dot every 16 MB */
628 }
629 }
630 }
631 }
632
633 if (miVerbse > 0) fprintf(JDebug::stddbg, ".\n");
634
635 #if debug
636 if (JDebug::gbDbg[DBGDST])
637 gpHsh->dist(lzPosOrg, 128);
638 #endif
639
640 if (lcValOrg < EOB)
641 return lcValOrg ;
642 else
643 return 0 ;
644 } /* ufFndAhdScn */
645 } /* namespace */
646