1 /*
2  * JDiff.cpp
3  *
4  * Jojo's diff on binary files: main class.
5  *
6  * Copyright (C) 2002-2011 Joris Heirbaut
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
20  *
21  * ----------------------------------------------------------------------------
22  *
23  * Class JDiff takes two JFiles as input and outputs to a JOut instance.
24  *
25  * If you want to reuse JojoDiff, you need following files:
26  * - JDiff.h/cpp        The main JojoDiff class
27  * - JHashPos.h/cpp     The hash table collection of (sample-key, position)
28  * - JMatchTable.h/cpp  The matching table logic
29  * - JDefs.h            Global definitions
30  * - JDebug.h/cpp       Debugging definitions
31  * - JOut.h             Abstract output class
32  * - JFile.h            Abstract input class
33  * + at least one descendant of JOut and JFile (may be of your own).
34  *
35  * Method jdiff performs the actual diffing:
36  * 1) first, we create a hashtable collection of (hkey, position) pairs, where
37  *      hkey = hash key of a 32 byte sample in the left input file
38  *      position = position of this sample in the file
39  * 2) compares both files byte by byte
40  * 3) when a difference is found, looks ahead using ufFndAhd to find the nearest
41  *    equal region between both files
42  * 4) output skip/delete/backtrace instructions to reach the found regions
43  * 5) repeat steps 2-4 each time the end of an equal region is reached.
44  *
45  * Method ufFndAhd looks ahead on the input files to find the nearest equals regions:
46  * - for every 32-byte sample in the right file, look in the hastable whether a
47  *   similar sample exists in the left file.
48  * - creates a table of matching regions using this catalog
49  * - returns the nearest match from the table of matches
50  *
51  * There are two reasons for creating a matching table on top of the hashtable:
52  * - The hashtable may only contain a (small) percentage of all samples, hence
53  *   the nearest equal region is not always detected first.
54  * - Samples are considered equal when their hash-keys are equal (in fact there's only
55  *   1/256 chance that samples are really equal of their hash-keys match).
56  *
57  * Method ufFndAhdGet gets one byte of a file and counts the number of subsequent
58  * equal bytes (because samples containing too many equal bytes are usually bad to
59  * compare with between files).
60  *
61  * Method ufFndhdScn scans the left file and creates the hash table.
62  *
63  * Author                Version Date       Modification
64  * --------------------- ------- -------    -----------------------
65  * Joris Heirbaut        v0.0    10-06-2002 hashed compare
66  * Joris Heirbaut                14-06-2002 full compare
67  * Joris Heirbaut                17-06-2002 global positions
68  * Joris Heirbaut        v0.1    18-06-2002 first well-working runs!!!
69  * Joris Heirbaut                19-06-2002 compare in buffer before read position
70  * Joris Heirbaut        v0.1    20-06-2002 optimized esc-sequences & lengths
71  * Joris Heirbaut        v0.2    24-06-2002 running okay again
72  * Joris Heirbaut        v0.2b   01-07-2002 bugfix on length=252
73  * Joris Heirbaut        v0.2c   09-07-2002 bugfix on divide by zero in statistics
74  * Joris Heirbaut        v0.3a   09-07-2002 hashtable hint only on samplerate
75  * Joris Heirbaut          |     09-07-2002 exit code 1 if files are equal
76  * Joris Heirbaut          |     12-07-2002 bugfix using ufFabPos in function call
77  * Joris Heirbaut        v0.3a   16-07-2002 backtrack on original file
78  * Joris Heirbaut        v0.4a   19-07-2002 prescan sourcefile
79  * Joris Heirbaut          |     30-08-2002 bugfix in ufFabRst and ufFabPos
80  * Joris Heirbaut          |     03-08-2002 bugfix for backtrack before start-of-file
81  * Joris Heirbaut          |     09-09-2002 reimplemented filebuffer
82  * Joris Heirbaut        v0.4a   10-09-2002 take best of multiple possibilities
83  * Joris Heirbaut        v0.4b   11-09-2002 soft-reading from files
84  * Joris Heirbaut          |     18-09-2002 moved ufFabCmp from ufFndAhdChk to ufFndAhdAdd/Bst
85  * Joris Heirbaut          |     18-09-2002 ufFabOpt - optimize a found solution
86  * Joris Heirbaut          |     10-10-2002 added Fab->izPosEof to correctly handle EOF condition
87  * Joris Heirbaut        v0.4b   16-10-2002 replaces ufFabCmpBck and ufFabCmpFwd with ufFabFnd
88  * Joris Heirbaut        v0.4c   04-11-2002 use ufHshFnd after prescanning
89  * Joris Heirbaut          |     04-11-2002 no reset of matching table
90  * Joris Heirbaut          |     21-12-2002 rewrite of matching table logic
91  * Joris Heirbaut          |     24-12-2002 no compare in ufFndAhdAdd
92  * Joris Heirbaut          |     02-01-2003 restart finding matches at regular intervals when no matches are found
93  * Joris Heirbaut          |     09-01-2003 renamed ufFabBkt to ufFabSek, use it for DEL and BKT instructions
94  * Joris Heirbaut        v0.4c   23-01-2003 distinguish between EOF en EOB
95  * Joris Heirbaut        v0.5    27-02-2003 dropped "fast" hash method (it was slow!)
96  * Joris Heirbaut          |     22-05-2003 started    rewrite of FAB-abstraction
97  * Joris Heirbaut          |     30-06-2003 terminated rewrite ...
98  * Joris Heirbaut          |     08-07-2003 correction in ufMchBst (llTstNxt = *alBstNew + 1 iso -1)
99  * Joris Heirbaut        v0.5    02-09-2003 production
100  * Joris Heirbaut        v0.6    29-04-2005 large-file support
101  * Joris Heirbaut        v0.7    23-06-2009 differentiate between position 0 and notfound in ufMchBst
102  * Joris Heirbaut          |     23-06-2009 optimize collission strategy using sample quality
103  * Joris Heirbaut          |     24-06-2009 introduce quality of samples
104  * Joris Heirbaut          |     26-06-2009 protect first samples
105  * Joris Heirbaut        v0.7g   24-09-2009 use fseeko for cygwin largefiles
106  * Joris Heirbaut          |     24-09-2009 removed casts to int from ufFndAhd
107  * Joris Heirbaut        v0.7h   24-09-2009 faster ufFabGetNxt
108  * Joris Heirbaut          |     24-09-2009 faster ufHshAdd: remove quality of hashes
109  * Joris Heirbaut        v0.7i   25-09-2009 drop use of ufHshBitCnt
110  * Joris Heirbaut        v0.7l   04-10-2009 increment glMchMaxDst as hashtable overloading grows
111  * Joris Heirbaut          |     16-10-2009 finalization
112  * Joris Heirbaut        v0.7m   17-10-2009 gprof optimization ufHshAdd
113  * Joris Heirbaut        v0.7n   17-10-2009 gprof optimization ufFabGet
114  * Joris Heirbaut        v0.7o   18-10-2009 gprof optimization asFab->iiRedSze
115  * Joris Heirbaut        v0.7p   19-10-2009 ufHshAdd: check uniform distribution
116  * Joris Heirbaut        v0.7q   23-10-2009 ufFabGet: scroll back on buffer
117  * Joris Heirbaut          |     19-10-2009 ufFndAhd: liMax = giBufSze
118  * Joris Heirbaut          |     25-10-2009 ufMchAdd: gliding matches
119  * Joris Heirbaut          |     25-10-2009 ufOut: return true for faster EQL sequences in jdiff function
120  * Joris Heirbaut        v0.7r   27-10-2009 ufMchBst: test position for gliding matches
121  * Joris Heirbaut          |     27-10-2009 ufMchBst: remove double loop
122  * Joris Heirbaut          |     27-10-2009 ufMchBst: double linked list ordered on azPosOrg
123  * Joris Heirbaut          |     27-10-2009 ufFndAhd: look back on reset (liBck)
124  * Joris Heirbaut          |     27-10-2009 ufFndAhd: reduce lookahead after giMchMin (speed optimization)
125  * Joris Heirbaut        v0.7x   05-11-2009 ufMchAdd: hashed method
126  * Joris Heirbaut        v0.7y   13-11-2009 ufMchBst: store unmatched samples too (reduce use of ufFabFnd)
127  * Joris Heirbaut        v0.8    30-06-2011 C++ version
128  * Joris Heirbaut        v0.8a   08-07-2011 Optimize position 0
129  * Joris Heirbaut        v0.8b   02-09-2011 Switch order of ahead/backtrack/skip logic
130  * Joris Heirbaut        v0.8.1  30-11-2011 Revert to use of fread/fseek (MinGW ifstream.gseek does not correctly handle files >2GB)
131  * Joris Heirbaut        v0.8.1  30-11-2011 Throuw out exception handling for MinGW (trying to reduce exe size)
132  *
133  *******************************************************************************/
134 #include "JDefs.h"
135 #include "JDiff.h"
136 #include <limits.h>
137 
138 #ifdef _FILE_OFFSET_BITS
139 #warning FILE OFFSET BITS set
140 #endif
141 #ifdef _LARGEFILE64_SOURCE
142 #warning _LARGEFILE64_SOURCE set
143 #endif
144 
145 namespace JojoDiff {
146 
147 /*
148  * Constructor
149  */
JDiff(JFile * const apFilOrg,JFile * const apFilNew,JOut * const apOut,const int aiHshSze,const int aiVerbse,const int abSrcBkt,const int aiSrcScn,const int aiMchMax,const int aiMchMin,const int aiAhdMax,const bool abCmpAll)150 JDiff::JDiff(
151     JFile * const apFilOrg, JFile * const apFilNew,
152     JOut  * const apOut,
153     const int aiHshSze, const int aiVerbse,
154     const int abSrcBkt, const int aiSrcScn,
155     const int aiMchMax, const int aiMchMin,
156     const int aiAhdMax, const bool abCmpAll
157 ) : mpFilOrg(apFilOrg), mpFilNew(apFilNew), mpOut(apOut),
158     miVerbse(aiVerbse), mbSrcBkt(abSrcBkt),
159     miMchMax(aiMchMax), miMchMin(aiMchMin),
160     miAhdMax(aiAhdMax<1024?1024:aiAhdMax),
161     mbCmpAll(abCmpAll), miSrcScn(aiSrcScn),
162     mzAhdOrg(0), mzAhdNew(0), mlHshOrg(0), mlHshNew(0), giHshErr(0)
163 {
164 	gpHsh = new JHashPos(aiHshSze) ;
165 	gpMch = new JMatchTable(gpHsh, mpFilOrg, mpFilNew, abCmpAll);
166 }
167 
168 /*
169  * Destructor
170  */
~JDiff()171 JDiff::~JDiff() {
172 	delete gpHsh ;
173 	delete gpMch ;
174 }
175 
176 /*******************************************************************************
177 * Difference function
178 *
179 * Takes two files as arguments and writes out the differences
180 *
181 * Principle:
182 *   Take one byte from each file and compare. If they are equal, then continue.
183 *   If they are different, start lookahead to find next equal blocks within file.
184 *   If equal blocks are found,
185 *   - first insert or delete the specified number of bytes,
186 *   - then continue reading on both files until equal blocks are reached,
187 *
188 *******************************************************************************/
jdiff()189 int JDiff::jdiff()
190 {
191     int lcOrg;              /* byte from original file */
192     int lcNew;              /* byte from new file */
193     off_t lzPosOrg = 0 ;
194     off_t lzPosNew = 0 ;
195 
196     bool  lbEql = false;    /* accumulate equal bytes? */
197     off_t lzEql = 0;        /* accumulated equal bytes */
198 
199     bool lbFnd = false;     /* offsets are pointing to a valid solution? */
200     off_t lzAhd=0;
201     off_t lzSkpOrg=0;
202     off_t lzSkpNew=0;
203 
204 #if debug
205     int liErr=0 ;		/* check for malfunction: 0=not checking, 1=checking, 2=error */
206 #endif
207 
208     /* Take one byte from each file ... */
209     lcOrg = mpFilOrg->get(lzPosOrg, 0);
210     lcNew = mpFilNew->get(lzPosNew, 0);
211     while (lcNew >= 0) {
212         #if debug
213         if (JDebug::gbDbg[DBGPRG])
214             fprintf(JDebug::stddbg, "Input "P8zd"->%2x "P8zd"->%2x.\n",
215                     lzPosOrg - 1, lcOrg, lzPosNew - 1, lcNew)  ;
216         #endif
217 
218         if (lcOrg == lcNew){
219             /* Output or count equals */
220             if (lbEql){
221                 lzEql ++ ;
222             } else {
223                 lbEql = mpOut->put(EQL, 1, lcOrg, lcNew, lzPosOrg, lzPosNew);
224             }
225 
226             /* Take next byte from each file ... */
227             lcOrg = mpFilOrg->get(++ lzPosOrg, 0) ;
228             lcNew = mpFilNew->get(++ lzPosNew, 0) ;
229 
230             /* decrease ahead counter */
231             lzAhd -- ;
232         } else if (lzAhd > 0) {
233             /* Output accumulated equals */
234         	ufPutEql(lzPosOrg, lzPosNew, lzEql, lbEql);
235 
236             /* Output difference */
237             if (lcOrg < 0) {
238                 mpOut->put(INS, 1, lcOrg, lcNew, lzPosOrg, lzPosNew);
239 
240                 /* Take next byte from each file ... */
241                 lcNew = mpFilNew->get(++ lzPosNew, 0) ;
242             } else {
243                 mpOut->put(MOD, 1, lcOrg, lcNew, lzPosOrg, lzPosNew);
244 
245                 /* Take next byte from each file ... */
246                 lcOrg = mpFilOrg->get(++ lzPosOrg, 0) ;
247                 lcNew = mpFilNew->get(++ lzPosNew, 0) ;
248             }
249 
250             /* decrease ahead counter */
251             lzAhd-- ;
252 
253         } else if (lbFnd && lzAhd == 0) {
254         	lzAhd = SMPSZE ; // to avoid infine loop (when ufFabFnd persists with lzSkpOrg, lzSkpNew, lzAhd all zero)
255         	lbFnd = false ;
256 			#if debug
257 			liErr = 2 ;	// ufFabFnd did not point to an equal region !
258 			#endif
259         } else {
260 			#if debug
261         	/* Flush output buffer in debug */
262         	if (JDebug::gbDbg[DBGAHD] || JDebug::gbDbg[DBGMCH]){
263         		ufPutEql(lzPosOrg, lzPosNew, lzEql, lbEql);
264         		mpOut->put(ESC, 0, 0, 0, lzPosOrg, lzPosNew);
265         	}
266 
267             /* If lbChk is unset, then an expected equal block has not been reached.
268              * In fast mode, when doing non-compared jumps, this may happen.
269              * In normal mode (mbCmpAll), this should never happen.
270              */
271         	if (liErr==2 && (miVerbse>2 || mbCmpAll)){
272 				fprintf(JDebug::stddbg, "Hash miss!\n");
273 				giHshErr++ ;
274         	}
275         	liErr=0; // clear error state
276             #endif
277 
278             /* Find a new equals-reqion */
279             lbFnd = ufFndAhd(lzPosOrg, lzPosNew, lzSkpOrg, lzSkpNew, lzAhd) ;
280             if (lbFnd < 0)
281                 return lbFnd ;
282 
283             #if debug
284             if (JDebug::gbDbg[DBGAHD])
285                 fprintf(JDebug::stddbg, "Findahead on %"PRIzd" %"PRIzd" skip %"PRIzd" %"PRIzd" ahead %"PRIzd"\n",
286                         lzPosOrg, lzPosNew, lzSkpOrg, lzSkpNew, lzAhd)  ;
287             if (JDebug::gbDbg[DBGPRG])
288                 fprintf(JDebug::stddbg, "Current position in new file= %"PRIzd"\n", lzPosNew) ;
289             #endif
290 
291             /* Output accumulated equals */
292         	ufPutEql(lzPosOrg, lzPosNew, lzEql, lbEql);
293 
294             /* Execute offsets */
295 			if (lzSkpOrg > 0) {
296 				mpOut->put(DEL, lzSkpOrg, 0, 0, lzPosOrg, lzPosNew) ;
297 				lzPosOrg += lzSkpOrg ;
298 				lcOrg = mpFilOrg->get(lzPosOrg, 0);
299 			} else if (lzSkpOrg < 0) {
300 				mpOut->put(BKT, - lzSkpOrg, 0, 0, lzPosOrg, lzPosNew) ;
301 				lzPosOrg += lzSkpOrg ;
302 				lcOrg = mpFilOrg->get(lzPosOrg, 0);
303 			}
304 			if (lzSkpNew > 0) {
305 				while (lzSkpNew > 0 && lcNew > EOF) {
306 					mpOut->put(INS, 1, 0, lcNew, lzPosOrg, lzPosNew);
307 					lzSkpNew-- ;
308 					lcNew = mpFilNew->get(++ lzPosNew, 0);
309 				}
310 			}
311         } /* if lcOrg == lcNew */
312     } /* while lcNew >= 0 */
313 
314     /* Flush output buffer */
315     ufPutEql(lzPosOrg, lzPosNew, lzEql, lbEql);
316     mpOut->put(ESC, 0, 0, 0, lzPosOrg, lzPosNew);
317 
318     /* Return code */
319     if (lcNew < EOB || lcOrg < EOB){
320         return (lcNew < lcOrg)?lcNew:lcOrg;
321     }
322     return 0;
323 } /* jdiff */
324 
325 /**
326  * Flush output
327  */
ufPutEql(const off_t & lzPosOrg,const off_t & lzPosNew,off_t & lzEql,bool & lbEql)328 void JDiff::ufPutEql(const off_t &lzPosOrg, const off_t &lzPosNew, off_t &lzEql, bool &lbEql){
329     /* Output accumulated equals */
330     if (lzEql > 0){
331         mpOut->put(EQL, lzEql, 0, 0, lzPosOrg - lzEql, lzPosNew - lzEql);
332         lzEql = 0;
333     }
334     lbEql=false;
335 }
336 
337 /**
338  * @brief Find Ahead function
339  *        Read ahead on both files until we possibly found an equal series of 32 bytes
340  *        in both files. Then calculate the deplacement vector between two files:
341  *          - positive if characters need to be inserted in the original file,
342  *          - negative if characters need to be removed from the original file.
343  * @param azRedOrg  read position in left file
344  * @param azRedNew  read position in right file
345  * @param azSkpOrg  out: number of bytes to skip (delete) in left file
346  * @param azSkpNew  out: number of bytes to skip (insert) in right file
347  * @param azAhd     out: number of bytes to skip on bith files before similarity is reached
348  * @return 0    no solution found
349  * @return 1    solution found
350  * @return < 0  error: see EXI-codes
351  */
ufFndAhd(off_t const & azRedOrg,off_t const & azRedNew,off_t & azSkpOrg,off_t & azSkpNew,off_t & azAhd)352 int JDiff::ufFndAhd (
353   off_t const &azRedOrg,
354   off_t const &azRedNew,
355   off_t &azSkpOrg,
356   off_t &azSkpNew,
357   off_t &azAhd
358 )
359 { off_t lzFndOrg=0;   /* Found position within original file                 */
360   off_t lzFndNew=0;   /* Found position within new file                      */
361   off_t lzBseOrg;     /* Base position on original file: gbSrcBkt?0:alRedOrg */
362 
363   int liIdx;          /* Index for initializing                         */
364   int liFnd=0;        /* Number of matches found                        */
365   int liSft;          /* 1 = hard look-ahead, 2 = soft look-ahead       */
366 
367   /* Start with hard lookahead, till we've found at least one match */
368   liSft = 1 ;
369 
370   /* Prescan the original file? */
371   if (miSrcScn == 1) {
372     int liRet = ufFndAhdScn() ;
373     if (liRet < 0) return liRet ;
374     miSrcScn = 2 ;
375   }
376 
377   /*
378    * How many bytes to look ahead ?
379    */
380   int liMax; /* Max number of bytes to read */
381   if (miSrcScn == 2){
382       if (mzAhdNew == 0 || mzAhdNew < azRedNew) {
383           liMax = miAhdMax  ;
384       } else if (mzAhdNew > azRedNew + miAhdMax) {
385           liMax = miAhdMax  ;
386       } else {
387           liMax = miAhdMax - (mzAhdNew - azRedNew)  ;
388       }
389   } else {
390       liMax = INT_MAX / 2 ;
391   }
392 
393   /*
394    * How many bytes to look back on reset ?
395    */
396   int liBck; /* Number of bytes to look back */
397   if (gpHsh->get_reliability() < miAhdMax)
398       liBck = gpHsh->get_reliability() / 2 ;
399   else
400       liBck = miAhdMax / 2 ;
401 
402   /*
403    * Re-Initialize hash function (read 31 bytes) if
404    * - ahead position has been reset, or
405    * - read position has passed the ahead position
406    */
407   if (miSrcScn == 0 && (mzAhdOrg == 0 || mzAhdOrg + liBck < azRedOrg)) {
408     mzAhdOrg = azRedOrg - liBck ;
409     if (mzAhdOrg < 0) mzAhdOrg = 0 ;
410     miEqlOrg = 0 ;
411     mlHshOrg = 0 ;
412 
413     miEqlOrg = 0 ;
414     miValOrg = mpFilOrg->get(mzAhdOrg, liSft) ;
415     for (liIdx=0;(liIdx < SMPSZE - 1) && (miValOrg > EOF); liIdx++){
416       gpHsh->hash(miValOrg, mlHshOrg) ;
417       ufFndAhdGet(mpFilOrg, ++ mzAhdOrg, miValOrg, miEqlOrg, liSft) ;
418     }
419   }
420   if (mzAhdNew == 0 || mzAhdNew + liBck < azRedNew)  {
421     mzAhdNew = azRedNew - liBck ;
422     if (mzAhdNew < 0) mzAhdNew = 0 ;
423     miEqlNew = 0 ;
424     mlHshNew = 0 ;
425     liMax += liBck ;
426 
427     miEqlNew = 0 ;
428     miValNew = mpFilNew->get(mzAhdNew, liSft) ;
429     liMax -- ;
430     for (liIdx=0;(liIdx < SMPSZE - 1) && (miValNew > EOF); liIdx++){
431       gpHsh->hash(miValNew, mlHshNew) ;
432       ufFndAhdGet(mpFilNew, ++ mzAhdNew, miValNew, miEqlNew, liSft) ;
433       liMax -- ;
434     }
435   }
436 
437   /*
438    * Build the table of matches
439    */
440   if (gpMch->cleanup(azRedNew - gpHsh->get_reliability())){
441       /* Do not backtrace before lzBseOrg */
442       lzBseOrg = (mbSrcBkt?0:azRedOrg) ;
443 
444       /* Do not read from original file if it has been prescanned */
445       if (miSrcScn > 0) miValOrg = EOB ;
446 
447       /* Scroll through both files until an equal hash value has been found */
448       while ((liMax > 0) && ((miValNew > EOF ) || (miValOrg > EOF))) {
449           /* insert original file's value into hashtable (if no prescanning has been done) */
450           if (miValOrg > EOF){
451               /* hash the new value and add to hashtable */
452               gpHsh->hash(miValOrg, mlHshOrg) ;
453               gpHsh->add(mlHshOrg, mzAhdOrg, miEqlOrg) ;
454 
455               #if debug
456               if (JDebug::gbDbg[DBGAHH])
457                   fprintf(JDebug::stddbg, "ufHshAdd(%2x -> %8"PRIhkey", "P8zd", "P8zd")\n",
458                           miValOrg, mlHshOrg, mzAhdOrg, lzBseOrg);
459               #endif
460 
461               /* get next value from file */
462               ufFndAhdGet(mpFilOrg, ++ mzAhdOrg, miValOrg, miEqlOrg, liSft) ;
463           }
464 
465           /* check new file against original file */
466           if (miValNew > EOF){
467               /* hash the new value and lookup in hashtable */
468               gpHsh->hash(miValNew, mlHshNew) ;
469               if (gpHsh->get(mlHshNew, lzFndOrg)) {
470                   /* add found position into table of matches */
471                   if (lzFndOrg > lzBseOrg) {
472                       /* add solution to the table of matches */
473                       switch (gpMch->add(lzFndOrg, mzAhdNew, azRedNew, miEqlNew))
474                       { case 0: /* table is full */
475                           if (liBck > 0 && gpMch->cleanup(azRedNew)){
476                               // made more room
477                           } else {
478                               liMax = 0 ; // stop lookahead
479                               continue;
480                           }
481                       case 1: /* alternative added */
482                           if (mzAhdNew > azRedNew) {
483                               liFnd ++ ;
484 
485                               if (liFnd == miMchMax) {
486                                   liMax = 0 ; // stop lookahead
487                                   continue;
488                               } else if ((liFnd == miMchMin) && (liMax > gpHsh->get_reliability())) {
489                                   liMax = gpHsh->get_reliability() ; // reduce lookahead
490                               }
491                           }
492                           break ;
493                       case 2:  ; /* alternative collided */
494                       case -1: ;/* compare failed      */
495                       }
496                   }
497               }
498 
499               /* get next value from file */
500               ufFndAhdGet(mpFilNew, ++ mzAhdNew, miValNew, miEqlNew, liSft) ;
501               liMax -- ;
502           } /* if siValNew > EOF */
503       } /* while */
504   } /* if ufMchFre(..) */
505 
506   /*
507    * Check for errors
508    */
509   if (miValNew < EOB || miValOrg < EOB){
510       return (miValNew < miValOrg) ? miValNew : miValOrg;
511   }
512 
513   /*
514    * Get the best match and calculate the offsets
515    */
516   if (! gpMch->get(azRedOrg, azRedNew, /* out */ lzFndOrg, lzFndNew))
517   { azSkpOrg = 0 ;
518     azSkpNew = 0 ;
519     azAhd    = (mzAhdNew - azRedNew) - gpHsh->get_reliability() ;
520     if (azAhd < SMPSZE) azAhd = SMPSZE ;
521     return 0 ;
522   }  else  {
523     if (lzFndOrg >= azRedOrg)
524     { if (lzFndOrg - azRedOrg >= lzFndNew - azRedNew)
525       { /* go forward on original file */
526         azSkpOrg = lzFndOrg - azRedOrg + azRedNew - lzFndNew ;
527         azSkpNew = 0 ;
528         azAhd    = lzFndNew - azRedNew ;
529       } else {
530         /* go forward on new file */
531         azSkpOrg = 0;
532         azSkpNew = lzFndNew - azRedNew + azRedOrg - lzFndOrg ;
533         azAhd    = lzFndOrg - azRedOrg ;
534       }
535     } else {
536       /* backtrack on original file */
537       azSkpOrg = azRedOrg - lzFndOrg + lzFndNew - azRedNew ;
538       if (azSkpOrg < azRedOrg)
539       { azSkpNew = 0 ;
540         azSkpOrg = - azSkpOrg ;
541         azAhd = lzFndNew - azRedNew ;
542       }
543       else /* do not bactrace before beginning of file */
544       { azSkpNew = azSkpOrg - azRedOrg ;
545         azSkpOrg = - azRedOrg ;
546         azAhd = (lzFndNew - azRedNew) - azSkpNew ;
547       }
548 
549       /* reset ahead position when backtracking */
550       mzAhdOrg = 0 ; // TODO reset matching table too?
551     }
552 
553     return 1 ;
554   }
555 }
556 
557 /* -----------------------------------------------------------------------------
558  * Auxiliary function:
559  * Get next character from file (lookahead) and count number of equal chars
560  * in current cluster
561  * ---------------------------------------------------------------------------*/
562 /**
563  * @brief Get next character from file (lookahead) and count number of equal chars
564  *        in current sample.
565  * @param apFil     File to read
566  * @param azPos     Position to read (will be incremented by one)
567  * @param aiVal     in: previous byte read, out: new byte read
568  * @param aiEql     Incremented by one if previous == new byte
569  * @param aiSft     Soft or hard read-ahead (see JFile.get)
570  */
ufFndAhdGet(JFile * apFil,const off_t & azPos,int & acVal,int & aiEql,int aiSft)571 void JDiff::ufFndAhdGet(JFile *apFil, const off_t &azPos, int &acVal, int &aiEql, int aiSft)
572 {
573   int lcPrv = acVal ;
574   acVal = apFil->get(azPos, aiSft) ;
575   if (acVal != lcPrv) {
576       if (aiEql > 0) aiEql -= 2 ;
577   } else {
578       if (aiEql < SMPSZE) aiEql += 1 ;
579   }
580 }
581 
582 /**
583  * Prescan the original file: calculates a hash-key for every 32-byte sample
584  * in the left file and stores them with their position in a hash-table.
585  */
ufFndAhdScn()586 int JDiff::ufFndAhdScn ()
587 {
588   hkey  lkHshOrg=0;     // Current hash value for original file
589   int   liEqlOrg=0;     // Number of times current value occurs in hash value
590   int   lcValOrg;       // Current file value
591   off_t lzPosOrg=0;     // Position within original file
592 
593   int   liIdx ;
594 
595   if (miVerbse > 0) {
596     fprintf(JDebug::stddbg, "Prescanning:\n");
597   }
598 
599   /* Initialize hash function */
600   lcValOrg = mpFilOrg->get(lzPosOrg, 1) ;
601   for (liIdx=0;(liIdx < SMPSZE - 1) && (lcValOrg > EOF); liIdx++) {
602     gpHsh->hash(lcValOrg, lkHshOrg) ;
603     ufFndAhdGet(mpFilOrg, ++ lzPosOrg, lcValOrg, liEqlOrg, 1) ;
604   }
605 
606   /* Build hashtable */
607   liIdx = 0;
608   while (lcValOrg > EOF) {
609     gpHsh->hash(lcValOrg, lkHshOrg) ;
610     gpHsh->add(lkHshOrg, lzPosOrg, liEqlOrg) ;
611     #if debug
612         if (JDebug::gbDbg[DBGAHH])
613             fprintf(JDebug::stddbg, "ufHshAdd(%2x -> %8"PRIhkey", "P8zd", %8d)\n",
614                     lcValOrg, lkHshOrg, lzPosOrg, 0);
615     #endif
616 
617     ufFndAhdGet(mpFilOrg, ++ lzPosOrg, lcValOrg, liEqlOrg, 1) ;
618 
619     if (miVerbse > 0) {
620         /* output a dot every 16MB */
621         liIdx ++ ;
622         if ((liIdx & 0xffffff) == 0) {
623             if (liIdx == 0x40000000) {
624                 liIdx = 0 ;
625                 fprintf(JDebug::stddbg, ".\n"); /* output a newline every 1024MB */
626             } else {
627                 fprintf(JDebug::stddbg, "."); /* output a dot every 16 MB */
628             }
629         }
630     }
631   }
632 
633   if (miVerbse > 0) fprintf(JDebug::stddbg, ".\n");
634 
635 #if debug
636   if (JDebug::gbDbg[DBGDST])
637 	  gpHsh->dist(lzPosOrg, 128);
638 #endif
639 
640   if (lcValOrg < EOB)
641       return lcValOrg ;
642   else
643       return 0 ;
644 } /* ufFndAhdScn */
645 } /* namespace */
646