1 /*
2  *  LibXDiff by Davide Libenzi ( File Differential Library )
3  *  Copyright (C) 2003	Davide Libenzi
4  *
5  *  This library is free software; you can redistribute it and/or
6  *  modify it under the terms of the GNU Lesser General Public
7  *  License as published by the Free Software Foundation; either
8  *  version 2.1 of the License, or (at your option) any later version.
9  *
10  *  This library is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  *  Lesser General Public License for more details.
14  *
15  *  You should have received a copy of the GNU Lesser General Public
16  *  License along with this library; if not, see
17  *  <http://www.gnu.org/licenses/>.
18  *
19  *  Davide Libenzi <davidel@xmailserver.org>
20  *
21  */
22 
23 #include "xinclude.h"
24 #include "integer.h"
25 
26 
27 #define XDL_MAX_COST_MIN 256
28 #define XDL_HEUR_MIN_COST 256
29 #define XDL_LINE_MAX (long)((1UL << (CHAR_BIT * sizeof(long) - 1)) - 1)
30 #define XDL_SNAKE_CNT 20
31 #define XDL_K_HEUR 4
32 
33 /** Declare a function as always inlined. */
34 #if defined(_MSC_VER)
35 # define XDL_INLINE(type) static __inline type
36 #elif defined(__GNUC__)
37 # define XDL_INLINE(type) static __inline__ type
38 #else
39 # define XDL_INLINE(type) static type
40 #endif
41 
42 typedef struct s_xdpsplit {
43 	long i1, i2;
44 	int min_lo, min_hi;
45 } xdpsplit_t;
46 
47 
48 
49 
50 static long xdl_split(unsigned long const *ha1, long off1, long lim1,
51 		      unsigned long const *ha2, long off2, long lim2,
52 		      long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl,
53 		      xdalgoenv_t *xenv);
54 static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2);
55 
56 
57 
58 
59 
60 /*
61  * See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers.
62  * Basically considers a "box" (off1, off2, lim1, lim2) and scan from both
63  * the forward diagonal starting from (off1, off2) and the backward diagonal
64  * starting from (lim1, lim2). If the K values on the same diagonal crosses
65  * returns the furthest point of reach. We might end up having to expensive
66  * cases using this algorithm is full, so a little bit of heuristic is needed
67  * to cut the search and to return a suboptimal point.
68  */
xdl_split(unsigned long const * ha1,long off1,long lim1,unsigned long const * ha2,long off2,long lim2,long * kvdf,long * kvdb,int need_min,xdpsplit_t * spl,xdalgoenv_t * xenv)69 static long xdl_split(unsigned long const *ha1, long off1, long lim1,
70 		      unsigned long const *ha2, long off2, long lim2,
71 		      long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl,
72 		      xdalgoenv_t *xenv) {
73 	long dmin = off1 - lim2, dmax = lim1 - off2;
74 	long fmid = off1 - off2, bmid = lim1 - lim2;
75 	long odd = (fmid - bmid) & 1;
76 	long fmin = fmid, fmax = fmid;
77 	long bmin = bmid, bmax = bmid;
78 	long ec, d, i1, i2, prev1, best, dd, v, k;
79 
80 	/*
81 	 * Set initial diagonal values for both forward and backward path.
82 	 */
83 	kvdf[fmid] = off1;
84 	kvdb[bmid] = lim1;
85 
86 	for (ec = 1;; ec++) {
87 		int got_snake = 0;
88 
89 		/*
90 		 * We need to extent the diagonal "domain" by one. If the next
91 		 * values exits the box boundaries we need to change it in the
92 		 * opposite direction because (max - min) must be a power of two.
93 		 * Also we initialize the external K value to -1 so that we can
94 		 * avoid extra conditions check inside the core loop.
95 		 */
96 		if (fmin > dmin)
97 			kvdf[--fmin - 1] = -1;
98 		else
99 			++fmin;
100 		if (fmax < dmax)
101 			kvdf[++fmax + 1] = -1;
102 		else
103 			--fmax;
104 
105 		for (d = fmax; d >= fmin; d -= 2) {
106 			if (kvdf[d - 1] >= kvdf[d + 1])
107 				i1 = kvdf[d - 1] + 1;
108 			else
109 				i1 = kvdf[d + 1];
110 			prev1 = i1;
111 			i2 = i1 - d;
112 			for (; i1 < lim1 && i2 < lim2 && ha1[i1] == ha2[i2]; i1++, i2++);
113 			if (i1 - prev1 > xenv->snake_cnt)
114 				got_snake = 1;
115 			kvdf[d] = i1;
116 			if (odd && bmin <= d && d <= bmax && kvdb[d] <= i1) {
117 				spl->i1 = i1;
118 				spl->i2 = i2;
119 				spl->min_lo = spl->min_hi = 1;
120 				return ec;
121 			}
122 		}
123 
124 		/*
125 		 * We need to extent the diagonal "domain" by one. If the next
126 		 * values exits the box boundaries we need to change it in the
127 		 * opposite direction because (max - min) must be a power of two.
128 		 * Also we initialize the external K value to -1 so that we can
129 		 * avoid extra conditions check inside the core loop.
130 		 */
131 		if (bmin > dmin)
132 			kvdb[--bmin - 1] = XDL_LINE_MAX;
133 		else
134 			++bmin;
135 		if (bmax < dmax)
136 			kvdb[++bmax + 1] = XDL_LINE_MAX;
137 		else
138 			--bmax;
139 
140 		for (d = bmax; d >= bmin; d -= 2) {
141 			if (kvdb[d - 1] < kvdb[d + 1])
142 				i1 = kvdb[d - 1];
143 			else
144 				i1 = kvdb[d + 1] - 1;
145 			prev1 = i1;
146 			i2 = i1 - d;
147 			for (; i1 > off1 && i2 > off2 && ha1[i1 - 1] == ha2[i2 - 1]; i1--, i2--);
148 			if (prev1 - i1 > xenv->snake_cnt)
149 				got_snake = 1;
150 			kvdb[d] = i1;
151 			if (!odd && fmin <= d && d <= fmax && i1 <= kvdf[d]) {
152 				spl->i1 = i1;
153 				spl->i2 = i2;
154 				spl->min_lo = spl->min_hi = 1;
155 				return ec;
156 			}
157 		}
158 
159 		if (need_min)
160 			continue;
161 
162 		/*
163 		 * If the edit cost is above the heuristic trigger and if
164 		 * we got a good snake, we sample current diagonals to see
165 		 * if some of the, have reached an "interesting" path. Our
166 		 * measure is a function of the distance from the diagonal
167 		 * corner (i1 + i2) penalized with the distance from the
168 		 * mid diagonal itself. If this value is above the current
169 		 * edit cost times a magic factor (XDL_K_HEUR) we consider
170 		 * it interesting.
171 		 */
172 		if (got_snake && ec > xenv->heur_min) {
173 			for (best = 0, d = fmax; d >= fmin; d -= 2) {
174 				dd = d > fmid ? d - fmid: fmid - d;
175 				i1 = kvdf[d];
176 				i2 = i1 - d;
177 				v = (i1 - off1) + (i2 - off2) - dd;
178 
179 				if (v > XDL_K_HEUR * ec && v > best &&
180 				    off1 + xenv->snake_cnt <= i1 && i1 < lim1 &&
181 				    off2 + xenv->snake_cnt <= i2 && i2 < lim2) {
182 					for (k = 1; ha1[i1 - k] == ha2[i2 - k]; k++)
183 						if (k == xenv->snake_cnt) {
184 							best = v;
185 							spl->i1 = i1;
186 							spl->i2 = i2;
187 							break;
188 						}
189 				}
190 			}
191 			if (best > 0) {
192 				spl->min_lo = 1;
193 				spl->min_hi = 0;
194 				return ec;
195 			}
196 
197 			for (best = 0, d = bmax; d >= bmin; d -= 2) {
198 				dd = d > bmid ? d - bmid: bmid - d;
199 				i1 = kvdb[d];
200 				i2 = i1 - d;
201 				v = (lim1 - i1) + (lim2 - i2) - dd;
202 
203 				if (v > XDL_K_HEUR * ec && v > best &&
204 				    off1 < i1 && i1 <= lim1 - xenv->snake_cnt &&
205 				    off2 < i2 && i2 <= lim2 - xenv->snake_cnt) {
206 					for (k = 0; ha1[i1 + k] == ha2[i2 + k]; k++)
207 						if (k == xenv->snake_cnt - 1) {
208 							best = v;
209 							spl->i1 = i1;
210 							spl->i2 = i2;
211 							break;
212 						}
213 				}
214 			}
215 			if (best > 0) {
216 				spl->min_lo = 0;
217 				spl->min_hi = 1;
218 				return ec;
219 			}
220 		}
221 
222 		/*
223 		 * Enough is enough. We spent too much time here and now we collect
224 		 * the furthest reaching path using the (i1 + i2) measure.
225 		 */
226 		if (ec >= xenv->mxcost) {
227 			long fbest, fbest1, bbest, bbest1;
228 
229 			fbest = fbest1 = -1;
230 			for (d = fmax; d >= fmin; d -= 2) {
231 				i1 = XDL_MIN(kvdf[d], lim1);
232 				i2 = i1 - d;
233 				if (lim2 < i2)
234 					i1 = lim2 + d, i2 = lim2;
235 				if (fbest < i1 + i2) {
236 					fbest = i1 + i2;
237 					fbest1 = i1;
238 				}
239 			}
240 
241 			bbest = bbest1 = XDL_LINE_MAX;
242 			for (d = bmax; d >= bmin; d -= 2) {
243 				i1 = XDL_MAX(off1, kvdb[d]);
244 				i2 = i1 - d;
245 				if (i2 < off2)
246 					i1 = off2 + d, i2 = off2;
247 				if (i1 + i2 < bbest) {
248 					bbest = i1 + i2;
249 					bbest1 = i1;
250 				}
251 			}
252 
253 			if ((lim1 + lim2) - bbest < fbest - (off1 + off2)) {
254 				spl->i1 = fbest1;
255 				spl->i2 = fbest - fbest1;
256 				spl->min_lo = 1;
257 				spl->min_hi = 0;
258 			} else {
259 				spl->i1 = bbest1;
260 				spl->i2 = bbest - bbest1;
261 				spl->min_lo = 0;
262 				spl->min_hi = 1;
263 			}
264 			return ec;
265 		}
266 	}
267 }
268 
269 
270 /*
271  * Rule: "Divide et Impera". Recursively split the box in sub-boxes by calling
272  * the box splitting function. Note that the real job (marking changed lines)
273  * is done in the two boundary reaching checks.
274  */
xdl_recs_cmp(diffdata_t * dd1,long off1,long lim1,diffdata_t * dd2,long off2,long lim2,long * kvdf,long * kvdb,int need_min,xdalgoenv_t * xenv)275 int xdl_recs_cmp(diffdata_t *dd1, long off1, long lim1,
276 		 diffdata_t *dd2, long off2, long lim2,
277 		 long *kvdf, long *kvdb, int need_min, xdalgoenv_t *xenv) {
278 	unsigned long const *ha1 = dd1->ha, *ha2 = dd2->ha;
279 
280 	/*
281 	 * Shrink the box by walking through each diagonal snake (SW and NE).
282 	 */
283 	for (; off1 < lim1 && off2 < lim2 && ha1[off1] == ha2[off2]; off1++, off2++);
284 	for (; off1 < lim1 && off2 < lim2 && ha1[lim1 - 1] == ha2[lim2 - 1]; lim1--, lim2--);
285 
286 	/*
287 	 * If one dimension is empty, then all records on the other one must
288 	 * be obviously changed.
289 	 */
290 	if (off1 == lim1) {
291 		char *rchg2 = dd2->rchg;
292 		long *rindex2 = dd2->rindex;
293 
294 		for (; off2 < lim2; off2++)
295 			rchg2[rindex2[off2]] = 1;
296 	} else if (off2 == lim2) {
297 		char *rchg1 = dd1->rchg;
298 		long *rindex1 = dd1->rindex;
299 
300 		for (; off1 < lim1; off1++)
301 			rchg1[rindex1[off1]] = 1;
302 	} else {
303 		xdpsplit_t spl;
304 		spl.i1 = spl.i2 = 0;
305 
306 		/*
307 		 * Divide ...
308 		 */
309 		if (xdl_split(ha1, off1, lim1, ha2, off2, lim2, kvdf, kvdb,
310 			      need_min, &spl, xenv) < 0) {
311 
312 			return -1;
313 		}
314 
315 		/*
316 		 * ... et Impera.
317 		 */
318 		if (xdl_recs_cmp(dd1, off1, spl.i1, dd2, off2, spl.i2,
319 				 kvdf, kvdb, spl.min_lo, xenv) < 0 ||
320 		    xdl_recs_cmp(dd1, spl.i1, lim1, dd2, spl.i2, lim2,
321 				 kvdf, kvdb, spl.min_hi, xenv) < 0) {
322 
323 			return -1;
324 		}
325 	}
326 
327 	return 0;
328 }
329 
330 
xdl_do_diff(mmfile_t * mf1,mmfile_t * mf2,xpparam_t const * xpp,xdfenv_t * xe)331 int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
332 		xdfenv_t *xe) {
333 	size_t ndiags, allocsize;
334 	long *kvd, *kvdf, *kvdb;
335 	xdalgoenv_t xenv;
336 	diffdata_t dd1, dd2;
337 
338 	if (XDF_DIFF_ALG(xpp->flags) == XDF_PATIENCE_DIFF)
339 		return xdl_do_patience_diff(mf1, mf2, xpp, xe);
340 
341 	if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF)
342 		return xdl_do_histogram_diff(mf1, mf2, xpp, xe);
343 
344 	if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0) {
345 
346 		return -1;
347 	}
348 
349 	/*
350 	 * Allocate and setup K vectors to be used by the differential algorithm.
351 	 * One is to store the forward path and one to store the backward path.
352 	 */
353 	GIT_ERROR_CHECK_ALLOC_ADD3(&ndiags, xe->xdf1.nreff, xe->xdf2.nreff, 3);
354 	GIT_ERROR_CHECK_ALLOC_MULTIPLY(&allocsize, ndiags, 2);
355 	GIT_ERROR_CHECK_ALLOC_ADD(&allocsize, allocsize, 2);
356 	GIT_ERROR_CHECK_ALLOC_MULTIPLY(&allocsize, allocsize, sizeof(long));
357 
358 	if (!(kvd = (long *) xdl_malloc(allocsize))) {
359 		xdl_free_env(xe);
360 		return -1;
361 	}
362 	kvdf = kvd;
363 	kvdb = kvdf + ndiags;
364 	kvdf += xe->xdf2.nreff + 1;
365 	kvdb += xe->xdf2.nreff + 1;
366 
367 	xenv.mxcost = xdl_bogosqrt(ndiags);
368 	if (xenv.mxcost < XDL_MAX_COST_MIN)
369 		xenv.mxcost = XDL_MAX_COST_MIN;
370 	xenv.snake_cnt = XDL_SNAKE_CNT;
371 	xenv.heur_min = XDL_HEUR_MIN_COST;
372 
373 	dd1.nrec = xe->xdf1.nreff;
374 	dd1.ha = xe->xdf1.ha;
375 	dd1.rchg = xe->xdf1.rchg;
376 	dd1.rindex = xe->xdf1.rindex;
377 	dd2.nrec = xe->xdf2.nreff;
378 	dd2.ha = xe->xdf2.ha;
379 	dd2.rchg = xe->xdf2.rchg;
380 	dd2.rindex = xe->xdf2.rindex;
381 
382 	if (xdl_recs_cmp(&dd1, 0, dd1.nrec, &dd2, 0, dd2.nrec,
383 			 kvdf, kvdb, (xpp->flags & XDF_NEED_MINIMAL) != 0, &xenv) < 0) {
384 
385 		xdl_free(kvd);
386 		xdl_free_env(xe);
387 		return -1;
388 	}
389 
390 	xdl_free(kvd);
391 
392 	return 0;
393 }
394 
395 
xdl_add_change(xdchange_t * xscr,long i1,long i2,long chg1,long chg2)396 static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2) {
397 	xdchange_t *xch;
398 
399 	if (!(xch = (xdchange_t *) xdl_malloc(sizeof(xdchange_t))))
400 		return NULL;
401 
402 	xch->next = xscr;
403 	xch->i1 = i1;
404 	xch->i2 = i2;
405 	xch->chg1 = chg1;
406 	xch->chg2 = chg2;
407 	xch->ignore = 0;
408 
409 	return xch;
410 }
411 
412 
recs_match(xrecord_t * rec1,xrecord_t * rec2,long flags)413 static int recs_match(xrecord_t *rec1, xrecord_t *rec2, long flags)
414 {
415 	return (rec1->ha == rec2->ha &&
416 		xdl_recmatch(rec1->ptr, rec1->size,
417 			     rec2->ptr, rec2->size,
418 			     flags));
419 }
420 
421 /*
422  * If a line is indented more than this, get_indent() just returns this value.
423  * This avoids having to do absurd amounts of work for data that are not
424  * human-readable text, and also ensures that the output of get_indent fits within
425  * an int.
426  */
427 #define MAX_INDENT 200
428 
429 /*
430  * Return the amount of indentation of the specified line, treating TAB as 8
431  * columns. Return -1 if line is empty or contains only whitespace. Clamp the
432  * output value at MAX_INDENT.
433  */
get_indent(xrecord_t * rec)434 static int get_indent(xrecord_t *rec)
435 {
436 	long i;
437 	int ret = 0;
438 
439 	for (i = 0; i < rec->size; i++) {
440 		char c = rec->ptr[i];
441 
442 		if (!XDL_ISSPACE(c))
443 			return ret;
444 		else if (c == ' ')
445 			ret += 1;
446 		else if (c == '\t')
447 			ret += 8 - ret % 8;
448 		/* ignore other whitespace characters */
449 
450 		if (ret >= MAX_INDENT)
451 			return MAX_INDENT;
452 	}
453 
454 	/* The line contains only whitespace. */
455 	return -1;
456 }
457 
458 /*
459  * If more than this number of consecutive blank rows are found, just return this
460  * value. This avoids requiring O(N^2) work for pathological cases, and also
461  * ensures that the output of score_split fits in an int.
462  */
463 #define MAX_BLANKS 20
464 
465 /* Characteristics measured about a hypothetical split position. */
466 struct split_measurement {
467 	/*
468 	 * Is the split at the end of the file (aside from any blank lines)?
469 	 */
470 	int end_of_file;
471 
472 	/*
473 	 * How much is the line immediately following the split indented (or -1 if
474 	 * the line is blank):
475 	 */
476 	int indent;
477 
478 	/*
479 	 * How many consecutive lines above the split are blank?
480 	 */
481 	int pre_blank;
482 
483 	/*
484 	 * How much is the nearest non-blank line above the split indented (or -1
485 	 * if there is no such line)?
486 	 */
487 	int pre_indent;
488 
489 	/*
490 	 * How many lines after the line following the split are blank?
491 	 */
492 	int post_blank;
493 
494 	/*
495 	 * How much is the nearest non-blank line after the line following the
496 	 * split indented (or -1 if there is no such line)?
497 	 */
498 	int post_indent;
499 };
500 
501 struct split_score {
502 	/* The effective indent of this split (smaller is preferred). */
503 	int effective_indent;
504 
505 	/* Penalty for this split (smaller is preferred). */
506 	int penalty;
507 };
508 
509 /*
510  * Fill m with information about a hypothetical split of xdf above line split.
511  */
measure_split(const xdfile_t * xdf,long split,struct split_measurement * m)512 static void measure_split(const xdfile_t *xdf, long split,
513 			  struct split_measurement *m)
514 {
515 	long i;
516 
517 	if (split >= xdf->nrec) {
518 		m->end_of_file = 1;
519 		m->indent = -1;
520 	} else {
521 		m->end_of_file = 0;
522 		m->indent = get_indent(xdf->recs[split]);
523 	}
524 
525 	m->pre_blank = 0;
526 	m->pre_indent = -1;
527 	for (i = split - 1; i >= 0; i--) {
528 		m->pre_indent = get_indent(xdf->recs[i]);
529 		if (m->pre_indent != -1)
530 			break;
531 		m->pre_blank += 1;
532 		if (m->pre_blank == MAX_BLANKS) {
533 			m->pre_indent = 0;
534 			break;
535 		}
536 	}
537 
538 	m->post_blank = 0;
539 	m->post_indent = -1;
540 	for (i = split + 1; i < xdf->nrec; i++) {
541 		m->post_indent = get_indent(xdf->recs[i]);
542 		if (m->post_indent != -1)
543 			break;
544 		m->post_blank += 1;
545 		if (m->post_blank == MAX_BLANKS) {
546 			m->post_indent = 0;
547 			break;
548 		}
549 	}
550 }
551 
552 /*
553  * The empirically-determined weight factors used by score_split() below.
554  * Larger values means that the position is a less favorable place to split.
555  *
556  * Note that scores are only ever compared against each other, so multiplying
557  * all of these weight/penalty values by the same factor wouldn't change the
558  * heuristic's behavior. Still, we need to set that arbitrary scale *somehow*.
559  * In practice, these numbers are chosen to be large enough that they can be
560  * adjusted relative to each other with sufficient precision despite using
561  * integer math.
562  */
563 
564 /* Penalty if there are no non-blank lines before the split */
565 #define START_OF_FILE_PENALTY 1
566 
567 /* Penalty if there are no non-blank lines after the split */
568 #define END_OF_FILE_PENALTY 21
569 
570 /* Multiplier for the number of blank lines around the split */
571 #define TOTAL_BLANK_WEIGHT (-30)
572 
573 /* Multiplier for the number of blank lines after the split */
574 #define POST_BLANK_WEIGHT 6
575 
576 /*
577  * Penalties applied if the line is indented more than its predecessor
578  */
579 #define RELATIVE_INDENT_PENALTY (-4)
580 #define RELATIVE_INDENT_WITH_BLANK_PENALTY 10
581 
582 /*
583  * Penalties applied if the line is indented less than both its predecessor and
584  * its successor
585  */
586 #define RELATIVE_OUTDENT_PENALTY 24
587 #define RELATIVE_OUTDENT_WITH_BLANK_PENALTY 17
588 
589 /*
590  * Penalties applied if the line is indented less than its predecessor but not
591  * less than its successor
592  */
593 #define RELATIVE_DEDENT_PENALTY 23
594 #define RELATIVE_DEDENT_WITH_BLANK_PENALTY 17
595 
596 /*
597  * We only consider whether the sum of the effective indents for splits are
598  * less than (-1), equal to (0), or greater than (+1) each other. The resulting
599  * value is multiplied by the following weight and combined with the penalty to
600  * determine the better of two scores.
601  */
602 #define INDENT_WEIGHT 60
603 
604 /*
605  * Compute a badness score for the hypothetical split whose measurements are
606  * stored in m. The weight factors were determined empirically using the tools and
607  * corpus described in
608  *
609  *     https://github.com/mhagger/diff-slider-tools
610  *
611  * Also see that project if you want to improve the weights based on, for example,
612  * a larger or more diverse corpus.
613  */
score_add_split(const struct split_measurement * m,struct split_score * s)614 static void score_add_split(const struct split_measurement *m, struct split_score *s)
615 {
616 	/*
617 	 * A place to accumulate penalty factors (positive makes this index more
618 	 * favored):
619 	 */
620 	int post_blank, total_blank, indent, any_blanks;
621 
622 	if (m->pre_indent == -1 && m->pre_blank == 0)
623 		s->penalty += START_OF_FILE_PENALTY;
624 
625 	if (m->end_of_file)
626 		s->penalty += END_OF_FILE_PENALTY;
627 
628 	/*
629 	 * Set post_blank to the number of blank lines following the split,
630 	 * including the line immediately after the split:
631 	 */
632 	post_blank = (m->indent == -1) ? 1 + m->post_blank : 0;
633 	total_blank = m->pre_blank + post_blank;
634 
635 	/* Penalties based on nearby blank lines: */
636 	s->penalty += TOTAL_BLANK_WEIGHT * total_blank;
637 	s->penalty += POST_BLANK_WEIGHT * post_blank;
638 
639 	if (m->indent != -1)
640 		indent = m->indent;
641 	else
642 		indent = m->post_indent;
643 
644 	any_blanks = (total_blank != 0);
645 
646 	/* Note that the effective indent is -1 at the end of the file: */
647 	s->effective_indent += indent;
648 
649 	if (indent == -1) {
650 		/* No additional adjustments needed. */
651 	} else if (m->pre_indent == -1) {
652 		/* No additional adjustments needed. */
653 	} else if (indent > m->pre_indent) {
654 		/*
655 		 * The line is indented more than its predecessor.
656 		 */
657 		s->penalty += any_blanks ?
658 			RELATIVE_INDENT_WITH_BLANK_PENALTY :
659 			RELATIVE_INDENT_PENALTY;
660 	} else if (indent == m->pre_indent) {
661 		/*
662 		 * The line has the same indentation level as its predecessor.
663 		 * No additional adjustments needed.
664 		 */
665 	} else {
666 		/*
667 		 * The line is indented less than its predecessor. It could be
668 		 * the block terminator of the previous block, but it could
669 		 * also be the start of a new block (e.g., an "else" block, or
670 		 * maybe the previous block didn't have a block terminator).
671 		 * Try to distinguish those cases based on what comes next:
672 		 */
673 		if (m->post_indent != -1 && m->post_indent > indent) {
674 			/*
675 			 * The following line is indented more. So it is likely
676 			 * that this line is the start of a block.
677 			 */
678 			s->penalty += any_blanks ?
679 				RELATIVE_OUTDENT_WITH_BLANK_PENALTY :
680 				RELATIVE_OUTDENT_PENALTY;
681 		} else {
682 			/*
683 			 * That was probably the end of a block.
684 			 */
685 			s->penalty += any_blanks ?
686 				RELATIVE_DEDENT_WITH_BLANK_PENALTY :
687 				RELATIVE_DEDENT_PENALTY;
688 		}
689 	}
690 }
691 
score_cmp(struct split_score * s1,struct split_score * s2)692 static int score_cmp(struct split_score *s1, struct split_score *s2)
693 {
694 	/* -1 if s1.effective_indent < s2->effective_indent, etc. */
695 	int cmp_indents = ((s1->effective_indent > s2->effective_indent) -
696 			   (s1->effective_indent < s2->effective_indent));
697 
698 	return INDENT_WEIGHT * cmp_indents + (s1->penalty - s2->penalty);
699 }
700 
701 /*
702  * Represent a group of changed lines in an xdfile_t (i.e., a contiguous group
703  * of lines that was inserted or deleted from the corresponding version of the
704  * file). We consider there to be such a group at the beginning of the file, at
705  * the end of the file, and between any two unchanged lines, though most such
706  * groups will usually be empty.
707  *
708  * If the first line in a group is equal to the line following the group, then
709  * the group can be slid down. Similarly, if the last line in a group is equal
710  * to the line preceding the group, then the group can be slid up. See
711  * group_slide_down() and group_slide_up().
712  *
713  * Note that loops that are testing for changed lines in xdf->rchg do not need
714  * index bounding since the array is prepared with a zero at position -1 and N.
715  */
716 struct xdlgroup {
717 	/*
718 	 * The index of the first changed line in the group, or the index of
719 	 * the unchanged line above which the (empty) group is located.
720 	 */
721 	long start;
722 
723 	/*
724 	 * The index of the first unchanged line after the group. For an empty
725 	 * group, end is equal to start.
726 	 */
727 	long end;
728 };
729 
730 /*
731  * Initialize g to point at the first group in xdf.
732  */
group_init(xdfile_t * xdf,struct xdlgroup * g)733 static void group_init(xdfile_t *xdf, struct xdlgroup *g)
734 {
735 	g->start = g->end = 0;
736 	while (xdf->rchg[g->end])
737 		g->end++;
738 }
739 
740 /*
741  * Move g to describe the next (possibly empty) group in xdf and return 0. If g
742  * is already at the end of the file, do nothing and return -1.
743  */
group_next(xdfile_t * xdf,struct xdlgroup * g)744 XDL_INLINE(int) group_next(xdfile_t *xdf, struct xdlgroup *g)
745 {
746 	if (g->end == xdf->nrec)
747 		return -1;
748 
749 	g->start = g->end + 1;
750 	for (g->end = g->start; xdf->rchg[g->end]; g->end++)
751 		;
752 
753 	return 0;
754 }
755 
756 /*
757  * Move g to describe the previous (possibly empty) group in xdf and return 0.
758  * If g is already at the beginning of the file, do nothing and return -1.
759  */
group_previous(xdfile_t * xdf,struct xdlgroup * g)760 XDL_INLINE(int) group_previous(xdfile_t *xdf, struct xdlgroup *g)
761 {
762 	if (g->start == 0)
763 		return -1;
764 
765 	g->end = g->start - 1;
766 	for (g->start = g->end; xdf->rchg[g->start - 1]; g->start--)
767 		;
768 
769 	return 0;
770 }
771 
772 /*
773  * If g can be slid toward the end of the file, do so, and if it bumps into a
774  * following group, expand this group to include it. Return 0 on success or -1
775  * if g cannot be slid down.
776  */
group_slide_down(xdfile_t * xdf,struct xdlgroup * g,long flags)777 static int group_slide_down(xdfile_t *xdf, struct xdlgroup *g, long flags)
778 {
779 	if (g->end < xdf->nrec &&
780 	    recs_match(xdf->recs[g->start], xdf->recs[g->end], flags)) {
781 		xdf->rchg[g->start++] = 0;
782 		xdf->rchg[g->end++] = 1;
783 
784 		while (xdf->rchg[g->end])
785 			g->end++;
786 
787 		return 0;
788 	} else {
789 		return -1;
790 	}
791 }
792 
793 /*
794  * If g can be slid toward the beginning of the file, do so, and if it bumps
795  * into a previous group, expand this group to include it. Return 0 on success
796  * or -1 if g cannot be slid up.
797  */
group_slide_up(xdfile_t * xdf,struct xdlgroup * g,long flags)798 static int group_slide_up(xdfile_t *xdf, struct xdlgroup *g, long flags)
799 {
800 	if (g->start > 0 &&
801 	    recs_match(xdf->recs[g->start - 1], xdf->recs[g->end - 1], flags)) {
802 		xdf->rchg[--g->start] = 1;
803 		xdf->rchg[--g->end] = 0;
804 
805 		while (xdf->rchg[g->start - 1])
806 			g->start--;
807 
808 		return 0;
809 	} else {
810 		return -1;
811 	}
812 }
813 
xdl_bug(const char * msg)814 static void xdl_bug(const char *msg)
815 {
816 	fprintf(stderr, "BUG: %s\n", msg);
817 	exit(1);
818 }
819 
820 /*
821  * Move back and forward change groups for a consistent and pretty diff output.
822  * This also helps in finding joinable change groups and reducing the diff
823  * size.
824  */
xdl_change_compact(xdfile_t * xdf,xdfile_t * xdfo,long flags)825 int xdl_change_compact(xdfile_t *xdf, xdfile_t *xdfo, long flags) {
826 	struct xdlgroup g, go;
827 	long earliest_end, end_matching_other;
828 	long groupsize;
829 
830 	group_init(xdf, &g);
831 	group_init(xdfo, &go);
832 
833 	while (1) {
834 		/* If the group is empty in the to-be-compacted file, skip it: */
835 		if (g.end == g.start)
836 			goto next;
837 
838 		/*
839 		 * Now shift the change up and then down as far as possible in
840 		 * each direction. If it bumps into any other changes, merge them.
841 		 */
842 		do {
843 			groupsize = g.end - g.start;
844 
845 			/*
846 			 * Keep track of the last "end" index that causes this
847 			 * group to align with a group of changed lines in the
848 			 * other file. -1 indicates that we haven't found such
849 			 * a match yet:
850 			 */
851 			end_matching_other = -1;
852 
853 			/* Shift the group backward as much as possible: */
854 			while (!group_slide_up(xdf, &g, flags))
855 				if (group_previous(xdfo, &go))
856 					xdl_bug("group sync broken sliding up");
857 
858 			/*
859 			 * This is this highest that this group can be shifted.
860 			 * Record its end index:
861 			 */
862 			earliest_end = g.end;
863 
864 			if (go.end > go.start)
865 				end_matching_other = g.end;
866 
867 			/* Now shift the group forward as far as possible: */
868 			while (1) {
869 				if (group_slide_down(xdf, &g, flags))
870 					break;
871 				if (group_next(xdfo, &go))
872 					xdl_bug("group sync broken sliding down");
873 
874 				if (go.end > go.start)
875 					end_matching_other = g.end;
876 			}
877 		} while (groupsize != g.end - g.start);
878 
879 		/*
880 		 * If the group can be shifted, then we can possibly use this
881 		 * freedom to produce a more intuitive diff.
882 		 *
883 		 * The group is currently shifted as far down as possible, so the
884 		 * heuristics below only have to handle upwards shifts.
885 		 */
886 
887 		if (g.end == earliest_end) {
888 			/* no shifting was possible */
889 		} else if (end_matching_other != -1) {
890 			/*
891 			 * Move the possibly merged group of changes back to line
892 			 * up with the last group of changes from the other file
893 			 * that it can align with.
894 			 */
895 			while (go.end == go.start) {
896 				if (group_slide_up(xdf, &g, flags))
897 					xdl_bug("match disappeared");
898 				if (group_previous(xdfo, &go))
899 					xdl_bug("group sync broken sliding to match");
900 			}
901 		} else if (flags & XDF_INDENT_HEURISTIC) {
902 			/*
903 			 * Indent heuristic: a group of pure add/delete lines
904 			 * implies two splits, one between the end of the "before"
905 			 * context and the start of the group, and another between
906 			 * the end of the group and the beginning of the "after"
907 			 * context. Some splits are aesthetically better and some
908 			 * are worse. We compute a badness "score" for each split,
909 			 * and add the scores for the two splits to define a
910 			 * "score" for each position that the group can be shifted
911 			 * to. Then we pick the shift with the lowest score.
912 			 */
913 			long shift, best_shift = -1;
914 			struct split_score best_score;
915 
916 			for (shift = earliest_end; shift <= g.end; shift++) {
917 				struct split_measurement m;
918 				struct split_score score = {0, 0};
919 
920 				measure_split(xdf, shift, &m);
921 				score_add_split(&m, &score);
922 				measure_split(xdf, shift - groupsize, &m);
923 				score_add_split(&m, &score);
924 				if (best_shift == -1 ||
925 				    score_cmp(&score, &best_score) <= 0) {
926 					best_score.effective_indent = score.effective_indent;
927 					best_score.penalty = score.penalty;
928 					best_shift = shift;
929 				}
930 			}
931 
932 			while (g.end > best_shift) {
933 				if (group_slide_up(xdf, &g, flags))
934 					xdl_bug("best shift unreached");
935 				if (group_previous(xdfo, &go))
936 					xdl_bug("group sync broken sliding to blank line");
937 			}
938 		}
939 
940 	next:
941 		/* Move past the just-processed group: */
942 		if (group_next(xdf, &g))
943 			break;
944 		if (group_next(xdfo, &go))
945 			xdl_bug("group sync broken moving to next group");
946 	}
947 
948 	if (!group_next(xdfo, &go))
949 		xdl_bug("group sync broken at end of file");
950 
951 	return 0;
952 }
953 
954 
xdl_build_script(xdfenv_t * xe,xdchange_t ** xscr)955 int xdl_build_script(xdfenv_t *xe, xdchange_t **xscr) {
956 	xdchange_t *cscr = NULL, *xch;
957 	char *rchg1 = xe->xdf1.rchg, *rchg2 = xe->xdf2.rchg;
958 	long i1, i2, l1, l2;
959 
960 	/*
961 	 * Trivial. Collects "groups" of changes and creates an edit script.
962 	 */
963 	for (i1 = xe->xdf1.nrec, i2 = xe->xdf2.nrec; i1 >= 0 || i2 >= 0; i1--, i2--)
964 		if (rchg1[i1 - 1] || rchg2[i2 - 1]) {
965 			for (l1 = i1; rchg1[i1 - 1]; i1--);
966 			for (l2 = i2; rchg2[i2 - 1]; i2--);
967 
968 			if (!(xch = xdl_add_change(cscr, i1, i2, l1 - i1, l2 - i2))) {
969 				xdl_free_script(cscr);
970 				return -1;
971 			}
972 			cscr = xch;
973 		}
974 
975 	*xscr = cscr;
976 
977 	return 0;
978 }
979 
980 
xdl_free_script(xdchange_t * xscr)981 void xdl_free_script(xdchange_t *xscr) {
982 	xdchange_t *xch;
983 
984 	while ((xch = xscr) != NULL) {
985 		xscr = xscr->next;
986 		xdl_free(xch);
987 	}
988 }
989 
xdl_call_hunk_func(xdfenv_t * xe,xdchange_t * xscr,xdemitcb_t * ecb,xdemitconf_t const * xecfg)990 static int xdl_call_hunk_func(xdfenv_t *xe, xdchange_t *xscr, xdemitcb_t *ecb,
991 			      xdemitconf_t const *xecfg)
992 {
993 	xdchange_t *xch, *xche;
994 
995 	(void)xe;
996 
997 	for (xch = xscr; xch; xch = xche->next) {
998 		xche = xdl_get_hunk(&xch, xecfg);
999 		if (!xch)
1000 			break;
1001 		if (xecfg->hunk_func(xch->i1, xche->i1 + xche->chg1 - xch->i1,
1002 				     xch->i2, xche->i2 + xche->chg2 - xch->i2,
1003 				     ecb->priv) < 0)
1004 			return -1;
1005 	}
1006 	return 0;
1007 }
1008 
xdl_mark_ignorable(xdchange_t * xscr,xdfenv_t * xe,long flags)1009 static void xdl_mark_ignorable(xdchange_t *xscr, xdfenv_t *xe, long flags)
1010 {
1011 	xdchange_t *xch;
1012 
1013 	for (xch = xscr; xch; xch = xch->next) {
1014 		int ignore = 1;
1015 		xrecord_t **rec;
1016 		long i;
1017 
1018 		rec = &xe->xdf1.recs[xch->i1];
1019 		for (i = 0; i < xch->chg1 && ignore; i++)
1020 			ignore = xdl_blankline(rec[i]->ptr, rec[i]->size, flags);
1021 
1022 		rec = &xe->xdf2.recs[xch->i2];
1023 		for (i = 0; i < xch->chg2 && ignore; i++)
1024 			ignore = xdl_blankline(rec[i]->ptr, rec[i]->size, flags);
1025 
1026 		xch->ignore = ignore;
1027 	}
1028 }
1029 
xdl_diff(mmfile_t * mf1,mmfile_t * mf2,xpparam_t const * xpp,xdemitconf_t const * xecfg,xdemitcb_t * ecb)1030 int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
1031 	     xdemitconf_t const *xecfg, xdemitcb_t *ecb) {
1032 	xdchange_t *xscr;
1033 	xdfenv_t xe;
1034 	emit_func_t ef = xecfg->hunk_func ? xdl_call_hunk_func : xdl_emit_diff;
1035 
1036 	if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) {
1037 
1038 		return -1;
1039 	}
1040 	if (xdl_change_compact(&xe.xdf1, &xe.xdf2, xpp->flags) < 0 ||
1041 	    xdl_change_compact(&xe.xdf2, &xe.xdf1, xpp->flags) < 0 ||
1042 	    xdl_build_script(&xe, &xscr) < 0) {
1043 
1044 		xdl_free_env(&xe);
1045 		return -1;
1046 	}
1047 	if (xscr) {
1048 		if (xpp->flags & XDF_IGNORE_BLANK_LINES)
1049 			xdl_mark_ignorable(xscr, &xe, xpp->flags);
1050 
1051 		if (ef(&xe, xscr, ecb, xecfg) < 0) {
1052 
1053 			xdl_free_script(xscr);
1054 			xdl_free_env(&xe);
1055 			return -1;
1056 		}
1057 		xdl_free_script(xscr);
1058 	}
1059 	xdl_free_env(&xe);
1060 
1061 	return 0;
1062 }
1063