1 /*
2 * LibXDiff by Davide Libenzi ( File Differential Library )
3 * Copyright (C) 2003 Davide Libenzi
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, see
17 * <http://www.gnu.org/licenses/>.
18 *
19 * Davide Libenzi <davidel@xmailserver.org>
20 *
21 */
22
23 #include "xinclude.h"
24 #include "integer.h"
25
26
27 #define XDL_MAX_COST_MIN 256
28 #define XDL_HEUR_MIN_COST 256
29 #define XDL_LINE_MAX (long)((1UL << (CHAR_BIT * sizeof(long) - 1)) - 1)
30 #define XDL_SNAKE_CNT 20
31 #define XDL_K_HEUR 4
32
33 /** Declare a function as always inlined. */
34 #if defined(_MSC_VER)
35 # define XDL_INLINE(type) static __inline type
36 #elif defined(__GNUC__)
37 # define XDL_INLINE(type) static __inline__ type
38 #else
39 # define XDL_INLINE(type) static type
40 #endif
41
42 typedef struct s_xdpsplit {
43 long i1, i2;
44 int min_lo, min_hi;
45 } xdpsplit_t;
46
47
48
49
50 static long xdl_split(unsigned long const *ha1, long off1, long lim1,
51 unsigned long const *ha2, long off2, long lim2,
52 long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl,
53 xdalgoenv_t *xenv);
54 static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2);
55
56
57
58
59
60 /*
61 * See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers.
62 * Basically considers a "box" (off1, off2, lim1, lim2) and scan from both
63 * the forward diagonal starting from (off1, off2) and the backward diagonal
64 * starting from (lim1, lim2). If the K values on the same diagonal crosses
65 * returns the furthest point of reach. We might end up having to expensive
66 * cases using this algorithm is full, so a little bit of heuristic is needed
67 * to cut the search and to return a suboptimal point.
68 */
xdl_split(unsigned long const * ha1,long off1,long lim1,unsigned long const * ha2,long off2,long lim2,long * kvdf,long * kvdb,int need_min,xdpsplit_t * spl,xdalgoenv_t * xenv)69 static long xdl_split(unsigned long const *ha1, long off1, long lim1,
70 unsigned long const *ha2, long off2, long lim2,
71 long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl,
72 xdalgoenv_t *xenv) {
73 long dmin = off1 - lim2, dmax = lim1 - off2;
74 long fmid = off1 - off2, bmid = lim1 - lim2;
75 long odd = (fmid - bmid) & 1;
76 long fmin = fmid, fmax = fmid;
77 long bmin = bmid, bmax = bmid;
78 long ec, d, i1, i2, prev1, best, dd, v, k;
79
80 /*
81 * Set initial diagonal values for both forward and backward path.
82 */
83 kvdf[fmid] = off1;
84 kvdb[bmid] = lim1;
85
86 for (ec = 1;; ec++) {
87 int got_snake = 0;
88
89 /*
90 * We need to extent the diagonal "domain" by one. If the next
91 * values exits the box boundaries we need to change it in the
92 * opposite direction because (max - min) must be a power of two.
93 * Also we initialize the external K value to -1 so that we can
94 * avoid extra conditions check inside the core loop.
95 */
96 if (fmin > dmin)
97 kvdf[--fmin - 1] = -1;
98 else
99 ++fmin;
100 if (fmax < dmax)
101 kvdf[++fmax + 1] = -1;
102 else
103 --fmax;
104
105 for (d = fmax; d >= fmin; d -= 2) {
106 if (kvdf[d - 1] >= kvdf[d + 1])
107 i1 = kvdf[d - 1] + 1;
108 else
109 i1 = kvdf[d + 1];
110 prev1 = i1;
111 i2 = i1 - d;
112 for (; i1 < lim1 && i2 < lim2 && ha1[i1] == ha2[i2]; i1++, i2++);
113 if (i1 - prev1 > xenv->snake_cnt)
114 got_snake = 1;
115 kvdf[d] = i1;
116 if (odd && bmin <= d && d <= bmax && kvdb[d] <= i1) {
117 spl->i1 = i1;
118 spl->i2 = i2;
119 spl->min_lo = spl->min_hi = 1;
120 return ec;
121 }
122 }
123
124 /*
125 * We need to extent the diagonal "domain" by one. If the next
126 * values exits the box boundaries we need to change it in the
127 * opposite direction because (max - min) must be a power of two.
128 * Also we initialize the external K value to -1 so that we can
129 * avoid extra conditions check inside the core loop.
130 */
131 if (bmin > dmin)
132 kvdb[--bmin - 1] = XDL_LINE_MAX;
133 else
134 ++bmin;
135 if (bmax < dmax)
136 kvdb[++bmax + 1] = XDL_LINE_MAX;
137 else
138 --bmax;
139
140 for (d = bmax; d >= bmin; d -= 2) {
141 if (kvdb[d - 1] < kvdb[d + 1])
142 i1 = kvdb[d - 1];
143 else
144 i1 = kvdb[d + 1] - 1;
145 prev1 = i1;
146 i2 = i1 - d;
147 for (; i1 > off1 && i2 > off2 && ha1[i1 - 1] == ha2[i2 - 1]; i1--, i2--);
148 if (prev1 - i1 > xenv->snake_cnt)
149 got_snake = 1;
150 kvdb[d] = i1;
151 if (!odd && fmin <= d && d <= fmax && i1 <= kvdf[d]) {
152 spl->i1 = i1;
153 spl->i2 = i2;
154 spl->min_lo = spl->min_hi = 1;
155 return ec;
156 }
157 }
158
159 if (need_min)
160 continue;
161
162 /*
163 * If the edit cost is above the heuristic trigger and if
164 * we got a good snake, we sample current diagonals to see
165 * if some of the, have reached an "interesting" path. Our
166 * measure is a function of the distance from the diagonal
167 * corner (i1 + i2) penalized with the distance from the
168 * mid diagonal itself. If this value is above the current
169 * edit cost times a magic factor (XDL_K_HEUR) we consider
170 * it interesting.
171 */
172 if (got_snake && ec > xenv->heur_min) {
173 for (best = 0, d = fmax; d >= fmin; d -= 2) {
174 dd = d > fmid ? d - fmid: fmid - d;
175 i1 = kvdf[d];
176 i2 = i1 - d;
177 v = (i1 - off1) + (i2 - off2) - dd;
178
179 if (v > XDL_K_HEUR * ec && v > best &&
180 off1 + xenv->snake_cnt <= i1 && i1 < lim1 &&
181 off2 + xenv->snake_cnt <= i2 && i2 < lim2) {
182 for (k = 1; ha1[i1 - k] == ha2[i2 - k]; k++)
183 if (k == xenv->snake_cnt) {
184 best = v;
185 spl->i1 = i1;
186 spl->i2 = i2;
187 break;
188 }
189 }
190 }
191 if (best > 0) {
192 spl->min_lo = 1;
193 spl->min_hi = 0;
194 return ec;
195 }
196
197 for (best = 0, d = bmax; d >= bmin; d -= 2) {
198 dd = d > bmid ? d - bmid: bmid - d;
199 i1 = kvdb[d];
200 i2 = i1 - d;
201 v = (lim1 - i1) + (lim2 - i2) - dd;
202
203 if (v > XDL_K_HEUR * ec && v > best &&
204 off1 < i1 && i1 <= lim1 - xenv->snake_cnt &&
205 off2 < i2 && i2 <= lim2 - xenv->snake_cnt) {
206 for (k = 0; ha1[i1 + k] == ha2[i2 + k]; k++)
207 if (k == xenv->snake_cnt - 1) {
208 best = v;
209 spl->i1 = i1;
210 spl->i2 = i2;
211 break;
212 }
213 }
214 }
215 if (best > 0) {
216 spl->min_lo = 0;
217 spl->min_hi = 1;
218 return ec;
219 }
220 }
221
222 /*
223 * Enough is enough. We spent too much time here and now we collect
224 * the furthest reaching path using the (i1 + i2) measure.
225 */
226 if (ec >= xenv->mxcost) {
227 long fbest, fbest1, bbest, bbest1;
228
229 fbest = fbest1 = -1;
230 for (d = fmax; d >= fmin; d -= 2) {
231 i1 = XDL_MIN(kvdf[d], lim1);
232 i2 = i1 - d;
233 if (lim2 < i2)
234 i1 = lim2 + d, i2 = lim2;
235 if (fbest < i1 + i2) {
236 fbest = i1 + i2;
237 fbest1 = i1;
238 }
239 }
240
241 bbest = bbest1 = XDL_LINE_MAX;
242 for (d = bmax; d >= bmin; d -= 2) {
243 i1 = XDL_MAX(off1, kvdb[d]);
244 i2 = i1 - d;
245 if (i2 < off2)
246 i1 = off2 + d, i2 = off2;
247 if (i1 + i2 < bbest) {
248 bbest = i1 + i2;
249 bbest1 = i1;
250 }
251 }
252
253 if ((lim1 + lim2) - bbest < fbest - (off1 + off2)) {
254 spl->i1 = fbest1;
255 spl->i2 = fbest - fbest1;
256 spl->min_lo = 1;
257 spl->min_hi = 0;
258 } else {
259 spl->i1 = bbest1;
260 spl->i2 = bbest - bbest1;
261 spl->min_lo = 0;
262 spl->min_hi = 1;
263 }
264 return ec;
265 }
266 }
267 }
268
269
270 /*
271 * Rule: "Divide et Impera". Recursively split the box in sub-boxes by calling
272 * the box splitting function. Note that the real job (marking changed lines)
273 * is done in the two boundary reaching checks.
274 */
xdl_recs_cmp(diffdata_t * dd1,long off1,long lim1,diffdata_t * dd2,long off2,long lim2,long * kvdf,long * kvdb,int need_min,xdalgoenv_t * xenv)275 int xdl_recs_cmp(diffdata_t *dd1, long off1, long lim1,
276 diffdata_t *dd2, long off2, long lim2,
277 long *kvdf, long *kvdb, int need_min, xdalgoenv_t *xenv) {
278 unsigned long const *ha1 = dd1->ha, *ha2 = dd2->ha;
279
280 /*
281 * Shrink the box by walking through each diagonal snake (SW and NE).
282 */
283 for (; off1 < lim1 && off2 < lim2 && ha1[off1] == ha2[off2]; off1++, off2++);
284 for (; off1 < lim1 && off2 < lim2 && ha1[lim1 - 1] == ha2[lim2 - 1]; lim1--, lim2--);
285
286 /*
287 * If one dimension is empty, then all records on the other one must
288 * be obviously changed.
289 */
290 if (off1 == lim1) {
291 char *rchg2 = dd2->rchg;
292 long *rindex2 = dd2->rindex;
293
294 for (; off2 < lim2; off2++)
295 rchg2[rindex2[off2]] = 1;
296 } else if (off2 == lim2) {
297 char *rchg1 = dd1->rchg;
298 long *rindex1 = dd1->rindex;
299
300 for (; off1 < lim1; off1++)
301 rchg1[rindex1[off1]] = 1;
302 } else {
303 xdpsplit_t spl;
304 spl.i1 = spl.i2 = 0;
305
306 /*
307 * Divide ...
308 */
309 if (xdl_split(ha1, off1, lim1, ha2, off2, lim2, kvdf, kvdb,
310 need_min, &spl, xenv) < 0) {
311
312 return -1;
313 }
314
315 /*
316 * ... et Impera.
317 */
318 if (xdl_recs_cmp(dd1, off1, spl.i1, dd2, off2, spl.i2,
319 kvdf, kvdb, spl.min_lo, xenv) < 0 ||
320 xdl_recs_cmp(dd1, spl.i1, lim1, dd2, spl.i2, lim2,
321 kvdf, kvdb, spl.min_hi, xenv) < 0) {
322
323 return -1;
324 }
325 }
326
327 return 0;
328 }
329
330
xdl_do_diff(mmfile_t * mf1,mmfile_t * mf2,xpparam_t const * xpp,xdfenv_t * xe)331 int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
332 xdfenv_t *xe) {
333 size_t ndiags, allocsize;
334 long *kvd, *kvdf, *kvdb;
335 xdalgoenv_t xenv;
336 diffdata_t dd1, dd2;
337
338 if (XDF_DIFF_ALG(xpp->flags) == XDF_PATIENCE_DIFF)
339 return xdl_do_patience_diff(mf1, mf2, xpp, xe);
340
341 if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF)
342 return xdl_do_histogram_diff(mf1, mf2, xpp, xe);
343
344 if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0) {
345
346 return -1;
347 }
348
349 /*
350 * Allocate and setup K vectors to be used by the differential algorithm.
351 * One is to store the forward path and one to store the backward path.
352 */
353 GIT_ERROR_CHECK_ALLOC_ADD3(&ndiags, xe->xdf1.nreff, xe->xdf2.nreff, 3);
354 GIT_ERROR_CHECK_ALLOC_MULTIPLY(&allocsize, ndiags, 2);
355 GIT_ERROR_CHECK_ALLOC_ADD(&allocsize, allocsize, 2);
356 GIT_ERROR_CHECK_ALLOC_MULTIPLY(&allocsize, allocsize, sizeof(long));
357
358 if (!(kvd = (long *) xdl_malloc(allocsize))) {
359 xdl_free_env(xe);
360 return -1;
361 }
362 kvdf = kvd;
363 kvdb = kvdf + ndiags;
364 kvdf += xe->xdf2.nreff + 1;
365 kvdb += xe->xdf2.nreff + 1;
366
367 xenv.mxcost = xdl_bogosqrt(ndiags);
368 if (xenv.mxcost < XDL_MAX_COST_MIN)
369 xenv.mxcost = XDL_MAX_COST_MIN;
370 xenv.snake_cnt = XDL_SNAKE_CNT;
371 xenv.heur_min = XDL_HEUR_MIN_COST;
372
373 dd1.nrec = xe->xdf1.nreff;
374 dd1.ha = xe->xdf1.ha;
375 dd1.rchg = xe->xdf1.rchg;
376 dd1.rindex = xe->xdf1.rindex;
377 dd2.nrec = xe->xdf2.nreff;
378 dd2.ha = xe->xdf2.ha;
379 dd2.rchg = xe->xdf2.rchg;
380 dd2.rindex = xe->xdf2.rindex;
381
382 if (xdl_recs_cmp(&dd1, 0, dd1.nrec, &dd2, 0, dd2.nrec,
383 kvdf, kvdb, (xpp->flags & XDF_NEED_MINIMAL) != 0, &xenv) < 0) {
384
385 xdl_free(kvd);
386 xdl_free_env(xe);
387 return -1;
388 }
389
390 xdl_free(kvd);
391
392 return 0;
393 }
394
395
xdl_add_change(xdchange_t * xscr,long i1,long i2,long chg1,long chg2)396 static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2) {
397 xdchange_t *xch;
398
399 if (!(xch = (xdchange_t *) xdl_malloc(sizeof(xdchange_t))))
400 return NULL;
401
402 xch->next = xscr;
403 xch->i1 = i1;
404 xch->i2 = i2;
405 xch->chg1 = chg1;
406 xch->chg2 = chg2;
407 xch->ignore = 0;
408
409 return xch;
410 }
411
412
recs_match(xrecord_t * rec1,xrecord_t * rec2,long flags)413 static int recs_match(xrecord_t *rec1, xrecord_t *rec2, long flags)
414 {
415 return (rec1->ha == rec2->ha &&
416 xdl_recmatch(rec1->ptr, rec1->size,
417 rec2->ptr, rec2->size,
418 flags));
419 }
420
421 /*
422 * If a line is indented more than this, get_indent() just returns this value.
423 * This avoids having to do absurd amounts of work for data that are not
424 * human-readable text, and also ensures that the output of get_indent fits within
425 * an int.
426 */
427 #define MAX_INDENT 200
428
429 /*
430 * Return the amount of indentation of the specified line, treating TAB as 8
431 * columns. Return -1 if line is empty or contains only whitespace. Clamp the
432 * output value at MAX_INDENT.
433 */
get_indent(xrecord_t * rec)434 static int get_indent(xrecord_t *rec)
435 {
436 long i;
437 int ret = 0;
438
439 for (i = 0; i < rec->size; i++) {
440 char c = rec->ptr[i];
441
442 if (!XDL_ISSPACE(c))
443 return ret;
444 else if (c == ' ')
445 ret += 1;
446 else if (c == '\t')
447 ret += 8 - ret % 8;
448 /* ignore other whitespace characters */
449
450 if (ret >= MAX_INDENT)
451 return MAX_INDENT;
452 }
453
454 /* The line contains only whitespace. */
455 return -1;
456 }
457
458 /*
459 * If more than this number of consecutive blank rows are found, just return this
460 * value. This avoids requiring O(N^2) work for pathological cases, and also
461 * ensures that the output of score_split fits in an int.
462 */
463 #define MAX_BLANKS 20
464
465 /* Characteristics measured about a hypothetical split position. */
466 struct split_measurement {
467 /*
468 * Is the split at the end of the file (aside from any blank lines)?
469 */
470 int end_of_file;
471
472 /*
473 * How much is the line immediately following the split indented (or -1 if
474 * the line is blank):
475 */
476 int indent;
477
478 /*
479 * How many consecutive lines above the split are blank?
480 */
481 int pre_blank;
482
483 /*
484 * How much is the nearest non-blank line above the split indented (or -1
485 * if there is no such line)?
486 */
487 int pre_indent;
488
489 /*
490 * How many lines after the line following the split are blank?
491 */
492 int post_blank;
493
494 /*
495 * How much is the nearest non-blank line after the line following the
496 * split indented (or -1 if there is no such line)?
497 */
498 int post_indent;
499 };
500
501 struct split_score {
502 /* The effective indent of this split (smaller is preferred). */
503 int effective_indent;
504
505 /* Penalty for this split (smaller is preferred). */
506 int penalty;
507 };
508
509 /*
510 * Fill m with information about a hypothetical split of xdf above line split.
511 */
measure_split(const xdfile_t * xdf,long split,struct split_measurement * m)512 static void measure_split(const xdfile_t *xdf, long split,
513 struct split_measurement *m)
514 {
515 long i;
516
517 if (split >= xdf->nrec) {
518 m->end_of_file = 1;
519 m->indent = -1;
520 } else {
521 m->end_of_file = 0;
522 m->indent = get_indent(xdf->recs[split]);
523 }
524
525 m->pre_blank = 0;
526 m->pre_indent = -1;
527 for (i = split - 1; i >= 0; i--) {
528 m->pre_indent = get_indent(xdf->recs[i]);
529 if (m->pre_indent != -1)
530 break;
531 m->pre_blank += 1;
532 if (m->pre_blank == MAX_BLANKS) {
533 m->pre_indent = 0;
534 break;
535 }
536 }
537
538 m->post_blank = 0;
539 m->post_indent = -1;
540 for (i = split + 1; i < xdf->nrec; i++) {
541 m->post_indent = get_indent(xdf->recs[i]);
542 if (m->post_indent != -1)
543 break;
544 m->post_blank += 1;
545 if (m->post_blank == MAX_BLANKS) {
546 m->post_indent = 0;
547 break;
548 }
549 }
550 }
551
552 /*
553 * The empirically-determined weight factors used by score_split() below.
554 * Larger values means that the position is a less favorable place to split.
555 *
556 * Note that scores are only ever compared against each other, so multiplying
557 * all of these weight/penalty values by the same factor wouldn't change the
558 * heuristic's behavior. Still, we need to set that arbitrary scale *somehow*.
559 * In practice, these numbers are chosen to be large enough that they can be
560 * adjusted relative to each other with sufficient precision despite using
561 * integer math.
562 */
563
564 /* Penalty if there are no non-blank lines before the split */
565 #define START_OF_FILE_PENALTY 1
566
567 /* Penalty if there are no non-blank lines after the split */
568 #define END_OF_FILE_PENALTY 21
569
570 /* Multiplier for the number of blank lines around the split */
571 #define TOTAL_BLANK_WEIGHT (-30)
572
573 /* Multiplier for the number of blank lines after the split */
574 #define POST_BLANK_WEIGHT 6
575
576 /*
577 * Penalties applied if the line is indented more than its predecessor
578 */
579 #define RELATIVE_INDENT_PENALTY (-4)
580 #define RELATIVE_INDENT_WITH_BLANK_PENALTY 10
581
582 /*
583 * Penalties applied if the line is indented less than both its predecessor and
584 * its successor
585 */
586 #define RELATIVE_OUTDENT_PENALTY 24
587 #define RELATIVE_OUTDENT_WITH_BLANK_PENALTY 17
588
589 /*
590 * Penalties applied if the line is indented less than its predecessor but not
591 * less than its successor
592 */
593 #define RELATIVE_DEDENT_PENALTY 23
594 #define RELATIVE_DEDENT_WITH_BLANK_PENALTY 17
595
596 /*
597 * We only consider whether the sum of the effective indents for splits are
598 * less than (-1), equal to (0), or greater than (+1) each other. The resulting
599 * value is multiplied by the following weight and combined with the penalty to
600 * determine the better of two scores.
601 */
602 #define INDENT_WEIGHT 60
603
604 /*
605 * Compute a badness score for the hypothetical split whose measurements are
606 * stored in m. The weight factors were determined empirically using the tools and
607 * corpus described in
608 *
609 * https://github.com/mhagger/diff-slider-tools
610 *
611 * Also see that project if you want to improve the weights based on, for example,
612 * a larger or more diverse corpus.
613 */
score_add_split(const struct split_measurement * m,struct split_score * s)614 static void score_add_split(const struct split_measurement *m, struct split_score *s)
615 {
616 /*
617 * A place to accumulate penalty factors (positive makes this index more
618 * favored):
619 */
620 int post_blank, total_blank, indent, any_blanks;
621
622 if (m->pre_indent == -1 && m->pre_blank == 0)
623 s->penalty += START_OF_FILE_PENALTY;
624
625 if (m->end_of_file)
626 s->penalty += END_OF_FILE_PENALTY;
627
628 /*
629 * Set post_blank to the number of blank lines following the split,
630 * including the line immediately after the split:
631 */
632 post_blank = (m->indent == -1) ? 1 + m->post_blank : 0;
633 total_blank = m->pre_blank + post_blank;
634
635 /* Penalties based on nearby blank lines: */
636 s->penalty += TOTAL_BLANK_WEIGHT * total_blank;
637 s->penalty += POST_BLANK_WEIGHT * post_blank;
638
639 if (m->indent != -1)
640 indent = m->indent;
641 else
642 indent = m->post_indent;
643
644 any_blanks = (total_blank != 0);
645
646 /* Note that the effective indent is -1 at the end of the file: */
647 s->effective_indent += indent;
648
649 if (indent == -1) {
650 /* No additional adjustments needed. */
651 } else if (m->pre_indent == -1) {
652 /* No additional adjustments needed. */
653 } else if (indent > m->pre_indent) {
654 /*
655 * The line is indented more than its predecessor.
656 */
657 s->penalty += any_blanks ?
658 RELATIVE_INDENT_WITH_BLANK_PENALTY :
659 RELATIVE_INDENT_PENALTY;
660 } else if (indent == m->pre_indent) {
661 /*
662 * The line has the same indentation level as its predecessor.
663 * No additional adjustments needed.
664 */
665 } else {
666 /*
667 * The line is indented less than its predecessor. It could be
668 * the block terminator of the previous block, but it could
669 * also be the start of a new block (e.g., an "else" block, or
670 * maybe the previous block didn't have a block terminator).
671 * Try to distinguish those cases based on what comes next:
672 */
673 if (m->post_indent != -1 && m->post_indent > indent) {
674 /*
675 * The following line is indented more. So it is likely
676 * that this line is the start of a block.
677 */
678 s->penalty += any_blanks ?
679 RELATIVE_OUTDENT_WITH_BLANK_PENALTY :
680 RELATIVE_OUTDENT_PENALTY;
681 } else {
682 /*
683 * That was probably the end of a block.
684 */
685 s->penalty += any_blanks ?
686 RELATIVE_DEDENT_WITH_BLANK_PENALTY :
687 RELATIVE_DEDENT_PENALTY;
688 }
689 }
690 }
691
score_cmp(struct split_score * s1,struct split_score * s2)692 static int score_cmp(struct split_score *s1, struct split_score *s2)
693 {
694 /* -1 if s1.effective_indent < s2->effective_indent, etc. */
695 int cmp_indents = ((s1->effective_indent > s2->effective_indent) -
696 (s1->effective_indent < s2->effective_indent));
697
698 return INDENT_WEIGHT * cmp_indents + (s1->penalty - s2->penalty);
699 }
700
701 /*
702 * Represent a group of changed lines in an xdfile_t (i.e., a contiguous group
703 * of lines that was inserted or deleted from the corresponding version of the
704 * file). We consider there to be such a group at the beginning of the file, at
705 * the end of the file, and between any two unchanged lines, though most such
706 * groups will usually be empty.
707 *
708 * If the first line in a group is equal to the line following the group, then
709 * the group can be slid down. Similarly, if the last line in a group is equal
710 * to the line preceding the group, then the group can be slid up. See
711 * group_slide_down() and group_slide_up().
712 *
713 * Note that loops that are testing for changed lines in xdf->rchg do not need
714 * index bounding since the array is prepared with a zero at position -1 and N.
715 */
716 struct xdlgroup {
717 /*
718 * The index of the first changed line in the group, or the index of
719 * the unchanged line above which the (empty) group is located.
720 */
721 long start;
722
723 /*
724 * The index of the first unchanged line after the group. For an empty
725 * group, end is equal to start.
726 */
727 long end;
728 };
729
730 /*
731 * Initialize g to point at the first group in xdf.
732 */
group_init(xdfile_t * xdf,struct xdlgroup * g)733 static void group_init(xdfile_t *xdf, struct xdlgroup *g)
734 {
735 g->start = g->end = 0;
736 while (xdf->rchg[g->end])
737 g->end++;
738 }
739
740 /*
741 * Move g to describe the next (possibly empty) group in xdf and return 0. If g
742 * is already at the end of the file, do nothing and return -1.
743 */
group_next(xdfile_t * xdf,struct xdlgroup * g)744 XDL_INLINE(int) group_next(xdfile_t *xdf, struct xdlgroup *g)
745 {
746 if (g->end == xdf->nrec)
747 return -1;
748
749 g->start = g->end + 1;
750 for (g->end = g->start; xdf->rchg[g->end]; g->end++)
751 ;
752
753 return 0;
754 }
755
756 /*
757 * Move g to describe the previous (possibly empty) group in xdf and return 0.
758 * If g is already at the beginning of the file, do nothing and return -1.
759 */
group_previous(xdfile_t * xdf,struct xdlgroup * g)760 XDL_INLINE(int) group_previous(xdfile_t *xdf, struct xdlgroup *g)
761 {
762 if (g->start == 0)
763 return -1;
764
765 g->end = g->start - 1;
766 for (g->start = g->end; xdf->rchg[g->start - 1]; g->start--)
767 ;
768
769 return 0;
770 }
771
772 /*
773 * If g can be slid toward the end of the file, do so, and if it bumps into a
774 * following group, expand this group to include it. Return 0 on success or -1
775 * if g cannot be slid down.
776 */
group_slide_down(xdfile_t * xdf,struct xdlgroup * g,long flags)777 static int group_slide_down(xdfile_t *xdf, struct xdlgroup *g, long flags)
778 {
779 if (g->end < xdf->nrec &&
780 recs_match(xdf->recs[g->start], xdf->recs[g->end], flags)) {
781 xdf->rchg[g->start++] = 0;
782 xdf->rchg[g->end++] = 1;
783
784 while (xdf->rchg[g->end])
785 g->end++;
786
787 return 0;
788 } else {
789 return -1;
790 }
791 }
792
793 /*
794 * If g can be slid toward the beginning of the file, do so, and if it bumps
795 * into a previous group, expand this group to include it. Return 0 on success
796 * or -1 if g cannot be slid up.
797 */
group_slide_up(xdfile_t * xdf,struct xdlgroup * g,long flags)798 static int group_slide_up(xdfile_t *xdf, struct xdlgroup *g, long flags)
799 {
800 if (g->start > 0 &&
801 recs_match(xdf->recs[g->start - 1], xdf->recs[g->end - 1], flags)) {
802 xdf->rchg[--g->start] = 1;
803 xdf->rchg[--g->end] = 0;
804
805 while (xdf->rchg[g->start - 1])
806 g->start--;
807
808 return 0;
809 } else {
810 return -1;
811 }
812 }
813
xdl_bug(const char * msg)814 static void xdl_bug(const char *msg)
815 {
816 fprintf(stderr, "BUG: %s\n", msg);
817 exit(1);
818 }
819
820 /*
821 * Move back and forward change groups for a consistent and pretty diff output.
822 * This also helps in finding joinable change groups and reducing the diff
823 * size.
824 */
xdl_change_compact(xdfile_t * xdf,xdfile_t * xdfo,long flags)825 int xdl_change_compact(xdfile_t *xdf, xdfile_t *xdfo, long flags) {
826 struct xdlgroup g, go;
827 long earliest_end, end_matching_other;
828 long groupsize;
829
830 group_init(xdf, &g);
831 group_init(xdfo, &go);
832
833 while (1) {
834 /* If the group is empty in the to-be-compacted file, skip it: */
835 if (g.end == g.start)
836 goto next;
837
838 /*
839 * Now shift the change up and then down as far as possible in
840 * each direction. If it bumps into any other changes, merge them.
841 */
842 do {
843 groupsize = g.end - g.start;
844
845 /*
846 * Keep track of the last "end" index that causes this
847 * group to align with a group of changed lines in the
848 * other file. -1 indicates that we haven't found such
849 * a match yet:
850 */
851 end_matching_other = -1;
852
853 /* Shift the group backward as much as possible: */
854 while (!group_slide_up(xdf, &g, flags))
855 if (group_previous(xdfo, &go))
856 xdl_bug("group sync broken sliding up");
857
858 /*
859 * This is this highest that this group can be shifted.
860 * Record its end index:
861 */
862 earliest_end = g.end;
863
864 if (go.end > go.start)
865 end_matching_other = g.end;
866
867 /* Now shift the group forward as far as possible: */
868 while (1) {
869 if (group_slide_down(xdf, &g, flags))
870 break;
871 if (group_next(xdfo, &go))
872 xdl_bug("group sync broken sliding down");
873
874 if (go.end > go.start)
875 end_matching_other = g.end;
876 }
877 } while (groupsize != g.end - g.start);
878
879 /*
880 * If the group can be shifted, then we can possibly use this
881 * freedom to produce a more intuitive diff.
882 *
883 * The group is currently shifted as far down as possible, so the
884 * heuristics below only have to handle upwards shifts.
885 */
886
887 if (g.end == earliest_end) {
888 /* no shifting was possible */
889 } else if (end_matching_other != -1) {
890 /*
891 * Move the possibly merged group of changes back to line
892 * up with the last group of changes from the other file
893 * that it can align with.
894 */
895 while (go.end == go.start) {
896 if (group_slide_up(xdf, &g, flags))
897 xdl_bug("match disappeared");
898 if (group_previous(xdfo, &go))
899 xdl_bug("group sync broken sliding to match");
900 }
901 } else if (flags & XDF_INDENT_HEURISTIC) {
902 /*
903 * Indent heuristic: a group of pure add/delete lines
904 * implies two splits, one between the end of the "before"
905 * context and the start of the group, and another between
906 * the end of the group and the beginning of the "after"
907 * context. Some splits are aesthetically better and some
908 * are worse. We compute a badness "score" for each split,
909 * and add the scores for the two splits to define a
910 * "score" for each position that the group can be shifted
911 * to. Then we pick the shift with the lowest score.
912 */
913 long shift, best_shift = -1;
914 struct split_score best_score;
915
916 for (shift = earliest_end; shift <= g.end; shift++) {
917 struct split_measurement m;
918 struct split_score score = {0, 0};
919
920 measure_split(xdf, shift, &m);
921 score_add_split(&m, &score);
922 measure_split(xdf, shift - groupsize, &m);
923 score_add_split(&m, &score);
924 if (best_shift == -1 ||
925 score_cmp(&score, &best_score) <= 0) {
926 best_score.effective_indent = score.effective_indent;
927 best_score.penalty = score.penalty;
928 best_shift = shift;
929 }
930 }
931
932 while (g.end > best_shift) {
933 if (group_slide_up(xdf, &g, flags))
934 xdl_bug("best shift unreached");
935 if (group_previous(xdfo, &go))
936 xdl_bug("group sync broken sliding to blank line");
937 }
938 }
939
940 next:
941 /* Move past the just-processed group: */
942 if (group_next(xdf, &g))
943 break;
944 if (group_next(xdfo, &go))
945 xdl_bug("group sync broken moving to next group");
946 }
947
948 if (!group_next(xdfo, &go))
949 xdl_bug("group sync broken at end of file");
950
951 return 0;
952 }
953
954
xdl_build_script(xdfenv_t * xe,xdchange_t ** xscr)955 int xdl_build_script(xdfenv_t *xe, xdchange_t **xscr) {
956 xdchange_t *cscr = NULL, *xch;
957 char *rchg1 = xe->xdf1.rchg, *rchg2 = xe->xdf2.rchg;
958 long i1, i2, l1, l2;
959
960 /*
961 * Trivial. Collects "groups" of changes and creates an edit script.
962 */
963 for (i1 = xe->xdf1.nrec, i2 = xe->xdf2.nrec; i1 >= 0 || i2 >= 0; i1--, i2--)
964 if (rchg1[i1 - 1] || rchg2[i2 - 1]) {
965 for (l1 = i1; rchg1[i1 - 1]; i1--);
966 for (l2 = i2; rchg2[i2 - 1]; i2--);
967
968 if (!(xch = xdl_add_change(cscr, i1, i2, l1 - i1, l2 - i2))) {
969 xdl_free_script(cscr);
970 return -1;
971 }
972 cscr = xch;
973 }
974
975 *xscr = cscr;
976
977 return 0;
978 }
979
980
xdl_free_script(xdchange_t * xscr)981 void xdl_free_script(xdchange_t *xscr) {
982 xdchange_t *xch;
983
984 while ((xch = xscr) != NULL) {
985 xscr = xscr->next;
986 xdl_free(xch);
987 }
988 }
989
xdl_call_hunk_func(xdfenv_t * xe,xdchange_t * xscr,xdemitcb_t * ecb,xdemitconf_t const * xecfg)990 static int xdl_call_hunk_func(xdfenv_t *xe, xdchange_t *xscr, xdemitcb_t *ecb,
991 xdemitconf_t const *xecfg)
992 {
993 xdchange_t *xch, *xche;
994
995 (void)xe;
996
997 for (xch = xscr; xch; xch = xche->next) {
998 xche = xdl_get_hunk(&xch, xecfg);
999 if (!xch)
1000 break;
1001 if (xecfg->hunk_func(xch->i1, xche->i1 + xche->chg1 - xch->i1,
1002 xch->i2, xche->i2 + xche->chg2 - xch->i2,
1003 ecb->priv) < 0)
1004 return -1;
1005 }
1006 return 0;
1007 }
1008
xdl_mark_ignorable(xdchange_t * xscr,xdfenv_t * xe,long flags)1009 static void xdl_mark_ignorable(xdchange_t *xscr, xdfenv_t *xe, long flags)
1010 {
1011 xdchange_t *xch;
1012
1013 for (xch = xscr; xch; xch = xch->next) {
1014 int ignore = 1;
1015 xrecord_t **rec;
1016 long i;
1017
1018 rec = &xe->xdf1.recs[xch->i1];
1019 for (i = 0; i < xch->chg1 && ignore; i++)
1020 ignore = xdl_blankline(rec[i]->ptr, rec[i]->size, flags);
1021
1022 rec = &xe->xdf2.recs[xch->i2];
1023 for (i = 0; i < xch->chg2 && ignore; i++)
1024 ignore = xdl_blankline(rec[i]->ptr, rec[i]->size, flags);
1025
1026 xch->ignore = ignore;
1027 }
1028 }
1029
xdl_diff(mmfile_t * mf1,mmfile_t * mf2,xpparam_t const * xpp,xdemitconf_t const * xecfg,xdemitcb_t * ecb)1030 int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
1031 xdemitconf_t const *xecfg, xdemitcb_t *ecb) {
1032 xdchange_t *xscr;
1033 xdfenv_t xe;
1034 emit_func_t ef = xecfg->hunk_func ? xdl_call_hunk_func : xdl_emit_diff;
1035
1036 if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) {
1037
1038 return -1;
1039 }
1040 if (xdl_change_compact(&xe.xdf1, &xe.xdf2, xpp->flags) < 0 ||
1041 xdl_change_compact(&xe.xdf2, &xe.xdf1, xpp->flags) < 0 ||
1042 xdl_build_script(&xe, &xscr) < 0) {
1043
1044 xdl_free_env(&xe);
1045 return -1;
1046 }
1047 if (xscr) {
1048 if (xpp->flags & XDF_IGNORE_BLANK_LINES)
1049 xdl_mark_ignorable(xscr, &xe, xpp->flags);
1050
1051 if (ef(&xe, xscr, ecb, xecfg) < 0) {
1052
1053 xdl_free_script(xscr);
1054 xdl_free_env(&xe);
1055 return -1;
1056 }
1057 xdl_free_script(xscr);
1058 }
1059 xdl_free_env(&xe);
1060
1061 return 0;
1062 }
1063