1 /* Analyze differences between two vectors. 2 3 Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2018 Free Software 4 Foundation, Inc. 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program. If not, see <https://www.gnu.org/licenses/>. */ 18 19 20 /* The basic idea is to consider two vectors as similar if, when 21 transforming the first vector into the second vector through a 22 sequence of edits (inserts and deletes of one element each), 23 this sequence is short - or equivalently, if the ordered list 24 of elements that are untouched by these edits is long. For a 25 good introduction to the subject, read about the "Levenshtein 26 distance" in Wikipedia. 27 28 The basic algorithm is described in: 29 "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers, 30 Algorithmica Vol. 1, 1986, pp. 251-266, 31 <http://dx.doi.org/10.1007/BF01840446>. 32 See especially section 4.2, which describes the variation used below. 33 34 The basic algorithm was independently discovered as described in: 35 "Algorithms for Approximate String Matching", Esko Ukkonen, 36 Information and Control Vol. 64, 1985, pp. 100-118, 37 <http://dx.doi.org/10.1016/S0019-9958(85)80046-2>. 38 39 Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE 40 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) 41 at the price of producing suboptimal output for large inputs with 42 many differences. */ 43 44 /* Before including this file, you need to define: 45 ELEMENT The element type of the vectors being compared. 46 EQUAL A two-argument macro that tests two elements for 47 equality. 48 OFFSET A signed integer type sufficient to hold the 49 difference between two indices. Usually 50 something like ptrdiff_t. 51 EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'. 52 NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff]. 53 NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff]. 54 EARLY_ABORT(ctxt) (Optional) A boolean expression that triggers an 55 early abort of the computation. 56 USE_HEURISTIC (Optional) Define if you want to support the 57 heuristic for large vectors. 58 It is also possible to use this file with abstract arrays. In this case, 59 xvec and yvec are not represented in memory. They only exist conceptually. 60 In this case, the list of defines above is amended as follows: 61 ELEMENT Undefined. 62 EQUAL Undefined. 63 XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff) 64 A three-argument macro: References xvec[xoff] and 65 yvec[yoff] and tests these elements for equality. 66 Before including this file, you also need to include: 67 #include <limits.h> 68 #include <stdbool.h> 69 #include "minmax.h" 70 */ 71 72 /* Maximum value of type OFFSET. */ 73 #define OFFSET_MAX \ 74 ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1) 75 76 /* Default to no early abort. */ 77 #ifndef EARLY_ABORT 78 # define EARLY_ABORT(ctxt) false 79 #endif 80 81 /* Use this to suppress gcc's "...may be used before initialized" warnings. 82 Beware: The Code argument must not contain commas. */ 83 #ifndef IF_LINT 84 # if defined GCC_LINT || defined lint 85 # define IF_LINT(Code) Code 86 # else 87 # define IF_LINT(Code) /* empty */ 88 # endif 89 #endif 90 91 /* As above, but when Code must contain one comma. */ 92 #ifndef IF_LINT2 93 # if defined GCC_LINT || defined lint 94 # define IF_LINT2(Code1, Code2) Code1, Code2 95 # else 96 # define IF_LINT2(Code1, Code2) /* empty */ 97 # endif 98 #endif 99 100 /* 101 * Context of comparison operation. 102 */ 103 struct context 104 { 105 #ifdef ELEMENT 106 /* Vectors being compared. */ 107 ELEMENT const *xvec; 108 ELEMENT const *yvec; 109 #endif 110 111 /* Extra fields. */ 112 EXTRA_CONTEXT_FIELDS 113 114 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point 115 furthest along the given diagonal in the forward search of the edit 116 matrix. */ 117 OFFSET *fdiag; 118 119 /* Vector, indexed by diagonal, containing the X coordinate of the point 120 furthest along the given diagonal in the backward search of the edit 121 matrix. */ 122 OFFSET *bdiag; 123 124 #ifdef USE_HEURISTIC 125 /* This corresponds to the diff --speed-large-files flag. With this 126 heuristic, for vectors with a constant small density of changes, 127 the algorithm is linear in the vector size. */ 128 bool heuristic; 129 #endif 130 131 /* Edit scripts longer than this are too expensive to compute. */ 132 OFFSET too_expensive; 133 134 /* Snakes bigger than this are considered "big". */ 135 #define SNAKE_LIMIT 20 136 }; 137 138 struct partition 139 { 140 /* Midpoints of this partition. */ 141 OFFSET xmid; 142 OFFSET ymid; 143 144 /* True if low half will be analyzed minimally. */ 145 bool lo_minimal; 146 147 /* Likewise for high half. */ 148 bool hi_minimal; 149 }; 150 151 152 /* Find the midpoint of the shortest edit script for a specified portion 153 of the two vectors. 154 155 Scan from the beginnings of the vectors, and simultaneously from the ends, 156 doing a breadth-first search through the space of edit-sequence. 157 When the two searches meet, we have found the midpoint of the shortest 158 edit sequence. 159 160 If FIND_MINIMAL is true, find the minimal edit script regardless of 161 expense. Otherwise, if the search is too expensive, use heuristics to 162 stop the search and report a suboptimal answer. 163 164 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number 165 XMID - YMID equals the number of inserted elements minus the number 166 of deleted elements (counting only elements before the midpoint). 167 168 Set PART->lo_minimal to true iff the minimal edit script for the 169 left half of the partition is known; similarly for PART->hi_minimal. 170 171 This function assumes that the first elements of the specified portions 172 of the two vectors do not match, and likewise that the last elements do not 173 match. The caller must trim matching elements from the beginning and end 174 of the portions it is going to specify. 175 176 If we return the "wrong" partitions, the worst this can do is cause 177 suboptimal diff output. It cannot cause incorrect diff output. */ 178 179 static void 180 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal, 181 struct partition *part, struct context *ctxt) 182 { 183 OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */ 184 OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */ 185 #ifdef ELEMENT 186 ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */ 187 ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */ 188 #define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y]) 189 #else 190 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y) 191 #endif 192 const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */ 193 const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */ 194 const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */ 195 const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ 196 OFFSET fmin = fmid; 197 OFFSET fmax = fmid; /* Limits of top-down search. */ 198 OFFSET bmin = bmid; 199 OFFSET bmax = bmid; /* Limits of bottom-up search. */ 200 OFFSET c; /* Cost. */ 201 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd 202 diagonal with respect to the northwest. */ 203 204 fd[fmid] = xoff; 205 bd[bmid] = xlim; 206 207 for (c = 1;; ++c) 208 { 209 OFFSET d; /* Active diagonal. */ 210 bool big_snake = false; 211 212 /* Extend the top-down search by an edit step in each diagonal. */ 213 if (fmin > dmin) 214 fd[--fmin - 1] = -1; 215 else 216 ++fmin; 217 if (fmax < dmax) 218 fd[++fmax + 1] = -1; 219 else 220 --fmax; 221 for (d = fmax; d >= fmin; d -= 2) 222 { 223 OFFSET x; 224 OFFSET y; 225 OFFSET tlo = fd[d - 1]; 226 OFFSET thi = fd[d + 1]; 227 OFFSET x0 = tlo < thi ? thi : tlo + 1; 228 229 for (x = x0, y = x0 - d; 230 x < xlim && y < ylim && XREF_YREF_EQUAL (x, y); 231 x++, y++) 232 continue; 233 if (x - x0 > SNAKE_LIMIT) 234 big_snake = true; 235 fd[d] = x; 236 if (odd && bmin <= d && d <= bmax && bd[d] <= x) 237 { 238 part->xmid = x; 239 part->ymid = y; 240 part->lo_minimal = part->hi_minimal = true; 241 return; 242 } 243 } 244 245 /* Similarly extend the bottom-up search. */ 246 if (bmin > dmin) 247 bd[--bmin - 1] = OFFSET_MAX; 248 else 249 ++bmin; 250 if (bmax < dmax) 251 bd[++bmax + 1] = OFFSET_MAX; 252 else 253 --bmax; 254 for (d = bmax; d >= bmin; d -= 2) 255 { 256 OFFSET x; 257 OFFSET y; 258 OFFSET tlo = bd[d - 1]; 259 OFFSET thi = bd[d + 1]; 260 OFFSET x0 = tlo < thi ? tlo : thi - 1; 261 262 for (x = x0, y = x0 - d; 263 xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1); 264 x--, y--) 265 continue; 266 if (x0 - x > SNAKE_LIMIT) 267 big_snake = true; 268 bd[d] = x; 269 if (!odd && fmin <= d && d <= fmax && x <= fd[d]) 270 { 271 part->xmid = x; 272 part->ymid = y; 273 part->lo_minimal = part->hi_minimal = true; 274 return; 275 } 276 } 277 278 if (find_minimal) 279 continue; 280 281 #ifdef USE_HEURISTIC 282 bool heuristic = ctxt->heuristic; 283 #else 284 bool heuristic = false; 285 #endif 286 287 /* Heuristic: check occasionally for a diagonal that has made lots 288 of progress compared with the edit distance. If we have any 289 such, find the one that has made the most progress and return it 290 as if it had succeeded. 291 292 With this heuristic, for vectors with a constant small density 293 of changes, the algorithm is linear in the vector size. */ 294 295 if (200 < c && big_snake && heuristic) 296 { 297 { 298 OFFSET best = 0; 299 300 for (d = fmax; d >= fmin; d -= 2) 301 { 302 OFFSET dd = d - fmid; 303 OFFSET x = fd[d]; 304 OFFSET y = x - d; 305 OFFSET v = (x - xoff) * 2 - dd; 306 307 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 308 { 309 if (v > best 310 && xoff + SNAKE_LIMIT <= x && x < xlim 311 && yoff + SNAKE_LIMIT <= y && y < ylim) 312 { 313 /* We have a good enough best diagonal; now insist 314 that it end with a significant snake. */ 315 int k; 316 317 for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++) 318 if (k == SNAKE_LIMIT) 319 { 320 best = v; 321 part->xmid = x; 322 part->ymid = y; 323 break; 324 } 325 } 326 } 327 } 328 if (best > 0) 329 { 330 part->lo_minimal = true; 331 part->hi_minimal = false; 332 return; 333 } 334 } 335 336 { 337 OFFSET best = 0; 338 339 for (d = bmax; d >= bmin; d -= 2) 340 { 341 OFFSET dd = d - bmid; 342 OFFSET x = bd[d]; 343 OFFSET y = x - d; 344 OFFSET v = (xlim - x) * 2 + dd; 345 346 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 347 { 348 if (v > best 349 && xoff < x && x <= xlim - SNAKE_LIMIT 350 && yoff < y && y <= ylim - SNAKE_LIMIT) 351 { 352 /* We have a good enough best diagonal; now insist 353 that it end with a significant snake. */ 354 int k; 355 356 for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++) 357 if (k == SNAKE_LIMIT - 1) 358 { 359 best = v; 360 part->xmid = x; 361 part->ymid = y; 362 break; 363 } 364 } 365 } 366 } 367 if (best > 0) 368 { 369 part->lo_minimal = false; 370 part->hi_minimal = true; 371 return; 372 } 373 } 374 } 375 376 /* Heuristic: if we've gone well beyond the call of duty, give up 377 and report halfway between our best results so far. */ 378 if (c >= ctxt->too_expensive) 379 { 380 OFFSET fxybest; 381 OFFSET fxbest IF_LINT (= 0); 382 OFFSET bxybest; 383 OFFSET bxbest IF_LINT (= 0); 384 385 /* Find forward diagonal that maximizes X + Y. */ 386 fxybest = -1; 387 for (d = fmax; d >= fmin; d -= 2) 388 { 389 OFFSET x = MIN (fd[d], xlim); 390 OFFSET y = x - d; 391 if (ylim < y) 392 { 393 x = ylim + d; 394 y = ylim; 395 } 396 if (fxybest < x + y) 397 { 398 fxybest = x + y; 399 fxbest = x; 400 } 401 } 402 403 /* Find backward diagonal that minimizes X + Y. */ 404 bxybest = OFFSET_MAX; 405 for (d = bmax; d >= bmin; d -= 2) 406 { 407 OFFSET x = MAX (xoff, bd[d]); 408 OFFSET y = x - d; 409 if (y < yoff) 410 { 411 x = yoff + d; 412 y = yoff; 413 } 414 if (x + y < bxybest) 415 { 416 bxybest = x + y; 417 bxbest = x; 418 } 419 } 420 421 /* Use the better of the two diagonals. */ 422 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) 423 { 424 part->xmid = fxbest; 425 part->ymid = fxybest - fxbest; 426 part->lo_minimal = true; 427 part->hi_minimal = false; 428 } 429 else 430 { 431 part->xmid = bxbest; 432 part->ymid = bxybest - bxbest; 433 part->lo_minimal = false; 434 part->hi_minimal = true; 435 } 436 return; 437 } 438 } 439 #undef XREF_YREF_EQUAL 440 } 441 442 443 /* Compare in detail contiguous subsequences of the two vectors 444 which are known, as a whole, to match each other. 445 446 The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1. 447 448 Note that XLIM, YLIM are exclusive bounds. All indices into the vectors 449 are origin-0. 450 451 If FIND_MINIMAL, find a minimal difference no matter how 452 expensive it is. 453 454 The results are recorded by invoking NOTE_DELETE and NOTE_INSERT. 455 456 Return false if terminated normally, or true if terminated through early 457 abort. */ 458 459 static bool 460 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, 461 bool find_minimal, struct context *ctxt) 462 { 463 #ifdef ELEMENT 464 ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */ 465 ELEMENT const *yv = ctxt->yvec; 466 #define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y]) 467 #else 468 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y) 469 #endif 470 471 /* Slide down the bottom initial diagonal. */ 472 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff)) 473 { 474 xoff++; 475 yoff++; 476 } 477 478 /* Slide up the top initial diagonal. */ 479 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1)) 480 { 481 xlim--; 482 ylim--; 483 } 484 485 /* Handle simple cases. */ 486 if (xoff == xlim) 487 while (yoff < ylim) 488 { 489 NOTE_INSERT (ctxt, yoff); 490 if (EARLY_ABORT (ctxt)) 491 return true; 492 yoff++; 493 } 494 else if (yoff == ylim) 495 while (xoff < xlim) 496 { 497 NOTE_DELETE (ctxt, xoff); 498 if (EARLY_ABORT (ctxt)) 499 return true; 500 xoff++; 501 } 502 else 503 { 504 struct partition part IF_LINT2 (= { .xmid = 0, .ymid = 0 }); 505 506 /* Find a point of correspondence in the middle of the vectors. */ 507 diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt); 508 509 /* Use the partitions to split this problem into subproblems. */ 510 if (compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt)) 511 return true; 512 if (compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt)) 513 return true; 514 } 515 516 return false; 517 #undef XREF_YREF_EQUAL 518 } 519 520 #undef ELEMENT 521 #undef EQUAL 522 #undef OFFSET 523 #undef EXTRA_CONTEXT_FIELDS 524 #undef NOTE_DELETE 525 #undef NOTE_INSERT 526 #undef EARLY_ABORT 527 #undef USE_HEURISTIC 528 #undef XVECREF_YVECREF_EQUAL 529 #undef OFFSET_MAX 530