1 /* Analyze differences between two vectors. 2 3 Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2010 Free Software 4 Foundation, Inc. 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 18 19 20 /* The basic idea is to consider two vectors as similar if, when 21 transforming the first vector into the second vector through a 22 sequence of edits (inserts and deletes of one element each), 23 this sequence is short - or equivalently, if the ordered list 24 of elements that are untouched by these edits is long. For a 25 good introduction to the subject, read about the "Levenshtein 26 distance" in Wikipedia. 27 28 The basic algorithm is described in: 29 "An O(ND) Difference Algorithm and its Variations", Eugene Myers, 30 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266; 31 see especially section 4.2, which describes the variation used below. 32 33 The basic algorithm was independently discovered as described in: 34 "Algorithms for Approximate String Matching", E. Ukkonen, 35 Information and Control Vol. 64, 1985, pp. 100-118. 36 37 Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE 38 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) 39 at the price of producing suboptimal output for large inputs with 40 many differences. */ 41 42 /* Before including this file, you need to define: 43 ELEMENT The element type of the vectors being compared. 44 EQUAL A two-argument macro that tests two elements for 45 equality. 46 OFFSET A signed integer type sufficient to hold the 47 difference between two indices. Usually 48 something like ssize_t. 49 EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'. 50 NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff]. 51 NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff]. 52 EARLY_ABORT(ctxt) (Optional) A boolean expression that triggers an 53 early abort of the computation. 54 USE_HEURISTIC (Optional) Define if you want to support the 55 heuristic for large vectors. 56 It is also possible to use this file with abstract arrays. In this case, 57 xvec and yvec are not represented in memory. They only exist conceptually. 58 In this case, the list of defines above is amended as follows: 59 ELEMENT Undefined. 60 EQUAL Undefined. 61 XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff) 62 A three-argument macro: References xvec[xoff] and 63 yvec[yoff] and tests these elements for equality. 64 Before including this file, you also need to include: 65 #include <limits.h> 66 #include <stdbool.h> 67 #include "minmax.h" 68 */ 69 70 /* Maximum value of type OFFSET. */ 71 #define OFFSET_MAX \ 72 ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1) 73 74 /* Default to no early abort. */ 75 #ifndef EARLY_ABORT 76 # define EARLY_ABORT(ctxt) false 77 #endif 78 79 /* Use this to suppress gcc's `...may be used before initialized' warnings. 80 Beware: The Code argument must not contain commas. */ 81 #ifndef IF_LINT 82 # ifdef lint 83 # define IF_LINT(Code) Code 84 # else 85 # define IF_LINT(Code) /* empty */ 86 # endif 87 #endif 88 89 /* As above, but when Code must contain one comma. */ 90 #ifndef IF_LINT2 91 # ifdef lint 92 # define IF_LINT2(Code1, Code2) Code1, Code2 93 # else 94 # define IF_LINT2(Code1, Code2) /* empty */ 95 # endif 96 #endif 97 98 /* 99 * Context of comparison operation. 100 */ 101 struct context 102 { 103 #ifdef ELEMENT 104 /* Vectors being compared. */ 105 ELEMENT const *xvec; 106 ELEMENT const *yvec; 107 #endif 108 109 /* Extra fields. */ 110 EXTRA_CONTEXT_FIELDS 111 112 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point 113 furthest along the given diagonal in the forward search of the edit 114 matrix. */ 115 OFFSET *fdiag; 116 117 /* Vector, indexed by diagonal, containing the X coordinate of the point 118 furthest along the given diagonal in the backward search of the edit 119 matrix. */ 120 OFFSET *bdiag; 121 122 #ifdef USE_HEURISTIC 123 /* This corresponds to the diff -H flag. With this heuristic, for 124 vectors with a constant small density of changes, the algorithm is 125 linear in the vectors size. */ 126 bool heuristic; 127 #endif 128 129 /* Edit scripts longer than this are too expensive to compute. */ 130 OFFSET too_expensive; 131 132 /* Snakes bigger than this are considered `big'. */ 133 #define SNAKE_LIMIT 20 134 }; 135 136 struct partition 137 { 138 /* Midpoints of this partition. */ 139 OFFSET xmid; 140 OFFSET ymid; 141 142 /* True if low half will be analyzed minimally. */ 143 bool lo_minimal; 144 145 /* Likewise for high half. */ 146 bool hi_minimal; 147 }; 148 149 150 /* Find the midpoint of the shortest edit script for a specified portion 151 of the two vectors. 152 153 Scan from the beginnings of the vectors, and simultaneously from the ends, 154 doing a breadth-first search through the space of edit-sequence. 155 When the two searches meet, we have found the midpoint of the shortest 156 edit sequence. 157 158 If FIND_MINIMAL is true, find the minimal edit script regardless of 159 expense. Otherwise, if the search is too expensive, use heuristics to 160 stop the search and report a suboptimal answer. 161 162 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number 163 XMID - YMID equals the number of inserted elements minus the number 164 of deleted elements (counting only elements before the midpoint). 165 166 Set PART->lo_minimal to true iff the minimal edit script for the 167 left half of the partition is known; similarly for PART->hi_minimal. 168 169 This function assumes that the first elements of the specified portions 170 of the two vectors do not match, and likewise that the last elements do not 171 match. The caller must trim matching elements from the beginning and end 172 of the portions it is going to specify. 173 174 If we return the "wrong" partitions, the worst this can do is cause 175 suboptimal diff output. It cannot cause incorrect diff output. */ 176 177 static void 178 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal, 179 struct partition *part, struct context *ctxt) 180 { 181 OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */ 182 OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */ 183 #ifdef ELEMENT 184 ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */ 185 ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */ 186 #define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y]) 187 #else 188 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y) 189 #endif 190 const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */ 191 const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */ 192 const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */ 193 const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ 194 OFFSET fmin = fmid; 195 OFFSET fmax = fmid; /* Limits of top-down search. */ 196 OFFSET bmin = bmid; 197 OFFSET bmax = bmid; /* Limits of bottom-up search. */ 198 OFFSET c; /* Cost. */ 199 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd 200 diagonal with respect to the northwest. */ 201 202 fd[fmid] = xoff; 203 bd[bmid] = xlim; 204 205 for (c = 1;; ++c) 206 { 207 OFFSET d; /* Active diagonal. */ 208 bool big_snake = false; 209 210 /* Extend the top-down search by an edit step in each diagonal. */ 211 if (fmin > dmin) 212 fd[--fmin - 1] = -1; 213 else 214 ++fmin; 215 if (fmax < dmax) 216 fd[++fmax + 1] = -1; 217 else 218 --fmax; 219 for (d = fmax; d >= fmin; d -= 2) 220 { 221 OFFSET x; 222 OFFSET y; 223 OFFSET tlo = fd[d - 1]; 224 OFFSET thi = fd[d + 1]; 225 OFFSET x0 = tlo < thi ? thi : tlo + 1; 226 227 for (x = x0, y = x0 - d; 228 x < xlim && y < ylim && XREF_YREF_EQUAL (x, y); 229 x++, y++) 230 continue; 231 if (x - x0 > SNAKE_LIMIT) 232 big_snake = true; 233 fd[d] = x; 234 if (odd && bmin <= d && d <= bmax && bd[d] <= x) 235 { 236 part->xmid = x; 237 part->ymid = y; 238 part->lo_minimal = part->hi_minimal = true; 239 return; 240 } 241 } 242 243 /* Similarly extend the bottom-up search. */ 244 if (bmin > dmin) 245 bd[--bmin - 1] = OFFSET_MAX; 246 else 247 ++bmin; 248 if (bmax < dmax) 249 bd[++bmax + 1] = OFFSET_MAX; 250 else 251 --bmax; 252 for (d = bmax; d >= bmin; d -= 2) 253 { 254 OFFSET x; 255 OFFSET y; 256 OFFSET tlo = bd[d - 1]; 257 OFFSET thi = bd[d + 1]; 258 OFFSET x0 = tlo < thi ? tlo : thi - 1; 259 260 for (x = x0, y = x0 - d; 261 xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1); 262 x--, y--) 263 continue; 264 if (x0 - x > SNAKE_LIMIT) 265 big_snake = true; 266 bd[d] = x; 267 if (!odd && fmin <= d && d <= fmax && x <= fd[d]) 268 { 269 part->xmid = x; 270 part->ymid = y; 271 part->lo_minimal = part->hi_minimal = true; 272 return; 273 } 274 } 275 276 if (find_minimal) 277 continue; 278 279 #ifdef USE_HEURISTIC 280 /* Heuristic: check occasionally for a diagonal that has made lots 281 of progress compared with the edit distance. If we have any 282 such, find the one that has made the most progress and return it 283 as if it had succeeded. 284 285 With this heuristic, for vectors with a constant small density 286 of changes, the algorithm is linear in the vector size. */ 287 288 if (200 < c && big_snake && ctxt->heuristic) 289 { 290 { 291 OFFSET best = 0; 292 293 for (d = fmax; d >= fmin; d -= 2) 294 { 295 OFFSET dd = d - fmid; 296 OFFSET x = fd[d]; 297 OFFSET y = x - d; 298 OFFSET v = (x - xoff) * 2 - dd; 299 300 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 301 { 302 if (v > best 303 && xoff + SNAKE_LIMIT <= x && x < xlim 304 && yoff + SNAKE_LIMIT <= y && y < ylim) 305 { 306 /* We have a good enough best diagonal; now insist 307 that it end with a significant snake. */ 308 int k; 309 310 for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++) 311 if (k == SNAKE_LIMIT) 312 { 313 best = v; 314 part->xmid = x; 315 part->ymid = y; 316 break; 317 } 318 } 319 } 320 } 321 if (best > 0) 322 { 323 part->lo_minimal = true; 324 part->hi_minimal = false; 325 return; 326 } 327 } 328 329 { 330 OFFSET best = 0; 331 332 for (d = bmax; d >= bmin; d -= 2) 333 { 334 OFFSET dd = d - bmid; 335 OFFSET x = bd[d]; 336 OFFSET y = x - d; 337 OFFSET v = (xlim - x) * 2 + dd; 338 339 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 340 { 341 if (v > best 342 && xoff < x && x <= xlim - SNAKE_LIMIT 343 && yoff < y && y <= ylim - SNAKE_LIMIT) 344 { 345 /* We have a good enough best diagonal; now insist 346 that it end with a significant snake. */ 347 int k; 348 349 for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++) 350 if (k == SNAKE_LIMIT - 1) 351 { 352 best = v; 353 part->xmid = x; 354 part->ymid = y; 355 break; 356 } 357 } 358 } 359 } 360 if (best > 0) 361 { 362 part->lo_minimal = false; 363 part->hi_minimal = true; 364 return; 365 } 366 } 367 } 368 #endif /* USE_HEURISTIC */ 369 370 /* Heuristic: if we've gone well beyond the call of duty, give up 371 and report halfway between our best results so far. */ 372 if (c >= ctxt->too_expensive) 373 { 374 OFFSET fxybest; 375 OFFSET fxbest IF_LINT (= 0); 376 OFFSET bxybest; 377 OFFSET bxbest IF_LINT (= 0); 378 379 /* Find forward diagonal that maximizes X + Y. */ 380 fxybest = -1; 381 for (d = fmax; d >= fmin; d -= 2) 382 { 383 OFFSET x = MIN (fd[d], xlim); 384 OFFSET y = x - d; 385 if (ylim < y) 386 { 387 x = ylim + d; 388 y = ylim; 389 } 390 if (fxybest < x + y) 391 { 392 fxybest = x + y; 393 fxbest = x; 394 } 395 } 396 397 /* Find backward diagonal that minimizes X + Y. */ 398 bxybest = OFFSET_MAX; 399 for (d = bmax; d >= bmin; d -= 2) 400 { 401 OFFSET x = MAX (xoff, bd[d]); 402 OFFSET y = x - d; 403 if (y < yoff) 404 { 405 x = yoff + d; 406 y = yoff; 407 } 408 if (x + y < bxybest) 409 { 410 bxybest = x + y; 411 bxbest = x; 412 } 413 } 414 415 /* Use the better of the two diagonals. */ 416 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) 417 { 418 part->xmid = fxbest; 419 part->ymid = fxybest - fxbest; 420 part->lo_minimal = true; 421 part->hi_minimal = false; 422 } 423 else 424 { 425 part->xmid = bxbest; 426 part->ymid = bxybest - bxbest; 427 part->lo_minimal = false; 428 part->hi_minimal = true; 429 } 430 return; 431 } 432 } 433 #undef XREF_YREF_EQUAL 434 } 435 436 437 /* Compare in detail contiguous subsequences of the two vectors 438 which are known, as a whole, to match each other. 439 440 The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1. 441 442 Note that XLIM, YLIM are exclusive bounds. All indices into the vectors 443 are origin-0. 444 445 If FIND_MINIMAL, find a minimal difference no matter how 446 expensive it is. 447 448 The results are recorded by invoking NOTE_DELETE and NOTE_INSERT. 449 450 Return false if terminated normally, or true if terminated through early 451 abort. */ 452 453 static bool 454 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, 455 bool find_minimal, struct context *ctxt) 456 { 457 #ifdef ELEMENT 458 ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */ 459 ELEMENT const *yv = ctxt->yvec; 460 #define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y]) 461 #else 462 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y) 463 #endif 464 465 /* Slide down the bottom initial diagonal. */ 466 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff)) 467 { 468 xoff++; 469 yoff++; 470 } 471 472 /* Slide up the top initial diagonal. */ 473 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1)) 474 { 475 xlim--; 476 ylim--; 477 } 478 479 /* Handle simple cases. */ 480 if (xoff == xlim) 481 while (yoff < ylim) 482 { 483 NOTE_INSERT (ctxt, yoff); 484 if (EARLY_ABORT (ctxt)) 485 return true; 486 yoff++; 487 } 488 else if (yoff == ylim) 489 while (xoff < xlim) 490 { 491 NOTE_DELETE (ctxt, xoff); 492 if (EARLY_ABORT (ctxt)) 493 return true; 494 xoff++; 495 } 496 else 497 { 498 struct partition part IF_LINT2 (= { .xmid = 0, .ymid = 0 }); 499 500 /* Find a point of correspondence in the middle of the vectors. */ 501 diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt); 502 503 /* Use the partitions to split this problem into subproblems. */ 504 if (compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt)) 505 return true; 506 if (compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt)) 507 return true; 508 } 509 510 return false; 511 #undef XREF_YREF_EQUAL 512 } 513 514 #undef ELEMENT 515 #undef EQUAL 516 #undef OFFSET 517 #undef EXTRA_CONTEXT_FIELDS 518 #undef NOTE_DELETE 519 #undef NOTE_INSERT 520 #undef EARLY_ABORT 521 #undef USE_HEURISTIC 522 #undef XVECREF_YVECREF_EQUAL 523 #undef OFFSET_MAX 524