1#!/usr/local/bin/python3.8 2 3from __future__ import division 4 5"""Diff Match and Patch 6Copyright 2018 The diff-match-patch Authors. 7https://github.com/google/diff-match-patch 8 9Licensed under the Apache License, Version 2.0 (the "License"); 10you may not use this file except in compliance with the License. 11You may obtain a copy of the License at 12 13 http://www.apache.org/licenses/LICENSE-2.0 14 15Unless required by applicable law or agreed to in writing, software 16distributed under the License is distributed on an "AS IS" BASIS, 17WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18See the License for the specific language governing permissions and 19limitations under the License. 20""" 21 22"""Functions for diff, match and patch. 23 24Computes the difference between two texts to create a patch. 25Applies the patch onto another text, allowing for errors. 26""" 27 28__author__ = "fraser@google.com (Neil Fraser)" 29 30import re 31import sys 32import time 33import urllib 34 35 36class diff_match_patch: 37 """Class containing the diff, match and patch methods. 38 39 Also contains the behaviour settings. 40 """ 41 42 def __init__(self): 43 """Inits a diff_match_patch object with default settings. 44 Redefine these in your program to override the defaults. 45 """ 46 47 # Number of seconds to map a diff before giving up (0 for infinity). 48 self.Diff_Timeout = 1.0 49 # Cost of an empty edit operation in terms of edit characters. 50 self.Diff_EditCost = 4 51 # At what point is no match declared (0.0 = perfection, 1.0 = very loose). 52 self.Match_Threshold = 0.5 53 # How far to search for a match (0 = exact location, 1000+ = broad match). 54 # A match this many characters away from the expected location will add 55 # 1.0 to the score (0.0 is a perfect match). 56 self.Match_Distance = 1000 57 # When deleting a large block of text (over ~64 characters), how close do 58 # the contents have to be to match the expected contents. (0.0 = perfection, 59 # 1.0 = very loose). Note that Match_Threshold controls how closely the 60 # end points of a delete need to match. 61 self.Patch_DeleteThreshold = 0.5 62 # Chunk size for context length. 63 self.Patch_Margin = 4 64 65 # The number of bits in an int. 66 # Python has no maximum, thus to disable patch splitting set to 0. 67 # However to avoid long patches in certain pathological cases, use 32. 68 # Multiple short patches (using native ints) are much faster than long ones. 69 self.Match_MaxBits = 32 70 71 # DIFF FUNCTIONS 72 73 # The data structure representing a diff is an array of tuples: 74 # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")] 75 # which means: delete "Hello", add "Goodbye" and keep " world." 76 DIFF_DELETE = -1 77 DIFF_INSERT = 1 78 DIFF_EQUAL = 0 79 80 def diff_main(self, text1, text2, checklines=True, deadline=None): 81 """Find the differences between two texts. Simplifies the problem by 82 stripping any common prefix or suffix off the texts before diffing. 83 84 Args: 85 text1: Old string to be diffed. 86 text2: New string to be diffed. 87 checklines: Optional speedup flag. If present and false, then don't run 88 a line-level diff first to identify the changed areas. 89 Defaults to true, which does a faster, slightly less optimal diff. 90 deadline: Optional time when the diff should be complete by. Used 91 internally for recursive calls. Users should set DiffTimeout instead. 92 93 Returns: 94 Array of changes. 95 """ 96 # Set a deadline by which time the diff must be complete. 97 if deadline == None: 98 # Unlike in most languages, Python counts time in seconds. 99 if self.Diff_Timeout <= 0: 100 deadline = sys.maxint 101 else: 102 deadline = time.time() + self.Diff_Timeout 103 104 # Check for null inputs. 105 if text1 == None or text2 == None: 106 raise ValueError("Null inputs. (diff_main)") 107 108 # Check for equality (speedup). 109 if text1 == text2: 110 if text1: 111 return [(self.DIFF_EQUAL, text1)] 112 return [] 113 114 # Trim off common prefix (speedup). 115 commonlength = self.diff_commonPrefix(text1, text2) 116 commonprefix = text1[:commonlength] 117 text1 = text1[commonlength:] 118 text2 = text2[commonlength:] 119 120 # Trim off common suffix (speedup). 121 commonlength = self.diff_commonSuffix(text1, text2) 122 if commonlength == 0: 123 commonsuffix = "" 124 else: 125 commonsuffix = text1[-commonlength:] 126 text1 = text1[:-commonlength] 127 text2 = text2[:-commonlength] 128 129 # Compute the diff on the middle block. 130 diffs = self.diff_compute(text1, text2, checklines, deadline) 131 132 # Restore the prefix and suffix. 133 if commonprefix: 134 diffs[:0] = [(self.DIFF_EQUAL, commonprefix)] 135 if commonsuffix: 136 diffs.append((self.DIFF_EQUAL, commonsuffix)) 137 self.diff_cleanupMerge(diffs) 138 return diffs 139 140 def diff_compute(self, text1, text2, checklines, deadline): 141 """Find the differences between two texts. Assumes that the texts do not 142 have any common prefix or suffix. 143 144 Args: 145 text1: Old string to be diffed. 146 text2: New string to be diffed. 147 checklines: Speedup flag. If false, then don't run a line-level diff 148 first to identify the changed areas. 149 If true, then run a faster, slightly less optimal diff. 150 deadline: Time when the diff should be complete by. 151 152 Returns: 153 Array of changes. 154 """ 155 if not text1: 156 # Just add some text (speedup). 157 return [(self.DIFF_INSERT, text2)] 158 159 if not text2: 160 # Just delete some text (speedup). 161 return [(self.DIFF_DELETE, text1)] 162 163 if len(text1) > len(text2): 164 (longtext, shorttext) = (text1, text2) 165 else: 166 (shorttext, longtext) = (text1, text2) 167 i = longtext.find(shorttext) 168 if i != -1: 169 # Shorter text is inside the longer text (speedup). 170 diffs = [ 171 (self.DIFF_INSERT, longtext[:i]), 172 (self.DIFF_EQUAL, shorttext), 173 (self.DIFF_INSERT, longtext[i + len(shorttext) :]), 174 ] 175 # Swap insertions for deletions if diff is reversed. 176 if len(text1) > len(text2): 177 diffs[0] = (self.DIFF_DELETE, diffs[0][1]) 178 diffs[2] = (self.DIFF_DELETE, diffs[2][1]) 179 return diffs 180 181 if len(shorttext) == 1: 182 # Single character string. 183 # After the previous speedup, the character can't be an equality. 184 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)] 185 186 # Check to see if the problem can be split in two. 187 hm = self.diff_halfMatch(text1, text2) 188 if hm: 189 # A half-match was found, sort out the return data. 190 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm 191 # Send both pairs off for separate processing. 192 diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline) 193 diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline) 194 # Merge the results. 195 return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b 196 197 if checklines and len(text1) > 100 and len(text2) > 100: 198 return self.diff_lineMode(text1, text2, deadline) 199 200 return self.diff_bisect(text1, text2, deadline) 201 202 def diff_lineMode(self, text1, text2, deadline): 203 """Do a quick line-level diff on both strings, then rediff the parts for 204 greater accuracy. 205 This speedup can produce non-minimal diffs. 206 207 Args: 208 text1: Old string to be diffed. 209 text2: New string to be diffed. 210 deadline: Time when the diff should be complete by. 211 212 Returns: 213 Array of changes. 214 """ 215 216 # Scan the text on a line-by-line basis first. 217 (text1, text2, linearray) = self.diff_linesToChars(text1, text2) 218 219 diffs = self.diff_main(text1, text2, False, deadline) 220 221 # Convert the diff back to original text. 222 self.diff_charsToLines(diffs, linearray) 223 # Eliminate freak matches (e.g. blank lines) 224 self.diff_cleanupSemantic(diffs) 225 226 # Rediff any replacement blocks, this time character-by-character. 227 # Add a dummy entry at the end. 228 diffs.append((self.DIFF_EQUAL, "")) 229 pointer = 0 230 count_delete = 0 231 count_insert = 0 232 text_delete = "" 233 text_insert = "" 234 while pointer < len(diffs): 235 if diffs[pointer][0] == self.DIFF_INSERT: 236 count_insert += 1 237 text_insert += diffs[pointer][1] 238 elif diffs[pointer][0] == self.DIFF_DELETE: 239 count_delete += 1 240 text_delete += diffs[pointer][1] 241 elif diffs[pointer][0] == self.DIFF_EQUAL: 242 # Upon reaching an equality, check for prior redundancies. 243 if count_delete >= 1 and count_insert >= 1: 244 # Delete the offending records and add the merged ones. 245 subDiff = self.diff_main(text_delete, text_insert, False, deadline) 246 diffs[pointer - count_delete - count_insert : pointer] = subDiff 247 pointer = pointer - count_delete - count_insert + len(subDiff) 248 count_insert = 0 249 count_delete = 0 250 text_delete = "" 251 text_insert = "" 252 253 pointer += 1 254 255 diffs.pop() # Remove the dummy entry at the end. 256 257 return diffs 258 259 def diff_bisect(self, text1, text2, deadline): 260 """Find the 'middle snake' of a diff, split the problem in two 261 and return the recursively constructed diff. 262 See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. 263 264 Args: 265 text1: Old string to be diffed. 266 text2: New string to be diffed. 267 deadline: Time at which to bail if not yet complete. 268 269 Returns: 270 Array of diff tuples. 271 """ 272 273 # Cache the text lengths to prevent multiple calls. 274 text1_length = len(text1) 275 text2_length = len(text2) 276 max_d = (text1_length + text2_length + 1) // 2 277 v_offset = max_d 278 v_length = 2 * max_d 279 v1 = [-1] * v_length 280 v1[v_offset + 1] = 0 281 v2 = v1[:] 282 delta = text1_length - text2_length 283 # If the total number of characters is odd, then the front path will 284 # collide with the reverse path. 285 front = delta % 2 != 0 286 # Offsets for start and end of k loop. 287 # Prevents mapping of space beyond the grid. 288 k1start = 0 289 k1end = 0 290 k2start = 0 291 k2end = 0 292 for d in xrange(max_d): 293 # Bail out if deadline is reached. 294 if time.time() > deadline: 295 break 296 297 # Walk the front path one step. 298 for k1 in xrange(-d + k1start, d + 1 - k1end, 2): 299 k1_offset = v_offset + k1 300 if k1 == -d or (k1 != d and v1[k1_offset - 1] < v1[k1_offset + 1]): 301 x1 = v1[k1_offset + 1] 302 else: 303 x1 = v1[k1_offset - 1] + 1 304 y1 = x1 - k1 305 while ( 306 x1 < text1_length and y1 < text2_length and text1[x1] == text2[y1] 307 ): 308 x1 += 1 309 y1 += 1 310 v1[k1_offset] = x1 311 if x1 > text1_length: 312 # Ran off the right of the graph. 313 k1end += 2 314 elif y1 > text2_length: 315 # Ran off the bottom of the graph. 316 k1start += 2 317 elif front: 318 k2_offset = v_offset + delta - k1 319 if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1: 320 # Mirror x2 onto top-left coordinate system. 321 x2 = text1_length - v2[k2_offset] 322 if x1 >= x2: 323 # Overlap detected. 324 return self.diff_bisectSplit(text1, text2, x1, y1, deadline) 325 326 # Walk the reverse path one step. 327 for k2 in xrange(-d + k2start, d + 1 - k2end, 2): 328 k2_offset = v_offset + k2 329 if k2 == -d or (k2 != d and v2[k2_offset - 1] < v2[k2_offset + 1]): 330 x2 = v2[k2_offset + 1] 331 else: 332 x2 = v2[k2_offset - 1] + 1 333 y2 = x2 - k2 334 while ( 335 x2 < text1_length 336 and y2 < text2_length 337 and text1[-x2 - 1] == text2[-y2 - 1] 338 ): 339 x2 += 1 340 y2 += 1 341 v2[k2_offset] = x2 342 if x2 > text1_length: 343 # Ran off the left of the graph. 344 k2end += 2 345 elif y2 > text2_length: 346 # Ran off the top of the graph. 347 k2start += 2 348 elif not front: 349 k1_offset = v_offset + delta - k2 350 if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1: 351 x1 = v1[k1_offset] 352 y1 = v_offset + x1 - k1_offset 353 # Mirror x2 onto top-left coordinate system. 354 x2 = text1_length - x2 355 if x1 >= x2: 356 # Overlap detected. 357 return self.diff_bisectSplit(text1, text2, x1, y1, deadline) 358 359 # Diff took too long and hit the deadline or 360 # number of diffs equals number of characters, no commonality at all. 361 return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)] 362 363 def diff_bisectSplit(self, text1, text2, x, y, deadline): 364 """Given the location of the 'middle snake', split the diff in two parts 365 and recurse. 366 367 Args: 368 text1: Old string to be diffed. 369 text2: New string to be diffed. 370 x: Index of split point in text1. 371 y: Index of split point in text2. 372 deadline: Time at which to bail if not yet complete. 373 374 Returns: 375 Array of diff tuples. 376 """ 377 text1a = text1[:x] 378 text2a = text2[:y] 379 text1b = text1[x:] 380 text2b = text2[y:] 381 382 # Compute both diffs serially. 383 diffs = self.diff_main(text1a, text2a, False, deadline) 384 diffsb = self.diff_main(text1b, text2b, False, deadline) 385 386 return diffs + diffsb 387 388 def diff_linesToChars(self, text1, text2): 389 """Split two texts into an array of strings. Reduce the texts to a string 390 of hashes where each Unicode character represents one line. 391 392 Args: 393 text1: First string. 394 text2: Second string. 395 396 Returns: 397 Three element tuple, containing the encoded text1, the encoded text2 and 398 the array of unique strings. The zeroth element of the array of unique 399 strings is intentionally blank. 400 """ 401 lineArray = [] # e.g. lineArray[4] == "Hello\n" 402 lineHash = {} # e.g. lineHash["Hello\n"] == 4 403 404 # "\x00" is a valid character, but various debuggers don't like it. 405 # So we'll insert a junk entry to avoid generating a null character. 406 lineArray.append("") 407 408 def diff_linesToCharsMunge(text): 409 """Split a text into an array of strings. Reduce the texts to a string 410 of hashes where each Unicode character represents one line. 411 Modifies linearray and linehash through being a closure. 412 413 Args: 414 text: String to encode. 415 416 Returns: 417 Encoded string. 418 """ 419 chars = [] 420 # Walk the text, pulling out a substring for each line. 421 # text.split('\n') would would temporarily double our memory footprint. 422 # Modifying text would create many large strings to garbage collect. 423 lineStart = 0 424 lineEnd = -1 425 while lineEnd < len(text) - 1: 426 lineEnd = text.find("\n", lineStart) 427 if lineEnd == -1: 428 lineEnd = len(text) - 1 429 line = text[lineStart : lineEnd + 1] 430 431 if line in lineHash: 432 chars.append(unichr(lineHash[line])) 433 else: 434 if len(lineArray) == maxLines: 435 # Bail out at 65535 because unichr(65536) throws. 436 line = text[lineStart:] 437 lineEnd = len(text) 438 lineArray.append(line) 439 lineHash[line] = len(lineArray) - 1 440 chars.append(unichr(len(lineArray) - 1)) 441 lineStart = lineEnd + 1 442 return "".join(chars) 443 444 # Allocate 2/3rds of the space for text1, the rest for text2. 445 maxLines = 40000 446 chars1 = diff_linesToCharsMunge(text1) 447 maxLines = 65535 448 chars2 = diff_linesToCharsMunge(text2) 449 return (chars1, chars2, lineArray) 450 451 def diff_charsToLines(self, diffs, lineArray): 452 """Rehydrate the text in a diff from a string of line hashes to real lines 453 of text. 454 455 Args: 456 diffs: Array of diff tuples. 457 lineArray: Array of unique strings. 458 """ 459 for i in xrange(len(diffs)): 460 text = [] 461 for char in diffs[i][1]: 462 text.append(lineArray[ord(char)]) 463 diffs[i] = (diffs[i][0], "".join(text)) 464 465 def diff_commonPrefix(self, text1, text2): 466 """Determine the common prefix of two strings. 467 468 Args: 469 text1: First string. 470 text2: Second string. 471 472 Returns: 473 The number of characters common to the start of each string. 474 """ 475 # Quick check for common null cases. 476 if not text1 or not text2 or text1[0] != text2[0]: 477 return 0 478 # Binary search. 479 # Performance analysis: https://neil.fraser.name/news/2007/10/09/ 480 pointermin = 0 481 pointermax = min(len(text1), len(text2)) 482 pointermid = pointermax 483 pointerstart = 0 484 while pointermin < pointermid: 485 if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]: 486 pointermin = pointermid 487 pointerstart = pointermin 488 else: 489 pointermax = pointermid 490 pointermid = (pointermax - pointermin) // 2 + pointermin 491 return pointermid 492 493 def diff_commonSuffix(self, text1, text2): 494 """Determine the common suffix of two strings. 495 496 Args: 497 text1: First string. 498 text2: Second string. 499 500 Returns: 501 The number of characters common to the end of each string. 502 """ 503 # Quick check for common null cases. 504 if not text1 or not text2 or text1[-1] != text2[-1]: 505 return 0 506 # Binary search. 507 # Performance analysis: https://neil.fraser.name/news/2007/10/09/ 508 pointermin = 0 509 pointermax = min(len(text1), len(text2)) 510 pointermid = pointermax 511 pointerend = 0 512 while pointermin < pointermid: 513 if ( 514 text1[-pointermid : len(text1) - pointerend] 515 == text2[-pointermid : len(text2) - pointerend] 516 ): 517 pointermin = pointermid 518 pointerend = pointermin 519 else: 520 pointermax = pointermid 521 pointermid = (pointermax - pointermin) // 2 + pointermin 522 return pointermid 523 524 def diff_commonOverlap(self, text1, text2): 525 """Determine if the suffix of one string is the prefix of another. 526 527 Args: 528 text1 First string. 529 text2 Second string. 530 531 Returns: 532 The number of characters common to the end of the first 533 string and the start of the second string. 534 """ 535 # Cache the text lengths to prevent multiple calls. 536 text1_length = len(text1) 537 text2_length = len(text2) 538 # Eliminate the null case. 539 if text1_length == 0 or text2_length == 0: 540 return 0 541 # Truncate the longer string. 542 if text1_length > text2_length: 543 text1 = text1[-text2_length:] 544 elif text1_length < text2_length: 545 text2 = text2[:text1_length] 546 text_length = min(text1_length, text2_length) 547 # Quick check for the worst case. 548 if text1 == text2: 549 return text_length 550 551 # Start by looking for a single character match 552 # and increase length until no match is found. 553 # Performance analysis: https://neil.fraser.name/news/2010/11/04/ 554 best = 0 555 length = 1 556 while True: 557 pattern = text1[-length:] 558 found = text2.find(pattern) 559 if found == -1: 560 return best 561 length += found 562 if found == 0 or text1[-length:] == text2[:length]: 563 best = length 564 length += 1 565 566 def diff_halfMatch(self, text1, text2): 567 """Do the two texts share a substring which is at least half the length of 568 the longer text? 569 This speedup can produce non-minimal diffs. 570 571 Args: 572 text1: First string. 573 text2: Second string. 574 575 Returns: 576 Five element Array, containing the prefix of text1, the suffix of text1, 577 the prefix of text2, the suffix of text2 and the common middle. Or None 578 if there was no match. 579 """ 580 if self.Diff_Timeout <= 0: 581 # Don't risk returning a non-optimal diff if we have unlimited time. 582 return None 583 if len(text1) > len(text2): 584 (longtext, shorttext) = (text1, text2) 585 else: 586 (shorttext, longtext) = (text1, text2) 587 if len(longtext) < 4 or len(shorttext) * 2 < len(longtext): 588 return None # Pointless. 589 590 def diff_halfMatchI(longtext, shorttext, i): 591 """Does a substring of shorttext exist within longtext such that the 592 substring is at least half the length of longtext? 593 Closure, but does not reference any external variables. 594 595 Args: 596 longtext: Longer string. 597 shorttext: Shorter string. 598 i: Start index of quarter length substring within longtext. 599 600 Returns: 601 Five element Array, containing the prefix of longtext, the suffix of 602 longtext, the prefix of shorttext, the suffix of shorttext and the 603 common middle. Or None if there was no match. 604 """ 605 seed = longtext[i : i + len(longtext) // 4] 606 best_common = "" 607 j = shorttext.find(seed) 608 while j != -1: 609 prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:]) 610 suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j]) 611 if len(best_common) < suffixLength + prefixLength: 612 best_common = ( 613 shorttext[j - suffixLength : j] 614 + shorttext[j : j + prefixLength] 615 ) 616 best_longtext_a = longtext[: i - suffixLength] 617 best_longtext_b = longtext[i + prefixLength :] 618 best_shorttext_a = shorttext[: j - suffixLength] 619 best_shorttext_b = shorttext[j + prefixLength :] 620 j = shorttext.find(seed, j + 1) 621 622 if len(best_common) * 2 >= len(longtext): 623 return ( 624 best_longtext_a, 625 best_longtext_b, 626 best_shorttext_a, 627 best_shorttext_b, 628 best_common, 629 ) 630 else: 631 return None 632 633 # First check if the second quarter is the seed for a half-match. 634 hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) // 4) 635 # Check again based on the third quarter. 636 hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) // 2) 637 if not hm1 and not hm2: 638 return None 639 elif not hm2: 640 hm = hm1 641 elif not hm1: 642 hm = hm2 643 else: 644 # Both matched. Select the longest. 645 if len(hm1[4]) > len(hm2[4]): 646 hm = hm1 647 else: 648 hm = hm2 649 650 # A half-match was found, sort out the return data. 651 if len(text1) > len(text2): 652 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm 653 else: 654 (text2_a, text2_b, text1_a, text1_b, mid_common) = hm 655 return (text1_a, text1_b, text2_a, text2_b, mid_common) 656 657 def diff_cleanupSemantic(self, diffs): 658 """Reduce the number of edits by eliminating semantically trivial 659 equalities. 660 661 Args: 662 diffs: Array of diff tuples. 663 """ 664 changes = False 665 equalities = [] # Stack of indices where equalities are found. 666 lastEquality = None # Always equal to diffs[equalities[-1]][1] 667 pointer = 0 # Index of current position. 668 # Number of chars that changed prior to the equality. 669 length_insertions1, length_deletions1 = 0, 0 670 # Number of chars that changed after the equality. 671 length_insertions2, length_deletions2 = 0, 0 672 while pointer < len(diffs): 673 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found. 674 equalities.append(pointer) 675 length_insertions1, length_insertions2 = length_insertions2, 0 676 length_deletions1, length_deletions2 = length_deletions2, 0 677 lastEquality = diffs[pointer][1] 678 else: # An insertion or deletion. 679 if diffs[pointer][0] == self.DIFF_INSERT: 680 length_insertions2 += len(diffs[pointer][1]) 681 else: 682 length_deletions2 += len(diffs[pointer][1]) 683 # Eliminate an equality that is smaller or equal to the edits on both 684 # sides of it. 685 if ( 686 lastEquality 687 and ( 688 len(lastEquality) <= max(length_insertions1, length_deletions1) 689 ) 690 and ( 691 len(lastEquality) <= max(length_insertions2, length_deletions2) 692 ) 693 ): 694 # Duplicate record. 695 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastEquality)) 696 # Change second copy to insert. 697 diffs[equalities[-1] + 1] = ( 698 self.DIFF_INSERT, 699 diffs[equalities[-1] + 1][1], 700 ) 701 # Throw away the equality we just deleted. 702 equalities.pop() 703 # Throw away the previous equality (it needs to be reevaluated). 704 if len(equalities): 705 equalities.pop() 706 if len(equalities): 707 pointer = equalities[-1] 708 else: 709 pointer = -1 710 # Reset the counters. 711 length_insertions1, length_deletions1 = 0, 0 712 length_insertions2, length_deletions2 = 0, 0 713 lastEquality = None 714 changes = True 715 pointer += 1 716 717 # Normalize the diff. 718 if changes: 719 self.diff_cleanupMerge(diffs) 720 self.diff_cleanupSemanticLossless(diffs) 721 722 # Find any overlaps between deletions and insertions. 723 # e.g: <del>abcxxx</del><ins>xxxdef</ins> 724 # -> <del>abc</del>xxx<ins>def</ins> 725 # e.g: <del>xxxabc</del><ins>defxxx</ins> 726 # -> <ins>def</ins>xxx<del>abc</del> 727 # Only extract an overlap if it is as big as the edit ahead or behind it. 728 pointer = 1 729 while pointer < len(diffs): 730 if ( 731 diffs[pointer - 1][0] == self.DIFF_DELETE 732 and diffs[pointer][0] == self.DIFF_INSERT 733 ): 734 deletion = diffs[pointer - 1][1] 735 insertion = diffs[pointer][1] 736 overlap_length1 = self.diff_commonOverlap(deletion, insertion) 737 overlap_length2 = self.diff_commonOverlap(insertion, deletion) 738 if overlap_length1 >= overlap_length2: 739 if ( 740 overlap_length1 >= len(deletion) / 2.0 741 or overlap_length1 >= len(insertion) / 2.0 742 ): 743 # Overlap found. Insert an equality and trim the surrounding edits. 744 diffs.insert( 745 pointer, (self.DIFF_EQUAL, insertion[:overlap_length1]) 746 ) 747 diffs[pointer - 1] = ( 748 self.DIFF_DELETE, 749 deletion[: len(deletion) - overlap_length1], 750 ) 751 diffs[pointer + 1] = ( 752 self.DIFF_INSERT, 753 insertion[overlap_length1:], 754 ) 755 pointer += 1 756 else: 757 if ( 758 overlap_length2 >= len(deletion) / 2.0 759 or overlap_length2 >= len(insertion) / 2.0 760 ): 761 # Reverse overlap found. 762 # Insert an equality and swap and trim the surrounding edits. 763 diffs.insert( 764 pointer, (self.DIFF_EQUAL, deletion[:overlap_length2]) 765 ) 766 diffs[pointer - 1] = ( 767 self.DIFF_INSERT, 768 insertion[: len(insertion) - overlap_length2], 769 ) 770 diffs[pointer + 1] = ( 771 self.DIFF_DELETE, 772 deletion[overlap_length2:], 773 ) 774 pointer += 1 775 pointer += 1 776 pointer += 1 777 778 def diff_cleanupSemanticLossless(self, diffs): 779 """Look for single edits surrounded on both sides by equalities 780 which can be shifted sideways to align the edit to a word boundary. 781 e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came. 782 783 Args: 784 diffs: Array of diff tuples. 785 """ 786 787 def diff_cleanupSemanticScore(one, two): 788 """Given two strings, compute a score representing whether the 789 internal boundary falls on logical boundaries. 790 Scores range from 6 (best) to 0 (worst). 791 Closure, but does not reference any external variables. 792 793 Args: 794 one: First string. 795 two: Second string. 796 797 Returns: 798 The score. 799 """ 800 if not one or not two: 801 # Edges are the best. 802 return 6 803 804 # Each port of this function behaves slightly differently due to 805 # subtle differences in each language's definition of things like 806 # 'whitespace'. Since this function's purpose is largely cosmetic, 807 # the choice has been made to use each language's native features 808 # rather than force total conformity. 809 char1 = one[-1] 810 char2 = two[0] 811 nonAlphaNumeric1 = not char1.isalnum() 812 nonAlphaNumeric2 = not char2.isalnum() 813 whitespace1 = nonAlphaNumeric1 and char1.isspace() 814 whitespace2 = nonAlphaNumeric2 and char2.isspace() 815 lineBreak1 = whitespace1 and (char1 == "\r" or char1 == "\n") 816 lineBreak2 = whitespace2 and (char2 == "\r" or char2 == "\n") 817 blankLine1 = lineBreak1 and self.BLANKLINEEND.search(one) 818 blankLine2 = lineBreak2 and self.BLANKLINESTART.match(two) 819 820 if blankLine1 or blankLine2: 821 # Five points for blank lines. 822 return 5 823 elif lineBreak1 or lineBreak2: 824 # Four points for line breaks. 825 return 4 826 elif nonAlphaNumeric1 and not whitespace1 and whitespace2: 827 # Three points for end of sentences. 828 return 3 829 elif whitespace1 or whitespace2: 830 # Two points for whitespace. 831 return 2 832 elif nonAlphaNumeric1 or nonAlphaNumeric2: 833 # One point for non-alphanumeric. 834 return 1 835 return 0 836 837 pointer = 1 838 # Intentionally ignore the first and last element (don't need checking). 839 while pointer < len(diffs) - 1: 840 if ( 841 diffs[pointer - 1][0] == self.DIFF_EQUAL 842 and diffs[pointer + 1][0] == self.DIFF_EQUAL 843 ): 844 # This is a single edit surrounded by equalities. 845 equality1 = diffs[pointer - 1][1] 846 edit = diffs[pointer][1] 847 equality2 = diffs[pointer + 1][1] 848 849 # First, shift the edit as far left as possible. 850 commonOffset = self.diff_commonSuffix(equality1, edit) 851 if commonOffset: 852 commonString = edit[-commonOffset:] 853 equality1 = equality1[:-commonOffset] 854 edit = commonString + edit[:-commonOffset] 855 equality2 = commonString + equality2 856 857 # Second, step character by character right, looking for the best fit. 858 bestEquality1 = equality1 859 bestEdit = edit 860 bestEquality2 = equality2 861 bestScore = diff_cleanupSemanticScore( 862 equality1, edit 863 ) + diff_cleanupSemanticScore(edit, equality2) 864 while edit and equality2 and edit[0] == equality2[0]: 865 equality1 += edit[0] 866 edit = edit[1:] + equality2[0] 867 equality2 = equality2[1:] 868 score = diff_cleanupSemanticScore( 869 equality1, edit 870 ) + diff_cleanupSemanticScore(edit, equality2) 871 # The >= encourages trailing rather than leading whitespace on edits. 872 if score >= bestScore: 873 bestScore = score 874 bestEquality1 = equality1 875 bestEdit = edit 876 bestEquality2 = equality2 877 878 if diffs[pointer - 1][1] != bestEquality1: 879 # We have an improvement, save it back to the diff. 880 if bestEquality1: 881 diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1) 882 else: 883 del diffs[pointer - 1] 884 pointer -= 1 885 diffs[pointer] = (diffs[pointer][0], bestEdit) 886 if bestEquality2: 887 diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2) 888 else: 889 del diffs[pointer + 1] 890 pointer -= 1 891 pointer += 1 892 893 # Define some regex patterns for matching boundaries. 894 BLANKLINEEND = re.compile(r"\n\r?\n$") 895 BLANKLINESTART = re.compile(r"^\r?\n\r?\n") 896 897 def diff_cleanupEfficiency(self, diffs): 898 """Reduce the number of edits by eliminating operationally trivial 899 equalities. 900 901 Args: 902 diffs: Array of diff tuples. 903 """ 904 changes = False 905 equalities = [] # Stack of indices where equalities are found. 906 lastEquality = None # Always equal to diffs[equalities[-1]][1] 907 pointer = 0 # Index of current position. 908 pre_ins = False # Is there an insertion operation before the last equality. 909 pre_del = False # Is there a deletion operation before the last equality. 910 post_ins = False # Is there an insertion operation after the last equality. 911 post_del = False # Is there a deletion operation after the last equality. 912 while pointer < len(diffs): 913 if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found. 914 if len(diffs[pointer][1]) < self.Diff_EditCost and ( 915 post_ins or post_del 916 ): 917 # Candidate found. 918 equalities.append(pointer) 919 pre_ins = post_ins 920 pre_del = post_del 921 lastEquality = diffs[pointer][1] 922 else: 923 # Not a candidate, and can never become one. 924 equalities = [] 925 lastEquality = None 926 927 post_ins = post_del = False 928 else: # An insertion or deletion. 929 if diffs[pointer][0] == self.DIFF_DELETE: 930 post_del = True 931 else: 932 post_ins = True 933 934 # Five types to be split: 935 # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del> 936 # <ins>A</ins>X<ins>C</ins><del>D</del> 937 # <ins>A</ins><del>B</del>X<ins>C</ins> 938 # <ins>A</del>X<ins>C</ins><del>D</del> 939 # <ins>A</ins><del>B</del>X<del>C</del> 940 941 if lastEquality and ( 942 (pre_ins and pre_del and post_ins and post_del) 943 or ( 944 (len(lastEquality) < self.Diff_EditCost / 2) 945 and (pre_ins + pre_del + post_ins + post_del) == 3 946 ) 947 ): 948 # Duplicate record. 949 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastEquality)) 950 # Change second copy to insert. 951 diffs[equalities[-1] + 1] = ( 952 self.DIFF_INSERT, 953 diffs[equalities[-1] + 1][1], 954 ) 955 equalities.pop() # Throw away the equality we just deleted. 956 lastEquality = None 957 if pre_ins and pre_del: 958 # No changes made which could affect previous entry, keep going. 959 post_ins = post_del = True 960 equalities = [] 961 else: 962 if len(equalities): 963 equalities.pop() # Throw away the previous equality. 964 if len(equalities): 965 pointer = equalities[-1] 966 else: 967 pointer = -1 968 post_ins = post_del = False 969 changes = True 970 pointer += 1 971 972 if changes: 973 self.diff_cleanupMerge(diffs) 974 975 def diff_cleanupMerge(self, diffs): 976 """Reorder and merge like edit sections. Merge equalities. 977 Any edit section can move as long as it doesn't cross an equality. 978 979 Args: 980 diffs: Array of diff tuples. 981 """ 982 diffs.append((self.DIFF_EQUAL, "")) # Add a dummy entry at the end. 983 pointer = 0 984 count_delete = 0 985 count_insert = 0 986 text_delete = "" 987 text_insert = "" 988 while pointer < len(diffs): 989 if diffs[pointer][0] == self.DIFF_INSERT: 990 count_insert += 1 991 text_insert += diffs[pointer][1] 992 pointer += 1 993 elif diffs[pointer][0] == self.DIFF_DELETE: 994 count_delete += 1 995 text_delete += diffs[pointer][1] 996 pointer += 1 997 elif diffs[pointer][0] == self.DIFF_EQUAL: 998 # Upon reaching an equality, check for prior redundancies. 999 if count_delete + count_insert > 1: 1000 if count_delete != 0 and count_insert != 0: 1001 # Factor out any common prefixies. 1002 commonlength = self.diff_commonPrefix(text_insert, text_delete) 1003 if commonlength != 0: 1004 x = pointer - count_delete - count_insert - 1 1005 if x >= 0 and diffs[x][0] == self.DIFF_EQUAL: 1006 diffs[x] = ( 1007 diffs[x][0], 1008 diffs[x][1] + text_insert[:commonlength], 1009 ) 1010 else: 1011 diffs.insert( 1012 0, (self.DIFF_EQUAL, text_insert[:commonlength]) 1013 ) 1014 pointer += 1 1015 text_insert = text_insert[commonlength:] 1016 text_delete = text_delete[commonlength:] 1017 # Factor out any common suffixies. 1018 commonlength = self.diff_commonSuffix(text_insert, text_delete) 1019 if commonlength != 0: 1020 diffs[pointer] = ( 1021 diffs[pointer][0], 1022 text_insert[-commonlength:] + diffs[pointer][1], 1023 ) 1024 text_insert = text_insert[:-commonlength] 1025 text_delete = text_delete[:-commonlength] 1026 # Delete the offending records and add the merged ones. 1027 new_ops = [] 1028 if len(text_delete) != 0: 1029 new_ops.append((self.DIFF_DELETE, text_delete)) 1030 if len(text_insert) != 0: 1031 new_ops.append((self.DIFF_INSERT, text_insert)) 1032 pointer -= count_delete + count_insert 1033 diffs[pointer : pointer + count_delete + count_insert] = new_ops 1034 pointer += len(new_ops) + 1 1035 elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL: 1036 # Merge this equality with the previous one. 1037 diffs[pointer - 1] = ( 1038 diffs[pointer - 1][0], 1039 diffs[pointer - 1][1] + diffs[pointer][1], 1040 ) 1041 del diffs[pointer] 1042 else: 1043 pointer += 1 1044 1045 count_insert = 0 1046 count_delete = 0 1047 text_delete = "" 1048 text_insert = "" 1049 1050 if diffs[-1][1] == "": 1051 diffs.pop() # Remove the dummy entry at the end. 1052 1053 # Second pass: look for single edits surrounded on both sides by equalities 1054 # which can be shifted sideways to eliminate an equality. 1055 # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC 1056 changes = False 1057 pointer = 1 1058 # Intentionally ignore the first and last element (don't need checking). 1059 while pointer < len(diffs) - 1: 1060 if ( 1061 diffs[pointer - 1][0] == self.DIFF_EQUAL 1062 and diffs[pointer + 1][0] == self.DIFF_EQUAL 1063 ): 1064 # This is a single edit surrounded by equalities. 1065 if diffs[pointer][1].endswith(diffs[pointer - 1][1]): 1066 # Shift the edit over the previous equality. 1067 if diffs[pointer - 1][1] != "": 1068 diffs[pointer] = ( 1069 diffs[pointer][0], 1070 diffs[pointer - 1][1] 1071 + diffs[pointer][1][: -len(diffs[pointer - 1][1])], 1072 ) 1073 diffs[pointer + 1] = ( 1074 diffs[pointer + 1][0], 1075 diffs[pointer - 1][1] + diffs[pointer + 1][1], 1076 ) 1077 del diffs[pointer - 1] 1078 changes = True 1079 elif diffs[pointer][1].startswith(diffs[pointer + 1][1]): 1080 # Shift the edit over the next equality. 1081 diffs[pointer - 1] = ( 1082 diffs[pointer - 1][0], 1083 diffs[pointer - 1][1] + diffs[pointer + 1][1], 1084 ) 1085 diffs[pointer] = ( 1086 diffs[pointer][0], 1087 diffs[pointer][1][len(diffs[pointer + 1][1]) :] 1088 + diffs[pointer + 1][1], 1089 ) 1090 del diffs[pointer + 1] 1091 changes = True 1092 pointer += 1 1093 1094 # If shifts were made, the diff needs reordering and another shift sweep. 1095 if changes: 1096 self.diff_cleanupMerge(diffs) 1097 1098 def diff_xIndex(self, diffs, loc): 1099 """loc is a location in text1, compute and return the equivalent location 1100 in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8 1101 1102 Args: 1103 diffs: Array of diff tuples. 1104 loc: Location within text1. 1105 1106 Returns: 1107 Location within text2. 1108 """ 1109 chars1 = 0 1110 chars2 = 0 1111 last_chars1 = 0 1112 last_chars2 = 0 1113 for x in xrange(len(diffs)): 1114 (op, text) = diffs[x] 1115 if op != self.DIFF_INSERT: # Equality or deletion. 1116 chars1 += len(text) 1117 if op != self.DIFF_DELETE: # Equality or insertion. 1118 chars2 += len(text) 1119 if chars1 > loc: # Overshot the location. 1120 break 1121 last_chars1 = chars1 1122 last_chars2 = chars2 1123 1124 if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE: 1125 # The location was deleted. 1126 return last_chars2 1127 # Add the remaining len(character). 1128 return last_chars2 + (loc - last_chars1) 1129 1130 def diff_prettyHtml(self, diffs): 1131 """Convert a diff array into a pretty HTML report. 1132 1133 Args: 1134 diffs: Array of diff tuples. 1135 1136 Returns: 1137 HTML representation. 1138 """ 1139 html = [] 1140 for (op, data) in diffs: 1141 text = ( 1142 data.replace("&", "&") 1143 .replace("<", "<") 1144 .replace(">", ">") 1145 .replace("\n", "¶<br>") 1146 ) 1147 if op == self.DIFF_INSERT: 1148 html.append('<ins style="background:#e6ffe6;">%s</ins>' % text) 1149 elif op == self.DIFF_DELETE: 1150 html.append('<del style="background:#ffe6e6;">%s</del>' % text) 1151 elif op == self.DIFF_EQUAL: 1152 html.append("<span>%s</span>" % text) 1153 return "".join(html) 1154 1155 def diff_text1(self, diffs): 1156 """Compute and return the source text (all equalities and deletions). 1157 1158 Args: 1159 diffs: Array of diff tuples. 1160 1161 Returns: 1162 Source text. 1163 """ 1164 text = [] 1165 for (op, data) in diffs: 1166 if op != self.DIFF_INSERT: 1167 text.append(data) 1168 return "".join(text) 1169 1170 def diff_text2(self, diffs): 1171 """Compute and return the destination text (all equalities and insertions). 1172 1173 Args: 1174 diffs: Array of diff tuples. 1175 1176 Returns: 1177 Destination text. 1178 """ 1179 text = [] 1180 for (op, data) in diffs: 1181 if op != self.DIFF_DELETE: 1182 text.append(data) 1183 return "".join(text) 1184 1185 def diff_levenshtein(self, diffs): 1186 """Compute the Levenshtein distance; the number of inserted, deleted or 1187 substituted characters. 1188 1189 Args: 1190 diffs: Array of diff tuples. 1191 1192 Returns: 1193 Number of changes. 1194 """ 1195 levenshtein = 0 1196 insertions = 0 1197 deletions = 0 1198 for (op, data) in diffs: 1199 if op == self.DIFF_INSERT: 1200 insertions += len(data) 1201 elif op == self.DIFF_DELETE: 1202 deletions += len(data) 1203 elif op == self.DIFF_EQUAL: 1204 # A deletion and an insertion is one substitution. 1205 levenshtein += max(insertions, deletions) 1206 insertions = 0 1207 deletions = 0 1208 levenshtein += max(insertions, deletions) 1209 return levenshtein 1210 1211 def diff_toDelta(self, diffs): 1212 """Crush the diff into an encoded string which describes the operations 1213 required to transform text1 into text2. 1214 E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. 1215 Operations are tab-separated. Inserted text is escaped using %xx notation. 1216 1217 Args: 1218 diffs: Array of diff tuples. 1219 1220 Returns: 1221 Delta text. 1222 """ 1223 text = [] 1224 for (op, data) in diffs: 1225 if op == self.DIFF_INSERT: 1226 # High ascii will raise UnicodeDecodeError. Use Unicode instead. 1227 data = data.encode("utf-8") 1228 text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# ")) 1229 elif op == self.DIFF_DELETE: 1230 text.append("-%d" % len(data)) 1231 elif op == self.DIFF_EQUAL: 1232 text.append("=%d" % len(data)) 1233 return "\t".join(text) 1234 1235 def diff_fromDelta(self, text1, delta): 1236 """Given the original text1, and an encoded string which describes the 1237 operations required to transform text1 into text2, compute the full diff. 1238 1239 Args: 1240 text1: Source string for the diff. 1241 delta: Delta text. 1242 1243 Returns: 1244 Array of diff tuples. 1245 1246 Raises: 1247 ValueError: If invalid input. 1248 """ 1249 if type(delta) == unicode: 1250 # Deltas should be composed of a subset of ascii chars, Unicode not 1251 # required. If this encode raises UnicodeEncodeError, delta is invalid. 1252 delta = delta.encode("ascii") 1253 diffs = [] 1254 pointer = 0 # Cursor in text1 1255 tokens = delta.split("\t") 1256 for token in tokens: 1257 if token == "": 1258 # Blank tokens are ok (from a trailing \t). 1259 continue 1260 # Each token begins with a one character parameter which specifies the 1261 # operation of this token (delete, insert, equality). 1262 param = token[1:] 1263 if token[0] == "+": 1264 param = urllib.unquote(param).decode("utf-8") 1265 diffs.append((self.DIFF_INSERT, param)) 1266 elif token[0] == "-" or token[0] == "=": 1267 try: 1268 n = int(param) 1269 except ValueError: 1270 raise ValueError("Invalid number in diff_fromDelta: " + param) 1271 if n < 0: 1272 raise ValueError("Negative number in diff_fromDelta: " + param) 1273 text = text1[pointer : pointer + n] 1274 pointer += n 1275 if token[0] == "=": 1276 diffs.append((self.DIFF_EQUAL, text)) 1277 else: 1278 diffs.append((self.DIFF_DELETE, text)) 1279 else: 1280 # Anything else is an error. 1281 raise ValueError( 1282 "Invalid diff operation in diff_fromDelta: " + token[0] 1283 ) 1284 if pointer != len(text1): 1285 raise ValueError( 1286 "Delta length (%d) does not equal source text length (%d)." 1287 % (pointer, len(text1)) 1288 ) 1289 return diffs 1290 1291 # MATCH FUNCTIONS 1292 1293 def match_main(self, text, pattern, loc): 1294 """Locate the best instance of 'pattern' in 'text' near 'loc'. 1295 1296 Args: 1297 text: The text to search. 1298 pattern: The pattern to search for. 1299 loc: The location to search around. 1300 1301 Returns: 1302 Best match index or -1. 1303 """ 1304 # Check for null inputs. 1305 if text == None or pattern == None: 1306 raise ValueError("Null inputs. (match_main)") 1307 1308 loc = max(0, min(loc, len(text))) 1309 if text == pattern: 1310 # Shortcut (potentially not guaranteed by the algorithm) 1311 return 0 1312 elif not text: 1313 # Nothing to match. 1314 return -1 1315 elif text[loc : loc + len(pattern)] == pattern: 1316 # Perfect match at the perfect spot! (Includes case of null pattern) 1317 return loc 1318 else: 1319 # Do a fuzzy compare. 1320 match = self.match_bitap(text, pattern, loc) 1321 return match 1322 1323 def match_bitap(self, text, pattern, loc): 1324 """Locate the best instance of 'pattern' in 'text' near 'loc' using the 1325 Bitap algorithm. 1326 1327 Args: 1328 text: The text to search. 1329 pattern: The pattern to search for. 1330 loc: The location to search around. 1331 1332 Returns: 1333 Best match index or -1. 1334 """ 1335 # Python doesn't have a maxint limit, so ignore this check. 1336 # if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits: 1337 # raise ValueError("Pattern too long for this application.") 1338 1339 # Initialise the alphabet. 1340 s = self.match_alphabet(pattern) 1341 1342 def match_bitapScore(e, x): 1343 """Compute and return the score for a match with e errors and x location. 1344 Accesses loc and pattern through being a closure. 1345 1346 Args: 1347 e: Number of errors in match. 1348 x: Location of match. 1349 1350 Returns: 1351 Overall score for match (0.0 = good, 1.0 = bad). 1352 """ 1353 accuracy = float(e) / len(pattern) 1354 proximity = abs(loc - x) 1355 if not self.Match_Distance: 1356 # Dodge divide by zero error. 1357 return proximity and 1.0 or accuracy 1358 return accuracy + (proximity / float(self.Match_Distance)) 1359 1360 # Highest score beyond which we give up. 1361 score_threshold = self.Match_Threshold 1362 # Is there a nearby exact match? (speedup) 1363 best_loc = text.find(pattern, loc) 1364 if best_loc != -1: 1365 score_threshold = min(match_bitapScore(0, best_loc), score_threshold) 1366 # What about in the other direction? (speedup) 1367 best_loc = text.rfind(pattern, loc + len(pattern)) 1368 if best_loc != -1: 1369 score_threshold = min(match_bitapScore(0, best_loc), score_threshold) 1370 1371 # Initialise the bit arrays. 1372 matchmask = 1 << (len(pattern) - 1) 1373 best_loc = -1 1374 1375 bin_max = len(pattern) + len(text) 1376 # Empty initialization added to appease pychecker. 1377 last_rd = None 1378 for d in xrange(len(pattern)): 1379 # Scan for the best match each iteration allows for one more error. 1380 # Run a binary search to determine how far from 'loc' we can stray at 1381 # this error level. 1382 bin_min = 0 1383 bin_mid = bin_max 1384 while bin_min < bin_mid: 1385 if match_bitapScore(d, loc + bin_mid) <= score_threshold: 1386 bin_min = bin_mid 1387 else: 1388 bin_max = bin_mid 1389 bin_mid = (bin_max - bin_min) // 2 + bin_min 1390 1391 # Use the result from this iteration as the maximum for the next. 1392 bin_max = bin_mid 1393 start = max(1, loc - bin_mid + 1) 1394 finish = min(loc + bin_mid, len(text)) + len(pattern) 1395 1396 rd = [0] * (finish + 2) 1397 rd[finish + 1] = (1 << d) - 1 1398 for j in xrange(finish, start - 1, -1): 1399 if len(text) <= j - 1: 1400 # Out of range. 1401 charMatch = 0 1402 else: 1403 charMatch = s.get(text[j - 1], 0) 1404 if d == 0: # First pass: exact match. 1405 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch 1406 else: # Subsequent passes: fuzzy match. 1407 rd[j] = ( 1408 (((rd[j + 1] << 1) | 1) & charMatch) 1409 | (((last_rd[j + 1] | last_rd[j]) << 1) | 1) 1410 | last_rd[j + 1] 1411 ) 1412 if rd[j] & matchmask: 1413 score = match_bitapScore(d, j - 1) 1414 # This match will almost certainly be better than any existing match. 1415 # But check anyway. 1416 if score <= score_threshold: 1417 # Told you so. 1418 score_threshold = score 1419 best_loc = j - 1 1420 if best_loc > loc: 1421 # When passing loc, don't exceed our current distance from loc. 1422 start = max(1, 2 * loc - best_loc) 1423 else: 1424 # Already passed loc, downhill from here on in. 1425 break 1426 # No hope for a (better) match at greater error levels. 1427 if match_bitapScore(d + 1, loc) > score_threshold: 1428 break 1429 last_rd = rd 1430 return best_loc 1431 1432 def match_alphabet(self, pattern): 1433 """Initialise the alphabet for the Bitap algorithm. 1434 1435 Args: 1436 pattern: The text to encode. 1437 1438 Returns: 1439 Hash of character locations. 1440 """ 1441 s = {} 1442 for char in pattern: 1443 s[char] = 0 1444 for i in xrange(len(pattern)): 1445 s[pattern[i]] |= 1 << (len(pattern) - i - 1) 1446 return s 1447 1448 # PATCH FUNCTIONS 1449 1450 def patch_addContext(self, patch, text): 1451 """Increase the context until it is unique, 1452 but don't let the pattern expand beyond Match_MaxBits. 1453 1454 Args: 1455 patch: The patch to grow. 1456 text: Source text. 1457 """ 1458 if len(text) == 0: 1459 return 1460 pattern = text[patch.start2 : patch.start2 + patch.length1] 1461 padding = 0 1462 1463 # Look for the first and last matches of pattern in text. If two different 1464 # matches are found, increase the pattern length. 1465 while text.find(pattern) != text.rfind(pattern) and ( 1466 self.Match_MaxBits == 0 1467 or len(pattern) < self.Match_MaxBits - self.Patch_Margin - self.Patch_Margin 1468 ): 1469 padding += self.Patch_Margin 1470 pattern = text[ 1471 max(0, patch.start2 - padding) : patch.start2 + patch.length1 + padding 1472 ] 1473 # Add one chunk for good luck. 1474 padding += self.Patch_Margin 1475 1476 # Add the prefix. 1477 prefix = text[max(0, patch.start2 - padding) : patch.start2] 1478 if prefix: 1479 patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)] 1480 # Add the suffix. 1481 suffix = text[ 1482 patch.start2 + patch.length1 : patch.start2 + patch.length1 + padding 1483 ] 1484 if suffix: 1485 patch.diffs.append((self.DIFF_EQUAL, suffix)) 1486 1487 # Roll back the start points. 1488 patch.start1 -= len(prefix) 1489 patch.start2 -= len(prefix) 1490 # Extend lengths. 1491 patch.length1 += len(prefix) + len(suffix) 1492 patch.length2 += len(prefix) + len(suffix) 1493 1494 def patch_make(self, a, b=None, c=None): 1495 """Compute a list of patches to turn text1 into text2. 1496 Use diffs if provided, otherwise compute it ourselves. 1497 There are four ways to call this function, depending on what data is 1498 available to the caller: 1499 Method 1: 1500 a = text1, b = text2 1501 Method 2: 1502 a = diffs 1503 Method 3 (optimal): 1504 a = text1, b = diffs 1505 Method 4 (deprecated, use method 3): 1506 a = text1, b = text2, c = diffs 1507 1508 Args: 1509 a: text1 (methods 1,3,4) or Array of diff tuples for text1 to 1510 text2 (method 2). 1511 b: text2 (methods 1,4) or Array of diff tuples for text1 to 1512 text2 (method 3) or undefined (method 2). 1513 c: Array of diff tuples for text1 to text2 (method 4) or 1514 undefined (methods 1,2,3). 1515 1516 Returns: 1517 Array of Patch objects. 1518 """ 1519 text1 = None 1520 diffs = None 1521 # Note that texts may arrive as 'str' or 'unicode'. 1522 if isinstance(a, basestring) and isinstance(b, basestring) and c is None: 1523 # Method 1: text1, text2 1524 # Compute diffs from text1 and text2. 1525 text1 = a 1526 diffs = self.diff_main(text1, b, True) 1527 if len(diffs) > 2: 1528 self.diff_cleanupSemantic(diffs) 1529 self.diff_cleanupEfficiency(diffs) 1530 elif isinstance(a, list) and b is None and c is None: 1531 # Method 2: diffs 1532 # Compute text1 from diffs. 1533 diffs = a 1534 text1 = self.diff_text1(diffs) 1535 elif isinstance(a, basestring) and isinstance(b, list) and c is None: 1536 # Method 3: text1, diffs 1537 text1 = a 1538 diffs = b 1539 elif ( 1540 isinstance(a, basestring) 1541 and isinstance(b, basestring) 1542 and isinstance(c, list) 1543 ): 1544 # Method 4: text1, text2, diffs 1545 # text2 is not used. 1546 text1 = a 1547 diffs = c 1548 else: 1549 raise ValueError("Unknown call format to patch_make.") 1550 1551 if not diffs: 1552 return [] # Get rid of the None case. 1553 patches = [] 1554 patch = patch_obj() 1555 char_count1 = 0 # Number of characters into the text1 string. 1556 char_count2 = 0 # Number of characters into the text2 string. 1557 prepatch_text = text1 # Recreate the patches to determine context info. 1558 postpatch_text = text1 1559 for x in xrange(len(diffs)): 1560 (diff_type, diff_text) = diffs[x] 1561 if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL: 1562 # A new patch starts here. 1563 patch.start1 = char_count1 1564 patch.start2 = char_count2 1565 if diff_type == self.DIFF_INSERT: 1566 # Insertion 1567 patch.diffs.append(diffs[x]) 1568 patch.length2 += len(diff_text) 1569 postpatch_text = ( 1570 postpatch_text[:char_count2] 1571 + diff_text 1572 + postpatch_text[char_count2:] 1573 ) 1574 elif diff_type == self.DIFF_DELETE: 1575 # Deletion. 1576 patch.length1 += len(diff_text) 1577 patch.diffs.append(diffs[x]) 1578 postpatch_text = ( 1579 postpatch_text[:char_count2] 1580 + postpatch_text[char_count2 + len(diff_text) :] 1581 ) 1582 elif ( 1583 diff_type == self.DIFF_EQUAL 1584 and len(diff_text) <= 2 * self.Patch_Margin 1585 and len(patch.diffs) != 0 1586 and len(diffs) != x + 1 1587 ): 1588 # Small equality inside a patch. 1589 patch.diffs.append(diffs[x]) 1590 patch.length1 += len(diff_text) 1591 patch.length2 += len(diff_text) 1592 1593 if diff_type == self.DIFF_EQUAL and len(diff_text) >= 2 * self.Patch_Margin: 1594 # Time for a new patch. 1595 if len(patch.diffs) != 0: 1596 self.patch_addContext(patch, prepatch_text) 1597 patches.append(patch) 1598 patch = patch_obj() 1599 # Unlike Unidiff, our patch lists have a rolling context. 1600 # https://github.com/google/diff-match-patch/wiki/Unidiff 1601 # Update prepatch text & pos to reflect the application of the 1602 # just completed patch. 1603 prepatch_text = postpatch_text 1604 char_count1 = char_count2 1605 1606 # Update the current character count. 1607 if diff_type != self.DIFF_INSERT: 1608 char_count1 += len(diff_text) 1609 if diff_type != self.DIFF_DELETE: 1610 char_count2 += len(diff_text) 1611 1612 # Pick up the leftover patch if not empty. 1613 if len(patch.diffs) != 0: 1614 self.patch_addContext(patch, prepatch_text) 1615 patches.append(patch) 1616 return patches 1617 1618 def patch_deepCopy(self, patches): 1619 """Given an array of patches, return another array that is identical. 1620 1621 Args: 1622 patches: Array of Patch objects. 1623 1624 Returns: 1625 Array of Patch objects. 1626 """ 1627 patchesCopy = [] 1628 for patch in patches: 1629 patchCopy = patch_obj() 1630 # No need to deep copy the tuples since they are immutable. 1631 patchCopy.diffs = patch.diffs[:] 1632 patchCopy.start1 = patch.start1 1633 patchCopy.start2 = patch.start2 1634 patchCopy.length1 = patch.length1 1635 patchCopy.length2 = patch.length2 1636 patchesCopy.append(patchCopy) 1637 return patchesCopy 1638 1639 def patch_apply(self, patches, text): 1640 """Merge a set of patches onto the text. Return a patched text, as well 1641 as a list of true/false values indicating which patches were applied. 1642 1643 Args: 1644 patches: Array of Patch objects. 1645 text: Old text. 1646 1647 Returns: 1648 Two element Array, containing the new text and an array of boolean values. 1649 """ 1650 if not patches: 1651 return (text, []) 1652 1653 # Deep copy the patches so that no changes are made to originals. 1654 patches = self.patch_deepCopy(patches) 1655 1656 nullPadding = self.patch_addPadding(patches) 1657 text = nullPadding + text + nullPadding 1658 self.patch_splitMax(patches) 1659 1660 # delta keeps track of the offset between the expected and actual location 1661 # of the previous patch. If there are patches expected at positions 10 and 1662 # 20, but the first patch was found at 12, delta is 2 and the second patch 1663 # has an effective expected position of 22. 1664 delta = 0 1665 results = [] 1666 for patch in patches: 1667 expected_loc = patch.start2 + delta 1668 text1 = self.diff_text1(patch.diffs) 1669 end_loc = -1 1670 if len(text1) > self.Match_MaxBits: 1671 # patch_splitMax will only provide an oversized pattern in the case of 1672 # a monster delete. 1673 start_loc = self.match_main( 1674 text, text1[: self.Match_MaxBits], expected_loc 1675 ) 1676 if start_loc != -1: 1677 end_loc = self.match_main( 1678 text, 1679 text1[-self.Match_MaxBits :], 1680 expected_loc + len(text1) - self.Match_MaxBits, 1681 ) 1682 if end_loc == -1 or start_loc >= end_loc: 1683 # Can't find valid trailing context. Drop this patch. 1684 start_loc = -1 1685 else: 1686 start_loc = self.match_main(text, text1, expected_loc) 1687 if start_loc == -1: 1688 # No match found. :( 1689 results.append(False) 1690 # Subtract the delta for this failed patch from subsequent patches. 1691 delta -= patch.length2 - patch.length1 1692 else: 1693 # Found a match. :) 1694 results.append(True) 1695 delta = start_loc - expected_loc 1696 if end_loc == -1: 1697 text2 = text[start_loc : start_loc + len(text1)] 1698 else: 1699 text2 = text[start_loc : end_loc + self.Match_MaxBits] 1700 if text1 == text2: 1701 # Perfect match, just shove the replacement text in. 1702 text = ( 1703 text[:start_loc] 1704 + self.diff_text2(patch.diffs) 1705 + text[start_loc + len(text1) :] 1706 ) 1707 else: 1708 # Imperfect match. 1709 # Run a diff to get a framework of equivalent indices. 1710 diffs = self.diff_main(text1, text2, False) 1711 if ( 1712 len(text1) > self.Match_MaxBits 1713 and self.diff_levenshtein(diffs) / float(len(text1)) 1714 > self.Patch_DeleteThreshold 1715 ): 1716 # The end points match, but the content is unacceptably bad. 1717 results[-1] = False 1718 else: 1719 self.diff_cleanupSemanticLossless(diffs) 1720 index1 = 0 1721 for (op, data) in patch.diffs: 1722 if op != self.DIFF_EQUAL: 1723 index2 = self.diff_xIndex(diffs, index1) 1724 if op == self.DIFF_INSERT: # Insertion 1725 text = ( 1726 text[: start_loc + index2] 1727 + data 1728 + text[start_loc + index2 :] 1729 ) 1730 elif op == self.DIFF_DELETE: # Deletion 1731 text = ( 1732 text[: start_loc + index2] 1733 + text[ 1734 start_loc 1735 + self.diff_xIndex(diffs, index1 + len(data)) : 1736 ] 1737 ) 1738 if op != self.DIFF_DELETE: 1739 index1 += len(data) 1740 # Strip the padding off. 1741 text = text[len(nullPadding) : -len(nullPadding)] 1742 return (text, results) 1743 1744 def patch_addPadding(self, patches): 1745 """Add some padding on text start and end so that edges can match 1746 something. Intended to be called only from within patch_apply. 1747 1748 Args: 1749 patches: Array of Patch objects. 1750 1751 Returns: 1752 The padding string added to each side. 1753 """ 1754 paddingLength = self.Patch_Margin 1755 nullPadding = "" 1756 for x in xrange(1, paddingLength + 1): 1757 nullPadding += chr(x) 1758 1759 # Bump all the patches forward. 1760 for patch in patches: 1761 patch.start1 += paddingLength 1762 patch.start2 += paddingLength 1763 1764 # Add some padding on start of first diff. 1765 patch = patches[0] 1766 diffs = patch.diffs 1767 if not diffs or diffs[0][0] != self.DIFF_EQUAL: 1768 # Add nullPadding equality. 1769 diffs.insert(0, (self.DIFF_EQUAL, nullPadding)) 1770 patch.start1 -= paddingLength # Should be 0. 1771 patch.start2 -= paddingLength # Should be 0. 1772 patch.length1 += paddingLength 1773 patch.length2 += paddingLength 1774 elif paddingLength > len(diffs[0][1]): 1775 # Grow first equality. 1776 extraLength = paddingLength - len(diffs[0][1]) 1777 newText = nullPadding[len(diffs[0][1]) :] + diffs[0][1] 1778 diffs[0] = (diffs[0][0], newText) 1779 patch.start1 -= extraLength 1780 patch.start2 -= extraLength 1781 patch.length1 += extraLength 1782 patch.length2 += extraLength 1783 1784 # Add some padding on end of last diff. 1785 patch = patches[-1] 1786 diffs = patch.diffs 1787 if not diffs or diffs[-1][0] != self.DIFF_EQUAL: 1788 # Add nullPadding equality. 1789 diffs.append((self.DIFF_EQUAL, nullPadding)) 1790 patch.length1 += paddingLength 1791 patch.length2 += paddingLength 1792 elif paddingLength > len(diffs[-1][1]): 1793 # Grow last equality. 1794 extraLength = paddingLength - len(diffs[-1][1]) 1795 newText = diffs[-1][1] + nullPadding[:extraLength] 1796 diffs[-1] = (diffs[-1][0], newText) 1797 patch.length1 += extraLength 1798 patch.length2 += extraLength 1799 1800 return nullPadding 1801 1802 def patch_splitMax(self, patches): 1803 """Look through the patches and break up any which are longer than the 1804 maximum limit of the match algorithm. 1805 Intended to be called only from within patch_apply. 1806 1807 Args: 1808 patches: Array of Patch objects. 1809 """ 1810 patch_size = self.Match_MaxBits 1811 if patch_size == 0: 1812 # Python has the option of not splitting strings due to its ability 1813 # to handle integers of arbitrary precision. 1814 return 1815 for x in xrange(len(patches)): 1816 if patches[x].length1 <= patch_size: 1817 continue 1818 bigpatch = patches[x] 1819 # Remove the big old patch. 1820 del patches[x] 1821 x -= 1 1822 start1 = bigpatch.start1 1823 start2 = bigpatch.start2 1824 precontext = "" 1825 while len(bigpatch.diffs) != 0: 1826 # Create one of several smaller patches. 1827 patch = patch_obj() 1828 empty = True 1829 patch.start1 = start1 - len(precontext) 1830 patch.start2 = start2 - len(precontext) 1831 if precontext: 1832 patch.length1 = patch.length2 = len(precontext) 1833 patch.diffs.append((self.DIFF_EQUAL, precontext)) 1834 1835 while ( 1836 len(bigpatch.diffs) != 0 1837 and patch.length1 < patch_size - self.Patch_Margin 1838 ): 1839 (diff_type, diff_text) = bigpatch.diffs[0] 1840 if diff_type == self.DIFF_INSERT: 1841 # Insertions are harmless. 1842 patch.length2 += len(diff_text) 1843 start2 += len(diff_text) 1844 patch.diffs.append(bigpatch.diffs.pop(0)) 1845 empty = False 1846 elif ( 1847 diff_type == self.DIFF_DELETE 1848 and len(patch.diffs) == 1 1849 and patch.diffs[0][0] == self.DIFF_EQUAL 1850 and len(diff_text) > 2 * patch_size 1851 ): 1852 # This is a large deletion. Let it pass in one chunk. 1853 patch.length1 += len(diff_text) 1854 start1 += len(diff_text) 1855 empty = False 1856 patch.diffs.append((diff_type, diff_text)) 1857 del bigpatch.diffs[0] 1858 else: 1859 # Deletion or equality. Only take as much as we can stomach. 1860 diff_text = diff_text[ 1861 : patch_size - patch.length1 - self.Patch_Margin 1862 ] 1863 patch.length1 += len(diff_text) 1864 start1 += len(diff_text) 1865 if diff_type == self.DIFF_EQUAL: 1866 patch.length2 += len(diff_text) 1867 start2 += len(diff_text) 1868 else: 1869 empty = False 1870 1871 patch.diffs.append((diff_type, diff_text)) 1872 if diff_text == bigpatch.diffs[0][1]: 1873 del bigpatch.diffs[0] 1874 else: 1875 bigpatch.diffs[0] = ( 1876 bigpatch.diffs[0][0], 1877 bigpatch.diffs[0][1][len(diff_text) :], 1878 ) 1879 1880 # Compute the head context for the next patch. 1881 precontext = self.diff_text2(patch.diffs) 1882 precontext = precontext[-self.Patch_Margin :] 1883 # Append the end context for this patch. 1884 postcontext = self.diff_text1(bigpatch.diffs)[: self.Patch_Margin] 1885 if postcontext: 1886 patch.length1 += len(postcontext) 1887 patch.length2 += len(postcontext) 1888 if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL: 1889 patch.diffs[-1] = ( 1890 self.DIFF_EQUAL, 1891 patch.diffs[-1][1] + postcontext, 1892 ) 1893 else: 1894 patch.diffs.append((self.DIFF_EQUAL, postcontext)) 1895 1896 if not empty: 1897 x += 1 1898 patches.insert(x, patch) 1899 1900 def patch_toText(self, patches): 1901 """Take a list of patches and return a textual representation. 1902 1903 Args: 1904 patches: Array of Patch objects. 1905 1906 Returns: 1907 Text representation of patches. 1908 """ 1909 text = [] 1910 for patch in patches: 1911 text.append(str(patch)) 1912 return "".join(text) 1913 1914 def patch_fromText(self, textline): 1915 """Parse a textual representation of patches and return a list of patch 1916 objects. 1917 1918 Args: 1919 textline: Text representation of patches. 1920 1921 Returns: 1922 Array of Patch objects. 1923 1924 Raises: 1925 ValueError: If invalid input. 1926 """ 1927 if type(textline) == unicode: 1928 # Patches should be composed of a subset of ascii chars, Unicode not 1929 # required. If this encode raises UnicodeEncodeError, patch is invalid. 1930 textline = textline.encode("ascii") 1931 patches = [] 1932 if not textline: 1933 return patches 1934 text = textline.split("\n") 1935 while len(text) != 0: 1936 m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0]) 1937 if not m: 1938 raise ValueError("Invalid patch string: " + text[0]) 1939 patch = patch_obj() 1940 patches.append(patch) 1941 patch.start1 = int(m.group(1)) 1942 if m.group(2) == "": 1943 patch.start1 -= 1 1944 patch.length1 = 1 1945 elif m.group(2) == "0": 1946 patch.length1 = 0 1947 else: 1948 patch.start1 -= 1 1949 patch.length1 = int(m.group(2)) 1950 1951 patch.start2 = int(m.group(3)) 1952 if m.group(4) == "": 1953 patch.start2 -= 1 1954 patch.length2 = 1 1955 elif m.group(4) == "0": 1956 patch.length2 = 0 1957 else: 1958 patch.start2 -= 1 1959 patch.length2 = int(m.group(4)) 1960 1961 del text[0] 1962 1963 while len(text) != 0: 1964 if text[0]: 1965 sign = text[0][0] 1966 else: 1967 sign = "" 1968 line = urllib.unquote(text[0][1:]) 1969 line = line.decode("utf-8") 1970 if sign == "+": 1971 # Insertion. 1972 patch.diffs.append((self.DIFF_INSERT, line)) 1973 elif sign == "-": 1974 # Deletion. 1975 patch.diffs.append((self.DIFF_DELETE, line)) 1976 elif sign == " ": 1977 # Minor equality. 1978 patch.diffs.append((self.DIFF_EQUAL, line)) 1979 elif sign == "@": 1980 # Start of next patch. 1981 break 1982 elif sign == "": 1983 # Blank line? Whatever. 1984 pass 1985 else: 1986 # WTF? 1987 raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line)) 1988 del text[0] 1989 return patches 1990 1991 1992class patch_obj: 1993 """Class representing one patch operation. 1994 """ 1995 1996 def __init__(self): 1997 """Initializes with an empty list of diffs. 1998 """ 1999 self.diffs = [] 2000 self.start1 = None 2001 self.start2 = None 2002 self.length1 = 0 2003 self.length2 = 0 2004 2005 def __str__(self): 2006 """Emulate GNU diff's format. 2007 Header: @@ -382,8 +481,9 @@ 2008 Indices are printed as 1-based, not 0-based. 2009 2010 Returns: 2011 The GNU diff string. 2012 """ 2013 if self.length1 == 0: 2014 coords1 = str(self.start1) + ",0" 2015 elif self.length1 == 1: 2016 coords1 = str(self.start1 + 1) 2017 else: 2018 coords1 = str(self.start1 + 1) + "," + str(self.length1) 2019 if self.length2 == 0: 2020 coords2 = str(self.start2) + ",0" 2021 elif self.length2 == 1: 2022 coords2 = str(self.start2 + 1) 2023 else: 2024 coords2 = str(self.start2 + 1) + "," + str(self.length2) 2025 text = ["@@ -", coords1, " +", coords2, " @@\n"] 2026 # Escape the body of the patch with %xx notation. 2027 for (op, data) in self.diffs: 2028 if op == diff_match_patch.DIFF_INSERT: 2029 text.append("+") 2030 elif op == diff_match_patch.DIFF_DELETE: 2031 text.append("-") 2032 elif op == diff_match_patch.DIFF_EQUAL: 2033 text.append(" ") 2034 # High ascii will raise UnicodeDecodeError. Use Unicode instead. 2035 data = data.encode("utf-8") 2036 text.append(urllib.quote(data, "!~*'();/?:@&=+$,# ") + "\n") 2037 return "".join(text) 2038