1# mdiff.py - diff and patch routines for mercurial 2# 3# Copyright 2005, 2006 Olivia Mackall <olivia@selenic.com> 4# 5# This software may be used and distributed according to the terms of the 6# GNU General Public License version 2 or any later version. 7 8from __future__ import absolute_import 9 10import re 11import struct 12import zlib 13 14from .i18n import _ 15from .pycompat import ( 16 getattr, 17 setattr, 18) 19from . import ( 20 diffhelper, 21 encoding, 22 error, 23 policy, 24 pycompat, 25 util, 26) 27from .utils import dateutil 28 29bdiff = policy.importmod('bdiff') 30mpatch = policy.importmod('mpatch') 31 32blocks = bdiff.blocks 33fixws = bdiff.fixws 34patches = mpatch.patches 35patchedsize = mpatch.patchedsize 36textdiff = bdiff.bdiff 37splitnewlines = bdiff.splitnewlines 38 39 40# TODO: this looks like it could be an attrs, which might help pytype 41class diffopts(object): 42 """context is the number of context lines 43 text treats all files as text 44 showfunc enables diff -p output 45 git enables the git extended patch format 46 nodates removes dates from diff headers 47 nobinary ignores binary files 48 noprefix disables the 'a/' and 'b/' prefixes (ignored in plain mode) 49 ignorews ignores all whitespace changes in the diff 50 ignorewsamount ignores changes in the amount of whitespace 51 ignoreblanklines ignores changes whose lines are all blank 52 upgrade generates git diffs to avoid data loss 53 """ 54 55 _HAS_DYNAMIC_ATTRIBUTES = True 56 57 defaults = { 58 b'context': 3, 59 b'text': False, 60 b'showfunc': False, 61 b'git': False, 62 b'nodates': False, 63 b'nobinary': False, 64 b'noprefix': False, 65 b'index': 0, 66 b'ignorews': False, 67 b'ignorewsamount': False, 68 b'ignorewseol': False, 69 b'ignoreblanklines': False, 70 b'upgrade': False, 71 b'showsimilarity': False, 72 b'worddiff': False, 73 b'xdiff': False, 74 } 75 76 def __init__(self, **opts): 77 opts = pycompat.byteskwargs(opts) 78 for k in self.defaults.keys(): 79 v = opts.get(k) 80 if v is None: 81 v = self.defaults[k] 82 setattr(self, k, v) 83 84 try: 85 self.context = int(self.context) 86 except ValueError: 87 raise error.Abort( 88 _(b'diff context lines count must be an integer, not %r') 89 % pycompat.bytestr(self.context) 90 ) 91 92 def copy(self, **kwargs): 93 opts = {k: getattr(self, k) for k in self.defaults} 94 opts = pycompat.strkwargs(opts) 95 opts.update(kwargs) 96 return diffopts(**opts) 97 98 99defaultopts = diffopts() 100 101 102def wsclean(opts, text, blank=True): 103 if opts.ignorews: 104 text = bdiff.fixws(text, 1) 105 elif opts.ignorewsamount: 106 text = bdiff.fixws(text, 0) 107 if blank and opts.ignoreblanklines: 108 text = re.sub(b'\n+', b'\n', text).strip(b'\n') 109 if opts.ignorewseol: 110 text = re.sub(br'[ \t\r\f]+\n', br'\n', text) 111 return text 112 113 114def splitblock(base1, lines1, base2, lines2, opts): 115 # The input lines matches except for interwoven blank lines. We 116 # transform it into a sequence of matching blocks and blank blocks. 117 lines1 = [(wsclean(opts, l) and 1 or 0) for l in lines1] 118 lines2 = [(wsclean(opts, l) and 1 or 0) for l in lines2] 119 s1, e1 = 0, len(lines1) 120 s2, e2 = 0, len(lines2) 121 while s1 < e1 or s2 < e2: 122 i1, i2, btype = s1, s2, b'=' 123 if i1 >= e1 or lines1[i1] == 0 or i2 >= e2 or lines2[i2] == 0: 124 # Consume the block of blank lines 125 btype = b'~' 126 while i1 < e1 and lines1[i1] == 0: 127 i1 += 1 128 while i2 < e2 and lines2[i2] == 0: 129 i2 += 1 130 else: 131 # Consume the matching lines 132 while i1 < e1 and lines1[i1] == 1 and lines2[i2] == 1: 133 i1 += 1 134 i2 += 1 135 yield [base1 + s1, base1 + i1, base2 + s2, base2 + i2], btype 136 s1 = i1 137 s2 = i2 138 139 140def hunkinrange(hunk, linerange): 141 """Return True if `hunk` defined as (start, length) is in `linerange` 142 defined as (lowerbound, upperbound). 143 144 >>> hunkinrange((5, 10), (2, 7)) 145 True 146 >>> hunkinrange((5, 10), (6, 12)) 147 True 148 >>> hunkinrange((5, 10), (13, 17)) 149 True 150 >>> hunkinrange((5, 10), (3, 17)) 151 True 152 >>> hunkinrange((5, 10), (1, 3)) 153 False 154 >>> hunkinrange((5, 10), (18, 20)) 155 False 156 >>> hunkinrange((5, 10), (1, 5)) 157 False 158 >>> hunkinrange((5, 10), (15, 27)) 159 False 160 """ 161 start, length = hunk 162 lowerbound, upperbound = linerange 163 return lowerbound < start + length and start < upperbound 164 165 166def blocksinrange(blocks, rangeb): 167 """filter `blocks` like (a1, a2, b1, b2) from items outside line range 168 `rangeb` from ``(b1, b2)`` point of view. 169 170 Return `filteredblocks, rangea` where: 171 172 * `filteredblocks` is list of ``block = (a1, a2, b1, b2), stype`` items of 173 `blocks` that are inside `rangeb` from ``(b1, b2)`` point of view; a 174 block ``(b1, b2)`` being inside `rangeb` if 175 ``rangeb[0] < b2 and b1 < rangeb[1]``; 176 * `rangea` is the line range w.r.t. to ``(a1, a2)`` parts of `blocks`. 177 """ 178 lbb, ubb = rangeb 179 lba, uba = None, None 180 filteredblocks = [] 181 for block in blocks: 182 (a1, a2, b1, b2), stype = block 183 if lbb >= b1 and ubb <= b2 and stype == b'=': 184 # rangeb is within a single "=" hunk, restrict back linerange1 185 # by offsetting rangeb 186 lba = lbb - b1 + a1 187 uba = ubb - b1 + a1 188 else: 189 if b1 <= lbb < b2: 190 if stype == b'=': 191 lba = a2 - (b2 - lbb) 192 else: 193 lba = a1 194 if b1 < ubb <= b2: 195 if stype == b'=': 196 uba = a1 + (ubb - b1) 197 else: 198 uba = a2 199 if hunkinrange((b1, (b2 - b1)), rangeb): 200 filteredblocks.append(block) 201 if lba is None or uba is None or uba < lba: 202 raise error.InputError(_(b'line range exceeds file size')) 203 return filteredblocks, (lba, uba) 204 205 206def chooseblocksfunc(opts=None): 207 if ( 208 opts is None 209 or not opts.xdiff 210 or not util.safehasattr(bdiff, b'xdiffblocks') 211 ): 212 return bdiff.blocks 213 else: 214 return bdiff.xdiffblocks 215 216 217def allblocks(text1, text2, opts=None, lines1=None, lines2=None): 218 """Return (block, type) tuples, where block is an mdiff.blocks 219 line entry. type is '=' for blocks matching exactly one another 220 (bdiff blocks), '!' for non-matching blocks and '~' for blocks 221 matching only after having filtered blank lines. 222 line1 and line2 are text1 and text2 split with splitnewlines() if 223 they are already available. 224 """ 225 if opts is None: 226 opts = defaultopts 227 if opts.ignorews or opts.ignorewsamount or opts.ignorewseol: 228 text1 = wsclean(opts, text1, False) 229 text2 = wsclean(opts, text2, False) 230 diff = chooseblocksfunc(opts)(text1, text2) 231 for i, s1 in enumerate(diff): 232 # The first match is special. 233 # we've either found a match starting at line 0 or a match later 234 # in the file. If it starts later, old and new below will both be 235 # empty and we'll continue to the next match. 236 if i > 0: 237 s = diff[i - 1] 238 else: 239 s = [0, 0, 0, 0] 240 s = [s[1], s1[0], s[3], s1[2]] 241 242 # bdiff sometimes gives huge matches past eof, this check eats them, 243 # and deals with the special first match case described above 244 if s[0] != s[1] or s[2] != s[3]: 245 type = b'!' 246 if opts.ignoreblanklines: 247 if lines1 is None: 248 lines1 = splitnewlines(text1) 249 if lines2 is None: 250 lines2 = splitnewlines(text2) 251 old = wsclean(opts, b"".join(lines1[s[0] : s[1]])) 252 new = wsclean(opts, b"".join(lines2[s[2] : s[3]])) 253 if old == new: 254 type = b'~' 255 yield s, type 256 yield s1, b'=' 257 258 259def unidiff(a, ad, b, bd, fn1, fn2, binary, opts=defaultopts): 260 """Return a unified diff as a (headers, hunks) tuple. 261 262 If the diff is not null, `headers` is a list with unified diff header 263 lines "--- <original>" and "+++ <new>" and `hunks` is a generator yielding 264 (hunkrange, hunklines) coming from _unidiff(). 265 Otherwise, `headers` and `hunks` are empty. 266 267 Set binary=True if either a or b should be taken as a binary file. 268 """ 269 270 def datetag(date, fn=None): 271 if not opts.git and not opts.nodates: 272 return b'\t%s' % date 273 if fn and b' ' in fn: 274 return b'\t' 275 return b'' 276 277 sentinel = [], () 278 if not a and not b: 279 return sentinel 280 281 if opts.noprefix: 282 aprefix = bprefix = b'' 283 else: 284 aprefix = b'a/' 285 bprefix = b'b/' 286 287 epoch = dateutil.datestr((0, 0)) 288 289 fn1 = util.pconvert(fn1) 290 fn2 = util.pconvert(fn2) 291 292 if binary: 293 if a and b and len(a) == len(b) and a == b: 294 return sentinel 295 headerlines = [] 296 hunks = ((None, [b'Binary file %s has changed\n' % fn1]),) 297 elif not a: 298 without_newline = not b.endswith(b'\n') 299 b = splitnewlines(b) 300 if a is None: 301 l1 = b'--- /dev/null%s' % datetag(epoch) 302 else: 303 l1 = b"--- %s%s%s" % (aprefix, fn1, datetag(ad, fn1)) 304 l2 = b"+++ %s%s" % (bprefix + fn2, datetag(bd, fn2)) 305 headerlines = [l1, l2] 306 size = len(b) 307 hunkrange = (0, 0, 1, size) 308 hunklines = [b"@@ -0,0 +1,%d @@\n" % size] + [b"+" + e for e in b] 309 if without_newline: 310 hunklines[-1] += b'\n' 311 hunklines.append(diffhelper.MISSING_NEWLINE_MARKER) 312 hunks = ((hunkrange, hunklines),) 313 elif not b: 314 without_newline = not a.endswith(b'\n') 315 a = splitnewlines(a) 316 l1 = b"--- %s%s%s" % (aprefix, fn1, datetag(ad, fn1)) 317 if b is None: 318 l2 = b'+++ /dev/null%s' % datetag(epoch) 319 else: 320 l2 = b"+++ %s%s%s" % (bprefix, fn2, datetag(bd, fn2)) 321 headerlines = [l1, l2] 322 size = len(a) 323 hunkrange = (1, size, 0, 0) 324 hunklines = [b"@@ -1,%d +0,0 @@\n" % size] + [b"-" + e for e in a] 325 if without_newline: 326 hunklines[-1] += b'\n' 327 hunklines.append(diffhelper.MISSING_NEWLINE_MARKER) 328 hunks = ((hunkrange, hunklines),) 329 else: 330 hunks = _unidiff(a, b, opts=opts) 331 if not next(hunks): 332 return sentinel 333 334 headerlines = [ 335 b"--- %s%s%s" % (aprefix, fn1, datetag(ad, fn1)), 336 b"+++ %s%s%s" % (bprefix, fn2, datetag(bd, fn2)), 337 ] 338 339 return headerlines, hunks 340 341 342def _unidiff(t1, t2, opts=defaultopts): 343 """Yield hunks of a headerless unified diff from t1 and t2 texts. 344 345 Each hunk consists of a (hunkrange, hunklines) tuple where `hunkrange` is a 346 tuple (s1, l1, s2, l2) representing the range information of the hunk to 347 form the '@@ -s1,l1 +s2,l2 @@' header and `hunklines` is a list of lines 348 of the hunk combining said header followed by line additions and 349 deletions. 350 351 The hunks are prefixed with a bool. 352 """ 353 l1 = splitnewlines(t1) 354 l2 = splitnewlines(t2) 355 356 def contextend(l, len): 357 ret = l + opts.context 358 if ret > len: 359 ret = len 360 return ret 361 362 def contextstart(l): 363 ret = l - opts.context 364 if ret < 0: 365 return 0 366 return ret 367 368 lastfunc = [0, b''] 369 370 def yieldhunk(hunk): 371 (astart, a2, bstart, b2, delta) = hunk 372 aend = contextend(a2, len(l1)) 373 alen = aend - astart 374 blen = b2 - bstart + aend - a2 375 376 func = b"" 377 if opts.showfunc: 378 lastpos, func = lastfunc 379 # walk backwards from the start of the context up to the start of 380 # the previous hunk context until we find a line starting with an 381 # alphanumeric char. 382 for i in pycompat.xrange(astart - 1, lastpos - 1, -1): 383 if l1[i][0:1].isalnum(): 384 func = b' ' + l1[i].rstrip() 385 # split long function name if ASCII. otherwise we have no 386 # idea where the multi-byte boundary is, so just leave it. 387 if encoding.isasciistr(func): 388 func = func[:41] 389 lastfunc[1] = func 390 break 391 # by recording this hunk's starting point as the next place to 392 # start looking for function lines, we avoid reading any line in 393 # the file more than once. 394 lastfunc[0] = astart 395 396 # zero-length hunk ranges report their start line as one less 397 if alen: 398 astart += 1 399 if blen: 400 bstart += 1 401 402 hunkrange = astart, alen, bstart, blen 403 hunklines = ( 404 [b"@@ -%d,%d +%d,%d @@%s\n" % (hunkrange + (func,))] 405 + delta 406 + [b' ' + l1[x] for x in pycompat.xrange(a2, aend)] 407 ) 408 # If either file ends without a newline and the last line of 409 # that file is part of a hunk, a marker is printed. If the 410 # last line of both files is identical and neither ends in 411 # a newline, print only one marker. That's the only case in 412 # which the hunk can end in a shared line without a newline. 413 skip = False 414 if not t1.endswith(b'\n') and astart + alen == len(l1) + 1: 415 for i in pycompat.xrange(len(hunklines) - 1, -1, -1): 416 if hunklines[i].startswith((b'-', b' ')): 417 if hunklines[i].startswith(b' '): 418 skip = True 419 hunklines[i] += b'\n' 420 hunklines.insert(i + 1, diffhelper.MISSING_NEWLINE_MARKER) 421 break 422 if not skip and not t2.endswith(b'\n') and bstart + blen == len(l2) + 1: 423 for i in pycompat.xrange(len(hunklines) - 1, -1, -1): 424 if hunklines[i].startswith(b'+'): 425 hunklines[i] += b'\n' 426 hunklines.insert(i + 1, diffhelper.MISSING_NEWLINE_MARKER) 427 break 428 yield hunkrange, hunklines 429 430 # bdiff.blocks gives us the matching sequences in the files. The loop 431 # below finds the spaces between those matching sequences and translates 432 # them into diff output. 433 # 434 hunk = None 435 ignoredlines = 0 436 has_hunks = False 437 for s, stype in allblocks(t1, t2, opts, l1, l2): 438 a1, a2, b1, b2 = s 439 if stype != b'!': 440 if stype == b'~': 441 # The diff context lines are based on t1 content. When 442 # blank lines are ignored, the new lines offsets must 443 # be adjusted as if equivalent blocks ('~') had the 444 # same sizes on both sides. 445 ignoredlines += (b2 - b1) - (a2 - a1) 446 continue 447 delta = [] 448 old = l1[a1:a2] 449 new = l2[b1:b2] 450 451 b1 -= ignoredlines 452 b2 -= ignoredlines 453 astart = contextstart(a1) 454 bstart = contextstart(b1) 455 prev = None 456 if hunk: 457 # join with the previous hunk if it falls inside the context 458 if astart < hunk[1] + opts.context + 1: 459 prev = hunk 460 astart = hunk[1] 461 bstart = hunk[3] 462 else: 463 if not has_hunks: 464 has_hunks = True 465 yield True 466 for x in yieldhunk(hunk): 467 yield x 468 if prev: 469 # we've joined the previous hunk, record the new ending points. 470 hunk[1] = a2 471 hunk[3] = b2 472 delta = hunk[4] 473 else: 474 # create a new hunk 475 hunk = [astart, a2, bstart, b2, delta] 476 477 delta[len(delta) :] = [b' ' + x for x in l1[astart:a1]] 478 delta[len(delta) :] = [b'-' + x for x in old] 479 delta[len(delta) :] = [b'+' + x for x in new] 480 481 if hunk: 482 if not has_hunks: 483 has_hunks = True 484 yield True 485 for x in yieldhunk(hunk): 486 yield x 487 elif not has_hunks: 488 yield False 489 490 491def b85diff(to, tn): 492 '''print base85-encoded binary diff''' 493 494 def fmtline(line): 495 l = len(line) 496 if l <= 26: 497 l = pycompat.bytechr(ord(b'A') + l - 1) 498 else: 499 l = pycompat.bytechr(l - 26 + ord(b'a') - 1) 500 return b'%c%s\n' % (l, util.b85encode(line, True)) 501 502 def chunk(text, csize=52): 503 l = len(text) 504 i = 0 505 while i < l: 506 yield text[i : i + csize] 507 i += csize 508 509 if to is None: 510 to = b'' 511 if tn is None: 512 tn = b'' 513 514 if to == tn: 515 return b'' 516 517 # TODO: deltas 518 ret = [] 519 ret.append(b'GIT binary patch\n') 520 ret.append(b'literal %d\n' % len(tn)) 521 for l in chunk(zlib.compress(tn)): 522 ret.append(fmtline(l)) 523 ret.append(b'\n') 524 525 return b''.join(ret) 526 527 528def patchtext(bin): 529 pos = 0 530 t = [] 531 while pos < len(bin): 532 p1, p2, l = struct.unpack(b">lll", bin[pos : pos + 12]) 533 pos += 12 534 t.append(bin[pos : pos + l]) 535 pos += l 536 return b"".join(t) 537 538 539def patch(a, bin): 540 if len(a) == 0: 541 # skip over trivial delta header 542 return util.buffer(bin, 12) 543 return mpatch.patches(a, [bin]) 544 545 546# similar to difflib.SequenceMatcher.get_matching_blocks 547def get_matching_blocks(a, b): 548 return [(d[0], d[2], d[1] - d[0]) for d in bdiff.blocks(a, b)] 549 550 551def trivialdiffheader(length): 552 return struct.pack(b">lll", 0, 0, length) if length else b'' 553 554 555def replacediffheader(oldlen, newlen): 556 return struct.pack(b">lll", 0, oldlen, newlen) 557