1# mdiff.py - diff and patch routines for mercurial
2#
3# Copyright 2005, 2006 Olivia Mackall <olivia@selenic.com>
4#
5# This software may be used and distributed according to the terms of the
6# GNU General Public License version 2 or any later version.
7
8from __future__ import absolute_import
9
10import re
11import struct
12import zlib
13
14from .i18n import _
15from .pycompat import (
16    getattr,
17    setattr,
18)
19from . import (
20    diffhelper,
21    encoding,
22    error,
23    policy,
24    pycompat,
25    util,
26)
27from .utils import dateutil
28
29bdiff = policy.importmod('bdiff')
30mpatch = policy.importmod('mpatch')
31
32blocks = bdiff.blocks
33fixws = bdiff.fixws
34patches = mpatch.patches
35patchedsize = mpatch.patchedsize
36textdiff = bdiff.bdiff
37splitnewlines = bdiff.splitnewlines
38
39
40# TODO: this looks like it could be an attrs, which might help pytype
41class diffopts(object):
42    """context is the number of context lines
43    text treats all files as text
44    showfunc enables diff -p output
45    git enables the git extended patch format
46    nodates removes dates from diff headers
47    nobinary ignores binary files
48    noprefix disables the 'a/' and 'b/' prefixes (ignored in plain mode)
49    ignorews ignores all whitespace changes in the diff
50    ignorewsamount ignores changes in the amount of whitespace
51    ignoreblanklines ignores changes whose lines are all blank
52    upgrade generates git diffs to avoid data loss
53    """
54
55    _HAS_DYNAMIC_ATTRIBUTES = True
56
57    defaults = {
58        b'context': 3,
59        b'text': False,
60        b'showfunc': False,
61        b'git': False,
62        b'nodates': False,
63        b'nobinary': False,
64        b'noprefix': False,
65        b'index': 0,
66        b'ignorews': False,
67        b'ignorewsamount': False,
68        b'ignorewseol': False,
69        b'ignoreblanklines': False,
70        b'upgrade': False,
71        b'showsimilarity': False,
72        b'worddiff': False,
73        b'xdiff': False,
74    }
75
76    def __init__(self, **opts):
77        opts = pycompat.byteskwargs(opts)
78        for k in self.defaults.keys():
79            v = opts.get(k)
80            if v is None:
81                v = self.defaults[k]
82            setattr(self, k, v)
83
84        try:
85            self.context = int(self.context)
86        except ValueError:
87            raise error.Abort(
88                _(b'diff context lines count must be an integer, not %r')
89                % pycompat.bytestr(self.context)
90            )
91
92    def copy(self, **kwargs):
93        opts = {k: getattr(self, k) for k in self.defaults}
94        opts = pycompat.strkwargs(opts)
95        opts.update(kwargs)
96        return diffopts(**opts)
97
98
99defaultopts = diffopts()
100
101
102def wsclean(opts, text, blank=True):
103    if opts.ignorews:
104        text = bdiff.fixws(text, 1)
105    elif opts.ignorewsamount:
106        text = bdiff.fixws(text, 0)
107    if blank and opts.ignoreblanklines:
108        text = re.sub(b'\n+', b'\n', text).strip(b'\n')
109    if opts.ignorewseol:
110        text = re.sub(br'[ \t\r\f]+\n', br'\n', text)
111    return text
112
113
114def splitblock(base1, lines1, base2, lines2, opts):
115    # The input lines matches except for interwoven blank lines. We
116    # transform it into a sequence of matching blocks and blank blocks.
117    lines1 = [(wsclean(opts, l) and 1 or 0) for l in lines1]
118    lines2 = [(wsclean(opts, l) and 1 or 0) for l in lines2]
119    s1, e1 = 0, len(lines1)
120    s2, e2 = 0, len(lines2)
121    while s1 < e1 or s2 < e2:
122        i1, i2, btype = s1, s2, b'='
123        if i1 >= e1 or lines1[i1] == 0 or i2 >= e2 or lines2[i2] == 0:
124            # Consume the block of blank lines
125            btype = b'~'
126            while i1 < e1 and lines1[i1] == 0:
127                i1 += 1
128            while i2 < e2 and lines2[i2] == 0:
129                i2 += 1
130        else:
131            # Consume the matching lines
132            while i1 < e1 and lines1[i1] == 1 and lines2[i2] == 1:
133                i1 += 1
134                i2 += 1
135        yield [base1 + s1, base1 + i1, base2 + s2, base2 + i2], btype
136        s1 = i1
137        s2 = i2
138
139
140def hunkinrange(hunk, linerange):
141    """Return True if `hunk` defined as (start, length) is in `linerange`
142    defined as (lowerbound, upperbound).
143
144    >>> hunkinrange((5, 10), (2, 7))
145    True
146    >>> hunkinrange((5, 10), (6, 12))
147    True
148    >>> hunkinrange((5, 10), (13, 17))
149    True
150    >>> hunkinrange((5, 10), (3, 17))
151    True
152    >>> hunkinrange((5, 10), (1, 3))
153    False
154    >>> hunkinrange((5, 10), (18, 20))
155    False
156    >>> hunkinrange((5, 10), (1, 5))
157    False
158    >>> hunkinrange((5, 10), (15, 27))
159    False
160    """
161    start, length = hunk
162    lowerbound, upperbound = linerange
163    return lowerbound < start + length and start < upperbound
164
165
166def blocksinrange(blocks, rangeb):
167    """filter `blocks` like (a1, a2, b1, b2) from items outside line range
168    `rangeb` from ``(b1, b2)`` point of view.
169
170    Return `filteredblocks, rangea` where:
171
172    * `filteredblocks` is list of ``block = (a1, a2, b1, b2), stype`` items of
173      `blocks` that are inside `rangeb` from ``(b1, b2)`` point of view; a
174      block ``(b1, b2)`` being inside `rangeb` if
175      ``rangeb[0] < b2 and b1 < rangeb[1]``;
176    * `rangea` is the line range w.r.t. to ``(a1, a2)`` parts of `blocks`.
177    """
178    lbb, ubb = rangeb
179    lba, uba = None, None
180    filteredblocks = []
181    for block in blocks:
182        (a1, a2, b1, b2), stype = block
183        if lbb >= b1 and ubb <= b2 and stype == b'=':
184            # rangeb is within a single "=" hunk, restrict back linerange1
185            # by offsetting rangeb
186            lba = lbb - b1 + a1
187            uba = ubb - b1 + a1
188        else:
189            if b1 <= lbb < b2:
190                if stype == b'=':
191                    lba = a2 - (b2 - lbb)
192                else:
193                    lba = a1
194            if b1 < ubb <= b2:
195                if stype == b'=':
196                    uba = a1 + (ubb - b1)
197                else:
198                    uba = a2
199        if hunkinrange((b1, (b2 - b1)), rangeb):
200            filteredblocks.append(block)
201    if lba is None or uba is None or uba < lba:
202        raise error.InputError(_(b'line range exceeds file size'))
203    return filteredblocks, (lba, uba)
204
205
206def chooseblocksfunc(opts=None):
207    if (
208        opts is None
209        or not opts.xdiff
210        or not util.safehasattr(bdiff, b'xdiffblocks')
211    ):
212        return bdiff.blocks
213    else:
214        return bdiff.xdiffblocks
215
216
217def allblocks(text1, text2, opts=None, lines1=None, lines2=None):
218    """Return (block, type) tuples, where block is an mdiff.blocks
219    line entry. type is '=' for blocks matching exactly one another
220    (bdiff blocks), '!' for non-matching blocks and '~' for blocks
221    matching only after having filtered blank lines.
222    line1 and line2 are text1 and text2 split with splitnewlines() if
223    they are already available.
224    """
225    if opts is None:
226        opts = defaultopts
227    if opts.ignorews or opts.ignorewsamount or opts.ignorewseol:
228        text1 = wsclean(opts, text1, False)
229        text2 = wsclean(opts, text2, False)
230    diff = chooseblocksfunc(opts)(text1, text2)
231    for i, s1 in enumerate(diff):
232        # The first match is special.
233        # we've either found a match starting at line 0 or a match later
234        # in the file.  If it starts later, old and new below will both be
235        # empty and we'll continue to the next match.
236        if i > 0:
237            s = diff[i - 1]
238        else:
239            s = [0, 0, 0, 0]
240        s = [s[1], s1[0], s[3], s1[2]]
241
242        # bdiff sometimes gives huge matches past eof, this check eats them,
243        # and deals with the special first match case described above
244        if s[0] != s[1] or s[2] != s[3]:
245            type = b'!'
246            if opts.ignoreblanklines:
247                if lines1 is None:
248                    lines1 = splitnewlines(text1)
249                if lines2 is None:
250                    lines2 = splitnewlines(text2)
251                old = wsclean(opts, b"".join(lines1[s[0] : s[1]]))
252                new = wsclean(opts, b"".join(lines2[s[2] : s[3]]))
253                if old == new:
254                    type = b'~'
255            yield s, type
256        yield s1, b'='
257
258
259def unidiff(a, ad, b, bd, fn1, fn2, binary, opts=defaultopts):
260    """Return a unified diff as a (headers, hunks) tuple.
261
262    If the diff is not null, `headers` is a list with unified diff header
263    lines "--- <original>" and "+++ <new>" and `hunks` is a generator yielding
264    (hunkrange, hunklines) coming from _unidiff().
265    Otherwise, `headers` and `hunks` are empty.
266
267    Set binary=True if either a or b should be taken as a binary file.
268    """
269
270    def datetag(date, fn=None):
271        if not opts.git and not opts.nodates:
272            return b'\t%s' % date
273        if fn and b' ' in fn:
274            return b'\t'
275        return b''
276
277    sentinel = [], ()
278    if not a and not b:
279        return sentinel
280
281    if opts.noprefix:
282        aprefix = bprefix = b''
283    else:
284        aprefix = b'a/'
285        bprefix = b'b/'
286
287    epoch = dateutil.datestr((0, 0))
288
289    fn1 = util.pconvert(fn1)
290    fn2 = util.pconvert(fn2)
291
292    if binary:
293        if a and b and len(a) == len(b) and a == b:
294            return sentinel
295        headerlines = []
296        hunks = ((None, [b'Binary file %s has changed\n' % fn1]),)
297    elif not a:
298        without_newline = not b.endswith(b'\n')
299        b = splitnewlines(b)
300        if a is None:
301            l1 = b'--- /dev/null%s' % datetag(epoch)
302        else:
303            l1 = b"--- %s%s%s" % (aprefix, fn1, datetag(ad, fn1))
304        l2 = b"+++ %s%s" % (bprefix + fn2, datetag(bd, fn2))
305        headerlines = [l1, l2]
306        size = len(b)
307        hunkrange = (0, 0, 1, size)
308        hunklines = [b"@@ -0,0 +1,%d @@\n" % size] + [b"+" + e for e in b]
309        if without_newline:
310            hunklines[-1] += b'\n'
311            hunklines.append(diffhelper.MISSING_NEWLINE_MARKER)
312        hunks = ((hunkrange, hunklines),)
313    elif not b:
314        without_newline = not a.endswith(b'\n')
315        a = splitnewlines(a)
316        l1 = b"--- %s%s%s" % (aprefix, fn1, datetag(ad, fn1))
317        if b is None:
318            l2 = b'+++ /dev/null%s' % datetag(epoch)
319        else:
320            l2 = b"+++ %s%s%s" % (bprefix, fn2, datetag(bd, fn2))
321        headerlines = [l1, l2]
322        size = len(a)
323        hunkrange = (1, size, 0, 0)
324        hunklines = [b"@@ -1,%d +0,0 @@\n" % size] + [b"-" + e for e in a]
325        if without_newline:
326            hunklines[-1] += b'\n'
327            hunklines.append(diffhelper.MISSING_NEWLINE_MARKER)
328        hunks = ((hunkrange, hunklines),)
329    else:
330        hunks = _unidiff(a, b, opts=opts)
331        if not next(hunks):
332            return sentinel
333
334        headerlines = [
335            b"--- %s%s%s" % (aprefix, fn1, datetag(ad, fn1)),
336            b"+++ %s%s%s" % (bprefix, fn2, datetag(bd, fn2)),
337        ]
338
339    return headerlines, hunks
340
341
342def _unidiff(t1, t2, opts=defaultopts):
343    """Yield hunks of a headerless unified diff from t1 and t2 texts.
344
345    Each hunk consists of a (hunkrange, hunklines) tuple where `hunkrange` is a
346    tuple (s1, l1, s2, l2) representing the range information of the hunk to
347    form the '@@ -s1,l1 +s2,l2 @@' header and `hunklines` is a list of lines
348    of the hunk combining said header followed by line additions and
349    deletions.
350
351    The hunks are prefixed with a bool.
352    """
353    l1 = splitnewlines(t1)
354    l2 = splitnewlines(t2)
355
356    def contextend(l, len):
357        ret = l + opts.context
358        if ret > len:
359            ret = len
360        return ret
361
362    def contextstart(l):
363        ret = l - opts.context
364        if ret < 0:
365            return 0
366        return ret
367
368    lastfunc = [0, b'']
369
370    def yieldhunk(hunk):
371        (astart, a2, bstart, b2, delta) = hunk
372        aend = contextend(a2, len(l1))
373        alen = aend - astart
374        blen = b2 - bstart + aend - a2
375
376        func = b""
377        if opts.showfunc:
378            lastpos, func = lastfunc
379            # walk backwards from the start of the context up to the start of
380            # the previous hunk context until we find a line starting with an
381            # alphanumeric char.
382            for i in pycompat.xrange(astart - 1, lastpos - 1, -1):
383                if l1[i][0:1].isalnum():
384                    func = b' ' + l1[i].rstrip()
385                    # split long function name if ASCII. otherwise we have no
386                    # idea where the multi-byte boundary is, so just leave it.
387                    if encoding.isasciistr(func):
388                        func = func[:41]
389                    lastfunc[1] = func
390                    break
391            # by recording this hunk's starting point as the next place to
392            # start looking for function lines, we avoid reading any line in
393            # the file more than once.
394            lastfunc[0] = astart
395
396        # zero-length hunk ranges report their start line as one less
397        if alen:
398            astart += 1
399        if blen:
400            bstart += 1
401
402        hunkrange = astart, alen, bstart, blen
403        hunklines = (
404            [b"@@ -%d,%d +%d,%d @@%s\n" % (hunkrange + (func,))]
405            + delta
406            + [b' ' + l1[x] for x in pycompat.xrange(a2, aend)]
407        )
408        # If either file ends without a newline and the last line of
409        # that file is part of a hunk, a marker is printed. If the
410        # last line of both files is identical and neither ends in
411        # a newline, print only one marker. That's the only case in
412        # which the hunk can end in a shared line without a newline.
413        skip = False
414        if not t1.endswith(b'\n') and astart + alen == len(l1) + 1:
415            for i in pycompat.xrange(len(hunklines) - 1, -1, -1):
416                if hunklines[i].startswith((b'-', b' ')):
417                    if hunklines[i].startswith(b' '):
418                        skip = True
419                    hunklines[i] += b'\n'
420                    hunklines.insert(i + 1, diffhelper.MISSING_NEWLINE_MARKER)
421                    break
422        if not skip and not t2.endswith(b'\n') and bstart + blen == len(l2) + 1:
423            for i in pycompat.xrange(len(hunklines) - 1, -1, -1):
424                if hunklines[i].startswith(b'+'):
425                    hunklines[i] += b'\n'
426                    hunklines.insert(i + 1, diffhelper.MISSING_NEWLINE_MARKER)
427                    break
428        yield hunkrange, hunklines
429
430    # bdiff.blocks gives us the matching sequences in the files.  The loop
431    # below finds the spaces between those matching sequences and translates
432    # them into diff output.
433    #
434    hunk = None
435    ignoredlines = 0
436    has_hunks = False
437    for s, stype in allblocks(t1, t2, opts, l1, l2):
438        a1, a2, b1, b2 = s
439        if stype != b'!':
440            if stype == b'~':
441                # The diff context lines are based on t1 content. When
442                # blank lines are ignored, the new lines offsets must
443                # be adjusted as if equivalent blocks ('~') had the
444                # same sizes on both sides.
445                ignoredlines += (b2 - b1) - (a2 - a1)
446            continue
447        delta = []
448        old = l1[a1:a2]
449        new = l2[b1:b2]
450
451        b1 -= ignoredlines
452        b2 -= ignoredlines
453        astart = contextstart(a1)
454        bstart = contextstart(b1)
455        prev = None
456        if hunk:
457            # join with the previous hunk if it falls inside the context
458            if astart < hunk[1] + opts.context + 1:
459                prev = hunk
460                astart = hunk[1]
461                bstart = hunk[3]
462            else:
463                if not has_hunks:
464                    has_hunks = True
465                    yield True
466                for x in yieldhunk(hunk):
467                    yield x
468        if prev:
469            # we've joined the previous hunk, record the new ending points.
470            hunk[1] = a2
471            hunk[3] = b2
472            delta = hunk[4]
473        else:
474            # create a new hunk
475            hunk = [astart, a2, bstart, b2, delta]
476
477        delta[len(delta) :] = [b' ' + x for x in l1[astart:a1]]
478        delta[len(delta) :] = [b'-' + x for x in old]
479        delta[len(delta) :] = [b'+' + x for x in new]
480
481    if hunk:
482        if not has_hunks:
483            has_hunks = True
484            yield True
485        for x in yieldhunk(hunk):
486            yield x
487    elif not has_hunks:
488        yield False
489
490
491def b85diff(to, tn):
492    '''print base85-encoded binary diff'''
493
494    def fmtline(line):
495        l = len(line)
496        if l <= 26:
497            l = pycompat.bytechr(ord(b'A') + l - 1)
498        else:
499            l = pycompat.bytechr(l - 26 + ord(b'a') - 1)
500        return b'%c%s\n' % (l, util.b85encode(line, True))
501
502    def chunk(text, csize=52):
503        l = len(text)
504        i = 0
505        while i < l:
506            yield text[i : i + csize]
507            i += csize
508
509    if to is None:
510        to = b''
511    if tn is None:
512        tn = b''
513
514    if to == tn:
515        return b''
516
517    # TODO: deltas
518    ret = []
519    ret.append(b'GIT binary patch\n')
520    ret.append(b'literal %d\n' % len(tn))
521    for l in chunk(zlib.compress(tn)):
522        ret.append(fmtline(l))
523    ret.append(b'\n')
524
525    return b''.join(ret)
526
527
528def patchtext(bin):
529    pos = 0
530    t = []
531    while pos < len(bin):
532        p1, p2, l = struct.unpack(b">lll", bin[pos : pos + 12])
533        pos += 12
534        t.append(bin[pos : pos + l])
535        pos += l
536    return b"".join(t)
537
538
539def patch(a, bin):
540    if len(a) == 0:
541        # skip over trivial delta header
542        return util.buffer(bin, 12)
543    return mpatch.patches(a, [bin])
544
545
546# similar to difflib.SequenceMatcher.get_matching_blocks
547def get_matching_blocks(a, b):
548    return [(d[0], d[2], d[1] - d[0]) for d in bdiff.blocks(a, b)]
549
550
551def trivialdiffheader(length):
552    return struct.pack(b">lll", 0, 0, length) if length else b''
553
554
555def replacediffheader(oldlen, newlen):
556    return struct.pack(b">lll", 0, oldlen, newlen)
557