1"""
2Module difflib -- helpers for computing deltas between objects.
3
4Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
5    Use SequenceMatcher to return list of the best "good enough" matches.
6
7Function context_diff(a, b):
8    For two lists of strings, return a delta in context diff format.
9
10Function ndiff(a, b):
11    Return a delta: the difference between `a` and `b` (lists of strings).
12
13Function restore(delta, which):
14    Return one of the two sequences that generated an ndiff delta.
15
16Function unified_diff(a, b):
17    For two lists of strings, return a delta in unified diff format.
18
19Class SequenceMatcher:
20    A flexible class for comparing pairs of sequences of any type.
21
22Class Differ:
23    For producing human-readable deltas from sequences of lines of text.
24
25Class HtmlDiff:
26    For producing HTML side by side comparison with change highlights.
27"""
28
29__all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
30           'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
31           'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match']
32
33from heapq import nlargest as _nlargest
34from collections import namedtuple as _namedtuple
35
36Match = _namedtuple('Match', 'a b size')
37
38def _calculate_ratio(matches, length):
39    if length:
40        return 2.0 * matches / length
41    return 1.0
42
43class SequenceMatcher:
44
45    """
46    SequenceMatcher is a flexible class for comparing pairs of sequences of
47    any type, so long as the sequence elements are hashable.  The basic
48    algorithm predates, and is a little fancier than, an algorithm
49    published in the late 1980's by Ratcliff and Obershelp under the
50    hyperbolic name "gestalt pattern matching".  The basic idea is to find
51    the longest contiguous matching subsequence that contains no "junk"
52    elements (R-O doesn't address junk).  The same idea is then applied
53    recursively to the pieces of the sequences to the left and to the right
54    of the matching subsequence.  This does not yield minimal edit
55    sequences, but does tend to yield matches that "look right" to people.
56
57    SequenceMatcher tries to compute a "human-friendly diff" between two
58    sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
59    longest *contiguous* & junk-free matching subsequence.  That's what
60    catches peoples' eyes.  The Windows(tm) windiff has another interesting
61    notion, pairing up elements that appear uniquely in each sequence.
62    That, and the method here, appear to yield more intuitive difference
63    reports than does diff.  This method appears to be the least vulnerable
64    to synching up on blocks of "junk lines", though (like blank lines in
65    ordinary text files, or maybe "<P>" lines in HTML files).  That may be
66    because this is the only method of the 3 that has a *concept* of
67    "junk" <wink>.
68
69    Example, comparing two strings, and considering blanks to be "junk":
70
71    >>> s = SequenceMatcher(lambda x: x == " ",
72    ...                     "private Thread currentThread;",
73    ...                     "private volatile Thread currentThread;")
74    >>>
75
76    .ratio() returns a float in [0, 1], measuring the "similarity" of the
77    sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
78    sequences are close matches:
79
80    >>> print(round(s.ratio(), 3))
81    0.866
82    >>>
83
84    If you're only interested in where the sequences match,
85    .get_matching_blocks() is handy:
86
87    >>> for block in s.get_matching_blocks():
88    ...     print("a[%d] and b[%d] match for %d elements" % block)
89    a[0] and b[0] match for 8 elements
90    a[8] and b[17] match for 21 elements
91    a[29] and b[38] match for 0 elements
92
93    Note that the last tuple returned by .get_matching_blocks() is always a
94    dummy, (len(a), len(b), 0), and this is the only case in which the last
95    tuple element (number of elements matched) is 0.
96
97    If you want to know how to change the first sequence into the second,
98    use .get_opcodes():
99
100    >>> for opcode in s.get_opcodes():
101    ...     print("%6s a[%d:%d] b[%d:%d]" % opcode)
102     equal a[0:8] b[0:8]
103    insert a[8:8] b[8:17]
104     equal a[8:29] b[17:38]
105
106    See the Differ class for a fancy human-friendly file differencer, which
107    uses SequenceMatcher both to compare sequences of lines, and to compare
108    sequences of characters within similar (near-matching) lines.
109
110    See also function get_close_matches() in this module, which shows how
111    simple code building on SequenceMatcher can be used to do useful work.
112
113    Timing:  Basic R-O is cubic time worst case and quadratic time expected
114    case.  SequenceMatcher is quadratic time for the worst case and has
115    expected-case behavior dependent in a complicated way on how many
116    elements the sequences have in common; best case time is linear.
117
118    Methods:
119
120    __init__(isjunk=None, a='', b='')
121        Construct a SequenceMatcher.
122
123    set_seqs(a, b)
124        Set the two sequences to be compared.
125
126    set_seq1(a)
127        Set the first sequence to be compared.
128
129    set_seq2(b)
130        Set the second sequence to be compared.
131
132    find_longest_match(alo, ahi, blo, bhi)
133        Find longest matching block in a[alo:ahi] and b[blo:bhi].
134
135    get_matching_blocks()
136        Return list of triples describing matching subsequences.
137
138    get_opcodes()
139        Return list of 5-tuples describing how to turn a into b.
140
141    ratio()
142        Return a measure of the sequences' similarity (float in [0,1]).
143
144    quick_ratio()
145        Return an upper bound on .ratio() relatively quickly.
146
147    real_quick_ratio()
148        Return an upper bound on ratio() very quickly.
149    """
150
151    def __init__(self, isjunk=None, a='', b='', autojunk=True):
152        """Construct a SequenceMatcher.
153
154        Optional arg isjunk is None (the default), or a one-argument
155        function that takes a sequence element and returns true iff the
156        element is junk.  None is equivalent to passing "lambda x: 0", i.e.
157        no elements are considered to be junk.  For example, pass
158            lambda x: x in " \\t"
159        if you're comparing lines as sequences of characters, and don't
160        want to synch up on blanks or hard tabs.
161
162        Optional arg a is the first of two sequences to be compared.  By
163        default, an empty string.  The elements of a must be hashable.  See
164        also .set_seqs() and .set_seq1().
165
166        Optional arg b is the second of two sequences to be compared.  By
167        default, an empty string.  The elements of b must be hashable. See
168        also .set_seqs() and .set_seq2().
169
170        Optional arg autojunk should be set to False to disable the
171        "automatic junk heuristic" that treats popular elements as junk
172        (see module documentation for more information).
173        """
174
175        # Members:
176        # a
177        #      first sequence
178        # b
179        #      second sequence; differences are computed as "what do
180        #      we need to do to 'a' to change it into 'b'?"
181        # b2j
182        #      for x in b, b2j[x] is a list of the indices (into b)
183        #      at which x appears; junk and popular elements do not appear
184        # fullbcount
185        #      for x in b, fullbcount[x] == the number of times x
186        #      appears in b; only materialized if really needed (used
187        #      only for computing quick_ratio())
188        # matching_blocks
189        #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
190        #      ascending & non-overlapping in i and in j; terminated by
191        #      a dummy (len(a), len(b), 0) sentinel
192        # opcodes
193        #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
194        #      one of
195        #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
196        #          'delete'    a[i1:i2] should be deleted
197        #          'insert'    b[j1:j2] should be inserted
198        #          'equal'     a[i1:i2] == b[j1:j2]
199        # isjunk
200        #      a user-supplied function taking a sequence element and
201        #      returning true iff the element is "junk" -- this has
202        #      subtle but helpful effects on the algorithm, which I'll
203        #      get around to writing up someday <0.9 wink>.
204        #      DON'T USE!  Only __chain_b uses this.  Use "in self.bjunk".
205        # bjunk
206        #      the items in b for which isjunk is True.
207        # bpopular
208        #      nonjunk items in b treated as junk by the heuristic (if used).
209
210        self.isjunk = isjunk
211        self.a = self.b = None
212        self.autojunk = autojunk
213        self.set_seqs(a, b)
214
215    def set_seqs(self, a, b):
216        """Set the two sequences to be compared.
217
218        >>> s = SequenceMatcher()
219        >>> s.set_seqs("abcd", "bcde")
220        >>> s.ratio()
221        0.75
222        """
223
224        self.set_seq1(a)
225        self.set_seq2(b)
226
227    def set_seq1(self, a):
228        """Set the first sequence to be compared.
229
230        The second sequence to be compared is not changed.
231
232        >>> s = SequenceMatcher(None, "abcd", "bcde")
233        >>> s.ratio()
234        0.75
235        >>> s.set_seq1("bcde")
236        >>> s.ratio()
237        1.0
238        >>>
239
240        SequenceMatcher computes and caches detailed information about the
241        second sequence, so if you want to compare one sequence S against
242        many sequences, use .set_seq2(S) once and call .set_seq1(x)
243        repeatedly for each of the other sequences.
244
245        See also set_seqs() and set_seq2().
246        """
247
248        if a is self.a:
249            return
250        self.a = a
251        self.matching_blocks = self.opcodes = None
252
253    def set_seq2(self, b):
254        """Set the second sequence to be compared.
255
256        The first sequence to be compared is not changed.
257
258        >>> s = SequenceMatcher(None, "abcd", "bcde")
259        >>> s.ratio()
260        0.75
261        >>> s.set_seq2("abcd")
262        >>> s.ratio()
263        1.0
264        >>>
265
266        SequenceMatcher computes and caches detailed information about the
267        second sequence, so if you want to compare one sequence S against
268        many sequences, use .set_seq2(S) once and call .set_seq1(x)
269        repeatedly for each of the other sequences.
270
271        See also set_seqs() and set_seq1().
272        """
273
274        if b is self.b:
275            return
276        self.b = b
277        self.matching_blocks = self.opcodes = None
278        self.fullbcount = None
279        self.__chain_b()
280
281    # For each element x in b, set b2j[x] to a list of the indices in
282    # b where x appears; the indices are in increasing order; note that
283    # the number of times x appears in b is len(b2j[x]) ...
284    # when self.isjunk is defined, junk elements don't show up in this
285    # map at all, which stops the central find_longest_match method
286    # from starting any matching block at a junk element ...
287    # b2j also does not contain entries for "popular" elements, meaning
288    # elements that account for more than 1 + 1% of the total elements, and
289    # when the sequence is reasonably large (>= 200 elements); this can
290    # be viewed as an adaptive notion of semi-junk, and yields an enormous
291    # speedup when, e.g., comparing program files with hundreds of
292    # instances of "return NULL;" ...
293    # note that this is only called when b changes; so for cross-product
294    # kinds of matches, it's best to call set_seq2 once, then set_seq1
295    # repeatedly
296
297    def __chain_b(self):
298        # Because isjunk is a user-defined (not C) function, and we test
299        # for junk a LOT, it's important to minimize the number of calls.
300        # Before the tricks described here, __chain_b was by far the most
301        # time-consuming routine in the whole module!  If anyone sees
302        # Jim Roskind, thank him again for profile.py -- I never would
303        # have guessed that.
304        # The first trick is to build b2j ignoring the possibility
305        # of junk.  I.e., we don't call isjunk at all yet.  Throwing
306        # out the junk later is much cheaper than building b2j "right"
307        # from the start.
308        b = self.b
309        self.b2j = b2j = {}
310
311        for i, elt in enumerate(b):
312            indices = b2j.setdefault(elt, [])
313            indices.append(i)
314
315        # Purge junk elements
316        self.bjunk = junk = set()
317        isjunk = self.isjunk
318        if isjunk:
319            for elt in b2j.keys():
320                if isjunk(elt):
321                    junk.add(elt)
322            for elt in junk: # separate loop avoids separate list of keys
323                del b2j[elt]
324
325        # Purge popular elements that are not junk
326        self.bpopular = popular = set()
327        n = len(b)
328        if self.autojunk and n >= 200:
329            ntest = n // 100 + 1
330            for elt, idxs in b2j.items():
331                if len(idxs) > ntest:
332                    popular.add(elt)
333            for elt in popular: # ditto; as fast for 1% deletion
334                del b2j[elt]
335
336    def find_longest_match(self, alo, ahi, blo, bhi):
337        """Find longest matching block in a[alo:ahi] and b[blo:bhi].
338
339        If isjunk is not defined:
340
341        Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
342            alo <= i <= i+k <= ahi
343            blo <= j <= j+k <= bhi
344        and for all (i',j',k') meeting those conditions,
345            k >= k'
346            i <= i'
347            and if i == i', j <= j'
348
349        In other words, of all maximal matching blocks, return one that
350        starts earliest in a, and of all those maximal matching blocks that
351        start earliest in a, return the one that starts earliest in b.
352
353        >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
354        >>> s.find_longest_match(0, 5, 0, 9)
355        Match(a=0, b=4, size=5)
356
357        If isjunk is defined, first the longest matching block is
358        determined as above, but with the additional restriction that no
359        junk element appears in the block.  Then that block is extended as
360        far as possible by matching (only) junk elements on both sides.  So
361        the resulting block never matches on junk except as identical junk
362        happens to be adjacent to an "interesting" match.
363
364        Here's the same example as before, but considering blanks to be
365        junk.  That prevents " abcd" from matching the " abcd" at the tail
366        end of the second sequence directly.  Instead only the "abcd" can
367        match, and matches the leftmost "abcd" in the second sequence:
368
369        >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
370        >>> s.find_longest_match(0, 5, 0, 9)
371        Match(a=1, b=0, size=4)
372
373        If no blocks match, return (alo, blo, 0).
374
375        >>> s = SequenceMatcher(None, "ab", "c")
376        >>> s.find_longest_match(0, 2, 0, 1)
377        Match(a=0, b=0, size=0)
378        """
379
380        # CAUTION:  stripping common prefix or suffix would be incorrect.
381        # E.g.,
382        #    ab
383        #    acab
384        # Longest matching block is "ab", but if common prefix is
385        # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
386        # strip, so ends up claiming that ab is changed to acab by
387        # inserting "ca" in the middle.  That's minimal but unintuitive:
388        # "it's obvious" that someone inserted "ac" at the front.
389        # Windiff ends up at the same place as diff, but by pairing up
390        # the unique 'b's and then matching the first two 'a's.
391
392        a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__
393        besti, bestj, bestsize = alo, blo, 0
394        # find longest junk-free match
395        # during an iteration of the loop, j2len[j] = length of longest
396        # junk-free match ending with a[i-1] and b[j]
397        j2len = {}
398        nothing = []
399        for i in range(alo, ahi):
400            # look at all instances of a[i] in b; note that because
401            # b2j has no junk keys, the loop is skipped if a[i] is junk
402            j2lenget = j2len.get
403            newj2len = {}
404            for j in b2j.get(a[i], nothing):
405                # a[i] matches b[j]
406                if j < blo:
407                    continue
408                if j >= bhi:
409                    break
410                k = newj2len[j] = j2lenget(j-1, 0) + 1
411                if k > bestsize:
412                    besti, bestj, bestsize = i-k+1, j-k+1, k
413            j2len = newj2len
414
415        # Extend the best by non-junk elements on each end.  In particular,
416        # "popular" non-junk elements aren't in b2j, which greatly speeds
417        # the inner loop above, but also means "the best" match so far
418        # doesn't contain any junk *or* popular non-junk elements.
419        while besti > alo and bestj > blo and \
420              not isbjunk(b[bestj-1]) and \
421              a[besti-1] == b[bestj-1]:
422            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
423        while besti+bestsize < ahi and bestj+bestsize < bhi and \
424              not isbjunk(b[bestj+bestsize]) and \
425              a[besti+bestsize] == b[bestj+bestsize]:
426            bestsize += 1
427
428        # Now that we have a wholly interesting match (albeit possibly
429        # empty!), we may as well suck up the matching junk on each
430        # side of it too.  Can't think of a good reason not to, and it
431        # saves post-processing the (possibly considerable) expense of
432        # figuring out what to do with it.  In the case of an empty
433        # interesting match, this is clearly the right thing to do,
434        # because no other kind of match is possible in the regions.
435        while besti > alo and bestj > blo and \
436              isbjunk(b[bestj-1]) and \
437              a[besti-1] == b[bestj-1]:
438            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
439        while besti+bestsize < ahi and bestj+bestsize < bhi and \
440              isbjunk(b[bestj+bestsize]) and \
441              a[besti+bestsize] == b[bestj+bestsize]:
442            bestsize = bestsize + 1
443
444        return Match(besti, bestj, bestsize)
445
446    def get_matching_blocks(self):
447        """Return list of triples describing matching subsequences.
448
449        Each triple is of the form (i, j, n), and means that
450        a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
451        i and in j.  New in Python 2.5, it's also guaranteed that if
452        (i, j, n) and (i', j', n') are adjacent triples in the list, and
453        the second is not the last triple in the list, then i+n != i' or
454        j+n != j'.  IOW, adjacent triples never describe adjacent equal
455        blocks.
456
457        The last triple is a dummy, (len(a), len(b), 0), and is the only
458        triple with n==0.
459
460        >>> s = SequenceMatcher(None, "abxcd", "abcd")
461        >>> list(s.get_matching_blocks())
462        [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]
463        """
464
465        if self.matching_blocks is not None:
466            return self.matching_blocks
467        la, lb = len(self.a), len(self.b)
468
469        # This is most naturally expressed as a recursive algorithm, but
470        # at least one user bumped into extreme use cases that exceeded
471        # the recursion limit on their box.  So, now we maintain a list
472        # ('queue`) of blocks we still need to look at, and append partial
473        # results to `matching_blocks` in a loop; the matches are sorted
474        # at the end.
475        queue = [(0, la, 0, lb)]
476        matching_blocks = []
477        while queue:
478            alo, ahi, blo, bhi = queue.pop()
479            i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
480            # a[alo:i] vs b[blo:j] unknown
481            # a[i:i+k] same as b[j:j+k]
482            # a[i+k:ahi] vs b[j+k:bhi] unknown
483            if k:   # if k is 0, there was no matching block
484                matching_blocks.append(x)
485                if alo < i and blo < j:
486                    queue.append((alo, i, blo, j))
487                if i+k < ahi and j+k < bhi:
488                    queue.append((i+k, ahi, j+k, bhi))
489        matching_blocks.sort()
490
491        # It's possible that we have adjacent equal blocks in the
492        # matching_blocks list now.  Starting with 2.5, this code was added
493        # to collapse them.
494        i1 = j1 = k1 = 0
495        non_adjacent = []
496        for i2, j2, k2 in matching_blocks:
497            # Is this block adjacent to i1, j1, k1?
498            if i1 + k1 == i2 and j1 + k1 == j2:
499                # Yes, so collapse them -- this just increases the length of
500                # the first block by the length of the second, and the first
501                # block so lengthened remains the block to compare against.
502                k1 += k2
503            else:
504                # Not adjacent.  Remember the first block (k1==0 means it's
505                # the dummy we started with), and make the second block the
506                # new block to compare against.
507                if k1:
508                    non_adjacent.append((i1, j1, k1))
509                i1, j1, k1 = i2, j2, k2
510        if k1:
511            non_adjacent.append((i1, j1, k1))
512
513        non_adjacent.append( (la, lb, 0) )
514        self.matching_blocks = list(map(Match._make, non_adjacent))
515        return self.matching_blocks
516
517    def get_opcodes(self):
518        """Return list of 5-tuples describing how to turn a into b.
519
520        Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
521        has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
522        tuple preceding it, and likewise for j1 == the previous j2.
523
524        The tags are strings, with these meanings:
525
526        'replace':  a[i1:i2] should be replaced by b[j1:j2]
527        'delete':   a[i1:i2] should be deleted.
528                    Note that j1==j2 in this case.
529        'insert':   b[j1:j2] should be inserted at a[i1:i1].
530                    Note that i1==i2 in this case.
531        'equal':    a[i1:i2] == b[j1:j2]
532
533        >>> a = "qabxcd"
534        >>> b = "abycdf"
535        >>> s = SequenceMatcher(None, a, b)
536        >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
537        ...    print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
538        ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
539         delete a[0:1] (q) b[0:0] ()
540          equal a[1:3] (ab) b[0:2] (ab)
541        replace a[3:4] (x) b[2:3] (y)
542          equal a[4:6] (cd) b[3:5] (cd)
543         insert a[6:6] () b[5:6] (f)
544        """
545
546        if self.opcodes is not None:
547            return self.opcodes
548        i = j = 0
549        self.opcodes = answer = []
550        for ai, bj, size in self.get_matching_blocks():
551            # invariant:  we've pumped out correct diffs to change
552            # a[:i] into b[:j], and the next matching block is
553            # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
554            # out a diff to change a[i:ai] into b[j:bj], pump out
555            # the matching block, and move (i,j) beyond the match
556            tag = ''
557            if i < ai and j < bj:
558                tag = 'replace'
559            elif i < ai:
560                tag = 'delete'
561            elif j < bj:
562                tag = 'insert'
563            if tag:
564                answer.append( (tag, i, ai, j, bj) )
565            i, j = ai+size, bj+size
566            # the list of matching blocks is terminated by a
567            # sentinel with size 0
568            if size:
569                answer.append( ('equal', ai, i, bj, j) )
570        return answer
571
572    def get_grouped_opcodes(self, n=3):
573        """ Isolate change clusters by eliminating ranges with no changes.
574
575        Return a generator of groups with up to n lines of context.
576        Each group is in the same format as returned by get_opcodes().
577
578        >>> from pprint import pprint
579        >>> a = list(map(str, range(1,40)))
580        >>> b = a[:]
581        >>> b[8:8] = ['i']     # Make an insertion
582        >>> b[20] += 'x'       # Make a replacement
583        >>> b[23:28] = []      # Make a deletion
584        >>> b[30] += 'y'       # Make another replacement
585        >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
586        [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
587         [('equal', 16, 19, 17, 20),
588          ('replace', 19, 20, 20, 21),
589          ('equal', 20, 22, 21, 23),
590          ('delete', 22, 27, 23, 23),
591          ('equal', 27, 30, 23, 26)],
592         [('equal', 31, 34, 27, 30),
593          ('replace', 34, 35, 30, 31),
594          ('equal', 35, 38, 31, 34)]]
595        """
596
597        codes = self.get_opcodes()
598        if not codes:
599            codes = [("equal", 0, 1, 0, 1)]
600        # Fixup leading and trailing groups if they show no changes.
601        if codes[0][0] == 'equal':
602            tag, i1, i2, j1, j2 = codes[0]
603            codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
604        if codes[-1][0] == 'equal':
605            tag, i1, i2, j1, j2 = codes[-1]
606            codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
607
608        nn = n + n
609        group = []
610        for tag, i1, i2, j1, j2 in codes:
611            # End the current group and start a new one whenever
612            # there is a large range with no changes.
613            if tag == 'equal' and i2-i1 > nn:
614                group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
615                yield group
616                group = []
617                i1, j1 = max(i1, i2-n), max(j1, j2-n)
618            group.append((tag, i1, i2, j1 ,j2))
619        if group and not (len(group)==1 and group[0][0] == 'equal'):
620            yield group
621
622    def ratio(self):
623        """Return a measure of the sequences' similarity (float in [0,1]).
624
625        Where T is the total number of elements in both sequences, and
626        M is the number of matches, this is 2.0*M / T.
627        Note that this is 1 if the sequences are identical, and 0 if
628        they have nothing in common.
629
630        .ratio() is expensive to compute if you haven't already computed
631        .get_matching_blocks() or .get_opcodes(), in which case you may
632        want to try .quick_ratio() or .real_quick_ratio() first to get an
633        upper bound.
634
635        >>> s = SequenceMatcher(None, "abcd", "bcde")
636        >>> s.ratio()
637        0.75
638        >>> s.quick_ratio()
639        0.75
640        >>> s.real_quick_ratio()
641        1.0
642        """
643
644        matches = sum(triple[-1] for triple in self.get_matching_blocks())
645        return _calculate_ratio(matches, len(self.a) + len(self.b))
646
647    def quick_ratio(self):
648        """Return an upper bound on ratio() relatively quickly.
649
650        This isn't defined beyond that it is an upper bound on .ratio(), and
651        is faster to compute.
652        """
653
654        # viewing a and b as multisets, set matches to the cardinality
655        # of their intersection; this counts the number of matches
656        # without regard to order, so is clearly an upper bound
657        if self.fullbcount is None:
658            self.fullbcount = fullbcount = {}
659            for elt in self.b:
660                fullbcount[elt] = fullbcount.get(elt, 0) + 1
661        fullbcount = self.fullbcount
662        # avail[x] is the number of times x appears in 'b' less the
663        # number of times we've seen it in 'a' so far ... kinda
664        avail = {}
665        availhas, matches = avail.__contains__, 0
666        for elt in self.a:
667            if availhas(elt):
668                numb = avail[elt]
669            else:
670                numb = fullbcount.get(elt, 0)
671            avail[elt] = numb - 1
672            if numb > 0:
673                matches = matches + 1
674        return _calculate_ratio(matches, len(self.a) + len(self.b))
675
676    def real_quick_ratio(self):
677        """Return an upper bound on ratio() very quickly.
678
679        This isn't defined beyond that it is an upper bound on .ratio(), and
680        is faster to compute than either .ratio() or .quick_ratio().
681        """
682
683        la, lb = len(self.a), len(self.b)
684        # can't have more matches than the number of elements in the
685        # shorter sequence
686        return _calculate_ratio(min(la, lb), la + lb)
687
688def get_close_matches(word, possibilities, n=3, cutoff=0.6):
689    """Use SequenceMatcher to return list of the best "good enough" matches.
690
691    word is a sequence for which close matches are desired (typically a
692    string).
693
694    possibilities is a list of sequences against which to match word
695    (typically a list of strings).
696
697    Optional arg n (default 3) is the maximum number of close matches to
698    return.  n must be > 0.
699
700    Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
701    that don't score at least that similar to word are ignored.
702
703    The best (no more than n) matches among the possibilities are returned
704    in a list, sorted by similarity score, most similar first.
705
706    >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
707    ['apple', 'ape']
708    >>> import keyword as _keyword
709    >>> get_close_matches("wheel", _keyword.kwlist)
710    ['while']
711    >>> get_close_matches("Apple", _keyword.kwlist)
712    []
713    >>> get_close_matches("accept", _keyword.kwlist)
714    ['except']
715    """
716
717    if not n >  0:
718        raise ValueError("n must be > 0: %r" % (n,))
719    if not 0.0 <= cutoff <= 1.0:
720        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
721    result = []
722    s = SequenceMatcher()
723    s.set_seq2(word)
724    for x in possibilities:
725        s.set_seq1(x)
726        if s.real_quick_ratio() >= cutoff and \
727           s.quick_ratio() >= cutoff and \
728           s.ratio() >= cutoff:
729            result.append((s.ratio(), x))
730
731    # Move the best scorers to head of list
732    result = _nlargest(n, result)
733    # Strip scores for the best n matches
734    return [x for score, x in result]
735
736
737def _keep_original_ws(s, tag_s):
738    """Replace whitespace with the original whitespace characters in `s`"""
739    return ''.join(
740        c if tag_c == " " and c.isspace() else tag_c
741        for c, tag_c in zip(s, tag_s)
742    )
743
744
745
746class Differ:
747    r"""
748    Differ is a class for comparing sequences of lines of text, and
749    producing human-readable differences or deltas.  Differ uses
750    SequenceMatcher both to compare sequences of lines, and to compare
751    sequences of characters within similar (near-matching) lines.
752
753    Each line of a Differ delta begins with a two-letter code:
754
755        '- '    line unique to sequence 1
756        '+ '    line unique to sequence 2
757        '  '    line common to both sequences
758        '? '    line not present in either input sequence
759
760    Lines beginning with '? ' attempt to guide the eye to intraline
761    differences, and were not present in either input sequence.  These lines
762    can be confusing if the sequences contain tab characters.
763
764    Note that Differ makes no claim to produce a *minimal* diff.  To the
765    contrary, minimal diffs are often counter-intuitive, because they synch
766    up anywhere possible, sometimes accidental matches 100 pages apart.
767    Restricting synch points to contiguous matches preserves some notion of
768    locality, at the occasional cost of producing a longer diff.
769
770    Example: Comparing two texts.
771
772    First we set up the texts, sequences of individual single-line strings
773    ending with newlines (such sequences can also be obtained from the
774    `readlines()` method of file-like objects):
775
776    >>> text1 = '''  1. Beautiful is better than ugly.
777    ...   2. Explicit is better than implicit.
778    ...   3. Simple is better than complex.
779    ...   4. Complex is better than complicated.
780    ... '''.splitlines(keepends=True)
781    >>> len(text1)
782    4
783    >>> text1[0][-1]
784    '\n'
785    >>> text2 = '''  1. Beautiful is better than ugly.
786    ...   3.   Simple is better than complex.
787    ...   4. Complicated is better than complex.
788    ...   5. Flat is better than nested.
789    ... '''.splitlines(keepends=True)
790
791    Next we instantiate a Differ object:
792
793    >>> d = Differ()
794
795    Note that when instantiating a Differ object we may pass functions to
796    filter out line and character 'junk'.  See Differ.__init__ for details.
797
798    Finally, we compare the two:
799
800    >>> result = list(d.compare(text1, text2))
801
802    'result' is a list of strings, so let's pretty-print it:
803
804    >>> from pprint import pprint as _pprint
805    >>> _pprint(result)
806    ['    1. Beautiful is better than ugly.\n',
807     '-   2. Explicit is better than implicit.\n',
808     '-   3. Simple is better than complex.\n',
809     '+   3.   Simple is better than complex.\n',
810     '?     ++\n',
811     '-   4. Complex is better than complicated.\n',
812     '?            ^                     ---- ^\n',
813     '+   4. Complicated is better than complex.\n',
814     '?           ++++ ^                      ^\n',
815     '+   5. Flat is better than nested.\n']
816
817    As a single multi-line string it looks like this:
818
819    >>> print(''.join(result), end="")
820        1. Beautiful is better than ugly.
821    -   2. Explicit is better than implicit.
822    -   3. Simple is better than complex.
823    +   3.   Simple is better than complex.
824    ?     ++
825    -   4. Complex is better than complicated.
826    ?            ^                     ---- ^
827    +   4. Complicated is better than complex.
828    ?           ++++ ^                      ^
829    +   5. Flat is better than nested.
830
831    Methods:
832
833    __init__(linejunk=None, charjunk=None)
834        Construct a text differencer, with optional filters.
835
836    compare(a, b)
837        Compare two sequences of lines; generate the resulting delta.
838    """
839
840    def __init__(self, linejunk=None, charjunk=None):
841        """
842        Construct a text differencer, with optional filters.
843
844        The two optional keyword parameters are for filter functions:
845
846        - `linejunk`: A function that should accept a single string argument,
847          and return true iff the string is junk. The module-level function
848          `IS_LINE_JUNK` may be used to filter out lines without visible
849          characters, except for at most one splat ('#').  It is recommended
850          to leave linejunk None; the underlying SequenceMatcher class has
851          an adaptive notion of "noise" lines that's better than any static
852          definition the author has ever been able to craft.
853
854        - `charjunk`: A function that should accept a string of length 1. The
855          module-level function `IS_CHARACTER_JUNK` may be used to filter out
856          whitespace characters (a blank or tab; **note**: bad idea to include
857          newline in this!).  Use of IS_CHARACTER_JUNK is recommended.
858        """
859
860        self.linejunk = linejunk
861        self.charjunk = charjunk
862
863    def compare(self, a, b):
864        r"""
865        Compare two sequences of lines; generate the resulting delta.
866
867        Each sequence must contain individual single-line strings ending with
868        newlines. Such sequences can be obtained from the `readlines()` method
869        of file-like objects.  The delta generated also consists of newline-
870        terminated strings, ready to be printed as-is via the writeline()
871        method of a file-like object.
872
873        Example:
874
875        >>> print(''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(True),
876        ...                                'ore\ntree\nemu\n'.splitlines(True))),
877        ...       end="")
878        - one
879        ?  ^
880        + ore
881        ?  ^
882        - two
883        - three
884        ?  -
885        + tree
886        + emu
887        """
888
889        cruncher = SequenceMatcher(self.linejunk, a, b)
890        for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
891            if tag == 'replace':
892                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
893            elif tag == 'delete':
894                g = self._dump('-', a, alo, ahi)
895            elif tag == 'insert':
896                g = self._dump('+', b, blo, bhi)
897            elif tag == 'equal':
898                g = self._dump(' ', a, alo, ahi)
899            else:
900                raise ValueError('unknown tag %r' % (tag,))
901
902            yield from g
903
904    def _dump(self, tag, x, lo, hi):
905        """Generate comparison results for a same-tagged range."""
906        for i in range(lo, hi):
907            yield '%s %s' % (tag, x[i])
908
909    def _plain_replace(self, a, alo, ahi, b, blo, bhi):
910        assert alo < ahi and blo < bhi
911        # dump the shorter block first -- reduces the burden on short-term
912        # memory if the blocks are of very different sizes
913        if bhi - blo < ahi - alo:
914            first  = self._dump('+', b, blo, bhi)
915            second = self._dump('-', a, alo, ahi)
916        else:
917            first  = self._dump('-', a, alo, ahi)
918            second = self._dump('+', b, blo, bhi)
919
920        for g in first, second:
921            yield from g
922
923    def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
924        r"""
925        When replacing one block of lines with another, search the blocks
926        for *similar* lines; the best-matching pair (if any) is used as a
927        synch point, and intraline difference marking is done on the
928        similar pair. Lots of work, but often worth it.
929
930        Example:
931
932        >>> d = Differ()
933        >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
934        ...                            ['abcdefGhijkl\n'], 0, 1)
935        >>> print(''.join(results), end="")
936        - abcDefghiJkl
937        ?    ^  ^  ^
938        + abcdefGhijkl
939        ?    ^  ^  ^
940        """
941
942        # don't synch up unless the lines have a similarity score of at
943        # least cutoff; best_ratio tracks the best score seen so far
944        best_ratio, cutoff = 0.74, 0.75
945        cruncher = SequenceMatcher(self.charjunk)
946        eqi, eqj = None, None   # 1st indices of equal lines (if any)
947
948        # search for the pair that matches best without being identical
949        # (identical lines must be junk lines, & we don't want to synch up
950        # on junk -- unless we have to)
951        for j in range(blo, bhi):
952            bj = b[j]
953            cruncher.set_seq2(bj)
954            for i in range(alo, ahi):
955                ai = a[i]
956                if ai == bj:
957                    if eqi is None:
958                        eqi, eqj = i, j
959                    continue
960                cruncher.set_seq1(ai)
961                # computing similarity is expensive, so use the quick
962                # upper bounds first -- have seen this speed up messy
963                # compares by a factor of 3.
964                # note that ratio() is only expensive to compute the first
965                # time it's called on a sequence pair; the expensive part
966                # of the computation is cached by cruncher
967                if cruncher.real_quick_ratio() > best_ratio and \
968                      cruncher.quick_ratio() > best_ratio and \
969                      cruncher.ratio() > best_ratio:
970                    best_ratio, best_i, best_j = cruncher.ratio(), i, j
971        if best_ratio < cutoff:
972            # no non-identical "pretty close" pair
973            if eqi is None:
974                # no identical pair either -- treat it as a straight replace
975                yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
976                return
977            # no close pair, but an identical pair -- synch up on that
978            best_i, best_j, best_ratio = eqi, eqj, 1.0
979        else:
980            # there's a close pair, so forget the identical pair (if any)
981            eqi = None
982
983        # a[best_i] very similar to b[best_j]; eqi is None iff they're not
984        # identical
985
986        # pump out diffs from before the synch point
987        yield from self._fancy_helper(a, alo, best_i, b, blo, best_j)
988
989        # do intraline marking on the synch pair
990        aelt, belt = a[best_i], b[best_j]
991        if eqi is None:
992            # pump out a '-', '?', '+', '?' quad for the synched lines
993            atags = btags = ""
994            cruncher.set_seqs(aelt, belt)
995            for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
996                la, lb = ai2 - ai1, bj2 - bj1
997                if tag == 'replace':
998                    atags += '^' * la
999                    btags += '^' * lb
1000                elif tag == 'delete':
1001                    atags += '-' * la
1002                elif tag == 'insert':
1003                    btags += '+' * lb
1004                elif tag == 'equal':
1005                    atags += ' ' * la
1006                    btags += ' ' * lb
1007                else:
1008                    raise ValueError('unknown tag %r' % (tag,))
1009            yield from self._qformat(aelt, belt, atags, btags)
1010        else:
1011            # the synch pair is identical
1012            yield '  ' + aelt
1013
1014        # pump out diffs from after the synch point
1015        yield from self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
1016
1017    def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
1018        g = []
1019        if alo < ahi:
1020            if blo < bhi:
1021                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
1022            else:
1023                g = self._dump('-', a, alo, ahi)
1024        elif blo < bhi:
1025            g = self._dump('+', b, blo, bhi)
1026
1027        yield from g
1028
1029    def _qformat(self, aline, bline, atags, btags):
1030        r"""
1031        Format "?" output and deal with tabs.
1032
1033        Example:
1034
1035        >>> d = Differ()
1036        >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n',
1037        ...                      '  ^ ^  ^      ', '  ^ ^  ^      ')
1038        >>> for line in results: print(repr(line))
1039        ...
1040        '- \tabcDefghiJkl\n'
1041        '? \t ^ ^  ^\n'
1042        '+ \tabcdefGhijkl\n'
1043        '? \t ^ ^  ^\n'
1044        """
1045        atags = _keep_original_ws(aline, atags).rstrip()
1046        btags = _keep_original_ws(bline, btags).rstrip()
1047
1048        yield "- " + aline
1049        if atags:
1050            yield f"? {atags}\n"
1051
1052        yield "+ " + bline
1053        if btags:
1054            yield f"? {btags}\n"
1055
1056# With respect to junk, an earlier version of ndiff simply refused to
1057# *start* a match with a junk element.  The result was cases like this:
1058#     before: private Thread currentThread;
1059#     after:  private volatile Thread currentThread;
1060# If you consider whitespace to be junk, the longest contiguous match
1061# not starting with junk is "e Thread currentThread".  So ndiff reported
1062# that "e volatil" was inserted between the 't' and the 'e' in "private".
1063# While an accurate view, to people that's absurd.  The current version
1064# looks for matching blocks that are entirely junk-free, then extends the
1065# longest one of those as far as possible but only with matching junk.
1066# So now "currentThread" is matched, then extended to suck up the
1067# preceding blank; then "private" is matched, and extended to suck up the
1068# following blank; then "Thread" is matched; and finally ndiff reports
1069# that "volatile " was inserted before "Thread".  The only quibble
1070# remaining is that perhaps it was really the case that " volatile"
1071# was inserted after "private".  I can live with that <wink>.
1072
1073import re
1074
1075def IS_LINE_JUNK(line, pat=re.compile(r"\s*(?:#\s*)?$").match):
1076    r"""
1077    Return True for ignorable line: iff `line` is blank or contains a single '#'.
1078
1079    Examples:
1080
1081    >>> IS_LINE_JUNK('\n')
1082    True
1083    >>> IS_LINE_JUNK('  #   \n')
1084    True
1085    >>> IS_LINE_JUNK('hello\n')
1086    False
1087    """
1088
1089    return pat(line) is not None
1090
1091def IS_CHARACTER_JUNK(ch, ws=" \t"):
1092    r"""
1093    Return True for ignorable character: iff `ch` is a space or tab.
1094
1095    Examples:
1096
1097    >>> IS_CHARACTER_JUNK(' ')
1098    True
1099    >>> IS_CHARACTER_JUNK('\t')
1100    True
1101    >>> IS_CHARACTER_JUNK('\n')
1102    False
1103    >>> IS_CHARACTER_JUNK('x')
1104    False
1105    """
1106
1107    return ch in ws
1108
1109
1110########################################################################
1111###  Unified Diff
1112########################################################################
1113
1114def _format_range_unified(start, stop):
1115    'Convert range to the "ed" format'
1116    # Per the diff spec at http://www.unix.org/single_unix_specification/
1117    beginning = start + 1     # lines start numbering with one
1118    length = stop - start
1119    if length == 1:
1120        return '{}'.format(beginning)
1121    if not length:
1122        beginning -= 1        # empty ranges begin at line just before the range
1123    return '{},{}'.format(beginning, length)
1124
1125def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
1126                 tofiledate='', n=3, lineterm='\n'):
1127    r"""
1128    Compare two sequences of lines; generate the delta as a unified diff.
1129
1130    Unified diffs are a compact way of showing line changes and a few
1131    lines of context.  The number of context lines is set by 'n' which
1132    defaults to three.
1133
1134    By default, the diff control lines (those with ---, +++, or @@) are
1135    created with a trailing newline.  This is helpful so that inputs
1136    created from file.readlines() result in diffs that are suitable for
1137    file.writelines() since both the inputs and outputs have trailing
1138    newlines.
1139
1140    For inputs that do not have trailing newlines, set the lineterm
1141    argument to "" so that the output will be uniformly newline free.
1142
1143    The unidiff format normally has a header for filenames and modification
1144    times.  Any or all of these may be specified using strings for
1145    'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
1146    The modification times are normally expressed in the ISO 8601 format.
1147
1148    Example:
1149
1150    >>> for line in unified_diff('one two three four'.split(),
1151    ...             'zero one tree four'.split(), 'Original', 'Current',
1152    ...             '2005-01-26 23:30:50', '2010-04-02 10:20:52',
1153    ...             lineterm=''):
1154    ...     print(line)                 # doctest: +NORMALIZE_WHITESPACE
1155    --- Original        2005-01-26 23:30:50
1156    +++ Current         2010-04-02 10:20:52
1157    @@ -1,4 +1,4 @@
1158    +zero
1159     one
1160    -two
1161    -three
1162    +tree
1163     four
1164    """
1165
1166    _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
1167    started = False
1168    for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
1169        if not started:
1170            started = True
1171            fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
1172            todate = '\t{}'.format(tofiledate) if tofiledate else ''
1173            yield '--- {}{}{}'.format(fromfile, fromdate, lineterm)
1174            yield '+++ {}{}{}'.format(tofile, todate, lineterm)
1175
1176        first, last = group[0], group[-1]
1177        file1_range = _format_range_unified(first[1], last[2])
1178        file2_range = _format_range_unified(first[3], last[4])
1179        yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm)
1180
1181        for tag, i1, i2, j1, j2 in group:
1182            if tag == 'equal':
1183                for line in a[i1:i2]:
1184                    yield ' ' + line
1185                continue
1186            if tag in {'replace', 'delete'}:
1187                for line in a[i1:i2]:
1188                    yield '-' + line
1189            if tag in {'replace', 'insert'}:
1190                for line in b[j1:j2]:
1191                    yield '+' + line
1192
1193
1194########################################################################
1195###  Context Diff
1196########################################################################
1197
1198def _format_range_context(start, stop):
1199    'Convert range to the "ed" format'
1200    # Per the diff spec at http://www.unix.org/single_unix_specification/
1201    beginning = start + 1     # lines start numbering with one
1202    length = stop - start
1203    if not length:
1204        beginning -= 1        # empty ranges begin at line just before the range
1205    if length <= 1:
1206        return '{}'.format(beginning)
1207    return '{},{}'.format(beginning, beginning + length - 1)
1208
1209# See http://www.unix.org/single_unix_specification/
1210def context_diff(a, b, fromfile='', tofile='',
1211                 fromfiledate='', tofiledate='', n=3, lineterm='\n'):
1212    r"""
1213    Compare two sequences of lines; generate the delta as a context diff.
1214
1215    Context diffs are a compact way of showing line changes and a few
1216    lines of context.  The number of context lines is set by 'n' which
1217    defaults to three.
1218
1219    By default, the diff control lines (those with *** or ---) are
1220    created with a trailing newline.  This is helpful so that inputs
1221    created from file.readlines() result in diffs that are suitable for
1222    file.writelines() since both the inputs and outputs have trailing
1223    newlines.
1224
1225    For inputs that do not have trailing newlines, set the lineterm
1226    argument to "" so that the output will be uniformly newline free.
1227
1228    The context diff format normally has a header for filenames and
1229    modification times.  Any or all of these may be specified using
1230    strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
1231    The modification times are normally expressed in the ISO 8601 format.
1232    If not specified, the strings default to blanks.
1233
1234    Example:
1235
1236    >>> print(''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(True),
1237    ...       'zero\none\ntree\nfour\n'.splitlines(True), 'Original', 'Current')),
1238    ...       end="")
1239    *** Original
1240    --- Current
1241    ***************
1242    *** 1,4 ****
1243      one
1244    ! two
1245    ! three
1246      four
1247    --- 1,4 ----
1248    + zero
1249      one
1250    ! tree
1251      four
1252    """
1253
1254    _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
1255    prefix = dict(insert='+ ', delete='- ', replace='! ', equal='  ')
1256    started = False
1257    for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
1258        if not started:
1259            started = True
1260            fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
1261            todate = '\t{}'.format(tofiledate) if tofiledate else ''
1262            yield '*** {}{}{}'.format(fromfile, fromdate, lineterm)
1263            yield '--- {}{}{}'.format(tofile, todate, lineterm)
1264
1265        first, last = group[0], group[-1]
1266        yield '***************' + lineterm
1267
1268        file1_range = _format_range_context(first[1], last[2])
1269        yield '*** {} ****{}'.format(file1_range, lineterm)
1270
1271        if any(tag in {'replace', 'delete'} for tag, _, _, _, _ in group):
1272            for tag, i1, i2, _, _ in group:
1273                if tag != 'insert':
1274                    for line in a[i1:i2]:
1275                        yield prefix[tag] + line
1276
1277        file2_range = _format_range_context(first[3], last[4])
1278        yield '--- {} ----{}'.format(file2_range, lineterm)
1279
1280        if any(tag in {'replace', 'insert'} for tag, _, _, _, _ in group):
1281            for tag, _, _, j1, j2 in group:
1282                if tag != 'delete':
1283                    for line in b[j1:j2]:
1284                        yield prefix[tag] + line
1285
1286def _check_types(a, b, *args):
1287    # Checking types is weird, but the alternative is garbled output when
1288    # someone passes mixed bytes and str to {unified,context}_diff(). E.g.
1289    # without this check, passing filenames as bytes results in output like
1290    #   --- b'oldfile.txt'
1291    #   +++ b'newfile.txt'
1292    # because of how str.format() incorporates bytes objects.
1293    if a and not isinstance(a[0], str):
1294        raise TypeError('lines to compare must be str, not %s (%r)' %
1295                        (type(a[0]).__name__, a[0]))
1296    if b and not isinstance(b[0], str):
1297        raise TypeError('lines to compare must be str, not %s (%r)' %
1298                        (type(b[0]).__name__, b[0]))
1299    for arg in args:
1300        if not isinstance(arg, str):
1301            raise TypeError('all arguments must be str, not: %r' % (arg,))
1302
1303def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'',
1304               fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'):
1305    r"""
1306    Compare `a` and `b`, two sequences of lines represented as bytes rather
1307    than str. This is a wrapper for `dfunc`, which is typically either
1308    unified_diff() or context_diff(). Inputs are losslessly converted to
1309    strings so that `dfunc` only has to worry about strings, and encoded
1310    back to bytes on return. This is necessary to compare files with
1311    unknown or inconsistent encoding. All other inputs (except `n`) must be
1312    bytes rather than str.
1313    """
1314    def decode(s):
1315        try:
1316            return s.decode('ascii', 'surrogateescape')
1317        except AttributeError as err:
1318            msg = ('all arguments must be bytes, not %s (%r)' %
1319                   (type(s).__name__, s))
1320            raise TypeError(msg) from err
1321    a = list(map(decode, a))
1322    b = list(map(decode, b))
1323    fromfile = decode(fromfile)
1324    tofile = decode(tofile)
1325    fromfiledate = decode(fromfiledate)
1326    tofiledate = decode(tofiledate)
1327    lineterm = decode(lineterm)
1328
1329    lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm)
1330    for line in lines:
1331        yield line.encode('ascii', 'surrogateescape')
1332
1333def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
1334    r"""
1335    Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
1336
1337    Optional keyword parameters `linejunk` and `charjunk` are for filter
1338    functions, or can be None:
1339
1340    - linejunk: A function that should accept a single string argument and
1341      return true iff the string is junk.  The default is None, and is
1342      recommended; the underlying SequenceMatcher class has an adaptive
1343      notion of "noise" lines.
1344
1345    - charjunk: A function that accepts a character (string of length
1346      1), and returns true iff the character is junk. The default is
1347      the module-level function IS_CHARACTER_JUNK, which filters out
1348      whitespace characters (a blank or tab; note: it's a bad idea to
1349      include newline in this!).
1350
1351    Tools/scripts/ndiff.py is a command-line front-end to this function.
1352
1353    Example:
1354
1355    >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
1356    ...              'ore\ntree\nemu\n'.splitlines(keepends=True))
1357    >>> print(''.join(diff), end="")
1358    - one
1359    ?  ^
1360    + ore
1361    ?  ^
1362    - two
1363    - three
1364    ?  -
1365    + tree
1366    + emu
1367    """
1368    return Differ(linejunk, charjunk).compare(a, b)
1369
1370def _mdiff(fromlines, tolines, context=None, linejunk=None,
1371           charjunk=IS_CHARACTER_JUNK):
1372    r"""Returns generator yielding marked up from/to side by side differences.
1373
1374    Arguments:
1375    fromlines -- list of text lines to compared to tolines
1376    tolines -- list of text lines to be compared to fromlines
1377    context -- number of context lines to display on each side of difference,
1378               if None, all from/to text lines will be generated.
1379    linejunk -- passed on to ndiff (see ndiff documentation)
1380    charjunk -- passed on to ndiff (see ndiff documentation)
1381
1382    This function returns an iterator which returns a tuple:
1383    (from line tuple, to line tuple, boolean flag)
1384
1385    from/to line tuple -- (line num, line text)
1386        line num -- integer or None (to indicate a context separation)
1387        line text -- original line text with following markers inserted:
1388            '\0+' -- marks start of added text
1389            '\0-' -- marks start of deleted text
1390            '\0^' -- marks start of changed text
1391            '\1' -- marks end of added/deleted/changed text
1392
1393    boolean flag -- None indicates context separation, True indicates
1394        either "from" or "to" line contains a change, otherwise False.
1395
1396    This function/iterator was originally developed to generate side by side
1397    file difference for making HTML pages (see HtmlDiff class for example
1398    usage).
1399
1400    Note, this function utilizes the ndiff function to generate the side by
1401    side difference markup.  Optional ndiff arguments may be passed to this
1402    function and they in turn will be passed to ndiff.
1403    """
1404    import re
1405
1406    # regular expression for finding intraline change indices
1407    change_re = re.compile(r'(\++|\-+|\^+)')
1408
1409    # create the difference iterator to generate the differences
1410    diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
1411
1412    def _make_line(lines, format_key, side, num_lines=[0,0]):
1413        """Returns line of text with user's change markup and line formatting.
1414
1415        lines -- list of lines from the ndiff generator to produce a line of
1416                 text from.  When producing the line of text to return, the
1417                 lines used are removed from this list.
1418        format_key -- '+' return first line in list with "add" markup around
1419                          the entire line.
1420                      '-' return first line in list with "delete" markup around
1421                          the entire line.
1422                      '?' return first line in list with add/delete/change
1423                          intraline markup (indices obtained from second line)
1424                      None return first line in list with no markup
1425        side -- indice into the num_lines list (0=from,1=to)
1426        num_lines -- from/to current line number.  This is NOT intended to be a
1427                     passed parameter.  It is present as a keyword argument to
1428                     maintain memory of the current line numbers between calls
1429                     of this function.
1430
1431        Note, this function is purposefully not defined at the module scope so
1432        that data it needs from its parent function (within whose context it
1433        is defined) does not need to be of module scope.
1434        """
1435        num_lines[side] += 1
1436        # Handle case where no user markup is to be added, just return line of
1437        # text with user's line format to allow for usage of the line number.
1438        if format_key is None:
1439            return (num_lines[side],lines.pop(0)[2:])
1440        # Handle case of intraline changes
1441        if format_key == '?':
1442            text, markers = lines.pop(0), lines.pop(0)
1443            # find intraline changes (store change type and indices in tuples)
1444            sub_info = []
1445            def record_sub_info(match_object,sub_info=sub_info):
1446                sub_info.append([match_object.group(1)[0],match_object.span()])
1447                return match_object.group(1)
1448            change_re.sub(record_sub_info,markers)
1449            # process each tuple inserting our special marks that won't be
1450            # noticed by an xml/html escaper.
1451            for key,(begin,end) in reversed(sub_info):
1452                text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
1453            text = text[2:]
1454        # Handle case of add/delete entire line
1455        else:
1456            text = lines.pop(0)[2:]
1457            # if line of text is just a newline, insert a space so there is
1458            # something for the user to highlight and see.
1459            if not text:
1460                text = ' '
1461            # insert marks that won't be noticed by an xml/html escaper.
1462            text = '\0' + format_key + text + '\1'
1463        # Return line of text, first allow user's line formatter to do its
1464        # thing (such as adding the line number) then replace the special
1465        # marks with what the user's change markup.
1466        return (num_lines[side],text)
1467
1468    def _line_iterator():
1469        """Yields from/to lines of text with a change indication.
1470
1471        This function is an iterator.  It itself pulls lines from a
1472        differencing iterator, processes them and yields them.  When it can
1473        it yields both a "from" and a "to" line, otherwise it will yield one
1474        or the other.  In addition to yielding the lines of from/to text, a
1475        boolean flag is yielded to indicate if the text line(s) have
1476        differences in them.
1477
1478        Note, this function is purposefully not defined at the module scope so
1479        that data it needs from its parent function (within whose context it
1480        is defined) does not need to be of module scope.
1481        """
1482        lines = []
1483        num_blanks_pending, num_blanks_to_yield = 0, 0
1484        while True:
1485            # Load up next 4 lines so we can look ahead, create strings which
1486            # are a concatenation of the first character of each of the 4 lines
1487            # so we can do some very readable comparisons.
1488            while len(lines) < 4:
1489                lines.append(next(diff_lines_iterator, 'X'))
1490            s = ''.join([line[0] for line in lines])
1491            if s.startswith('X'):
1492                # When no more lines, pump out any remaining blank lines so the
1493                # corresponding add/delete lines get a matching blank line so
1494                # all line pairs get yielded at the next level.
1495                num_blanks_to_yield = num_blanks_pending
1496            elif s.startswith('-?+?'):
1497                # simple intraline change
1498                yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
1499                continue
1500            elif s.startswith('--++'):
1501                # in delete block, add block coming: we do NOT want to get
1502                # caught up on blank lines yet, just process the delete line
1503                num_blanks_pending -= 1
1504                yield _make_line(lines,'-',0), None, True
1505                continue
1506            elif s.startswith(('--?+', '--+', '- ')):
1507                # in delete block and see an intraline change or unchanged line
1508                # coming: yield the delete line and then blanks
1509                from_line,to_line = _make_line(lines,'-',0), None
1510                num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
1511            elif s.startswith('-+?'):
1512                # intraline change
1513                yield _make_line(lines,None,0), _make_line(lines,'?',1), True
1514                continue
1515            elif s.startswith('-?+'):
1516                # intraline change
1517                yield _make_line(lines,'?',0), _make_line(lines,None,1), True
1518                continue
1519            elif s.startswith('-'):
1520                # delete FROM line
1521                num_blanks_pending -= 1
1522                yield _make_line(lines,'-',0), None, True
1523                continue
1524            elif s.startswith('+--'):
1525                # in add block, delete block coming: we do NOT want to get
1526                # caught up on blank lines yet, just process the add line
1527                num_blanks_pending += 1
1528                yield None, _make_line(lines,'+',1), True
1529                continue
1530            elif s.startswith(('+ ', '+-')):
1531                # will be leaving an add block: yield blanks then add line
1532                from_line, to_line = None, _make_line(lines,'+',1)
1533                num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
1534            elif s.startswith('+'):
1535                # inside an add block, yield the add line
1536                num_blanks_pending += 1
1537                yield None, _make_line(lines,'+',1), True
1538                continue
1539            elif s.startswith(' '):
1540                # unchanged text, yield it to both sides
1541                yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
1542                continue
1543            # Catch up on the blank lines so when we yield the next from/to
1544            # pair, they are lined up.
1545            while(num_blanks_to_yield < 0):
1546                num_blanks_to_yield += 1
1547                yield None,('','\n'),True
1548            while(num_blanks_to_yield > 0):
1549                num_blanks_to_yield -= 1
1550                yield ('','\n'),None,True
1551            if s.startswith('X'):
1552                return
1553            else:
1554                yield from_line,to_line,True
1555
1556    def _line_pair_iterator():
1557        """Yields from/to lines of text with a change indication.
1558
1559        This function is an iterator.  It itself pulls lines from the line
1560        iterator.  Its difference from that iterator is that this function
1561        always yields a pair of from/to text lines (with the change
1562        indication).  If necessary it will collect single from/to lines
1563        until it has a matching pair from/to pair to yield.
1564
1565        Note, this function is purposefully not defined at the module scope so
1566        that data it needs from its parent function (within whose context it
1567        is defined) does not need to be of module scope.
1568        """
1569        line_iterator = _line_iterator()
1570        fromlines,tolines=[],[]
1571        while True:
1572            # Collecting lines of text until we have a from/to pair
1573            while (len(fromlines)==0 or len(tolines)==0):
1574                try:
1575                    from_line, to_line, found_diff = next(line_iterator)
1576                except StopIteration:
1577                    return
1578                if from_line is not None:
1579                    fromlines.append((from_line,found_diff))
1580                if to_line is not None:
1581                    tolines.append((to_line,found_diff))
1582            # Once we have a pair, remove them from the collection and yield it
1583            from_line, fromDiff = fromlines.pop(0)
1584            to_line, to_diff = tolines.pop(0)
1585            yield (from_line,to_line,fromDiff or to_diff)
1586
1587    # Handle case where user does not want context differencing, just yield
1588    # them up without doing anything else with them.
1589    line_pair_iterator = _line_pair_iterator()
1590    if context is None:
1591        yield from line_pair_iterator
1592    # Handle case where user wants context differencing.  We must do some
1593    # storage of lines until we know for sure that they are to be yielded.
1594    else:
1595        context += 1
1596        lines_to_write = 0
1597        while True:
1598            # Store lines up until we find a difference, note use of a
1599            # circular queue because we only need to keep around what
1600            # we need for context.
1601            index, contextLines = 0, [None]*(context)
1602            found_diff = False
1603            while(found_diff is False):
1604                try:
1605                    from_line, to_line, found_diff = next(line_pair_iterator)
1606                except StopIteration:
1607                    return
1608                i = index % context
1609                contextLines[i] = (from_line, to_line, found_diff)
1610                index += 1
1611            # Yield lines that we have collected so far, but first yield
1612            # the user's separator.
1613            if index > context:
1614                yield None, None, None
1615                lines_to_write = context
1616            else:
1617                lines_to_write = index
1618                index = 0
1619            while(lines_to_write):
1620                i = index % context
1621                index += 1
1622                yield contextLines[i]
1623                lines_to_write -= 1
1624            # Now yield the context lines after the change
1625            lines_to_write = context-1
1626            try:
1627                while(lines_to_write):
1628                    from_line, to_line, found_diff = next(line_pair_iterator)
1629                    # If another change within the context, extend the context
1630                    if found_diff:
1631                        lines_to_write = context-1
1632                    else:
1633                        lines_to_write -= 1
1634                    yield from_line, to_line, found_diff
1635            except StopIteration:
1636                # Catch exception from next() and return normally
1637                return
1638
1639
1640_file_template = """
1641<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
1642          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
1643
1644<html>
1645
1646<head>
1647    <meta http-equiv="Content-Type"
1648          content="text/html; charset=%(charset)s" />
1649    <title></title>
1650    <style type="text/css">%(styles)s
1651    </style>
1652</head>
1653
1654<body>
1655    %(table)s%(legend)s
1656</body>
1657
1658</html>"""
1659
1660_styles = """
1661        table.diff {font-family:Courier; border:medium;}
1662        .diff_header {background-color:#e0e0e0}
1663        td.diff_header {text-align:right}
1664        .diff_next {background-color:#c0c0c0}
1665        .diff_add {background-color:#aaffaa}
1666        .diff_chg {background-color:#ffff77}
1667        .diff_sub {background-color:#ffaaaa}"""
1668
1669_table_template = """
1670    <table class="diff" id="difflib_chg_%(prefix)s_top"
1671           cellspacing="0" cellpadding="0" rules="groups" >
1672        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
1673        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
1674        %(header_row)s
1675        <tbody>
1676%(data_rows)s        </tbody>
1677    </table>"""
1678
1679_legend = """
1680    <table class="diff" summary="Legends">
1681        <tr> <th colspan="2"> Legends </th> </tr>
1682        <tr> <td> <table border="" summary="Colors">
1683                      <tr><th> Colors </th> </tr>
1684                      <tr><td class="diff_add">&nbsp;Added&nbsp;</td></tr>
1685                      <tr><td class="diff_chg">Changed</td> </tr>
1686                      <tr><td class="diff_sub">Deleted</td> </tr>
1687                  </table></td>
1688             <td> <table border="" summary="Links">
1689                      <tr><th colspan="2"> Links </th> </tr>
1690                      <tr><td>(f)irst change</td> </tr>
1691                      <tr><td>(n)ext change</td> </tr>
1692                      <tr><td>(t)op</td> </tr>
1693                  </table></td> </tr>
1694    </table>"""
1695
1696class HtmlDiff(object):
1697    """For producing HTML side by side comparison with change highlights.
1698
1699    This class can be used to create an HTML table (or a complete HTML file
1700    containing the table) showing a side by side, line by line comparison
1701    of text with inter-line and intra-line change highlights.  The table can
1702    be generated in either full or contextual difference mode.
1703
1704    The following methods are provided for HTML generation:
1705
1706    make_table -- generates HTML for a single side by side table
1707    make_file -- generates complete HTML file with a single side by side table
1708
1709    See tools/scripts/diff.py for an example usage of this class.
1710    """
1711
1712    _file_template = _file_template
1713    _styles = _styles
1714    _table_template = _table_template
1715    _legend = _legend
1716    _default_prefix = 0
1717
1718    def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
1719                 charjunk=IS_CHARACTER_JUNK):
1720        """HtmlDiff instance initializer
1721
1722        Arguments:
1723        tabsize -- tab stop spacing, defaults to 8.
1724        wrapcolumn -- column number where lines are broken and wrapped,
1725            defaults to None where lines are not wrapped.
1726        linejunk,charjunk -- keyword arguments passed into ndiff() (used by
1727            HtmlDiff() to generate the side by side HTML differences).  See
1728            ndiff() documentation for argument default values and descriptions.
1729        """
1730        self._tabsize = tabsize
1731        self._wrapcolumn = wrapcolumn
1732        self._linejunk = linejunk
1733        self._charjunk = charjunk
1734
1735    def make_file(self, fromlines, tolines, fromdesc='', todesc='',
1736                  context=False, numlines=5, *, charset='utf-8'):
1737        """Returns HTML file of side by side comparison with change highlights
1738
1739        Arguments:
1740        fromlines -- list of "from" lines
1741        tolines -- list of "to" lines
1742        fromdesc -- "from" file column header string
1743        todesc -- "to" file column header string
1744        context -- set to True for contextual differences (defaults to False
1745            which shows full differences).
1746        numlines -- number of context lines.  When context is set True,
1747            controls number of lines displayed before and after the change.
1748            When context is False, controls the number of lines to place
1749            the "next" link anchors before the next change (so click of
1750            "next" link jumps to just before the change).
1751        charset -- charset of the HTML document
1752        """
1753
1754        return (self._file_template % dict(
1755            styles=self._styles,
1756            legend=self._legend,
1757            table=self.make_table(fromlines, tolines, fromdesc, todesc,
1758                                  context=context, numlines=numlines),
1759            charset=charset
1760        )).encode(charset, 'xmlcharrefreplace').decode(charset)
1761
1762    def _tab_newline_replace(self,fromlines,tolines):
1763        """Returns from/to line lists with tabs expanded and newlines removed.
1764
1765        Instead of tab characters being replaced by the number of spaces
1766        needed to fill in to the next tab stop, this function will fill
1767        the space with tab characters.  This is done so that the difference
1768        algorithms can identify changes in a file when tabs are replaced by
1769        spaces and vice versa.  At the end of the HTML generation, the tab
1770        characters will be replaced with a nonbreakable space.
1771        """
1772        def expand_tabs(line):
1773            # hide real spaces
1774            line = line.replace(' ','\0')
1775            # expand tabs into spaces
1776            line = line.expandtabs(self._tabsize)
1777            # replace spaces from expanded tabs back into tab characters
1778            # (we'll replace them with markup after we do differencing)
1779            line = line.replace(' ','\t')
1780            return line.replace('\0',' ').rstrip('\n')
1781        fromlines = [expand_tabs(line) for line in fromlines]
1782        tolines = [expand_tabs(line) for line in tolines]
1783        return fromlines,tolines
1784
1785    def _split_line(self,data_list,line_num,text):
1786        """Builds list of text lines by splitting text lines at wrap point
1787
1788        This function will determine if the input text line needs to be
1789        wrapped (split) into separate lines.  If so, the first wrap point
1790        will be determined and the first line appended to the output
1791        text line list.  This function is used recursively to handle
1792        the second part of the split line to further split it.
1793        """
1794        # if blank line or context separator, just add it to the output list
1795        if not line_num:
1796            data_list.append((line_num,text))
1797            return
1798
1799        # if line text doesn't need wrapping, just add it to the output list
1800        size = len(text)
1801        max = self._wrapcolumn
1802        if (size <= max) or ((size -(text.count('\0')*3)) <= max):
1803            data_list.append((line_num,text))
1804            return
1805
1806        # scan text looking for the wrap point, keeping track if the wrap
1807        # point is inside markers
1808        i = 0
1809        n = 0
1810        mark = ''
1811        while n < max and i < size:
1812            if text[i] == '\0':
1813                i += 1
1814                mark = text[i]
1815                i += 1
1816            elif text[i] == '\1':
1817                i += 1
1818                mark = ''
1819            else:
1820                i += 1
1821                n += 1
1822
1823        # wrap point is inside text, break it up into separate lines
1824        line1 = text[:i]
1825        line2 = text[i:]
1826
1827        # if wrap point is inside markers, place end marker at end of first
1828        # line and start marker at beginning of second line because each
1829        # line will have its own table tag markup around it.
1830        if mark:
1831            line1 = line1 + '\1'
1832            line2 = '\0' + mark + line2
1833
1834        # tack on first line onto the output list
1835        data_list.append((line_num,line1))
1836
1837        # use this routine again to wrap the remaining text
1838        self._split_line(data_list,'>',line2)
1839
1840    def _line_wrapper(self,diffs):
1841        """Returns iterator that splits (wraps) mdiff text lines"""
1842
1843        # pull from/to data and flags from mdiff iterator
1844        for fromdata,todata,flag in diffs:
1845            # check for context separators and pass them through
1846            if flag is None:
1847                yield fromdata,todata,flag
1848                continue
1849            (fromline,fromtext),(toline,totext) = fromdata,todata
1850            # for each from/to line split it at the wrap column to form
1851            # list of text lines.
1852            fromlist,tolist = [],[]
1853            self._split_line(fromlist,fromline,fromtext)
1854            self._split_line(tolist,toline,totext)
1855            # yield from/to line in pairs inserting blank lines as
1856            # necessary when one side has more wrapped lines
1857            while fromlist or tolist:
1858                if fromlist:
1859                    fromdata = fromlist.pop(0)
1860                else:
1861                    fromdata = ('',' ')
1862                if tolist:
1863                    todata = tolist.pop(0)
1864                else:
1865                    todata = ('',' ')
1866                yield fromdata,todata,flag
1867
1868    def _collect_lines(self,diffs):
1869        """Collects mdiff output into separate lists
1870
1871        Before storing the mdiff from/to data into a list, it is converted
1872        into a single line of text with HTML markup.
1873        """
1874
1875        fromlist,tolist,flaglist = [],[],[]
1876        # pull from/to data and flags from mdiff style iterator
1877        for fromdata,todata,flag in diffs:
1878            try:
1879                # store HTML markup of the lines into the lists
1880                fromlist.append(self._format_line(0,flag,*fromdata))
1881                tolist.append(self._format_line(1,flag,*todata))
1882            except TypeError:
1883                # exceptions occur for lines where context separators go
1884                fromlist.append(None)
1885                tolist.append(None)
1886            flaglist.append(flag)
1887        return fromlist,tolist,flaglist
1888
1889    def _format_line(self,side,flag,linenum,text):
1890        """Returns HTML markup of "from" / "to" text lines
1891
1892        side -- 0 or 1 indicating "from" or "to" text
1893        flag -- indicates if difference on line
1894        linenum -- line number (used for line number column)
1895        text -- line text to be marked up
1896        """
1897        try:
1898            linenum = '%d' % linenum
1899            id = ' id="%s%s"' % (self._prefix[side],linenum)
1900        except TypeError:
1901            # handle blank lines where linenum is '>' or ''
1902            id = ''
1903        # replace those things that would get confused with HTML symbols
1904        text=text.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")
1905
1906        # make space non-breakable so they don't get compressed or line wrapped
1907        text = text.replace(' ','&nbsp;').rstrip()
1908
1909        return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \
1910               % (id,linenum,text)
1911
1912    def _make_prefix(self):
1913        """Create unique anchor prefixes"""
1914
1915        # Generate a unique anchor prefix so multiple tables
1916        # can exist on the same HTML page without conflicts.
1917        fromprefix = "from%d_" % HtmlDiff._default_prefix
1918        toprefix = "to%d_" % HtmlDiff._default_prefix
1919        HtmlDiff._default_prefix += 1
1920        # store prefixes so line format method has access
1921        self._prefix = [fromprefix,toprefix]
1922
1923    def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
1924        """Makes list of "next" links"""
1925
1926        # all anchor names will be generated using the unique "to" prefix
1927        toprefix = self._prefix[1]
1928
1929        # process change flags, generating middle column of next anchors/links
1930        next_id = ['']*len(flaglist)
1931        next_href = ['']*len(flaglist)
1932        num_chg, in_change = 0, False
1933        last = 0
1934        for i,flag in enumerate(flaglist):
1935            if flag:
1936                if not in_change:
1937                    in_change = True
1938                    last = i
1939                    # at the beginning of a change, drop an anchor a few lines
1940                    # (the context lines) before the change for the previous
1941                    # link
1942                    i = max([0,i-numlines])
1943                    next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
1944                    # at the beginning of a change, drop a link to the next
1945                    # change
1946                    num_chg += 1
1947                    next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (
1948                         toprefix,num_chg)
1949            else:
1950                in_change = False
1951        # check for cases where there is no content to avoid exceptions
1952        if not flaglist:
1953            flaglist = [False]
1954            next_id = ['']
1955            next_href = ['']
1956            last = 0
1957            if context:
1958                fromlist = ['<td></td><td>&nbsp;No Differences Found&nbsp;</td>']
1959                tolist = fromlist
1960            else:
1961                fromlist = tolist = ['<td></td><td>&nbsp;Empty File&nbsp;</td>']
1962        # if not a change on first line, drop a link
1963        if not flaglist[0]:
1964            next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix
1965        # redo the last link to link to the top
1966        next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)
1967
1968        return fromlist,tolist,flaglist,next_href,next_id
1969
1970    def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
1971                   numlines=5):
1972        """Returns HTML table of side by side comparison with change highlights
1973
1974        Arguments:
1975        fromlines -- list of "from" lines
1976        tolines -- list of "to" lines
1977        fromdesc -- "from" file column header string
1978        todesc -- "to" file column header string
1979        context -- set to True for contextual differences (defaults to False
1980            which shows full differences).
1981        numlines -- number of context lines.  When context is set True,
1982            controls number of lines displayed before and after the change.
1983            When context is False, controls the number of lines to place
1984            the "next" link anchors before the next change (so click of
1985            "next" link jumps to just before the change).
1986        """
1987
1988        # make unique anchor prefixes so that multiple tables may exist
1989        # on the same page without conflict.
1990        self._make_prefix()
1991
1992        # change tabs to spaces before it gets more difficult after we insert
1993        # markup
1994        fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
1995
1996        # create diffs iterator which generates side by side from/to data
1997        if context:
1998            context_lines = numlines
1999        else:
2000            context_lines = None
2001        diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
2002                      charjunk=self._charjunk)
2003
2004        # set up iterator to wrap lines that exceed desired width
2005        if self._wrapcolumn:
2006            diffs = self._line_wrapper(diffs)
2007
2008        # collect up from/to lines and flags into lists (also format the lines)
2009        fromlist,tolist,flaglist = self._collect_lines(diffs)
2010
2011        # process change flags, generating middle column of next anchors/links
2012        fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
2013            fromlist,tolist,flaglist,context,numlines)
2014
2015        s = []
2016        fmt = '            <tr><td class="diff_next"%s>%s</td>%s' + \
2017              '<td class="diff_next">%s</td>%s</tr>\n'
2018        for i in range(len(flaglist)):
2019            if flaglist[i] is None:
2020                # mdiff yields None on separator lines skip the bogus ones
2021                # generated for the first line
2022                if i > 0:
2023                    s.append('        </tbody>        \n        <tbody>\n')
2024            else:
2025                s.append( fmt % (next_id[i],next_href[i],fromlist[i],
2026                                           next_href[i],tolist[i]))
2027        if fromdesc or todesc:
2028            header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (
2029                '<th class="diff_next"><br /></th>',
2030                '<th colspan="2" class="diff_header">%s</th>' % fromdesc,
2031                '<th class="diff_next"><br /></th>',
2032                '<th colspan="2" class="diff_header">%s</th>' % todesc)
2033        else:
2034            header_row = ''
2035
2036        table = self._table_template % dict(
2037            data_rows=''.join(s),
2038            header_row=header_row,
2039            prefix=self._prefix[1])
2040
2041        return table.replace('\0+','<span class="diff_add">'). \
2042                     replace('\0-','<span class="diff_sub">'). \
2043                     replace('\0^','<span class="diff_chg">'). \
2044                     replace('\1','</span>'). \
2045                     replace('\t','&nbsp;')
2046
2047del re
2048
2049def restore(delta, which):
2050    r"""
2051    Generate one of the two sequences that generated a delta.
2052
2053    Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
2054    lines originating from file 1 or 2 (parameter `which`), stripping off line
2055    prefixes.
2056
2057    Examples:
2058
2059    >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
2060    ...              'ore\ntree\nemu\n'.splitlines(keepends=True))
2061    >>> diff = list(diff)
2062    >>> print(''.join(restore(diff, 1)), end="")
2063    one
2064    two
2065    three
2066    >>> print(''.join(restore(diff, 2)), end="")
2067    ore
2068    tree
2069    emu
2070    """
2071    try:
2072        tag = {1: "- ", 2: "+ "}[int(which)]
2073    except KeyError:
2074        raise ValueError('unknown delta choice (must be 1 or 2): %r'
2075                           % which) from None
2076    prefixes = ("  ", tag)
2077    for line in delta:
2078        if line[:2] in prefixes:
2079            yield line[2:]
2080
2081def _test():
2082    import doctest, difflib
2083    return doctest.testmod(difflib)
2084
2085if __name__ == "__main__":
2086    _test()
2087