1#
2# diffoscope: in-depth comparison of files, archives, and directories
3#
4# Copyright © 2014-2015 Jérémy Bobbio <lunar@debian.org>
5# Copyright © 2017-2021 Chris Lamb <lamby@debian.org>
6#
7# diffoscope is free software: you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation, either version 3 of the License, or
10# (at your option) any later version.
11#
12# diffoscope is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.
19
20import re
21import io
22import os
23import errno
24import fcntl
25import hashlib
26import logging
27import itertools
28import threading
29import subprocess
30
31from difflib import Differ
32from multiprocessing.dummy import Queue
33
34from .tools import get_tool_name, tool_required
35from .config import Config
36from .profiling import profile
37from .tempfiles import get_temporary_directory
38
39DIFF_CHUNK = 4096
40
41logger = logging.getLogger(__name__)
42re_diff_change = re.compile(r"^([+-@]).*", re.MULTILINE)
43
44
45class DiffParser:
46    RANGE_RE = re.compile(
47        # example: '@@ -26814,9 +26814,8 @@'
48        rb"""
49          ^
50            @@\s+
51              -
52              (?P<start1>\d+)(,(?P<len1>\d+))?
53            \s+
54              \+
55              (?P<start2>\d+)(,(?P<len2>\d+))?
56            \s+@@
57          $
58        """,
59        re.VERBOSE,
60    )
61
62    def __init__(self, output, end_nl_q1, end_nl_q2):
63        self._output = output
64        self._end_nl_q1 = end_nl_q1
65        self._end_nl_q2 = end_nl_q2
66        self._action = self.read_headers
67        self._diff = io.BytesIO()
68        self._success = False
69        self._remaining_hunk_lines = None
70        self._block_len = None
71        self._direction = None
72        self._end_nl = None
73        self._max_lines = Config().max_diff_block_lines_saved
74
75    @property
76    def diff(self):
77        return self._diff.getvalue().decode("UTF-8", errors="replace")
78
79    @property
80    def success(self):
81        return self._success
82
83    def parse(self):
84        for line in self._output.split(b"\n"):
85            self._action = self._action(line)
86
87        self._action = None
88        self._success = True
89
90    def read_headers(self, line):
91        if not line:
92            return None
93
94        if line.startswith(b"---"):
95            return self.read_headers
96
97        if line.startswith(b"+++"):
98            return self.read_headers
99
100        found = DiffParser.RANGE_RE.match(line)
101
102        if not found:
103            raise ValueError(f"Unable to parse diff headers: {line!r}")
104
105        self._diff.write(line + b"\n")
106        if found.group("len1"):
107            self._remaining_hunk_lines = int(found.group("len1"))
108        else:
109            self._remaining_hunk_lines = 1
110        if found.group("len2"):
111            self._remaining_hunk_lines += int(found.group("len2"))
112        else:
113            self._remaining_hunk_lines += 1
114
115        self._direction = None
116
117        return self.read_hunk
118
119    def read_hunk(self, line):
120        if not line:
121            return None
122
123        if line[:1] == b" ":
124            self._remaining_hunk_lines -= 2
125        elif line[:1] == b"+":
126            self._remaining_hunk_lines -= 1
127        elif line[:1] == b"-":
128            self._remaining_hunk_lines -= 1
129        elif line[:1] == b"\\":
130            # When both files don't end with \n, do not show it as a difference
131            if self._end_nl is None:
132                end_nl1 = self._end_nl_q1.get()
133                end_nl2 = self._end_nl_q2.get()
134                self._end_nl = end_nl1 and end_nl2
135            if not self._end_nl:
136                return self.read_hunk
137        elif self._remaining_hunk_lines == 0:
138            return self.read_headers(line)
139        else:
140            raise ValueError(f"Unable to parse diff hunk: {line!r}")
141
142        self._diff.write(line + b"\n")
143
144        if line[:1] in (b"-", b"+"):
145            if line[:1] == self._direction:
146                self._block_len += 1
147            else:
148                self._block_len = 1
149                self._direction = line[:1]
150
151            if self._block_len >= self._max_lines:
152                return self.skip_block
153        else:
154            self._block_len = 1
155            self._direction = line[:1]
156
157        return self.read_hunk
158
159    def skip_block(self, line):
160        if self._remaining_hunk_lines == 0 or line[:1] != self._direction:
161            removed = self._block_len - Config().max_diff_block_lines_saved
162            if removed:
163                self._diff.write(
164                    b"%s[ %d lines removed ]\n" % (self._direction, removed)
165                )
166            return self.read_hunk(line)
167
168        self._block_len += 1
169        self._remaining_hunk_lines -= 1
170
171        return self.skip_block
172
173
174@tool_required("diff")
175def run_diff(fifo1, fifo2, end_nl_q1, end_nl_q2):
176    cmd = [
177        get_tool_name("diff"),
178        "-a",
179        "-U" + str(Config().diff_context),
180        fifo1,
181        fifo2,
182    ]
183
184    logger.debug("Running %s", " ".join(cmd))
185
186    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
187
188    parser = DiffParser(p.stdout, end_nl_q1, end_nl_q2)
189    parser.parse()
190
191    logger.debug(
192        "%s: returncode %d, parsed %s",
193        " ".join(cmd),
194        p.returncode,
195        parser.success,
196    )
197
198    if not parser.success and p.returncode not in (0, 1):
199        raise subprocess.CalledProcessError(
200            p.returncode, cmd, output=parser.diff.encode("utf-8")
201        )
202
203    if p.returncode == 0:
204        return None
205
206    return parser.diff
207
208
209class FIFOFeeder(threading.Thread):
210    def __init__(self, feeder, fifo_path, end_nl_q=None, daemon=True, *args):
211        os.mkfifo(fifo_path)
212        super().__init__(daemon=daemon)
213        self.feeder = feeder
214        self.fifo_path = fifo_path
215        self.end_nl_q = Queue() if end_nl_q is None else end_nl_q
216        self._exception = None
217        self._want_join = threading.Event()
218
219    def __enter__(self):
220        self.start()
221        return self
222
223    def __exit__(self, exc_type, exc_value, exc_tb):
224        os.remove(self.fifo_path)
225        self.join()
226
227    def run(self):
228        try:
229            # Try to open the FIFO nonblocking, so we can periodically check
230            # if the main thread wants us to wind down.  If it does, there's no
231            # more need for the FIFO, so stop the thread.
232            while True:
233                try:
234                    fd = os.open(self.fifo_path, os.O_WRONLY | os.O_NONBLOCK)
235                except OSError as e:
236                    if e.errno != errno.ENXIO:
237                        raise
238                    if self._want_join.is_set():
239                        return
240                else:
241                    break
242
243            # Now clear the fd's nonblocking flag to let writes block normally.
244            fcntl.fcntl(fd, fcntl.F_SETFL, 0)
245
246            with open(fd, "wb") as fifo:
247                # The queue works around a unified diff limitation: if there's
248                # no newlines in both don't make it a difference
249                end_nl = self.feeder(fifo)
250                self.end_nl_q.put(end_nl)
251        except Exception as error:
252            self._exception = error
253
254    def join(self):
255        self._want_join.set()
256        super().join()
257        if self._exception is not None:
258            raise self._exception
259
260
261class _Feeder:
262    """
263    A 'feeder' is a specialized writer.
264
265    A 'feeder' is a callable that takes as argument a writeable file, and
266    writes to it.  Feeders can transform the written data, truncate it,
267    checksum it, and so on.  The callable must return True to represent that
268    the data had a terminating newline, and False otherwise.
269
270    Feeders are created by the functions make_feeder_from_raw_reader() and
271    empty_file_feeder().  The returned objects are closures, and are not
272    (currently?) instances of any particular class.
273    """
274
275
276def empty_file_feeder():
277    """
278    Returns a feeder that simulates an empty file.
279
280    See _Feeder for feeders.
281    """
282
283    def feeder(f):
284        return False
285
286    return feeder
287
288
289def make_feeder_from_raw_reader(in_file, filterfn=None):
290    """
291    Create a feeder that checksums, truncates, and transcodes the data.  The
292    optional argument filterfn is a callable that gets passed each line, and
293    returns the line that should be used in its stead.  (There is no facility
294    for filterfn to discard a line entirely.)
295
296    See _Feeder for feeders.
297    """
298
299    def feeder(out_file):
300        h = None
301        end_nl = False
302        max_lines = Config().max_diff_input_lines
303        line_count = 0
304
305        if max_lines < float("inf"):
306            h = hashlib.sha256()
307
308        for buf in in_file:
309            line_count += 1
310            out = filterfn(buf) if filterfn else buf
311            if h:
312                h.update(out)
313            if line_count < max_lines:
314                out_file.write(out)
315            end_nl = buf[-1] == "\n"
316
317        if h and line_count >= max_lines:
318            out_file.write(
319                "[ Too much input for diff (SHA: {}) ]\n".format(
320                    h.hexdigest()
321                ).encode("utf-8")
322            )
323            end_nl = True
324
325        return end_nl
326
327    return feeder
328
329
330def diff(feeder1, feeder2):
331    with get_temporary_directory() as tmpdir:
332        fifo1_path = os.path.join(tmpdir, "fifo1")
333        fifo2_path = os.path.join(tmpdir, "fifo2")
334        with FIFOFeeder(feeder1, fifo1_path) as fifo1, FIFOFeeder(
335            feeder2, fifo2_path
336        ) as fifo2:
337            return run_diff(
338                fifo1_path, fifo2_path, fifo1.end_nl_q, fifo2.end_nl_q
339            )
340
341
342def diff_split_lines(diff, keepends=True):
343    lines = diff.split("\n")
344    if not keepends:
345        return lines
346    return [line + "\n" for line in lines[:-1]] + (
347        [lines[-1]] if lines[-1] else []
348    )
349
350
351def reverse_unified_diff(diff):
352    assert isinstance(diff, str)
353    res = []
354    # XXX - should move to just use plain bytes nearly everywhere until ready
355    # to print instead of using strings internally as well.
356    for line in diff_split_lines(diff):
357        line = line.encode("utf-8", errors="replace")
358        assert isinstance(line, bytes)
359        found = DiffParser.RANGE_RE.match(line)
360
361        if found:
362            before = found.group("start2")
363            if found.group("len2") is not None:
364                before += b"," + found.group("len2")
365
366            after = found.group("start1")
367            if found.group("len1") is not None:
368                after += b"," + found.group("len1")
369
370            res.append(b"@@ -%s +%s @@\n" % (before, after))
371        elif line.startswith(b"-"):
372            res.append(b"+")
373            res.append(line[1:])
374        elif line.startswith(b"+"):
375            res.append(b"-")
376            res.append(line[1:])
377        else:
378            res.append(line)
379    return b"".join(res).decode("utf-8", errors="replace")
380
381
382def color_unified_diff(diff):
383    RESET = "\033[0m"
384    RED, GREEN, CYAN = "\033[31m", "\033[32m", "\033[0;36m"
385
386    def repl(m):
387        return "{}{}{}".format(
388            {"-": RED, "@": CYAN, "+": GREEN}[m.group(1)], m.group(0), RESET
389        )
390
391    return re_diff_change.sub(repl, diff)
392
393
394DIFFON = "\x01"
395DIFFOFF = "\x02"
396MAX_WAGNER_FISCHER_SIZE = (
397    256  # any higher, and linediff takes >1 second and >200MB RAM
398)
399
400
401def _linediff_sane(x):
402    # turn non-printable chars into "."
403    return "." if ord(x) < 32 and x not in "\t\n" else x
404
405
406def diffinput_truncate(s, sz):
407    # Truncate, preserving uniqueness
408    if len(s) > sz:
409        s = s[
410            :sz
411        ] + "[ ... truncated by diffoscope; len: {}, SHA: {} ... ]".format(
412            len(s[sz:]), hashlib.sha256(s[sz:].encode("utf-8")).hexdigest()
413        )
414    return s
415
416
417def linediff(s, t, diffon, diffoff):
418    # calculate common prefix/suffix, easy optimisation for Wagner-Fischer
419    prefix = os.path.commonprefix((s, t))
420    if prefix:
421        s = s[len(prefix) :]
422        t = t[len(prefix) :]
423    suffix = os.path.commonprefix((s[::-1], t[::-1]))[::-1]
424    if suffix:
425        s = s[: -len(suffix)]
426        t = t[: -len(suffix)]
427
428    # truncate so Wagner-Fischer doesn't blow up RAM
429    s = diffinput_truncate(s, MAX_WAGNER_FISCHER_SIZE)
430    t = diffinput_truncate(t, MAX_WAGNER_FISCHER_SIZE)
431    l1, l2 = zip(*linediff_simplify(linediff_wagnerfischer(s, t)))
432
433    def to_string(k, v):
434        sanev = "".join(_linediff_sane(c) for c in v)
435        return (diffon + sanev + diffoff) if k else sanev
436
437    s1 = "".join(to_string(*p) for p in l1)
438    t1 = "".join(to_string(*p) for p in l2)
439
440    return f"{prefix}{s1}{suffix}", f"{prefix}{t1}{suffix}"
441
442
443def linediff_wagnerfischer(s, t):
444    """
445    Line diff algorithm, originally from diff2html.
446
447    https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm
448
449    Finds the minimum (levenshtein) edit distance between two strings, but has
450    quadratic performance O(m*n).
451    """
452    m, n = len(s), len(t)
453    d = [[(0, 0) for i in range(n + 1)] for i in range(m + 1)]
454
455    d[0][0] = (0, (0, 0))
456    for i in range(1, m + 1):
457        d[i][0] = (i, (i - 1, 0))
458    for j in range(1, n + 1):
459        d[0][j] = (j, (0, j - 1))
460
461    # NB. This loop is O(len(s) * len(t))
462    for i in range(1, m + 1):
463        for j in range(1, n + 1):
464            if s[i - 1] == t[j - 1]:
465                cost = 0
466            else:
467                cost = 1
468            d[i][j] = min(
469                (d[i - 1][j][0] + 1, (i - 1, j)),
470                (d[i][j - 1][0] + 1, (i, j - 1)),
471                (d[i - 1][j - 1][0] + cost, (i - 1, j - 1)),
472            )
473
474    coords = []
475    coord = (m, n)
476    while coord != (0, 0):
477        coords.insert(0, coord)
478        x, y = coord
479        coord = d[x][y][1]
480
481    for coord in coords:
482        cx, cy = coord
483        child_val = d[cx][cy][0]
484
485        father_coord = d[cx][cy][1]
486        fx, fy = father_coord
487        father_val = d[fx][fy][0]
488
489        diff = (cx - fx, cy - fy)
490
491        if diff == (0, 1):
492            yield (False, ""), (True, t[fy])
493        elif diff == (1, 0):
494            yield (True, s[fx]), (False, "")
495        elif child_val - father_val == 1:
496            yield (True, s[fx]), (True, t[fy])
497        else:
498            assert s[fx] == t[fy]
499            yield (False, s[fx]), (False, t[fy])
500
501
502def linediff_simplify(g):
503    """Simplify the output of Wagner-Fischer."""
504    current = None
505    for l, r in g:
506        if not current:
507            current = l, r
508        elif current[0][0] == l[0] and current[1][0] == r[0]:
509            current = (
510                (l[0], current[0][1] + l[1]),
511                (r[0], current[1][1] + r[1]),
512            )
513        else:
514            yield current
515            current = l, r
516    if current:
517        yield current
518
519
520class SideBySideDiff:
521    """Calculates a side-by-side diff from a unified diff."""
522
523    def __init__(self, unified_diff, diffon=DIFFON, diffoff=DIFFOFF):
524        self.unified_diff = unified_diff
525        self.diffon = diffon
526        self.diffoff = diffoff
527        self.reset()
528
529    def reset(self):
530        self.differ = Differ()
531        self.buf = []
532        self.add_cpt = 0
533        self.del_cpt = 0
534        self.line1 = 0
535        self.line2 = 0
536        self.hunk_off1 = 0
537        self.hunk_size1 = 0
538        self.hunk_off2 = 0
539        self.hunk_size2 = 0
540        self._bytes_processed = 0
541
542    @property
543    def bytes_processed(self):
544        return self._bytes_processed
545
546    def match_lines(self, l0, l1):
547        """
548        Given two lists of lines, use python's difflib to make the diff a bit
549        "smarter", as it better detects added lines (see !64)
550        """
551
552        if len(l0) + len(l1) > 750:
553            # difflib.Differ.compare is at least O(n^2), so don't call it if
554            # our inputs are too large.
555            yield "C", "Diff chunk too large, falling back to line-by-line diff ({} lines added, {} lines removed)".format(
556                self.add_cpt, self.del_cpt
557            )
558            for line0, line1 in itertools.zip_longest(l0, l1, fillvalue=""):
559                yield from self.yield_line(line0, line1)
560            return
561
562        saved_line = None
563        for line in self.differ.compare(l0, l1):
564            # Skip lines with details as they don't matter to us
565            if line.startswith("? "):
566                continue
567
568            # When we encounter a +, it may either be because a line was
569            # added, or because a line was modified (in that case,
570            # save_line will not be None)
571            if line.startswith("+ "):
572                yield from self.yield_line(saved_line or "", line[2:])
573                saved_line = None
574                continue
575
576            # If saved_line is not None, we need to yield from it before
577            # moving on
578            if saved_line is not None:
579                yield from self.yield_line(saved_line, "")
580                saved_line = None
581
582            # Handle removed and unchanged lines
583            if line.startswith("- "):
584                saved_line = line[2:]
585            elif line.startswith("  "):
586                line = line[2:]
587                yield from self.yield_line(line, line)
588
589        # Make sure we don't forget a line
590        if saved_line is not None:
591            yield from self.yield_line(saved_line, "")
592
593    def empty_buffer(self):
594        if self.del_cpt == 0 or self.add_cpt == 0:
595            # All lines are unchanged lines
596            for l in self.buf:
597                yield from self.yield_line(l[0], l[1])
598        elif self.del_cpt != 0 and self.add_cpt != 0:
599            l0 = [pair[0] for pair in self.buf if pair[0] is not None]
600            l1 = [pair[1] for pair in self.buf if pair[1] is not None]
601            yield from self.match_lines(l0, l1)
602
603    def yield_line(self, s1, s2):
604        orig1 = s1
605        orig2 = s2
606
607        if s1 is None and s2 is None:
608            type_name = "unmodified"
609        elif s1 == "" and s2 == "":
610            type_name = "unmodified"
611        elif s1 is None or s1 == "":
612            type_name = "added"
613        elif s2 is None or s2 == "":
614            type_name = "deleted"
615        elif (
616            orig1 == orig2
617            and not s1.endswith("lines removed ]")
618            and not s2.endswith("lines removed ]")
619        ):
620            type_name = "unmodified"
621        else:
622            type_name = "changed"
623            with profile("diff", "linediff"):
624                s1, s2 = linediff(s1, s2, self.diffon, self.diffoff)
625
626        yield "L", (type_name, s1, self.line1, s2, self.line2)
627
628        m = orig1 and re.match(r"^\[ (\d+) lines removed \]$", orig1)
629        if m:
630            self.line1 += int(m.group(1))
631        elif orig1:
632            self.line1 += 1
633        m = orig2 and re.match(r"^\[ (\d+) lines removed \]$", orig2)
634        if m:
635            self.line2 += int(m.group(1))
636        elif orig2:
637            self.line2 += 1
638
639        self.add_cpt = 0
640        self.del_cpt = 0
641        self.buf = []
642
643    def items(self):
644        """Yield the items that form the side-by-side diff.
645
646        Each item is a (type, value) tuple, as follows:
647
648        type == "H", value is a tuple representing a hunk header
649            hunk_offset1, hunk_size1, hunk_offset2, hunk_size2 = value
650            all ints
651
652        type == "L", value is a tuple representing a line of a hunk
653            mode, line1, lineno1, line2, lineno2 = value
654            where mode is one of {"unmodified", "added", "deleted", "changed"}
655            line* are strings
656            lineno* are ints
657
658        type == "C", value is a comment
659            comment = value
660            a string
661        """
662        self.reset()
663
664        for l in diff_split_lines(self.unified_diff, False):
665            self._bytes_processed += len(l) + 1
666            m = re.match(r"^--- ([^\s]*)", l)
667            if m:
668                yield from self.empty_buffer()
669                continue
670            m = re.match(r"^\+\+\+ ([^\s]*)", l)
671            if m:
672                yield from self.empty_buffer()
673                continue
674
675            m = re.match(r"@@ -(\d+),?(\d*) \+(\d+),?(\d*)", l)
676            if m:
677                yield from self.empty_buffer()
678                hunk_data = map(lambda x: x == "" and 1 or int(x), m.groups())
679                (
680                    self.hunk_off1,
681                    self.hunk_size1,
682                    self.hunk_off2,
683                    self.hunk_size2,
684                ) = hunk_data
685                self.line1, self.line2 = self.hunk_off1, self.hunk_off2
686                yield "H", (
687                    self.hunk_off1,
688                    self.hunk_size1,
689                    self.hunk_off2,
690                    self.hunk_size2,
691                )
692                continue
693
694            if re.match(r"^\[", l):
695                yield from self.empty_buffer()
696                yield "C", l
697
698            if re.match(r"^\\ No newline", l):
699                if self.hunk_size2 == 0:
700                    self.buf[-1] = (
701                        self.buf[-1][0],
702                        self.buf[-1][1] + "\n" + l[2:],
703                    )
704                else:
705                    self.buf[-1] = (
706                        self.buf[-1][0] + "\n" + l[2:],
707                        self.buf[-1][1],
708                    )
709                continue
710
711            if self.hunk_size1 <= 0 and self.hunk_size2 <= 0:
712                yield from self.empty_buffer()
713                continue
714
715            m = re.match(r"^\+\[ (\d+) lines removed \]$", l)
716            if m:
717                self.add_cpt += int(m.group(1))
718                self.hunk_size2 -= int(m.group(1))
719                self.buf.append((None, l[1:]))
720                continue
721
722            if re.match(r"^\+", l):
723                self.add_cpt += 1
724                self.hunk_size2 -= 1
725                self.buf.append((None, l[1:]))
726                continue
727
728            m = re.match(r"^-\[ (\d+) lines removed \]$", l)
729            if m:
730                self.del_cpt += int(m.group(1))
731                self.hunk_size1 -= int(m.group(1))
732                self.buf.append((l[1:], None))
733                continue
734
735            if re.match(r"^-", l):
736                self.del_cpt += 1
737                self.hunk_size1 -= 1
738                self.buf.append((l[1:], None))
739                continue
740
741            if re.match(r"^ ", l) and self.hunk_size1 and self.hunk_size2:
742                yield from self.empty_buffer()
743                self.hunk_size1 -= 1
744                self.hunk_size2 -= 1
745                self.buf.append((l[1:], l[1:]))
746                continue
747
748            yield from self.empty_buffer()
749
750        yield from self.empty_buffer()
751