diffoscope-204/diffoscope/diff.py

#
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2014-2015 Jérémy Bobbio <lunar@debian.org>
# Copyright © 2017-2021 Chris Lamb <lamby@debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# diffoscope is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.

import re
import io
import os
import errno
import fcntl
import hashlib
import logging
import itertools
import threading
import subprocess

from difflib import Differ
from multiprocessing.dummy import Queue

from .tools import get_tool_name, tool_required
from .config import Config
from .profiling import profile
from .tempfiles import get_temporary_directory

DIFF_CHUNK = 4096

logger = logging.getLogger(__name__)
re_diff_change = re.compile(r"^([+-@]).*", re.MULTILINE)


class DiffParser:
    RANGE_RE = re.compile(
        # example: '@@ -26814,9 +26814,8 @@'
        rb"""
          ^
            @@\s+
              -
              (?P<start1>\d+)(,(?P<len1>\d+))?
            \s+
              \+
              (?P<start2>\d+)(,(?P<len2>\d+))?
            \s+@@
          $
        """,
        re.VERBOSE,
    )

    def __init__(self, output, end_nl_q1, end_nl_q2):
        self._output = output
        self._end_nl_q1 = end_nl_q1
        self._end_nl_q2 = end_nl_q2
        self._action = self.read_headers
        self._diff = io.BytesIO()
        self._success = False
        self._remaining_hunk_lines = None
        self._block_len = None
        self._direction = None
        self._end_nl = None
        self._max_lines = Config().max_diff_block_lines_saved

    @property
    def diff(self):
        return self._diff.getvalue().decode("UTF-8", errors="replace")

    @property
    def success(self):
        return self._success

    def parse(self):
        for line in self._output.split(b"\n"):
            self._action = self._action(line)

        self._action = None
        self._success = True

    def read_headers(self, line):
        if not line:
            return None

        if line.startswith(b"---"):
            return self.read_headers

        if line.startswith(b"+++"):
            return self.read_headers

        found = DiffParser.RANGE_RE.match(line)

        if not found:
            raise ValueError(f"Unable to parse diff headers: {line!r}")

        self._diff.write(line + b"\n")
        if found.group("len1"):
            self._remaining_hunk_lines = int(found.group("len1"))
        else:
            self._remaining_hunk_lines = 1
        if found.group("len2"):
            self._remaining_hunk_lines += int(found.group("len2"))
        else:
            self._remaining_hunk_lines += 1

        self._direction = None

        return self.read_hunk

    def read_hunk(self, line):
        if not line:
            return None

        if line[:1] == b" ":
            self._remaining_hunk_lines -= 2
        elif line[:1] == b"+":
            self._remaining_hunk_lines -= 1
        elif line[:1] == b"-":
            self._remaining_hunk_lines -= 1
        elif line[:1] == b"\\":
            # When both files don't end with \n, do not show it as a difference
            if self._end_nl is None:
                end_nl1 = self._end_nl_q1.get()
                end_nl2 = self._end_nl_q2.get()
                self._end_nl = end_nl1 and end_nl2
            if not self._end_nl:
                return self.read_hunk
        elif self._remaining_hunk_lines == 0:
            return self.read_headers(line)
        else:
            raise ValueError(f"Unable to parse diff hunk: {line!r}")

        self._diff.write(line + b"\n")

        if line[:1] in (b"-", b"+"):
            if line[:1] == self._direction:
                self._block_len += 1
            else:
                self._block_len = 1
                self._direction = line[:1]

            if self._block_len >= self._max_lines:
                return self.skip_block
        else:
            self._block_len = 1
            self._direction = line[:1]

        return self.read_hunk

    def skip_block(self, line):
        if self._remaining_hunk_lines == 0 or line[:1] != self._direction:
            removed = self._block_len - Config().max_diff_block_lines_saved
            if removed:
                self._diff.write(
                    b"%s[ %d lines removed ]\n" % (self._direction, removed)
                )
            return self.read_hunk(line)

        self._block_len += 1
        self._remaining_hunk_lines -= 1

        return self.skip_block


@tool_required("diff")
def run_diff(fifo1, fifo2, end_nl_q1, end_nl_q2):
    cmd = [
        get_tool_name("diff"),
        "-a",
        "-U" + str(Config().diff_context),
        fifo1,
        fifo2,
    ]

    logger.debug("Running %s", " ".join(cmd))

    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

    parser = DiffParser(p.stdout, end_nl_q1, end_nl_q2)
    parser.parse()

    logger.debug(
        "%s: returncode %d, parsed %s",
        " ".join(cmd),
        p.returncode,
        parser.success,
    )

    if not parser.success and p.returncode not in (0, 1):
        raise subprocess.CalledProcessError(
            p.returncode, cmd, output=parser.diff.encode("utf-8")
        )

    if p.returncode == 0:
        return None

    return parser.diff


class FIFOFeeder(threading.Thread):
    def __init__(self, feeder, fifo_path, end_nl_q=None, daemon=True, *args):
        os.mkfifo(fifo_path)
        super().__init__(daemon=daemon)
        self.feeder = feeder
        self.fifo_path = fifo_path
        self.end_nl_q = Queue() if end_nl_q is None else end_nl_q
        self._exception = None
        self._want_join = threading.Event()

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, exc_type, exc_value, exc_tb):
        os.remove(self.fifo_path)
        self.join()

    def run(self):
        try:
            # Try to open the FIFO nonblocking, so we can periodically check
            # if the main thread wants us to wind down.  If it does, there's no
            # more need for the FIFO, so stop the thread.
            while True:
                try:
                    fd = os.open(self.fifo_path, os.O_WRONLY | os.O_NONBLOCK)
                except OSError as e:
                    if e.errno != errno.ENXIO:
                        raise
                    if self._want_join.is_set():
                        return
                else:
                    break

            # Now clear the fd's nonblocking flag to let writes block normally.
            fcntl.fcntl(fd, fcntl.F_SETFL, 0)

            with open(fd, "wb") as fifo:
                # The queue works around a unified diff limitation: if there's
                # no newlines in both don't make it a difference
                end_nl = self.feeder(fifo)
                self.end_nl_q.put(end_nl)
        except Exception as error:
            self._exception = error

    def join(self):
        self._want_join.set()
        super().join()
        if self._exception is not None:
            raise self._exception


class _Feeder:
    """
    A 'feeder' is a specialized writer.

    A 'feeder' is a callable that takes as argument a writeable file, and
    writes to it.  Feeders can transform the written data, truncate it,
    checksum it, and so on.  The callable must return True to represent that
    the data had a terminating newline, and False otherwise.

    Feeders are created by the functions make_feeder_from_raw_reader() and
    empty_file_feeder().  The returned objects are closures, and are not
    (currently?) instances of any particular class.
    """


def empty_file_feeder():
    """
    Returns a feeder that simulates an empty file.

    See _Feeder for feeders.
    """

    def feeder(f):
        return False

    return feeder


def make_feeder_from_raw_reader(in_file, filterfn=None):
    """
    Create a feeder that checksums, truncates, and transcodes the data.  The
    optional argument filterfn is a callable that gets passed each line, and
    returns the line that should be used in its stead.  (There is no facility
    for filterfn to discard a line entirely.)

    See _Feeder for feeders.
    """

    def feeder(out_file):
        h = None
        end_nl = False
        max_lines = Config().max_diff_input_lines
        line_count = 0

        if max_lines < float("inf"):
            h = hashlib.sha256()

        for buf in in_file:
            line_count += 1
            out = filterfn(buf) if filterfn else buf
            if h:
                h.update(out)
            if line_count < max_lines:
                out_file.write(out)
            end_nl = buf[-1] == "\n"

        if h and line_count >= max_lines:
            out_file.write(
                "[ Too much input for diff (SHA: {}) ]\n".format(
                    h.hexdigest()
                ).encode("utf-8")
            )
            end_nl = True

        return end_nl

    return feeder


def diff(feeder1, feeder2):
    with get_temporary_directory() as tmpdir:
        fifo1_path = os.path.join(tmpdir, "fifo1")
        fifo2_path = os.path.join(tmpdir, "fifo2")
        with FIFOFeeder(feeder1, fifo1_path) as fifo1, FIFOFeeder(
            feeder2, fifo2_path
        ) as fifo2:
            return run_diff(
                fifo1_path, fifo2_path, fifo1.end_nl_q, fifo2.end_nl_q
            )


def diff_split_lines(diff, keepends=True):
    lines = diff.split("\n")
    if not keepends:
        return lines
    return [line + "\n" for line in lines[:-1]] + (
        [lines[-1]] if lines[-1] else []
    )


def reverse_unified_diff(diff):
    assert isinstance(diff, str)
    res = []
    # XXX - should move to just use plain bytes nearly everywhere until ready
    # to print instead of using strings internally as well.
    for line in diff_split_lines(diff):
        line = line.encode("utf-8", errors="replace")
        assert isinstance(line, bytes)
        found = DiffParser.RANGE_RE.match(line)

        if found:
            before = found.group("start2")
            if found.group("len2") is not None:
                before += b"," + found.group("len2")

            after = found.group("start1")
            if found.group("len1") is not None:
                after += b"," + found.group("len1")

            res.append(b"@@ -%s +%s @@\n" % (before, after))
        elif line.startswith(b"-"):
            res.append(b"+")
            res.append(line[1:])
        elif line.startswith(b"+"):
            res.append(b"-")
            res.append(line[1:])
        else:
            res.append(line)
    return b"".join(res).decode("utf-8", errors="replace")


def color_unified_diff(diff):
    RESET = "\033[0m"
    RED, GREEN, CYAN = "\033[31m", "\033[32m", "\033[0;36m"

    def repl(m):
        return "{}{}{}".format(
            {"-": RED, "@": CYAN, "+": GREEN}[m.group(1)], m.group(0), RESET
        )

    return re_diff_change.sub(repl, diff)


DIFFON = "\x01"
DIFFOFF = "\x02"
MAX_WAGNER_FISCHER_SIZE = (
    256  # any higher, and linediff takes >1 second and >200MB RAM
)


def _linediff_sane(x):
    # turn non-printable chars into "."
    return "." if ord(x) < 32 and x not in "\t\n" else x


def diffinput_truncate(s, sz):
    # Truncate, preserving uniqueness
    if len(s) > sz:
        s = s[
            :sz
        ] + "[ ... truncated by diffoscope; len: {}, SHA: {} ... ]".format(
            len(s[sz:]), hashlib.sha256(s[sz:].encode("utf-8")).hexdigest()
        )
    return s


def linediff(s, t, diffon, diffoff):
    # calculate common prefix/suffix, easy optimisation for Wagner-Fischer
    prefix = os.path.commonprefix((s, t))
    if prefix:
        s = s[len(prefix) :]
        t = t[len(prefix) :]
    suffix = os.path.commonprefix((s[::-1], t[::-1]))[::-1]
    if suffix:
        s = s[: -len(suffix)]
        t = t[: -len(suffix)]

    # truncate so Wagner-Fischer doesn't blow up RAM
    s = diffinput_truncate(s, MAX_WAGNER_FISCHER_SIZE)
    t = diffinput_truncate(t, MAX_WAGNER_FISCHER_SIZE)
    l1, l2 = zip(*linediff_simplify(linediff_wagnerfischer(s, t)))

    def to_string(k, v):
        sanev = "".join(_linediff_sane(c) for c in v)
        return (diffon + sanev + diffoff) if k else sanev

    s1 = "".join(to_string(*p) for p in l1)
    t1 = "".join(to_string(*p) for p in l2)

    return f"{prefix}{s1}{suffix}", f"{prefix}{t1}{suffix}"


def linediff_wagnerfischer(s, t):
    """
    Line diff algorithm, originally from diff2html.

    https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm

    Finds the minimum (levenshtein) edit distance between two strings, but has
    quadratic performance O(m*n).
    """
    m, n = len(s), len(t)
    d = [[(0, 0) for i in range(n + 1)] for i in range(m + 1)]

    d[0][0] = (0, (0, 0))
    for i in range(1, m + 1):
        d[i][0] = (i, (i - 1, 0))
    for j in range(1, n + 1):
        d[0][j] = (j, (0, j - 1))

    # NB. This loop is O(len(s) * len(t))
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s[i - 1] == t[j - 1]:
                cost = 0
            else:
                cost = 1
            d[i][j] = min(
                (d[i - 1][j][0] + 1, (i - 1, j)),
                (d[i][j - 1][0] + 1, (i, j - 1)),
                (d[i - 1][j - 1][0] + cost, (i - 1, j - 1)),
            )

    coords = []
    coord = (m, n)
    while coord != (0, 0):
        coords.insert(0, coord)
        x, y = coord
        coord = d[x][y][1]

    for coord in coords:
        cx, cy = coord
        child_val = d[cx][cy][0]

        father_coord = d[cx][cy][1]
        fx, fy = father_coord
        father_val = d[fx][fy][0]

        diff = (cx - fx, cy - fy)

        if diff == (0, 1):
            yield (False, ""), (True, t[fy])
        elif diff == (1, 0):
            yield (True, s[fx]), (False, "")
        elif child_val - father_val == 1:
            yield (True, s[fx]), (True, t[fy])
        else:
            assert s[fx] == t[fy]
            yield (False, s[fx]), (False, t[fy])


def linediff_simplify(g):
    """Simplify the output of Wagner-Fischer."""
    current = None
    for l, r in g:
        if not current:
            current = l, r
        elif current[0][0] == l[0] and current[1][0] == r[0]:
            current = (
                (l[0], current[0][1] + l[1]),
                (r[0], current[1][1] + r[1]),
            )
        else:
            yield current
            current = l, r
    if current:
        yield current


class SideBySideDiff:
    """Calculates a side-by-side diff from a unified diff."""

    def __init__(self, unified_diff, diffon=DIFFON, diffoff=DIFFOFF):
        self.unified_diff = unified_diff
        self.diffon = diffon
        self.diffoff = diffoff
        self.reset()

    def reset(self):
        self.differ = Differ()
        self.buf = []
        self.add_cpt = 0
        self.del_cpt = 0
        self.line1 = 0
        self.line2 = 0
        self.hunk_off1 = 0
        self.hunk_size1 = 0
        self.hunk_off2 = 0
        self.hunk_size2 = 0
        self._bytes_processed = 0

    @property
    def bytes_processed(self):
        return self._bytes_processed

    def match_lines(self, l0, l1):
        """
        Given two lists of lines, use python's difflib to make the diff a bit
        "smarter", as it better detects added lines (see !64)
        """

        if len(l0) + len(l1) > 750:
            # difflib.Differ.compare is at least O(n^2), so don't call it if
            # our inputs are too large.
            yield "C", "Diff chunk too large, falling back to line-by-line diff ({} lines added, {} lines removed)".format(
                self.add_cpt, self.del_cpt
            )
            for line0, line1 in itertools.zip_longest(l0, l1, fillvalue=""):
                yield from self.yield_line(line0, line1)
            return

        saved_line = None
        for line in self.differ.compare(l0, l1):
            # Skip lines with details as they don't matter to us
            if line.startswith("? "):
                continue

            # When we encounter a +, it may either be because a line was
            # added, or because a line was modified (in that case,
            # save_line will not be None)
            if line.startswith("+ "):
                yield from self.yield_line(saved_line or "", line[2:])
                saved_line = None
                continue

            # If saved_line is not None, we need to yield from it before
            # moving on
            if saved_line is not None:
                yield from self.yield_line(saved_line, "")
                saved_line = None

            # Handle removed and unchanged lines
            if line.startswith("- "):
                saved_line = line[2:]
            elif line.startswith("  "):
                line = line[2:]
                yield from self.yield_line(line, line)

        # Make sure we don't forget a line
        if saved_line is not None:
            yield from self.yield_line(saved_line, "")

    def empty_buffer(self):
        if self.del_cpt == 0 or self.add_cpt == 0:
            # All lines are unchanged lines
            for l in self.buf:
                yield from self.yield_line(l[0], l[1])
        elif self.del_cpt != 0 and self.add_cpt != 0:
            l0 = [pair[0] for pair in self.buf if pair[0] is not None]
            l1 = [pair[1] for pair in self.buf if pair[1] is not None]
            yield from self.match_lines(l0, l1)

    def yield_line(self, s1, s2):
        orig1 = s1
        orig2 = s2

        if s1 is None and s2 is None:
            type_name = "unmodified"
        elif s1 == "" and s2 == "":
            type_name = "unmodified"
        elif s1 is None or s1 == "":
            type_name = "added"
        elif s2 is None or s2 == "":
            type_name = "deleted"
        elif (
            orig1 == orig2
            and not s1.endswith("lines removed ]")
            and not s2.endswith("lines removed ]")
        ):
            type_name = "unmodified"
        else:
            type_name = "changed"
            with profile("diff", "linediff"):
                s1, s2 = linediff(s1, s2, self.diffon, self.diffoff)

        yield "L", (type_name, s1, self.line1, s2, self.line2)

        m = orig1 and re.match(r"^\[ (\d+) lines removed \]$", orig1)
        if m:
            self.line1 += int(m.group(1))
        elif orig1:
            self.line1 += 1
        m = orig2 and re.match(r"^\[ (\d+) lines removed \]$", orig2)
        if m:
            self.line2 += int(m.group(1))
        elif orig2:
            self.line2 += 1

        self.add_cpt = 0
        self.del_cpt = 0
        self.buf = []

    def items(self):
        """Yield the items that form the side-by-side diff.

        Each item is a (type, value) tuple, as follows:

        type == "H", value is a tuple representing a hunk header
            hunk_offset1, hunk_size1, hunk_offset2, hunk_size2 = value
            all ints

        type == "L", value is a tuple representing a line of a hunk
            mode, line1, lineno1, line2, lineno2 = value
            where mode is one of {"unmodified", "added", "deleted", "changed"}
            line* are strings
            lineno* are ints

        type == "C", value is a comment
            comment = value
            a string
        """
        self.reset()

        for l in diff_split_lines(self.unified_diff, False):
            self._bytes_processed += len(l) + 1
            m = re.match(r"^--- ([^\s]*)", l)
            if m:
                yield from self.empty_buffer()
                continue
            m = re.match(r"^\+\+\+ ([^\s]*)", l)
            if m:
                yield from self.empty_buffer()
                continue

            m = re.match(r"@@ -(\d+),?(\d*) \+(\d+),?(\d*)", l)
            if m:
                yield from self.empty_buffer()
                hunk_data = map(lambda x: x == "" and 1 or int(x), m.groups())
                (
                    self.hunk_off1,
                    self.hunk_size1,
                    self.hunk_off2,
                    self.hunk_size2,
                ) = hunk_data
                self.line1, self.line2 = self.hunk_off1, self.hunk_off2
                yield "H", (
                    self.hunk_off1,
                    self.hunk_size1,
                    self.hunk_off2,
                    self.hunk_size2,
                )
                continue

            if re.match(r"^\[", l):
                yield from self.empty_buffer()
                yield "C", l

            if re.match(r"^\\ No newline", l):
                if self.hunk_size2 == 0:
                    self.buf[-1] = (
                        self.buf[-1][0],
                        self.buf[-1][1] + "\n" + l[2:],
                    )
                else:
                    self.buf[-1] = (
                        self.buf[-1][0] + "\n" + l[2:],
                        self.buf[-1][1],
                    )
                continue

            if self.hunk_size1 <= 0 and self.hunk_size2 <= 0:
                yield from self.empty_buffer()
                continue

            m = re.match(r"^\+\[ (\d+) lines removed \]$", l)
            if m:
                self.add_cpt += int(m.group(1))
                self.hunk_size2 -= int(m.group(1))
                self.buf.append((None, l[1:]))
                continue

            if re.match(r"^\+", l):
                self.add_cpt += 1
                self.hunk_size2 -= 1
                self.buf.append((None, l[1:]))
                continue

            m = re.match(r"^-\[ (\d+) lines removed \]$", l)
            if m:
                self.del_cpt += int(m.group(1))
                self.hunk_size1 -= int(m.group(1))
                self.buf.append((l[1:], None))
                continue

            if re.match(r"^-", l):
                self.del_cpt += 1
                self.hunk_size1 -= 1
                self.buf.append((l[1:], None))
                continue

            if re.match(r"^ ", l) and self.hunk_size1 and self.hunk_size2:
                yield from self.empty_buffer()
                self.hunk_size1 -= 1
                self.hunk_size2 -= 1
                self.buf.append((l[1:], l[1:]))
                continue

            yield from self.empty_buffer()

        yield from self.empty_buffer()