1# 2# diffoscope: in-depth comparison of files, archives, and directories 3# 4# Copyright © 2014-2015 Jérémy Bobbio <lunar@debian.org> 5# Copyright © 2017-2021 Chris Lamb <lamby@debian.org> 6# 7# diffoscope is free software: you can redistribute it and/or modify 8# it under the terms of the GNU General Public License as published by 9# the Free Software Foundation, either version 3 of the License, or 10# (at your option) any later version. 11# 12# diffoscope is distributed in the hope that it will be useful, 13# but WITHOUT ANY WARRANTY; without even the implied warranty of 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15# GNU General Public License for more details. 16# 17# You should have received a copy of the GNU General Public License 18# along with diffoscope. If not, see <https://www.gnu.org/licenses/>. 19 20import re 21import io 22import os 23import errno 24import fcntl 25import hashlib 26import logging 27import itertools 28import threading 29import subprocess 30 31from difflib import Differ 32from multiprocessing.dummy import Queue 33 34from .tools import get_tool_name, tool_required 35from .config import Config 36from .profiling import profile 37from .tempfiles import get_temporary_directory 38 39DIFF_CHUNK = 4096 40 41logger = logging.getLogger(__name__) 42re_diff_change = re.compile(r"^([+-@]).*", re.MULTILINE) 43 44 45class DiffParser: 46 RANGE_RE = re.compile( 47 # example: '@@ -26814,9 +26814,8 @@' 48 rb""" 49 ^ 50 @@\s+ 51 - 52 (?P<start1>\d+)(,(?P<len1>\d+))? 53 \s+ 54 \+ 55 (?P<start2>\d+)(,(?P<len2>\d+))? 56 \s+@@ 57 $ 58 """, 59 re.VERBOSE, 60 ) 61 62 def __init__(self, output, end_nl_q1, end_nl_q2): 63 self._output = output 64 self._end_nl_q1 = end_nl_q1 65 self._end_nl_q2 = end_nl_q2 66 self._action = self.read_headers 67 self._diff = io.BytesIO() 68 self._success = False 69 self._remaining_hunk_lines = None 70 self._block_len = None 71 self._direction = None 72 self._end_nl = None 73 self._max_lines = Config().max_diff_block_lines_saved 74 75 @property 76 def diff(self): 77 return self._diff.getvalue().decode("UTF-8", errors="replace") 78 79 @property 80 def success(self): 81 return self._success 82 83 def parse(self): 84 for line in self._output.split(b"\n"): 85 self._action = self._action(line) 86 87 self._action = None 88 self._success = True 89 90 def read_headers(self, line): 91 if not line: 92 return None 93 94 if line.startswith(b"---"): 95 return self.read_headers 96 97 if line.startswith(b"+++"): 98 return self.read_headers 99 100 found = DiffParser.RANGE_RE.match(line) 101 102 if not found: 103 raise ValueError(f"Unable to parse diff headers: {line!r}") 104 105 self._diff.write(line + b"\n") 106 if found.group("len1"): 107 self._remaining_hunk_lines = int(found.group("len1")) 108 else: 109 self._remaining_hunk_lines = 1 110 if found.group("len2"): 111 self._remaining_hunk_lines += int(found.group("len2")) 112 else: 113 self._remaining_hunk_lines += 1 114 115 self._direction = None 116 117 return self.read_hunk 118 119 def read_hunk(self, line): 120 if not line: 121 return None 122 123 if line[:1] == b" ": 124 self._remaining_hunk_lines -= 2 125 elif line[:1] == b"+": 126 self._remaining_hunk_lines -= 1 127 elif line[:1] == b"-": 128 self._remaining_hunk_lines -= 1 129 elif line[:1] == b"\\": 130 # When both files don't end with \n, do not show it as a difference 131 if self._end_nl is None: 132 end_nl1 = self._end_nl_q1.get() 133 end_nl2 = self._end_nl_q2.get() 134 self._end_nl = end_nl1 and end_nl2 135 if not self._end_nl: 136 return self.read_hunk 137 elif self._remaining_hunk_lines == 0: 138 return self.read_headers(line) 139 else: 140 raise ValueError(f"Unable to parse diff hunk: {line!r}") 141 142 self._diff.write(line + b"\n") 143 144 if line[:1] in (b"-", b"+"): 145 if line[:1] == self._direction: 146 self._block_len += 1 147 else: 148 self._block_len = 1 149 self._direction = line[:1] 150 151 if self._block_len >= self._max_lines: 152 return self.skip_block 153 else: 154 self._block_len = 1 155 self._direction = line[:1] 156 157 return self.read_hunk 158 159 def skip_block(self, line): 160 if self._remaining_hunk_lines == 0 or line[:1] != self._direction: 161 removed = self._block_len - Config().max_diff_block_lines_saved 162 if removed: 163 self._diff.write( 164 b"%s[ %d lines removed ]\n" % (self._direction, removed) 165 ) 166 return self.read_hunk(line) 167 168 self._block_len += 1 169 self._remaining_hunk_lines -= 1 170 171 return self.skip_block 172 173 174@tool_required("diff") 175def run_diff(fifo1, fifo2, end_nl_q1, end_nl_q2): 176 cmd = [ 177 get_tool_name("diff"), 178 "-a", 179 "-U" + str(Config().diff_context), 180 fifo1, 181 fifo2, 182 ] 183 184 logger.debug("Running %s", " ".join(cmd)) 185 186 p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 187 188 parser = DiffParser(p.stdout, end_nl_q1, end_nl_q2) 189 parser.parse() 190 191 logger.debug( 192 "%s: returncode %d, parsed %s", 193 " ".join(cmd), 194 p.returncode, 195 parser.success, 196 ) 197 198 if not parser.success and p.returncode not in (0, 1): 199 raise subprocess.CalledProcessError( 200 p.returncode, cmd, output=parser.diff.encode("utf-8") 201 ) 202 203 if p.returncode == 0: 204 return None 205 206 return parser.diff 207 208 209class FIFOFeeder(threading.Thread): 210 def __init__(self, feeder, fifo_path, end_nl_q=None, daemon=True, *args): 211 os.mkfifo(fifo_path) 212 super().__init__(daemon=daemon) 213 self.feeder = feeder 214 self.fifo_path = fifo_path 215 self.end_nl_q = Queue() if end_nl_q is None else end_nl_q 216 self._exception = None 217 self._want_join = threading.Event() 218 219 def __enter__(self): 220 self.start() 221 return self 222 223 def __exit__(self, exc_type, exc_value, exc_tb): 224 os.remove(self.fifo_path) 225 self.join() 226 227 def run(self): 228 try: 229 # Try to open the FIFO nonblocking, so we can periodically check 230 # if the main thread wants us to wind down. If it does, there's no 231 # more need for the FIFO, so stop the thread. 232 while True: 233 try: 234 fd = os.open(self.fifo_path, os.O_WRONLY | os.O_NONBLOCK) 235 except OSError as e: 236 if e.errno != errno.ENXIO: 237 raise 238 if self._want_join.is_set(): 239 return 240 else: 241 break 242 243 # Now clear the fd's nonblocking flag to let writes block normally. 244 fcntl.fcntl(fd, fcntl.F_SETFL, 0) 245 246 with open(fd, "wb") as fifo: 247 # The queue works around a unified diff limitation: if there's 248 # no newlines in both don't make it a difference 249 end_nl = self.feeder(fifo) 250 self.end_nl_q.put(end_nl) 251 except Exception as error: 252 self._exception = error 253 254 def join(self): 255 self._want_join.set() 256 super().join() 257 if self._exception is not None: 258 raise self._exception 259 260 261class _Feeder: 262 """ 263 A 'feeder' is a specialized writer. 264 265 A 'feeder' is a callable that takes as argument a writeable file, and 266 writes to it. Feeders can transform the written data, truncate it, 267 checksum it, and so on. The callable must return True to represent that 268 the data had a terminating newline, and False otherwise. 269 270 Feeders are created by the functions make_feeder_from_raw_reader() and 271 empty_file_feeder(). The returned objects are closures, and are not 272 (currently?) instances of any particular class. 273 """ 274 275 276def empty_file_feeder(): 277 """ 278 Returns a feeder that simulates an empty file. 279 280 See _Feeder for feeders. 281 """ 282 283 def feeder(f): 284 return False 285 286 return feeder 287 288 289def make_feeder_from_raw_reader(in_file, filterfn=None): 290 """ 291 Create a feeder that checksums, truncates, and transcodes the data. The 292 optional argument filterfn is a callable that gets passed each line, and 293 returns the line that should be used in its stead. (There is no facility 294 for filterfn to discard a line entirely.) 295 296 See _Feeder for feeders. 297 """ 298 299 def feeder(out_file): 300 h = None 301 end_nl = False 302 max_lines = Config().max_diff_input_lines 303 line_count = 0 304 305 if max_lines < float("inf"): 306 h = hashlib.sha256() 307 308 for buf in in_file: 309 line_count += 1 310 out = filterfn(buf) if filterfn else buf 311 if h: 312 h.update(out) 313 if line_count < max_lines: 314 out_file.write(out) 315 end_nl = buf[-1] == "\n" 316 317 if h and line_count >= max_lines: 318 out_file.write( 319 "[ Too much input for diff (SHA: {}) ]\n".format( 320 h.hexdigest() 321 ).encode("utf-8") 322 ) 323 end_nl = True 324 325 return end_nl 326 327 return feeder 328 329 330def diff(feeder1, feeder2): 331 with get_temporary_directory() as tmpdir: 332 fifo1_path = os.path.join(tmpdir, "fifo1") 333 fifo2_path = os.path.join(tmpdir, "fifo2") 334 with FIFOFeeder(feeder1, fifo1_path) as fifo1, FIFOFeeder( 335 feeder2, fifo2_path 336 ) as fifo2: 337 return run_diff( 338 fifo1_path, fifo2_path, fifo1.end_nl_q, fifo2.end_nl_q 339 ) 340 341 342def diff_split_lines(diff, keepends=True): 343 lines = diff.split("\n") 344 if not keepends: 345 return lines 346 return [line + "\n" for line in lines[:-1]] + ( 347 [lines[-1]] if lines[-1] else [] 348 ) 349 350 351def reverse_unified_diff(diff): 352 assert isinstance(diff, str) 353 res = [] 354 # XXX - should move to just use plain bytes nearly everywhere until ready 355 # to print instead of using strings internally as well. 356 for line in diff_split_lines(diff): 357 line = line.encode("utf-8", errors="replace") 358 assert isinstance(line, bytes) 359 found = DiffParser.RANGE_RE.match(line) 360 361 if found: 362 before = found.group("start2") 363 if found.group("len2") is not None: 364 before += b"," + found.group("len2") 365 366 after = found.group("start1") 367 if found.group("len1") is not None: 368 after += b"," + found.group("len1") 369 370 res.append(b"@@ -%s +%s @@\n" % (before, after)) 371 elif line.startswith(b"-"): 372 res.append(b"+") 373 res.append(line[1:]) 374 elif line.startswith(b"+"): 375 res.append(b"-") 376 res.append(line[1:]) 377 else: 378 res.append(line) 379 return b"".join(res).decode("utf-8", errors="replace") 380 381 382def color_unified_diff(diff): 383 RESET = "\033[0m" 384 RED, GREEN, CYAN = "\033[31m", "\033[32m", "\033[0;36m" 385 386 def repl(m): 387 return "{}{}{}".format( 388 {"-": RED, "@": CYAN, "+": GREEN}[m.group(1)], m.group(0), RESET 389 ) 390 391 return re_diff_change.sub(repl, diff) 392 393 394DIFFON = "\x01" 395DIFFOFF = "\x02" 396MAX_WAGNER_FISCHER_SIZE = ( 397 256 # any higher, and linediff takes >1 second and >200MB RAM 398) 399 400 401def _linediff_sane(x): 402 # turn non-printable chars into "." 403 return "." if ord(x) < 32 and x not in "\t\n" else x 404 405 406def diffinput_truncate(s, sz): 407 # Truncate, preserving uniqueness 408 if len(s) > sz: 409 s = s[ 410 :sz 411 ] + "[ ... truncated by diffoscope; len: {}, SHA: {} ... ]".format( 412 len(s[sz:]), hashlib.sha256(s[sz:].encode("utf-8")).hexdigest() 413 ) 414 return s 415 416 417def linediff(s, t, diffon, diffoff): 418 # calculate common prefix/suffix, easy optimisation for Wagner-Fischer 419 prefix = os.path.commonprefix((s, t)) 420 if prefix: 421 s = s[len(prefix) :] 422 t = t[len(prefix) :] 423 suffix = os.path.commonprefix((s[::-1], t[::-1]))[::-1] 424 if suffix: 425 s = s[: -len(suffix)] 426 t = t[: -len(suffix)] 427 428 # truncate so Wagner-Fischer doesn't blow up RAM 429 s = diffinput_truncate(s, MAX_WAGNER_FISCHER_SIZE) 430 t = diffinput_truncate(t, MAX_WAGNER_FISCHER_SIZE) 431 l1, l2 = zip(*linediff_simplify(linediff_wagnerfischer(s, t))) 432 433 def to_string(k, v): 434 sanev = "".join(_linediff_sane(c) for c in v) 435 return (diffon + sanev + diffoff) if k else sanev 436 437 s1 = "".join(to_string(*p) for p in l1) 438 t1 = "".join(to_string(*p) for p in l2) 439 440 return f"{prefix}{s1}{suffix}", f"{prefix}{t1}{suffix}" 441 442 443def linediff_wagnerfischer(s, t): 444 """ 445 Line diff algorithm, originally from diff2html. 446 447 https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm 448 449 Finds the minimum (levenshtein) edit distance between two strings, but has 450 quadratic performance O(m*n). 451 """ 452 m, n = len(s), len(t) 453 d = [[(0, 0) for i in range(n + 1)] for i in range(m + 1)] 454 455 d[0][0] = (0, (0, 0)) 456 for i in range(1, m + 1): 457 d[i][0] = (i, (i - 1, 0)) 458 for j in range(1, n + 1): 459 d[0][j] = (j, (0, j - 1)) 460 461 # NB. This loop is O(len(s) * len(t)) 462 for i in range(1, m + 1): 463 for j in range(1, n + 1): 464 if s[i - 1] == t[j - 1]: 465 cost = 0 466 else: 467 cost = 1 468 d[i][j] = min( 469 (d[i - 1][j][0] + 1, (i - 1, j)), 470 (d[i][j - 1][0] + 1, (i, j - 1)), 471 (d[i - 1][j - 1][0] + cost, (i - 1, j - 1)), 472 ) 473 474 coords = [] 475 coord = (m, n) 476 while coord != (0, 0): 477 coords.insert(0, coord) 478 x, y = coord 479 coord = d[x][y][1] 480 481 for coord in coords: 482 cx, cy = coord 483 child_val = d[cx][cy][0] 484 485 father_coord = d[cx][cy][1] 486 fx, fy = father_coord 487 father_val = d[fx][fy][0] 488 489 diff = (cx - fx, cy - fy) 490 491 if diff == (0, 1): 492 yield (False, ""), (True, t[fy]) 493 elif diff == (1, 0): 494 yield (True, s[fx]), (False, "") 495 elif child_val - father_val == 1: 496 yield (True, s[fx]), (True, t[fy]) 497 else: 498 assert s[fx] == t[fy] 499 yield (False, s[fx]), (False, t[fy]) 500 501 502def linediff_simplify(g): 503 """Simplify the output of Wagner-Fischer.""" 504 current = None 505 for l, r in g: 506 if not current: 507 current = l, r 508 elif current[0][0] == l[0] and current[1][0] == r[0]: 509 current = ( 510 (l[0], current[0][1] + l[1]), 511 (r[0], current[1][1] + r[1]), 512 ) 513 else: 514 yield current 515 current = l, r 516 if current: 517 yield current 518 519 520class SideBySideDiff: 521 """Calculates a side-by-side diff from a unified diff.""" 522 523 def __init__(self, unified_diff, diffon=DIFFON, diffoff=DIFFOFF): 524 self.unified_diff = unified_diff 525 self.diffon = diffon 526 self.diffoff = diffoff 527 self.reset() 528 529 def reset(self): 530 self.differ = Differ() 531 self.buf = [] 532 self.add_cpt = 0 533 self.del_cpt = 0 534 self.line1 = 0 535 self.line2 = 0 536 self.hunk_off1 = 0 537 self.hunk_size1 = 0 538 self.hunk_off2 = 0 539 self.hunk_size2 = 0 540 self._bytes_processed = 0 541 542 @property 543 def bytes_processed(self): 544 return self._bytes_processed 545 546 def match_lines(self, l0, l1): 547 """ 548 Given two lists of lines, use python's difflib to make the diff a bit 549 "smarter", as it better detects added lines (see !64) 550 """ 551 552 if len(l0) + len(l1) > 750: 553 # difflib.Differ.compare is at least O(n^2), so don't call it if 554 # our inputs are too large. 555 yield "C", "Diff chunk too large, falling back to line-by-line diff ({} lines added, {} lines removed)".format( 556 self.add_cpt, self.del_cpt 557 ) 558 for line0, line1 in itertools.zip_longest(l0, l1, fillvalue=""): 559 yield from self.yield_line(line0, line1) 560 return 561 562 saved_line = None 563 for line in self.differ.compare(l0, l1): 564 # Skip lines with details as they don't matter to us 565 if line.startswith("? "): 566 continue 567 568 # When we encounter a +, it may either be because a line was 569 # added, or because a line was modified (in that case, 570 # save_line will not be None) 571 if line.startswith("+ "): 572 yield from self.yield_line(saved_line or "", line[2:]) 573 saved_line = None 574 continue 575 576 # If saved_line is not None, we need to yield from it before 577 # moving on 578 if saved_line is not None: 579 yield from self.yield_line(saved_line, "") 580 saved_line = None 581 582 # Handle removed and unchanged lines 583 if line.startswith("- "): 584 saved_line = line[2:] 585 elif line.startswith(" "): 586 line = line[2:] 587 yield from self.yield_line(line, line) 588 589 # Make sure we don't forget a line 590 if saved_line is not None: 591 yield from self.yield_line(saved_line, "") 592 593 def empty_buffer(self): 594 if self.del_cpt == 0 or self.add_cpt == 0: 595 # All lines are unchanged lines 596 for l in self.buf: 597 yield from self.yield_line(l[0], l[1]) 598 elif self.del_cpt != 0 and self.add_cpt != 0: 599 l0 = [pair[0] for pair in self.buf if pair[0] is not None] 600 l1 = [pair[1] for pair in self.buf if pair[1] is not None] 601 yield from self.match_lines(l0, l1) 602 603 def yield_line(self, s1, s2): 604 orig1 = s1 605 orig2 = s2 606 607 if s1 is None and s2 is None: 608 type_name = "unmodified" 609 elif s1 == "" and s2 == "": 610 type_name = "unmodified" 611 elif s1 is None or s1 == "": 612 type_name = "added" 613 elif s2 is None or s2 == "": 614 type_name = "deleted" 615 elif ( 616 orig1 == orig2 617 and not s1.endswith("lines removed ]") 618 and not s2.endswith("lines removed ]") 619 ): 620 type_name = "unmodified" 621 else: 622 type_name = "changed" 623 with profile("diff", "linediff"): 624 s1, s2 = linediff(s1, s2, self.diffon, self.diffoff) 625 626 yield "L", (type_name, s1, self.line1, s2, self.line2) 627 628 m = orig1 and re.match(r"^\[ (\d+) lines removed \]$", orig1) 629 if m: 630 self.line1 += int(m.group(1)) 631 elif orig1: 632 self.line1 += 1 633 m = orig2 and re.match(r"^\[ (\d+) lines removed \]$", orig2) 634 if m: 635 self.line2 += int(m.group(1)) 636 elif orig2: 637 self.line2 += 1 638 639 self.add_cpt = 0 640 self.del_cpt = 0 641 self.buf = [] 642 643 def items(self): 644 """Yield the items that form the side-by-side diff. 645 646 Each item is a (type, value) tuple, as follows: 647 648 type == "H", value is a tuple representing a hunk header 649 hunk_offset1, hunk_size1, hunk_offset2, hunk_size2 = value 650 all ints 651 652 type == "L", value is a tuple representing a line of a hunk 653 mode, line1, lineno1, line2, lineno2 = value 654 where mode is one of {"unmodified", "added", "deleted", "changed"} 655 line* are strings 656 lineno* are ints 657 658 type == "C", value is a comment 659 comment = value 660 a string 661 """ 662 self.reset() 663 664 for l in diff_split_lines(self.unified_diff, False): 665 self._bytes_processed += len(l) + 1 666 m = re.match(r"^--- ([^\s]*)", l) 667 if m: 668 yield from self.empty_buffer() 669 continue 670 m = re.match(r"^\+\+\+ ([^\s]*)", l) 671 if m: 672 yield from self.empty_buffer() 673 continue 674 675 m = re.match(r"@@ -(\d+),?(\d*) \+(\d+),?(\d*)", l) 676 if m: 677 yield from self.empty_buffer() 678 hunk_data = map(lambda x: x == "" and 1 or int(x), m.groups()) 679 ( 680 self.hunk_off1, 681 self.hunk_size1, 682 self.hunk_off2, 683 self.hunk_size2, 684 ) = hunk_data 685 self.line1, self.line2 = self.hunk_off1, self.hunk_off2 686 yield "H", ( 687 self.hunk_off1, 688 self.hunk_size1, 689 self.hunk_off2, 690 self.hunk_size2, 691 ) 692 continue 693 694 if re.match(r"^\[", l): 695 yield from self.empty_buffer() 696 yield "C", l 697 698 if re.match(r"^\\ No newline", l): 699 if self.hunk_size2 == 0: 700 self.buf[-1] = ( 701 self.buf[-1][0], 702 self.buf[-1][1] + "\n" + l[2:], 703 ) 704 else: 705 self.buf[-1] = ( 706 self.buf[-1][0] + "\n" + l[2:], 707 self.buf[-1][1], 708 ) 709 continue 710 711 if self.hunk_size1 <= 0 and self.hunk_size2 <= 0: 712 yield from self.empty_buffer() 713 continue 714 715 m = re.match(r"^\+\[ (\d+) lines removed \]$", l) 716 if m: 717 self.add_cpt += int(m.group(1)) 718 self.hunk_size2 -= int(m.group(1)) 719 self.buf.append((None, l[1:])) 720 continue 721 722 if re.match(r"^\+", l): 723 self.add_cpt += 1 724 self.hunk_size2 -= 1 725 self.buf.append((None, l[1:])) 726 continue 727 728 m = re.match(r"^-\[ (\d+) lines removed \]$", l) 729 if m: 730 self.del_cpt += int(m.group(1)) 731 self.hunk_size1 -= int(m.group(1)) 732 self.buf.append((l[1:], None)) 733 continue 734 735 if re.match(r"^-", l): 736 self.del_cpt += 1 737 self.hunk_size1 -= 1 738 self.buf.append((l[1:], None)) 739 continue 740 741 if re.match(r"^ ", l) and self.hunk_size1 and self.hunk_size2: 742 yield from self.empty_buffer() 743 self.hunk_size1 -= 1 744 self.hunk_size2 -= 1 745 self.buf.append((l[1:], l[1:])) 746 continue 747 748 yield from self.empty_buffer() 749 750 yield from self.empty_buffer() 751