1<?php 2 3/** 4 * Parses diffs from a working copy. 5 */ 6final class ArcanistDiffParser extends Phobject { 7 8 protected $repositoryAPI; 9 protected $text; 10 protected $line; 11 protected $lineSaved; 12 protected $isGit; 13 protected $isMercurial; 14 protected $isRCS; 15 protected $detectBinaryFiles = false; 16 protected $tryEncoding; 17 protected $rawDiff; 18 protected $writeDiffOnFailure; 19 20 protected $changes = array(); 21 private $forcePath; 22 23 public function setRepositoryAPI(ArcanistRepositoryAPI $repository_api) { 24 $this->repositoryAPI = $repository_api; 25 return $this; 26 } 27 28 public function setDetectBinaryFiles($detect) { 29 $this->detectBinaryFiles = $detect; 30 return $this; 31 } 32 33 public function setTryEncoding($encoding) { 34 $this->tryEncoding = $encoding; 35 return $this; 36 } 37 38 public function forcePath($path) { 39 $this->forcePath = $path; 40 return $this; 41 } 42 43 public function setChanges(array $changes) { 44 assert_instances_of($changes, 'ArcanistDiffChange'); 45 $this->changes = mpull($changes, null, 'getCurrentPath'); 46 return $this; 47 } 48 49 public function parseSubversionDiff(ArcanistSubversionAPI $api, $paths) { 50 $this->setRepositoryAPI($api); 51 52 $diffs = array(); 53 54 foreach ($paths as $path => $status) { 55 if ($status & ArcanistRepositoryAPI::FLAG_UNTRACKED || 56 $status & ArcanistRepositoryAPI::FLAG_CONFLICT || 57 $status & ArcanistRepositoryAPI::FLAG_MISSING) { 58 unset($paths[$path]); 59 } 60 } 61 62 $root = null; 63 $from = array(); 64 foreach ($paths as $path => $status) { 65 $change = $this->buildChange($path); 66 67 if ($status & ArcanistRepositoryAPI::FLAG_ADDED) { 68 $change->setType(ArcanistDiffChangeType::TYPE_ADD); 69 } else if ($status & ArcanistRepositoryAPI::FLAG_DELETED) { 70 $change->setType(ArcanistDiffChangeType::TYPE_DELETE); 71 } else { 72 $change->setType(ArcanistDiffChangeType::TYPE_CHANGE); 73 } 74 75 $is_dir = is_dir($api->getPath($path)); 76 if ($is_dir) { 77 $change->setFileType(ArcanistDiffChangeType::FILE_DIRECTORY); 78 // We have to go hit the diff even for directories because they may 79 // have property changes or moves, etc. 80 } 81 $is_link = is_link($api->getPath($path)); 82 if ($is_link) { 83 $change->setFileType(ArcanistDiffChangeType::FILE_SYMLINK); 84 } 85 86 $diff = $api->getRawDiffText($path); 87 if ($diff) { 88 $this->parseDiff($diff); 89 } 90 91 $info = $api->getSVNInfo($path); 92 if (idx($info, 'Copied From URL')) { 93 if (!$root) { 94 $rinfo = $api->getSVNInfo('.'); 95 $root = $rinfo['URL'].'/'; 96 } 97 $cpath = $info['Copied From URL']; 98 $root_len = strlen($root); 99 if (!strncmp($cpath, $root, $root_len)) { 100 $cpath = substr($cpath, $root_len); 101 // The user can "svn cp /path/to/file@12345 x", which pulls a file out 102 // of version history at a specific revision. If we just use the path, 103 // we'll collide with possible changes to that path in the working 104 // copy below. In particular, "svn cp"-ing a path which no longer 105 // exists somewhere in the working copy and then adding that path 106 // gets us to the "origin change type" branches below with a 107 // TYPE_ADD state on the path. To avoid this, append the origin 108 // revision to the path so we'll necessarily generate a new change. 109 // TODO: In theory, you could have an '@' in your path and this could 110 // cause a collision, e.g. two files named 'f' and 'f@12345'. This is 111 // at least somewhat the user's fault, though. 112 if ($info['Copied From Rev']) { 113 if ($info['Copied From Rev'] != $info['Revision']) { 114 $cpath .= '@'.$info['Copied From Rev']; 115 } 116 } 117 $change->setOldPath($cpath); 118 $from[$path] = $cpath; 119 } 120 } 121 122 $type = $change->getType(); 123 if (($type === ArcanistDiffChangeType::TYPE_MOVE_AWAY || 124 $type === ArcanistDiffChangeType::TYPE_DELETE) && 125 idx($info, 'Node Kind') === 'directory') { 126 $change->setFileType(ArcanistDiffChangeType::FILE_DIRECTORY); 127 } 128 } 129 130 foreach ($paths as $path => $status) { 131 $change = $this->buildChange($path); 132 if (empty($from[$path])) { 133 continue; 134 } 135 136 if (empty($this->changes[$from[$path]])) { 137 if ($change->getType() == ArcanistDiffChangeType::TYPE_COPY_HERE) { 138 // If the origin path wasn't changed (or isn't included in this diff) 139 // and we only copied it, don't generate a changeset for it. This 140 // keeps us out of trouble when we go to 'arc commit' and need to 141 // figure out which files should be included in the commit list. 142 continue; 143 } 144 } 145 146 $origin = $this->buildChange($from[$path]); 147 $origin->addAwayPath($change->getCurrentPath()); 148 149 $type = $origin->getType(); 150 switch ($type) { 151 case ArcanistDiffChangeType::TYPE_MULTICOPY: 152 case ArcanistDiffChangeType::TYPE_COPY_AWAY: 153 // "Add" is possible if you do some bizarre tricks with svn:ignore and 154 // "svn copy"'ing URLs straight from the repository; you can end up with 155 // a file that is a copy of itself. See T271. 156 case ArcanistDiffChangeType::TYPE_ADD: 157 break; 158 case ArcanistDiffChangeType::TYPE_DELETE: 159 $origin->setType(ArcanistDiffChangeType::TYPE_MOVE_AWAY); 160 break; 161 case ArcanistDiffChangeType::TYPE_MOVE_AWAY: 162 $origin->setType(ArcanistDiffChangeType::TYPE_MULTICOPY); 163 break; 164 case ArcanistDiffChangeType::TYPE_CHANGE: 165 $origin->setType(ArcanistDiffChangeType::TYPE_COPY_AWAY); 166 break; 167 default: 168 throw new Exception(pht('Bad origin state %s.', $type)); 169 } 170 171 $type = $origin->getType(); 172 switch ($type) { 173 case ArcanistDiffChangeType::TYPE_MULTICOPY: 174 case ArcanistDiffChangeType::TYPE_MOVE_AWAY: 175 $change->setType(ArcanistDiffChangeType::TYPE_MOVE_HERE); 176 break; 177 case ArcanistDiffChangeType::TYPE_ADD: 178 case ArcanistDiffChangeType::TYPE_COPY_AWAY: 179 $change->setType(ArcanistDiffChangeType::TYPE_COPY_HERE); 180 break; 181 default: 182 throw new Exception(pht('Bad origin state %s.', $type)); 183 } 184 } 185 186 return $this->changes; 187 } 188 189 public function parseDiff($diff) { 190 if (!strlen(trim($diff))) { 191 throw new Exception(pht("Can't parse an empty diff!")); 192 } 193 194 // Detect `git-format-patch`, by looking for a "---" line somewhere in 195 // the file and then a footer with Git version number, which looks like 196 // this: 197 // 198 // -- 199 // 1.8.4.2 200 // 201 // Note that `git-format-patch` adds a space after the "--", but we don't 202 // require it when detecting patches, as trailing whitespace can easily be 203 // lost in transit. 204 $detect_patch = '/^---$.*^-- ?[\s\d.]+\z/ms'; 205 $message = null; 206 if (preg_match($detect_patch, $diff)) { 207 list($message, $diff) = $this->stripGitFormatPatch($diff); 208 } 209 210 $this->didStartParse($diff); 211 212 // Strip off header comments. While `patch` allows comments anywhere in the 213 // file, `git apply` is more strict. We get these comments in `hg export` 214 // diffs, and Eclipse can also produce them. 215 $line = $this->getLineTrimmed(); 216 while (preg_match('/^#/', $line)) { 217 $line = $this->nextLine(); 218 } 219 220 if (strlen($message)) { 221 // If we found a message during pre-parse steps, add it to the resulting 222 // changes here. 223 $change = $this->buildChange(null) 224 ->setType(ArcanistDiffChangeType::TYPE_MESSAGE) 225 ->setMetadata('message', $message); 226 } 227 228 do { 229 $patterns = array( 230 // This is a normal SVN text change, probably from "svn diff". 231 '(?P<type>Index): (?P<cur>.+)', 232 // This is an SVN text change, probably from "svnlook diff". 233 '(?P<type>Modified|Added|Deleted|Copied): (?P<cur>.+)', 234 // This is an SVN property change, probably from "svn diff". 235 '(?P<type>Property changes on): (?P<cur>.+)', 236 // This is a git commit message, probably from "git show". 237 '(?P<type>commit) (?P<hash>[a-f0-9]+)(?: \(.*\))?', 238 // This is a git diff, probably from "git show" or "git diff". 239 // Note that the filenames may appear quoted. 240 '(?P<type>diff --git) (?P<oldnew>.*)', 241 // RCS Diff 242 '(?P<type>rcsdiff -u) (?P<oldnew>.*)', 243 // This is a unified diff, probably from "diff -u" or synthetic diffing. 244 '(?P<type>---) (?P<old>.+)\s+\d{4}-\d{2}-\d{2}.*', 245 '(?P<binary>Binary files|Files) '. 246 '(?P<old>.+)\s+\d{4}-\d{2}-\d{2} and '. 247 '(?P<new>.+)\s+\d{4}-\d{2}-\d{2} differ.*', 248 // This is a normal Mercurial text change, probably from "hg diff". It 249 // may have two "-r" blocks if it came from "hg diff -r x:y". 250 '(?P<type>diff -r) (?P<hgrev>[a-f0-9]+) (?:-r [a-f0-9]+ )?(?P<cur>.+)', 251 ); 252 253 $line = $this->getLineTrimmed(); 254 $match = null; 255 $ok = $this->tryMatchHeader($patterns, $line, $match); 256 257 $failed_parse = false; 258 if (!$ok && $this->isFirstNonEmptyLine()) { 259 // 'hg export' command creates so called "extended diff" that 260 // contains some meta information and comment at the beginning 261 // (isFirstNonEmptyLine() to check for beginning). Actual mercurial 262 // code detects where comment ends and unified diff starts by 263 // searching for "diff -r" or "diff --git" in the text. 264 $this->saveLine(); 265 $line = $this->nextLineThatLooksLikeDiffStart(); 266 if (!$this->tryMatchHeader($patterns, $line, $match)) { 267 // Restore line before guessing to display correct error. 268 $this->restoreLine(); 269 $failed_parse = true; 270 } 271 } else if (!$ok) { 272 $failed_parse = true; 273 } 274 275 if ($failed_parse) { 276 $this->didFailParse( 277 pht( 278 "Expected a hunk header, like '%s' (svn), '%s' (svn properties), ". 279 "'%s' (git show), '%s' (git diff), '%s' (unified diff), or ". 280 "'%s' (hg diff or patch).", 281 'Index: /path/to/file.ext', 282 'Property changes on: /path/to/file.ext', 283 'commit 59bcc3ad6775562f845953cf01624225', 284 'diff --git', 285 '--- filename', 286 'diff -r')); 287 } 288 289 if (isset($match['type'])) { 290 if ($match['type'] == 'diff --git') { 291 $filename = self::extractGitCommonFilename($match['oldnew']); 292 if ($filename !== null) { 293 $match['old'] = $filename; 294 $match['cur'] = $filename; 295 } 296 } 297 } 298 299 $change = $this->buildChange(idx($match, 'cur')); 300 301 if (isset($match['old'])) { 302 $change->setOldPath($match['old']); 303 } 304 305 if (isset($match['hash'])) { 306 $change->setCommitHash($match['hash']); 307 } 308 309 if (isset($match['binary'])) { 310 $change->setFileType(ArcanistDiffChangeType::FILE_BINARY); 311 $line = $this->nextNonemptyLine(); 312 continue; 313 } 314 315 $line = $this->nextLine(); 316 317 switch ($match['type']) { 318 case 'Index': 319 case 'Modified': 320 case 'Added': 321 case 'Deleted': 322 case 'Copied': 323 $this->parseIndexHunk($change); 324 break; 325 case 'Property changes on': 326 $this->parsePropertyHunk($change); 327 break; 328 case 'diff --git': 329 $this->setIsGit(true); 330 $this->parseIndexHunk($change); 331 break; 332 case 'commit': 333 $this->setIsGit(true); 334 $this->parseCommitMessage($change); 335 break; 336 case '---': 337 $ok = preg_match( 338 '@^(?:\+\+\+) (.*)\s+\d{4}-\d{2}-\d{2}.*$@', 339 $line, 340 $match); 341 if (!$ok) { 342 $this->didFailParse(pht( 343 "Expected '%s' in unified diff.", 344 '+++ filename')); 345 } 346 $change->setCurrentPath($match[1]); 347 $line = $this->nextLine(); 348 $this->parseChangeset($change); 349 break; 350 case 'diff -r': 351 $this->setIsMercurial(true); 352 $this->parseIndexHunk($change); 353 break; 354 case 'rcsdiff -u': 355 $this->isRCS = true; 356 $this->parseIndexHunk($change); 357 break; 358 default: 359 $this->didFailParse(pht('Unknown diff type.')); 360 break; 361 } 362 } while ($this->getLine() !== null); 363 364 $this->didFinishParse(); 365 366 $this->loadSyntheticData(); 367 368 return $this->changes; 369 } 370 371 protected function tryMatchHeader($patterns, $line, &$match) { 372 foreach ($patterns as $pattern) { 373 if (preg_match('@^'.$pattern.'$@', $line, $match)) { 374 return true; 375 } 376 } 377 return false; 378 } 379 380 protected function parseCommitMessage(ArcanistDiffChange $change) { 381 $change->setType(ArcanistDiffChangeType::TYPE_MESSAGE); 382 383 $message = array(); 384 385 $line = $this->getLine(); 386 if (preg_match('/^Merge: /', $line)) { 387 $this->nextLine(); 388 } 389 390 $line = $this->getLine(); 391 if (!preg_match('/^Author: /', $line)) { 392 $this->didFailParse(pht("Expected 'Author:'.")); 393 } 394 395 $line = $this->nextLine(); 396 if (!preg_match('/^Date: /', $line)) { 397 $this->didFailParse(pht("Expected 'Date:'.")); 398 } 399 400 while (($line = $this->nextLineTrimmed()) !== null) { 401 if (strlen($line) && $line[0] != ' ') { 402 break; 403 } 404 405 // Strip leading spaces from Git commit messages. Note that empty lines 406 // are represented as just "\n"; don't touch those. 407 $message[] = preg_replace('/^ /', '', $this->getLine()); 408 } 409 410 $message = rtrim(implode('', $message), "\r\n"); 411 $change->setMetadata('message', $message); 412 } 413 414 /** 415 * Parse an SVN property change hunk. These hunks are ambiguous so just sort 416 * of try to get it mostly right. It's entirely possible to foil this parser 417 * (or any other parser) with a carefully constructed property change. 418 */ 419 protected function parsePropertyHunk(ArcanistDiffChange $change) { 420 $line = $this->getLineTrimmed(); 421 if (!preg_match('/^_+$/', $line)) { 422 $this->didFailParse(pht("Expected '%s'.", '______________________')); 423 } 424 425 $line = $this->nextLine(); 426 while ($line !== null) { 427 $done = preg_match('/^(Index|Property changes on):/', $line); 428 if ($done) { 429 break; 430 } 431 432 // NOTE: Before 1.5, SVN uses "Name". At 1.5 and later, SVN uses 433 // "Modified", "Added" and "Deleted". 434 435 $matches = null; 436 $ok = preg_match( 437 '/^(Name|Modified|Added|Deleted): (.*)$/', 438 $line, 439 $matches); 440 if (!$ok) { 441 $this->didFailParse( 442 pht("Expected 'Name', 'Added', 'Deleted', or 'Modified'.")); 443 } 444 445 $op = $matches[1]; 446 $prop = $matches[2]; 447 448 list($old, $new) = $this->parseSVNPropertyChange($op, $prop); 449 450 if ($old !== null) { 451 $change->setOldProperty($prop, $old); 452 } 453 454 if ($new !== null) { 455 $change->setNewProperty($prop, $new); 456 } 457 458 $line = $this->getLine(); 459 } 460 } 461 462 private function parseSVNPropertyChange($op, $prop) { 463 $old = array(); 464 $new = array(); 465 466 $target = null; 467 468 $line = $this->nextLine(); 469 $prop_index = 2; 470 while ($line !== null) { 471 $done = preg_match( 472 '/^(Modified|Added|Deleted|Index|Property changes on):/', 473 $line); 474 if ($done) { 475 break; 476 } 477 $trimline = ltrim($line); 478 if ($trimline && $trimline[0] == '#') { 479 // in svn1.7, a line like ## -0,0 +1 ## is put between the Added: line 480 // and the line with the property change. If we have such a line, we'll 481 // just ignore it (: 482 $line = $this->nextLine(); 483 $prop_index = 1; 484 $trimline = ltrim($line); 485 } 486 if ($trimline && $trimline[0] == '+') { 487 if ($op == 'Deleted') { 488 $this->didFailParse(pht( 489 'Unexpected "%s" section in property deletion.', 490 '+')); 491 } 492 $target = 'new'; 493 $line = substr($trimline, $prop_index); 494 } else if ($trimline && $trimline[0] == '-') { 495 if ($op == 'Added') { 496 $this->didFailParse(pht( 497 'Unexpected "%s" section in property addition.', 498 '-')); 499 } 500 $target = 'old'; 501 $line = substr($trimline, $prop_index); 502 } else if (!strncmp($trimline, 'Merged', 6)) { 503 if ($op == 'Added') { 504 $target = 'new'; 505 } else { 506 // These can appear on merges. No idea how to interpret this (unclear 507 // what the old / new values are) and it's of dubious usefulness so 508 // just throw it away until someone complains. 509 $target = null; 510 } 511 $line = $trimline; 512 } 513 514 if ($target == 'new') { 515 $new[] = $line; 516 } else if ($target == 'old') { 517 $old[] = $line; 518 } 519 520 $line = $this->nextLine(); 521 } 522 523 $old = rtrim(implode('', $old)); 524 $new = rtrim(implode('', $new)); 525 526 if (!strlen($old)) { 527 $old = null; 528 } 529 530 if (!strlen($new)) { 531 $new = null; 532 } 533 534 return array($old, $new); 535 } 536 537 protected function setIsGit($git) { 538 if ($this->isGit !== null && $this->isGit != $git) { 539 throw new Exception(pht('Git status has changed!')); 540 } 541 $this->isGit = $git; 542 return $this; 543 } 544 545 protected function getIsGit() { 546 return $this->isGit; 547 } 548 549 public function setIsMercurial($is_mercurial) { 550 $this->isMercurial = $is_mercurial; 551 return $this; 552 } 553 554 public function getIsMercurial() { 555 return $this->isMercurial; 556 } 557 558 protected function parseIndexHunk(ArcanistDiffChange $change) { 559 $is_git = $this->getIsGit(); 560 $is_mercurial = $this->getIsMercurial(); 561 $is_svn = (!$is_git && !$is_mercurial); 562 563 $move_source = null; 564 565 $line = $this->getLine(); 566 if ($is_git) { 567 do { 568 569 $patterns = array( 570 '(?P<new>new) file mode (?P<newmode>\d+)', 571 '(?P<deleted>deleted) file mode (?P<oldmode>\d+)', 572 // These occur when someone uses `chmod` on a file. 573 'old mode (?P<oldmode>\d+)', 574 'new mode (?P<newmode>\d+)', 575 // These occur when you `mv` a file and git figures it out. 576 'similarity index ', 577 'rename from (?P<old>.*)', 578 '(?P<move>rename) to (?P<cur>.*)', 579 'copy from (?P<old>.*)', 580 '(?P<copy>copy) to (?P<cur>.*)', 581 ); 582 583 $ok = false; 584 $match = null; 585 foreach ($patterns as $pattern) { 586 $ok = preg_match('@^'.$pattern.'@', $line, $match); 587 if ($ok) { 588 break; 589 } 590 } 591 592 if (!$ok) { 593 if ($line === null || 594 preg_match('/^(diff --git|commit) /', $line)) { 595 // In this case, there are ONLY file mode changes, or this is a 596 // pure move. If it's a move, flag these changesets so we can build 597 // synthetic changes later, enabling us to show file contents in 598 // Differential -- git only gives us a block like this: 599 // 600 // diff --git a/README b/READYOU 601 // similarity index 100% 602 // rename from README 603 // rename to READYOU 604 // 605 // ...i.e., there is no associated diff. 606 607 // This allows us to distinguish between property changes only 608 // and actual moves. For property changes only, we can't currently 609 // build a synthetic diff correctly, so just skip it. 610 // TODO: Build synthetic diffs for property changes, too. 611 if ($change->getType() != ArcanistDiffChangeType::TYPE_CHANGE) { 612 $change->setNeedsSyntheticGitHunks(true); 613 if ($move_source) { 614 $move_source->setNeedsSyntheticGitHunks(true); 615 } 616 } 617 return; 618 } 619 break; 620 } 621 622 if (!empty($match['oldmode'])) { 623 $change->setOldProperty('unix:filemode', $match['oldmode']); 624 } 625 if (!empty($match['newmode'])) { 626 $change->setNewProperty('unix:filemode', $match['newmode']); 627 } 628 629 if (!empty($match['deleted'])) { 630 $change->setType(ArcanistDiffChangeType::TYPE_DELETE); 631 } 632 633 if (!empty($match['new'])) { 634 // If you replace a symlink with a normal file, git renders the change 635 // as a "delete" of the symlink plus an "add" of the new file. We 636 // prefer to represent this as a change. 637 if ($change->getType() == ArcanistDiffChangeType::TYPE_DELETE) { 638 $change->setType(ArcanistDiffChangeType::TYPE_CHANGE); 639 } else { 640 $change->setType(ArcanistDiffChangeType::TYPE_ADD); 641 } 642 } 643 644 if (!empty($match['old'])) { 645 $match['old'] = self::unescapeFilename($match['old']); 646 $change->setOldPath($match['old']); 647 } 648 649 if (!empty($match['cur'])) { 650 $match['cur'] = self::unescapeFilename($match['cur']); 651 $change->setCurrentPath($match['cur']); 652 } 653 654 if (!empty($match['copy'])) { 655 $change->setType(ArcanistDiffChangeType::TYPE_COPY_HERE); 656 $old = $this->buildChange($change->getOldPath()); 657 $type = $old->getType(); 658 659 if ($type == ArcanistDiffChangeType::TYPE_MOVE_AWAY) { 660 $old->setType(ArcanistDiffChangeType::TYPE_MULTICOPY); 661 } else { 662 $old->setType(ArcanistDiffChangeType::TYPE_COPY_AWAY); 663 } 664 665 $old->addAwayPath($change->getCurrentPath()); 666 } 667 668 if (!empty($match['move'])) { 669 $change->setType(ArcanistDiffChangeType::TYPE_MOVE_HERE); 670 $old = $this->buildChange($change->getOldPath()); 671 $type = $old->getType(); 672 673 if ($type == ArcanistDiffChangeType::TYPE_MULTICOPY) { 674 // Great, no change. 675 } else if ($type == ArcanistDiffChangeType::TYPE_MOVE_AWAY) { 676 $old->setType(ArcanistDiffChangeType::TYPE_MULTICOPY); 677 } else if ($type == ArcanistDiffChangeType::TYPE_COPY_AWAY) { 678 $old->setType(ArcanistDiffChangeType::TYPE_MULTICOPY); 679 } else { 680 $old->setType(ArcanistDiffChangeType::TYPE_MOVE_AWAY); 681 } 682 683 // We'll reference this above. 684 $move_source = $old; 685 686 $old->addAwayPath($change->getCurrentPath()); 687 } 688 689 $line = $this->nextNonemptyLine(); 690 } while (true); 691 } 692 693 $line = $this->getLine(); 694 695 if ($is_svn) { 696 $ok = preg_match('/^=+\s*$/', $line); 697 if (!$ok) { 698 $this->didFailParse(pht( 699 "Expected '%s' divider line.", 700 '=======================')); 701 } else { 702 // Adding an empty file in SVN can produce an empty line here. 703 $line = $this->nextNonemptyLine(); 704 } 705 } else if ($is_git) { 706 $ok = preg_match('/^index .*$/', $line); 707 if (!$ok) { 708 // TODO: "hg diff -g" diffs ("mercurial git-style diffs") do not include 709 // this line, so we can't parse them if we fail on it. Maybe introduce 710 // a flag saying "parse this diff using relaxed git-style diff rules"? 711 712 // $this->didFailParse("Expected 'index af23f...a98bc' header line."); 713 } else { 714 // NOTE: In the git case, where this patch is the last change in the 715 // file, we may have a final terminal newline. Skip over it so that 716 // we'll hit the '$line === null' block below. This is covered by the 717 // 'git-empty-file.gitdiff' test case. 718 $line = $this->nextNonemptyLine(); 719 } 720 } 721 722 // If there are files with only whitespace changes and -b or -w are 723 // supplied as command-line flags to `diff', svn and git both produce 724 // changes without any body. 725 if ($line === null || 726 preg_match( 727 '/^(Index:|Property changes on:|diff --git|commit) /', 728 $line)) { 729 return; 730 } 731 732 $is_binary_add = preg_match( 733 '/^Cannot display: file marked as a binary type\.$/', 734 rtrim($line)); 735 if ($is_binary_add) { 736 $this->nextLine(); // Cannot display: file marked as a binary type. 737 $this->nextNonemptyLine(); // svn:mime-type = application/octet-stream 738 $this->markBinary($change); 739 return; 740 } 741 742 // We can get this in git, or in SVN when a file exists in the repository 743 // WITHOUT a binary mime-type and is changed and given a binary mime-type. 744 $is_binary_diff = preg_match( 745 '/^(Binary files|Files) .* and .* differ$/', 746 rtrim($line)); 747 if ($is_binary_diff) { 748 $this->nextNonemptyLine(); // Binary files x and y differ 749 $this->markBinary($change); 750 return; 751 } 752 753 // This occurs under "hg diff --git" when a binary file is removed. See 754 // test case "hg-binary-delete.hgdiff". (I believe it never occurs under 755 // git, which reports the "files X and /dev/null differ" string above. Git 756 // can not apply these patches.) 757 $is_hg_binary_delete = preg_match( 758 '/^Binary file .* has changed$/', 759 rtrim($line)); 760 if ($is_hg_binary_delete) { 761 $this->nextNonemptyLine(); 762 $this->markBinary($change); 763 return; 764 } 765 766 // With "git diff --binary" (not a normal mode, but one users may explicitly 767 // invoke and then, e.g., copy-paste into the web console) or "hg diff 768 // --git" (normal under hg workflows), we may encounter a literal binary 769 // patch. 770 $is_git_binary_patch = preg_match( 771 '/^GIT binary patch$/', 772 rtrim($line)); 773 if ($is_git_binary_patch) { 774 $this->nextLine(); 775 $this->parseGitBinaryPatch(); 776 $line = $this->getLine(); 777 if (preg_match('/^literal/', $line)) { 778 // We may have old/new binaries (change) or just a new binary (hg add). 779 // If there are two blocks, parse both. 780 $this->parseGitBinaryPatch(); 781 } 782 $this->markBinary($change); 783 return; 784 } 785 786 if ($is_git) { 787 // "git diff -b" ignores whitespace, but has an empty hunk target 788 if (preg_match('@^diff --git .*$@', $line)) { 789 $this->nextLine(); 790 return null; 791 } 792 } 793 794 if ($this->isRCS) { 795 // Skip the RCS headers. 796 $this->nextLine(); 797 $this->nextLine(); 798 $this->nextLine(); 799 } 800 801 $old_file = $this->parseHunkTarget(); 802 $new_file = $this->parseHunkTarget(); 803 804 if ($this->isRCS) { 805 $change->setCurrentPath($new_file); 806 } 807 808 $change->setOldPath($old_file); 809 810 $this->parseChangeset($change); 811 } 812 813 private function parseGitBinaryPatch() { 814 815 // TODO: We could decode the patches, but it's a giant mess so don't bother 816 // for now. We'll pick up the data from the working copy in the common 817 // case ("arc diff"). 818 819 $line = $this->getLine(); 820 if (!preg_match('/^literal /', $line)) { 821 $this->didFailParse( 822 pht("Expected '%s' to start git binary patch.", 'literal NNNN')); 823 } 824 do { 825 $line = $this->nextLineTrimmed(); 826 if ($line === '' || $line === null) { 827 // Some versions of Mercurial apparently omit the terminal newline, 828 // although it's unclear if Git will ever do this. In either case, 829 // rely on the base85 check for sanity. 830 $this->nextNonemptyLine(); 831 return; 832 } else if (!preg_match('/^[a-zA-Z]/', $line)) { 833 $this->didFailParse( 834 pht('Expected base85 line length character (a-zA-Z).')); 835 } 836 } while (true); 837 } 838 839 protected function parseHunkTarget() { 840 $line = $this->getLine(); 841 $matches = null; 842 843 $remainder = '(?:\s*\(.*\))?'; 844 if ($this->getIsMercurial()) { 845 // Something like "Fri Aug 26 01:20:50 2005 -0700", don't bother trying 846 // to parse it. 847 $remainder = '\t.*'; 848 } else if ($this->isRCS) { 849 $remainder = '\s.*'; 850 } else if ($this->getIsGit()) { 851 // When filenames contain spaces, Git terminates this line with a tab. 852 // Normally, the tab is not present. If there's a tab, ignore it. 853 $remainder = '(?:\t.*)?'; 854 } 855 856 $ok = preg_match( 857 '@^[-+]{3} (?:[ab]/)?(?P<path>.*?)'.$remainder.'$@', 858 $line, 859 $matches); 860 861 if (!$ok) { 862 $this->didFailParse( 863 pht( 864 "Expected hunk target '%s'.", 865 '+++ path/to/file.ext (revision N)')); 866 } 867 868 $this->nextLine(); 869 return $matches['path']; 870 } 871 872 protected function markBinary(ArcanistDiffChange $change) { 873 $change->setFileType(ArcanistDiffChangeType::FILE_BINARY); 874 return $this; 875 } 876 877 protected function parseChangeset(ArcanistDiffChange $change) { 878 // If a diff includes two sets of changes to the same file, let the 879 // second one win. In particular, this occurs when adding subdirectories 880 // in Subversion that contain files: the file text will be present in 881 // both the directory diff and the file diff. See T5555. Dropping the 882 // hunks lets whichever one shows up later win instead of showing changes 883 // twice. 884 $change->dropHunks(); 885 886 $all_changes = array(); 887 do { 888 $hunk = new ArcanistDiffHunk(); 889 $line = $this->getLineTrimmed(); 890 $real = array(); 891 892 // In the case where only one line is changed, the length is omitted. 893 // The final group is for git, which appends a guess at the function 894 // context to the diff. 895 $matches = null; 896 $ok = preg_match( 897 '/^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(?: .*?)?$/U', 898 $line, 899 $matches); 900 901 if (!$ok) { 902 // It's possible we hit the style of an svn1.7 property change. 903 // This is a 4-line Index block, followed by an empty line, followed 904 // by a "Property changes on:" section similar to svn1.6. 905 if ($line == '') { 906 $line = $this->nextNonemptyLine(); 907 $ok = preg_match('/^Property changes on:/', $line); 908 if (!$ok) { 909 $this->didFailParse(pht('Confused by empty line')); 910 } 911 $line = $this->nextLine(); 912 return $this->parsePropertyHunk($change); 913 } 914 $this->didFailParse(pht( 915 "Expected hunk header '%s'.", 916 '@@ -NN,NN +NN,NN @@')); 917 } 918 919 $hunk->setOldOffset($matches[1]); 920 $hunk->setNewOffset($matches[3]); 921 922 // Cover for the cases where length wasn't present (implying one line). 923 $old_len = idx($matches, 2); 924 if (!strlen($old_len)) { 925 $old_len = 1; 926 } 927 $new_len = idx($matches, 4); 928 if (!strlen($new_len)) { 929 $new_len = 1; 930 } 931 932 $hunk->setOldLength($old_len); 933 $hunk->setNewLength($new_len); 934 935 $add = 0; 936 $del = 0; 937 938 $hit_next_hunk = false; 939 while ((($line = $this->nextLine()) !== null)) { 940 if (strlen(rtrim($line, "\r\n"))) { 941 $char = $line[0]; 942 } else { 943 // Normally, we do not encouter empty lines in diffs, because 944 // unchanged lines have an initial space. However, in Git, with 945 // the option `diff.suppress-blank-empty` set, unchanged blank lines 946 // emit as completely empty. If we encounter a completely empty line, 947 // treat it as a ' ' (i.e., unchanged empty line) line. 948 $char = ' '; 949 } 950 switch ($char) { 951 case '\\': 952 if (!preg_match('@\\ No newline at end of file@', $line)) { 953 $this->didFailParse( 954 pht("Expected '\ No newline at end of file'.")); 955 } 956 if ($new_len) { 957 $real[] = $line; 958 $hunk->setIsMissingOldNewline(true); 959 } else { 960 $real[] = $line; 961 $hunk->setIsMissingNewNewline(true); 962 } 963 if (!$new_len) { 964 break 2; 965 } 966 break; 967 case '+': 968 ++$add; 969 --$new_len; 970 $real[] = $line; 971 break; 972 case '-': 973 if (!$old_len) { 974 // In this case, we've hit "---" from a new file. So don't 975 // advance the line cursor. 976 $hit_next_hunk = true; 977 break 2; 978 } 979 ++$del; 980 --$old_len; 981 $real[] = $line; 982 break; 983 case ' ': 984 if (!$old_len && !$new_len) { 985 break 2; 986 } 987 --$old_len; 988 --$new_len; 989 $real[] = $line; 990 break; 991 default: 992 // We hit something, likely another hunk. 993 $hit_next_hunk = true; 994 break 2; 995 } 996 } 997 998 if ($old_len || $new_len) { 999 $this->didFailParse(pht('Found the wrong number of hunk lines.')); 1000 } 1001 1002 $corpus = implode('', $real); 1003 1004 $is_binary = false; 1005 if ($this->detectBinaryFiles) { 1006 $is_binary = !phutil_is_utf8($corpus); 1007 $try_encoding = $this->tryEncoding; 1008 1009 if ($is_binary && $try_encoding) { 1010 $is_binary = ArcanistDiffUtils::isHeuristicBinaryFile($corpus); 1011 if (!$is_binary) { 1012 $corpus = phutil_utf8_convert($corpus, 'UTF-8', $try_encoding); 1013 if (!phutil_is_utf8($corpus)) { 1014 throw new Exception( 1015 pht( 1016 "Failed to convert a hunk from '%s' to UTF-8. ". 1017 "Check that the specified encoding is correct.", 1018 $try_encoding)); 1019 } 1020 } 1021 } 1022 1023 } 1024 1025 if ($is_binary) { 1026 // SVN happily treats binary files which aren't marked with the right 1027 // mime type as text files. Detect that junk here and mark the file 1028 // binary. We'll catch stuff with unicode too, but that's verboten 1029 // anyway. If there are too many false positives with this we might 1030 // need to make it threshold-triggered instead of triggering on any 1031 // unprintable byte. 1032 $change->setFileType(ArcanistDiffChangeType::FILE_BINARY); 1033 } else { 1034 $hunk->setCorpus($corpus); 1035 $hunk->setAddLines($add); 1036 $hunk->setDelLines($del); 1037 $change->addHunk($hunk); 1038 } 1039 1040 if (!$hit_next_hunk) { 1041 $line = $this->nextNonemptyLine(); 1042 } 1043 1044 } while (preg_match('/^@@ /', $line)); 1045 } 1046 1047 protected function buildChange($path = null) { 1048 $change = null; 1049 if ($path !== null) { 1050 if (!empty($this->changes[$path])) { 1051 return $this->changes[$path]; 1052 } 1053 } 1054 1055 if ($this->forcePath) { 1056 return $this->changes[$this->forcePath]; 1057 } 1058 1059 $change = new ArcanistDiffChange(); 1060 if ($path !== null) { 1061 $change->setCurrentPath($path); 1062 $this->changes[$path] = $change; 1063 } else { 1064 $this->changes[] = $change; 1065 } 1066 1067 return $change; 1068 } 1069 1070 protected function didStartParse($text) { 1071 $this->rawDiff = $text; 1072 1073 // Eat leading whitespace. This may happen if the first change in the diff 1074 // is an SVN property change. 1075 $text = ltrim($text); 1076 1077 // Try to strip ANSI color codes from colorized diffs. ANSI color codes 1078 // might be present in two cases: 1079 // 1080 // - You piped a colorized diff into 'arc --raw' or similar (normally 1081 // we're able to disable colorization on diffs we control the generation 1082 // of). 1083 // - You're diffing a file which actually contains ANSI color codes. 1084 // 1085 // The former is vastly more likely, but we try to distinguish between the 1086 // two cases by testing for a color code at the beginning of a line. If 1087 // we find one, we know it's a colorized diff (since the beginning of the 1088 // line should be "+", "-" or " " if the code is in the diff text). 1089 // 1090 // While it's possible a diff might be colorized and fail this test, it's 1091 // unlikely, and it covers hg's color extension which seems to be the most 1092 // stubborn about colorizing text despite stdout not being a TTY. 1093 // 1094 // We might incorrectly strip color codes from a colorized diff of a text 1095 // file with color codes inside it, but this case is stupid and pathological 1096 // and you've dug your own grave. 1097 1098 $ansi_color_pattern = '\x1B\[[\d;]*m'; 1099 if (preg_match('/^'.$ansi_color_pattern.'/m', $text)) { 1100 $text = preg_replace('/'.$ansi_color_pattern.'/', '', $text); 1101 } 1102 1103 $this->text = phutil_split_lines($text); 1104 $this->line = 0; 1105 } 1106 1107 protected function getLine() { 1108 if ($this->text === null) { 1109 throw new Exception(pht('Not parsing!')); 1110 } 1111 if (isset($this->text[$this->line])) { 1112 return $this->text[$this->line]; 1113 } 1114 return null; 1115 } 1116 1117 protected function getLineTrimmed() { 1118 $line = $this->getLine(); 1119 if ($line !== null) { 1120 $line = trim($line, "\r\n"); 1121 } 1122 return $line; 1123 } 1124 1125 protected function nextLine() { 1126 $this->line++; 1127 return $this->getLine(); 1128 } 1129 1130 protected function nextLineTrimmed() { 1131 $line = $this->nextLine(); 1132 if ($line !== null) { 1133 $line = trim($line, "\r\n"); 1134 } 1135 return $line; 1136 } 1137 1138 protected function nextNonemptyLine() { 1139 while (($line = $this->nextLine()) !== null) { 1140 if (strlen(trim($line)) !== 0) { 1141 break; 1142 } 1143 } 1144 return $this->getLine(); 1145 } 1146 1147 protected function nextLineThatLooksLikeDiffStart() { 1148 while (($line = $this->nextLine()) !== null) { 1149 if (preg_match('/^\s*diff\s+-(?:r|-git)/', $line)) { 1150 break; 1151 } 1152 } 1153 return $this->getLine(); 1154 } 1155 1156 protected function saveLine() { 1157 $this->lineSaved = $this->line; 1158 } 1159 1160 protected function restoreLine() { 1161 $this->line = $this->lineSaved; 1162 } 1163 1164 protected function isFirstNonEmptyLine() { 1165 $len = count($this->text); 1166 for ($ii = 0; $ii < $len; $ii++) { 1167 $line = $this->text[$ii]; 1168 1169 if (!strlen(trim($line))) { 1170 // This line is empty, skip it. 1171 continue; 1172 } 1173 1174 if (preg_match('/^#/', $line)) { 1175 // This line is a comment, skip it. 1176 continue; 1177 } 1178 1179 return ($ii == $this->line); 1180 } 1181 1182 // Entire file is empty. 1183 return false; 1184 } 1185 1186 protected function didFinishParse() { 1187 $this->text = null; 1188 } 1189 1190 public function setWriteDiffOnFailure($write) { 1191 $this->writeDiffOnFailure = $write; 1192 return $this; 1193 } 1194 1195 protected function didFailParse($message) { 1196 $context = 5; 1197 $min = max(0, $this->line - $context); 1198 $max = min($this->line + $context, count($this->text) - 1); 1199 1200 $context = ''; 1201 for ($ii = $min; $ii <= $max; $ii++) { 1202 $context .= sprintf( 1203 '%8.8s %6.6s %s', 1204 ($ii == $this->line) ? '>>> ' : '', 1205 $ii + 1, 1206 $this->text[$ii]); 1207 } 1208 1209 $out = array(); 1210 $out[] = pht('Diff Parse Exception: %s', $message); 1211 1212 if ($this->writeDiffOnFailure) { 1213 $temp = new TempFile(); 1214 $temp->setPreserveFile(true); 1215 1216 Filesystem::writeFile($temp, $this->rawDiff); 1217 $out[] = pht('Raw input file was written to: %s', $temp); 1218 } 1219 1220 $out[] = $context; 1221 $out = implode("\n\n", $out); 1222 1223 throw new Exception($out); 1224 } 1225 1226 /** 1227 * Unescape escaped filenames, e.g. from "git diff". 1228 */ 1229 private static function unescapeFilename($name) { 1230 if (preg_match('/^".+"$/', $name)) { 1231 return stripcslashes(substr($name, 1, -1)); 1232 } else { 1233 return $name; 1234 } 1235 } 1236 1237 private function loadSyntheticData() { 1238 if (!$this->changes) { 1239 return; 1240 } 1241 1242 $repository_api = $this->repositoryAPI; 1243 if (!$repository_api) { 1244 return; 1245 } 1246 1247 $imagechanges = array(); 1248 1249 $changes = $this->changes; 1250 foreach ($changes as $change) { 1251 $path = $change->getCurrentPath(); 1252 1253 // Certain types of changes (moves and copies) don't contain change data 1254 // when expressed in raw "git diff" form. Augment any such diffs with 1255 // textual data. 1256 if ($change->getNeedsSyntheticGitHunks() && 1257 ($repository_api instanceof ArcanistGitAPI)) { 1258 $diff = $repository_api->getRawDiffText($path, $moves = false); 1259 1260 // NOTE: We're reusing the parser and it doesn't reset change state 1261 // between parses because there's an oddball SVN workflow in Phabricator 1262 // which relies on being able to inject changes. 1263 // TODO: Fix this. 1264 $parser = clone $this; 1265 $parser->setChanges(array()); 1266 $raw_changes = $parser->parseDiff($diff); 1267 1268 foreach ($raw_changes as $raw_change) { 1269 if ($raw_change->getCurrentPath() == $path) { 1270 $change->setFileType($raw_change->getFileType()); 1271 foreach ($raw_change->getHunks() as $hunk) { 1272 // Git thinks that this file has been added. But we know that it 1273 // has been moved or copied without a change. 1274 $hunk->setCorpus( 1275 preg_replace('/^\+/m', ' ', $hunk->getCorpus())); 1276 $change->addHunk($hunk); 1277 } 1278 break; 1279 } 1280 } 1281 1282 $change->setNeedsSyntheticGitHunks(false); 1283 } 1284 1285 if ($change->getFileType() != ArcanistDiffChangeType::FILE_BINARY && 1286 $change->getFileType() != ArcanistDiffChangeType::FILE_IMAGE) { 1287 continue; 1288 } 1289 1290 $imagechanges[$path] = $change; 1291 } 1292 1293 // Fetch the actual file contents in batches so repositories 1294 // that have slow random file accesses (i.e. mercurial) can 1295 // optimize the retrieval. 1296 $paths = array_keys($imagechanges); 1297 1298 $filedata = $repository_api->getBulkOriginalFileData($paths); 1299 foreach ($filedata as $path => $data) { 1300 $imagechanges[$path]->setOriginalFileData($data); 1301 } 1302 1303 $filedata = $repository_api->getBulkCurrentFileData($paths); 1304 foreach ($filedata as $path => $data) { 1305 $imagechanges[$path]->setCurrentFileData($data); 1306 } 1307 1308 $this->changes = $changes; 1309 } 1310 1311 1312 /** 1313 * Extracts the common filename from two strings with differing path 1314 * prefixes as found after `diff --git`. These strings may be 1315 * quoted; if so, the filename is returned unescaped. The prefixes 1316 * default to "a/" and "b/", but may be any string -- or may be 1317 * entierly absent. This function may return "null" if the hunk 1318 * represents a file move or copy, and with pathological renames may 1319 * return an incorrect value. Such cases are expected to be 1320 * recovered by later rename detection codepaths. 1321 * 1322 * @param string Text from a diff line after "diff --git ". 1323 * @return string Filename being altered, or null for a rename. 1324 */ 1325 public static function extractGitCommonFilename($paths) { 1326 $matches = null; 1327 $paths = rtrim($paths, "\r\n"); 1328 1329 // Try the exact same string twice in a row separated by a 1330 // space, with an optional prefix. This can hit a false 1331 // positive for moves from files like "old file old" to "file", 1332 // but such a cases will be caught by the "rename from" / 1333 // "rename to" lines. 1334 $prefix = '(?:[^/]+/)?'; 1335 $pattern = 1336 "@^(?P<old>(?P<oldq>\"?){$prefix}(?P<common>.+)\\k<oldq>)" 1337 ." " 1338 ."(?P<new>(?P<newq>\"?){$prefix}\\k<common>\\k<newq>)$@"; 1339 1340 if (!preg_match($pattern, $paths, $matches)) { 1341 // A rename or some form; return null for now, and let the 1342 // "rename from" / "rename to" lines fix it up. 1343 return null; 1344 } 1345 1346 // Use the common subpart. There may be ambiguity here: "src/file 1347 // dst/file" may _either_ be a prefix-less move, or a change with 1348 // two custom prefixes. We assume it is the latter; if it is a 1349 // rename, diff parsing will update based on the "rename from" / 1350 // "rename to" lines. 1351 1352 // This re-assembles with the differing prefixes removed, but the 1353 // quoting from the original. Necessary so we know if we should 1354 // unescape characters from the common string. 1355 $new = $matches['newq'].$matches['common'].$matches['newq']; 1356 $new = self::unescapeFilename($new); 1357 1358 return $new; 1359 } 1360 1361 1362 /** 1363 * Strip the header and footer off a `git-format-patch` diff. 1364 * 1365 * Returns a parseable normal diff and a textual commit message. 1366 */ 1367 private function stripGitFormatPatch($diff) { 1368 // We can parse this by splitting it into two pieces over and over again 1369 // along different section dividers: 1370 // 1371 // 1. Mail headers. 1372 // 2. ("\n\n") 1373 // 3. Mail body. 1374 // 4. ("---") 1375 // 5. Diff stat section. 1376 // 6. ("\n\n") 1377 // 7. Actual diff body. 1378 // 8. ("--") 1379 // 9. Patch footer. 1380 1381 list($head, $tail) = preg_split('/^---$/m', $diff, 2); 1382 list($mail_headers, $mail_body) = explode("\n\n", $head, 2); 1383 list($body, $foot) = preg_split('/^-- ?$/m', $tail, 2); 1384 list($stat, $diff) = explode("\n\n", $body, 2); 1385 1386 // Rebuild the commit message by putting the subject line back on top of it, 1387 // if we can find one. 1388 $matches = null; 1389 $pattern = '/^Subject: (?:\[PATCH\] )?(.*)$/mi'; 1390 if (preg_match($pattern, $mail_headers, $matches)) { 1391 $mail_body = $matches[1]."\n\n".$mail_body; 1392 $mail_body = rtrim($mail_body); 1393 } 1394 1395 return array($mail_body, $diff); 1396 } 1397 1398} 1399