1<?php
2
3/**
4 * Parses diffs from a working copy.
5 */
6final class ArcanistDiffParser extends Phobject {
7
8  protected $repositoryAPI;
9  protected $text;
10  protected $line;
11  protected $lineSaved;
12  protected $isGit;
13  protected $isMercurial;
14  protected $isRCS;
15  protected $detectBinaryFiles = false;
16  protected $tryEncoding;
17  protected $rawDiff;
18  protected $writeDiffOnFailure;
19
20  protected $changes = array();
21  private $forcePath;
22
23  public function setRepositoryAPI(ArcanistRepositoryAPI $repository_api) {
24    $this->repositoryAPI = $repository_api;
25    return $this;
26  }
27
28  public function setDetectBinaryFiles($detect) {
29    $this->detectBinaryFiles = $detect;
30    return $this;
31  }
32
33  public function setTryEncoding($encoding) {
34    $this->tryEncoding = $encoding;
35    return $this;
36  }
37
38  public function forcePath($path) {
39    $this->forcePath = $path;
40    return $this;
41  }
42
43  public function setChanges(array $changes) {
44    assert_instances_of($changes, 'ArcanistDiffChange');
45    $this->changes = mpull($changes, null, 'getCurrentPath');
46    return $this;
47  }
48
49  public function parseSubversionDiff(ArcanistSubversionAPI $api, $paths) {
50    $this->setRepositoryAPI($api);
51
52    $diffs = array();
53
54    foreach ($paths as $path => $status) {
55      if ($status & ArcanistRepositoryAPI::FLAG_UNTRACKED ||
56          $status & ArcanistRepositoryAPI::FLAG_CONFLICT ||
57          $status & ArcanistRepositoryAPI::FLAG_MISSING) {
58        unset($paths[$path]);
59      }
60    }
61
62    $root = null;
63    $from = array();
64    foreach ($paths as $path => $status) {
65      $change = $this->buildChange($path);
66
67      if ($status & ArcanistRepositoryAPI::FLAG_ADDED) {
68        $change->setType(ArcanistDiffChangeType::TYPE_ADD);
69      } else if ($status & ArcanistRepositoryAPI::FLAG_DELETED) {
70        $change->setType(ArcanistDiffChangeType::TYPE_DELETE);
71      } else {
72        $change->setType(ArcanistDiffChangeType::TYPE_CHANGE);
73      }
74
75      $is_dir = is_dir($api->getPath($path));
76      if ($is_dir) {
77        $change->setFileType(ArcanistDiffChangeType::FILE_DIRECTORY);
78        // We have to go hit the diff even for directories because they may
79        // have property changes or moves, etc.
80      }
81      $is_link = is_link($api->getPath($path));
82      if ($is_link) {
83        $change->setFileType(ArcanistDiffChangeType::FILE_SYMLINK);
84      }
85
86      $diff = $api->getRawDiffText($path);
87      if ($diff) {
88        $this->parseDiff($diff);
89      }
90
91      $info = $api->getSVNInfo($path);
92      if (idx($info, 'Copied From URL')) {
93        if (!$root) {
94          $rinfo = $api->getSVNInfo('.');
95          $root = $rinfo['URL'].'/';
96        }
97        $cpath = $info['Copied From URL'];
98        $root_len = strlen($root);
99        if (!strncmp($cpath, $root, $root_len)) {
100          $cpath = substr($cpath, $root_len);
101          // The user can "svn cp /path/to/file@12345 x", which pulls a file out
102          // of version history at a specific revision. If we just use the path,
103          // we'll collide with possible changes to that path in the working
104          // copy below. In particular, "svn cp"-ing a path which no longer
105          // exists somewhere in the working copy and then adding that path
106          // gets us to the "origin change type" branches below with a
107          // TYPE_ADD state on the path. To avoid this, append the origin
108          // revision to the path so we'll necessarily generate a new change.
109          // TODO: In theory, you could have an '@' in your path and this could
110          // cause a collision, e.g. two files named 'f' and 'f@12345'. This is
111          // at least somewhat the user's fault, though.
112          if ($info['Copied From Rev']) {
113            if ($info['Copied From Rev'] != $info['Revision']) {
114              $cpath .= '@'.$info['Copied From Rev'];
115            }
116          }
117          $change->setOldPath($cpath);
118          $from[$path] = $cpath;
119        }
120      }
121
122      $type = $change->getType();
123      if (($type === ArcanistDiffChangeType::TYPE_MOVE_AWAY ||
124           $type === ArcanistDiffChangeType::TYPE_DELETE) &&
125          idx($info, 'Node Kind') === 'directory') {
126        $change->setFileType(ArcanistDiffChangeType::FILE_DIRECTORY);
127      }
128    }
129
130    foreach ($paths as $path => $status) {
131      $change = $this->buildChange($path);
132      if (empty($from[$path])) {
133        continue;
134      }
135
136      if (empty($this->changes[$from[$path]])) {
137        if ($change->getType() == ArcanistDiffChangeType::TYPE_COPY_HERE) {
138          // If the origin path wasn't changed (or isn't included in this diff)
139          // and we only copied it, don't generate a changeset for it. This
140          // keeps us out of trouble when we go to 'arc commit' and need to
141          // figure out which files should be included in the commit list.
142          continue;
143        }
144      }
145
146      $origin = $this->buildChange($from[$path]);
147      $origin->addAwayPath($change->getCurrentPath());
148
149      $type = $origin->getType();
150      switch ($type) {
151        case ArcanistDiffChangeType::TYPE_MULTICOPY:
152        case ArcanistDiffChangeType::TYPE_COPY_AWAY:
153        // "Add" is possible if you do some bizarre tricks with svn:ignore and
154        // "svn copy"'ing URLs straight from the repository; you can end up with
155        // a file that is a copy of itself. See T271.
156        case ArcanistDiffChangeType::TYPE_ADD:
157          break;
158        case ArcanistDiffChangeType::TYPE_DELETE:
159          $origin->setType(ArcanistDiffChangeType::TYPE_MOVE_AWAY);
160          break;
161        case ArcanistDiffChangeType::TYPE_MOVE_AWAY:
162          $origin->setType(ArcanistDiffChangeType::TYPE_MULTICOPY);
163          break;
164        case ArcanistDiffChangeType::TYPE_CHANGE:
165          $origin->setType(ArcanistDiffChangeType::TYPE_COPY_AWAY);
166          break;
167        default:
168          throw new Exception(pht('Bad origin state %s.', $type));
169      }
170
171      $type = $origin->getType();
172      switch ($type) {
173        case ArcanistDiffChangeType::TYPE_MULTICOPY:
174        case ArcanistDiffChangeType::TYPE_MOVE_AWAY:
175          $change->setType(ArcanistDiffChangeType::TYPE_MOVE_HERE);
176          break;
177        case ArcanistDiffChangeType::TYPE_ADD:
178        case ArcanistDiffChangeType::TYPE_COPY_AWAY:
179          $change->setType(ArcanistDiffChangeType::TYPE_COPY_HERE);
180          break;
181        default:
182          throw new Exception(pht('Bad origin state %s.', $type));
183      }
184    }
185
186    return $this->changes;
187  }
188
189  public function parseDiff($diff) {
190    if (!strlen(trim($diff))) {
191      throw new Exception(pht("Can't parse an empty diff!"));
192    }
193
194    // Detect `git-format-patch`, by looking for a "---" line somewhere in
195    // the file and then a footer with Git version number, which looks like
196    // this:
197    //
198    //   --
199    //   1.8.4.2
200    //
201    // Note that `git-format-patch` adds a space after the "--", but we don't
202    // require it when detecting patches, as trailing whitespace can easily be
203    // lost in transit.
204    $detect_patch = '/^---$.*^-- ?[\s\d.]+\z/ms';
205    $message = null;
206    if (preg_match($detect_patch, $diff)) {
207      list($message, $diff) = $this->stripGitFormatPatch($diff);
208    }
209
210    $this->didStartParse($diff);
211
212    // Strip off header comments. While `patch` allows comments anywhere in the
213    // file, `git apply` is more strict. We get these comments in `hg export`
214    // diffs, and Eclipse can also produce them.
215    $line = $this->getLineTrimmed();
216    while (preg_match('/^#/', $line)) {
217      $line = $this->nextLine();
218    }
219
220    if (strlen($message)) {
221      // If we found a message during pre-parse steps, add it to the resulting
222      // changes here.
223      $change = $this->buildChange(null)
224        ->setType(ArcanistDiffChangeType::TYPE_MESSAGE)
225        ->setMetadata('message', $message);
226    }
227
228    do {
229      $patterns = array(
230        // This is a normal SVN text change, probably from "svn diff".
231        '(?P<type>Index): (?P<cur>.+)',
232        // This is an SVN text change, probably from "svnlook diff".
233        '(?P<type>Modified|Added|Deleted|Copied): (?P<cur>.+)',
234        // This is an SVN property change, probably from "svn diff".
235        '(?P<type>Property changes on): (?P<cur>.+)',
236        // This is a git commit message, probably from "git show".
237        '(?P<type>commit) (?P<hash>[a-f0-9]+)(?: \(.*\))?',
238        // This is a git diff, probably from "git show" or "git diff".
239        // Note that the filenames may appear quoted.
240        '(?P<type>diff --git) (?P<oldnew>.*)',
241        // RCS Diff
242        '(?P<type>rcsdiff -u) (?P<oldnew>.*)',
243        // This is a unified diff, probably from "diff -u" or synthetic diffing.
244        '(?P<type>---) (?P<old>.+)\s+\d{4}-\d{2}-\d{2}.*',
245        '(?P<binary>Binary files|Files) '.
246          '(?P<old>.+)\s+\d{4}-\d{2}-\d{2} and '.
247          '(?P<new>.+)\s+\d{4}-\d{2}-\d{2} differ.*',
248        // This is a normal Mercurial text change, probably from "hg diff". It
249        // may have two "-r" blocks if it came from "hg diff -r x:y".
250        '(?P<type>diff -r) (?P<hgrev>[a-f0-9]+) (?:-r [a-f0-9]+ )?(?P<cur>.+)',
251      );
252
253      $line = $this->getLineTrimmed();
254      $match = null;
255      $ok = $this->tryMatchHeader($patterns, $line, $match);
256
257      $failed_parse = false;
258      if (!$ok && $this->isFirstNonEmptyLine()) {
259        // 'hg export' command creates so called "extended diff" that
260        // contains some meta information and comment at the beginning
261        // (isFirstNonEmptyLine() to check for beginning). Actual mercurial
262        // code detects where comment ends and unified diff starts by
263        // searching for "diff -r" or "diff --git" in the text.
264        $this->saveLine();
265        $line = $this->nextLineThatLooksLikeDiffStart();
266        if (!$this->tryMatchHeader($patterns, $line, $match)) {
267          // Restore line before guessing to display correct error.
268          $this->restoreLine();
269          $failed_parse = true;
270        }
271      } else if (!$ok) {
272        $failed_parse = true;
273      }
274
275      if ($failed_parse) {
276        $this->didFailParse(
277          pht(
278            "Expected a hunk header, like '%s' (svn), '%s' (svn properties), ".
279            "'%s' (git show), '%s' (git diff), '%s' (unified diff), or ".
280            "'%s' (hg diff or patch).",
281            'Index: /path/to/file.ext',
282            'Property changes on: /path/to/file.ext',
283            'commit 59bcc3ad6775562f845953cf01624225',
284            'diff --git',
285            '--- filename',
286            'diff -r'));
287      }
288
289      if (isset($match['type'])) {
290        if ($match['type'] == 'diff --git') {
291          $filename = self::extractGitCommonFilename($match['oldnew']);
292          if ($filename !== null) {
293            $match['old'] = $filename;
294            $match['cur'] = $filename;
295          }
296        }
297      }
298
299      $change = $this->buildChange(idx($match, 'cur'));
300
301      if (isset($match['old'])) {
302        $change->setOldPath($match['old']);
303      }
304
305      if (isset($match['hash'])) {
306        $change->setCommitHash($match['hash']);
307      }
308
309      if (isset($match['binary'])) {
310        $change->setFileType(ArcanistDiffChangeType::FILE_BINARY);
311        $line = $this->nextNonemptyLine();
312        continue;
313      }
314
315      $line = $this->nextLine();
316
317      switch ($match['type']) {
318        case 'Index':
319        case 'Modified':
320        case 'Added':
321        case 'Deleted':
322        case 'Copied':
323          $this->parseIndexHunk($change);
324          break;
325        case 'Property changes on':
326          $this->parsePropertyHunk($change);
327          break;
328        case 'diff --git':
329          $this->setIsGit(true);
330          $this->parseIndexHunk($change);
331          break;
332        case 'commit':
333          $this->setIsGit(true);
334          $this->parseCommitMessage($change);
335          break;
336        case '---':
337          $ok = preg_match(
338            '@^(?:\+\+\+) (.*)\s+\d{4}-\d{2}-\d{2}.*$@',
339            $line,
340            $match);
341          if (!$ok) {
342            $this->didFailParse(pht(
343              "Expected '%s' in unified diff.",
344              '+++ filename'));
345          }
346          $change->setCurrentPath($match[1]);
347          $line = $this->nextLine();
348          $this->parseChangeset($change);
349          break;
350        case 'diff -r':
351          $this->setIsMercurial(true);
352          $this->parseIndexHunk($change);
353          break;
354        case 'rcsdiff -u':
355          $this->isRCS = true;
356          $this->parseIndexHunk($change);
357          break;
358        default:
359          $this->didFailParse(pht('Unknown diff type.'));
360          break;
361      }
362    } while ($this->getLine() !== null);
363
364    $this->didFinishParse();
365
366    $this->loadSyntheticData();
367
368    return $this->changes;
369  }
370
371  protected function tryMatchHeader($patterns, $line, &$match) {
372    foreach ($patterns as $pattern) {
373      if (preg_match('@^'.$pattern.'$@', $line, $match)) {
374        return true;
375      }
376    }
377    return false;
378  }
379
380  protected function parseCommitMessage(ArcanistDiffChange $change) {
381    $change->setType(ArcanistDiffChangeType::TYPE_MESSAGE);
382
383    $message = array();
384
385    $line = $this->getLine();
386    if (preg_match('/^Merge: /', $line)) {
387      $this->nextLine();
388    }
389
390    $line = $this->getLine();
391    if (!preg_match('/^Author: /', $line)) {
392      $this->didFailParse(pht("Expected 'Author:'."));
393    }
394
395    $line = $this->nextLine();
396    if (!preg_match('/^Date: /', $line)) {
397      $this->didFailParse(pht("Expected 'Date:'."));
398    }
399
400    while (($line = $this->nextLineTrimmed()) !== null) {
401      if (strlen($line) && $line[0] != ' ') {
402        break;
403      }
404
405      // Strip leading spaces from Git commit messages. Note that empty lines
406      // are represented as just "\n"; don't touch those.
407      $message[] = preg_replace('/^    /', '', $this->getLine());
408    }
409
410    $message = rtrim(implode('', $message), "\r\n");
411    $change->setMetadata('message', $message);
412  }
413
414  /**
415   * Parse an SVN property change hunk. These hunks are ambiguous so just sort
416   * of try to get it mostly right. It's entirely possible to foil this parser
417   * (or any other parser) with a carefully constructed property change.
418   */
419  protected function parsePropertyHunk(ArcanistDiffChange $change) {
420    $line = $this->getLineTrimmed();
421    if (!preg_match('/^_+$/', $line)) {
422      $this->didFailParse(pht("Expected '%s'.", '______________________'));
423    }
424
425    $line = $this->nextLine();
426    while ($line !== null) {
427      $done = preg_match('/^(Index|Property changes on):/', $line);
428      if ($done) {
429        break;
430      }
431
432      // NOTE: Before 1.5, SVN uses "Name". At 1.5 and later, SVN uses
433      // "Modified", "Added" and "Deleted".
434
435      $matches = null;
436      $ok = preg_match(
437        '/^(Name|Modified|Added|Deleted): (.*)$/',
438        $line,
439        $matches);
440      if (!$ok) {
441        $this->didFailParse(
442          pht("Expected 'Name', 'Added', 'Deleted', or 'Modified'."));
443      }
444
445      $op = $matches[1];
446      $prop = $matches[2];
447
448      list($old, $new) = $this->parseSVNPropertyChange($op, $prop);
449
450      if ($old !== null) {
451        $change->setOldProperty($prop, $old);
452      }
453
454      if ($new !== null) {
455        $change->setNewProperty($prop, $new);
456      }
457
458      $line = $this->getLine();
459    }
460  }
461
462  private function parseSVNPropertyChange($op, $prop) {
463    $old = array();
464    $new = array();
465
466    $target = null;
467
468    $line = $this->nextLine();
469    $prop_index = 2;
470    while ($line !== null) {
471      $done = preg_match(
472        '/^(Modified|Added|Deleted|Index|Property changes on):/',
473        $line);
474      if ($done) {
475        break;
476      }
477      $trimline = ltrim($line);
478      if ($trimline && $trimline[0] == '#') {
479        // in svn1.7, a line like ## -0,0 +1 ## is put between the Added: line
480        // and the line with the property change. If we have such a line, we'll
481        // just ignore it (:
482        $line = $this->nextLine();
483        $prop_index = 1;
484        $trimline = ltrim($line);
485      }
486      if ($trimline && $trimline[0] == '+') {
487        if ($op == 'Deleted') {
488          $this->didFailParse(pht(
489            'Unexpected "%s" section in property deletion.',
490            '+'));
491        }
492        $target = 'new';
493        $line = substr($trimline, $prop_index);
494      } else if ($trimline && $trimline[0] == '-') {
495        if ($op == 'Added') {
496          $this->didFailParse(pht(
497            'Unexpected "%s" section in property addition.',
498            '-'));
499        }
500        $target = 'old';
501        $line = substr($trimline, $prop_index);
502      } else if (!strncmp($trimline, 'Merged', 6)) {
503        if ($op == 'Added') {
504          $target = 'new';
505        } else {
506          // These can appear on merges. No idea how to interpret this (unclear
507          // what the old / new values are) and it's of dubious usefulness so
508          // just throw it away until someone complains.
509          $target = null;
510        }
511        $line = $trimline;
512      }
513
514      if ($target == 'new') {
515        $new[] = $line;
516      } else if ($target == 'old') {
517        $old[] = $line;
518      }
519
520      $line = $this->nextLine();
521    }
522
523    $old = rtrim(implode('', $old));
524    $new = rtrim(implode('', $new));
525
526    if (!strlen($old)) {
527      $old = null;
528    }
529
530    if (!strlen($new)) {
531      $new = null;
532    }
533
534    return array($old, $new);
535  }
536
537  protected function setIsGit($git) {
538    if ($this->isGit !== null && $this->isGit != $git) {
539      throw new Exception(pht('Git status has changed!'));
540    }
541    $this->isGit = $git;
542    return $this;
543  }
544
545  protected function getIsGit() {
546    return $this->isGit;
547  }
548
549  public function setIsMercurial($is_mercurial) {
550    $this->isMercurial = $is_mercurial;
551    return $this;
552  }
553
554  public function getIsMercurial() {
555    return $this->isMercurial;
556  }
557
558  protected function parseIndexHunk(ArcanistDiffChange $change) {
559    $is_git = $this->getIsGit();
560    $is_mercurial = $this->getIsMercurial();
561    $is_svn = (!$is_git && !$is_mercurial);
562
563    $move_source = null;
564
565    $line = $this->getLine();
566    if ($is_git) {
567      do {
568
569        $patterns = array(
570          '(?P<new>new) file mode (?P<newmode>\d+)',
571          '(?P<deleted>deleted) file mode (?P<oldmode>\d+)',
572          // These occur when someone uses `chmod` on a file.
573          'old mode (?P<oldmode>\d+)',
574          'new mode (?P<newmode>\d+)',
575          // These occur when you `mv` a file and git figures it out.
576          'similarity index ',
577          'rename from (?P<old>.*)',
578          '(?P<move>rename) to (?P<cur>.*)',
579          'copy from (?P<old>.*)',
580          '(?P<copy>copy) to (?P<cur>.*)',
581        );
582
583        $ok = false;
584        $match = null;
585        foreach ($patterns as $pattern) {
586          $ok = preg_match('@^'.$pattern.'@', $line, $match);
587          if ($ok) {
588            break;
589          }
590        }
591
592        if (!$ok) {
593          if ($line === null ||
594              preg_match('/^(diff --git|commit) /', $line)) {
595            // In this case, there are ONLY file mode changes, or this is a
596            // pure move. If it's a move, flag these changesets so we can build
597            // synthetic changes later, enabling us to show file contents in
598            // Differential -- git only gives us a block like this:
599            //
600            //   diff --git a/README b/READYOU
601            //   similarity index 100%
602            //   rename from README
603            //   rename to READYOU
604            //
605            // ...i.e., there is no associated diff.
606
607            // This allows us to distinguish between property changes only
608            // and actual moves. For property changes only, we can't currently
609            // build a synthetic diff correctly, so just skip it.
610            // TODO: Build synthetic diffs for property changes, too.
611            if ($change->getType() != ArcanistDiffChangeType::TYPE_CHANGE) {
612              $change->setNeedsSyntheticGitHunks(true);
613              if ($move_source) {
614                $move_source->setNeedsSyntheticGitHunks(true);
615              }
616            }
617            return;
618          }
619          break;
620        }
621
622        if (!empty($match['oldmode'])) {
623          $change->setOldProperty('unix:filemode', $match['oldmode']);
624        }
625        if (!empty($match['newmode'])) {
626          $change->setNewProperty('unix:filemode', $match['newmode']);
627        }
628
629        if (!empty($match['deleted'])) {
630          $change->setType(ArcanistDiffChangeType::TYPE_DELETE);
631        }
632
633        if (!empty($match['new'])) {
634          // If you replace a symlink with a normal file, git renders the change
635          // as a "delete" of the symlink plus an "add" of the new file. We
636          // prefer to represent this as a change.
637          if ($change->getType() == ArcanistDiffChangeType::TYPE_DELETE) {
638            $change->setType(ArcanistDiffChangeType::TYPE_CHANGE);
639          } else {
640            $change->setType(ArcanistDiffChangeType::TYPE_ADD);
641          }
642        }
643
644        if (!empty($match['old'])) {
645          $match['old'] = self::unescapeFilename($match['old']);
646          $change->setOldPath($match['old']);
647        }
648
649        if (!empty($match['cur'])) {
650          $match['cur'] = self::unescapeFilename($match['cur']);
651          $change->setCurrentPath($match['cur']);
652        }
653
654        if (!empty($match['copy'])) {
655          $change->setType(ArcanistDiffChangeType::TYPE_COPY_HERE);
656          $old = $this->buildChange($change->getOldPath());
657          $type = $old->getType();
658
659          if ($type == ArcanistDiffChangeType::TYPE_MOVE_AWAY) {
660            $old->setType(ArcanistDiffChangeType::TYPE_MULTICOPY);
661          } else {
662            $old->setType(ArcanistDiffChangeType::TYPE_COPY_AWAY);
663          }
664
665          $old->addAwayPath($change->getCurrentPath());
666        }
667
668        if (!empty($match['move'])) {
669          $change->setType(ArcanistDiffChangeType::TYPE_MOVE_HERE);
670          $old = $this->buildChange($change->getOldPath());
671          $type = $old->getType();
672
673          if ($type == ArcanistDiffChangeType::TYPE_MULTICOPY) {
674            // Great, no change.
675          } else if ($type == ArcanistDiffChangeType::TYPE_MOVE_AWAY) {
676            $old->setType(ArcanistDiffChangeType::TYPE_MULTICOPY);
677          } else if ($type == ArcanistDiffChangeType::TYPE_COPY_AWAY) {
678            $old->setType(ArcanistDiffChangeType::TYPE_MULTICOPY);
679          } else {
680            $old->setType(ArcanistDiffChangeType::TYPE_MOVE_AWAY);
681          }
682
683          // We'll reference this above.
684          $move_source = $old;
685
686          $old->addAwayPath($change->getCurrentPath());
687        }
688
689        $line = $this->nextNonemptyLine();
690      } while (true);
691    }
692
693    $line = $this->getLine();
694
695    if ($is_svn) {
696      $ok = preg_match('/^=+\s*$/', $line);
697      if (!$ok) {
698        $this->didFailParse(pht(
699          "Expected '%s' divider line.",
700          '======================='));
701      } else {
702        // Adding an empty file in SVN can produce an empty line here.
703        $line = $this->nextNonemptyLine();
704      }
705    } else if ($is_git) {
706      $ok = preg_match('/^index .*$/', $line);
707      if (!$ok) {
708        // TODO: "hg diff -g" diffs ("mercurial git-style diffs") do not include
709        // this line, so we can't parse them if we fail on it. Maybe introduce
710        // a flag saying "parse this diff using relaxed git-style diff rules"?
711
712        // $this->didFailParse("Expected 'index af23f...a98bc' header line.");
713      } else {
714        // NOTE: In the git case, where this patch is the last change in the
715        // file, we may have a final terminal newline. Skip over it so that
716        // we'll hit the '$line === null' block below. This is covered by the
717        // 'git-empty-file.gitdiff' test case.
718        $line = $this->nextNonemptyLine();
719      }
720    }
721
722    // If there are files with only whitespace changes and -b or -w are
723    // supplied as command-line flags to `diff', svn and git both produce
724    // changes without any body.
725    if ($line === null ||
726        preg_match(
727          '/^(Index:|Property changes on:|diff --git|commit) /',
728          $line)) {
729      return;
730    }
731
732    $is_binary_add = preg_match(
733      '/^Cannot display: file marked as a binary type\.$/',
734      rtrim($line));
735    if ($is_binary_add) {
736      $this->nextLine(); // Cannot display: file marked as a binary type.
737      $this->nextNonemptyLine(); // svn:mime-type = application/octet-stream
738      $this->markBinary($change);
739      return;
740    }
741
742    // We can get this in git, or in SVN when a file exists in the repository
743    // WITHOUT a binary mime-type and is changed and given a binary mime-type.
744    $is_binary_diff = preg_match(
745      '/^(Binary files|Files) .* and .* differ$/',
746      rtrim($line));
747    if ($is_binary_diff) {
748      $this->nextNonemptyLine(); // Binary files x and y differ
749      $this->markBinary($change);
750      return;
751    }
752
753    // This occurs under "hg diff --git" when a binary file is removed. See
754    // test case "hg-binary-delete.hgdiff". (I believe it never occurs under
755    // git, which reports the "files X and /dev/null differ" string above. Git
756    // can not apply these patches.)
757    $is_hg_binary_delete = preg_match(
758      '/^Binary file .* has changed$/',
759      rtrim($line));
760    if ($is_hg_binary_delete) {
761      $this->nextNonemptyLine();
762      $this->markBinary($change);
763      return;
764    }
765
766    // With "git diff --binary" (not a normal mode, but one users may explicitly
767    // invoke and then, e.g., copy-paste into the web console) or "hg diff
768    // --git" (normal under hg workflows), we may encounter a literal binary
769    // patch.
770    $is_git_binary_patch = preg_match(
771      '/^GIT binary patch$/',
772      rtrim($line));
773    if ($is_git_binary_patch) {
774      $this->nextLine();
775      $this->parseGitBinaryPatch();
776      $line = $this->getLine();
777      if (preg_match('/^literal/', $line)) {
778        // We may have old/new binaries (change) or just a new binary (hg add).
779        // If there are two blocks, parse both.
780        $this->parseGitBinaryPatch();
781      }
782      $this->markBinary($change);
783      return;
784    }
785
786    if ($is_git) {
787      // "git diff -b" ignores whitespace, but has an empty hunk target
788      if (preg_match('@^diff --git .*$@', $line)) {
789        $this->nextLine();
790        return null;
791      }
792    }
793
794    if ($this->isRCS) {
795      // Skip the RCS headers.
796      $this->nextLine();
797      $this->nextLine();
798      $this->nextLine();
799    }
800
801    $old_file = $this->parseHunkTarget();
802    $new_file = $this->parseHunkTarget();
803
804    if ($this->isRCS) {
805      $change->setCurrentPath($new_file);
806    }
807
808    $change->setOldPath($old_file);
809
810    $this->parseChangeset($change);
811  }
812
813  private function parseGitBinaryPatch() {
814
815    // TODO: We could decode the patches, but it's a giant mess so don't bother
816    // for now. We'll pick up the data from the working copy in the common
817    // case ("arc diff").
818
819    $line = $this->getLine();
820    if (!preg_match('/^literal /', $line)) {
821      $this->didFailParse(
822        pht("Expected '%s' to start git binary patch.", 'literal NNNN'));
823    }
824    do {
825      $line = $this->nextLineTrimmed();
826      if ($line === '' || $line === null) {
827        // Some versions of Mercurial apparently omit the terminal newline,
828        // although it's unclear if Git will ever do this. In either case,
829        // rely on the base85 check for sanity.
830        $this->nextNonemptyLine();
831        return;
832      } else if (!preg_match('/^[a-zA-Z]/', $line)) {
833        $this->didFailParse(
834          pht('Expected base85 line length character (a-zA-Z).'));
835      }
836    } while (true);
837  }
838
839  protected function parseHunkTarget() {
840    $line = $this->getLine();
841    $matches = null;
842
843    $remainder = '(?:\s*\(.*\))?';
844    if ($this->getIsMercurial()) {
845      // Something like "Fri Aug 26 01:20:50 2005 -0700", don't bother trying
846      // to parse it.
847      $remainder = '\t.*';
848    } else if ($this->isRCS) {
849      $remainder = '\s.*';
850    } else if ($this->getIsGit()) {
851      // When filenames contain spaces, Git terminates this line with a tab.
852      // Normally, the tab is not present. If there's a tab, ignore it.
853      $remainder = '(?:\t.*)?';
854    }
855
856    $ok = preg_match(
857      '@^[-+]{3} (?:[ab]/)?(?P<path>.*?)'.$remainder.'$@',
858      $line,
859      $matches);
860
861    if (!$ok) {
862      $this->didFailParse(
863        pht(
864          "Expected hunk target '%s'.",
865          '+++ path/to/file.ext (revision N)'));
866    }
867
868    $this->nextLine();
869    return $matches['path'];
870  }
871
872  protected function markBinary(ArcanistDiffChange $change) {
873    $change->setFileType(ArcanistDiffChangeType::FILE_BINARY);
874    return $this;
875  }
876
877  protected function parseChangeset(ArcanistDiffChange $change) {
878    // If a diff includes two sets of changes to the same file, let the
879    // second one win. In particular, this occurs when adding subdirectories
880    // in Subversion that contain files: the file text will be present in
881    // both the directory diff and the file diff. See T5555. Dropping the
882    // hunks lets whichever one shows up later win instead of showing changes
883    // twice.
884    $change->dropHunks();
885
886    $all_changes = array();
887    do {
888      $hunk = new ArcanistDiffHunk();
889      $line = $this->getLineTrimmed();
890      $real = array();
891
892      // In the case where only one line is changed, the length is omitted.
893      // The final group is for git, which appends a guess at the function
894      // context to the diff.
895      $matches = null;
896      $ok = preg_match(
897        '/^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(?: .*?)?$/U',
898        $line,
899        $matches);
900
901      if (!$ok) {
902        // It's possible we hit the style of an svn1.7 property change.
903        // This is a 4-line Index block, followed by an empty line, followed
904        // by a "Property changes on:" section similar to svn1.6.
905        if ($line == '') {
906          $line = $this->nextNonemptyLine();
907          $ok = preg_match('/^Property changes on:/', $line);
908          if (!$ok) {
909            $this->didFailParse(pht('Confused by empty line'));
910          }
911          $line = $this->nextLine();
912          return $this->parsePropertyHunk($change);
913        }
914        $this->didFailParse(pht(
915          "Expected hunk header '%s'.",
916          '@@ -NN,NN +NN,NN @@'));
917      }
918
919      $hunk->setOldOffset($matches[1]);
920      $hunk->setNewOffset($matches[3]);
921
922      // Cover for the cases where length wasn't present (implying one line).
923      $old_len = idx($matches, 2);
924      if (!strlen($old_len)) {
925        $old_len = 1;
926      }
927      $new_len = idx($matches, 4);
928      if (!strlen($new_len)) {
929        $new_len = 1;
930      }
931
932      $hunk->setOldLength($old_len);
933      $hunk->setNewLength($new_len);
934
935      $add = 0;
936      $del = 0;
937
938      $hit_next_hunk = false;
939      while ((($line = $this->nextLine()) !== null)) {
940        if (strlen(rtrim($line, "\r\n"))) {
941          $char = $line[0];
942        } else {
943          // Normally, we do not encouter empty lines in diffs, because
944          // unchanged lines have an initial space. However, in Git, with
945          // the option `diff.suppress-blank-empty` set, unchanged blank lines
946          // emit as completely empty. If we encounter a completely empty line,
947          // treat it as a ' ' (i.e., unchanged empty line) line.
948          $char = ' ';
949        }
950        switch ($char) {
951          case '\\':
952            if (!preg_match('@\\ No newline at end of file@', $line)) {
953              $this->didFailParse(
954                pht("Expected '\ No newline at end of file'."));
955            }
956            if ($new_len) {
957              $real[] = $line;
958              $hunk->setIsMissingOldNewline(true);
959            } else {
960              $real[] = $line;
961              $hunk->setIsMissingNewNewline(true);
962            }
963            if (!$new_len) {
964              break 2;
965            }
966            break;
967          case '+':
968            ++$add;
969            --$new_len;
970            $real[] = $line;
971            break;
972          case '-':
973            if (!$old_len) {
974              // In this case, we've hit "---" from a new file. So don't
975              // advance the line cursor.
976              $hit_next_hunk = true;
977              break 2;
978            }
979            ++$del;
980            --$old_len;
981            $real[] = $line;
982            break;
983          case ' ':
984            if (!$old_len && !$new_len) {
985              break 2;
986            }
987            --$old_len;
988            --$new_len;
989            $real[] = $line;
990            break;
991          default:
992            // We hit something, likely another hunk.
993            $hit_next_hunk = true;
994            break 2;
995        }
996      }
997
998      if ($old_len || $new_len) {
999        $this->didFailParse(pht('Found the wrong number of hunk lines.'));
1000      }
1001
1002      $corpus = implode('', $real);
1003
1004      $is_binary = false;
1005      if ($this->detectBinaryFiles) {
1006        $is_binary = !phutil_is_utf8($corpus);
1007        $try_encoding = $this->tryEncoding;
1008
1009        if ($is_binary && $try_encoding) {
1010          $is_binary = ArcanistDiffUtils::isHeuristicBinaryFile($corpus);
1011          if (!$is_binary) {
1012            $corpus = phutil_utf8_convert($corpus, 'UTF-8', $try_encoding);
1013            if (!phutil_is_utf8($corpus)) {
1014              throw new Exception(
1015                pht(
1016                  "Failed to convert a hunk from '%s' to UTF-8. ".
1017                  "Check that the specified encoding is correct.",
1018                  $try_encoding));
1019            }
1020          }
1021        }
1022
1023      }
1024
1025      if ($is_binary) {
1026        // SVN happily treats binary files which aren't marked with the right
1027        // mime type as text files. Detect that junk here and mark the file
1028        // binary. We'll catch stuff with unicode too, but that's verboten
1029        // anyway. If there are too many false positives with this we might
1030        // need to make it threshold-triggered instead of triggering on any
1031        // unprintable byte.
1032        $change->setFileType(ArcanistDiffChangeType::FILE_BINARY);
1033      } else {
1034        $hunk->setCorpus($corpus);
1035        $hunk->setAddLines($add);
1036        $hunk->setDelLines($del);
1037        $change->addHunk($hunk);
1038      }
1039
1040      if (!$hit_next_hunk) {
1041        $line = $this->nextNonemptyLine();
1042      }
1043
1044    } while (preg_match('/^@@ /', $line));
1045  }
1046
1047  protected function buildChange($path = null) {
1048    $change = null;
1049    if ($path !== null) {
1050      if (!empty($this->changes[$path])) {
1051        return $this->changes[$path];
1052      }
1053    }
1054
1055    if ($this->forcePath) {
1056      return $this->changes[$this->forcePath];
1057    }
1058
1059    $change = new ArcanistDiffChange();
1060    if ($path !== null) {
1061      $change->setCurrentPath($path);
1062      $this->changes[$path] = $change;
1063    } else {
1064      $this->changes[] = $change;
1065    }
1066
1067    return $change;
1068  }
1069
1070  protected function didStartParse($text) {
1071    $this->rawDiff = $text;
1072
1073    // Eat leading whitespace. This may happen if the first change in the diff
1074    // is an SVN property change.
1075    $text = ltrim($text);
1076
1077    // Try to strip ANSI color codes from colorized diffs. ANSI color codes
1078    // might be present in two cases:
1079    //
1080    //   - You piped a colorized diff into 'arc --raw' or similar (normally
1081    //     we're able to disable colorization on diffs we control the generation
1082    //     of).
1083    //   - You're diffing a file which actually contains ANSI color codes.
1084    //
1085    // The former is vastly more likely, but we try to distinguish between the
1086    // two cases by testing for a color code at the beginning of a line. If
1087    // we find one, we know it's a colorized diff (since the beginning of the
1088    // line should be "+", "-" or " " if the code is in the diff text).
1089    //
1090    // While it's possible a diff might be colorized and fail this test, it's
1091    // unlikely, and it covers hg's color extension which seems to be the most
1092    // stubborn about colorizing text despite stdout not being a TTY.
1093    //
1094    // We might incorrectly strip color codes from a colorized diff of a text
1095    // file with color codes inside it, but this case is stupid and pathological
1096    // and you've dug your own grave.
1097
1098    $ansi_color_pattern = '\x1B\[[\d;]*m';
1099    if (preg_match('/^'.$ansi_color_pattern.'/m', $text)) {
1100      $text = preg_replace('/'.$ansi_color_pattern.'/', '', $text);
1101    }
1102
1103    $this->text = phutil_split_lines($text);
1104    $this->line = 0;
1105  }
1106
1107  protected function getLine() {
1108    if ($this->text === null) {
1109      throw new Exception(pht('Not parsing!'));
1110    }
1111    if (isset($this->text[$this->line])) {
1112      return $this->text[$this->line];
1113    }
1114    return null;
1115  }
1116
1117  protected function getLineTrimmed() {
1118    $line = $this->getLine();
1119    if ($line !== null) {
1120      $line = trim($line, "\r\n");
1121    }
1122    return $line;
1123  }
1124
1125  protected function nextLine() {
1126    $this->line++;
1127    return $this->getLine();
1128  }
1129
1130  protected function nextLineTrimmed() {
1131    $line = $this->nextLine();
1132    if ($line !== null) {
1133      $line = trim($line, "\r\n");
1134    }
1135    return $line;
1136  }
1137
1138  protected function nextNonemptyLine() {
1139    while (($line = $this->nextLine()) !== null) {
1140      if (strlen(trim($line)) !== 0) {
1141        break;
1142      }
1143    }
1144    return $this->getLine();
1145  }
1146
1147  protected function nextLineThatLooksLikeDiffStart() {
1148    while (($line = $this->nextLine()) !== null) {
1149      if (preg_match('/^\s*diff\s+-(?:r|-git)/', $line)) {
1150        break;
1151      }
1152    }
1153    return $this->getLine();
1154  }
1155
1156  protected function saveLine() {
1157    $this->lineSaved = $this->line;
1158  }
1159
1160  protected function restoreLine() {
1161    $this->line = $this->lineSaved;
1162  }
1163
1164  protected function isFirstNonEmptyLine() {
1165    $len = count($this->text);
1166    for ($ii = 0; $ii < $len; $ii++) {
1167      $line = $this->text[$ii];
1168
1169      if (!strlen(trim($line))) {
1170        // This line is empty, skip it.
1171        continue;
1172      }
1173
1174      if (preg_match('/^#/', $line)) {
1175        // This line is a comment, skip it.
1176        continue;
1177      }
1178
1179      return ($ii == $this->line);
1180    }
1181
1182    // Entire file is empty.
1183    return false;
1184  }
1185
1186  protected function didFinishParse() {
1187    $this->text = null;
1188  }
1189
1190  public function setWriteDiffOnFailure($write) {
1191    $this->writeDiffOnFailure = $write;
1192    return $this;
1193  }
1194
1195  protected function didFailParse($message) {
1196    $context = 5;
1197    $min = max(0, $this->line - $context);
1198    $max = min($this->line + $context, count($this->text) - 1);
1199
1200    $context = '';
1201    for ($ii = $min; $ii <= $max; $ii++) {
1202      $context .= sprintf(
1203        '%8.8s %6.6s   %s',
1204        ($ii == $this->line) ? '>>>  ' : '',
1205        $ii + 1,
1206        $this->text[$ii]);
1207    }
1208
1209    $out = array();
1210    $out[] = pht('Diff Parse Exception: %s', $message);
1211
1212    if ($this->writeDiffOnFailure) {
1213      $temp = new TempFile();
1214      $temp->setPreserveFile(true);
1215
1216      Filesystem::writeFile($temp, $this->rawDiff);
1217      $out[] = pht('Raw input file was written to: %s', $temp);
1218    }
1219
1220    $out[] = $context;
1221    $out = implode("\n\n", $out);
1222
1223    throw new Exception($out);
1224  }
1225
1226  /**
1227   * Unescape escaped filenames, e.g. from "git diff".
1228   */
1229  private static function unescapeFilename($name) {
1230    if (preg_match('/^".+"$/', $name)) {
1231      return stripcslashes(substr($name, 1, -1));
1232    } else {
1233      return $name;
1234    }
1235  }
1236
1237  private function loadSyntheticData() {
1238    if (!$this->changes) {
1239      return;
1240    }
1241
1242    $repository_api = $this->repositoryAPI;
1243    if (!$repository_api) {
1244      return;
1245    }
1246
1247    $imagechanges = array();
1248
1249    $changes = $this->changes;
1250    foreach ($changes as $change) {
1251      $path = $change->getCurrentPath();
1252
1253      // Certain types of changes (moves and copies) don't contain change data
1254      // when expressed in raw "git diff" form. Augment any such diffs with
1255      // textual data.
1256      if ($change->getNeedsSyntheticGitHunks() &&
1257          ($repository_api instanceof ArcanistGitAPI)) {
1258        $diff = $repository_api->getRawDiffText($path, $moves = false);
1259
1260        // NOTE: We're reusing the parser and it doesn't reset change state
1261        // between parses because there's an oddball SVN workflow in Phabricator
1262        // which relies on being able to inject changes.
1263        // TODO: Fix this.
1264        $parser = clone $this;
1265        $parser->setChanges(array());
1266        $raw_changes = $parser->parseDiff($diff);
1267
1268        foreach ($raw_changes as $raw_change) {
1269          if ($raw_change->getCurrentPath() == $path) {
1270            $change->setFileType($raw_change->getFileType());
1271            foreach ($raw_change->getHunks() as $hunk) {
1272              // Git thinks that this file has been added. But we know that it
1273              // has been moved or copied without a change.
1274              $hunk->setCorpus(
1275                preg_replace('/^\+/m', ' ', $hunk->getCorpus()));
1276              $change->addHunk($hunk);
1277            }
1278            break;
1279          }
1280        }
1281
1282        $change->setNeedsSyntheticGitHunks(false);
1283      }
1284
1285      if ($change->getFileType() != ArcanistDiffChangeType::FILE_BINARY &&
1286          $change->getFileType() != ArcanistDiffChangeType::FILE_IMAGE) {
1287        continue;
1288      }
1289
1290      $imagechanges[$path] = $change;
1291    }
1292
1293    // Fetch the actual file contents in batches so repositories
1294    // that have slow random file accesses (i.e. mercurial) can
1295    // optimize the retrieval.
1296    $paths = array_keys($imagechanges);
1297
1298    $filedata = $repository_api->getBulkOriginalFileData($paths);
1299    foreach ($filedata as $path => $data) {
1300      $imagechanges[$path]->setOriginalFileData($data);
1301    }
1302
1303    $filedata = $repository_api->getBulkCurrentFileData($paths);
1304    foreach ($filedata as $path => $data) {
1305      $imagechanges[$path]->setCurrentFileData($data);
1306    }
1307
1308    $this->changes = $changes;
1309  }
1310
1311
1312  /**
1313   * Extracts the common filename from two strings with differing path
1314   * prefixes as found after `diff --git`.  These strings may be
1315   * quoted; if so, the filename is returned unescaped.  The prefixes
1316   * default to "a/" and "b/", but may be any string -- or may be
1317   * entierly absent.  This function may return "null" if the hunk
1318   * represents a file move or copy, and with pathological renames may
1319   * return an incorrect value.  Such cases are expected to be
1320   * recovered by later rename detection codepaths.
1321   *
1322   * @param string Text from a diff line after "diff --git ".
1323   * @return string Filename being altered, or null for a rename.
1324   */
1325  public static function extractGitCommonFilename($paths) {
1326    $matches = null;
1327    $paths = rtrim($paths, "\r\n");
1328
1329    // Try the exact same string twice in a row separated by a
1330    // space, with an optional prefix.  This can hit a false
1331    // positive for moves from files like "old file old" to "file",
1332    // but such a cases will be caught by the "rename from" /
1333    // "rename to" lines.
1334    $prefix = '(?:[^/]+/)?';
1335    $pattern =
1336             "@^(?P<old>(?P<oldq>\"?){$prefix}(?P<common>.+)\\k<oldq>)"
1337             ." "
1338             ."(?P<new>(?P<newq>\"?){$prefix}\\k<common>\\k<newq>)$@";
1339
1340    if (!preg_match($pattern, $paths, $matches)) {
1341      // A rename or some form; return null for now, and let the
1342      // "rename from" / "rename to" lines fix it up.
1343      return null;
1344    }
1345
1346    // Use the common subpart.  There may be ambiguity here: "src/file
1347    // dst/file" may _either_ be a prefix-less move, or a change with
1348    // two custom prefixes.  We assume it is the latter; if it is a
1349    // rename, diff parsing will update based on the "rename from" /
1350    // "rename to" lines.
1351
1352    // This re-assembles with the differing prefixes removed, but the
1353    // quoting from the original.  Necessary so we know if we should
1354    // unescape characters from the common string.
1355    $new = $matches['newq'].$matches['common'].$matches['newq'];
1356    $new = self::unescapeFilename($new);
1357
1358    return $new;
1359  }
1360
1361
1362  /**
1363   * Strip the header and footer off a `git-format-patch` diff.
1364   *
1365   * Returns a parseable normal diff and a textual commit message.
1366   */
1367  private function stripGitFormatPatch($diff) {
1368    // We can parse this by splitting it into two pieces over and over again
1369    // along different section dividers:
1370    //
1371    //   1. Mail headers.
1372    //   2. ("\n\n")
1373    //   3. Mail body.
1374    //   4. ("---")
1375    //   5. Diff stat section.
1376    //   6. ("\n\n")
1377    //   7. Actual diff body.
1378    //   8. ("--")
1379    //   9. Patch footer.
1380
1381    list($head, $tail) = preg_split('/^---$/m', $diff, 2);
1382    list($mail_headers, $mail_body) = explode("\n\n", $head, 2);
1383    list($body, $foot) = preg_split('/^-- ?$/m', $tail, 2);
1384    list($stat, $diff) = explode("\n\n", $body, 2);
1385
1386    // Rebuild the commit message by putting the subject line back on top of it,
1387    // if we can find one.
1388    $matches = null;
1389    $pattern = '/^Subject: (?:\[PATCH\] )?(.*)$/mi';
1390    if (preg_match($pattern, $mail_headers, $matches)) {
1391      $mail_body = $matches[1]."\n\n".$mail_body;
1392      $mail_body = rtrim($mail_body);
1393    }
1394
1395    return array($mail_body, $diff);
1396  }
1397
1398}
1399