1#!/usr/local/bin/python3.8
2
3"""
4git-filter-repo filters git repositories, similar to git filter-branch, BFG
5repo cleaner, and others.  The basic idea is that it works by running
6   git fast-export <options> | filter | git fast-import <options>
7where this program not only launches the whole pipeline but also serves as
8the 'filter' in the middle.  It does a few additional things on top as well
9in order to make it into a well-rounded filtering tool.
10
11git-filter-repo can also be used as a library for more involved filtering
12operations; however:
13  ***** API BACKWARD COMPATIBILITY CAVEAT *****
14  Programs using git-filter-repo as a library can reach pretty far into its
15  internals, but I am not prepared to guarantee backward compatibility of
16  all APIs.  I suspect changes will be rare, but I reserve the right to
17  change any API.  Since it is assumed that repository filtering is
18  something one would do very rarely, and in particular that it's a
19  one-shot operation, this should not be a problem in practice for anyone.
20  However, if you want to re-use a program you have written that uses
21  git-filter-repo as a library (or makes use of one of its --*-callback
22  arguments), you should either make sure you are using the same version of
23  git and git-filter-repo, or make sure to re-test it.
24
25  If there are particular pieces of the API you are concerned about, and
26  there is not already a testcase for it in t9391-lib-usage.sh or
27  t9392-python-callback.sh, please contribute a testcase.  That will not
28  prevent me from changing the API, but it will allow you to look at the
29  history of a testcase to see whether and how the API changed.
30  ***** END API BACKWARD COMPATIBILITY CAVEAT *****
31"""
32
33import argparse
34import collections
35import fnmatch
36import gettext
37import io
38import os
39import platform
40import re
41import shutil
42import subprocess
43import sys
44import time
45import textwrap
46
47from datetime import tzinfo, timedelta, datetime
48
49__all__ = ["Blob", "Reset", "FileChange", "Commit", "Tag", "Progress",
50           "Checkpoint", "FastExportParser", "ProgressWriter",
51           "string_to_date", "date_to_string",
52           "record_id_rename", "GitUtils", "FilteringOptions", "RepoFilter"]
53
54deleted_hash = b'0'*40
55write_marks = True
56date_format_permissive = True
57
58def gettext_poison(msg):
59  if "GIT_TEST_GETTEXT_POISON" in os.environ: # pragma: no cover
60    return "# GETTEXT POISON #"
61  return gettext.gettext(msg)
62
63_ = gettext_poison
64
65def setup_gettext():
66  TEXTDOMAIN="git-filter-repo"
67  podir = os.environ.get("GIT_TEXTDOMAINDIR") or "@@LOCALEDIR@@"
68  if not os.path.isdir(podir): # pragma: no cover
69    podir = None  # Python has its own fallback; use that
70
71  ## This looks like the most straightforward translation of the relevant
72  ## code in git.git:gettext.c and git.git:perl/Git/I18n.pm:
73  #import locale
74  #locale.setlocale(locale.LC_MESSAGES, "");
75  #locale.setlocale(locale.LC_TIME, "");
76  #locale.textdomain(TEXTDOMAIN);
77  #locale.bindtextdomain(TEXTDOMAIN, podir);
78  ## but the python docs suggest using the gettext module (which doesn't
79  ## have setlocale()) instead, so:
80  gettext.textdomain(TEXTDOMAIN);
81  gettext.bindtextdomain(TEXTDOMAIN, podir);
82
83def _timedelta_to_seconds(delta):
84  """
85  Converts timedelta to seconds
86  """
87  offset = delta.days*86400 + delta.seconds + (delta.microseconds+0.0)/1000000
88  return round(offset)
89
90class FixedTimeZone(tzinfo):
91  """
92  Fixed offset in minutes east from UTC.
93  """
94
95  tz_re = re.compile(br'^([-+]?)(\d\d)(\d\d)$')
96
97  def __init__(self, offset_string):
98    tzinfo.__init__(self)
99    sign, hh, mm = FixedTimeZone.tz_re.match(offset_string).groups()
100    factor = -1 if (sign and sign == b'-') else 1
101    self._offset = timedelta(minutes = factor*(60*int(hh) + int(mm)))
102    self._offset_string = offset_string
103
104  def utcoffset(self, dt):
105    return self._offset
106
107  def tzname(self, dt):
108    return self._offset_string
109
110  def dst(self, dt):
111    return timedelta(0)
112
113def string_to_date(datestring):
114  (unix_timestamp, tz_offset) = datestring.split()
115  return datetime.fromtimestamp(int(unix_timestamp),
116                                FixedTimeZone(tz_offset))
117
118def date_to_string(dateobj):
119  epoch = datetime.fromtimestamp(0, dateobj.tzinfo)
120  return(b'%d %s' % (int(_timedelta_to_seconds(dateobj - epoch)),
121                     dateobj.tzinfo.tzname(0)))
122
123def decode(bytestr):
124  'Try to convert bytestr to utf-8 for outputting as an error message.'
125  return bytestr.decode('utf-8', 'backslashreplace')
126
127def glob_to_regex(glob_bytestr):
128  'Translate glob_bytestr into a regex on bytestrings'
129
130  # fnmatch.translate is idiotic and won't accept bytestrings
131  if (decode(glob_bytestr).encode() != glob_bytestr): # pragma: no cover
132    raise SystemExit(_("Error: Cannot handle glob %s").format(glob_bytestr))
133
134  # Create regex operating on string
135  regex = fnmatch.translate(decode(glob_bytestr))
136
137  # FIXME: This is an ugly hack...
138  # fnmatch.translate tries to do multi-line matching and wants the glob to
139  # match up to the end of the input, which isn't relevant for us, so we
140  # have to modify the regex.  fnmatch.translate has used different regex
141  # constructs to achieve this with different python versions, so we have
142  # to check for each of them and then fix it up.  It would be much better
143  # if fnmatch.translate could just take some flags to allow us to specify
144  # what we want rather than employing this hackery, but since it
145  # doesn't...
146  if regex.endswith(r'\Z(?ms)'): # pragma: no cover
147    regex = regex[0:-7]
148  elif regex.startswith(r'(?s:') and regex.endswith(r')\Z'): # pragma: no cover
149    regex = regex[4:-3]
150
151  # Finally, convert back to regex operating on bytestr
152  return regex.encode()
153
154class PathQuoting:
155  _unescape = {b'a': b'\a',
156               b'b': b'\b',
157               b'f': b'\f',
158               b'n': b'\n',
159               b'r': b'\r',
160               b't': b'\t',
161               b'v': b'\v',
162               b'"': b'"',
163               b'\\':b'\\'}
164  _unescape_re = re.compile(br'\\([a-z"\\]|[0-9]{3})')
165  _escape = [bytes([x]) for x in range(127)]+[
166             b'\\'+bytes(ord(c) for c in oct(x)[2:]) for x in range(127,256)]
167  _reverse = dict(map(reversed, _unescape.items()))
168  for x in _reverse:
169    _escape[ord(x)] = b'\\'+_reverse[x]
170  _special_chars = [len(x) > 1 for x in _escape]
171
172  @staticmethod
173  def unescape_sequence(orig):
174    seq = orig.group(1)
175    return PathQuoting._unescape[seq] if len(seq) == 1 else bytes([int(seq, 8)])
176
177  @staticmethod
178  def dequote(quoted_string):
179    if quoted_string.startswith(b'"'):
180      assert quoted_string.endswith(b'"')
181      return PathQuoting._unescape_re.sub(PathQuoting.unescape_sequence,
182                                          quoted_string[1:-1])
183    return quoted_string
184
185  @staticmethod
186  def enquote(unquoted_string):
187    # Option 1: Quoting when fast-export would:
188    #    pqsc = PathQuoting._special_chars
189    #    if any(pqsc[x] for x in set(unquoted_string)):
190    # Option 2, perf hack: do minimal amount of quoting required by fast-import
191    if unquoted_string.startswith(b'"') or b'\n' in unquoted_string:
192      pqe = PathQuoting._escape
193      return b'"' + b''.join(pqe[x] for x in unquoted_string) + b'"'
194    return unquoted_string
195
196class AncestryGraph(object):
197  """
198  A class that maintains a direct acycle graph of commits for the purpose of
199  determining if one commit is the ancestor of another.
200  """
201
202  def __init__(self):
203    self.cur_value = 0
204
205    # A mapping from the external identifers given to us to the simple integers
206    # we use in self.graph
207    self.value = {}
208
209    # A tuple of (depth, list-of-ancestors).  Values and keys in this graph are
210    # all integers from the self.value dict.  The depth of a commit is one more
211    # than the max depth of any of its ancestors.
212    self.graph = {}
213
214    # Cached results from previous calls to is_ancestor().
215    self._cached_is_ancestor = {}
216
217  def record_external_commits(self, external_commits):
218    """
219    Record in graph that each commit in external_commits exists, and is
220    treated as a root commit with no parents.
221    """
222    for c in external_commits:
223      if c not in self.value:
224        self.cur_value += 1
225        self.value[c] = self.cur_value
226        self.graph[self.cur_value] = (1, [])
227
228  def add_commit_and_parents(self, commit, parents):
229    """
230    Record in graph that commit has the given parents.  parents _MUST_ have
231    been first recorded.  commit _MUST_ not have been recorded yet.
232    """
233    assert all(p in self.value for p in parents)
234    assert commit not in self.value
235
236    # Get values for commit and parents
237    self.cur_value += 1
238    self.value[commit] = self.cur_value
239    graph_parents = [self.value[x] for x in parents]
240
241    # Determine depth for commit, then insert the info into the graph
242    depth = 1
243    if parents:
244      depth += max(self.graph[p][0] for p in graph_parents)
245    self.graph[self.cur_value] = (depth, graph_parents)
246
247  def is_ancestor(self, possible_ancestor, check):
248    """
249    Return whether possible_ancestor is an ancestor of check
250    """
251    a, b = self.value[possible_ancestor], self.value[check]
252    original_pair = (a,b)
253    a_depth = self.graph[a][0]
254    ancestors = [b]
255    visited = set()
256    while ancestors:
257      ancestor = ancestors.pop()
258      prev_pair = (a, ancestor)
259      if prev_pair in self._cached_is_ancestor:
260        if not self._cached_is_ancestor[prev_pair]:
261          continue
262        self._cached_is_ancestor[original_pair] = True
263        return True
264      if ancestor in visited:
265        continue
266      visited.add(ancestor)
267      depth, more_ancestors = self.graph[ancestor]
268      if ancestor == a:
269        self._cached_is_ancestor[original_pair] = True
270        return True
271      elif depth <= a_depth:
272        continue
273      ancestors.extend(more_ancestors)
274    self._cached_is_ancestor[original_pair] = False
275    return False
276
277class MailmapInfo(object):
278  def __init__(self, filename):
279    self.changes = {}
280    self._parse_file(filename)
281
282  def _parse_file(self, filename):
283    name_and_email_re = re.compile(br'(.*?)\s*<([^>]*)>\s*')
284    comment_re = re.compile(br'\s*#.*')
285    if not os.access(filename, os.R_OK):
286      raise SystemExit(_("Cannot read %s") % decode(filename))
287    with open(filename, 'br') as f:
288      count = 0
289      for line in f:
290        count += 1
291        err = "Unparseable mailmap file: line #{} is bad: {}".format(count, line)
292        # Remove comments
293        line = comment_re.sub(b'', line)
294        # Remove leading and trailing whitespace
295        line = line.strip()
296        if not line:
297          continue
298
299        m = name_and_email_re.match(line)
300        if not m:
301          raise SystemExit(err)
302        proper_name, proper_email = m.groups()
303        if len(line) == m.end():
304          self.changes[(None, proper_email)] = (proper_name, proper_email)
305          continue
306        rest = line[m.end():]
307        m = name_and_email_re.match(rest)
308        if m:
309          commit_name, commit_email = m.groups()
310          if len(rest) != m.end():
311            raise SystemExit(err)
312        else:
313          commit_name, commit_email = rest, None
314        self.changes[(commit_name, commit_email)] = (proper_name, proper_email)
315
316  def translate(self, name, email):
317    ''' Given a name and email, return the expected new name and email from the
318        mailmap if there is a translation rule for it, otherwise just return
319        the given name and email.'''
320    for old, new in self.changes.items():
321      old_name, old_email = old
322      new_name, new_email = new
323      if (not old_email or email.lower() == old_email.lower()) and (
324          name  == old_name or not old_name):
325        return (new_name or name, new_email or email)
326    return (name, email)
327
328class ProgressWriter(object):
329  def __init__(self):
330    self._last_progress_update = time.time()
331    self._last_message = None
332
333  def show(self, msg):
334    self._last_message = msg
335    now = time.time()
336    if now - self._last_progress_update > .1:
337      self._last_progress_update = now
338      sys.stdout.write("\r{}".format(msg))
339      sys.stdout.flush()
340
341  def finish(self):
342    self._last_progress_update = 0
343    if self._last_message:
344      self.show(self._last_message)
345    sys.stdout.write("\n")
346
347class _IDs(object):
348  """
349  A class that maintains the 'name domain' of all the 'marks' (short int
350  id for a blob/commit git object). The reason this mechanism is necessary
351  is because the text of fast-export may refer to an object using a different
352  mark than the mark that was assigned to that object using IDS.new(). This
353  class allows you to translate the fast-export marks (old) to the marks
354  assigned from IDS.new() (new).
355
356  Note that there are two reasons why the marks may differ: (1) The
357  user manually creates Blob or Commit objects (for insertion into the
358  stream) (2) We're reading the data from two different repositories
359  and trying to combine the data (git fast-export will number ids from
360  1...n, and having two 1's, two 2's, two 3's, causes issues).
361  """
362
363  def __init__(self):
364    """
365    Init
366    """
367    # The id for the next created blob/commit object
368    self._next_id = 1
369
370    # A map of old-ids to new-ids (1:1 map)
371    self._translation = {}
372
373    # A map of new-ids to every old-id that points to the new-id (1:N map)
374    self._reverse_translation = {}
375
376  def has_renames(self):
377    """
378    Return whether there have been ids remapped to new values
379    """
380    return bool(self._translation)
381
382  def new(self):
383    """
384    Should be called whenever a new blob or commit object is created. The
385    returned value should be used as the id/mark for that object.
386    """
387    rv = self._next_id
388    self._next_id += 1
389    return rv
390
391  def record_rename(self, old_id, new_id, handle_transitivity = False):
392    """
393    Record that old_id is being renamed to new_id.
394    """
395    if old_id != new_id:
396      # old_id -> new_id
397      self._translation[old_id] = new_id
398
399      # Transitivity will be needed if new commits are being inserted mid-way
400      # through a branch.
401      if handle_transitivity:
402        # Anything that points to old_id should point to new_id
403        if old_id in self._reverse_translation:
404          for id_ in self._reverse_translation[old_id]:
405            self._translation[id_] = new_id
406
407      # Record that new_id is pointed to by old_id
408      if new_id not in self._reverse_translation:
409        self._reverse_translation[new_id] = []
410      self._reverse_translation[new_id].append(old_id)
411
412  def translate(self, old_id):
413    """
414    If old_id has been mapped to an alternate id, return the alternate id.
415    """
416    if old_id in self._translation:
417      return self._translation[old_id]
418    else:
419      return old_id
420
421  def __str__(self):
422    """
423    Convert IDs to string; used for debugging
424    """
425    rv = "Current count: %d\nTranslation:\n" % self._next_id
426    for k in sorted(self._translation):
427      rv += "  %d -> %s\n" % (k, self._translation[k])
428
429    rv += "Reverse translation:\n"
430    for k in sorted(self._reverse_translation):
431      rv += "  " + str(k) + " -> " + str(self._reverse_translation[k]) + "\n"
432
433    return rv
434
435class _GitElement(object):
436  """
437  The base class for all git elements that we create.
438  """
439
440  def __init__(self):
441    # A string that describes what type of Git element this is
442    self.type = None
443
444    # A flag telling us if this Git element has been dumped
445    # (i.e. printed) or skipped.  Typically elements that have been
446    # dumped or skipped will not be dumped again.
447    self.dumped = 0
448
449  def dump(self, file_):
450    """
451    This version should never be called. Derived classes need to
452    override! We should note that subclasses should implement this
453    method such that the output would match the format produced by
454    fast-export.
455    """
456    raise SystemExit(_("Unimplemented function: %s") % type(self).__name__
457                     +".dump()") # pragma: no cover
458
459  def __bytes__(self):
460    """
461    Convert GitElement to bytestring; used for debugging
462    """
463    old_dumped = self.dumped
464    writeme = io.BytesIO()
465    self.dump(writeme)
466    output_lines = writeme.getvalue().splitlines()
467    writeme.close()
468    self.dumped = old_dumped
469    return b"%s:\n  %s" % (type(self).__name__.encode(),
470                           b"\n  ".join(output_lines))
471
472  def skip(self, new_id=None):
473    """
474    Ensures this element will not be written to output
475    """
476    self.dumped = 2
477
478class _GitElementWithId(_GitElement):
479  """
480  The base class for Git elements that have IDs (commits and blobs)
481  """
482
483  def __init__(self):
484    _GitElement.__init__(self)
485
486    # The mark (short, portable id) for this element
487    self.id = _IDS.new()
488
489    # The previous mark for this element
490    self.old_id = None
491
492  def skip(self, new_id=None):
493    """
494    This element will no longer be automatically written to output. When a
495    commit gets skipped, it's ID will need to be translated to that of its
496    parent.
497    """
498    self.dumped = 2
499
500    _IDS.record_rename(self.old_id or self.id, new_id)
501
502class Blob(_GitElementWithId):
503  """
504  This class defines our representation of git blob elements (i.e. our
505  way of representing file contents).
506  """
507
508  def __init__(self, data, original_id = None):
509    _GitElementWithId.__init__(self)
510
511    # Denote that this is a blob
512    self.type = 'blob'
513
514    # Record original id
515    self.original_id = original_id
516
517    # Stores the blob's data
518    assert(type(data) == bytes)
519    self.data = data
520
521  def dump(self, file_):
522    """
523    Write this blob element to a file.
524    """
525    self.dumped = 1
526    HASH_TO_ID[self.original_id] = self.id
527    ID_TO_HASH[self.id] = self.original_id
528
529    file_.write(b'blob\n')
530    file_.write(b'mark :%d\n' % self.id)
531    file_.write(b'data %d\n%s' % (len(self.data), self.data))
532    file_.write(b'\n')
533
534
535class Reset(_GitElement):
536  """
537  This class defines our representation of git reset elements.  A reset
538  event is the creation (or recreation) of a named branch, optionally
539  starting from a specific revision).
540  """
541
542  def __init__(self, ref, from_ref = None):
543    _GitElement.__init__(self)
544
545    # Denote that this is a reset
546    self.type = 'reset'
547
548    # The name of the branch being (re)created
549    self.ref = ref
550
551    # Some reference to the branch/commit we are resetting from
552    self.from_ref = from_ref
553
554  def dump(self, file_):
555    """
556    Write this reset element to a file
557    """
558    self.dumped = 1
559
560    file_.write(b'reset %s\n' % self.ref)
561    if self.from_ref:
562      if isinstance(self.from_ref, int):
563        file_.write(b'from :%d\n' % self.from_ref)
564      else:
565        file_.write(b'from %s\n' % self.from_ref)
566      file_.write(b'\n')
567
568class FileChange(_GitElement):
569  """
570  This class defines our representation of file change elements. File change
571  elements are components within a Commit element.
572  """
573
574  def __init__(self, type_, filename = None, id_ = None, mode = None):
575    _GitElement.__init__(self)
576
577    # Denote the type of file-change (b'M' for modify, b'D' for delete, etc)
578    # We could
579    #   assert(type(type_) == bytes)
580    # here but I don't just due to worries about performance overhead...
581    self.type = type_
582
583    # Record the name of the file being changed
584    self.filename = filename
585
586    # Record the mode (mode describes type of file entry (non-executable,
587    # executable, or symlink)).
588    self.mode = mode
589
590    # blob_id is the id (mark) of the affected blob
591    self.blob_id = id_
592
593    if type_ == b'DELETEALL':
594      assert filename is None and id_ is None and mode is None
595      self.filename = b'' # Just so PathQuoting.enquote doesn't die
596    else:
597      assert filename is not None
598
599    if type_ == b'M':
600      assert id_ is not None and mode is not None
601    elif type_ == b'D':
602      assert id_ is None and mode is None
603    elif type_ == b'R':  # pragma: no cover (now avoid fast-export renames)
604      assert mode is None
605      if id_ is None:
606        raise SystemExit(_("new name needed for rename of %s") % filename)
607      self.filename = (self.filename, id_)
608      self.blob_id = None
609
610  def dump(self, file_):
611    """
612    Write this file-change element to a file
613    """
614    skipped_blob = (self.type == b'M' and self.blob_id is None)
615    if skipped_blob: return
616    self.dumped = 1
617
618    quoted_filename = PathQuoting.enquote(self.filename)
619    if self.type == b'M' and isinstance(self.blob_id, int):
620      file_.write(b'M %s :%d %s\n' % (self.mode, self.blob_id, quoted_filename))
621    elif self.type == b'M':
622      file_.write(b'M %s %s %s\n' % (self.mode, self.blob_id, quoted_filename))
623    elif self.type == b'D':
624      file_.write(b'D %s\n' % quoted_filename)
625    elif self.type == b'DELETEALL':
626      file_.write(b'deleteall\n')
627    else:
628      raise SystemExit(_("Unhandled filechange type: %s") % self.type) # pragma: no cover
629
630class Commit(_GitElementWithId):
631  """
632  This class defines our representation of commit elements. Commit elements
633  contain all the information associated with a commit.
634  """
635
636  def __init__(self, branch,
637               author_name,    author_email,    author_date,
638               committer_name, committer_email, committer_date,
639               message,
640               file_changes,
641               parents,
642               original_id = None,
643               encoding = None, # encoding for message; None implies UTF-8
644               **kwargs):
645    _GitElementWithId.__init__(self)
646    self.old_id = self.id
647
648    # Denote that this is a commit element
649    self.type = 'commit'
650
651    # Record the affected branch
652    self.branch = branch
653
654    # Record original id
655    self.original_id = original_id
656
657    # Record author's name
658    self.author_name  = author_name
659
660    # Record author's email
661    self.author_email = author_email
662
663    # Record date of authoring
664    self.author_date  = author_date
665
666    # Record committer's name
667    self.committer_name  = committer_name
668
669    # Record committer's email
670    self.committer_email = committer_email
671
672    # Record date the commit was made
673    self.committer_date  = committer_date
674
675    # Record commit message and its encoding
676    self.encoding = encoding
677    self.message = message
678
679    # List of file-changes associated with this commit. Note that file-changes
680    # are also represented as git elements
681    self.file_changes = file_changes
682
683    self.parents = parents
684
685  def dump(self, file_):
686    """
687    Write this commit element to a file.
688    """
689    self.dumped = 1
690    HASH_TO_ID[self.original_id] = self.id
691    ID_TO_HASH[self.id] = self.original_id
692
693    # Make output to fast-import slightly easier for humans to read if the
694    # message has no trailing newline of its own; cosmetic, but a nice touch...
695    extra_newline = b'\n'
696    if self.message.endswith(b'\n') or not (self.parents or self.file_changes):
697      extra_newline = b''
698
699    if not self.parents:
700      file_.write(b'reset %s\n' % self.branch)
701    file_.write((b'commit %s\n'
702                 b'mark :%d\n'
703                 b'author %s <%s> %s\n'
704                 b'committer %s <%s> %s\n'
705                ) % (
706                  self.branch, self.id,
707                  self.author_name, self.author_email, self.author_date,
708                  self.committer_name, self.committer_email, self.committer_date
709               ))
710    if self.encoding:
711      file_.write(b'encoding %s\n' % self.encoding)
712    file_.write(b'data %d\n%s%s' %
713                (len(self.message), self.message, extra_newline))
714    for i, parent in enumerate(self.parents):
715      file_.write(b'from ' if i==0 else b'merge ')
716      if isinstance(parent, int):
717        file_.write(b':%d\n' % parent)
718      else:
719        file_.write(b'%s\n' % parent)
720    for change in self.file_changes:
721      change.dump(file_)
722    if not self.parents and not self.file_changes:
723      # Workaround a bug in pre-git-2.22 versions of fast-import with
724      # the get-mark directive.
725      file_.write(b'\n')
726    file_.write(b'\n')
727
728  def first_parent(self):
729    """
730    Return first parent commit
731    """
732    if self.parents:
733      return self.parents[0]
734    return None
735
736  def skip(self, new_id=None):
737    _SKIPPED_COMMITS.add(self.old_id or self.id)
738    _GitElementWithId.skip(self, new_id)
739
740class Tag(_GitElementWithId):
741  """
742  This class defines our representation of annotated tag elements.
743  """
744
745  def __init__(self, ref, from_ref,
746               tagger_name, tagger_email, tagger_date, tag_msg,
747               original_id = None):
748    _GitElementWithId.__init__(self)
749    self.old_id = self.id
750
751    # Denote that this is a tag element
752    self.type = 'tag'
753
754    # Store the name of the tag
755    self.ref = ref
756
757    # Store the entity being tagged (this should be a commit)
758    self.from_ref = from_ref
759
760    # Record original id
761    self.original_id = original_id
762
763    # Store the name of the tagger
764    self.tagger_name  = tagger_name
765
766    # Store the email of the tagger
767    self.tagger_email = tagger_email
768
769    # Store the date
770    self.tagger_date  = tagger_date
771
772    # Store the tag message
773    self.message = tag_msg
774
775  def dump(self, file_):
776    """
777    Write this tag element to a file
778    """
779
780    self.dumped = 1
781    HASH_TO_ID[self.original_id] = self.id
782    ID_TO_HASH[self.id] = self.original_id
783
784    file_.write(b'tag %s\n' % self.ref)
785    if (write_marks and self.id):
786      file_.write(b'mark :%d\n' % self.id)
787    markfmt = b'from :%d\n' if isinstance(self.from_ref, int) else b'from %s\n'
788    file_.write(markfmt % self.from_ref)
789    if self.tagger_name:
790      file_.write(b'tagger %s <%s> ' % (self.tagger_name, self.tagger_email))
791      file_.write(self.tagger_date)
792      file_.write(b'\n')
793    file_.write(b'data %d\n%s' % (len(self.message), self.message))
794    file_.write(b'\n')
795
796class Progress(_GitElement):
797  """
798  This class defines our representation of progress elements. The progress
799  element only contains a progress message, which is printed by fast-import
800  when it processes the progress output.
801  """
802
803  def __init__(self, message):
804    _GitElement.__init__(self)
805
806    # Denote that this is a progress element
807    self.type = 'progress'
808
809    # Store the progress message
810    self.message = message
811
812  def dump(self, file_):
813    """
814    Write this progress element to a file
815    """
816    self.dumped = 1
817
818    file_.write(b'progress %s\n' % self.message)
819    file_.write(b'\n')
820
821class Checkpoint(_GitElement):
822  """
823  This class defines our representation of checkpoint elements.  These
824  elements represent events which force fast-import to close the current
825  packfile, start a new one, and to save out all current branch refs, tags
826  and marks.
827  """
828
829  def __init__(self):
830    _GitElement.__init__(self)
831
832    # Denote that this is a checkpoint element
833    self.type = 'checkpoint'
834
835  def dump(self, file_):
836    """
837    Write this checkpoint element to a file
838    """
839    self.dumped = 1
840
841    file_.write(b'checkpoint\n')
842    file_.write(b'\n')
843
844class LiteralCommand(_GitElement):
845  """
846  This class defines our representation of commands. The literal command
847  includes only a single line, and is not processed in any special way.
848  """
849
850  def __init__(self, line):
851    _GitElement.__init__(self)
852
853    # Denote that this is a literal element
854    self.type = 'literal'
855
856    # Store the command
857    self.line = line
858
859  def dump(self, file_):
860    """
861    Write this progress element to a file
862    """
863    self.dumped = 1
864
865    file_.write(self.line)
866
867class Alias(_GitElement):
868  """
869  This class defines our representation of fast-import alias elements.  An
870  alias element is the setting of one mark to the same sha1sum as another,
871  usually because the newer mark corresponded to a pruned commit.
872  """
873
874  def __init__(self, ref, to_ref):
875    _GitElement.__init__(self)
876    # Denote that this is a reset
877    self.type = 'alias'
878
879    self.ref = ref
880    self.to_ref = to_ref
881
882  def dump(self, file_):
883    """
884    Write this reset element to a file
885    """
886    self.dumped = 1
887
888    file_.write(b'alias\nmark :%d\nto :%d\n\n' % (self.ref, self.to_ref))
889
890class FastExportParser(object):
891  """
892  A class for parsing and handling the output from fast-export. This
893  class allows the user to register callbacks when various types of
894  data are encountered in the fast-export output. The basic idea is that,
895  FastExportParser takes fast-export output, creates the various objects
896  as it encounters them, the user gets to use/modify these objects via
897  callbacks, and finally FastExportParser outputs the modified objects
898  in fast-import format (presumably so they can be used to create a new
899  repo).
900  """
901
902  def __init__(self,
903               tag_callback = None,   commit_callback = None,
904               blob_callback = None,  progress_callback = None,
905               reset_callback = None, checkpoint_callback = None,
906               done_callback = None):
907    # Members below simply store callback functions for the various git
908    # elements
909    self._tag_callback        = tag_callback
910    self._blob_callback       = blob_callback
911    self._reset_callback      = reset_callback
912    self._commit_callback     = commit_callback
913    self._progress_callback   = progress_callback
914    self._checkpoint_callback = checkpoint_callback
915    self._done_callback       = done_callback
916
917    # Keep track of which refs appear from the export, and which make it to
918    # the import (pruning of empty commits, renaming of refs, and creating
919    # new manual objects and inserting them can cause these to differ).
920    self._exported_refs = set()
921    self._imported_refs = set()
922
923    # A list of the branches we've seen, plus the last known commit they
924    # pointed to.  An entry in latest_*commit will be deleted if we get a
925    # reset for that branch.  These are used because of fast-import's weird
926    # decision to allow having an implicit parent via naming the branch
927    # instead of requiring branches to be specified via 'from' directives.
928    self._latest_commit = {}
929    self._latest_orig_commit = {}
930
931    # A handle to the input source for the fast-export data
932    self._input = None
933
934    # A handle to the output file for the output we generate (we call dump
935    # on many of the git elements we create).
936    self._output = None
937
938    # Stores the contents of the current line of input being parsed
939    self._currentline = ''
940
941    # Compile some regexes and cache those
942    self._mark_re = re.compile(br'mark :(\d+)\n$')
943    self._parent_regexes = {}
944    parent_regex_rules = (b' :(\d+)\n$', b' ([0-9a-f]{40})\n')
945    for parent_refname in (b'from', b'merge'):
946      ans = [re.compile(parent_refname+x) for x in parent_regex_rules]
947      self._parent_regexes[parent_refname] = ans
948    self._quoted_string_re = re.compile(br'"(?:[^"\\]|\\.)*"')
949    self._refline_regexes = {}
950    for refline_name in (b'reset', b'commit', b'tag', b'progress'):
951      self._refline_regexes[refline_name] = re.compile(refline_name+b' (.*)\n$')
952    self._user_regexes = {}
953    for user in (b'author', b'committer', b'tagger'):
954      self._user_regexes[user] = re.compile(user + b' (.*?) <(.*?)> (.*)\n$')
955
956  def _advance_currentline(self):
957    """
958    Grab the next line of input
959    """
960    self._currentline = self._input.readline()
961
962  def _parse_optional_mark(self):
963    """
964    If the current line contains a mark, parse it and advance to the
965    next line; return None otherwise
966    """
967    mark = None
968    matches = self._mark_re.match(self._currentline)
969    if matches:
970      mark = int(matches.group(1))
971      self._advance_currentline()
972    return mark
973
974  def _parse_optional_parent_ref(self, refname):
975    """
976    If the current line contains a reference to a parent commit, then
977    parse it and advance the current line; otherwise return None. Note
978    that the name of the reference ('from', 'merge') must match the
979    refname arg.
980    """
981    orig_baseref, baseref = None, None
982    rule, altrule = self._parent_regexes[refname]
983    matches = rule.match(self._currentline)
984    if matches:
985      orig_baseref = int(matches.group(1))
986      # We translate the parent commit mark to what it needs to be in
987      # our mark namespace
988      baseref = _IDS.translate(orig_baseref)
989      self._advance_currentline()
990    else:
991      matches = altrule.match(self._currentline)
992      if matches:
993        orig_baseref = matches.group(1)
994        baseref = orig_baseref
995        self._advance_currentline()
996    return orig_baseref, baseref
997
998  def _parse_optional_filechange(self):
999    """
1000    If the current line contains a file-change object, then parse it
1001    and advance the current line; otherwise return None. We only care
1002    about file changes of type b'M' and b'D' (these are the only types
1003    of file-changes that fast-export will provide).
1004    """
1005    filechange = None
1006    changetype = self._currentline[0:1]
1007    if changetype == b'M':
1008      (changetype, mode, idnum, path) = self._currentline.split(None, 3)
1009      if idnum[0:1] == b':':
1010        idnum = idnum[1:]
1011      path = path.rstrip(b'\n')
1012      # We translate the idnum to our id system
1013      if len(idnum) != 40:
1014        idnum = _IDS.translate( int(idnum) )
1015      if idnum is not None:
1016        if path.startswith(b'"'):
1017          path = PathQuoting.dequote(path)
1018        filechange = FileChange(b'M', path, idnum, mode)
1019      else:
1020        filechange = b'skipped'
1021      self._advance_currentline()
1022    elif changetype == b'D':
1023      (changetype, path) = self._currentline.split(None, 1)
1024      path = path.rstrip(b'\n')
1025      if path.startswith(b'"'):
1026        path = PathQuoting.dequote(path)
1027      filechange = FileChange(b'D', path)
1028      self._advance_currentline()
1029    elif changetype == b'R':  # pragma: no cover (now avoid fast-export renames)
1030      rest = self._currentline[2:-1]
1031      if rest.startswith(b'"'):
1032        m = self._quoted_string_re.match(rest)
1033        if not m:
1034          raise SystemExit(_("Couldn't parse rename source"))
1035        orig = PathQuoting.dequote(m.group(0))
1036        new = rest[m.end()+1:]
1037      else:
1038        orig, new = rest.split(b' ', 1)
1039      if new.startswith(b'"'):
1040        new = PathQuoting.dequote(new)
1041      filechange = FileChange(b'R', orig, new)
1042      self._advance_currentline()
1043    return filechange
1044
1045  def _parse_original_id(self):
1046    original_id = self._currentline[len(b'original-oid '):].rstrip()
1047    self._advance_currentline()
1048    return original_id
1049
1050  def _parse_encoding(self):
1051    encoding = self._currentline[len(b'encoding '):].rstrip()
1052    self._advance_currentline()
1053    return encoding
1054
1055  def _parse_ref_line(self, refname):
1056    """
1057    Parses string data (often a branch name) from current-line. The name of
1058    the string data must match the refname arg. The program will crash if
1059    current-line does not match, so current-line will always be advanced if
1060    this method returns.
1061    """
1062    matches = self._refline_regexes[refname].match(self._currentline)
1063    if not matches:
1064      raise SystemExit(_("Malformed %(refname)s line: '%(line)s'") %
1065                       ({'refname': refname, 'line':self._currentline})
1066                       ) # pragma: no cover
1067    ref = matches.group(1)
1068    self._advance_currentline()
1069    return ref
1070
1071  def _parse_user(self, usertype):
1072    """
1073    Get user name, email, datestamp from current-line. Current-line will
1074    be advanced.
1075    """
1076    user_regex = self._user_regexes[usertype]
1077    (name, email, when) = user_regex.match(self._currentline).groups()
1078
1079    self._advance_currentline()
1080    return (name, email, when)
1081
1082  def _parse_data(self):
1083    """
1084    Reads data from _input. Current-line will be advanced until it is beyond
1085    the data.
1086    """
1087    fields = self._currentline.split()
1088    assert fields[0] == b'data'
1089    size = int(fields[1])
1090    data = self._input.read(size)
1091    self._advance_currentline()
1092    if self._currentline == b'\n':
1093      self._advance_currentline()
1094    return data
1095
1096  def _parse_blob(self):
1097    """
1098    Parse input data into a Blob object. Once the Blob has been created, it
1099    will be handed off to the appropriate callbacks. Current-line will be
1100    advanced until it is beyond this blob's data. The Blob will be dumped
1101    to _output once everything else is done (unless it has been skipped by
1102    the callback).
1103    """
1104    # Parse the Blob
1105    self._advance_currentline()
1106    id_ = self._parse_optional_mark()
1107
1108    original_id = None
1109    if self._currentline.startswith(b'original-oid'):
1110      original_id = self._parse_original_id();
1111
1112    data = self._parse_data()
1113    if self._currentline == b'\n':
1114      self._advance_currentline()
1115
1116    # Create the blob
1117    blob = Blob(data, original_id)
1118
1119    # If fast-export text had a mark for this blob, need to make sure this
1120    # mark translates to the blob's true id.
1121    if id_:
1122      blob.old_id = id_
1123      _IDS.record_rename(id_, blob.id)
1124
1125    # Call any user callback to allow them to use/modify the blob
1126    if self._blob_callback:
1127      self._blob_callback(blob)
1128
1129    # Now print the resulting blob
1130    if not blob.dumped:
1131      blob.dump(self._output)
1132
1133  def _parse_reset(self):
1134    """
1135    Parse input data into a Reset object. Once the Reset has been created,
1136    it will be handed off to the appropriate callbacks. Current-line will
1137    be advanced until it is beyond the reset data. The Reset will be dumped
1138    to _output once everything else is done (unless it has been skipped by
1139    the callback).
1140    """
1141    # Parse the Reset
1142    ref = self._parse_ref_line(b'reset')
1143    self._exported_refs.add(ref)
1144    ignoreme, from_ref = self._parse_optional_parent_ref(b'from')
1145    if self._currentline == b'\n':
1146      self._advance_currentline()
1147
1148    # fast-export likes to print extraneous resets that serve no purpose.
1149    # While we could continue processing such resets, that is a waste of
1150    # resources.  Also, we want to avoid recording that this ref was
1151    # seen in such cases, since this ref could be rewritten to nothing.
1152    if not from_ref:
1153      self._latest_commit.pop(ref, None)
1154      self._latest_orig_commit.pop(ref, None)
1155      return
1156
1157    # Create the reset
1158    reset = Reset(ref, from_ref)
1159
1160    # Call any user callback to allow them to modify the reset
1161    if self._reset_callback:
1162      self._reset_callback(reset)
1163
1164    # Update metadata
1165    self._latest_commit[reset.ref] = reset.from_ref
1166    self._latest_orig_commit[reset.ref] = reset.from_ref
1167
1168    # Now print the resulting reset
1169    if not reset.dumped:
1170      self._imported_refs.add(reset.ref)
1171      reset.dump(self._output)
1172
1173  def _parse_commit(self):
1174    """
1175    Parse input data into a Commit object. Once the Commit has been created,
1176    it will be handed off to the appropriate callbacks. Current-line will
1177    be advanced until it is beyond the commit data. The Commit will be dumped
1178    to _output once everything else is done (unless it has been skipped by
1179    the callback OR the callback has removed all file-changes from the commit).
1180    """
1181    # Parse the Commit. This may look involved, but it's pretty simple; it only
1182    # looks bad because a commit object contains many pieces of data.
1183    branch = self._parse_ref_line(b'commit')
1184    self._exported_refs.add(branch)
1185    id_ = self._parse_optional_mark()
1186
1187    original_id = None
1188    if self._currentline.startswith(b'original-oid'):
1189      original_id = self._parse_original_id();
1190
1191    author_name = None
1192    author_email = None
1193    if self._currentline.startswith(b'author'):
1194      (author_name, author_email, author_date) = self._parse_user(b'author')
1195
1196    (committer_name, committer_email, committer_date) = \
1197      self._parse_user(b'committer')
1198
1199    if not author_name and not author_email:
1200      (author_name, author_email, author_date) = \
1201        (committer_name, committer_email, committer_date)
1202
1203    encoding = None
1204    if self._currentline.startswith(b'encoding '):
1205      encoding = self._parse_encoding()
1206
1207    commit_msg = self._parse_data()
1208
1209    pinfo = [self._parse_optional_parent_ref(b'from')]
1210    # Due to empty pruning, we can have real 'from' and 'merge' lines that
1211    # due to commit rewriting map to a parent of None.  We need to record
1212    # 'from' if its non-None, and we need to parse all 'merge' lines.
1213    while self._currentline.startswith(b'merge '):
1214      pinfo.append(self._parse_optional_parent_ref(b'merge'))
1215    orig_parents, parents = [list(tmp) for tmp in zip(*pinfo)]
1216
1217    # No parents is oddly represented as [None] instead of [], due to the
1218    # special 'from' handling.  Convert it here to a more canonical form.
1219    if parents == [None]:
1220      parents = []
1221    if orig_parents == [None]:
1222      orig_parents = []
1223
1224    # fast-import format is kinda stupid in that it allows implicit parents
1225    # based on the branch name instead of requiring them to be specified by
1226    # 'from' directives.  The only way to get no parent is by using a reset
1227    # directive first, which clears the latest_commit_for_this_branch tracking.
1228    if not orig_parents and self._latest_commit.get(branch):
1229      parents = [self._latest_commit[branch]]
1230    if not orig_parents and self._latest_orig_commit.get(branch):
1231      orig_parents = [self._latest_orig_commit[branch]]
1232
1233    # Get the list of file changes
1234    file_changes = []
1235    file_change = self._parse_optional_filechange()
1236    had_file_changes = file_change is not None
1237    while file_change:
1238      if not (type(file_change) == bytes and file_change == b'skipped'):
1239        file_changes.append(file_change)
1240      file_change = self._parse_optional_filechange()
1241    if self._currentline == b'\n':
1242      self._advance_currentline()
1243
1244    # Okay, now we can finally create the Commit object
1245    commit = Commit(branch,
1246                    author_name,    author_email,    author_date,
1247                    committer_name, committer_email, committer_date,
1248                    commit_msg, file_changes, parents, original_id, encoding)
1249
1250    # If fast-export text had a mark for this commit, need to make sure this
1251    # mark translates to the commit's true id.
1252    if id_:
1253      commit.old_id = id_
1254      _IDS.record_rename(id_, commit.id)
1255
1256    # Call any user callback to allow them to modify the commit
1257    aux_info = {'orig_parents': orig_parents,
1258                'had_file_changes': had_file_changes}
1259    if self._commit_callback:
1260      self._commit_callback(commit, aux_info)
1261
1262    # Now print the resulting commit, or if prunable skip it
1263    self._latest_orig_commit[branch] = commit.id
1264    if not (commit.old_id or commit.id) in _SKIPPED_COMMITS:
1265      self._latest_commit[branch] = commit.id
1266    if not commit.dumped:
1267      self._imported_refs.add(commit.branch)
1268      commit.dump(self._output)
1269
1270  def _parse_tag(self):
1271    """
1272    Parse input data into a Tag object. Once the Tag has been created,
1273    it will be handed off to the appropriate callbacks. Current-line will
1274    be advanced until it is beyond the tag data. The Tag will be dumped
1275    to _output once everything else is done (unless it has been skipped by
1276    the callback).
1277    """
1278    # Parse the Tag
1279    tag = self._parse_ref_line(b'tag')
1280    self._exported_refs.add(b'refs/tags/'+tag)
1281    id_ = self._parse_optional_mark()
1282    ignoreme, from_ref = self._parse_optional_parent_ref(b'from')
1283
1284    original_id = None
1285    if self._currentline.startswith(b'original-oid'):
1286      original_id = self._parse_original_id();
1287
1288    tagger_name, tagger_email, tagger_date = None, None, None
1289    if self._currentline.startswith(b'tagger'):
1290      (tagger_name, tagger_email, tagger_date) = self._parse_user(b'tagger')
1291    tag_msg = self._parse_data()
1292    if self._currentline == b'\n':
1293      self._advance_currentline()
1294
1295    # Create the tag
1296    tag = Tag(tag, from_ref,
1297              tagger_name, tagger_email, tagger_date, tag_msg,
1298              original_id)
1299
1300    # If fast-export text had a mark for this tag, need to make sure this
1301    # mark translates to the tag's true id.
1302    if id_:
1303      tag.old_id = id_
1304      _IDS.record_rename(id_, tag.id)
1305
1306    # Call any user callback to allow them to modify the tag
1307    if self._tag_callback:
1308      self._tag_callback(tag)
1309
1310    # The tag might not point at anything that still exists (self.from_ref
1311    # will be None if the commit it pointed to and all its ancestors were
1312    # pruned due to being empty)
1313    if tag.from_ref:
1314      # Print out this tag's information
1315      if not tag.dumped:
1316        self._imported_refs.add(b'refs/tags/'+tag.ref)
1317        tag.dump(self._output)
1318    else:
1319      tag.skip()
1320
1321  def _parse_progress(self):
1322    """
1323    Parse input data into a Progress object. Once the Progress has
1324    been created, it will be handed off to the appropriate
1325    callbacks. Current-line will be advanced until it is beyond the
1326    progress data. The Progress will be dumped to _output once
1327    everything else is done (unless it has been skipped by the callback).
1328    """
1329    # Parse the Progress
1330    message = self._parse_ref_line(b'progress')
1331    if self._currentline == b'\n':
1332      self._advance_currentline()
1333
1334    # Create the progress message
1335    progress = Progress(message)
1336
1337    # Call any user callback to allow them to modify the progress messsage
1338    if self._progress_callback:
1339      self._progress_callback(progress)
1340
1341    # NOTE: By default, we do NOT print the progress message; git
1342    # fast-import would write it to fast_import_pipes which could mess with
1343    # our parsing of output from the 'ls' and 'get-mark' directives we send
1344    # to fast-import.  If users want these messages, they need to process
1345    # and handle them in the appropriate callback above.
1346
1347  def _parse_checkpoint(self):
1348    """
1349    Parse input data into a Checkpoint object. Once the Checkpoint has
1350    been created, it will be handed off to the appropriate
1351    callbacks. Current-line will be advanced until it is beyond the
1352    checkpoint data. The Checkpoint will be dumped to _output once
1353    everything else is done (unless it has been skipped by the callback).
1354    """
1355    # Parse the Checkpoint
1356    self._advance_currentline()
1357    if self._currentline == b'\n':
1358      self._advance_currentline()
1359
1360    # Create the checkpoint
1361    checkpoint = Checkpoint()
1362
1363    # Call any user callback to allow them to drop the checkpoint
1364    if self._checkpoint_callback:
1365      self._checkpoint_callback(checkpoint)
1366
1367    # NOTE: By default, we do NOT print the checkpoint message; although it
1368    # we would only realistically get them with --stdin, the fact that we
1369    # are filtering makes me think the checkpointing is less likely to be
1370    # reasonable.  In fact, I don't think it's necessary in general.  If
1371    # users do want it, they should process it in the checkpoint_callback.
1372
1373  def _parse_literal_command(self):
1374    """
1375    Parse literal command.  Then just dump the line as is.
1376    """
1377    # Create the literal command object
1378    command = LiteralCommand(self._currentline)
1379    self._advance_currentline()
1380
1381    # Now print the resulting literal command
1382    if not command.dumped:
1383      command.dump(self._output)
1384
1385  def insert(self, obj):
1386    assert not obj.dumped
1387    obj.dump(self._output)
1388    if type(obj) == Commit:
1389      self._imported_refs.add(obj.branch)
1390    elif type(obj) in (Reset, Tag):
1391      self._imported_refs.add(obj.ref)
1392
1393  def run(self, input, output):
1394    """
1395    This method filters fast export output.
1396    """
1397    # Set input. If no args provided, use stdin.
1398    self._input = input
1399    self._output = output
1400
1401    # Run over the input and do the filtering
1402    self._advance_currentline()
1403    while self._currentline:
1404      if   self._currentline.startswith(b'blob'):
1405        self._parse_blob()
1406      elif self._currentline.startswith(b'reset'):
1407        self._parse_reset()
1408      elif self._currentline.startswith(b'commit'):
1409        self._parse_commit()
1410      elif self._currentline.startswith(b'tag'):
1411        self._parse_tag()
1412      elif self._currentline.startswith(b'progress'):
1413        self._parse_progress()
1414      elif self._currentline.startswith(b'checkpoint'):
1415        self._parse_checkpoint()
1416      elif self._currentline.startswith(b'feature'):
1417        self._parse_literal_command()
1418      elif self._currentline.startswith(b'option'):
1419        self._parse_literal_command()
1420      elif self._currentline.startswith(b'done'):
1421        if self._done_callback:
1422          self._done_callback()
1423        self._parse_literal_command()
1424        # Prevent confusion from others writing additional stuff that'll just
1425        # be ignored
1426        self._output.close()
1427      elif self._currentline.startswith(b'#'):
1428        self._parse_literal_command()
1429      elif self._currentline.startswith(b'get-mark') or \
1430           self._currentline.startswith(b'cat-blob') or \
1431           self._currentline.startswith(b'ls'):
1432        raise SystemExit(_("Unsupported command: '%s'") % self._currentline)
1433      else:
1434        raise SystemExit(_("Could not parse line: '%s'") % self._currentline)
1435
1436  def get_exported_and_imported_refs(self):
1437    return self._exported_refs, self._imported_refs
1438
1439def record_id_rename(old_id, new_id):
1440  """
1441  Register a new translation
1442  """
1443  handle_transitivity = True
1444  _IDS.record_rename(old_id, new_id, handle_transitivity)
1445
1446# Internal globals
1447_IDS = _IDs()
1448_SKIPPED_COMMITS = set()
1449HASH_TO_ID = {}
1450ID_TO_HASH = {}
1451
1452class SubprocessWrapper(object):
1453  @staticmethod
1454  def decodify(args):
1455    if type(args) == str:
1456      return args
1457    else:
1458      assert type(args) == list
1459      return [decode(x) if type(x)==bytes else x for x in args]
1460
1461  @staticmethod
1462  def call(*args, **kwargs):
1463    if 'cwd' in kwargs:
1464      kwargs['cwd'] = decode(kwargs['cwd'])
1465    return subprocess.call(SubprocessWrapper.decodify(*args), **kwargs)
1466
1467  @staticmethod
1468  def check_output(*args, **kwargs):
1469    if 'cwd' in kwargs:
1470      kwargs['cwd'] = decode(kwargs['cwd'])
1471    return subprocess.check_output(SubprocessWrapper.decodify(*args), **kwargs)
1472
1473  @staticmethod
1474  def check_call(*args, **kwargs): # pragma: no cover  # used by filter-lamely
1475    if 'cwd' in kwargs:
1476      kwargs['cwd'] = decode(kwargs['cwd'])
1477    return subprocess.check_call(SubprocessWrapper.decodify(*args), **kwargs)
1478
1479  @staticmethod
1480  def Popen(*args, **kwargs):
1481    if 'cwd' in kwargs:
1482      kwargs['cwd'] = decode(kwargs['cwd'])
1483    return subprocess.Popen(SubprocessWrapper.decodify(*args), **kwargs)
1484
1485subproc = subprocess
1486if platform.system() == 'Windows' or 'PRETEND_UNICODE_ARGS' in os.environ:
1487  subproc = SubprocessWrapper
1488
1489class GitUtils(object):
1490  @staticmethod
1491  def get_commit_count(repo, *args):
1492    """
1493    Return the number of commits that have been made on repo.
1494    """
1495    if not args:
1496      args = ['--all']
1497    if len(args) == 1 and isinstance(args[0], list):
1498      args = args[0]
1499    p = subproc.Popen(["git", "rev-list", "--count"] + args,
1500                      stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1501                      cwd=repo)
1502    if p.wait() != 0:
1503      raise SystemExit(_("%s does not appear to be a valid git repository")
1504                       % decode(repo))
1505    return int(p.stdout.read())
1506
1507  @staticmethod
1508  def get_total_objects(repo):
1509    """
1510    Return the number of objects (both packed and unpacked)
1511    """
1512    p1 = subproc.Popen(["git", "count-objects", "-v"],
1513                          stdout=subprocess.PIPE, cwd=repo)
1514    lines = p1.stdout.read().splitlines()
1515    # Return unpacked objects + packed-objects
1516    return int(lines[0].split()[1]) + int(lines[2].split()[1])
1517
1518  @staticmethod
1519  def is_repository_bare(repo_working_dir):
1520    out = subproc.check_output('git rev-parse --is-bare-repository'.split(),
1521                               cwd=repo_working_dir)
1522    return (out.strip() == b'true')
1523
1524  @staticmethod
1525  def determine_git_dir(repo_working_dir):
1526    d = subproc.check_output('git rev-parse --git-dir'.split(),
1527                             cwd=repo_working_dir).strip()
1528    if repo_working_dir==b'.' or d.startswith(b'/'):
1529      return d
1530    return os.path.join(repo_working_dir, d)
1531
1532  @staticmethod
1533  def get_refs(repo_working_dir):
1534    try:
1535      output = subproc.check_output('git show-ref'.split(),
1536                                    cwd=repo_working_dir)
1537    except subprocess.CalledProcessError as e:
1538      # If error code is 1, there just aren't any refs; i.e. new repo.
1539      # If error code is other than 1, some other error (e.g. not a git repo)
1540      if e.returncode != 1:
1541        raise SystemExit('fatal: {}'.format(e))
1542      output = ''
1543    return dict(reversed(x.split()) for x in output.splitlines())
1544
1545  @staticmethod
1546  def get_blob_sizes(quiet = False):
1547    blob_size_progress = ProgressWriter()
1548    num_blobs = 0
1549    processed_blobs_msg = _("Processed %d blob sizes")
1550
1551    # Get sizes of blobs by sha1
1552    cmd = '--batch-check=%(objectname) %(objecttype) ' + \
1553          '%(objectsize) %(objectsize:disk)'
1554    cf = subproc.Popen(['git', 'cat-file', '--batch-all-objects', cmd],
1555                       bufsize = -1,
1556                       stdout = subprocess.PIPE)
1557    unpacked_size = {}
1558    packed_size = {}
1559    for line in cf.stdout:
1560      sha, objtype, objsize, objdisksize = line.split()
1561      objsize, objdisksize = int(objsize), int(objdisksize)
1562      if objtype == b'blob':
1563        unpacked_size[sha] = objsize
1564        packed_size[sha] = objdisksize
1565        num_blobs += 1
1566      if not quiet:
1567        blob_size_progress.show(processed_blobs_msg % num_blobs)
1568    cf.wait()
1569    if not quiet:
1570      blob_size_progress.finish()
1571    return unpacked_size, packed_size
1572
1573  @staticmethod
1574  def get_file_changes(repo, parent_hash, commit_hash):
1575    """
1576    Return a FileChanges list with the differences between parent_hash
1577    and commit_hash
1578    """
1579    file_changes = []
1580
1581    cmd = ["git", "diff-tree", "-r", parent_hash, commit_hash]
1582    output = subproc.check_output(cmd, cwd=repo)
1583    for line in output.splitlines():
1584      fileinfo, path = line.split(b'\t', 1)
1585      if path.startswith(b'"'):
1586        path = PathQuoting.dequote(path)
1587      oldmode, mode, oldhash, newhash, changetype = fileinfo.split()
1588      if changetype == b'D':
1589        file_changes.append(FileChange(b'D', path))
1590      elif changetype in (b'A', b'M', b'T'):
1591        identifier = HASH_TO_ID.get(newhash, newhash)
1592        file_changes.append(FileChange(b'M', path, identifier, mode))
1593      else: # pragma: no cover
1594        raise SystemExit("Unknown change type for line {}".format(line))
1595
1596    return file_changes
1597
1598  @staticmethod
1599  def print_my_version():
1600    with open(__file__, 'br') as f:
1601      contents = f.read()
1602    # If people replaced @@LOCALEDIR@@ string to point at their local
1603    # directory, undo it so we can get original source version.
1604    contents = re.sub(br'\A#\!.*',
1605                      br'#!/usr/bin/env python3', contents)
1606    contents = re.sub(br'(\("GIT_TEXTDOMAINDIR"\) or ").*"',
1607                      br'\1@@LOCALEDIR@@"', contents)
1608
1609    cmd = 'git hash-object --stdin'.split()
1610    version = subproc.check_output(cmd, input=contents).strip()
1611    print(decode(version[0:12]))
1612
1613class FilteringOptions(object):
1614  default_replace_text = b'***REMOVED***'
1615  class AppendFilter(argparse.Action):
1616    def __call__(self, parser, namespace, values, option_string=None):
1617      user_path = values
1618      suffix = option_string[len('--path-'):] or 'match'
1619      if suffix.startswith('rename'):
1620        mod_type = 'rename'
1621        match_type = option_string[len('--path-rename-'):] or 'match'
1622        values = values.split(b':')
1623        if len(values) != 2:
1624          raise SystemExit(_("Error: --path-rename expects one colon in its"
1625                             " argument: <old_name:new_name>."))
1626        if values[0] and values[1] and not (
1627           values[0].endswith(b'/') == values[1].endswith(b'/')):
1628          raise SystemExit(_("Error: With --path-rename, if OLD_NAME and "
1629                             "NEW_NAME are both non-empty and either ends "
1630                             "with a slash then both must."))
1631        if any(v.startswith(b'/') for v in values):
1632          raise SystemExit(_("Error: Pathnames cannot begin with a '/'"))
1633        components = values[0].split(b'/') + values[1].split(b'/')
1634      else:
1635        mod_type = 'filter'
1636        match_type = suffix
1637        components = values.split(b'/')
1638        if values.startswith(b'/'):
1639          raise SystemExit(_("Error: Pathnames cannot begin with a '/'"))
1640      for illegal_path in [b'.', b'..']:
1641        if illegal_path in components:
1642          raise SystemExit(_("Error: Invalid path component '%s' found in '%s'")
1643                           % (decode(illegal_path), decode(user_path)))
1644      if match_type == 'regex':
1645        values = re.compile(values)
1646      items = getattr(namespace, self.dest, []) or []
1647      items.append((mod_type, match_type, values))
1648      if (match_type, mod_type) == ('glob', 'filter'):
1649        if not values.endswith(b'*'):
1650          extension = b'*' if values.endswith(b'/') else b'/*'
1651          items.append((mod_type, match_type, values+extension))
1652      setattr(namespace, self.dest, items)
1653
1654  class HelperFilter(argparse.Action):
1655    def __call__(self, parser, namespace, values, option_string=None):
1656      af = FilteringOptions.AppendFilter(dest='path_changes',
1657                                         option_strings=None)
1658      dirname = values if values[-1:] == b'/' else values+b'/'
1659      if option_string == '--subdirectory-filter':
1660        af(parser, namespace, dirname,     '--path-match')
1661        af(parser, namespace, dirname+b':', '--path-rename')
1662      elif option_string == '--to-subdirectory-filter':
1663        af(parser, namespace, b':'+dirname, '--path-rename')
1664      else:
1665        raise SystemExit(_("Error: HelperFilter given invalid option_string: %s")
1666                         % option_string) # pragma: no cover
1667
1668  class FileWithPathsFilter(argparse.Action):
1669    def __call__(self, parser, namespace, values, option_string=None):
1670      if not namespace.path_changes:
1671        namespace.path_changes = []
1672      namespace.path_changes += FilteringOptions.get_paths_from_file(values)
1673
1674  @staticmethod
1675  def create_arg_parser():
1676    # Include usage in the summary, so we can put the description first
1677    summary = _('''Rewrite (or analyze) repository history
1678
1679    git-filter-repo destructively rewrites history (unless --analyze or
1680    --dry-run are given) according to specified rules.  It refuses to do any
1681    rewriting unless either run from a clean fresh clone, or --force was
1682    given.
1683
1684    Basic Usage:
1685      git-filter-repo --analyze
1686      git-filter-repo [FILTER/RENAME/CONTROL OPTIONS]
1687
1688    See EXAMPLES section for details.
1689    ''').rstrip()
1690
1691    # Provide a long helpful examples section
1692    example_text = _('''CALLBACKS
1693
1694    All callback functions are of the same general format.  For a command line
1695    argument like
1696      --foo-callback 'BODY'
1697
1698    the following code will be compiled and called:
1699      def foo_callback(foo):
1700        BODY
1701
1702    Thus, to replace 'Jon' with 'John' in author/committer/tagger names:
1703      git filter-repo --name-callback 'return name.replace(b"Jon", b"John")'
1704
1705    To remove all 'Tested-by' tags in commit (or tag) messages:
1706      git filter-repo --message-callback 'return re.sub(br"\\nTested-by:.*", "", message)'
1707
1708    To remove all .DS_Store files:
1709      git filter-repo --filename-callback 'return None if os.path.basename(filename) == b".DS_Store" else filename'
1710
1711    Note that if BODY resolves to a filename, then the contents of that file
1712    will be used as the BODY in the callback function.
1713
1714    For more detailed examples and explanations AND caveats, see
1715      https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#CALLBACKS
1716
1717EXAMPLES
1718
1719    To get a bunch of reports mentioning renames that have occurred in
1720    your repo and listing sizes of objects aggregated by any of path,
1721    directory, extension, or blob-id:
1722      git filter-repo --analyze
1723
1724    (These reports can help you choose how to filter your repo; it can
1725    be useful to re-run this command after filtering to regenerate the
1726    report and verify the changes look correct.)
1727
1728    To extract the history that touched just 'guides' and 'tools/releases':
1729      git filter-repo --path guides/ --path tools/releases
1730
1731    To remove foo.zip and bar/baz/zips from every revision in history:
1732      git filter-repo --path foo.zip --path bar/baz/zips/ --invert-paths
1733
1734    To replace the text 'password' with 'p455w0rd':
1735      git filter-repo --replace-text <(echo "password==>p455w0rd")
1736
1737    To use the current version of the .mailmap file to update authors,
1738    committers, and taggers throughout history and make it permanent:
1739      git filter-repo --use-mailmap
1740
1741    To extract the history of 'src/', rename all files to have a new leading
1742    directory 'my-module' (e.g. src/foo.java -> my-module/src/foo.java), and
1743    add a 'my-module-' prefix to all tags:
1744      git filter-repo --path src/ --to-subdirectory-filter my-module --tag-rename '':'my-module-'
1745
1746    For more detailed examples and explanations, see
1747      https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#EXAMPLES''')
1748
1749    # Create the basic parser
1750    parser = argparse.ArgumentParser(description=summary,
1751                                     usage = argparse.SUPPRESS,
1752                                     add_help = False,
1753                                     epilog = example_text,
1754                                     formatter_class=argparse.RawDescriptionHelpFormatter)
1755
1756    analyze = parser.add_argument_group(title=_("Analysis"))
1757    analyze.add_argument('--analyze', action='store_true',
1758        help=_("Analyze repository history and create a report that may be "
1759               "useful in determining what to filter in a subsequent run. "
1760               "Will not modify your repo."))
1761    analyze.add_argument('--report-dir',
1762        metavar='DIR_OR_FILE',
1763        type=os.fsencode,
1764        dest='report_dir',
1765        help=_("Directory to write report, defaults to GIT_DIR/filter_repo/analysis,"
1766               "refuses to run if exists, --force delete existing dir first."))
1767
1768    path = parser.add_argument_group(title=_("Filtering based on paths "
1769                                             "(see also --filename-callback)"),
1770                                     description=textwrap.dedent(_("""
1771           These options specify the paths to select.  Note that much like git
1772           itself, renames are NOT followed so you may need to specify multiple
1773           paths, e.g. `--path olddir/ --path newdir/`
1774           """[1:])))
1775
1776    path.add_argument('--invert-paths', action='store_false', dest='inclusive',
1777        help=_("Invert the selection of files from the specified "
1778               "--path-{match,glob,regex} options below, i.e. only select "
1779               "files matching none of those options."))
1780
1781    path.add_argument('--path-match', '--path', metavar='DIR_OR_FILE',
1782        type=os.fsencode,
1783        action=FilteringOptions.AppendFilter, dest='path_changes',
1784        help=_("Exact paths (files or directories) to include in filtered "
1785               "history.  Multiple --path options can be specified to get "
1786               "a union of paths."))
1787    path.add_argument('--path-glob', metavar='GLOB', type=os.fsencode,
1788        action=FilteringOptions.AppendFilter, dest='path_changes',
1789        help=_("Glob of paths to include in filtered history. Multiple "
1790               "--path-glob options can be specified to get a union of "
1791               "paths."))
1792    path.add_argument('--path-regex', metavar='REGEX', type=os.fsencode,
1793        action=FilteringOptions.AppendFilter, dest='path_changes',
1794        help=_("Regex of paths to include in filtered history. Multiple "
1795               "--path-regex options can be specified to get a union of "
1796               "paths"))
1797    path.add_argument('--use-base-name', action='store_true',
1798        help=_("Match on file base name instead of full path from the top "
1799               "of the repo.  Incompatible with --path-rename, and "
1800               "incompatible with matching against directory names."))
1801
1802    rename = parser.add_argument_group(title=_("Renaming based on paths "
1803                                             "(see also --filename-callback)"))
1804    rename.add_argument('--path-rename', '--path-rename-match',
1805        metavar='OLD_NAME:NEW_NAME', dest='path_changes', type=os.fsencode,
1806        action=FilteringOptions.AppendFilter,
1807        help=_("Path to rename; if filename or directory matches OLD_NAME "
1808               "rename to NEW_NAME.  Multiple --path-rename options can be "
1809               "specified.  NOTE: If you combine filtering options with "
1810               "renaming ones, do not rely on a rename argument to select "
1811               "paths; you also need a filter to select them."))
1812
1813    helpers = parser.add_argument_group(title=_("Path shortcuts"))
1814    helpers.add_argument('--paths-from-file', metavar='FILENAME',
1815        type=os.fsencode,
1816        action=FilteringOptions.FileWithPathsFilter, dest='path_changes',
1817        help=_("Specify several path filtering and renaming directives, one "
1818               "per line.  Lines with '==>' in them specify path renames, "
1819               "and lines can begin with 'literal:' (the default), 'glob:', "
1820               "or 'regex:' to specify different matching styles.  Blank "
1821               "lines and lines starting with a '#' are ignored."))
1822    helpers.add_argument('--subdirectory-filter', metavar='DIRECTORY',
1823        action=FilteringOptions.HelperFilter, type=os.fsencode,
1824        help=_("Only look at history that touches the given subdirectory "
1825               "and treat that directory as the project root.  Equivalent "
1826               "to using '--path DIRECTORY/ --path-rename DIRECTORY/:'"))
1827    helpers.add_argument('--to-subdirectory-filter', metavar='DIRECTORY',
1828        action=FilteringOptions.HelperFilter, type=os.fsencode,
1829        help=_("Treat the project root as instead being under DIRECTORY. "
1830               "Equivalent to using '--path-rename :DIRECTORY/'"))
1831
1832    contents = parser.add_argument_group(title=_("Content editing filters "
1833                                                 "(see also --blob-callback)"))
1834    contents.add_argument('--replace-text', metavar='EXPRESSIONS_FILE',
1835        help=_("A file with expressions that, if found, will be replaced. "
1836               "By default, each expression is treated as literal text, "
1837               "but 'regex:' and 'glob:' prefixes are supported.  You can "
1838               "end the line with '==>' and some replacement text to "
1839               "choose a replacement choice other than the default of '{}'."
1840               .format(decode(FilteringOptions.default_replace_text))))
1841    contents.add_argument('--strip-blobs-bigger-than', metavar='SIZE',
1842                          dest='max_blob_size', default=0,
1843        help=_("Strip blobs (files) bigger than specified size (e.g. '5M', "
1844               "'2G', etc)"))
1845    contents.add_argument('--strip-blobs-with-ids', metavar='BLOB-ID-FILENAME',
1846        help=_("Read git object ids from each line of the given file, and "
1847               "strip all of them from history"))
1848
1849    refrename = parser.add_argument_group(title=_("Renaming of refs "
1850                                              "(see also --refname-callback)"))
1851    refrename.add_argument('--tag-rename', metavar='OLD:NEW', type=os.fsencode,
1852        help=_("Rename tags starting with OLD to start with NEW.  For "
1853               "example, --tag-rename foo:bar will rename tag foo-1.2.3 "
1854               "to bar-1.2.3; either OLD or NEW can be empty."))
1855
1856    messages = parser.add_argument_group(title=_("Filtering of commit messages "
1857                                               "(see also --message-callback)"))
1858    messages.add_argument('--replace-message', metavar='EXPRESSIONS_FILE',
1859        help=_("A file with expressions that, if found in commit messages, "
1860               "will be replaced. This file uses the same syntax as "
1861               "--replace-text."))
1862    messages.add_argument('--preserve-commit-hashes', action='store_true',
1863        help=_("By default, since commits are rewritten and thus gain new "
1864               "hashes, references to old commit hashes in commit messages "
1865               "are replaced with new commit hashes (abbreviated to the same "
1866               "length as the old reference).  Use this flag to turn off "
1867               "updating commit hashes in commit messages."))
1868    messages.add_argument('--preserve-commit-encoding', action='store_true',
1869        help=_("Do not reencode commit messages into UTF-8.  By default, if "
1870               "the commit object specifies an encoding for the commit "
1871               "message, the message is re-encoded into UTF-8."))
1872
1873    people = parser.add_argument_group(title=_("Filtering of names & emails "
1874                                               "(see also --name-callback "
1875                                               "and --email-callback)"))
1876    people.add_argument('--mailmap', dest='mailmap', metavar='FILENAME',
1877        type=os.fsencode,
1878        help=_("Use specified mailmap file (see git-shortlog(1) for "
1879               "details on the format) when rewriting author, committer, "
1880               "and tagger names and emails.  If the specified file is "
1881               "part of git history, historical versions of the file will "
1882               "be ignored; only the current contents are consulted."))
1883    people.add_argument('--use-mailmap', dest='mailmap',
1884        action='store_const', const=b'.mailmap',
1885        help=_("Same as: '--mailmap .mailmap' "))
1886
1887    parents = parser.add_argument_group(title=_("Parent rewriting"))
1888    parents.add_argument('--replace-refs', default=None,
1889                         choices=['delete-no-add', 'delete-and-add',
1890                                  'update-no-add', 'update-or-add',
1891                                  'update-and-add'],
1892        help=_("Replace refs (see git-replace(1)) are used to rewrite "
1893               "parents (unless turned off by the usual git mechanism); this "
1894               "flag specifies what do do with those refs afterward. "
1895               "Replace refs can either be deleted or updated to point at new "
1896               "commit hashes.  Also, new replace refs can be added for each "
1897               "commit rewrite.  With 'update-or-add', new replace refs are "
1898               "only added for commit rewrites that aren't used to update an "
1899               "existing replace ref. default is 'update-and-add' if "
1900               "$GIT_DIR/filter-repo/already_ran does not exist; "
1901               "'update-or-add' otherwise."))
1902    parents.add_argument('--prune-empty', default='auto',
1903                         choices=['always', 'auto', 'never'],
1904        help=_("Whether to prune empty commits.  'auto' (the default) means "
1905               "only prune commits which become empty (not commits which were "
1906               "empty in the original repo, unless their parent was pruned). "
1907               "When the parent of a commit is pruned, the first non-pruned "
1908               "ancestor becomes the new parent."))
1909    parents.add_argument('--prune-degenerate', default='auto',
1910                         choices=['always', 'auto', 'never'],
1911        help=_("Since merge commits are needed for history topology, they "
1912               "are typically exempt from pruning.  However, they can become "
1913               "degenerate with the pruning of other commits (having fewer "
1914               "than two parents, having one commit serve as both parents, or "
1915               "having one parent as the ancestor of the other.)  If such "
1916               "merge commits have no file changes, they can be pruned.  The "
1917               "default ('auto') is to only prune empty merge commits which "
1918               "become degenerate (not which started as such)."))
1919    parents.add_argument('--no-ff', action='store_true',
1920        help=_("Even if the first parent is or becomes an ancestor of another "
1921               "parent, do not prune it.  This modifies how "
1922               "--prune-degenerate behaves, and may be useful in projects who "
1923               "always use merge --no-ff."))
1924
1925    callback = parser.add_argument_group(title=_("Generic callback code snippets"))
1926    callback.add_argument('--filename-callback', metavar="FUNCTION_BODY_OR_FILE",
1927        help=_("Python code body for processing filenames; see CALLBACKS "
1928               "sections below."))
1929    callback.add_argument('--message-callback', metavar="FUNCTION_BODY_OR_FILE",
1930        help=_("Python code body for processing messages (both commit "
1931               "messages and tag messages); see CALLBACKS section below."))
1932    callback.add_argument('--name-callback', metavar="FUNCTION_BODY_OR_FILE",
1933        help=_("Python code body for processing names of people; see "
1934               "CALLBACKS section below."))
1935    callback.add_argument('--email-callback', metavar="FUNCTION_BODY_OR_FILE",
1936        help=_("Python code body for processing emails addresses; see "
1937               "CALLBACKS section below."))
1938    callback.add_argument('--refname-callback', metavar="FUNCTION_BODY_OR_FILE",
1939        help=_("Python code body for processing refnames; see CALLBACKS "
1940               "section below."))
1941
1942    callback.add_argument('--blob-callback', metavar="FUNCTION_BODY_OR_FILE",
1943        help=_("Python code body for processing blob objects; see "
1944               "CALLBACKS section below."))
1945    callback.add_argument('--commit-callback', metavar="FUNCTION_BODY_OR_FILE",
1946        help=_("Python code body for processing commit objects; see "
1947               "CALLBACKS section below."))
1948    callback.add_argument('--tag-callback', metavar="FUNCTION_BODY_OR_FILE",
1949        help=_("Python code body for processing tag objects; see CALLBACKS "
1950               "section below."))
1951    callback.add_argument('--reset-callback', metavar="FUNCTION_BODY_OR_FILE",
1952        help=_("Python code body for processing reset objects; see "
1953               "CALLBACKS section below."))
1954
1955    desc = _(
1956      "Specifying alternate source or target locations implies --partial,\n"
1957      "except that the normal default for --replace-refs is used.  However,\n"
1958      "unlike normal uses of --partial, this doesn't risk mixing old and new\n"
1959      "history since the old and new histories are in different repositories.")
1960    location = parser.add_argument_group(title=_("Location to filter from/to"),
1961                                         description=desc)
1962    location.add_argument('--source', type=os.fsencode,
1963                          help=_("Git repository to read from"))
1964    location.add_argument('--target', type=os.fsencode,
1965        help=_("Git repository to overwrite with filtered history"))
1966
1967    misc = parser.add_argument_group(title=_("Miscellaneous options"))
1968    misc.add_argument('--help', '-h', action='store_true',
1969        help=_("Show this help message and exit."))
1970    misc.add_argument('--version', action='store_true',
1971        help=_("Display filter-repo's version and exit."))
1972    misc.add_argument('--force', '-f', action='store_true',
1973        help=_("Rewrite repository history even if the current repo does not "
1974               "look like a fresh clone.  History rewriting is irreversible "
1975               "(and includes immediate pruning of reflogs and old objects), "
1976               "so be cautious about using this flag."))
1977    misc.add_argument('--partial', action='store_true',
1978        help=_("Do a partial history rewrite, resulting in the mixture of "
1979               "old and new history.  This implies a default of "
1980               "update-no-add for --replace-refs, disables rewriting "
1981               "refs/remotes/origin/* to refs/heads/*, disables removing "
1982               "of the 'origin' remote, disables removing unexported refs, "
1983               "disables expiring the reflog, and disables the automatic "
1984               "post-filter gc.  Also, this modifies --tag-rename and "
1985               "--refname-callback options such that instead of replacing "
1986               "old refs with new refnames, it will instead create new "
1987               "refs and keep the old ones around.  Use with caution."))
1988    # WARNING: --refs presents a problem with become-degenerate pruning:
1989    #   * Excluding a commit also excludes its ancestors so when some other
1990    #     commit has an excluded ancestor as a parent we have no way of
1991    #     knowing what it is an ancestor of without doing a special
1992    #     full-graph walk.
1993    misc.add_argument('--refs', nargs='+',
1994        help=_("Limit history rewriting to the specified refs.  Implies "
1995               "--partial.  In addition to the normal caveats of --partial "
1996               "(mixing old and new history, no automatic remapping of "
1997               "refs/remotes/origin/* to refs/heads/*, etc.), this also may "
1998               "cause problems for pruning of degenerate empty merge "
1999               "commits when negative revisions are specified."))
2000
2001    misc.add_argument('--dry-run', action='store_true',
2002        help=_("Do not change the repository.  Run `git fast-export` and "
2003               "filter its output, and save both the original and the "
2004               "filtered version for comparison.  This also disables "
2005               "rewriting commit messages due to not knowing new commit "
2006               "IDs and disables filtering of some empty commits due to "
2007               "inability to query the fast-import backend." ))
2008    misc.add_argument('--debug', action='store_true',
2009        help=_("Print additional information about operations being "
2010               "performed and commands being run.  When used together "
2011               "with --dry-run, also show extra information about what "
2012               "would be run."))
2013    # WARNING: --state-branch has some problems:
2014    #   * It does not work well with manually inserted objects (user creating
2015    #     Blob() or Commit() or Tag() objects and calling
2016    #     RepoFilter.insert(obj) on them).
2017    #   * It does not work well with multiple source or multiple target repos
2018    #   * It doesn't work so well with pruning become-empty commits (though
2019    #     --refs doesn't work so well with it either)
2020    # These are probably fixable, given some work (e.g. re-importing the
2021    # graph at the beginning to get the AncestryGraph right, doing our own
2022    # export of marks instead of using fast-export --export-marks, etc.), but
2023    # for now just hide the option.
2024    misc.add_argument('--state-branch',
2025        #help=_("Enable incremental filtering by saving the mapping of old "
2026        #       "to new objects to the specified branch upon exit, and"
2027        #       "loading that mapping from that branch (if it exists) "
2028        #       "upon startup."))
2029        help=argparse.SUPPRESS)
2030    misc.add_argument('--stdin', action='store_true',
2031        help=_("Instead of running `git fast-export` and filtering its "
2032               "output, filter the fast-export stream from stdin.    The "
2033               "stdin must be in the expected input format (e.g. it needs "
2034               "to include original-oid directives)."))
2035    misc.add_argument('--quiet', action='store_true',
2036        help=_("Pass --quiet to other git commands called"))
2037    return parser
2038
2039  @staticmethod
2040  def sanity_check_args(args):
2041    if args.analyze and args.path_changes:
2042      raise SystemExit(_("Error: --analyze is incompatible with --path* flags; "
2043                         "it's a read-only operation."))
2044    if args.analyze and args.stdin:
2045      raise SystemExit(_("Error: --analyze is incompatible with --stdin."))
2046    # If no path_changes are found, initialize with empty list but mark as
2047    # not inclusive so that all files match
2048    if args.path_changes == None:
2049      args.path_changes = []
2050      args.inclusive = False
2051    else:
2052      # Similarly, if we have no filtering paths, then no path should be
2053      # filtered out.  Based on how newname() works, the easiest way to
2054      # achieve that is setting args.inclusive to False.
2055      if not any(x[0] == 'filter' for x in args.path_changes):
2056        args.inclusive = False
2057      # Also check for incompatible --use-base-name and --path-rename flags.
2058      if args.use_base_name:
2059        if any(x[0] == 'rename' for x in args.path_changes):
2060          raise SystemExit(_("Error: --use-base-name and --path-rename are "
2061                             "incompatible."))
2062    # Also throw some sanity checks on git version here;
2063    # PERF: remove these checks once new enough git versions are common
2064    p = subproc.Popen('git fast-export -h'.split(),
2065                      stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2066    output = p.stdout.read()
2067    if b'--anonymize-map' not in output: # pragma: no cover
2068      global date_format_permissive
2069      date_format_permissive = False
2070    if b'--mark-tags' not in output: # pragma: no cover
2071      global write_marks
2072      write_marks = False
2073      if args.state_branch:
2074        # We need a version of git-fast-export with --mark-tags
2075        raise SystemExit(_("Error: need git >= 2.24.0"))
2076    if b'--reencode' not in output: # pragma: no cover
2077      if args.preserve_commit_encoding:
2078        # We need a version of git-fast-export with --reencode
2079        raise SystemExit(_("Error: need git >= 2.23.0"))
2080      else:
2081        # Set args.preserve_commit_encoding to None which we'll check for later
2082        # to avoid passing --reencode=yes to fast-export (that option was the
2083        # default prior to git-2.23)
2084        args.preserve_commit_encoding = None
2085      # If we don't have fast-exoprt --reencode, we may also be missing
2086      # diff-tree --combined-all-paths, which is even more important...
2087      p = subproc.Popen('git diff-tree -h'.split(),
2088                        stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2089      output = p.stdout.read()
2090      if b'--combined-all-paths' not in output:
2091        # We need a version of git-diff-tree with --combined-all-paths
2092        raise SystemExit(_("Error: need git >= 2.22.0"))
2093    # End of sanity checks on git version
2094    if args.max_blob_size:
2095      suffix = args.max_blob_size[-1]
2096      if suffix not in '1234567890':
2097        mult = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
2098        if suffix not in mult:
2099          raise SystemExit(_("Error: could not parse --strip-blobs-bigger-than"
2100                             " argument %s")
2101                           % args.max_blob_size)
2102        args.max_blob_size = int(args.max_blob_size[0:-1]) * mult[suffix]
2103      else:
2104        args.max_blob_size = int(args.max_blob_size)
2105
2106  @staticmethod
2107  def get_replace_text(filename):
2108    replace_literals = []
2109    replace_regexes = []
2110    with open(filename, 'br') as f:
2111      for line in f:
2112        line = line.rstrip(b'\r\n')
2113
2114        # Determine the replacement
2115        replacement = FilteringOptions.default_replace_text
2116        if b'==>' in line:
2117          line, replacement = line.rsplit(b'==>', 1)
2118
2119        # See if we need to match via regex
2120        regex = None
2121        if line.startswith(b'regex:'):
2122          regex = line[6:]
2123        elif line.startswith(b'glob:'):
2124          regex = glob_to_regex(line[5:])
2125        if regex:
2126          replace_regexes.append((re.compile(regex), replacement))
2127        else:
2128          # Otherwise, find the literal we need to replace
2129          if line.startswith(b'literal:'):
2130            line = line[8:]
2131          if not line:
2132            continue
2133          replace_literals.append((line, replacement))
2134    return {'literals': replace_literals, 'regexes':  replace_regexes}
2135
2136  @staticmethod
2137  def get_paths_from_file(filename):
2138    new_path_changes = []
2139    with open(filename, 'br') as f:
2140      for line in f:
2141        line = line.rstrip(b'\r\n')
2142
2143        # Skip blank lines
2144        if not line:
2145          continue
2146        # Skip comment lines
2147        if line.startswith(b'#'):
2148          continue
2149
2150        # Determine the replacement
2151        match_type, repl = 'literal', None
2152        if b'==>' in line:
2153          line, repl = line.rsplit(b'==>', 1)
2154
2155        # See if we need to match via regex
2156        match_type = 'match' # a.k.a. 'literal'
2157        if line.startswith(b'regex:'):
2158          match_type = 'regex'
2159          match = re.compile(line[6:])
2160        elif line.startswith(b'glob:'):
2161          match_type = 'glob'
2162          match = line[5:]
2163          if repl:
2164            raise SystemExit(_("Error: In %s, 'glob:' and '==>' are incompatible (renaming globs makes no sense)" % decode(filename)))
2165        else:
2166          if line.startswith(b'literal:'):
2167            match = line[8:]
2168          else:
2169            match = line
2170          if repl is not None:
2171            if match and repl and match.endswith(b'/') != repl.endswith(b'/'):
2172              raise SystemExit(_("Error: When rename directories, if OLDNAME "
2173                                 "and NEW_NAME are both non-empty and either "
2174                                 "ends with a slash then both must."))
2175
2176        # Record the filter or rename
2177        if repl is not None:
2178          new_path_changes.append(['rename', match_type, (match, repl)])
2179        else:
2180          new_path_changes.append(['filter', match_type, match])
2181          if match_type == 'glob' and not match.endswith(b'*'):
2182            extension = b'*' if match.endswith(b'/') else b'/*'
2183            new_path_changes.append(['filter', match_type, match+extension])
2184      return new_path_changes
2185
2186  @staticmethod
2187  def default_options():
2188    return FilteringOptions.parse_args([], error_on_empty = False)
2189
2190  @staticmethod
2191  def parse_args(input_args, error_on_empty = True):
2192    parser = FilteringOptions.create_arg_parser()
2193    if not input_args and error_on_empty:
2194      parser.print_usage()
2195      raise SystemExit(_("No arguments specified."))
2196    args = parser.parse_args(input_args)
2197    if args.help:
2198      parser.print_help()
2199      raise SystemExit()
2200    if args.version:
2201      GitUtils.print_my_version()
2202      raise SystemExit()
2203    FilteringOptions.sanity_check_args(args)
2204    if args.mailmap:
2205      args.mailmap = MailmapInfo(args.mailmap)
2206    if args.replace_text:
2207      args.replace_text = FilteringOptions.get_replace_text(args.replace_text)
2208    if args.replace_message:
2209      args.replace_message = FilteringOptions.get_replace_text(args.replace_message)
2210    if args.strip_blobs_with_ids:
2211      with open(args.strip_blobs_with_ids, 'br') as f:
2212        args.strip_blobs_with_ids = set(f.read().split())
2213    else:
2214      args.strip_blobs_with_ids = set()
2215    if (args.partial or args.refs) and not args.replace_refs:
2216      args.replace_refs = 'update-no-add'
2217    args.repack = not (args.partial or args.refs)
2218    if args.refs or args.source or args.target:
2219      args.partial = True
2220    if not args.refs:
2221      args.refs = ['--all']
2222    return args
2223
2224class RepoAnalyze(object):
2225
2226  # First, several helper functions for analyze_commit()
2227
2228  @staticmethod
2229  def equiv_class(stats, filename):
2230    return stats['equivalence'].get(filename, (filename,))
2231
2232  @staticmethod
2233  def setup_equivalence_for_rename(stats, oldname, newname):
2234    # if A is renamed to B and B is renamed to C, then the user thinks of
2235    # A, B, and C as all being different names for the same 'file'.  We record
2236    # this as an equivalence class:
2237    #   stats['equivalence'][name] = (A,B,C)
2238    # for name being each of A, B, and C.
2239    old_tuple = stats['equivalence'].get(oldname, ())
2240    if newname in old_tuple:
2241      return
2242    elif old_tuple:
2243      new_tuple = tuple(list(old_tuple)+[newname])
2244    else:
2245      new_tuple = (oldname, newname)
2246    for f in new_tuple:
2247      stats['equivalence'][f] = new_tuple
2248
2249  @staticmethod
2250  def setup_or_update_rename_history(stats, commit, oldname, newname):
2251    rename_commits = stats['rename_history'].get(oldname, set())
2252    rename_commits.add(commit)
2253    stats['rename_history'][oldname] = rename_commits
2254
2255  @staticmethod
2256  def handle_renames(stats, commit, change_types, filenames):
2257    for index, change_type in enumerate(change_types):
2258      if change_type == ord(b'R'):
2259        oldname, newname = filenames[index], filenames[-1]
2260        RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname)
2261        RepoAnalyze.setup_or_update_rename_history(stats, commit,
2262                                                   oldname, newname)
2263
2264  @staticmethod
2265  def handle_file(stats, graph, commit, modes, shas, filenames):
2266    mode, sha, filename = modes[-1], shas[-1], filenames[-1]
2267
2268    # Figure out kind of deletions to undo for this file, and update lists
2269    # of all-names-by-sha and all-filenames
2270    delmode = 'tree_deletions'
2271    if mode != b'040000':
2272      delmode = 'file_deletions'
2273      stats['names'][sha].add(filename)
2274      stats['allnames'].add(filename)
2275
2276    # If the file (or equivalence class of files) was recorded as deleted,
2277    # clearly it isn't anymore
2278    equiv = RepoAnalyze.equiv_class(stats, filename)
2279    for f in equiv:
2280      stats[delmode].pop(f, None)
2281
2282    # If we get a modify/add for a path that was renamed, we may need to break
2283    # the equivalence class.  However, if the modify/add was on a branch that
2284    # doesn't have the rename in its history, we are still okay.
2285    need_to_break_equivalence = False
2286    if equiv[-1] != filename:
2287      for rename_commit in stats['rename_history'][filename]:
2288        if graph.is_ancestor(rename_commit, commit):
2289          need_to_break_equivalence = True
2290
2291    if need_to_break_equivalence:
2292      for f in equiv:
2293        if f in stats['equivalence']:
2294          del stats['equivalence'][f]
2295
2296  @staticmethod
2297  def analyze_commit(stats, graph, commit, parents, date, file_changes):
2298    graph.add_commit_and_parents(commit, parents)
2299    for change in file_changes:
2300      modes, shas, change_types, filenames = change
2301      if len(parents) == 1 and change_types.startswith(b'R'):
2302        change_types = b'R'  # remove the rename score; we don't care
2303      if modes[-1] == b'160000':
2304        continue
2305      elif modes[-1] == b'000000':
2306        # Track when files/directories are deleted
2307        for f in RepoAnalyze.equiv_class(stats, filenames[-1]):
2308          if any(x == b'040000' for x in modes[0:-1]):
2309            stats['tree_deletions'][f] = date
2310          else:
2311            stats['file_deletions'][f] = date
2312      elif change_types.strip(b'AMT') == b'':
2313        RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
2314      elif modes[-1] == b'040000' and change_types.strip(b'RAM') == b'':
2315        RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
2316      elif change_types.strip(b'RAMT') == b'':
2317        RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
2318        RepoAnalyze.handle_renames(stats, commit, change_types, filenames)
2319      else:
2320        raise SystemExit(_("Unhandled change type(s): %(change_type)s "
2321                           "(in commit %(commit)s)")
2322                         % ({'change_type': change_types, 'commit': commit})
2323                         ) # pragma: no cover
2324
2325  @staticmethod
2326  def gather_data(args):
2327    unpacked_size, packed_size = GitUtils.get_blob_sizes()
2328    stats = {'names': collections.defaultdict(set),
2329             'allnames' : set(),
2330             'file_deletions': {},
2331             'tree_deletions': {},
2332             'equivalence': {},
2333             'rename_history': collections.defaultdict(set),
2334             'unpacked_size': unpacked_size,
2335             'packed_size': packed_size,
2336             'num_commits': 0}
2337
2338    # Setup the rev-list/diff-tree process
2339    processed_commits_msg = _("Processed %d commits")
2340    commit_parse_progress = ProgressWriter()
2341    num_commits = 0
2342    cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) +
2343           ' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' +
2344           ' --date=short -M -t -c --raw --combined-all-paths')
2345    dtp = subproc.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
2346    f = dtp.stdout
2347    line = f.readline()
2348    if not line:
2349      raise SystemExit(_("Nothing to analyze; repository is empty."))
2350    cont = bool(line)
2351    graph = AncestryGraph()
2352    while cont:
2353      commit = line.rstrip()
2354      parents = f.readline().split()
2355      date = f.readline().rstrip()
2356
2357      # We expect a blank line next; if we get a non-blank line then
2358      # this commit modified no files and we need to move on to the next.
2359      # If there is no line, we've reached end-of-input.
2360      line = f.readline()
2361      if not line:
2362        cont = False
2363      line = line.rstrip()
2364
2365      # If we haven't reached end of input, and we got a blank line meaning
2366      # a commit that has modified files, then get the file changes associated
2367      # with this commit.
2368      file_changes = []
2369      if cont and not line:
2370        cont = False
2371        for line in f:
2372          if not line.startswith(b':'):
2373            cont = True
2374            break
2375          n = 1+max(1, len(parents))
2376          assert line.startswith(b':'*(n-1))
2377          relevant = line[n-1:-1]
2378          splits = relevant.split(None, n)
2379          modes = splits[0:n]
2380          splits = splits[n].split(None, n)
2381          shas = splits[0:n]
2382          splits = splits[n].split(b'\t')
2383          change_types = splits[0]
2384          filenames = [PathQuoting.dequote(x) for x in splits[1:]]
2385          file_changes.append([modes, shas, change_types, filenames])
2386
2387      # If someone is trying to analyze a subset of the history, make sure
2388      # to avoid dying on commits with parents that we haven't seen before
2389      if args.refs:
2390        graph.record_external_commits([p for p in parents
2391                                       if not p in graph.value])
2392
2393      # Analyze this commit and update progress
2394      RepoAnalyze.analyze_commit(stats, graph, commit, parents, date,
2395                                 file_changes)
2396      num_commits += 1
2397      commit_parse_progress.show(processed_commits_msg % num_commits)
2398
2399    # Show the final commits processed message and record the number of commits
2400    commit_parse_progress.finish()
2401    stats['num_commits'] = num_commits
2402
2403    # Close the output, ensure rev-list|diff-tree pipeline completed successfully
2404    dtp.stdout.close()
2405    if dtp.wait():
2406      raise SystemExit(_("Error: rev-list|diff-tree pipeline failed; see above.")) # pragma: no cover
2407
2408    return stats
2409
2410  @staticmethod
2411  def write_report(reportdir, stats):
2412    def datestr(datetimestr):
2413      return datetimestr if datetimestr else _('<present>').encode()
2414
2415    def dirnames(path):
2416      while True:
2417        path = os.path.dirname(path)
2418        yield path
2419        if path == b'':
2420          break
2421
2422    # Compute aggregate size information for paths, extensions, and dirs
2423    total_size = {'packed': 0, 'unpacked': 0}
2424    path_size = {'packed': collections.defaultdict(int),
2425                 'unpacked': collections.defaultdict(int)}
2426    ext_size = {'packed': collections.defaultdict(int),
2427                'unpacked': collections.defaultdict(int)}
2428    dir_size = {'packed': collections.defaultdict(int),
2429                'unpacked': collections.defaultdict(int)}
2430    for sha in stats['names']:
2431      size = {'packed': stats['packed_size'][sha],
2432              'unpacked': stats['unpacked_size'][sha]}
2433      for which in ('packed', 'unpacked'):
2434        for name in stats['names'][sha]:
2435          total_size[which] += size[which]
2436          path_size[which][name] += size[which]
2437          basename, ext = os.path.splitext(name)
2438          ext_size[which][ext] += size[which]
2439          for dirname in dirnames(name):
2440            dir_size[which][dirname] += size[which]
2441
2442    # Determine if and when extensions and directories were deleted
2443    ext_deleted_data = {}
2444    for name in stats['allnames']:
2445      when = stats['file_deletions'].get(name, None)
2446
2447      # Update the extension
2448      basename, ext = os.path.splitext(name)
2449      if when is None:
2450        ext_deleted_data[ext] = None
2451      elif ext in ext_deleted_data:
2452        if ext_deleted_data[ext] is not None:
2453          ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
2454      else:
2455        ext_deleted_data[ext] = when
2456
2457    dir_deleted_data = {}
2458    for name in dir_size['packed']:
2459      dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
2460
2461    with open(os.path.join(reportdir, b"README"), 'bw') as f:
2462      # Give a basic overview of this file
2463      f.write(b"== %s ==\n" % _("Overall Statistics").encode())
2464      f.write(("  %s: %d\n" % (_("Number of commits"),
2465                               stats['num_commits'])).encode())
2466      f.write(("  %s: %d\n" % (_("Number of filenames"),
2467                               len(path_size['packed']))).encode())
2468      f.write(("  %s: %d\n" % (_("Number of directories"),
2469                               len(dir_size['packed']))).encode())
2470      f.write(("  %s: %d\n" % (_("Number of file extensions"),
2471                               len(ext_size['packed']))).encode())
2472      f.write(b"\n")
2473      f.write(("  %s: %d\n" % (_("Total unpacked size (bytes)"),
2474                               total_size['unpacked'])).encode())
2475      f.write(("  %s: %d\n" % (_("Total packed size (bytes)"),
2476                               total_size['packed'])).encode())
2477      f.write(b"\n")
2478
2479      # Mention issues with the report
2480      f.write(("== %s ==\n" % _("Caveats")).encode())
2481      f.write(("=== %s ===\n" % _("Sizes")).encode())
2482      f.write(textwrap.dedent(_("""
2483        Packed size represents what size your repository would be if no
2484        trees, commits, tags, or other metadata were included (though it may
2485        fail to represent de-duplication; see below).  It also represents the
2486        current packing, which may be suboptimal if you haven't gc'ed for a
2487        while.
2488
2489        Unpacked size represents what size your repository would be if no
2490        trees, commits, tags, or other metadata were included AND if no
2491        files were packed; i.e., without delta-ing or compression.
2492
2493        Both unpacked and packed sizes can be slightly misleading.  Deleting
2494        a blob from history not save as much space as the unpacked size,
2495        because it is obviously normally stored in packed form.  Also,
2496        deleting a blob from history may not save as much space as its packed
2497        size either, because another blob could be stored as a delta against
2498        that blob, so when you remove one blob another blob's packed size may
2499        grow.
2500
2501        Also, the sum of the packed sizes can add up to more than the
2502        repository size; if the same contents appeared in the repository in
2503        multiple places, git will automatically de-dupe and store only one
2504        copy, while the way sizes are added in this analysis adds the size
2505        for each file path that has those contents.  Further, if a file is
2506        ever reverted to a previous version's contents, the previous
2507        version's size will be counted multiple times in this analysis, even
2508        though git will only store it once.
2509        """)[1:]).encode())
2510      f.write(b"\n")
2511      f.write(("=== %s ===\n" % _("Deletions")).encode())
2512      f.write(textwrap.dedent(_("""
2513        Whether a file is deleted is not a binary quality, since it can be
2514        deleted on some branches but still exist in others.  Also, it might
2515        exist in an old tag, but have been deleted in versions newer than
2516        that.  More thorough tracking could be done, including looking at
2517        merge commits where one side of history deleted and the other modified,
2518        in order to give a more holistic picture of deletions.  However, that
2519        algorithm would not only be more complex to implement, it'd also be
2520        quite difficult to present and interpret by users.  Since --analyze
2521        is just about getting a high-level rough picture of history, it instead
2522        implements the simplistic rule that is good enough for 98% of cases:
2523          A file is marked as deleted if the last commit in the fast-export
2524          stream that mentions the file lists it as deleted.
2525        This makes it dependent on topological ordering, but generally gives
2526        the "right" answer.
2527        """)[1:]).encode())
2528      f.write(b"\n")
2529      f.write(("=== %s ===\n" % _("Renames")).encode())
2530      f.write(textwrap.dedent(_("""
2531        Renames share the same non-binary nature that deletions do, plus
2532        additional challenges:
2533          * If the renamed file is renamed again, instead of just two names for
2534            a path you can have three or more.
2535          * Rename pairs of the form (oldname, newname) that we consider to be
2536            different names of the "same file" might only be valid over certain
2537            commit ranges.  For example, if a new commit reintroduces a file
2538            named oldname, then new versions of oldname aren't the "same file"
2539            anymore.  We could try to portray this to the user, but it's easier
2540            for the user to just break the pairing and only report unbroken
2541            rename pairings to the user.
2542          * The ability for users to rename files differently in different
2543            branches means that our chains of renames will not necessarily be
2544            linear but may branch out.
2545        """)[1:]).encode())
2546      f.write(b"\n")
2547
2548    # Equivalence classes for names, so if folks only want to keep a
2549    # certain set of paths, they know the old names they want to include
2550    # too.
2551    with open(os.path.join(reportdir, b"renames.txt"), 'bw') as f:
2552      seen = set()
2553      for pathname,equiv_group in sorted(stats['equivalence'].items(),
2554                                         key=lambda x:(x[1], x[0])):
2555        if equiv_group in seen:
2556          continue
2557        seen.add(equiv_group)
2558        f.write(("{} ->\n    ".format(decode(equiv_group[0])) +
2559                     "\n    ".join(decode(x) for x in equiv_group[1:]) +
2560                 "\n").encode())
2561
2562    # List directories in reverse sorted order of unpacked size
2563    with open(os.path.join(reportdir, b"directories-deleted-sizes.txt"), 'bw') as f:
2564      msg = "=== %s ===\n" % _("Deleted directories by reverse size")
2565      f.write(msg.encode())
2566      msg = _("Format: unpacked size, packed size, date deleted, directory name\n")
2567      f.write(msg.encode())
2568      for dirname, size in sorted(dir_size['packed'].items(),
2569                                  key=lambda x:(x[1],x[0]), reverse=True):
2570        if (dir_deleted_data[dirname]):
2571          f.write(b"  %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname],
2572                                              size,
2573                                              datestr(dir_deleted_data[dirname]),
2574                                              dirname or _('<toplevel>').encode()))
2575
2576    with open(os.path.join(reportdir, b"directories-all-sizes.txt"), 'bw') as f:
2577      f.write(("=== %s ===\n" % _("All directories by reverse size")).encode())
2578      msg = _("Format: unpacked size, packed size, date deleted, directory name\n")
2579      f.write(msg.encode())
2580      for dirname, size in sorted(dir_size['packed'].items(),
2581                                  key=lambda x:(x[1],x[0]), reverse=True):
2582        f.write(b"  %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname],
2583                                            size,
2584                                            datestr(dir_deleted_data[dirname]),
2585                                            dirname or _("<toplevel>").encode()))
2586
2587    # List extensions in reverse sorted order of unpacked size
2588    with open(os.path.join(reportdir, b"extensions-deleted-sizes.txt"), 'bw') as f:
2589      msg = "=== %s ===\n" % _("Deleted extensions by reverse size")
2590      f.write(msg.encode())
2591      msg = _("Format: unpacked size, packed size, date deleted, extension name\n")
2592      f.write(msg.encode())
2593      for extname, size in sorted(ext_size['packed'].items(),
2594                                  key=lambda x:(x[1],x[0]), reverse=True):
2595        if (ext_deleted_data[extname]):
2596          f.write(b"  %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname],
2597                                              size,
2598                                              datestr(ext_deleted_data[extname]),
2599                                              extname or _('<no extension>').encode()))
2600
2601    with open(os.path.join(reportdir, b"extensions-all-sizes.txt"), 'bw') as f:
2602      f.write(("=== %s ===\n" % _("All extensions by reverse size")).encode())
2603      msg = _("Format: unpacked size, packed size, date deleted, extension name\n")
2604      f.write(msg.encode())
2605      for extname, size in sorted(ext_size['packed'].items(),
2606                                  key=lambda x:(x[1],x[0]), reverse=True):
2607        f.write(b"  %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname],
2608                                            size,
2609                                            datestr(ext_deleted_data[extname]),
2610                                            extname or _('<no extension>').encode()))
2611
2612    # List files in reverse sorted order of unpacked size
2613    with open(os.path.join(reportdir, b"path-deleted-sizes.txt"), 'bw') as f:
2614      msg = "=== %s ===\n" % _("Deleted paths by reverse accumulated size")
2615      f.write(msg.encode())
2616      msg = _("Format: unpacked size, packed size, date deleted, path name(s)\n")
2617      f.write(msg.encode())
2618      for pathname, size in sorted(path_size['packed'].items(),
2619                                   key=lambda x:(x[1],x[0]), reverse=True):
2620        when = stats['file_deletions'].get(pathname, None)
2621        if when:
2622          f.write(b"  %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname],
2623                                              size,
2624                                              datestr(when),
2625                                              pathname))
2626
2627    with open(os.path.join(reportdir, b"path-all-sizes.txt"), 'bw') as f:
2628      msg = "=== %s ===\n" % _("All paths by reverse accumulated size")
2629      f.write(msg.encode())
2630      msg = _("Format: unpacked size, packed size, date deleted, path name\n")
2631      f.write(msg.encode())
2632      for pathname, size in sorted(path_size['packed'].items(),
2633                                   key=lambda x:(x[1],x[0]), reverse=True):
2634        when = stats['file_deletions'].get(pathname, None)
2635        f.write(b"  %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname],
2636                                            size,
2637                                            datestr(when),
2638                                            pathname))
2639
2640    # List of filenames and sizes in descending order
2641    with open(os.path.join(reportdir, b"blob-shas-and-paths.txt"), 'bw') as f:
2642      f.write(("=== %s ===\n" % _("Files by sha and associated pathnames in reverse size")).encode())
2643      f.write(_("Format: sha, unpacked size, packed size, filename(s) object stored as\n").encode())
2644      for sha, size in sorted(stats['packed_size'].items(),
2645                              key=lambda x:(x[1],x[0]), reverse=True):
2646        if sha not in stats['names']:
2647          # Some objects in the repository might not be referenced, or not
2648          # referenced by the branches/tags the user cares about; skip them.
2649          continue
2650        names_with_sha = stats['names'][sha]
2651        if len(names_with_sha) == 1:
2652          names_with_sha = names_with_sha.pop()
2653        else:
2654          names_with_sha = b'[' + b', '.join(sorted(names_with_sha)) + b']'
2655        f.write(b"  %s %10d %10d %s\n" % (sha,
2656                                          stats['unpacked_size'][sha],
2657                                          size,
2658                                          names_with_sha))
2659
2660  @staticmethod
2661  def run(args):
2662    if args.report_dir:
2663      reportdir = args.report_dir
2664    else:
2665      git_dir = GitUtils.determine_git_dir(b'.')
2666
2667    # Create the report directory as necessary
2668      results_tmp_dir = os.path.join(git_dir, b'filter-repo')
2669      if not os.path.isdir(results_tmp_dir):
2670        os.mkdir(results_tmp_dir)
2671      reportdir = os.path.join(results_tmp_dir, b"analysis")
2672
2673    if os.path.isdir(reportdir):
2674      if args.force:
2675        sys.stdout.write(_("Warning: Removing recursively: \"%s\"") % decode(reportdir))
2676        shutil.rmtree(reportdir)
2677      else:
2678        sys.stdout.write(_("Error: dir already exists (use --force to delete): \"%s\"\n") % decode(reportdir))
2679        sys.exit(1)
2680
2681    os.mkdir(reportdir)
2682
2683    # Gather the data we need
2684    stats = RepoAnalyze.gather_data(args)
2685
2686    # Write the reports
2687    sys.stdout.write(_("Writing reports to %s...") % decode(reportdir))
2688    sys.stdout.flush()
2689    RepoAnalyze.write_report(reportdir, stats)
2690    sys.stdout.write(_("done.\n"))
2691
2692class InputFileBackup:
2693  def __init__(self, input_file, output_file):
2694    self.input_file  = input_file
2695    self.output_file = output_file
2696
2697  def close(self):
2698    self.input_file.close()
2699    self.output_file.close()
2700
2701  def read(self, size):
2702    output = self.input_file.read(size)
2703    self.output_file.write(output)
2704    return output
2705
2706  def readline(self):
2707    line = self.input_file.readline()
2708    self.output_file.write(line)
2709    return line
2710
2711class DualFileWriter:
2712  def __init__(self, file1, file2):
2713    self.file1 = file1
2714    self.file2 = file2
2715
2716  def write(self, *args):
2717    self.file1.write(*args)
2718    self.file2.write(*args)
2719
2720  def flush(self):
2721    self.file1.flush()
2722    self.file2.flush()
2723
2724  def close(self):
2725    self.file1.close()
2726    self.file2.close()
2727
2728class RepoFilter(object):
2729  def __init__(self,
2730               args,
2731               filename_callback = None,
2732               message_callback = None,
2733               name_callback = None,
2734               email_callback = None,
2735               refname_callback = None,
2736               blob_callback = None,
2737               commit_callback = None,
2738               tag_callback = None,
2739               reset_callback = None,
2740               done_callback = None):
2741
2742    self._args = args
2743
2744    # Repo we are exporting
2745    self._repo_working_dir = None
2746
2747    # Store callbacks for acting on objects printed by FastExport
2748    self._blob_callback        = blob_callback
2749    self._commit_callback      = commit_callback
2750    self._tag_callback         = tag_callback
2751    self._reset_callback       = reset_callback
2752    self._done_callback        = done_callback
2753
2754    # Store callbacks for acting on slices of FastExport objects
2755    self._filename_callback    = filename_callback  # filenames from commits
2756    self._message_callback     = message_callback   # commit OR tag message
2757    self._name_callback        = name_callback      # author, committer, tagger
2758    self._email_callback       = email_callback     # author, committer, tagger
2759    self._refname_callback     = refname_callback   # from commit/tag/reset
2760    self._handle_arg_callbacks()
2761
2762    # Defaults for input
2763    self._input = None
2764    self._fep = None  # Fast Export Process
2765    self._fe_orig = None  # Path to where original fast-export output stored
2766    self._fe_filt = None  # Path to where filtered fast-export output stored
2767    self._parser = None # FastExportParser object we are working with
2768
2769    # Defaults for output
2770    self._output = None
2771    self._fip = None  # Fast Import Process
2772    self._import_pipes = None
2773    self._managed_output = True
2774
2775    # A tuple of (depth, list-of-ancestors).  Commits and ancestors are
2776    # identified by their id (their 'mark' in fast-export or fast-import
2777    # speak).  The depth of a commit is one more than the max depth of any
2778    # of its ancestors.
2779    self._graph = AncestryGraph()
2780    # Another one, for ancestry of commits in the original repo
2781    self._orig_graph = AncestryGraph()
2782
2783    # Names of files that were tweaked in any commit; such paths could lead
2784    # to subsequent commits being empty
2785    self._files_tweaked = set()
2786
2787    # A set of commit hash pairs (oldhash, newhash) which used to be merge
2788    # commits but due to filtering were turned into non-merge commits.
2789    # The commits probably have suboptimal commit messages (e.g. "Merge branch
2790    # next into master").
2791    self._commits_no_longer_merges = []
2792
2793    # A dict of original_ids to new_ids; filtering commits means getting
2794    # new commit hash (sha1sums), and we record the mapping both for
2795    # diagnostic purposes and so we can rewrite commit messages.  Note that
2796    # the new_id can be None rather than a commit hash if the original
2797    # commit became empty and was pruned or was otherwise dropped.
2798    self._commit_renames = {}
2799
2800    # A set of original_ids for which we have not yet gotten the
2801    # new_ids; we use OrderedDict because we need to know the order of
2802    # insertion, but the values are always ignored (and set to None).
2803    # If there was an OrderedSet class, I'd use it instead.
2804    self._pending_renames = collections.OrderedDict()
2805
2806    # A dict of commit_hash[0:7] -> set(commit_hashes with that prefix).
2807    #
2808    # It's common for commit messages to refer to commits by abbreviated
2809    # commit hashes, as short as 7 characters.  To facilitate translating
2810    # such short hashes, we have a mapping of prefixes to full old hashes.
2811    self._commit_short_old_hashes = collections.defaultdict(set)
2812
2813    # A set of commit hash references appearing in commit messages which
2814    # mapped to a valid commit that was removed entirely in the filtering
2815    # process.  The commit message will continue to reference the
2816    # now-missing commit hash, since there was nothing to map it to.
2817    self._commits_referenced_but_removed = set()
2818
2819    # Progress handling (number of commits parsed, etc.)
2820    self._progress_writer = ProgressWriter()
2821    self._num_commits = 0
2822
2823    # Size of blobs in the repo
2824    self._unpacked_size = {}
2825
2826    # Other vars
2827    self._sanity_checks_handled = False
2828    self._finalize_handled = False
2829    self._orig_refs = None
2830    self._newnames = {}
2831
2832    # Cache a few message translations for performance reasons
2833    self._parsed_message = _("Parsed %d commits")
2834
2835    # Compile some regexes and cache those
2836    self._hash_re = re.compile(br'(\b[0-9a-f]{7,40}\b)')
2837
2838  def _handle_arg_callbacks(self):
2839    def make_callback(argname, str):
2840      exec('def callback({}, _do_not_use_this_var = None):\n'.format(argname)+
2841           '  '+'\n  '.join(str.splitlines()), globals())
2842      return callback #namespace['callback']
2843    def handle(type):
2844      callback_field = '_{}_callback'.format(type)
2845      code_string = getattr(self._args, type+'_callback')
2846      if code_string:
2847        if os.path.exists(code_string):
2848          with open(code_string, 'r', encoding='utf-8') as f:
2849            code_string = f.read()
2850        if getattr(self, callback_field):
2851          raise SystemExit(_("Error: Cannot pass a %s_callback to RepoFilter "
2852                             "AND pass --%s-callback"
2853                           % (type, type)))
2854        if 'return ' not in code_string and \
2855           type not in ('blob', 'commit', 'tag', 'reset'):
2856          raise SystemExit(_("Error: --%s-callback should have a return statement")
2857                           % type)
2858        setattr(self, callback_field, make_callback(type, code_string))
2859    handle('filename')
2860    handle('message')
2861    handle('name')
2862    handle('email')
2863    handle('refname')
2864    handle('blob')
2865    handle('commit')
2866    handle('tag')
2867    handle('reset')
2868
2869  def _run_sanity_checks(self):
2870    self._sanity_checks_handled = True
2871    if not self._managed_output:
2872      if not self._args.replace_refs:
2873        # If not _managed_output we don't want to make extra changes to the
2874        # repo, so set default to no-op 'update-no-add'
2875        self._args.replace_refs = 'update-no-add'
2876      return
2877
2878    if self._args.debug:
2879      print("[DEBUG] Passed arguments:\n{}".format(self._args))
2880
2881    # Determine basic repository information
2882    target_working_dir = self._args.target or b'.'
2883    self._orig_refs = GitUtils.get_refs(target_working_dir)
2884    is_bare = GitUtils.is_repository_bare(target_working_dir)
2885
2886    # Determine if this is second or later run of filter-repo
2887    tmp_dir = self.results_tmp_dir(create_if_missing=False)
2888    already_ran = os.path.isfile(os.path.join(tmp_dir, b'already_ran'))
2889
2890    # Default for --replace-refs
2891    if not self._args.replace_refs:
2892        self._args.replace_refs = ('update-or-add' if already_ran
2893                                   else 'update-and-add')
2894
2895    # Do sanity checks from the correct directory
2896    if not self._args.force and not already_ran:
2897      cwd = os.getcwd()
2898      os.chdir(target_working_dir)
2899      RepoFilter.sanity_check(self._orig_refs, is_bare)
2900      os.chdir(cwd)
2901
2902  @staticmethod
2903  def sanity_check(refs, is_bare):
2904    def abort(reason):
2905      try:
2906        cmd = 'git config remote.origin.url'
2907        output = subproc.check_output(cmd.split()).strip()
2908      except subprocess.CalledProcessError as e:
2909        output = None
2910      msg = ""
2911      if output and os.path.isdir(output):
2912        msg = _("Note: when cloning local repositories, you need to pass\n"
2913                "      --no-local to git clone to avoid this issue.\n")
2914      raise SystemExit(
2915        _("Aborting: Refusing to destructively overwrite repo history since\n"
2916          "this does not look like a fresh clone.\n"
2917          "  (%s)\n%s"
2918          "Please operate on a fresh clone instead.  If you want to proceed\n"
2919          "anyway, use --force.") % (reason, msg))
2920
2921    # Make sure repo is fully packed, just like a fresh clone would be.
2922    # Note that transfer.unpackLimit defaults to 100, meaning that a
2923    # repository with no packs and less than 100 objects should be considered
2924    # fully packed.
2925    output = subproc.check_output('git count-objects -v'.split())
2926    stats = dict(x.split(b': ') for x in output.splitlines())
2927    num_packs = int(stats[b'packs'])
2928    num_loose_objects = int(stats[b'count'])
2929    if num_packs > 1 or \
2930       (num_packs == 1 and num_loose_objects > 0) or \
2931       num_loose_objects >= 100:
2932      abort(_("expected freshly packed repo"))
2933
2934    # Make sure there is precisely one remote, named "origin"...or that this
2935    # is a new bare repo with no packs and no remotes
2936    output = subproc.check_output('git remote'.split()).strip()
2937    if not (output == b"origin" or (num_packs == 0 and not output)):
2938      abort(_("expected one remote, origin"))
2939
2940    # Avoid letting people running with weird setups and overwriting GIT_DIR
2941    # elsewhere
2942    git_dir = GitUtils.determine_git_dir(b'.')
2943    if is_bare and git_dir != b'.':
2944      abort(_("GIT_DIR must be ."))
2945    elif not is_bare and git_dir != b'.git':
2946      abort(_("GIT_DIR must be .git"))
2947
2948    # Make sure that all reflogs have precisely one entry
2949    reflog_dir=os.path.join(git_dir, b'logs')
2950    for root, dirs, files in os.walk(reflog_dir):
2951      for filename in files:
2952        pathname = os.path.join(root, filename)
2953        with open(pathname, 'br') as f:
2954          if len(f.read().splitlines()) > 1:
2955            shortpath = pathname[len(reflog_dir)+1:]
2956            abort(_("expected at most one entry in the reflog for %s") %
2957                  decode(shortpath))
2958
2959    # Make sure there are no stashed changes
2960    if b'refs/stash' in refs:
2961      abort(_("has stashed changes"))
2962
2963    # Do extra checks in non-bare repos
2964    if not is_bare:
2965      # Avoid uncommitted, unstaged, or untracked changes
2966      if subproc.call('git diff --staged --quiet'.split()):
2967        abort(_("you have uncommitted changes"))
2968      if subproc.call('git diff --quiet'.split()):
2969        abort(_("you have unstaged changes"))
2970      if len(subproc.check_output('git ls-files -o'.split())) > 0:
2971        abort(_("you have untracked changes"))
2972
2973      # Avoid unpushed changes
2974      for refname, rev in refs.items():
2975        if not refname.startswith(b'refs/heads/'):
2976          continue
2977        origin_ref = refname.replace(b'refs/heads/', b'refs/remotes/origin/')
2978        if origin_ref not in refs:
2979          abort(_('%s exists, but %s not found') % (decode(refname),
2980                                                    decode(origin_ref)))
2981        if rev != refs[origin_ref]:
2982          abort(_('%s does not match %s') % (decode(refname),
2983                                             decode(origin_ref)))
2984
2985      # Make sure there is only one worktree
2986      output = subproc.check_output('git worktree list'.split())
2987      if len(output.splitlines()) > 1:
2988        abort(_('you have multiple worktrees'))
2989
2990  @staticmethod
2991  def cleanup(repo, repack, reset, run_quietly=False, show_debuginfo=False):
2992    ''' cleanup repo; if repack then expire reflogs and do a gc --prune=now.
2993        if reset then do a reset --hard.  Optionally also curb output if
2994        run_quietly is True, or go the opposite direction and show extra
2995        output if show_debuginfo is True. '''
2996    assert not (run_quietly and show_debuginfo)
2997
2998    if (repack and not run_quietly and not show_debuginfo):
2999      print(_("Repacking your repo and cleaning out old unneeded objects"))
3000    quiet_flags = '--quiet' if run_quietly else ''
3001    cleanup_cmds = []
3002    if repack:
3003      cleanup_cmds = ['git reflog expire --expire=now --all'.split(),
3004                      'git gc {} --prune=now'.format(quiet_flags).split()]
3005    if reset:
3006      cleanup_cmds.insert(0, 'git reset {} --hard'.format(quiet_flags).split())
3007    location_info = ' (in {})'.format(decode(repo)) if repo != b'.' else ''
3008    for cmd in cleanup_cmds:
3009      if show_debuginfo:
3010        print("[DEBUG] Running{}: {}".format(location_info, ' '.join(cmd)))
3011      subproc.call(cmd, cwd=repo)
3012
3013  def _get_rename(self, old_hash):
3014    # If we already know the rename, just return it
3015    new_hash = self._commit_renames.get(old_hash, None)
3016    if new_hash:
3017      return new_hash
3018
3019    # If it's not in the remaining pending renames, we don't know it
3020    if old_hash is not None and old_hash not in self._pending_renames:
3021      return None
3022
3023    # Read through the pending renames until we find it or we've read them all,
3024    # and return whatever we might find
3025    self._flush_renames(old_hash)
3026    return self._commit_renames.get(old_hash, None)
3027
3028  def _flush_renames(self, old_hash=None, limit=0):
3029    # Parse through self._pending_renames until we have read enough.  We have
3030    # read enough if:
3031    #   self._pending_renames is empty
3032    #   old_hash != None and we found a rename for old_hash
3033    #   limit > 0 and len(self._pending_renames) started less than 2*limit
3034    #   limit > 0 and len(self._pending_renames) < limit
3035    if limit and len(self._pending_renames) < 2 * limit:
3036      return
3037    fi_input, fi_output = self._import_pipes
3038    while self._pending_renames:
3039      orig_id, ignore = self._pending_renames.popitem(last=False)
3040      new_id = fi_output.readline().rstrip()
3041      self._commit_renames[orig_id] = new_id
3042      if old_hash == orig_id:
3043        return
3044      if limit and len(self._pending_renames) < limit:
3045        return
3046
3047  def _translate_commit_hash(self, matchobj_or_oldhash):
3048    old_hash = matchobj_or_oldhash
3049    if not isinstance(matchobj_or_oldhash, bytes):
3050      old_hash = matchobj_or_oldhash.group(1)
3051    orig_len = len(old_hash)
3052    new_hash = self._get_rename(old_hash)
3053    if new_hash is None:
3054      if old_hash[0:7] not in self._commit_short_old_hashes:
3055        self._commits_referenced_but_removed.add(old_hash)
3056        return old_hash
3057      possibilities = self._commit_short_old_hashes[old_hash[0:7]]
3058      matches = [x for x in possibilities
3059                 if x[0:orig_len] == old_hash]
3060      if len(matches) != 1:
3061        self._commits_referenced_but_removed.add(old_hash)
3062        return old_hash
3063      old_hash = matches[0]
3064      new_hash = self._get_rename(old_hash)
3065
3066    assert new_hash is not None
3067    return new_hash[0:orig_len]
3068
3069  def _trim_extra_parents(self, orig_parents, parents):
3070    '''Due to pruning of empty commits, some parents could be non-existent
3071       (None) or otherwise redundant.  Remove the non-existent parents, and
3072       remove redundant parents so long as that doesn't transform a merge
3073       commit into a non-merge commit.
3074
3075       Returns a tuple:
3076         (parents, new_first_parent_if_would_become_non_merge)'''
3077
3078    always_prune = (self._args.prune_degenerate == 'always')
3079
3080    # Pruning of empty commits means multiple things:
3081    #   * An original parent of this commit may have been pruned causing the
3082    #     need to rewrite the reported parent to the nearest ancestor.  We
3083    #     want to know when we're dealing with such a parent.
3084    #   * Further, there may be no "nearest ancestor" if the entire history
3085    #     of that parent was also pruned.  (Detectable by the parent being
3086    #     'None')
3087    # Remove all parents rewritten to None, and keep track of which parents
3088    # were rewritten to an ancestor.
3089    tmp = zip(parents,
3090              orig_parents,
3091              [(x in _SKIPPED_COMMITS or always_prune) for x in orig_parents])
3092    tmp2 = [x for x in tmp if x[0] is not None]
3093    if not tmp2:
3094      # All ancestors have been pruned; we have no parents.
3095      return [], None
3096    parents, orig_parents, is_rewritten = [list(x) for x in zip(*tmp2)]
3097
3098    # We can't have redundant parents if we don't have at least 2 parents
3099    if len(parents) < 2:
3100      return parents, None
3101
3102    # Don't remove redundant parents if user doesn't want us to
3103    if self._args.prune_degenerate == 'never':
3104      return parents, None
3105
3106    # Remove duplicate parents (if both sides of history have lots of commits
3107    # which become empty due to pruning, the most recent ancestor on both
3108    # sides may be the same commit), except only remove parents that have
3109    # been rewritten due to previous empty pruning.
3110    seen = set()
3111    seen_add = seen.add
3112    # Deleting duplicate rewritten parents means keeping parents if either
3113    # they have not been seen or they are ones that have not been rewritten.
3114    parents_copy = parents
3115    uniq = [[p, orig_parents[i], is_rewritten[i]] for i, p in enumerate(parents)
3116            if not (p in seen or seen_add(p)) or not is_rewritten[i]]
3117    parents, orig_parents, is_rewritten = [list(x) for x in zip(*uniq)]
3118    if len(parents) < 2:
3119      return parents_copy, parents[0]
3120
3121    # Flatten unnecessary merges.  (If one side of history is entirely
3122    # empty commits that were pruned, we may end up attempting to
3123    # merge a commit with its ancestor.  Remove parents that are an
3124    # ancestor of another parent.)
3125    num_parents = len(parents)
3126    to_remove = []
3127    for cur in range(num_parents):
3128      if not is_rewritten[cur]:
3129        continue
3130      for other in range(num_parents):
3131        if cur == other:
3132          continue
3133        if not self._graph.is_ancestor(parents[cur], parents[other]):
3134          continue
3135        # parents[cur] is an ancestor of parents[other], so parents[cur]
3136        # seems redundant.  However, if it was intentionally redundant
3137        # (e.g. a no-ff merge) in the original, then we want to keep it.
3138        if not always_prune and \
3139           self._orig_graph.is_ancestor(orig_parents[cur],
3140                                        orig_parents[other]):
3141          continue
3142        # Some folks want their history to have all first parents be merge
3143        # commits (except for any root commits), and always do a merge --no-ff.
3144        # For such folks, don't remove the first parent even if it's an
3145        # ancestor of other commits.
3146        if self._args.no_ff and cur == 0:
3147          continue
3148        # Okay so the cur-th parent is an ancestor of the other-th parent,
3149        # and it wasn't that way in the original repository; mark the
3150        # cur-th parent as removable.
3151        to_remove.append(cur)
3152        break # cur removed, so skip rest of others -- i.e. check cur+=1
3153    for x in reversed(to_remove):
3154      parents.pop(x)
3155    if len(parents) < 2:
3156      return parents_copy, parents[0]
3157
3158    return parents, None
3159
3160  def _prunable(self, commit, new_1st_parent, had_file_changes, orig_parents):
3161    parents = commit.parents
3162
3163    if self._args.prune_empty == 'never':
3164      return False
3165    always_prune = (self._args.prune_empty == 'always')
3166
3167    # For merge commits, unless there are prunable (redundant) parents, we
3168    # do not want to prune
3169    if len(parents) >= 2 and not new_1st_parent:
3170      return False
3171
3172    if len(parents) < 2:
3173      # Special logic for commits that started empty...
3174      if not had_file_changes and not always_prune:
3175        had_parents_pruned = (len(parents) < len(orig_parents) or
3176                              (len(orig_parents) == 1 and
3177                               orig_parents[0] in _SKIPPED_COMMITS))
3178        # If the commit remains empty and had parents which were pruned,
3179        # then prune this commit; otherwise, retain it
3180        return (not commit.file_changes and had_parents_pruned)
3181
3182      # We can only get here if the commit didn't start empty, so if it's
3183      # empty now, it obviously became empty
3184      if not commit.file_changes:
3185        return True
3186
3187    # If there are no parents of this commit and we didn't match the case
3188    # above, then this commit cannot be pruned.  Since we have no parent(s)
3189    # to compare to, abort now to prevent future checks from failing.
3190    if not parents:
3191      return False
3192
3193    # Similarly, we cannot handle the hard cases if we don't have a pipe
3194    # to communicate with fast-import
3195    if not self._import_pipes:
3196      return False
3197
3198    # If there have not been renames/remappings of IDs (due to insertion of
3199    # new blobs), then we can sometimes know things aren't prunable with a
3200    # simple check
3201    if not _IDS.has_renames():
3202      # non-merge commits can only be empty if blob/file-change editing caused
3203      # all file changes in the commit to have the same file contents as
3204      # the parent.
3205      changed_files = set(change.filename for change in commit.file_changes)
3206      if len(orig_parents) < 2 and changed_files - self._files_tweaked:
3207        return False
3208
3209    # Finally, the hard case: due to either blob rewriting, or due to pruning
3210    # of empty commits wiping out the first parent history back to the merge
3211    # base, the list of file_changes we have may not actually differ from our
3212    # (new) first parent's version of the files, i.e. this would actually be
3213    # an empty commit.  Check by comparing the contents of this commit to its
3214    # (remaining) parent.
3215    #
3216    # NOTE on why this works, for the case of original first parent history
3217    # having been pruned away due to being empty:
3218    #     The first parent history having been pruned away due to being
3219    #     empty implies the original first parent would have a tree (after
3220    #     filtering) that matched the merge base's tree.  Since
3221    #     file_changes has the changes needed to go from what would have
3222    #     been the first parent to our new commit, and what would have been
3223    #     our first parent has a tree that matches the merge base, then if
3224    #     the new first parent has a tree matching the versions of files in
3225    #     file_changes, then this new commit is empty and thus prunable.
3226    fi_input, fi_output = self._import_pipes
3227    self._flush_renames()  # Avoid fi_output having other stuff present
3228    # Optimization note: we could have two loops over file_changes, the
3229    # first doing all the self._output.write() calls, and the second doing
3230    # the rest.  But I'm worried about fast-import blocking on fi_output
3231    # buffers filling up so I instead read from it as I go.
3232    for change in commit.file_changes:
3233      parent = new_1st_parent or commit.parents[0] # exists due to above checks
3234      quoted_filename = PathQuoting.enquote(change.filename)
3235      if isinstance(parent, int):
3236        self._output.write(b"ls :%d %s\n" % (parent, quoted_filename))
3237      else:
3238        self._output.write(b"ls %s %s\n" % (parent, quoted_filename))
3239      self._output.flush()
3240      parent_version = fi_output.readline().split()
3241      if change.type == b'D':
3242        if parent_version != [b'missing', quoted_filename]:
3243          return False
3244      else:
3245        blob_sha = change.blob_id
3246        if isinstance(change.blob_id, int):
3247          self._output.write(b"get-mark :%d\n" % change.blob_id)
3248          self._output.flush()
3249          blob_sha = fi_output.readline().rstrip()
3250        if parent_version != [change.mode, b'blob', blob_sha, quoted_filename]:
3251          return False
3252
3253    return True
3254
3255  def _record_remapping(self, commit, orig_parents):
3256    new_id = None
3257    # Record the mapping of old commit hash to new one
3258    if commit.original_id and self._import_pipes:
3259      fi_input, fi_output = self._import_pipes
3260      self._output.write(b"get-mark :%d\n" % commit.id)
3261      self._output.flush()
3262      orig_id = commit.original_id
3263      self._commit_short_old_hashes[orig_id[0:7]].add(orig_id)
3264      # Note that we have queued up an id for later reading; flush a
3265      # few of the older ones if we have too many queued up
3266      self._pending_renames[orig_id] = None
3267      self._flush_renames(None, limit=40)
3268    # Also, record if this was a merge commit that turned into a non-merge
3269    # commit.
3270    if len(orig_parents) >= 2 and len(commit.parents) < 2:
3271      self._commits_no_longer_merges.append((commit.original_id, new_id))
3272
3273  def callback_metadata(self, extra_items = dict()):
3274    return {'commit_rename_func': self._translate_commit_hash,
3275            'ancestry_graph': self._graph,
3276            'original_ancestry_graph': self._orig_graph,
3277            **extra_items}
3278
3279  def _tweak_blob(self, blob):
3280    if self._args.max_blob_size and len(blob.data) > self._args.max_blob_size:
3281      blob.skip()
3282
3283    if blob.original_id in self._args.strip_blobs_with_ids:
3284      blob.skip()
3285
3286    if ( self._args.replace_text
3287        # not (if blob contains zero byte in the first 8Kb, that is, if blob is binary data)
3288        and not b"\0" in blob.data[0:8192]
3289    ):
3290      for literal, replacement in self._args.replace_text['literals']:
3291        blob.data = blob.data.replace(literal, replacement)
3292      for regex,   replacement in self._args.replace_text['regexes']:
3293        blob.data = regex.sub(replacement, blob.data)
3294
3295    if self._blob_callback:
3296      self._blob_callback(blob, self.callback_metadata())
3297
3298  def _filter_files(self, commit):
3299    def filename_matches(path_expression, pathname):
3300      ''' Returns whether path_expression matches pathname or a leading
3301          directory thereof, allowing path_expression to not have a trailing
3302          slash even if it is meant to match a leading directory. '''
3303      if path_expression == b'':
3304        return True
3305      n = len(path_expression)
3306      if (pathname.startswith(path_expression) and
3307          (path_expression[n-1:n] == b'/' or
3308           len(pathname) == n or
3309           pathname[n:n+1] == b'/')):
3310        return True
3311      return False
3312
3313    def newname(path_changes, pathname, use_base_name, filtering_is_inclusive):
3314      ''' Applies filtering and rename changes from path_changes to pathname,
3315          returning any of None (file isn't wanted), original filename (file
3316          is wanted with original name), or new filename. '''
3317      wanted = False
3318      full_pathname = pathname
3319      if use_base_name:
3320        pathname = os.path.basename(pathname)
3321      for (mod_type, match_type, path_exp) in path_changes:
3322        if mod_type == 'filter' and not wanted:
3323          assert match_type in ('match', 'glob', 'regex')
3324          if match_type == 'match' and filename_matches(path_exp, pathname):
3325            wanted = True
3326          if match_type == 'glob' and fnmatch.fnmatch(pathname, path_exp):
3327            wanted = True
3328          if match_type == 'regex' and path_exp.search(pathname):
3329            wanted = True
3330        elif mod_type == 'rename':
3331          match, repl = path_exp
3332          assert match_type in ('match','regex') # glob was translated to regex
3333          if match_type == 'match' and filename_matches(match, full_pathname):
3334            full_pathname = full_pathname.replace(match, repl, 1)
3335          if match_type == 'regex':
3336            full_pathname = match.sub(repl, full_pathname)
3337      return full_pathname if (wanted == filtering_is_inclusive) else None
3338
3339    args = self._args
3340    new_file_changes = {}  # Assumes no renames or copies, otherwise collisions
3341    for change in commit.file_changes:
3342      # NEEDSWORK: _If_ we ever want to pass `--full-tree` to fast-export and
3343      # parse that output, we'll need to modify this block; `--full-tree`
3344      # issues a deleteall directive which has no filename, and thus this
3345      # block would normally strip it.  Of course, FileChange() and
3346      # _parse_optional_filechange() would need updates too.
3347      if change.type == b'DELETEALL':
3348        new_file_changes[b''] = change
3349        continue
3350      if change.filename in self._newnames:
3351        change.filename = self._newnames[change.filename]
3352      else:
3353        original_filename = change.filename
3354        change.filename = newname(args.path_changes, change.filename,
3355                                  args.use_base_name, args.inclusive)
3356        if self._filename_callback:
3357          change.filename = self._filename_callback(change.filename)
3358        self._newnames[original_filename] = change.filename
3359      if not change.filename:
3360        continue # Filtering criteria excluded this file; move on to next one
3361      if change.filename in new_file_changes:
3362        # Getting here means that path renaming is in effect, and caused one
3363        # path to collide with another.  That's usually bad, but can be okay
3364        # under two circumstances:
3365        #   1) Sometimes people have a file named OLDFILE in old revisions of
3366        #      history, and they rename to NEWFILE, and would like to rewrite
3367        #      history so that all revisions refer to it as NEWFILE.  As such,
3368        #      we can allow a collision when (at least) one of the two paths
3369        #      is a deletion.  Note that if OLDFILE and NEWFILE are unrelated
3370        #      this also allows the rewrite to continue, which makes sense
3371        #      since OLDFILE is no longer in the way.
3372        #   2) If OLDFILE and NEWFILE are exactly equal, then writing them
3373        #      both to the same location poses no problem; we only need one
3374        #      file.  (This could come up if someone copied a file in some
3375        #      commit, then later either deleted the file or kept it exactly
3376        #      in sync with the original with any changes, and then decides
3377        #      they want to rewrite history to only have one of the two files)
3378        colliding_change = new_file_changes[change.filename]
3379        if change.type == b'D':
3380          # We can just throw this one away and keep the other
3381          continue
3382        elif change.type == b'M' and (
3383            change.mode == colliding_change.mode and
3384            change.blob_id == colliding_change.blob_id):
3385          # The two are identical, so we can throw this one away and keep other
3386          continue
3387        elif new_file_changes[change.filename].type != b'D':
3388          raise SystemExit(_("File renaming caused colliding pathnames!\n") +
3389                           _("  Commit: {}\n").format(commit.original_id) +
3390                           _("  Filename: {}").format(change.filename))
3391      # Strip files that are too large
3392      if self._args.max_blob_size and \
3393         self._unpacked_size.get(change.blob_id, 0) > self._args.max_blob_size:
3394        continue
3395      if self._args.strip_blobs_with_ids and \
3396         change.blob_id in self._args.strip_blobs_with_ids:
3397        continue
3398      # Otherwise, record the change
3399      new_file_changes[change.filename] = change
3400    commit.file_changes = [v for k,v in sorted(new_file_changes.items())]
3401
3402  def _tweak_commit(self, commit, aux_info):
3403    # Change the commit message according to callback
3404    if not self._args.preserve_commit_hashes:
3405      commit.message = self._hash_re.sub(self._translate_commit_hash,
3406                                         commit.message)
3407    if self._args.replace_message:
3408      for literal, replacement in self._args.replace_message['literals']:
3409        commit.message = commit.message.replace(literal, replacement)
3410      for regex,   replacement in self._args.replace_message['regexes']:
3411        commit.message = regex.sub(replacement, commit.message)
3412    if self._message_callback:
3413      commit.message = self._message_callback(commit.message)
3414
3415    # Change the author & committer according to mailmap rules
3416    args = self._args
3417    if args.mailmap:
3418      commit.author_name, commit.author_email = \
3419          args.mailmap.translate(commit.author_name, commit.author_email)
3420      commit.committer_name, commit.committer_email = \
3421          args.mailmap.translate(commit.committer_name, commit.committer_email)
3422    # Change author & committer according to callbacks
3423    if self._name_callback:
3424      commit.author_name = self._name_callback(commit.author_name)
3425      commit.committer_name = self._name_callback(commit.committer_name)
3426    if self._email_callback:
3427      commit.author_email = self._email_callback(commit.author_email)
3428      commit.committer_email = self._email_callback(commit.committer_email)
3429
3430    # Sometimes the 'branch' given is a tag; if so, rename it as requested so
3431    # we don't get any old tagnames
3432    if self._args.tag_rename:
3433      commit.branch = RepoFilter._do_tag_rename(args.tag_rename, commit.branch)
3434    if self._refname_callback:
3435      commit.branch = self._refname_callback(commit.branch)
3436
3437    # Filter or rename the list of file changes
3438    orig_file_changes = set(commit.file_changes)
3439    self._filter_files(commit)
3440
3441    # Record ancestry graph
3442    parents, orig_parents = commit.parents, aux_info['orig_parents']
3443    if self._args.state_branch:
3444      external_parents = parents
3445    else:
3446      external_parents = [p for p in parents if not isinstance(p, int)]
3447    self._graph.record_external_commits(external_parents)
3448    self._orig_graph.record_external_commits(external_parents)
3449    self._graph.add_commit_and_parents(commit.id, parents)
3450    self._orig_graph.add_commit_and_parents(commit.old_id, orig_parents)
3451
3452    # Prune parents (due to pruning of empty commits) if relevant
3453    old_1st_parent = parents[0] if parents else None
3454    parents, new_1st_parent = self._trim_extra_parents(orig_parents, parents)
3455    commit.parents = parents
3456
3457    # If parents were pruned, then we need our file changes to be relative
3458    # to the new first parent
3459    if parents and old_1st_parent != parents[0]:
3460      commit.file_changes = GitUtils.get_file_changes(self._repo_working_dir,
3461                                                      ID_TO_HASH[parents[0]],
3462                                                      commit.original_id)
3463      orig_file_changes = set(commit.file_changes)
3464      self._filter_files(commit)
3465
3466    # Find out which files were modified by the callbacks.  Such paths could
3467    # lead to subsequent commits being empty (e.g. if removing a line containing
3468    # a password from every version of a file that had the password, and some
3469    # later commit did nothing more than remove that line)
3470    final_file_changes = set(commit.file_changes)
3471    if self._args.replace_text or self._blob_callback:
3472      differences = orig_file_changes.union(final_file_changes)
3473    else:
3474      differences = orig_file_changes.symmetric_difference(final_file_changes)
3475    self._files_tweaked.update(x.filename for x in differences)
3476
3477    # Call the user-defined callback, if any
3478    if self._commit_callback:
3479      self._commit_callback(commit, self.callback_metadata(aux_info))
3480
3481    # Now print the resulting commit, or if prunable skip it
3482    if not commit.dumped:
3483      if not self._prunable(commit, new_1st_parent,
3484                            aux_info['had_file_changes'], orig_parents):
3485        self._insert_into_stream(commit)
3486        self._record_remapping(commit, orig_parents)
3487      else:
3488        rewrite_to = new_1st_parent or commit.first_parent()
3489        commit.skip(new_id = rewrite_to)
3490        if self._args.state_branch:
3491          alias = Alias(commit.old_id or commit.id, rewrite_to or deleted_hash)
3492          self._insert_into_stream(alias)
3493        reset = Reset(commit.branch, rewrite_to or deleted_hash)
3494        self._insert_into_stream(reset)
3495        self._commit_renames[commit.original_id] = None
3496
3497    # Show progress
3498    self._num_commits += 1
3499    if not self._args.quiet:
3500      self._progress_writer.show(self._parsed_message % self._num_commits)
3501
3502  @staticmethod
3503  def _do_tag_rename(rename_pair, tagname):
3504    old, new = rename_pair.split(b':', 1)
3505    old, new = b'refs/tags/'+old, b'refs/tags/'+new
3506    if tagname.startswith(old):
3507      return tagname.replace(old, new, 1)
3508    return tagname
3509
3510  def _tweak_tag(self, tag):
3511    # Tweak the tag message according to callbacks
3512    if self._args.replace_message:
3513      for literal, replacement in self._args.replace_message['literals']:
3514        tag.message = tag.message.replace(literal, replacement)
3515      for regex,   replacement in self._args.replace_message['regexes']:
3516        tag.message = regex.sub(replacement, tag.message)
3517    if self._message_callback:
3518      tag.message = self._message_callback(tag.message)
3519
3520    # Tweak the tag name according to tag-name-related callbacks
3521    tag_prefix = b'refs/tags/'
3522    fullref = tag_prefix+tag.ref
3523    if self._args.tag_rename:
3524      fullref = RepoFilter._do_tag_rename(self._args.tag_rename, fullref)
3525    if self._refname_callback:
3526      fullref = self._refname_callback(fullref)
3527      if not fullref.startswith(tag_prefix):
3528        msg = "Error: fast-import requires tags to be in refs/tags/ namespace."
3529        msg += "\n       {} renamed to {}".format(tag_prefix+tag.ref, fullref)
3530        raise SystemExit(msg)
3531    tag.ref = fullref[len(tag_prefix):]
3532
3533    # Tweak the tagger according to callbacks
3534    if self._args.mailmap:
3535      tag.tagger_name, tag.tagger_email = \
3536          self._args.mailmap.translate(tag.tagger_name, tag.tagger_email)
3537    if self._name_callback:
3538      tag.tagger_name = self._name_callback(tag.tagger_name)
3539    if self._email_callback:
3540      tag.tagger_email = self._email_callback(tag.tagger_email)
3541
3542    # Call general purpose tag callback
3543    if self._tag_callback:
3544      self._tag_callback(tag, self.callback_metadata())
3545
3546  def _tweak_reset(self, reset):
3547    if self._args.tag_rename:
3548      reset.ref = RepoFilter._do_tag_rename(self._args.tag_rename, reset.ref)
3549    if self._refname_callback:
3550      reset.ref = self._refname_callback(reset.ref)
3551    if self._reset_callback:
3552      self._reset_callback(reset, self.callback_metadata())
3553
3554  def results_tmp_dir(self, create_if_missing=True):
3555    target_working_dir = self._args.target or b'.'
3556    git_dir = GitUtils.determine_git_dir(target_working_dir)
3557    d = os.path.join(git_dir, b'filter-repo')
3558    if create_if_missing and not os.path.isdir(d):
3559      os.mkdir(d)
3560    return d
3561
3562  def _load_marks_file(self, marks_basename):
3563    full_branch = 'refs/heads/{}'.format(self._args.state_branch)
3564    marks_file = os.path.join(self.results_tmp_dir(), marks_basename)
3565    working_dir = self._args.target or b'.'
3566    cmd = ['git', '-C', working_dir, 'show-ref', full_branch]
3567    contents = b''
3568    if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0:
3569      cmd = ['git', '-C', working_dir, 'show',
3570             '%s:%s' % (full_branch, decode(marks_basename))]
3571      try:
3572        contents = subproc.check_output(cmd)
3573      except subprocess.CalledProcessError as e: # pragma: no cover
3574        raise SystemExit(_("Failed loading %s from %s") %
3575                         (decode(marks_basename), full_branch))
3576    if contents:
3577      biggest_id = max(int(x.split()[0][1:]) for x in contents.splitlines())
3578      _IDS._next_id = max(_IDS._next_id, biggest_id+1)
3579    with open(marks_file, 'bw') as f:
3580      f.write(contents)
3581    return marks_file
3582
3583  def _save_marks_files(self):
3584    basenames = [b'source-marks', b'target-marks']
3585    working_dir = self._args.target or b'.'
3586
3587    # Check whether the branch exists
3588    parent = []
3589    full_branch = 'refs/heads/{}'.format(self._args.state_branch)
3590    cmd = ['git', '-C', working_dir, 'show-ref', full_branch]
3591    if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0:
3592      parent = ['-p', full_branch]
3593
3594    # Run 'git hash-object $MARKS_FILE' for each marks file, save result
3595    blob_hashes = {}
3596    for marks_basename in basenames:
3597      marks_file = os.path.join(self.results_tmp_dir(), marks_basename)
3598      if not os.path.isfile(marks_file): # pragma: no cover
3599        raise SystemExit(_("Failed to find %s to save to %s")
3600                         % (marks_file, self._args.state_branch))
3601      cmd = ['git', '-C', working_dir, 'hash-object', '-w', marks_file]
3602      blob_hashes[marks_basename] = subproc.check_output(cmd).strip()
3603
3604    # Run 'git mktree' to create a tree out of it
3605    p = subproc.Popen(['git', '-C', working_dir, 'mktree'],
3606                      stdin=subprocess.PIPE, stdout=subprocess.PIPE)
3607    for b in basenames:
3608      p.stdin.write(b'100644 blob %s\t%s\n' % (blob_hashes[b], b))
3609    p.stdin.close()
3610    p.wait()
3611    tree = p.stdout.read().strip()
3612
3613    # Create the new commit
3614    cmd = (['git', '-C', working_dir, 'commit-tree', '-m', 'New mark files',
3615            tree] + parent)
3616    commit = subproc.check_output(cmd).strip()
3617    subproc.call(['git', '-C', working_dir, 'update-ref', full_branch, commit])
3618
3619  def importer_only(self):
3620    self._run_sanity_checks()
3621    self._setup_output()
3622
3623  def set_output(self, outputRepoFilter):
3624    assert outputRepoFilter._output
3625
3626    # set_output implies this RepoFilter is doing exporting, though may not
3627    # be the only one.
3628    self._setup_input(use_done_feature = False)
3629
3630    # Set our output management up to pipe to outputRepoFilter's locations
3631    self._managed_output = False
3632    self._output = outputRepoFilter._output
3633    self._import_pipes = outputRepoFilter._import_pipes
3634
3635    # Handle sanity checks, though currently none needed for export-only cases
3636    self._run_sanity_checks()
3637
3638  def _setup_input(self, use_done_feature):
3639    if self._args.stdin:
3640      self._input = sys.stdin.detach()
3641      sys.stdin = None # Make sure no one tries to accidentally use it
3642      self._fe_orig = None
3643    else:
3644      skip_blobs = (self._blob_callback is None and
3645                    self._args.replace_text is None and
3646                    self._args.source == self._args.target)
3647      extra_flags = []
3648      if skip_blobs:
3649        extra_flags.append('--no-data')
3650        if self._args.max_blob_size:
3651          self._unpacked_size, packed_size = GitUtils.get_blob_sizes()
3652      if use_done_feature:
3653        extra_flags.append('--use-done-feature')
3654      if write_marks:
3655        extra_flags.append(b'--mark-tags')
3656      if self._args.state_branch:
3657        assert(write_marks)
3658        source_marks_file = self._load_marks_file(b'source-marks')
3659        extra_flags.extend([b'--export-marks='+source_marks_file,
3660                            b'--import-marks='+source_marks_file])
3661      if self._args.preserve_commit_encoding is not None: # pragma: no cover
3662        reencode = 'no' if self._args.preserve_commit_encoding else 'yes'
3663        extra_flags.append('--reencode='+reencode)
3664      location = ['-C', self._args.source] if self._args.source else []
3665      fep_cmd = ['git'] + location + ['fast-export', '--show-original-ids',
3666                 '--signed-tags=strip', '--tag-of-filtered-object=rewrite',
3667                 '--fake-missing-tagger', '--reference-excluded-parents'
3668                 ] + extra_flags + self._args.refs
3669      self._fep = subproc.Popen(fep_cmd, bufsize=-1, stdout=subprocess.PIPE)
3670      self._input = self._fep.stdout
3671      if self._args.dry_run or self._args.debug:
3672        self._fe_orig = os.path.join(self.results_tmp_dir(),
3673                                     b'fast-export.original')
3674        output = open(self._fe_orig, 'bw')
3675        self._input = InputFileBackup(self._input, output)
3676        if self._args.debug:
3677          tmp = [decode(x) if isinstance(x, bytes) else x for x in fep_cmd]
3678          print("[DEBUG] Running: {}".format(' '.join(tmp)))
3679          print("  (saving a copy of the output at {})"
3680                .format(decode(self._fe_orig)))
3681
3682  def _setup_output(self):
3683    if not self._args.dry_run:
3684      location = ['-C', self._args.target] if self._args.target else []
3685      fip_cmd = ['git'] + location + ['-c', 'core.ignorecase=false',
3686                                      'fast-import', '--force', '--quiet']
3687      if date_format_permissive:
3688        fip_cmd.append('--date-format=raw-permissive')
3689      if self._args.state_branch:
3690        target_marks_file = self._load_marks_file(b'target-marks')
3691        fip_cmd.extend([b'--export-marks='+target_marks_file,
3692                        b'--import-marks='+target_marks_file])
3693      self._fip = subproc.Popen(fip_cmd, bufsize=-1,
3694                                stdin=subprocess.PIPE, stdout=subprocess.PIPE)
3695      self._import_pipes = (self._fip.stdin, self._fip.stdout)
3696    if self._args.dry_run or self._args.debug:
3697      self._fe_filt = os.path.join(self.results_tmp_dir(),
3698                                   b'fast-export.filtered')
3699      self._output = open(self._fe_filt, 'bw')
3700    else:
3701      self._output = self._fip.stdin
3702    if self._args.debug and not self._args.dry_run:
3703      self._output = DualFileWriter(self._fip.stdin, self._output)
3704      tmp = [decode(x) if isinstance(x, bytes) else x for x in fip_cmd]
3705      print("[DEBUG] Running: {}".format(' '.join(tmp)))
3706      print("  (using the following file as input: {})"
3707            .format(decode(self._fe_filt)))
3708
3709  def _migrate_origin_to_heads(self):
3710    refs_to_migrate = set(x for x in self._orig_refs
3711                          if x.startswith(b'refs/remotes/origin/'))
3712    if not refs_to_migrate:
3713      return
3714    if self._args.debug:
3715      print("[DEBUG] Migrating refs/remotes/origin/* -> refs/heads/*")
3716    target_working_dir = self._args.target or b'.'
3717    p = subproc.Popen('git update-ref --no-deref --stdin'.split(),
3718                      stdin=subprocess.PIPE, cwd=target_working_dir)
3719    for ref in refs_to_migrate:
3720      if ref == b'refs/remotes/origin/HEAD':
3721        p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref]))
3722        del self._orig_refs[ref]
3723        continue
3724      newref = ref.replace(b'refs/remotes/origin/', b'refs/heads/')
3725      if newref not in self._orig_refs:
3726        p.stdin.write(b'create %s %s\n' % (newref, self._orig_refs[ref]))
3727      p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref]))
3728      self._orig_refs[newref] = self._orig_refs[ref]
3729      del self._orig_refs[ref]
3730    p.stdin.close()
3731    if p.wait():
3732      raise SystemExit(_("git update-ref failed; see above")) # pragma: no cover
3733
3734    # Now remove
3735    if self._args.debug:
3736      print("[DEBUG] Removing 'origin' remote (rewritten history will no ")
3737      print("        longer be related; consider re-pushing it elsewhere.")
3738    subproc.call('git remote rm origin'.split(), cwd=target_working_dir)
3739
3740  def _final_commands(self):
3741    self._finalize_handled = True
3742    self._done_callback and self._done_callback()
3743
3744    if not self._args.quiet:
3745      self._progress_writer.finish()
3746
3747  def _ref_update(self, target_working_dir):
3748    # Start the update-ref process
3749    p = subproc.Popen('git update-ref --no-deref --stdin'.split(),
3750                      stdin=subprocess.PIPE,
3751                      cwd=target_working_dir)
3752
3753    # Remove replace_refs from _orig_refs
3754    replace_refs = {k:v for k, v in self._orig_refs.items()
3755                    if k.startswith(b'refs/replace/')}
3756    reverse_replace_refs = collections.defaultdict(list)
3757    for k,v in replace_refs.items():
3758      reverse_replace_refs[v].append(k)
3759    all(map(self._orig_refs.pop, replace_refs))
3760
3761    # Remove unused refs
3762    exported_refs, imported_refs = self.get_exported_and_imported_refs()
3763    refs_to_nuke = exported_refs - imported_refs
3764    if self._args.partial:
3765      refs_to_nuke = set()
3766    if refs_to_nuke and self._args.debug:
3767      print("[DEBUG] Deleting the following refs:\n  "+
3768            decode(b"\n  ".join(refs_to_nuke)))
3769    p.stdin.write(b''.join([b"delete %s\n" % x
3770                           for x in refs_to_nuke]))
3771
3772    # Delete or update and add replace_refs; note that fast-export automatically
3773    # handles 'update-no-add', we only need to take action for the other four
3774    # choices for replace_refs.
3775    self._flush_renames()
3776    actual_renames = {k:v for k,v in self._commit_renames.items() if k != v}
3777    if self._args.replace_refs in ['delete-no-add', 'delete-and-add']:
3778      # Delete old replace refs, if unwanted
3779      replace_refs_to_nuke = set(replace_refs)
3780      if self._args.replace_refs == 'delete-and-add':
3781        # git-update-ref won't allow us to update a ref twice, so be careful
3782        # to avoid deleting refs we'll later update
3783        replace_refs_to_nuke = replace_refs_to_nuke.difference(
3784                                 [b'refs/replace/'+x for x in actual_renames])
3785      p.stdin.write(b''.join([b"delete %s\n" % x
3786                             for x in replace_refs_to_nuke]))
3787    if self._args.replace_refs in ['delete-and-add', 'update-or-add',
3788                                   'update-and-add']:
3789      # Add new replace refs
3790      update_only = (self._args.replace_refs == 'update-or-add')
3791      p.stdin.write(b''.join([b"update refs/replace/%s %s\n" % (old, new)
3792                              for old,new in actual_renames.items()
3793                              if new and not (update_only and
3794                                              old in reverse_replace_refs)]))
3795
3796    # Complete the update-ref process
3797    p.stdin.close()
3798    if p.wait():
3799      raise SystemExit(_("git update-ref failed; see above")) # pragma: no cover
3800
3801  def _record_metadata(self, metadata_dir, orig_refs):
3802    self._flush_renames()
3803    with open(os.path.join(metadata_dir, b'commit-map'), 'bw') as f:
3804      f.write(("%-40s %s\n" % (_("old"), _("new"))).encode())
3805      for (old,new) in self._commit_renames.items():
3806        msg = b'%s %s\n' % (old, new if new != None else deleted_hash)
3807        f.write(msg)
3808
3809    exported_refs, imported_refs = self.get_exported_and_imported_refs()
3810
3811    batch_check_process = None
3812    batch_check_output_re = re.compile(b'^([0-9a-f]{40}) ([a-z]+) ([0-9]+)$')
3813    with open(os.path.join(metadata_dir, b'ref-map'), 'bw') as f:
3814      for refname, old_hash in orig_refs.items():
3815        if refname not in exported_refs:
3816          continue
3817        if refname not in imported_refs:
3818          new_hash = deleted_hash
3819        elif old_hash in self._commit_renames:
3820          new_hash = self._commit_renames[old_hash]
3821          new_hash = new_hash if new_hash != None else deleted_hash
3822        else: # Must be either an annotated tag, or a ref whose tip was pruned
3823          if not batch_check_process:
3824            cmd = 'git cat-file --batch-check'.split()
3825            target_working_dir = self._args.target or b'.'
3826            batch_check_process = subproc.Popen(cmd,
3827                                                stdin=subprocess.PIPE,
3828                                                stdout=subprocess.PIPE,
3829                                                cwd=target_working_dir)
3830          batch_check_process.stdin.write(refname+b"\n")
3831          batch_check_process.stdin.flush()
3832          line = batch_check_process.stdout.readline()
3833          m = batch_check_output_re.match(line)
3834          if m and m.group(2) in (b'tag', b'commit'):
3835            new_hash = m.group(1)
3836          elif line.endswith(b' missing\n'):
3837            new_hash = deleted_hash
3838          else:
3839            raise SystemExit(_("Failed to find new id for %(refname)s "
3840                               "(old id was %(old_hash)s)")
3841                             % ({'refname': refname, 'old_hash': old_hash})
3842                             ) # pragma: no cover
3843        f.write(b'%s %s %s\n' % (old_hash, new_hash, refname))
3844      if self._args.source or self._args.target:
3845        new_refs = GitUtils.get_refs(self._args.target or b'.')
3846        for ref, new_hash in new_refs.items():
3847          if ref not in orig_refs and not ref.startswith(b'refs/replace/'):
3848            old_hash = b'0'*len(new_hash)
3849            f.write(b'%s %s %s\n' % (old_hash, new_hash, ref))
3850    if batch_check_process:
3851      batch_check_process.stdin.close()
3852      batch_check_process.wait()
3853
3854    with open(os.path.join(metadata_dir, b'suboptimal-issues'), 'bw') as f:
3855      issues_found = False
3856      if self._commits_no_longer_merges:
3857        issues_found = True
3858
3859        f.write(textwrap.dedent(_('''
3860          The following commits used to be merge commits but due to filtering
3861          are now regular commits; they likely have suboptimal commit messages
3862          (e.g. "Merge branch next into master").  Original commit hash on the
3863          left, commit hash after filtering/rewriting on the right:
3864          ''')[1:]).encode())
3865        for oldhash, newhash in self._commits_no_longer_merges:
3866          f.write('  {} {}\n'.format(oldhash, newhash).encode())
3867        f.write(b'\n')
3868
3869      if self._commits_referenced_but_removed:
3870        issues_found = True
3871        f.write(textwrap.dedent(_('''
3872          The following commits were filtered out, but referenced in another
3873          commit message.  The reference to the now-nonexistent commit hash
3874          (or a substring thereof) was left as-is in any commit messages:
3875          ''')[1:]).encode())
3876        for bad_commit_reference in self._commits_referenced_but_removed:
3877          f.write('  {}\n'.format(bad_commit_reference).encode())
3878        f.write(b'\n')
3879
3880      if not issues_found:
3881        f.write(_("No filtering problems encountered.\n").encode())
3882
3883    with open(os.path.join(metadata_dir, b'already_ran'), 'bw') as f:
3884       f.write(_("This file exists to allow you to filter again without --force.\n").encode())
3885
3886  def finish(self):
3887    ''' Alternative to run() when there is no input of our own to parse,
3888        meaning that run only really needs to close the handle to fast-import
3889        and let it finish, thus making a call to "run" feel like a misnomer. '''
3890    assert not self._input
3891    assert self._managed_output
3892    self.run()
3893
3894  def insert(self, obj, direct_insertion = False):
3895    if not direct_insertion:
3896      if type(obj) == Blob:
3897        self._tweak_blob(obj)
3898      elif type(obj) == Commit:
3899        aux_info = {'orig_parents': obj.parents,
3900                    'had_file_changes': bool(obj.file_changes)}
3901        self._tweak_commit(obj, aux_info)
3902      elif type(obj) == Reset:
3903        self._tweak_reset(obj)
3904      elif type(obj) == Tag:
3905        self._tweak_tag(obj)
3906    self._insert_into_stream(obj)
3907
3908  def _insert_into_stream(self, obj):
3909    if not obj.dumped:
3910      if self._parser:
3911        self._parser.insert(obj)
3912      else:
3913        obj.dump(self._output)
3914
3915  def get_exported_and_imported_refs(self):
3916    return self._parser.get_exported_and_imported_refs()
3917
3918  def run(self):
3919    start = time.time()
3920    if not self._input and not self._output:
3921      self._run_sanity_checks()
3922      if not self._args.dry_run and not self._args.partial:
3923        self._migrate_origin_to_heads()
3924      self._setup_input(use_done_feature = True)
3925      self._setup_output()
3926    assert self._sanity_checks_handled
3927
3928    if self._input:
3929      # Create and run the filter
3930      self._repo_working_dir = self._args.source or b'.'
3931      self._parser = FastExportParser(blob_callback   = self._tweak_blob,
3932                                      commit_callback = self._tweak_commit,
3933                                      tag_callback    = self._tweak_tag,
3934                                      reset_callback  = self._tweak_reset,
3935                                      done_callback   = self._final_commands)
3936      self._parser.run(self._input, self._output)
3937      if not self._finalize_handled:
3938        self._final_commands()
3939
3940      # Make sure fast-export completed successfully
3941      if not self._args.stdin and self._fep.wait():
3942        raise SystemExit(_("Error: fast-export failed; see above.")) # pragma: no cover
3943      self._input.close()
3944
3945    # If we're not the manager of self._output, we should avoid post-run cleanup
3946    if not self._managed_output:
3947      return
3948
3949    # Close the output and ensure fast-import successfully completes
3950    self._output.close()
3951    if not self._args.dry_run and self._fip.wait():
3952      raise SystemExit(_("Error: fast-import failed; see above.")) # pragma: no cover
3953
3954    # With fast-export and fast-import complete, update state if requested
3955    if self._args.state_branch:
3956      self._save_marks_files()
3957
3958    # Notify user how long it took, before doing a gc and such
3959    msg = "New history written in {:.2f} seconds..."
3960    if self._args.repack:
3961      msg = "New history written in {:.2f} seconds; now repacking/cleaning..."
3962    print(msg.format(time.time()-start))
3963
3964    # Exit early, if requested
3965    if self._args.dry_run:
3966      print(_("NOTE: Not running fast-import or cleaning up; --dry-run passed."))
3967      if self._fe_orig:
3968        print(_("      Requested filtering can be seen by comparing:"))
3969        print("        " + decode(self._fe_orig))
3970      else:
3971        print(_("      Requested filtering can be seen at:"))
3972      print("        " + decode(self._fe_filt))
3973      return
3974
3975    target_working_dir = self._args.target or b'.'
3976    if self._input:
3977      self._ref_update(target_working_dir)
3978
3979      # Write out data about run
3980      self._record_metadata(self.results_tmp_dir(), self._orig_refs)
3981
3982    # Final cleanup:
3983    #   If we need a repack, then nuke the reflogs and repack.
3984    #   If we need a reset, do a reset --hard
3985    reset = not GitUtils.is_repository_bare(target_working_dir)
3986    RepoFilter.cleanup(target_working_dir, self._args.repack, reset,
3987                       run_quietly=self._args.quiet,
3988                       show_debuginfo=self._args.debug)
3989
3990    # Let user know how long it took
3991    print(_("Completely finished after {:.2f} seconds.")
3992          .format(time.time()-start))
3993
3994def main():
3995  setup_gettext()
3996  args = FilteringOptions.parse_args(sys.argv[1:])
3997  if args.analyze:
3998    RepoAnalyze.run(args)
3999  else:
4000    filter = RepoFilter(args)
4001    filter.run()
4002
4003if __name__ == '__main__':
4004  main()
4005