1from __future__ import absolute_import, division, unicode_literals
2from binascii import hexlify, unhexlify
3try:
4    from itertools import izip as zip
5except ImportError:
6    pass
7import io
8import os
9import shutil
10import subprocess
11import sys
12try:
13    from urllib.parse import quote_from_bytes, unquote_to_bytes
14except ImportError:
15    from urllib import quote as quote_from_bytes
16    from urllib import unquote as unquote_to_bytes
17from collections import (
18    OrderedDict,
19    defaultdict,
20)
21try:
22    from collections.abc import Sequence
23except ImportError:
24    from collections import Sequence
25try:
26    from urllib2 import URLError
27except ImportError:
28    from urllib.error import URLError
29try:
30    from urlparse import urlparse
31except ImportError:
32    from urllib.parse import urlparse
33from .exceptions import (
34    Abort,
35    AmbiguousGraftAbort,
36    NothingToGraftException,
37    OldUpgradeAbort,
38    UpgradeAbort,
39)
40from .util import (
41    HTTPReader,
42    Seekable,
43    byte_diff,
44    check_enabled,
45    one,
46    VersionedDict,
47)
48from .git import (
49    EMPTY_BLOB,
50    EMPTY_TREE,
51    Git,
52    GitProcess,
53    NULL_NODE_ID,
54)
55from .hg.changegroup import (
56    RawRevChunk,
57    RevDiff,
58)
59from .hg.objects import (
60    Authorship,
61    Changeset,
62    File,
63    Manifest,
64)
65from .helper import GitHgHelper
66from .util import progress_iter
67from cinnabar import util
68from cinnabar.util import fsdecode
69
70import logging
71
72
73# An empty mercurial file with no parent has a fixed sha1 which is that of
74# "\0" * 40 (incidentally, this is the same as for an empty manifest with
75# no parent.
76HG_EMPTY_FILE = b'b80de5d138758541c5f05265ad144ab9fa86d1db'
77
78
79revchunk_log = logging.getLogger('revchunks')
80
81
82class FileFindParents(object):
83    logger = logging.getLogger('generated_file')
84
85    @staticmethod
86    def _invalid_if_new(file):
87        if file.node == NULL_NODE_ID:
88            raise Exception('Trying to create an invalid file. '
89                            'Please open an issue with details.')
90
91    @staticmethod
92    def set_parents(file, parent1=NULL_NODE_ID, parent2=NULL_NODE_ID):
93        # Remove null nodes
94        parents = tuple(p for p in (parent1, parent2) if p != NULL_NODE_ID)
95        orig_parents = parents
96
97        # On merges, a file with copy metadata has either no parent, or only
98        # one. In that latter case, the parent is always set as second parent.
99        # On non-merges, a file with copy metadata doesn't have a parent.
100        if file.metadata or file.content.startswith(b'\1\n'):
101            if len(parents) == 2:
102                FileFindParents._invalid_if_new(file)
103            elif len(parents) == 1:
104                parents = (NULL_NODE_ID, parents[0])
105        elif len(parents) == 2:
106            if parents[0] == parents[1]:
107                parents = parents[:1]
108
109        file.parents = parents
110        if file.node != NULL_NODE_ID and file.node != file.sha1:
111            if parents != orig_parents:
112                if FileFindParents._try_parents(file, *orig_parents):
113                    FileFindParents.logger.debug(
114                        'Right parents given for %s, but they don\'t match '
115                        'what modern mercurial normally would do', file.node)
116                    return
117            FileFindParents._set_parents_fallback(file, parent1, parent2)
118
119    @staticmethod
120    def _set_parents_fallback(file, parent1=NULL_NODE_ID,
121                              parent2=NULL_NODE_ID):
122        result = (  # In some cases, only one parent is stored in a merge,
123                    # because the other parent is actually an ancestor of the
124                    # first one, but checking that is likely more expensive
125                    # than to check if the sha1 matches with either parent.
126                    FileFindParents._try_parents(file, parent1) or
127                    FileFindParents._try_parents(file, parent2) or
128                    # Some mercurial versions stores the first parent twice in
129                    # merges.
130                    FileFindParents._try_parents(file, parent1, parent1) or
131                    # As last resort, try without any parents.
132                    FileFindParents._try_parents(file))
133
134        FileFindParents.logger.debug('Wrong parents given for %s', file.node)
135        FileFindParents.logger.debug('  Got: %s %s', parent1, parent2)
136        if result:
137            FileFindParents.logger.debug('  Expected: %s %s', file.parent1,
138                                         file.parent2)
139
140        # If none of the above worked, we failed big time
141        if not result:
142            raise Exception('Failed to create file. '
143                            'Please open an issue with details.')
144
145    @staticmethod
146    def _try_parents(file, *parents):
147        file.parents = parents
148        return file.node == file.sha1
149
150
151class ChangesetPatcher(bytes):
152    class ChangesetPatch(RawRevChunk):
153        __slots__ = ('patch', '_changeset')
154
155        class Patch(RevDiff):
156            class Part(object):
157                __slots__ = ('start', 'end', 'text_data')
158
159            def __init__(self, buf):
160                self._buf = buf
161
162            def __str__(self):
163                raise RuntimeError('Use to_str()')
164
165            def to_str(self):
166                return self._buf
167
168            def __iter__(self):
169                for line in self._buf.split(b'\0'):
170                    if line:
171                        part = self.Part()
172                        start, end, text_data = line.split(b',')
173                        part.start = int(start)
174                        part.end = int(end)
175                        part.text_data = unquote_to_bytes(text_data)
176                        yield part
177
178            @classmethod
179            def from_items(cls, items):
180                return cls(b'\0'.join(
181                    b','.join((b'%d,%d' % (start, end),
182                               quote_from_bytes(text_data).encode('ascii')))
183                    for start, end, text_data in items))
184
185        def __init__(self, changeset, patch_data):
186            self._changeset = changeset
187            self.patch = self.Patch(patch_data)
188
189        def __getattr__(self, name):
190            if name == 'delta_node':
191                name = 'node'
192            return getattr(self._changeset, name)
193
194    def apply(self, changeset):
195        # Sneaky way to create a copy of the changeset
196        chunk = self.ChangesetPatch(changeset, b'')
197        changeset = Changeset.from_chunk(chunk, changeset)
198
199        for k, v in (l.split(b' ', 1) for l in self.splitlines()):
200            if k == b'changeset':
201                changeset.node = v
202            elif k == b'manifest':
203                changeset.manifest = v
204            elif k == b'author':
205                changeset.author = v
206            elif k == b'extra':
207                extra = changeset.extra
208                changeset.extra = v
209                if extra is not None:
210                    changeset.extra.update(
211                        (k, v) for k, v in extra.items()
212                        if k not in changeset.extra)
213            elif k == b'files':
214                changeset.files = v.split(b'\0')
215            elif k == b'patch':
216                chunk = self.ChangesetPatch(changeset, v)
217                changeset = Changeset.from_chunk(chunk, changeset)
218
219        # This should not occur in normal changeset bodies. If it occurs,
220        # it likely comes from our handling of conflicting commits.
221        # So in that case, adjust until we have the right sha1.
222        while changeset.body.endswith(b'\0') and \
223                changeset.sha1 != changeset.node:
224            changeset.body = changeset.body[:-1]
225
226        return changeset
227
228    @classmethod
229    def from_diff(cls, changeset1, changeset2):
230        items = []
231        if changeset1.node != changeset2.node:
232            items.append(b'changeset %s' % changeset2.node)
233        if changeset1.manifest != changeset2.manifest:
234            items.append(b'manifest %s' % changeset2.manifest)
235        if changeset1.author != changeset2.author:
236            items.append(b'author %s' % changeset2.author)
237        if changeset1.extra != changeset2.extra:
238            if changeset2.extra is not None:
239                items.append(b'extra %s' % Changeset.ExtraData({
240                    k: v
241                    for k, v in changeset2.extra.items()
242                    if not changeset1.extra or changeset1.extra.get(k) != v
243                }).to_str())
244        if changeset1.files != changeset2.files:
245            items.append(b'files %s' % b'\0'.join(changeset2.files))
246
247        this = cls(b'\n'.join(items))
248        new = this.apply(changeset1)
249        if new.raw_data != changeset2.raw_data:
250            items.append(b'patch %s' % cls.ChangesetPatch.Patch.from_items(
251                byte_diff(new.raw_data, changeset2.raw_data)).to_str())
252            this = cls(b'\n'.join(items))
253
254        return this
255
256
257class Changeset(Changeset):
258    @classmethod
259    def from_git_commit(cls, git_commit):
260        if not isinstance(git_commit, GitCommit):
261            git_commit = GitCommit(git_commit)
262
263        changeset = cls()
264
265        (changeset.author, changeset.timestamp, changeset.utcoffset) = \
266            Authorship.from_git_str(git_commit.author).to_hg()
267
268        if git_commit.committer != git_commit.author:
269            changeset.committer = Authorship.from_git_str(
270                git_commit.committer).to_hg_str()
271
272        changeset.body = git_commit.body
273
274        return changeset
275
276
277class GeneratedManifestInfo(Manifest):
278    __slots__ = ('delta_node', 'removed', 'modified')
279
280    def __init__(self, node):
281        super(GeneratedManifestInfo, self).__init__(node)
282        self.removed = set()
283        self.modified = {}
284
285    def add(self, path, sha1=None, attr=b'', modified=False):
286        super(GeneratedManifestInfo, self).add(path, sha1, attr)
287        if modified:
288            self.modified[path] = (sha1, attr)
289
290
291class TagSet(object):
292    def __init__(self):
293        self._tags = OrderedDict()
294        self._taghist = defaultdict(set)
295
296    def __setitem__(self, key, value):
297        old = self._tags.get(key)
298        if old:
299            self._taghist[key].add(old)
300        self._tags[key] = value
301
302    def __getitem__(self, key):
303        return self._tags[key]
304
305    def update(self, other):
306        if not other:
307            return
308        assert isinstance(other, TagSet)
309        for key, anode in util.iteritems(other._tags):
310            # derived from mercurial's _updatetags
311            ahist = other._taghist[key]
312            if key not in self._tags:
313                self._tags[key] = anode
314                self._taghist[key] = set(ahist)
315                continue
316            bnode = self._tags[key]
317            bhist = self._taghist[key]
318            if (bnode != anode and anode in bhist and
319                    (bnode not in ahist or len(bhist) > len(ahist))):
320                anode = bnode
321            self._tags[key] = anode
322            self._taghist[key] = ahist | set(
323                n for n in bhist if n not in ahist)
324
325    def __iter__(self):
326        return util.iteritems(self._tags)
327
328    def hist(self, key):
329        return iter(sorted(self._taghist[key]))
330
331
332class GitCommit(object):
333    __slots__ = ('sha1', 'body', 'parents', 'tree', 'author', 'committer')
334
335    def __init__(self, sha1):
336        self.sha1 = sha1
337        commit = GitHgHelper.cat_file(b'commit', sha1)
338        header, self.body = commit.split(b'\n\n', 1)
339        parents = []
340        for line in header.splitlines():
341            if line == b'\n':
342                break
343            typ, data = line.split(b' ', 1)
344            typ = typ.decode('ascii')
345            if typ == 'parent':
346                parents.append(data.strip())
347            elif typ in self.__slots__:
348                assert not hasattr(self, typ)
349                setattr(self, typ, data)
350        self.parents = tuple(parents)
351
352
353class PseudoGitCommit(GitCommit):
354    def __init__(self, sha1):
355        self.sha1 = sha1
356
357
358def autohexlify(h):
359    if len(h) == 40:
360        return h
361    elif len(h) == 20:
362        return hexlify(h)
363    assert False
364
365
366class BranchMap(object):
367    __slots__ = "_heads", "_all_heads", "_tips", "_git_sha1s", "_unknown_heads"
368
369    def __init__(self, store, remote_branchmap, remote_heads):
370        self._heads = {}
371        self._all_heads = tuple(autohexlify(h) for h in remote_heads)
372        self._tips = {}
373        self._git_sha1s = {}
374        self._unknown_heads = set()
375        for branch, heads in util.iteritems(remote_branchmap):
376            # We can't keep track of tips if the list of heads is not sequenced
377            sequenced = isinstance(heads, Sequence) or len(heads) == 1
378            branch_heads = []
379            for head in heads:
380                head = autohexlify(head)
381                branch_heads.append(head)
382                sha1 = store.changeset_ref(head)
383                if not sha1:
384                    self._unknown_heads.add(head)
385                    continue
386                assert head not in self._git_sha1s
387                self._git_sha1s[head] = sha1
388            # Use last non-closed head as tip if there's more than one head.
389            # Caveat: we don't know a head is closed until we've pulled it.
390            if branch and heads and sequenced:
391                for head in reversed(branch_heads):
392                    self._tips[branch] = head
393                    if head in self._git_sha1s:
394                        changeset = store.changeset(head)
395                        if changeset.close:
396                            continue
397                    break
398            if branch:
399                self._heads[branch] = tuple(branch_heads)
400
401    def names(self):
402        return self._heads.keys()
403
404    def heads(self, branch=None):
405        if branch:
406            return self._heads.get(branch, ())
407        return self._all_heads
408
409    def unknown_heads(self):
410        return self._unknown_heads
411
412    def git_sha1(self, head):
413        return self._git_sha1s.get(head, b'?')
414
415    def tip(self, branch):
416        return self._tips.get(branch, None)
417
418
419class Grafter(object):
420    __slots__ = "_store", "_early_history", "_graft_trees", "_grafted"
421
422    def __init__(self, store):
423        self._store = store
424        self._early_history = set()
425        self._graft_trees = defaultdict(list)
426        self._grafted = False
427        refs = [
428            b'--exclude=refs/cinnabar/*',
429            b'--exclude=refs/notes/cinnabar',
430            b'--exclude=refs/original/*',
431            b'--all',
432        ]
433        if store._has_metadata:
434            refs += [b'--not', b'refs/cinnabar/metadata^']
435        for node, tree, parents in progress_iter(
436                'Reading {} graft candidates',
437                GitHgHelper.rev_list(b'--full-history', *refs)):
438            self._graft_trees[tree].append(node)
439
440    def _is_cinnabar_commit(self, commit):
441        data = self._store.read_changeset_data(commit)
442        return b'\npatch' not in data if data else False
443
444    def _graft(self, changeset, parents):
445        store = self._store
446        tree = store.git_tree(changeset.manifest, *changeset.parents[:1])
447        do_graft = tree and tree in self._graft_trees
448        if not do_graft:
449            return None
450
451        commits = {}
452
453        def graftable(c):
454            commit = commits.get(c)
455            if not commit:
456                commit = commits[c] = GitCommit(c)
457            if (Authorship.from_git_str(commit.author).timestamp !=
458                    int(changeset.timestamp)):
459                return False
460
461            if all(store._replace.get(p1, p1) == store._replace.get(p2, p2)
462                   for p1, p2 in zip(commit.parents, parents)):
463                return True
464
465            # Allow to graft if one of the parents is from early history
466            return any(p in self._early_history for p in parents)
467
468        nodes = tuple(c for c in self._graft_trees[tree] if graftable(c))
469
470        if len(nodes) > 1:
471            # Ideally, this should all be tried with fuzziness, and
472            # independently of the number of nodes we got, but the
473            # following is enough to graft github.com/mozilla/gecko-dev
474            # to mozilla-central and related repositories.
475            # Try with commits with the same subject line
476            subject = changeset.body.split(b'\n', 1)[0]
477            possible_nodes = tuple(
478                n for n in nodes
479                if commits[n].body.split(b'\n', 1)[0] == subject
480            )
481            if len(possible_nodes) > 1:
482                # Try with commits with the same author ; this is attempted
483                # separately from checking timestamps because author may
484                # have been munged.
485                possible_nodes = tuple(
486                    n for n in possible_nodes
487                    if (Authorship.from_git_str(commits[n].author)
488                        .to_hg()[0] == changeset.author)
489                )
490            if len(possible_nodes) == 1:
491                nodes = possible_nodes
492
493        # If we still have multiple nodes, check if one of them is one that
494        # cinnabar would have created. If it is, we prefer other commits on
495        # the premise that it means we've been asked to reclone with a graft.
496        # on a repo that was already handled by cinnabar.
497        if len(nodes) > 1:
498            possible_nodes = []
499            for node in nodes:
500                commit = commits[node]
501                cs = Changeset.from_git_commit(commit)
502                patcher = ChangesetPatcher.from_diff(cs, changeset)
503                if b'\npatch' in patcher:
504                    possible_nodes.append(node)
505            nodes = possible_nodes
506
507        if len(nodes) > 1:
508            raise AmbiguousGraftAbort(
509                'Cannot graft changeset %s. Candidates: %s'
510                % (changeset.node.decode('ascii'),
511                   ', '.join(n.decode('ascii') for n in nodes)))
512
513        if nodes:
514            node = nodes[0]
515            self._graft_trees[tree].remove(node)
516            return commits[node]
517        return None
518
519    def graft(self, changeset):
520        # TODO: clarify this function because it's hard to follow.
521        store = self._store
522        parents = tuple(store.changeset_ref(p) for p in changeset.parents)
523        if None in parents:
524            result = None
525        else:
526            result = self._graft(changeset, parents)
527        if parents:
528            is_early_history = all(p in self._early_history for p in parents)
529        else:
530            is_early_history = not result
531        if not (is_early_history or result):
532            raise NothingToGraftException()
533        if is_early_history or not result:
534            commit = store.changeset_ref(changeset.node)
535        else:
536            commit = result
537        store.store_changeset(changeset, commit or False)
538        commit = store.changeset_ref(changeset.node)
539        if is_early_history:
540            if result and result.sha1 != commit:
541                store._replace[result.sha1] = commit
542            else:
543                self._early_history.add(commit)
544        elif not parents:
545            if result:
546                commit = result.sha1
547            if self._is_cinnabar_commit(commit):
548                self._early_history.add(commit)
549
550        if result:
551            self._grafted = True
552
553    def close(self):
554        if not self._grafted and self._early_history:
555            raise NothingToGraftException()
556
557
558class GitHgStore(object):
559    FLAGS = [
560        b'files-meta',
561        b'unified-manifests-v2',
562    ]
563
564    METADATA_REFS = (
565        b'refs/cinnabar/changesets',
566        b'refs/cinnabar/manifests',
567        b'refs/cinnabar/hg2git',
568        b'refs/notes/cinnabar',
569        b'refs/cinnabar/files-meta',
570    )
571
572    def _metadata(self):
573        if self._metadata_sha1:
574            metadata = GitCommit(self._metadata_sha1)
575            self._flags = set(metadata.body.split())
576            refs = self.METADATA_REFS
577            if b'files-meta' not in self._flags:
578                refs = list(refs)
579                refs.remove(b'refs/cinnabar/files-meta')
580            return metadata, dict(zip(refs, metadata.parents))
581
582    def metadata(self):
583        metadata = self._metadata()
584        if metadata:
585            if len(self._flags) > len(self.FLAGS):
586                raise UpgradeAbort(
587                    'It looks like this repository was used with a newer '
588                    'version of git-cinnabar. Cannot use this version.')
589            if set(self._flags) != set(self.FLAGS):
590                raise UpgradeAbort()
591        return metadata
592
593    def __init__(self):
594        self._flags = set()
595        self._closed = False
596        self._graft = None
597
598        self._hgheads = VersionedDict()
599        self._branches = {}
600
601        self._replace = Git._replace
602        self._tagcache_ref = None
603        self._metadata_sha1 = None
604        # While doing a for_each_ref, ensure refs/notes/cinnabar is in the
605        # cache.
606        for sha1, ref in Git.for_each_ref('refs/cinnabar',
607                                          'refs/notes/cinnabar'):
608            if ref.startswith(b'refs/cinnabar/replace/'):
609                self._replace[ref[22:]] = sha1
610            elif ref.startswith(b'refs/cinnabar/branches/'):
611                raise OldUpgradeAbort()
612            elif ref == b'refs/cinnabar/metadata':
613                self._metadata_sha1 = sha1
614            elif ref == b'refs/cinnabar/tag_cache':
615                self._tagcache_ref = sha1
616        self._replace = VersionedDict(self._replace)
617
618        self._tagcache = {}
619        self._tagfiles = {}
620        self._tags = {NULL_NODE_ID: {}}
621        self._cached_changeset_ref = {}
622        self._tagcache_items = set()
623        if self._tagcache_ref:
624            for line in Git.ls_tree(self._tagcache_ref):
625                mode, typ, sha1, path = line
626                if typ == b'blob':
627                    if self.ATTR[mode] == b'x':
628                        self._tagfiles[path] = sha1
629                    else:
630                        self._tagcache[path] = sha1
631                elif typ == b'commit':
632                    assert sha1 == NULL_NODE_ID
633                    self._tagcache[path] = sha1
634                self._tagcache_items.add(path)
635
636        self.tag_changes = False
637
638        metadata = self.metadata()
639        if metadata:
640            metadata, refs = metadata
641        self._has_metadata = bool(metadata)
642        self._metadata_refs = refs if metadata else {}
643        self._manifest_heads_orig = set()
644        self._generation = 0
645        if metadata:
646            changesets_ref = self._metadata_refs.get(
647                b'refs/cinnabar/changesets')
648            if changesets_ref:
649                commit = GitCommit(changesets_ref)
650                for n, head in enumerate(commit.body.splitlines()):
651                    hghead, branch = head.split(b' ', 1)
652                    self._hgheads._previous[hghead] = (branch, n)
653                    self._generation = n + 1
654
655            self._manifest_heads_orig = set(GitHgHelper.heads(b'manifests'))
656
657            replace = {}
658            for line in Git.ls_tree(metadata.tree):
659                mode, typ, sha1, path = line
660                replace[path] = sha1
661
662            if self._replace and not replace:
663                raise OldUpgradeAbort()
664
665            # Delete old tag-cache, which may contain incomplete data.
666            Git.delete_ref(b'refs/cinnabar/tag-cache')
667
668    def prepare_graft(self):
669        self._graft = Grafter(self)
670
671    @staticmethod
672    def _try_merge_branches(repo_url):
673        parsed_url = urlparse(repo_url)
674        branches = []
675        path = parsed_url.path.lstrip(b'/').rstrip(b'/')
676        if path:
677            parts = list(reversed(path.split(b'/')))
678        else:
679            parts = []
680        host = parsed_url.netloc.split(b':', 1)[0]
681        if host:
682            parts.append(host)
683        last_path = b''
684        for part in parts:
685            if last_path:
686                last_path = b'%s/%s' % (part, last_path)
687            else:
688                last_path = part
689            branches.append(last_path)
690        branches.append(b'metadata')
691        return branches
692
693    @staticmethod
694    def _find_branch(branches, remote_refs):
695        for branch in branches:
696            if branch in remote_refs:
697                return branch
698            if b'refs/cinnabar/%s' % branch in remote_refs:
699                return b'refs/cinnabar/%s' % branch
700            if b'refs/heads/%s' % branch in remote_refs:
701                return b'refs/heads/%s' % branch
702
703    def merge(self, git_repo_url, hg_repo_url, branch=None):
704        # Eventually we'll want to handle a full merge, but for now, we only
705        # handle the case where we don't have metadata to begin with.
706        # The caller should avoid calling this function otherwise.
707        assert not self._has_metadata
708        remote_refs = OrderedDict()
709        for line in Git.iter('ls-remote', fsdecode(git_repo_url),
710                             stderr=open(os.devnull, 'wb')):
711            sha1, ref = line.split(None, 1)
712            remote_refs[ref] = sha1
713        bundle = None
714        if not remote_refs and urlparse(git_repo_url).scheme in (b'http',
715                                                                 b'https'):
716            try:
717                bundle = HTTPReader(git_repo_url)
718            except URLError as e:
719                logging.error(e.reason)
720                return False
721            if bundle.fh.headers.get('Content-Encoding', 'identity') == 'gzip':
722                from gzip import GzipFile
723                bundle = Seekable(bundle, bundle.length)
724                bundle = GzipFile(mode='rb', fileobj=bundle)
725            BUNDLE_SIGNATURE = b'# v2 git bundle\n'
726            signature = bundle.read(len(BUNDLE_SIGNATURE))
727            if signature != BUNDLE_SIGNATURE:
728                logging.error('Could not find cinnabar metadata')
729                return False
730            bundle = io.BufferedReader(bundle)
731            while True:
732                line = bundle.readline().rstrip()
733                if not line:
734                    break
735                sha1, ref = line.split(b' ', 1)
736                remote_refs[ref] = sha1
737        if branch:
738            branches = [branch]
739        else:
740            branches = self._try_merge_branches(hg_repo_url)
741
742        ref = self._find_branch(branches, remote_refs)
743        if ref is None:
744            logging.error('Could not find cinnabar metadata')
745            return False
746
747        if bundle:
748            args = ('-v',) if util.progress else ()
749            proc = GitProcess('index-pack', '--stdin', '--fix-thin', *args,
750                              stdin=subprocess.PIPE,
751                              stdout=open(os.devnull, 'wb'))
752            shutil.copyfileobj(bundle, proc.stdin)
753        else:
754            fetch = ['fetch', '--no-tags', '--no-recurse-submodules', '-q']
755            fetch.append('--progress' if util.progress else '--no-progress')
756            fetch.append(fsdecode(git_repo_url))
757            cmd = fetch + [fsdecode(ref) + ':refs/cinnabar/fetch']
758            proc = GitProcess(*cmd, stdout=sys.stdout)
759        if proc.wait():
760            logging.error('Failed to fetch cinnabar metadata.')
761            return False
762
763        # Do some basic validation on the metadata we just got.
764        commit = GitCommit(remote_refs[ref])
765        if b'cinnabar@git' not in commit.author:
766            logging.error('Invalid cinnabar metadata.')
767            return False
768
769        flags = set(commit.body.split())
770        if b'files-meta' not in flags or b'unified-manifests-v2' not in flags \
771                or len(commit.parents) != len(self.METADATA_REFS):
772            logging.error('Invalid cinnabar metadata.')
773            return False
774
775        # At this point, we'll just assume this is good enough.
776
777        # Get replace refs.
778        if commit.tree != EMPTY_TREE:
779            errors = False
780            by_sha1 = {}
781            for k, v in util.iteritems(remote_refs):
782                if v not in by_sha1:
783                    by_sha1[v] = k
784            needed = []
785            for line in Git.ls_tree(commit.tree):
786                mode, typ, sha1, path = line
787                if sha1 in by_sha1:
788                    ref = b'refs/cinnabar/replace/%s' % path
789                    if bundle:
790                        Git.update_ref(ref, sha1)
791                    else:
792                        needed.append(
793                            fsdecode(b':'.join((by_sha1[sha1], ref))))
794                else:
795                    logging.error('Missing commit: %s', sha1)
796                    errors = True
797            if errors:
798                return False
799
800            if not bundle:
801                cmd = fetch + needed
802                proc = GitProcess(*cmd, stdout=sys.stdout)
803                if proc.wait():
804                    logging.error('Failed to fetch cinnabar metadata.')
805                    return False
806
807        Git.update_ref(b'refs/cinnabar/metadata', commit.sha1)
808        self._metadata_sha1 = commit.sha1
809        GitHgHelper.reload()
810        Git.delete_ref(b'refs/cinnabar/fetch')
811
812        # TODO: avoid the duplication of code with __init__
813        metadata = self.metadata()
814
815        if not metadata:
816            # This should never happen, but just in case.
817            logging.warn('Could not find cinnabar metadata')
818            Git.delete_ref(b'refs/cinnabar/metadata')
819            GitHgHelper.reload()
820            return False
821
822        metadata, refs = metadata
823        self._has_metadata = True
824        self._metadata_refs = refs if metadata else {}
825        changesets_ref = self._metadata_refs.get(b'refs/cinnabar/changesets')
826        self._generation = 0
827        if changesets_ref:
828            commit = GitCommit(changesets_ref)
829            for n, head in enumerate(commit.body.splitlines()):
830                hghead, branch = head.split(b' ', 1)
831                self._hgheads._previous[hghead] = (branch, 1)
832                self._generation = n + 1
833
834        self._manifest_heads_orig = set(GitHgHelper.heads(b'manifests'))
835
836        for line in Git.ls_tree(metadata.tree):
837            mode, typ, sha1, path = line
838            self._replace[path] = sha1
839
840        return True
841
842    def tags(self):
843        tags = TagSet()
844        heads = sorted((n, h) for h, (b, n) in util.iteritems(self._hgheads))
845        for _, h in heads:
846            h = self.changeset_ref(h)
847            tags.update(self._get_hgtags(h))
848        for tag, node in tags:
849            if node != NULL_NODE_ID:
850                yield tag, node
851
852    def _get_hgtags(self, head):
853        tags = TagSet()
854        if not self._tagcache.get(head):
855            ls = one(Git.ls_tree(head, b'.hgtags'))
856            if not ls:
857                self._tagcache[head] = NULL_NODE_ID
858                return tags
859            mode, typ, self._tagcache[head], path = ls
860        tagfile = self._tagcache[head]
861        if tagfile not in self._tags:
862            if tagfile in self._tagfiles:
863                data = GitHgHelper.cat_file(b'blob', self._tagfiles[tagfile])
864                for line in data.splitlines():
865                    tag, nodes = line.split(b'\0', 1)
866                    nodes = nodes.split(b' ')
867                    for node in reversed(nodes):
868                        tags[tag] = node
869            else:
870                data = GitHgHelper.cat_file(b'blob', tagfile) or b''
871                for line in data.splitlines():
872                    if not line:
873                        continue
874                    try:
875                        node, tag = line.split(b' ', 1)
876                    except ValueError:
877                        continue
878                    tag = tag.strip()
879                    try:
880                        unhexlify(node)
881                    except TypeError:
882                        continue
883                    if node != NULL_NODE_ID:
884                        node = self.cached_changeset_ref(node)
885                    if node:
886                        tags[tag] = node
887            self._tags[tagfile] = tags
888        return self._tags[tagfile]
889
890    def heads(self, branches={}):
891        if not isinstance(branches, (dict, set)):
892            branches = set(branches)
893        return set(h for h, (b, _) in util.iteritems(self._hgheads)
894                   if not branches or b in branches)
895
896    def _head_branch(self, head):
897        if head in self._hgheads:
898            return self._hgheads[head][0], head
899        if head in self._branches:
900            return self._branches[head], head
901        branch = self.changeset(head).branch or b'default'
902        self._branches[head] = branch
903        return branch, head
904
905    def add_head(self, head, parent1=NULL_NODE_ID, parent2=NULL_NODE_ID):
906        branch, head = self._head_branch(head)
907        for p in (parent1, parent2):
908            if p == NULL_NODE_ID:
909                continue
910            parent_branch, parent_head = self._head_branch(p)
911            if parent_branch == branch:
912                if parent_head in self._hgheads:
913                    assert parent_branch == self._hgheads[parent_head][0]
914                    del self._hgheads[parent_head]
915
916        generation = self._generation
917        self._generation += 1
918        self._hgheads[head] = (branch, generation)
919
920    def read_changeset_data(self, obj):
921        assert obj is not None
922        obj = bytes(obj)
923        data = GitHgHelper.git2hg(obj)
924        if data is None:
925            return None
926        ret = ChangesetPatcher(data)
927        return ret
928
929    def hg_changeset(self, sha1):
930        data = self.read_changeset_data(sha1)
931        if data:
932            assert data.startswith(b'changeset ')
933            return data[10:50]
934        return None
935
936    def hg_manifest(self, sha1):
937        git_commit = GitCommit(sha1)
938        assert len(git_commit.body) == 40
939        return git_commit.body
940
941    def _hg2git(self, sha1):
942        if not self._has_metadata and not GitHgHelper._helper:
943            return None
944        gitsha1 = GitHgHelper.hg2git(sha1)
945        if gitsha1 == NULL_NODE_ID:
946            gitsha1 = None
947        return gitsha1
948
949    def changeset(self, sha1, include_parents=False):
950        gitsha1 = self.changeset_ref(sha1)
951        assert gitsha1
952        return self._changeset(gitsha1, include_parents)
953
954    def _changeset(self, git_commit, include_parents=False):
955        if not isinstance(git_commit, GitCommit):
956            git_commit = GitCommit(git_commit)
957
958        metadata = self.read_changeset_data(git_commit.sha1)
959        if not metadata:
960            return None
961        changeset = Changeset.from_git_commit(git_commit)
962        changeset = metadata.apply(changeset)
963
964        if include_parents:
965            assert len(git_commit.parents) <= 2
966            changeset.parents = tuple(
967                self.hg_changeset(self._replace.get(p, p))
968                for p in git_commit.parents)
969
970        return changeset
971
972    ATTR = {
973        b'100644': b'',
974        b'100755': b'x',
975        b'120000': b'l',
976    }
977
978    @staticmethod
979    def manifest_metadata_path(path):
980        return b'_' + path.replace(b'/', b'/_')
981
982    @staticmethod
983    def manifest_path(path):
984        return path[1:].replace(b'/_', b'/')
985
986    def manifest(self, sha1, include_parents=False):
987        manifest = GeneratedManifestInfo(sha1)
988        manifest.raw_data = GitHgHelper.manifest(sha1)
989        if include_parents:
990            git_sha1 = self.manifest_ref(sha1)
991            commit = GitCommit(git_sha1)
992            parents = (self.hg_manifest(p) for p in commit.parents)
993            manifest.parents = tuple(parents)
994        return manifest
995
996    def manifest_ref(self, sha1):
997        return self._hg2git(sha1)
998
999    def changeset_ref(self, sha1):
1000        return self._hg2git(sha1)
1001
1002    def cached_changeset_ref(self, sha1):
1003        try:
1004            return self._cached_changeset_ref[sha1]
1005        except KeyError:
1006            res = self._cached_changeset_ref[sha1] = self.changeset_ref(sha1)
1007            return res
1008
1009    def file_meta(self, sha1):
1010        return GitHgHelper.file_meta(sha1)
1011
1012    def file(self, sha1, file_parents=None):
1013        if sha1 == HG_EMPTY_FILE:
1014            content = b''
1015        else:
1016            content = GitHgHelper.cat_blob(b':h%s' % sha1)
1017
1018        file = File(sha1)
1019        meta = self.file_meta(sha1)
1020        if meta:
1021            file.metadata = meta
1022        file.content = content
1023        if file_parents is not None:
1024            FileFindParents.set_parents(file, *file_parents)
1025        return file
1026
1027    def git_file_ref(self, sha1):
1028        # Because an empty file and an empty manifest, both with no parents,
1029        # have the same sha1, we can't store both in the hg2git tree. So, we
1030        # choose to never store the file version, and make it forcibly resolve
1031        # to the empty blob. Which means we won't be storing an empty blob and
1032        # getting a mark for it, and will attempt to use it directly even if
1033        # it doesn't exist. The FastImport code works around this.
1034        # Theoretically, it is possible to have a non-modified child of the
1035        # empty file, and a non-modified child of the empty manifest, which
1036        # both would also have the same sha1, but, TTBOMK, it is only possible
1037        # to achieve with commands like hg debugparents.
1038        if sha1 == HG_EMPTY_FILE:
1039            return EMPTY_BLOB
1040        return self._hg2git(sha1)
1041
1042    def git_tree(self, manifest_sha1, ref_changeset=None):
1043        if manifest_sha1 == NULL_NODE_ID:
1044            return EMPTY_TREE
1045        return GitHgHelper.create_git_tree(manifest_sha1, ref_changeset)
1046
1047    def store_changeset(self, instance, commit=None):
1048        if commit and not isinstance(commit, GitCommit):
1049            commit = GitCommit(commit)
1050        if commit is None and self._graft:
1051            return self._graft.graft(instance)
1052
1053        if not commit:
1054            author = Authorship.from_hg(instance.author, instance.timestamp,
1055                                        instance.utcoffset)
1056            extra = instance.extra
1057            if extra and extra.get(b'committer'):
1058                committer = extra[b'committer']
1059                if committer[-1:] == b'>':
1060                    committer = Authorship.from_hg(
1061                        committer, instance.timestamp, instance.utcoffset)
1062                else:
1063                    committer = Authorship.from_hg_str(
1064                        committer, maybe_git_utcoffset=True)
1065                    if committer.to_hg() == committer:
1066                        extra = dict(instance.extra)
1067                        del extra[b'committer']
1068                        if not extra:
1069                            extra = None
1070            else:
1071                committer = author
1072
1073            parents = tuple(b':h%s' % p for p in instance.parents)
1074
1075            body = instance.body
1076
1077            # There are cases where two changesets would map to the same
1078            # git commit because their differences are not in information
1079            # stored in the git commit (different manifest node, but
1080            # identical tree ; different branches ; etc.)
1081            # In that case, add invisible characters to the commit
1082            # message until we find a commit that doesn't map to another
1083            # changeset.
1084            committer = committer.to_git_str()
1085            author = author.to_git_str()
1086            with GitHgHelper.commit(
1087                ref=b'refs/cinnabar/tip',
1088                message=body,
1089                committer=committer,
1090                author=author,
1091                parents=parents,
1092                pseudo_mark=b':h%s' % instance.node,
1093            ) as c:
1094                c.filemodify(b'', self.git_tree(instance.manifest,
1095                                                *instance.parents[:1]),
1096                             typ=b'tree')
1097
1098            commit = PseudoGitCommit(b':1')
1099            commit.author = author
1100            commit.committer = committer
1101            commit.body = body
1102
1103        GitHgHelper.set(b'changeset', instance.node, commit.sha1)
1104        changeset = Changeset.from_git_commit(commit)
1105        GitHgHelper.put_blob(
1106            ChangesetPatcher.from_diff(changeset, instance), want_sha1=False)
1107        GitHgHelper.set(b'changeset-metadata', instance.node, b':1')
1108
1109        self._branches[instance.node] = instance.branch or b'default'
1110        self.add_head(instance.node, instance.parent1, instance.parent2)
1111
1112    MODE = {
1113        b'': b'160644',
1114        b'l': b'160000',
1115        b'x': b'160755',
1116    }
1117
1118    def store_manifest(self, instance):
1119        if getattr(instance, 'delta_node', NULL_NODE_ID) != NULL_NODE_ID:
1120            previous = b':h%s' % instance.delta_node
1121        else:
1122            previous = None
1123        parents = tuple(b':h%s' % p for p in instance.parents)
1124        with GitHgHelper.commit(
1125            ref=b'refs/cinnabar/manifests',
1126            from_commit=previous,
1127            parents=parents,
1128            message=instance.node,
1129            pseudo_mark=b':h%s' % instance.node,
1130        ) as commit:
1131            if hasattr(instance, 'delta_node'):
1132                for name in instance.removed:
1133                    commit.filedelete(self.manifest_metadata_path(name))
1134                modified = instance.modified.items()
1135            else:
1136                # slow
1137                modified = ((line.path, (line.sha1, line.attr))
1138                            for line in instance)
1139            for name, (node, attr) in modified:
1140                node = bytes(node)
1141                commit.filemodify(self.manifest_metadata_path(name), node,
1142                                  self.MODE[attr])
1143
1144        GitHgHelper.set(b'manifest', instance.node, b':1')
1145
1146        if check_enabled('manifests'):
1147            if not GitHgHelper.check_manifest(instance.node):
1148                raise Exception(
1149                    'sha1 mismatch for node %s with parents %s %s and '
1150                    'previous %s' %
1151                    (instance.node.decode('ascii'),
1152                     instance.parent1.decode('ascii'),
1153                     instance.parent2.decode('ascii'),
1154                     instance.delta_node.decode('ascii'))
1155                )
1156
1157    def close(self, refresh=()):
1158        if self._closed:
1159            return
1160        if self._graft:
1161            self._graft.close()
1162        self._closed = True
1163        # If the helper is not running, we don't have anything to update.
1164        if not GitHgHelper._helper:
1165            return
1166        update_metadata = {}
1167        tree = GitHgHelper.store(b'metadata', b'hg2git')
1168        if tree != NULL_NODE_ID:
1169            hg2git = self._metadata_refs.get(b'refs/cinnabar/hg2git')
1170            with GitHgHelper.commit(
1171                ref=b'refs/cinnabar/hg2git',
1172            ) as commit:
1173                commit.write(b'M 040000 %s \n' % tree)
1174            if commit.sha1 != hg2git:
1175                update_metadata[b'refs/cinnabar/hg2git'] = commit.sha1
1176
1177        tree = GitHgHelper.store(b'metadata', b'git2hg')
1178        if tree != NULL_NODE_ID:
1179            notes = self._metadata_refs.get(b'refs/notes/cinnabar')
1180            with GitHgHelper.commit(
1181                ref=b'refs/notes/cinnabar',
1182            ) as commit:
1183                commit.write(b'M 040000 %s \n' % tree)
1184            if commit.sha1 != notes:
1185                update_metadata[b'refs/notes/cinnabar'] = commit.sha1
1186
1187        hg_changeset_heads = list(self._hgheads)
1188        changeset_heads = list(self.changeset_ref(h)
1189                               for h in hg_changeset_heads)
1190        if (any(self._hgheads.iterchanges()) or
1191                b'refs/cinnabar/changesets' in refresh):
1192            heads = sorted((self._hgheads[h][1], self._hgheads[h][0], h, g)
1193                           for h, g in zip(hg_changeset_heads,
1194                                           changeset_heads))
1195            with GitHgHelper.commit(
1196                ref=b'refs/cinnabar/changesets',
1197                parents=list(h for _, __, ___, h in heads),
1198                message=b'\n'.join(b'%s %s' % (h, b) for _, b, h, __ in heads),
1199            ) as commit:
1200                pass
1201            update_metadata[b'refs/cinnabar/changesets'] = commit.sha1
1202
1203        changeset_heads = set(changeset_heads)
1204
1205        manifest_heads = GitHgHelper.heads(b'manifests')
1206        if (set(manifest_heads) != self._manifest_heads_orig or
1207                (b'refs/cinnabar/changesets' in update_metadata and
1208                 not manifest_heads) or b'refs/cinnabar/manifests' in refresh):
1209            with GitHgHelper.commit(
1210                ref=b'refs/cinnabar/manifests',
1211                parents=sorted(manifest_heads),
1212            ) as commit:
1213                pass
1214            update_metadata[b'refs/cinnabar/manifests'] = commit.sha1
1215
1216        tree = GitHgHelper.store(b'metadata', b'files-meta')
1217        files_meta_ref = self._metadata_refs.get(b'refs/cinnabar/files-meta')
1218        if update_metadata and (tree != NULL_NODE_ID or not files_meta_ref):
1219            with GitHgHelper.commit(
1220                ref=b'refs/cinnabar/files-meta',
1221            ) as commit:
1222                if tree != NULL_NODE_ID:
1223                    commit.write(b'M 040000 %s \n' % tree)
1224            if commit.sha1 != files_meta_ref:
1225                update_metadata[b'refs/cinnabar/files-meta'] = commit.sha1
1226
1227        replace_changed = False
1228        for status, ref, sha1 in self._replace.iterchanges():
1229            if status == VersionedDict.REMOVED:
1230                Git.delete_ref(b'refs/cinnabar/replace/%s' % ref)
1231            else:
1232                Git.update_ref(b'refs/cinnabar/replace/%s' % ref, sha1)
1233            replace_changed = True
1234
1235        if update_metadata or replace_changed:
1236            parents = list(update_metadata.get(r) or self._metadata_refs[r]
1237                           for r in self.METADATA_REFS)
1238            metadata_sha1 = (Git.config('cinnabar.previous-metadata') or
1239                             self._metadata_sha1)
1240            if metadata_sha1:
1241                parents.append(metadata_sha1)
1242            with GitHgHelper.commit(
1243                ref=b'refs/cinnabar/metadata',
1244                parents=parents,
1245                message=b' '.join(sorted(self.FLAGS)),
1246            ) as commit:
1247                for sha1, target in util.iteritems(self._replace):
1248                    commit.filemodify(sha1, target, b'commit')
1249
1250        for c in self._tagcache:
1251            if c not in changeset_heads:
1252                self._tagcache[c] = False
1253
1254        for c in changeset_heads:
1255            if c not in self._tagcache:
1256                tags = self._get_hgtags(c)
1257
1258        files = set(util.itervalues(self._tagcache))
1259        deleted = set()
1260        created = {}
1261        for f in self._tagcache_items:
1262            if (f not in self._tagcache and f not in self._tagfiles or
1263                    f not in files and f in self._tagfiles):
1264                deleted.add(f)
1265
1266        def tagset_lines(tags):
1267            for tag, value in tags:
1268                yield b'%s\0%s %s\n' % (tag, value,
1269                                        b' '.join(tags.hist(tag)))
1270
1271        for f, tags in util.iteritems(self._tags):
1272            if f not in self._tagfiles and f != NULL_NODE_ID:
1273                data = b''.join(tagset_lines(tags))
1274                mark = GitHgHelper.put_blob(data=data)
1275                created[f] = (mark, b'exec')
1276
1277        if created or deleted:
1278            self.tag_changes = True
1279
1280        for c, f in util.iteritems(self._tagcache):
1281            if (f and c not in self._tagcache_items):
1282                if f == NULL_NODE_ID:
1283                    created[c] = (f, b'commit')
1284                else:
1285                    created[c] = (f, b'regular')
1286            elif f is False and c in self._tagcache_items:
1287                deleted.add(c)
1288
1289        if created or deleted:
1290            with GitHgHelper.commit(
1291                ref=b'refs/cinnabar/tag_cache',
1292                from_commit=self._tagcache_ref,
1293            ) as commit:
1294                for f in deleted:
1295                    commit.filedelete(f)
1296
1297                for f, (filesha1, typ) in util.iteritems(created):
1298                    commit.filemodify(f, filesha1, typ)
1299
1300        # refs/notes/cinnabar is kept for convenience
1301        for ref in update_metadata:
1302            if ref not in (b'refs/notes/cinnabar',):
1303                Git.delete_ref(ref)
1304
1305        GitHgHelper.close(rollback=False)
1306
1307        # Try to detect issue #207 as early as possible.
1308        GitHgHelper._helper = False
1309        busted = False
1310        from .hg.repo import getbundle_params, stored_files
1311        for (node, (parent1, parent2)) in progress_iter(
1312                "Checking {} imported file root and head revisions",
1313                util.iteritems(stored_files)):
1314            if not GitHgHelper.check_file(node, parent1, parent2):
1315                busted = True
1316                logging.error("Error in file %s" % node)
1317        if busted:
1318            import json
1319            extra = ""
1320            if getbundle_params:
1321                extra = \
1322                    "If it failed, please also copy/paste the following:\n"
1323                extra += json.dumps(getbundle_params, sort_keys=True, indent=4)
1324            raise Abort(
1325                "It seems you have hit a known, rare, and difficult to "
1326                "reproduce issue.\n"
1327                "Your help would be appreciated.\n"
1328                "Please try either `git cinnabar rollback` followed by the "
1329                "same command that just\n"
1330                "failed, or `git cinnabar reclone`.\n"
1331                "Please open a new issue "
1332                "(https://github.com/glandium/git-cinnabar/issues/new)\n"
1333                "mentioning issue #207 and reporting whether the second "
1334                "attempt succeeded.\n" + extra + "\n"
1335                "Please keep a copy of this repository."
1336            )
1337