1# Copyright (C) 2008-2018 Jelmer Vernooij <jelmer@jelmer.uk>
2# Copyright (C) 2007 Canonical Ltd
3# Copyright (C) 2008 John Carr
4#
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation; either version 2 of the License, or
8# (at your option) any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License
16# along with this program; if not, write to the Free Software
17# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
19"""Converters, etc for going between Bazaar and Git ids."""
20
21import base64
22import stat
23
24from .. import (
25    bencode,
26    errors,
27    foreign,
28    trace,
29    urlutils,
30    )
31from ..foreign import (
32    ForeignVcs,
33    VcsMappingRegistry,
34    ForeignRevision,
35    )
36from ..revision import (
37    NULL_REVISION,
38    Revision,
39    )
40from .errors import (
41    NoPushSupport,
42    )
43from .hg import (
44    format_hg_metadata,
45    extract_hg_metadata,
46    )
47from .roundtrip import (
48    extract_bzr_metadata,
49    inject_bzr_metadata,
50    CommitSupplement,
51    )
52
53
54DEFAULT_FILE_MODE = stat.S_IFREG | 0o644
55HG_RENAME_SOURCE = b"HG:rename-source"
56HG_EXTRA = b"HG:extra"
57
58# This HG extra is used to indicate the commit that this commit was based on.
59HG_EXTRA_AMEND_SOURCE = b"amend_source"
60
61FILE_ID_PREFIX = b'git:'
62
63# Always the same.
64ROOT_ID = b"TREE_ROOT"
65
66
67class UnknownCommitExtra(errors.BzrError):
68    _fmt = "Unknown extra fields in %(object)r: %(fields)r."
69
70    def __init__(self, object, fields):
71        errors.BzrError.__init__(self)
72        self.object = object
73        self.fields = ",".join(fields)
74
75
76class UnknownMercurialCommitExtra(errors.BzrError):
77    _fmt = "Unknown mercurial extra fields in %(object)r: %(fields)r."
78
79    def __init__(self, object, fields):
80        errors.BzrError.__init__(self)
81        self.object = object
82        self.fields = b",".join(fields)
83
84
85def escape_file_id(file_id):
86    file_id = file_id.replace(b'_', b'__')
87    file_id = file_id.replace(b' ', b'_s')
88    file_id = file_id.replace(b'\x0c', b'_c')
89    return file_id
90
91
92def unescape_file_id(file_id):
93    ret = bytearray()
94    i = 0
95    while i < len(file_id):
96        if file_id[i:i + 1] != b'_':
97            ret.append(file_id[i])
98        else:
99            if file_id[i + 1:i + 2] == b'_':
100                ret.append(b"_"[0])
101            elif file_id[i + 1:i + 2] == b's':
102                ret.append(b" "[0])
103            elif file_id[i + 1:i + 2] == b'c':
104                ret.append(b"\x0c"[0])
105            else:
106                raise ValueError("unknown escape character %s" %
107                                 file_id[i + 1:i + 2])
108            i += 1
109        i += 1
110    return bytes(ret)
111
112
113def fix_person_identifier(text):
114    if b"<" not in text and b">" not in text:
115        username = text
116        email = text
117    elif b">" not in text:
118        return text + b">"
119    else:
120        if text.rindex(b">") < text.rindex(b"<"):
121            raise ValueError(text)
122        username, email = text.split(b"<", 2)[-2:]
123        email = email.split(b">", 1)[0]
124        if username.endswith(b" "):
125            username = username[:-1]
126    return b"%s <%s>" % (username, email)
127
128
129def decode_git_path(path):
130    """Take a git path and decode it."""
131    try:
132        return path.decode('utf-8')
133    except UnicodeDecodeError:
134        if PY3:
135            return path.decode('utf-8', 'surrogateescape')
136        raise
137
138
139def encode_git_path(path):
140    """Take a regular path and encode it for git."""
141    try:
142        return path.encode('utf-8')
143    except UnicodeEncodeError:
144        if PY3:
145            return path.encode('utf-8', 'surrogateescape')
146        raise
147
148
149def warn_escaped(commit, num_escaped):
150    trace.warning("Escaped %d XML-invalid characters in %s. Will be unable "
151                  "to regenerate the SHA map.", num_escaped, commit)
152
153
154def warn_unusual_mode(commit, path, mode):
155    trace.mutter("Unusual file mode %o for %s in %s. Storing as revision "
156                 "property. ", mode, path, commit)
157
158
159class BzrGitMapping(foreign.VcsMapping):
160    """Class that maps between Git and Bazaar semantics."""
161    experimental = False
162
163    BZR_DUMMY_FILE = None  # type: Optional[str]
164
165    def is_special_file(self, filename):
166        return (filename in (self.BZR_DUMMY_FILE, ))
167
168    def __init__(self):
169        super(BzrGitMapping, self).__init__(foreign_vcs_git)
170
171    def __eq__(self, other):
172        return (type(self) == type(other)
173                and self.revid_prefix == other.revid_prefix)
174
175    @classmethod
176    def revision_id_foreign_to_bzr(cls, git_rev_id):
177        """Convert a git revision id handle to a Bazaar revision id."""
178        from dulwich.protocol import ZERO_SHA
179        if git_rev_id == ZERO_SHA:
180            return NULL_REVISION
181        return b"%s:%s" % (cls.revid_prefix, git_rev_id)
182
183    @classmethod
184    def revision_id_bzr_to_foreign(cls, bzr_rev_id):
185        """Convert a Bazaar revision id to a git revision id handle."""
186        if not bzr_rev_id.startswith(b"%s:" % cls.revid_prefix):
187            raise errors.InvalidRevisionId(bzr_rev_id, cls)
188        return bzr_rev_id[len(cls.revid_prefix) + 1:], cls()
189
190    def generate_file_id(self, path):
191        # Git paths are just bytestrings
192        # We must just hope they are valid UTF-8..
193        if isinstance(path, str):
194            path = path.encode("utf-8")
195        if path == b"":
196            return ROOT_ID
197        return FILE_ID_PREFIX + escape_file_id(path)
198
199    def parse_file_id(self, file_id):
200        if file_id == ROOT_ID:
201            return u""
202        if not file_id.startswith(FILE_ID_PREFIX):
203            raise ValueError
204        return decode_git_path(unescape_file_id(file_id[len(FILE_ID_PREFIX):]))
205
206    def import_unusual_file_modes(self, rev, unusual_file_modes):
207        if unusual_file_modes:
208            ret = [(path, unusual_file_modes[path])
209                   for path in sorted(unusual_file_modes.keys())]
210            rev.properties[u'file-modes'] = bencode.bencode(ret)
211
212    def export_unusual_file_modes(self, rev):
213        try:
214            file_modes = rev.properties[u'file-modes']
215        except KeyError:
216            return {}
217        else:
218            return dict(bencode.bdecode(file_modes.encode("utf-8")))
219
220    def _generate_git_svn_metadata(self, rev, encoding):
221        try:
222            git_svn_id = rev.properties[u"git-svn-id"]
223        except KeyError:
224            return ""
225        else:
226            return "\ngit-svn-id: %s\n" % git_svn_id.encode(encoding)
227
228    def _generate_hg_message_tail(self, rev):
229        extra = {}
230        renames = []
231        branch = 'default'
232        for name in rev.properties:
233            if name == u'hg:extra:branch':
234                branch = rev.properties[u'hg:extra:branch']
235            elif name.startswith(u'hg:extra'):
236                extra[name[len(u'hg:extra:'):]] = base64.b64decode(
237                    rev.properties[name])
238            elif name == u'hg:renames':
239                renames = bencode.bdecode(base64.b64decode(
240                    rev.properties[u'hg:renames']))
241            # TODO: Export other properties as 'bzr:' extras?
242        ret = format_hg_metadata(renames, branch, extra)
243        if not isinstance(ret, bytes):
244            raise TypeError(ret)
245        return ret
246
247    def _extract_git_svn_metadata(self, rev, message):
248        lines = message.split("\n")
249        if not (lines[-1] == "" and len(lines) >= 2 and
250                lines[-2].startswith("git-svn-id:")):
251            return message
252        git_svn_id = lines[-2].split(": ", 1)[1]
253        rev.properties[u'git-svn-id'] = git_svn_id
254        (url, rev, uuid) = parse_git_svn_id(git_svn_id)
255        # FIXME: Convert this to converted-from property somehow..
256        return "\n".join(lines[:-2])
257
258    def _extract_hg_metadata(self, rev, message):
259        (message, renames, branch, extra) = extract_hg_metadata(message)
260        if branch is not None:
261            rev.properties[u'hg:extra:branch'] = branch
262        for name, value in extra.items():
263            rev.properties[u'hg:extra:' + name] = base64.b64encode(value)
264        if renames:
265            rev.properties[u'hg:renames'] = base64.b64encode(bencode.bencode(
266                [(new, old) for (old, new) in renames.items()]))
267        return message
268
269    def _extract_bzr_metadata(self, rev, message):
270        (message, metadata) = extract_bzr_metadata(message)
271        return message, metadata
272
273    def _decode_commit_message(self, rev, message, encoding):
274        return message.decode(encoding), CommitSupplement()
275
276    def _encode_commit_message(self, rev, message, encoding):
277        return message.encode(encoding)
278
279    def export_commit(self, rev, tree_sha, parent_lookup, lossy,
280                      verifiers):
281        """Turn a Bazaar revision in to a Git commit
282
283        :param tree_sha: Tree sha for the commit
284        :param parent_lookup: Function for looking up the GIT sha equiv of a
285            bzr revision
286        :param lossy: Whether to store roundtripping information.
287        :param verifiers: Verifiers info
288        :return dulwich.objects.Commit represent the revision:
289        """
290        from dulwich.objects import Commit, Tag
291        commit = Commit()
292        commit.tree = tree_sha
293        if not lossy:
294            metadata = CommitSupplement()
295            metadata.verifiers = verifiers
296        else:
297            metadata = None
298        parents = []
299        for p in rev.parent_ids:
300            try:
301                git_p = parent_lookup(p)
302            except KeyError:
303                git_p = None
304                if metadata is not None:
305                    metadata.explicit_parent_ids = rev.parent_ids
306            if git_p is not None:
307                if len(git_p) != 40:
308                    raise AssertionError("unexpected length for %r" % git_p)
309                parents.append(git_p)
310        commit.parents = parents
311        try:
312            encoding = rev.properties[u'git-explicit-encoding']
313        except KeyError:
314            encoding = rev.properties.get(u'git-implicit-encoding', 'utf-8')
315        try:
316            commit.encoding = rev.properties[u'git-explicit-encoding'].encode(
317                'ascii')
318        except KeyError:
319            pass
320        commit.committer = fix_person_identifier(rev.committer.encode(
321            encoding))
322        commit.author = fix_person_identifier(
323            rev.get_apparent_authors()[0].encode(encoding))
324        # TODO(jelmer): Don't use this hack.
325        long = getattr(__builtins__, 'long', int)
326        commit.commit_time = long(rev.timestamp)
327        if u'author-timestamp' in rev.properties:
328            commit.author_time = long(rev.properties[u'author-timestamp'])
329        else:
330            commit.author_time = commit.commit_time
331        commit._commit_timezone_neg_utc = (
332            u"commit-timezone-neg-utc" in rev.properties)
333        commit.commit_timezone = rev.timezone
334        commit._author_timezone_neg_utc = (
335            u"author-timezone-neg-utc" in rev.properties)
336        if u'author-timezone' in rev.properties:
337            commit.author_timezone = int(rev.properties[u'author-timezone'])
338        else:
339            commit.author_timezone = commit.commit_timezone
340        if u'git-gpg-signature' in rev.properties:
341            commit.gpgsig = rev.properties[u'git-gpg-signature'].encode(
342                'utf-8', 'surrogateescape')
343        commit.message = self._encode_commit_message(rev, rev.message,
344                                                     encoding)
345        if not isinstance(commit.message, bytes):
346            raise TypeError(commit.message)
347        if metadata is not None:
348            try:
349                mapping_registry.parse_revision_id(rev.revision_id)
350            except errors.InvalidRevisionId:
351                metadata.revision_id = rev.revision_id
352            mapping_properties = set(
353                [u'author', u'author-timezone', u'author-timezone-neg-utc',
354                 u'commit-timezone-neg-utc', u'git-implicit-encoding',
355                 u'git-gpg-signature', u'git-explicit-encoding',
356                 u'author-timestamp', u'file-modes'])
357            for k, v in rev.properties.items():
358                if k not in mapping_properties:
359                    metadata.properties[k] = v
360        if not lossy and metadata:
361            if self.roundtripping:
362                commit.message = inject_bzr_metadata(commit.message, metadata,
363                                                     encoding)
364            else:
365                raise NoPushSupport(
366                    None, None, self, revision_id=rev.revision_id)
367        if not isinstance(commit.message, bytes):
368            raise TypeError(commit.message)
369        i = 0
370        propname = u'git-mergetag-0'
371        while propname in rev.properties:
372            commit.mergetag.append(Tag.from_string(rev.properties[propname]))
373            i += 1
374            propname = u'git-mergetag-%d' % i
375        if u'git-extra' in rev.properties:
376            commit.extra.extend(
377                [l.split(b' ', 1)
378                 for l in rev.properties[u'git-extra'].splitlines()])
379        return commit
380
381    def get_revision_id(self, commit):
382        if commit.encoding:
383            encoding = commit.encoding.decode('ascii')
384        else:
385            encoding = 'utf-8'
386        try:
387            message, metadata = self._decode_commit_message(
388                None, commit.message, encoding)
389        except UnicodeDecodeError:
390            pass
391        else:
392            if metadata.revision_id:
393                return metadata.revision_id
394        return self.revision_id_foreign_to_bzr(commit.id)
395
396    def import_commit(self, commit, lookup_parent_revid, strict=True):
397        """Convert a git commit to a bzr revision.
398
399        :return: a `breezy.revision.Revision` object, foreign revid and a
400            testament sha1
401        """
402        if commit is None:
403            raise AssertionError("Commit object can't be None")
404        rev = ForeignRevision(commit.id, self,
405                              self.revision_id_foreign_to_bzr(commit.id))
406        rev.git_metadata = None
407
408        def decode_using_encoding(rev, commit, encoding):
409            rev.committer = commit.committer.decode(encoding)
410            if commit.committer != commit.author:
411                rev.properties[u'author'] = commit.author.decode(encoding)
412            rev.message, rev.git_metadata = self._decode_commit_message(
413                rev, commit.message, encoding)
414
415        if commit.encoding is not None:
416            rev.properties[u'git-explicit-encoding'] = commit.encoding.decode(
417                'ascii')
418        if commit.encoding is not None and commit.encoding != b'false':
419            decode_using_encoding(rev, commit, commit.encoding.decode('ascii'))
420        else:
421            for encoding in ('utf-8', 'latin1'):
422                try:
423                    decode_using_encoding(rev, commit, encoding)
424                except UnicodeDecodeError:
425                    pass
426                else:
427                    if encoding != 'utf-8':
428                        rev.properties[u'git-implicit-encoding'] = encoding
429                    break
430        if commit.commit_time != commit.author_time:
431            rev.properties[u'author-timestamp'] = str(commit.author_time)
432        if commit.commit_timezone != commit.author_timezone:
433            rev.properties[u'author-timezone'] = "%d" % commit.author_timezone
434        if commit._author_timezone_neg_utc:
435            rev.properties[u'author-timezone-neg-utc'] = ""
436        if commit._commit_timezone_neg_utc:
437            rev.properties[u'commit-timezone-neg-utc'] = ""
438        if commit.gpgsig:
439            rev.properties[u'git-gpg-signature'] = commit.gpgsig.decode(
440                'utf-8', 'surrogateescape')
441        if commit.mergetag:
442            for i, tag in enumerate(commit.mergetag):
443                rev.properties[u'git-mergetag-%d' % i] = tag.as_raw_string()
444        rev.timestamp = commit.commit_time
445        rev.timezone = commit.commit_timezone
446        rev.parent_ids = None
447        if rev.git_metadata is not None:
448            md = rev.git_metadata
449            roundtrip_revid = md.revision_id
450            if md.explicit_parent_ids:
451                rev.parent_ids = md.explicit_parent_ids
452            rev.properties.update(md.properties)
453            verifiers = md.verifiers
454        else:
455            roundtrip_revid = None
456            verifiers = {}
457        if rev.parent_ids is None:
458            parents = []
459            for p in commit.parents:
460                try:
461                    parents.append(lookup_parent_revid(p))
462                except KeyError:
463                    parents.append(self.revision_id_foreign_to_bzr(p))
464            rev.parent_ids = list(parents)
465        unknown_extra_fields = []
466        extra_lines = []
467        for k, v in commit.extra:
468            if k == HG_RENAME_SOURCE:
469                extra_lines.append(k + b' ' + v + b'\n')
470            elif k == HG_EXTRA:
471                hgk, hgv = v.split(b':', 1)
472                if hgk not in (HG_EXTRA_AMEND_SOURCE, ) and strict:
473                    raise UnknownMercurialCommitExtra(commit, [hgk])
474                extra_lines.append(k + b' ' + v + b'\n')
475            else:
476                unknown_extra_fields.append(k)
477        if unknown_extra_fields and strict:
478            raise UnknownCommitExtra(
479                commit,
480                [f.decode('ascii', 'replace') for f in unknown_extra_fields])
481        if extra_lines:
482            rev.properties[u'git-extra'] = b''.join(extra_lines)
483        return rev, roundtrip_revid, verifiers
484
485
486class BzrGitMappingv1(BzrGitMapping):
487    revid_prefix = b'git-v1'
488    experimental = False
489
490    def __str__(self):
491        return self.revid_prefix
492
493
494class BzrGitMappingExperimental(BzrGitMappingv1):
495    revid_prefix = b'git-experimental'
496    experimental = True
497    roundtripping = False
498
499    BZR_DUMMY_FILE = '.bzrdummy'
500
501    def _decode_commit_message(self, rev, message, encoding):
502        if rev is None:
503            rev = Revision()
504        message = self._extract_hg_metadata(rev, message)
505        message = self._extract_git_svn_metadata(rev, message)
506        message, metadata = self._extract_bzr_metadata(rev, message)
507        return message.decode(encoding), metadata
508
509    def _encode_commit_message(self, rev, message, encoding):
510        ret = message.encode(encoding)
511        ret += self._generate_hg_message_tail(rev)
512        ret += self._generate_git_svn_metadata(rev, encoding)
513        return ret
514
515    def import_commit(self, commit, lookup_parent_revid, strict=True):
516        rev, roundtrip_revid, verifiers = super(
517            BzrGitMappingExperimental, self).import_commit(
518                commit, lookup_parent_revid, strict)
519        rev.properties[u'converted_revision'] = "git %s\n" % commit.id
520        return rev, roundtrip_revid, verifiers
521
522
523class GitMappingRegistry(VcsMappingRegistry):
524    """Registry with available git mappings."""
525
526    def revision_id_bzr_to_foreign(self, bzr_revid):
527        if bzr_revid == NULL_REVISION:
528            from dulwich.protocol import ZERO_SHA
529            return ZERO_SHA, None
530        if not bzr_revid.startswith(b"git-"):
531            raise errors.InvalidRevisionId(bzr_revid, None)
532        (mapping_version, git_sha) = bzr_revid.split(b":", 1)
533        mapping = self.get(mapping_version)
534        return mapping.revision_id_bzr_to_foreign(bzr_revid)
535
536    parse_revision_id = revision_id_bzr_to_foreign
537
538
539mapping_registry = GitMappingRegistry()
540mapping_registry.register_lazy(b'git-v1', __name__,
541                               "BzrGitMappingv1")
542mapping_registry.register_lazy(b'git-experimental',
543                               __name__, "BzrGitMappingExperimental")
544# Uncomment the next line to enable the experimental bzr-git mappings.
545# This will make sure all bzr metadata is pushed into git, allowing for
546# full roundtripping later.
547# NOTE: THIS IS EXPERIMENTAL. IT MAY EAT YOUR DATA OR CORRUPT
548# YOUR BZR OR GIT REPOSITORIES. USE WITH CARE.
549# mapping_registry.set_default('git-experimental')
550mapping_registry.set_default(b'git-v1')
551
552
553class ForeignGit(ForeignVcs):
554    """The Git Stupid Content Tracker"""
555
556    @property
557    def branch_format(self):
558        from .branch import LocalGitBranchFormat
559        return LocalGitBranchFormat()
560
561    @property
562    def repository_format(self):
563        from .repository import GitRepositoryFormat
564        return GitRepositoryFormat()
565
566    def __init__(self):
567        super(ForeignGit, self).__init__(mapping_registry)
568        self.abbreviation = "git"
569
570    @classmethod
571    def serialize_foreign_revid(self, foreign_revid):
572        return foreign_revid
573
574    @classmethod
575    def show_foreign_revid(cls, foreign_revid):
576        return {"git commit": foreign_revid.decode('utf-8')}
577
578
579foreign_vcs_git = ForeignGit()
580default_mapping = mapping_registry.get_default()()
581
582
583def symlink_to_blob(symlink_target):
584    from dulwich.objects import Blob
585    blob = Blob()
586    if isinstance(symlink_target, str):
587        symlink_target = encode_git_path(symlink_target)
588    blob.data = symlink_target
589    return blob
590
591
592def mode_is_executable(mode):
593    """Check if mode should be considered executable."""
594    return bool(mode & 0o111)
595
596
597def mode_kind(mode):
598    """Determine the Bazaar inventory kind based on Unix file mode."""
599    if mode is None:
600        return None
601    entry_kind = (mode & 0o700000) / 0o100000
602    if entry_kind == 0:
603        return 'directory'
604    elif entry_kind == 1:
605        file_kind = (mode & 0o70000) / 0o10000
606        if file_kind == 0:
607            return 'file'
608        elif file_kind == 2:
609            return 'symlink'
610        elif file_kind == 6:
611            return 'tree-reference'
612        else:
613            raise AssertionError(
614                "Unknown file kind %d, perms=%o." % (file_kind, mode,))
615    else:
616        raise AssertionError(
617            "Unknown kind, perms=%r." % (mode,))
618
619
620def object_mode(kind, executable):
621    if kind == 'directory':
622        return stat.S_IFDIR
623    elif kind == 'symlink':
624        mode = stat.S_IFLNK
625        if executable:
626            mode |= 0o111
627        return mode
628    elif kind == 'file':
629        mode = stat.S_IFREG | 0o644
630        if executable:
631            mode |= 0o111
632        return mode
633    elif kind == 'tree-reference':
634        from dulwich.objects import S_IFGITLINK
635        return S_IFGITLINK
636    else:
637        raise AssertionError
638
639
640def entry_mode(entry):
641    """Determine the git file mode for an inventory entry."""
642    return object_mode(entry.kind, getattr(entry, 'executable', False))
643
644
645def extract_unusual_modes(rev):
646    try:
647        foreign_revid, mapping = mapping_registry.parse_revision_id(
648            rev.revision_id)
649    except errors.InvalidRevisionId:
650        return {}
651    else:
652        return mapping.export_unusual_file_modes(rev)
653
654
655def parse_git_svn_id(text):
656    (head, uuid) = text.rsplit(" ", 1)
657    (full_url, rev) = head.rsplit("@", 1)
658    return (full_url, int(rev), uuid)
659
660
661def needs_roundtripping(repo, revid):
662    try:
663        mapping_registry.parse_revision_id(revid)
664    except errors.InvalidRevisionId:
665        return True
666    else:
667        return False
668