1#!/usr/bin/env python3
2
3# This Source Code Form is subject to the terms of the Mozilla Public
4# License, v. 2.0. If a copy of the MPL was not distributed with this
5# file, You can obtain one at http://mozilla.org/MPL/2.0/.
6
7import os
8import re
9import subprocess
10import sys
11
12import pygit2
13import hglib
14
15DEBUG = False
16
17
18def eprint(*args, **kwargs):
19    print(*args, file=sys.stderr, **kwargs)
20
21
22def debugprint(*args, **kwargs):
23    if DEBUG:
24        eprint(*args, **kwargs)
25
26
27class HgCommit:
28    def __init__(self, parent1, parent2):
29        self.parents = []
30        if parent1 == NULL_PARENT_REV:
31            raise Exception(
32                "Encountered a hg changeset with no parents! We don't handle this...."
33            )
34        self.parents.append(parent1)
35        if parent2 != NULL_PARENT_REV:
36            self.parents.append(parent2)
37        self.touches_sync_code = False
38        self.children = []
39
40    def add_child(self, rev):
41        self.children.append(rev)
42
43
44class GitCommit:
45    def __init__(self, hg_rev, commit_obj):
46        self.hg_rev = hg_rev
47        self.commit_obj = commit_obj
48
49
50def load_git_repository():
51    commit_map = dict()
52    # First, scan the tags for "mozilla-xxx" that keep track of manually synchronized changes
53    sync_tags = filter(
54        lambda ref: ref.startswith("refs/tags/mozilla-"),
55        list(downstream_git_repo.references),
56    )
57    for desc in sync_tags:
58        commit = downstream_git_repo.lookup_reference(desc).peel()
59        # cut out the revision hash from the output
60        hg_rev = desc[18:]
61        commit_map[hg_rev] = GitCommit(hg_rev, commit)
62        debugprint("Loaded pre-existing tag hg %s -> git %s" % (hg_rev, commit.oid))
63
64    # Next, scan the commits for a specific message format
65    re_commitmsg = re.compile(
66        r"^\[(ghsync|wrupdater)\] From https://hg.mozilla.org/mozilla-central/rev/([0-9a-fA-F]+)$",
67        re.MULTILINE,
68    )
69    for commit in downstream_git_repo.walk(downstream_git_repo.head.target):
70        m = re_commitmsg.search(commit.message)
71        if not m:
72            continue
73        hg_rev = m.group(2)
74        commit_map[hg_rev] = GitCommit(hg_rev, commit)
75        debugprint("Loaded pre-existing commit hg %s -> git %s" % (hg_rev, commit.oid))
76    return commit_map
77
78
79def timeof(git_commit):
80    return git_commit.commit_obj.commit_time + git_commit.commit_obj.commit_time_offset
81
82
83def find_newest_commit(commit_map):
84    newest_hg_rev = None
85    newest_commit_time = None
86
87    for hg_rev, git_commit in commit_map.items():
88        if newest_hg_rev is None or timeof(git_commit) > newest_commit_time:
89            newest_hg_rev = hg_rev
90            newest_commit_time = timeof(git_commit)
91
92    return newest_hg_rev
93
94
95def get_single_rev(revset):
96    output = subprocess.check_output(
97        ["hg", "log", "-r", revset, "--template", "{node}"]
98    )
99    output = str(output, "ascii")
100    return output
101
102
103def get_multiple_revs(revset, template):
104    output = subprocess.check_output(
105        ["hg", "log", "-r", revset, "--template", template + "\\n"]
106    )
107    for line in output.splitlines():
108        yield str(line, "ascii")
109
110
111def get_base_hg_rev(commit_map):
112    base_hg_rev = find_newest_commit(commit_map)
113    eprint("Using %s as base hg revision" % base_hg_rev)
114    return base_hg_rev
115
116
117def load_hg_commits(commits, query):
118    for cset in get_multiple_revs(query, "{node} {p1node} {p2node}"):
119        tokens = cset.split()
120        commits[tokens[0]] = HgCommit(tokens[1], tokens[2])
121    return commits
122
123
124def get_real_base_hg_rev(hg_data, commit_map):
125    # Some of the HG commits we want to port to github may have landed on codelines
126    # that branched off central prior to base_hg_rev. So when we create the git
127    # equivalents, they will have parents that are not the HEAD of the git repo,
128    # but instead will be descendants of older commits in the git repo. In order
129    # to do this correctly, we need to find the hg-equivalents of all of those
130    # possible git parents. So first we identify all the "tail" hg revisions in
131    # our hg_data set (think "tail" as in opposite of "head" which is the tipmost
132    # commit). The "tail" hg revisions are the ones for which we don't have their
133    # ancestors in hg_data.
134    tails = []
135    for (rev, cset) in hg_data.items():
136        for parent in cset.parents:
137            if parent not in hg_data:
138                tails.append(rev)
139    eprint("Found hg tail revisions %s" % tails)
140    # Then we find their common ancestor, which will be some ancestor of base_hg_rev
141    # from which those codelines.
142    if len(tails) == 0:
143        common_ancestor = get_single_rev(".")
144    else:
145        common_ancestor = get_single_rev("ancestor(" + ",".join(tails) + ")")
146    eprint("Found common ancestor of tail revisions: %s" % common_ancestor)
147
148    # And then we find the newest git commit whose hg-equivalent is an ancestor of
149    # that common ancestor, to make sure we are starting from a known hg/git
150    # commit pair.
151    for git_commit in sorted(commit_map.values(), key=timeof, reverse=True):
152        new_base = get_single_rev(
153            "ancestor(" + common_ancestor + "," + git_commit.hg_rev + ")"
154        )
155        if new_base == common_ancestor:
156            eprint(
157                "Pre-existing git commit %s from hg rev %s is descendant of common ancestor; %s"
158                % (
159                    git_commit.commit_obj.id,
160                    git_commit.hg_rev,
161                    "walking back further...",
162                )
163            )
164            continue
165        if new_base != git_commit.hg_rev:
166            eprint(
167                "Pre-existing git commit %s from hg rev %s is on sibling branch"
168                " of common ancestor; %s"
169                % (
170                    git_commit.commit_obj.id,
171                    git_commit.hg_rev,
172                    "walking back further...",
173                )
174            )
175            continue
176        eprint(
177            "Pre-existing git commit %s from hg rev %s is sufficiently old; stopping walk"
178            % (git_commit.commit_obj.id, git_commit.hg_rev)
179        )
180        common_ancestor = new_base
181        break
182
183    return common_ancestor
184
185
186# Now we prune out all the uninteresting changesets from hg_commits. The
187# uninteresting ones are ones that don't touch the target code, are not merges,
188# and are not referenced by mozilla tags in the git repo.
189# We do this by rewriting the parents to the "interesting" ancestor.
190def prune_boring(rev):
191    while rev in hg_commits:
192        parent_pruned = False
193        for i in range(len(hg_commits[rev].parents)):
194            parent_rev = hg_commits[rev].parents[i]
195            if parent_rev not in hg_commits:
196                continue
197            if hg_commits[parent_rev].touches_sync_code:
198                continue
199            if len(hg_commits[parent_rev].parents) > 1:
200                continue
201            if parent_rev in hg_to_git_commit_map:
202                continue
203
204            # If we get here, then `parent_rev` is a boring revision and we can
205            # prune it. Connect `rev` to its grandparent, and prune the parent
206            grandparent_rev = hg_commits[parent_rev].parents[0]
207            hg_commits[rev].parents[i] = grandparent_rev
208            # eprint("Pruned %s as boring parent of %s, using %s now" %
209            #    (parent_rev, rev, grandparent_rev))
210            parent_pruned = True
211
212        if parent_pruned:
213            # If we pruned a parent, process `rev` again as we might want to
214            # prune more parents
215            continue
216
217        # Collapse identical parents, because if the parents are identical
218        # we don't need to keep multiple copies of them.
219        hg_commits[rev].parents = list(dict.fromkeys(hg_commits[rev].parents))
220
221        # If we get here, all of `rev`s parents are interesting, so we can't
222        # prune them. Move up to the parent rev and start processing that, or
223        # if we have multiple parents then recurse on those nodes.
224        if len(hg_commits[rev].parents) == 1:
225            rev = hg_commits[rev].parents[0]
226            continue
227
228        for parent_rev in hg_commits[rev].parents:
229            prune_boring(parent_rev)
230        return
231
232
233class FakeCommit:
234    def __init__(self, oid):
235        self.oid = oid
236
237
238def fake_commit(hg_rev, parent1, parent2):
239    if parent1 is None:
240        eprint("ERROR: Trying to build on None")
241        exit(1)
242    oid = "githash_%s" % hash(parent1)
243    eprint("Fake-built %s" % oid)
244    return FakeCommit(oid)
245
246
247def build_tree(builder, treedata):
248    for (name, value) in treedata.items():
249        if isinstance(value, dict):
250            subbuilder = downstream_git_repo.TreeBuilder()
251            build_tree(subbuilder, value)
252            builder.insert(name, subbuilder.write(), pygit2.GIT_FILEMODE_TREE)
253        else:
254            (filemode, contents) = value
255            blob_oid = downstream_git_repo.create_blob(contents)
256            builder.insert(name, blob_oid, filemode)
257
258
259def author_to_signature(author):
260    pieces = author.strip().split("<")
261    if len(pieces) != 2 or pieces[1][-1] != ">":
262        # We could probably handle this better
263        return pygit2.Signature(author, "")
264    name = pieces[0].strip()
265    email = pieces[1][:-1].strip()
266    return pygit2.Signature(name, email)
267
268
269def real_commit(hg_rev, parent1, parent2):
270    filetree = dict()
271    manifest = mozilla_hg_repo.manifest(rev=hg_rev)
272    for (nodeid, permission, executable, symlink, filename) in manifest:
273        if not filename.startswith(relative_path.encode("utf-8")):
274            continue
275        if symlink:
276            filemode = pygit2.GIT_FILEMODE_LINK
277        elif executable:
278            filemode = pygit2.GIT_FILEMODE_BLOB_EXECUTABLE
279        else:
280            filemode = pygit2.GIT_FILEMODE_BLOB
281        filecontent = mozilla_hg_repo.cat([filename], rev=hg_rev)
282        subtree = filetree
283        for component in filename.split(b"/")[2:-1]:
284            subtree = subtree.setdefault(component.decode("latin-1"), dict())
285        filename = filename.split(b"/")[-1]
286        subtree[filename.decode("latin-1")] = (filemode, filecontent)
287
288    builder = downstream_git_repo.TreeBuilder()
289    build_tree(builder, filetree)
290    tree_oid = builder.write()
291
292    parent1_obj = downstream_git_repo.get(parent1)
293    if parent1_obj.tree_id == tree_oid:
294        eprint("Early-exit; tree matched that of parent git commit %s" % parent1)
295        return parent1_obj
296
297    if parent2 is not None:
298        parent2_obj = downstream_git_repo.get(parent2)
299        if parent2_obj.tree_id == tree_oid:
300            eprint("Early-exit; tree matched that of parent git commit %s" % parent2)
301            return parent2_obj
302
303    hg_rev_obj = mozilla_hg_repo.log(revrange=hg_rev, limit=1)[0]
304    commit_author = hg_rev_obj[4].decode("latin-1")
305    commit_message = hg_rev_obj[5].decode("latin-1")
306    commit_message += (
307        "\n\n[ghsync] From https://hg.mozilla.org/mozilla-central/rev/%s" % hg_rev
308        + "\n"
309    )
310
311    parents = [parent1]
312    if parent2 is not None:
313        parents.append(parent2)
314    commit_oid = downstream_git_repo.create_commit(
315        None,
316        author_to_signature(commit_author),
317        author_to_signature(commit_author),
318        commit_message,
319        tree_oid,
320        parents,
321    )
322    eprint("Built git commit %s" % commit_oid)
323    return downstream_git_repo.get(commit_oid)
324
325
326def try_commit(hg_rev, parent1, parent2=None):
327    if False:
328        return fake_commit(hg_rev, parent1, parent2)
329    else:
330        return real_commit(hg_rev, parent1, parent2)
331
332
333def build_git_commits(rev):
334    debugprint("build_git_commit(%s)..." % rev)
335    if rev in hg_to_git_commit_map:
336        debugprint("  maps to %s" % hg_to_git_commit_map[rev].commit_obj.oid)
337        return hg_to_git_commit_map[rev].commit_obj.oid
338
339    if rev not in hg_commits:
340        debugprint("  not in hg_commits")
341        return None
342
343    if len(hg_commits[rev].parents) == 1:
344        git_parent = build_git_commits(hg_commits[rev].parents[0])
345        if not hg_commits[rev].touches_sync_code:
346            eprint(
347                "WARNING: Found rev %s that is non-merge and not related to the target"
348                % rev
349            )
350            return git_parent
351        eprint("Building git equivalent for %s on top of %s" % (rev, git_parent))
352        commit_obj = try_commit(rev, git_parent)
353        hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj)
354        debugprint("  built %s as %s" % (rev, commit_obj.oid))
355        return commit_obj.oid
356
357    git_parent_1 = build_git_commits(hg_commits[rev].parents[0])
358    git_parent_2 = build_git_commits(hg_commits[rev].parents[1])
359    if git_parent_1 is None or git_parent_2 is None or git_parent_1 == git_parent_2:
360        git_parent = git_parent_1 if git_parent_2 is None else git_parent_2
361        if not hg_commits[rev].touches_sync_code:
362            debugprint(
363                "  %s is merge with no parents or doesn't touch WR, returning %s"
364                % (rev, git_parent)
365            )
366            return git_parent
367
368        eprint(
369            "WARNING: Found merge rev %s whose parents have identical target code"
370            ", but modifies the target" % rev
371        )
372        eprint("Building git equivalent for %s on top of %s" % (rev, git_parent))
373        commit_obj = try_commit(rev, git_parent)
374        hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj)
375        debugprint("  built %s as %s" % (rev, commit_obj.oid))
376        return commit_obj.oid
377
378    # An actual merge
379    eprint(
380        "Building git equivalent for %s on top of %s, %s"
381        % (rev, git_parent_1, git_parent_2)
382    )
383    commit_obj = try_commit(rev, git_parent_1, git_parent_2)
384    hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj)
385    debugprint("  built %s as %s" % (rev, commit_obj.oid))
386    return commit_obj.oid
387
388
389def pretty_print(rev, cset):
390    desc = "  %s" % rev
391    desc += " parents: %s" % cset.parents
392    if rev in hg_to_git_commit_map:
393        desc += " git: %s" % hg_to_git_commit_map[rev].commit_obj.oid
394    if rev == hg_tip:
395        desc += " (tip)"
396    return desc
397
398
399if len(sys.argv) < 3:
400    eprint("Usage: %s <local-checkout-path> <repo-relative-path>" % sys.argv[0])
401    eprint("Current dir must be the mozilla hg repo")
402    exit(1)
403
404local_checkout_path = sys.argv[1]
405relative_path = sys.argv[2]
406mozilla_hg_path = os.getcwd()
407NULL_PARENT_REV = "0000000000000000000000000000000000000000"
408
409downstream_git_repo = pygit2.Repository(pygit2.discover_repository(local_checkout_path))
410mozilla_hg_repo = hglib.open(mozilla_hg_path)
411hg_to_git_commit_map = load_git_repository()
412base_hg_rev = get_base_hg_rev(hg_to_git_commit_map)
413if base_hg_rev is None:
414    eprint("Found no sync commits or 'mozilla-xxx' tags")
415    exit(1)
416
417hg_commits = load_hg_commits(dict(), "only(.," + base_hg_rev + ")")
418eprint("Initial set has %s changesets" % len(hg_commits))
419base_hg_rev = get_real_base_hg_rev(hg_commits, hg_to_git_commit_map)
420eprint("Using hg rev %s as common ancestor of all interesting changesets" % base_hg_rev)
421
422# Refresh hg_commits with our wider dataset
423hg_tip = get_single_rev(".")
424wider_range = "%s::%s" % (base_hg_rev, hg_tip)
425hg_commits = load_hg_commits(hg_commits, wider_range)
426eprint("Updated set has %s changesets" % len(hg_commits))
427
428if DEBUG:
429    eprint("Graph of descendants of %s" % base_hg_rev)
430    output = subprocess.check_output(
431        [
432            "hg",
433            "log",
434            "--graph",
435            "-r",
436            "descendants(" + base_hg_rev + ")",
437            "--template",
438            "{node} {desc|firstline}\\n",
439        ]
440    )
441    for line in output.splitlines():
442        eprint(line.decode("utf-8", "ignore"))
443
444# Also flag any changes that touch the project
445query = "(" + wider_range + ') & file("glob:' + relative_path + '/**")'
446for cset in get_multiple_revs(query, "{node}"):
447    debugprint("Changeset %s modifies %s" % (cset, relative_path))
448    hg_commits[cset].touches_sync_code = True
449eprint(
450    "Identified %s changesets that touch the target code"
451    % sum([1 if v.touches_sync_code else 0 for (k, v) in hg_commits.items()])
452)
453
454prune_boring(hg_tip)
455
456# hg_tip itself might be boring
457if not hg_commits[hg_tip].touches_sync_code and len(hg_commits[hg_tip].parents) == 1:
458    new_tip = hg_commits[hg_tip].parents[0]
459    eprint("Pruned tip %s as boring, using %s now" % (hg_tip, new_tip))
460    hg_tip = new_tip
461
462eprint("--- Interesting changesets ---")
463for (rev, cset) in hg_commits.items():
464    if cset.touches_sync_code or len(cset.parents) > 1 or rev in hg_to_git_commit_map:
465        eprint(pretty_print(rev, cset))
466if DEBUG:
467    eprint("--- Other changesets (not really interesting) ---")
468    for (rev, cset) in hg_commits.items():
469        if not (
470            cset.touches_sync_code
471            or len(cset.parents) > 1
472            or rev in hg_to_git_commit_map
473        ):
474            eprint(pretty_print(rev, cset))
475
476git_tip = build_git_commits(hg_tip)
477if git_tip is None:
478    eprint("No new changesets generated, exiting.")
479else:
480    downstream_git_repo.create_reference("refs/heads/github-sync", git_tip, force=True)
481    eprint("Updated github-sync branch to %s, done!" % git_tip)
482