1#!/usr/bin/env python3 2 3# This Source Code Form is subject to the terms of the Mozilla Public 4# License, v. 2.0. If a copy of the MPL was not distributed with this 5# file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 7import os 8import re 9import subprocess 10import sys 11 12import pygit2 13import hglib 14 15DEBUG = False 16 17 18def eprint(*args, **kwargs): 19 print(*args, file=sys.stderr, **kwargs) 20 21 22def debugprint(*args, **kwargs): 23 if DEBUG: 24 eprint(*args, **kwargs) 25 26 27class HgCommit: 28 def __init__(self, parent1, parent2): 29 self.parents = [] 30 if parent1 == NULL_PARENT_REV: 31 raise Exception( 32 "Encountered a hg changeset with no parents! We don't handle this...." 33 ) 34 self.parents.append(parent1) 35 if parent2 != NULL_PARENT_REV: 36 self.parents.append(parent2) 37 self.touches_sync_code = False 38 self.children = [] 39 40 def add_child(self, rev): 41 self.children.append(rev) 42 43 44class GitCommit: 45 def __init__(self, hg_rev, commit_obj): 46 self.hg_rev = hg_rev 47 self.commit_obj = commit_obj 48 49 50def load_git_repository(): 51 commit_map = dict() 52 # First, scan the tags for "mozilla-xxx" that keep track of manually synchronized changes 53 sync_tags = filter( 54 lambda ref: ref.startswith("refs/tags/mozilla-"), 55 list(downstream_git_repo.references), 56 ) 57 for desc in sync_tags: 58 commit = downstream_git_repo.lookup_reference(desc).peel() 59 # cut out the revision hash from the output 60 hg_rev = desc[18:] 61 commit_map[hg_rev] = GitCommit(hg_rev, commit) 62 debugprint("Loaded pre-existing tag hg %s -> git %s" % (hg_rev, commit.oid)) 63 64 # Next, scan the commits for a specific message format 65 re_commitmsg = re.compile( 66 r"^\[(ghsync|wrupdater)\] From https://hg.mozilla.org/mozilla-central/rev/([0-9a-fA-F]+)$", 67 re.MULTILINE, 68 ) 69 for commit in downstream_git_repo.walk(downstream_git_repo.head.target): 70 m = re_commitmsg.search(commit.message) 71 if not m: 72 continue 73 hg_rev = m.group(2) 74 commit_map[hg_rev] = GitCommit(hg_rev, commit) 75 debugprint("Loaded pre-existing commit hg %s -> git %s" % (hg_rev, commit.oid)) 76 return commit_map 77 78 79def timeof(git_commit): 80 return git_commit.commit_obj.commit_time + git_commit.commit_obj.commit_time_offset 81 82 83def find_newest_commit(commit_map): 84 newest_hg_rev = None 85 newest_commit_time = None 86 87 for hg_rev, git_commit in commit_map.items(): 88 if newest_hg_rev is None or timeof(git_commit) > newest_commit_time: 89 newest_hg_rev = hg_rev 90 newest_commit_time = timeof(git_commit) 91 92 return newest_hg_rev 93 94 95def get_single_rev(revset): 96 output = subprocess.check_output( 97 ["hg", "log", "-r", revset, "--template", "{node}"] 98 ) 99 output = str(output, "ascii") 100 return output 101 102 103def get_multiple_revs(revset, template): 104 output = subprocess.check_output( 105 ["hg", "log", "-r", revset, "--template", template + "\\n"] 106 ) 107 for line in output.splitlines(): 108 yield str(line, "ascii") 109 110 111def get_base_hg_rev(commit_map): 112 base_hg_rev = find_newest_commit(commit_map) 113 eprint("Using %s as base hg revision" % base_hg_rev) 114 return base_hg_rev 115 116 117def load_hg_commits(commits, query): 118 for cset in get_multiple_revs(query, "{node} {p1node} {p2node}"): 119 tokens = cset.split() 120 commits[tokens[0]] = HgCommit(tokens[1], tokens[2]) 121 return commits 122 123 124def get_real_base_hg_rev(hg_data, commit_map): 125 # Some of the HG commits we want to port to github may have landed on codelines 126 # that branched off central prior to base_hg_rev. So when we create the git 127 # equivalents, they will have parents that are not the HEAD of the git repo, 128 # but instead will be descendants of older commits in the git repo. In order 129 # to do this correctly, we need to find the hg-equivalents of all of those 130 # possible git parents. So first we identify all the "tail" hg revisions in 131 # our hg_data set (think "tail" as in opposite of "head" which is the tipmost 132 # commit). The "tail" hg revisions are the ones for which we don't have their 133 # ancestors in hg_data. 134 tails = [] 135 for (rev, cset) in hg_data.items(): 136 for parent in cset.parents: 137 if parent not in hg_data: 138 tails.append(rev) 139 eprint("Found hg tail revisions %s" % tails) 140 # Then we find their common ancestor, which will be some ancestor of base_hg_rev 141 # from which those codelines. 142 if len(tails) == 0: 143 common_ancestor = get_single_rev(".") 144 else: 145 common_ancestor = get_single_rev("ancestor(" + ",".join(tails) + ")") 146 eprint("Found common ancestor of tail revisions: %s" % common_ancestor) 147 148 # And then we find the newest git commit whose hg-equivalent is an ancestor of 149 # that common ancestor, to make sure we are starting from a known hg/git 150 # commit pair. 151 for git_commit in sorted(commit_map.values(), key=timeof, reverse=True): 152 new_base = get_single_rev( 153 "ancestor(" + common_ancestor + "," + git_commit.hg_rev + ")" 154 ) 155 if new_base == common_ancestor: 156 eprint( 157 "Pre-existing git commit %s from hg rev %s is descendant of common ancestor; %s" 158 % ( 159 git_commit.commit_obj.id, 160 git_commit.hg_rev, 161 "walking back further...", 162 ) 163 ) 164 continue 165 if new_base != git_commit.hg_rev: 166 eprint( 167 "Pre-existing git commit %s from hg rev %s is on sibling branch" 168 " of common ancestor; %s" 169 % ( 170 git_commit.commit_obj.id, 171 git_commit.hg_rev, 172 "walking back further...", 173 ) 174 ) 175 continue 176 eprint( 177 "Pre-existing git commit %s from hg rev %s is sufficiently old; stopping walk" 178 % (git_commit.commit_obj.id, git_commit.hg_rev) 179 ) 180 common_ancestor = new_base 181 break 182 183 return common_ancestor 184 185 186# Now we prune out all the uninteresting changesets from hg_commits. The 187# uninteresting ones are ones that don't touch the target code, are not merges, 188# and are not referenced by mozilla tags in the git repo. 189# We do this by rewriting the parents to the "interesting" ancestor. 190def prune_boring(rev): 191 while rev in hg_commits: 192 parent_pruned = False 193 for i in range(len(hg_commits[rev].parents)): 194 parent_rev = hg_commits[rev].parents[i] 195 if parent_rev not in hg_commits: 196 continue 197 if hg_commits[parent_rev].touches_sync_code: 198 continue 199 if len(hg_commits[parent_rev].parents) > 1: 200 continue 201 if parent_rev in hg_to_git_commit_map: 202 continue 203 204 # If we get here, then `parent_rev` is a boring revision and we can 205 # prune it. Connect `rev` to its grandparent, and prune the parent 206 grandparent_rev = hg_commits[parent_rev].parents[0] 207 hg_commits[rev].parents[i] = grandparent_rev 208 # eprint("Pruned %s as boring parent of %s, using %s now" % 209 # (parent_rev, rev, grandparent_rev)) 210 parent_pruned = True 211 212 if parent_pruned: 213 # If we pruned a parent, process `rev` again as we might want to 214 # prune more parents 215 continue 216 217 # Collapse identical parents, because if the parents are identical 218 # we don't need to keep multiple copies of them. 219 hg_commits[rev].parents = list(dict.fromkeys(hg_commits[rev].parents)) 220 221 # If we get here, all of `rev`s parents are interesting, so we can't 222 # prune them. Move up to the parent rev and start processing that, or 223 # if we have multiple parents then recurse on those nodes. 224 if len(hg_commits[rev].parents) == 1: 225 rev = hg_commits[rev].parents[0] 226 continue 227 228 for parent_rev in hg_commits[rev].parents: 229 prune_boring(parent_rev) 230 return 231 232 233class FakeCommit: 234 def __init__(self, oid): 235 self.oid = oid 236 237 238def fake_commit(hg_rev, parent1, parent2): 239 if parent1 is None: 240 eprint("ERROR: Trying to build on None") 241 exit(1) 242 oid = "githash_%s" % hash(parent1) 243 eprint("Fake-built %s" % oid) 244 return FakeCommit(oid) 245 246 247def build_tree(builder, treedata): 248 for (name, value) in treedata.items(): 249 if isinstance(value, dict): 250 subbuilder = downstream_git_repo.TreeBuilder() 251 build_tree(subbuilder, value) 252 builder.insert(name, subbuilder.write(), pygit2.GIT_FILEMODE_TREE) 253 else: 254 (filemode, contents) = value 255 blob_oid = downstream_git_repo.create_blob(contents) 256 builder.insert(name, blob_oid, filemode) 257 258 259def author_to_signature(author): 260 pieces = author.strip().split("<") 261 if len(pieces) != 2 or pieces[1][-1] != ">": 262 # We could probably handle this better 263 return pygit2.Signature(author, "") 264 name = pieces[0].strip() 265 email = pieces[1][:-1].strip() 266 return pygit2.Signature(name, email) 267 268 269def real_commit(hg_rev, parent1, parent2): 270 filetree = dict() 271 manifest = mozilla_hg_repo.manifest(rev=hg_rev) 272 for (nodeid, permission, executable, symlink, filename) in manifest: 273 if not filename.startswith(relative_path.encode("utf-8")): 274 continue 275 if symlink: 276 filemode = pygit2.GIT_FILEMODE_LINK 277 elif executable: 278 filemode = pygit2.GIT_FILEMODE_BLOB_EXECUTABLE 279 else: 280 filemode = pygit2.GIT_FILEMODE_BLOB 281 filecontent = mozilla_hg_repo.cat([filename], rev=hg_rev) 282 subtree = filetree 283 for component in filename.split(b"/")[2:-1]: 284 subtree = subtree.setdefault(component.decode("latin-1"), dict()) 285 filename = filename.split(b"/")[-1] 286 subtree[filename.decode("latin-1")] = (filemode, filecontent) 287 288 builder = downstream_git_repo.TreeBuilder() 289 build_tree(builder, filetree) 290 tree_oid = builder.write() 291 292 parent1_obj = downstream_git_repo.get(parent1) 293 if parent1_obj.tree_id == tree_oid: 294 eprint("Early-exit; tree matched that of parent git commit %s" % parent1) 295 return parent1_obj 296 297 if parent2 is not None: 298 parent2_obj = downstream_git_repo.get(parent2) 299 if parent2_obj.tree_id == tree_oid: 300 eprint("Early-exit; tree matched that of parent git commit %s" % parent2) 301 return parent2_obj 302 303 hg_rev_obj = mozilla_hg_repo.log(revrange=hg_rev, limit=1)[0] 304 commit_author = hg_rev_obj[4].decode("latin-1") 305 commit_message = hg_rev_obj[5].decode("latin-1") 306 commit_message += ( 307 "\n\n[ghsync] From https://hg.mozilla.org/mozilla-central/rev/%s" % hg_rev 308 + "\n" 309 ) 310 311 parents = [parent1] 312 if parent2 is not None: 313 parents.append(parent2) 314 commit_oid = downstream_git_repo.create_commit( 315 None, 316 author_to_signature(commit_author), 317 author_to_signature(commit_author), 318 commit_message, 319 tree_oid, 320 parents, 321 ) 322 eprint("Built git commit %s" % commit_oid) 323 return downstream_git_repo.get(commit_oid) 324 325 326def try_commit(hg_rev, parent1, parent2=None): 327 if False: 328 return fake_commit(hg_rev, parent1, parent2) 329 else: 330 return real_commit(hg_rev, parent1, parent2) 331 332 333def build_git_commits(rev): 334 debugprint("build_git_commit(%s)..." % rev) 335 if rev in hg_to_git_commit_map: 336 debugprint(" maps to %s" % hg_to_git_commit_map[rev].commit_obj.oid) 337 return hg_to_git_commit_map[rev].commit_obj.oid 338 339 if rev not in hg_commits: 340 debugprint(" not in hg_commits") 341 return None 342 343 if len(hg_commits[rev].parents) == 1: 344 git_parent = build_git_commits(hg_commits[rev].parents[0]) 345 if not hg_commits[rev].touches_sync_code: 346 eprint( 347 "WARNING: Found rev %s that is non-merge and not related to the target" 348 % rev 349 ) 350 return git_parent 351 eprint("Building git equivalent for %s on top of %s" % (rev, git_parent)) 352 commit_obj = try_commit(rev, git_parent) 353 hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj) 354 debugprint(" built %s as %s" % (rev, commit_obj.oid)) 355 return commit_obj.oid 356 357 git_parent_1 = build_git_commits(hg_commits[rev].parents[0]) 358 git_parent_2 = build_git_commits(hg_commits[rev].parents[1]) 359 if git_parent_1 is None or git_parent_2 is None or git_parent_1 == git_parent_2: 360 git_parent = git_parent_1 if git_parent_2 is None else git_parent_2 361 if not hg_commits[rev].touches_sync_code: 362 debugprint( 363 " %s is merge with no parents or doesn't touch WR, returning %s" 364 % (rev, git_parent) 365 ) 366 return git_parent 367 368 eprint( 369 "WARNING: Found merge rev %s whose parents have identical target code" 370 ", but modifies the target" % rev 371 ) 372 eprint("Building git equivalent for %s on top of %s" % (rev, git_parent)) 373 commit_obj = try_commit(rev, git_parent) 374 hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj) 375 debugprint(" built %s as %s" % (rev, commit_obj.oid)) 376 return commit_obj.oid 377 378 # An actual merge 379 eprint( 380 "Building git equivalent for %s on top of %s, %s" 381 % (rev, git_parent_1, git_parent_2) 382 ) 383 commit_obj = try_commit(rev, git_parent_1, git_parent_2) 384 hg_to_git_commit_map[rev] = GitCommit(rev, commit_obj) 385 debugprint(" built %s as %s" % (rev, commit_obj.oid)) 386 return commit_obj.oid 387 388 389def pretty_print(rev, cset): 390 desc = " %s" % rev 391 desc += " parents: %s" % cset.parents 392 if rev in hg_to_git_commit_map: 393 desc += " git: %s" % hg_to_git_commit_map[rev].commit_obj.oid 394 if rev == hg_tip: 395 desc += " (tip)" 396 return desc 397 398 399if len(sys.argv) < 3: 400 eprint("Usage: %s <local-checkout-path> <repo-relative-path>" % sys.argv[0]) 401 eprint("Current dir must be the mozilla hg repo") 402 exit(1) 403 404local_checkout_path = sys.argv[1] 405relative_path = sys.argv[2] 406mozilla_hg_path = os.getcwd() 407NULL_PARENT_REV = "0000000000000000000000000000000000000000" 408 409downstream_git_repo = pygit2.Repository(pygit2.discover_repository(local_checkout_path)) 410mozilla_hg_repo = hglib.open(mozilla_hg_path) 411hg_to_git_commit_map = load_git_repository() 412base_hg_rev = get_base_hg_rev(hg_to_git_commit_map) 413if base_hg_rev is None: 414 eprint("Found no sync commits or 'mozilla-xxx' tags") 415 exit(1) 416 417hg_commits = load_hg_commits(dict(), "only(.," + base_hg_rev + ")") 418eprint("Initial set has %s changesets" % len(hg_commits)) 419base_hg_rev = get_real_base_hg_rev(hg_commits, hg_to_git_commit_map) 420eprint("Using hg rev %s as common ancestor of all interesting changesets" % base_hg_rev) 421 422# Refresh hg_commits with our wider dataset 423hg_tip = get_single_rev(".") 424wider_range = "%s::%s" % (base_hg_rev, hg_tip) 425hg_commits = load_hg_commits(hg_commits, wider_range) 426eprint("Updated set has %s changesets" % len(hg_commits)) 427 428if DEBUG: 429 eprint("Graph of descendants of %s" % base_hg_rev) 430 output = subprocess.check_output( 431 [ 432 "hg", 433 "log", 434 "--graph", 435 "-r", 436 "descendants(" + base_hg_rev + ")", 437 "--template", 438 "{node} {desc|firstline}\\n", 439 ] 440 ) 441 for line in output.splitlines(): 442 eprint(line.decode("utf-8", "ignore")) 443 444# Also flag any changes that touch the project 445query = "(" + wider_range + ') & file("glob:' + relative_path + '/**")' 446for cset in get_multiple_revs(query, "{node}"): 447 debugprint("Changeset %s modifies %s" % (cset, relative_path)) 448 hg_commits[cset].touches_sync_code = True 449eprint( 450 "Identified %s changesets that touch the target code" 451 % sum([1 if v.touches_sync_code else 0 for (k, v) in hg_commits.items()]) 452) 453 454prune_boring(hg_tip) 455 456# hg_tip itself might be boring 457if not hg_commits[hg_tip].touches_sync_code and len(hg_commits[hg_tip].parents) == 1: 458 new_tip = hg_commits[hg_tip].parents[0] 459 eprint("Pruned tip %s as boring, using %s now" % (hg_tip, new_tip)) 460 hg_tip = new_tip 461 462eprint("--- Interesting changesets ---") 463for (rev, cset) in hg_commits.items(): 464 if cset.touches_sync_code or len(cset.parents) > 1 or rev in hg_to_git_commit_map: 465 eprint(pretty_print(rev, cset)) 466if DEBUG: 467 eprint("--- Other changesets (not really interesting) ---") 468 for (rev, cset) in hg_commits.items(): 469 if not ( 470 cset.touches_sync_code 471 or len(cset.parents) > 1 472 or rev in hg_to_git_commit_map 473 ): 474 eprint(pretty_print(rev, cset)) 475 476git_tip = build_git_commits(hg_tip) 477if git_tip is None: 478 eprint("No new changesets generated, exiting.") 479else: 480 downstream_git_repo.create_reference("refs/heads/github-sync", git_tip, force=True) 481 eprint("Updated github-sync branch to %s, done!" % git_tip) 482