1#!/usr/local/bin/python3.8 2 3""" 4git-filter-repo filters git repositories, similar to git filter-branch, BFG 5repo cleaner, and others. The basic idea is that it works by running 6 git fast-export <options> | filter | git fast-import <options> 7where this program not only launches the whole pipeline but also serves as 8the 'filter' in the middle. It does a few additional things on top as well 9in order to make it into a well-rounded filtering tool. 10 11git-filter-repo can also be used as a library for more involved filtering 12operations; however: 13 ***** API BACKWARD COMPATIBILITY CAVEAT ***** 14 Programs using git-filter-repo as a library can reach pretty far into its 15 internals, but I am not prepared to guarantee backward compatibility of 16 all APIs. I suspect changes will be rare, but I reserve the right to 17 change any API. Since it is assumed that repository filtering is 18 something one would do very rarely, and in particular that it's a 19 one-shot operation, this should not be a problem in practice for anyone. 20 However, if you want to re-use a program you have written that uses 21 git-filter-repo as a library (or makes use of one of its --*-callback 22 arguments), you should either make sure you are using the same version of 23 git and git-filter-repo, or make sure to re-test it. 24 25 If there are particular pieces of the API you are concerned about, and 26 there is not already a testcase for it in t9391-lib-usage.sh or 27 t9392-python-callback.sh, please contribute a testcase. That will not 28 prevent me from changing the API, but it will allow you to look at the 29 history of a testcase to see whether and how the API changed. 30 ***** END API BACKWARD COMPATIBILITY CAVEAT ***** 31""" 32 33import argparse 34import collections 35import fnmatch 36import gettext 37import io 38import os 39import platform 40import re 41import shutil 42import subprocess 43import sys 44import time 45import textwrap 46 47from datetime import tzinfo, timedelta, datetime 48 49__all__ = ["Blob", "Reset", "FileChange", "Commit", "Tag", "Progress", 50 "Checkpoint", "FastExportParser", "ProgressWriter", 51 "string_to_date", "date_to_string", 52 "record_id_rename", "GitUtils", "FilteringOptions", "RepoFilter"] 53 54deleted_hash = b'0'*40 55write_marks = True 56date_format_permissive = True 57 58def gettext_poison(msg): 59 if "GIT_TEST_GETTEXT_POISON" in os.environ: # pragma: no cover 60 return "# GETTEXT POISON #" 61 return gettext.gettext(msg) 62 63_ = gettext_poison 64 65def setup_gettext(): 66 TEXTDOMAIN="git-filter-repo" 67 podir = os.environ.get("GIT_TEXTDOMAINDIR") or "@@LOCALEDIR@@" 68 if not os.path.isdir(podir): # pragma: no cover 69 podir = None # Python has its own fallback; use that 70 71 ## This looks like the most straightforward translation of the relevant 72 ## code in git.git:gettext.c and git.git:perl/Git/I18n.pm: 73 #import locale 74 #locale.setlocale(locale.LC_MESSAGES, ""); 75 #locale.setlocale(locale.LC_TIME, ""); 76 #locale.textdomain(TEXTDOMAIN); 77 #locale.bindtextdomain(TEXTDOMAIN, podir); 78 ## but the python docs suggest using the gettext module (which doesn't 79 ## have setlocale()) instead, so: 80 gettext.textdomain(TEXTDOMAIN); 81 gettext.bindtextdomain(TEXTDOMAIN, podir); 82 83def _timedelta_to_seconds(delta): 84 """ 85 Converts timedelta to seconds 86 """ 87 offset = delta.days*86400 + delta.seconds + (delta.microseconds+0.0)/1000000 88 return round(offset) 89 90class FixedTimeZone(tzinfo): 91 """ 92 Fixed offset in minutes east from UTC. 93 """ 94 95 tz_re = re.compile(br'^([-+]?)(\d\d)(\d\d)$') 96 97 def __init__(self, offset_string): 98 tzinfo.__init__(self) 99 sign, hh, mm = FixedTimeZone.tz_re.match(offset_string).groups() 100 factor = -1 if (sign and sign == b'-') else 1 101 self._offset = timedelta(minutes = factor*(60*int(hh) + int(mm))) 102 self._offset_string = offset_string 103 104 def utcoffset(self, dt): 105 return self._offset 106 107 def tzname(self, dt): 108 return self._offset_string 109 110 def dst(self, dt): 111 return timedelta(0) 112 113def string_to_date(datestring): 114 (unix_timestamp, tz_offset) = datestring.split() 115 return datetime.fromtimestamp(int(unix_timestamp), 116 FixedTimeZone(tz_offset)) 117 118def date_to_string(dateobj): 119 epoch = datetime.fromtimestamp(0, dateobj.tzinfo) 120 return(b'%d %s' % (int(_timedelta_to_seconds(dateobj - epoch)), 121 dateobj.tzinfo.tzname(0))) 122 123def decode(bytestr): 124 'Try to convert bytestr to utf-8 for outputting as an error message.' 125 return bytestr.decode('utf-8', 'backslashreplace') 126 127def glob_to_regex(glob_bytestr): 128 'Translate glob_bytestr into a regex on bytestrings' 129 130 # fnmatch.translate is idiotic and won't accept bytestrings 131 if (decode(glob_bytestr).encode() != glob_bytestr): # pragma: no cover 132 raise SystemExit(_("Error: Cannot handle glob %s").format(glob_bytestr)) 133 134 # Create regex operating on string 135 regex = fnmatch.translate(decode(glob_bytestr)) 136 137 # FIXME: This is an ugly hack... 138 # fnmatch.translate tries to do multi-line matching and wants the glob to 139 # match up to the end of the input, which isn't relevant for us, so we 140 # have to modify the regex. fnmatch.translate has used different regex 141 # constructs to achieve this with different python versions, so we have 142 # to check for each of them and then fix it up. It would be much better 143 # if fnmatch.translate could just take some flags to allow us to specify 144 # what we want rather than employing this hackery, but since it 145 # doesn't... 146 if regex.endswith(r'\Z(?ms)'): # pragma: no cover 147 regex = regex[0:-7] 148 elif regex.startswith(r'(?s:') and regex.endswith(r')\Z'): # pragma: no cover 149 regex = regex[4:-3] 150 151 # Finally, convert back to regex operating on bytestr 152 return regex.encode() 153 154class PathQuoting: 155 _unescape = {b'a': b'\a', 156 b'b': b'\b', 157 b'f': b'\f', 158 b'n': b'\n', 159 b'r': b'\r', 160 b't': b'\t', 161 b'v': b'\v', 162 b'"': b'"', 163 b'\\':b'\\'} 164 _unescape_re = re.compile(br'\\([a-z"\\]|[0-9]{3})') 165 _escape = [bytes([x]) for x in range(127)]+[ 166 b'\\'+bytes(ord(c) for c in oct(x)[2:]) for x in range(127,256)] 167 _reverse = dict(map(reversed, _unescape.items())) 168 for x in _reverse: 169 _escape[ord(x)] = b'\\'+_reverse[x] 170 _special_chars = [len(x) > 1 for x in _escape] 171 172 @staticmethod 173 def unescape_sequence(orig): 174 seq = orig.group(1) 175 return PathQuoting._unescape[seq] if len(seq) == 1 else bytes([int(seq, 8)]) 176 177 @staticmethod 178 def dequote(quoted_string): 179 if quoted_string.startswith(b'"'): 180 assert quoted_string.endswith(b'"') 181 return PathQuoting._unescape_re.sub(PathQuoting.unescape_sequence, 182 quoted_string[1:-1]) 183 return quoted_string 184 185 @staticmethod 186 def enquote(unquoted_string): 187 # Option 1: Quoting when fast-export would: 188 # pqsc = PathQuoting._special_chars 189 # if any(pqsc[x] for x in set(unquoted_string)): 190 # Option 2, perf hack: do minimal amount of quoting required by fast-import 191 if unquoted_string.startswith(b'"') or b'\n' in unquoted_string: 192 pqe = PathQuoting._escape 193 return b'"' + b''.join(pqe[x] for x in unquoted_string) + b'"' 194 return unquoted_string 195 196class AncestryGraph(object): 197 """ 198 A class that maintains a direct acycle graph of commits for the purpose of 199 determining if one commit is the ancestor of another. 200 """ 201 202 def __init__(self): 203 self.cur_value = 0 204 205 # A mapping from the external identifers given to us to the simple integers 206 # we use in self.graph 207 self.value = {} 208 209 # A tuple of (depth, list-of-ancestors). Values and keys in this graph are 210 # all integers from the self.value dict. The depth of a commit is one more 211 # than the max depth of any of its ancestors. 212 self.graph = {} 213 214 # Cached results from previous calls to is_ancestor(). 215 self._cached_is_ancestor = {} 216 217 def record_external_commits(self, external_commits): 218 """ 219 Record in graph that each commit in external_commits exists, and is 220 treated as a root commit with no parents. 221 """ 222 for c in external_commits: 223 if c not in self.value: 224 self.cur_value += 1 225 self.value[c] = self.cur_value 226 self.graph[self.cur_value] = (1, []) 227 228 def add_commit_and_parents(self, commit, parents): 229 """ 230 Record in graph that commit has the given parents. parents _MUST_ have 231 been first recorded. commit _MUST_ not have been recorded yet. 232 """ 233 assert all(p in self.value for p in parents) 234 assert commit not in self.value 235 236 # Get values for commit and parents 237 self.cur_value += 1 238 self.value[commit] = self.cur_value 239 graph_parents = [self.value[x] for x in parents] 240 241 # Determine depth for commit, then insert the info into the graph 242 depth = 1 243 if parents: 244 depth += max(self.graph[p][0] for p in graph_parents) 245 self.graph[self.cur_value] = (depth, graph_parents) 246 247 def is_ancestor(self, possible_ancestor, check): 248 """ 249 Return whether possible_ancestor is an ancestor of check 250 """ 251 a, b = self.value[possible_ancestor], self.value[check] 252 original_pair = (a,b) 253 a_depth = self.graph[a][0] 254 ancestors = [b] 255 visited = set() 256 while ancestors: 257 ancestor = ancestors.pop() 258 prev_pair = (a, ancestor) 259 if prev_pair in self._cached_is_ancestor: 260 if not self._cached_is_ancestor[prev_pair]: 261 continue 262 self._cached_is_ancestor[original_pair] = True 263 return True 264 if ancestor in visited: 265 continue 266 visited.add(ancestor) 267 depth, more_ancestors = self.graph[ancestor] 268 if ancestor == a: 269 self._cached_is_ancestor[original_pair] = True 270 return True 271 elif depth <= a_depth: 272 continue 273 ancestors.extend(more_ancestors) 274 self._cached_is_ancestor[original_pair] = False 275 return False 276 277class MailmapInfo(object): 278 def __init__(self, filename): 279 self.changes = {} 280 self._parse_file(filename) 281 282 def _parse_file(self, filename): 283 name_and_email_re = re.compile(br'(.*?)\s*<([^>]*)>\s*') 284 comment_re = re.compile(br'\s*#.*') 285 if not os.access(filename, os.R_OK): 286 raise SystemExit(_("Cannot read %s") % decode(filename)) 287 with open(filename, 'br') as f: 288 count = 0 289 for line in f: 290 count += 1 291 err = "Unparseable mailmap file: line #{} is bad: {}".format(count, line) 292 # Remove comments 293 line = comment_re.sub(b'', line) 294 # Remove leading and trailing whitespace 295 line = line.strip() 296 if not line: 297 continue 298 299 m = name_and_email_re.match(line) 300 if not m: 301 raise SystemExit(err) 302 proper_name, proper_email = m.groups() 303 if len(line) == m.end(): 304 self.changes[(None, proper_email)] = (proper_name, proper_email) 305 continue 306 rest = line[m.end():] 307 m = name_and_email_re.match(rest) 308 if m: 309 commit_name, commit_email = m.groups() 310 if len(rest) != m.end(): 311 raise SystemExit(err) 312 else: 313 commit_name, commit_email = rest, None 314 self.changes[(commit_name, commit_email)] = (proper_name, proper_email) 315 316 def translate(self, name, email): 317 ''' Given a name and email, return the expected new name and email from the 318 mailmap if there is a translation rule for it, otherwise just return 319 the given name and email.''' 320 for old, new in self.changes.items(): 321 old_name, old_email = old 322 new_name, new_email = new 323 if (not old_email or email.lower() == old_email.lower()) and ( 324 name == old_name or not old_name): 325 return (new_name or name, new_email or email) 326 return (name, email) 327 328class ProgressWriter(object): 329 def __init__(self): 330 self._last_progress_update = time.time() 331 self._last_message = None 332 333 def show(self, msg): 334 self._last_message = msg 335 now = time.time() 336 if now - self._last_progress_update > .1: 337 self._last_progress_update = now 338 sys.stdout.write("\r{}".format(msg)) 339 sys.stdout.flush() 340 341 def finish(self): 342 self._last_progress_update = 0 343 if self._last_message: 344 self.show(self._last_message) 345 sys.stdout.write("\n") 346 347class _IDs(object): 348 """ 349 A class that maintains the 'name domain' of all the 'marks' (short int 350 id for a blob/commit git object). The reason this mechanism is necessary 351 is because the text of fast-export may refer to an object using a different 352 mark than the mark that was assigned to that object using IDS.new(). This 353 class allows you to translate the fast-export marks (old) to the marks 354 assigned from IDS.new() (new). 355 356 Note that there are two reasons why the marks may differ: (1) The 357 user manually creates Blob or Commit objects (for insertion into the 358 stream) (2) We're reading the data from two different repositories 359 and trying to combine the data (git fast-export will number ids from 360 1...n, and having two 1's, two 2's, two 3's, causes issues). 361 """ 362 363 def __init__(self): 364 """ 365 Init 366 """ 367 # The id for the next created blob/commit object 368 self._next_id = 1 369 370 # A map of old-ids to new-ids (1:1 map) 371 self._translation = {} 372 373 # A map of new-ids to every old-id that points to the new-id (1:N map) 374 self._reverse_translation = {} 375 376 def has_renames(self): 377 """ 378 Return whether there have been ids remapped to new values 379 """ 380 return bool(self._translation) 381 382 def new(self): 383 """ 384 Should be called whenever a new blob or commit object is created. The 385 returned value should be used as the id/mark for that object. 386 """ 387 rv = self._next_id 388 self._next_id += 1 389 return rv 390 391 def record_rename(self, old_id, new_id, handle_transitivity = False): 392 """ 393 Record that old_id is being renamed to new_id. 394 """ 395 if old_id != new_id: 396 # old_id -> new_id 397 self._translation[old_id] = new_id 398 399 # Transitivity will be needed if new commits are being inserted mid-way 400 # through a branch. 401 if handle_transitivity: 402 # Anything that points to old_id should point to new_id 403 if old_id in self._reverse_translation: 404 for id_ in self._reverse_translation[old_id]: 405 self._translation[id_] = new_id 406 407 # Record that new_id is pointed to by old_id 408 if new_id not in self._reverse_translation: 409 self._reverse_translation[new_id] = [] 410 self._reverse_translation[new_id].append(old_id) 411 412 def translate(self, old_id): 413 """ 414 If old_id has been mapped to an alternate id, return the alternate id. 415 """ 416 if old_id in self._translation: 417 return self._translation[old_id] 418 else: 419 return old_id 420 421 def __str__(self): 422 """ 423 Convert IDs to string; used for debugging 424 """ 425 rv = "Current count: %d\nTranslation:\n" % self._next_id 426 for k in sorted(self._translation): 427 rv += " %d -> %s\n" % (k, self._translation[k]) 428 429 rv += "Reverse translation:\n" 430 for k in sorted(self._reverse_translation): 431 rv += " " + str(k) + " -> " + str(self._reverse_translation[k]) + "\n" 432 433 return rv 434 435class _GitElement(object): 436 """ 437 The base class for all git elements that we create. 438 """ 439 440 def __init__(self): 441 # A string that describes what type of Git element this is 442 self.type = None 443 444 # A flag telling us if this Git element has been dumped 445 # (i.e. printed) or skipped. Typically elements that have been 446 # dumped or skipped will not be dumped again. 447 self.dumped = 0 448 449 def dump(self, file_): 450 """ 451 This version should never be called. Derived classes need to 452 override! We should note that subclasses should implement this 453 method such that the output would match the format produced by 454 fast-export. 455 """ 456 raise SystemExit(_("Unimplemented function: %s") % type(self).__name__ 457 +".dump()") # pragma: no cover 458 459 def __bytes__(self): 460 """ 461 Convert GitElement to bytestring; used for debugging 462 """ 463 old_dumped = self.dumped 464 writeme = io.BytesIO() 465 self.dump(writeme) 466 output_lines = writeme.getvalue().splitlines() 467 writeme.close() 468 self.dumped = old_dumped 469 return b"%s:\n %s" % (type(self).__name__.encode(), 470 b"\n ".join(output_lines)) 471 472 def skip(self, new_id=None): 473 """ 474 Ensures this element will not be written to output 475 """ 476 self.dumped = 2 477 478class _GitElementWithId(_GitElement): 479 """ 480 The base class for Git elements that have IDs (commits and blobs) 481 """ 482 483 def __init__(self): 484 _GitElement.__init__(self) 485 486 # The mark (short, portable id) for this element 487 self.id = _IDS.new() 488 489 # The previous mark for this element 490 self.old_id = None 491 492 def skip(self, new_id=None): 493 """ 494 This element will no longer be automatically written to output. When a 495 commit gets skipped, it's ID will need to be translated to that of its 496 parent. 497 """ 498 self.dumped = 2 499 500 _IDS.record_rename(self.old_id or self.id, new_id) 501 502class Blob(_GitElementWithId): 503 """ 504 This class defines our representation of git blob elements (i.e. our 505 way of representing file contents). 506 """ 507 508 def __init__(self, data, original_id = None): 509 _GitElementWithId.__init__(self) 510 511 # Denote that this is a blob 512 self.type = 'blob' 513 514 # Record original id 515 self.original_id = original_id 516 517 # Stores the blob's data 518 assert(type(data) == bytes) 519 self.data = data 520 521 def dump(self, file_): 522 """ 523 Write this blob element to a file. 524 """ 525 self.dumped = 1 526 HASH_TO_ID[self.original_id] = self.id 527 ID_TO_HASH[self.id] = self.original_id 528 529 file_.write(b'blob\n') 530 file_.write(b'mark :%d\n' % self.id) 531 file_.write(b'data %d\n%s' % (len(self.data), self.data)) 532 file_.write(b'\n') 533 534 535class Reset(_GitElement): 536 """ 537 This class defines our representation of git reset elements. A reset 538 event is the creation (or recreation) of a named branch, optionally 539 starting from a specific revision). 540 """ 541 542 def __init__(self, ref, from_ref = None): 543 _GitElement.__init__(self) 544 545 # Denote that this is a reset 546 self.type = 'reset' 547 548 # The name of the branch being (re)created 549 self.ref = ref 550 551 # Some reference to the branch/commit we are resetting from 552 self.from_ref = from_ref 553 554 def dump(self, file_): 555 """ 556 Write this reset element to a file 557 """ 558 self.dumped = 1 559 560 file_.write(b'reset %s\n' % self.ref) 561 if self.from_ref: 562 if isinstance(self.from_ref, int): 563 file_.write(b'from :%d\n' % self.from_ref) 564 else: 565 file_.write(b'from %s\n' % self.from_ref) 566 file_.write(b'\n') 567 568class FileChange(_GitElement): 569 """ 570 This class defines our representation of file change elements. File change 571 elements are components within a Commit element. 572 """ 573 574 def __init__(self, type_, filename = None, id_ = None, mode = None): 575 _GitElement.__init__(self) 576 577 # Denote the type of file-change (b'M' for modify, b'D' for delete, etc) 578 # We could 579 # assert(type(type_) == bytes) 580 # here but I don't just due to worries about performance overhead... 581 self.type = type_ 582 583 # Record the name of the file being changed 584 self.filename = filename 585 586 # Record the mode (mode describes type of file entry (non-executable, 587 # executable, or symlink)). 588 self.mode = mode 589 590 # blob_id is the id (mark) of the affected blob 591 self.blob_id = id_ 592 593 if type_ == b'DELETEALL': 594 assert filename is None and id_ is None and mode is None 595 self.filename = b'' # Just so PathQuoting.enquote doesn't die 596 else: 597 assert filename is not None 598 599 if type_ == b'M': 600 assert id_ is not None and mode is not None 601 elif type_ == b'D': 602 assert id_ is None and mode is None 603 elif type_ == b'R': # pragma: no cover (now avoid fast-export renames) 604 assert mode is None 605 if id_ is None: 606 raise SystemExit(_("new name needed for rename of %s") % filename) 607 self.filename = (self.filename, id_) 608 self.blob_id = None 609 610 def dump(self, file_): 611 """ 612 Write this file-change element to a file 613 """ 614 skipped_blob = (self.type == b'M' and self.blob_id is None) 615 if skipped_blob: return 616 self.dumped = 1 617 618 quoted_filename = PathQuoting.enquote(self.filename) 619 if self.type == b'M' and isinstance(self.blob_id, int): 620 file_.write(b'M %s :%d %s\n' % (self.mode, self.blob_id, quoted_filename)) 621 elif self.type == b'M': 622 file_.write(b'M %s %s %s\n' % (self.mode, self.blob_id, quoted_filename)) 623 elif self.type == b'D': 624 file_.write(b'D %s\n' % quoted_filename) 625 elif self.type == b'DELETEALL': 626 file_.write(b'deleteall\n') 627 else: 628 raise SystemExit(_("Unhandled filechange type: %s") % self.type) # pragma: no cover 629 630class Commit(_GitElementWithId): 631 """ 632 This class defines our representation of commit elements. Commit elements 633 contain all the information associated with a commit. 634 """ 635 636 def __init__(self, branch, 637 author_name, author_email, author_date, 638 committer_name, committer_email, committer_date, 639 message, 640 file_changes, 641 parents, 642 original_id = None, 643 encoding = None, # encoding for message; None implies UTF-8 644 **kwargs): 645 _GitElementWithId.__init__(self) 646 self.old_id = self.id 647 648 # Denote that this is a commit element 649 self.type = 'commit' 650 651 # Record the affected branch 652 self.branch = branch 653 654 # Record original id 655 self.original_id = original_id 656 657 # Record author's name 658 self.author_name = author_name 659 660 # Record author's email 661 self.author_email = author_email 662 663 # Record date of authoring 664 self.author_date = author_date 665 666 # Record committer's name 667 self.committer_name = committer_name 668 669 # Record committer's email 670 self.committer_email = committer_email 671 672 # Record date the commit was made 673 self.committer_date = committer_date 674 675 # Record commit message and its encoding 676 self.encoding = encoding 677 self.message = message 678 679 # List of file-changes associated with this commit. Note that file-changes 680 # are also represented as git elements 681 self.file_changes = file_changes 682 683 self.parents = parents 684 685 def dump(self, file_): 686 """ 687 Write this commit element to a file. 688 """ 689 self.dumped = 1 690 HASH_TO_ID[self.original_id] = self.id 691 ID_TO_HASH[self.id] = self.original_id 692 693 # Make output to fast-import slightly easier for humans to read if the 694 # message has no trailing newline of its own; cosmetic, but a nice touch... 695 extra_newline = b'\n' 696 if self.message.endswith(b'\n') or not (self.parents or self.file_changes): 697 extra_newline = b'' 698 699 if not self.parents: 700 file_.write(b'reset %s\n' % self.branch) 701 file_.write((b'commit %s\n' 702 b'mark :%d\n' 703 b'author %s <%s> %s\n' 704 b'committer %s <%s> %s\n' 705 ) % ( 706 self.branch, self.id, 707 self.author_name, self.author_email, self.author_date, 708 self.committer_name, self.committer_email, self.committer_date 709 )) 710 if self.encoding: 711 file_.write(b'encoding %s\n' % self.encoding) 712 file_.write(b'data %d\n%s%s' % 713 (len(self.message), self.message, extra_newline)) 714 for i, parent in enumerate(self.parents): 715 file_.write(b'from ' if i==0 else b'merge ') 716 if isinstance(parent, int): 717 file_.write(b':%d\n' % parent) 718 else: 719 file_.write(b'%s\n' % parent) 720 for change in self.file_changes: 721 change.dump(file_) 722 if not self.parents and not self.file_changes: 723 # Workaround a bug in pre-git-2.22 versions of fast-import with 724 # the get-mark directive. 725 file_.write(b'\n') 726 file_.write(b'\n') 727 728 def first_parent(self): 729 """ 730 Return first parent commit 731 """ 732 if self.parents: 733 return self.parents[0] 734 return None 735 736 def skip(self, new_id=None): 737 _SKIPPED_COMMITS.add(self.old_id or self.id) 738 _GitElementWithId.skip(self, new_id) 739 740class Tag(_GitElementWithId): 741 """ 742 This class defines our representation of annotated tag elements. 743 """ 744 745 def __init__(self, ref, from_ref, 746 tagger_name, tagger_email, tagger_date, tag_msg, 747 original_id = None): 748 _GitElementWithId.__init__(self) 749 self.old_id = self.id 750 751 # Denote that this is a tag element 752 self.type = 'tag' 753 754 # Store the name of the tag 755 self.ref = ref 756 757 # Store the entity being tagged (this should be a commit) 758 self.from_ref = from_ref 759 760 # Record original id 761 self.original_id = original_id 762 763 # Store the name of the tagger 764 self.tagger_name = tagger_name 765 766 # Store the email of the tagger 767 self.tagger_email = tagger_email 768 769 # Store the date 770 self.tagger_date = tagger_date 771 772 # Store the tag message 773 self.message = tag_msg 774 775 def dump(self, file_): 776 """ 777 Write this tag element to a file 778 """ 779 780 self.dumped = 1 781 HASH_TO_ID[self.original_id] = self.id 782 ID_TO_HASH[self.id] = self.original_id 783 784 file_.write(b'tag %s\n' % self.ref) 785 if (write_marks and self.id): 786 file_.write(b'mark :%d\n' % self.id) 787 markfmt = b'from :%d\n' if isinstance(self.from_ref, int) else b'from %s\n' 788 file_.write(markfmt % self.from_ref) 789 if self.tagger_name: 790 file_.write(b'tagger %s <%s> ' % (self.tagger_name, self.tagger_email)) 791 file_.write(self.tagger_date) 792 file_.write(b'\n') 793 file_.write(b'data %d\n%s' % (len(self.message), self.message)) 794 file_.write(b'\n') 795 796class Progress(_GitElement): 797 """ 798 This class defines our representation of progress elements. The progress 799 element only contains a progress message, which is printed by fast-import 800 when it processes the progress output. 801 """ 802 803 def __init__(self, message): 804 _GitElement.__init__(self) 805 806 # Denote that this is a progress element 807 self.type = 'progress' 808 809 # Store the progress message 810 self.message = message 811 812 def dump(self, file_): 813 """ 814 Write this progress element to a file 815 """ 816 self.dumped = 1 817 818 file_.write(b'progress %s\n' % self.message) 819 file_.write(b'\n') 820 821class Checkpoint(_GitElement): 822 """ 823 This class defines our representation of checkpoint elements. These 824 elements represent events which force fast-import to close the current 825 packfile, start a new one, and to save out all current branch refs, tags 826 and marks. 827 """ 828 829 def __init__(self): 830 _GitElement.__init__(self) 831 832 # Denote that this is a checkpoint element 833 self.type = 'checkpoint' 834 835 def dump(self, file_): 836 """ 837 Write this checkpoint element to a file 838 """ 839 self.dumped = 1 840 841 file_.write(b'checkpoint\n') 842 file_.write(b'\n') 843 844class LiteralCommand(_GitElement): 845 """ 846 This class defines our representation of commands. The literal command 847 includes only a single line, and is not processed in any special way. 848 """ 849 850 def __init__(self, line): 851 _GitElement.__init__(self) 852 853 # Denote that this is a literal element 854 self.type = 'literal' 855 856 # Store the command 857 self.line = line 858 859 def dump(self, file_): 860 """ 861 Write this progress element to a file 862 """ 863 self.dumped = 1 864 865 file_.write(self.line) 866 867class Alias(_GitElement): 868 """ 869 This class defines our representation of fast-import alias elements. An 870 alias element is the setting of one mark to the same sha1sum as another, 871 usually because the newer mark corresponded to a pruned commit. 872 """ 873 874 def __init__(self, ref, to_ref): 875 _GitElement.__init__(self) 876 # Denote that this is a reset 877 self.type = 'alias' 878 879 self.ref = ref 880 self.to_ref = to_ref 881 882 def dump(self, file_): 883 """ 884 Write this reset element to a file 885 """ 886 self.dumped = 1 887 888 file_.write(b'alias\nmark :%d\nto :%d\n\n' % (self.ref, self.to_ref)) 889 890class FastExportParser(object): 891 """ 892 A class for parsing and handling the output from fast-export. This 893 class allows the user to register callbacks when various types of 894 data are encountered in the fast-export output. The basic idea is that, 895 FastExportParser takes fast-export output, creates the various objects 896 as it encounters them, the user gets to use/modify these objects via 897 callbacks, and finally FastExportParser outputs the modified objects 898 in fast-import format (presumably so they can be used to create a new 899 repo). 900 """ 901 902 def __init__(self, 903 tag_callback = None, commit_callback = None, 904 blob_callback = None, progress_callback = None, 905 reset_callback = None, checkpoint_callback = None, 906 done_callback = None): 907 # Members below simply store callback functions for the various git 908 # elements 909 self._tag_callback = tag_callback 910 self._blob_callback = blob_callback 911 self._reset_callback = reset_callback 912 self._commit_callback = commit_callback 913 self._progress_callback = progress_callback 914 self._checkpoint_callback = checkpoint_callback 915 self._done_callback = done_callback 916 917 # Keep track of which refs appear from the export, and which make it to 918 # the import (pruning of empty commits, renaming of refs, and creating 919 # new manual objects and inserting them can cause these to differ). 920 self._exported_refs = set() 921 self._imported_refs = set() 922 923 # A list of the branches we've seen, plus the last known commit they 924 # pointed to. An entry in latest_*commit will be deleted if we get a 925 # reset for that branch. These are used because of fast-import's weird 926 # decision to allow having an implicit parent via naming the branch 927 # instead of requiring branches to be specified via 'from' directives. 928 self._latest_commit = {} 929 self._latest_orig_commit = {} 930 931 # A handle to the input source for the fast-export data 932 self._input = None 933 934 # A handle to the output file for the output we generate (we call dump 935 # on many of the git elements we create). 936 self._output = None 937 938 # Stores the contents of the current line of input being parsed 939 self._currentline = '' 940 941 # Compile some regexes and cache those 942 self._mark_re = re.compile(br'mark :(\d+)\n$') 943 self._parent_regexes = {} 944 parent_regex_rules = (b' :(\d+)\n$', b' ([0-9a-f]{40})\n') 945 for parent_refname in (b'from', b'merge'): 946 ans = [re.compile(parent_refname+x) for x in parent_regex_rules] 947 self._parent_regexes[parent_refname] = ans 948 self._quoted_string_re = re.compile(br'"(?:[^"\\]|\\.)*"') 949 self._refline_regexes = {} 950 for refline_name in (b'reset', b'commit', b'tag', b'progress'): 951 self._refline_regexes[refline_name] = re.compile(refline_name+b' (.*)\n$') 952 self._user_regexes = {} 953 for user in (b'author', b'committer', b'tagger'): 954 self._user_regexes[user] = re.compile(user + b' (.*?) <(.*?)> (.*)\n$') 955 956 def _advance_currentline(self): 957 """ 958 Grab the next line of input 959 """ 960 self._currentline = self._input.readline() 961 962 def _parse_optional_mark(self): 963 """ 964 If the current line contains a mark, parse it and advance to the 965 next line; return None otherwise 966 """ 967 mark = None 968 matches = self._mark_re.match(self._currentline) 969 if matches: 970 mark = int(matches.group(1)) 971 self._advance_currentline() 972 return mark 973 974 def _parse_optional_parent_ref(self, refname): 975 """ 976 If the current line contains a reference to a parent commit, then 977 parse it and advance the current line; otherwise return None. Note 978 that the name of the reference ('from', 'merge') must match the 979 refname arg. 980 """ 981 orig_baseref, baseref = None, None 982 rule, altrule = self._parent_regexes[refname] 983 matches = rule.match(self._currentline) 984 if matches: 985 orig_baseref = int(matches.group(1)) 986 # We translate the parent commit mark to what it needs to be in 987 # our mark namespace 988 baseref = _IDS.translate(orig_baseref) 989 self._advance_currentline() 990 else: 991 matches = altrule.match(self._currentline) 992 if matches: 993 orig_baseref = matches.group(1) 994 baseref = orig_baseref 995 self._advance_currentline() 996 return orig_baseref, baseref 997 998 def _parse_optional_filechange(self): 999 """ 1000 If the current line contains a file-change object, then parse it 1001 and advance the current line; otherwise return None. We only care 1002 about file changes of type b'M' and b'D' (these are the only types 1003 of file-changes that fast-export will provide). 1004 """ 1005 filechange = None 1006 changetype = self._currentline[0:1] 1007 if changetype == b'M': 1008 (changetype, mode, idnum, path) = self._currentline.split(None, 3) 1009 if idnum[0:1] == b':': 1010 idnum = idnum[1:] 1011 path = path.rstrip(b'\n') 1012 # We translate the idnum to our id system 1013 if len(idnum) != 40: 1014 idnum = _IDS.translate( int(idnum) ) 1015 if idnum is not None: 1016 if path.startswith(b'"'): 1017 path = PathQuoting.dequote(path) 1018 filechange = FileChange(b'M', path, idnum, mode) 1019 else: 1020 filechange = b'skipped' 1021 self._advance_currentline() 1022 elif changetype == b'D': 1023 (changetype, path) = self._currentline.split(None, 1) 1024 path = path.rstrip(b'\n') 1025 if path.startswith(b'"'): 1026 path = PathQuoting.dequote(path) 1027 filechange = FileChange(b'D', path) 1028 self._advance_currentline() 1029 elif changetype == b'R': # pragma: no cover (now avoid fast-export renames) 1030 rest = self._currentline[2:-1] 1031 if rest.startswith(b'"'): 1032 m = self._quoted_string_re.match(rest) 1033 if not m: 1034 raise SystemExit(_("Couldn't parse rename source")) 1035 orig = PathQuoting.dequote(m.group(0)) 1036 new = rest[m.end()+1:] 1037 else: 1038 orig, new = rest.split(b' ', 1) 1039 if new.startswith(b'"'): 1040 new = PathQuoting.dequote(new) 1041 filechange = FileChange(b'R', orig, new) 1042 self._advance_currentline() 1043 return filechange 1044 1045 def _parse_original_id(self): 1046 original_id = self._currentline[len(b'original-oid '):].rstrip() 1047 self._advance_currentline() 1048 return original_id 1049 1050 def _parse_encoding(self): 1051 encoding = self._currentline[len(b'encoding '):].rstrip() 1052 self._advance_currentline() 1053 return encoding 1054 1055 def _parse_ref_line(self, refname): 1056 """ 1057 Parses string data (often a branch name) from current-line. The name of 1058 the string data must match the refname arg. The program will crash if 1059 current-line does not match, so current-line will always be advanced if 1060 this method returns. 1061 """ 1062 matches = self._refline_regexes[refname].match(self._currentline) 1063 if not matches: 1064 raise SystemExit(_("Malformed %(refname)s line: '%(line)s'") % 1065 ({'refname': refname, 'line':self._currentline}) 1066 ) # pragma: no cover 1067 ref = matches.group(1) 1068 self._advance_currentline() 1069 return ref 1070 1071 def _parse_user(self, usertype): 1072 """ 1073 Get user name, email, datestamp from current-line. Current-line will 1074 be advanced. 1075 """ 1076 user_regex = self._user_regexes[usertype] 1077 (name, email, when) = user_regex.match(self._currentline).groups() 1078 1079 self._advance_currentline() 1080 return (name, email, when) 1081 1082 def _parse_data(self): 1083 """ 1084 Reads data from _input. Current-line will be advanced until it is beyond 1085 the data. 1086 """ 1087 fields = self._currentline.split() 1088 assert fields[0] == b'data' 1089 size = int(fields[1]) 1090 data = self._input.read(size) 1091 self._advance_currentline() 1092 if self._currentline == b'\n': 1093 self._advance_currentline() 1094 return data 1095 1096 def _parse_blob(self): 1097 """ 1098 Parse input data into a Blob object. Once the Blob has been created, it 1099 will be handed off to the appropriate callbacks. Current-line will be 1100 advanced until it is beyond this blob's data. The Blob will be dumped 1101 to _output once everything else is done (unless it has been skipped by 1102 the callback). 1103 """ 1104 # Parse the Blob 1105 self._advance_currentline() 1106 id_ = self._parse_optional_mark() 1107 1108 original_id = None 1109 if self._currentline.startswith(b'original-oid'): 1110 original_id = self._parse_original_id(); 1111 1112 data = self._parse_data() 1113 if self._currentline == b'\n': 1114 self._advance_currentline() 1115 1116 # Create the blob 1117 blob = Blob(data, original_id) 1118 1119 # If fast-export text had a mark for this blob, need to make sure this 1120 # mark translates to the blob's true id. 1121 if id_: 1122 blob.old_id = id_ 1123 _IDS.record_rename(id_, blob.id) 1124 1125 # Call any user callback to allow them to use/modify the blob 1126 if self._blob_callback: 1127 self._blob_callback(blob) 1128 1129 # Now print the resulting blob 1130 if not blob.dumped: 1131 blob.dump(self._output) 1132 1133 def _parse_reset(self): 1134 """ 1135 Parse input data into a Reset object. Once the Reset has been created, 1136 it will be handed off to the appropriate callbacks. Current-line will 1137 be advanced until it is beyond the reset data. The Reset will be dumped 1138 to _output once everything else is done (unless it has been skipped by 1139 the callback). 1140 """ 1141 # Parse the Reset 1142 ref = self._parse_ref_line(b'reset') 1143 self._exported_refs.add(ref) 1144 ignoreme, from_ref = self._parse_optional_parent_ref(b'from') 1145 if self._currentline == b'\n': 1146 self._advance_currentline() 1147 1148 # fast-export likes to print extraneous resets that serve no purpose. 1149 # While we could continue processing such resets, that is a waste of 1150 # resources. Also, we want to avoid recording that this ref was 1151 # seen in such cases, since this ref could be rewritten to nothing. 1152 if not from_ref: 1153 self._latest_commit.pop(ref, None) 1154 self._latest_orig_commit.pop(ref, None) 1155 return 1156 1157 # Create the reset 1158 reset = Reset(ref, from_ref) 1159 1160 # Call any user callback to allow them to modify the reset 1161 if self._reset_callback: 1162 self._reset_callback(reset) 1163 1164 # Update metadata 1165 self._latest_commit[reset.ref] = reset.from_ref 1166 self._latest_orig_commit[reset.ref] = reset.from_ref 1167 1168 # Now print the resulting reset 1169 if not reset.dumped: 1170 self._imported_refs.add(reset.ref) 1171 reset.dump(self._output) 1172 1173 def _parse_commit(self): 1174 """ 1175 Parse input data into a Commit object. Once the Commit has been created, 1176 it will be handed off to the appropriate callbacks. Current-line will 1177 be advanced until it is beyond the commit data. The Commit will be dumped 1178 to _output once everything else is done (unless it has been skipped by 1179 the callback OR the callback has removed all file-changes from the commit). 1180 """ 1181 # Parse the Commit. This may look involved, but it's pretty simple; it only 1182 # looks bad because a commit object contains many pieces of data. 1183 branch = self._parse_ref_line(b'commit') 1184 self._exported_refs.add(branch) 1185 id_ = self._parse_optional_mark() 1186 1187 original_id = None 1188 if self._currentline.startswith(b'original-oid'): 1189 original_id = self._parse_original_id(); 1190 1191 author_name = None 1192 author_email = None 1193 if self._currentline.startswith(b'author'): 1194 (author_name, author_email, author_date) = self._parse_user(b'author') 1195 1196 (committer_name, committer_email, committer_date) = \ 1197 self._parse_user(b'committer') 1198 1199 if not author_name and not author_email: 1200 (author_name, author_email, author_date) = \ 1201 (committer_name, committer_email, committer_date) 1202 1203 encoding = None 1204 if self._currentline.startswith(b'encoding '): 1205 encoding = self._parse_encoding() 1206 1207 commit_msg = self._parse_data() 1208 1209 pinfo = [self._parse_optional_parent_ref(b'from')] 1210 # Due to empty pruning, we can have real 'from' and 'merge' lines that 1211 # due to commit rewriting map to a parent of None. We need to record 1212 # 'from' if its non-None, and we need to parse all 'merge' lines. 1213 while self._currentline.startswith(b'merge '): 1214 pinfo.append(self._parse_optional_parent_ref(b'merge')) 1215 orig_parents, parents = [list(tmp) for tmp in zip(*pinfo)] 1216 1217 # No parents is oddly represented as [None] instead of [], due to the 1218 # special 'from' handling. Convert it here to a more canonical form. 1219 if parents == [None]: 1220 parents = [] 1221 if orig_parents == [None]: 1222 orig_parents = [] 1223 1224 # fast-import format is kinda stupid in that it allows implicit parents 1225 # based on the branch name instead of requiring them to be specified by 1226 # 'from' directives. The only way to get no parent is by using a reset 1227 # directive first, which clears the latest_commit_for_this_branch tracking. 1228 if not orig_parents and self._latest_commit.get(branch): 1229 parents = [self._latest_commit[branch]] 1230 if not orig_parents and self._latest_orig_commit.get(branch): 1231 orig_parents = [self._latest_orig_commit[branch]] 1232 1233 # Get the list of file changes 1234 file_changes = [] 1235 file_change = self._parse_optional_filechange() 1236 had_file_changes = file_change is not None 1237 while file_change: 1238 if not (type(file_change) == bytes and file_change == b'skipped'): 1239 file_changes.append(file_change) 1240 file_change = self._parse_optional_filechange() 1241 if self._currentline == b'\n': 1242 self._advance_currentline() 1243 1244 # Okay, now we can finally create the Commit object 1245 commit = Commit(branch, 1246 author_name, author_email, author_date, 1247 committer_name, committer_email, committer_date, 1248 commit_msg, file_changes, parents, original_id, encoding) 1249 1250 # If fast-export text had a mark for this commit, need to make sure this 1251 # mark translates to the commit's true id. 1252 if id_: 1253 commit.old_id = id_ 1254 _IDS.record_rename(id_, commit.id) 1255 1256 # Call any user callback to allow them to modify the commit 1257 aux_info = {'orig_parents': orig_parents, 1258 'had_file_changes': had_file_changes} 1259 if self._commit_callback: 1260 self._commit_callback(commit, aux_info) 1261 1262 # Now print the resulting commit, or if prunable skip it 1263 self._latest_orig_commit[branch] = commit.id 1264 if not (commit.old_id or commit.id) in _SKIPPED_COMMITS: 1265 self._latest_commit[branch] = commit.id 1266 if not commit.dumped: 1267 self._imported_refs.add(commit.branch) 1268 commit.dump(self._output) 1269 1270 def _parse_tag(self): 1271 """ 1272 Parse input data into a Tag object. Once the Tag has been created, 1273 it will be handed off to the appropriate callbacks. Current-line will 1274 be advanced until it is beyond the tag data. The Tag will be dumped 1275 to _output once everything else is done (unless it has been skipped by 1276 the callback). 1277 """ 1278 # Parse the Tag 1279 tag = self._parse_ref_line(b'tag') 1280 self._exported_refs.add(b'refs/tags/'+tag) 1281 id_ = self._parse_optional_mark() 1282 ignoreme, from_ref = self._parse_optional_parent_ref(b'from') 1283 1284 original_id = None 1285 if self._currentline.startswith(b'original-oid'): 1286 original_id = self._parse_original_id(); 1287 1288 tagger_name, tagger_email, tagger_date = None, None, None 1289 if self._currentline.startswith(b'tagger'): 1290 (tagger_name, tagger_email, tagger_date) = self._parse_user(b'tagger') 1291 tag_msg = self._parse_data() 1292 if self._currentline == b'\n': 1293 self._advance_currentline() 1294 1295 # Create the tag 1296 tag = Tag(tag, from_ref, 1297 tagger_name, tagger_email, tagger_date, tag_msg, 1298 original_id) 1299 1300 # If fast-export text had a mark for this tag, need to make sure this 1301 # mark translates to the tag's true id. 1302 if id_: 1303 tag.old_id = id_ 1304 _IDS.record_rename(id_, tag.id) 1305 1306 # Call any user callback to allow them to modify the tag 1307 if self._tag_callback: 1308 self._tag_callback(tag) 1309 1310 # The tag might not point at anything that still exists (self.from_ref 1311 # will be None if the commit it pointed to and all its ancestors were 1312 # pruned due to being empty) 1313 if tag.from_ref: 1314 # Print out this tag's information 1315 if not tag.dumped: 1316 self._imported_refs.add(b'refs/tags/'+tag.ref) 1317 tag.dump(self._output) 1318 else: 1319 tag.skip() 1320 1321 def _parse_progress(self): 1322 """ 1323 Parse input data into a Progress object. Once the Progress has 1324 been created, it will be handed off to the appropriate 1325 callbacks. Current-line will be advanced until it is beyond the 1326 progress data. The Progress will be dumped to _output once 1327 everything else is done (unless it has been skipped by the callback). 1328 """ 1329 # Parse the Progress 1330 message = self._parse_ref_line(b'progress') 1331 if self._currentline == b'\n': 1332 self._advance_currentline() 1333 1334 # Create the progress message 1335 progress = Progress(message) 1336 1337 # Call any user callback to allow them to modify the progress messsage 1338 if self._progress_callback: 1339 self._progress_callback(progress) 1340 1341 # NOTE: By default, we do NOT print the progress message; git 1342 # fast-import would write it to fast_import_pipes which could mess with 1343 # our parsing of output from the 'ls' and 'get-mark' directives we send 1344 # to fast-import. If users want these messages, they need to process 1345 # and handle them in the appropriate callback above. 1346 1347 def _parse_checkpoint(self): 1348 """ 1349 Parse input data into a Checkpoint object. Once the Checkpoint has 1350 been created, it will be handed off to the appropriate 1351 callbacks. Current-line will be advanced until it is beyond the 1352 checkpoint data. The Checkpoint will be dumped to _output once 1353 everything else is done (unless it has been skipped by the callback). 1354 """ 1355 # Parse the Checkpoint 1356 self._advance_currentline() 1357 if self._currentline == b'\n': 1358 self._advance_currentline() 1359 1360 # Create the checkpoint 1361 checkpoint = Checkpoint() 1362 1363 # Call any user callback to allow them to drop the checkpoint 1364 if self._checkpoint_callback: 1365 self._checkpoint_callback(checkpoint) 1366 1367 # NOTE: By default, we do NOT print the checkpoint message; although it 1368 # we would only realistically get them with --stdin, the fact that we 1369 # are filtering makes me think the checkpointing is less likely to be 1370 # reasonable. In fact, I don't think it's necessary in general. If 1371 # users do want it, they should process it in the checkpoint_callback. 1372 1373 def _parse_literal_command(self): 1374 """ 1375 Parse literal command. Then just dump the line as is. 1376 """ 1377 # Create the literal command object 1378 command = LiteralCommand(self._currentline) 1379 self._advance_currentline() 1380 1381 # Now print the resulting literal command 1382 if not command.dumped: 1383 command.dump(self._output) 1384 1385 def insert(self, obj): 1386 assert not obj.dumped 1387 obj.dump(self._output) 1388 if type(obj) == Commit: 1389 self._imported_refs.add(obj.branch) 1390 elif type(obj) in (Reset, Tag): 1391 self._imported_refs.add(obj.ref) 1392 1393 def run(self, input, output): 1394 """ 1395 This method filters fast export output. 1396 """ 1397 # Set input. If no args provided, use stdin. 1398 self._input = input 1399 self._output = output 1400 1401 # Run over the input and do the filtering 1402 self._advance_currentline() 1403 while self._currentline: 1404 if self._currentline.startswith(b'blob'): 1405 self._parse_blob() 1406 elif self._currentline.startswith(b'reset'): 1407 self._parse_reset() 1408 elif self._currentline.startswith(b'commit'): 1409 self._parse_commit() 1410 elif self._currentline.startswith(b'tag'): 1411 self._parse_tag() 1412 elif self._currentline.startswith(b'progress'): 1413 self._parse_progress() 1414 elif self._currentline.startswith(b'checkpoint'): 1415 self._parse_checkpoint() 1416 elif self._currentline.startswith(b'feature'): 1417 self._parse_literal_command() 1418 elif self._currentline.startswith(b'option'): 1419 self._parse_literal_command() 1420 elif self._currentline.startswith(b'done'): 1421 if self._done_callback: 1422 self._done_callback() 1423 self._parse_literal_command() 1424 # Prevent confusion from others writing additional stuff that'll just 1425 # be ignored 1426 self._output.close() 1427 elif self._currentline.startswith(b'#'): 1428 self._parse_literal_command() 1429 elif self._currentline.startswith(b'get-mark') or \ 1430 self._currentline.startswith(b'cat-blob') or \ 1431 self._currentline.startswith(b'ls'): 1432 raise SystemExit(_("Unsupported command: '%s'") % self._currentline) 1433 else: 1434 raise SystemExit(_("Could not parse line: '%s'") % self._currentline) 1435 1436 def get_exported_and_imported_refs(self): 1437 return self._exported_refs, self._imported_refs 1438 1439def record_id_rename(old_id, new_id): 1440 """ 1441 Register a new translation 1442 """ 1443 handle_transitivity = True 1444 _IDS.record_rename(old_id, new_id, handle_transitivity) 1445 1446# Internal globals 1447_IDS = _IDs() 1448_SKIPPED_COMMITS = set() 1449HASH_TO_ID = {} 1450ID_TO_HASH = {} 1451 1452class SubprocessWrapper(object): 1453 @staticmethod 1454 def decodify(args): 1455 if type(args) == str: 1456 return args 1457 else: 1458 assert type(args) == list 1459 return [decode(x) if type(x)==bytes else x for x in args] 1460 1461 @staticmethod 1462 def call(*args, **kwargs): 1463 if 'cwd' in kwargs: 1464 kwargs['cwd'] = decode(kwargs['cwd']) 1465 return subprocess.call(SubprocessWrapper.decodify(*args), **kwargs) 1466 1467 @staticmethod 1468 def check_output(*args, **kwargs): 1469 if 'cwd' in kwargs: 1470 kwargs['cwd'] = decode(kwargs['cwd']) 1471 return subprocess.check_output(SubprocessWrapper.decodify(*args), **kwargs) 1472 1473 @staticmethod 1474 def check_call(*args, **kwargs): # pragma: no cover # used by filter-lamely 1475 if 'cwd' in kwargs: 1476 kwargs['cwd'] = decode(kwargs['cwd']) 1477 return subprocess.check_call(SubprocessWrapper.decodify(*args), **kwargs) 1478 1479 @staticmethod 1480 def Popen(*args, **kwargs): 1481 if 'cwd' in kwargs: 1482 kwargs['cwd'] = decode(kwargs['cwd']) 1483 return subprocess.Popen(SubprocessWrapper.decodify(*args), **kwargs) 1484 1485subproc = subprocess 1486if platform.system() == 'Windows' or 'PRETEND_UNICODE_ARGS' in os.environ: 1487 subproc = SubprocessWrapper 1488 1489class GitUtils(object): 1490 @staticmethod 1491 def get_commit_count(repo, *args): 1492 """ 1493 Return the number of commits that have been made on repo. 1494 """ 1495 if not args: 1496 args = ['--all'] 1497 if len(args) == 1 and isinstance(args[0], list): 1498 args = args[0] 1499 p = subproc.Popen(["git", "rev-list", "--count"] + args, 1500 stdout=subprocess.PIPE, stderr=subprocess.PIPE, 1501 cwd=repo) 1502 if p.wait() != 0: 1503 raise SystemExit(_("%s does not appear to be a valid git repository") 1504 % decode(repo)) 1505 return int(p.stdout.read()) 1506 1507 @staticmethod 1508 def get_total_objects(repo): 1509 """ 1510 Return the number of objects (both packed and unpacked) 1511 """ 1512 p1 = subproc.Popen(["git", "count-objects", "-v"], 1513 stdout=subprocess.PIPE, cwd=repo) 1514 lines = p1.stdout.read().splitlines() 1515 # Return unpacked objects + packed-objects 1516 return int(lines[0].split()[1]) + int(lines[2].split()[1]) 1517 1518 @staticmethod 1519 def is_repository_bare(repo_working_dir): 1520 out = subproc.check_output('git rev-parse --is-bare-repository'.split(), 1521 cwd=repo_working_dir) 1522 return (out.strip() == b'true') 1523 1524 @staticmethod 1525 def determine_git_dir(repo_working_dir): 1526 d = subproc.check_output('git rev-parse --git-dir'.split(), 1527 cwd=repo_working_dir).strip() 1528 if repo_working_dir==b'.' or d.startswith(b'/'): 1529 return d 1530 return os.path.join(repo_working_dir, d) 1531 1532 @staticmethod 1533 def get_refs(repo_working_dir): 1534 try: 1535 output = subproc.check_output('git show-ref'.split(), 1536 cwd=repo_working_dir) 1537 except subprocess.CalledProcessError as e: 1538 # If error code is 1, there just aren't any refs; i.e. new repo. 1539 # If error code is other than 1, some other error (e.g. not a git repo) 1540 if e.returncode != 1: 1541 raise SystemExit('fatal: {}'.format(e)) 1542 output = '' 1543 return dict(reversed(x.split()) for x in output.splitlines()) 1544 1545 @staticmethod 1546 def get_blob_sizes(quiet = False): 1547 blob_size_progress = ProgressWriter() 1548 num_blobs = 0 1549 processed_blobs_msg = _("Processed %d blob sizes") 1550 1551 # Get sizes of blobs by sha1 1552 cmd = '--batch-check=%(objectname) %(objecttype) ' + \ 1553 '%(objectsize) %(objectsize:disk)' 1554 cf = subproc.Popen(['git', 'cat-file', '--batch-all-objects', cmd], 1555 bufsize = -1, 1556 stdout = subprocess.PIPE) 1557 unpacked_size = {} 1558 packed_size = {} 1559 for line in cf.stdout: 1560 sha, objtype, objsize, objdisksize = line.split() 1561 objsize, objdisksize = int(objsize), int(objdisksize) 1562 if objtype == b'blob': 1563 unpacked_size[sha] = objsize 1564 packed_size[sha] = objdisksize 1565 num_blobs += 1 1566 if not quiet: 1567 blob_size_progress.show(processed_blobs_msg % num_blobs) 1568 cf.wait() 1569 if not quiet: 1570 blob_size_progress.finish() 1571 return unpacked_size, packed_size 1572 1573 @staticmethod 1574 def get_file_changes(repo, parent_hash, commit_hash): 1575 """ 1576 Return a FileChanges list with the differences between parent_hash 1577 and commit_hash 1578 """ 1579 file_changes = [] 1580 1581 cmd = ["git", "diff-tree", "-r", parent_hash, commit_hash] 1582 output = subproc.check_output(cmd, cwd=repo) 1583 for line in output.splitlines(): 1584 fileinfo, path = line.split(b'\t', 1) 1585 if path.startswith(b'"'): 1586 path = PathQuoting.dequote(path) 1587 oldmode, mode, oldhash, newhash, changetype = fileinfo.split() 1588 if changetype == b'D': 1589 file_changes.append(FileChange(b'D', path)) 1590 elif changetype in (b'A', b'M', b'T'): 1591 identifier = HASH_TO_ID.get(newhash, newhash) 1592 file_changes.append(FileChange(b'M', path, identifier, mode)) 1593 else: # pragma: no cover 1594 raise SystemExit("Unknown change type for line {}".format(line)) 1595 1596 return file_changes 1597 1598 @staticmethod 1599 def print_my_version(): 1600 with open(__file__, 'br') as f: 1601 contents = f.read() 1602 # If people replaced @@LOCALEDIR@@ string to point at their local 1603 # directory, undo it so we can get original source version. 1604 contents = re.sub(br'\A#\!.*', 1605 br'#!/usr/bin/env python3', contents) 1606 contents = re.sub(br'(\("GIT_TEXTDOMAINDIR"\) or ").*"', 1607 br'\1@@LOCALEDIR@@"', contents) 1608 1609 cmd = 'git hash-object --stdin'.split() 1610 version = subproc.check_output(cmd, input=contents).strip() 1611 print(decode(version[0:12])) 1612 1613class FilteringOptions(object): 1614 default_replace_text = b'***REMOVED***' 1615 class AppendFilter(argparse.Action): 1616 def __call__(self, parser, namespace, values, option_string=None): 1617 user_path = values 1618 suffix = option_string[len('--path-'):] or 'match' 1619 if suffix.startswith('rename'): 1620 mod_type = 'rename' 1621 match_type = option_string[len('--path-rename-'):] or 'match' 1622 values = values.split(b':') 1623 if len(values) != 2: 1624 raise SystemExit(_("Error: --path-rename expects one colon in its" 1625 " argument: <old_name:new_name>.")) 1626 if values[0] and values[1] and not ( 1627 values[0].endswith(b'/') == values[1].endswith(b'/')): 1628 raise SystemExit(_("Error: With --path-rename, if OLD_NAME and " 1629 "NEW_NAME are both non-empty and either ends " 1630 "with a slash then both must.")) 1631 if any(v.startswith(b'/') for v in values): 1632 raise SystemExit(_("Error: Pathnames cannot begin with a '/'")) 1633 components = values[0].split(b'/') + values[1].split(b'/') 1634 else: 1635 mod_type = 'filter' 1636 match_type = suffix 1637 components = values.split(b'/') 1638 if values.startswith(b'/'): 1639 raise SystemExit(_("Error: Pathnames cannot begin with a '/'")) 1640 for illegal_path in [b'.', b'..']: 1641 if illegal_path in components: 1642 raise SystemExit(_("Error: Invalid path component '%s' found in '%s'") 1643 % (decode(illegal_path), decode(user_path))) 1644 if match_type == 'regex': 1645 values = re.compile(values) 1646 items = getattr(namespace, self.dest, []) or [] 1647 items.append((mod_type, match_type, values)) 1648 if (match_type, mod_type) == ('glob', 'filter'): 1649 if not values.endswith(b'*'): 1650 extension = b'*' if values.endswith(b'/') else b'/*' 1651 items.append((mod_type, match_type, values+extension)) 1652 setattr(namespace, self.dest, items) 1653 1654 class HelperFilter(argparse.Action): 1655 def __call__(self, parser, namespace, values, option_string=None): 1656 af = FilteringOptions.AppendFilter(dest='path_changes', 1657 option_strings=None) 1658 dirname = values if values[-1:] == b'/' else values+b'/' 1659 if option_string == '--subdirectory-filter': 1660 af(parser, namespace, dirname, '--path-match') 1661 af(parser, namespace, dirname+b':', '--path-rename') 1662 elif option_string == '--to-subdirectory-filter': 1663 af(parser, namespace, b':'+dirname, '--path-rename') 1664 else: 1665 raise SystemExit(_("Error: HelperFilter given invalid option_string: %s") 1666 % option_string) # pragma: no cover 1667 1668 class FileWithPathsFilter(argparse.Action): 1669 def __call__(self, parser, namespace, values, option_string=None): 1670 if not namespace.path_changes: 1671 namespace.path_changes = [] 1672 namespace.path_changes += FilteringOptions.get_paths_from_file(values) 1673 1674 @staticmethod 1675 def create_arg_parser(): 1676 # Include usage in the summary, so we can put the description first 1677 summary = _('''Rewrite (or analyze) repository history 1678 1679 git-filter-repo destructively rewrites history (unless --analyze or 1680 --dry-run are given) according to specified rules. It refuses to do any 1681 rewriting unless either run from a clean fresh clone, or --force was 1682 given. 1683 1684 Basic Usage: 1685 git-filter-repo --analyze 1686 git-filter-repo [FILTER/RENAME/CONTROL OPTIONS] 1687 1688 See EXAMPLES section for details. 1689 ''').rstrip() 1690 1691 # Provide a long helpful examples section 1692 example_text = _('''CALLBACKS 1693 1694 All callback functions are of the same general format. For a command line 1695 argument like 1696 --foo-callback 'BODY' 1697 1698 the following code will be compiled and called: 1699 def foo_callback(foo): 1700 BODY 1701 1702 Thus, to replace 'Jon' with 'John' in author/committer/tagger names: 1703 git filter-repo --name-callback 'return name.replace(b"Jon", b"John")' 1704 1705 To remove all 'Tested-by' tags in commit (or tag) messages: 1706 git filter-repo --message-callback 'return re.sub(br"\\nTested-by:.*", "", message)' 1707 1708 To remove all .DS_Store files: 1709 git filter-repo --filename-callback 'return None if os.path.basename(filename) == b".DS_Store" else filename' 1710 1711 Note that if BODY resolves to a filename, then the contents of that file 1712 will be used as the BODY in the callback function. 1713 1714 For more detailed examples and explanations AND caveats, see 1715 https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#CALLBACKS 1716 1717EXAMPLES 1718 1719 To get a bunch of reports mentioning renames that have occurred in 1720 your repo and listing sizes of objects aggregated by any of path, 1721 directory, extension, or blob-id: 1722 git filter-repo --analyze 1723 1724 (These reports can help you choose how to filter your repo; it can 1725 be useful to re-run this command after filtering to regenerate the 1726 report and verify the changes look correct.) 1727 1728 To extract the history that touched just 'guides' and 'tools/releases': 1729 git filter-repo --path guides/ --path tools/releases 1730 1731 To remove foo.zip and bar/baz/zips from every revision in history: 1732 git filter-repo --path foo.zip --path bar/baz/zips/ --invert-paths 1733 1734 To replace the text 'password' with 'p455w0rd': 1735 git filter-repo --replace-text <(echo "password==>p455w0rd") 1736 1737 To use the current version of the .mailmap file to update authors, 1738 committers, and taggers throughout history and make it permanent: 1739 git filter-repo --use-mailmap 1740 1741 To extract the history of 'src/', rename all files to have a new leading 1742 directory 'my-module' (e.g. src/foo.java -> my-module/src/foo.java), and 1743 add a 'my-module-' prefix to all tags: 1744 git filter-repo --path src/ --to-subdirectory-filter my-module --tag-rename '':'my-module-' 1745 1746 For more detailed examples and explanations, see 1747 https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#EXAMPLES''') 1748 1749 # Create the basic parser 1750 parser = argparse.ArgumentParser(description=summary, 1751 usage = argparse.SUPPRESS, 1752 add_help = False, 1753 epilog = example_text, 1754 formatter_class=argparse.RawDescriptionHelpFormatter) 1755 1756 analyze = parser.add_argument_group(title=_("Analysis")) 1757 analyze.add_argument('--analyze', action='store_true', 1758 help=_("Analyze repository history and create a report that may be " 1759 "useful in determining what to filter in a subsequent run. " 1760 "Will not modify your repo.")) 1761 analyze.add_argument('--report-dir', 1762 metavar='DIR_OR_FILE', 1763 type=os.fsencode, 1764 dest='report_dir', 1765 help=_("Directory to write report, defaults to GIT_DIR/filter_repo/analysis," 1766 "refuses to run if exists, --force delete existing dir first.")) 1767 1768 path = parser.add_argument_group(title=_("Filtering based on paths " 1769 "(see also --filename-callback)"), 1770 description=textwrap.dedent(_(""" 1771 These options specify the paths to select. Note that much like git 1772 itself, renames are NOT followed so you may need to specify multiple 1773 paths, e.g. `--path olddir/ --path newdir/` 1774 """[1:]))) 1775 1776 path.add_argument('--invert-paths', action='store_false', dest='inclusive', 1777 help=_("Invert the selection of files from the specified " 1778 "--path-{match,glob,regex} options below, i.e. only select " 1779 "files matching none of those options.")) 1780 1781 path.add_argument('--path-match', '--path', metavar='DIR_OR_FILE', 1782 type=os.fsencode, 1783 action=FilteringOptions.AppendFilter, dest='path_changes', 1784 help=_("Exact paths (files or directories) to include in filtered " 1785 "history. Multiple --path options can be specified to get " 1786 "a union of paths.")) 1787 path.add_argument('--path-glob', metavar='GLOB', type=os.fsencode, 1788 action=FilteringOptions.AppendFilter, dest='path_changes', 1789 help=_("Glob of paths to include in filtered history. Multiple " 1790 "--path-glob options can be specified to get a union of " 1791 "paths.")) 1792 path.add_argument('--path-regex', metavar='REGEX', type=os.fsencode, 1793 action=FilteringOptions.AppendFilter, dest='path_changes', 1794 help=_("Regex of paths to include in filtered history. Multiple " 1795 "--path-regex options can be specified to get a union of " 1796 "paths")) 1797 path.add_argument('--use-base-name', action='store_true', 1798 help=_("Match on file base name instead of full path from the top " 1799 "of the repo. Incompatible with --path-rename, and " 1800 "incompatible with matching against directory names.")) 1801 1802 rename = parser.add_argument_group(title=_("Renaming based on paths " 1803 "(see also --filename-callback)")) 1804 rename.add_argument('--path-rename', '--path-rename-match', 1805 metavar='OLD_NAME:NEW_NAME', dest='path_changes', type=os.fsencode, 1806 action=FilteringOptions.AppendFilter, 1807 help=_("Path to rename; if filename or directory matches OLD_NAME " 1808 "rename to NEW_NAME. Multiple --path-rename options can be " 1809 "specified. NOTE: If you combine filtering options with " 1810 "renaming ones, do not rely on a rename argument to select " 1811 "paths; you also need a filter to select them.")) 1812 1813 helpers = parser.add_argument_group(title=_("Path shortcuts")) 1814 helpers.add_argument('--paths-from-file', metavar='FILENAME', 1815 type=os.fsencode, 1816 action=FilteringOptions.FileWithPathsFilter, dest='path_changes', 1817 help=_("Specify several path filtering and renaming directives, one " 1818 "per line. Lines with '==>' in them specify path renames, " 1819 "and lines can begin with 'literal:' (the default), 'glob:', " 1820 "or 'regex:' to specify different matching styles. Blank " 1821 "lines and lines starting with a '#' are ignored.")) 1822 helpers.add_argument('--subdirectory-filter', metavar='DIRECTORY', 1823 action=FilteringOptions.HelperFilter, type=os.fsencode, 1824 help=_("Only look at history that touches the given subdirectory " 1825 "and treat that directory as the project root. Equivalent " 1826 "to using '--path DIRECTORY/ --path-rename DIRECTORY/:'")) 1827 helpers.add_argument('--to-subdirectory-filter', metavar='DIRECTORY', 1828 action=FilteringOptions.HelperFilter, type=os.fsencode, 1829 help=_("Treat the project root as instead being under DIRECTORY. " 1830 "Equivalent to using '--path-rename :DIRECTORY/'")) 1831 1832 contents = parser.add_argument_group(title=_("Content editing filters " 1833 "(see also --blob-callback)")) 1834 contents.add_argument('--replace-text', metavar='EXPRESSIONS_FILE', 1835 help=_("A file with expressions that, if found, will be replaced. " 1836 "By default, each expression is treated as literal text, " 1837 "but 'regex:' and 'glob:' prefixes are supported. You can " 1838 "end the line with '==>' and some replacement text to " 1839 "choose a replacement choice other than the default of '{}'." 1840 .format(decode(FilteringOptions.default_replace_text)))) 1841 contents.add_argument('--strip-blobs-bigger-than', metavar='SIZE', 1842 dest='max_blob_size', default=0, 1843 help=_("Strip blobs (files) bigger than specified size (e.g. '5M', " 1844 "'2G', etc)")) 1845 contents.add_argument('--strip-blobs-with-ids', metavar='BLOB-ID-FILENAME', 1846 help=_("Read git object ids from each line of the given file, and " 1847 "strip all of them from history")) 1848 1849 refrename = parser.add_argument_group(title=_("Renaming of refs " 1850 "(see also --refname-callback)")) 1851 refrename.add_argument('--tag-rename', metavar='OLD:NEW', type=os.fsencode, 1852 help=_("Rename tags starting with OLD to start with NEW. For " 1853 "example, --tag-rename foo:bar will rename tag foo-1.2.3 " 1854 "to bar-1.2.3; either OLD or NEW can be empty.")) 1855 1856 messages = parser.add_argument_group(title=_("Filtering of commit messages " 1857 "(see also --message-callback)")) 1858 messages.add_argument('--replace-message', metavar='EXPRESSIONS_FILE', 1859 help=_("A file with expressions that, if found in commit messages, " 1860 "will be replaced. This file uses the same syntax as " 1861 "--replace-text.")) 1862 messages.add_argument('--preserve-commit-hashes', action='store_true', 1863 help=_("By default, since commits are rewritten and thus gain new " 1864 "hashes, references to old commit hashes in commit messages " 1865 "are replaced with new commit hashes (abbreviated to the same " 1866 "length as the old reference). Use this flag to turn off " 1867 "updating commit hashes in commit messages.")) 1868 messages.add_argument('--preserve-commit-encoding', action='store_true', 1869 help=_("Do not reencode commit messages into UTF-8. By default, if " 1870 "the commit object specifies an encoding for the commit " 1871 "message, the message is re-encoded into UTF-8.")) 1872 1873 people = parser.add_argument_group(title=_("Filtering of names & emails " 1874 "(see also --name-callback " 1875 "and --email-callback)")) 1876 people.add_argument('--mailmap', dest='mailmap', metavar='FILENAME', 1877 type=os.fsencode, 1878 help=_("Use specified mailmap file (see git-shortlog(1) for " 1879 "details on the format) when rewriting author, committer, " 1880 "and tagger names and emails. If the specified file is " 1881 "part of git history, historical versions of the file will " 1882 "be ignored; only the current contents are consulted.")) 1883 people.add_argument('--use-mailmap', dest='mailmap', 1884 action='store_const', const=b'.mailmap', 1885 help=_("Same as: '--mailmap .mailmap' ")) 1886 1887 parents = parser.add_argument_group(title=_("Parent rewriting")) 1888 parents.add_argument('--replace-refs', default=None, 1889 choices=['delete-no-add', 'delete-and-add', 1890 'update-no-add', 'update-or-add', 1891 'update-and-add'], 1892 help=_("Replace refs (see git-replace(1)) are used to rewrite " 1893 "parents (unless turned off by the usual git mechanism); this " 1894 "flag specifies what do do with those refs afterward. " 1895 "Replace refs can either be deleted or updated to point at new " 1896 "commit hashes. Also, new replace refs can be added for each " 1897 "commit rewrite. With 'update-or-add', new replace refs are " 1898 "only added for commit rewrites that aren't used to update an " 1899 "existing replace ref. default is 'update-and-add' if " 1900 "$GIT_DIR/filter-repo/already_ran does not exist; " 1901 "'update-or-add' otherwise.")) 1902 parents.add_argument('--prune-empty', default='auto', 1903 choices=['always', 'auto', 'never'], 1904 help=_("Whether to prune empty commits. 'auto' (the default) means " 1905 "only prune commits which become empty (not commits which were " 1906 "empty in the original repo, unless their parent was pruned). " 1907 "When the parent of a commit is pruned, the first non-pruned " 1908 "ancestor becomes the new parent.")) 1909 parents.add_argument('--prune-degenerate', default='auto', 1910 choices=['always', 'auto', 'never'], 1911 help=_("Since merge commits are needed for history topology, they " 1912 "are typically exempt from pruning. However, they can become " 1913 "degenerate with the pruning of other commits (having fewer " 1914 "than two parents, having one commit serve as both parents, or " 1915 "having one parent as the ancestor of the other.) If such " 1916 "merge commits have no file changes, they can be pruned. The " 1917 "default ('auto') is to only prune empty merge commits which " 1918 "become degenerate (not which started as such).")) 1919 parents.add_argument('--no-ff', action='store_true', 1920 help=_("Even if the first parent is or becomes an ancestor of another " 1921 "parent, do not prune it. This modifies how " 1922 "--prune-degenerate behaves, and may be useful in projects who " 1923 "always use merge --no-ff.")) 1924 1925 callback = parser.add_argument_group(title=_("Generic callback code snippets")) 1926 callback.add_argument('--filename-callback', metavar="FUNCTION_BODY_OR_FILE", 1927 help=_("Python code body for processing filenames; see CALLBACKS " 1928 "sections below.")) 1929 callback.add_argument('--message-callback', metavar="FUNCTION_BODY_OR_FILE", 1930 help=_("Python code body for processing messages (both commit " 1931 "messages and tag messages); see CALLBACKS section below.")) 1932 callback.add_argument('--name-callback', metavar="FUNCTION_BODY_OR_FILE", 1933 help=_("Python code body for processing names of people; see " 1934 "CALLBACKS section below.")) 1935 callback.add_argument('--email-callback', metavar="FUNCTION_BODY_OR_FILE", 1936 help=_("Python code body for processing emails addresses; see " 1937 "CALLBACKS section below.")) 1938 callback.add_argument('--refname-callback', metavar="FUNCTION_BODY_OR_FILE", 1939 help=_("Python code body for processing refnames; see CALLBACKS " 1940 "section below.")) 1941 1942 callback.add_argument('--blob-callback', metavar="FUNCTION_BODY_OR_FILE", 1943 help=_("Python code body for processing blob objects; see " 1944 "CALLBACKS section below.")) 1945 callback.add_argument('--commit-callback', metavar="FUNCTION_BODY_OR_FILE", 1946 help=_("Python code body for processing commit objects; see " 1947 "CALLBACKS section below.")) 1948 callback.add_argument('--tag-callback', metavar="FUNCTION_BODY_OR_FILE", 1949 help=_("Python code body for processing tag objects; see CALLBACKS " 1950 "section below.")) 1951 callback.add_argument('--reset-callback', metavar="FUNCTION_BODY_OR_FILE", 1952 help=_("Python code body for processing reset objects; see " 1953 "CALLBACKS section below.")) 1954 1955 desc = _( 1956 "Specifying alternate source or target locations implies --partial,\n" 1957 "except that the normal default for --replace-refs is used. However,\n" 1958 "unlike normal uses of --partial, this doesn't risk mixing old and new\n" 1959 "history since the old and new histories are in different repositories.") 1960 location = parser.add_argument_group(title=_("Location to filter from/to"), 1961 description=desc) 1962 location.add_argument('--source', type=os.fsencode, 1963 help=_("Git repository to read from")) 1964 location.add_argument('--target', type=os.fsencode, 1965 help=_("Git repository to overwrite with filtered history")) 1966 1967 misc = parser.add_argument_group(title=_("Miscellaneous options")) 1968 misc.add_argument('--help', '-h', action='store_true', 1969 help=_("Show this help message and exit.")) 1970 misc.add_argument('--version', action='store_true', 1971 help=_("Display filter-repo's version and exit.")) 1972 misc.add_argument('--force', '-f', action='store_true', 1973 help=_("Rewrite repository history even if the current repo does not " 1974 "look like a fresh clone. History rewriting is irreversible " 1975 "(and includes immediate pruning of reflogs and old objects), " 1976 "so be cautious about using this flag.")) 1977 misc.add_argument('--partial', action='store_true', 1978 help=_("Do a partial history rewrite, resulting in the mixture of " 1979 "old and new history. This implies a default of " 1980 "update-no-add for --replace-refs, disables rewriting " 1981 "refs/remotes/origin/* to refs/heads/*, disables removing " 1982 "of the 'origin' remote, disables removing unexported refs, " 1983 "disables expiring the reflog, and disables the automatic " 1984 "post-filter gc. Also, this modifies --tag-rename and " 1985 "--refname-callback options such that instead of replacing " 1986 "old refs with new refnames, it will instead create new " 1987 "refs and keep the old ones around. Use with caution.")) 1988 # WARNING: --refs presents a problem with become-degenerate pruning: 1989 # * Excluding a commit also excludes its ancestors so when some other 1990 # commit has an excluded ancestor as a parent we have no way of 1991 # knowing what it is an ancestor of without doing a special 1992 # full-graph walk. 1993 misc.add_argument('--refs', nargs='+', 1994 help=_("Limit history rewriting to the specified refs. Implies " 1995 "--partial. In addition to the normal caveats of --partial " 1996 "(mixing old and new history, no automatic remapping of " 1997 "refs/remotes/origin/* to refs/heads/*, etc.), this also may " 1998 "cause problems for pruning of degenerate empty merge " 1999 "commits when negative revisions are specified.")) 2000 2001 misc.add_argument('--dry-run', action='store_true', 2002 help=_("Do not change the repository. Run `git fast-export` and " 2003 "filter its output, and save both the original and the " 2004 "filtered version for comparison. This also disables " 2005 "rewriting commit messages due to not knowing new commit " 2006 "IDs and disables filtering of some empty commits due to " 2007 "inability to query the fast-import backend." )) 2008 misc.add_argument('--debug', action='store_true', 2009 help=_("Print additional information about operations being " 2010 "performed and commands being run. When used together " 2011 "with --dry-run, also show extra information about what " 2012 "would be run.")) 2013 # WARNING: --state-branch has some problems: 2014 # * It does not work well with manually inserted objects (user creating 2015 # Blob() or Commit() or Tag() objects and calling 2016 # RepoFilter.insert(obj) on them). 2017 # * It does not work well with multiple source or multiple target repos 2018 # * It doesn't work so well with pruning become-empty commits (though 2019 # --refs doesn't work so well with it either) 2020 # These are probably fixable, given some work (e.g. re-importing the 2021 # graph at the beginning to get the AncestryGraph right, doing our own 2022 # export of marks instead of using fast-export --export-marks, etc.), but 2023 # for now just hide the option. 2024 misc.add_argument('--state-branch', 2025 #help=_("Enable incremental filtering by saving the mapping of old " 2026 # "to new objects to the specified branch upon exit, and" 2027 # "loading that mapping from that branch (if it exists) " 2028 # "upon startup.")) 2029 help=argparse.SUPPRESS) 2030 misc.add_argument('--stdin', action='store_true', 2031 help=_("Instead of running `git fast-export` and filtering its " 2032 "output, filter the fast-export stream from stdin. The " 2033 "stdin must be in the expected input format (e.g. it needs " 2034 "to include original-oid directives).")) 2035 misc.add_argument('--quiet', action='store_true', 2036 help=_("Pass --quiet to other git commands called")) 2037 return parser 2038 2039 @staticmethod 2040 def sanity_check_args(args): 2041 if args.analyze and args.path_changes: 2042 raise SystemExit(_("Error: --analyze is incompatible with --path* flags; " 2043 "it's a read-only operation.")) 2044 if args.analyze and args.stdin: 2045 raise SystemExit(_("Error: --analyze is incompatible with --stdin.")) 2046 # If no path_changes are found, initialize with empty list but mark as 2047 # not inclusive so that all files match 2048 if args.path_changes == None: 2049 args.path_changes = [] 2050 args.inclusive = False 2051 else: 2052 # Similarly, if we have no filtering paths, then no path should be 2053 # filtered out. Based on how newname() works, the easiest way to 2054 # achieve that is setting args.inclusive to False. 2055 if not any(x[0] == 'filter' for x in args.path_changes): 2056 args.inclusive = False 2057 # Also check for incompatible --use-base-name and --path-rename flags. 2058 if args.use_base_name: 2059 if any(x[0] == 'rename' for x in args.path_changes): 2060 raise SystemExit(_("Error: --use-base-name and --path-rename are " 2061 "incompatible.")) 2062 # Also throw some sanity checks on git version here; 2063 # PERF: remove these checks once new enough git versions are common 2064 p = subproc.Popen('git fast-export -h'.split(), 2065 stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 2066 output = p.stdout.read() 2067 if b'--anonymize-map' not in output: # pragma: no cover 2068 global date_format_permissive 2069 date_format_permissive = False 2070 if b'--mark-tags' not in output: # pragma: no cover 2071 global write_marks 2072 write_marks = False 2073 if args.state_branch: 2074 # We need a version of git-fast-export with --mark-tags 2075 raise SystemExit(_("Error: need git >= 2.24.0")) 2076 if b'--reencode' not in output: # pragma: no cover 2077 if args.preserve_commit_encoding: 2078 # We need a version of git-fast-export with --reencode 2079 raise SystemExit(_("Error: need git >= 2.23.0")) 2080 else: 2081 # Set args.preserve_commit_encoding to None which we'll check for later 2082 # to avoid passing --reencode=yes to fast-export (that option was the 2083 # default prior to git-2.23) 2084 args.preserve_commit_encoding = None 2085 # If we don't have fast-exoprt --reencode, we may also be missing 2086 # diff-tree --combined-all-paths, which is even more important... 2087 p = subproc.Popen('git diff-tree -h'.split(), 2088 stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 2089 output = p.stdout.read() 2090 if b'--combined-all-paths' not in output: 2091 # We need a version of git-diff-tree with --combined-all-paths 2092 raise SystemExit(_("Error: need git >= 2.22.0")) 2093 # End of sanity checks on git version 2094 if args.max_blob_size: 2095 suffix = args.max_blob_size[-1] 2096 if suffix not in '1234567890': 2097 mult = {'K': 1024, 'M': 1024**2, 'G': 1024**3} 2098 if suffix not in mult: 2099 raise SystemExit(_("Error: could not parse --strip-blobs-bigger-than" 2100 " argument %s") 2101 % args.max_blob_size) 2102 args.max_blob_size = int(args.max_blob_size[0:-1]) * mult[suffix] 2103 else: 2104 args.max_blob_size = int(args.max_blob_size) 2105 2106 @staticmethod 2107 def get_replace_text(filename): 2108 replace_literals = [] 2109 replace_regexes = [] 2110 with open(filename, 'br') as f: 2111 for line in f: 2112 line = line.rstrip(b'\r\n') 2113 2114 # Determine the replacement 2115 replacement = FilteringOptions.default_replace_text 2116 if b'==>' in line: 2117 line, replacement = line.rsplit(b'==>', 1) 2118 2119 # See if we need to match via regex 2120 regex = None 2121 if line.startswith(b'regex:'): 2122 regex = line[6:] 2123 elif line.startswith(b'glob:'): 2124 regex = glob_to_regex(line[5:]) 2125 if regex: 2126 replace_regexes.append((re.compile(regex), replacement)) 2127 else: 2128 # Otherwise, find the literal we need to replace 2129 if line.startswith(b'literal:'): 2130 line = line[8:] 2131 if not line: 2132 continue 2133 replace_literals.append((line, replacement)) 2134 return {'literals': replace_literals, 'regexes': replace_regexes} 2135 2136 @staticmethod 2137 def get_paths_from_file(filename): 2138 new_path_changes = [] 2139 with open(filename, 'br') as f: 2140 for line in f: 2141 line = line.rstrip(b'\r\n') 2142 2143 # Skip blank lines 2144 if not line: 2145 continue 2146 # Skip comment lines 2147 if line.startswith(b'#'): 2148 continue 2149 2150 # Determine the replacement 2151 match_type, repl = 'literal', None 2152 if b'==>' in line: 2153 line, repl = line.rsplit(b'==>', 1) 2154 2155 # See if we need to match via regex 2156 match_type = 'match' # a.k.a. 'literal' 2157 if line.startswith(b'regex:'): 2158 match_type = 'regex' 2159 match = re.compile(line[6:]) 2160 elif line.startswith(b'glob:'): 2161 match_type = 'glob' 2162 match = line[5:] 2163 if repl: 2164 raise SystemExit(_("Error: In %s, 'glob:' and '==>' are incompatible (renaming globs makes no sense)" % decode(filename))) 2165 else: 2166 if line.startswith(b'literal:'): 2167 match = line[8:] 2168 else: 2169 match = line 2170 if repl is not None: 2171 if match and repl and match.endswith(b'/') != repl.endswith(b'/'): 2172 raise SystemExit(_("Error: When rename directories, if OLDNAME " 2173 "and NEW_NAME are both non-empty and either " 2174 "ends with a slash then both must.")) 2175 2176 # Record the filter or rename 2177 if repl is not None: 2178 new_path_changes.append(['rename', match_type, (match, repl)]) 2179 else: 2180 new_path_changes.append(['filter', match_type, match]) 2181 if match_type == 'glob' and not match.endswith(b'*'): 2182 extension = b'*' if match.endswith(b'/') else b'/*' 2183 new_path_changes.append(['filter', match_type, match+extension]) 2184 return new_path_changes 2185 2186 @staticmethod 2187 def default_options(): 2188 return FilteringOptions.parse_args([], error_on_empty = False) 2189 2190 @staticmethod 2191 def parse_args(input_args, error_on_empty = True): 2192 parser = FilteringOptions.create_arg_parser() 2193 if not input_args and error_on_empty: 2194 parser.print_usage() 2195 raise SystemExit(_("No arguments specified.")) 2196 args = parser.parse_args(input_args) 2197 if args.help: 2198 parser.print_help() 2199 raise SystemExit() 2200 if args.version: 2201 GitUtils.print_my_version() 2202 raise SystemExit() 2203 FilteringOptions.sanity_check_args(args) 2204 if args.mailmap: 2205 args.mailmap = MailmapInfo(args.mailmap) 2206 if args.replace_text: 2207 args.replace_text = FilteringOptions.get_replace_text(args.replace_text) 2208 if args.replace_message: 2209 args.replace_message = FilteringOptions.get_replace_text(args.replace_message) 2210 if args.strip_blobs_with_ids: 2211 with open(args.strip_blobs_with_ids, 'br') as f: 2212 args.strip_blobs_with_ids = set(f.read().split()) 2213 else: 2214 args.strip_blobs_with_ids = set() 2215 if (args.partial or args.refs) and not args.replace_refs: 2216 args.replace_refs = 'update-no-add' 2217 args.repack = not (args.partial or args.refs) 2218 if args.refs or args.source or args.target: 2219 args.partial = True 2220 if not args.refs: 2221 args.refs = ['--all'] 2222 return args 2223 2224class RepoAnalyze(object): 2225 2226 # First, several helper functions for analyze_commit() 2227 2228 @staticmethod 2229 def equiv_class(stats, filename): 2230 return stats['equivalence'].get(filename, (filename,)) 2231 2232 @staticmethod 2233 def setup_equivalence_for_rename(stats, oldname, newname): 2234 # if A is renamed to B and B is renamed to C, then the user thinks of 2235 # A, B, and C as all being different names for the same 'file'. We record 2236 # this as an equivalence class: 2237 # stats['equivalence'][name] = (A,B,C) 2238 # for name being each of A, B, and C. 2239 old_tuple = stats['equivalence'].get(oldname, ()) 2240 if newname in old_tuple: 2241 return 2242 elif old_tuple: 2243 new_tuple = tuple(list(old_tuple)+[newname]) 2244 else: 2245 new_tuple = (oldname, newname) 2246 for f in new_tuple: 2247 stats['equivalence'][f] = new_tuple 2248 2249 @staticmethod 2250 def setup_or_update_rename_history(stats, commit, oldname, newname): 2251 rename_commits = stats['rename_history'].get(oldname, set()) 2252 rename_commits.add(commit) 2253 stats['rename_history'][oldname] = rename_commits 2254 2255 @staticmethod 2256 def handle_renames(stats, commit, change_types, filenames): 2257 for index, change_type in enumerate(change_types): 2258 if change_type == ord(b'R'): 2259 oldname, newname = filenames[index], filenames[-1] 2260 RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname) 2261 RepoAnalyze.setup_or_update_rename_history(stats, commit, 2262 oldname, newname) 2263 2264 @staticmethod 2265 def handle_file(stats, graph, commit, modes, shas, filenames): 2266 mode, sha, filename = modes[-1], shas[-1], filenames[-1] 2267 2268 # Figure out kind of deletions to undo for this file, and update lists 2269 # of all-names-by-sha and all-filenames 2270 delmode = 'tree_deletions' 2271 if mode != b'040000': 2272 delmode = 'file_deletions' 2273 stats['names'][sha].add(filename) 2274 stats['allnames'].add(filename) 2275 2276 # If the file (or equivalence class of files) was recorded as deleted, 2277 # clearly it isn't anymore 2278 equiv = RepoAnalyze.equiv_class(stats, filename) 2279 for f in equiv: 2280 stats[delmode].pop(f, None) 2281 2282 # If we get a modify/add for a path that was renamed, we may need to break 2283 # the equivalence class. However, if the modify/add was on a branch that 2284 # doesn't have the rename in its history, we are still okay. 2285 need_to_break_equivalence = False 2286 if equiv[-1] != filename: 2287 for rename_commit in stats['rename_history'][filename]: 2288 if graph.is_ancestor(rename_commit, commit): 2289 need_to_break_equivalence = True 2290 2291 if need_to_break_equivalence: 2292 for f in equiv: 2293 if f in stats['equivalence']: 2294 del stats['equivalence'][f] 2295 2296 @staticmethod 2297 def analyze_commit(stats, graph, commit, parents, date, file_changes): 2298 graph.add_commit_and_parents(commit, parents) 2299 for change in file_changes: 2300 modes, shas, change_types, filenames = change 2301 if len(parents) == 1 and change_types.startswith(b'R'): 2302 change_types = b'R' # remove the rename score; we don't care 2303 if modes[-1] == b'160000': 2304 continue 2305 elif modes[-1] == b'000000': 2306 # Track when files/directories are deleted 2307 for f in RepoAnalyze.equiv_class(stats, filenames[-1]): 2308 if any(x == b'040000' for x in modes[0:-1]): 2309 stats['tree_deletions'][f] = date 2310 else: 2311 stats['file_deletions'][f] = date 2312 elif change_types.strip(b'AMT') == b'': 2313 RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) 2314 elif modes[-1] == b'040000' and change_types.strip(b'RAM') == b'': 2315 RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) 2316 elif change_types.strip(b'RAMT') == b'': 2317 RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) 2318 RepoAnalyze.handle_renames(stats, commit, change_types, filenames) 2319 else: 2320 raise SystemExit(_("Unhandled change type(s): %(change_type)s " 2321 "(in commit %(commit)s)") 2322 % ({'change_type': change_types, 'commit': commit}) 2323 ) # pragma: no cover 2324 2325 @staticmethod 2326 def gather_data(args): 2327 unpacked_size, packed_size = GitUtils.get_blob_sizes() 2328 stats = {'names': collections.defaultdict(set), 2329 'allnames' : set(), 2330 'file_deletions': {}, 2331 'tree_deletions': {}, 2332 'equivalence': {}, 2333 'rename_history': collections.defaultdict(set), 2334 'unpacked_size': unpacked_size, 2335 'packed_size': packed_size, 2336 'num_commits': 0} 2337 2338 # Setup the rev-list/diff-tree process 2339 processed_commits_msg = _("Processed %d commits") 2340 commit_parse_progress = ProgressWriter() 2341 num_commits = 0 2342 cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) + 2343 ' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' + 2344 ' --date=short -M -t -c --raw --combined-all-paths') 2345 dtp = subproc.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE) 2346 f = dtp.stdout 2347 line = f.readline() 2348 if not line: 2349 raise SystemExit(_("Nothing to analyze; repository is empty.")) 2350 cont = bool(line) 2351 graph = AncestryGraph() 2352 while cont: 2353 commit = line.rstrip() 2354 parents = f.readline().split() 2355 date = f.readline().rstrip() 2356 2357 # We expect a blank line next; if we get a non-blank line then 2358 # this commit modified no files and we need to move on to the next. 2359 # If there is no line, we've reached end-of-input. 2360 line = f.readline() 2361 if not line: 2362 cont = False 2363 line = line.rstrip() 2364 2365 # If we haven't reached end of input, and we got a blank line meaning 2366 # a commit that has modified files, then get the file changes associated 2367 # with this commit. 2368 file_changes = [] 2369 if cont and not line: 2370 cont = False 2371 for line in f: 2372 if not line.startswith(b':'): 2373 cont = True 2374 break 2375 n = 1+max(1, len(parents)) 2376 assert line.startswith(b':'*(n-1)) 2377 relevant = line[n-1:-1] 2378 splits = relevant.split(None, n) 2379 modes = splits[0:n] 2380 splits = splits[n].split(None, n) 2381 shas = splits[0:n] 2382 splits = splits[n].split(b'\t') 2383 change_types = splits[0] 2384 filenames = [PathQuoting.dequote(x) for x in splits[1:]] 2385 file_changes.append([modes, shas, change_types, filenames]) 2386 2387 # If someone is trying to analyze a subset of the history, make sure 2388 # to avoid dying on commits with parents that we haven't seen before 2389 if args.refs: 2390 graph.record_external_commits([p for p in parents 2391 if not p in graph.value]) 2392 2393 # Analyze this commit and update progress 2394 RepoAnalyze.analyze_commit(stats, graph, commit, parents, date, 2395 file_changes) 2396 num_commits += 1 2397 commit_parse_progress.show(processed_commits_msg % num_commits) 2398 2399 # Show the final commits processed message and record the number of commits 2400 commit_parse_progress.finish() 2401 stats['num_commits'] = num_commits 2402 2403 # Close the output, ensure rev-list|diff-tree pipeline completed successfully 2404 dtp.stdout.close() 2405 if dtp.wait(): 2406 raise SystemExit(_("Error: rev-list|diff-tree pipeline failed; see above.")) # pragma: no cover 2407 2408 return stats 2409 2410 @staticmethod 2411 def write_report(reportdir, stats): 2412 def datestr(datetimestr): 2413 return datetimestr if datetimestr else _('<present>').encode() 2414 2415 def dirnames(path): 2416 while True: 2417 path = os.path.dirname(path) 2418 yield path 2419 if path == b'': 2420 break 2421 2422 # Compute aggregate size information for paths, extensions, and dirs 2423 total_size = {'packed': 0, 'unpacked': 0} 2424 path_size = {'packed': collections.defaultdict(int), 2425 'unpacked': collections.defaultdict(int)} 2426 ext_size = {'packed': collections.defaultdict(int), 2427 'unpacked': collections.defaultdict(int)} 2428 dir_size = {'packed': collections.defaultdict(int), 2429 'unpacked': collections.defaultdict(int)} 2430 for sha in stats['names']: 2431 size = {'packed': stats['packed_size'][sha], 2432 'unpacked': stats['unpacked_size'][sha]} 2433 for which in ('packed', 'unpacked'): 2434 for name in stats['names'][sha]: 2435 total_size[which] += size[which] 2436 path_size[which][name] += size[which] 2437 basename, ext = os.path.splitext(name) 2438 ext_size[which][ext] += size[which] 2439 for dirname in dirnames(name): 2440 dir_size[which][dirname] += size[which] 2441 2442 # Determine if and when extensions and directories were deleted 2443 ext_deleted_data = {} 2444 for name in stats['allnames']: 2445 when = stats['file_deletions'].get(name, None) 2446 2447 # Update the extension 2448 basename, ext = os.path.splitext(name) 2449 if when is None: 2450 ext_deleted_data[ext] = None 2451 elif ext in ext_deleted_data: 2452 if ext_deleted_data[ext] is not None: 2453 ext_deleted_data[ext] = max(ext_deleted_data[ext], when) 2454 else: 2455 ext_deleted_data[ext] = when 2456 2457 dir_deleted_data = {} 2458 for name in dir_size['packed']: 2459 dir_deleted_data[name] = stats['tree_deletions'].get(name, None) 2460 2461 with open(os.path.join(reportdir, b"README"), 'bw') as f: 2462 # Give a basic overview of this file 2463 f.write(b"== %s ==\n" % _("Overall Statistics").encode()) 2464 f.write((" %s: %d\n" % (_("Number of commits"), 2465 stats['num_commits'])).encode()) 2466 f.write((" %s: %d\n" % (_("Number of filenames"), 2467 len(path_size['packed']))).encode()) 2468 f.write((" %s: %d\n" % (_("Number of directories"), 2469 len(dir_size['packed']))).encode()) 2470 f.write((" %s: %d\n" % (_("Number of file extensions"), 2471 len(ext_size['packed']))).encode()) 2472 f.write(b"\n") 2473 f.write((" %s: %d\n" % (_("Total unpacked size (bytes)"), 2474 total_size['unpacked'])).encode()) 2475 f.write((" %s: %d\n" % (_("Total packed size (bytes)"), 2476 total_size['packed'])).encode()) 2477 f.write(b"\n") 2478 2479 # Mention issues with the report 2480 f.write(("== %s ==\n" % _("Caveats")).encode()) 2481 f.write(("=== %s ===\n" % _("Sizes")).encode()) 2482 f.write(textwrap.dedent(_(""" 2483 Packed size represents what size your repository would be if no 2484 trees, commits, tags, or other metadata were included (though it may 2485 fail to represent de-duplication; see below). It also represents the 2486 current packing, which may be suboptimal if you haven't gc'ed for a 2487 while. 2488 2489 Unpacked size represents what size your repository would be if no 2490 trees, commits, tags, or other metadata were included AND if no 2491 files were packed; i.e., without delta-ing or compression. 2492 2493 Both unpacked and packed sizes can be slightly misleading. Deleting 2494 a blob from history not save as much space as the unpacked size, 2495 because it is obviously normally stored in packed form. Also, 2496 deleting a blob from history may not save as much space as its packed 2497 size either, because another blob could be stored as a delta against 2498 that blob, so when you remove one blob another blob's packed size may 2499 grow. 2500 2501 Also, the sum of the packed sizes can add up to more than the 2502 repository size; if the same contents appeared in the repository in 2503 multiple places, git will automatically de-dupe and store only one 2504 copy, while the way sizes are added in this analysis adds the size 2505 for each file path that has those contents. Further, if a file is 2506 ever reverted to a previous version's contents, the previous 2507 version's size will be counted multiple times in this analysis, even 2508 though git will only store it once. 2509 """)[1:]).encode()) 2510 f.write(b"\n") 2511 f.write(("=== %s ===\n" % _("Deletions")).encode()) 2512 f.write(textwrap.dedent(_(""" 2513 Whether a file is deleted is not a binary quality, since it can be 2514 deleted on some branches but still exist in others. Also, it might 2515 exist in an old tag, but have been deleted in versions newer than 2516 that. More thorough tracking could be done, including looking at 2517 merge commits where one side of history deleted and the other modified, 2518 in order to give a more holistic picture of deletions. However, that 2519 algorithm would not only be more complex to implement, it'd also be 2520 quite difficult to present and interpret by users. Since --analyze 2521 is just about getting a high-level rough picture of history, it instead 2522 implements the simplistic rule that is good enough for 98% of cases: 2523 A file is marked as deleted if the last commit in the fast-export 2524 stream that mentions the file lists it as deleted. 2525 This makes it dependent on topological ordering, but generally gives 2526 the "right" answer. 2527 """)[1:]).encode()) 2528 f.write(b"\n") 2529 f.write(("=== %s ===\n" % _("Renames")).encode()) 2530 f.write(textwrap.dedent(_(""" 2531 Renames share the same non-binary nature that deletions do, plus 2532 additional challenges: 2533 * If the renamed file is renamed again, instead of just two names for 2534 a path you can have three or more. 2535 * Rename pairs of the form (oldname, newname) that we consider to be 2536 different names of the "same file" might only be valid over certain 2537 commit ranges. For example, if a new commit reintroduces a file 2538 named oldname, then new versions of oldname aren't the "same file" 2539 anymore. We could try to portray this to the user, but it's easier 2540 for the user to just break the pairing and only report unbroken 2541 rename pairings to the user. 2542 * The ability for users to rename files differently in different 2543 branches means that our chains of renames will not necessarily be 2544 linear but may branch out. 2545 """)[1:]).encode()) 2546 f.write(b"\n") 2547 2548 # Equivalence classes for names, so if folks only want to keep a 2549 # certain set of paths, they know the old names they want to include 2550 # too. 2551 with open(os.path.join(reportdir, b"renames.txt"), 'bw') as f: 2552 seen = set() 2553 for pathname,equiv_group in sorted(stats['equivalence'].items(), 2554 key=lambda x:(x[1], x[0])): 2555 if equiv_group in seen: 2556 continue 2557 seen.add(equiv_group) 2558 f.write(("{} ->\n ".format(decode(equiv_group[0])) + 2559 "\n ".join(decode(x) for x in equiv_group[1:]) + 2560 "\n").encode()) 2561 2562 # List directories in reverse sorted order of unpacked size 2563 with open(os.path.join(reportdir, b"directories-deleted-sizes.txt"), 'bw') as f: 2564 msg = "=== %s ===\n" % _("Deleted directories by reverse size") 2565 f.write(msg.encode()) 2566 msg = _("Format: unpacked size, packed size, date deleted, directory name\n") 2567 f.write(msg.encode()) 2568 for dirname, size in sorted(dir_size['packed'].items(), 2569 key=lambda x:(x[1],x[0]), reverse=True): 2570 if (dir_deleted_data[dirname]): 2571 f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname], 2572 size, 2573 datestr(dir_deleted_data[dirname]), 2574 dirname or _('<toplevel>').encode())) 2575 2576 with open(os.path.join(reportdir, b"directories-all-sizes.txt"), 'bw') as f: 2577 f.write(("=== %s ===\n" % _("All directories by reverse size")).encode()) 2578 msg = _("Format: unpacked size, packed size, date deleted, directory name\n") 2579 f.write(msg.encode()) 2580 for dirname, size in sorted(dir_size['packed'].items(), 2581 key=lambda x:(x[1],x[0]), reverse=True): 2582 f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname], 2583 size, 2584 datestr(dir_deleted_data[dirname]), 2585 dirname or _("<toplevel>").encode())) 2586 2587 # List extensions in reverse sorted order of unpacked size 2588 with open(os.path.join(reportdir, b"extensions-deleted-sizes.txt"), 'bw') as f: 2589 msg = "=== %s ===\n" % _("Deleted extensions by reverse size") 2590 f.write(msg.encode()) 2591 msg = _("Format: unpacked size, packed size, date deleted, extension name\n") 2592 f.write(msg.encode()) 2593 for extname, size in sorted(ext_size['packed'].items(), 2594 key=lambda x:(x[1],x[0]), reverse=True): 2595 if (ext_deleted_data[extname]): 2596 f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname], 2597 size, 2598 datestr(ext_deleted_data[extname]), 2599 extname or _('<no extension>').encode())) 2600 2601 with open(os.path.join(reportdir, b"extensions-all-sizes.txt"), 'bw') as f: 2602 f.write(("=== %s ===\n" % _("All extensions by reverse size")).encode()) 2603 msg = _("Format: unpacked size, packed size, date deleted, extension name\n") 2604 f.write(msg.encode()) 2605 for extname, size in sorted(ext_size['packed'].items(), 2606 key=lambda x:(x[1],x[0]), reverse=True): 2607 f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname], 2608 size, 2609 datestr(ext_deleted_data[extname]), 2610 extname or _('<no extension>').encode())) 2611 2612 # List files in reverse sorted order of unpacked size 2613 with open(os.path.join(reportdir, b"path-deleted-sizes.txt"), 'bw') as f: 2614 msg = "=== %s ===\n" % _("Deleted paths by reverse accumulated size") 2615 f.write(msg.encode()) 2616 msg = _("Format: unpacked size, packed size, date deleted, path name(s)\n") 2617 f.write(msg.encode()) 2618 for pathname, size in sorted(path_size['packed'].items(), 2619 key=lambda x:(x[1],x[0]), reverse=True): 2620 when = stats['file_deletions'].get(pathname, None) 2621 if when: 2622 f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname], 2623 size, 2624 datestr(when), 2625 pathname)) 2626 2627 with open(os.path.join(reportdir, b"path-all-sizes.txt"), 'bw') as f: 2628 msg = "=== %s ===\n" % _("All paths by reverse accumulated size") 2629 f.write(msg.encode()) 2630 msg = _("Format: unpacked size, packed size, date deleted, path name\n") 2631 f.write(msg.encode()) 2632 for pathname, size in sorted(path_size['packed'].items(), 2633 key=lambda x:(x[1],x[0]), reverse=True): 2634 when = stats['file_deletions'].get(pathname, None) 2635 f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname], 2636 size, 2637 datestr(when), 2638 pathname)) 2639 2640 # List of filenames and sizes in descending order 2641 with open(os.path.join(reportdir, b"blob-shas-and-paths.txt"), 'bw') as f: 2642 f.write(("=== %s ===\n" % _("Files by sha and associated pathnames in reverse size")).encode()) 2643 f.write(_("Format: sha, unpacked size, packed size, filename(s) object stored as\n").encode()) 2644 for sha, size in sorted(stats['packed_size'].items(), 2645 key=lambda x:(x[1],x[0]), reverse=True): 2646 if sha not in stats['names']: 2647 # Some objects in the repository might not be referenced, or not 2648 # referenced by the branches/tags the user cares about; skip them. 2649 continue 2650 names_with_sha = stats['names'][sha] 2651 if len(names_with_sha) == 1: 2652 names_with_sha = names_with_sha.pop() 2653 else: 2654 names_with_sha = b'[' + b', '.join(sorted(names_with_sha)) + b']' 2655 f.write(b" %s %10d %10d %s\n" % (sha, 2656 stats['unpacked_size'][sha], 2657 size, 2658 names_with_sha)) 2659 2660 @staticmethod 2661 def run(args): 2662 if args.report_dir: 2663 reportdir = args.report_dir 2664 else: 2665 git_dir = GitUtils.determine_git_dir(b'.') 2666 2667 # Create the report directory as necessary 2668 results_tmp_dir = os.path.join(git_dir, b'filter-repo') 2669 if not os.path.isdir(results_tmp_dir): 2670 os.mkdir(results_tmp_dir) 2671 reportdir = os.path.join(results_tmp_dir, b"analysis") 2672 2673 if os.path.isdir(reportdir): 2674 if args.force: 2675 sys.stdout.write(_("Warning: Removing recursively: \"%s\"") % decode(reportdir)) 2676 shutil.rmtree(reportdir) 2677 else: 2678 sys.stdout.write(_("Error: dir already exists (use --force to delete): \"%s\"\n") % decode(reportdir)) 2679 sys.exit(1) 2680 2681 os.mkdir(reportdir) 2682 2683 # Gather the data we need 2684 stats = RepoAnalyze.gather_data(args) 2685 2686 # Write the reports 2687 sys.stdout.write(_("Writing reports to %s...") % decode(reportdir)) 2688 sys.stdout.flush() 2689 RepoAnalyze.write_report(reportdir, stats) 2690 sys.stdout.write(_("done.\n")) 2691 2692class InputFileBackup: 2693 def __init__(self, input_file, output_file): 2694 self.input_file = input_file 2695 self.output_file = output_file 2696 2697 def close(self): 2698 self.input_file.close() 2699 self.output_file.close() 2700 2701 def read(self, size): 2702 output = self.input_file.read(size) 2703 self.output_file.write(output) 2704 return output 2705 2706 def readline(self): 2707 line = self.input_file.readline() 2708 self.output_file.write(line) 2709 return line 2710 2711class DualFileWriter: 2712 def __init__(self, file1, file2): 2713 self.file1 = file1 2714 self.file2 = file2 2715 2716 def write(self, *args): 2717 self.file1.write(*args) 2718 self.file2.write(*args) 2719 2720 def flush(self): 2721 self.file1.flush() 2722 self.file2.flush() 2723 2724 def close(self): 2725 self.file1.close() 2726 self.file2.close() 2727 2728class RepoFilter(object): 2729 def __init__(self, 2730 args, 2731 filename_callback = None, 2732 message_callback = None, 2733 name_callback = None, 2734 email_callback = None, 2735 refname_callback = None, 2736 blob_callback = None, 2737 commit_callback = None, 2738 tag_callback = None, 2739 reset_callback = None, 2740 done_callback = None): 2741 2742 self._args = args 2743 2744 # Repo we are exporting 2745 self._repo_working_dir = None 2746 2747 # Store callbacks for acting on objects printed by FastExport 2748 self._blob_callback = blob_callback 2749 self._commit_callback = commit_callback 2750 self._tag_callback = tag_callback 2751 self._reset_callback = reset_callback 2752 self._done_callback = done_callback 2753 2754 # Store callbacks for acting on slices of FastExport objects 2755 self._filename_callback = filename_callback # filenames from commits 2756 self._message_callback = message_callback # commit OR tag message 2757 self._name_callback = name_callback # author, committer, tagger 2758 self._email_callback = email_callback # author, committer, tagger 2759 self._refname_callback = refname_callback # from commit/tag/reset 2760 self._handle_arg_callbacks() 2761 2762 # Defaults for input 2763 self._input = None 2764 self._fep = None # Fast Export Process 2765 self._fe_orig = None # Path to where original fast-export output stored 2766 self._fe_filt = None # Path to where filtered fast-export output stored 2767 self._parser = None # FastExportParser object we are working with 2768 2769 # Defaults for output 2770 self._output = None 2771 self._fip = None # Fast Import Process 2772 self._import_pipes = None 2773 self._managed_output = True 2774 2775 # A tuple of (depth, list-of-ancestors). Commits and ancestors are 2776 # identified by their id (their 'mark' in fast-export or fast-import 2777 # speak). The depth of a commit is one more than the max depth of any 2778 # of its ancestors. 2779 self._graph = AncestryGraph() 2780 # Another one, for ancestry of commits in the original repo 2781 self._orig_graph = AncestryGraph() 2782 2783 # Names of files that were tweaked in any commit; such paths could lead 2784 # to subsequent commits being empty 2785 self._files_tweaked = set() 2786 2787 # A set of commit hash pairs (oldhash, newhash) which used to be merge 2788 # commits but due to filtering were turned into non-merge commits. 2789 # The commits probably have suboptimal commit messages (e.g. "Merge branch 2790 # next into master"). 2791 self._commits_no_longer_merges = [] 2792 2793 # A dict of original_ids to new_ids; filtering commits means getting 2794 # new commit hash (sha1sums), and we record the mapping both for 2795 # diagnostic purposes and so we can rewrite commit messages. Note that 2796 # the new_id can be None rather than a commit hash if the original 2797 # commit became empty and was pruned or was otherwise dropped. 2798 self._commit_renames = {} 2799 2800 # A set of original_ids for which we have not yet gotten the 2801 # new_ids; we use OrderedDict because we need to know the order of 2802 # insertion, but the values are always ignored (and set to None). 2803 # If there was an OrderedSet class, I'd use it instead. 2804 self._pending_renames = collections.OrderedDict() 2805 2806 # A dict of commit_hash[0:7] -> set(commit_hashes with that prefix). 2807 # 2808 # It's common for commit messages to refer to commits by abbreviated 2809 # commit hashes, as short as 7 characters. To facilitate translating 2810 # such short hashes, we have a mapping of prefixes to full old hashes. 2811 self._commit_short_old_hashes = collections.defaultdict(set) 2812 2813 # A set of commit hash references appearing in commit messages which 2814 # mapped to a valid commit that was removed entirely in the filtering 2815 # process. The commit message will continue to reference the 2816 # now-missing commit hash, since there was nothing to map it to. 2817 self._commits_referenced_but_removed = set() 2818 2819 # Progress handling (number of commits parsed, etc.) 2820 self._progress_writer = ProgressWriter() 2821 self._num_commits = 0 2822 2823 # Size of blobs in the repo 2824 self._unpacked_size = {} 2825 2826 # Other vars 2827 self._sanity_checks_handled = False 2828 self._finalize_handled = False 2829 self._orig_refs = None 2830 self._newnames = {} 2831 2832 # Cache a few message translations for performance reasons 2833 self._parsed_message = _("Parsed %d commits") 2834 2835 # Compile some regexes and cache those 2836 self._hash_re = re.compile(br'(\b[0-9a-f]{7,40}\b)') 2837 2838 def _handle_arg_callbacks(self): 2839 def make_callback(argname, str): 2840 exec('def callback({}, _do_not_use_this_var = None):\n'.format(argname)+ 2841 ' '+'\n '.join(str.splitlines()), globals()) 2842 return callback #namespace['callback'] 2843 def handle(type): 2844 callback_field = '_{}_callback'.format(type) 2845 code_string = getattr(self._args, type+'_callback') 2846 if code_string: 2847 if os.path.exists(code_string): 2848 with open(code_string, 'r', encoding='utf-8') as f: 2849 code_string = f.read() 2850 if getattr(self, callback_field): 2851 raise SystemExit(_("Error: Cannot pass a %s_callback to RepoFilter " 2852 "AND pass --%s-callback" 2853 % (type, type))) 2854 if 'return ' not in code_string and \ 2855 type not in ('blob', 'commit', 'tag', 'reset'): 2856 raise SystemExit(_("Error: --%s-callback should have a return statement") 2857 % type) 2858 setattr(self, callback_field, make_callback(type, code_string)) 2859 handle('filename') 2860 handle('message') 2861 handle('name') 2862 handle('email') 2863 handle('refname') 2864 handle('blob') 2865 handle('commit') 2866 handle('tag') 2867 handle('reset') 2868 2869 def _run_sanity_checks(self): 2870 self._sanity_checks_handled = True 2871 if not self._managed_output: 2872 if not self._args.replace_refs: 2873 # If not _managed_output we don't want to make extra changes to the 2874 # repo, so set default to no-op 'update-no-add' 2875 self._args.replace_refs = 'update-no-add' 2876 return 2877 2878 if self._args.debug: 2879 print("[DEBUG] Passed arguments:\n{}".format(self._args)) 2880 2881 # Determine basic repository information 2882 target_working_dir = self._args.target or b'.' 2883 self._orig_refs = GitUtils.get_refs(target_working_dir) 2884 is_bare = GitUtils.is_repository_bare(target_working_dir) 2885 2886 # Determine if this is second or later run of filter-repo 2887 tmp_dir = self.results_tmp_dir(create_if_missing=False) 2888 already_ran = os.path.isfile(os.path.join(tmp_dir, b'already_ran')) 2889 2890 # Default for --replace-refs 2891 if not self._args.replace_refs: 2892 self._args.replace_refs = ('update-or-add' if already_ran 2893 else 'update-and-add') 2894 2895 # Do sanity checks from the correct directory 2896 if not self._args.force and not already_ran: 2897 cwd = os.getcwd() 2898 os.chdir(target_working_dir) 2899 RepoFilter.sanity_check(self._orig_refs, is_bare) 2900 os.chdir(cwd) 2901 2902 @staticmethod 2903 def sanity_check(refs, is_bare): 2904 def abort(reason): 2905 try: 2906 cmd = 'git config remote.origin.url' 2907 output = subproc.check_output(cmd.split()).strip() 2908 except subprocess.CalledProcessError as e: 2909 output = None 2910 msg = "" 2911 if output and os.path.isdir(output): 2912 msg = _("Note: when cloning local repositories, you need to pass\n" 2913 " --no-local to git clone to avoid this issue.\n") 2914 raise SystemExit( 2915 _("Aborting: Refusing to destructively overwrite repo history since\n" 2916 "this does not look like a fresh clone.\n" 2917 " (%s)\n%s" 2918 "Please operate on a fresh clone instead. If you want to proceed\n" 2919 "anyway, use --force.") % (reason, msg)) 2920 2921 # Make sure repo is fully packed, just like a fresh clone would be. 2922 # Note that transfer.unpackLimit defaults to 100, meaning that a 2923 # repository with no packs and less than 100 objects should be considered 2924 # fully packed. 2925 output = subproc.check_output('git count-objects -v'.split()) 2926 stats = dict(x.split(b': ') for x in output.splitlines()) 2927 num_packs = int(stats[b'packs']) 2928 num_loose_objects = int(stats[b'count']) 2929 if num_packs > 1 or \ 2930 (num_packs == 1 and num_loose_objects > 0) or \ 2931 num_loose_objects >= 100: 2932 abort(_("expected freshly packed repo")) 2933 2934 # Make sure there is precisely one remote, named "origin"...or that this 2935 # is a new bare repo with no packs and no remotes 2936 output = subproc.check_output('git remote'.split()).strip() 2937 if not (output == b"origin" or (num_packs == 0 and not output)): 2938 abort(_("expected one remote, origin")) 2939 2940 # Avoid letting people running with weird setups and overwriting GIT_DIR 2941 # elsewhere 2942 git_dir = GitUtils.determine_git_dir(b'.') 2943 if is_bare and git_dir != b'.': 2944 abort(_("GIT_DIR must be .")) 2945 elif not is_bare and git_dir != b'.git': 2946 abort(_("GIT_DIR must be .git")) 2947 2948 # Make sure that all reflogs have precisely one entry 2949 reflog_dir=os.path.join(git_dir, b'logs') 2950 for root, dirs, files in os.walk(reflog_dir): 2951 for filename in files: 2952 pathname = os.path.join(root, filename) 2953 with open(pathname, 'br') as f: 2954 if len(f.read().splitlines()) > 1: 2955 shortpath = pathname[len(reflog_dir)+1:] 2956 abort(_("expected at most one entry in the reflog for %s") % 2957 decode(shortpath)) 2958 2959 # Make sure there are no stashed changes 2960 if b'refs/stash' in refs: 2961 abort(_("has stashed changes")) 2962 2963 # Do extra checks in non-bare repos 2964 if not is_bare: 2965 # Avoid uncommitted, unstaged, or untracked changes 2966 if subproc.call('git diff --staged --quiet'.split()): 2967 abort(_("you have uncommitted changes")) 2968 if subproc.call('git diff --quiet'.split()): 2969 abort(_("you have unstaged changes")) 2970 if len(subproc.check_output('git ls-files -o'.split())) > 0: 2971 abort(_("you have untracked changes")) 2972 2973 # Avoid unpushed changes 2974 for refname, rev in refs.items(): 2975 if not refname.startswith(b'refs/heads/'): 2976 continue 2977 origin_ref = refname.replace(b'refs/heads/', b'refs/remotes/origin/') 2978 if origin_ref not in refs: 2979 abort(_('%s exists, but %s not found') % (decode(refname), 2980 decode(origin_ref))) 2981 if rev != refs[origin_ref]: 2982 abort(_('%s does not match %s') % (decode(refname), 2983 decode(origin_ref))) 2984 2985 # Make sure there is only one worktree 2986 output = subproc.check_output('git worktree list'.split()) 2987 if len(output.splitlines()) > 1: 2988 abort(_('you have multiple worktrees')) 2989 2990 @staticmethod 2991 def cleanup(repo, repack, reset, run_quietly=False, show_debuginfo=False): 2992 ''' cleanup repo; if repack then expire reflogs and do a gc --prune=now. 2993 if reset then do a reset --hard. Optionally also curb output if 2994 run_quietly is True, or go the opposite direction and show extra 2995 output if show_debuginfo is True. ''' 2996 assert not (run_quietly and show_debuginfo) 2997 2998 if (repack and not run_quietly and not show_debuginfo): 2999 print(_("Repacking your repo and cleaning out old unneeded objects")) 3000 quiet_flags = '--quiet' if run_quietly else '' 3001 cleanup_cmds = [] 3002 if repack: 3003 cleanup_cmds = ['git reflog expire --expire=now --all'.split(), 3004 'git gc {} --prune=now'.format(quiet_flags).split()] 3005 if reset: 3006 cleanup_cmds.insert(0, 'git reset {} --hard'.format(quiet_flags).split()) 3007 location_info = ' (in {})'.format(decode(repo)) if repo != b'.' else '' 3008 for cmd in cleanup_cmds: 3009 if show_debuginfo: 3010 print("[DEBUG] Running{}: {}".format(location_info, ' '.join(cmd))) 3011 subproc.call(cmd, cwd=repo) 3012 3013 def _get_rename(self, old_hash): 3014 # If we already know the rename, just return it 3015 new_hash = self._commit_renames.get(old_hash, None) 3016 if new_hash: 3017 return new_hash 3018 3019 # If it's not in the remaining pending renames, we don't know it 3020 if old_hash is not None and old_hash not in self._pending_renames: 3021 return None 3022 3023 # Read through the pending renames until we find it or we've read them all, 3024 # and return whatever we might find 3025 self._flush_renames(old_hash) 3026 return self._commit_renames.get(old_hash, None) 3027 3028 def _flush_renames(self, old_hash=None, limit=0): 3029 # Parse through self._pending_renames until we have read enough. We have 3030 # read enough if: 3031 # self._pending_renames is empty 3032 # old_hash != None and we found a rename for old_hash 3033 # limit > 0 and len(self._pending_renames) started less than 2*limit 3034 # limit > 0 and len(self._pending_renames) < limit 3035 if limit and len(self._pending_renames) < 2 * limit: 3036 return 3037 fi_input, fi_output = self._import_pipes 3038 while self._pending_renames: 3039 orig_id, ignore = self._pending_renames.popitem(last=False) 3040 new_id = fi_output.readline().rstrip() 3041 self._commit_renames[orig_id] = new_id 3042 if old_hash == orig_id: 3043 return 3044 if limit and len(self._pending_renames) < limit: 3045 return 3046 3047 def _translate_commit_hash(self, matchobj_or_oldhash): 3048 old_hash = matchobj_or_oldhash 3049 if not isinstance(matchobj_or_oldhash, bytes): 3050 old_hash = matchobj_or_oldhash.group(1) 3051 orig_len = len(old_hash) 3052 new_hash = self._get_rename(old_hash) 3053 if new_hash is None: 3054 if old_hash[0:7] not in self._commit_short_old_hashes: 3055 self._commits_referenced_but_removed.add(old_hash) 3056 return old_hash 3057 possibilities = self._commit_short_old_hashes[old_hash[0:7]] 3058 matches = [x for x in possibilities 3059 if x[0:orig_len] == old_hash] 3060 if len(matches) != 1: 3061 self._commits_referenced_but_removed.add(old_hash) 3062 return old_hash 3063 old_hash = matches[0] 3064 new_hash = self._get_rename(old_hash) 3065 3066 assert new_hash is not None 3067 return new_hash[0:orig_len] 3068 3069 def _trim_extra_parents(self, orig_parents, parents): 3070 '''Due to pruning of empty commits, some parents could be non-existent 3071 (None) or otherwise redundant. Remove the non-existent parents, and 3072 remove redundant parents so long as that doesn't transform a merge 3073 commit into a non-merge commit. 3074 3075 Returns a tuple: 3076 (parents, new_first_parent_if_would_become_non_merge)''' 3077 3078 always_prune = (self._args.prune_degenerate == 'always') 3079 3080 # Pruning of empty commits means multiple things: 3081 # * An original parent of this commit may have been pruned causing the 3082 # need to rewrite the reported parent to the nearest ancestor. We 3083 # want to know when we're dealing with such a parent. 3084 # * Further, there may be no "nearest ancestor" if the entire history 3085 # of that parent was also pruned. (Detectable by the parent being 3086 # 'None') 3087 # Remove all parents rewritten to None, and keep track of which parents 3088 # were rewritten to an ancestor. 3089 tmp = zip(parents, 3090 orig_parents, 3091 [(x in _SKIPPED_COMMITS or always_prune) for x in orig_parents]) 3092 tmp2 = [x for x in tmp if x[0] is not None] 3093 if not tmp2: 3094 # All ancestors have been pruned; we have no parents. 3095 return [], None 3096 parents, orig_parents, is_rewritten = [list(x) for x in zip(*tmp2)] 3097 3098 # We can't have redundant parents if we don't have at least 2 parents 3099 if len(parents) < 2: 3100 return parents, None 3101 3102 # Don't remove redundant parents if user doesn't want us to 3103 if self._args.prune_degenerate == 'never': 3104 return parents, None 3105 3106 # Remove duplicate parents (if both sides of history have lots of commits 3107 # which become empty due to pruning, the most recent ancestor on both 3108 # sides may be the same commit), except only remove parents that have 3109 # been rewritten due to previous empty pruning. 3110 seen = set() 3111 seen_add = seen.add 3112 # Deleting duplicate rewritten parents means keeping parents if either 3113 # they have not been seen or they are ones that have not been rewritten. 3114 parents_copy = parents 3115 uniq = [[p, orig_parents[i], is_rewritten[i]] for i, p in enumerate(parents) 3116 if not (p in seen or seen_add(p)) or not is_rewritten[i]] 3117 parents, orig_parents, is_rewritten = [list(x) for x in zip(*uniq)] 3118 if len(parents) < 2: 3119 return parents_copy, parents[0] 3120 3121 # Flatten unnecessary merges. (If one side of history is entirely 3122 # empty commits that were pruned, we may end up attempting to 3123 # merge a commit with its ancestor. Remove parents that are an 3124 # ancestor of another parent.) 3125 num_parents = len(parents) 3126 to_remove = [] 3127 for cur in range(num_parents): 3128 if not is_rewritten[cur]: 3129 continue 3130 for other in range(num_parents): 3131 if cur == other: 3132 continue 3133 if not self._graph.is_ancestor(parents[cur], parents[other]): 3134 continue 3135 # parents[cur] is an ancestor of parents[other], so parents[cur] 3136 # seems redundant. However, if it was intentionally redundant 3137 # (e.g. a no-ff merge) in the original, then we want to keep it. 3138 if not always_prune and \ 3139 self._orig_graph.is_ancestor(orig_parents[cur], 3140 orig_parents[other]): 3141 continue 3142 # Some folks want their history to have all first parents be merge 3143 # commits (except for any root commits), and always do a merge --no-ff. 3144 # For such folks, don't remove the first parent even if it's an 3145 # ancestor of other commits. 3146 if self._args.no_ff and cur == 0: 3147 continue 3148 # Okay so the cur-th parent is an ancestor of the other-th parent, 3149 # and it wasn't that way in the original repository; mark the 3150 # cur-th parent as removable. 3151 to_remove.append(cur) 3152 break # cur removed, so skip rest of others -- i.e. check cur+=1 3153 for x in reversed(to_remove): 3154 parents.pop(x) 3155 if len(parents) < 2: 3156 return parents_copy, parents[0] 3157 3158 return parents, None 3159 3160 def _prunable(self, commit, new_1st_parent, had_file_changes, orig_parents): 3161 parents = commit.parents 3162 3163 if self._args.prune_empty == 'never': 3164 return False 3165 always_prune = (self._args.prune_empty == 'always') 3166 3167 # For merge commits, unless there are prunable (redundant) parents, we 3168 # do not want to prune 3169 if len(parents) >= 2 and not new_1st_parent: 3170 return False 3171 3172 if len(parents) < 2: 3173 # Special logic for commits that started empty... 3174 if not had_file_changes and not always_prune: 3175 had_parents_pruned = (len(parents) < len(orig_parents) or 3176 (len(orig_parents) == 1 and 3177 orig_parents[0] in _SKIPPED_COMMITS)) 3178 # If the commit remains empty and had parents which were pruned, 3179 # then prune this commit; otherwise, retain it 3180 return (not commit.file_changes and had_parents_pruned) 3181 3182 # We can only get here if the commit didn't start empty, so if it's 3183 # empty now, it obviously became empty 3184 if not commit.file_changes: 3185 return True 3186 3187 # If there are no parents of this commit and we didn't match the case 3188 # above, then this commit cannot be pruned. Since we have no parent(s) 3189 # to compare to, abort now to prevent future checks from failing. 3190 if not parents: 3191 return False 3192 3193 # Similarly, we cannot handle the hard cases if we don't have a pipe 3194 # to communicate with fast-import 3195 if not self._import_pipes: 3196 return False 3197 3198 # If there have not been renames/remappings of IDs (due to insertion of 3199 # new blobs), then we can sometimes know things aren't prunable with a 3200 # simple check 3201 if not _IDS.has_renames(): 3202 # non-merge commits can only be empty if blob/file-change editing caused 3203 # all file changes in the commit to have the same file contents as 3204 # the parent. 3205 changed_files = set(change.filename for change in commit.file_changes) 3206 if len(orig_parents) < 2 and changed_files - self._files_tweaked: 3207 return False 3208 3209 # Finally, the hard case: due to either blob rewriting, or due to pruning 3210 # of empty commits wiping out the first parent history back to the merge 3211 # base, the list of file_changes we have may not actually differ from our 3212 # (new) first parent's version of the files, i.e. this would actually be 3213 # an empty commit. Check by comparing the contents of this commit to its 3214 # (remaining) parent. 3215 # 3216 # NOTE on why this works, for the case of original first parent history 3217 # having been pruned away due to being empty: 3218 # The first parent history having been pruned away due to being 3219 # empty implies the original first parent would have a tree (after 3220 # filtering) that matched the merge base's tree. Since 3221 # file_changes has the changes needed to go from what would have 3222 # been the first parent to our new commit, and what would have been 3223 # our first parent has a tree that matches the merge base, then if 3224 # the new first parent has a tree matching the versions of files in 3225 # file_changes, then this new commit is empty and thus prunable. 3226 fi_input, fi_output = self._import_pipes 3227 self._flush_renames() # Avoid fi_output having other stuff present 3228 # Optimization note: we could have two loops over file_changes, the 3229 # first doing all the self._output.write() calls, and the second doing 3230 # the rest. But I'm worried about fast-import blocking on fi_output 3231 # buffers filling up so I instead read from it as I go. 3232 for change in commit.file_changes: 3233 parent = new_1st_parent or commit.parents[0] # exists due to above checks 3234 quoted_filename = PathQuoting.enquote(change.filename) 3235 if isinstance(parent, int): 3236 self._output.write(b"ls :%d %s\n" % (parent, quoted_filename)) 3237 else: 3238 self._output.write(b"ls %s %s\n" % (parent, quoted_filename)) 3239 self._output.flush() 3240 parent_version = fi_output.readline().split() 3241 if change.type == b'D': 3242 if parent_version != [b'missing', quoted_filename]: 3243 return False 3244 else: 3245 blob_sha = change.blob_id 3246 if isinstance(change.blob_id, int): 3247 self._output.write(b"get-mark :%d\n" % change.blob_id) 3248 self._output.flush() 3249 blob_sha = fi_output.readline().rstrip() 3250 if parent_version != [change.mode, b'blob', blob_sha, quoted_filename]: 3251 return False 3252 3253 return True 3254 3255 def _record_remapping(self, commit, orig_parents): 3256 new_id = None 3257 # Record the mapping of old commit hash to new one 3258 if commit.original_id and self._import_pipes: 3259 fi_input, fi_output = self._import_pipes 3260 self._output.write(b"get-mark :%d\n" % commit.id) 3261 self._output.flush() 3262 orig_id = commit.original_id 3263 self._commit_short_old_hashes[orig_id[0:7]].add(orig_id) 3264 # Note that we have queued up an id for later reading; flush a 3265 # few of the older ones if we have too many queued up 3266 self._pending_renames[orig_id] = None 3267 self._flush_renames(None, limit=40) 3268 # Also, record if this was a merge commit that turned into a non-merge 3269 # commit. 3270 if len(orig_parents) >= 2 and len(commit.parents) < 2: 3271 self._commits_no_longer_merges.append((commit.original_id, new_id)) 3272 3273 def callback_metadata(self, extra_items = dict()): 3274 return {'commit_rename_func': self._translate_commit_hash, 3275 'ancestry_graph': self._graph, 3276 'original_ancestry_graph': self._orig_graph, 3277 **extra_items} 3278 3279 def _tweak_blob(self, blob): 3280 if self._args.max_blob_size and len(blob.data) > self._args.max_blob_size: 3281 blob.skip() 3282 3283 if blob.original_id in self._args.strip_blobs_with_ids: 3284 blob.skip() 3285 3286 if ( self._args.replace_text 3287 # not (if blob contains zero byte in the first 8Kb, that is, if blob is binary data) 3288 and not b"\0" in blob.data[0:8192] 3289 ): 3290 for literal, replacement in self._args.replace_text['literals']: 3291 blob.data = blob.data.replace(literal, replacement) 3292 for regex, replacement in self._args.replace_text['regexes']: 3293 blob.data = regex.sub(replacement, blob.data) 3294 3295 if self._blob_callback: 3296 self._blob_callback(blob, self.callback_metadata()) 3297 3298 def _filter_files(self, commit): 3299 def filename_matches(path_expression, pathname): 3300 ''' Returns whether path_expression matches pathname or a leading 3301 directory thereof, allowing path_expression to not have a trailing 3302 slash even if it is meant to match a leading directory. ''' 3303 if path_expression == b'': 3304 return True 3305 n = len(path_expression) 3306 if (pathname.startswith(path_expression) and 3307 (path_expression[n-1:n] == b'/' or 3308 len(pathname) == n or 3309 pathname[n:n+1] == b'/')): 3310 return True 3311 return False 3312 3313 def newname(path_changes, pathname, use_base_name, filtering_is_inclusive): 3314 ''' Applies filtering and rename changes from path_changes to pathname, 3315 returning any of None (file isn't wanted), original filename (file 3316 is wanted with original name), or new filename. ''' 3317 wanted = False 3318 full_pathname = pathname 3319 if use_base_name: 3320 pathname = os.path.basename(pathname) 3321 for (mod_type, match_type, path_exp) in path_changes: 3322 if mod_type == 'filter' and not wanted: 3323 assert match_type in ('match', 'glob', 'regex') 3324 if match_type == 'match' and filename_matches(path_exp, pathname): 3325 wanted = True 3326 if match_type == 'glob' and fnmatch.fnmatch(pathname, path_exp): 3327 wanted = True 3328 if match_type == 'regex' and path_exp.search(pathname): 3329 wanted = True 3330 elif mod_type == 'rename': 3331 match, repl = path_exp 3332 assert match_type in ('match','regex') # glob was translated to regex 3333 if match_type == 'match' and filename_matches(match, full_pathname): 3334 full_pathname = full_pathname.replace(match, repl, 1) 3335 if match_type == 'regex': 3336 full_pathname = match.sub(repl, full_pathname) 3337 return full_pathname if (wanted == filtering_is_inclusive) else None 3338 3339 args = self._args 3340 new_file_changes = {} # Assumes no renames or copies, otherwise collisions 3341 for change in commit.file_changes: 3342 # NEEDSWORK: _If_ we ever want to pass `--full-tree` to fast-export and 3343 # parse that output, we'll need to modify this block; `--full-tree` 3344 # issues a deleteall directive which has no filename, and thus this 3345 # block would normally strip it. Of course, FileChange() and 3346 # _parse_optional_filechange() would need updates too. 3347 if change.type == b'DELETEALL': 3348 new_file_changes[b''] = change 3349 continue 3350 if change.filename in self._newnames: 3351 change.filename = self._newnames[change.filename] 3352 else: 3353 original_filename = change.filename 3354 change.filename = newname(args.path_changes, change.filename, 3355 args.use_base_name, args.inclusive) 3356 if self._filename_callback: 3357 change.filename = self._filename_callback(change.filename) 3358 self._newnames[original_filename] = change.filename 3359 if not change.filename: 3360 continue # Filtering criteria excluded this file; move on to next one 3361 if change.filename in new_file_changes: 3362 # Getting here means that path renaming is in effect, and caused one 3363 # path to collide with another. That's usually bad, but can be okay 3364 # under two circumstances: 3365 # 1) Sometimes people have a file named OLDFILE in old revisions of 3366 # history, and they rename to NEWFILE, and would like to rewrite 3367 # history so that all revisions refer to it as NEWFILE. As such, 3368 # we can allow a collision when (at least) one of the two paths 3369 # is a deletion. Note that if OLDFILE and NEWFILE are unrelated 3370 # this also allows the rewrite to continue, which makes sense 3371 # since OLDFILE is no longer in the way. 3372 # 2) If OLDFILE and NEWFILE are exactly equal, then writing them 3373 # both to the same location poses no problem; we only need one 3374 # file. (This could come up if someone copied a file in some 3375 # commit, then later either deleted the file or kept it exactly 3376 # in sync with the original with any changes, and then decides 3377 # they want to rewrite history to only have one of the two files) 3378 colliding_change = new_file_changes[change.filename] 3379 if change.type == b'D': 3380 # We can just throw this one away and keep the other 3381 continue 3382 elif change.type == b'M' and ( 3383 change.mode == colliding_change.mode and 3384 change.blob_id == colliding_change.blob_id): 3385 # The two are identical, so we can throw this one away and keep other 3386 continue 3387 elif new_file_changes[change.filename].type != b'D': 3388 raise SystemExit(_("File renaming caused colliding pathnames!\n") + 3389 _(" Commit: {}\n").format(commit.original_id) + 3390 _(" Filename: {}").format(change.filename)) 3391 # Strip files that are too large 3392 if self._args.max_blob_size and \ 3393 self._unpacked_size.get(change.blob_id, 0) > self._args.max_blob_size: 3394 continue 3395 if self._args.strip_blobs_with_ids and \ 3396 change.blob_id in self._args.strip_blobs_with_ids: 3397 continue 3398 # Otherwise, record the change 3399 new_file_changes[change.filename] = change 3400 commit.file_changes = [v for k,v in sorted(new_file_changes.items())] 3401 3402 def _tweak_commit(self, commit, aux_info): 3403 # Change the commit message according to callback 3404 if not self._args.preserve_commit_hashes: 3405 commit.message = self._hash_re.sub(self._translate_commit_hash, 3406 commit.message) 3407 if self._args.replace_message: 3408 for literal, replacement in self._args.replace_message['literals']: 3409 commit.message = commit.message.replace(literal, replacement) 3410 for regex, replacement in self._args.replace_message['regexes']: 3411 commit.message = regex.sub(replacement, commit.message) 3412 if self._message_callback: 3413 commit.message = self._message_callback(commit.message) 3414 3415 # Change the author & committer according to mailmap rules 3416 args = self._args 3417 if args.mailmap: 3418 commit.author_name, commit.author_email = \ 3419 args.mailmap.translate(commit.author_name, commit.author_email) 3420 commit.committer_name, commit.committer_email = \ 3421 args.mailmap.translate(commit.committer_name, commit.committer_email) 3422 # Change author & committer according to callbacks 3423 if self._name_callback: 3424 commit.author_name = self._name_callback(commit.author_name) 3425 commit.committer_name = self._name_callback(commit.committer_name) 3426 if self._email_callback: 3427 commit.author_email = self._email_callback(commit.author_email) 3428 commit.committer_email = self._email_callback(commit.committer_email) 3429 3430 # Sometimes the 'branch' given is a tag; if so, rename it as requested so 3431 # we don't get any old tagnames 3432 if self._args.tag_rename: 3433 commit.branch = RepoFilter._do_tag_rename(args.tag_rename, commit.branch) 3434 if self._refname_callback: 3435 commit.branch = self._refname_callback(commit.branch) 3436 3437 # Filter or rename the list of file changes 3438 orig_file_changes = set(commit.file_changes) 3439 self._filter_files(commit) 3440 3441 # Record ancestry graph 3442 parents, orig_parents = commit.parents, aux_info['orig_parents'] 3443 if self._args.state_branch: 3444 external_parents = parents 3445 else: 3446 external_parents = [p for p in parents if not isinstance(p, int)] 3447 self._graph.record_external_commits(external_parents) 3448 self._orig_graph.record_external_commits(external_parents) 3449 self._graph.add_commit_and_parents(commit.id, parents) 3450 self._orig_graph.add_commit_and_parents(commit.old_id, orig_parents) 3451 3452 # Prune parents (due to pruning of empty commits) if relevant 3453 old_1st_parent = parents[0] if parents else None 3454 parents, new_1st_parent = self._trim_extra_parents(orig_parents, parents) 3455 commit.parents = parents 3456 3457 # If parents were pruned, then we need our file changes to be relative 3458 # to the new first parent 3459 if parents and old_1st_parent != parents[0]: 3460 commit.file_changes = GitUtils.get_file_changes(self._repo_working_dir, 3461 ID_TO_HASH[parents[0]], 3462 commit.original_id) 3463 orig_file_changes = set(commit.file_changes) 3464 self._filter_files(commit) 3465 3466 # Find out which files were modified by the callbacks. Such paths could 3467 # lead to subsequent commits being empty (e.g. if removing a line containing 3468 # a password from every version of a file that had the password, and some 3469 # later commit did nothing more than remove that line) 3470 final_file_changes = set(commit.file_changes) 3471 if self._args.replace_text or self._blob_callback: 3472 differences = orig_file_changes.union(final_file_changes) 3473 else: 3474 differences = orig_file_changes.symmetric_difference(final_file_changes) 3475 self._files_tweaked.update(x.filename for x in differences) 3476 3477 # Call the user-defined callback, if any 3478 if self._commit_callback: 3479 self._commit_callback(commit, self.callback_metadata(aux_info)) 3480 3481 # Now print the resulting commit, or if prunable skip it 3482 if not commit.dumped: 3483 if not self._prunable(commit, new_1st_parent, 3484 aux_info['had_file_changes'], orig_parents): 3485 self._insert_into_stream(commit) 3486 self._record_remapping(commit, orig_parents) 3487 else: 3488 rewrite_to = new_1st_parent or commit.first_parent() 3489 commit.skip(new_id = rewrite_to) 3490 if self._args.state_branch: 3491 alias = Alias(commit.old_id or commit.id, rewrite_to or deleted_hash) 3492 self._insert_into_stream(alias) 3493 reset = Reset(commit.branch, rewrite_to or deleted_hash) 3494 self._insert_into_stream(reset) 3495 self._commit_renames[commit.original_id] = None 3496 3497 # Show progress 3498 self._num_commits += 1 3499 if not self._args.quiet: 3500 self._progress_writer.show(self._parsed_message % self._num_commits) 3501 3502 @staticmethod 3503 def _do_tag_rename(rename_pair, tagname): 3504 old, new = rename_pair.split(b':', 1) 3505 old, new = b'refs/tags/'+old, b'refs/tags/'+new 3506 if tagname.startswith(old): 3507 return tagname.replace(old, new, 1) 3508 return tagname 3509 3510 def _tweak_tag(self, tag): 3511 # Tweak the tag message according to callbacks 3512 if self._args.replace_message: 3513 for literal, replacement in self._args.replace_message['literals']: 3514 tag.message = tag.message.replace(literal, replacement) 3515 for regex, replacement in self._args.replace_message['regexes']: 3516 tag.message = regex.sub(replacement, tag.message) 3517 if self._message_callback: 3518 tag.message = self._message_callback(tag.message) 3519 3520 # Tweak the tag name according to tag-name-related callbacks 3521 tag_prefix = b'refs/tags/' 3522 fullref = tag_prefix+tag.ref 3523 if self._args.tag_rename: 3524 fullref = RepoFilter._do_tag_rename(self._args.tag_rename, fullref) 3525 if self._refname_callback: 3526 fullref = self._refname_callback(fullref) 3527 if not fullref.startswith(tag_prefix): 3528 msg = "Error: fast-import requires tags to be in refs/tags/ namespace." 3529 msg += "\n {} renamed to {}".format(tag_prefix+tag.ref, fullref) 3530 raise SystemExit(msg) 3531 tag.ref = fullref[len(tag_prefix):] 3532 3533 # Tweak the tagger according to callbacks 3534 if self._args.mailmap: 3535 tag.tagger_name, tag.tagger_email = \ 3536 self._args.mailmap.translate(tag.tagger_name, tag.tagger_email) 3537 if self._name_callback: 3538 tag.tagger_name = self._name_callback(tag.tagger_name) 3539 if self._email_callback: 3540 tag.tagger_email = self._email_callback(tag.tagger_email) 3541 3542 # Call general purpose tag callback 3543 if self._tag_callback: 3544 self._tag_callback(tag, self.callback_metadata()) 3545 3546 def _tweak_reset(self, reset): 3547 if self._args.tag_rename: 3548 reset.ref = RepoFilter._do_tag_rename(self._args.tag_rename, reset.ref) 3549 if self._refname_callback: 3550 reset.ref = self._refname_callback(reset.ref) 3551 if self._reset_callback: 3552 self._reset_callback(reset, self.callback_metadata()) 3553 3554 def results_tmp_dir(self, create_if_missing=True): 3555 target_working_dir = self._args.target or b'.' 3556 git_dir = GitUtils.determine_git_dir(target_working_dir) 3557 d = os.path.join(git_dir, b'filter-repo') 3558 if create_if_missing and not os.path.isdir(d): 3559 os.mkdir(d) 3560 return d 3561 3562 def _load_marks_file(self, marks_basename): 3563 full_branch = 'refs/heads/{}'.format(self._args.state_branch) 3564 marks_file = os.path.join(self.results_tmp_dir(), marks_basename) 3565 working_dir = self._args.target or b'.' 3566 cmd = ['git', '-C', working_dir, 'show-ref', full_branch] 3567 contents = b'' 3568 if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0: 3569 cmd = ['git', '-C', working_dir, 'show', 3570 '%s:%s' % (full_branch, decode(marks_basename))] 3571 try: 3572 contents = subproc.check_output(cmd) 3573 except subprocess.CalledProcessError as e: # pragma: no cover 3574 raise SystemExit(_("Failed loading %s from %s") % 3575 (decode(marks_basename), full_branch)) 3576 if contents: 3577 biggest_id = max(int(x.split()[0][1:]) for x in contents.splitlines()) 3578 _IDS._next_id = max(_IDS._next_id, biggest_id+1) 3579 with open(marks_file, 'bw') as f: 3580 f.write(contents) 3581 return marks_file 3582 3583 def _save_marks_files(self): 3584 basenames = [b'source-marks', b'target-marks'] 3585 working_dir = self._args.target or b'.' 3586 3587 # Check whether the branch exists 3588 parent = [] 3589 full_branch = 'refs/heads/{}'.format(self._args.state_branch) 3590 cmd = ['git', '-C', working_dir, 'show-ref', full_branch] 3591 if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0: 3592 parent = ['-p', full_branch] 3593 3594 # Run 'git hash-object $MARKS_FILE' for each marks file, save result 3595 blob_hashes = {} 3596 for marks_basename in basenames: 3597 marks_file = os.path.join(self.results_tmp_dir(), marks_basename) 3598 if not os.path.isfile(marks_file): # pragma: no cover 3599 raise SystemExit(_("Failed to find %s to save to %s") 3600 % (marks_file, self._args.state_branch)) 3601 cmd = ['git', '-C', working_dir, 'hash-object', '-w', marks_file] 3602 blob_hashes[marks_basename] = subproc.check_output(cmd).strip() 3603 3604 # Run 'git mktree' to create a tree out of it 3605 p = subproc.Popen(['git', '-C', working_dir, 'mktree'], 3606 stdin=subprocess.PIPE, stdout=subprocess.PIPE) 3607 for b in basenames: 3608 p.stdin.write(b'100644 blob %s\t%s\n' % (blob_hashes[b], b)) 3609 p.stdin.close() 3610 p.wait() 3611 tree = p.stdout.read().strip() 3612 3613 # Create the new commit 3614 cmd = (['git', '-C', working_dir, 'commit-tree', '-m', 'New mark files', 3615 tree] + parent) 3616 commit = subproc.check_output(cmd).strip() 3617 subproc.call(['git', '-C', working_dir, 'update-ref', full_branch, commit]) 3618 3619 def importer_only(self): 3620 self._run_sanity_checks() 3621 self._setup_output() 3622 3623 def set_output(self, outputRepoFilter): 3624 assert outputRepoFilter._output 3625 3626 # set_output implies this RepoFilter is doing exporting, though may not 3627 # be the only one. 3628 self._setup_input(use_done_feature = False) 3629 3630 # Set our output management up to pipe to outputRepoFilter's locations 3631 self._managed_output = False 3632 self._output = outputRepoFilter._output 3633 self._import_pipes = outputRepoFilter._import_pipes 3634 3635 # Handle sanity checks, though currently none needed for export-only cases 3636 self._run_sanity_checks() 3637 3638 def _setup_input(self, use_done_feature): 3639 if self._args.stdin: 3640 self._input = sys.stdin.detach() 3641 sys.stdin = None # Make sure no one tries to accidentally use it 3642 self._fe_orig = None 3643 else: 3644 skip_blobs = (self._blob_callback is None and 3645 self._args.replace_text is None and 3646 self._args.source == self._args.target) 3647 extra_flags = [] 3648 if skip_blobs: 3649 extra_flags.append('--no-data') 3650 if self._args.max_blob_size: 3651 self._unpacked_size, packed_size = GitUtils.get_blob_sizes() 3652 if use_done_feature: 3653 extra_flags.append('--use-done-feature') 3654 if write_marks: 3655 extra_flags.append(b'--mark-tags') 3656 if self._args.state_branch: 3657 assert(write_marks) 3658 source_marks_file = self._load_marks_file(b'source-marks') 3659 extra_flags.extend([b'--export-marks='+source_marks_file, 3660 b'--import-marks='+source_marks_file]) 3661 if self._args.preserve_commit_encoding is not None: # pragma: no cover 3662 reencode = 'no' if self._args.preserve_commit_encoding else 'yes' 3663 extra_flags.append('--reencode='+reencode) 3664 location = ['-C', self._args.source] if self._args.source else [] 3665 fep_cmd = ['git'] + location + ['fast-export', '--show-original-ids', 3666 '--signed-tags=strip', '--tag-of-filtered-object=rewrite', 3667 '--fake-missing-tagger', '--reference-excluded-parents' 3668 ] + extra_flags + self._args.refs 3669 self._fep = subproc.Popen(fep_cmd, bufsize=-1, stdout=subprocess.PIPE) 3670 self._input = self._fep.stdout 3671 if self._args.dry_run or self._args.debug: 3672 self._fe_orig = os.path.join(self.results_tmp_dir(), 3673 b'fast-export.original') 3674 output = open(self._fe_orig, 'bw') 3675 self._input = InputFileBackup(self._input, output) 3676 if self._args.debug: 3677 tmp = [decode(x) if isinstance(x, bytes) else x for x in fep_cmd] 3678 print("[DEBUG] Running: {}".format(' '.join(tmp))) 3679 print(" (saving a copy of the output at {})" 3680 .format(decode(self._fe_orig))) 3681 3682 def _setup_output(self): 3683 if not self._args.dry_run: 3684 location = ['-C', self._args.target] if self._args.target else [] 3685 fip_cmd = ['git'] + location + ['-c', 'core.ignorecase=false', 3686 'fast-import', '--force', '--quiet'] 3687 if date_format_permissive: 3688 fip_cmd.append('--date-format=raw-permissive') 3689 if self._args.state_branch: 3690 target_marks_file = self._load_marks_file(b'target-marks') 3691 fip_cmd.extend([b'--export-marks='+target_marks_file, 3692 b'--import-marks='+target_marks_file]) 3693 self._fip = subproc.Popen(fip_cmd, bufsize=-1, 3694 stdin=subprocess.PIPE, stdout=subprocess.PIPE) 3695 self._import_pipes = (self._fip.stdin, self._fip.stdout) 3696 if self._args.dry_run or self._args.debug: 3697 self._fe_filt = os.path.join(self.results_tmp_dir(), 3698 b'fast-export.filtered') 3699 self._output = open(self._fe_filt, 'bw') 3700 else: 3701 self._output = self._fip.stdin 3702 if self._args.debug and not self._args.dry_run: 3703 self._output = DualFileWriter(self._fip.stdin, self._output) 3704 tmp = [decode(x) if isinstance(x, bytes) else x for x in fip_cmd] 3705 print("[DEBUG] Running: {}".format(' '.join(tmp))) 3706 print(" (using the following file as input: {})" 3707 .format(decode(self._fe_filt))) 3708 3709 def _migrate_origin_to_heads(self): 3710 refs_to_migrate = set(x for x in self._orig_refs 3711 if x.startswith(b'refs/remotes/origin/')) 3712 if not refs_to_migrate: 3713 return 3714 if self._args.debug: 3715 print("[DEBUG] Migrating refs/remotes/origin/* -> refs/heads/*") 3716 target_working_dir = self._args.target or b'.' 3717 p = subproc.Popen('git update-ref --no-deref --stdin'.split(), 3718 stdin=subprocess.PIPE, cwd=target_working_dir) 3719 for ref in refs_to_migrate: 3720 if ref == b'refs/remotes/origin/HEAD': 3721 p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref])) 3722 del self._orig_refs[ref] 3723 continue 3724 newref = ref.replace(b'refs/remotes/origin/', b'refs/heads/') 3725 if newref not in self._orig_refs: 3726 p.stdin.write(b'create %s %s\n' % (newref, self._orig_refs[ref])) 3727 p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref])) 3728 self._orig_refs[newref] = self._orig_refs[ref] 3729 del self._orig_refs[ref] 3730 p.stdin.close() 3731 if p.wait(): 3732 raise SystemExit(_("git update-ref failed; see above")) # pragma: no cover 3733 3734 # Now remove 3735 if self._args.debug: 3736 print("[DEBUG] Removing 'origin' remote (rewritten history will no ") 3737 print(" longer be related; consider re-pushing it elsewhere.") 3738 subproc.call('git remote rm origin'.split(), cwd=target_working_dir) 3739 3740 def _final_commands(self): 3741 self._finalize_handled = True 3742 self._done_callback and self._done_callback() 3743 3744 if not self._args.quiet: 3745 self._progress_writer.finish() 3746 3747 def _ref_update(self, target_working_dir): 3748 # Start the update-ref process 3749 p = subproc.Popen('git update-ref --no-deref --stdin'.split(), 3750 stdin=subprocess.PIPE, 3751 cwd=target_working_dir) 3752 3753 # Remove replace_refs from _orig_refs 3754 replace_refs = {k:v for k, v in self._orig_refs.items() 3755 if k.startswith(b'refs/replace/')} 3756 reverse_replace_refs = collections.defaultdict(list) 3757 for k,v in replace_refs.items(): 3758 reverse_replace_refs[v].append(k) 3759 all(map(self._orig_refs.pop, replace_refs)) 3760 3761 # Remove unused refs 3762 exported_refs, imported_refs = self.get_exported_and_imported_refs() 3763 refs_to_nuke = exported_refs - imported_refs 3764 if self._args.partial: 3765 refs_to_nuke = set() 3766 if refs_to_nuke and self._args.debug: 3767 print("[DEBUG] Deleting the following refs:\n "+ 3768 decode(b"\n ".join(refs_to_nuke))) 3769 p.stdin.write(b''.join([b"delete %s\n" % x 3770 for x in refs_to_nuke])) 3771 3772 # Delete or update and add replace_refs; note that fast-export automatically 3773 # handles 'update-no-add', we only need to take action for the other four 3774 # choices for replace_refs. 3775 self._flush_renames() 3776 actual_renames = {k:v for k,v in self._commit_renames.items() if k != v} 3777 if self._args.replace_refs in ['delete-no-add', 'delete-and-add']: 3778 # Delete old replace refs, if unwanted 3779 replace_refs_to_nuke = set(replace_refs) 3780 if self._args.replace_refs == 'delete-and-add': 3781 # git-update-ref won't allow us to update a ref twice, so be careful 3782 # to avoid deleting refs we'll later update 3783 replace_refs_to_nuke = replace_refs_to_nuke.difference( 3784 [b'refs/replace/'+x for x in actual_renames]) 3785 p.stdin.write(b''.join([b"delete %s\n" % x 3786 for x in replace_refs_to_nuke])) 3787 if self._args.replace_refs in ['delete-and-add', 'update-or-add', 3788 'update-and-add']: 3789 # Add new replace refs 3790 update_only = (self._args.replace_refs == 'update-or-add') 3791 p.stdin.write(b''.join([b"update refs/replace/%s %s\n" % (old, new) 3792 for old,new in actual_renames.items() 3793 if new and not (update_only and 3794 old in reverse_replace_refs)])) 3795 3796 # Complete the update-ref process 3797 p.stdin.close() 3798 if p.wait(): 3799 raise SystemExit(_("git update-ref failed; see above")) # pragma: no cover 3800 3801 def _record_metadata(self, metadata_dir, orig_refs): 3802 self._flush_renames() 3803 with open(os.path.join(metadata_dir, b'commit-map'), 'bw') as f: 3804 f.write(("%-40s %s\n" % (_("old"), _("new"))).encode()) 3805 for (old,new) in self._commit_renames.items(): 3806 msg = b'%s %s\n' % (old, new if new != None else deleted_hash) 3807 f.write(msg) 3808 3809 exported_refs, imported_refs = self.get_exported_and_imported_refs() 3810 3811 batch_check_process = None 3812 batch_check_output_re = re.compile(b'^([0-9a-f]{40}) ([a-z]+) ([0-9]+)$') 3813 with open(os.path.join(metadata_dir, b'ref-map'), 'bw') as f: 3814 for refname, old_hash in orig_refs.items(): 3815 if refname not in exported_refs: 3816 continue 3817 if refname not in imported_refs: 3818 new_hash = deleted_hash 3819 elif old_hash in self._commit_renames: 3820 new_hash = self._commit_renames[old_hash] 3821 new_hash = new_hash if new_hash != None else deleted_hash 3822 else: # Must be either an annotated tag, or a ref whose tip was pruned 3823 if not batch_check_process: 3824 cmd = 'git cat-file --batch-check'.split() 3825 target_working_dir = self._args.target or b'.' 3826 batch_check_process = subproc.Popen(cmd, 3827 stdin=subprocess.PIPE, 3828 stdout=subprocess.PIPE, 3829 cwd=target_working_dir) 3830 batch_check_process.stdin.write(refname+b"\n") 3831 batch_check_process.stdin.flush() 3832 line = batch_check_process.stdout.readline() 3833 m = batch_check_output_re.match(line) 3834 if m and m.group(2) in (b'tag', b'commit'): 3835 new_hash = m.group(1) 3836 elif line.endswith(b' missing\n'): 3837 new_hash = deleted_hash 3838 else: 3839 raise SystemExit(_("Failed to find new id for %(refname)s " 3840 "(old id was %(old_hash)s)") 3841 % ({'refname': refname, 'old_hash': old_hash}) 3842 ) # pragma: no cover 3843 f.write(b'%s %s %s\n' % (old_hash, new_hash, refname)) 3844 if self._args.source or self._args.target: 3845 new_refs = GitUtils.get_refs(self._args.target or b'.') 3846 for ref, new_hash in new_refs.items(): 3847 if ref not in orig_refs and not ref.startswith(b'refs/replace/'): 3848 old_hash = b'0'*len(new_hash) 3849 f.write(b'%s %s %s\n' % (old_hash, new_hash, ref)) 3850 if batch_check_process: 3851 batch_check_process.stdin.close() 3852 batch_check_process.wait() 3853 3854 with open(os.path.join(metadata_dir, b'suboptimal-issues'), 'bw') as f: 3855 issues_found = False 3856 if self._commits_no_longer_merges: 3857 issues_found = True 3858 3859 f.write(textwrap.dedent(_(''' 3860 The following commits used to be merge commits but due to filtering 3861 are now regular commits; they likely have suboptimal commit messages 3862 (e.g. "Merge branch next into master"). Original commit hash on the 3863 left, commit hash after filtering/rewriting on the right: 3864 ''')[1:]).encode()) 3865 for oldhash, newhash in self._commits_no_longer_merges: 3866 f.write(' {} {}\n'.format(oldhash, newhash).encode()) 3867 f.write(b'\n') 3868 3869 if self._commits_referenced_but_removed: 3870 issues_found = True 3871 f.write(textwrap.dedent(_(''' 3872 The following commits were filtered out, but referenced in another 3873 commit message. The reference to the now-nonexistent commit hash 3874 (or a substring thereof) was left as-is in any commit messages: 3875 ''')[1:]).encode()) 3876 for bad_commit_reference in self._commits_referenced_but_removed: 3877 f.write(' {}\n'.format(bad_commit_reference).encode()) 3878 f.write(b'\n') 3879 3880 if not issues_found: 3881 f.write(_("No filtering problems encountered.\n").encode()) 3882 3883 with open(os.path.join(metadata_dir, b'already_ran'), 'bw') as f: 3884 f.write(_("This file exists to allow you to filter again without --force.\n").encode()) 3885 3886 def finish(self): 3887 ''' Alternative to run() when there is no input of our own to parse, 3888 meaning that run only really needs to close the handle to fast-import 3889 and let it finish, thus making a call to "run" feel like a misnomer. ''' 3890 assert not self._input 3891 assert self._managed_output 3892 self.run() 3893 3894 def insert(self, obj, direct_insertion = False): 3895 if not direct_insertion: 3896 if type(obj) == Blob: 3897 self._tweak_blob(obj) 3898 elif type(obj) == Commit: 3899 aux_info = {'orig_parents': obj.parents, 3900 'had_file_changes': bool(obj.file_changes)} 3901 self._tweak_commit(obj, aux_info) 3902 elif type(obj) == Reset: 3903 self._tweak_reset(obj) 3904 elif type(obj) == Tag: 3905 self._tweak_tag(obj) 3906 self._insert_into_stream(obj) 3907 3908 def _insert_into_stream(self, obj): 3909 if not obj.dumped: 3910 if self._parser: 3911 self._parser.insert(obj) 3912 else: 3913 obj.dump(self._output) 3914 3915 def get_exported_and_imported_refs(self): 3916 return self._parser.get_exported_and_imported_refs() 3917 3918 def run(self): 3919 start = time.time() 3920 if not self._input and not self._output: 3921 self._run_sanity_checks() 3922 if not self._args.dry_run and not self._args.partial: 3923 self._migrate_origin_to_heads() 3924 self._setup_input(use_done_feature = True) 3925 self._setup_output() 3926 assert self._sanity_checks_handled 3927 3928 if self._input: 3929 # Create and run the filter 3930 self._repo_working_dir = self._args.source or b'.' 3931 self._parser = FastExportParser(blob_callback = self._tweak_blob, 3932 commit_callback = self._tweak_commit, 3933 tag_callback = self._tweak_tag, 3934 reset_callback = self._tweak_reset, 3935 done_callback = self._final_commands) 3936 self._parser.run(self._input, self._output) 3937 if not self._finalize_handled: 3938 self._final_commands() 3939 3940 # Make sure fast-export completed successfully 3941 if not self._args.stdin and self._fep.wait(): 3942 raise SystemExit(_("Error: fast-export failed; see above.")) # pragma: no cover 3943 self._input.close() 3944 3945 # If we're not the manager of self._output, we should avoid post-run cleanup 3946 if not self._managed_output: 3947 return 3948 3949 # Close the output and ensure fast-import successfully completes 3950 self._output.close() 3951 if not self._args.dry_run and self._fip.wait(): 3952 raise SystemExit(_("Error: fast-import failed; see above.")) # pragma: no cover 3953 3954 # With fast-export and fast-import complete, update state if requested 3955 if self._args.state_branch: 3956 self._save_marks_files() 3957 3958 # Notify user how long it took, before doing a gc and such 3959 msg = "New history written in {:.2f} seconds..." 3960 if self._args.repack: 3961 msg = "New history written in {:.2f} seconds; now repacking/cleaning..." 3962 print(msg.format(time.time()-start)) 3963 3964 # Exit early, if requested 3965 if self._args.dry_run: 3966 print(_("NOTE: Not running fast-import or cleaning up; --dry-run passed.")) 3967 if self._fe_orig: 3968 print(_(" Requested filtering can be seen by comparing:")) 3969 print(" " + decode(self._fe_orig)) 3970 else: 3971 print(_(" Requested filtering can be seen at:")) 3972 print(" " + decode(self._fe_filt)) 3973 return 3974 3975 target_working_dir = self._args.target or b'.' 3976 if self._input: 3977 self._ref_update(target_working_dir) 3978 3979 # Write out data about run 3980 self._record_metadata(self.results_tmp_dir(), self._orig_refs) 3981 3982 # Final cleanup: 3983 # If we need a repack, then nuke the reflogs and repack. 3984 # If we need a reset, do a reset --hard 3985 reset = not GitUtils.is_repository_bare(target_working_dir) 3986 RepoFilter.cleanup(target_working_dir, self._args.repack, reset, 3987 run_quietly=self._args.quiet, 3988 show_debuginfo=self._args.debug) 3989 3990 # Let user know how long it took 3991 print(_("Completely finished after {:.2f} seconds.") 3992 .format(time.time()-start)) 3993 3994def main(): 3995 setup_gettext() 3996 args = FilteringOptions.parse_args(sys.argv[1:]) 3997 if args.analyze: 3998 RepoAnalyze.run(args) 3999 else: 4000 filter = RepoFilter(args) 4001 filter.run() 4002 4003if __name__ == '__main__': 4004 main() 4005