1# Copyright (C) 2007-2011 Canonical Ltd 2# 3# This program is free software; you can redistribute it and/or modify 4# it under the terms of the GNU General Public License as published by 5# the Free Software Foundation; either version 2 of the License, or 6# (at your option) any later version. 7# 8# This program is distributed in the hope that it will be useful, 9# but WITHOUT ANY WARRANTY; without even the implied warranty of 10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11# GNU General Public License for more details. 12# 13# You should have received a copy of the GNU General Public License 14# along with this program; if not, write to the Free Software 15# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 16 17import errno 18from io import ( 19 BytesIO, 20 ) 21import os 22 23from .lazy_import import lazy_import 24 25lazy_import(globals(), """ 26import gzip 27import itertools 28import patiencediff 29 30from breezy import ( 31 bencode, 32 ui, 33 ) 34""") 35from . import ( 36 errors, 37 ) 38from .i18n import gettext 39 40 41def topo_iter_keys(vf, keys=None): 42 if keys is None: 43 keys = vf.keys() 44 parents = vf.get_parent_map(keys) 45 return _topo_iter(parents, keys) 46 47 48def topo_iter(vf, versions=None): 49 if versions is None: 50 versions = vf.versions() 51 parents = vf.get_parent_map(versions) 52 return _topo_iter(parents, versions) 53 54 55def _topo_iter(parents, versions): 56 seen = set() 57 descendants = {} 58 59 def pending_parents(version): 60 if parents[version] is None: 61 return [] 62 return [v for v in parents[version] if v in versions and 63 v not in seen] 64 for version_id in versions: 65 if parents[version_id] is None: 66 # parentless 67 continue 68 for parent_id in parents[version_id]: 69 descendants.setdefault(parent_id, []).append(version_id) 70 cur = [v for v in versions if len(pending_parents(v)) == 0] 71 while len(cur) > 0: 72 next = [] 73 for version_id in cur: 74 if version_id in seen: 75 continue 76 if len(pending_parents(version_id)) != 0: 77 continue 78 next.extend(descendants.get(version_id, [])) 79 yield version_id 80 seen.add(version_id) 81 cur = next 82 83 84class MultiParent(object): 85 """A multi-parent diff""" 86 87 __slots__ = ['hunks'] 88 89 def __init__(self, hunks=None): 90 if hunks is not None: 91 self.hunks = hunks 92 else: 93 self.hunks = [] 94 95 def __repr__(self): 96 return "MultiParent(%r)" % self.hunks 97 98 def __eq__(self, other): 99 if self.__class__ is not other.__class__: 100 return False 101 return (self.hunks == other.hunks) 102 103 @staticmethod 104 def from_lines(text, parents=(), left_blocks=None): 105 """Produce a MultiParent from a list of lines and parents""" 106 def compare(parent): 107 matcher = patiencediff.PatienceSequenceMatcher(None, parent, 108 text) 109 return matcher.get_matching_blocks() 110 if len(parents) > 0: 111 if left_blocks is None: 112 left_blocks = compare(parents[0]) 113 parent_comparisons = [left_blocks] + [compare(p) for p in 114 parents[1:]] 115 else: 116 parent_comparisons = [] 117 cur_line = 0 118 new_text = NewText([]) 119 parent_text = [] 120 block_iter = [iter(i) for i in parent_comparisons] 121 diff = MultiParent([]) 122 123 def next_block(p): 124 try: 125 return next(block_iter[p]) 126 except StopIteration: 127 return None 128 cur_block = [next_block(p) for p, i in enumerate(block_iter)] 129 while cur_line < len(text): 130 best_match = None 131 for p, block in enumerate(cur_block): 132 if block is None: 133 continue 134 i, j, n = block 135 while j + n <= cur_line: 136 block = cur_block[p] = next_block(p) 137 if block is None: 138 break 139 i, j, n = block 140 if block is None: 141 continue 142 if j > cur_line: 143 continue 144 offset = cur_line - j 145 i += offset 146 j = cur_line 147 n -= offset 148 if n == 0: 149 continue 150 if best_match is None or n > best_match.num_lines: 151 best_match = ParentText(p, i, j, n) 152 if best_match is None: 153 new_text.lines.append(text[cur_line]) 154 cur_line += 1 155 else: 156 if len(new_text.lines) > 0: 157 diff.hunks.append(new_text) 158 new_text = NewText([]) 159 diff.hunks.append(best_match) 160 cur_line += best_match.num_lines 161 if len(new_text.lines) > 0: 162 diff.hunks.append(new_text) 163 return diff 164 165 def get_matching_blocks(self, parent, parent_len): 166 for hunk in self.hunks: 167 if not isinstance(hunk, ParentText) or hunk.parent != parent: 168 continue 169 yield (hunk.parent_pos, hunk.child_pos, hunk.num_lines) 170 yield parent_len, self.num_lines(), 0 171 172 def to_lines(self, parents=()): 173 """Contruct a fulltext from this diff and its parents""" 174 mpvf = MultiMemoryVersionedFile() 175 for num, parent in enumerate(parents): 176 mpvf.add_version(BytesIO(parent).readlines(), num, []) 177 mpvf.add_diff(self, 'a', list(range(len(parents)))) 178 return mpvf.get_line_list(['a'])[0] 179 180 @classmethod 181 def from_texts(cls, text, parents=()): 182 """Produce a MultiParent from a text and list of parent text""" 183 return cls.from_lines(BytesIO(text).readlines(), 184 [BytesIO(p).readlines() for p in parents]) 185 186 def to_patch(self): 187 """Yield text lines for a patch""" 188 for hunk in self.hunks: 189 for line in hunk.to_patch(): 190 yield line 191 192 def patch_len(self): 193 return len(b''.join(self.to_patch())) 194 195 def zipped_patch_len(self): 196 return len(gzip_string(self.to_patch())) 197 198 @classmethod 199 def from_patch(cls, text): 200 """Create a MultiParent from its string form""" 201 return cls._from_patch(BytesIO(text)) 202 203 @staticmethod 204 def _from_patch(lines): 205 """This is private because it is essential to split lines on \n only""" 206 line_iter = iter(lines) 207 hunks = [] 208 cur_line = None 209 while True: 210 try: 211 cur_line = next(line_iter) 212 except StopIteration: 213 break 214 first_char = cur_line[0:1] 215 if first_char == b'i': 216 num_lines = int(cur_line.split(b' ')[1]) 217 hunk_lines = [next(line_iter) for _ in range(num_lines)] 218 hunk_lines[-1] = hunk_lines[-1][:-1] 219 hunks.append(NewText(hunk_lines)) 220 elif first_char == b'\n': 221 hunks[-1].lines[-1] += b'\n' 222 else: 223 if not (first_char == b'c'): 224 raise AssertionError(first_char) 225 parent, parent_pos, child_pos, num_lines =\ 226 [int(v) for v in cur_line.split(b' ')[1:]] 227 hunks.append(ParentText(parent, parent_pos, child_pos, 228 num_lines)) 229 return MultiParent(hunks) 230 231 def range_iterator(self): 232 """Iterate through the hunks, with range indicated 233 234 kind is "new" or "parent". 235 for "new", data is a list of lines. 236 for "parent", data is (parent, parent_start, parent_end) 237 :return: a generator of (start, end, kind, data) 238 """ 239 start = 0 240 for hunk in self.hunks: 241 if isinstance(hunk, NewText): 242 kind = 'new' 243 end = start + len(hunk.lines) 244 data = hunk.lines 245 else: 246 kind = 'parent' 247 start = hunk.child_pos 248 end = start + hunk.num_lines 249 data = (hunk.parent, hunk.parent_pos, hunk.parent_pos + 250 hunk.num_lines) 251 yield start, end, kind, data 252 start = end 253 254 def num_lines(self): 255 """The number of lines in the output text""" 256 extra_n = 0 257 for hunk in reversed(self.hunks): 258 if isinstance(hunk, ParentText): 259 return hunk.child_pos + hunk.num_lines + extra_n 260 extra_n += len(hunk.lines) 261 return extra_n 262 263 def is_snapshot(self): 264 """Return true of this hunk is effectively a fulltext""" 265 if len(self.hunks) != 1: 266 return False 267 return (isinstance(self.hunks[0], NewText)) 268 269 270class NewText(object): 271 """The contents of text that is introduced by this text""" 272 273 __slots__ = ['lines'] 274 275 def __init__(self, lines): 276 self.lines = lines 277 278 def __eq__(self, other): 279 if self.__class__ is not other.__class__: 280 return False 281 return (other.lines == self.lines) 282 283 def __repr__(self): 284 return 'NewText(%r)' % self.lines 285 286 def to_patch(self): 287 yield b'i %d\n' % len(self.lines) 288 for line in self.lines: 289 yield line 290 yield b'\n' 291 292 293class ParentText(object): 294 """A reference to text present in a parent text""" 295 296 __slots__ = ['parent', 'parent_pos', 'child_pos', 'num_lines'] 297 298 def __init__(self, parent, parent_pos, child_pos, num_lines): 299 self.parent = parent 300 self.parent_pos = parent_pos 301 self.child_pos = child_pos 302 self.num_lines = num_lines 303 304 def _as_dict(self): 305 return {b'parent': self.parent, 306 b'parent_pos': self.parent_pos, 307 b'child_pos': self.child_pos, 308 b'num_lines': self.num_lines} 309 310 def __repr__(self): 311 return ('ParentText(%(parent)r, %(parent_pos)r, %(child_pos)r,' 312 ' %(num_lines)r)' % self._as_dict()) 313 314 def __eq__(self, other): 315 if self.__class__ is not other.__class__: 316 return False 317 return self._as_dict() == other._as_dict() 318 319 def to_patch(self): 320 yield (b'c %(parent)d %(parent_pos)d %(child_pos)d %(num_lines)d\n' 321 % self._as_dict()) 322 323 324class BaseVersionedFile(object): 325 """Pseudo-VersionedFile skeleton for MultiParent""" 326 327 def __init__(self, snapshot_interval=25, max_snapshots=None): 328 self._lines = {} 329 self._parents = {} 330 self._snapshots = set() 331 self.snapshot_interval = snapshot_interval 332 self.max_snapshots = max_snapshots 333 334 def versions(self): 335 return iter(self._parents) 336 337 def has_version(self, version): 338 return version in self._parents 339 340 def do_snapshot(self, version_id, parent_ids): 341 """Determine whether to perform a snapshot for this version""" 342 if self.snapshot_interval is None: 343 return False 344 if self.max_snapshots is not None and\ 345 len(self._snapshots) == self.max_snapshots: 346 return False 347 if len(parent_ids) == 0: 348 return True 349 for ignored in range(self.snapshot_interval): 350 if len(parent_ids) == 0: 351 return False 352 version_ids = parent_ids 353 parent_ids = [] 354 for version_id in version_ids: 355 if version_id not in self._snapshots: 356 parent_ids.extend(self._parents[version_id]) 357 else: 358 return True 359 360 def add_version(self, lines, version_id, parent_ids, 361 force_snapshot=None, single_parent=False): 362 """Add a version to the versionedfile 363 364 :param lines: The list of lines to add. Must be split on '\n'. 365 :param version_id: The version_id of the version to add 366 :param force_snapshot: If true, force this version to be added as a 367 snapshot version. If false, force this version to be added as a 368 diff. If none, determine this automatically. 369 :param single_parent: If true, use a single parent, rather than 370 multiple parents. 371 """ 372 if force_snapshot is None: 373 do_snapshot = self.do_snapshot(version_id, parent_ids) 374 else: 375 do_snapshot = force_snapshot 376 if do_snapshot: 377 self._snapshots.add(version_id) 378 diff = MultiParent([NewText(lines)]) 379 else: 380 if single_parent: 381 parent_lines = self.get_line_list(parent_ids[:1]) 382 else: 383 parent_lines = self.get_line_list(parent_ids) 384 diff = MultiParent.from_lines(lines, parent_lines) 385 if diff.is_snapshot(): 386 self._snapshots.add(version_id) 387 self.add_diff(diff, version_id, parent_ids) 388 self._lines[version_id] = lines 389 390 def get_parents(self, version_id): 391 return self._parents[version_id] 392 393 def make_snapshot(self, version_id): 394 snapdiff = MultiParent([NewText(self.cache_version(version_id))]) 395 self.add_diff(snapdiff, version_id, self._parents[version_id]) 396 self._snapshots.add(version_id) 397 398 def import_versionedfile(self, vf, snapshots, no_cache=True, 399 single_parent=False, verify=False): 400 """Import all revisions of a versionedfile 401 402 :param vf: The versionedfile to import 403 :param snapshots: If provided, the revisions to make snapshots of. 404 Otherwise, this will be auto-determined 405 :param no_cache: If true, clear the cache after every add. 406 :param single_parent: If true, omit all but one parent text, (but 407 retain parent metadata). 408 """ 409 if not (no_cache or not verify): 410 raise ValueError() 411 revisions = set(vf.versions()) 412 total = len(revisions) 413 with ui.ui_factory.nested_progress_bar() as pb: 414 while len(revisions) > 0: 415 added = set() 416 for revision in revisions: 417 parents = vf.get_parents(revision) 418 if [p for p in parents if p not in self._parents] != []: 419 continue 420 lines = [a + b' ' + l for a, l in 421 vf.annotate(revision)] 422 if snapshots is None: 423 force_snapshot = None 424 else: 425 force_snapshot = (revision in snapshots) 426 self.add_version(lines, revision, parents, force_snapshot, 427 single_parent) 428 added.add(revision) 429 if no_cache: 430 self.clear_cache() 431 vf.clear_cache() 432 if verify: 433 if not (lines == self.get_line_list([revision])[0]): 434 raise AssertionError() 435 self.clear_cache() 436 pb.update(gettext('Importing revisions'), 437 (total - len(revisions)) + len(added), total) 438 revisions = [r for r in revisions if r not in added] 439 440 def select_snapshots(self, vf): 441 """Determine which versions to add as snapshots""" 442 build_ancestors = {} 443 snapshots = set() 444 for version_id in topo_iter(vf): 445 potential_build_ancestors = set(vf.get_parents(version_id)) 446 parents = vf.get_parents(version_id) 447 if len(parents) == 0: 448 snapshots.add(version_id) 449 build_ancestors[version_id] = set() 450 else: 451 for parent in vf.get_parents(version_id): 452 potential_build_ancestors.update(build_ancestors[parent]) 453 if len(potential_build_ancestors) > self.snapshot_interval: 454 snapshots.add(version_id) 455 build_ancestors[version_id] = set() 456 else: 457 build_ancestors[version_id] = potential_build_ancestors 458 return snapshots 459 460 def select_by_size(self, num): 461 """Select snapshots for minimum output size""" 462 num -= len(self._snapshots) 463 new_snapshots = self.get_size_ranking()[-num:] 464 return [v for n, v in new_snapshots] 465 466 def get_size_ranking(self): 467 """Get versions ranked by size""" 468 versions = [] 469 for version_id in self.versions(): 470 if version_id in self._snapshots: 471 continue 472 diff_len = self.get_diff(version_id).patch_len() 473 snapshot_len = MultiParent([NewText( 474 self.cache_version(version_id))]).patch_len() 475 versions.append((snapshot_len - diff_len, version_id)) 476 versions.sort() 477 return versions 478 479 def import_diffs(self, vf): 480 """Import the diffs from another pseudo-versionedfile""" 481 for version_id in vf.versions(): 482 self.add_diff(vf.get_diff(version_id), version_id, 483 vf._parents[version_id]) 484 485 def get_build_ranking(self): 486 """Return revisions sorted by how much they reduce build complexity""" 487 could_avoid = {} 488 referenced_by = {} 489 for version_id in topo_iter(self): 490 could_avoid[version_id] = set() 491 if version_id not in self._snapshots: 492 for parent_id in self._parents[version_id]: 493 could_avoid[version_id].update(could_avoid[parent_id]) 494 could_avoid[version_id].update(self._parents) 495 could_avoid[version_id].discard(version_id) 496 for avoid_id in could_avoid[version_id]: 497 referenced_by.setdefault(avoid_id, set()).add(version_id) 498 available_versions = list(self.versions()) 499 ranking = [] 500 while len(available_versions) > 0: 501 available_versions.sort(key=lambda x: 502 len(could_avoid[x]) * 503 len(referenced_by.get(x, []))) 504 selected = available_versions.pop() 505 ranking.append(selected) 506 for version_id in referenced_by[selected]: 507 could_avoid[version_id].difference_update( 508 could_avoid[selected]) 509 for version_id in could_avoid[selected]: 510 referenced_by[version_id].difference_update( 511 referenced_by[selected] 512 ) 513 return ranking 514 515 def clear_cache(self): 516 self._lines.clear() 517 518 def get_line_list(self, version_ids): 519 return [self.cache_version(v) for v in version_ids] 520 521 def cache_version(self, version_id): 522 try: 523 return self._lines[version_id] 524 except KeyError: 525 pass 526 diff = self.get_diff(version_id) 527 lines = [] 528 reconstructor = _Reconstructor(self, self._lines, self._parents) 529 reconstructor.reconstruct_version(lines, version_id) 530 self._lines[version_id] = lines 531 return lines 532 533 534class MultiMemoryVersionedFile(BaseVersionedFile): 535 """Memory-backed pseudo-versionedfile""" 536 537 def __init__(self, snapshot_interval=25, max_snapshots=None): 538 BaseVersionedFile.__init__(self, snapshot_interval, max_snapshots) 539 self._diffs = {} 540 541 def add_diff(self, diff, version_id, parent_ids): 542 self._diffs[version_id] = diff 543 self._parents[version_id] = parent_ids 544 545 def get_diff(self, version_id): 546 try: 547 return self._diffs[version_id] 548 except KeyError: 549 raise errors.RevisionNotPresent(version_id, self) 550 551 def destroy(self): 552 self._diffs = {} 553 554 555class MultiVersionedFile(BaseVersionedFile): 556 """Disk-backed pseudo-versionedfile""" 557 558 def __init__(self, filename, snapshot_interval=25, max_snapshots=None): 559 BaseVersionedFile.__init__(self, snapshot_interval, max_snapshots) 560 self._filename = filename 561 self._diff_offset = {} 562 563 def get_diff(self, version_id): 564 start, count = self._diff_offset[version_id] 565 with open(self._filename + '.mpknit', 'rb') as infile: 566 infile.seek(start) 567 sio = BytesIO(infile.read(count)) 568 with gzip.GzipFile(None, mode='rb', fileobj=sio) as zip_file: 569 file_version_id = zip_file.readline() 570 content = zip_file.read() 571 return MultiParent.from_patch(content) 572 573 def add_diff(self, diff, version_id, parent_ids): 574 with open(self._filename + '.mpknit', 'ab') as outfile: 575 outfile.seek(0, 2) # workaround for windows bug: 576 # .tell() for files opened in 'ab' mode 577 # before any write returns 0 578 start = outfile.tell() 579 with gzip.GzipFile(None, mode='ab', fileobj=outfile) as zipfile: 580 zipfile.writelines(itertools.chain( 581 [b'version %s\n' % version_id], diff.to_patch())) 582 end = outfile.tell() 583 self._diff_offset[version_id] = (start, end - start) 584 self._parents[version_id] = parent_ids 585 586 def destroy(self): 587 try: 588 os.unlink(self._filename + '.mpknit') 589 except OSError as e: 590 if e.errno != errno.ENOENT: 591 raise 592 try: 593 os.unlink(self._filename + '.mpidx') 594 except OSError as e: 595 if e.errno != errno.ENOENT: 596 raise 597 598 def save(self): 599 open(self._filename + '.mpidx', 'wb').write(bencode.bencode( 600 (self._parents, list(self._snapshots), self._diff_offset))) 601 602 def load(self): 603 self._parents, snapshots, self._diff_offset = bencode.bdecode( 604 open(self._filename + '.mpidx', 'rb').read()) 605 self._snapshots = set(snapshots) 606 607 608class _Reconstructor(object): 609 """Build a text from the diffs, ancestry graph and cached lines""" 610 611 def __init__(self, diffs, lines, parents): 612 self.diffs = diffs 613 self.lines = lines 614 self.parents = parents 615 self.cursor = {} 616 617 def reconstruct(self, lines, parent_text, version_id): 618 """Append the lines referred to by a ParentText to lines""" 619 parent_id = self.parents[version_id][parent_text.parent] 620 end = parent_text.parent_pos + parent_text.num_lines 621 return self._reconstruct(lines, parent_id, parent_text.parent_pos, 622 end) 623 624 def _reconstruct(self, lines, req_version_id, req_start, req_end): 625 """Append lines for the requested version_id range""" 626 # stack of pending range requests 627 if req_start == req_end: 628 return 629 pending_reqs = [(req_version_id, req_start, req_end)] 630 while len(pending_reqs) > 0: 631 req_version_id, req_start, req_end = pending_reqs.pop() 632 # lazily allocate cursors for versions 633 if req_version_id in self.lines: 634 lines.extend(self.lines[req_version_id][req_start:req_end]) 635 continue 636 try: 637 start, end, kind, data, iterator = self.cursor[req_version_id] 638 except KeyError: 639 iterator = self.diffs.get_diff(req_version_id).range_iterator() 640 start, end, kind, data = next(iterator) 641 if start > req_start: 642 iterator = self.diffs.get_diff(req_version_id).range_iterator() 643 start, end, kind, data = next(iterator) 644 645 # find the first hunk relevant to the request 646 while end <= req_start: 647 start, end, kind, data = next(iterator) 648 self.cursor[req_version_id] = start, end, kind, data, iterator 649 # if the hunk can't satisfy the whole request, split it in two, 650 # and leave the second half for later. 651 if req_end > end: 652 pending_reqs.append((req_version_id, end, req_end)) 653 req_end = end 654 if kind == 'new': 655 lines.extend(data[req_start - start: (req_end - start)]) 656 else: 657 # If the hunk is a ParentText, rewrite it as a range request 658 # for the parent, and make it the next pending request. 659 parent, parent_start, parent_end = data 660 new_version_id = self.parents[req_version_id][parent] 661 new_start = parent_start + req_start - start 662 new_end = parent_end + req_end - end 663 pending_reqs.append((new_version_id, new_start, new_end)) 664 665 def reconstruct_version(self, lines, version_id): 666 length = self.diffs.get_diff(version_id).num_lines() 667 return self._reconstruct(lines, version_id, 0, length) 668 669 670def gzip_string(lines): 671 sio = BytesIO() 672 with gzip.GzipFile(None, mode='wb', fileobj=sio) as data_file: 673 data_file.writelines(lines) 674 return sio.getvalue() 675