1# Copyright (C) 2005, 2009 Canonical Ltd 2# 3# This program is free software; you can redistribute it and/or modify 4# it under the terms of the GNU General Public License as published by 5# the Free Software Foundation; either version 2 of the License, or 6# (at your option) any later version. 7# 8# This program is distributed in the hope that it will be useful, 9# but WITHOUT ANY WARRANTY; without even the implied warranty of 10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11# GNU General Public License for more details. 12# 13# You should have received a copy of the GNU General Public License 14# along with this program; if not, write to the Free Software 15# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 16 17# Author: Martin Pool <mbp@canonical.com> 18 19"""Weave - storage of related text file versions""" 20 21# XXX: If we do weaves this way, will a merge still behave the same 22# way if it's done in a different order? That's a pretty desirable 23# property. 24 25# TODO: Nothing here so far assumes the lines are really \n newlines, 26# rather than being split up in some other way. We could accommodate 27# binaries, perhaps by naively splitting on \n or perhaps using 28# something like a rolling checksum. 29 30# TODO: End marker for each version so we can stop reading? 31 32# TODO: Check that no insertion occurs inside a deletion that was 33# active in the version of the insertion. 34 35# TODO: In addition to the SHA-1 check, perhaps have some code that 36# checks structural constraints of the weave: ie that insertions are 37# properly nested, that there is no text outside of an insertion, that 38# insertions or deletions are not repeated, etc. 39 40# TODO: Parallel-extract that passes back each line along with a 41# description of which revisions include it. Nice for checking all 42# shas or calculating stats in parallel. 43 44# TODO: Using a single _extract routine and then processing the output 45# is probably inefficient. It's simple enough that we can afford to 46# have slight specializations for different ways its used: annotate, 47# basis for add, get, etc. 48 49# TODO: Probably the API should work only in names to hide the integer 50# indexes from the user. 51 52# TODO: Is there any potential performance win by having an add() 53# variant that is passed a pre-cooked version of the single basis 54# version? 55 56# TODO: Reweave can possibly be made faster by remembering diffs 57# where the basis and destination are unchanged. 58 59# FIXME: Sometimes we will be given a parents list for a revision 60# that includes some redundant parents (i.e. already a parent of 61# something in the list.) We should eliminate them. This can 62# be done fairly efficiently because the sequence numbers constrain 63# the possible relationships. 64 65# FIXME: the conflict markers should be *7* characters 66 67from copy import copy 68from io import BytesIO 69import os 70import patiencediff 71 72from ..lazy_import import lazy_import 73lazy_import(globals(), """ 74from breezy import tsort 75""") 76from .. import ( 77 errors, 78 osutils, 79 ) 80from ..errors import ( 81 RevisionAlreadyPresent, 82 RevisionNotPresent, 83 ) 84from ..osutils import dirname, sha, sha_strings, split_lines 85from ..revision import NULL_REVISION 86from ..trace import mutter 87from .versionedfile import ( 88 AbsentContentFactory, 89 adapter_registry, 90 ContentFactory, 91 ExistingContent, 92 sort_groupcompress, 93 UnavailableRepresentation, 94 VersionedFile, 95 ) 96from .weavefile import _read_weave_v5, write_weave_v5 97 98 99class WeaveError(errors.BzrError): 100 101 _fmt = "Error in processing weave: %(msg)s" 102 103 def __init__(self, msg=None): 104 errors.BzrError.__init__(self) 105 self.msg = msg 106 107 108class WeaveRevisionAlreadyPresent(WeaveError): 109 110 _fmt = "Revision {%(revision_id)s} already present in %(weave)s" 111 112 def __init__(self, revision_id, weave): 113 114 WeaveError.__init__(self) 115 self.revision_id = revision_id 116 self.weave = weave 117 118 119class WeaveRevisionNotPresent(WeaveError): 120 121 _fmt = "Revision {%(revision_id)s} not present in %(weave)s" 122 123 def __init__(self, revision_id, weave): 124 WeaveError.__init__(self) 125 self.revision_id = revision_id 126 self.weave = weave 127 128 129class WeaveFormatError(WeaveError): 130 131 _fmt = "Weave invariant violated: %(what)s" 132 133 def __init__(self, what): 134 WeaveError.__init__(self) 135 self.what = what 136 137 138class WeaveParentMismatch(WeaveError): 139 140 _fmt = "Parents are mismatched between two revisions. %(msg)s" 141 142 143class WeaveInvalidChecksum(WeaveError): 144 145 _fmt = "Text did not match its checksum: %(msg)s" 146 147 148class WeaveTextDiffers(WeaveError): 149 150 _fmt = ("Weaves differ on text content. Revision:" 151 " {%(revision_id)s}, %(weave_a)s, %(weave_b)s") 152 153 def __init__(self, revision_id, weave_a, weave_b): 154 WeaveError.__init__(self) 155 self.revision_id = revision_id 156 self.weave_a = weave_a 157 self.weave_b = weave_b 158 159 160class WeaveContentFactory(ContentFactory): 161 """Content factory for streaming from weaves. 162 163 :seealso ContentFactory: 164 """ 165 166 def __init__(self, version, weave): 167 """Create a WeaveContentFactory for version from weave.""" 168 ContentFactory.__init__(self) 169 self.sha1 = weave.get_sha1s([version])[version] 170 self.key = (version,) 171 parents = weave.get_parent_map([version])[version] 172 self.parents = tuple((parent,) for parent in parents) 173 self.storage_kind = 'fulltext' 174 self._weave = weave 175 176 def get_bytes_as(self, storage_kind): 177 if storage_kind == 'fulltext': 178 return self._weave.get_text(self.key[-1]) 179 elif storage_kind in ('chunked', 'lines'): 180 return self._weave.get_lines(self.key[-1]) 181 else: 182 raise UnavailableRepresentation(self.key, storage_kind, 'fulltext') 183 184 def iter_bytes_as(self, storage_kind): 185 if storage_kind in ('chunked', 'lines'): 186 return iter(self._weave.get_lines(self.key[-1])) 187 else: 188 raise UnavailableRepresentation(self.key, storage_kind, 'fulltext') 189 190 191class Weave(VersionedFile): 192 """weave - versioned text file storage. 193 194 A Weave manages versions of line-based text files, keeping track 195 of the originating version for each line. 196 197 To clients the "lines" of the file are represented as a list of strings. 198 These strings will typically have terminal newline characters, but 199 this is not required. In particular files commonly do not have a newline 200 at the end of the file. 201 202 Texts can be identified in either of two ways: 203 204 * a nonnegative index number. 205 206 * a version-id string. 207 208 Typically the index number will be valid only inside this weave and 209 the version-id is used to reference it in the larger world. 210 211 The weave is represented as a list mixing edit instructions and 212 literal text. Each entry in _weave can be either a string (or 213 unicode), or a tuple. If a string, it means that the given line 214 should be output in the currently active revisions. 215 216 If a tuple, it gives a processing instruction saying in which 217 revisions the enclosed lines are active. The tuple has the form 218 (instruction, version). 219 220 The instruction can be '{' or '}' for an insertion block, and '[' 221 and ']' for a deletion block respectively. The version is the 222 integer version index. There is no replace operator, only deletes 223 and inserts. For '}', the end of an insertion, there is no 224 version parameter because it always closes the most recently 225 opened insertion. 226 227 Constraints/notes: 228 229 * A later version can delete lines that were introduced by any 230 number of ancestor versions; this implies that deletion 231 instructions can span insertion blocks without regard to the 232 insertion block's nesting. 233 234 * Similarly, deletions need not be properly nested with regard to 235 each other, because they might have been generated by 236 independent revisions. 237 238 * Insertions are always made by inserting a new bracketed block 239 into a single point in the previous weave. This implies they 240 can nest but not overlap, and the nesting must always have later 241 insertions on the inside. 242 243 * It doesn't seem very useful to have an active insertion 244 inside an inactive insertion, but it might happen. 245 246 * Therefore, all instructions are always"considered"; that 247 is passed onto and off the stack. An outer inactive block 248 doesn't disable an inner block. 249 250 * Lines are enabled if the most recent enclosing insertion is 251 active and none of the enclosing deletions are active. 252 253 * There is no point having a deletion directly inside its own 254 insertion; you might as well just not write it. And there 255 should be no way to get an earlier version deleting a later 256 version. 257 258 _weave 259 Text of the weave; list of control instruction tuples and strings. 260 261 _parents 262 List of parents, indexed by version number. 263 It is only necessary to store the minimal set of parents for 264 each version; the parent's parents are implied. 265 266 _sha1s 267 List of hex SHA-1 of each version. 268 269 _names 270 List of symbolic names for each version. Each should be unique. 271 272 _name_map 273 For each name, the version number. 274 275 _weave_name 276 Descriptive name of this weave; typically the filename if known. 277 Set by read_weave. 278 """ 279 280 __slots__ = ['_weave', '_parents', '_sha1s', '_names', '_name_map', 281 '_weave_name', '_matcher', '_allow_reserved'] 282 283 def __init__(self, weave_name=None, access_mode='w', matcher=None, 284 get_scope=None, allow_reserved=False): 285 """Create a weave. 286 287 :param get_scope: A callable that returns an opaque object to be used 288 for detecting when this weave goes out of scope (should stop 289 answering requests or allowing mutation). 290 """ 291 super(Weave, self).__init__() 292 self._weave = [] 293 self._parents = [] 294 self._sha1s = [] 295 self._names = [] 296 self._name_map = {} 297 self._weave_name = weave_name 298 if matcher is None: 299 self._matcher = patiencediff.PatienceSequenceMatcher 300 else: 301 self._matcher = matcher 302 if get_scope is None: 303 def get_scope(): 304 return None 305 self._get_scope = get_scope 306 self._scope = get_scope() 307 self._access_mode = access_mode 308 self._allow_reserved = allow_reserved 309 310 def __repr__(self): 311 return "Weave(%r)" % self._weave_name 312 313 def _check_write_ok(self): 314 """Is the versioned file marked as 'finished' ? Raise if it is.""" 315 if self._get_scope() != self._scope: 316 raise errors.OutSideTransaction() 317 if self._access_mode != 'w': 318 raise errors.ReadOnlyObjectDirtiedError(self) 319 320 def copy(self): 321 """Return a deep copy of self. 322 323 The copy can be modified without affecting the original weave.""" 324 other = Weave() 325 other._weave = self._weave[:] 326 other._parents = self._parents[:] 327 other._sha1s = self._sha1s[:] 328 other._names = self._names[:] 329 other._name_map = self._name_map.copy() 330 other._weave_name = self._weave_name 331 return other 332 333 def __eq__(self, other): 334 if not isinstance(other, Weave): 335 return False 336 return self._parents == other._parents \ 337 and self._weave == other._weave \ 338 and self._sha1s == other._sha1s 339 340 def __ne__(self, other): 341 return not self.__eq__(other) 342 343 def _idx_to_name(self, version): 344 return self._names[version] 345 346 def _lookup(self, name): 347 """Convert symbolic version name to index.""" 348 if not self._allow_reserved: 349 self.check_not_reserved_id(name) 350 try: 351 return self._name_map[name] 352 except KeyError: 353 raise RevisionNotPresent(name, self._weave_name) 354 355 def versions(self): 356 """See VersionedFile.versions.""" 357 return self._names[:] 358 359 def has_version(self, version_id): 360 """See VersionedFile.has_version.""" 361 return (version_id in self._name_map) 362 363 __contains__ = has_version 364 365 def get_record_stream(self, versions, ordering, include_delta_closure): 366 """Get a stream of records for versions. 367 368 :param versions: The versions to include. Each version is a tuple 369 (version,). 370 :param ordering: Either 'unordered' or 'topological'. A topologically 371 sorted stream has compression parents strictly before their 372 children. 373 :param include_delta_closure: If True then the closure across any 374 compression parents will be included (in the opaque data). 375 :return: An iterator of ContentFactory objects, each of which is only 376 valid until the iterator is advanced. 377 """ 378 versions = [version[-1] for version in versions] 379 if ordering == 'topological': 380 parents = self.get_parent_map(versions) 381 new_versions = tsort.topo_sort(parents) 382 new_versions.extend(set(versions).difference(set(parents))) 383 versions = new_versions 384 elif ordering == 'groupcompress': 385 parents = self.get_parent_map(versions) 386 new_versions = sort_groupcompress(parents) 387 new_versions.extend(set(versions).difference(set(parents))) 388 versions = new_versions 389 for version in versions: 390 if version in self: 391 yield WeaveContentFactory(version, self) 392 else: 393 yield AbsentContentFactory((version,)) 394 395 def get_parent_map(self, version_ids): 396 """See VersionedFile.get_parent_map.""" 397 result = {} 398 for version_id in version_ids: 399 if version_id == NULL_REVISION: 400 parents = () 401 else: 402 try: 403 parents = tuple( 404 map(self._idx_to_name, 405 self._parents[self._lookup(version_id)])) 406 except RevisionNotPresent: 407 continue 408 result[version_id] = parents 409 return result 410 411 def get_parents_with_ghosts(self, version_id): 412 raise NotImplementedError(self.get_parents_with_ghosts) 413 414 def insert_record_stream(self, stream): 415 """Insert a record stream into this versioned file. 416 417 :param stream: A stream of records to insert. 418 :return: None 419 :seealso VersionedFile.get_record_stream: 420 """ 421 adapters = {} 422 for record in stream: 423 # Raise an error when a record is missing. 424 if record.storage_kind == 'absent': 425 raise RevisionNotPresent([record.key[0]], self) 426 # adapt to non-tuple interface 427 parents = [parent[0] for parent in record.parents] 428 if record.storage_kind in ('fulltext', 'chunked', 'lines'): 429 self.add_lines( 430 record.key[0], parents, 431 record.get_bytes_as('lines')) 432 else: 433 adapter_key = record.storage_kind, 'lines' 434 try: 435 adapter = adapters[adapter_key] 436 except KeyError: 437 adapter_factory = adapter_registry.get(adapter_key) 438 adapter = adapter_factory(self) 439 adapters[adapter_key] = adapter 440 lines = adapter.get_bytes(record, 'lines') 441 try: 442 self.add_lines(record.key[0], parents, lines) 443 except RevisionAlreadyPresent: 444 pass 445 446 def _check_repeated_add(self, name, parents, text, sha1): 447 """Check that a duplicated add is OK. 448 449 If it is, return the (old) index; otherwise raise an exception. 450 """ 451 idx = self._lookup(name) 452 if sorted(self._parents[idx]) != sorted(parents) \ 453 or sha1 != self._sha1s[idx]: 454 raise RevisionAlreadyPresent(name, self._weave_name) 455 return idx 456 457 def _add_lines(self, version_id, parents, lines, parent_texts, 458 left_matching_blocks, nostore_sha, random_id, 459 check_content): 460 """See VersionedFile.add_lines.""" 461 idx = self._add(version_id, lines, list(map(self._lookup, parents)), 462 nostore_sha=nostore_sha) 463 return sha_strings(lines), sum(map(len, lines)), idx 464 465 def _add(self, version_id, lines, parents, sha1=None, nostore_sha=None): 466 """Add a single text on top of the weave. 467 468 Returns the index number of the newly added version. 469 470 version_id 471 Symbolic name for this version. 472 (Typically the revision-id of the revision that added it.) 473 If None, a name will be allocated based on the hash. (sha1:SHAHASH) 474 475 parents 476 List or set of direct parent version numbers. 477 478 lines 479 Sequence of lines to be added in the new version. 480 481 :param nostore_sha: See VersionedFile.add_lines. 482 """ 483 self._check_lines_not_unicode(lines) 484 self._check_lines_are_lines(lines) 485 if not sha1: 486 sha1 = sha_strings(lines) 487 if sha1 == nostore_sha: 488 raise ExistingContent 489 if version_id is None: 490 version_id = b"sha1:" + sha1 491 if version_id in self._name_map: 492 return self._check_repeated_add(version_id, parents, lines, sha1) 493 494 self._check_versions(parents) 495 new_version = len(self._parents) 496 497 # if we abort after here the (in-memory) weave will be corrupt because 498 # only some fields are updated 499 # XXX: FIXME implement a succeed-or-fail of the rest of this routine. 500 # - Robert Collins 20060226 501 self._parents.append(parents[:]) 502 self._sha1s.append(sha1) 503 self._names.append(version_id) 504 self._name_map[version_id] = new_version 505 506 if not parents: 507 # special case; adding with no parents revision; can do 508 # this more quickly by just appending unconditionally. 509 # even more specially, if we're adding an empty text we 510 # need do nothing at all. 511 if lines: 512 self._weave.append((b'{', new_version)) 513 self._weave.extend(lines) 514 self._weave.append((b'}', None)) 515 return new_version 516 517 if len(parents) == 1: 518 pv = list(parents)[0] 519 if sha1 == self._sha1s[pv]: 520 # special case: same as the single parent 521 return new_version 522 523 ancestors = self._inclusions(parents) 524 525 l = self._weave 526 527 # basis a list of (origin, lineno, line) 528 basis_lineno = [] 529 basis_lines = [] 530 for origin, lineno, line in self._extract(ancestors): 531 basis_lineno.append(lineno) 532 basis_lines.append(line) 533 534 # another small special case: a merge, producing the same text 535 # as auto-merge 536 if lines == basis_lines: 537 return new_version 538 539 # add a sentinel, because we can also match against the final line 540 basis_lineno.append(len(self._weave)) 541 542 # XXX: which line of the weave should we really consider 543 # matches the end of the file? the current code says it's the 544 # last line of the weave? 545 546 # print 'basis_lines:', basis_lines 547 # print 'new_lines: ', lines 548 549 s = self._matcher(None, basis_lines, lines) 550 551 # offset gives the number of lines that have been inserted 552 # into the weave up to the current point; if the original edit 553 # instruction says to change line A then we actually change (A+offset) 554 offset = 0 555 556 for tag, i1, i2, j1, j2 in s.get_opcodes(): 557 # i1,i2 are given in offsets within basis_lines; we need to map 558 # them back to offsets within the entire weave print 'raw match', 559 # tag, i1, i2, j1, j2 560 if tag == 'equal': 561 continue 562 i1 = basis_lineno[i1] 563 i2 = basis_lineno[i2] 564 # the deletion and insertion are handled separately. 565 # first delete the region. 566 if i1 != i2: 567 self._weave.insert(i1 + offset, (b'[', new_version)) 568 self._weave.insert(i2 + offset + 1, (b']', new_version)) 569 offset += 2 570 571 if j1 != j2: 572 # there may have been a deletion spanning up to 573 # i2; we want to insert after this region to make sure 574 # we don't destroy ourselves 575 i = i2 + offset 576 self._weave[i:i] = ([(b'{', new_version)] + 577 lines[j1:j2] + 578 [(b'}', None)]) 579 offset += 2 + (j2 - j1) 580 return new_version 581 582 def _inclusions(self, versions): 583 """Return set of all ancestors of given version(s).""" 584 if not len(versions): 585 return set() 586 i = set(versions) 587 for v in range(max(versions), 0, -1): 588 if v in i: 589 # include all its parents 590 i.update(self._parents[v]) 591 return i 592 593 def get_ancestry(self, version_ids, topo_sorted=True): 594 """See VersionedFile.get_ancestry.""" 595 if isinstance(version_ids, bytes): 596 version_ids = [version_ids] 597 i = self._inclusions([self._lookup(v) for v in version_ids]) 598 return set(self._idx_to_name(v) for v in i) 599 600 def _check_versions(self, indexes): 601 """Check everything in the sequence of indexes is valid""" 602 for i in indexes: 603 try: 604 self._parents[i] 605 except IndexError: 606 raise IndexError("invalid version number %r" % i) 607 608 def _compatible_parents(self, my_parents, other_parents): 609 """During join check that other_parents are joinable with my_parents. 610 611 Joinable is defined as 'is a subset of' - supersets may require 612 regeneration of diffs, but subsets do not. 613 """ 614 return len(other_parents.difference(my_parents)) == 0 615 616 def annotate(self, version_id): 617 """Return a list of (version-id, line) tuples for version_id. 618 619 The index indicates when the line originated in the weave.""" 620 incls = [self._lookup(version_id)] 621 return [(self._idx_to_name(origin), text) for origin, lineno, text in 622 self._extract(incls)] 623 624 def iter_lines_added_or_present_in_versions(self, version_ids=None, 625 pb=None): 626 """See VersionedFile.iter_lines_added_or_present_in_versions().""" 627 if version_ids is None: 628 version_ids = self.versions() 629 version_ids = set(version_ids) 630 for lineno, inserted, deletes, line in self._walk_internal( 631 version_ids): 632 if inserted not in version_ids: 633 continue 634 if not line.endswith(b'\n'): 635 yield line + b'\n', inserted 636 else: 637 yield line, inserted 638 639 def _walk_internal(self, version_ids=None): 640 """Helper method for weave actions.""" 641 642 istack = [] 643 dset = set() 644 645 lineno = 0 # line of weave, 0-based 646 647 for l in self._weave: 648 if l.__class__ == tuple: 649 c, v = l 650 if c == b'{': 651 istack.append(self._names[v]) 652 elif c == b'}': 653 istack.pop() 654 elif c == b'[': 655 dset.add(self._names[v]) 656 elif c == b']': 657 dset.remove(self._names[v]) 658 else: 659 raise WeaveFormatError('unexpected instruction %r' % v) 660 else: 661 yield lineno, istack[-1], frozenset(dset), l 662 lineno += 1 663 664 if istack: 665 raise WeaveFormatError("unclosed insertion blocks " 666 "at end of weave: %s" % istack) 667 if dset: 668 raise WeaveFormatError( 669 "unclosed deletion blocks at end of weave: %s" % dset) 670 671 def plan_merge(self, ver_a, ver_b): 672 """Return pseudo-annotation indicating how the two versions merge. 673 674 This is computed between versions a and b and their common 675 base. 676 677 Weave lines present in none of them are skipped entirely. 678 """ 679 inc_a = self.get_ancestry([ver_a]) 680 inc_b = self.get_ancestry([ver_b]) 681 inc_c = inc_a & inc_b 682 683 for lineno, insert, deleteset, line in self._walk_internal( 684 [ver_a, ver_b]): 685 if deleteset & inc_c: 686 # killed in parent; can't be in either a or b 687 # not relevant to our work 688 yield 'killed-base', line 689 elif insert in inc_c: 690 # was inserted in base 691 killed_a = bool(deleteset & inc_a) 692 killed_b = bool(deleteset & inc_b) 693 if killed_a and killed_b: 694 yield 'killed-both', line 695 elif killed_a: 696 yield 'killed-a', line 697 elif killed_b: 698 yield 'killed-b', line 699 else: 700 yield 'unchanged', line 701 elif insert in inc_a: 702 if deleteset & inc_a: 703 yield 'ghost-a', line 704 else: 705 # new in A; not in B 706 yield 'new-a', line 707 elif insert in inc_b: 708 if deleteset & inc_b: 709 yield 'ghost-b', line 710 else: 711 yield 'new-b', line 712 else: 713 # not in either revision 714 yield 'irrelevant', line 715 716 def _extract(self, versions): 717 """Yield annotation of lines in included set. 718 719 Yields a sequence of tuples (origin, lineno, text), where 720 origin is the origin version, lineno the index in the weave, 721 and text the text of the line. 722 723 The set typically but not necessarily corresponds to a version. 724 """ 725 for i in versions: 726 if not isinstance(i, int): 727 raise ValueError(i) 728 729 included = self._inclusions(versions) 730 731 istack = [] 732 iset = set() 733 dset = set() 734 735 lineno = 0 # line of weave, 0-based 736 737 isactive = None 738 739 result = [] 740 741 # wow. 742 # 449 0 4474.6820 2356.5590 breezy.weave:556(_extract) 743 # +285282 0 1676.8040 1676.8040 +<isinstance> 744 # 1.6 seconds in 'isinstance'. 745 # changing the first isinstance: 746 # 449 0 2814.2660 1577.1760 breezy.weave:556(_extract) 747 # +140414 0 762.8050 762.8050 +<isinstance> 748 # note that the inline time actually dropped (less function calls) 749 # and total processing time was halved. 750 # we're still spending ~1/4 of the method in isinstance though. 751 # so lets hard code the acceptable string classes we expect: 752 # 449 0 1202.9420 786.2930 breezy.weave:556(_extract) 753 # +71352 0 377.5560 377.5560 +<method 'append' of 'list' 754 # objects> 755 # yay, down to ~1/4 the initial extract time, and our inline time 756 # has shrunk again, with isinstance no longer dominating. 757 # tweaking the stack inclusion test to use a set gives: 758 # 449 0 1122.8030 713.0080 breezy.weave:556(_extract) 759 # +71352 0 354.9980 354.9980 +<method 'append' of 'list' 760 # objects> 761 # - a 5% win, or possibly just noise. However with large istacks that 762 # 'in' test could dominate, so I'm leaving this change in place - when 763 # its fast enough to consider profiling big datasets we can review. 764 765 for l in self._weave: 766 if l.__class__ == tuple: 767 c, v = l 768 isactive = None 769 if c == b'{': 770 istack.append(v) 771 iset.add(v) 772 elif c == b'}': 773 iset.remove(istack.pop()) 774 elif c == b'[': 775 if v in included: 776 dset.add(v) 777 elif c == b']': 778 if v in included: 779 dset.remove(v) 780 else: 781 raise AssertionError() 782 else: 783 if isactive is None: 784 isactive = (not dset) and istack and ( 785 istack[-1] in included) 786 if isactive: 787 result.append((istack[-1], lineno, l)) 788 lineno += 1 789 if istack: 790 raise WeaveFormatError("unclosed insertion blocks " 791 "at end of weave: %s" % istack) 792 if dset: 793 raise WeaveFormatError( 794 "unclosed deletion blocks at end of weave: %s" % dset) 795 return result 796 797 def _maybe_lookup(self, name_or_index): 798 """Convert possible symbolic name to index, or pass through indexes. 799 800 NOT FOR PUBLIC USE. 801 """ 802 # GZ 2017-04-01: This used to check for long as well, but I don't think 803 # there are python implementations with sys.maxsize > sys.maxint 804 if isinstance(name_or_index, int): 805 return name_or_index 806 else: 807 return self._lookup(name_or_index) 808 809 def get_lines(self, version_id): 810 """See VersionedFile.get_lines().""" 811 int_index = self._maybe_lookup(version_id) 812 result = [line for (origin, lineno, line) 813 in self._extract([int_index])] 814 expected_sha1 = self._sha1s[int_index] 815 measured_sha1 = sha_strings(result) 816 if measured_sha1 != expected_sha1: 817 raise WeaveInvalidChecksum( 818 'file %s, revision %s, expected: %s, measured %s' 819 % (self._weave_name, version_id, 820 expected_sha1, measured_sha1)) 821 return result 822 823 def get_sha1s(self, version_ids): 824 """See VersionedFile.get_sha1s().""" 825 result = {} 826 for v in version_ids: 827 result[v] = self._sha1s[self._lookup(v)] 828 return result 829 830 def num_versions(self): 831 """How many versions are in this weave?""" 832 return len(self._parents) 833 834 __len__ = num_versions 835 836 def check(self, progress_bar=None): 837 # TODO evaluate performance hit of using string sets in this routine. 838 # TODO: check no circular inclusions 839 # TODO: create a nested progress bar 840 for version in range(self.num_versions()): 841 inclusions = list(self._parents[version]) 842 if inclusions: 843 inclusions.sort() 844 if inclusions[-1] >= version: 845 raise WeaveFormatError( 846 "invalid included version %d for index %d" 847 % (inclusions[-1], version)) 848 849 # try extracting all versions; parallel extraction is used 850 nv = self.num_versions() 851 sha1s = {} 852 texts = {} 853 inclusions = {} 854 for i in range(nv): 855 # For creating the ancestry, IntSet is much faster (3.7s vs 0.17s) 856 # The problem is that set membership is much more expensive 857 name = self._idx_to_name(i) 858 sha1s[name] = sha() 859 texts[name] = [] 860 new_inc = {name} 861 for p in self._parents[i]: 862 new_inc.update(inclusions[self._idx_to_name(p)]) 863 864 if new_inc != self.get_ancestry(name): 865 raise AssertionError( 866 'failed %s != %s' 867 % (new_inc, self.get_ancestry(name))) 868 inclusions[name] = new_inc 869 870 nlines = len(self._weave) 871 872 update_text = 'checking weave' 873 if self._weave_name: 874 short_name = os.path.basename(self._weave_name) 875 update_text = 'checking %s' % (short_name,) 876 update_text = update_text[:25] 877 878 for lineno, insert, deleteset, line in self._walk_internal(): 879 if progress_bar: 880 progress_bar.update(update_text, lineno, nlines) 881 882 for name, name_inclusions in inclusions.items(): 883 # The active inclusion must be an ancestor, 884 # and no ancestors must have deleted this line, 885 # because we don't support resurrection. 886 if ((insert in name_inclusions) and 887 not (deleteset & name_inclusions)): 888 sha1s[name].update(line) 889 890 for i in range(nv): 891 version = self._idx_to_name(i) 892 hd = sha1s[version].hexdigest().encode() 893 expected = self._sha1s[i] 894 if hd != expected: 895 raise WeaveInvalidChecksum( 896 "mismatched sha1 for version %s: " 897 "got %s, expected %s" 898 % (version, hd, expected)) 899 900 # TODO: check insertions are properly nested, that there are 901 # no lines outside of insertion blocks, that deletions are 902 # properly paired, etc. 903 904 def _imported_parents(self, other, other_idx): 905 """Return list of parents in self corresponding to indexes in other.""" 906 new_parents = [] 907 for parent_idx in other._parents[other_idx]: 908 parent_name = other._names[parent_idx] 909 if parent_name not in self._name_map: 910 # should not be possible 911 raise WeaveError("missing parent {%s} of {%s} in %r" 912 % (parent_name, other._name_map[other_idx], self)) 913 new_parents.append(self._name_map[parent_name]) 914 return new_parents 915 916 def _check_version_consistent(self, other, other_idx, name): 917 """Check if a version in consistent in this and other. 918 919 To be consistent it must have: 920 921 * the same text 922 * the same direct parents (by name, not index, and disregarding 923 order) 924 925 If present & correct return True; 926 if not present in self return False; 927 if inconsistent raise error.""" 928 this_idx = self._name_map.get(name, -1) 929 if this_idx != -1: 930 if self._sha1s[this_idx] != other._sha1s[other_idx]: 931 raise WeaveTextDiffers(name, self, other) 932 self_parents = self._parents[this_idx] 933 other_parents = other._parents[other_idx] 934 n1 = {self._names[i] for i in self_parents} 935 n2 = {other._names[i] for i in other_parents} 936 if not self._compatible_parents(n1, n2): 937 raise WeaveParentMismatch( 938 "inconsistent parents " 939 "for version {%s}: %s vs %s" % (name, n1, n2)) 940 else: 941 return True # ok! 942 else: 943 return False 944 945 def _reweave(self, other, pb, msg): 946 """Reweave self with other - internal helper for join(). 947 948 :param other: The other weave to merge 949 :param pb: An optional progress bar, indicating how far done we are 950 :param msg: An optional message for the progress 951 """ 952 new_weave = _reweave(self, other, pb=pb, msg=msg) 953 self._copy_weave_content(new_weave) 954 955 def _copy_weave_content(self, otherweave): 956 """adsorb the content from otherweave.""" 957 for attr in self.__slots__: 958 if attr != '_weave_name': 959 setattr(self, attr, copy(getattr(otherweave, attr))) 960 961 962class WeaveFile(Weave): 963 """A WeaveFile represents a Weave on disk and writes on change.""" 964 965 WEAVE_SUFFIX = '.weave' 966 967 def __init__(self, name, transport, filemode=None, create=False, 968 access_mode='w', get_scope=None): 969 """Create a WeaveFile. 970 971 :param create: If not True, only open an existing knit. 972 """ 973 super(WeaveFile, self).__init__(name, access_mode, get_scope=get_scope, 974 allow_reserved=False) 975 self._transport = transport 976 self._filemode = filemode 977 try: 978 f = self._transport.get(name + WeaveFile.WEAVE_SUFFIX) 979 _read_weave_v5(BytesIO(f.read()), self) 980 except errors.NoSuchFile: 981 if not create: 982 raise 983 # new file, save it 984 self._save() 985 986 def _add_lines(self, version_id, parents, lines, parent_texts, 987 left_matching_blocks, nostore_sha, random_id, 988 check_content): 989 """Add a version and save the weave.""" 990 self.check_not_reserved_id(version_id) 991 result = super(WeaveFile, self)._add_lines( 992 version_id, parents, lines, parent_texts, left_matching_blocks, 993 nostore_sha, random_id, check_content) 994 self._save() 995 return result 996 997 def copy_to(self, name, transport): 998 """See VersionedFile.copy_to().""" 999 # as we are all in memory always, just serialise to the new place. 1000 sio = BytesIO() 1001 write_weave_v5(self, sio) 1002 sio.seek(0) 1003 transport.put_file(name + WeaveFile.WEAVE_SUFFIX, sio, self._filemode) 1004 1005 def _save(self): 1006 """Save the weave.""" 1007 self._check_write_ok() 1008 sio = BytesIO() 1009 write_weave_v5(self, sio) 1010 sio.seek(0) 1011 bytes = sio.getvalue() 1012 path = self._weave_name + WeaveFile.WEAVE_SUFFIX 1013 try: 1014 self._transport.put_bytes(path, bytes, self._filemode) 1015 except errors.NoSuchFile: 1016 self._transport.mkdir(dirname(path)) 1017 self._transport.put_bytes(path, bytes, self._filemode) 1018 1019 @staticmethod 1020 def get_suffixes(): 1021 """See VersionedFile.get_suffixes().""" 1022 return [WeaveFile.WEAVE_SUFFIX] 1023 1024 def insert_record_stream(self, stream): 1025 super(WeaveFile, self).insert_record_stream(stream) 1026 self._save() 1027 1028 1029def _reweave(wa, wb, pb=None, msg=None): 1030 """Combine two weaves and return the result. 1031 1032 This works even if a revision R has different parents in 1033 wa and wb. In the resulting weave all the parents are given. 1034 1035 This is done by just building up a new weave, maintaining ordering 1036 of the versions in the two inputs. More efficient approaches 1037 might be possible but it should only be necessary to do 1038 this operation rarely, when a new previously ghost version is 1039 inserted. 1040 1041 :param pb: An optional progress bar, indicating how far done we are 1042 :param msg: An optional message for the progress 1043 """ 1044 wr = Weave() 1045 # first determine combined parents of all versions 1046 # map from version name -> all parent names 1047 combined_parents = _reweave_parent_graphs(wa, wb) 1048 mutter("combined parents: %r", combined_parents) 1049 order = tsort.topo_sort(combined_parents.items()) 1050 mutter("order to reweave: %r", order) 1051 1052 if pb and not msg: 1053 msg = 'reweave' 1054 1055 for idx, name in enumerate(order): 1056 if pb: 1057 pb.update(msg, idx, len(order)) 1058 if name in wa._name_map: 1059 lines = wa.get_lines(name) 1060 if name in wb._name_map: 1061 lines_b = wb.get_lines(name) 1062 if lines != lines_b: 1063 mutter('Weaves differ on content. rev_id {%s}', name) 1064 mutter('weaves: %s, %s', wa._weave_name, wb._weave_name) 1065 import difflib 1066 lines = list(difflib.unified_diff(lines, lines_b, 1067 wa._weave_name, wb._weave_name)) 1068 mutter('lines:\n%s', ''.join(lines)) 1069 raise WeaveTextDiffers(name, wa, wb) 1070 else: 1071 lines = wb.get_lines(name) 1072 wr._add(name, lines, [wr._lookup(i) for i in combined_parents[name]]) 1073 return wr 1074 1075 1076def _reweave_parent_graphs(wa, wb): 1077 """Return combined parent ancestry for two weaves. 1078 1079 Returned as a list of (version_name, set(parent_names))""" 1080 combined = {} 1081 for weave in [wa, wb]: 1082 for idx, name in enumerate(weave._names): 1083 p = combined.setdefault(name, set()) 1084 p.update(map(weave._idx_to_name, weave._parents[idx])) 1085 return combined 1086