1# Copyright (C) 2005, 2009 Canonical Ltd
2#
3# This program is free software; you can redistribute it and/or modify
4# it under the terms of the GNU General Public License as published by
5# the Free Software Foundation; either version 2 of the License, or
6# (at your option) any later version.
7#
8# This program is distributed in the hope that it will be useful,
9# but WITHOUT ANY WARRANTY; without even the implied warranty of
10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11# GNU General Public License for more details.
12#
13# You should have received a copy of the GNU General Public License
14# along with this program; if not, write to the Free Software
15# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
17# Author: Martin Pool <mbp@canonical.com>
18
19"""Weave - storage of related text file versions"""
20
21# XXX: If we do weaves this way, will a merge still behave the same
22# way if it's done in a different order?  That's a pretty desirable
23# property.
24
25# TODO: Nothing here so far assumes the lines are really \n newlines,
26# rather than being split up in some other way.  We could accommodate
27# binaries, perhaps by naively splitting on \n or perhaps using
28# something like a rolling checksum.
29
30# TODO: End marker for each version so we can stop reading?
31
32# TODO: Check that no insertion occurs inside a deletion that was
33# active in the version of the insertion.
34
35# TODO: In addition to the SHA-1 check, perhaps have some code that
36# checks structural constraints of the weave: ie that insertions are
37# properly nested, that there is no text outside of an insertion, that
38# insertions or deletions are not repeated, etc.
39
40# TODO: Parallel-extract that passes back each line along with a
41# description of which revisions include it.  Nice for checking all
42# shas or calculating stats in parallel.
43
44# TODO: Using a single _extract routine and then processing the output
45# is probably inefficient.  It's simple enough that we can afford to
46# have slight specializations for different ways its used: annotate,
47# basis for add, get, etc.
48
49# TODO: Probably the API should work only in names to hide the integer
50# indexes from the user.
51
52# TODO: Is there any potential performance win by having an add()
53# variant that is passed a pre-cooked version of the single basis
54# version?
55
56# TODO: Reweave can possibly be made faster by remembering diffs
57# where the basis and destination are unchanged.
58
59# FIXME: Sometimes we will be given a parents list for a revision
60# that includes some redundant parents (i.e. already a parent of
61# something in the list.)  We should eliminate them.  This can
62# be done fairly efficiently because the sequence numbers constrain
63# the possible relationships.
64
65# FIXME: the conflict markers should be *7* characters
66
67from copy import copy
68from io import BytesIO
69import os
70import patiencediff
71
72from ..lazy_import import lazy_import
73lazy_import(globals(), """
74from breezy import tsort
75""")
76from .. import (
77    errors,
78    osutils,
79    )
80from ..errors import (
81    RevisionAlreadyPresent,
82    RevisionNotPresent,
83    )
84from ..osutils import dirname, sha, sha_strings, split_lines
85from ..revision import NULL_REVISION
86from ..trace import mutter
87from .versionedfile import (
88    AbsentContentFactory,
89    adapter_registry,
90    ContentFactory,
91    ExistingContent,
92    sort_groupcompress,
93    UnavailableRepresentation,
94    VersionedFile,
95    )
96from .weavefile import _read_weave_v5, write_weave_v5
97
98
99class WeaveError(errors.BzrError):
100
101    _fmt = "Error in processing weave: %(msg)s"
102
103    def __init__(self, msg=None):
104        errors.BzrError.__init__(self)
105        self.msg = msg
106
107
108class WeaveRevisionAlreadyPresent(WeaveError):
109
110    _fmt = "Revision {%(revision_id)s} already present in %(weave)s"
111
112    def __init__(self, revision_id, weave):
113
114        WeaveError.__init__(self)
115        self.revision_id = revision_id
116        self.weave = weave
117
118
119class WeaveRevisionNotPresent(WeaveError):
120
121    _fmt = "Revision {%(revision_id)s} not present in %(weave)s"
122
123    def __init__(self, revision_id, weave):
124        WeaveError.__init__(self)
125        self.revision_id = revision_id
126        self.weave = weave
127
128
129class WeaveFormatError(WeaveError):
130
131    _fmt = "Weave invariant violated: %(what)s"
132
133    def __init__(self, what):
134        WeaveError.__init__(self)
135        self.what = what
136
137
138class WeaveParentMismatch(WeaveError):
139
140    _fmt = "Parents are mismatched between two revisions. %(msg)s"
141
142
143class WeaveInvalidChecksum(WeaveError):
144
145    _fmt = "Text did not match its checksum: %(msg)s"
146
147
148class WeaveTextDiffers(WeaveError):
149
150    _fmt = ("Weaves differ on text content. Revision:"
151            " {%(revision_id)s}, %(weave_a)s, %(weave_b)s")
152
153    def __init__(self, revision_id, weave_a, weave_b):
154        WeaveError.__init__(self)
155        self.revision_id = revision_id
156        self.weave_a = weave_a
157        self.weave_b = weave_b
158
159
160class WeaveContentFactory(ContentFactory):
161    """Content factory for streaming from weaves.
162
163    :seealso ContentFactory:
164    """
165
166    def __init__(self, version, weave):
167        """Create a WeaveContentFactory for version from weave."""
168        ContentFactory.__init__(self)
169        self.sha1 = weave.get_sha1s([version])[version]
170        self.key = (version,)
171        parents = weave.get_parent_map([version])[version]
172        self.parents = tuple((parent,) for parent in parents)
173        self.storage_kind = 'fulltext'
174        self._weave = weave
175
176    def get_bytes_as(self, storage_kind):
177        if storage_kind == 'fulltext':
178            return self._weave.get_text(self.key[-1])
179        elif storage_kind in ('chunked', 'lines'):
180            return self._weave.get_lines(self.key[-1])
181        else:
182            raise UnavailableRepresentation(self.key, storage_kind, 'fulltext')
183
184    def iter_bytes_as(self, storage_kind):
185        if storage_kind in ('chunked', 'lines'):
186            return iter(self._weave.get_lines(self.key[-1]))
187        else:
188            raise UnavailableRepresentation(self.key, storage_kind, 'fulltext')
189
190
191class Weave(VersionedFile):
192    """weave - versioned text file storage.
193
194    A Weave manages versions of line-based text files, keeping track
195    of the originating version for each line.
196
197    To clients the "lines" of the file are represented as a list of strings.
198    These strings  will typically have terminal newline characters, but
199    this is not required.  In particular files commonly do not have a newline
200    at the end of the file.
201
202    Texts can be identified in either of two ways:
203
204    * a nonnegative index number.
205
206    * a version-id string.
207
208    Typically the index number will be valid only inside this weave and
209    the version-id is used to reference it in the larger world.
210
211    The weave is represented as a list mixing edit instructions and
212    literal text.  Each entry in _weave can be either a string (or
213    unicode), or a tuple.  If a string, it means that the given line
214    should be output in the currently active revisions.
215
216    If a tuple, it gives a processing instruction saying in which
217    revisions the enclosed lines are active.  The tuple has the form
218    (instruction, version).
219
220    The instruction can be '{' or '}' for an insertion block, and '['
221    and ']' for a deletion block respectively.  The version is the
222    integer version index.  There is no replace operator, only deletes
223    and inserts.  For '}', the end of an insertion, there is no
224    version parameter because it always closes the most recently
225    opened insertion.
226
227    Constraints/notes:
228
229    * A later version can delete lines that were introduced by any
230      number of ancestor versions; this implies that deletion
231      instructions can span insertion blocks without regard to the
232      insertion block's nesting.
233
234    * Similarly, deletions need not be properly nested with regard to
235      each other, because they might have been generated by
236      independent revisions.
237
238    * Insertions are always made by inserting a new bracketed block
239      into a single point in the previous weave.  This implies they
240      can nest but not overlap, and the nesting must always have later
241      insertions on the inside.
242
243    * It doesn't seem very useful to have an active insertion
244      inside an inactive insertion, but it might happen.
245
246    * Therefore, all instructions are always"considered"; that
247      is passed onto and off the stack.  An outer inactive block
248      doesn't disable an inner block.
249
250    * Lines are enabled if the most recent enclosing insertion is
251      active and none of the enclosing deletions are active.
252
253    * There is no point having a deletion directly inside its own
254      insertion; you might as well just not write it.  And there
255      should be no way to get an earlier version deleting a later
256      version.
257
258    _weave
259        Text of the weave; list of control instruction tuples and strings.
260
261    _parents
262        List of parents, indexed by version number.
263        It is only necessary to store the minimal set of parents for
264        each version; the parent's parents are implied.
265
266    _sha1s
267        List of hex SHA-1 of each version.
268
269    _names
270        List of symbolic names for each version.  Each should be unique.
271
272    _name_map
273        For each name, the version number.
274
275    _weave_name
276        Descriptive name of this weave; typically the filename if known.
277        Set by read_weave.
278    """
279
280    __slots__ = ['_weave', '_parents', '_sha1s', '_names', '_name_map',
281                 '_weave_name', '_matcher', '_allow_reserved']
282
283    def __init__(self, weave_name=None, access_mode='w', matcher=None,
284                 get_scope=None, allow_reserved=False):
285        """Create a weave.
286
287        :param get_scope: A callable that returns an opaque object to be used
288            for detecting when this weave goes out of scope (should stop
289            answering requests or allowing mutation).
290        """
291        super(Weave, self).__init__()
292        self._weave = []
293        self._parents = []
294        self._sha1s = []
295        self._names = []
296        self._name_map = {}
297        self._weave_name = weave_name
298        if matcher is None:
299            self._matcher = patiencediff.PatienceSequenceMatcher
300        else:
301            self._matcher = matcher
302        if get_scope is None:
303            def get_scope():
304                return None
305        self._get_scope = get_scope
306        self._scope = get_scope()
307        self._access_mode = access_mode
308        self._allow_reserved = allow_reserved
309
310    def __repr__(self):
311        return "Weave(%r)" % self._weave_name
312
313    def _check_write_ok(self):
314        """Is the versioned file marked as 'finished' ? Raise if it is."""
315        if self._get_scope() != self._scope:
316            raise errors.OutSideTransaction()
317        if self._access_mode != 'w':
318            raise errors.ReadOnlyObjectDirtiedError(self)
319
320    def copy(self):
321        """Return a deep copy of self.
322
323        The copy can be modified without affecting the original weave."""
324        other = Weave()
325        other._weave = self._weave[:]
326        other._parents = self._parents[:]
327        other._sha1s = self._sha1s[:]
328        other._names = self._names[:]
329        other._name_map = self._name_map.copy()
330        other._weave_name = self._weave_name
331        return other
332
333    def __eq__(self, other):
334        if not isinstance(other, Weave):
335            return False
336        return self._parents == other._parents \
337            and self._weave == other._weave \
338            and self._sha1s == other._sha1s
339
340    def __ne__(self, other):
341        return not self.__eq__(other)
342
343    def _idx_to_name(self, version):
344        return self._names[version]
345
346    def _lookup(self, name):
347        """Convert symbolic version name to index."""
348        if not self._allow_reserved:
349            self.check_not_reserved_id(name)
350        try:
351            return self._name_map[name]
352        except KeyError:
353            raise RevisionNotPresent(name, self._weave_name)
354
355    def versions(self):
356        """See VersionedFile.versions."""
357        return self._names[:]
358
359    def has_version(self, version_id):
360        """See VersionedFile.has_version."""
361        return (version_id in self._name_map)
362
363    __contains__ = has_version
364
365    def get_record_stream(self, versions, ordering, include_delta_closure):
366        """Get a stream of records for versions.
367
368        :param versions: The versions to include. Each version is a tuple
369            (version,).
370        :param ordering: Either 'unordered' or 'topological'. A topologically
371            sorted stream has compression parents strictly before their
372            children.
373        :param include_delta_closure: If True then the closure across any
374            compression parents will be included (in the opaque data).
375        :return: An iterator of ContentFactory objects, each of which is only
376            valid until the iterator is advanced.
377        """
378        versions = [version[-1] for version in versions]
379        if ordering == 'topological':
380            parents = self.get_parent_map(versions)
381            new_versions = tsort.topo_sort(parents)
382            new_versions.extend(set(versions).difference(set(parents)))
383            versions = new_versions
384        elif ordering == 'groupcompress':
385            parents = self.get_parent_map(versions)
386            new_versions = sort_groupcompress(parents)
387            new_versions.extend(set(versions).difference(set(parents)))
388            versions = new_versions
389        for version in versions:
390            if version in self:
391                yield WeaveContentFactory(version, self)
392            else:
393                yield AbsentContentFactory((version,))
394
395    def get_parent_map(self, version_ids):
396        """See VersionedFile.get_parent_map."""
397        result = {}
398        for version_id in version_ids:
399            if version_id == NULL_REVISION:
400                parents = ()
401            else:
402                try:
403                    parents = tuple(
404                        map(self._idx_to_name,
405                            self._parents[self._lookup(version_id)]))
406                except RevisionNotPresent:
407                    continue
408            result[version_id] = parents
409        return result
410
411    def get_parents_with_ghosts(self, version_id):
412        raise NotImplementedError(self.get_parents_with_ghosts)
413
414    def insert_record_stream(self, stream):
415        """Insert a record stream into this versioned file.
416
417        :param stream: A stream of records to insert.
418        :return: None
419        :seealso VersionedFile.get_record_stream:
420        """
421        adapters = {}
422        for record in stream:
423            # Raise an error when a record is missing.
424            if record.storage_kind == 'absent':
425                raise RevisionNotPresent([record.key[0]], self)
426            # adapt to non-tuple interface
427            parents = [parent[0] for parent in record.parents]
428            if record.storage_kind in ('fulltext', 'chunked', 'lines'):
429                self.add_lines(
430                    record.key[0], parents,
431                    record.get_bytes_as('lines'))
432            else:
433                adapter_key = record.storage_kind, 'lines'
434                try:
435                    adapter = adapters[adapter_key]
436                except KeyError:
437                    adapter_factory = adapter_registry.get(adapter_key)
438                    adapter = adapter_factory(self)
439                    adapters[adapter_key] = adapter
440                lines = adapter.get_bytes(record, 'lines')
441                try:
442                    self.add_lines(record.key[0], parents, lines)
443                except RevisionAlreadyPresent:
444                    pass
445
446    def _check_repeated_add(self, name, parents, text, sha1):
447        """Check that a duplicated add is OK.
448
449        If it is, return the (old) index; otherwise raise an exception.
450        """
451        idx = self._lookup(name)
452        if sorted(self._parents[idx]) != sorted(parents) \
453                or sha1 != self._sha1s[idx]:
454            raise RevisionAlreadyPresent(name, self._weave_name)
455        return idx
456
457    def _add_lines(self, version_id, parents, lines, parent_texts,
458                   left_matching_blocks, nostore_sha, random_id,
459                   check_content):
460        """See VersionedFile.add_lines."""
461        idx = self._add(version_id, lines, list(map(self._lookup, parents)),
462                        nostore_sha=nostore_sha)
463        return sha_strings(lines), sum(map(len, lines)), idx
464
465    def _add(self, version_id, lines, parents, sha1=None, nostore_sha=None):
466        """Add a single text on top of the weave.
467
468        Returns the index number of the newly added version.
469
470        version_id
471            Symbolic name for this version.
472            (Typically the revision-id of the revision that added it.)
473            If None, a name will be allocated based on the hash. (sha1:SHAHASH)
474
475        parents
476            List or set of direct parent version numbers.
477
478        lines
479            Sequence of lines to be added in the new version.
480
481        :param nostore_sha: See VersionedFile.add_lines.
482        """
483        self._check_lines_not_unicode(lines)
484        self._check_lines_are_lines(lines)
485        if not sha1:
486            sha1 = sha_strings(lines)
487        if sha1 == nostore_sha:
488            raise ExistingContent
489        if version_id is None:
490            version_id = b"sha1:" + sha1
491        if version_id in self._name_map:
492            return self._check_repeated_add(version_id, parents, lines, sha1)
493
494        self._check_versions(parents)
495        new_version = len(self._parents)
496
497        # if we abort after here the (in-memory) weave will be corrupt because
498        # only some fields are updated
499        # XXX: FIXME implement a succeed-or-fail of the rest of this routine.
500        #      - Robert Collins 20060226
501        self._parents.append(parents[:])
502        self._sha1s.append(sha1)
503        self._names.append(version_id)
504        self._name_map[version_id] = new_version
505
506        if not parents:
507            # special case; adding with no parents revision; can do
508            # this more quickly by just appending unconditionally.
509            # even more specially, if we're adding an empty text we
510            # need do nothing at all.
511            if lines:
512                self._weave.append((b'{', new_version))
513                self._weave.extend(lines)
514                self._weave.append((b'}', None))
515            return new_version
516
517        if len(parents) == 1:
518            pv = list(parents)[0]
519            if sha1 == self._sha1s[pv]:
520                # special case: same as the single parent
521                return new_version
522
523        ancestors = self._inclusions(parents)
524
525        l = self._weave
526
527        # basis a list of (origin, lineno, line)
528        basis_lineno = []
529        basis_lines = []
530        for origin, lineno, line in self._extract(ancestors):
531            basis_lineno.append(lineno)
532            basis_lines.append(line)
533
534        # another small special case: a merge, producing the same text
535        # as auto-merge
536        if lines == basis_lines:
537            return new_version
538
539        # add a sentinel, because we can also match against the final line
540        basis_lineno.append(len(self._weave))
541
542        # XXX: which line of the weave should we really consider
543        # matches the end of the file?  the current code says it's the
544        # last line of the weave?
545
546        # print 'basis_lines:', basis_lines
547        # print 'new_lines:  ', lines
548
549        s = self._matcher(None, basis_lines, lines)
550
551        # offset gives the number of lines that have been inserted
552        # into the weave up to the current point; if the original edit
553        # instruction says to change line A then we actually change (A+offset)
554        offset = 0
555
556        for tag, i1, i2, j1, j2 in s.get_opcodes():
557            # i1,i2 are given in offsets within basis_lines; we need to map
558            # them back to offsets within the entire weave print 'raw match',
559            # tag, i1, i2, j1, j2
560            if tag == 'equal':
561                continue
562            i1 = basis_lineno[i1]
563            i2 = basis_lineno[i2]
564            # the deletion and insertion are handled separately.
565            # first delete the region.
566            if i1 != i2:
567                self._weave.insert(i1 + offset, (b'[', new_version))
568                self._weave.insert(i2 + offset + 1, (b']', new_version))
569                offset += 2
570
571            if j1 != j2:
572                # there may have been a deletion spanning up to
573                # i2; we want to insert after this region to make sure
574                # we don't destroy ourselves
575                i = i2 + offset
576                self._weave[i:i] = ([(b'{', new_version)] +
577                                    lines[j1:j2] +
578                                    [(b'}', None)])
579                offset += 2 + (j2 - j1)
580        return new_version
581
582    def _inclusions(self, versions):
583        """Return set of all ancestors of given version(s)."""
584        if not len(versions):
585            return set()
586        i = set(versions)
587        for v in range(max(versions), 0, -1):
588            if v in i:
589                # include all its parents
590                i.update(self._parents[v])
591        return i
592
593    def get_ancestry(self, version_ids, topo_sorted=True):
594        """See VersionedFile.get_ancestry."""
595        if isinstance(version_ids, bytes):
596            version_ids = [version_ids]
597        i = self._inclusions([self._lookup(v) for v in version_ids])
598        return set(self._idx_to_name(v) for v in i)
599
600    def _check_versions(self, indexes):
601        """Check everything in the sequence of indexes is valid"""
602        for i in indexes:
603            try:
604                self._parents[i]
605            except IndexError:
606                raise IndexError("invalid version number %r" % i)
607
608    def _compatible_parents(self, my_parents, other_parents):
609        """During join check that other_parents are joinable with my_parents.
610
611        Joinable is defined as 'is a subset of' - supersets may require
612        regeneration of diffs, but subsets do not.
613        """
614        return len(other_parents.difference(my_parents)) == 0
615
616    def annotate(self, version_id):
617        """Return a list of (version-id, line) tuples for version_id.
618
619        The index indicates when the line originated in the weave."""
620        incls = [self._lookup(version_id)]
621        return [(self._idx_to_name(origin), text) for origin, lineno, text in
622                self._extract(incls)]
623
624    def iter_lines_added_or_present_in_versions(self, version_ids=None,
625                                                pb=None):
626        """See VersionedFile.iter_lines_added_or_present_in_versions()."""
627        if version_ids is None:
628            version_ids = self.versions()
629        version_ids = set(version_ids)
630        for lineno, inserted, deletes, line in self._walk_internal(
631                version_ids):
632            if inserted not in version_ids:
633                continue
634            if not line.endswith(b'\n'):
635                yield line + b'\n', inserted
636            else:
637                yield line, inserted
638
639    def _walk_internal(self, version_ids=None):
640        """Helper method for weave actions."""
641
642        istack = []
643        dset = set()
644
645        lineno = 0         # line of weave, 0-based
646
647        for l in self._weave:
648            if l.__class__ == tuple:
649                c, v = l
650                if c == b'{':
651                    istack.append(self._names[v])
652                elif c == b'}':
653                    istack.pop()
654                elif c == b'[':
655                    dset.add(self._names[v])
656                elif c == b']':
657                    dset.remove(self._names[v])
658                else:
659                    raise WeaveFormatError('unexpected instruction %r' % v)
660            else:
661                yield lineno, istack[-1], frozenset(dset), l
662            lineno += 1
663
664        if istack:
665            raise WeaveFormatError("unclosed insertion blocks "
666                                   "at end of weave: %s" % istack)
667        if dset:
668            raise WeaveFormatError(
669                "unclosed deletion blocks at end of weave: %s" % dset)
670
671    def plan_merge(self, ver_a, ver_b):
672        """Return pseudo-annotation indicating how the two versions merge.
673
674        This is computed between versions a and b and their common
675        base.
676
677        Weave lines present in none of them are skipped entirely.
678        """
679        inc_a = self.get_ancestry([ver_a])
680        inc_b = self.get_ancestry([ver_b])
681        inc_c = inc_a & inc_b
682
683        for lineno, insert, deleteset, line in self._walk_internal(
684                [ver_a, ver_b]):
685            if deleteset & inc_c:
686                # killed in parent; can't be in either a or b
687                # not relevant to our work
688                yield 'killed-base', line
689            elif insert in inc_c:
690                # was inserted in base
691                killed_a = bool(deleteset & inc_a)
692                killed_b = bool(deleteset & inc_b)
693                if killed_a and killed_b:
694                    yield 'killed-both', line
695                elif killed_a:
696                    yield 'killed-a', line
697                elif killed_b:
698                    yield 'killed-b', line
699                else:
700                    yield 'unchanged', line
701            elif insert in inc_a:
702                if deleteset & inc_a:
703                    yield 'ghost-a', line
704                else:
705                    # new in A; not in B
706                    yield 'new-a', line
707            elif insert in inc_b:
708                if deleteset & inc_b:
709                    yield 'ghost-b', line
710                else:
711                    yield 'new-b', line
712            else:
713                # not in either revision
714                yield 'irrelevant', line
715
716    def _extract(self, versions):
717        """Yield annotation of lines in included set.
718
719        Yields a sequence of tuples (origin, lineno, text), where
720        origin is the origin version, lineno the index in the weave,
721        and text the text of the line.
722
723        The set typically but not necessarily corresponds to a version.
724        """
725        for i in versions:
726            if not isinstance(i, int):
727                raise ValueError(i)
728
729        included = self._inclusions(versions)
730
731        istack = []
732        iset = set()
733        dset = set()
734
735        lineno = 0         # line of weave, 0-based
736
737        isactive = None
738
739        result = []
740
741        # wow.
742        #  449       0   4474.6820   2356.5590   breezy.weave:556(_extract)
743        #  +285282   0   1676.8040   1676.8040   +<isinstance>
744        # 1.6 seconds in 'isinstance'.
745        # changing the first isinstance:
746        #  449       0   2814.2660   1577.1760   breezy.weave:556(_extract)
747        #  +140414   0    762.8050    762.8050   +<isinstance>
748        # note that the inline time actually dropped (less function calls)
749        # and total processing time was halved.
750        # we're still spending ~1/4 of the method in isinstance though.
751        # so lets hard code the acceptable string classes we expect:
752        #  449       0   1202.9420    786.2930   breezy.weave:556(_extract)
753        # +71352     0    377.5560    377.5560   +<method 'append' of 'list'
754        #                                          objects>
755        # yay, down to ~1/4 the initial extract time, and our inline time
756        # has shrunk again, with isinstance no longer dominating.
757        # tweaking the stack inclusion test to use a set gives:
758        #  449       0   1122.8030    713.0080   breezy.weave:556(_extract)
759        # +71352     0    354.9980    354.9980   +<method 'append' of 'list'
760        #                                          objects>
761        # - a 5% win, or possibly just noise. However with large istacks that
762        # 'in' test could dominate, so I'm leaving this change in place - when
763        # its fast enough to consider profiling big datasets we can review.
764
765        for l in self._weave:
766            if l.__class__ == tuple:
767                c, v = l
768                isactive = None
769                if c == b'{':
770                    istack.append(v)
771                    iset.add(v)
772                elif c == b'}':
773                    iset.remove(istack.pop())
774                elif c == b'[':
775                    if v in included:
776                        dset.add(v)
777                elif c == b']':
778                    if v in included:
779                        dset.remove(v)
780                else:
781                    raise AssertionError()
782            else:
783                if isactive is None:
784                    isactive = (not dset) and istack and (
785                        istack[-1] in included)
786                if isactive:
787                    result.append((istack[-1], lineno, l))
788            lineno += 1
789        if istack:
790            raise WeaveFormatError("unclosed insertion blocks "
791                                   "at end of weave: %s" % istack)
792        if dset:
793            raise WeaveFormatError(
794                "unclosed deletion blocks at end of weave: %s" % dset)
795        return result
796
797    def _maybe_lookup(self, name_or_index):
798        """Convert possible symbolic name to index, or pass through indexes.
799
800        NOT FOR PUBLIC USE.
801        """
802        # GZ 2017-04-01: This used to check for long as well, but I don't think
803        # there are python implementations with sys.maxsize > sys.maxint
804        if isinstance(name_or_index, int):
805            return name_or_index
806        else:
807            return self._lookup(name_or_index)
808
809    def get_lines(self, version_id):
810        """See VersionedFile.get_lines()."""
811        int_index = self._maybe_lookup(version_id)
812        result = [line for (origin, lineno, line)
813                  in self._extract([int_index])]
814        expected_sha1 = self._sha1s[int_index]
815        measured_sha1 = sha_strings(result)
816        if measured_sha1 != expected_sha1:
817            raise WeaveInvalidChecksum(
818                'file %s, revision %s, expected: %s, measured %s'
819                % (self._weave_name, version_id,
820                   expected_sha1, measured_sha1))
821        return result
822
823    def get_sha1s(self, version_ids):
824        """See VersionedFile.get_sha1s()."""
825        result = {}
826        for v in version_ids:
827            result[v] = self._sha1s[self._lookup(v)]
828        return result
829
830    def num_versions(self):
831        """How many versions are in this weave?"""
832        return len(self._parents)
833
834    __len__ = num_versions
835
836    def check(self, progress_bar=None):
837        # TODO evaluate performance hit of using string sets in this routine.
838        # TODO: check no circular inclusions
839        # TODO: create a nested progress bar
840        for version in range(self.num_versions()):
841            inclusions = list(self._parents[version])
842            if inclusions:
843                inclusions.sort()
844                if inclusions[-1] >= version:
845                    raise WeaveFormatError(
846                        "invalid included version %d for index %d"
847                        % (inclusions[-1], version))
848
849        # try extracting all versions; parallel extraction is used
850        nv = self.num_versions()
851        sha1s = {}
852        texts = {}
853        inclusions = {}
854        for i in range(nv):
855            # For creating the ancestry, IntSet is much faster (3.7s vs 0.17s)
856            # The problem is that set membership is much more expensive
857            name = self._idx_to_name(i)
858            sha1s[name] = sha()
859            texts[name] = []
860            new_inc = {name}
861            for p in self._parents[i]:
862                new_inc.update(inclusions[self._idx_to_name(p)])
863
864            if new_inc != self.get_ancestry(name):
865                raise AssertionError(
866                    'failed %s != %s'
867                    % (new_inc, self.get_ancestry(name)))
868            inclusions[name] = new_inc
869
870        nlines = len(self._weave)
871
872        update_text = 'checking weave'
873        if self._weave_name:
874            short_name = os.path.basename(self._weave_name)
875            update_text = 'checking %s' % (short_name,)
876            update_text = update_text[:25]
877
878        for lineno, insert, deleteset, line in self._walk_internal():
879            if progress_bar:
880                progress_bar.update(update_text, lineno, nlines)
881
882            for name, name_inclusions in inclusions.items():
883                # The active inclusion must be an ancestor,
884                # and no ancestors must have deleted this line,
885                # because we don't support resurrection.
886                if ((insert in name_inclusions) and
887                        not (deleteset & name_inclusions)):
888                    sha1s[name].update(line)
889
890        for i in range(nv):
891            version = self._idx_to_name(i)
892            hd = sha1s[version].hexdigest().encode()
893            expected = self._sha1s[i]
894            if hd != expected:
895                raise WeaveInvalidChecksum(
896                    "mismatched sha1 for version %s: "
897                    "got %s, expected %s"
898                    % (version, hd, expected))
899
900        # TODO: check insertions are properly nested, that there are
901        # no lines outside of insertion blocks, that deletions are
902        # properly paired, etc.
903
904    def _imported_parents(self, other, other_idx):
905        """Return list of parents in self corresponding to indexes in other."""
906        new_parents = []
907        for parent_idx in other._parents[other_idx]:
908            parent_name = other._names[parent_idx]
909            if parent_name not in self._name_map:
910                # should not be possible
911                raise WeaveError("missing parent {%s} of {%s} in %r"
912                                 % (parent_name, other._name_map[other_idx], self))
913            new_parents.append(self._name_map[parent_name])
914        return new_parents
915
916    def _check_version_consistent(self, other, other_idx, name):
917        """Check if a version in consistent in this and other.
918
919        To be consistent it must have:
920
921         * the same text
922         * the same direct parents (by name, not index, and disregarding
923           order)
924
925        If present & correct return True;
926        if not present in self return False;
927        if inconsistent raise error."""
928        this_idx = self._name_map.get(name, -1)
929        if this_idx != -1:
930            if self._sha1s[this_idx] != other._sha1s[other_idx]:
931                raise WeaveTextDiffers(name, self, other)
932            self_parents = self._parents[this_idx]
933            other_parents = other._parents[other_idx]
934            n1 = {self._names[i] for i in self_parents}
935            n2 = {other._names[i] for i in other_parents}
936            if not self._compatible_parents(n1, n2):
937                raise WeaveParentMismatch(
938                    "inconsistent parents "
939                    "for version {%s}: %s vs %s" % (name, n1, n2))
940            else:
941                return True         # ok!
942        else:
943            return False
944
945    def _reweave(self, other, pb, msg):
946        """Reweave self with other - internal helper for join().
947
948        :param other: The other weave to merge
949        :param pb: An optional progress bar, indicating how far done we are
950        :param msg: An optional message for the progress
951        """
952        new_weave = _reweave(self, other, pb=pb, msg=msg)
953        self._copy_weave_content(new_weave)
954
955    def _copy_weave_content(self, otherweave):
956        """adsorb the content from otherweave."""
957        for attr in self.__slots__:
958            if attr != '_weave_name':
959                setattr(self, attr, copy(getattr(otherweave, attr)))
960
961
962class WeaveFile(Weave):
963    """A WeaveFile represents a Weave on disk and writes on change."""
964
965    WEAVE_SUFFIX = '.weave'
966
967    def __init__(self, name, transport, filemode=None, create=False,
968                 access_mode='w', get_scope=None):
969        """Create a WeaveFile.
970
971        :param create: If not True, only open an existing knit.
972        """
973        super(WeaveFile, self).__init__(name, access_mode, get_scope=get_scope,
974                                        allow_reserved=False)
975        self._transport = transport
976        self._filemode = filemode
977        try:
978            f = self._transport.get(name + WeaveFile.WEAVE_SUFFIX)
979            _read_weave_v5(BytesIO(f.read()), self)
980        except errors.NoSuchFile:
981            if not create:
982                raise
983            # new file, save it
984            self._save()
985
986    def _add_lines(self, version_id, parents, lines, parent_texts,
987                   left_matching_blocks, nostore_sha, random_id,
988                   check_content):
989        """Add a version and save the weave."""
990        self.check_not_reserved_id(version_id)
991        result = super(WeaveFile, self)._add_lines(
992            version_id, parents, lines, parent_texts, left_matching_blocks,
993            nostore_sha, random_id, check_content)
994        self._save()
995        return result
996
997    def copy_to(self, name, transport):
998        """See VersionedFile.copy_to()."""
999        # as we are all in memory always, just serialise to the new place.
1000        sio = BytesIO()
1001        write_weave_v5(self, sio)
1002        sio.seek(0)
1003        transport.put_file(name + WeaveFile.WEAVE_SUFFIX, sio, self._filemode)
1004
1005    def _save(self):
1006        """Save the weave."""
1007        self._check_write_ok()
1008        sio = BytesIO()
1009        write_weave_v5(self, sio)
1010        sio.seek(0)
1011        bytes = sio.getvalue()
1012        path = self._weave_name + WeaveFile.WEAVE_SUFFIX
1013        try:
1014            self._transport.put_bytes(path, bytes, self._filemode)
1015        except errors.NoSuchFile:
1016            self._transport.mkdir(dirname(path))
1017            self._transport.put_bytes(path, bytes, self._filemode)
1018
1019    @staticmethod
1020    def get_suffixes():
1021        """See VersionedFile.get_suffixes()."""
1022        return [WeaveFile.WEAVE_SUFFIX]
1023
1024    def insert_record_stream(self, stream):
1025        super(WeaveFile, self).insert_record_stream(stream)
1026        self._save()
1027
1028
1029def _reweave(wa, wb, pb=None, msg=None):
1030    """Combine two weaves and return the result.
1031
1032    This works even if a revision R has different parents in
1033    wa and wb.  In the resulting weave all the parents are given.
1034
1035    This is done by just building up a new weave, maintaining ordering
1036    of the versions in the two inputs.  More efficient approaches
1037    might be possible but it should only be necessary to do
1038    this operation rarely, when a new previously ghost version is
1039    inserted.
1040
1041    :param pb: An optional progress bar, indicating how far done we are
1042    :param msg: An optional message for the progress
1043    """
1044    wr = Weave()
1045    # first determine combined parents of all versions
1046    # map from version name -> all parent names
1047    combined_parents = _reweave_parent_graphs(wa, wb)
1048    mutter("combined parents: %r", combined_parents)
1049    order = tsort.topo_sort(combined_parents.items())
1050    mutter("order to reweave: %r", order)
1051
1052    if pb and not msg:
1053        msg = 'reweave'
1054
1055    for idx, name in enumerate(order):
1056        if pb:
1057            pb.update(msg, idx, len(order))
1058        if name in wa._name_map:
1059            lines = wa.get_lines(name)
1060            if name in wb._name_map:
1061                lines_b = wb.get_lines(name)
1062                if lines != lines_b:
1063                    mutter('Weaves differ on content. rev_id {%s}', name)
1064                    mutter('weaves: %s, %s', wa._weave_name, wb._weave_name)
1065                    import difflib
1066                    lines = list(difflib.unified_diff(lines, lines_b,
1067                                                      wa._weave_name, wb._weave_name))
1068                    mutter('lines:\n%s', ''.join(lines))
1069                    raise WeaveTextDiffers(name, wa, wb)
1070        else:
1071            lines = wb.get_lines(name)
1072        wr._add(name, lines, [wr._lookup(i) for i in combined_parents[name]])
1073    return wr
1074
1075
1076def _reweave_parent_graphs(wa, wb):
1077    """Return combined parent ancestry for two weaves.
1078
1079    Returned as a list of (version_name, set(parent_names))"""
1080    combined = {}
1081    for weave in [wa, wb]:
1082        for idx, name in enumerate(weave._names):
1083            p = combined.setdefault(name, set())
1084            p.update(map(weave._idx_to_name, weave._parents[idx]))
1085    return combined
1086