1# Copyright (C) 2007-2011 Canonical Ltd
2#
3# This program is free software; you can redistribute it and/or modify
4# it under the terms of the GNU General Public License as published by
5# the Free Software Foundation; either version 2 of the License, or
6# (at your option) any later version.
7#
8# This program is distributed in the hope that it will be useful,
9# but WITHOUT ANY WARRANTY; without even the implied warranty of
10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11# GNU General Public License for more details.
12#
13# You should have received a copy of the GNU General Public License
14# along with this program; if not, write to the Free Software
15# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
17import errno
18from io import (
19    BytesIO,
20    )
21import os
22
23from .lazy_import import lazy_import
24
25lazy_import(globals(), """
26import gzip
27import itertools
28import patiencediff
29
30from breezy import (
31    bencode,
32    ui,
33    )
34""")
35from . import (
36    errors,
37    )
38from .i18n import gettext
39
40
41def topo_iter_keys(vf, keys=None):
42    if keys is None:
43        keys = vf.keys()
44    parents = vf.get_parent_map(keys)
45    return _topo_iter(parents, keys)
46
47
48def topo_iter(vf, versions=None):
49    if versions is None:
50        versions = vf.versions()
51    parents = vf.get_parent_map(versions)
52    return _topo_iter(parents, versions)
53
54
55def _topo_iter(parents, versions):
56    seen = set()
57    descendants = {}
58
59    def pending_parents(version):
60        if parents[version] is None:
61            return []
62        return [v for v in parents[version] if v in versions and
63                v not in seen]
64    for version_id in versions:
65        if parents[version_id] is None:
66            # parentless
67            continue
68        for parent_id in parents[version_id]:
69            descendants.setdefault(parent_id, []).append(version_id)
70    cur = [v for v in versions if len(pending_parents(v)) == 0]
71    while len(cur) > 0:
72        next = []
73        for version_id in cur:
74            if version_id in seen:
75                continue
76            if len(pending_parents(version_id)) != 0:
77                continue
78            next.extend(descendants.get(version_id, []))
79            yield version_id
80            seen.add(version_id)
81        cur = next
82
83
84class MultiParent(object):
85    """A multi-parent diff"""
86
87    __slots__ = ['hunks']
88
89    def __init__(self, hunks=None):
90        if hunks is not None:
91            self.hunks = hunks
92        else:
93            self.hunks = []
94
95    def __repr__(self):
96        return "MultiParent(%r)" % self.hunks
97
98    def __eq__(self, other):
99        if self.__class__ is not other.__class__:
100            return False
101        return (self.hunks == other.hunks)
102
103    @staticmethod
104    def from_lines(text, parents=(), left_blocks=None):
105        """Produce a MultiParent from a list of lines and parents"""
106        def compare(parent):
107            matcher = patiencediff.PatienceSequenceMatcher(None, parent,
108                                                           text)
109            return matcher.get_matching_blocks()
110        if len(parents) > 0:
111            if left_blocks is None:
112                left_blocks = compare(parents[0])
113            parent_comparisons = [left_blocks] + [compare(p) for p in
114                                                  parents[1:]]
115        else:
116            parent_comparisons = []
117        cur_line = 0
118        new_text = NewText([])
119        parent_text = []
120        block_iter = [iter(i) for i in parent_comparisons]
121        diff = MultiParent([])
122
123        def next_block(p):
124            try:
125                return next(block_iter[p])
126            except StopIteration:
127                return None
128        cur_block = [next_block(p) for p, i in enumerate(block_iter)]
129        while cur_line < len(text):
130            best_match = None
131            for p, block in enumerate(cur_block):
132                if block is None:
133                    continue
134                i, j, n = block
135                while j + n <= cur_line:
136                    block = cur_block[p] = next_block(p)
137                    if block is None:
138                        break
139                    i, j, n = block
140                if block is None:
141                    continue
142                if j > cur_line:
143                    continue
144                offset = cur_line - j
145                i += offset
146                j = cur_line
147                n -= offset
148                if n == 0:
149                    continue
150                if best_match is None or n > best_match.num_lines:
151                    best_match = ParentText(p, i, j, n)
152            if best_match is None:
153                new_text.lines.append(text[cur_line])
154                cur_line += 1
155            else:
156                if len(new_text.lines) > 0:
157                    diff.hunks.append(new_text)
158                    new_text = NewText([])
159                diff.hunks.append(best_match)
160                cur_line += best_match.num_lines
161        if len(new_text.lines) > 0:
162            diff.hunks.append(new_text)
163        return diff
164
165    def get_matching_blocks(self, parent, parent_len):
166        for hunk in self.hunks:
167            if not isinstance(hunk, ParentText) or hunk.parent != parent:
168                continue
169            yield (hunk.parent_pos, hunk.child_pos, hunk.num_lines)
170        yield parent_len, self.num_lines(), 0
171
172    def to_lines(self, parents=()):
173        """Contruct a fulltext from this diff and its parents"""
174        mpvf = MultiMemoryVersionedFile()
175        for num, parent in enumerate(parents):
176            mpvf.add_version(BytesIO(parent).readlines(), num, [])
177        mpvf.add_diff(self, 'a', list(range(len(parents))))
178        return mpvf.get_line_list(['a'])[0]
179
180    @classmethod
181    def from_texts(cls, text, parents=()):
182        """Produce a MultiParent from a text and list of parent text"""
183        return cls.from_lines(BytesIO(text).readlines(),
184                              [BytesIO(p).readlines() for p in parents])
185
186    def to_patch(self):
187        """Yield text lines for a patch"""
188        for hunk in self.hunks:
189            for line in hunk.to_patch():
190                yield line
191
192    def patch_len(self):
193        return len(b''.join(self.to_patch()))
194
195    def zipped_patch_len(self):
196        return len(gzip_string(self.to_patch()))
197
198    @classmethod
199    def from_patch(cls, text):
200        """Create a MultiParent from its string form"""
201        return cls._from_patch(BytesIO(text))
202
203    @staticmethod
204    def _from_patch(lines):
205        """This is private because it is essential to split lines on \n only"""
206        line_iter = iter(lines)
207        hunks = []
208        cur_line = None
209        while True:
210            try:
211                cur_line = next(line_iter)
212            except StopIteration:
213                break
214            first_char = cur_line[0:1]
215            if first_char == b'i':
216                num_lines = int(cur_line.split(b' ')[1])
217                hunk_lines = [next(line_iter) for _ in range(num_lines)]
218                hunk_lines[-1] = hunk_lines[-1][:-1]
219                hunks.append(NewText(hunk_lines))
220            elif first_char == b'\n':
221                hunks[-1].lines[-1] += b'\n'
222            else:
223                if not (first_char == b'c'):
224                    raise AssertionError(first_char)
225                parent, parent_pos, child_pos, num_lines =\
226                    [int(v) for v in cur_line.split(b' ')[1:]]
227                hunks.append(ParentText(parent, parent_pos, child_pos,
228                                        num_lines))
229        return MultiParent(hunks)
230
231    def range_iterator(self):
232        """Iterate through the hunks, with range indicated
233
234        kind is "new" or "parent".
235        for "new", data is a list of lines.
236        for "parent", data is (parent, parent_start, parent_end)
237        :return: a generator of (start, end, kind, data)
238        """
239        start = 0
240        for hunk in self.hunks:
241            if isinstance(hunk, NewText):
242                kind = 'new'
243                end = start + len(hunk.lines)
244                data = hunk.lines
245            else:
246                kind = 'parent'
247                start = hunk.child_pos
248                end = start + hunk.num_lines
249                data = (hunk.parent, hunk.parent_pos, hunk.parent_pos +
250                        hunk.num_lines)
251            yield start, end, kind, data
252            start = end
253
254    def num_lines(self):
255        """The number of lines in the output text"""
256        extra_n = 0
257        for hunk in reversed(self.hunks):
258            if isinstance(hunk, ParentText):
259                return hunk.child_pos + hunk.num_lines + extra_n
260            extra_n += len(hunk.lines)
261        return extra_n
262
263    def is_snapshot(self):
264        """Return true of this hunk is effectively a fulltext"""
265        if len(self.hunks) != 1:
266            return False
267        return (isinstance(self.hunks[0], NewText))
268
269
270class NewText(object):
271    """The contents of text that is introduced by this text"""
272
273    __slots__ = ['lines']
274
275    def __init__(self, lines):
276        self.lines = lines
277
278    def __eq__(self, other):
279        if self.__class__ is not other.__class__:
280            return False
281        return (other.lines == self.lines)
282
283    def __repr__(self):
284        return 'NewText(%r)' % self.lines
285
286    def to_patch(self):
287        yield b'i %d\n' % len(self.lines)
288        for line in self.lines:
289            yield line
290        yield b'\n'
291
292
293class ParentText(object):
294    """A reference to text present in a parent text"""
295
296    __slots__ = ['parent', 'parent_pos', 'child_pos', 'num_lines']
297
298    def __init__(self, parent, parent_pos, child_pos, num_lines):
299        self.parent = parent
300        self.parent_pos = parent_pos
301        self.child_pos = child_pos
302        self.num_lines = num_lines
303
304    def _as_dict(self):
305        return {b'parent': self.parent,
306                b'parent_pos': self.parent_pos,
307                b'child_pos': self.child_pos,
308                b'num_lines': self.num_lines}
309
310    def __repr__(self):
311        return ('ParentText(%(parent)r, %(parent_pos)r, %(child_pos)r,'
312                ' %(num_lines)r)' % self._as_dict())
313
314    def __eq__(self, other):
315        if self.__class__ is not other.__class__:
316            return False
317        return self._as_dict() == other._as_dict()
318
319    def to_patch(self):
320        yield (b'c %(parent)d %(parent_pos)d %(child_pos)d %(num_lines)d\n'
321               % self._as_dict())
322
323
324class BaseVersionedFile(object):
325    """Pseudo-VersionedFile skeleton for MultiParent"""
326
327    def __init__(self, snapshot_interval=25, max_snapshots=None):
328        self._lines = {}
329        self._parents = {}
330        self._snapshots = set()
331        self.snapshot_interval = snapshot_interval
332        self.max_snapshots = max_snapshots
333
334    def versions(self):
335        return iter(self._parents)
336
337    def has_version(self, version):
338        return version in self._parents
339
340    def do_snapshot(self, version_id, parent_ids):
341        """Determine whether to perform a snapshot for this version"""
342        if self.snapshot_interval is None:
343            return False
344        if self.max_snapshots is not None and\
345                len(self._snapshots) == self.max_snapshots:
346            return False
347        if len(parent_ids) == 0:
348            return True
349        for ignored in range(self.snapshot_interval):
350            if len(parent_ids) == 0:
351                return False
352            version_ids = parent_ids
353            parent_ids = []
354            for version_id in version_ids:
355                if version_id not in self._snapshots:
356                    parent_ids.extend(self._parents[version_id])
357        else:
358            return True
359
360    def add_version(self, lines, version_id, parent_ids,
361                    force_snapshot=None, single_parent=False):
362        """Add a version to the versionedfile
363
364        :param lines: The list of lines to add.  Must be split on '\n'.
365        :param version_id: The version_id of the version to add
366        :param force_snapshot: If true, force this version to be added as a
367            snapshot version.  If false, force this version to be added as a
368            diff.  If none, determine this automatically.
369        :param single_parent: If true, use a single parent, rather than
370            multiple parents.
371        """
372        if force_snapshot is None:
373            do_snapshot = self.do_snapshot(version_id, parent_ids)
374        else:
375            do_snapshot = force_snapshot
376        if do_snapshot:
377            self._snapshots.add(version_id)
378            diff = MultiParent([NewText(lines)])
379        else:
380            if single_parent:
381                parent_lines = self.get_line_list(parent_ids[:1])
382            else:
383                parent_lines = self.get_line_list(parent_ids)
384            diff = MultiParent.from_lines(lines, parent_lines)
385            if diff.is_snapshot():
386                self._snapshots.add(version_id)
387        self.add_diff(diff, version_id, parent_ids)
388        self._lines[version_id] = lines
389
390    def get_parents(self, version_id):
391        return self._parents[version_id]
392
393    def make_snapshot(self, version_id):
394        snapdiff = MultiParent([NewText(self.cache_version(version_id))])
395        self.add_diff(snapdiff, version_id, self._parents[version_id])
396        self._snapshots.add(version_id)
397
398    def import_versionedfile(self, vf, snapshots, no_cache=True,
399                             single_parent=False, verify=False):
400        """Import all revisions of a versionedfile
401
402        :param vf: The versionedfile to import
403        :param snapshots: If provided, the revisions to make snapshots of.
404            Otherwise, this will be auto-determined
405        :param no_cache: If true, clear the cache after every add.
406        :param single_parent: If true, omit all but one parent text, (but
407            retain parent metadata).
408        """
409        if not (no_cache or not verify):
410            raise ValueError()
411        revisions = set(vf.versions())
412        total = len(revisions)
413        with ui.ui_factory.nested_progress_bar() as pb:
414            while len(revisions) > 0:
415                added = set()
416                for revision in revisions:
417                    parents = vf.get_parents(revision)
418                    if [p for p in parents if p not in self._parents] != []:
419                        continue
420                    lines = [a + b' ' + l for a, l in
421                             vf.annotate(revision)]
422                    if snapshots is None:
423                        force_snapshot = None
424                    else:
425                        force_snapshot = (revision in snapshots)
426                    self.add_version(lines, revision, parents, force_snapshot,
427                                     single_parent)
428                    added.add(revision)
429                    if no_cache:
430                        self.clear_cache()
431                        vf.clear_cache()
432                        if verify:
433                            if not (lines == self.get_line_list([revision])[0]):
434                                raise AssertionError()
435                            self.clear_cache()
436                    pb.update(gettext('Importing revisions'),
437                              (total - len(revisions)) + len(added), total)
438                revisions = [r for r in revisions if r not in added]
439
440    def select_snapshots(self, vf):
441        """Determine which versions to add as snapshots"""
442        build_ancestors = {}
443        snapshots = set()
444        for version_id in topo_iter(vf):
445            potential_build_ancestors = set(vf.get_parents(version_id))
446            parents = vf.get_parents(version_id)
447            if len(parents) == 0:
448                snapshots.add(version_id)
449                build_ancestors[version_id] = set()
450            else:
451                for parent in vf.get_parents(version_id):
452                    potential_build_ancestors.update(build_ancestors[parent])
453                if len(potential_build_ancestors) > self.snapshot_interval:
454                    snapshots.add(version_id)
455                    build_ancestors[version_id] = set()
456                else:
457                    build_ancestors[version_id] = potential_build_ancestors
458        return snapshots
459
460    def select_by_size(self, num):
461        """Select snapshots for minimum output size"""
462        num -= len(self._snapshots)
463        new_snapshots = self.get_size_ranking()[-num:]
464        return [v for n, v in new_snapshots]
465
466    def get_size_ranking(self):
467        """Get versions ranked by size"""
468        versions = []
469        for version_id in self.versions():
470            if version_id in self._snapshots:
471                continue
472            diff_len = self.get_diff(version_id).patch_len()
473            snapshot_len = MultiParent([NewText(
474                self.cache_version(version_id))]).patch_len()
475            versions.append((snapshot_len - diff_len, version_id))
476        versions.sort()
477        return versions
478
479    def import_diffs(self, vf):
480        """Import the diffs from another pseudo-versionedfile"""
481        for version_id in vf.versions():
482            self.add_diff(vf.get_diff(version_id), version_id,
483                          vf._parents[version_id])
484
485    def get_build_ranking(self):
486        """Return revisions sorted by how much they reduce build complexity"""
487        could_avoid = {}
488        referenced_by = {}
489        for version_id in topo_iter(self):
490            could_avoid[version_id] = set()
491            if version_id not in self._snapshots:
492                for parent_id in self._parents[version_id]:
493                    could_avoid[version_id].update(could_avoid[parent_id])
494                could_avoid[version_id].update(self._parents)
495                could_avoid[version_id].discard(version_id)
496            for avoid_id in could_avoid[version_id]:
497                referenced_by.setdefault(avoid_id, set()).add(version_id)
498        available_versions = list(self.versions())
499        ranking = []
500        while len(available_versions) > 0:
501            available_versions.sort(key=lambda x:
502                                    len(could_avoid[x]) *
503                                    len(referenced_by.get(x, [])))
504            selected = available_versions.pop()
505            ranking.append(selected)
506            for version_id in referenced_by[selected]:
507                could_avoid[version_id].difference_update(
508                    could_avoid[selected])
509            for version_id in could_avoid[selected]:
510                referenced_by[version_id].difference_update(
511                    referenced_by[selected]
512                )
513        return ranking
514
515    def clear_cache(self):
516        self._lines.clear()
517
518    def get_line_list(self, version_ids):
519        return [self.cache_version(v) for v in version_ids]
520
521    def cache_version(self, version_id):
522        try:
523            return self._lines[version_id]
524        except KeyError:
525            pass
526        diff = self.get_diff(version_id)
527        lines = []
528        reconstructor = _Reconstructor(self, self._lines, self._parents)
529        reconstructor.reconstruct_version(lines, version_id)
530        self._lines[version_id] = lines
531        return lines
532
533
534class MultiMemoryVersionedFile(BaseVersionedFile):
535    """Memory-backed pseudo-versionedfile"""
536
537    def __init__(self, snapshot_interval=25, max_snapshots=None):
538        BaseVersionedFile.__init__(self, snapshot_interval, max_snapshots)
539        self._diffs = {}
540
541    def add_diff(self, diff, version_id, parent_ids):
542        self._diffs[version_id] = diff
543        self._parents[version_id] = parent_ids
544
545    def get_diff(self, version_id):
546        try:
547            return self._diffs[version_id]
548        except KeyError:
549            raise errors.RevisionNotPresent(version_id, self)
550
551    def destroy(self):
552        self._diffs = {}
553
554
555class MultiVersionedFile(BaseVersionedFile):
556    """Disk-backed pseudo-versionedfile"""
557
558    def __init__(self, filename, snapshot_interval=25, max_snapshots=None):
559        BaseVersionedFile.__init__(self, snapshot_interval, max_snapshots)
560        self._filename = filename
561        self._diff_offset = {}
562
563    def get_diff(self, version_id):
564        start, count = self._diff_offset[version_id]
565        with open(self._filename + '.mpknit', 'rb') as infile:
566            infile.seek(start)
567            sio = BytesIO(infile.read(count))
568        with gzip.GzipFile(None, mode='rb', fileobj=sio) as zip_file:
569            file_version_id = zip_file.readline()
570            content = zip_file.read()
571            return MultiParent.from_patch(content)
572
573    def add_diff(self, diff, version_id, parent_ids):
574        with open(self._filename + '.mpknit', 'ab') as outfile:
575            outfile.seek(0, 2)      # workaround for windows bug:
576            # .tell() for files opened in 'ab' mode
577            # before any write returns 0
578            start = outfile.tell()
579            with gzip.GzipFile(None, mode='ab', fileobj=outfile) as zipfile:
580                zipfile.writelines(itertools.chain(
581                    [b'version %s\n' % version_id], diff.to_patch()))
582            end = outfile.tell()
583        self._diff_offset[version_id] = (start, end - start)
584        self._parents[version_id] = parent_ids
585
586    def destroy(self):
587        try:
588            os.unlink(self._filename + '.mpknit')
589        except OSError as e:
590            if e.errno != errno.ENOENT:
591                raise
592        try:
593            os.unlink(self._filename + '.mpidx')
594        except OSError as e:
595            if e.errno != errno.ENOENT:
596                raise
597
598    def save(self):
599        open(self._filename + '.mpidx', 'wb').write(bencode.bencode(
600            (self._parents, list(self._snapshots), self._diff_offset)))
601
602    def load(self):
603        self._parents, snapshots, self._diff_offset = bencode.bdecode(
604            open(self._filename + '.mpidx', 'rb').read())
605        self._snapshots = set(snapshots)
606
607
608class _Reconstructor(object):
609    """Build a text from the diffs, ancestry graph and cached lines"""
610
611    def __init__(self, diffs, lines, parents):
612        self.diffs = diffs
613        self.lines = lines
614        self.parents = parents
615        self.cursor = {}
616
617    def reconstruct(self, lines, parent_text, version_id):
618        """Append the lines referred to by a ParentText to lines"""
619        parent_id = self.parents[version_id][parent_text.parent]
620        end = parent_text.parent_pos + parent_text.num_lines
621        return self._reconstruct(lines, parent_id, parent_text.parent_pos,
622                                 end)
623
624    def _reconstruct(self, lines, req_version_id, req_start, req_end):
625        """Append lines for the requested version_id range"""
626        # stack of pending range requests
627        if req_start == req_end:
628            return
629        pending_reqs = [(req_version_id, req_start, req_end)]
630        while len(pending_reqs) > 0:
631            req_version_id, req_start, req_end = pending_reqs.pop()
632            # lazily allocate cursors for versions
633            if req_version_id in self.lines:
634                lines.extend(self.lines[req_version_id][req_start:req_end])
635                continue
636            try:
637                start, end, kind, data, iterator = self.cursor[req_version_id]
638            except KeyError:
639                iterator = self.diffs.get_diff(req_version_id).range_iterator()
640                start, end, kind, data = next(iterator)
641            if start > req_start:
642                iterator = self.diffs.get_diff(req_version_id).range_iterator()
643                start, end, kind, data = next(iterator)
644
645            # find the first hunk relevant to the request
646            while end <= req_start:
647                start, end, kind, data = next(iterator)
648            self.cursor[req_version_id] = start, end, kind, data, iterator
649            # if the hunk can't satisfy the whole request, split it in two,
650            # and leave the second half for later.
651            if req_end > end:
652                pending_reqs.append((req_version_id, end, req_end))
653                req_end = end
654            if kind == 'new':
655                lines.extend(data[req_start - start: (req_end - start)])
656            else:
657                # If the hunk is a ParentText, rewrite it as a range request
658                # for the parent, and make it the next pending request.
659                parent, parent_start, parent_end = data
660                new_version_id = self.parents[req_version_id][parent]
661                new_start = parent_start + req_start - start
662                new_end = parent_end + req_end - end
663                pending_reqs.append((new_version_id, new_start, new_end))
664
665    def reconstruct_version(self, lines, version_id):
666        length = self.diffs.get_diff(version_id).num_lines()
667        return self._reconstruct(lines, version_id, 0, length)
668
669
670def gzip_string(lines):
671    sio = BytesIO()
672    with gzip.GzipFile(None, mode='wb', fileobj=sio) as data_file:
673        data_file.writelines(lines)
674    return sio.getvalue()
675