1# Copyright (C) 2009 Canonical Ltd
2#
3# This program is free software; you can redistribute it and/or modify
4# it under the terms of the GNU General Public License as published by
5# the Free Software Foundation; either version 2 of the License, or
6# (at your option) any later version.
7#
8# This program is distributed in the hope that it will be useful,
9# but WITHOUT ANY WARRANTY; without even the implied warranty of
10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11# GNU General Public License for more details.
12#
13# You should have received a copy of the GNU General Public License
14# along with this program; if not, write to the Free Software
15# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
17from io import BytesIO
18
19from . import (
20    osutils,
21    progress,
22    trace,
23    )
24from .i18n import gettext
25from .ui import ui_factory
26
27
28class RenameMap(object):
29    """Determine a mapping of renames."""
30
31    def __init__(self, tree):
32        self.tree = tree
33        self.edge_hashes = {}
34
35    @staticmethod
36    def iter_edge_hashes(lines):
37        """Iterate through the hashes of line pairs (which make up an edge).
38
39        The hash is truncated using a modulus to avoid excessive memory
40        consumption by the hitscount dict.  A modulus of 10Mi means that the
41        maximum number of keys is 10Mi.  (Keys are normally 32 bits, e.g.
42        4 Gi)
43        """
44        modulus = 1024 * 1024 * 10
45        for n in range(len(lines)):
46            yield hash(tuple(lines[n:n + 2])) % modulus
47
48    def add_edge_hashes(self, lines, tag):
49        """Update edge_hashes to include the given lines.
50
51        :param lines: The lines to update the hashes for.
52        :param tag: A tag uniquely associated with these lines (i.e. file-id)
53        """
54        for my_hash in self.iter_edge_hashes(lines):
55            self.edge_hashes.setdefault(my_hash, set()).add(tag)
56
57    def add_file_edge_hashes(self, tree, file_ids):
58        """Update to reflect the hashes for files in the tree.
59
60        :param tree: The tree containing the files.
61        :param file_ids: A list of file_ids to perform the updates for.
62        """
63        desired_files = [(tree.id2path(f), f) for f in file_ids]
64        with ui_factory.nested_progress_bar() as task:
65            for num, (file_id, contents) in enumerate(
66                    tree.iter_files_bytes(desired_files)):
67                task.update(gettext('Calculating hashes'), num, len(file_ids))
68                s = BytesIO()
69                s.writelines(contents)
70                s.seek(0)
71                self.add_edge_hashes(s.readlines(), file_id)
72
73    def hitcounts(self, lines):
74        """Count the number of hash hits for each tag, for the given lines.
75
76        Hits are weighted according to the number of tags the hash is
77        associated with; more tags means that the hash is less rare and should
78        tend to be ignored.
79        :param lines: The lines to calculate hashes of.
80        :return: a dict of {tag: hitcount}
81        """
82        hits = {}
83        for my_hash in self.iter_edge_hashes(lines):
84            tags = self.edge_hashes.get(my_hash)
85            if tags is None:
86                continue
87            taglen = len(tags)
88            for tag in tags:
89                if tag not in hits:
90                    hits[tag] = 0
91                hits[tag] += 1.0 / taglen
92        return hits
93
94    def get_all_hits(self, paths):
95        """Find all the hit counts for the listed paths in the tree.
96
97        :return: A list of tuples of count, path, file_id.
98        """
99        all_hits = []
100        with ui_factory.nested_progress_bar() as task:
101            for num, path in enumerate(paths):
102                task.update(gettext('Determining hash hits'), num, len(paths))
103                hits = self.hitcounts(self.tree.get_file_lines(path))
104                all_hits.extend((v, path, k) for k, v in hits.items())
105        return all_hits
106
107    def file_match(self, paths):
108        """Return a mapping from file_ids to the supplied paths."""
109        return self._match_hits(self.get_all_hits(paths))
110
111    @staticmethod
112    def _match_hits(hit_list):
113        """Using a hit list, determine a path-to-fileid map.
114
115        The hit list is a list of (count, path, file_id), where count is a
116        (possibly float) number, with higher numbers indicating stronger
117        matches.
118        """
119        seen_file_ids = set()
120        path_map = {}
121        for count, path, file_id in sorted(hit_list, reverse=True):
122            if path in path_map or file_id in seen_file_ids:
123                continue
124            path_map[path] = file_id
125            seen_file_ids.add(file_id)
126        return path_map
127
128    def get_required_parents(self, matches):
129        """Return a dict of all file parents that must be versioned.
130
131        The keys are the required parents and the values are sets of their
132        children.
133        """
134        required_parents = {}
135        for path in matches:
136            while True:
137                child = path
138                path = osutils.dirname(path)
139                if self.tree.is_versioned(path):
140                    break
141                required_parents.setdefault(path, []).append(child)
142        require_ids = {}
143        for parent, children in required_parents.items():
144            child_file_ids = set()
145            for child in children:
146                file_id = matches.get(child)
147                if file_id is not None:
148                    child_file_ids.add(file_id)
149            require_ids[parent] = child_file_ids
150        return require_ids
151
152    def match_parents(self, required_parents, missing_parents):
153        """Map parent directories to file-ids.
154
155        This is done by finding similarity between the file-ids of children of
156        required parent directories and the file-ids of children of missing
157        parent directories.
158        """
159        all_hits = []
160        for file_id, file_id_children in missing_parents.items():
161            for path, path_children in required_parents.items():
162                hits = len(path_children.intersection(file_id_children))
163                if hits > 0:
164                    all_hits.append((hits, path, file_id))
165        return self._match_hits(all_hits)
166
167    def _find_missing_files(self, basis):
168        missing_files = set()
169        missing_parents = {}
170        candidate_files = set()
171        with ui_factory.nested_progress_bar() as task:
172            iterator = self.tree.iter_changes(basis, want_unversioned=True,
173                                              pb=task)
174            for change in iterator:
175                if change.kind[1] is None and change.versioned[1]:
176                    if not self.tree.has_filename(
177                            self.tree.id2path(change.parent_id[0])):
178                        missing_parents.setdefault(
179                            change.parent_id[0], set()).add(change.file_id)
180                    if change.kind[0] == 'file':
181                        missing_files.add(change.file_id)
182                    else:
183                        # other kinds are not handled
184                        pass
185                if change.versioned == (False, False):
186                    if self.tree.is_ignored(change.path[1]):
187                        continue
188                    if change.kind[1] == 'file':
189                        candidate_files.add(change.path[1])
190                    if change.kind[1] == 'directory':
191                        for _dir, children in self.tree.walkdirs(change.path[1]):
192                            for child in children:
193                                if child[2] == 'file':
194                                    candidate_files.add(child[0])
195        return missing_files, missing_parents, candidate_files
196
197    @classmethod
198    def guess_renames(klass, from_tree, to_tree, dry_run=False):
199        """Guess which files to rename, and perform the rename.
200
201        We assume that unversioned files and missing files indicate that
202        versioned files have been renamed outside of Bazaar.
203
204        :param from_tree: A tree to compare from
205        :param to_tree: A write-locked working tree.
206        """
207        required_parents = {}
208        with ui_factory.nested_progress_bar() as task:
209            pp = progress.ProgressPhase('Guessing renames', 4, task)
210            with from_tree.lock_read():
211                rn = klass(to_tree)
212                pp.next_phase()
213                missing_files, missing_parents, candidate_files = (
214                    rn._find_missing_files(from_tree))
215                pp.next_phase()
216                rn.add_file_edge_hashes(from_tree, missing_files)
217            pp.next_phase()
218            matches = rn.file_match(candidate_files)
219            parents_matches = matches
220            while len(parents_matches) > 0:
221                required_parents = rn.get_required_parents(
222                    parents_matches)
223                parents_matches = rn.match_parents(required_parents,
224                                                   missing_parents)
225                matches.update(parents_matches)
226            pp.next_phase()
227            delta = rn._make_inventory_delta(matches)
228            for old, new, file_id, entry in delta:
229                trace.note(gettext("{0} => {1}").format(old, new))
230            if not dry_run:
231                to_tree.add(required_parents)
232                to_tree.apply_inventory_delta(delta)
233
234    def _make_inventory_delta(self, matches):
235        delta = []
236        file_id_matches = dict((f, p) for p, f in matches.items())
237        file_id_query = []
238        for f in matches.values():
239            try:
240                file_id_query.append(self.tree.id2path(f))
241            except errors.NoSuchId:
242                pass
243        for old_path, entry in self.tree.iter_entries_by_dir(
244                specific_files=file_id_query):
245            new_path = file_id_matches[entry.file_id]
246            parent_path, new_name = osutils.split(new_path)
247            parent_id = matches.get(parent_path)
248            if parent_id is None:
249                parent_id = self.tree.path2id(parent_path)
250                if parent_id is None:
251                    added, ignored = self.tree.smart_add(
252                        [parent_path], recurse=False)
253                    if len(ignored) > 0 and ignored[0] == parent_path:
254                        continue
255                    else:
256                        parent_id = self.tree.path2id(parent_path)
257            if entry.name == new_name and entry.parent_id == parent_id:
258                continue
259            new_entry = entry.copy()
260            new_entry.parent_id = parent_id
261            new_entry.name = new_name
262            delta.append((old_path, new_path, new_entry.file_id, new_entry))
263        return delta
264