1# Copyright (C) 2009 Canonical Ltd 2# 3# This program is free software; you can redistribute it and/or modify 4# it under the terms of the GNU General Public License as published by 5# the Free Software Foundation; either version 2 of the License, or 6# (at your option) any later version. 7# 8# This program is distributed in the hope that it will be useful, 9# but WITHOUT ANY WARRANTY; without even the implied warranty of 10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11# GNU General Public License for more details. 12# 13# You should have received a copy of the GNU General Public License 14# along with this program; if not, write to the Free Software 15# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 16 17from io import BytesIO 18 19from . import ( 20 osutils, 21 progress, 22 trace, 23 ) 24from .i18n import gettext 25from .ui import ui_factory 26 27 28class RenameMap(object): 29 """Determine a mapping of renames.""" 30 31 def __init__(self, tree): 32 self.tree = tree 33 self.edge_hashes = {} 34 35 @staticmethod 36 def iter_edge_hashes(lines): 37 """Iterate through the hashes of line pairs (which make up an edge). 38 39 The hash is truncated using a modulus to avoid excessive memory 40 consumption by the hitscount dict. A modulus of 10Mi means that the 41 maximum number of keys is 10Mi. (Keys are normally 32 bits, e.g. 42 4 Gi) 43 """ 44 modulus = 1024 * 1024 * 10 45 for n in range(len(lines)): 46 yield hash(tuple(lines[n:n + 2])) % modulus 47 48 def add_edge_hashes(self, lines, tag): 49 """Update edge_hashes to include the given lines. 50 51 :param lines: The lines to update the hashes for. 52 :param tag: A tag uniquely associated with these lines (i.e. file-id) 53 """ 54 for my_hash in self.iter_edge_hashes(lines): 55 self.edge_hashes.setdefault(my_hash, set()).add(tag) 56 57 def add_file_edge_hashes(self, tree, file_ids): 58 """Update to reflect the hashes for files in the tree. 59 60 :param tree: The tree containing the files. 61 :param file_ids: A list of file_ids to perform the updates for. 62 """ 63 desired_files = [(tree.id2path(f), f) for f in file_ids] 64 with ui_factory.nested_progress_bar() as task: 65 for num, (file_id, contents) in enumerate( 66 tree.iter_files_bytes(desired_files)): 67 task.update(gettext('Calculating hashes'), num, len(file_ids)) 68 s = BytesIO() 69 s.writelines(contents) 70 s.seek(0) 71 self.add_edge_hashes(s.readlines(), file_id) 72 73 def hitcounts(self, lines): 74 """Count the number of hash hits for each tag, for the given lines. 75 76 Hits are weighted according to the number of tags the hash is 77 associated with; more tags means that the hash is less rare and should 78 tend to be ignored. 79 :param lines: The lines to calculate hashes of. 80 :return: a dict of {tag: hitcount} 81 """ 82 hits = {} 83 for my_hash in self.iter_edge_hashes(lines): 84 tags = self.edge_hashes.get(my_hash) 85 if tags is None: 86 continue 87 taglen = len(tags) 88 for tag in tags: 89 if tag not in hits: 90 hits[tag] = 0 91 hits[tag] += 1.0 / taglen 92 return hits 93 94 def get_all_hits(self, paths): 95 """Find all the hit counts for the listed paths in the tree. 96 97 :return: A list of tuples of count, path, file_id. 98 """ 99 all_hits = [] 100 with ui_factory.nested_progress_bar() as task: 101 for num, path in enumerate(paths): 102 task.update(gettext('Determining hash hits'), num, len(paths)) 103 hits = self.hitcounts(self.tree.get_file_lines(path)) 104 all_hits.extend((v, path, k) for k, v in hits.items()) 105 return all_hits 106 107 def file_match(self, paths): 108 """Return a mapping from file_ids to the supplied paths.""" 109 return self._match_hits(self.get_all_hits(paths)) 110 111 @staticmethod 112 def _match_hits(hit_list): 113 """Using a hit list, determine a path-to-fileid map. 114 115 The hit list is a list of (count, path, file_id), where count is a 116 (possibly float) number, with higher numbers indicating stronger 117 matches. 118 """ 119 seen_file_ids = set() 120 path_map = {} 121 for count, path, file_id in sorted(hit_list, reverse=True): 122 if path in path_map or file_id in seen_file_ids: 123 continue 124 path_map[path] = file_id 125 seen_file_ids.add(file_id) 126 return path_map 127 128 def get_required_parents(self, matches): 129 """Return a dict of all file parents that must be versioned. 130 131 The keys are the required parents and the values are sets of their 132 children. 133 """ 134 required_parents = {} 135 for path in matches: 136 while True: 137 child = path 138 path = osutils.dirname(path) 139 if self.tree.is_versioned(path): 140 break 141 required_parents.setdefault(path, []).append(child) 142 require_ids = {} 143 for parent, children in required_parents.items(): 144 child_file_ids = set() 145 for child in children: 146 file_id = matches.get(child) 147 if file_id is not None: 148 child_file_ids.add(file_id) 149 require_ids[parent] = child_file_ids 150 return require_ids 151 152 def match_parents(self, required_parents, missing_parents): 153 """Map parent directories to file-ids. 154 155 This is done by finding similarity between the file-ids of children of 156 required parent directories and the file-ids of children of missing 157 parent directories. 158 """ 159 all_hits = [] 160 for file_id, file_id_children in missing_parents.items(): 161 for path, path_children in required_parents.items(): 162 hits = len(path_children.intersection(file_id_children)) 163 if hits > 0: 164 all_hits.append((hits, path, file_id)) 165 return self._match_hits(all_hits) 166 167 def _find_missing_files(self, basis): 168 missing_files = set() 169 missing_parents = {} 170 candidate_files = set() 171 with ui_factory.nested_progress_bar() as task: 172 iterator = self.tree.iter_changes(basis, want_unversioned=True, 173 pb=task) 174 for change in iterator: 175 if change.kind[1] is None and change.versioned[1]: 176 if not self.tree.has_filename( 177 self.tree.id2path(change.parent_id[0])): 178 missing_parents.setdefault( 179 change.parent_id[0], set()).add(change.file_id) 180 if change.kind[0] == 'file': 181 missing_files.add(change.file_id) 182 else: 183 # other kinds are not handled 184 pass 185 if change.versioned == (False, False): 186 if self.tree.is_ignored(change.path[1]): 187 continue 188 if change.kind[1] == 'file': 189 candidate_files.add(change.path[1]) 190 if change.kind[1] == 'directory': 191 for _dir, children in self.tree.walkdirs(change.path[1]): 192 for child in children: 193 if child[2] == 'file': 194 candidate_files.add(child[0]) 195 return missing_files, missing_parents, candidate_files 196 197 @classmethod 198 def guess_renames(klass, from_tree, to_tree, dry_run=False): 199 """Guess which files to rename, and perform the rename. 200 201 We assume that unversioned files and missing files indicate that 202 versioned files have been renamed outside of Bazaar. 203 204 :param from_tree: A tree to compare from 205 :param to_tree: A write-locked working tree. 206 """ 207 required_parents = {} 208 with ui_factory.nested_progress_bar() as task: 209 pp = progress.ProgressPhase('Guessing renames', 4, task) 210 with from_tree.lock_read(): 211 rn = klass(to_tree) 212 pp.next_phase() 213 missing_files, missing_parents, candidate_files = ( 214 rn._find_missing_files(from_tree)) 215 pp.next_phase() 216 rn.add_file_edge_hashes(from_tree, missing_files) 217 pp.next_phase() 218 matches = rn.file_match(candidate_files) 219 parents_matches = matches 220 while len(parents_matches) > 0: 221 required_parents = rn.get_required_parents( 222 parents_matches) 223 parents_matches = rn.match_parents(required_parents, 224 missing_parents) 225 matches.update(parents_matches) 226 pp.next_phase() 227 delta = rn._make_inventory_delta(matches) 228 for old, new, file_id, entry in delta: 229 trace.note(gettext("{0} => {1}").format(old, new)) 230 if not dry_run: 231 to_tree.add(required_parents) 232 to_tree.apply_inventory_delta(delta) 233 234 def _make_inventory_delta(self, matches): 235 delta = [] 236 file_id_matches = dict((f, p) for p, f in matches.items()) 237 file_id_query = [] 238 for f in matches.values(): 239 try: 240 file_id_query.append(self.tree.id2path(f)) 241 except errors.NoSuchId: 242 pass 243 for old_path, entry in self.tree.iter_entries_by_dir( 244 specific_files=file_id_query): 245 new_path = file_id_matches[entry.file_id] 246 parent_path, new_name = osutils.split(new_path) 247 parent_id = matches.get(parent_path) 248 if parent_id is None: 249 parent_id = self.tree.path2id(parent_path) 250 if parent_id is None: 251 added, ignored = self.tree.smart_add( 252 [parent_path], recurse=False) 253 if len(ignored) > 0 and ignored[0] == parent_path: 254 continue 255 else: 256 parent_id = self.tree.path2id(parent_path) 257 if entry.name == new_name and entry.parent_id == parent_id: 258 continue 259 new_entry = entry.copy() 260 new_entry.parent_id = parent_id 261 new_entry.name = new_name 262 delta.append((old_path, new_path, new_entry.file_id, new_entry)) 263 return delta 264