1# Copyright (C) 2006-2010 Canonical Ltd 2# 3# This program is free software; you can redistribute it and/or modify 4# it under the terms of the GNU General Public License as published by 5# the Free Software Foundation; either version 2 of the License, or 6# (at your option) any later version. 7# 8# This program is distributed in the hope that it will be useful, 9# but WITHOUT ANY WARRANTY; without even the implied warranty of 10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11# GNU General Public License for more details. 12# 13# You should have received a copy of the GNU General Public License 14# along with this program; if not, write to the Free Software 15# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 16 17"""Reconcilers are able to fix some potential data errors in a branch.""" 18 19__all__ = [ 20 'BranchReconciler', 21 'KnitReconciler', 22 'PackReconciler', 23 'VersionedFileRepoReconciler', 24 ] 25 26from .. import ( 27 errors, 28 revision as _mod_revision, 29 ui, 30 ) 31from ..reconcile import ReconcileResult 32from ..i18n import gettext 33from ..trace import mutter 34from ..tsort import topo_sort 35from .versionedfile import AdapterFactory, ChunkedContentFactory 36 37 38class VersionedFileRepoReconciler(object): 39 """Reconciler that reconciles a repository. 40 41 The goal of repository reconciliation is to make any derived data 42 consistent with the core data committed by a user. This can involve 43 reindexing, or removing unreferenced data if that can interfere with 44 queries in a given repository. 45 46 Currently this consists of an inventory reweave with revision cross-checks. 47 """ 48 49 def __init__(self, repo, other=None, thorough=False): 50 """Construct a RepoReconciler. 51 52 :param thorough: perform a thorough check which may take longer but 53 will correct non-data loss issues such as incorrect 54 cached data. 55 """ 56 self.garbage_inventories = 0 57 self.inconsistent_parents = 0 58 self.aborted = False 59 self.repo = repo 60 self.thorough = thorough 61 62 def reconcile(self): 63 """Perform reconciliation. 64 65 After reconciliation the following attributes document found issues: 66 67 * `inconsistent_parents`: The number of revisions in the repository 68 whose ancestry was being reported incorrectly. 69 * `garbage_inventories`: The number of inventory objects without 70 revisions that were garbage collected. 71 """ 72 with self.repo.lock_write(), \ 73 ui.ui_factory.nested_progress_bar() as self.pb: 74 self._reconcile_steps() 75 ret = ReconcileResult() 76 ret.aborted = self.aborted 77 ret.garbage_inventories = self.garbage_inventories 78 ret.inconsistent_parents = self.inconsistent_parents 79 return ret 80 81 def _reconcile_steps(self): 82 """Perform the steps to reconcile this repository.""" 83 self._reweave_inventory() 84 85 def _reweave_inventory(self): 86 """Regenerate the inventory weave for the repository from scratch. 87 88 This is a smart function: it will only do the reweave if doing it 89 will correct data issues. The self.thorough flag controls whether 90 only data-loss causing issues (!self.thorough) or all issues 91 (self.thorough) are treated as requiring the reweave. 92 """ 93 transaction = self.repo.get_transaction() 94 self.pb.update(gettext('Reading inventory data')) 95 self.inventory = self.repo.inventories 96 self.revisions = self.repo.revisions 97 # the total set of revisions to process 98 self.pending = {key[-1] for key in self.revisions.keys()} 99 100 # mapping from revision_id to parents 101 self._rev_graph = {} 102 # errors that we detect 103 self.inconsistent_parents = 0 104 # we need the revision id of each revision and its available parents list 105 self._setup_steps(len(self.pending)) 106 for rev_id in self.pending: 107 # put a revision into the graph. 108 self._graph_revision(rev_id) 109 self._check_garbage_inventories() 110 # if there are no inconsistent_parents and 111 # (no garbage inventories or we are not doing a thorough check) 112 if (not self.inconsistent_parents 113 and (not self.garbage_inventories or not self.thorough)): 114 ui.ui_factory.note(gettext('Inventory ok.')) 115 return 116 self.pb.update(gettext('Backing up inventory'), 0, 0) 117 self.repo._backup_inventory() 118 ui.ui_factory.note(gettext('Backup inventory created.')) 119 new_inventories = self.repo._temp_inventories() 120 121 # we have topological order of revisions and non ghost parents ready. 122 self._setup_steps(len(self._rev_graph)) 123 revision_keys = [(rev_id,) for rev_id in topo_sort(self._rev_graph)] 124 stream = self._change_inv_parents( 125 self.inventory.get_record_stream(revision_keys, 'unordered', True), 126 self._new_inv_parents, 127 set(revision_keys)) 128 new_inventories.insert_record_stream(stream) 129 # if this worked, the set of new_inventories.keys should equal 130 # self.pending 131 if not (set(new_inventories.keys()) 132 == {(revid,) for revid in self.pending}): 133 raise AssertionError() 134 self.pb.update(gettext('Writing weave')) 135 self.repo._activate_new_inventory() 136 self.inventory = None 137 ui.ui_factory.note(gettext('Inventory regenerated.')) 138 139 def _new_inv_parents(self, revision_key): 140 """Lookup ghost-filtered parents for revision_key.""" 141 # Use the filtered ghostless parents list: 142 return tuple([(revid,) for revid in self._rev_graph[revision_key[-1]]]) 143 144 def _change_inv_parents(self, stream, get_parents, all_revision_keys): 145 """Adapt a record stream to reconcile the parents.""" 146 for record in stream: 147 wanted_parents = get_parents(record.key) 148 if wanted_parents and wanted_parents[0] not in all_revision_keys: 149 # The check for the left most parent only handles knit 150 # compressors, but this code only applies to knit and weave 151 # repositories anyway. 152 chunks = record.get_bytes_as('chunked') 153 yield ChunkedContentFactory(record.key, wanted_parents, record.sha1, chunks) 154 else: 155 adapted_record = AdapterFactory( 156 record.key, wanted_parents, record) 157 yield adapted_record 158 self._reweave_step('adding inventories') 159 160 def _setup_steps(self, new_total): 161 """Setup the markers we need to control the progress bar.""" 162 self.total = new_total 163 self.count = 0 164 165 def _graph_revision(self, rev_id): 166 """Load a revision into the revision graph.""" 167 # pick a random revision 168 # analyse revision id rev_id and put it in the stack. 169 self._reweave_step('loading revisions') 170 rev = self.repo.get_revision_reconcile(rev_id) 171 parents = [] 172 for parent in rev.parent_ids: 173 if self._parent_is_available(parent): 174 parents.append(parent) 175 else: 176 mutter('found ghost %s', parent) 177 self._rev_graph[rev_id] = parents 178 179 def _check_garbage_inventories(self): 180 """Check for garbage inventories which we cannot trust 181 182 We cant trust them because their pre-requisite file data may not 183 be present - all we know is that their revision was not installed. 184 """ 185 if not self.thorough: 186 return 187 inventories = set(self.inventory.keys()) 188 revisions = set(self.revisions.keys()) 189 garbage = inventories.difference(revisions) 190 self.garbage_inventories = len(garbage) 191 for revision_key in garbage: 192 mutter('Garbage inventory {%s} found.', revision_key[-1]) 193 194 def _parent_is_available(self, parent): 195 """True if parent is a fully available revision 196 197 A fully available revision has a inventory and a revision object in the 198 repository. 199 """ 200 if parent in self._rev_graph: 201 return True 202 inv_present = (1 == len(self.inventory.get_parent_map([(parent,)]))) 203 return (inv_present and self.repo.has_revision(parent)) 204 205 def _reweave_step(self, message): 206 """Mark a single step of regeneration complete.""" 207 self.pb.update(message, self.count, self.total) 208 self.count += 1 209 210 211class KnitReconciler(VersionedFileRepoReconciler): 212 """Reconciler that reconciles a knit format repository. 213 214 This will detect garbage inventories and remove them in thorough mode. 215 """ 216 217 def _reconcile_steps(self): 218 """Perform the steps to reconcile this repository.""" 219 if self.thorough: 220 try: 221 self._load_indexes() 222 except errors.BzrCheckError: 223 self.aborted = True 224 return 225 # knits never suffer this 226 self._gc_inventory() 227 self._fix_text_parents() 228 229 def _load_indexes(self): 230 """Load indexes for the reconciliation.""" 231 self.transaction = self.repo.get_transaction() 232 self.pb.update(gettext('Reading indexes'), 0, 2) 233 self.inventory = self.repo.inventories 234 self.pb.update(gettext('Reading indexes'), 1, 2) 235 self.repo._check_for_inconsistent_revision_parents() 236 self.revisions = self.repo.revisions 237 self.pb.update(gettext('Reading indexes'), 2, 2) 238 239 def _gc_inventory(self): 240 """Remove inventories that are not referenced from the revision store.""" 241 self.pb.update(gettext('Checking unused inventories'), 0, 1) 242 self._check_garbage_inventories() 243 self.pb.update(gettext('Checking unused inventories'), 1, 3) 244 if not self.garbage_inventories: 245 ui.ui_factory.note(gettext('Inventory ok.')) 246 return 247 self.pb.update(gettext('Backing up inventory'), 0, 0) 248 self.repo._backup_inventory() 249 ui.ui_factory.note(gettext('Backup Inventory created')) 250 # asking for '' should never return a non-empty weave 251 new_inventories = self.repo._temp_inventories() 252 # we have topological order of revisions and non ghost parents ready. 253 graph = self.revisions.get_parent_map(self.revisions.keys()) 254 revision_keys = topo_sort(graph) 255 revision_ids = [key[-1] for key in revision_keys] 256 self._setup_steps(len(revision_keys)) 257 stream = self._change_inv_parents( 258 self.inventory.get_record_stream(revision_keys, 'unordered', True), 259 graph.__getitem__, 260 set(revision_keys)) 261 new_inventories.insert_record_stream(stream) 262 # if this worked, the set of new_inventory_vf.names should equal 263 # the revisionds list 264 if not(set(new_inventories.keys()) == set(revision_keys)): 265 raise AssertionError() 266 self.pb.update(gettext('Writing weave')) 267 self.repo._activate_new_inventory() 268 self.inventory = None 269 ui.ui_factory.note(gettext('Inventory regenerated.')) 270 271 def _fix_text_parents(self): 272 """Fix bad versionedfile parent entries. 273 274 It is possible for the parents entry in a versionedfile entry to be 275 inconsistent with the values in the revision and inventory. 276 277 This method finds entries with such inconsistencies, corrects their 278 parent lists, and replaces the versionedfile with a corrected version. 279 """ 280 transaction = self.repo.get_transaction() 281 versions = [key[-1] for key in self.revisions.keys()] 282 mutter('Prepopulating revision text cache with %d revisions', 283 len(versions)) 284 vf_checker = self.repo._get_versioned_file_checker() 285 bad_parents, unused_versions = vf_checker.check_file_version_parents( 286 self.repo.texts, self.pb) 287 text_index = vf_checker.text_index 288 per_id_bad_parents = {} 289 for key in unused_versions: 290 # Ensure that every file with unused versions gets rewritten. 291 # NB: This is really not needed, reconcile != pack. 292 per_id_bad_parents[key[0]] = {} 293 # Generate per-knit/weave data. 294 for key, details in bad_parents.items(): 295 file_id = key[0] 296 rev_id = key[1] 297 knit_parents = tuple([parent[-1] for parent in details[0]]) 298 correct_parents = tuple([parent[-1] for parent in details[1]]) 299 file_details = per_id_bad_parents.setdefault(file_id, {}) 300 file_details[rev_id] = (knit_parents, correct_parents) 301 file_id_versions = {} 302 for text_key in text_index: 303 versions_list = file_id_versions.setdefault(text_key[0], []) 304 versions_list.append(text_key[1]) 305 # Do the reconcile of individual weaves. 306 for num, file_id in enumerate(per_id_bad_parents): 307 self.pb.update(gettext('Fixing text parents'), num, 308 len(per_id_bad_parents)) 309 versions_with_bad_parents = per_id_bad_parents[file_id] 310 id_unused_versions = set(key[-1] for key in unused_versions 311 if key[0] == file_id) 312 if file_id in file_id_versions: 313 file_versions = file_id_versions[file_id] 314 else: 315 # This id was present in the disk store but is not referenced 316 # by any revision at all. 317 file_versions = [] 318 self._fix_text_parent(file_id, versions_with_bad_parents, 319 id_unused_versions, file_versions) 320 321 def _fix_text_parent(self, file_id, versions_with_bad_parents, 322 unused_versions, all_versions): 323 """Fix bad versionedfile entries in a single versioned file.""" 324 mutter('fixing text parent: %r (%d versions)', file_id, 325 len(versions_with_bad_parents)) 326 mutter('(%d are unused)', len(unused_versions)) 327 new_file_id = b'temp:%s' % file_id 328 new_parents = {} 329 needed_keys = set() 330 for version in all_versions: 331 if version in unused_versions: 332 continue 333 elif version in versions_with_bad_parents: 334 parents = versions_with_bad_parents[version][1] 335 else: 336 pmap = self.repo.texts.get_parent_map([(file_id, version)]) 337 parents = [key[-1] for key in pmap[(file_id, version)]] 338 new_parents[(new_file_id, version)] = [ 339 (new_file_id, parent) for parent in parents] 340 needed_keys.add((file_id, version)) 341 342 def fix_parents(stream): 343 for record in stream: 344 chunks = record.get_bytes_as('chunked') 345 new_key = (new_file_id, record.key[-1]) 346 parents = new_parents[new_key] 347 yield ChunkedContentFactory(new_key, parents, record.sha1, chunks) 348 stream = self.repo.texts.get_record_stream( 349 needed_keys, 'topological', True) 350 self.repo._remove_file_id(new_file_id) 351 self.repo.texts.insert_record_stream(fix_parents(stream)) 352 self.repo._remove_file_id(file_id) 353 if len(new_parents): 354 self.repo._move_file_id(new_file_id, file_id) 355 356 357class PackReconciler(VersionedFileRepoReconciler): 358 """Reconciler that reconciles a pack based repository. 359 360 Garbage inventories do not affect ancestry queries, and removal is 361 considerably more expensive as there is no separate versioned file for 362 them, so they are not cleaned. In short it is currently a no-op. 363 364 In future this may be a good place to hook in annotation cache checking, 365 index recreation etc. 366 """ 367 368 # XXX: The index corruption that _fix_text_parents performs is needed for 369 # packs, but not yet implemented. The basic approach is to: 370 # - lock the names list 371 # - perform a customised pack() that regenerates data as needed 372 # - unlock the names list 373 # https://bugs.launchpad.net/bzr/+bug/154173 374 375 def __init__(self, repo, other=None, thorough=False, 376 canonicalize_chks=False): 377 super(PackReconciler, self).__init__(repo, other=other, 378 thorough=thorough) 379 self.canonicalize_chks = canonicalize_chks 380 381 def _reconcile_steps(self): 382 """Perform the steps to reconcile this repository.""" 383 if not self.thorough: 384 return 385 collection = self.repo._pack_collection 386 collection.ensure_loaded() 387 collection.lock_names() 388 try: 389 packs = collection.all_packs() 390 all_revisions = self.repo.all_revision_ids() 391 total_inventories = len(list( 392 collection.inventory_index.combined_index.iter_all_entries())) 393 if len(all_revisions): 394 if self.canonicalize_chks: 395 reconcile_meth = self.repo._canonicalize_chks_pack 396 else: 397 reconcile_meth = self.repo._reconcile_pack 398 new_pack = reconcile_meth(collection, packs, ".reconcile", 399 all_revisions, self.pb) 400 if new_pack is not None: 401 self._discard_and_save(packs) 402 else: 403 # only make a new pack when there is data to copy. 404 self._discard_and_save(packs) 405 self.garbage_inventories = total_inventories - len(list( 406 collection.inventory_index.combined_index.iter_all_entries())) 407 finally: 408 collection._unlock_names() 409 410 def _discard_and_save(self, packs): 411 """Discard some packs from the repository. 412 413 This removes them from the memory index, saves the in-memory index 414 which makes the newly reconciled pack visible and hides the packs to be 415 discarded, and finally renames the packs being discarded into the 416 obsolete packs directory. 417 418 :param packs: The packs to discard. 419 """ 420 for pack in packs: 421 self.repo._pack_collection._remove_pack_from_memory(pack) 422 self.repo._pack_collection._save_pack_names() 423 self.repo._pack_collection._obsolete_packs(packs) 424 425 426class BranchReconciler(object): 427 """Reconciler that works on a branch.""" 428 429 def __init__(self, a_branch, thorough=False): 430 self.fixed_history = None 431 self.thorough = thorough 432 self.branch = a_branch 433 434 def reconcile(self): 435 with self.branch.lock_write(), \ 436 ui.ui_factory.nested_progress_bar() as self.pb: 437 ret = ReconcileResult() 438 ret.fixed_history = self._reconcile_steps() 439 return ret 440 441 def _reconcile_steps(self): 442 return self._reconcile_revision_history() 443 444 def _reconcile_revision_history(self): 445 last_revno, last_revision_id = self.branch.last_revision_info() 446 real_history = [] 447 graph = self.branch.repository.get_graph() 448 try: 449 for revid in graph.iter_lefthand_ancestry( 450 last_revision_id, (_mod_revision.NULL_REVISION,)): 451 real_history.append(revid) 452 except errors.RevisionNotPresent: 453 pass # Hit a ghost left hand parent 454 real_history.reverse() 455 if last_revno != len(real_history): 456 # Technically for Branch5 formats, it is more efficient to use 457 # set_revision_history, as this will regenerate it again. 458 # Not really worth a whole BranchReconciler class just for this, 459 # though. 460 ui.ui_factory.note(gettext('Fixing last revision info {0} ' 461 ' => {1}').format( 462 last_revno, len(real_history))) 463 self.branch.set_last_revision_info(len(real_history), 464 last_revision_id) 465 return True 466 else: 467 ui.ui_factory.note(gettext('revision_history ok.')) 468 return False 469