1# Copyright (C) 2008 Canonical Ltd 2# 3# This program is free software; you can redistribute it and/or modify 4# it under the terms of the GNU General Public License as published by 5# the Free Software Foundation; either version 2 of the License, or 6# (at your option) any later version. 7# 8# This program is distributed in the hope that it will be useful, 9# but WITHOUT ANY WARRANTY; without even the implied warranty of 10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11# GNU General Public License for more details. 12# 13# You should have received a copy of the GNU General Public License 14# along with this program. If not, see <http://www.gnu.org/licenses/>. 15 16"""Import processor that dump stats about the input (and doesn't import).""" 17 18from __future__ import absolute_import 19 20from .. import ( 21 commands, 22 processor, 23 reftracker, 24 ) 25from ..helpers import ( 26 invert_dict, 27 invert_dictset, 28 ) 29import stat 30 31 32class InfoProcessor(processor.ImportProcessor): 33 """An import processor that dumps statistics about the input. 34 35 No changes to the current repository are made. 36 37 As well as providing useful information about an import 38 stream before importing it, this processor is useful for 39 benchmarking the speed at which data can be extracted from 40 the source. 41 """ 42 43 def __init__(self, params=None, verbose=0, outf=None): 44 processor.ImportProcessor.__init__( 45 self, params, verbose, outf=outf) 46 47 def pre_process(self): 48 # Init statistics 49 self.cmd_counts = {} 50 for cmd in commands.COMMAND_NAMES: 51 self.cmd_counts[cmd] = 0 52 self.file_cmd_counts = {} 53 for fc in commands.FILE_COMMAND_NAMES: 54 self.file_cmd_counts[fc] = 0 55 self.parent_counts = {} 56 self.max_parent_count = 0 57 self.committers = set() 58 self.separate_authors_found = False 59 self.symlinks_found = False 60 self.executables_found = False 61 self.sha_blob_references = False 62 self.lightweight_tags = 0 63 # Blob usage tracking 64 self.blobs = {} 65 for usage in ['new', 'used', 'unknown', 'unmarked']: 66 self.blobs[usage] = set() 67 self.blob_ref_counts = {} 68 # Head tracking 69 self.reftracker = reftracker.RefTracker() 70 # Stuff to cache: a map from mark to # of times that mark is merged 71 self.merges = {} 72 # Stuff to cache: these are maps from mark to sets 73 self.rename_old_paths = {} 74 self.copy_source_paths = {} 75 76 def post_process(self): 77 # Dump statistics 78 cmd_names = commands.COMMAND_NAMES 79 fc_names = commands.FILE_COMMAND_NAMES 80 self._dump_stats_group( 81 "Command counts", 82 [(c.decode('utf-8'), self.cmd_counts[c]) for c in cmd_names], str) 83 self._dump_stats_group( 84 "File command counts", 85 [(c.decode('utf-8'), self.file_cmd_counts[c]) for c in fc_names], 86 str) 87 88 # Commit stats 89 if self.cmd_counts[b'commit']: 90 p_items = [] 91 for i in range(self.max_parent_count + 1): 92 if i in self.parent_counts: 93 count = self.parent_counts[i] 94 p_items.append(("parents-%d" % i, count)) 95 merges_count = len(self.merges) 96 p_items.append(('total revisions merged', merges_count)) 97 flags = { 98 'separate authors found': self.separate_authors_found, 99 'executables': self.executables_found, 100 'symlinks': self.symlinks_found, 101 'blobs referenced by SHA': self.sha_blob_references, 102 } 103 self._dump_stats_group("Parent counts", p_items, str) 104 self._dump_stats_group( 105 "Commit analysis", sorted(flags.items()), _found) 106 heads = invert_dictset(self.reftracker.heads) 107 self._dump_stats_group( 108 "Head analysis", 109 [(k.decode('utf-8'), 110 ', '.join([m.decode('utf-8') for m in v])) 111 for (k, v) in heads.items()], None, 112 _iterable_as_config_list) 113 # note("\t%d\t%s" % (len(self.committers), 'unique committers')) 114 self._dump_stats_group("Merges", self.merges.items(), None) 115 # We only show the rename old path and copy source paths when -vv 116 # (verbose=2) is specified. The output here for mysql's data can't 117 # be parsed currently so this bit of code needs more work anyhow .. 118 if self.verbose >= 2: 119 self._dump_stats_group( 120 "Rename old paths", 121 self.rename_old_paths.items(), len, 122 _iterable_as_config_list) 123 self._dump_stats_group( 124 "Copy source paths", 125 self.copy_source_paths.items(), len, 126 _iterable_as_config_list) 127 128 # Blob stats 129 if self.cmd_counts[b'blob']: 130 # In verbose mode, don't list every blob used 131 if self.verbose: 132 del self.blobs['used'] 133 self._dump_stats_group( 134 "Blob usage tracking", 135 [(k, set([v1.decode() for v1 in v])) 136 for (k, v) in self.blobs.items()], 137 len, _iterable_as_config_list) 138 if self.blob_ref_counts: 139 blobs_by_count = invert_dict(self.blob_ref_counts) 140 blob_items = sorted(blobs_by_count.items()) 141 self._dump_stats_group( 142 "Blob reference counts", 143 blob_items, len, _iterable_as_config_list) 144 145 # Other stats 146 if self.cmd_counts[b'reset']: 147 reset_stats = { 148 'lightweight tags': self.lightweight_tags, 149 } 150 self._dump_stats_group("Reset analysis", reset_stats.items()) 151 152 def _dump_stats_group(self, title, items, normal_formatter=None, 153 verbose_formatter=None): 154 """Dump a statistics group. 155 156 In verbose mode, do so as a config file so 157 that other processors can load the information if they want to. 158 :param normal_formatter: the callable to apply to the value 159 before displaying it in normal mode 160 :param verbose_formatter: the callable to apply to the value 161 before displaying it in verbose mode 162 """ 163 if self.verbose: 164 self.outf.write("[%s]\n" % (title,)) 165 for name, value in items: 166 if verbose_formatter is not None: 167 value = verbose_formatter(value) 168 if type(name) == str: 169 name = name.replace(' ', '-') 170 self.outf.write("%s = %s\n" % (name, value)) 171 self.outf.write("\n") 172 else: 173 self.outf.write("%s:\n" % (title,)) 174 for name, value in items: 175 if normal_formatter is not None: 176 value = normal_formatter(value) 177 self.outf.write("\t%s\t%s\n" % (value, name)) 178 179 def progress_handler(self, cmd): 180 """Process a ProgressCommand.""" 181 self.cmd_counts[cmd.name] += 1 182 183 def blob_handler(self, cmd): 184 """Process a BlobCommand.""" 185 self.cmd_counts[cmd.name] += 1 186 if cmd.mark is None: 187 self.blobs['unmarked'].add(cmd.id) 188 else: 189 self.blobs['new'].add(cmd.id) 190 # Marks can be re-used so remove it from used if already there. 191 # Note: we definitely do NOT want to remove it from multi if 192 # it's already in that set. 193 try: 194 self.blobs['used'].remove(cmd.id) 195 except KeyError: 196 pass 197 198 def checkpoint_handler(self, cmd): 199 """Process a CheckpointCommand.""" 200 self.cmd_counts[cmd.name] += 1 201 202 def commit_handler(self, cmd): 203 """Process a CommitCommand.""" 204 self.cmd_counts[cmd.name] += 1 205 self.committers.add(cmd.committer) 206 if cmd.author is not None: 207 self.separate_authors_found = True 208 for fc in cmd.iter_files(): 209 self.file_cmd_counts[fc.name] += 1 210 if isinstance(fc, commands.FileModifyCommand): 211 if fc.mode & 0o111: 212 self.executables_found = True 213 if stat.S_ISLNK(fc.mode): 214 self.symlinks_found = True 215 if fc.dataref is not None: 216 if fc.dataref[0] == ':': 217 self._track_blob(fc.dataref) 218 else: 219 self.sha_blob_references = True 220 elif isinstance(fc, commands.FileRenameCommand): 221 self.rename_old_paths.setdefault(cmd.id, set()).add( 222 fc.old_path) 223 elif isinstance(fc, commands.FileCopyCommand): 224 self.copy_source_paths.setdefault(cmd.id, set()).add( 225 fc.src_path) 226 227 # Track the heads 228 parents = self.reftracker.track_heads(cmd) 229 230 # Track the parent counts 231 parent_count = len(parents) 232 try: 233 self.parent_counts[parent_count] += 1 234 except KeyError: 235 self.parent_counts[parent_count] = 1 236 if parent_count > self.max_parent_count: 237 self.max_parent_count = parent_count 238 239 # Remember the merges 240 if cmd.merges: 241 for merge in cmd.merges: 242 if merge in self.merges: 243 self.merges[merge] += 1 244 else: 245 self.merges[merge] = 1 246 247 def reset_handler(self, cmd): 248 """Process a ResetCommand.""" 249 self.cmd_counts[cmd.name] += 1 250 if cmd.ref.startswith(b'refs/tags/'): 251 self.lightweight_tags += 1 252 else: 253 if cmd.from_ is not None: 254 self.reftracker.track_heads_for_ref( 255 cmd.ref, cmd.from_) 256 257 def tag_handler(self, cmd): 258 """Process a TagCommand.""" 259 self.cmd_counts[cmd.name] += 1 260 261 def feature_handler(self, cmd): 262 """Process a FeatureCommand.""" 263 self.cmd_counts[cmd.name] += 1 264 feature = cmd.feature_name 265 if feature not in commands.FEATURE_NAMES: 266 self.warning( 267 "feature %s is not supported - parsing may fail" 268 % (feature,)) 269 270 def _track_blob(self, mark): 271 if mark in self.blob_ref_counts: 272 self.blob_ref_counts[mark] += 1 273 pass 274 elif mark in self.blobs['used']: 275 self.blob_ref_counts[mark] = 2 276 self.blobs['used'].remove(mark) 277 elif mark in self.blobs['new']: 278 self.blobs['used'].add(mark) 279 self.blobs['new'].remove(mark) 280 else: 281 self.blobs['unknown'].add(mark) 282 283 284def _found(b): 285 """Format a found boolean as a string.""" 286 return ['no', 'found'][b] 287 288 289def _iterable_as_config_list(s): 290 """Format an iterable as a sequence of comma-separated strings. 291 292 To match what ConfigObj expects, a single item list has a trailing comma. 293 """ 294 items = sorted(s) 295 if len(items) == 1: 296 return "%s," % (items[0],) 297 else: 298 return ", ".join(items) 299