1# Copyright (C) 2008 Canonical Ltd
2#
3# This program is free software; you can redistribute it and/or modify
4# it under the terms of the GNU General Public License as published by
5# the Free Software Foundation; either version 2 of the License, or
6# (at your option) any later version.
7#
8# This program is distributed in the hope that it will be useful,
9# but WITHOUT ANY WARRANTY; without even the implied warranty of
10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11# GNU General Public License for more details.
12#
13# You should have received a copy of the GNU General Public License
14# along with this program.  If not, see <http://www.gnu.org/licenses/>.
15
16"""Import processor that dump stats about the input (and doesn't import)."""
17
18from __future__ import absolute_import
19
20from .. import (
21    commands,
22    processor,
23    reftracker,
24    )
25from ..helpers import (
26    invert_dict,
27    invert_dictset,
28    )
29import stat
30
31
32class InfoProcessor(processor.ImportProcessor):
33    """An import processor that dumps statistics about the input.
34
35    No changes to the current repository are made.
36
37    As well as providing useful information about an import
38    stream before importing it, this processor is useful for
39    benchmarking the speed at which data can be extracted from
40    the source.
41    """
42
43    def __init__(self, params=None, verbose=0, outf=None):
44        processor.ImportProcessor.__init__(
45            self, params, verbose, outf=outf)
46
47    def pre_process(self):
48        # Init statistics
49        self.cmd_counts = {}
50        for cmd in commands.COMMAND_NAMES:
51            self.cmd_counts[cmd] = 0
52        self.file_cmd_counts = {}
53        for fc in commands.FILE_COMMAND_NAMES:
54            self.file_cmd_counts[fc] = 0
55        self.parent_counts = {}
56        self.max_parent_count = 0
57        self.committers = set()
58        self.separate_authors_found = False
59        self.symlinks_found = False
60        self.executables_found = False
61        self.sha_blob_references = False
62        self.lightweight_tags = 0
63        # Blob usage tracking
64        self.blobs = {}
65        for usage in ['new', 'used', 'unknown', 'unmarked']:
66            self.blobs[usage] = set()
67        self.blob_ref_counts = {}
68        # Head tracking
69        self.reftracker = reftracker.RefTracker()
70        # Stuff to cache: a map from mark to # of times that mark is merged
71        self.merges = {}
72        # Stuff to cache: these are maps from mark to sets
73        self.rename_old_paths = {}
74        self.copy_source_paths = {}
75
76    def post_process(self):
77        # Dump statistics
78        cmd_names = commands.COMMAND_NAMES
79        fc_names = commands.FILE_COMMAND_NAMES
80        self._dump_stats_group(
81            "Command counts",
82            [(c.decode('utf-8'), self.cmd_counts[c]) for c in cmd_names], str)
83        self._dump_stats_group(
84            "File command counts",
85            [(c.decode('utf-8'), self.file_cmd_counts[c]) for c in fc_names],
86            str)
87
88        # Commit stats
89        if self.cmd_counts[b'commit']:
90            p_items = []
91            for i in range(self.max_parent_count + 1):
92                if i in self.parent_counts:
93                    count = self.parent_counts[i]
94                    p_items.append(("parents-%d" % i, count))
95            merges_count = len(self.merges)
96            p_items.append(('total revisions merged', merges_count))
97            flags = {
98                'separate authors found': self.separate_authors_found,
99                'executables': self.executables_found,
100                'symlinks': self.symlinks_found,
101                'blobs referenced by SHA': self.sha_blob_references,
102                }
103            self._dump_stats_group("Parent counts", p_items, str)
104            self._dump_stats_group(
105                "Commit analysis", sorted(flags.items()), _found)
106            heads = invert_dictset(self.reftracker.heads)
107            self._dump_stats_group(
108                    "Head analysis",
109                    [(k.decode('utf-8'),
110                        ', '.join([m.decode('utf-8') for m in v]))
111                        for (k, v) in heads.items()], None,
112                    _iterable_as_config_list)
113            # note("\t%d\t%s" % (len(self.committers), 'unique committers'))
114            self._dump_stats_group("Merges", self.merges.items(), None)
115            # We only show the rename old path and copy source paths when -vv
116            # (verbose=2) is specified. The output here for mysql's data can't
117            # be parsed currently so this bit of code needs more work anyhow ..
118            if self.verbose >= 2:
119                self._dump_stats_group(
120                    "Rename old paths",
121                    self.rename_old_paths.items(), len,
122                    _iterable_as_config_list)
123                self._dump_stats_group(
124                    "Copy source paths",
125                    self.copy_source_paths.items(), len,
126                    _iterable_as_config_list)
127
128        # Blob stats
129        if self.cmd_counts[b'blob']:
130            # In verbose mode, don't list every blob used
131            if self.verbose:
132                del self.blobs['used']
133            self._dump_stats_group(
134                "Blob usage tracking",
135                [(k, set([v1.decode() for v1 in v]))
136                 for (k, v) in self.blobs.items()],
137                len, _iterable_as_config_list)
138        if self.blob_ref_counts:
139            blobs_by_count = invert_dict(self.blob_ref_counts)
140            blob_items = sorted(blobs_by_count.items())
141            self._dump_stats_group(
142                "Blob reference counts",
143                blob_items, len, _iterable_as_config_list)
144
145        # Other stats
146        if self.cmd_counts[b'reset']:
147            reset_stats = {
148                'lightweight tags': self.lightweight_tags,
149                }
150            self._dump_stats_group("Reset analysis", reset_stats.items())
151
152    def _dump_stats_group(self, title, items, normal_formatter=None,
153                          verbose_formatter=None):
154        """Dump a statistics group.
155
156        In verbose mode, do so as a config file so
157        that other processors can load the information if they want to.
158        :param normal_formatter: the callable to apply to the value
159          before displaying it in normal mode
160        :param verbose_formatter: the callable to apply to the value
161          before displaying it in verbose mode
162        """
163        if self.verbose:
164            self.outf.write("[%s]\n" % (title,))
165            for name, value in items:
166                if verbose_formatter is not None:
167                    value = verbose_formatter(value)
168                if type(name) == str:
169                    name = name.replace(' ', '-')
170                self.outf.write("%s = %s\n" % (name, value))
171            self.outf.write("\n")
172        else:
173            self.outf.write("%s:\n" % (title,))
174            for name, value in items:
175                if normal_formatter is not None:
176                    value = normal_formatter(value)
177                self.outf.write("\t%s\t%s\n" % (value, name))
178
179    def progress_handler(self, cmd):
180        """Process a ProgressCommand."""
181        self.cmd_counts[cmd.name] += 1
182
183    def blob_handler(self, cmd):
184        """Process a BlobCommand."""
185        self.cmd_counts[cmd.name] += 1
186        if cmd.mark is None:
187            self.blobs['unmarked'].add(cmd.id)
188        else:
189            self.blobs['new'].add(cmd.id)
190            # Marks can be re-used so remove it from used if already there.
191            # Note: we definitely do NOT want to remove it from multi if
192            # it's already in that set.
193            try:
194                self.blobs['used'].remove(cmd.id)
195            except KeyError:
196                pass
197
198    def checkpoint_handler(self, cmd):
199        """Process a CheckpointCommand."""
200        self.cmd_counts[cmd.name] += 1
201
202    def commit_handler(self, cmd):
203        """Process a CommitCommand."""
204        self.cmd_counts[cmd.name] += 1
205        self.committers.add(cmd.committer)
206        if cmd.author is not None:
207            self.separate_authors_found = True
208        for fc in cmd.iter_files():
209            self.file_cmd_counts[fc.name] += 1
210            if isinstance(fc, commands.FileModifyCommand):
211                if fc.mode & 0o111:
212                    self.executables_found = True
213                if stat.S_ISLNK(fc.mode):
214                    self.symlinks_found = True
215                if fc.dataref is not None:
216                    if fc.dataref[0] == ':':
217                        self._track_blob(fc.dataref)
218                    else:
219                        self.sha_blob_references = True
220            elif isinstance(fc, commands.FileRenameCommand):
221                self.rename_old_paths.setdefault(cmd.id, set()).add(
222                    fc.old_path)
223            elif isinstance(fc, commands.FileCopyCommand):
224                self.copy_source_paths.setdefault(cmd.id, set()).add(
225                    fc.src_path)
226
227        # Track the heads
228        parents = self.reftracker.track_heads(cmd)
229
230        # Track the parent counts
231        parent_count = len(parents)
232        try:
233            self.parent_counts[parent_count] += 1
234        except KeyError:
235            self.parent_counts[parent_count] = 1
236            if parent_count > self.max_parent_count:
237                self.max_parent_count = parent_count
238
239        # Remember the merges
240        if cmd.merges:
241            for merge in cmd.merges:
242                if merge in self.merges:
243                    self.merges[merge] += 1
244                else:
245                    self.merges[merge] = 1
246
247    def reset_handler(self, cmd):
248        """Process a ResetCommand."""
249        self.cmd_counts[cmd.name] += 1
250        if cmd.ref.startswith(b'refs/tags/'):
251            self.lightweight_tags += 1
252        else:
253            if cmd.from_ is not None:
254                self.reftracker.track_heads_for_ref(
255                    cmd.ref, cmd.from_)
256
257    def tag_handler(self, cmd):
258        """Process a TagCommand."""
259        self.cmd_counts[cmd.name] += 1
260
261    def feature_handler(self, cmd):
262        """Process a FeatureCommand."""
263        self.cmd_counts[cmd.name] += 1
264        feature = cmd.feature_name
265        if feature not in commands.FEATURE_NAMES:
266            self.warning(
267                "feature %s is not supported - parsing may fail"
268                % (feature,))
269
270    def _track_blob(self, mark):
271        if mark in self.blob_ref_counts:
272            self.blob_ref_counts[mark] += 1
273            pass
274        elif mark in self.blobs['used']:
275            self.blob_ref_counts[mark] = 2
276            self.blobs['used'].remove(mark)
277        elif mark in self.blobs['new']:
278            self.blobs['used'].add(mark)
279            self.blobs['new'].remove(mark)
280        else:
281            self.blobs['unknown'].add(mark)
282
283
284def _found(b):
285    """Format a found boolean as a string."""
286    return ['no', 'found'][b]
287
288
289def _iterable_as_config_list(s):
290    """Format an iterable as a sequence of comma-separated strings.
291
292    To match what ConfigObj expects, a single item list has a trailing comma.
293    """
294    items = sorted(s)
295    if len(items) == 1:
296        return "%s," % (items[0],)
297    else:
298        return ", ".join(items)
299