1#!/usr/bin/env python
2
3"""
4CmpRuns - A simple tool for comparing two static analyzer runs to determine
5which reports have been added, removed, or changed.
6
7This is designed to support automated testing using the static analyzer, from
8two perspectives:
9  1. To monitor changes in the static analyzer's reports on real code bases,
10     for regression testing.
11
12  2. For use by end users who want to integrate regular static analyzer testing
13     into a buildbot like environment.
14
15Usage:
16
17    # Load the results of both runs, to obtain lists of the corresponding
18    # AnalysisDiagnostic objects.
19    #
20    resultsA = loadResultsFromSingleRun(singleRunInfoA, deleteEmpty)
21    resultsB = loadResultsFromSingleRun(singleRunInfoB, deleteEmpty)
22
23    # Generate a relation from diagnostics in run A to diagnostics in run B
24    # to obtain a list of triples (a, b, confidence).
25    diff = compareResults(resultsA, resultsB)
26
27"""
28from __future__ import division, print_function
29
30from collections import defaultdict
31
32from math import log
33from optparse import OptionParser
34import json
35import os
36import plistlib
37import re
38import sys
39
40STATS_REGEXP = re.compile(r"Statistics: (\{.+\})", re.MULTILINE | re.DOTALL)
41
42class Colors(object):
43    """
44    Color for terminal highlight.
45    """
46    RED = '\x1b[2;30;41m'
47    GREEN = '\x1b[6;30;42m'
48    CLEAR = '\x1b[0m'
49
50# Information about analysis run:
51# path - the analysis output directory
52# root - the name of the root directory, which will be disregarded when
53# determining the source file name
54class SingleRunInfo(object):
55    def __init__(self, path, root="", verboseLog=None):
56        self.path = path
57        self.root = root.rstrip("/\\")
58        self.verboseLog = verboseLog
59
60
61class AnalysisDiagnostic(object):
62    def __init__(self, data, report, htmlReport):
63        self._data = data
64        self._loc = self._data['location']
65        self._report = report
66        self._htmlReport = htmlReport
67        self._reportSize = len(self._data['path'])
68
69    def getFileName(self):
70        root = self._report.run.root
71        fileName = self._report.files[self._loc['file']]
72        if fileName.startswith(root) and len(root) > 0:
73            return fileName[len(root) + 1:]
74        return fileName
75
76    def getRootFileName(self):
77        path = self._data['path']
78        if not path:
79            return self.getFileName()
80        p = path[0]
81        if 'location' in p:
82            fIdx = p['location']['file']
83        else: # control edge
84            fIdx = path[0]['edges'][0]['start'][0]['file']
85        out = self._report.files[fIdx]
86        root = self._report.run.root
87        if out.startswith(root):
88            return out[len(root):]
89        return out
90
91    def getLine(self):
92        return self._loc['line']
93
94    def getColumn(self):
95        return self._loc['col']
96
97    def getPathLength(self):
98        return self._reportSize
99
100    def getCategory(self):
101        return self._data['category']
102
103    def getDescription(self):
104        return self._data['description']
105
106    def getIssueIdentifier(self):
107        id = self.getFileName() + "+"
108        if 'issue_context' in self._data:
109            id += self._data['issue_context'] + "+"
110        if 'issue_hash_content_of_line_in_context' in self._data:
111            id += str(self._data['issue_hash_content_of_line_in_context'])
112        return id
113
114    def getReport(self):
115        if self._htmlReport is None:
116            return " "
117        return os.path.join(self._report.run.path, self._htmlReport)
118
119    def getReadableName(self):
120        if 'issue_context' in self._data:
121            funcnamePostfix = "#" + self._data['issue_context']
122        else:
123            funcnamePostfix = ""
124        rootFilename = self.getRootFileName()
125        fileName = self.getFileName()
126        if rootFilename != fileName:
127            filePrefix = "[%s] %s" % (rootFilename, fileName)
128        else:
129            filePrefix = rootFilename
130        return '%s%s:%d:%d, %s: %s' % (filePrefix,
131                                       funcnamePostfix,
132                                       self.getLine(),
133                                       self.getColumn(), self.getCategory(),
134                                       self.getDescription())
135
136    # Note, the data format is not an API and may change from one analyzer
137    # version to another.
138    def getRawData(self):
139        return self._data
140
141
142class AnalysisReport(object):
143    def __init__(self, run, files):
144        self.run = run
145        self.files = files
146        self.diagnostics = []
147
148
149class AnalysisRun(object):
150    def __init__(self, info):
151        self.path = info.path
152        self.root = info.root
153        self.info = info
154        self.reports = []
155        # Cumulative list of all diagnostics from all the reports.
156        self.diagnostics = []
157        self.clang_version = None
158        self.stats = []
159
160    def getClangVersion(self):
161        return self.clang_version
162
163    def readSingleFile(self, p, deleteEmpty):
164        data = plistlib.readPlist(p)
165        if 'statistics' in data:
166            self.stats.append(json.loads(data['statistics']))
167            data.pop('statistics')
168
169        # We want to retrieve the clang version even if there are no
170        # reports. Assume that all reports were created using the same
171        # clang version (this is always true and is more efficient).
172        if 'clang_version' in data:
173            if self.clang_version is None:
174                self.clang_version = data.pop('clang_version')
175            else:
176                data.pop('clang_version')
177
178        # Ignore/delete empty reports.
179        if not data['files']:
180            if deleteEmpty:
181                os.remove(p)
182            return
183
184        # Extract the HTML reports, if they exists.
185        if 'HTMLDiagnostics_files' in data['diagnostics'][0]:
186            htmlFiles = []
187            for d in data['diagnostics']:
188                # FIXME: Why is this named files, when does it have multiple
189                # files?
190                assert len(d['HTMLDiagnostics_files']) == 1
191                htmlFiles.append(d.pop('HTMLDiagnostics_files')[0])
192        else:
193            htmlFiles = [None] * len(data['diagnostics'])
194
195        report = AnalysisReport(self, data.pop('files'))
196        diagnostics = [AnalysisDiagnostic(d, report, h)
197                       for d, h in zip(data.pop('diagnostics'), htmlFiles)]
198
199        assert not data
200
201        report.diagnostics.extend(diagnostics)
202        self.reports.append(report)
203        self.diagnostics.extend(diagnostics)
204
205
206def loadResults(path, opts, root="", deleteEmpty=True):
207    """
208    Backwards compatibility API.
209    """
210    return loadResultsFromSingleRun(SingleRunInfo(path, root, opts.verboseLog),
211                                    deleteEmpty)
212
213
214def loadResultsFromSingleRun(info, deleteEmpty=True):
215    """
216    # Load results of the analyzes from a given output folder.
217    # - info is the SingleRunInfo object
218    # - deleteEmpty specifies if the empty plist files should be deleted
219
220    """
221    path = info.path
222    run = AnalysisRun(info)
223
224    if os.path.isfile(path):
225        run.readSingleFile(path, deleteEmpty)
226    else:
227        for (dirpath, dirnames, filenames) in os.walk(path):
228            for f in filenames:
229                if (not f.endswith('plist')):
230                    continue
231                p = os.path.join(dirpath, f)
232                run.readSingleFile(p, deleteEmpty)
233
234    return run
235
236
237def cmpAnalysisDiagnostic(d):
238    return d.getIssueIdentifier()
239
240
241def compareResults(A, B, opts):
242    """
243    compareResults - Generate a relation from diagnostics in run A to
244    diagnostics in run B.
245
246    The result is the relation as a list of triples (a, b) where
247    each element {a,b} is None or a matching element from the respective run
248    """
249
250    res = []
251
252    # Map size_before -> size_after
253    path_difference_data = []
254
255    # Quickly eliminate equal elements.
256    neqA = []
257    neqB = []
258    eltsA = list(A.diagnostics)
259    eltsB = list(B.diagnostics)
260    eltsA.sort(key=cmpAnalysisDiagnostic)
261    eltsB.sort(key=cmpAnalysisDiagnostic)
262    while eltsA and eltsB:
263        a = eltsA.pop()
264        b = eltsB.pop()
265        if (a.getIssueIdentifier() == b.getIssueIdentifier()):
266            if a.getPathLength() != b.getPathLength():
267                if opts.relative_path_histogram:
268                    path_difference_data.append(
269                        float(a.getPathLength()) / b.getPathLength())
270                elif opts.relative_log_path_histogram:
271                    path_difference_data.append(
272                        log(float(a.getPathLength()) / b.getPathLength()))
273                elif opts.absolute_path_histogram:
274                    path_difference_data.append(
275                        a.getPathLength() - b.getPathLength())
276
277            res.append((a, b))
278        elif a.getIssueIdentifier() > b.getIssueIdentifier():
279            eltsB.append(b)
280            neqA.append(a)
281        else:
282            eltsA.append(a)
283            neqB.append(b)
284    neqA.extend(eltsA)
285    neqB.extend(eltsB)
286
287    # FIXME: Add fuzzy matching. One simple and possible effective idea would
288    # be to bin the diagnostics, print them in a normalized form (based solely
289    # on the structure of the diagnostic), compute the diff, then use that as
290    # the basis for matching. This has the nice property that we don't depend
291    # in any way on the diagnostic format.
292
293    for a in neqA:
294        res.append((a, None))
295    for b in neqB:
296        res.append((None, b))
297
298    if opts.relative_log_path_histogram or opts.relative_path_histogram or \
299            opts.absolute_path_histogram:
300        from matplotlib import pyplot
301        pyplot.hist(path_difference_data, bins=100)
302        pyplot.show()
303
304    return res
305
306def computePercentile(l, percentile):
307    """
308    Return computed percentile.
309    """
310    return sorted(l)[int(round(percentile * len(l) + 0.5)) - 1]
311
312def deriveStats(results):
313    # Assume all keys are the same in each statistics bucket.
314    combined_data = defaultdict(list)
315
316    # Collect data on paths length.
317    for report in results.reports:
318        for diagnostic in report.diagnostics:
319            combined_data['PathsLength'].append(diagnostic.getPathLength())
320
321    for stat in results.stats:
322        for key, value in stat.items():
323            combined_data[key].append(value)
324    combined_stats = {}
325    for key, values in combined_data.items():
326        combined_stats[str(key)] = {
327            "max": max(values),
328            "min": min(values),
329            "mean": sum(values) / len(values),
330            "90th %tile": computePercentile(values, 0.9),
331            "95th %tile": computePercentile(values, 0.95),
332            "median": sorted(values)[len(values) // 2],
333            "total": sum(values)
334        }
335    return combined_stats
336
337
338def compareStats(resultsA, resultsB):
339    statsA = deriveStats(resultsA)
340    statsB = deriveStats(resultsB)
341    keys = sorted(statsA.keys())
342    for key in keys:
343        print(key)
344        for kkey in statsA[key]:
345            valA = float(statsA[key][kkey])
346            valB = float(statsB[key][kkey])
347            report = "%.3f -> %.3f" % (valA, valB)
348            # Only apply highlighting when writing to TTY and it's not Windows
349            if sys.stdout.isatty() and os.name != 'nt':
350                if valB != 0:
351                    ratio = (valB - valA) / valB
352                    if ratio < -0.2:
353                        report = Colors.GREEN + report + Colors.CLEAR
354                    elif ratio > 0.2:
355                        report = Colors.RED + report + Colors.CLEAR
356            print("\t %s %s" % (kkey, report))
357
358def dumpScanBuildResultsDiff(dirA, dirB, opts, deleteEmpty=True,
359                             Stdout=sys.stdout):
360    # Load the run results.
361    resultsA = loadResults(dirA, opts, opts.rootA, deleteEmpty)
362    resultsB = loadResults(dirB, opts, opts.rootB, deleteEmpty)
363    if opts.show_stats:
364        compareStats(resultsA, resultsB)
365    if opts.stats_only:
366        return
367
368    # Open the verbose log, if given.
369    if opts.verboseLog:
370        auxLog = open(opts.verboseLog, "wb")
371    else:
372        auxLog = None
373
374    diff = compareResults(resultsA, resultsB, opts)
375    foundDiffs = 0
376    totalAdded = 0
377    totalRemoved = 0
378    for res in diff:
379        a, b = res
380        if a is None:
381            Stdout.write("ADDED: %r\n" % b.getReadableName())
382            foundDiffs += 1
383            totalAdded += 1
384            if auxLog:
385                auxLog.write("('ADDED', %r, %r)\n" % (b.getReadableName(),
386                                                      b.getReport()))
387        elif b is None:
388            Stdout.write("REMOVED: %r\n" % a.getReadableName())
389            foundDiffs += 1
390            totalRemoved += 1
391            if auxLog:
392                auxLog.write("('REMOVED', %r, %r)\n" % (a.getReadableName(),
393                                                        a.getReport()))
394        else:
395            pass
396
397    TotalReports = len(resultsB.diagnostics)
398    Stdout.write("TOTAL REPORTS: %r\n" % TotalReports)
399    Stdout.write("TOTAL ADDED: %r\n" % totalAdded)
400    Stdout.write("TOTAL REMOVED: %r\n" % totalRemoved)
401    if auxLog:
402        auxLog.write("('TOTAL NEW REPORTS', %r)\n" % TotalReports)
403        auxLog.write("('TOTAL DIFFERENCES', %r)\n" % foundDiffs)
404        auxLog.close()
405
406    return foundDiffs, len(resultsA.diagnostics), len(resultsB.diagnostics)
407
408def generate_option_parser():
409    parser = OptionParser("usage: %prog [options] [dir A] [dir B]")
410    parser.add_option("", "--rootA", dest="rootA",
411                      help="Prefix to ignore on source files for directory A",
412                      action="store", type=str, default="")
413    parser.add_option("", "--rootB", dest="rootB",
414                      help="Prefix to ignore on source files for directory B",
415                      action="store", type=str, default="")
416    parser.add_option("", "--verbose-log", dest="verboseLog",
417                      help="Write additional information to LOG \
418                           [default=None]",
419                      action="store", type=str, default=None,
420                      metavar="LOG")
421    parser.add_option("--relative-path-differences-histogram",
422                      action="store_true", dest="relative_path_histogram",
423                      default=False,
424                      help="Show histogram of relative paths differences. \
425                            Requires matplotlib")
426    parser.add_option("--relative-log-path-differences-histogram",
427                      action="store_true", dest="relative_log_path_histogram",
428                      default=False,
429                      help="Show histogram of log relative paths differences. \
430                            Requires matplotlib")
431    parser.add_option("--absolute-path-differences-histogram",
432                      action="store_true", dest="absolute_path_histogram",
433                      default=False,
434                      help="Show histogram of absolute paths differences. \
435                            Requires matplotlib")
436    parser.add_option("--stats-only", action="store_true", dest="stats_only",
437                      default=False, help="Only show statistics on reports")
438    parser.add_option("--show-stats", action="store_true", dest="show_stats",
439                      default=False, help="Show change in statistics")
440    return parser
441
442
443def main():
444    parser = generate_option_parser()
445    (opts, args) = parser.parse_args()
446
447    if len(args) != 2:
448        parser.error("invalid number of arguments")
449
450    dirA, dirB = args
451
452    dumpScanBuildResultsDiff(dirA, dirB, opts)
453
454
455if __name__ == '__main__':
456    main()
457