1#!/usr/bin/env python 2 3""" 4CmpRuns - A simple tool for comparing two static analyzer runs to determine 5which reports have been added, removed, or changed. 6 7This is designed to support automated testing using the static analyzer, from 8two perspectives: 9 1. To monitor changes in the static analyzer's reports on real code bases, 10 for regression testing. 11 12 2. For use by end users who want to integrate regular static analyzer testing 13 into a buildbot like environment. 14 15Usage: 16 17 # Load the results of both runs, to obtain lists of the corresponding 18 # AnalysisDiagnostic objects. 19 # 20 resultsA = loadResultsFromSingleRun(singleRunInfoA, deleteEmpty) 21 resultsB = loadResultsFromSingleRun(singleRunInfoB, deleteEmpty) 22 23 # Generate a relation from diagnostics in run A to diagnostics in run B 24 # to obtain a list of triples (a, b, confidence). 25 diff = compareResults(resultsA, resultsB) 26 27""" 28from __future__ import division, print_function 29 30from collections import defaultdict 31 32from math import log 33from optparse import OptionParser 34import json 35import os 36import plistlib 37import re 38import sys 39 40STATS_REGEXP = re.compile(r"Statistics: (\{.+\})", re.MULTILINE | re.DOTALL) 41 42class Colors(object): 43 """ 44 Color for terminal highlight. 45 """ 46 RED = '\x1b[2;30;41m' 47 GREEN = '\x1b[6;30;42m' 48 CLEAR = '\x1b[0m' 49 50# Information about analysis run: 51# path - the analysis output directory 52# root - the name of the root directory, which will be disregarded when 53# determining the source file name 54class SingleRunInfo(object): 55 def __init__(self, path, root="", verboseLog=None): 56 self.path = path 57 self.root = root.rstrip("/\\") 58 self.verboseLog = verboseLog 59 60 61class AnalysisDiagnostic(object): 62 def __init__(self, data, report, htmlReport): 63 self._data = data 64 self._loc = self._data['location'] 65 self._report = report 66 self._htmlReport = htmlReport 67 self._reportSize = len(self._data['path']) 68 69 def getFileName(self): 70 root = self._report.run.root 71 fileName = self._report.files[self._loc['file']] 72 if fileName.startswith(root) and len(root) > 0: 73 return fileName[len(root) + 1:] 74 return fileName 75 76 def getRootFileName(self): 77 path = self._data['path'] 78 if not path: 79 return self.getFileName() 80 p = path[0] 81 if 'location' in p: 82 fIdx = p['location']['file'] 83 else: # control edge 84 fIdx = path[0]['edges'][0]['start'][0]['file'] 85 out = self._report.files[fIdx] 86 root = self._report.run.root 87 if out.startswith(root): 88 return out[len(root):] 89 return out 90 91 def getLine(self): 92 return self._loc['line'] 93 94 def getColumn(self): 95 return self._loc['col'] 96 97 def getPathLength(self): 98 return self._reportSize 99 100 def getCategory(self): 101 return self._data['category'] 102 103 def getDescription(self): 104 return self._data['description'] 105 106 def getIssueIdentifier(self): 107 id = self.getFileName() + "+" 108 if 'issue_context' in self._data: 109 id += self._data['issue_context'] + "+" 110 if 'issue_hash_content_of_line_in_context' in self._data: 111 id += str(self._data['issue_hash_content_of_line_in_context']) 112 return id 113 114 def getReport(self): 115 if self._htmlReport is None: 116 return " " 117 return os.path.join(self._report.run.path, self._htmlReport) 118 119 def getReadableName(self): 120 if 'issue_context' in self._data: 121 funcnamePostfix = "#" + self._data['issue_context'] 122 else: 123 funcnamePostfix = "" 124 rootFilename = self.getRootFileName() 125 fileName = self.getFileName() 126 if rootFilename != fileName: 127 filePrefix = "[%s] %s" % (rootFilename, fileName) 128 else: 129 filePrefix = rootFilename 130 return '%s%s:%d:%d, %s: %s' % (filePrefix, 131 funcnamePostfix, 132 self.getLine(), 133 self.getColumn(), self.getCategory(), 134 self.getDescription()) 135 136 # Note, the data format is not an API and may change from one analyzer 137 # version to another. 138 def getRawData(self): 139 return self._data 140 141 142class AnalysisReport(object): 143 def __init__(self, run, files): 144 self.run = run 145 self.files = files 146 self.diagnostics = [] 147 148 149class AnalysisRun(object): 150 def __init__(self, info): 151 self.path = info.path 152 self.root = info.root 153 self.info = info 154 self.reports = [] 155 # Cumulative list of all diagnostics from all the reports. 156 self.diagnostics = [] 157 self.clang_version = None 158 self.stats = [] 159 160 def getClangVersion(self): 161 return self.clang_version 162 163 def readSingleFile(self, p, deleteEmpty): 164 data = plistlib.readPlist(p) 165 if 'statistics' in data: 166 self.stats.append(json.loads(data['statistics'])) 167 data.pop('statistics') 168 169 # We want to retrieve the clang version even if there are no 170 # reports. Assume that all reports were created using the same 171 # clang version (this is always true and is more efficient). 172 if 'clang_version' in data: 173 if self.clang_version is None: 174 self.clang_version = data.pop('clang_version') 175 else: 176 data.pop('clang_version') 177 178 # Ignore/delete empty reports. 179 if not data['files']: 180 if deleteEmpty: 181 os.remove(p) 182 return 183 184 # Extract the HTML reports, if they exists. 185 if 'HTMLDiagnostics_files' in data['diagnostics'][0]: 186 htmlFiles = [] 187 for d in data['diagnostics']: 188 # FIXME: Why is this named files, when does it have multiple 189 # files? 190 assert len(d['HTMLDiagnostics_files']) == 1 191 htmlFiles.append(d.pop('HTMLDiagnostics_files')[0]) 192 else: 193 htmlFiles = [None] * len(data['diagnostics']) 194 195 report = AnalysisReport(self, data.pop('files')) 196 diagnostics = [AnalysisDiagnostic(d, report, h) 197 for d, h in zip(data.pop('diagnostics'), htmlFiles)] 198 199 assert not data 200 201 report.diagnostics.extend(diagnostics) 202 self.reports.append(report) 203 self.diagnostics.extend(diagnostics) 204 205 206def loadResults(path, opts, root="", deleteEmpty=True): 207 """ 208 Backwards compatibility API. 209 """ 210 return loadResultsFromSingleRun(SingleRunInfo(path, root, opts.verboseLog), 211 deleteEmpty) 212 213 214def loadResultsFromSingleRun(info, deleteEmpty=True): 215 """ 216 # Load results of the analyzes from a given output folder. 217 # - info is the SingleRunInfo object 218 # - deleteEmpty specifies if the empty plist files should be deleted 219 220 """ 221 path = info.path 222 run = AnalysisRun(info) 223 224 if os.path.isfile(path): 225 run.readSingleFile(path, deleteEmpty) 226 else: 227 for (dirpath, dirnames, filenames) in os.walk(path): 228 for f in filenames: 229 if (not f.endswith('plist')): 230 continue 231 p = os.path.join(dirpath, f) 232 run.readSingleFile(p, deleteEmpty) 233 234 return run 235 236 237def cmpAnalysisDiagnostic(d): 238 return d.getIssueIdentifier() 239 240 241def compareResults(A, B, opts): 242 """ 243 compareResults - Generate a relation from diagnostics in run A to 244 diagnostics in run B. 245 246 The result is the relation as a list of triples (a, b) where 247 each element {a,b} is None or a matching element from the respective run 248 """ 249 250 res = [] 251 252 # Map size_before -> size_after 253 path_difference_data = [] 254 255 # Quickly eliminate equal elements. 256 neqA = [] 257 neqB = [] 258 eltsA = list(A.diagnostics) 259 eltsB = list(B.diagnostics) 260 eltsA.sort(key=cmpAnalysisDiagnostic) 261 eltsB.sort(key=cmpAnalysisDiagnostic) 262 while eltsA and eltsB: 263 a = eltsA.pop() 264 b = eltsB.pop() 265 if (a.getIssueIdentifier() == b.getIssueIdentifier()): 266 if a.getPathLength() != b.getPathLength(): 267 if opts.relative_path_histogram: 268 path_difference_data.append( 269 float(a.getPathLength()) / b.getPathLength()) 270 elif opts.relative_log_path_histogram: 271 path_difference_data.append( 272 log(float(a.getPathLength()) / b.getPathLength())) 273 elif opts.absolute_path_histogram: 274 path_difference_data.append( 275 a.getPathLength() - b.getPathLength()) 276 277 res.append((a, b)) 278 elif a.getIssueIdentifier() > b.getIssueIdentifier(): 279 eltsB.append(b) 280 neqA.append(a) 281 else: 282 eltsA.append(a) 283 neqB.append(b) 284 neqA.extend(eltsA) 285 neqB.extend(eltsB) 286 287 # FIXME: Add fuzzy matching. One simple and possible effective idea would 288 # be to bin the diagnostics, print them in a normalized form (based solely 289 # on the structure of the diagnostic), compute the diff, then use that as 290 # the basis for matching. This has the nice property that we don't depend 291 # in any way on the diagnostic format. 292 293 for a in neqA: 294 res.append((a, None)) 295 for b in neqB: 296 res.append((None, b)) 297 298 if opts.relative_log_path_histogram or opts.relative_path_histogram or \ 299 opts.absolute_path_histogram: 300 from matplotlib import pyplot 301 pyplot.hist(path_difference_data, bins=100) 302 pyplot.show() 303 304 return res 305 306def computePercentile(l, percentile): 307 """ 308 Return computed percentile. 309 """ 310 return sorted(l)[int(round(percentile * len(l) + 0.5)) - 1] 311 312def deriveStats(results): 313 # Assume all keys are the same in each statistics bucket. 314 combined_data = defaultdict(list) 315 316 # Collect data on paths length. 317 for report in results.reports: 318 for diagnostic in report.diagnostics: 319 combined_data['PathsLength'].append(diagnostic.getPathLength()) 320 321 for stat in results.stats: 322 for key, value in stat.items(): 323 combined_data[key].append(value) 324 combined_stats = {} 325 for key, values in combined_data.items(): 326 combined_stats[str(key)] = { 327 "max": max(values), 328 "min": min(values), 329 "mean": sum(values) / len(values), 330 "90th %tile": computePercentile(values, 0.9), 331 "95th %tile": computePercentile(values, 0.95), 332 "median": sorted(values)[len(values) // 2], 333 "total": sum(values) 334 } 335 return combined_stats 336 337 338def compareStats(resultsA, resultsB): 339 statsA = deriveStats(resultsA) 340 statsB = deriveStats(resultsB) 341 keys = sorted(statsA.keys()) 342 for key in keys: 343 print(key) 344 for kkey in statsA[key]: 345 valA = float(statsA[key][kkey]) 346 valB = float(statsB[key][kkey]) 347 report = "%.3f -> %.3f" % (valA, valB) 348 # Only apply highlighting when writing to TTY and it's not Windows 349 if sys.stdout.isatty() and os.name != 'nt': 350 if valB != 0: 351 ratio = (valB - valA) / valB 352 if ratio < -0.2: 353 report = Colors.GREEN + report + Colors.CLEAR 354 elif ratio > 0.2: 355 report = Colors.RED + report + Colors.CLEAR 356 print("\t %s %s" % (kkey, report)) 357 358def dumpScanBuildResultsDiff(dirA, dirB, opts, deleteEmpty=True, 359 Stdout=sys.stdout): 360 # Load the run results. 361 resultsA = loadResults(dirA, opts, opts.rootA, deleteEmpty) 362 resultsB = loadResults(dirB, opts, opts.rootB, deleteEmpty) 363 if opts.show_stats: 364 compareStats(resultsA, resultsB) 365 if opts.stats_only: 366 return 367 368 # Open the verbose log, if given. 369 if opts.verboseLog: 370 auxLog = open(opts.verboseLog, "wb") 371 else: 372 auxLog = None 373 374 diff = compareResults(resultsA, resultsB, opts) 375 foundDiffs = 0 376 totalAdded = 0 377 totalRemoved = 0 378 for res in diff: 379 a, b = res 380 if a is None: 381 Stdout.write("ADDED: %r\n" % b.getReadableName()) 382 foundDiffs += 1 383 totalAdded += 1 384 if auxLog: 385 auxLog.write("('ADDED', %r, %r)\n" % (b.getReadableName(), 386 b.getReport())) 387 elif b is None: 388 Stdout.write("REMOVED: %r\n" % a.getReadableName()) 389 foundDiffs += 1 390 totalRemoved += 1 391 if auxLog: 392 auxLog.write("('REMOVED', %r, %r)\n" % (a.getReadableName(), 393 a.getReport())) 394 else: 395 pass 396 397 TotalReports = len(resultsB.diagnostics) 398 Stdout.write("TOTAL REPORTS: %r\n" % TotalReports) 399 Stdout.write("TOTAL ADDED: %r\n" % totalAdded) 400 Stdout.write("TOTAL REMOVED: %r\n" % totalRemoved) 401 if auxLog: 402 auxLog.write("('TOTAL NEW REPORTS', %r)\n" % TotalReports) 403 auxLog.write("('TOTAL DIFFERENCES', %r)\n" % foundDiffs) 404 auxLog.close() 405 406 return foundDiffs, len(resultsA.diagnostics), len(resultsB.diagnostics) 407 408def generate_option_parser(): 409 parser = OptionParser("usage: %prog [options] [dir A] [dir B]") 410 parser.add_option("", "--rootA", dest="rootA", 411 help="Prefix to ignore on source files for directory A", 412 action="store", type=str, default="") 413 parser.add_option("", "--rootB", dest="rootB", 414 help="Prefix to ignore on source files for directory B", 415 action="store", type=str, default="") 416 parser.add_option("", "--verbose-log", dest="verboseLog", 417 help="Write additional information to LOG \ 418 [default=None]", 419 action="store", type=str, default=None, 420 metavar="LOG") 421 parser.add_option("--relative-path-differences-histogram", 422 action="store_true", dest="relative_path_histogram", 423 default=False, 424 help="Show histogram of relative paths differences. \ 425 Requires matplotlib") 426 parser.add_option("--relative-log-path-differences-histogram", 427 action="store_true", dest="relative_log_path_histogram", 428 default=False, 429 help="Show histogram of log relative paths differences. \ 430 Requires matplotlib") 431 parser.add_option("--absolute-path-differences-histogram", 432 action="store_true", dest="absolute_path_histogram", 433 default=False, 434 help="Show histogram of absolute paths differences. \ 435 Requires matplotlib") 436 parser.add_option("--stats-only", action="store_true", dest="stats_only", 437 default=False, help="Only show statistics on reports") 438 parser.add_option("--show-stats", action="store_true", dest="show_stats", 439 default=False, help="Show change in statistics") 440 return parser 441 442 443def main(): 444 parser = generate_option_parser() 445 (opts, args) = parser.parse_args() 446 447 if len(args) != 2: 448 parser.error("invalid number of arguments") 449 450 dirA, dirB = args 451 452 dumpScanBuildResultsDiff(dirA, dirB, opts) 453 454 455if __name__ == '__main__': 456 main() 457