1e5dd7070Spatrick#!/usr/bin/env python 2e5dd7070Spatrick 3e5dd7070Spatrick""" 4e5dd7070SpatrickCmpRuns - A simple tool for comparing two static analyzer runs to determine 5e5dd7070Spatrickwhich reports have been added, removed, or changed. 6e5dd7070Spatrick 7e5dd7070SpatrickThis is designed to support automated testing using the static analyzer, from 8e5dd7070Spatricktwo perspectives: 9e5dd7070Spatrick 1. To monitor changes in the static analyzer's reports on real code bases, 10e5dd7070Spatrick for regression testing. 11e5dd7070Spatrick 12e5dd7070Spatrick 2. For use by end users who want to integrate regular static analyzer testing 13e5dd7070Spatrick into a buildbot like environment. 14e5dd7070Spatrick 15e5dd7070SpatrickUsage: 16e5dd7070Spatrick 17e5dd7070Spatrick # Load the results of both runs, to obtain lists of the corresponding 18e5dd7070Spatrick # AnalysisDiagnostic objects. 19e5dd7070Spatrick # 20ec727ea7Spatrick resultsA = load_results_from_single_run(singleRunInfoA, delete_empty) 21ec727ea7Spatrick resultsB = load_results_from_single_run(singleRunInfoB, delete_empty) 22e5dd7070Spatrick 23e5dd7070Spatrick # Generate a relation from diagnostics in run A to diagnostics in run B 24e5dd7070Spatrick # to obtain a list of triples (a, b, confidence). 25ec727ea7Spatrick diff = compare_results(resultsA, resultsB) 26e5dd7070Spatrick 27e5dd7070Spatrick""" 28e5dd7070Spatrickimport json 29e5dd7070Spatrickimport os 30e5dd7070Spatrickimport plistlib 31e5dd7070Spatrickimport re 32e5dd7070Spatrickimport sys 33e5dd7070Spatrick 34ec727ea7Spatrickfrom math import log 35ec727ea7Spatrickfrom collections import defaultdict 36ec727ea7Spatrickfrom copy import copy 37ec727ea7Spatrickfrom enum import Enum 38a9ac8606Spatrickfrom typing import (Any, DefaultDict, Dict, List, NamedTuple, Optional, 39a9ac8606Spatrick Sequence, Set, TextIO, TypeVar, Tuple, Union) 40ec727ea7Spatrick 41ec727ea7Spatrick 42ec727ea7SpatrickNumber = Union[int, float] 43ec727ea7SpatrickStats = Dict[str, Dict[str, Number]] 44ec727ea7SpatrickPlist = Dict[str, Any] 45ec727ea7SpatrickJSON = Dict[str, Any] 46a9ac8606Spatrick# Diff in a form: field -> (before, after) 47a9ac8606SpatrickJSONDiff = Dict[str, Tuple[str, str]] 48ec727ea7Spatrick# Type for generics 49ec727ea7SpatrickT = TypeVar('T') 50ec727ea7Spatrick 51e5dd7070SpatrickSTATS_REGEXP = re.compile(r"Statistics: (\{.+\})", re.MULTILINE | re.DOTALL) 52e5dd7070Spatrick 53ec727ea7Spatrick 54ec727ea7Spatrickclass Colors: 55e5dd7070Spatrick """ 56e5dd7070Spatrick Color for terminal highlight. 57e5dd7070Spatrick """ 58e5dd7070Spatrick RED = '\x1b[2;30;41m' 59e5dd7070Spatrick GREEN = '\x1b[6;30;42m' 60e5dd7070Spatrick CLEAR = '\x1b[0m' 61e5dd7070Spatrick 62ec727ea7Spatrick 63ec727ea7Spatrickclass HistogramType(str, Enum): 64ec727ea7Spatrick RELATIVE = "relative" 65ec727ea7Spatrick LOG_RELATIVE = "log-relative" 66ec727ea7Spatrick ABSOLUTE = "absolute" 67e5dd7070Spatrick 68e5dd7070Spatrick 69ec727ea7Spatrickclass ResultsDirectory(NamedTuple): 70ec727ea7Spatrick path: str 71ec727ea7Spatrick root: str = "" 72ec727ea7Spatrick 73ec727ea7Spatrick 74ec727ea7Spatrickclass SingleRunInfo: 75ec727ea7Spatrick """ 76ec727ea7Spatrick Information about analysis run: 77ec727ea7Spatrick path - the analysis output directory 78ec727ea7Spatrick root - the name of the root directory, which will be disregarded when 79ec727ea7Spatrick determining the source file name 80ec727ea7Spatrick """ 81ec727ea7Spatrick def __init__(self, results: ResultsDirectory, 82ec727ea7Spatrick verbose_log: Optional[str] = None): 83ec727ea7Spatrick self.path = results.path 84ec727ea7Spatrick self.root = results.root.rstrip("/\\") 85ec727ea7Spatrick self.verbose_log = verbose_log 86ec727ea7Spatrick 87ec727ea7Spatrick 88ec727ea7Spatrickclass AnalysisDiagnostic: 89ec727ea7Spatrick def __init__(self, data: Plist, report: "AnalysisReport", 90ec727ea7Spatrick html_report: Optional[str]): 91e5dd7070Spatrick self._data = data 92e5dd7070Spatrick self._loc = self._data['location'] 93e5dd7070Spatrick self._report = report 94ec727ea7Spatrick self._html_report = html_report 95ec727ea7Spatrick self._report_size = len(self._data['path']) 96e5dd7070Spatrick 97ec727ea7Spatrick def get_file_name(self) -> str: 98e5dd7070Spatrick root = self._report.run.root 99ec727ea7Spatrick file_name = self._report.files[self._loc['file']] 100e5dd7070Spatrick 101ec727ea7Spatrick if file_name.startswith(root) and len(root) > 0: 102ec727ea7Spatrick return file_name[len(root) + 1:] 103ec727ea7Spatrick 104ec727ea7Spatrick return file_name 105ec727ea7Spatrick 106ec727ea7Spatrick def get_root_file_name(self) -> str: 107e5dd7070Spatrick path = self._data['path'] 108ec727ea7Spatrick 109e5dd7070Spatrick if not path: 110ec727ea7Spatrick return self.get_file_name() 111ec727ea7Spatrick 112e5dd7070Spatrick p = path[0] 113e5dd7070Spatrick if 'location' in p: 114ec727ea7Spatrick file_index = p['location']['file'] 115e5dd7070Spatrick else: # control edge 116ec727ea7Spatrick file_index = path[0]['edges'][0]['start'][0]['file'] 117ec727ea7Spatrick 118ec727ea7Spatrick out = self._report.files[file_index] 119e5dd7070Spatrick root = self._report.run.root 120ec727ea7Spatrick 121e5dd7070Spatrick if out.startswith(root): 122e5dd7070Spatrick return out[len(root):] 123ec727ea7Spatrick 124e5dd7070Spatrick return out 125e5dd7070Spatrick 126ec727ea7Spatrick def get_line(self) -> int: 127e5dd7070Spatrick return self._loc['line'] 128e5dd7070Spatrick 129ec727ea7Spatrick def get_column(self) -> int: 130e5dd7070Spatrick return self._loc['col'] 131e5dd7070Spatrick 132ec727ea7Spatrick def get_path_length(self) -> int: 133ec727ea7Spatrick return self._report_size 134e5dd7070Spatrick 135ec727ea7Spatrick def get_category(self) -> str: 136e5dd7070Spatrick return self._data['category'] 137e5dd7070Spatrick 138ec727ea7Spatrick def get_description(self) -> str: 139e5dd7070Spatrick return self._data['description'] 140e5dd7070Spatrick 141a9ac8606Spatrick def get_location(self) -> str: 142a9ac8606Spatrick return f"{self.get_file_name()}:{self.get_line()}:{self.get_column()}" 143a9ac8606Spatrick 144ec727ea7Spatrick def get_issue_identifier(self) -> str: 145ec727ea7Spatrick id = self.get_file_name() + "+" 146ec727ea7Spatrick 147ec727ea7Spatrick if "issue_context" in self._data: 148ec727ea7Spatrick id += self._data["issue_context"] + "+" 149ec727ea7Spatrick 150ec727ea7Spatrick if "issue_hash_content_of_line_in_context" in self._data: 151ec727ea7Spatrick id += str(self._data["issue_hash_content_of_line_in_context"]) 152ec727ea7Spatrick 153e5dd7070Spatrick return id 154e5dd7070Spatrick 155ec727ea7Spatrick def get_html_report(self) -> str: 156ec727ea7Spatrick if self._html_report is None: 157e5dd7070Spatrick return " " 158e5dd7070Spatrick 159ec727ea7Spatrick return os.path.join(self._report.run.path, self._html_report) 160ec727ea7Spatrick 161ec727ea7Spatrick def get_readable_name(self) -> str: 162ec727ea7Spatrick if "issue_context" in self._data: 163ec727ea7Spatrick funcname_postfix = "#" + self._data["issue_context"] 164e5dd7070Spatrick else: 165ec727ea7Spatrick funcname_postfix = "" 166ec727ea7Spatrick 167ec727ea7Spatrick root_filename = self.get_root_file_name() 168ec727ea7Spatrick file_name = self.get_file_name() 169ec727ea7Spatrick 170ec727ea7Spatrick if root_filename != file_name: 171ec727ea7Spatrick file_prefix = f"[{root_filename}] {file_name}" 172e5dd7070Spatrick else: 173ec727ea7Spatrick file_prefix = root_filename 174ec727ea7Spatrick 175ec727ea7Spatrick line = self.get_line() 176ec727ea7Spatrick col = self.get_column() 177ec727ea7Spatrick return f"{file_prefix}{funcname_postfix}:{line}:{col}" \ 178ec727ea7Spatrick f", {self.get_category()}: {self.get_description()}" 179e5dd7070Spatrick 180a9ac8606Spatrick KEY_FIELDS = ["check_name", "category", "description"] 181a9ac8606Spatrick 182a9ac8606Spatrick def is_similar_to(self, other: "AnalysisDiagnostic") -> bool: 183a9ac8606Spatrick # We consider two diagnostics similar only if at least one 184a9ac8606Spatrick # of the key fields is the same in both diagnostics. 185a9ac8606Spatrick return len(self.get_diffs(other)) != len(self.KEY_FIELDS) 186a9ac8606Spatrick 187a9ac8606Spatrick def get_diffs(self, other: "AnalysisDiagnostic") -> JSONDiff: 188a9ac8606Spatrick return {field: (self._data[field], other._data[field]) 189a9ac8606Spatrick for field in self.KEY_FIELDS 190a9ac8606Spatrick if self._data[field] != other._data[field]} 191a9ac8606Spatrick 192e5dd7070Spatrick # Note, the data format is not an API and may change from one analyzer 193e5dd7070Spatrick # version to another. 194ec727ea7Spatrick def get_raw_data(self) -> Plist: 195e5dd7070Spatrick return self._data 196e5dd7070Spatrick 197a9ac8606Spatrick def __eq__(self, other: object) -> bool: 198a9ac8606Spatrick return hash(self) == hash(other) 199a9ac8606Spatrick 200a9ac8606Spatrick def __ne__(self, other: object) -> bool: 201a9ac8606Spatrick return hash(self) != hash(other) 202a9ac8606Spatrick 203a9ac8606Spatrick def __hash__(self) -> int: 204a9ac8606Spatrick return hash(self.get_issue_identifier()) 205a9ac8606Spatrick 206e5dd7070Spatrick 207ec727ea7Spatrickclass AnalysisRun: 208ec727ea7Spatrick def __init__(self, info: SingleRunInfo): 209e5dd7070Spatrick self.path = info.path 210e5dd7070Spatrick self.root = info.root 211e5dd7070Spatrick self.info = info 212ec727ea7Spatrick self.reports: List[AnalysisReport] = [] 213e5dd7070Spatrick # Cumulative list of all diagnostics from all the reports. 214ec727ea7Spatrick self.diagnostics: List[AnalysisDiagnostic] = [] 215ec727ea7Spatrick self.clang_version: Optional[str] = None 216ec727ea7Spatrick self.raw_stats: List[JSON] = [] 217e5dd7070Spatrick 218ec727ea7Spatrick def get_clang_version(self) -> Optional[str]: 219e5dd7070Spatrick return self.clang_version 220e5dd7070Spatrick 221ec727ea7Spatrick def read_single_file(self, path: str, delete_empty: bool): 222ec727ea7Spatrick with open(path, "rb") as plist_file: 223ec727ea7Spatrick data = plistlib.load(plist_file) 224ec727ea7Spatrick 225e5dd7070Spatrick if 'statistics' in data: 226ec727ea7Spatrick self.raw_stats.append(json.loads(data['statistics'])) 227e5dd7070Spatrick data.pop('statistics') 228e5dd7070Spatrick 229e5dd7070Spatrick # We want to retrieve the clang version even if there are no 230e5dd7070Spatrick # reports. Assume that all reports were created using the same 231e5dd7070Spatrick # clang version (this is always true and is more efficient). 232e5dd7070Spatrick if 'clang_version' in data: 233e5dd7070Spatrick if self.clang_version is None: 234e5dd7070Spatrick self.clang_version = data.pop('clang_version') 235e5dd7070Spatrick else: 236e5dd7070Spatrick data.pop('clang_version') 237e5dd7070Spatrick 238e5dd7070Spatrick # Ignore/delete empty reports. 239e5dd7070Spatrick if not data['files']: 240ec727ea7Spatrick if delete_empty: 241ec727ea7Spatrick os.remove(path) 242e5dd7070Spatrick return 243e5dd7070Spatrick 244e5dd7070Spatrick # Extract the HTML reports, if they exists. 245e5dd7070Spatrick htmlFiles = [] 246e5dd7070Spatrick for d in data['diagnostics']: 247*12c85518Srobert if 'HTMLDiagnostics_files' in d: 248e5dd7070Spatrick # FIXME: Why is this named files, when does it have multiple 249e5dd7070Spatrick # files? 250e5dd7070Spatrick assert len(d['HTMLDiagnostics_files']) == 1 251e5dd7070Spatrick htmlFiles.append(d.pop('HTMLDiagnostics_files')[0]) 252e5dd7070Spatrick else: 253*12c85518Srobert htmlFiles.append(None) 254e5dd7070Spatrick 255e5dd7070Spatrick report = AnalysisReport(self, data.pop('files')) 256*12c85518Srobert # Python 3.10 offers zip(..., strict=True). The following assertion 257*12c85518Srobert # mimics it. 258*12c85518Srobert assert len(data['diagnostics']) == len(htmlFiles) 259e5dd7070Spatrick diagnostics = [AnalysisDiagnostic(d, report, h) 260e5dd7070Spatrick for d, h in zip(data.pop('diagnostics'), htmlFiles)] 261e5dd7070Spatrick 262e5dd7070Spatrick assert not data 263e5dd7070Spatrick 264e5dd7070Spatrick report.diagnostics.extend(diagnostics) 265e5dd7070Spatrick self.reports.append(report) 266e5dd7070Spatrick self.diagnostics.extend(diagnostics) 267e5dd7070Spatrick 268e5dd7070Spatrick 269ec727ea7Spatrickclass AnalysisReport: 270ec727ea7Spatrick def __init__(self, run: AnalysisRun, files: List[str]): 271ec727ea7Spatrick self.run = run 272ec727ea7Spatrick self.files = files 273ec727ea7Spatrick self.diagnostics: List[AnalysisDiagnostic] = [] 274ec727ea7Spatrick 275ec727ea7Spatrick 276ec727ea7Spatrickdef load_results(results: ResultsDirectory, delete_empty: bool = True, 277ec727ea7Spatrick verbose_log: Optional[str] = None) -> AnalysisRun: 278e5dd7070Spatrick """ 279e5dd7070Spatrick Backwards compatibility API. 280e5dd7070Spatrick """ 281ec727ea7Spatrick return load_results_from_single_run(SingleRunInfo(results, 282ec727ea7Spatrick verbose_log), 283ec727ea7Spatrick delete_empty) 284e5dd7070Spatrick 285e5dd7070Spatrick 286ec727ea7Spatrickdef load_results_from_single_run(info: SingleRunInfo, 287ec727ea7Spatrick delete_empty: bool = True) -> AnalysisRun: 288e5dd7070Spatrick """ 289e5dd7070Spatrick # Load results of the analyzes from a given output folder. 290e5dd7070Spatrick # - info is the SingleRunInfo object 291ec727ea7Spatrick # - delete_empty specifies if the empty plist files should be deleted 292e5dd7070Spatrick 293e5dd7070Spatrick """ 294e5dd7070Spatrick path = info.path 295e5dd7070Spatrick run = AnalysisRun(info) 296e5dd7070Spatrick 297e5dd7070Spatrick if os.path.isfile(path): 298ec727ea7Spatrick run.read_single_file(path, delete_empty) 299e5dd7070Spatrick else: 300ec727ea7Spatrick for dirpath, dirnames, filenames in os.walk(path): 301e5dd7070Spatrick for f in filenames: 302ec727ea7Spatrick if not f.endswith('plist'): 303e5dd7070Spatrick continue 304ec727ea7Spatrick 305e5dd7070Spatrick p = os.path.join(dirpath, f) 306ec727ea7Spatrick run.read_single_file(p, delete_empty) 307e5dd7070Spatrick 308e5dd7070Spatrick return run 309e5dd7070Spatrick 310e5dd7070Spatrick 311ec727ea7Spatrickdef cmp_analysis_diagnostic(d): 312ec727ea7Spatrick return d.get_issue_identifier() 313e5dd7070Spatrick 314e5dd7070Spatrick 315a9ac8606SpatrickAnalysisDiagnosticPair = Tuple[AnalysisDiagnostic, AnalysisDiagnostic] 316a9ac8606Spatrick 317a9ac8606Spatrick 318a9ac8606Spatrickclass ComparisonResult: 319a9ac8606Spatrick def __init__(self): 320a9ac8606Spatrick self.present_in_both: List[AnalysisDiagnostic] = [] 321a9ac8606Spatrick self.present_only_in_old: List[AnalysisDiagnostic] = [] 322a9ac8606Spatrick self.present_only_in_new: List[AnalysisDiagnostic] = [] 323a9ac8606Spatrick self.changed_between_new_and_old: List[AnalysisDiagnosticPair] = [] 324a9ac8606Spatrick 325a9ac8606Spatrick def add_common(self, issue: AnalysisDiagnostic): 326a9ac8606Spatrick self.present_in_both.append(issue) 327a9ac8606Spatrick 328a9ac8606Spatrick def add_removed(self, issue: AnalysisDiagnostic): 329a9ac8606Spatrick self.present_only_in_old.append(issue) 330a9ac8606Spatrick 331a9ac8606Spatrick def add_added(self, issue: AnalysisDiagnostic): 332a9ac8606Spatrick self.present_only_in_new.append(issue) 333a9ac8606Spatrick 334a9ac8606Spatrick def add_changed(self, old_issue: AnalysisDiagnostic, 335a9ac8606Spatrick new_issue: AnalysisDiagnostic): 336a9ac8606Spatrick self.changed_between_new_and_old.append((old_issue, new_issue)) 337a9ac8606Spatrick 338a9ac8606Spatrick 339a9ac8606SpatrickGroupedDiagnostics = DefaultDict[str, List[AnalysisDiagnostic]] 340a9ac8606Spatrick 341a9ac8606Spatrick 342a9ac8606Spatrickdef get_grouped_diagnostics(diagnostics: List[AnalysisDiagnostic] 343a9ac8606Spatrick ) -> GroupedDiagnostics: 344a9ac8606Spatrick result: GroupedDiagnostics = defaultdict(list) 345a9ac8606Spatrick for diagnostic in diagnostics: 346a9ac8606Spatrick result[diagnostic.get_location()].append(diagnostic) 347a9ac8606Spatrick return result 348ec727ea7Spatrick 349ec727ea7Spatrick 350ec727ea7Spatrickdef compare_results(results_old: AnalysisRun, results_new: AnalysisRun, 351ec727ea7Spatrick histogram: Optional[HistogramType] = None 352ec727ea7Spatrick ) -> ComparisonResult: 353e5dd7070Spatrick """ 354ec727ea7Spatrick compare_results - Generate a relation from diagnostics in run A to 355e5dd7070Spatrick diagnostics in run B. 356e5dd7070Spatrick 357e5dd7070Spatrick The result is the relation as a list of triples (a, b) where 358e5dd7070Spatrick each element {a,b} is None or a matching element from the respective run 359e5dd7070Spatrick """ 360e5dd7070Spatrick 361a9ac8606Spatrick res = ComparisonResult() 362e5dd7070Spatrick 363e5dd7070Spatrick # Map size_before -> size_after 364ec727ea7Spatrick path_difference_data: List[float] = [] 365e5dd7070Spatrick 366a9ac8606Spatrick diags_old = get_grouped_diagnostics(results_old.diagnostics) 367a9ac8606Spatrick diags_new = get_grouped_diagnostics(results_new.diagnostics) 368ec727ea7Spatrick 369a9ac8606Spatrick locations_old = set(diags_old.keys()) 370a9ac8606Spatrick locations_new = set(diags_new.keys()) 371ec727ea7Spatrick 372a9ac8606Spatrick common_locations = locations_old & locations_new 373ec727ea7Spatrick 374a9ac8606Spatrick for location in common_locations: 375a9ac8606Spatrick old = diags_old[location] 376a9ac8606Spatrick new = diags_new[location] 377ec727ea7Spatrick 378a9ac8606Spatrick # Quadratic algorithms in this part are fine because 'old' and 'new' 379a9ac8606Spatrick # are most commonly of size 1. 380a9ac8606Spatrick common: Set[AnalysisDiagnostic] = set() 381a9ac8606Spatrick for a in old: 382a9ac8606Spatrick for b in new: 383ec727ea7Spatrick if a.get_issue_identifier() == b.get_issue_identifier(): 384a9ac8606Spatrick a_path_len = a.get_path_length() 385a9ac8606Spatrick b_path_len = b.get_path_length() 386a9ac8606Spatrick 387a9ac8606Spatrick if a_path_len != b_path_len: 388ec727ea7Spatrick 389ec727ea7Spatrick if histogram == HistogramType.RELATIVE: 390e5dd7070Spatrick path_difference_data.append( 391a9ac8606Spatrick float(a_path_len) / b_path_len) 392ec727ea7Spatrick 393ec727ea7Spatrick elif histogram == HistogramType.LOG_RELATIVE: 394e5dd7070Spatrick path_difference_data.append( 395a9ac8606Spatrick log(float(a_path_len) / b_path_len)) 396ec727ea7Spatrick 397ec727ea7Spatrick elif histogram == HistogramType.ABSOLUTE: 398e5dd7070Spatrick path_difference_data.append( 399a9ac8606Spatrick a_path_len - b_path_len) 400e5dd7070Spatrick 401a9ac8606Spatrick res.add_common(b) 402a9ac8606Spatrick common.add(a) 403ec727ea7Spatrick 404a9ac8606Spatrick old = filter_issues(old, common) 405a9ac8606Spatrick new = filter_issues(new, common) 406a9ac8606Spatrick common = set() 407ec727ea7Spatrick 408a9ac8606Spatrick for a in old: 409a9ac8606Spatrick for b in new: 410a9ac8606Spatrick if a.is_similar_to(b): 411a9ac8606Spatrick res.add_changed(a, b) 412a9ac8606Spatrick common.add(a) 413a9ac8606Spatrick common.add(b) 414ec727ea7Spatrick 415a9ac8606Spatrick old = filter_issues(old, common) 416a9ac8606Spatrick new = filter_issues(new, common) 417a9ac8606Spatrick 418a9ac8606Spatrick # Whatever is left in 'old' doesn't have a corresponding diagnostic 419a9ac8606Spatrick # in 'new', so we need to mark it as 'removed'. 420a9ac8606Spatrick for a in old: 421a9ac8606Spatrick res.add_removed(a) 422a9ac8606Spatrick 423a9ac8606Spatrick # Whatever is left in 'new' doesn't have a corresponding diagnostic 424a9ac8606Spatrick # in 'old', so we need to mark it as 'added'. 425a9ac8606Spatrick for b in new: 426a9ac8606Spatrick res.add_added(b) 427a9ac8606Spatrick 428a9ac8606Spatrick only_old_locations = locations_old - common_locations 429a9ac8606Spatrick for location in only_old_locations: 430a9ac8606Spatrick for a in diags_old[location]: 431a9ac8606Spatrick # These locations have been found only in the old build, so we 432a9ac8606Spatrick # need to mark all of therm as 'removed' 433a9ac8606Spatrick res.add_removed(a) 434a9ac8606Spatrick 435a9ac8606Spatrick only_new_locations = locations_new - common_locations 436a9ac8606Spatrick for location in only_new_locations: 437a9ac8606Spatrick for b in diags_new[location]: 438a9ac8606Spatrick # These locations have been found only in the new build, so we 439a9ac8606Spatrick # need to mark all of therm as 'added' 440a9ac8606Spatrick res.add_added(b) 441e5dd7070Spatrick 442e5dd7070Spatrick # FIXME: Add fuzzy matching. One simple and possible effective idea would 443e5dd7070Spatrick # be to bin the diagnostics, print them in a normalized form (based solely 444e5dd7070Spatrick # on the structure of the diagnostic), compute the diff, then use that as 445e5dd7070Spatrick # the basis for matching. This has the nice property that we don't depend 446e5dd7070Spatrick # in any way on the diagnostic format. 447e5dd7070Spatrick 448ec727ea7Spatrick if histogram: 449e5dd7070Spatrick from matplotlib import pyplot 450e5dd7070Spatrick pyplot.hist(path_difference_data, bins=100) 451e5dd7070Spatrick pyplot.show() 452e5dd7070Spatrick 453e5dd7070Spatrick return res 454e5dd7070Spatrick 455ec727ea7Spatrick 456a9ac8606Spatrickdef filter_issues(origin: List[AnalysisDiagnostic], 457a9ac8606Spatrick to_remove: Set[AnalysisDiagnostic]) \ 458a9ac8606Spatrick -> List[AnalysisDiagnostic]: 459a9ac8606Spatrick return [diag for diag in origin if diag not in to_remove] 460a9ac8606Spatrick 461a9ac8606Spatrick 462ec727ea7Spatrickdef compute_percentile(values: Sequence[T], percentile: float) -> T: 463e5dd7070Spatrick """ 464e5dd7070Spatrick Return computed percentile. 465e5dd7070Spatrick """ 466ec727ea7Spatrick return sorted(values)[int(round(percentile * len(values) + 0.5)) - 1] 467e5dd7070Spatrick 468ec727ea7Spatrick 469ec727ea7Spatrickdef derive_stats(results: AnalysisRun) -> Stats: 470e5dd7070Spatrick # Assume all keys are the same in each statistics bucket. 471e5dd7070Spatrick combined_data = defaultdict(list) 472e5dd7070Spatrick 473e5dd7070Spatrick # Collect data on paths length. 474e5dd7070Spatrick for report in results.reports: 475e5dd7070Spatrick for diagnostic in report.diagnostics: 476ec727ea7Spatrick combined_data['PathsLength'].append(diagnostic.get_path_length()) 477e5dd7070Spatrick 478ec727ea7Spatrick for stat in results.raw_stats: 479e5dd7070Spatrick for key, value in stat.items(): 480ec727ea7Spatrick combined_data[str(key)].append(value) 481ec727ea7Spatrick 482ec727ea7Spatrick combined_stats: Stats = {} 483ec727ea7Spatrick 484e5dd7070Spatrick for key, values in combined_data.items(): 485ec727ea7Spatrick combined_stats[key] = { 486e5dd7070Spatrick "max": max(values), 487e5dd7070Spatrick "min": min(values), 488e5dd7070Spatrick "mean": sum(values) / len(values), 489ec727ea7Spatrick "90th %tile": compute_percentile(values, 0.9), 490ec727ea7Spatrick "95th %tile": compute_percentile(values, 0.95), 491e5dd7070Spatrick "median": sorted(values)[len(values) // 2], 492e5dd7070Spatrick "total": sum(values) 493e5dd7070Spatrick } 494ec727ea7Spatrick 495e5dd7070Spatrick return combined_stats 496e5dd7070Spatrick 497e5dd7070Spatrick 498ec727ea7Spatrick# TODO: compare_results decouples comparison from the output, we should 499ec727ea7Spatrick# do it here as well 500ec727ea7Spatrickdef compare_stats(results_old: AnalysisRun, results_new: AnalysisRun, 501ec727ea7Spatrick out: TextIO = sys.stdout): 502ec727ea7Spatrick stats_old = derive_stats(results_old) 503ec727ea7Spatrick stats_new = derive_stats(results_new) 504ec727ea7Spatrick 505ec727ea7Spatrick old_keys = set(stats_old.keys()) 506ec727ea7Spatrick new_keys = set(stats_new.keys()) 507ec727ea7Spatrick keys = sorted(old_keys & new_keys) 508ec727ea7Spatrick 509e5dd7070Spatrick for key in keys: 510ec727ea7Spatrick out.write(f"{key}\n") 511ec727ea7Spatrick 512ec727ea7Spatrick nested_keys = sorted(set(stats_old[key]) & set(stats_new[key])) 513ec727ea7Spatrick 514ec727ea7Spatrick for nested_key in nested_keys: 515ec727ea7Spatrick val_old = float(stats_old[key][nested_key]) 516ec727ea7Spatrick val_new = float(stats_new[key][nested_key]) 517ec727ea7Spatrick 518ec727ea7Spatrick report = f"{val_old:.3f} -> {val_new:.3f}" 519ec727ea7Spatrick 520e5dd7070Spatrick # Only apply highlighting when writing to TTY and it's not Windows 521ec727ea7Spatrick if out.isatty() and os.name != 'nt': 522ec727ea7Spatrick if val_new != 0: 523ec727ea7Spatrick ratio = (val_new - val_old) / val_new 524e5dd7070Spatrick if ratio < -0.2: 525e5dd7070Spatrick report = Colors.GREEN + report + Colors.CLEAR 526e5dd7070Spatrick elif ratio > 0.2: 527e5dd7070Spatrick report = Colors.RED + report + Colors.CLEAR 528e5dd7070Spatrick 529ec727ea7Spatrick out.write(f"\t {nested_key} {report}\n") 530ec727ea7Spatrick 531ec727ea7Spatrick removed_keys = old_keys - new_keys 532ec727ea7Spatrick if removed_keys: 533ec727ea7Spatrick out.write(f"REMOVED statistics: {removed_keys}\n") 534ec727ea7Spatrick 535ec727ea7Spatrick added_keys = new_keys - old_keys 536ec727ea7Spatrick if added_keys: 537ec727ea7Spatrick out.write(f"ADDED statistics: {added_keys}\n") 538ec727ea7Spatrick 539ec727ea7Spatrick out.write("\n") 540ec727ea7Spatrick 541ec727ea7Spatrick 542ec727ea7Spatrickdef dump_scan_build_results_diff(dir_old: ResultsDirectory, 543ec727ea7Spatrick dir_new: ResultsDirectory, 544ec727ea7Spatrick delete_empty: bool = True, 545ec727ea7Spatrick out: TextIO = sys.stdout, 546ec727ea7Spatrick show_stats: bool = False, 547ec727ea7Spatrick stats_only: bool = False, 548ec727ea7Spatrick histogram: Optional[HistogramType] = None, 549ec727ea7Spatrick verbose_log: Optional[str] = None): 550ec727ea7Spatrick """ 551ec727ea7Spatrick Compare directories with analysis results and dump results. 552ec727ea7Spatrick 553ec727ea7Spatrick :param delete_empty: delete empty plist files 554ec727ea7Spatrick :param out: buffer to dump comparison results to. 555ec727ea7Spatrick :param show_stats: compare execution stats as well. 556ec727ea7Spatrick :param stats_only: compare ONLY execution stats. 557ec727ea7Spatrick :param histogram: optional histogram type to plot path differences. 558ec727ea7Spatrick :param verbose_log: optional path to an additional log file. 559ec727ea7Spatrick """ 560ec727ea7Spatrick results_old = load_results(dir_old, delete_empty, verbose_log) 561ec727ea7Spatrick results_new = load_results(dir_new, delete_empty, verbose_log) 562ec727ea7Spatrick 563ec727ea7Spatrick if show_stats or stats_only: 564ec727ea7Spatrick compare_stats(results_old, results_new) 565ec727ea7Spatrick if stats_only: 566e5dd7070Spatrick return 567e5dd7070Spatrick 568e5dd7070Spatrick # Open the verbose log, if given. 569ec727ea7Spatrick if verbose_log: 570a9ac8606Spatrick aux_log: Optional[TextIO] = open(verbose_log, "w") 571e5dd7070Spatrick else: 572a9ac8606Spatrick aux_log = None 573e5dd7070Spatrick 574ec727ea7Spatrick diff = compare_results(results_old, results_new, histogram) 575ec727ea7Spatrick found_diffs = 0 576ec727ea7Spatrick total_added = 0 577ec727ea7Spatrick total_removed = 0 578a9ac8606Spatrick total_modified = 0 579ec727ea7Spatrick 580a9ac8606Spatrick for new in diff.present_only_in_new: 581a9ac8606Spatrick out.write(f"ADDED: {new.get_readable_name()}\n\n") 582ec727ea7Spatrick found_diffs += 1 583ec727ea7Spatrick total_added += 1 584a9ac8606Spatrick if aux_log: 585a9ac8606Spatrick aux_log.write(f"('ADDED', {new.get_readable_name()}, " 586ec727ea7Spatrick f"{new.get_html_report()})\n") 587ec727ea7Spatrick 588a9ac8606Spatrick for old in diff.present_only_in_old: 589a9ac8606Spatrick out.write(f"REMOVED: {old.get_readable_name()}\n\n") 590ec727ea7Spatrick found_diffs += 1 591ec727ea7Spatrick total_removed += 1 592a9ac8606Spatrick if aux_log: 593a9ac8606Spatrick aux_log.write(f"('REMOVED', {old.get_readable_name()}, " 594ec727ea7Spatrick f"{old.get_html_report()})\n") 595a9ac8606Spatrick 596a9ac8606Spatrick for old, new in diff.changed_between_new_and_old: 597a9ac8606Spatrick out.write(f"MODIFIED: {old.get_readable_name()}\n") 598a9ac8606Spatrick found_diffs += 1 599a9ac8606Spatrick total_modified += 1 600a9ac8606Spatrick diffs = old.get_diffs(new) 601a9ac8606Spatrick str_diffs = [f" '{key}' changed: " 602a9ac8606Spatrick f"'{old_value}' -> '{new_value}'" 603a9ac8606Spatrick for key, (old_value, new_value) in diffs.items()] 604a9ac8606Spatrick out.write(",\n".join(str_diffs) + "\n\n") 605a9ac8606Spatrick if aux_log: 606a9ac8606Spatrick aux_log.write(f"('MODIFIED', {old.get_readable_name()}, " 607a9ac8606Spatrick f"{old.get_html_report()})\n") 608e5dd7070Spatrick 609ec727ea7Spatrick total_reports = len(results_new.diagnostics) 610ec727ea7Spatrick out.write(f"TOTAL REPORTS: {total_reports}\n") 611ec727ea7Spatrick out.write(f"TOTAL ADDED: {total_added}\n") 612ec727ea7Spatrick out.write(f"TOTAL REMOVED: {total_removed}\n") 613a9ac8606Spatrick out.write(f"TOTAL MODIFIED: {total_modified}\n") 614ec727ea7Spatrick 615a9ac8606Spatrick if aux_log: 616a9ac8606Spatrick aux_log.write(f"('TOTAL NEW REPORTS', {total_reports})\n") 617a9ac8606Spatrick aux_log.write(f"('TOTAL DIFFERENCES', {found_diffs})\n") 618a9ac8606Spatrick aux_log.close() 619e5dd7070Spatrick 620ec727ea7Spatrick # TODO: change to NamedTuple 621ec727ea7Spatrick return found_diffs, len(results_old.diagnostics), \ 622ec727ea7Spatrick len(results_new.diagnostics) 623e5dd7070Spatrick 624e5dd7070Spatrick 625ec727ea7Spatrickif __name__ == "__main__": 626ec727ea7Spatrick print("CmpRuns.py should not be used on its own.") 627ec727ea7Spatrick print("Please use 'SATest.py compare' instead") 628ec727ea7Spatrick sys.exit(1) 629