utils/analyzer/CmpRuns.py

e5dd7070Spatrick#!/usr/bin/env python
e5dd7070Spatrick
e5dd7070Spatrick"""
e5dd7070SpatrickCmpRuns - A simple tool for comparing two static analyzer runs to determine
e5dd7070Spatrickwhich reports have been added, removed, or changed.
e5dd7070Spatrick
e5dd7070SpatrickThis is designed to support automated testing using the static analyzer, from
e5dd7070Spatricktwo perspectives:
e5dd7070Spatrick  1. To monitor changes in the static analyzer's reports on real code bases,
e5dd7070Spatrick     for regression testing.
e5dd7070Spatrick
e5dd7070Spatrick  2. For use by end users who want to integrate regular static analyzer testing
e5dd7070Spatrick     into a buildbot like environment.
e5dd7070Spatrick
e5dd7070SpatrickUsage:
e5dd7070Spatrick
e5dd7070Spatrick    # Load the results of both runs, to obtain lists of the corresponding
e5dd7070Spatrick    # AnalysisDiagnostic objects.
e5dd7070Spatrick    #
ec727ea7Spatrick    resultsA = load_results_from_single_run(singleRunInfoA, delete_empty)
ec727ea7Spatrick    resultsB = load_results_from_single_run(singleRunInfoB, delete_empty)
e5dd7070Spatrick
e5dd7070Spatrick    # Generate a relation from diagnostics in run A to diagnostics in run B
e5dd7070Spatrick    # to obtain a list of triples (a, b, confidence).
ec727ea7Spatrick    diff = compare_results(resultsA, resultsB)
e5dd7070Spatrick
e5dd7070Spatrick"""
e5dd7070Spatrickimport json
e5dd7070Spatrickimport os
e5dd7070Spatrickimport plistlib
e5dd7070Spatrickimport re
e5dd7070Spatrickimport sys
e5dd7070Spatrick
ec727ea7Spatrickfrom math import log
ec727ea7Spatrickfrom collections import defaultdict
ec727ea7Spatrickfrom copy import copy
ec727ea7Spatrickfrom enum import Enum
a9ac8606Spatrickfrom typing import (Any, DefaultDict, Dict, List, NamedTuple, Optional,
a9ac8606Spatrick                    Sequence, Set, TextIO, TypeVar, Tuple, Union)
ec727ea7Spatrick
ec727ea7Spatrick
ec727ea7SpatrickNumber = Union[int, float]
ec727ea7SpatrickStats = Dict[str, Dict[str, Number]]
ec727ea7SpatrickPlist = Dict[str, Any]
ec727ea7SpatrickJSON = Dict[str, Any]
a9ac8606Spatrick# Diff in a form: field -> (before, after)
a9ac8606SpatrickJSONDiff = Dict[str, Tuple[str, str]]
ec727ea7Spatrick# Type for generics
ec727ea7SpatrickT = TypeVar('T')
ec727ea7Spatrick
e5dd7070SpatrickSTATS_REGEXP = re.compile(r"Statistics: (\{.+\})", re.MULTILINE | re.DOTALL)
e5dd7070Spatrick
ec727ea7Spatrick
ec727ea7Spatrickclass Colors:
e5dd7070Spatrick    """
e5dd7070Spatrick    Color for terminal highlight.
e5dd7070Spatrick    """
e5dd7070Spatrick    RED = '\x1b[2;30;41m'
e5dd7070Spatrick    GREEN = '\x1b[6;30;42m'
e5dd7070Spatrick    CLEAR = '\x1b[0m'
e5dd7070Spatrick
ec727ea7Spatrick
ec727ea7Spatrickclass HistogramType(str, Enum):
ec727ea7Spatrick    RELATIVE = "relative"
ec727ea7Spatrick    LOG_RELATIVE = "log-relative"
ec727ea7Spatrick    ABSOLUTE = "absolute"
e5dd7070Spatrick
e5dd7070Spatrick
ec727ea7Spatrickclass ResultsDirectory(NamedTuple):
ec727ea7Spatrick    path: str
ec727ea7Spatrick    root: str = ""
ec727ea7Spatrick
ec727ea7Spatrick
ec727ea7Spatrickclass SingleRunInfo:
ec727ea7Spatrick    """
ec727ea7Spatrick    Information about analysis run:
ec727ea7Spatrick    path - the analysis output directory
ec727ea7Spatrick    root - the name of the root directory, which will be disregarded when
ec727ea7Spatrick    determining the source file name
ec727ea7Spatrick    """
ec727ea7Spatrick    def __init__(self, results: ResultsDirectory,
ec727ea7Spatrick                 verbose_log: Optional[str] = None):
ec727ea7Spatrick        self.path = results.path
ec727ea7Spatrick        self.root = results.root.rstrip("/\\")
ec727ea7Spatrick        self.verbose_log = verbose_log
ec727ea7Spatrick
ec727ea7Spatrick
ec727ea7Spatrickclass AnalysisDiagnostic:
ec727ea7Spatrick    def __init__(self, data: Plist, report: "AnalysisReport",
ec727ea7Spatrick                 html_report: Optional[str]):
e5dd7070Spatrick        self._data = data
e5dd7070Spatrick        self._loc = self._data['location']
e5dd7070Spatrick        self._report = report
ec727ea7Spatrick        self._html_report = html_report
ec727ea7Spatrick        self._report_size = len(self._data['path'])
e5dd7070Spatrick
ec727ea7Spatrick    def get_file_name(self) -> str:
e5dd7070Spatrick        root = self._report.run.root
ec727ea7Spatrick        file_name = self._report.files[self._loc['file']]
e5dd7070Spatrick
ec727ea7Spatrick        if file_name.startswith(root) and len(root) > 0:
ec727ea7Spatrick            return file_name[len(root) + 1:]
ec727ea7Spatrick
ec727ea7Spatrick        return file_name
ec727ea7Spatrick
ec727ea7Spatrick    def get_root_file_name(self) -> str:
e5dd7070Spatrick        path = self._data['path']
ec727ea7Spatrick
e5dd7070Spatrick        if not path:
ec727ea7Spatrick            return self.get_file_name()
ec727ea7Spatrick
e5dd7070Spatrick        p = path[0]
e5dd7070Spatrick        if 'location' in p:
ec727ea7Spatrick            file_index = p['location']['file']
e5dd7070Spatrick        else:  # control edge
ec727ea7Spatrick            file_index = path[0]['edges'][0]['start'][0]['file']
ec727ea7Spatrick
ec727ea7Spatrick        out = self._report.files[file_index]
e5dd7070Spatrick        root = self._report.run.root
ec727ea7Spatrick
e5dd7070Spatrick        if out.startswith(root):
e5dd7070Spatrick            return out[len(root):]
ec727ea7Spatrick
e5dd7070Spatrick        return out
e5dd7070Spatrick
ec727ea7Spatrick    def get_line(self) -> int:
e5dd7070Spatrick        return self._loc['line']
e5dd7070Spatrick
ec727ea7Spatrick    def get_column(self) -> int:
e5dd7070Spatrick        return self._loc['col']
e5dd7070Spatrick
ec727ea7Spatrick    def get_path_length(self) -> int:
ec727ea7Spatrick        return self._report_size
e5dd7070Spatrick
ec727ea7Spatrick    def get_category(self) -> str:
e5dd7070Spatrick        return self._data['category']
e5dd7070Spatrick
ec727ea7Spatrick    def get_description(self) -> str:
e5dd7070Spatrick        return self._data['description']
e5dd7070Spatrick
a9ac8606Spatrick    def get_location(self) -> str:
a9ac8606Spatrick        return f"{self.get_file_name()}:{self.get_line()}:{self.get_column()}"
a9ac8606Spatrick
ec727ea7Spatrick    def get_issue_identifier(self) -> str:
ec727ea7Spatrick        id = self.get_file_name() + "+"
ec727ea7Spatrick
ec727ea7Spatrick        if "issue_context" in self._data:
ec727ea7Spatrick            id += self._data["issue_context"] + "+"
ec727ea7Spatrick
ec727ea7Spatrick        if "issue_hash_content_of_line_in_context" in self._data:
ec727ea7Spatrick            id += str(self._data["issue_hash_content_of_line_in_context"])
ec727ea7Spatrick
e5dd7070Spatrick        return id
e5dd7070Spatrick
ec727ea7Spatrick    def get_html_report(self) -> str:
ec727ea7Spatrick        if self._html_report is None:
e5dd7070Spatrick            return " "
e5dd7070Spatrick
ec727ea7Spatrick        return os.path.join(self._report.run.path, self._html_report)
ec727ea7Spatrick
ec727ea7Spatrick    def get_readable_name(self) -> str:
ec727ea7Spatrick        if "issue_context" in self._data:
ec727ea7Spatrick            funcname_postfix = "#" + self._data["issue_context"]
e5dd7070Spatrick        else:
ec727ea7Spatrick            funcname_postfix = ""
ec727ea7Spatrick
ec727ea7Spatrick        root_filename = self.get_root_file_name()
ec727ea7Spatrick        file_name = self.get_file_name()
ec727ea7Spatrick
ec727ea7Spatrick        if root_filename != file_name:
ec727ea7Spatrick            file_prefix = f"[{root_filename}] {file_name}"
e5dd7070Spatrick        else:
ec727ea7Spatrick            file_prefix = root_filename
ec727ea7Spatrick
ec727ea7Spatrick        line = self.get_line()
ec727ea7Spatrick        col = self.get_column()
ec727ea7Spatrick        return f"{file_prefix}{funcname_postfix}:{line}:{col}" \
ec727ea7Spatrick            f", {self.get_category()}: {self.get_description()}"
e5dd7070Spatrick
a9ac8606Spatrick    KEY_FIELDS = ["check_name", "category", "description"]
a9ac8606Spatrick
a9ac8606Spatrick    def is_similar_to(self, other: "AnalysisDiagnostic") -> bool:
a9ac8606Spatrick        # We consider two diagnostics similar only if at least one
a9ac8606Spatrick        # of the key fields is the same in both diagnostics.
a9ac8606Spatrick        return len(self.get_diffs(other)) != len(self.KEY_FIELDS)
a9ac8606Spatrick
a9ac8606Spatrick    def get_diffs(self, other: "AnalysisDiagnostic") -> JSONDiff:
a9ac8606Spatrick        return {field: (self._data[field], other._data[field])
a9ac8606Spatrick                for field in self.KEY_FIELDS
a9ac8606Spatrick                if self._data[field] != other._data[field]}
a9ac8606Spatrick
e5dd7070Spatrick    # Note, the data format is not an API and may change from one analyzer
e5dd7070Spatrick    # version to another.
ec727ea7Spatrick    def get_raw_data(self) -> Plist:
e5dd7070Spatrick        return self._data
e5dd7070Spatrick
a9ac8606Spatrick    def __eq__(self, other: object) -> bool:
a9ac8606Spatrick        return hash(self) == hash(other)
a9ac8606Spatrick
a9ac8606Spatrick    def __ne__(self, other: object) -> bool:
a9ac8606Spatrick        return hash(self) != hash(other)
a9ac8606Spatrick
a9ac8606Spatrick    def __hash__(self) -> int:
a9ac8606Spatrick        return hash(self.get_issue_identifier())
a9ac8606Spatrick
e5dd7070Spatrick
ec727ea7Spatrickclass AnalysisRun:
ec727ea7Spatrick    def __init__(self, info: SingleRunInfo):
e5dd7070Spatrick        self.path = info.path
e5dd7070Spatrick        self.root = info.root
e5dd7070Spatrick        self.info = info
ec727ea7Spatrick        self.reports: List[AnalysisReport] = []
e5dd7070Spatrick        # Cumulative list of all diagnostics from all the reports.
ec727ea7Spatrick        self.diagnostics: List[AnalysisDiagnostic] = []
ec727ea7Spatrick        self.clang_version: Optional[str] = None
ec727ea7Spatrick        self.raw_stats: List[JSON] = []
e5dd7070Spatrick
ec727ea7Spatrick    def get_clang_version(self) -> Optional[str]:
e5dd7070Spatrick        return self.clang_version
e5dd7070Spatrick
ec727ea7Spatrick    def read_single_file(self, path: str, delete_empty: bool):
ec727ea7Spatrick        with open(path, "rb") as plist_file:
ec727ea7Spatrick            data = plistlib.load(plist_file)
ec727ea7Spatrick
e5dd7070Spatrick        if 'statistics' in data:
ec727ea7Spatrick            self.raw_stats.append(json.loads(data['statistics']))
e5dd7070Spatrick            data.pop('statistics')
e5dd7070Spatrick
e5dd7070Spatrick        # We want to retrieve the clang version even if there are no
e5dd7070Spatrick        # reports. Assume that all reports were created using the same
e5dd7070Spatrick        # clang version (this is always true and is more efficient).
e5dd7070Spatrick        if 'clang_version' in data:
e5dd7070Spatrick            if self.clang_version is None:
e5dd7070Spatrick                self.clang_version = data.pop('clang_version')
e5dd7070Spatrick            else:
e5dd7070Spatrick                data.pop('clang_version')
e5dd7070Spatrick
e5dd7070Spatrick        # Ignore/delete empty reports.
e5dd7070Spatrick        if not data['files']:
ec727ea7Spatrick            if delete_empty:
ec727ea7Spatrick                os.remove(path)
e5dd7070Spatrick            return
e5dd7070Spatrick
e5dd7070Spatrick        # Extract the HTML reports, if they exists.
e5dd7070Spatrick        htmlFiles = []
e5dd7070Spatrick        for d in data['diagnostics']:
*12c85518Srobert            if 'HTMLDiagnostics_files' in d:
e5dd7070Spatrick                # FIXME: Why is this named files, when does it have multiple
e5dd7070Spatrick                # files?
e5dd7070Spatrick                assert len(d['HTMLDiagnostics_files']) == 1
e5dd7070Spatrick                htmlFiles.append(d.pop('HTMLDiagnostics_files')[0])
e5dd7070Spatrick            else:
*12c85518Srobert                htmlFiles.append(None)
e5dd7070Spatrick
e5dd7070Spatrick        report = AnalysisReport(self, data.pop('files'))
*12c85518Srobert        # Python 3.10 offers zip(..., strict=True). The following assertion
*12c85518Srobert        # mimics it.
*12c85518Srobert        assert len(data['diagnostics']) == len(htmlFiles)
e5dd7070Spatrick        diagnostics = [AnalysisDiagnostic(d, report, h)
e5dd7070Spatrick                       for d, h in zip(data.pop('diagnostics'), htmlFiles)]
e5dd7070Spatrick
e5dd7070Spatrick        assert not data
e5dd7070Spatrick
e5dd7070Spatrick        report.diagnostics.extend(diagnostics)
e5dd7070Spatrick        self.reports.append(report)
e5dd7070Spatrick        self.diagnostics.extend(diagnostics)
e5dd7070Spatrick
e5dd7070Spatrick
ec727ea7Spatrickclass AnalysisReport:
ec727ea7Spatrick    def __init__(self, run: AnalysisRun, files: List[str]):
ec727ea7Spatrick        self.run = run
ec727ea7Spatrick        self.files = files
ec727ea7Spatrick        self.diagnostics: List[AnalysisDiagnostic] = []
ec727ea7Spatrick
ec727ea7Spatrick
ec727ea7Spatrickdef load_results(results: ResultsDirectory, delete_empty: bool = True,
ec727ea7Spatrick                 verbose_log: Optional[str] = None) -> AnalysisRun:
e5dd7070Spatrick    """
e5dd7070Spatrick    Backwards compatibility API.
e5dd7070Spatrick    """
ec727ea7Spatrick    return load_results_from_single_run(SingleRunInfo(results,
ec727ea7Spatrick                                                      verbose_log),
ec727ea7Spatrick                                        delete_empty)
e5dd7070Spatrick
e5dd7070Spatrick
ec727ea7Spatrickdef load_results_from_single_run(info: SingleRunInfo,
ec727ea7Spatrick                                 delete_empty: bool = True) -> AnalysisRun:
e5dd7070Spatrick    """
e5dd7070Spatrick    # Load results of the analyzes from a given output folder.
e5dd7070Spatrick    # - info is the SingleRunInfo object
ec727ea7Spatrick    # - delete_empty specifies if the empty plist files should be deleted
e5dd7070Spatrick
e5dd7070Spatrick    """
e5dd7070Spatrick    path = info.path
e5dd7070Spatrick    run = AnalysisRun(info)
e5dd7070Spatrick
e5dd7070Spatrick    if os.path.isfile(path):
ec727ea7Spatrick        run.read_single_file(path, delete_empty)
e5dd7070Spatrick    else:
ec727ea7Spatrick        for dirpath, dirnames, filenames in os.walk(path):
e5dd7070Spatrick            for f in filenames:
ec727ea7Spatrick                if not f.endswith('plist'):
e5dd7070Spatrick                    continue
ec727ea7Spatrick
e5dd7070Spatrick                p = os.path.join(dirpath, f)
ec727ea7Spatrick                run.read_single_file(p, delete_empty)
e5dd7070Spatrick
e5dd7070Spatrick    return run
e5dd7070Spatrick
e5dd7070Spatrick
ec727ea7Spatrickdef cmp_analysis_diagnostic(d):
ec727ea7Spatrick    return d.get_issue_identifier()
e5dd7070Spatrick
e5dd7070Spatrick
a9ac8606SpatrickAnalysisDiagnosticPair = Tuple[AnalysisDiagnostic, AnalysisDiagnostic]
a9ac8606Spatrick
a9ac8606Spatrick
a9ac8606Spatrickclass ComparisonResult:
a9ac8606Spatrick    def __init__(self):
a9ac8606Spatrick        self.present_in_both: List[AnalysisDiagnostic] = []
a9ac8606Spatrick        self.present_only_in_old: List[AnalysisDiagnostic] = []
a9ac8606Spatrick        self.present_only_in_new: List[AnalysisDiagnostic] = []
a9ac8606Spatrick        self.changed_between_new_and_old: List[AnalysisDiagnosticPair] = []
a9ac8606Spatrick
a9ac8606Spatrick    def add_common(self, issue: AnalysisDiagnostic):
a9ac8606Spatrick        self.present_in_both.append(issue)
a9ac8606Spatrick
a9ac8606Spatrick    def add_removed(self, issue: AnalysisDiagnostic):
a9ac8606Spatrick        self.present_only_in_old.append(issue)
a9ac8606Spatrick
a9ac8606Spatrick    def add_added(self, issue: AnalysisDiagnostic):
a9ac8606Spatrick        self.present_only_in_new.append(issue)
a9ac8606Spatrick
a9ac8606Spatrick    def add_changed(self, old_issue: AnalysisDiagnostic,
a9ac8606Spatrick                    new_issue: AnalysisDiagnostic):
a9ac8606Spatrick        self.changed_between_new_and_old.append((old_issue, new_issue))
a9ac8606Spatrick
a9ac8606Spatrick
a9ac8606SpatrickGroupedDiagnostics = DefaultDict[str, List[AnalysisDiagnostic]]
a9ac8606Spatrick
a9ac8606Spatrick
a9ac8606Spatrickdef get_grouped_diagnostics(diagnostics: List[AnalysisDiagnostic]
a9ac8606Spatrick                            ) -> GroupedDiagnostics:
a9ac8606Spatrick    result: GroupedDiagnostics = defaultdict(list)
a9ac8606Spatrick    for diagnostic in diagnostics:
a9ac8606Spatrick        result[diagnostic.get_location()].append(diagnostic)
a9ac8606Spatrick    return result
ec727ea7Spatrick
ec727ea7Spatrick
ec727ea7Spatrickdef compare_results(results_old: AnalysisRun, results_new: AnalysisRun,
ec727ea7Spatrick                    histogram: Optional[HistogramType] = None
ec727ea7Spatrick                    ) -> ComparisonResult:
e5dd7070Spatrick    """
ec727ea7Spatrick    compare_results - Generate a relation from diagnostics in run A to
e5dd7070Spatrick    diagnostics in run B.
e5dd7070Spatrick
e5dd7070Spatrick    The result is the relation as a list of triples (a, b) where
e5dd7070Spatrick    each element {a,b} is None or a matching element from the respective run
e5dd7070Spatrick    """
e5dd7070Spatrick
a9ac8606Spatrick    res = ComparisonResult()
e5dd7070Spatrick
e5dd7070Spatrick    # Map size_before -> size_after
ec727ea7Spatrick    path_difference_data: List[float] = []
e5dd7070Spatrick
a9ac8606Spatrick    diags_old = get_grouped_diagnostics(results_old.diagnostics)
a9ac8606Spatrick    diags_new = get_grouped_diagnostics(results_new.diagnostics)
ec727ea7Spatrick
a9ac8606Spatrick    locations_old = set(diags_old.keys())
a9ac8606Spatrick    locations_new = set(diags_new.keys())
ec727ea7Spatrick
a9ac8606Spatrick    common_locations = locations_old & locations_new
ec727ea7Spatrick
a9ac8606Spatrick    for location in common_locations:
a9ac8606Spatrick        old = diags_old[location]
a9ac8606Spatrick        new = diags_new[location]
ec727ea7Spatrick
a9ac8606Spatrick        # Quadratic algorithms in this part are fine because 'old' and 'new'
a9ac8606Spatrick        # are most commonly of size 1.
a9ac8606Spatrick        common: Set[AnalysisDiagnostic] = set()
a9ac8606Spatrick        for a in old:
a9ac8606Spatrick            for b in new:
ec727ea7Spatrick                if a.get_issue_identifier() == b.get_issue_identifier():
a9ac8606Spatrick                    a_path_len = a.get_path_length()
a9ac8606Spatrick                    b_path_len = b.get_path_length()
a9ac8606Spatrick
a9ac8606Spatrick                    if a_path_len != b_path_len:
ec727ea7Spatrick
ec727ea7Spatrick                        if histogram == HistogramType.RELATIVE:
e5dd7070Spatrick                            path_difference_data.append(
a9ac8606Spatrick                                float(a_path_len) / b_path_len)
ec727ea7Spatrick
ec727ea7Spatrick                        elif histogram == HistogramType.LOG_RELATIVE:
e5dd7070Spatrick                            path_difference_data.append(
a9ac8606Spatrick                                log(float(a_path_len) / b_path_len))
ec727ea7Spatrick
ec727ea7Spatrick                        elif histogram == HistogramType.ABSOLUTE:
e5dd7070Spatrick                            path_difference_data.append(
a9ac8606Spatrick                                a_path_len - b_path_len)
e5dd7070Spatrick
a9ac8606Spatrick                    res.add_common(b)
a9ac8606Spatrick                    common.add(a)
ec727ea7Spatrick
a9ac8606Spatrick        old = filter_issues(old, common)
a9ac8606Spatrick        new = filter_issues(new, common)
a9ac8606Spatrick        common = set()
ec727ea7Spatrick
a9ac8606Spatrick        for a in old:
a9ac8606Spatrick            for b in new:
a9ac8606Spatrick                if a.is_similar_to(b):
a9ac8606Spatrick                    res.add_changed(a, b)
a9ac8606Spatrick                    common.add(a)
a9ac8606Spatrick                    common.add(b)
ec727ea7Spatrick
a9ac8606Spatrick        old = filter_issues(old, common)
a9ac8606Spatrick        new = filter_issues(new, common)
a9ac8606Spatrick
a9ac8606Spatrick        # Whatever is left in 'old' doesn't have a corresponding diagnostic
a9ac8606Spatrick        # in 'new', so we need to mark it as 'removed'.
a9ac8606Spatrick        for a in old:
a9ac8606Spatrick            res.add_removed(a)
a9ac8606Spatrick
a9ac8606Spatrick        # Whatever is left in 'new' doesn't have a corresponding diagnostic
a9ac8606Spatrick        # in 'old', so we need to mark it as 'added'.
a9ac8606Spatrick        for b in new:
a9ac8606Spatrick            res.add_added(b)
a9ac8606Spatrick
a9ac8606Spatrick    only_old_locations = locations_old - common_locations
a9ac8606Spatrick    for location in only_old_locations:
a9ac8606Spatrick        for a in diags_old[location]:
a9ac8606Spatrick            # These locations have been found only in the old build, so we
a9ac8606Spatrick            # need to mark all of therm as 'removed'
a9ac8606Spatrick            res.add_removed(a)
a9ac8606Spatrick
a9ac8606Spatrick    only_new_locations = locations_new - common_locations
a9ac8606Spatrick    for location in only_new_locations:
a9ac8606Spatrick        for b in diags_new[location]:
a9ac8606Spatrick            # These locations have been found only in the new build, so we
a9ac8606Spatrick            # need to mark all of therm as 'added'
a9ac8606Spatrick            res.add_added(b)
e5dd7070Spatrick
e5dd7070Spatrick    # FIXME: Add fuzzy matching. One simple and possible effective idea would
e5dd7070Spatrick    # be to bin the diagnostics, print them in a normalized form (based solely
e5dd7070Spatrick    # on the structure of the diagnostic), compute the diff, then use that as
e5dd7070Spatrick    # the basis for matching. This has the nice property that we don't depend
e5dd7070Spatrick    # in any way on the diagnostic format.
e5dd7070Spatrick
ec727ea7Spatrick    if histogram:
e5dd7070Spatrick        from matplotlib import pyplot
e5dd7070Spatrick        pyplot.hist(path_difference_data, bins=100)
e5dd7070Spatrick        pyplot.show()
e5dd7070Spatrick
e5dd7070Spatrick    return res
e5dd7070Spatrick
ec727ea7Spatrick
a9ac8606Spatrickdef filter_issues(origin: List[AnalysisDiagnostic],
a9ac8606Spatrick                  to_remove: Set[AnalysisDiagnostic]) \
a9ac8606Spatrick                  -> List[AnalysisDiagnostic]:
a9ac8606Spatrick    return [diag for diag in origin if diag not in to_remove]
a9ac8606Spatrick
a9ac8606Spatrick
ec727ea7Spatrickdef compute_percentile(values: Sequence[T], percentile: float) -> T:
e5dd7070Spatrick    """
e5dd7070Spatrick    Return computed percentile.
e5dd7070Spatrick    """
ec727ea7Spatrick    return sorted(values)[int(round(percentile * len(values) + 0.5)) - 1]
e5dd7070Spatrick
ec727ea7Spatrick
ec727ea7Spatrickdef derive_stats(results: AnalysisRun) -> Stats:
e5dd7070Spatrick    # Assume all keys are the same in each statistics bucket.
e5dd7070Spatrick    combined_data = defaultdict(list)
e5dd7070Spatrick
e5dd7070Spatrick    # Collect data on paths length.
e5dd7070Spatrick    for report in results.reports:
e5dd7070Spatrick        for diagnostic in report.diagnostics:
ec727ea7Spatrick            combined_data['PathsLength'].append(diagnostic.get_path_length())
e5dd7070Spatrick
ec727ea7Spatrick    for stat in results.raw_stats:
e5dd7070Spatrick        for key, value in stat.items():
ec727ea7Spatrick            combined_data[str(key)].append(value)
ec727ea7Spatrick
ec727ea7Spatrick    combined_stats: Stats = {}
ec727ea7Spatrick
e5dd7070Spatrick    for key, values in combined_data.items():
ec727ea7Spatrick        combined_stats[key] = {
e5dd7070Spatrick            "max": max(values),
e5dd7070Spatrick            "min": min(values),
e5dd7070Spatrick            "mean": sum(values) / len(values),
ec727ea7Spatrick            "90th %tile": compute_percentile(values, 0.9),
ec727ea7Spatrick            "95th %tile": compute_percentile(values, 0.95),
e5dd7070Spatrick            "median": sorted(values)[len(values) // 2],
e5dd7070Spatrick            "total": sum(values)
e5dd7070Spatrick        }
ec727ea7Spatrick
e5dd7070Spatrick    return combined_stats
e5dd7070Spatrick
e5dd7070Spatrick
ec727ea7Spatrick# TODO: compare_results decouples comparison from the output, we should
ec727ea7Spatrick#       do it here as well
ec727ea7Spatrickdef compare_stats(results_old: AnalysisRun, results_new: AnalysisRun,
ec727ea7Spatrick                  out: TextIO = sys.stdout):
ec727ea7Spatrick    stats_old = derive_stats(results_old)
ec727ea7Spatrick    stats_new = derive_stats(results_new)
ec727ea7Spatrick
ec727ea7Spatrick    old_keys = set(stats_old.keys())
ec727ea7Spatrick    new_keys = set(stats_new.keys())
ec727ea7Spatrick    keys = sorted(old_keys & new_keys)
ec727ea7Spatrick
e5dd7070Spatrick    for key in keys:
ec727ea7Spatrick        out.write(f"{key}\n")
ec727ea7Spatrick
ec727ea7Spatrick        nested_keys = sorted(set(stats_old[key]) & set(stats_new[key]))
ec727ea7Spatrick
ec727ea7Spatrick        for nested_key in nested_keys:
ec727ea7Spatrick            val_old = float(stats_old[key][nested_key])
ec727ea7Spatrick            val_new = float(stats_new[key][nested_key])
ec727ea7Spatrick
ec727ea7Spatrick            report = f"{val_old:.3f} -> {val_new:.3f}"
ec727ea7Spatrick
e5dd7070Spatrick            # Only apply highlighting when writing to TTY and it's not Windows
ec727ea7Spatrick            if out.isatty() and os.name != 'nt':
ec727ea7Spatrick                if val_new != 0:
ec727ea7Spatrick                    ratio = (val_new - val_old) / val_new
e5dd7070Spatrick                    if ratio < -0.2:
e5dd7070Spatrick                        report = Colors.GREEN + report + Colors.CLEAR
e5dd7070Spatrick                    elif ratio > 0.2:
e5dd7070Spatrick                        report = Colors.RED + report + Colors.CLEAR
e5dd7070Spatrick
ec727ea7Spatrick            out.write(f"\t {nested_key} {report}\n")
ec727ea7Spatrick
ec727ea7Spatrick    removed_keys = old_keys - new_keys
ec727ea7Spatrick    if removed_keys:
ec727ea7Spatrick        out.write(f"REMOVED statistics: {removed_keys}\n")
ec727ea7Spatrick
ec727ea7Spatrick    added_keys = new_keys - old_keys
ec727ea7Spatrick    if added_keys:
ec727ea7Spatrick        out.write(f"ADDED statistics: {added_keys}\n")
ec727ea7Spatrick
ec727ea7Spatrick    out.write("\n")
ec727ea7Spatrick
ec727ea7Spatrick
ec727ea7Spatrickdef dump_scan_build_results_diff(dir_old: ResultsDirectory,
ec727ea7Spatrick                                 dir_new: ResultsDirectory,
ec727ea7Spatrick                                 delete_empty: bool = True,
ec727ea7Spatrick                                 out: TextIO = sys.stdout,
ec727ea7Spatrick                                 show_stats: bool = False,
ec727ea7Spatrick                                 stats_only: bool = False,
ec727ea7Spatrick                                 histogram: Optional[HistogramType] = None,
ec727ea7Spatrick                                 verbose_log: Optional[str] = None):
ec727ea7Spatrick    """
ec727ea7Spatrick    Compare directories with analysis results and dump results.
ec727ea7Spatrick
ec727ea7Spatrick    :param delete_empty: delete empty plist files
ec727ea7Spatrick    :param out: buffer to dump comparison results to.
ec727ea7Spatrick    :param show_stats: compare execution stats as well.
ec727ea7Spatrick    :param stats_only: compare ONLY execution stats.
ec727ea7Spatrick    :param histogram: optional histogram type to plot path differences.
ec727ea7Spatrick    :param verbose_log: optional path to an additional log file.
ec727ea7Spatrick    """
ec727ea7Spatrick    results_old = load_results(dir_old, delete_empty, verbose_log)
ec727ea7Spatrick    results_new = load_results(dir_new, delete_empty, verbose_log)
ec727ea7Spatrick
ec727ea7Spatrick    if show_stats or stats_only:
ec727ea7Spatrick        compare_stats(results_old, results_new)
ec727ea7Spatrick    if stats_only:
e5dd7070Spatrick        return
e5dd7070Spatrick
e5dd7070Spatrick    # Open the verbose log, if given.
ec727ea7Spatrick    if verbose_log:
a9ac8606Spatrick        aux_log: Optional[TextIO] = open(verbose_log, "w")
e5dd7070Spatrick    else:
a9ac8606Spatrick        aux_log = None
e5dd7070Spatrick
ec727ea7Spatrick    diff = compare_results(results_old, results_new, histogram)
ec727ea7Spatrick    found_diffs = 0
ec727ea7Spatrick    total_added = 0
ec727ea7Spatrick    total_removed = 0
a9ac8606Spatrick    total_modified = 0
ec727ea7Spatrick
a9ac8606Spatrick    for new in diff.present_only_in_new:
a9ac8606Spatrick        out.write(f"ADDED: {new.get_readable_name()}\n\n")
ec727ea7Spatrick        found_diffs += 1
ec727ea7Spatrick        total_added += 1
a9ac8606Spatrick        if aux_log:
a9ac8606Spatrick            aux_log.write(f"('ADDED', {new.get_readable_name()}, "
ec727ea7Spatrick                          f"{new.get_html_report()})\n")
ec727ea7Spatrick
a9ac8606Spatrick    for old in diff.present_only_in_old:
a9ac8606Spatrick        out.write(f"REMOVED: {old.get_readable_name()}\n\n")
ec727ea7Spatrick        found_diffs += 1
ec727ea7Spatrick        total_removed += 1
a9ac8606Spatrick        if aux_log:
a9ac8606Spatrick            aux_log.write(f"('REMOVED', {old.get_readable_name()}, "
ec727ea7Spatrick                          f"{old.get_html_report()})\n")
a9ac8606Spatrick
a9ac8606Spatrick    for old, new in diff.changed_between_new_and_old:
a9ac8606Spatrick        out.write(f"MODIFIED: {old.get_readable_name()}\n")
a9ac8606Spatrick        found_diffs += 1
a9ac8606Spatrick        total_modified += 1
a9ac8606Spatrick        diffs = old.get_diffs(new)
a9ac8606Spatrick        str_diffs = [f"          '{key}' changed: "
a9ac8606Spatrick                     f"'{old_value}' -> '{new_value}'"
a9ac8606Spatrick                     for key, (old_value, new_value) in diffs.items()]
a9ac8606Spatrick        out.write(",\n".join(str_diffs) + "\n\n")
a9ac8606Spatrick        if aux_log:
a9ac8606Spatrick            aux_log.write(f"('MODIFIED', {old.get_readable_name()}, "
a9ac8606Spatrick                          f"{old.get_html_report()})\n")
e5dd7070Spatrick
ec727ea7Spatrick    total_reports = len(results_new.diagnostics)
ec727ea7Spatrick    out.write(f"TOTAL REPORTS: {total_reports}\n")
ec727ea7Spatrick    out.write(f"TOTAL ADDED: {total_added}\n")
ec727ea7Spatrick    out.write(f"TOTAL REMOVED: {total_removed}\n")
a9ac8606Spatrick    out.write(f"TOTAL MODIFIED: {total_modified}\n")
ec727ea7Spatrick
a9ac8606Spatrick    if aux_log:
a9ac8606Spatrick        aux_log.write(f"('TOTAL NEW REPORTS', {total_reports})\n")
a9ac8606Spatrick        aux_log.write(f"('TOTAL DIFFERENCES', {found_diffs})\n")
a9ac8606Spatrick        aux_log.close()
e5dd7070Spatrick
ec727ea7Spatrick    # TODO: change to NamedTuple
ec727ea7Spatrick    return found_diffs, len(results_old.diagnostics), \
ec727ea7Spatrick        len(results_new.diagnostics)
e5dd7070Spatrick
e5dd7070Spatrick
ec727ea7Spatrickif __name__ == "__main__":
ec727ea7Spatrick    print("CmpRuns.py should not be used on its own.")
ec727ea7Spatrick    print("Please use 'SATest.py compare' instead")
ec727ea7Spatrick    sys.exit(1)