xref: /openbsd/gnu/llvm/clang/utils/analyzer/CmpRuns.py (revision 12c85518)
1e5dd7070Spatrick#!/usr/bin/env python
2e5dd7070Spatrick
3e5dd7070Spatrick"""
4e5dd7070SpatrickCmpRuns - A simple tool for comparing two static analyzer runs to determine
5e5dd7070Spatrickwhich reports have been added, removed, or changed.
6e5dd7070Spatrick
7e5dd7070SpatrickThis is designed to support automated testing using the static analyzer, from
8e5dd7070Spatricktwo perspectives:
9e5dd7070Spatrick  1. To monitor changes in the static analyzer's reports on real code bases,
10e5dd7070Spatrick     for regression testing.
11e5dd7070Spatrick
12e5dd7070Spatrick  2. For use by end users who want to integrate regular static analyzer testing
13e5dd7070Spatrick     into a buildbot like environment.
14e5dd7070Spatrick
15e5dd7070SpatrickUsage:
16e5dd7070Spatrick
17e5dd7070Spatrick    # Load the results of both runs, to obtain lists of the corresponding
18e5dd7070Spatrick    # AnalysisDiagnostic objects.
19e5dd7070Spatrick    #
20ec727ea7Spatrick    resultsA = load_results_from_single_run(singleRunInfoA, delete_empty)
21ec727ea7Spatrick    resultsB = load_results_from_single_run(singleRunInfoB, delete_empty)
22e5dd7070Spatrick
23e5dd7070Spatrick    # Generate a relation from diagnostics in run A to diagnostics in run B
24e5dd7070Spatrick    # to obtain a list of triples (a, b, confidence).
25ec727ea7Spatrick    diff = compare_results(resultsA, resultsB)
26e5dd7070Spatrick
27e5dd7070Spatrick"""
28e5dd7070Spatrickimport json
29e5dd7070Spatrickimport os
30e5dd7070Spatrickimport plistlib
31e5dd7070Spatrickimport re
32e5dd7070Spatrickimport sys
33e5dd7070Spatrick
34ec727ea7Spatrickfrom math import log
35ec727ea7Spatrickfrom collections import defaultdict
36ec727ea7Spatrickfrom copy import copy
37ec727ea7Spatrickfrom enum import Enum
38a9ac8606Spatrickfrom typing import (Any, DefaultDict, Dict, List, NamedTuple, Optional,
39a9ac8606Spatrick                    Sequence, Set, TextIO, TypeVar, Tuple, Union)
40ec727ea7Spatrick
41ec727ea7Spatrick
42ec727ea7SpatrickNumber = Union[int, float]
43ec727ea7SpatrickStats = Dict[str, Dict[str, Number]]
44ec727ea7SpatrickPlist = Dict[str, Any]
45ec727ea7SpatrickJSON = Dict[str, Any]
46a9ac8606Spatrick# Diff in a form: field -> (before, after)
47a9ac8606SpatrickJSONDiff = Dict[str, Tuple[str, str]]
48ec727ea7Spatrick# Type for generics
49ec727ea7SpatrickT = TypeVar('T')
50ec727ea7Spatrick
51e5dd7070SpatrickSTATS_REGEXP = re.compile(r"Statistics: (\{.+\})", re.MULTILINE | re.DOTALL)
52e5dd7070Spatrick
53ec727ea7Spatrick
54ec727ea7Spatrickclass Colors:
55e5dd7070Spatrick    """
56e5dd7070Spatrick    Color for terminal highlight.
57e5dd7070Spatrick    """
58e5dd7070Spatrick    RED = '\x1b[2;30;41m'
59e5dd7070Spatrick    GREEN = '\x1b[6;30;42m'
60e5dd7070Spatrick    CLEAR = '\x1b[0m'
61e5dd7070Spatrick
62ec727ea7Spatrick
63ec727ea7Spatrickclass HistogramType(str, Enum):
64ec727ea7Spatrick    RELATIVE = "relative"
65ec727ea7Spatrick    LOG_RELATIVE = "log-relative"
66ec727ea7Spatrick    ABSOLUTE = "absolute"
67e5dd7070Spatrick
68e5dd7070Spatrick
69ec727ea7Spatrickclass ResultsDirectory(NamedTuple):
70ec727ea7Spatrick    path: str
71ec727ea7Spatrick    root: str = ""
72ec727ea7Spatrick
73ec727ea7Spatrick
74ec727ea7Spatrickclass SingleRunInfo:
75ec727ea7Spatrick    """
76ec727ea7Spatrick    Information about analysis run:
77ec727ea7Spatrick    path - the analysis output directory
78ec727ea7Spatrick    root - the name of the root directory, which will be disregarded when
79ec727ea7Spatrick    determining the source file name
80ec727ea7Spatrick    """
81ec727ea7Spatrick    def __init__(self, results: ResultsDirectory,
82ec727ea7Spatrick                 verbose_log: Optional[str] = None):
83ec727ea7Spatrick        self.path = results.path
84ec727ea7Spatrick        self.root = results.root.rstrip("/\\")
85ec727ea7Spatrick        self.verbose_log = verbose_log
86ec727ea7Spatrick
87ec727ea7Spatrick
88ec727ea7Spatrickclass AnalysisDiagnostic:
89ec727ea7Spatrick    def __init__(self, data: Plist, report: "AnalysisReport",
90ec727ea7Spatrick                 html_report: Optional[str]):
91e5dd7070Spatrick        self._data = data
92e5dd7070Spatrick        self._loc = self._data['location']
93e5dd7070Spatrick        self._report = report
94ec727ea7Spatrick        self._html_report = html_report
95ec727ea7Spatrick        self._report_size = len(self._data['path'])
96e5dd7070Spatrick
97ec727ea7Spatrick    def get_file_name(self) -> str:
98e5dd7070Spatrick        root = self._report.run.root
99ec727ea7Spatrick        file_name = self._report.files[self._loc['file']]
100e5dd7070Spatrick
101ec727ea7Spatrick        if file_name.startswith(root) and len(root) > 0:
102ec727ea7Spatrick            return file_name[len(root) + 1:]
103ec727ea7Spatrick
104ec727ea7Spatrick        return file_name
105ec727ea7Spatrick
106ec727ea7Spatrick    def get_root_file_name(self) -> str:
107e5dd7070Spatrick        path = self._data['path']
108ec727ea7Spatrick
109e5dd7070Spatrick        if not path:
110ec727ea7Spatrick            return self.get_file_name()
111ec727ea7Spatrick
112e5dd7070Spatrick        p = path[0]
113e5dd7070Spatrick        if 'location' in p:
114ec727ea7Spatrick            file_index = p['location']['file']
115e5dd7070Spatrick        else:  # control edge
116ec727ea7Spatrick            file_index = path[0]['edges'][0]['start'][0]['file']
117ec727ea7Spatrick
118ec727ea7Spatrick        out = self._report.files[file_index]
119e5dd7070Spatrick        root = self._report.run.root
120ec727ea7Spatrick
121e5dd7070Spatrick        if out.startswith(root):
122e5dd7070Spatrick            return out[len(root):]
123ec727ea7Spatrick
124e5dd7070Spatrick        return out
125e5dd7070Spatrick
126ec727ea7Spatrick    def get_line(self) -> int:
127e5dd7070Spatrick        return self._loc['line']
128e5dd7070Spatrick
129ec727ea7Spatrick    def get_column(self) -> int:
130e5dd7070Spatrick        return self._loc['col']
131e5dd7070Spatrick
132ec727ea7Spatrick    def get_path_length(self) -> int:
133ec727ea7Spatrick        return self._report_size
134e5dd7070Spatrick
135ec727ea7Spatrick    def get_category(self) -> str:
136e5dd7070Spatrick        return self._data['category']
137e5dd7070Spatrick
138ec727ea7Spatrick    def get_description(self) -> str:
139e5dd7070Spatrick        return self._data['description']
140e5dd7070Spatrick
141a9ac8606Spatrick    def get_location(self) -> str:
142a9ac8606Spatrick        return f"{self.get_file_name()}:{self.get_line()}:{self.get_column()}"
143a9ac8606Spatrick
144ec727ea7Spatrick    def get_issue_identifier(self) -> str:
145ec727ea7Spatrick        id = self.get_file_name() + "+"
146ec727ea7Spatrick
147ec727ea7Spatrick        if "issue_context" in self._data:
148ec727ea7Spatrick            id += self._data["issue_context"] + "+"
149ec727ea7Spatrick
150ec727ea7Spatrick        if "issue_hash_content_of_line_in_context" in self._data:
151ec727ea7Spatrick            id += str(self._data["issue_hash_content_of_line_in_context"])
152ec727ea7Spatrick
153e5dd7070Spatrick        return id
154e5dd7070Spatrick
155ec727ea7Spatrick    def get_html_report(self) -> str:
156ec727ea7Spatrick        if self._html_report is None:
157e5dd7070Spatrick            return " "
158e5dd7070Spatrick
159ec727ea7Spatrick        return os.path.join(self._report.run.path, self._html_report)
160ec727ea7Spatrick
161ec727ea7Spatrick    def get_readable_name(self) -> str:
162ec727ea7Spatrick        if "issue_context" in self._data:
163ec727ea7Spatrick            funcname_postfix = "#" + self._data["issue_context"]
164e5dd7070Spatrick        else:
165ec727ea7Spatrick            funcname_postfix = ""
166ec727ea7Spatrick
167ec727ea7Spatrick        root_filename = self.get_root_file_name()
168ec727ea7Spatrick        file_name = self.get_file_name()
169ec727ea7Spatrick
170ec727ea7Spatrick        if root_filename != file_name:
171ec727ea7Spatrick            file_prefix = f"[{root_filename}] {file_name}"
172e5dd7070Spatrick        else:
173ec727ea7Spatrick            file_prefix = root_filename
174ec727ea7Spatrick
175ec727ea7Spatrick        line = self.get_line()
176ec727ea7Spatrick        col = self.get_column()
177ec727ea7Spatrick        return f"{file_prefix}{funcname_postfix}:{line}:{col}" \
178ec727ea7Spatrick            f", {self.get_category()}: {self.get_description()}"
179e5dd7070Spatrick
180a9ac8606Spatrick    KEY_FIELDS = ["check_name", "category", "description"]
181a9ac8606Spatrick
182a9ac8606Spatrick    def is_similar_to(self, other: "AnalysisDiagnostic") -> bool:
183a9ac8606Spatrick        # We consider two diagnostics similar only if at least one
184a9ac8606Spatrick        # of the key fields is the same in both diagnostics.
185a9ac8606Spatrick        return len(self.get_diffs(other)) != len(self.KEY_FIELDS)
186a9ac8606Spatrick
187a9ac8606Spatrick    def get_diffs(self, other: "AnalysisDiagnostic") -> JSONDiff:
188a9ac8606Spatrick        return {field: (self._data[field], other._data[field])
189a9ac8606Spatrick                for field in self.KEY_FIELDS
190a9ac8606Spatrick                if self._data[field] != other._data[field]}
191a9ac8606Spatrick
192e5dd7070Spatrick    # Note, the data format is not an API and may change from one analyzer
193e5dd7070Spatrick    # version to another.
194ec727ea7Spatrick    def get_raw_data(self) -> Plist:
195e5dd7070Spatrick        return self._data
196e5dd7070Spatrick
197a9ac8606Spatrick    def __eq__(self, other: object) -> bool:
198a9ac8606Spatrick        return hash(self) == hash(other)
199a9ac8606Spatrick
200a9ac8606Spatrick    def __ne__(self, other: object) -> bool:
201a9ac8606Spatrick        return hash(self) != hash(other)
202a9ac8606Spatrick
203a9ac8606Spatrick    def __hash__(self) -> int:
204a9ac8606Spatrick        return hash(self.get_issue_identifier())
205a9ac8606Spatrick
206e5dd7070Spatrick
207ec727ea7Spatrickclass AnalysisRun:
208ec727ea7Spatrick    def __init__(self, info: SingleRunInfo):
209e5dd7070Spatrick        self.path = info.path
210e5dd7070Spatrick        self.root = info.root
211e5dd7070Spatrick        self.info = info
212ec727ea7Spatrick        self.reports: List[AnalysisReport] = []
213e5dd7070Spatrick        # Cumulative list of all diagnostics from all the reports.
214ec727ea7Spatrick        self.diagnostics: List[AnalysisDiagnostic] = []
215ec727ea7Spatrick        self.clang_version: Optional[str] = None
216ec727ea7Spatrick        self.raw_stats: List[JSON] = []
217e5dd7070Spatrick
218ec727ea7Spatrick    def get_clang_version(self) -> Optional[str]:
219e5dd7070Spatrick        return self.clang_version
220e5dd7070Spatrick
221ec727ea7Spatrick    def read_single_file(self, path: str, delete_empty: bool):
222ec727ea7Spatrick        with open(path, "rb") as plist_file:
223ec727ea7Spatrick            data = plistlib.load(plist_file)
224ec727ea7Spatrick
225e5dd7070Spatrick        if 'statistics' in data:
226ec727ea7Spatrick            self.raw_stats.append(json.loads(data['statistics']))
227e5dd7070Spatrick            data.pop('statistics')
228e5dd7070Spatrick
229e5dd7070Spatrick        # We want to retrieve the clang version even if there are no
230e5dd7070Spatrick        # reports. Assume that all reports were created using the same
231e5dd7070Spatrick        # clang version (this is always true and is more efficient).
232e5dd7070Spatrick        if 'clang_version' in data:
233e5dd7070Spatrick            if self.clang_version is None:
234e5dd7070Spatrick                self.clang_version = data.pop('clang_version')
235e5dd7070Spatrick            else:
236e5dd7070Spatrick                data.pop('clang_version')
237e5dd7070Spatrick
238e5dd7070Spatrick        # Ignore/delete empty reports.
239e5dd7070Spatrick        if not data['files']:
240ec727ea7Spatrick            if delete_empty:
241ec727ea7Spatrick                os.remove(path)
242e5dd7070Spatrick            return
243e5dd7070Spatrick
244e5dd7070Spatrick        # Extract the HTML reports, if they exists.
245e5dd7070Spatrick        htmlFiles = []
246e5dd7070Spatrick        for d in data['diagnostics']:
247*12c85518Srobert            if 'HTMLDiagnostics_files' in d:
248e5dd7070Spatrick                # FIXME: Why is this named files, when does it have multiple
249e5dd7070Spatrick                # files?
250e5dd7070Spatrick                assert len(d['HTMLDiagnostics_files']) == 1
251e5dd7070Spatrick                htmlFiles.append(d.pop('HTMLDiagnostics_files')[0])
252e5dd7070Spatrick            else:
253*12c85518Srobert                htmlFiles.append(None)
254e5dd7070Spatrick
255e5dd7070Spatrick        report = AnalysisReport(self, data.pop('files'))
256*12c85518Srobert        # Python 3.10 offers zip(..., strict=True). The following assertion
257*12c85518Srobert        # mimics it.
258*12c85518Srobert        assert len(data['diagnostics']) == len(htmlFiles)
259e5dd7070Spatrick        diagnostics = [AnalysisDiagnostic(d, report, h)
260e5dd7070Spatrick                       for d, h in zip(data.pop('diagnostics'), htmlFiles)]
261e5dd7070Spatrick
262e5dd7070Spatrick        assert not data
263e5dd7070Spatrick
264e5dd7070Spatrick        report.diagnostics.extend(diagnostics)
265e5dd7070Spatrick        self.reports.append(report)
266e5dd7070Spatrick        self.diagnostics.extend(diagnostics)
267e5dd7070Spatrick
268e5dd7070Spatrick
269ec727ea7Spatrickclass AnalysisReport:
270ec727ea7Spatrick    def __init__(self, run: AnalysisRun, files: List[str]):
271ec727ea7Spatrick        self.run = run
272ec727ea7Spatrick        self.files = files
273ec727ea7Spatrick        self.diagnostics: List[AnalysisDiagnostic] = []
274ec727ea7Spatrick
275ec727ea7Spatrick
276ec727ea7Spatrickdef load_results(results: ResultsDirectory, delete_empty: bool = True,
277ec727ea7Spatrick                 verbose_log: Optional[str] = None) -> AnalysisRun:
278e5dd7070Spatrick    """
279e5dd7070Spatrick    Backwards compatibility API.
280e5dd7070Spatrick    """
281ec727ea7Spatrick    return load_results_from_single_run(SingleRunInfo(results,
282ec727ea7Spatrick                                                      verbose_log),
283ec727ea7Spatrick                                        delete_empty)
284e5dd7070Spatrick
285e5dd7070Spatrick
286ec727ea7Spatrickdef load_results_from_single_run(info: SingleRunInfo,
287ec727ea7Spatrick                                 delete_empty: bool = True) -> AnalysisRun:
288e5dd7070Spatrick    """
289e5dd7070Spatrick    # Load results of the analyzes from a given output folder.
290e5dd7070Spatrick    # - info is the SingleRunInfo object
291ec727ea7Spatrick    # - delete_empty specifies if the empty plist files should be deleted
292e5dd7070Spatrick
293e5dd7070Spatrick    """
294e5dd7070Spatrick    path = info.path
295e5dd7070Spatrick    run = AnalysisRun(info)
296e5dd7070Spatrick
297e5dd7070Spatrick    if os.path.isfile(path):
298ec727ea7Spatrick        run.read_single_file(path, delete_empty)
299e5dd7070Spatrick    else:
300ec727ea7Spatrick        for dirpath, dirnames, filenames in os.walk(path):
301e5dd7070Spatrick            for f in filenames:
302ec727ea7Spatrick                if not f.endswith('plist'):
303e5dd7070Spatrick                    continue
304ec727ea7Spatrick
305e5dd7070Spatrick                p = os.path.join(dirpath, f)
306ec727ea7Spatrick                run.read_single_file(p, delete_empty)
307e5dd7070Spatrick
308e5dd7070Spatrick    return run
309e5dd7070Spatrick
310e5dd7070Spatrick
311ec727ea7Spatrickdef cmp_analysis_diagnostic(d):
312ec727ea7Spatrick    return d.get_issue_identifier()
313e5dd7070Spatrick
314e5dd7070Spatrick
315a9ac8606SpatrickAnalysisDiagnosticPair = Tuple[AnalysisDiagnostic, AnalysisDiagnostic]
316a9ac8606Spatrick
317a9ac8606Spatrick
318a9ac8606Spatrickclass ComparisonResult:
319a9ac8606Spatrick    def __init__(self):
320a9ac8606Spatrick        self.present_in_both: List[AnalysisDiagnostic] = []
321a9ac8606Spatrick        self.present_only_in_old: List[AnalysisDiagnostic] = []
322a9ac8606Spatrick        self.present_only_in_new: List[AnalysisDiagnostic] = []
323a9ac8606Spatrick        self.changed_between_new_and_old: List[AnalysisDiagnosticPair] = []
324a9ac8606Spatrick
325a9ac8606Spatrick    def add_common(self, issue: AnalysisDiagnostic):
326a9ac8606Spatrick        self.present_in_both.append(issue)
327a9ac8606Spatrick
328a9ac8606Spatrick    def add_removed(self, issue: AnalysisDiagnostic):
329a9ac8606Spatrick        self.present_only_in_old.append(issue)
330a9ac8606Spatrick
331a9ac8606Spatrick    def add_added(self, issue: AnalysisDiagnostic):
332a9ac8606Spatrick        self.present_only_in_new.append(issue)
333a9ac8606Spatrick
334a9ac8606Spatrick    def add_changed(self, old_issue: AnalysisDiagnostic,
335a9ac8606Spatrick                    new_issue: AnalysisDiagnostic):
336a9ac8606Spatrick        self.changed_between_new_and_old.append((old_issue, new_issue))
337a9ac8606Spatrick
338a9ac8606Spatrick
339a9ac8606SpatrickGroupedDiagnostics = DefaultDict[str, List[AnalysisDiagnostic]]
340a9ac8606Spatrick
341a9ac8606Spatrick
342a9ac8606Spatrickdef get_grouped_diagnostics(diagnostics: List[AnalysisDiagnostic]
343a9ac8606Spatrick                            ) -> GroupedDiagnostics:
344a9ac8606Spatrick    result: GroupedDiagnostics = defaultdict(list)
345a9ac8606Spatrick    for diagnostic in diagnostics:
346a9ac8606Spatrick        result[diagnostic.get_location()].append(diagnostic)
347a9ac8606Spatrick    return result
348ec727ea7Spatrick
349ec727ea7Spatrick
350ec727ea7Spatrickdef compare_results(results_old: AnalysisRun, results_new: AnalysisRun,
351ec727ea7Spatrick                    histogram: Optional[HistogramType] = None
352ec727ea7Spatrick                    ) -> ComparisonResult:
353e5dd7070Spatrick    """
354ec727ea7Spatrick    compare_results - Generate a relation from diagnostics in run A to
355e5dd7070Spatrick    diagnostics in run B.
356e5dd7070Spatrick
357e5dd7070Spatrick    The result is the relation as a list of triples (a, b) where
358e5dd7070Spatrick    each element {a,b} is None or a matching element from the respective run
359e5dd7070Spatrick    """
360e5dd7070Spatrick
361a9ac8606Spatrick    res = ComparisonResult()
362e5dd7070Spatrick
363e5dd7070Spatrick    # Map size_before -> size_after
364ec727ea7Spatrick    path_difference_data: List[float] = []
365e5dd7070Spatrick
366a9ac8606Spatrick    diags_old = get_grouped_diagnostics(results_old.diagnostics)
367a9ac8606Spatrick    diags_new = get_grouped_diagnostics(results_new.diagnostics)
368ec727ea7Spatrick
369a9ac8606Spatrick    locations_old = set(diags_old.keys())
370a9ac8606Spatrick    locations_new = set(diags_new.keys())
371ec727ea7Spatrick
372a9ac8606Spatrick    common_locations = locations_old & locations_new
373ec727ea7Spatrick
374a9ac8606Spatrick    for location in common_locations:
375a9ac8606Spatrick        old = diags_old[location]
376a9ac8606Spatrick        new = diags_new[location]
377ec727ea7Spatrick
378a9ac8606Spatrick        # Quadratic algorithms in this part are fine because 'old' and 'new'
379a9ac8606Spatrick        # are most commonly of size 1.
380a9ac8606Spatrick        common: Set[AnalysisDiagnostic] = set()
381a9ac8606Spatrick        for a in old:
382a9ac8606Spatrick            for b in new:
383ec727ea7Spatrick                if a.get_issue_identifier() == b.get_issue_identifier():
384a9ac8606Spatrick                    a_path_len = a.get_path_length()
385a9ac8606Spatrick                    b_path_len = b.get_path_length()
386a9ac8606Spatrick
387a9ac8606Spatrick                    if a_path_len != b_path_len:
388ec727ea7Spatrick
389ec727ea7Spatrick                        if histogram == HistogramType.RELATIVE:
390e5dd7070Spatrick                            path_difference_data.append(
391a9ac8606Spatrick                                float(a_path_len) / b_path_len)
392ec727ea7Spatrick
393ec727ea7Spatrick                        elif histogram == HistogramType.LOG_RELATIVE:
394e5dd7070Spatrick                            path_difference_data.append(
395a9ac8606Spatrick                                log(float(a_path_len) / b_path_len))
396ec727ea7Spatrick
397ec727ea7Spatrick                        elif histogram == HistogramType.ABSOLUTE:
398e5dd7070Spatrick                            path_difference_data.append(
399a9ac8606Spatrick                                a_path_len - b_path_len)
400e5dd7070Spatrick
401a9ac8606Spatrick                    res.add_common(b)
402a9ac8606Spatrick                    common.add(a)
403ec727ea7Spatrick
404a9ac8606Spatrick        old = filter_issues(old, common)
405a9ac8606Spatrick        new = filter_issues(new, common)
406a9ac8606Spatrick        common = set()
407ec727ea7Spatrick
408a9ac8606Spatrick        for a in old:
409a9ac8606Spatrick            for b in new:
410a9ac8606Spatrick                if a.is_similar_to(b):
411a9ac8606Spatrick                    res.add_changed(a, b)
412a9ac8606Spatrick                    common.add(a)
413a9ac8606Spatrick                    common.add(b)
414ec727ea7Spatrick
415a9ac8606Spatrick        old = filter_issues(old, common)
416a9ac8606Spatrick        new = filter_issues(new, common)
417a9ac8606Spatrick
418a9ac8606Spatrick        # Whatever is left in 'old' doesn't have a corresponding diagnostic
419a9ac8606Spatrick        # in 'new', so we need to mark it as 'removed'.
420a9ac8606Spatrick        for a in old:
421a9ac8606Spatrick            res.add_removed(a)
422a9ac8606Spatrick
423a9ac8606Spatrick        # Whatever is left in 'new' doesn't have a corresponding diagnostic
424a9ac8606Spatrick        # in 'old', so we need to mark it as 'added'.
425a9ac8606Spatrick        for b in new:
426a9ac8606Spatrick            res.add_added(b)
427a9ac8606Spatrick
428a9ac8606Spatrick    only_old_locations = locations_old - common_locations
429a9ac8606Spatrick    for location in only_old_locations:
430a9ac8606Spatrick        for a in diags_old[location]:
431a9ac8606Spatrick            # These locations have been found only in the old build, so we
432a9ac8606Spatrick            # need to mark all of therm as 'removed'
433a9ac8606Spatrick            res.add_removed(a)
434a9ac8606Spatrick
435a9ac8606Spatrick    only_new_locations = locations_new - common_locations
436a9ac8606Spatrick    for location in only_new_locations:
437a9ac8606Spatrick        for b in diags_new[location]:
438a9ac8606Spatrick            # These locations have been found only in the new build, so we
439a9ac8606Spatrick            # need to mark all of therm as 'added'
440a9ac8606Spatrick            res.add_added(b)
441e5dd7070Spatrick
442e5dd7070Spatrick    # FIXME: Add fuzzy matching. One simple and possible effective idea would
443e5dd7070Spatrick    # be to bin the diagnostics, print them in a normalized form (based solely
444e5dd7070Spatrick    # on the structure of the diagnostic), compute the diff, then use that as
445e5dd7070Spatrick    # the basis for matching. This has the nice property that we don't depend
446e5dd7070Spatrick    # in any way on the diagnostic format.
447e5dd7070Spatrick
448ec727ea7Spatrick    if histogram:
449e5dd7070Spatrick        from matplotlib import pyplot
450e5dd7070Spatrick        pyplot.hist(path_difference_data, bins=100)
451e5dd7070Spatrick        pyplot.show()
452e5dd7070Spatrick
453e5dd7070Spatrick    return res
454e5dd7070Spatrick
455ec727ea7Spatrick
456a9ac8606Spatrickdef filter_issues(origin: List[AnalysisDiagnostic],
457a9ac8606Spatrick                  to_remove: Set[AnalysisDiagnostic]) \
458a9ac8606Spatrick                  -> List[AnalysisDiagnostic]:
459a9ac8606Spatrick    return [diag for diag in origin if diag not in to_remove]
460a9ac8606Spatrick
461a9ac8606Spatrick
462ec727ea7Spatrickdef compute_percentile(values: Sequence[T], percentile: float) -> T:
463e5dd7070Spatrick    """
464e5dd7070Spatrick    Return computed percentile.
465e5dd7070Spatrick    """
466ec727ea7Spatrick    return sorted(values)[int(round(percentile * len(values) + 0.5)) - 1]
467e5dd7070Spatrick
468ec727ea7Spatrick
469ec727ea7Spatrickdef derive_stats(results: AnalysisRun) -> Stats:
470e5dd7070Spatrick    # Assume all keys are the same in each statistics bucket.
471e5dd7070Spatrick    combined_data = defaultdict(list)
472e5dd7070Spatrick
473e5dd7070Spatrick    # Collect data on paths length.
474e5dd7070Spatrick    for report in results.reports:
475e5dd7070Spatrick        for diagnostic in report.diagnostics:
476ec727ea7Spatrick            combined_data['PathsLength'].append(diagnostic.get_path_length())
477e5dd7070Spatrick
478ec727ea7Spatrick    for stat in results.raw_stats:
479e5dd7070Spatrick        for key, value in stat.items():
480ec727ea7Spatrick            combined_data[str(key)].append(value)
481ec727ea7Spatrick
482ec727ea7Spatrick    combined_stats: Stats = {}
483ec727ea7Spatrick
484e5dd7070Spatrick    for key, values in combined_data.items():
485ec727ea7Spatrick        combined_stats[key] = {
486e5dd7070Spatrick            "max": max(values),
487e5dd7070Spatrick            "min": min(values),
488e5dd7070Spatrick            "mean": sum(values) / len(values),
489ec727ea7Spatrick            "90th %tile": compute_percentile(values, 0.9),
490ec727ea7Spatrick            "95th %tile": compute_percentile(values, 0.95),
491e5dd7070Spatrick            "median": sorted(values)[len(values) // 2],
492e5dd7070Spatrick            "total": sum(values)
493e5dd7070Spatrick        }
494ec727ea7Spatrick
495e5dd7070Spatrick    return combined_stats
496e5dd7070Spatrick
497e5dd7070Spatrick
498ec727ea7Spatrick# TODO: compare_results decouples comparison from the output, we should
499ec727ea7Spatrick#       do it here as well
500ec727ea7Spatrickdef compare_stats(results_old: AnalysisRun, results_new: AnalysisRun,
501ec727ea7Spatrick                  out: TextIO = sys.stdout):
502ec727ea7Spatrick    stats_old = derive_stats(results_old)
503ec727ea7Spatrick    stats_new = derive_stats(results_new)
504ec727ea7Spatrick
505ec727ea7Spatrick    old_keys = set(stats_old.keys())
506ec727ea7Spatrick    new_keys = set(stats_new.keys())
507ec727ea7Spatrick    keys = sorted(old_keys & new_keys)
508ec727ea7Spatrick
509e5dd7070Spatrick    for key in keys:
510ec727ea7Spatrick        out.write(f"{key}\n")
511ec727ea7Spatrick
512ec727ea7Spatrick        nested_keys = sorted(set(stats_old[key]) & set(stats_new[key]))
513ec727ea7Spatrick
514ec727ea7Spatrick        for nested_key in nested_keys:
515ec727ea7Spatrick            val_old = float(stats_old[key][nested_key])
516ec727ea7Spatrick            val_new = float(stats_new[key][nested_key])
517ec727ea7Spatrick
518ec727ea7Spatrick            report = f"{val_old:.3f} -> {val_new:.3f}"
519ec727ea7Spatrick
520e5dd7070Spatrick            # Only apply highlighting when writing to TTY and it's not Windows
521ec727ea7Spatrick            if out.isatty() and os.name != 'nt':
522ec727ea7Spatrick                if val_new != 0:
523ec727ea7Spatrick                    ratio = (val_new - val_old) / val_new
524e5dd7070Spatrick                    if ratio < -0.2:
525e5dd7070Spatrick                        report = Colors.GREEN + report + Colors.CLEAR
526e5dd7070Spatrick                    elif ratio > 0.2:
527e5dd7070Spatrick                        report = Colors.RED + report + Colors.CLEAR
528e5dd7070Spatrick
529ec727ea7Spatrick            out.write(f"\t {nested_key} {report}\n")
530ec727ea7Spatrick
531ec727ea7Spatrick    removed_keys = old_keys - new_keys
532ec727ea7Spatrick    if removed_keys:
533ec727ea7Spatrick        out.write(f"REMOVED statistics: {removed_keys}\n")
534ec727ea7Spatrick
535ec727ea7Spatrick    added_keys = new_keys - old_keys
536ec727ea7Spatrick    if added_keys:
537ec727ea7Spatrick        out.write(f"ADDED statistics: {added_keys}\n")
538ec727ea7Spatrick
539ec727ea7Spatrick    out.write("\n")
540ec727ea7Spatrick
541ec727ea7Spatrick
542ec727ea7Spatrickdef dump_scan_build_results_diff(dir_old: ResultsDirectory,
543ec727ea7Spatrick                                 dir_new: ResultsDirectory,
544ec727ea7Spatrick                                 delete_empty: bool = True,
545ec727ea7Spatrick                                 out: TextIO = sys.stdout,
546ec727ea7Spatrick                                 show_stats: bool = False,
547ec727ea7Spatrick                                 stats_only: bool = False,
548ec727ea7Spatrick                                 histogram: Optional[HistogramType] = None,
549ec727ea7Spatrick                                 verbose_log: Optional[str] = None):
550ec727ea7Spatrick    """
551ec727ea7Spatrick    Compare directories with analysis results and dump results.
552ec727ea7Spatrick
553ec727ea7Spatrick    :param delete_empty: delete empty plist files
554ec727ea7Spatrick    :param out: buffer to dump comparison results to.
555ec727ea7Spatrick    :param show_stats: compare execution stats as well.
556ec727ea7Spatrick    :param stats_only: compare ONLY execution stats.
557ec727ea7Spatrick    :param histogram: optional histogram type to plot path differences.
558ec727ea7Spatrick    :param verbose_log: optional path to an additional log file.
559ec727ea7Spatrick    """
560ec727ea7Spatrick    results_old = load_results(dir_old, delete_empty, verbose_log)
561ec727ea7Spatrick    results_new = load_results(dir_new, delete_empty, verbose_log)
562ec727ea7Spatrick
563ec727ea7Spatrick    if show_stats or stats_only:
564ec727ea7Spatrick        compare_stats(results_old, results_new)
565ec727ea7Spatrick    if stats_only:
566e5dd7070Spatrick        return
567e5dd7070Spatrick
568e5dd7070Spatrick    # Open the verbose log, if given.
569ec727ea7Spatrick    if verbose_log:
570a9ac8606Spatrick        aux_log: Optional[TextIO] = open(verbose_log, "w")
571e5dd7070Spatrick    else:
572a9ac8606Spatrick        aux_log = None
573e5dd7070Spatrick
574ec727ea7Spatrick    diff = compare_results(results_old, results_new, histogram)
575ec727ea7Spatrick    found_diffs = 0
576ec727ea7Spatrick    total_added = 0
577ec727ea7Spatrick    total_removed = 0
578a9ac8606Spatrick    total_modified = 0
579ec727ea7Spatrick
580a9ac8606Spatrick    for new in diff.present_only_in_new:
581a9ac8606Spatrick        out.write(f"ADDED: {new.get_readable_name()}\n\n")
582ec727ea7Spatrick        found_diffs += 1
583ec727ea7Spatrick        total_added += 1
584a9ac8606Spatrick        if aux_log:
585a9ac8606Spatrick            aux_log.write(f"('ADDED', {new.get_readable_name()}, "
586ec727ea7Spatrick                          f"{new.get_html_report()})\n")
587ec727ea7Spatrick
588a9ac8606Spatrick    for old in diff.present_only_in_old:
589a9ac8606Spatrick        out.write(f"REMOVED: {old.get_readable_name()}\n\n")
590ec727ea7Spatrick        found_diffs += 1
591ec727ea7Spatrick        total_removed += 1
592a9ac8606Spatrick        if aux_log:
593a9ac8606Spatrick            aux_log.write(f"('REMOVED', {old.get_readable_name()}, "
594ec727ea7Spatrick                          f"{old.get_html_report()})\n")
595a9ac8606Spatrick
596a9ac8606Spatrick    for old, new in diff.changed_between_new_and_old:
597a9ac8606Spatrick        out.write(f"MODIFIED: {old.get_readable_name()}\n")
598a9ac8606Spatrick        found_diffs += 1
599a9ac8606Spatrick        total_modified += 1
600a9ac8606Spatrick        diffs = old.get_diffs(new)
601a9ac8606Spatrick        str_diffs = [f"          '{key}' changed: "
602a9ac8606Spatrick                     f"'{old_value}' -> '{new_value}'"
603a9ac8606Spatrick                     for key, (old_value, new_value) in diffs.items()]
604a9ac8606Spatrick        out.write(",\n".join(str_diffs) + "\n\n")
605a9ac8606Spatrick        if aux_log:
606a9ac8606Spatrick            aux_log.write(f"('MODIFIED', {old.get_readable_name()}, "
607a9ac8606Spatrick                          f"{old.get_html_report()})\n")
608e5dd7070Spatrick
609ec727ea7Spatrick    total_reports = len(results_new.diagnostics)
610ec727ea7Spatrick    out.write(f"TOTAL REPORTS: {total_reports}\n")
611ec727ea7Spatrick    out.write(f"TOTAL ADDED: {total_added}\n")
612ec727ea7Spatrick    out.write(f"TOTAL REMOVED: {total_removed}\n")
613a9ac8606Spatrick    out.write(f"TOTAL MODIFIED: {total_modified}\n")
614ec727ea7Spatrick
615a9ac8606Spatrick    if aux_log:
616a9ac8606Spatrick        aux_log.write(f"('TOTAL NEW REPORTS', {total_reports})\n")
617a9ac8606Spatrick        aux_log.write(f"('TOTAL DIFFERENCES', {found_diffs})\n")
618a9ac8606Spatrick        aux_log.close()
619e5dd7070Spatrick
620ec727ea7Spatrick    # TODO: change to NamedTuple
621ec727ea7Spatrick    return found_diffs, len(results_old.diagnostics), \
622ec727ea7Spatrick        len(results_new.diagnostics)
623e5dd7070Spatrick
624e5dd7070Spatrick
625ec727ea7Spatrickif __name__ == "__main__":
626ec727ea7Spatrick    print("CmpRuns.py should not be used on its own.")
627ec727ea7Spatrick    print("Please use 'SATest.py compare' instead")
628ec727ea7Spatrick    sys.exit(1)
629