1import unittest
2"""report.py - Utilities for reporting statistics about benchmark results
3"""
4import os
5import re
6import copy
7
8from scipy.stats import mannwhitneyu
9
10
11class BenchmarkColor(object):
12    def __init__(self, name, code):
13        self.name = name
14        self.code = code
15
16    def __repr__(self):
17        return '%s%r' % (self.__class__.__name__,
18                         (self.name, self.code))
19
20    def __format__(self, format):
21        return self.code
22
23
24# Benchmark Colors Enumeration
25BC_NONE = BenchmarkColor('NONE', '')
26BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m')
27BC_CYAN = BenchmarkColor('CYAN', '\033[96m')
28BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m')
29BC_OKGREEN = BenchmarkColor('OKGREEN', '\033[32m')
30BC_HEADER = BenchmarkColor('HEADER', '\033[92m')
31BC_WARNING = BenchmarkColor('WARNING', '\033[93m')
32BC_WHITE = BenchmarkColor('WHITE', '\033[97m')
33BC_FAIL = BenchmarkColor('FAIL', '\033[91m')
34BC_ENDC = BenchmarkColor('ENDC', '\033[0m')
35BC_BOLD = BenchmarkColor('BOLD', '\033[1m')
36BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')
37
38UTEST_MIN_REPETITIONS = 2
39UTEST_OPTIMAL_REPETITIONS = 9  # Lowest reasonable number, More is better.
40UTEST_COL_NAME = "_pvalue"
41
42
43def color_format(use_color, fmt_str, *args, **kwargs):
44    """
45    Return the result of 'fmt_str.format(*args, **kwargs)' after transforming
46    'args' and 'kwargs' according to the value of 'use_color'. If 'use_color'
47    is False then all color codes in 'args' and 'kwargs' are replaced with
48    the empty string.
49    """
50    assert use_color is True or use_color is False
51    if not use_color:
52        args = [arg if not isinstance(arg, BenchmarkColor) else BC_NONE
53                for arg in args]
54        kwargs = {key: arg if not isinstance(arg, BenchmarkColor) else BC_NONE
55                  for key, arg in kwargs.items()}
56    return fmt_str.format(*args, **kwargs)
57
58
59def find_longest_name(benchmark_list):
60    """
61    Return the length of the longest benchmark name in a given list of
62    benchmark JSON objects
63    """
64    longest_name = 1
65    for bc in benchmark_list:
66        if len(bc['name']) > longest_name:
67            longest_name = len(bc['name'])
68    return longest_name
69
70
71def calculate_change(old_val, new_val):
72    """
73    Return a float representing the decimal change between old_val and new_val.
74    """
75    if old_val == 0 and new_val == 0:
76        return 0.0
77    if old_val == 0:
78        return float(new_val - old_val) / (float(old_val + new_val) / 2)
79    return float(new_val - old_val) / abs(old_val)
80
81
82def filter_benchmark(json_orig, family, replacement=""):
83    """
84    Apply a filter to the json, and only leave the 'family' of benchmarks.
85    """
86    regex = re.compile(family)
87    filtered = {}
88    filtered['benchmarks'] = []
89    for be in json_orig['benchmarks']:
90        if not regex.search(be['name']):
91            continue
92        filteredbench = copy.deepcopy(be)  # Do NOT modify the old name!
93        filteredbench['name'] = regex.sub(replacement, filteredbench['name'])
94        filtered['benchmarks'].append(filteredbench)
95    return filtered
96
97
98def get_unique_benchmark_names(json):
99    """
100    While *keeping* the order, give all the unique 'names' used for benchmarks.
101    """
102    seen = set()
103    uniqued = [x['name'] for x in json['benchmarks']
104               if x['name'] not in seen and
105               (seen.add(x['name']) or True)]
106    return uniqued
107
108
109def intersect(list1, list2):
110    """
111    Given two lists, get a new list consisting of the elements only contained
112    in *both of the input lists*, while preserving the ordering.
113    """
114    return [x for x in list1 if x in list2]
115
116
117def is_potentially_comparable_benchmark(x):
118    return ('time_unit' in x and 'real_time' in x and 'cpu_time' in x)
119
120
121def partition_benchmarks(json1, json2):
122    """
123    While preserving the ordering, find benchmarks with the same names in
124    both of the inputs, and group them.
125    (i.e. partition/filter into groups with common name)
126    """
127    json1_unique_names = get_unique_benchmark_names(json1)
128    json2_unique_names = get_unique_benchmark_names(json2)
129    names = intersect(json1_unique_names, json2_unique_names)
130    partitions = []
131    for name in names:
132        time_unit = None
133        # Pick the time unit from the first entry of the lhs benchmark.
134        # We should be careful not to crash with unexpected input.
135        for x in json1['benchmarks']:
136            if (x['name'] == name and is_potentially_comparable_benchmark(x)):
137                time_unit = x['time_unit']
138                break
139        if time_unit is None:
140            continue
141        # Filter by name and time unit.
142        # All the repetitions are assumed to be comparable.
143        lhs = [x for x in json1['benchmarks'] if x['name'] == name and
144               x['time_unit'] == time_unit]
145        rhs = [x for x in json2['benchmarks'] if x['name'] == name and
146               x['time_unit'] == time_unit]
147        partitions.append([lhs, rhs])
148    return partitions
149
150
151def extract_field(partition, field_name):
152    # The count of elements may be different. We want *all* of them.
153    lhs = [x[field_name] for x in partition[0]]
154    rhs = [x[field_name] for x in partition[1]]
155    return [lhs, rhs]
156
157def calc_utest(timings_cpu, timings_time):
158    min_rep_cnt = min(len(timings_time[0]),
159                      len(timings_time[1]),
160                      len(timings_cpu[0]),
161                      len(timings_cpu[1]))
162
163    # Does *everything* has at least UTEST_MIN_REPETITIONS repetitions?
164    if min_rep_cnt < UTEST_MIN_REPETITIONS:
165        return False, None, None
166
167    time_pvalue = mannwhitneyu(
168        timings_time[0], timings_time[1], alternative='two-sided').pvalue
169    cpu_pvalue = mannwhitneyu(
170        timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
171
172    return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
173
174def print_utest(partition, utest_alpha, first_col_width, use_color=True):
175    def get_utest_color(pval):
176        return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
177
178    timings_time = extract_field(partition, 'real_time')
179    timings_cpu = extract_field(partition, 'cpu_time')
180    have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time)
181
182    # Check if we failed miserably with minimum required repetitions for utest
183    if not have_optimal_repetitions and cpu_pvalue is None and time_pvalue is None:
184        return []
185
186    dsc = "U Test, Repetitions: {} vs {}".format(
187        len(timings_cpu[0]), len(timings_cpu[1]))
188    dsc_color = BC_OKGREEN
189
190    # We still got some results to show but issue a warning about it.
191    if not have_optimal_repetitions:
192        dsc_color = BC_WARNING
193        dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
194            UTEST_OPTIMAL_REPETITIONS)
195
196    special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{}      {}"
197
198    last_name = partition[0][0]['name']
199    return [color_format(use_color,
200                         special_str,
201                         BC_HEADER,
202                         "{}{}".format(last_name, UTEST_COL_NAME),
203                         first_col_width,
204                         get_utest_color(time_pvalue), time_pvalue,
205                         get_utest_color(cpu_pvalue), cpu_pvalue,
206                         dsc_color, dsc,
207                         endc=BC_ENDC)]
208
209
210def generate_difference_report(
211        json1,
212        json2,
213        display_aggregates_only=False,
214        utest=False,
215        utest_alpha=0.05,
216        use_color=True):
217    """
218    Calculate and report the difference between each test of two benchmarks
219    runs specified as 'json1' and 'json2'.
220    """
221    assert utest is True or utest is False
222    first_col_width = find_longest_name(json1['benchmarks'])
223
224    def find_test(name):
225        for b in json2['benchmarks']:
226            if b['name'] == name:
227                return b
228        return None
229
230    first_col_width = max(
231        first_col_width,
232        len('Benchmark'))
233    first_col_width += len(UTEST_COL_NAME)
234    first_line = "{:<{}s}Time             CPU      Time Old      Time New       CPU Old       CPU New".format(
235        'Benchmark', 12 + first_col_width)
236    output_strs = [first_line, '-' * len(first_line)]
237
238    partitions = partition_benchmarks(json1, json2)
239    for partition in partitions:
240        # Careful, we may have different repetition count.
241        for i in range(min(len(partition[0]), len(partition[1]))):
242            bn = partition[0][i]
243            other_bench = partition[1][i]
244
245            # *If* we were asked to only display aggregates,
246            # and if it is non-aggregate, then skip it.
247            if display_aggregates_only and 'run_type' in bn and 'run_type' in other_bench:
248                assert bn['run_type'] == other_bench['run_type']
249                if bn['run_type'] != 'aggregate':
250                    continue
251
252            fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
253
254            def get_color(res):
255                if res > 0.05:
256                    return BC_FAIL
257                elif res > -0.07:
258                    return BC_WHITE
259                else:
260                    return BC_CYAN
261
262            tres = calculate_change(bn['real_time'], other_bench['real_time'])
263            cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
264            output_strs += [color_format(use_color,
265                                         fmt_str,
266                                         BC_HEADER,
267                                         bn['name'],
268                                         first_col_width,
269                                         get_color(tres),
270                                         tres,
271                                         get_color(cpures),
272                                         cpures,
273                                         bn['real_time'],
274                                         other_bench['real_time'],
275                                         bn['cpu_time'],
276                                         other_bench['cpu_time'],
277                                         endc=BC_ENDC)]
278
279        # After processing the whole partition, if requested, do the U test.
280        if utest:
281            output_strs += print_utest(partition,
282                                       utest_alpha=utest_alpha,
283                                       first_col_width=first_col_width,
284                                       use_color=use_color)
285
286    return output_strs
287
288
289###############################################################################
290# Unit tests
291
292
293class TestGetUniqueBenchmarkNames(unittest.TestCase):
294    def load_results(self):
295        import json
296        testInputs = os.path.join(
297            os.path.dirname(
298                os.path.realpath(__file__)),
299            'Inputs')
300        testOutput = os.path.join(testInputs, 'test3_run0.json')
301        with open(testOutput, 'r') as f:
302            json = json.load(f)
303        return json
304
305    def test_basic(self):
306        expect_lines = [
307            'BM_One',
308            'BM_Two',
309            'short',  # These two are not sorted
310            'medium',  # These two are not sorted
311        ]
312        json = self.load_results()
313        output_lines = get_unique_benchmark_names(json)
314        print("\n")
315        print("\n".join(output_lines))
316        self.assertEqual(len(output_lines), len(expect_lines))
317        for i in range(0, len(output_lines)):
318            self.assertEqual(expect_lines[i], output_lines[i])
319
320
321class TestReportDifference(unittest.TestCase):
322    def load_results(self):
323        import json
324        testInputs = os.path.join(
325            os.path.dirname(
326                os.path.realpath(__file__)),
327            'Inputs')
328        testOutput1 = os.path.join(testInputs, 'test1_run1.json')
329        testOutput2 = os.path.join(testInputs, 'test1_run2.json')
330        with open(testOutput1, 'r') as f:
331            json1 = json.load(f)
332        with open(testOutput2, 'r') as f:
333            json2 = json.load(f)
334        return json1, json2
335
336    def test_basic(self):
337        expect_lines = [
338            ['BM_SameTimes', '+0.0000', '+0.0000', '10', '10', '10', '10'],
339            ['BM_2xFaster', '-0.5000', '-0.5000', '50', '25', '50', '25'],
340            ['BM_2xSlower', '+1.0000', '+1.0000', '50', '100', '50', '100'],
341            ['BM_1PercentFaster', '-0.0100', '-0.0100', '100', '99', '100', '99'],
342            ['BM_1PercentSlower', '+0.0100', '+0.0100', '100', '101', '100', '101'],
343            ['BM_10PercentFaster', '-0.1000', '-0.1000', '100', '90', '100', '90'],
344            ['BM_10PercentSlower', '+0.1000', '+0.1000', '100', '110', '100', '110'],
345            ['BM_100xSlower', '+99.0000', '+99.0000',
346                '100', '10000', '100', '10000'],
347            ['BM_100xFaster', '-0.9900', '-0.9900',
348                '10000', '100', '10000', '100'],
349            ['BM_10PercentCPUToTime', '+0.1000',
350                '-0.1000', '100', '110', '100', '90'],
351            ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
352            ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
353        ]
354        json1, json2 = self.load_results()
355        output_lines_with_header = generate_difference_report(
356            json1, json2, use_color=False)
357        output_lines = output_lines_with_header[2:]
358        print("\n")
359        print("\n".join(output_lines_with_header))
360        self.assertEqual(len(output_lines), len(expect_lines))
361        for i in range(0, len(output_lines)):
362            parts = [x for x in output_lines[i].split(' ') if x]
363            self.assertEqual(len(parts), 7)
364            self.assertEqual(expect_lines[i], parts)
365
366
367class TestReportDifferenceBetweenFamilies(unittest.TestCase):
368    def load_result(self):
369        import json
370        testInputs = os.path.join(
371            os.path.dirname(
372                os.path.realpath(__file__)),
373            'Inputs')
374        testOutput = os.path.join(testInputs, 'test2_run.json')
375        with open(testOutput, 'r') as f:
376            json = json.load(f)
377        return json
378
379    def test_basic(self):
380        expect_lines = [
381            ['.', '-0.5000', '-0.5000', '10', '5', '10', '5'],
382            ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
383            ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
384            ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
385        ]
386        json = self.load_result()
387        json1 = filter_benchmark(json, "BM_Z.ro", ".")
388        json2 = filter_benchmark(json, "BM_O.e", ".")
389        output_lines_with_header = generate_difference_report(
390            json1, json2, use_color=False)
391        output_lines = output_lines_with_header[2:]
392        print("\n")
393        print("\n".join(output_lines_with_header))
394        self.assertEqual(len(output_lines), len(expect_lines))
395        for i in range(0, len(output_lines)):
396            parts = [x for x in output_lines[i].split(' ') if x]
397            self.assertEqual(len(parts), 7)
398            self.assertEqual(expect_lines[i], parts)
399
400
401class TestReportDifferenceWithUTest(unittest.TestCase):
402    def load_results(self):
403        import json
404        testInputs = os.path.join(
405            os.path.dirname(
406                os.path.realpath(__file__)),
407            'Inputs')
408        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
409        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
410        with open(testOutput1, 'r') as f:
411            json1 = json.load(f)
412        with open(testOutput2, 'r') as f:
413            json2 = json.load(f)
414        return json1, json2
415
416    def test_utest(self):
417        expect_lines = []
418        expect_lines = [
419            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
420            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
421            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
422            ['BM_Two_pvalue',
423             '0.6985',
424             '0.6985',
425             'U',
426             'Test,',
427             'Repetitions:',
428             '2',
429             'vs',
430             '2.',
431             'WARNING:',
432             'Results',
433             'unreliable!',
434             '9+',
435             'repetitions',
436             'recommended.'],
437            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
438            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
439            ['short_pvalue',
440             '0.7671',
441             '0.1489',
442             'U',
443             'Test,',
444             'Repetitions:',
445             '2',
446             'vs',
447             '3.',
448             'WARNING:',
449             'Results',
450             'unreliable!',
451             '9+',
452             'repetitions',
453             'recommended.'],
454            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
455        ]
456        json1, json2 = self.load_results()
457        output_lines_with_header = generate_difference_report(
458            json1, json2, utest=True, utest_alpha=0.05, use_color=False)
459        output_lines = output_lines_with_header[2:]
460        print("\n")
461        print("\n".join(output_lines_with_header))
462        self.assertEqual(len(output_lines), len(expect_lines))
463        for i in range(0, len(output_lines)):
464            parts = [x for x in output_lines[i].split(' ') if x]
465            self.assertEqual(expect_lines[i], parts)
466
467
468class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
469        unittest.TestCase):
470    def load_results(self):
471        import json
472        testInputs = os.path.join(
473            os.path.dirname(
474                os.path.realpath(__file__)),
475            'Inputs')
476        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
477        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
478        with open(testOutput1, 'r') as f:
479            json1 = json.load(f)
480        with open(testOutput2, 'r') as f:
481            json2 = json.load(f)
482        return json1, json2
483
484    def test_utest(self):
485        expect_lines = []
486        expect_lines = [
487            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
488            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
489            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
490            ['BM_Two_pvalue',
491             '0.6985',
492             '0.6985',
493             'U',
494             'Test,',
495             'Repetitions:',
496             '2',
497             'vs',
498             '2.',
499             'WARNING:',
500             'Results',
501             'unreliable!',
502             '9+',
503             'repetitions',
504             'recommended.'],
505            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
506            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
507            ['short_pvalue',
508             '0.7671',
509             '0.1489',
510             'U',
511             'Test,',
512             'Repetitions:',
513             '2',
514             'vs',
515             '3.',
516             'WARNING:',
517             'Results',
518             'unreliable!',
519             '9+',
520             'repetitions',
521             'recommended.'],
522        ]
523        json1, json2 = self.load_results()
524        output_lines_with_header = generate_difference_report(
525            json1, json2, display_aggregates_only=True,
526            utest=True, utest_alpha=0.05, use_color=False)
527        output_lines = output_lines_with_header[2:]
528        print("\n")
529        print("\n".join(output_lines_with_header))
530        self.assertEqual(len(output_lines), len(expect_lines))
531        for i in range(0, len(output_lines)):
532            parts = [x for x in output_lines[i].split(' ') if x]
533            self.assertEqual(expect_lines[i], parts)
534
535
536if __name__ == '__main__':
537    unittest.main()
538
539# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
540# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
541# kate: indent-mode python; remove-trailing-spaces modified;
542