1import unittest 2"""report.py - Utilities for reporting statistics about benchmark results 3""" 4import os 5import re 6import copy 7 8from scipy.stats import mannwhitneyu 9 10 11class BenchmarkColor(object): 12 def __init__(self, name, code): 13 self.name = name 14 self.code = code 15 16 def __repr__(self): 17 return '%s%r' % (self.__class__.__name__, 18 (self.name, self.code)) 19 20 def __format__(self, format): 21 return self.code 22 23 24# Benchmark Colors Enumeration 25BC_NONE = BenchmarkColor('NONE', '') 26BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m') 27BC_CYAN = BenchmarkColor('CYAN', '\033[96m') 28BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m') 29BC_OKGREEN = BenchmarkColor('OKGREEN', '\033[32m') 30BC_HEADER = BenchmarkColor('HEADER', '\033[92m') 31BC_WARNING = BenchmarkColor('WARNING', '\033[93m') 32BC_WHITE = BenchmarkColor('WHITE', '\033[97m') 33BC_FAIL = BenchmarkColor('FAIL', '\033[91m') 34BC_ENDC = BenchmarkColor('ENDC', '\033[0m') 35BC_BOLD = BenchmarkColor('BOLD', '\033[1m') 36BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m') 37 38UTEST_MIN_REPETITIONS = 2 39UTEST_OPTIMAL_REPETITIONS = 9 # Lowest reasonable number, More is better. 40UTEST_COL_NAME = "_pvalue" 41 42 43def color_format(use_color, fmt_str, *args, **kwargs): 44 """ 45 Return the result of 'fmt_str.format(*args, **kwargs)' after transforming 46 'args' and 'kwargs' according to the value of 'use_color'. If 'use_color' 47 is False then all color codes in 'args' and 'kwargs' are replaced with 48 the empty string. 49 """ 50 assert use_color is True or use_color is False 51 if not use_color: 52 args = [arg if not isinstance(arg, BenchmarkColor) else BC_NONE 53 for arg in args] 54 kwargs = {key: arg if not isinstance(arg, BenchmarkColor) else BC_NONE 55 for key, arg in kwargs.items()} 56 return fmt_str.format(*args, **kwargs) 57 58 59def find_longest_name(benchmark_list): 60 """ 61 Return the length of the longest benchmark name in a given list of 62 benchmark JSON objects 63 """ 64 longest_name = 1 65 for bc in benchmark_list: 66 if len(bc['name']) > longest_name: 67 longest_name = len(bc['name']) 68 return longest_name 69 70 71def calculate_change(old_val, new_val): 72 """ 73 Return a float representing the decimal change between old_val and new_val. 74 """ 75 if old_val == 0 and new_val == 0: 76 return 0.0 77 if old_val == 0: 78 return float(new_val - old_val) / (float(old_val + new_val) / 2) 79 return float(new_val - old_val) / abs(old_val) 80 81 82def filter_benchmark(json_orig, family, replacement=""): 83 """ 84 Apply a filter to the json, and only leave the 'family' of benchmarks. 85 """ 86 regex = re.compile(family) 87 filtered = {} 88 filtered['benchmarks'] = [] 89 for be in json_orig['benchmarks']: 90 if not regex.search(be['name']): 91 continue 92 filteredbench = copy.deepcopy(be) # Do NOT modify the old name! 93 filteredbench['name'] = regex.sub(replacement, filteredbench['name']) 94 filtered['benchmarks'].append(filteredbench) 95 return filtered 96 97 98def get_unique_benchmark_names(json): 99 """ 100 While *keeping* the order, give all the unique 'names' used for benchmarks. 101 """ 102 seen = set() 103 uniqued = [x['name'] for x in json['benchmarks'] 104 if x['name'] not in seen and 105 (seen.add(x['name']) or True)] 106 return uniqued 107 108 109def intersect(list1, list2): 110 """ 111 Given two lists, get a new list consisting of the elements only contained 112 in *both of the input lists*, while preserving the ordering. 113 """ 114 return [x for x in list1 if x in list2] 115 116 117def is_potentially_comparable_benchmark(x): 118 return ('time_unit' in x and 'real_time' in x and 'cpu_time' in x) 119 120 121def partition_benchmarks(json1, json2): 122 """ 123 While preserving the ordering, find benchmarks with the same names in 124 both of the inputs, and group them. 125 (i.e. partition/filter into groups with common name) 126 """ 127 json1_unique_names = get_unique_benchmark_names(json1) 128 json2_unique_names = get_unique_benchmark_names(json2) 129 names = intersect(json1_unique_names, json2_unique_names) 130 partitions = [] 131 for name in names: 132 time_unit = None 133 # Pick the time unit from the first entry of the lhs benchmark. 134 # We should be careful not to crash with unexpected input. 135 for x in json1['benchmarks']: 136 if (x['name'] == name and is_potentially_comparable_benchmark(x)): 137 time_unit = x['time_unit'] 138 break 139 if time_unit is None: 140 continue 141 # Filter by name and time unit. 142 # All the repetitions are assumed to be comparable. 143 lhs = [x for x in json1['benchmarks'] if x['name'] == name and 144 x['time_unit'] == time_unit] 145 rhs = [x for x in json2['benchmarks'] if x['name'] == name and 146 x['time_unit'] == time_unit] 147 partitions.append([lhs, rhs]) 148 return partitions 149 150 151def extract_field(partition, field_name): 152 # The count of elements may be different. We want *all* of them. 153 lhs = [x[field_name] for x in partition[0]] 154 rhs = [x[field_name] for x in partition[1]] 155 return [lhs, rhs] 156 157def calc_utest(timings_cpu, timings_time): 158 min_rep_cnt = min(len(timings_time[0]), 159 len(timings_time[1]), 160 len(timings_cpu[0]), 161 len(timings_cpu[1])) 162 163 # Does *everything* has at least UTEST_MIN_REPETITIONS repetitions? 164 if min_rep_cnt < UTEST_MIN_REPETITIONS: 165 return False, None, None 166 167 time_pvalue = mannwhitneyu( 168 timings_time[0], timings_time[1], alternative='two-sided').pvalue 169 cpu_pvalue = mannwhitneyu( 170 timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue 171 172 return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue 173 174def print_utest(partition, utest_alpha, first_col_width, use_color=True): 175 def get_utest_color(pval): 176 return BC_FAIL if pval >= utest_alpha else BC_OKGREEN 177 178 timings_time = extract_field(partition, 'real_time') 179 timings_cpu = extract_field(partition, 'cpu_time') 180 have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time) 181 182 # Check if we failed miserably with minimum required repetitions for utest 183 if not have_optimal_repetitions and cpu_pvalue is None and time_pvalue is None: 184 return [] 185 186 dsc = "U Test, Repetitions: {} vs {}".format( 187 len(timings_cpu[0]), len(timings_cpu[1])) 188 dsc_color = BC_OKGREEN 189 190 # We still got some results to show but issue a warning about it. 191 if not have_optimal_repetitions: 192 dsc_color = BC_WARNING 193 dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format( 194 UTEST_OPTIMAL_REPETITIONS) 195 196 special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{} {}" 197 198 last_name = partition[0][0]['name'] 199 return [color_format(use_color, 200 special_str, 201 BC_HEADER, 202 "{}{}".format(last_name, UTEST_COL_NAME), 203 first_col_width, 204 get_utest_color(time_pvalue), time_pvalue, 205 get_utest_color(cpu_pvalue), cpu_pvalue, 206 dsc_color, dsc, 207 endc=BC_ENDC)] 208 209 210def generate_difference_report( 211 json1, 212 json2, 213 display_aggregates_only=False, 214 utest=False, 215 utest_alpha=0.05, 216 use_color=True): 217 """ 218 Calculate and report the difference between each test of two benchmarks 219 runs specified as 'json1' and 'json2'. 220 """ 221 assert utest is True or utest is False 222 first_col_width = find_longest_name(json1['benchmarks']) 223 224 def find_test(name): 225 for b in json2['benchmarks']: 226 if b['name'] == name: 227 return b 228 return None 229 230 first_col_width = max( 231 first_col_width, 232 len('Benchmark')) 233 first_col_width += len(UTEST_COL_NAME) 234 first_line = "{:<{}s}Time CPU Time Old Time New CPU Old CPU New".format( 235 'Benchmark', 12 + first_col_width) 236 output_strs = [first_line, '-' * len(first_line)] 237 238 partitions = partition_benchmarks(json1, json2) 239 for partition in partitions: 240 # Careful, we may have different repetition count. 241 for i in range(min(len(partition[0]), len(partition[1]))): 242 bn = partition[0][i] 243 other_bench = partition[1][i] 244 245 # *If* we were asked to only display aggregates, 246 # and if it is non-aggregate, then skip it. 247 if display_aggregates_only and 'run_type' in bn and 'run_type' in other_bench: 248 assert bn['run_type'] == other_bench['run_type'] 249 if bn['run_type'] != 'aggregate': 250 continue 251 252 fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}" 253 254 def get_color(res): 255 if res > 0.05: 256 return BC_FAIL 257 elif res > -0.07: 258 return BC_WHITE 259 else: 260 return BC_CYAN 261 262 tres = calculate_change(bn['real_time'], other_bench['real_time']) 263 cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time']) 264 output_strs += [color_format(use_color, 265 fmt_str, 266 BC_HEADER, 267 bn['name'], 268 first_col_width, 269 get_color(tres), 270 tres, 271 get_color(cpures), 272 cpures, 273 bn['real_time'], 274 other_bench['real_time'], 275 bn['cpu_time'], 276 other_bench['cpu_time'], 277 endc=BC_ENDC)] 278 279 # After processing the whole partition, if requested, do the U test. 280 if utest: 281 output_strs += print_utest(partition, 282 utest_alpha=utest_alpha, 283 first_col_width=first_col_width, 284 use_color=use_color) 285 286 return output_strs 287 288 289############################################################################### 290# Unit tests 291 292 293class TestGetUniqueBenchmarkNames(unittest.TestCase): 294 def load_results(self): 295 import json 296 testInputs = os.path.join( 297 os.path.dirname( 298 os.path.realpath(__file__)), 299 'Inputs') 300 testOutput = os.path.join(testInputs, 'test3_run0.json') 301 with open(testOutput, 'r') as f: 302 json = json.load(f) 303 return json 304 305 def test_basic(self): 306 expect_lines = [ 307 'BM_One', 308 'BM_Two', 309 'short', # These two are not sorted 310 'medium', # These two are not sorted 311 ] 312 json = self.load_results() 313 output_lines = get_unique_benchmark_names(json) 314 print("\n") 315 print("\n".join(output_lines)) 316 self.assertEqual(len(output_lines), len(expect_lines)) 317 for i in range(0, len(output_lines)): 318 self.assertEqual(expect_lines[i], output_lines[i]) 319 320 321class TestReportDifference(unittest.TestCase): 322 def load_results(self): 323 import json 324 testInputs = os.path.join( 325 os.path.dirname( 326 os.path.realpath(__file__)), 327 'Inputs') 328 testOutput1 = os.path.join(testInputs, 'test1_run1.json') 329 testOutput2 = os.path.join(testInputs, 'test1_run2.json') 330 with open(testOutput1, 'r') as f: 331 json1 = json.load(f) 332 with open(testOutput2, 'r') as f: 333 json2 = json.load(f) 334 return json1, json2 335 336 def test_basic(self): 337 expect_lines = [ 338 ['BM_SameTimes', '+0.0000', '+0.0000', '10', '10', '10', '10'], 339 ['BM_2xFaster', '-0.5000', '-0.5000', '50', '25', '50', '25'], 340 ['BM_2xSlower', '+1.0000', '+1.0000', '50', '100', '50', '100'], 341 ['BM_1PercentFaster', '-0.0100', '-0.0100', '100', '99', '100', '99'], 342 ['BM_1PercentSlower', '+0.0100', '+0.0100', '100', '101', '100', '101'], 343 ['BM_10PercentFaster', '-0.1000', '-0.1000', '100', '90', '100', '90'], 344 ['BM_10PercentSlower', '+0.1000', '+0.1000', '100', '110', '100', '110'], 345 ['BM_100xSlower', '+99.0000', '+99.0000', 346 '100', '10000', '100', '10000'], 347 ['BM_100xFaster', '-0.9900', '-0.9900', 348 '10000', '100', '10000', '100'], 349 ['BM_10PercentCPUToTime', '+0.1000', 350 '-0.1000', '100', '110', '100', '90'], 351 ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'], 352 ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'], 353 ] 354 json1, json2 = self.load_results() 355 output_lines_with_header = generate_difference_report( 356 json1, json2, use_color=False) 357 output_lines = output_lines_with_header[2:] 358 print("\n") 359 print("\n".join(output_lines_with_header)) 360 self.assertEqual(len(output_lines), len(expect_lines)) 361 for i in range(0, len(output_lines)): 362 parts = [x for x in output_lines[i].split(' ') if x] 363 self.assertEqual(len(parts), 7) 364 self.assertEqual(expect_lines[i], parts) 365 366 367class TestReportDifferenceBetweenFamilies(unittest.TestCase): 368 def load_result(self): 369 import json 370 testInputs = os.path.join( 371 os.path.dirname( 372 os.path.realpath(__file__)), 373 'Inputs') 374 testOutput = os.path.join(testInputs, 'test2_run.json') 375 with open(testOutput, 'r') as f: 376 json = json.load(f) 377 return json 378 379 def test_basic(self): 380 expect_lines = [ 381 ['.', '-0.5000', '-0.5000', '10', '5', '10', '5'], 382 ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'], 383 ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'], 384 ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'], 385 ] 386 json = self.load_result() 387 json1 = filter_benchmark(json, "BM_Z.ro", ".") 388 json2 = filter_benchmark(json, "BM_O.e", ".") 389 output_lines_with_header = generate_difference_report( 390 json1, json2, use_color=False) 391 output_lines = output_lines_with_header[2:] 392 print("\n") 393 print("\n".join(output_lines_with_header)) 394 self.assertEqual(len(output_lines), len(expect_lines)) 395 for i in range(0, len(output_lines)): 396 parts = [x for x in output_lines[i].split(' ') if x] 397 self.assertEqual(len(parts), 7) 398 self.assertEqual(expect_lines[i], parts) 399 400 401class TestReportDifferenceWithUTest(unittest.TestCase): 402 def load_results(self): 403 import json 404 testInputs = os.path.join( 405 os.path.dirname( 406 os.path.realpath(__file__)), 407 'Inputs') 408 testOutput1 = os.path.join(testInputs, 'test3_run0.json') 409 testOutput2 = os.path.join(testInputs, 'test3_run1.json') 410 with open(testOutput1, 'r') as f: 411 json1 = json.load(f) 412 with open(testOutput2, 'r') as f: 413 json2 = json.load(f) 414 return json1, json2 415 416 def test_utest(self): 417 expect_lines = [] 418 expect_lines = [ 419 ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'], 420 ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'], 421 ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'], 422 ['BM_Two_pvalue', 423 '0.6985', 424 '0.6985', 425 'U', 426 'Test,', 427 'Repetitions:', 428 '2', 429 'vs', 430 '2.', 431 'WARNING:', 432 'Results', 433 'unreliable!', 434 '9+', 435 'repetitions', 436 'recommended.'], 437 ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'], 438 ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'], 439 ['short_pvalue', 440 '0.7671', 441 '0.1489', 442 'U', 443 'Test,', 444 'Repetitions:', 445 '2', 446 'vs', 447 '3.', 448 'WARNING:', 449 'Results', 450 'unreliable!', 451 '9+', 452 'repetitions', 453 'recommended.'], 454 ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'], 455 ] 456 json1, json2 = self.load_results() 457 output_lines_with_header = generate_difference_report( 458 json1, json2, utest=True, utest_alpha=0.05, use_color=False) 459 output_lines = output_lines_with_header[2:] 460 print("\n") 461 print("\n".join(output_lines_with_header)) 462 self.assertEqual(len(output_lines), len(expect_lines)) 463 for i in range(0, len(output_lines)): 464 parts = [x for x in output_lines[i].split(' ') if x] 465 self.assertEqual(expect_lines[i], parts) 466 467 468class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly( 469 unittest.TestCase): 470 def load_results(self): 471 import json 472 testInputs = os.path.join( 473 os.path.dirname( 474 os.path.realpath(__file__)), 475 'Inputs') 476 testOutput1 = os.path.join(testInputs, 'test3_run0.json') 477 testOutput2 = os.path.join(testInputs, 'test3_run1.json') 478 with open(testOutput1, 'r') as f: 479 json1 = json.load(f) 480 with open(testOutput2, 'r') as f: 481 json2 = json.load(f) 482 return json1, json2 483 484 def test_utest(self): 485 expect_lines = [] 486 expect_lines = [ 487 ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'], 488 ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'], 489 ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'], 490 ['BM_Two_pvalue', 491 '0.6985', 492 '0.6985', 493 'U', 494 'Test,', 495 'Repetitions:', 496 '2', 497 'vs', 498 '2.', 499 'WARNING:', 500 'Results', 501 'unreliable!', 502 '9+', 503 'repetitions', 504 'recommended.'], 505 ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'], 506 ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'], 507 ['short_pvalue', 508 '0.7671', 509 '0.1489', 510 'U', 511 'Test,', 512 'Repetitions:', 513 '2', 514 'vs', 515 '3.', 516 'WARNING:', 517 'Results', 518 'unreliable!', 519 '9+', 520 'repetitions', 521 'recommended.'], 522 ] 523 json1, json2 = self.load_results() 524 output_lines_with_header = generate_difference_report( 525 json1, json2, display_aggregates_only=True, 526 utest=True, utest_alpha=0.05, use_color=False) 527 output_lines = output_lines_with_header[2:] 528 print("\n") 529 print("\n".join(output_lines_with_header)) 530 self.assertEqual(len(output_lines), len(expect_lines)) 531 for i in range(0, len(output_lines)): 532 parts = [x for x in output_lines[i].split(' ') if x] 533 self.assertEqual(expect_lines[i], parts) 534 535 536if __name__ == '__main__': 537 unittest.main() 538 539# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 540# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off; 541# kate: indent-mode python; remove-trailing-spaces modified; 542