1# Copyright 2019 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5from __future__ import print_function 6 7import argparse 8import json 9import logging 10import os 11import shutil 12import subprocess 13import sys 14import tempfile 15 16TOOLS_PERF_PATH = os.path.abspath(os.path.join( 17 os.path.dirname(__file__), '..', '..')) 18sys.path.insert(1, TOOLS_PERF_PATH) 19 20from experimental.story_clustering import similarity_calculator 21from experimental.story_clustering import cluster_stories 22from experimental.story_clustering import create_soundwave_input 23from core.external_modules import pandas 24 25 26def CalculateDistances( 27 all_bots_dataframe, 28 bots, 29 rolling_window, 30 metric_name, 31 normalize = False): 32 timeseries = [] 33 34 for bot_name, bot_group in all_bots_dataframe.groupby(bots): 35 temp_dataframe = bot_group.pivot(index='test_case', 36 columns='commit_pos', values='value') 37 temp_dataframe_with_solling_avg = temp_dataframe.rolling( 38 rolling_window, 39 min_periods=1, 40 axis=1 41 ).mean().stack().rename('value').reset_index() 42 43 temp_dataframe_with_solling_avg['bot'] = bot_name 44 timeseries.append(temp_dataframe_with_solling_avg) 45 46 all_bots = pandas.concat(timeseries) 47 distance_matrix = similarity_calculator.CalculateDistances( 48 all_bots, 49 metric=metric_name, 50 normalize=normalize, 51 ) 52 print('Similarities are calculated for', metric_name) 53 54 return distance_matrix 55 56 57def Main(argv): 58 parser = argparse.ArgumentParser( 59 description=('Gathers the values of each metric and platfrom pair in a' 60 ' csv file to be used in clustering of stories.')) 61 parser.add_argument('benchmark', type=str, help='benchmark to be used') 62 parser.add_argument('--metrics', type=str, nargs='*', 63 help='List of metrics to use') 64 parser.add_argument('--platforms', type=str, nargs='*', 65 help='List of platforms to use') 66 parser.add_argument('--testcases-path', type=str, help=('Path to the file ' 67 'containing a list of all test_cases in the benchmark that needs to ' 68 'be clustered')) 69 parser.add_argument('--days', default=180, help=('Number of days to gather' 70 ' data about')) 71 parser.add_argument('--output-path', type=str, help='Output file', 72 default='//tmp/story_clustering/clusters.json') 73 parser.add_argument('--max-cluster-count', default='10', 74 help='Number of not valid clusters needed') 75 parser.add_argument('--min-cluster-size', default='2', help=('Least number ' 76 'of members in cluster, to make cluster valied')) 77 parser.add_argument('--rolling-window', default='1', help=('Number of ' 78 'samples to take average from while calculating the moving average')) 79 parser.add_argument('--normalize', default=False, 80 help='Normalize timeseries to calculate similarity', action='store_true') 81 parser.add_argument('--processes', default='20', help=('Number of ' 82 'concurrent workers used by soundwave.')) 83 args = parser.parse_args(argv[1:]) 84 85 temp_dir = tempfile.mkdtemp('telemetry') 86 startup_timeseries = os.path.join(temp_dir, 'startup_timeseries.json') 87 soundwave_output_path = os.path.join(temp_dir, 'data.csv') 88 soundwave_path = os.path.join(TOOLS_PERF_PATH, 'soundwave') 89 90 try: 91 output_dir = os.path.dirname(args.output_path) 92 clusters_json = {} 93 94 if not os.path.isdir(output_dir): 95 os.makedirs(output_dir) 96 97 # creating the json file needed for soundwave 98 create_soundwave_input.CreateInput( 99 test_suite=args.benchmark, 100 platforms=args.platforms, 101 metrics=args.metrics, 102 test_cases_path=args.testcases_path, 103 output_dir=startup_timeseries) 104 105 subprocess.call([ 106 soundwave_path, 107 '-d', args.days, 108 '--processes', args.processes, 109 'timeseries', 110 '-i', startup_timeseries, 111 '--output-csv', soundwave_output_path 112 ]) 113 114 # Processing the data. 115 dataframe = pandas.read_csv(soundwave_output_path) 116 dataframe_per_metric = dataframe.groupby(dataframe['measurement']) 117 for metric_name, all_bots in list(dataframe_per_metric): 118 clusters_json[metric_name] = [] 119 120 distance_matrix = CalculateDistances( 121 all_bots_dataframe=all_bots, 122 bots=dataframe['bot'], 123 rolling_window=int(args.rolling_window), 124 metric_name=metric_name, 125 normalize=args.normalize) 126 127 clusters, coverage = cluster_stories.RunHierarchicalClustering( 128 distance_matrix, 129 max_cluster_count=int(args.max_cluster_count), 130 min_cluster_size=int(args.min_cluster_size), 131 ) 132 print() 133 print(metric_name, ':') 134 print(format(coverage * 100.0, '.1f'), 'percent coverage.') 135 print('Stories are grouped into', len(clusters), 'clusters.') 136 print('representatives:') 137 for cluster in clusters: 138 print (cluster.GetRepresentative()) 139 print() 140 141 for cluster in clusters: 142 clusters_json[metric_name].append(cluster.AsDict()) 143 144 with open(args.output_path, 'w') as outfile: 145 json.dump( 146 clusters_json, 147 outfile, 148 separators=(',',': '), 149 indent=4, 150 sort_keys=True 151 ) 152 153 except Exception: 154 logging.exception('The following exception may have prevented the code' 155 ' from clustering stories.') 156 finally: 157 shutil.rmtree(temp_dir, ignore_errors=True) 158 159if __name__ == '__main__': 160 sys.exit(Main(sys.argv)) 161