1# Copyright 2019 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5from __future__ import print_function
6
7import argparse
8import json
9import logging
10import os
11import shutil
12import subprocess
13import sys
14import tempfile
15
16TOOLS_PERF_PATH = os.path.abspath(os.path.join(
17  os.path.dirname(__file__), '..', '..'))
18sys.path.insert(1, TOOLS_PERF_PATH)
19
20from experimental.story_clustering import similarity_calculator
21from experimental.story_clustering import cluster_stories
22from experimental.story_clustering import create_soundwave_input
23from core.external_modules import pandas
24
25
26def CalculateDistances(
27  all_bots_dataframe,
28  bots,
29  rolling_window,
30  metric_name,
31  normalize = False):
32  timeseries = []
33
34  for bot_name, bot_group in all_bots_dataframe.groupby(bots):
35    temp_dataframe = bot_group.pivot(index='test_case',
36      columns='commit_pos', values='value')
37    temp_dataframe_with_solling_avg = temp_dataframe.rolling(
38      rolling_window,
39      min_periods=1,
40      axis=1
41    ).mean().stack().rename('value').reset_index()
42
43    temp_dataframe_with_solling_avg['bot'] = bot_name
44    timeseries.append(temp_dataframe_with_solling_avg)
45
46  all_bots = pandas.concat(timeseries)
47  distance_matrix = similarity_calculator.CalculateDistances(
48    all_bots,
49    metric=metric_name,
50    normalize=normalize,
51  )
52  print('Similarities are calculated for', metric_name)
53
54  return distance_matrix
55
56
57def Main(argv):
58  parser = argparse.ArgumentParser(
59    description=('Gathers the values of each metric and platfrom pair in a'
60    ' csv file to be used in clustering of stories.'))
61  parser.add_argument('benchmark', type=str, help='benchmark to be used')
62  parser.add_argument('--metrics', type=str, nargs='*',
63    help='List of metrics to use')
64  parser.add_argument('--platforms', type=str, nargs='*',
65    help='List of platforms to use')
66  parser.add_argument('--testcases-path', type=str, help=('Path to the file '
67    'containing a list of all test_cases in the benchmark that needs to '
68    'be clustered'))
69  parser.add_argument('--days', default=180, help=('Number of days to gather'
70    ' data about'))
71  parser.add_argument('--output-path', type=str, help='Output file',
72    default='//tmp/story_clustering/clusters.json')
73  parser.add_argument('--max-cluster-count', default='10',
74    help='Number of not valid clusters needed')
75  parser.add_argument('--min-cluster-size', default='2', help=('Least number '
76            'of members in cluster, to make cluster valied'))
77  parser.add_argument('--rolling-window', default='1', help=('Number of '
78    'samples to take average from while calculating the moving average'))
79  parser.add_argument('--normalize', default=False,
80    help='Normalize timeseries to calculate similarity', action='store_true')
81  parser.add_argument('--processes', default='20', help=('Number of '
82    'concurrent workers used by soundwave.'))
83  args = parser.parse_args(argv[1:])
84
85  temp_dir = tempfile.mkdtemp('telemetry')
86  startup_timeseries = os.path.join(temp_dir, 'startup_timeseries.json')
87  soundwave_output_path = os.path.join(temp_dir, 'data.csv')
88  soundwave_path = os.path.join(TOOLS_PERF_PATH, 'soundwave')
89
90  try:
91    output_dir = os.path.dirname(args.output_path)
92    clusters_json = {}
93
94    if not os.path.isdir(output_dir):
95      os.makedirs(output_dir)
96
97    # creating the json file needed for soundwave
98    create_soundwave_input.CreateInput(
99      test_suite=args.benchmark,
100      platforms=args.platforms,
101      metrics=args.metrics,
102      test_cases_path=args.testcases_path,
103      output_dir=startup_timeseries)
104
105    subprocess.call([
106      soundwave_path,
107      '-d', args.days,
108      '--processes', args.processes,
109      'timeseries',
110      '-i', startup_timeseries,
111      '--output-csv', soundwave_output_path
112    ])
113
114    # Processing the data.
115    dataframe = pandas.read_csv(soundwave_output_path)
116    dataframe_per_metric = dataframe.groupby(dataframe['measurement'])
117    for metric_name, all_bots in list(dataframe_per_metric):
118      clusters_json[metric_name] = []
119
120      distance_matrix = CalculateDistances(
121        all_bots_dataframe=all_bots,
122        bots=dataframe['bot'],
123        rolling_window=int(args.rolling_window),
124        metric_name=metric_name,
125        normalize=args.normalize)
126
127      clusters, coverage = cluster_stories.RunHierarchicalClustering(
128        distance_matrix,
129        max_cluster_count=int(args.max_cluster_count),
130        min_cluster_size=int(args.min_cluster_size),
131      )
132      print()
133      print(metric_name, ':')
134      print(format(coverage * 100.0, '.1f'), 'percent coverage.')
135      print('Stories are grouped into', len(clusters), 'clusters.')
136      print('representatives:')
137      for cluster in clusters:
138        print (cluster.GetRepresentative())
139      print()
140
141      for cluster in clusters:
142        clusters_json[metric_name].append(cluster.AsDict())
143
144    with open(args.output_path, 'w') as outfile:
145      json.dump(
146        clusters_json,
147        outfile,
148        separators=(',',': '),
149        indent=4,
150        sort_keys=True
151      )
152
153  except Exception:
154    logging.exception('The following exception may have prevented the code'
155      ' from clustering stories.')
156  finally:
157    shutil.rmtree(temp_dir, ignore_errors=True)
158
159if __name__ == '__main__':
160  sys.exit(Main(sys.argv))
161