1# Copyright 2020 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4"""Splits a XML file into smaller XMLs in subfolders.
5
6Splits nodes according to the first camelcase part of their name attribute.
7Intended to be used to split up the large histograms.xml or enums.xml file.
8"""
9
10import os
11import re
12from xml.dom import minidom
13
14import histogram_configuration_model
15import histogram_paths
16import merge_xml
17import path_util
18
19# The top level comment templates that will be formatted and added to each split
20# histograms xml.
21FIRST_TOP_LEVEL_COMMENT_TEMPLATE = """
22Copyright 2020 The Chromium Authors. All rights reserved.
23Use of this source code is governed by a BSD-style license that can be
24found in the LICENSE file.
25"""
26SECOND_TOP_LEVEL_COMMENT_TEMPLATE = """
27This file is used to generate a comprehensive list of %s
28along with a detailed description for each histogram.
29
30For best practices on writing histogram descriptions, see
31https://chromium.googlesource.com/chromium/src.git/+/HEAD/tools/metrics/histograms/README.md
32
33Please send CLs to chromium-metrics-reviews@google.com rather than to specific
34individuals. These CLs will be automatically reassigned to a reviewer within
35about 5 minutes. This approach helps the metrics team to load-balance incoming
36reviews. Googlers can read more about this at go/gwsq-gerrit.
37"""
38# Number of times that splitting of histograms will be carried out.
39TARGET_DEPTH = 1
40# The number of histograms below which they will be aggregated into
41# the histograms.xml in 'others'.
42AGGREGATE_THRESHOLD = 20
43# A map from the histogram name to the folder name these histograms should be
44# put in.
45_PREDEFINED_NAMES_MAPPING = {
46    'BackForwardCache': 'BackForwardCache',
47    'ChromeOS': 'ChromeOS',
48    'CustomTabs': 'CustomTabs',
49    'CustomTab': 'CustomTabs',
50    'DataReductionProxy': 'DataReductionProxy',
51    'DataUse': 'DataUse',
52    'MultiDevice': 'MultiDevice',
53    'NaCl': 'NaCl',
54    'SafeBrowsing': 'SafeBrowsing',
55    'SafeBrowsingBinaryUploadRequest': 'SafeBrowsing',
56    'SafeBrowsingFCMService': 'SafeBrowsing',
57    'NewTabPage': 'NewTabPage',
58    'SiteEngagementService': 'SiteEngagementService',
59    'SiteIsolation': 'SiteIsolation',
60    'Tabs': 'Tab',
61    'TextFragmentAnchor': 'TextFragmentAnchor',
62    'TextToSpeech': 'TextToSpeech',
63    'UpdateEngine': 'UpdateEngine',
64    'WebApk': 'WebApk',
65    'WebApp': 'WebApp',
66    'WebAudio': 'WebAudio',
67    'WebAuthentication': 'WebAuthentication',
68    'WebCore': 'WebCore',
69    'WebFont': 'WebFont',
70    'WebHistory': 'WebHistory',
71    'WebRTC': 'WebRTC',
72    'WebRtcEventLogging': 'WebRTC',
73    'WebRtcTextLogging': 'WebRTC',
74    'WebUI': 'WebUI',
75    'WebUITabStrip': 'WebUI',
76}
77
78
79def _ParseMergedXML():
80  """Parses merged xml into different types of nodes"""
81  merged_histograms = merge_xml.MergeFiles(histogram_paths.HISTOGRAMS_XMLS +
82                                           [histogram_paths.OBSOLETE_XML])
83  histogram_nodes = merged_histograms.getElementsByTagName('histogram')
84  variants_nodes = merged_histograms.getElementsByTagName('variants')
85  histogram_suffixes_nodes = merged_histograms.getElementsByTagName(
86      'histogram_suffixes')
87  return histogram_nodes, variants_nodes, histogram_suffixes_nodes
88
89
90def _CreateXMLFile(comment, parent_node_string, nodes, output_dir, filename):
91  """Creates XML file for given type of XML nodes.
92
93  This function also creates a |parent_node_string| tag as the parent node, e.g.
94  <histograms> or <histogram_suffixes_list>, that wraps all the |nodes| in the
95  output XML.
96
97  Args:
98    comment: The string to be formatted in the |TOP_LEVEL_COMMENT_TEMPLATE|
99        which will then be added on top of each split xml.
100    parent_node_string: The name of the the second-level parent node, e.g.
101        <histograms> or <histogram_suffixes_list>.
102    nodes: A DOM NodeList object or a list containing <histogram> or
103        <histogram_suffixes> that will be inserted under the parent node.
104    output_dir: The output directory.
105    filename: The output filename.
106  """
107  doc = minidom.Document()
108
109  doc.appendChild(doc.createComment(FIRST_TOP_LEVEL_COMMENT_TEMPLATE))
110  doc.appendChild(doc.createComment(SECOND_TOP_LEVEL_COMMENT_TEMPLATE %
111                                    comment))
112
113  # Create the <histogram-configuration> node for the new histograms.xml file.
114  histogram_config_element = doc.createElement('histogram-configuration')
115  doc.appendChild(histogram_config_element)
116  parent_element = doc.createElement(parent_node_string)
117  histogram_config_element.appendChild(parent_element)
118
119  # Under the parent node, append the children nodes.
120  for node in nodes:
121    parent_element.appendChild(node)
122
123  output_path = os.path.join(output_dir, filename)
124  if os.path.exists(output_path):
125    os.remove(output_path)
126
127  # Use the model to get pretty-printed XML string and write into file.
128  with open(output_path, 'w') as output_file:
129    pretty_xml_string = histogram_configuration_model.PrettifyTree(doc)
130    output_file.write(pretty_xml_string)
131
132
133def _GetCamelCaseName(node, depth=0):
134  """Returns the first camelcase name part of the given |node|.
135
136  Args:
137    node: The node to get name from.
138    depth: The depth that specifies which name part will be returned.
139        e.g. For a node of name
140        'CustomTabs.DynamicModule.CreatePackageContextTime'
141        The returned camel name for depth 0 is 'Custom';
142        The returned camel name for depth 1 is 'Dynamic';
143        The returned camel name for depth 2 is 'Create'.
144
145        Default depth is set to 0 as this function is imported and
146        used in other files, where depth used is 0.
147
148  Returns:
149    The camelcase name part at specified depth. If the number of name parts is
150    less than the depth, return 'others'.
151  """
152  name = node.getAttribute('name')
153  split_string_list = name.split('.')
154  if len(split_string_list) <= depth:
155    return 'others'
156  elif split_string_list[depth] in _PREDEFINED_NAMES_MAPPING:
157    return _PREDEFINED_NAMES_MAPPING[split_string_list[depth]]
158  else:
159    name_part = split_string_list[depth]
160    start_index = 0
161    # |all_upper| is used to identify the case where the name is ABCDelta, in
162    # which case the camel name of depth 0 should be ABC, instead of A.
163    all_upper = True
164    for index, letter in enumerate(name_part):
165      if letter.islower() or letter.isnumeric():
166        all_upper = False
167      if letter.isupper() and not all_upper:
168        start_index = index
169        break
170
171  if start_index == 0:
172    return name_part
173  else:
174    return name_part[0:start_index]
175
176
177def GetDirForNode(node):
178  """Returns the correct directory that the given |node| should be placed in."""
179  camel_name = _GetCamelCaseName(node)
180  # Check if the directory of its prefix exists. Return the |camel_name| if the
181  # folder exists. Otherwise, this |node| should be placed in 'others' folder.
182  if camel_name in histogram_paths.HISTOGRAMS_PREFIX_LIST:
183    return camel_name
184  return 'others'
185
186
187def _CamelCaseToSnakeCase(name):
188  """Converts CamelCase |name| to snake_case."""
189  name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
190  return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()
191
192
193def _OutputToFolderAndXML(nodes, output_dir, key):
194  """Creates new folder and XML file for separated histograms.
195
196  Args:
197    nodes: A list of histogram/variants nodes of a prefix.
198    output_dir: The output directory.
199    key: The prefix of the histograms, also the name of the new folder.
200  """
201  # Convert CamelCase name to snake_case when creating a directory.
202  output_dir = os.path.join(output_dir, _CamelCaseToSnakeCase(key))
203  if not os.path.exists(output_dir):
204    os.makedirs(output_dir)
205  _CreateXMLFile(key + ' histograms', 'histograms', nodes, output_dir,
206                 'histograms.xml')
207
208
209def _WriteDocumentDict(document_dict, output_dir):
210  """Recursively writes |document_dict| to xmls in |output_dir|.
211
212  Args:
213    document_dict: A dictionary where the key is the prefix of the histogram and
214        value is a list of nodes or another dict.
215    output_dir: The output directory of the resulting folders.
216  """
217  for key, val in document_dict.items():
218    if isinstance(val, list):
219      _OutputToFolderAndXML(val, output_dir, key)
220    else:
221      _WriteDocumentDict(val, os.path.join(output_dir, key))
222
223
224def _AggregateMinorNodes(node_dict):
225  """Aggregates groups of nodes below threshold number into 'others'.
226
227  Args:
228    node_dict: A dictionary where the key is the prefix of the histogram/variant
229        and value is a list of histogram/variant nodes.
230  """
231  others = node_dict.pop('others', [])
232
233  for key, nodes in node_dict.items():
234    # For a prefix, if the number of histograms is fewer than threshold,
235    # aggregate into others.
236    if len(nodes) < AGGREGATE_THRESHOLD:
237      others.extend(nodes)
238      del node_dict[key]
239
240  if others:
241    node_dict['others'] = others
242
243
244def _BuildDocumentDict(nodes, depth):
245  """Recursively builds a document dict which will be written later.
246
247  This function recursively builds a document dict which the key of the dict is
248  the first word of the node's name at the given |depth| and the value of the
249  dict is either a list of nodes that correspond to the key or another dict if
250  it doesn't reach to |TARGET_DEPTH|.
251
252  Args:
253    nodes: A list of histogram nodes or variants node.
254    depth: The current depth, starting from 0.
255
256  Returns:
257    The document dict.
258  """
259  if depth == TARGET_DEPTH:
260    return nodes
261
262  temp_dict = document_dict = {}
263  for node in nodes:
264    name_part = _GetCamelCaseName(node, depth)
265    if name_part not in temp_dict:
266      temp_dict[name_part] = []
267    temp_dict[name_part].append(node)
268
269  # Aggregate keys with less than |AGGREGATE_THRESHOLD| values to 'others'.
270  _AggregateMinorNodes(temp_dict)
271
272  for key, nodes in temp_dict.items():
273    if key == 'others':
274      document_dict[key] = nodes
275    else:
276      document_dict[key] = _BuildDocumentDict(nodes, depth + 1)
277
278  return document_dict
279
280
281def _SeparateObsoleteHistogram(histogram_nodes):
282  """Separates a NodeList of histograms into obsolete and non-obsolete.
283
284  Args:
285    histogram_nodes: A NodeList object containing histogram nodes.
286
287  Returns:
288    obsolete_nodes: A list of obsolete nodes.
289    non_obsolete_nodes: A list of non-obsolete nodes.
290  """
291  obsolete_nodes = []
292  non_obsolete_nodes = []
293  for histogram in histogram_nodes:
294    obsolete_tag_nodelist = histogram.getElementsByTagName('obsolete')
295    if len(obsolete_tag_nodelist) > 0:
296      obsolete_nodes.append(histogram)
297    else:
298      non_obsolete_nodes.append(histogram)
299  return obsolete_nodes, non_obsolete_nodes
300
301
302def SplitIntoMultipleHistogramXMLs(output_base_dir):
303  """Splits a large histograms.xml and writes out the split xmls.
304
305  Args:
306    output_base_dir: The output base directory.
307  """
308  if not os.path.exists(output_base_dir):
309    os.mkdir(output_base_dir)
310
311  histogram_nodes, variants_nodes, histogram_suffixes_nodes = _ParseMergedXML()
312
313  # Create separate XML file for histogram suffixes.
314  _CreateXMLFile('histogram suffixes', 'histogram_suffixes_list',
315                 histogram_suffixes_nodes, output_base_dir,
316                 'histogram_suffixes_list.xml')
317
318  obsolete_nodes, non_obsolete_nodes = _SeparateObsoleteHistogram(
319      histogram_nodes)
320  # Create separate XML file for obsolete histograms.
321  _CreateXMLFile('obsolete histograms', 'histograms', obsolete_nodes,
322                 output_base_dir, 'obsolete_histograms.xml')
323
324  document_dict = _BuildDocumentDict(non_obsolete_nodes + variants_nodes, 0)
325
326  _WriteDocumentDict(document_dict, output_base_dir)
327
328
329if __name__ == '__main__':
330  SplitIntoMultipleHistogramXMLs(
331      path_util.GetInputFile('tools/metrics/histograms/histograms_xml'))
332