1# Copyright 2020 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4"""Splits a XML file into smaller XMLs in subfolders. 5 6Splits nodes according to the first camelcase part of their name attribute. 7Intended to be used to split up the large histograms.xml or enums.xml file. 8""" 9 10import os 11import re 12from xml.dom import minidom 13 14import histogram_configuration_model 15import histogram_paths 16import merge_xml 17import path_util 18 19# The top level comment templates that will be formatted and added to each split 20# histograms xml. 21FIRST_TOP_LEVEL_COMMENT_TEMPLATE = """ 22Copyright 2020 The Chromium Authors. All rights reserved. 23Use of this source code is governed by a BSD-style license that can be 24found in the LICENSE file. 25""" 26SECOND_TOP_LEVEL_COMMENT_TEMPLATE = """ 27This file is used to generate a comprehensive list of %s 28along with a detailed description for each histogram. 29 30For best practices on writing histogram descriptions, see 31https://chromium.googlesource.com/chromium/src.git/+/HEAD/tools/metrics/histograms/README.md 32 33Please send CLs to chromium-metrics-reviews@google.com rather than to specific 34individuals. These CLs will be automatically reassigned to a reviewer within 35about 5 minutes. This approach helps the metrics team to load-balance incoming 36reviews. Googlers can read more about this at go/gwsq-gerrit. 37""" 38# Number of times that splitting of histograms will be carried out. 39TARGET_DEPTH = 1 40# The number of histograms below which they will be aggregated into 41# the histograms.xml in 'others'. 42AGGREGATE_THRESHOLD = 20 43# A map from the histogram name to the folder name these histograms should be 44# put in. 45_PREDEFINED_NAMES_MAPPING = { 46 'BackForwardCache': 'BackForwardCache', 47 'ChromeOS': 'ChromeOS', 48 'CustomTabs': 'CustomTabs', 49 'CustomTab': 'CustomTabs', 50 'DataReductionProxy': 'DataReductionProxy', 51 'DataUse': 'DataUse', 52 'MultiDevice': 'MultiDevice', 53 'NaCl': 'NaCl', 54 'SafeBrowsing': 'SafeBrowsing', 55 'SafeBrowsingBinaryUploadRequest': 'SafeBrowsing', 56 'SafeBrowsingFCMService': 'SafeBrowsing', 57 'NewTabPage': 'NewTabPage', 58 'SiteEngagementService': 'SiteEngagementService', 59 'SiteIsolation': 'SiteIsolation', 60 'Tabs': 'Tab', 61 'TextFragmentAnchor': 'TextFragmentAnchor', 62 'TextToSpeech': 'TextToSpeech', 63 'UpdateEngine': 'UpdateEngine', 64 'WebApk': 'WebApk', 65 'WebApp': 'WebApp', 66 'WebAudio': 'WebAudio', 67 'WebAuthentication': 'WebAuthentication', 68 'WebCore': 'WebCore', 69 'WebFont': 'WebFont', 70 'WebHistory': 'WebHistory', 71 'WebRTC': 'WebRTC', 72 'WebRtcEventLogging': 'WebRTC', 73 'WebRtcTextLogging': 'WebRTC', 74 'WebUI': 'WebUI', 75 'WebUITabStrip': 'WebUI', 76} 77 78 79def _ParseMergedXML(): 80 """Parses merged xml into different types of nodes""" 81 merged_histograms = merge_xml.MergeFiles(histogram_paths.HISTOGRAMS_XMLS + 82 [histogram_paths.OBSOLETE_XML]) 83 histogram_nodes = merged_histograms.getElementsByTagName('histogram') 84 variants_nodes = merged_histograms.getElementsByTagName('variants') 85 histogram_suffixes_nodes = merged_histograms.getElementsByTagName( 86 'histogram_suffixes') 87 return histogram_nodes, variants_nodes, histogram_suffixes_nodes 88 89 90def _CreateXMLFile(comment, parent_node_string, nodes, output_dir, filename): 91 """Creates XML file for given type of XML nodes. 92 93 This function also creates a |parent_node_string| tag as the parent node, e.g. 94 <histograms> or <histogram_suffixes_list>, that wraps all the |nodes| in the 95 output XML. 96 97 Args: 98 comment: The string to be formatted in the |TOP_LEVEL_COMMENT_TEMPLATE| 99 which will then be added on top of each split xml. 100 parent_node_string: The name of the the second-level parent node, e.g. 101 <histograms> or <histogram_suffixes_list>. 102 nodes: A DOM NodeList object or a list containing <histogram> or 103 <histogram_suffixes> that will be inserted under the parent node. 104 output_dir: The output directory. 105 filename: The output filename. 106 """ 107 doc = minidom.Document() 108 109 doc.appendChild(doc.createComment(FIRST_TOP_LEVEL_COMMENT_TEMPLATE)) 110 doc.appendChild(doc.createComment(SECOND_TOP_LEVEL_COMMENT_TEMPLATE % 111 comment)) 112 113 # Create the <histogram-configuration> node for the new histograms.xml file. 114 histogram_config_element = doc.createElement('histogram-configuration') 115 doc.appendChild(histogram_config_element) 116 parent_element = doc.createElement(parent_node_string) 117 histogram_config_element.appendChild(parent_element) 118 119 # Under the parent node, append the children nodes. 120 for node in nodes: 121 parent_element.appendChild(node) 122 123 output_path = os.path.join(output_dir, filename) 124 if os.path.exists(output_path): 125 os.remove(output_path) 126 127 # Use the model to get pretty-printed XML string and write into file. 128 with open(output_path, 'w') as output_file: 129 pretty_xml_string = histogram_configuration_model.PrettifyTree(doc) 130 output_file.write(pretty_xml_string) 131 132 133def _GetCamelCaseName(node, depth=0): 134 """Returns the first camelcase name part of the given |node|. 135 136 Args: 137 node: The node to get name from. 138 depth: The depth that specifies which name part will be returned. 139 e.g. For a node of name 140 'CustomTabs.DynamicModule.CreatePackageContextTime' 141 The returned camel name for depth 0 is 'Custom'; 142 The returned camel name for depth 1 is 'Dynamic'; 143 The returned camel name for depth 2 is 'Create'. 144 145 Default depth is set to 0 as this function is imported and 146 used in other files, where depth used is 0. 147 148 Returns: 149 The camelcase name part at specified depth. If the number of name parts is 150 less than the depth, return 'others'. 151 """ 152 name = node.getAttribute('name') 153 split_string_list = name.split('.') 154 if len(split_string_list) <= depth: 155 return 'others' 156 elif split_string_list[depth] in _PREDEFINED_NAMES_MAPPING: 157 return _PREDEFINED_NAMES_MAPPING[split_string_list[depth]] 158 else: 159 name_part = split_string_list[depth] 160 start_index = 0 161 # |all_upper| is used to identify the case where the name is ABCDelta, in 162 # which case the camel name of depth 0 should be ABC, instead of A. 163 all_upper = True 164 for index, letter in enumerate(name_part): 165 if letter.islower() or letter.isnumeric(): 166 all_upper = False 167 if letter.isupper() and not all_upper: 168 start_index = index 169 break 170 171 if start_index == 0: 172 return name_part 173 else: 174 return name_part[0:start_index] 175 176 177def GetDirForNode(node): 178 """Returns the correct directory that the given |node| should be placed in.""" 179 camel_name = _GetCamelCaseName(node) 180 # Check if the directory of its prefix exists. Return the |camel_name| if the 181 # folder exists. Otherwise, this |node| should be placed in 'others' folder. 182 if camel_name in histogram_paths.HISTOGRAMS_PREFIX_LIST: 183 return camel_name 184 return 'others' 185 186 187def _CamelCaseToSnakeCase(name): 188 """Converts CamelCase |name| to snake_case.""" 189 name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) 190 return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower() 191 192 193def _OutputToFolderAndXML(nodes, output_dir, key): 194 """Creates new folder and XML file for separated histograms. 195 196 Args: 197 nodes: A list of histogram/variants nodes of a prefix. 198 output_dir: The output directory. 199 key: The prefix of the histograms, also the name of the new folder. 200 """ 201 # Convert CamelCase name to snake_case when creating a directory. 202 output_dir = os.path.join(output_dir, _CamelCaseToSnakeCase(key)) 203 if not os.path.exists(output_dir): 204 os.makedirs(output_dir) 205 _CreateXMLFile(key + ' histograms', 'histograms', nodes, output_dir, 206 'histograms.xml') 207 208 209def _WriteDocumentDict(document_dict, output_dir): 210 """Recursively writes |document_dict| to xmls in |output_dir|. 211 212 Args: 213 document_dict: A dictionary where the key is the prefix of the histogram and 214 value is a list of nodes or another dict. 215 output_dir: The output directory of the resulting folders. 216 """ 217 for key, val in document_dict.items(): 218 if isinstance(val, list): 219 _OutputToFolderAndXML(val, output_dir, key) 220 else: 221 _WriteDocumentDict(val, os.path.join(output_dir, key)) 222 223 224def _AggregateMinorNodes(node_dict): 225 """Aggregates groups of nodes below threshold number into 'others'. 226 227 Args: 228 node_dict: A dictionary where the key is the prefix of the histogram/variant 229 and value is a list of histogram/variant nodes. 230 """ 231 others = node_dict.pop('others', []) 232 233 for key, nodes in node_dict.items(): 234 # For a prefix, if the number of histograms is fewer than threshold, 235 # aggregate into others. 236 if len(nodes) < AGGREGATE_THRESHOLD: 237 others.extend(nodes) 238 del node_dict[key] 239 240 if others: 241 node_dict['others'] = others 242 243 244def _BuildDocumentDict(nodes, depth): 245 """Recursively builds a document dict which will be written later. 246 247 This function recursively builds a document dict which the key of the dict is 248 the first word of the node's name at the given |depth| and the value of the 249 dict is either a list of nodes that correspond to the key or another dict if 250 it doesn't reach to |TARGET_DEPTH|. 251 252 Args: 253 nodes: A list of histogram nodes or variants node. 254 depth: The current depth, starting from 0. 255 256 Returns: 257 The document dict. 258 """ 259 if depth == TARGET_DEPTH: 260 return nodes 261 262 temp_dict = document_dict = {} 263 for node in nodes: 264 name_part = _GetCamelCaseName(node, depth) 265 if name_part not in temp_dict: 266 temp_dict[name_part] = [] 267 temp_dict[name_part].append(node) 268 269 # Aggregate keys with less than |AGGREGATE_THRESHOLD| values to 'others'. 270 _AggregateMinorNodes(temp_dict) 271 272 for key, nodes in temp_dict.items(): 273 if key == 'others': 274 document_dict[key] = nodes 275 else: 276 document_dict[key] = _BuildDocumentDict(nodes, depth + 1) 277 278 return document_dict 279 280 281def _SeparateObsoleteHistogram(histogram_nodes): 282 """Separates a NodeList of histograms into obsolete and non-obsolete. 283 284 Args: 285 histogram_nodes: A NodeList object containing histogram nodes. 286 287 Returns: 288 obsolete_nodes: A list of obsolete nodes. 289 non_obsolete_nodes: A list of non-obsolete nodes. 290 """ 291 obsolete_nodes = [] 292 non_obsolete_nodes = [] 293 for histogram in histogram_nodes: 294 obsolete_tag_nodelist = histogram.getElementsByTagName('obsolete') 295 if len(obsolete_tag_nodelist) > 0: 296 obsolete_nodes.append(histogram) 297 else: 298 non_obsolete_nodes.append(histogram) 299 return obsolete_nodes, non_obsolete_nodes 300 301 302def SplitIntoMultipleHistogramXMLs(output_base_dir): 303 """Splits a large histograms.xml and writes out the split xmls. 304 305 Args: 306 output_base_dir: The output base directory. 307 """ 308 if not os.path.exists(output_base_dir): 309 os.mkdir(output_base_dir) 310 311 histogram_nodes, variants_nodes, histogram_suffixes_nodes = _ParseMergedXML() 312 313 # Create separate XML file for histogram suffixes. 314 _CreateXMLFile('histogram suffixes', 'histogram_suffixes_list', 315 histogram_suffixes_nodes, output_base_dir, 316 'histogram_suffixes_list.xml') 317 318 obsolete_nodes, non_obsolete_nodes = _SeparateObsoleteHistogram( 319 histogram_nodes) 320 # Create separate XML file for obsolete histograms. 321 _CreateXMLFile('obsolete histograms', 'histograms', obsolete_nodes, 322 output_base_dir, 'obsolete_histograms.xml') 323 324 document_dict = _BuildDocumentDict(non_obsolete_nodes + variants_nodes, 0) 325 326 _WriteDocumentDict(document_dict, output_base_dir) 327 328 329if __name__ == '__main__': 330 SplitIntoMultipleHistogramXMLs( 331 path_util.GetInputFile('tools/metrics/histograms/histograms_xml')) 332