1# Copyright 2014 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5"""Utility file for pretty printing xml file. 6 7The function PrettyPrintXml will be used for formatting both histograms.xml 8and actions.xml. 9""" 10 11import sys 12import logging 13import textwrap 14from xml.dom import minidom 15import xml.etree.ElementTree as ET 16 17import etree_util 18 19WRAP_COLUMN = 80 20 21 22class Error(Exception): 23 pass 24 25 26def LastLineLength(s): 27 """Returns the length of the last line in s. 28 29 Args: 30 s: A multi-line string, including newlines. 31 32 Returns: 33 The length of the last line in s, in characters. 34 """ 35 if s.rfind('\n') == -1: 36 return len(s) 37 return len(s) - s.rfind('\n') - len('\n') 38 39 40def XmlEscape(s): 41 """Returns escaped string for the given string |s|.""" 42 s = s.replace('&', '&').replace('<', '<') 43 s = s.replace('\"', '"').replace('>', '>') 44 return s 45 46 47def SplitParagraphs(text): 48 """Split a block of text into paragraphs. 49 50 Args: 51 text: The text to split. 52 53 Returns: 54 A list of paragraphs as strings. 55 """ 56 text = textwrap.dedent(text.strip('\n')) 57 lines = text.split('\n') 58 # Split the text into paragraphs at blank line boundaries. 59 paragraphs = [[]] 60 for l in lines: 61 if paragraphs[-1] and not l.strip(): 62 paragraphs.append([]) 63 else: 64 # Replace runs of repeated whitespace with a single space. 65 transformed_line = ' '.join(l.split()) 66 paragraphs[-1].append(transformed_line) 67 # Remove trailing empty paragraph if present. 68 if paragraphs and not paragraphs[-1]: 69 paragraphs = paragraphs[:-1] 70 return ['\n'.join(p) for p in paragraphs] 71 72 73class XmlStyle(object): 74 """A class that stores all style specification for an output xml file.""" 75 76 def __init__(self, attribute_order, required_attributes, 77 tags_that_have_extra_newline, tags_that_dont_indent, 78 tags_that_allow_single_line, tags_alphabetization_rules): 79 self.attribute_order = attribute_order 80 self.required_attributes = required_attributes 81 self.tags_that_have_extra_newline = tags_that_have_extra_newline 82 self.tags_that_dont_indent = tags_that_dont_indent 83 self.tags_that_allow_single_line = tags_that_allow_single_line 84 self.tags_alphabetization_rules = tags_alphabetization_rules 85 86 self.wrapper = textwrap.TextWrapper() 87 self.wrapper.break_on_hyphens = False 88 self.wrapper.break_long_words = False 89 self.wrapper.width = WRAP_COLUMN 90 91 def PrettyPrintXml(self, tree): 92 # If it's not an ElementTree instance, we assume it's minidom. 93 if not isinstance(tree, ET.Element): 94 assert isinstance(tree, minidom.Document) 95 return self._PrettyPrintMinidom(tree) 96 97 tree = self._TransformByAlphabetizing(tree) 98 tree = self.PrettyPrintElementTreeNode(tree) 99 return tree 100 101 def _PrettyPrintMinidom(self, doc): 102 """Transforms minidom to ElementTree before pretty printing it.""" 103 raw_xml = doc.toxml() 104 105 # minidom prepends a document type, so remove it. 106 raw_xml = raw_xml.replace(minidom.Document().toxml(), '') 107 108 etree_root = etree_util.ParseXMLString(raw_xml) 109 top_content = etree_util.GetTopLevelContent(raw_xml) 110 111 # Add newlines between top-level comments. 112 top_content = top_content.replace('--><!--', '-->\n\n<!--') 113 114 formatted_xml = self.PrettyPrintXml(etree_root) 115 return top_content + formatted_xml 116 117 118 def _TransformByAlphabetizing(self, node): 119 """Transform the given XML by alphabetizing nodes. 120 121 Args: 122 node: The elementtree node to transform. 123 124 Returns: 125 The elementtree node, with children appropriately alphabetized. Note that 126 the transformation is done in-place, i.e. the original tree is modified 127 directly. 128 """ 129 # Element node with a tag name that we alphabetize the children of? 130 alpha_rules = self.tags_alphabetization_rules 131 if node.tag in alpha_rules: 132 # Put subnodes in a list of node, key pairs to allow for custom sorting. 133 subtags = {} 134 for index, (subtag, key_function) in enumerate(alpha_rules[node.tag]): 135 subtags[subtag] = (index, key_function) 136 137 subnodes = [] 138 sort_key = -1 139 pending_node_indices = [] 140 for c in node: 141 if c.tag in subtags: 142 subtag_sort_index, key_function = subtags[c.tag] 143 sort_key = (subtag_sort_index, key_function(c)) 144 # Replace sort keys for delayed nodes. 145 for idx in pending_node_indices: 146 subnodes[idx][1] = sort_key 147 pending_node_indices = [] 148 else: 149 # Subnodes that we don't want to rearrange use the next node's key, 150 # so they stay in the same relative position. 151 # Therefore we delay setting key until the next node is found. 152 pending_node_indices.append(len(subnodes)) 153 subnodes.append([c, sort_key]) 154 155 # Sort the subnode list. 156 subnodes.sort(key=lambda pair: pair[1]) 157 158 # Remove the existing nodes 159 for child in list(node): 160 node.remove(child) 161 162 # Re-add the sorted subnodes, transforming each recursively. 163 for (c, _) in subnodes: 164 node.append(self._TransformByAlphabetizing(c)) 165 return node 166 167 # Recursively handle other element nodes and other node types. 168 for c in node: 169 self._TransformByAlphabetizing(c) 170 return node 171 172 def _PrettyPrintText(self, text, indent): 173 """Pretty print an element.""" 174 if not text.strip(): 175 return "" 176 177 self.wrapper.initial_indent = ' ' * indent 178 self.wrapper.subsequent_indent = ' ' * indent 179 escaped_text = XmlEscape(text) 180 paragraphs = SplitParagraphs(escaped_text) 181 182 # Wrap each paragraph and separate with two newlines. 183 return '\n\n'.join(self.wrapper.fill(p) for p in paragraphs) 184 185 def _PrettyPrintElement(self, node, indent): 186 # Check if tag name is valid. 187 if node.tag not in self.attribute_order: 188 logging.error('Unrecognized tag "%s"', node.tag) 189 raise Error('Unrecognized tag "%s"' % node.tag) 190 191 # Newlines. 192 newlines_after_open, newlines_before_close, newlines_after_close = ( 193 self.tags_that_have_extra_newline.get(node.tag, (1, 1, 0))) 194 # Open the tag. 195 s = ' ' * indent + '<' + node.tag 196 197 # Calculate how much space to allow for the '>' or '/>'. 198 closing_chars = 2 199 if len(node) or node.text: 200 closing_chars = 1 201 202 attributes = node.keys() 203 missing_attributes = [ 204 attribute for attribute in self.required_attributes[node.tag] 205 if attribute not in attributes 206 ] 207 208 for attribute in missing_attributes: 209 logging.error( 210 'Missing attribute "%s" in tag "%s"', attribute, node.tag) 211 if missing_attributes: 212 missing_attributes_str = ( 213 ', '.join('"%s"' % attribute for attribute in missing_attributes)) 214 present_attributes = [ 215 ' {0}="{1}"'.format(name, value) 216 for name, value in node.items()] 217 node_str = '<{0}{1}>'.format(node.tag, ''.join(present_attributes)) 218 raise Error( 219 'Missing attributes {0} in tag "{1}"'.format( 220 missing_attributes_str, node_str)) 221 222 # Pretty-print the attributes. 223 if attributes: 224 # Reorder the attributes. 225 unrecognized_attributes = [ 226 a for a in attributes if a not in self.attribute_order[node.tag] 227 ] 228 attributes = [ 229 a for a in self.attribute_order[node.tag] if a in attributes 230 ] 231 232 for a in unrecognized_attributes: 233 logging.error('Unrecognized attribute "%s" in tag "%s"', a, node.tag) 234 if unrecognized_attributes: 235 raise Error('Unrecognized attributes {0} in tag "{1}"'.format( 236 ', '.join('"{0}"'.format(a) for a in unrecognized_attributes), 237 node.tag)) 238 239 for a in attributes: 240 value = XmlEscape(node.get(a)) 241 # Replace sequences of whitespace with single spaces. 242 words = value.split() 243 a_str = ' %s="%s"' % (a, ' '.join(words)) 244 # Start a new line if the attribute will make this line too long. 245 if LastLineLength(s) + len(a_str) + closing_chars > WRAP_COLUMN: 246 s += '\n' + ' ' * (indent + 3) 247 # Output everything up to the first quote. 248 s += ' %s="' % (a) 249 value_indent_level = LastLineLength(s) 250 # Output one word at a time, splitting to the next line where 251 # necessary. 252 column = value_indent_level 253 for i, word in enumerate(words): 254 # This is slightly too conservative since not every word will be 255 # followed by the closing characters... 256 if i > 0 and (column + len(word) + 1 + closing_chars > WRAP_COLUMN): 257 s = s.rstrip() # remove any trailing whitespace 258 s += '\n' + ' ' * value_indent_level 259 column = value_indent_level 260 s += word + ' ' 261 column += len(word) + 1 262 s = s.rstrip() # remove any trailing whitespace 263 s += '"' 264 s = s.rstrip() # remove any trailing whitespace 265 266 # Pretty-print the child nodes. 267 if len(node) > 0 or node.text: # pylint: disable=g-explicit-length-test 268 s += '>' 269 # Calculate the new indent level for child nodes. 270 new_indent = indent 271 if node.tag not in self.tags_that_dont_indent: 272 new_indent += 2 273 274 children = [] 275 for c in node: 276 children.append(c) 277 278 # Recursively pretty-print the child nodes. 279 child_nodes = [] 280 if node.text: 281 formatted_text = self._PrettyPrintText(node.text, new_indent) 282 if formatted_text: 283 child_nodes.append(formatted_text) 284 285 for child in node: 286 child_output = self.PrettyPrintElementTreeNode(child, indent=new_indent) 287 if child_output.strip(): 288 child_nodes.append(child_output) 289 290 if child.tail: 291 tail_text = self._PrettyPrintText(child.tail, new_indent) 292 if tail_text: 293 child_nodes.append(tail_text) 294 295 # Determine whether we can fit the entire node on a single line. 296 close_tag = '</%s>' % node.tag 297 space_left = WRAP_COLUMN - LastLineLength(s) - len(close_tag) 298 if (node.tag in self.tags_that_allow_single_line and 299 len(child_nodes) == 1 and len(child_nodes[0].strip()) <= space_left): 300 s += child_nodes[0].strip() 301 else: 302 s += '\n' * newlines_after_open + '\n'.join(child_nodes) 303 s += '\n' * newlines_before_close + ' ' * indent 304 s += close_tag 305 else: 306 s += '/>' 307 s += '\n' * newlines_after_close 308 return s 309 310 def PrettyPrintElementTreeNode(self, node, indent=0): 311 """Pretty-prints the given XML node at the given indent level. 312 313 Args: 314 node: The ElementTree node to pretty-print. 315 indent: The current indent level. 316 317 Returns: 318 The pretty-printed string (including embedded newlines). 319 320 Raises: 321 Error: if the XML has unknown tags or attributes. 322 """ 323 # Handle comment nodes. 324 if node.tag is ET.Comment: 325 return '<!--%s-->\n' % node.text 326 327 # Handle element nodes. 328 return self._PrettyPrintElement(node, indent) 329