1# Copyright 2014 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Utility file for pretty printing xml file.
6
7The function PrettyPrintXml will be used for formatting both histograms.xml
8and actions.xml.
9"""
10
11import sys
12import logging
13import textwrap
14from xml.dom import minidom
15import xml.etree.ElementTree as ET
16
17import etree_util
18
19WRAP_COLUMN = 80
20
21
22class Error(Exception):
23  pass
24
25
26def LastLineLength(s):
27  """Returns the length of the last line in s.
28
29  Args:
30    s: A multi-line string, including newlines.
31
32  Returns:
33    The length of the last line in s, in characters.
34  """
35  if s.rfind('\n') == -1:
36    return len(s)
37  return len(s) - s.rfind('\n') - len('\n')
38
39
40def XmlEscape(s):
41  """Returns escaped string for the given string |s|."""
42  s = s.replace('&', '&amp;').replace('<', '&lt;')
43  s = s.replace('\"', '&quot;').replace('>', '&gt;')
44  return s
45
46
47def SplitParagraphs(text):
48  """Split a block of text into paragraphs.
49
50  Args:
51    text: The text to split.
52
53  Returns:
54    A list of paragraphs as strings.
55  """
56  text = textwrap.dedent(text.strip('\n'))
57  lines = text.split('\n')
58  # Split the text into paragraphs at blank line boundaries.
59  paragraphs = [[]]
60  for l in lines:
61    if paragraphs[-1] and not l.strip():
62      paragraphs.append([])
63    else:
64      # Replace runs of repeated whitespace with a single space.
65      transformed_line = ' '.join(l.split())
66      paragraphs[-1].append(transformed_line)
67  # Remove trailing empty paragraph if present.
68  if paragraphs and not paragraphs[-1]:
69    paragraphs = paragraphs[:-1]
70  return ['\n'.join(p) for p in paragraphs]
71
72
73class XmlStyle(object):
74  """A class that stores all style specification for an output xml file."""
75
76  def __init__(self, attribute_order, required_attributes,
77               tags_that_have_extra_newline, tags_that_dont_indent,
78               tags_that_allow_single_line, tags_alphabetization_rules):
79    self.attribute_order = attribute_order
80    self.required_attributes = required_attributes
81    self.tags_that_have_extra_newline = tags_that_have_extra_newline
82    self.tags_that_dont_indent = tags_that_dont_indent
83    self.tags_that_allow_single_line = tags_that_allow_single_line
84    self.tags_alphabetization_rules = tags_alphabetization_rules
85
86    self.wrapper = textwrap.TextWrapper()
87    self.wrapper.break_on_hyphens = False
88    self.wrapper.break_long_words = False
89    self.wrapper.width = WRAP_COLUMN
90
91  def PrettyPrintXml(self, tree):
92    # If it's not an ElementTree instance, we assume it's minidom.
93    if not isinstance(tree, ET.Element):
94      assert isinstance(tree, minidom.Document)
95      return self._PrettyPrintMinidom(tree)
96
97    tree = self._TransformByAlphabetizing(tree)
98    tree = self.PrettyPrintElementTreeNode(tree)
99    return tree
100
101  def _PrettyPrintMinidom(self, doc):
102    """Transforms minidom to ElementTree before pretty printing it."""
103    raw_xml = doc.toxml()
104
105    # minidom prepends a document type, so remove it.
106    raw_xml = raw_xml.replace(minidom.Document().toxml(), '')
107
108    etree_root = etree_util.ParseXMLString(raw_xml)
109    top_content = etree_util.GetTopLevelContent(raw_xml)
110
111    # Add newlines between top-level comments.
112    top_content = top_content.replace('--><!--', '-->\n\n<!--')
113
114    formatted_xml = self.PrettyPrintXml(etree_root)
115    return top_content + formatted_xml
116
117
118  def _TransformByAlphabetizing(self, node):
119    """Transform the given XML by alphabetizing nodes.
120
121    Args:
122      node: The elementtree node to transform.
123
124    Returns:
125      The elementtree node, with children appropriately alphabetized. Note that
126      the transformation is done in-place, i.e. the original tree is modified
127      directly.
128    """
129    # Element node with a tag name that we alphabetize the children of?
130    alpha_rules = self.tags_alphabetization_rules
131    if node.tag in alpha_rules:
132      # Put subnodes in a list of node, key pairs to allow for custom sorting.
133      subtags = {}
134      for index, (subtag, key_function) in enumerate(alpha_rules[node.tag]):
135        subtags[subtag] = (index, key_function)
136
137      subnodes = []
138      sort_key = -1
139      pending_node_indices = []
140      for c in node:
141        if c.tag in subtags:
142          subtag_sort_index, key_function = subtags[c.tag]
143          sort_key = (subtag_sort_index, key_function(c))
144          # Replace sort keys for delayed nodes.
145          for idx in pending_node_indices:
146            subnodes[idx][1] = sort_key
147          pending_node_indices = []
148        else:
149          # Subnodes that we don't want to rearrange use the next node's key,
150          # so they stay in the same relative position.
151          # Therefore we delay setting key until the next node is found.
152          pending_node_indices.append(len(subnodes))
153        subnodes.append([c, sort_key])
154
155      # Sort the subnode list.
156      subnodes.sort(key=lambda pair: pair[1])
157
158      # Remove the existing nodes
159      for child in list(node):
160        node.remove(child)
161
162      # Re-add the sorted subnodes, transforming each recursively.
163      for (c, _) in subnodes:
164        node.append(self._TransformByAlphabetizing(c))
165      return node
166
167    # Recursively handle other element nodes and other node types.
168    for c in node:
169      self._TransformByAlphabetizing(c)
170    return node
171
172  def _PrettyPrintText(self, text, indent):
173    """Pretty print an element."""
174    if not text.strip():
175      return ""
176
177    self.wrapper.initial_indent = ' ' * indent
178    self.wrapper.subsequent_indent = ' ' * indent
179    escaped_text = XmlEscape(text)
180    paragraphs = SplitParagraphs(escaped_text)
181
182    # Wrap each paragraph and separate with two newlines.
183    return '\n\n'.join(self.wrapper.fill(p) for p in paragraphs)
184
185  def _PrettyPrintElement(self, node, indent):
186    # Check if tag name is valid.
187    if node.tag not in self.attribute_order:
188      logging.error('Unrecognized tag "%s"', node.tag)
189      raise Error('Unrecognized tag "%s"' % node.tag)
190
191    # Newlines.
192    newlines_after_open, newlines_before_close, newlines_after_close = (
193        self.tags_that_have_extra_newline.get(node.tag, (1, 1, 0)))
194    # Open the tag.
195    s = ' ' * indent + '<' + node.tag
196
197    # Calculate how much space to allow for the '>' or '/>'.
198    closing_chars = 2
199    if len(node) or node.text:
200      closing_chars = 1
201
202    attributes = node.keys()
203    missing_attributes = [
204        attribute for attribute in self.required_attributes[node.tag]
205        if attribute not in attributes
206    ]
207
208    for attribute in missing_attributes:
209      logging.error(
210          'Missing attribute "%s" in tag "%s"', attribute, node.tag)
211    if missing_attributes:
212      missing_attributes_str = (
213          ', '.join('"%s"' % attribute for attribute in missing_attributes))
214      present_attributes = [
215          ' {0}="{1}"'.format(name, value)
216          for name, value in node.items()]
217      node_str = '<{0}{1}>'.format(node.tag, ''.join(present_attributes))
218      raise Error(
219          'Missing attributes {0} in tag "{1}"'.format(
220              missing_attributes_str, node_str))
221
222    # Pretty-print the attributes.
223    if attributes:
224      # Reorder the attributes.
225      unrecognized_attributes = [
226          a for a in attributes if a not in self.attribute_order[node.tag]
227      ]
228      attributes = [
229          a for a in self.attribute_order[node.tag] if a in attributes
230      ]
231
232      for a in unrecognized_attributes:
233        logging.error('Unrecognized attribute "%s" in tag "%s"', a, node.tag)
234      if unrecognized_attributes:
235        raise Error('Unrecognized attributes {0} in tag "{1}"'.format(
236            ', '.join('"{0}"'.format(a) for a in unrecognized_attributes),
237            node.tag))
238
239      for a in attributes:
240        value = XmlEscape(node.get(a))
241        # Replace sequences of whitespace with single spaces.
242        words = value.split()
243        a_str = ' %s="%s"' % (a, ' '.join(words))
244        # Start a new line if the attribute will make this line too long.
245        if LastLineLength(s) + len(a_str) + closing_chars > WRAP_COLUMN:
246          s += '\n' + ' ' * (indent + 3)
247        # Output everything up to the first quote.
248        s += ' %s="' % (a)
249        value_indent_level = LastLineLength(s)
250        # Output one word at a time, splitting to the next line where
251        # necessary.
252        column = value_indent_level
253        for i, word in enumerate(words):
254          # This is slightly too conservative since not every word will be
255          # followed by the closing characters...
256          if i > 0 and (column + len(word) + 1 + closing_chars > WRAP_COLUMN):
257            s = s.rstrip()  # remove any trailing whitespace
258            s += '\n' + ' ' * value_indent_level
259            column = value_indent_level
260          s += word + ' '
261          column += len(word) + 1
262        s = s.rstrip()  # remove any trailing whitespace
263        s += '"'
264      s = s.rstrip()  # remove any trailing whitespace
265
266    # Pretty-print the child nodes.
267    if len(node) > 0 or node.text:  # pylint: disable=g-explicit-length-test
268      s += '>'
269      # Calculate the new indent level for child nodes.
270      new_indent = indent
271      if node.tag not in self.tags_that_dont_indent:
272        new_indent += 2
273
274      children = []
275      for c in node:
276        children.append(c)
277
278      # Recursively pretty-print the child nodes.
279      child_nodes = []
280      if node.text:
281        formatted_text = self._PrettyPrintText(node.text, new_indent)
282        if formatted_text:
283          child_nodes.append(formatted_text)
284
285      for child in node:
286        child_output = self.PrettyPrintElementTreeNode(child, indent=new_indent)
287        if child_output.strip():
288          child_nodes.append(child_output)
289
290        if child.tail:
291          tail_text = self._PrettyPrintText(child.tail, new_indent)
292          if tail_text:
293            child_nodes.append(tail_text)
294
295      # Determine whether we can fit the entire node on a single line.
296      close_tag = '</%s>' % node.tag
297      space_left = WRAP_COLUMN - LastLineLength(s) - len(close_tag)
298      if (node.tag in self.tags_that_allow_single_line and
299          len(child_nodes) == 1 and len(child_nodes[0].strip()) <= space_left):
300        s += child_nodes[0].strip()
301      else:
302        s += '\n' * newlines_after_open + '\n'.join(child_nodes)
303        s += '\n' * newlines_before_close + ' ' * indent
304      s += close_tag
305    else:
306      s += '/>'
307    s += '\n' * newlines_after_close
308    return s
309
310  def PrettyPrintElementTreeNode(self, node, indent=0):
311    """Pretty-prints the given XML node at the given indent level.
312
313    Args:
314      node: The ElementTree node to pretty-print.
315      indent: The current indent level.
316
317    Returns:
318      The pretty-printed string (including embedded newlines).
319
320    Raises:
321      Error: if the XML has unknown tags or attributes.
322    """
323    # Handle comment nodes.
324    if node.tag is ET.Comment:
325      return '<!--%s-->\n' % node.text
326
327    # Handle element nodes.
328    return self._PrettyPrintElement(node, indent)
329