1# Copyright (c) 2012 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5'''A gatherer for the TotalRecall brand of HTML templates with replaceable
6portions.  We wanted to reuse extern.tclib.api.handlers.html.TCHTMLParser
7but this proved impossible due to the fact that the TotalRecall HTML templates
8are in general quite far from parseable HTML and the TCHTMLParser derives
9
10from HTMLParser.HTMLParser which requires relatively well-formed HTML.  Some
11examples of "HTML" from the TotalRecall HTML templates that wouldn't be
12parseable include things like:
13
14  <a [PARAMS]>blabla</a>  (not parseable because attributes are invalid)
15
16  <table><tr><td>[LOTSOFSTUFF]</tr></table> (not parseable because closing
17                                            </td> is in the HTML [LOTSOFSTUFF]
18                                            is replaced by)
19
20The other problem with using general parsers (such as TCHTMLParser) is that
21we want to make sure we output the TotalRecall template with as little changes
22as possible in terms of whitespace characters, layout etc.  With any parser
23that generates a parse tree, and generates output by dumping the parse tree,
24we would always have little inconsistencies which could cause bugs (the
25TotalRecall template stuff is quite brittle and can break if e.g. a tab
26character is replaced with spaces).
27
28The solution, which may be applicable to some other HTML-like template
29languages floating around Google, is to create a parser with a simple state
30machine that keeps track of what kind of tag it's inside, and whether it's in
31a translateable section or not.  Translateable sections are:
32
33a) text (including [BINGO] replaceables) inside of tags that
34   can contain translateable text (which is all tags except
35   for a few)
36
37b) text inside of an 'alt' attribute in an <image> element, or
38   the 'value' attribute of a <submit>, <button> or <text>
39   element.
40
41The parser does not build up a parse tree but rather a "skeleton" which
42is a list of nontranslateable strings intermingled with grit.clique.MessageClique
43objects.  This simplifies the parser considerably compared to a regular HTML
44parser.  To output a translated document, each item in the skeleton is
45printed out, with the relevant Translation from each MessageCliques being used
46for the requested language.
47
48This implementation borrows some code, constants and ideas from
49extern.tclib.api.handlers.html.TCHTMLParser.
50'''
51
52from __future__ import print_function
53
54import re
55
56import six
57
58from grit import clique
59from grit import exception
60from grit import lazy_re
61from grit import util
62from grit import tclib
63
64from grit.gather import interface
65
66
67# HTML tags which break (separate) chunks.
68_BLOCK_TAGS = ['script', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'br',
69              'body', 'style', 'head', 'title', 'table', 'tr', 'td', 'th',
70              'ul', 'ol', 'dl', 'nl', 'li', 'div', 'object', 'center',
71              'html', 'link', 'form', 'select', 'textarea',
72              'button', 'option', 'map', 'area', 'blockquote', 'pre',
73              'meta', 'xmp', 'noscript', 'label', 'tbody', 'thead',
74              'script', 'style', 'pre', 'iframe', 'img', 'input', 'nowrap',
75              'fieldset', 'legend']
76
77# HTML tags which may appear within a chunk.
78_INLINE_TAGS = ['b', 'i', 'u', 'tt', 'code', 'font', 'a', 'span', 'small',
79               'key', 'nobr', 'url', 'em', 's', 'sup', 'strike',
80               'strong']
81
82# HTML tags within which linebreaks are significant.
83_PREFORMATTED_TAGS = ['textarea', 'xmp', 'pre']
84
85# An array mapping some of the inline HTML tags to more meaningful
86# names for those tags.  This will be used when generating placeholders
87# representing these tags.
88_HTML_PLACEHOLDER_NAMES = { 'a' : 'link', 'br' : 'break', 'b' : 'bold',
89  'i' : 'italic', 'li' : 'item', 'ol' : 'ordered_list', 'p' : 'paragraph',
90  'ul' : 'unordered_list', 'img' : 'image', 'em' : 'emphasis' }
91
92# We append each of these characters in sequence to distinguish between
93# different placeholders with basically the same name (e.g. BOLD1, BOLD2).
94# Keep in mind that a placeholder name must not be a substring of any other
95# placeholder name in the same message, so we can't simply count (BOLD_1
96# would be a substring of BOLD_10).
97_SUFFIXES = '123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
98
99# Matches whitespace in an HTML document.  Also matches HTML comments, which are
100# treated as whitespace.
101_WHITESPACE = lazy_re.compile(r'(\s|&nbsp;|\\n|\\r|<!--\s*desc\s*=.*?-->)+',
102                              re.DOTALL)
103
104# Matches whitespace sequences which can be folded into a single whitespace
105# character.  This matches single characters so that non-spaces are replaced
106# with spaces.
107_FOLD_WHITESPACE = lazy_re.compile(r'\s+')
108
109# Finds a non-whitespace character
110_NON_WHITESPACE = lazy_re.compile(r'\S')
111
112# Matches two or more &nbsp; in a row (a single &nbsp is not changed into
113# placeholders because different languages require different numbers of spaces
114# and placeholders must match exactly; more than one is probably a "special"
115# whitespace sequence and should be turned into a placeholder).
116_NBSP = lazy_re.compile(r'&nbsp;(&nbsp;)+')
117
118# Matches nontranslateable chunks of the document
119_NONTRANSLATEABLES = lazy_re.compile(r'''
120  <\s*script.+?<\s*/\s*script\s*>
121  |
122  <\s*style.+?<\s*/\s*style\s*>
123  |
124  <!--.+?-->
125  |
126  <\?IMPORT\s.+?>           # import tag
127  |
128  <\s*[a-zA-Z_]+:.+?>       # custom tag (open)
129  |
130  <\s*/\s*[a-zA-Z_]+:.+?>   # custom tag (close)
131  |
132  <!\s*[A-Z]+\s*([^>]+|"[^"]+"|'[^']+')*?>
133  ''', re.MULTILINE | re.DOTALL | re.VERBOSE | re.IGNORECASE)
134
135# Matches a tag and its attributes
136_ELEMENT = lazy_re.compile(r'''
137  # Optional closing /, element name
138  <\s*(?P<closing>/)?\s*(?P<element>[a-zA-Z0-9]+)\s*
139  # Attributes and/or replaceables inside the tag, if any
140  (?P<atts>(
141    \s*([a-zA-Z_][-:.a-zA-Z_0-9]*) # Attribute name
142    (\s*=\s*(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?
143    |
144    \s*\[(\$?\~)?([A-Z0-9-_]+?)(\~\$?)?\]
145  )*)
146  \s*(?P<empty>/)?\s*> # Optional empty-tag closing /, and tag close
147  ''',
148  re.MULTILINE | re.DOTALL | re.VERBOSE)
149
150# Matches elements that may have translateable attributes.  The value of these
151# special attributes is given by group 'value1' or 'value2'.  Note that this
152# regexp demands that the attribute value be quoted; this is necessary because
153# the non-tree-building nature of the parser means we don't know when we're
154# writing out attributes, so we wouldn't know to escape spaces.
155_SPECIAL_ELEMENT = lazy_re.compile(r'''
156  <\s*(
157    input[^>]+?value\s*=\s*(\'(?P<value3>[^\']*)\'|"(?P<value4>[^"]*)")
158    [^>]+type\s*=\s*"?'?(button|reset|text|submit)'?"?
159    |
160    (
161      table[^>]+?title\s*=
162      |
163      img[^>]+?alt\s*=
164      |
165      input[^>]+?type\s*=\s*"?'?(button|reset|text|submit)'?"?[^>]+?value\s*=
166    )
167    \s*(\'(?P<value1>[^\']*)\'|"(?P<value2>[^"]*)")
168  )[^>]*?>
169  ''', re.MULTILINE | re.DOTALL | re.VERBOSE | re.IGNORECASE)
170
171# Matches stuff that is translateable if it occurs in the right context
172# (between tags).  This includes all characters and character entities.
173# Note that this also matches &nbsp; which needs to be handled as whitespace
174# before this regexp is applied.
175_CHARACTERS = lazy_re.compile(r'''
176  (
177    \w
178    |
179    [\!\@\#\$\%\^\*\(\)\-\=\_\+\[\]\{\}\\\|\;\:\'\"\,\.\/\?\`\~]
180    |
181    &(\#[0-9]+|\#x[0-9a-fA-F]+|[A-Za-z0-9]+);
182  )+
183  ''', re.MULTILINE | re.DOTALL | re.VERBOSE)
184
185# Matches Total Recall's "replaceable" tags, which are just any text
186# in capitals enclosed by delimiters like [] or [~~] or [$~~$] (e.g. [HELLO],
187# [~HELLO~] and [$~HELLO~$]).
188_REPLACEABLE = lazy_re.compile(r'\[(\$?\~)?(?P<name>[A-Z0-9-_]+?)(\~\$?)?\]',
189                               re.MULTILINE)
190
191
192# Matches the silly [!]-prefixed "header" that is used in some TotalRecall
193# templates.
194_SILLY_HEADER = lazy_re.compile(r'\[!\]\ntitle\t(?P<title>[^\n]+?)\n.+?\n\n',
195                                re.MULTILINE | re.DOTALL)
196
197
198# Matches a comment that provides a description for the message it occurs in.
199_DESCRIPTION_COMMENT = lazy_re.compile(
200  r'<!--\s*desc\s*=\s*(?P<description>.+?)\s*-->', re.DOTALL)
201
202# Matches a comment which is used to break apart multiple messages.
203_MESSAGE_BREAK_COMMENT = lazy_re.compile(r'<!--\s*message-break\s*-->',
204                                         re.DOTALL)
205
206# Matches a comment which is used to prevent block tags from splitting a message
207_MESSAGE_NO_BREAK_COMMENT = re.compile(r'<!--\s*message-no-break\s*-->',
208                                       re.DOTALL)
209
210
211_DEBUG = 0
212def _DebugPrint(text):
213  if _DEBUG:
214    print(text.encode('utf-8'))
215
216
217class HtmlChunks(object):
218  '''A parser that knows how to break an HTML-like document into a list of
219  chunks, where each chunk is either translateable or non-translateable.
220  The chunks are unmodified sections of the original document, so concatenating
221  the text of all chunks would result in the original document.'''
222
223  def InTranslateable(self):
224    return self.last_translateable != -1
225
226  def Rest(self):
227    return self.text_[self.current:]
228
229  def StartTranslateable(self):
230    assert not self.InTranslateable()
231    if self.current != 0:
232      # Append a nontranslateable chunk
233      chunk_text = self.text_[self.chunk_start : self.last_nontranslateable + 1]
234      # Needed in the case where document starts with a translateable.
235      if len(chunk_text) > 0:
236        self.AddChunk(False, chunk_text)
237    self.chunk_start = self.last_nontranslateable + 1
238    self.last_translateable = self.current
239    self.last_nontranslateable = -1
240
241  def EndTranslateable(self):
242    assert self.InTranslateable()
243    # Append a translateable chunk
244    self.AddChunk(True,
245                  self.text_[self.chunk_start : self.last_translateable + 1])
246    self.chunk_start = self.last_translateable + 1
247    self.last_translateable = -1
248    self.last_nontranslateable = self.current
249
250  def AdvancePast(self, match):
251    self.current += match.end()
252
253  def AddChunk(self, translateable, text):
254    '''Adds a chunk to self, removing linebreaks and duplicate whitespace
255    if appropriate.
256    '''
257    m = _DESCRIPTION_COMMENT.search(text)
258    if m:
259      self.last_description = m.group('description')
260      # Remove the description from the output text
261      text = _DESCRIPTION_COMMENT.sub('', text)
262
263    m = _MESSAGE_BREAK_COMMENT.search(text)
264    if m:
265      # Remove the coment from the output text.  It should already effectively
266      # break apart messages.
267      text = _MESSAGE_BREAK_COMMENT.sub('', text)
268
269    if translateable and not self.last_element_ in _PREFORMATTED_TAGS:
270      if self.fold_whitespace_:
271        # Fold whitespace sequences if appropriate.  This is optional because it
272        # alters the output strings.
273        text = _FOLD_WHITESPACE.sub(' ', text)
274      else:
275        text = text.replace('\n', ' ')
276        text = text.replace('\r', ' ')
277        # This whitespace folding doesn't work in all cases, thus the
278        # fold_whitespace flag to support backwards compatibility.
279        text = text.replace('   ', ' ')
280        text = text.replace('  ', ' ')
281
282    if translateable:
283      description = self.last_description
284      self.last_description = ''
285    else:
286      description = ''
287
288    if text != '':
289      self.chunks_.append((translateable, text, description))
290
291  def Parse(self, text, fold_whitespace):
292    '''Parses self.text_ into an intermediate format stored in self.chunks_
293    which is translateable and nontranslateable chunks.  Also returns
294    self.chunks_
295
296    Args:
297      text: The HTML for parsing.
298      fold_whitespace: Whether whitespace sequences should be folded into a
299        single space.
300
301    Return:
302      [chunk1, chunk2, chunk3, ...]  (instances of class Chunk)
303    '''
304    #
305    # Chunker state
306    #
307
308    self.text_ = text
309    self.fold_whitespace_ = fold_whitespace
310
311    # A list of tuples (is_translateable, text) which represents the document
312    # after chunking.
313    self.chunks_ = []
314
315    # Start index of the last chunk, whether translateable or not
316    self.chunk_start = 0
317
318    # Index of the last for-sure translateable character if we are parsing
319    # a translateable chunk, -1 to indicate we are not in a translateable chunk.
320    # This is needed so that we don't include trailing whitespace in the
321    # translateable chunk (whitespace is neutral).
322    self.last_translateable = -1
323
324    # Index of the last for-sure nontranslateable character if we are parsing
325    # a nontranslateable chunk, -1 if we are not in a nontranslateable chunk.
326    # This is needed to make sure we can group e.g. "<b>Hello</b> there"
327    # together instead of just "Hello</b> there" which would be much worse
328    # for translation.
329    self.last_nontranslateable = -1
330
331    # Index of the character we're currently looking at.
332    self.current = 0
333
334    # The name of the last block element parsed.
335    self.last_element_ = ''
336
337    # The last explicit description we found.
338    self.last_description = ''
339
340    # Whether no-break was the last chunk seen
341    self.last_nobreak = False
342
343    while self.current < len(self.text_):
344      _DebugPrint('REST: %s' % self.text_[self.current:self.current+60])
345
346      m = _MESSAGE_NO_BREAK_COMMENT.match(self.Rest())
347      if m:
348        self.AdvancePast(m)
349        self.last_nobreak = True
350        continue
351
352      # Try to match whitespace
353      m = _WHITESPACE.match(self.Rest())
354      if m:
355        # Whitespace is neutral, it just advances 'current' and does not switch
356        # between translateable/nontranslateable.  If we are in a
357        # nontranslateable section that extends to the current point, we extend
358        # it to include the whitespace.  If we are in a translateable section,
359        # we do not extend it until we find
360        # more translateable parts, because we never want a translateable chunk
361        # to end with whitespace.
362        if (not self.InTranslateable() and
363            self.last_nontranslateable == self.current - 1):
364          self.last_nontranslateable = self.current + m.end() - 1
365        self.AdvancePast(m)
366        continue
367
368      # Then we try to match nontranslateables
369      m = _NONTRANSLATEABLES.match(self.Rest())
370      if m:
371        if self.InTranslateable():
372          self.EndTranslateable()
373        self.last_nontranslateable = self.current + m.end() - 1
374        self.AdvancePast(m)
375        continue
376
377      # Now match all other HTML element tags (opening, closing, or empty, we
378      # don't care).
379      m = _ELEMENT.match(self.Rest())
380      if m:
381        element_name = m.group('element').lower()
382        if element_name in _BLOCK_TAGS:
383          self.last_element_ = element_name
384          if self.InTranslateable():
385            if self.last_nobreak:
386              self.last_nobreak = False
387            else:
388              self.EndTranslateable()
389
390          # Check for "special" elements, i.e. ones that have a translateable
391          # attribute, and handle them correctly.  Note that all of the
392          # "special" elements are block tags, so no need to check for this
393          # if the tag is not a block tag.
394          sm = _SPECIAL_ELEMENT.match(self.Rest())
395          if sm:
396            # Get the appropriate group name
397            for group in sm.groupdict():
398              if sm.groupdict()[group]:
399                break
400
401            # First make a nontranslateable chunk up to and including the
402            # quote before the translateable attribute value
403            self.AddChunk(False, self.text_[
404              self.chunk_start : self.current + sm.start(group)])
405            # Then a translateable for the translateable bit
406            self.AddChunk(True, self.Rest()[sm.start(group) : sm.end(group)])
407            # Finally correct the data invariant for the parser
408            self.chunk_start = self.current + sm.end(group)
409
410          self.last_nontranslateable = self.current + m.end() - 1
411        elif self.InTranslateable():
412          # We're in a translateable and the tag is an inline tag, so we
413          # need to include it in the translateable.
414          self.last_translateable = self.current + m.end() - 1
415        self.AdvancePast(m)
416        continue
417
418      # Anything else we find must be translateable, so we advance one character
419      # at a time until one of the above matches.
420      if not self.InTranslateable():
421        self.StartTranslateable()
422      else:
423        self.last_translateable = self.current
424      self.current += 1
425
426    # Close the final chunk
427    if self.InTranslateable():
428      self.AddChunk(True, self.text_[self.chunk_start : ])
429    else:
430      self.AddChunk(False, self.text_[self.chunk_start : ])
431
432    return self.chunks_
433
434
435def HtmlToMessage(html, include_block_tags=False, description=''):
436  '''Takes a bit of HTML, which must contain only "inline" HTML elements,
437  and changes it into a tclib.Message.  This involves escaping any entities and
438  replacing any HTML code with placeholders.
439
440  If include_block_tags is true, no error will be given if block tags (e.g.
441  <p> or <br>) are included in the HTML.
442
443  Args:
444    html: 'Hello <b>[USERNAME]</b>, how&nbsp;<i>are</i> you?'
445    include_block_tags: False
446
447  Return:
448    tclib.Message('Hello START_BOLD1USERNAMEEND_BOLD, '
449                  'howNBSPSTART_ITALICareEND_ITALIC you?',
450                  [ Placeholder('START_BOLD', '<b>', ''),
451                    Placeholder('USERNAME', '[USERNAME]', ''),
452                    Placeholder('END_BOLD', '</b>', ''),
453                    Placeholder('START_ITALIC', '<i>', ''),
454                    Placeholder('END_ITALIC', '</i>', ''), ])
455  '''
456  # Approach is:
457  # - first placeholderize, finding <elements>, [REPLACEABLES] and &nbsp;
458  # - then escape all character entities in text in-between placeholders
459
460  parts = []  # List of strings (for text chunks) and tuples (ID, original)
461              # for placeholders
462
463  count_names = {}  # Map of base names to number of times used
464  end_names = {}  # Map of base names to stack of end tags (for correct nesting)
465
466  def MakeNameClosure(base, type = ''):
467    '''Returns a closure that can be called once all names have been allocated
468    to return the final name of the placeholder.  This allows us to minimally
469    number placeholders for non-overlap.
470
471    Also ensures that END_XXX_Y placeholders have the same Y as the
472    corresponding BEGIN_XXX_Y placeholder when we have nested tags of the same
473    type.
474
475    Args:
476      base: 'phname'
477      type: '' | 'begin' | 'end'
478
479    Return:
480      Closure()
481    '''
482    name = base.upper()
483    if type != '':
484      name = ('%s_%s' % (type, base)).upper()
485
486    count_names.setdefault(name, 0)
487    count_names[name] += 1
488
489    def MakeFinalName(name_ = name, index = count_names[name] - 1):
490      if type.lower() == 'end' and end_names.get(base):
491        return end_names[base].pop(-1)  # For correct nesting
492      if count_names[name_] != 1:
493        name_ = '%s_%s' % (name_, _SUFFIXES[index])
494        # We need to use a stack to ensure that the end-tag suffixes match
495        # the begin-tag suffixes.  Only needed when more than one tag of the
496        # same type.
497        if type == 'begin':
498          end_name = ('END_%s_%s' % (base, _SUFFIXES[index])).upper()
499          if base in end_names:
500            end_names[base].append(end_name)
501          else:
502            end_names[base] = [end_name]
503
504      return name_
505
506    return MakeFinalName
507
508  current = 0
509  last_nobreak = False
510
511  while current < len(html):
512    m = _MESSAGE_NO_BREAK_COMMENT.match(html[current:])
513    if m:
514      last_nobreak = True
515      current += m.end()
516      continue
517
518    m = _NBSP.match(html[current:])
519    if m:
520      parts.append((MakeNameClosure('SPACE'), m.group()))
521      current += m.end()
522      continue
523
524    m = _REPLACEABLE.match(html[current:])
525    if m:
526      # Replaceables allow - but placeholders don't, so replace - with _
527      ph_name = MakeNameClosure('X_%s_X' % m.group('name').replace('-', '_'))
528      parts.append((ph_name, m.group()))
529      current += m.end()
530      continue
531
532    m = _SPECIAL_ELEMENT.match(html[current:])
533    if m:
534      if not include_block_tags:
535        if last_nobreak:
536          last_nobreak = False
537        else:
538          raise exception.BlockTagInTranslateableChunk(html)
539      element_name = 'block'  # for simplification
540      # Get the appropriate group name
541      for group in m.groupdict():
542        if m.groupdict()[group]:
543          break
544      parts.append((MakeNameClosure(element_name, 'begin'),
545                    html[current : current + m.start(group)]))
546      parts.append(m.group(group))
547      parts.append((MakeNameClosure(element_name, 'end'),
548                    html[current + m.end(group) : current + m.end()]))
549      current += m.end()
550      continue
551
552    m = _ELEMENT.match(html[current:])
553    if m:
554      element_name = m.group('element').lower()
555      if not include_block_tags and not element_name in _INLINE_TAGS:
556        if last_nobreak:
557          last_nobreak = False
558        else:
559          raise exception.BlockTagInTranslateableChunk(html[current:])
560      if element_name in _HTML_PLACEHOLDER_NAMES:  # use meaningful names
561        element_name = _HTML_PLACEHOLDER_NAMES[element_name]
562
563      # Make a name for the placeholder
564      type = ''
565      if not m.group('empty'):
566        if m.group('closing'):
567          type = 'end'
568        else:
569          type = 'begin'
570      parts.append((MakeNameClosure(element_name, type), m.group()))
571      current += m.end()
572      continue
573
574    if len(parts) and isinstance(parts[-1], six.string_types):
575      parts[-1] += html[current]
576    else:
577      parts.append(html[current])
578    current += 1
579
580  msg_text = ''
581  placeholders = []
582  for part in parts:
583    if isinstance(part, tuple):
584      final_name = part[0]()
585      original = part[1]
586      msg_text += final_name
587      placeholders.append(tclib.Placeholder(final_name, original, '(HTML code)'))
588    else:
589      msg_text += part
590
591  msg = tclib.Message(text=msg_text, placeholders=placeholders,
592                      description=description)
593  content = msg.GetContent()
594  for ix in range(len(content)):
595    if isinstance(content[ix], six.string_types):
596      content[ix] = util.UnescapeHtml(content[ix], replace_nbsp=False)
597
598  return msg
599
600
601class TrHtml(interface.GathererBase):
602  '''Represents a document or message in the template format used by
603  Total Recall for HTML documents.'''
604
605  def __init__(self, *args, **kwargs):
606    super(TrHtml, self).__init__(*args, **kwargs)
607    self.have_parsed_ = False
608    self.skeleton_ = []  # list of strings and MessageClique objects
609    self.fold_whitespace_ = False
610
611  def SetAttributes(self, attrs):
612    '''Sets node attributes used by the gatherer.
613
614    This checks the fold_whitespace attribute.
615
616    Args:
617      attrs: The mapping of node attributes.
618    '''
619    self.fold_whitespace_ = ('fold_whitespace' in attrs and
620                             attrs['fold_whitespace'] == 'true')
621
622  def GetText(self):
623    '''Returns the original text of the HTML document'''
624    return self.text_
625
626  def GetTextualIds(self):
627    return [self.extkey]
628
629  def GetCliques(self):
630    '''Returns the message cliques for each translateable message in the
631    document.'''
632    return [x for x in self.skeleton_ if isinstance(x, clique.MessageClique)]
633
634  def Translate(self, lang, pseudo_if_not_available=True,
635                skeleton_gatherer=None, fallback_to_english=False):
636    '''Returns this document with translateable messages filled with
637    the translation for language 'lang'.
638
639    Args:
640      lang: 'en'
641      pseudo_if_not_available: True
642
643    Return:
644      'ID_THIS_SECTION TYPE\n...BEGIN\n  "Translated message"\n......\nEND
645
646    Raises:
647      grit.exception.NotReady() if used before Parse() has been successfully
648      called.
649      grit.exception.NoSuchTranslation() if 'pseudo_if_not_available' is false
650      and there is no translation for the requested language.
651    '''
652    if len(self.skeleton_) == 0:
653      raise exception.NotReady()
654
655    # TODO(joi) Implement support for skeleton gatherers here.
656
657    out = []
658    for item in self.skeleton_:
659      if isinstance(item, six.string_types):
660        out.append(item)
661      else:
662        msg = item.MessageForLanguage(lang,
663                                      pseudo_if_not_available,
664                                      fallback_to_english)
665        for content in msg.GetContent():
666          if isinstance(content, tclib.Placeholder):
667            out.append(content.GetOriginal())
668          else:
669            # We escape " characters to increase the chance that attributes
670            # will be properly escaped.
671            out.append(util.EscapeHtml(content, True))
672
673    return ''.join(out)
674
675  def Parse(self):
676    if self.have_parsed_:
677      return
678    self.have_parsed_ = True
679
680    text = self._LoadInputFile()
681
682    # Ignore the BOM character if the document starts with one.
683    if text.startswith(u'\ufeff'):
684      text = text[1:]
685
686    self.text_ = text
687
688    # Parsing is done in two phases:  First, we break the document into
689    # translateable and nontranslateable chunks.  Second, we run through each
690    # translateable chunk and insert placeholders for any HTML elements,
691    # unescape escaped characters, etc.
692
693    # First handle the silly little [!]-prefixed header because it's not
694    # handled by our HTML parsers.
695    m = _SILLY_HEADER.match(text)
696    if m:
697      self.skeleton_.append(text[:m.start('title')])
698      self.skeleton_.append(self.uberclique.MakeClique(
699        tclib.Message(text=text[m.start('title'):m.end('title')])))
700      self.skeleton_.append(text[m.end('title') : m.end()])
701      text = text[m.end():]
702
703    chunks = HtmlChunks().Parse(text, self.fold_whitespace_)
704
705    for chunk in chunks:
706      if chunk[0]:  # Chunk is translateable
707        self.skeleton_.append(self.uberclique.MakeClique(
708          HtmlToMessage(chunk[1], description=chunk[2])))
709      else:
710        self.skeleton_.append(chunk[1])
711
712    # Go through the skeleton and change any messages that consist solely of
713    # placeholders and whitespace into nontranslateable strings.
714    for ix in range(len(self.skeleton_)):
715      got_text = False
716      if isinstance(self.skeleton_[ix], clique.MessageClique):
717        msg = self.skeleton_[ix].GetMessage()
718        for item in msg.GetContent():
719          if (isinstance(item, six.string_types)
720              and _NON_WHITESPACE.search(item) and item != '&nbsp;'):
721            got_text = True
722            break
723        if not got_text:
724          self.skeleton_[ix] = msg.GetRealContent()
725
726  def SubstituteMessages(self, substituter):
727    '''Applies substitutions to all messages in the tree.
728
729    Goes through the skeleton and finds all MessageCliques.
730
731    Args:
732      substituter: a grit.util.Substituter object.
733    '''
734    new_skel = []
735    for chunk in self.skeleton_:
736      if isinstance(chunk, clique.MessageClique):
737        old_message = chunk.GetMessage()
738        new_message = substituter.SubstituteMessage(old_message)
739        if new_message is not old_message:
740          new_skel.append(self.uberclique.MakeClique(new_message))
741          continue
742      new_skel.append(chunk)
743    self.skeleton_ = new_skel
744