1# Copyright (c) 2012 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5'''A gatherer for the TotalRecall brand of HTML templates with replaceable 6portions. We wanted to reuse extern.tclib.api.handlers.html.TCHTMLParser 7but this proved impossible due to the fact that the TotalRecall HTML templates 8are in general quite far from parseable HTML and the TCHTMLParser derives 9 10from HTMLParser.HTMLParser which requires relatively well-formed HTML. Some 11examples of "HTML" from the TotalRecall HTML templates that wouldn't be 12parseable include things like: 13 14 <a [PARAMS]>blabla</a> (not parseable because attributes are invalid) 15 16 <table><tr><td>[LOTSOFSTUFF]</tr></table> (not parseable because closing 17 </td> is in the HTML [LOTSOFSTUFF] 18 is replaced by) 19 20The other problem with using general parsers (such as TCHTMLParser) is that 21we want to make sure we output the TotalRecall template with as little changes 22as possible in terms of whitespace characters, layout etc. With any parser 23that generates a parse tree, and generates output by dumping the parse tree, 24we would always have little inconsistencies which could cause bugs (the 25TotalRecall template stuff is quite brittle and can break if e.g. a tab 26character is replaced with spaces). 27 28The solution, which may be applicable to some other HTML-like template 29languages floating around Google, is to create a parser with a simple state 30machine that keeps track of what kind of tag it's inside, and whether it's in 31a translateable section or not. Translateable sections are: 32 33a) text (including [BINGO] replaceables) inside of tags that 34 can contain translateable text (which is all tags except 35 for a few) 36 37b) text inside of an 'alt' attribute in an <image> element, or 38 the 'value' attribute of a <submit>, <button> or <text> 39 element. 40 41The parser does not build up a parse tree but rather a "skeleton" which 42is a list of nontranslateable strings intermingled with grit.clique.MessageClique 43objects. This simplifies the parser considerably compared to a regular HTML 44parser. To output a translated document, each item in the skeleton is 45printed out, with the relevant Translation from each MessageCliques being used 46for the requested language. 47 48This implementation borrows some code, constants and ideas from 49extern.tclib.api.handlers.html.TCHTMLParser. 50''' 51 52from __future__ import print_function 53 54import re 55 56import six 57 58from grit import clique 59from grit import exception 60from grit import lazy_re 61from grit import util 62from grit import tclib 63 64from grit.gather import interface 65 66 67# HTML tags which break (separate) chunks. 68_BLOCK_TAGS = ['script', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'br', 69 'body', 'style', 'head', 'title', 'table', 'tr', 'td', 'th', 70 'ul', 'ol', 'dl', 'nl', 'li', 'div', 'object', 'center', 71 'html', 'link', 'form', 'select', 'textarea', 72 'button', 'option', 'map', 'area', 'blockquote', 'pre', 73 'meta', 'xmp', 'noscript', 'label', 'tbody', 'thead', 74 'script', 'style', 'pre', 'iframe', 'img', 'input', 'nowrap', 75 'fieldset', 'legend'] 76 77# HTML tags which may appear within a chunk. 78_INLINE_TAGS = ['b', 'i', 'u', 'tt', 'code', 'font', 'a', 'span', 'small', 79 'key', 'nobr', 'url', 'em', 's', 'sup', 'strike', 80 'strong'] 81 82# HTML tags within which linebreaks are significant. 83_PREFORMATTED_TAGS = ['textarea', 'xmp', 'pre'] 84 85# An array mapping some of the inline HTML tags to more meaningful 86# names for those tags. This will be used when generating placeholders 87# representing these tags. 88_HTML_PLACEHOLDER_NAMES = { 'a' : 'link', 'br' : 'break', 'b' : 'bold', 89 'i' : 'italic', 'li' : 'item', 'ol' : 'ordered_list', 'p' : 'paragraph', 90 'ul' : 'unordered_list', 'img' : 'image', 'em' : 'emphasis' } 91 92# We append each of these characters in sequence to distinguish between 93# different placeholders with basically the same name (e.g. BOLD1, BOLD2). 94# Keep in mind that a placeholder name must not be a substring of any other 95# placeholder name in the same message, so we can't simply count (BOLD_1 96# would be a substring of BOLD_10). 97_SUFFIXES = '123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' 98 99# Matches whitespace in an HTML document. Also matches HTML comments, which are 100# treated as whitespace. 101_WHITESPACE = lazy_re.compile(r'(\s| |\\n|\\r|<!--\s*desc\s*=.*?-->)+', 102 re.DOTALL) 103 104# Matches whitespace sequences which can be folded into a single whitespace 105# character. This matches single characters so that non-spaces are replaced 106# with spaces. 107_FOLD_WHITESPACE = lazy_re.compile(r'\s+') 108 109# Finds a non-whitespace character 110_NON_WHITESPACE = lazy_re.compile(r'\S') 111 112# Matches two or more in a row (a single   is not changed into 113# placeholders because different languages require different numbers of spaces 114# and placeholders must match exactly; more than one is probably a "special" 115# whitespace sequence and should be turned into a placeholder). 116_NBSP = lazy_re.compile(r' ( )+') 117 118# Matches nontranslateable chunks of the document 119_NONTRANSLATEABLES = lazy_re.compile(r''' 120 <\s*script.+?<\s*/\s*script\s*> 121 | 122 <\s*style.+?<\s*/\s*style\s*> 123 | 124 <!--.+?--> 125 | 126 <\?IMPORT\s.+?> # import tag 127 | 128 <\s*[a-zA-Z_]+:.+?> # custom tag (open) 129 | 130 <\s*/\s*[a-zA-Z_]+:.+?> # custom tag (close) 131 | 132 <!\s*[A-Z]+\s*([^>]+|"[^"]+"|'[^']+')*?> 133 ''', re.MULTILINE | re.DOTALL | re.VERBOSE | re.IGNORECASE) 134 135# Matches a tag and its attributes 136_ELEMENT = lazy_re.compile(r''' 137 # Optional closing /, element name 138 <\s*(?P<closing>/)?\s*(?P<element>[a-zA-Z0-9]+)\s* 139 # Attributes and/or replaceables inside the tag, if any 140 (?P<atts>( 141 \s*([a-zA-Z_][-:.a-zA-Z_0-9]*) # Attribute name 142 (\s*=\s*(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))? 143 | 144 \s*\[(\$?\~)?([A-Z0-9-_]+?)(\~\$?)?\] 145 )*) 146 \s*(?P<empty>/)?\s*> # Optional empty-tag closing /, and tag close 147 ''', 148 re.MULTILINE | re.DOTALL | re.VERBOSE) 149 150# Matches elements that may have translateable attributes. The value of these 151# special attributes is given by group 'value1' or 'value2'. Note that this 152# regexp demands that the attribute value be quoted; this is necessary because 153# the non-tree-building nature of the parser means we don't know when we're 154# writing out attributes, so we wouldn't know to escape spaces. 155_SPECIAL_ELEMENT = lazy_re.compile(r''' 156 <\s*( 157 input[^>]+?value\s*=\s*(\'(?P<value3>[^\']*)\'|"(?P<value4>[^"]*)") 158 [^>]+type\s*=\s*"?'?(button|reset|text|submit)'?"? 159 | 160 ( 161 table[^>]+?title\s*= 162 | 163 img[^>]+?alt\s*= 164 | 165 input[^>]+?type\s*=\s*"?'?(button|reset|text|submit)'?"?[^>]+?value\s*= 166 ) 167 \s*(\'(?P<value1>[^\']*)\'|"(?P<value2>[^"]*)") 168 )[^>]*?> 169 ''', re.MULTILINE | re.DOTALL | re.VERBOSE | re.IGNORECASE) 170 171# Matches stuff that is translateable if it occurs in the right context 172# (between tags). This includes all characters and character entities. 173# Note that this also matches which needs to be handled as whitespace 174# before this regexp is applied. 175_CHARACTERS = lazy_re.compile(r''' 176 ( 177 \w 178 | 179 [\!\@\#\$\%\^\*\(\)\-\=\_\+\[\]\{\}\\\|\;\:\'\"\,\.\/\?\`\~] 180 | 181 &(\#[0-9]+|\#x[0-9a-fA-F]+|[A-Za-z0-9]+); 182 )+ 183 ''', re.MULTILINE | re.DOTALL | re.VERBOSE) 184 185# Matches Total Recall's "replaceable" tags, which are just any text 186# in capitals enclosed by delimiters like [] or [~~] or [$~~$] (e.g. [HELLO], 187# [~HELLO~] and [$~HELLO~$]). 188_REPLACEABLE = lazy_re.compile(r'\[(\$?\~)?(?P<name>[A-Z0-9-_]+?)(\~\$?)?\]', 189 re.MULTILINE) 190 191 192# Matches the silly [!]-prefixed "header" that is used in some TotalRecall 193# templates. 194_SILLY_HEADER = lazy_re.compile(r'\[!\]\ntitle\t(?P<title>[^\n]+?)\n.+?\n\n', 195 re.MULTILINE | re.DOTALL) 196 197 198# Matches a comment that provides a description for the message it occurs in. 199_DESCRIPTION_COMMENT = lazy_re.compile( 200 r'<!--\s*desc\s*=\s*(?P<description>.+?)\s*-->', re.DOTALL) 201 202# Matches a comment which is used to break apart multiple messages. 203_MESSAGE_BREAK_COMMENT = lazy_re.compile(r'<!--\s*message-break\s*-->', 204 re.DOTALL) 205 206# Matches a comment which is used to prevent block tags from splitting a message 207_MESSAGE_NO_BREAK_COMMENT = re.compile(r'<!--\s*message-no-break\s*-->', 208 re.DOTALL) 209 210 211_DEBUG = 0 212def _DebugPrint(text): 213 if _DEBUG: 214 print(text.encode('utf-8')) 215 216 217class HtmlChunks(object): 218 '''A parser that knows how to break an HTML-like document into a list of 219 chunks, where each chunk is either translateable or non-translateable. 220 The chunks are unmodified sections of the original document, so concatenating 221 the text of all chunks would result in the original document.''' 222 223 def InTranslateable(self): 224 return self.last_translateable != -1 225 226 def Rest(self): 227 return self.text_[self.current:] 228 229 def StartTranslateable(self): 230 assert not self.InTranslateable() 231 if self.current != 0: 232 # Append a nontranslateable chunk 233 chunk_text = self.text_[self.chunk_start : self.last_nontranslateable + 1] 234 # Needed in the case where document starts with a translateable. 235 if len(chunk_text) > 0: 236 self.AddChunk(False, chunk_text) 237 self.chunk_start = self.last_nontranslateable + 1 238 self.last_translateable = self.current 239 self.last_nontranslateable = -1 240 241 def EndTranslateable(self): 242 assert self.InTranslateable() 243 # Append a translateable chunk 244 self.AddChunk(True, 245 self.text_[self.chunk_start : self.last_translateable + 1]) 246 self.chunk_start = self.last_translateable + 1 247 self.last_translateable = -1 248 self.last_nontranslateable = self.current 249 250 def AdvancePast(self, match): 251 self.current += match.end() 252 253 def AddChunk(self, translateable, text): 254 '''Adds a chunk to self, removing linebreaks and duplicate whitespace 255 if appropriate. 256 ''' 257 m = _DESCRIPTION_COMMENT.search(text) 258 if m: 259 self.last_description = m.group('description') 260 # Remove the description from the output text 261 text = _DESCRIPTION_COMMENT.sub('', text) 262 263 m = _MESSAGE_BREAK_COMMENT.search(text) 264 if m: 265 # Remove the coment from the output text. It should already effectively 266 # break apart messages. 267 text = _MESSAGE_BREAK_COMMENT.sub('', text) 268 269 if translateable and not self.last_element_ in _PREFORMATTED_TAGS: 270 if self.fold_whitespace_: 271 # Fold whitespace sequences if appropriate. This is optional because it 272 # alters the output strings. 273 text = _FOLD_WHITESPACE.sub(' ', text) 274 else: 275 text = text.replace('\n', ' ') 276 text = text.replace('\r', ' ') 277 # This whitespace folding doesn't work in all cases, thus the 278 # fold_whitespace flag to support backwards compatibility. 279 text = text.replace(' ', ' ') 280 text = text.replace(' ', ' ') 281 282 if translateable: 283 description = self.last_description 284 self.last_description = '' 285 else: 286 description = '' 287 288 if text != '': 289 self.chunks_.append((translateable, text, description)) 290 291 def Parse(self, text, fold_whitespace): 292 '''Parses self.text_ into an intermediate format stored in self.chunks_ 293 which is translateable and nontranslateable chunks. Also returns 294 self.chunks_ 295 296 Args: 297 text: The HTML for parsing. 298 fold_whitespace: Whether whitespace sequences should be folded into a 299 single space. 300 301 Return: 302 [chunk1, chunk2, chunk3, ...] (instances of class Chunk) 303 ''' 304 # 305 # Chunker state 306 # 307 308 self.text_ = text 309 self.fold_whitespace_ = fold_whitespace 310 311 # A list of tuples (is_translateable, text) which represents the document 312 # after chunking. 313 self.chunks_ = [] 314 315 # Start index of the last chunk, whether translateable or not 316 self.chunk_start = 0 317 318 # Index of the last for-sure translateable character if we are parsing 319 # a translateable chunk, -1 to indicate we are not in a translateable chunk. 320 # This is needed so that we don't include trailing whitespace in the 321 # translateable chunk (whitespace is neutral). 322 self.last_translateable = -1 323 324 # Index of the last for-sure nontranslateable character if we are parsing 325 # a nontranslateable chunk, -1 if we are not in a nontranslateable chunk. 326 # This is needed to make sure we can group e.g. "<b>Hello</b> there" 327 # together instead of just "Hello</b> there" which would be much worse 328 # for translation. 329 self.last_nontranslateable = -1 330 331 # Index of the character we're currently looking at. 332 self.current = 0 333 334 # The name of the last block element parsed. 335 self.last_element_ = '' 336 337 # The last explicit description we found. 338 self.last_description = '' 339 340 # Whether no-break was the last chunk seen 341 self.last_nobreak = False 342 343 while self.current < len(self.text_): 344 _DebugPrint('REST: %s' % self.text_[self.current:self.current+60]) 345 346 m = _MESSAGE_NO_BREAK_COMMENT.match(self.Rest()) 347 if m: 348 self.AdvancePast(m) 349 self.last_nobreak = True 350 continue 351 352 # Try to match whitespace 353 m = _WHITESPACE.match(self.Rest()) 354 if m: 355 # Whitespace is neutral, it just advances 'current' and does not switch 356 # between translateable/nontranslateable. If we are in a 357 # nontranslateable section that extends to the current point, we extend 358 # it to include the whitespace. If we are in a translateable section, 359 # we do not extend it until we find 360 # more translateable parts, because we never want a translateable chunk 361 # to end with whitespace. 362 if (not self.InTranslateable() and 363 self.last_nontranslateable == self.current - 1): 364 self.last_nontranslateable = self.current + m.end() - 1 365 self.AdvancePast(m) 366 continue 367 368 # Then we try to match nontranslateables 369 m = _NONTRANSLATEABLES.match(self.Rest()) 370 if m: 371 if self.InTranslateable(): 372 self.EndTranslateable() 373 self.last_nontranslateable = self.current + m.end() - 1 374 self.AdvancePast(m) 375 continue 376 377 # Now match all other HTML element tags (opening, closing, or empty, we 378 # don't care). 379 m = _ELEMENT.match(self.Rest()) 380 if m: 381 element_name = m.group('element').lower() 382 if element_name in _BLOCK_TAGS: 383 self.last_element_ = element_name 384 if self.InTranslateable(): 385 if self.last_nobreak: 386 self.last_nobreak = False 387 else: 388 self.EndTranslateable() 389 390 # Check for "special" elements, i.e. ones that have a translateable 391 # attribute, and handle them correctly. Note that all of the 392 # "special" elements are block tags, so no need to check for this 393 # if the tag is not a block tag. 394 sm = _SPECIAL_ELEMENT.match(self.Rest()) 395 if sm: 396 # Get the appropriate group name 397 for group in sm.groupdict(): 398 if sm.groupdict()[group]: 399 break 400 401 # First make a nontranslateable chunk up to and including the 402 # quote before the translateable attribute value 403 self.AddChunk(False, self.text_[ 404 self.chunk_start : self.current + sm.start(group)]) 405 # Then a translateable for the translateable bit 406 self.AddChunk(True, self.Rest()[sm.start(group) : sm.end(group)]) 407 # Finally correct the data invariant for the parser 408 self.chunk_start = self.current + sm.end(group) 409 410 self.last_nontranslateable = self.current + m.end() - 1 411 elif self.InTranslateable(): 412 # We're in a translateable and the tag is an inline tag, so we 413 # need to include it in the translateable. 414 self.last_translateable = self.current + m.end() - 1 415 self.AdvancePast(m) 416 continue 417 418 # Anything else we find must be translateable, so we advance one character 419 # at a time until one of the above matches. 420 if not self.InTranslateable(): 421 self.StartTranslateable() 422 else: 423 self.last_translateable = self.current 424 self.current += 1 425 426 # Close the final chunk 427 if self.InTranslateable(): 428 self.AddChunk(True, self.text_[self.chunk_start : ]) 429 else: 430 self.AddChunk(False, self.text_[self.chunk_start : ]) 431 432 return self.chunks_ 433 434 435def HtmlToMessage(html, include_block_tags=False, description=''): 436 '''Takes a bit of HTML, which must contain only "inline" HTML elements, 437 and changes it into a tclib.Message. This involves escaping any entities and 438 replacing any HTML code with placeholders. 439 440 If include_block_tags is true, no error will be given if block tags (e.g. 441 <p> or <br>) are included in the HTML. 442 443 Args: 444 html: 'Hello <b>[USERNAME]</b>, how <i>are</i> you?' 445 include_block_tags: False 446 447 Return: 448 tclib.Message('Hello START_BOLD1USERNAMEEND_BOLD, ' 449 'howNBSPSTART_ITALICareEND_ITALIC you?', 450 [ Placeholder('START_BOLD', '<b>', ''), 451 Placeholder('USERNAME', '[USERNAME]', ''), 452 Placeholder('END_BOLD', '</b>', ''), 453 Placeholder('START_ITALIC', '<i>', ''), 454 Placeholder('END_ITALIC', '</i>', ''), ]) 455 ''' 456 # Approach is: 457 # - first placeholderize, finding <elements>, [REPLACEABLES] and 458 # - then escape all character entities in text in-between placeholders 459 460 parts = [] # List of strings (for text chunks) and tuples (ID, original) 461 # for placeholders 462 463 count_names = {} # Map of base names to number of times used 464 end_names = {} # Map of base names to stack of end tags (for correct nesting) 465 466 def MakeNameClosure(base, type = ''): 467 '''Returns a closure that can be called once all names have been allocated 468 to return the final name of the placeholder. This allows us to minimally 469 number placeholders for non-overlap. 470 471 Also ensures that END_XXX_Y placeholders have the same Y as the 472 corresponding BEGIN_XXX_Y placeholder when we have nested tags of the same 473 type. 474 475 Args: 476 base: 'phname' 477 type: '' | 'begin' | 'end' 478 479 Return: 480 Closure() 481 ''' 482 name = base.upper() 483 if type != '': 484 name = ('%s_%s' % (type, base)).upper() 485 486 count_names.setdefault(name, 0) 487 count_names[name] += 1 488 489 def MakeFinalName(name_ = name, index = count_names[name] - 1): 490 if type.lower() == 'end' and end_names.get(base): 491 return end_names[base].pop(-1) # For correct nesting 492 if count_names[name_] != 1: 493 name_ = '%s_%s' % (name_, _SUFFIXES[index]) 494 # We need to use a stack to ensure that the end-tag suffixes match 495 # the begin-tag suffixes. Only needed when more than one tag of the 496 # same type. 497 if type == 'begin': 498 end_name = ('END_%s_%s' % (base, _SUFFIXES[index])).upper() 499 if base in end_names: 500 end_names[base].append(end_name) 501 else: 502 end_names[base] = [end_name] 503 504 return name_ 505 506 return MakeFinalName 507 508 current = 0 509 last_nobreak = False 510 511 while current < len(html): 512 m = _MESSAGE_NO_BREAK_COMMENT.match(html[current:]) 513 if m: 514 last_nobreak = True 515 current += m.end() 516 continue 517 518 m = _NBSP.match(html[current:]) 519 if m: 520 parts.append((MakeNameClosure('SPACE'), m.group())) 521 current += m.end() 522 continue 523 524 m = _REPLACEABLE.match(html[current:]) 525 if m: 526 # Replaceables allow - but placeholders don't, so replace - with _ 527 ph_name = MakeNameClosure('X_%s_X' % m.group('name').replace('-', '_')) 528 parts.append((ph_name, m.group())) 529 current += m.end() 530 continue 531 532 m = _SPECIAL_ELEMENT.match(html[current:]) 533 if m: 534 if not include_block_tags: 535 if last_nobreak: 536 last_nobreak = False 537 else: 538 raise exception.BlockTagInTranslateableChunk(html) 539 element_name = 'block' # for simplification 540 # Get the appropriate group name 541 for group in m.groupdict(): 542 if m.groupdict()[group]: 543 break 544 parts.append((MakeNameClosure(element_name, 'begin'), 545 html[current : current + m.start(group)])) 546 parts.append(m.group(group)) 547 parts.append((MakeNameClosure(element_name, 'end'), 548 html[current + m.end(group) : current + m.end()])) 549 current += m.end() 550 continue 551 552 m = _ELEMENT.match(html[current:]) 553 if m: 554 element_name = m.group('element').lower() 555 if not include_block_tags and not element_name in _INLINE_TAGS: 556 if last_nobreak: 557 last_nobreak = False 558 else: 559 raise exception.BlockTagInTranslateableChunk(html[current:]) 560 if element_name in _HTML_PLACEHOLDER_NAMES: # use meaningful names 561 element_name = _HTML_PLACEHOLDER_NAMES[element_name] 562 563 # Make a name for the placeholder 564 type = '' 565 if not m.group('empty'): 566 if m.group('closing'): 567 type = 'end' 568 else: 569 type = 'begin' 570 parts.append((MakeNameClosure(element_name, type), m.group())) 571 current += m.end() 572 continue 573 574 if len(parts) and isinstance(parts[-1], six.string_types): 575 parts[-1] += html[current] 576 else: 577 parts.append(html[current]) 578 current += 1 579 580 msg_text = '' 581 placeholders = [] 582 for part in parts: 583 if isinstance(part, tuple): 584 final_name = part[0]() 585 original = part[1] 586 msg_text += final_name 587 placeholders.append(tclib.Placeholder(final_name, original, '(HTML code)')) 588 else: 589 msg_text += part 590 591 msg = tclib.Message(text=msg_text, placeholders=placeholders, 592 description=description) 593 content = msg.GetContent() 594 for ix in range(len(content)): 595 if isinstance(content[ix], six.string_types): 596 content[ix] = util.UnescapeHtml(content[ix], replace_nbsp=False) 597 598 return msg 599 600 601class TrHtml(interface.GathererBase): 602 '''Represents a document or message in the template format used by 603 Total Recall for HTML documents.''' 604 605 def __init__(self, *args, **kwargs): 606 super(TrHtml, self).__init__(*args, **kwargs) 607 self.have_parsed_ = False 608 self.skeleton_ = [] # list of strings and MessageClique objects 609 self.fold_whitespace_ = False 610 611 def SetAttributes(self, attrs): 612 '''Sets node attributes used by the gatherer. 613 614 This checks the fold_whitespace attribute. 615 616 Args: 617 attrs: The mapping of node attributes. 618 ''' 619 self.fold_whitespace_ = ('fold_whitespace' in attrs and 620 attrs['fold_whitespace'] == 'true') 621 622 def GetText(self): 623 '''Returns the original text of the HTML document''' 624 return self.text_ 625 626 def GetTextualIds(self): 627 return [self.extkey] 628 629 def GetCliques(self): 630 '''Returns the message cliques for each translateable message in the 631 document.''' 632 return [x for x in self.skeleton_ if isinstance(x, clique.MessageClique)] 633 634 def Translate(self, lang, pseudo_if_not_available=True, 635 skeleton_gatherer=None, fallback_to_english=False): 636 '''Returns this document with translateable messages filled with 637 the translation for language 'lang'. 638 639 Args: 640 lang: 'en' 641 pseudo_if_not_available: True 642 643 Return: 644 'ID_THIS_SECTION TYPE\n...BEGIN\n "Translated message"\n......\nEND 645 646 Raises: 647 grit.exception.NotReady() if used before Parse() has been successfully 648 called. 649 grit.exception.NoSuchTranslation() if 'pseudo_if_not_available' is false 650 and there is no translation for the requested language. 651 ''' 652 if len(self.skeleton_) == 0: 653 raise exception.NotReady() 654 655 # TODO(joi) Implement support for skeleton gatherers here. 656 657 out = [] 658 for item in self.skeleton_: 659 if isinstance(item, six.string_types): 660 out.append(item) 661 else: 662 msg = item.MessageForLanguage(lang, 663 pseudo_if_not_available, 664 fallback_to_english) 665 for content in msg.GetContent(): 666 if isinstance(content, tclib.Placeholder): 667 out.append(content.GetOriginal()) 668 else: 669 # We escape " characters to increase the chance that attributes 670 # will be properly escaped. 671 out.append(util.EscapeHtml(content, True)) 672 673 return ''.join(out) 674 675 def Parse(self): 676 if self.have_parsed_: 677 return 678 self.have_parsed_ = True 679 680 text = self._LoadInputFile() 681 682 # Ignore the BOM character if the document starts with one. 683 if text.startswith(u'\ufeff'): 684 text = text[1:] 685 686 self.text_ = text 687 688 # Parsing is done in two phases: First, we break the document into 689 # translateable and nontranslateable chunks. Second, we run through each 690 # translateable chunk and insert placeholders for any HTML elements, 691 # unescape escaped characters, etc. 692 693 # First handle the silly little [!]-prefixed header because it's not 694 # handled by our HTML parsers. 695 m = _SILLY_HEADER.match(text) 696 if m: 697 self.skeleton_.append(text[:m.start('title')]) 698 self.skeleton_.append(self.uberclique.MakeClique( 699 tclib.Message(text=text[m.start('title'):m.end('title')]))) 700 self.skeleton_.append(text[m.end('title') : m.end()]) 701 text = text[m.end():] 702 703 chunks = HtmlChunks().Parse(text, self.fold_whitespace_) 704 705 for chunk in chunks: 706 if chunk[0]: # Chunk is translateable 707 self.skeleton_.append(self.uberclique.MakeClique( 708 HtmlToMessage(chunk[1], description=chunk[2]))) 709 else: 710 self.skeleton_.append(chunk[1]) 711 712 # Go through the skeleton and change any messages that consist solely of 713 # placeholders and whitespace into nontranslateable strings. 714 for ix in range(len(self.skeleton_)): 715 got_text = False 716 if isinstance(self.skeleton_[ix], clique.MessageClique): 717 msg = self.skeleton_[ix].GetMessage() 718 for item in msg.GetContent(): 719 if (isinstance(item, six.string_types) 720 and _NON_WHITESPACE.search(item) and item != ' '): 721 got_text = True 722 break 723 if not got_text: 724 self.skeleton_[ix] = msg.GetRealContent() 725 726 def SubstituteMessages(self, substituter): 727 '''Applies substitutions to all messages in the tree. 728 729 Goes through the skeleton and finds all MessageCliques. 730 731 Args: 732 substituter: a grit.util.Substituter object. 733 ''' 734 new_skel = [] 735 for chunk in self.skeleton_: 736 if isinstance(chunk, clique.MessageClique): 737 old_message = chunk.GetMessage() 738 new_message = substituter.SubstituteMessage(old_message) 739 if new_message is not old_message: 740 new_skel.append(self.uberclique.MakeClique(new_message)) 741 continue 742 new_skel.append(chunk) 743 self.skeleton_ = new_skel 744