1# 2# Copyright 2004-2006,2008 Zuza Software Foundation 3# 4# This file is part of translate. 5# 6# translate is free software; you can redistribute it and/or modify 7# it under the terms of the GNU General Public License as published by 8# the Free Software Foundation; either version 2 of the License, or 9# (at your option) any later version. 10# 11# translate is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14# GNU General Public License for more details. 15# 16# You should have received a copy of the GNU General Public License 17# along with this program; if not, see <http://www.gnu.org/licenses/>. 18# 19 20"""module for parsing html files for translation""" 21 22import html.parser 23import re 24from html.entities import html5 25 26from translate.storage import base 27from translate.storage.base import ParseError 28 29 30# Override the piclose tag from simple > to ?> otherwise we consume HTML 31# within the processing instructions 32html.parser.piclose = re.compile(r"\?>") 33 34 35class htmlunit(base.TranslationUnit): 36 """A unit of translatable/localisable HTML content""" 37 38 def __init__(self, source=None): 39 super().__init__(source) 40 self.locations = [] 41 42 def addlocation(self, location): 43 self.locations.append(location) 44 45 def getlocations(self): 46 return self.locations 47 48 49class htmlfile(html.parser.HTMLParser, base.TranslationStore): 50 UnitClass = htmlunit 51 52 TRANSLATABLE_ELEMENTS = [ 53 "address", 54 "article", 55 "aside", 56 "blockquote", 57 "caption", 58 "dd", 59 "dt", 60 "div", 61 "figcaption", 62 "footer", 63 "header", 64 "h1", 65 "h2", 66 "h3", 67 "h4", 68 "h5", 69 "h6", 70 "li", 71 "main", 72 "nav", 73 "option", 74 "p", 75 "pre", 76 "section", 77 "td", 78 "th", 79 "title", 80 ] 81 """These HTML elements (tags) will be extracted as translation units, unless 82 they lack translatable text content. 83 In case one translatable element is embedded in another, the outer translation 84 unit will be split into the parts before and after the inner translation unit.""" 85 86 TRANSLATABLE_ATTRIBUTES = [ 87 "abbr", # abbreviation for a table header cell 88 "alt", 89 "lang", # only for the html element -- see extract_translatable_attributes() 90 "summary", 91 "title", # tooltip text for an element 92 "value", 93 ] 94 """Text from these HTML attributes will be extracted as translation units. 95 Note: the content attribute of meta tags is a special case.""" 96 97 TRANSLATABLE_METADATA = ["description", "keywords"] 98 """Document metadata from meta elements with these names will be extracted as translation units. 99 Reference `<https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta/name>`_""" 100 101 EMPTY_HTML_ELEMENTS = [ 102 "area", 103 "base", 104 "br", 105 "col", 106 "embed", 107 "hr", 108 "img", 109 "input", 110 "link", 111 "meta", 112 "param", 113 "source", 114 "track", 115 "wbr", 116 ] 117 """An empty element is an element that cannot have any child nodes (i.e., nested 118 elements or text nodes). In HTML, using a closing tag on an empty element is 119 usually invalid. 120 Reference `<https://developer.mozilla.org/en-US/docs/Glossary/Empty_element>`_""" 121 122 WHITESPACE_RE = re.compile(r"\s+") 123 124 LEADING_WHITESPACE_RE = re.compile(r"^(\s+)") 125 126 TRAILING_WHITESPACE_RE = re.compile(r"(\s+)$") 127 128 ENCODING_RE = re.compile( 129 br"""<meta.* 130 content.*=.*?charset.*?=\s*? 131 ([^\s]*) 132 \s*?["']\s*?> 133 """, 134 re.VERBOSE | re.IGNORECASE, 135 ) 136 137 def __init__(self, inputfile=None, callback=None): 138 super().__init__(convert_charrefs=False) 139 base.TranslationStore.__init__(self) 140 141 # store parameters 142 self.filename = getattr(inputfile, "name", None) 143 if callback is None: 144 self.callback = self._simple_callback 145 else: 146 self.callback = callback 147 148 # initialize state 149 self.filesrc = "" 150 self.tag_path = [] 151 self.tu_content = [] 152 self.tu_location = None 153 154 # parse 155 if inputfile is not None: 156 htmlsrc = inputfile.read() 157 inputfile.close() 158 self.parse(htmlsrc) 159 160 def _simple_callback(self, string): 161 return string 162 163 def guess_encoding(self, htmlsrc): 164 """Returns the encoding of the html text. 165 166 We look for 'charset=' within a meta tag to do this. 167 """ 168 result = self.ENCODING_RE.findall(htmlsrc) 169 if result: 170 self.encoding = result[0].decode("ascii") 171 return self.encoding 172 173 def do_encoding(self, htmlsrc): 174 """Return the html text properly encoded based on a charset.""" 175 self.guess_encoding(htmlsrc) 176 return htmlsrc.decode(self.encoding) 177 178 def parse(self, htmlsrc): 179 htmlsrc = self.do_encoding(htmlsrc) 180 self.feed(htmlsrc) 181 182 def begin_translation_unit(self): 183 # at the start of a translation unit: 184 # this interrupts any translation unit in progress, so process the queue 185 # and prepare for the new. 186 self.emit_translation_unit() 187 self.tu_content = [] 188 self.tu_location = "%s+%s:%d-%d" % ( 189 self.filename, 190 ".".join(self.tag_path), 191 self.getpos()[0], 192 self.getpos()[1] + 1, 193 ) 194 195 def end_translation_unit(self): 196 # at the end of a translation unit: 197 # process the queue and reset state. 198 self.emit_translation_unit() 199 self.tu_content = [] 200 self.tu_location = None 201 202 def append_markup(self, markup): 203 # if within a translation unit: add to the queue to be processed later. 204 # otherwise handle immediately. 205 if self.tu_location: 206 self.tu_content.append(markup) 207 else: 208 self.emit_attribute_translation_units(markup) 209 self.filesrc += markup["html_content"] 210 211 def emit_translation_unit(self): 212 # scan through the queue: 213 # - find the first and last translatable markup elements: the captured 214 # interval [start, end) 215 # - match start and end tags 216 start = 0 217 end = 0 218 tagstack = [] 219 tagmap = {} 220 tag = None 221 do_normalize = True 222 for pos, content in enumerate(self.tu_content): 223 if content["type"] != "endtag" and tag in self.EMPTY_HTML_ELEMENTS: 224 match = tagstack.pop() 225 tag = None 226 227 if self.has_translatable_content(content): 228 if end == 0: 229 start = pos 230 end = pos + 1 231 elif content["type"] == "starttag": 232 tagstack.append(pos) 233 tag = content["tag"] 234 if tag == "pre": 235 do_normalize = False 236 elif content["type"] == "endtag": 237 if tagstack: 238 match = tagstack.pop() 239 tagmap[match] = pos 240 tagmap[pos] = match 241 tag = None 242 243 # if no translatable content found: process all the content in the queue 244 # as if the translation unit didn't exist. 245 if end == 0: 246 for markup in self.tu_content: 247 self.emit_attribute_translation_units(markup) 248 self.filesrc += markup["html_content"] 249 return 250 251 # scan the start and end tags captured between translatable content; 252 # extend the captured interval to include the matching tags 253 for pos in range(start + 1, end - 1): 254 if ( 255 self.tu_content[pos]["type"] == "starttag" 256 or self.tu_content[pos]["type"] == "endtag" 257 ) and pos in tagmap: 258 match = tagmap[pos] 259 start = min(start, match) 260 end = max(end, match + 1) 261 262 # emit leading uncaptured markup elements 263 for markup in self.tu_content[0:start]: 264 if markup["type"] != "comment": 265 self.emit_attribute_translation_units(markup) 266 self.filesrc += markup["html_content"] 267 268 # emit captured markup elements 269 if start < end: 270 html_content = [] 271 for markup in self.tu_content[start:end]: 272 if markup["type"] != "comment": 273 if "untranslated_html" in markup: 274 html_content.append(markup["untranslated_html"]) 275 else: 276 html_content.append(markup["html_content"]) 277 html_content = "".join(html_content) 278 if do_normalize: 279 normalized_content = self.WHITESPACE_RE.sub(" ", html_content.strip()) 280 else: 281 normalized_content = html_content.strip() 282 assert normalized_content # shouldn't be here otherwise 283 284 unit = self.addsourceunit(normalized_content) 285 unit.addlocation(self.tu_location) 286 comments = [ 287 markup["note"] 288 for markup in self.tu_content 289 if markup["type"] == "comment" 290 ] 291 if comments: 292 unit.addnote("\n".join(comments)) 293 294 html_content = ( 295 self.get_leading_whitespace(html_content) 296 + self.callback(normalized_content) 297 + self.get_trailing_whitespace(html_content) 298 ) 299 self.filesrc += html_content 300 301 # emit trailing uncaptured markup elements 302 for markup in self.tu_content[end : len(self.tu_content)]: 303 if markup["type"] != "comment": 304 self.emit_attribute_translation_units(markup) 305 self.filesrc += markup["html_content"] 306 307 @staticmethod 308 def has_translatable_content(markup): 309 # processing instructions count as translatable content, because PHP 310 return markup["type"] in {"data", "pi"} and markup["html_content"].strip() 311 312 def extract_translatable_attributes(self, tag, attrs): 313 result = [] 314 if tag == "meta": 315 tu = self.create_metadata_attribute_tu(attrs) 316 if tu: 317 result.append(tu) 318 else: 319 for attrname, attrvalue in attrs: 320 if ( 321 attrname in self.TRANSLATABLE_ATTRIBUTES 322 and self.translatable_attribute_matches_tag(attrname, tag) 323 ): 324 tu = self.create_attribute_tu(attrname, attrvalue) 325 if tu: 326 result.append(tu) 327 return result 328 329 def create_metadata_attribute_tu(self, attrs): 330 attrs_dict = dict(attrs) 331 name = attrs_dict["name"].lower() if "name" in attrs_dict else None 332 if name in self.TRANSLATABLE_METADATA and "content" in attrs_dict: 333 return self.create_attribute_tu("content", attrs_dict["content"]) 334 335 def translatable_attribute_matches_tag(self, attrname, tag): 336 if attrname == "lang": 337 return tag == "html" 338 return True 339 340 def create_attribute_tu(self, attrname, attrvalue): 341 normalized_value = self.WHITESPACE_RE.sub(" ", attrvalue).strip() 342 if normalized_value: 343 return { 344 "html_content": normalized_value, 345 "location": "%s+%s:%d-%d" 346 % ( 347 self.filename, 348 ".".join(self.tag_path) + "[" + attrname + "]", 349 self.getpos()[0], 350 self.getpos()[1] + 1, 351 ), 352 } 353 354 def emit_attribute_translation_units(self, markup): 355 if "attribute_tus" in markup: 356 for tu in markup["attribute_tus"]: 357 unit = self.addsourceunit(tu["html_content"]) 358 unit.addlocation(tu["location"]) 359 360 def translate_attributes(self, attrs): 361 result = [] 362 for attrname, attrvalue in attrs: 363 if attrvalue: 364 normalized_value = self.WHITESPACE_RE.sub(" ", attrvalue).strip() 365 translated_value = self.callback(normalized_value) 366 if translated_value != normalized_value: 367 result.append((attrname, translated_value)) 368 continue 369 result.append((attrname, attrvalue)) 370 return result 371 372 def create_start_tag(self, tag, attrs=None, startend=False): 373 attr_strings = [] 374 for attrname, attrvalue in attrs: 375 if attrvalue is None: 376 attr_strings.append(" " + attrname) 377 else: 378 attr_strings.append(f' {attrname}="{attrvalue}"') 379 return "<{}{}{}>".format(tag, "".join(attr_strings), " /" if startend else "") 380 381 def auto_close_empty_element(self): 382 if self.tag_path and self.tag_path[-1] in self.EMPTY_HTML_ELEMENTS: 383 self.tag_path.pop() 384 385 def get_leading_whitespace(self, str): 386 match = self.LEADING_WHITESPACE_RE.search(str) 387 return match.group(1) if match else "" 388 389 def get_trailing_whitespace(self, str): 390 match = self.TRAILING_WHITESPACE_RE.search(str) 391 return match.group(1) if match else "" 392 393 # From here on below, follows the methods of the HTMLParser 394 395 def handle_starttag(self, tag, attrs): 396 self.auto_close_empty_element() 397 self.tag_path.append(tag) 398 399 if tag in self.TRANSLATABLE_ELEMENTS: 400 self.begin_translation_unit() 401 402 translated_attrs = self.translate_attributes(attrs) 403 markup = { 404 "type": "starttag", 405 "tag": tag, 406 "html_content": self.create_start_tag(tag, translated_attrs), 407 "untranslated_html": self.create_start_tag(tag, attrs), 408 "attribute_tus": self.extract_translatable_attributes(tag, attrs), 409 } 410 self.append_markup(markup) 411 412 def handle_endtag(self, tag): 413 try: 414 popped = self.tag_path.pop() 415 except IndexError: 416 raise ParseError( 417 "Mismatched tags: no more tags: line %s" % self.getpos()[0] 418 ) 419 if popped != tag and popped in self.EMPTY_HTML_ELEMENTS: 420 popped = self.tag_path.pop() 421 if popped != tag: 422 raise ParseError( 423 "Mismatched closing tag: " 424 "expected '%s' got '%s' at line %s" % (popped, tag, self.getpos()[0]) 425 ) 426 427 self.append_markup({"type": "endtag", "html_content": "</%s>" % tag}) 428 429 if tag in self.TRANSLATABLE_ELEMENTS: 430 self.end_translation_unit() 431 if any(t in self.TRANSLATABLE_ELEMENTS for t in self.tag_path): 432 self.begin_translation_unit() 433 434 def handle_startendtag(self, tag, attrs): 435 self.auto_close_empty_element() 436 self.tag_path.append(tag) 437 438 if tag in self.TRANSLATABLE_ELEMENTS: 439 self.begin_translation_unit() 440 441 translated_attrs = self.translate_attributes(attrs) 442 markup = { 443 "type": "startendtag", 444 "html_content": self.create_start_tag(tag, translated_attrs, startend=True), 445 "untranslated_html": self.create_start_tag(tag, attrs, startend=True), 446 "attribute_tus": self.extract_translatable_attributes(tag, attrs), 447 } 448 self.append_markup(markup) 449 450 if tag in self.TRANSLATABLE_ELEMENTS: 451 self.end_translation_unit() 452 if any(t in self.TRANSLATABLE_ELEMENTS for t in self.tag_path): 453 self.begin_translation_unit() 454 455 self.tag_path.pop() 456 457 def handle_data(self, data): 458 self.auto_close_empty_element() 459 self.append_markup({"type": "data", "html_content": data}) 460 461 def handle_charref(self, name): 462 """Handle entries in the form &#NNNN; e.g. ⃡""" 463 if name.lower().startswith("x"): 464 self.handle_data(chr(int(name[1:], 16))) 465 else: 466 self.handle_data(chr(int(name))) 467 468 def handle_entityref(self, name): 469 """Handle named entities of the form &aaaa; e.g. ’""" 470 converted = html5.get(name + ";", None) 471 if name in ["gt", "lt", "amp"] or not converted: 472 self.handle_data("&%s;" % name) 473 else: 474 self.handle_data(converted) 475 476 def handle_comment(self, data): 477 self.auto_close_empty_element() 478 self.append_markup( 479 {"type": "comment", "html_content": "<!--%s-->" % data, "note": data} 480 ) 481 482 def handle_decl(self, decl): 483 self.auto_close_empty_element() 484 self.append_markup({"type": "decl", "html_content": "<!%s>" % decl}) 485 486 def handle_pi(self, data): 487 self.auto_close_empty_element() 488 self.append_markup({"type": "pi", "html_content": "<?%s?>" % data}) 489 490 def unknown_decl(self, data): 491 self.auto_close_empty_element() 492 self.append_markup({"type": "cdecl", "html_content": "<![%s]>" % data}) 493 494 495class POHTMLParser(htmlfile): 496 pass 497