1# ---------------------------------------------------------------------------- 2# pyglet 3# Copyright (c) 2006-2008 Alex Holkner 4# Copyright (c) 2008-2020 pyglet contributors 5# All rights reserved. 6# 7# Redistribution and use in source and binary forms, with or without 8# modification, are permitted provided that the following conditions 9# are met: 10# 11# * Redistributions of source code must retain the above copyright 12# notice, this list of conditions and the following disclaimer. 13# * Redistributions in binary form must reproduce the above copyright 14# notice, this list of conditions and the following disclaimer in 15# the documentation and/or other materials provided with the 16# distribution. 17# * Neither the name of pyglet nor the names of its 18# contributors may be used to endorse or promote products 19# derived from this software without specific prior written 20# permission. 21# 22# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 28# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 30# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 32# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 33# POSSIBILITY OF SUCH DAMAGE. 34# ---------------------------------------------------------------------------- 35 36"""Decode HTML into attributed text. 37 38A subset of HTML 4.01 Transitional is implemented. The following elements are 39supported fully:: 40 41 B BLOCKQUOTE BR CENTER CODE DD DIR DL EM FONT H1 H2 H3 H4 H5 H6 I IMG KBD 42 LI MENU OL P PRE Q SAMP STRONG SUB SUP TT U UL VAR 43 44The mark (bullet or number) of a list item is separated from the body of the 45list item with a tab, as the pyglet document model does not allow 46out-of-stream text. This means lists display as expected, but behave a little 47oddly if edited. 48 49No CSS styling is supported. 50""" 51 52import re 53 54from html.parser import HTMLParser 55from html import entities 56 57import pyglet 58from pyglet.text.formats import structured 59 60 61def _hex_color(val): 62 return [(val >> 16) & 0xff, (val >> 8) & 0xff, val & 0xff, 255] 63 64 65_color_names = { 66 'black': _hex_color(0x000000), 67 'silver': _hex_color(0xc0c0c0), 68 'gray': _hex_color(0x808080), 69 'white': _hex_color(0xffffff), 70 'maroon': _hex_color(0x800000), 71 'red': _hex_color(0xff0000), 72 'purple': _hex_color(0x800080), 73 'fucsia': _hex_color(0x008000), 74 'green': _hex_color(0x00ff00), 75 'lime': _hex_color(0xffff00), 76 'olive': _hex_color(0x808000), 77 'yellow': _hex_color(0xff0000), 78 'navy': _hex_color(0x000080), 79 'blue': _hex_color(0x0000ff), 80 'teal': _hex_color(0x008080), 81 'aqua': _hex_color(0x00ffff), 82} 83 84 85def _parse_color(value): 86 if value.startswith('#'): 87 return _hex_color(int(value[1:], 16)) 88 else: 89 try: 90 return _color_names[value.lower()] 91 except KeyError: 92 raise ValueError() 93 94 95_whitespace_re = re.compile(u'[\u0020\u0009\u000c\u200b\r\n]+', re.DOTALL) 96 97_metadata_elements = ['head', 'title'] 98 99_block_elements = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 100 'ul', 'ol', 'dir', 'menu', 101 'pre', 'dl', 'div', 'center', 102 'noscript', 'noframes', 'blockquote', 'form', 103 'isindex', 'hr', 'table', 'fieldset', 'address', 104 # Incorrect, but we treat list items as blocks: 105 'li', 'dd', 'dt', ] 106 107_block_containers = ['_top_block', 108 'body', 'div', 'center', 'object', 'applet', 109 'blockquote', 'ins', 'del', 'dd', 'li', 'form', 110 'fieldset', 'button', 'th', 'td', 'iframe', 'noscript', 111 'noframes', 112 # Incorrect, but we treat list items as blocks: 113 'ul', 'ol', 'dir', 'menu', 'dl'] 114 115 116class HTMLDecoder(HTMLParser, structured.StructuredTextDecoder): 117 """Decoder for HTML documents. 118 """ 119 #: Default style attributes for unstyled text in the HTML document. 120 #: 121 #: :type: dict 122 default_style = { 123 'font_name': 'Times New Roman', 124 'font_size': 12, 125 'margin_bottom': '12pt', 126 } 127 128 #: Map HTML font sizes to actual font sizes, in points. 129 #: 130 #: :type: dict 131 font_sizes = { 132 1: 8, 133 2: 10, 134 3: 12, 135 4: 14, 136 5: 18, 137 6: 24, 138 7: 48 139 } 140 141 def decode_structured(self, text, location): 142 self.location = location 143 self._font_size_stack = [3] 144 self.list_stack.append(structured.UnorderedListBuilder({})) 145 self.strip_leading_space = True 146 self.block_begin = True 147 self.need_block_begin = False 148 self.element_stack = ['_top_block'] 149 self.in_metadata = False 150 self.in_pre = False 151 152 self.push_style('_default', self.default_style) 153 154 self.feed(text) 155 self.close() 156 157 def get_image(self, filename): 158 return pyglet.image.load(filename, file=self.location.open(filename)) 159 160 def prepare_for_data(self): 161 if self.need_block_begin: 162 self.add_text('\n') 163 self.block_begin = True 164 self.need_block_begin = False 165 166 def handle_data(self, data): 167 if self.in_metadata: 168 return 169 170 if self.in_pre: 171 self.add_text(data) 172 else: 173 data = _whitespace_re.sub(' ', data) 174 if data.strip(): 175 self.prepare_for_data() 176 if self.block_begin or self.strip_leading_space: 177 data = data.lstrip() 178 self.block_begin = False 179 self.add_text(data) 180 self.strip_leading_space = data.endswith(' ') 181 182 def handle_starttag(self, tag, case_attrs): 183 if self.in_metadata: 184 return 185 186 element = tag.lower() 187 attrs = {} 188 for key, value in case_attrs: 189 attrs[key.lower()] = value 190 191 if element in _metadata_elements: 192 self.in_metadata = True 193 elif element in _block_elements: 194 # Pop off elements until we get to a block container. 195 while self.element_stack[-1] not in _block_containers: 196 self.handle_endtag(self.element_stack[-1]) 197 if not self.block_begin: 198 self.add_text('\n') 199 self.block_begin = True 200 self.need_block_begin = False 201 self.element_stack.append(element) 202 203 style = {} 204 if element in ('b', 'strong'): 205 style['bold'] = True 206 elif element in ('i', 'em', 'var'): 207 style['italic'] = True 208 elif element in ('tt', 'code', 'samp', 'kbd'): 209 style['font_name'] = 'Courier New' 210 elif element == 'u': 211 color = self.current_style.get('color') 212 if color is None: 213 color = [0, 0, 0, 255] 214 style['underline'] = color 215 elif element == 'font': 216 if 'face' in attrs: 217 style['font_name'] = attrs['face'].split(',') 218 if 'size' in attrs: 219 size = attrs['size'] 220 try: 221 if size.startswith('+'): 222 size = self._font_size_stack[-1] + int(size[1:]) 223 elif size.startswith('-'): 224 size = self._font_size_stack[-1] - int(size[1:]) 225 else: 226 size = int(size) 227 except ValueError: 228 size = 3 229 self._font_size_stack.append(size) 230 if size in self.font_sizes: 231 style['font_size'] = self.font_sizes.get(size, 3) 232 else: 233 self._font_size_stack.append(self._font_size_stack[-1]) 234 if 'color' in attrs: 235 try: 236 style['color'] = _parse_color(attrs['color']) 237 except ValueError: 238 pass 239 elif element == 'sup': 240 size = self._font_size_stack[-1] - 1 241 style['font_size'] = self.font_sizes.get(size, 1) 242 style['baseline'] = '3pt' 243 elif element == 'sub': 244 size = self._font_size_stack[-1] - 1 245 style['font_size'] = self.font_sizes.get(size, 1) 246 style['baseline'] = '-3pt' 247 elif element == 'h1': 248 style['font_size'] = 24 249 style['bold'] = True 250 style['align'] = 'center' 251 elif element == 'h2': 252 style['font_size'] = 18 253 style['bold'] = True 254 elif element == 'h3': 255 style['font_size'] = 16 256 style['bold'] = True 257 elif element == 'h4': 258 style['font_size'] = 14 259 style['bold'] = True 260 elif element == 'h5': 261 style['font_size'] = 12 262 style['bold'] = True 263 elif element == 'h6': 264 style['font_size'] = 12 265 style['italic'] = True 266 elif element == 'br': 267 self.add_text(u'\u2028') 268 self.strip_leading_space = True 269 elif element == 'p': 270 if attrs.get('align') in ('left', 'center', 'right'): 271 style['align'] = attrs['align'] 272 elif element == 'center': 273 style['align'] = 'center' 274 elif element == 'pre': 275 style['font_name'] = 'Courier New' 276 style['margin_bottom'] = 0 277 self.in_pre = True 278 elif element == 'blockquote': 279 left_margin = self.current_style.get('margin_left') or 0 280 right_margin = self.current_style.get('margin_right') or 0 281 style['margin_left'] = left_margin + 60 282 style['margin_right'] = right_margin + 60 283 elif element == 'q': 284 self.handle_data(u'\u201c') 285 elif element == 'ol': 286 try: 287 start = int(attrs.get('start', 1)) 288 except ValueError: 289 start = 1 290 format = attrs.get('type', '1') + '.' 291 builder = structured.OrderedListBuilder(start, format) 292 builder.begin(self, style) 293 self.list_stack.append(builder) 294 elif element in ('ul', 'dir', 'menu'): 295 type = attrs.get('type', 'disc').lower() 296 if type == 'circle': 297 mark = u'\u25cb' 298 elif type == 'square': 299 mark = u'\u25a1' 300 else: 301 mark = u'\u25cf' 302 builder = structured.UnorderedListBuilder(mark) 303 builder.begin(self, style) 304 self.list_stack.append(builder) 305 elif element == 'li': 306 self.list_stack[-1].item(self, style) 307 self.strip_leading_space = True 308 elif element == 'dl': 309 style['margin_bottom'] = 0 310 elif element == 'dd': 311 left_margin = self.current_style.get('margin_left') or 0 312 style['margin_left'] = left_margin + 30 313 elif element == 'img': 314 image = self.get_image(attrs.get('src')) 315 if image: 316 width = attrs.get('width') 317 if width: 318 width = int(width) 319 height = attrs.get('height') 320 if height: 321 height = int(height) 322 self.prepare_for_data() 323 self.add_element(structured.ImageElement(image, width, height)) 324 self.strip_leading_space = False 325 326 self.push_style(element, style) 327 328 def handle_endtag(self, tag): 329 element = tag.lower() 330 if element not in self.element_stack: 331 return 332 333 self.pop_style(element) 334 while self.element_stack.pop() != element: 335 pass 336 337 if element in _metadata_elements: 338 self.in_metadata = False 339 elif element in _block_elements: 340 self.block_begin = False 341 self.need_block_begin = True 342 343 if element == 'font' and len(self._font_size_stack) > 1: 344 self._font_size_stack.pop() 345 elif element == 'pre': 346 self.in_pre = False 347 elif element == 'q': 348 self.handle_data(u'\u201d') 349 elif element in ('ul', 'ol'): 350 if len(self.list_stack) > 1: 351 self.list_stack.pop() 352 353 def handle_entityref(self, name): 354 if name in entities.name2codepoint: 355 self.handle_data(chr(entities.name2codepoint[name])) 356 357 def handle_charref(self, name): 358 name = name.lower() 359 try: 360 if name.startswith('x'): 361 self.handle_data(chr(int(name[1:], 16))) 362 else: 363 self.handle_data(chr(int(name))) 364 except ValueError: 365 pass 366