1# ----------------------------------------------------------------------------
2# pyglet
3# Copyright (c) 2006-2008 Alex Holkner
4# Copyright (c) 2008-2020 pyglet contributors
5# All rights reserved.
6#
7# Redistribution and use in source and binary forms, with or without
8# modification, are permitted provided that the following conditions
9# are met:
10#
11#  * Redistributions of source code must retain the above copyright
12#    notice, this list of conditions and the following disclaimer.
13#  * Redistributions in binary form must reproduce the above copyright
14#    notice, this list of conditions and the following disclaimer in
15#    the documentation and/or other materials provided with the
16#    distribution.
17#  * Neither the name of pyglet nor the names of its
18#    contributors may be used to endorse or promote products
19#    derived from this software without specific prior written
20#    permission.
21#
22# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33# POSSIBILITY OF SUCH DAMAGE.
34# ----------------------------------------------------------------------------
35
36"""Decode HTML into attributed text.
37
38A subset of HTML 4.01 Transitional is implemented.  The following elements are
39supported fully::
40
41    B BLOCKQUOTE BR CENTER CODE DD DIR DL EM FONT H1 H2 H3 H4 H5 H6 I IMG KBD
42    LI MENU OL P PRE Q SAMP STRONG SUB SUP TT U UL VAR
43
44The mark (bullet or number) of a list item is separated from the body of the
45list item with a tab, as the pyglet document model does not allow
46out-of-stream text.  This means lists display as expected, but behave a little
47oddly if edited.
48
49No CSS styling is supported.
50"""
51
52import re
53
54from html.parser import HTMLParser
55from html import entities
56
57import pyglet
58from pyglet.text.formats import structured
59
60
61def _hex_color(val):
62    return [(val >> 16) & 0xff, (val >> 8) & 0xff, val & 0xff, 255]
63
64
65_color_names = {
66    'black': _hex_color(0x000000),
67    'silver': _hex_color(0xc0c0c0),
68    'gray': _hex_color(0x808080),
69    'white': _hex_color(0xffffff),
70    'maroon': _hex_color(0x800000),
71    'red': _hex_color(0xff0000),
72    'purple': _hex_color(0x800080),
73    'fucsia': _hex_color(0x008000),
74    'green': _hex_color(0x00ff00),
75    'lime': _hex_color(0xffff00),
76    'olive': _hex_color(0x808000),
77    'yellow': _hex_color(0xff0000),
78    'navy': _hex_color(0x000080),
79    'blue': _hex_color(0x0000ff),
80    'teal': _hex_color(0x008080),
81    'aqua': _hex_color(0x00ffff),
82}
83
84
85def _parse_color(value):
86    if value.startswith('#'):
87        return _hex_color(int(value[1:], 16))
88    else:
89        try:
90            return _color_names[value.lower()]
91        except KeyError:
92            raise ValueError()
93
94
95_whitespace_re = re.compile(u'[\u0020\u0009\u000c\u200b\r\n]+', re.DOTALL)
96
97_metadata_elements = ['head', 'title']
98
99_block_elements = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
100                   'ul', 'ol', 'dir', 'menu',
101                   'pre', 'dl', 'div', 'center',
102                   'noscript', 'noframes', 'blockquote', 'form',
103                   'isindex', 'hr', 'table', 'fieldset', 'address',
104                   # Incorrect, but we treat list items as blocks:
105                   'li', 'dd', 'dt', ]
106
107_block_containers = ['_top_block',
108                     'body', 'div', 'center', 'object', 'applet',
109                     'blockquote', 'ins', 'del', 'dd', 'li', 'form',
110                     'fieldset', 'button', 'th', 'td', 'iframe', 'noscript',
111                     'noframes',
112                     # Incorrect, but we treat list items as blocks:
113                     'ul', 'ol', 'dir', 'menu', 'dl']
114
115
116class HTMLDecoder(HTMLParser, structured.StructuredTextDecoder):
117    """Decoder for HTML documents.
118    """
119    #: Default style attributes for unstyled text in the HTML document.
120    #:
121    #: :type: dict
122    default_style = {
123        'font_name': 'Times New Roman',
124        'font_size': 12,
125        'margin_bottom': '12pt',
126    }
127
128    #: Map HTML font sizes to actual font sizes, in points.
129    #:
130    #: :type: dict
131    font_sizes = {
132        1: 8,
133        2: 10,
134        3: 12,
135        4: 14,
136        5: 18,
137        6: 24,
138        7: 48
139    }
140
141    def decode_structured(self, text, location):
142        self.location = location
143        self._font_size_stack = [3]
144        self.list_stack.append(structured.UnorderedListBuilder({}))
145        self.strip_leading_space = True
146        self.block_begin = True
147        self.need_block_begin = False
148        self.element_stack = ['_top_block']
149        self.in_metadata = False
150        self.in_pre = False
151
152        self.push_style('_default', self.default_style)
153
154        self.feed(text)
155        self.close()
156
157    def get_image(self, filename):
158        return pyglet.image.load(filename, file=self.location.open(filename))
159
160    def prepare_for_data(self):
161        if self.need_block_begin:
162            self.add_text('\n')
163            self.block_begin = True
164            self.need_block_begin = False
165
166    def handle_data(self, data):
167        if self.in_metadata:
168            return
169
170        if self.in_pre:
171            self.add_text(data)
172        else:
173            data = _whitespace_re.sub(' ', data)
174            if data.strip():
175                self.prepare_for_data()
176                if self.block_begin or self.strip_leading_space:
177                    data = data.lstrip()
178                    self.block_begin = False
179                self.add_text(data)
180            self.strip_leading_space = data.endswith(' ')
181
182    def handle_starttag(self, tag, case_attrs):
183        if self.in_metadata:
184            return
185
186        element = tag.lower()
187        attrs = {}
188        for key, value in case_attrs:
189            attrs[key.lower()] = value
190
191        if element in _metadata_elements:
192            self.in_metadata = True
193        elif element in _block_elements:
194            # Pop off elements until we get to a block container.
195            while self.element_stack[-1] not in _block_containers:
196                self.handle_endtag(self.element_stack[-1])
197            if not self.block_begin:
198                self.add_text('\n')
199                self.block_begin = True
200                self.need_block_begin = False
201        self.element_stack.append(element)
202
203        style = {}
204        if element in ('b', 'strong'):
205            style['bold'] = True
206        elif element in ('i', 'em', 'var'):
207            style['italic'] = True
208        elif element in ('tt', 'code', 'samp', 'kbd'):
209            style['font_name'] = 'Courier New'
210        elif element == 'u':
211            color = self.current_style.get('color')
212            if color is None:
213                color = [0, 0, 0, 255]
214            style['underline'] = color
215        elif element == 'font':
216            if 'face' in attrs:
217                style['font_name'] = attrs['face'].split(',')
218            if 'size' in attrs:
219                size = attrs['size']
220                try:
221                    if size.startswith('+'):
222                        size = self._font_size_stack[-1] + int(size[1:])
223                    elif size.startswith('-'):
224                        size = self._font_size_stack[-1] - int(size[1:])
225                    else:
226                        size = int(size)
227                except ValueError:
228                    size = 3
229                self._font_size_stack.append(size)
230                if size in self.font_sizes:
231                    style['font_size'] = self.font_sizes.get(size, 3)
232            else:
233                self._font_size_stack.append(self._font_size_stack[-1])
234            if 'color' in attrs:
235                try:
236                    style['color'] = _parse_color(attrs['color'])
237                except ValueError:
238                    pass
239        elif element == 'sup':
240            size = self._font_size_stack[-1] - 1
241            style['font_size'] = self.font_sizes.get(size, 1)
242            style['baseline'] = '3pt'
243        elif element == 'sub':
244            size = self._font_size_stack[-1] - 1
245            style['font_size'] = self.font_sizes.get(size, 1)
246            style['baseline'] = '-3pt'
247        elif element == 'h1':
248            style['font_size'] = 24
249            style['bold'] = True
250            style['align'] = 'center'
251        elif element == 'h2':
252            style['font_size'] = 18
253            style['bold'] = True
254        elif element == 'h3':
255            style['font_size'] = 16
256            style['bold'] = True
257        elif element == 'h4':
258            style['font_size'] = 14
259            style['bold'] = True
260        elif element == 'h5':
261            style['font_size'] = 12
262            style['bold'] = True
263        elif element == 'h6':
264            style['font_size'] = 12
265            style['italic'] = True
266        elif element == 'br':
267            self.add_text(u'\u2028')
268            self.strip_leading_space = True
269        elif element == 'p':
270            if attrs.get('align') in ('left', 'center', 'right'):
271                style['align'] = attrs['align']
272        elif element == 'center':
273            style['align'] = 'center'
274        elif element == 'pre':
275            style['font_name'] = 'Courier New'
276            style['margin_bottom'] = 0
277            self.in_pre = True
278        elif element == 'blockquote':
279            left_margin = self.current_style.get('margin_left') or 0
280            right_margin = self.current_style.get('margin_right') or 0
281            style['margin_left'] = left_margin + 60
282            style['margin_right'] = right_margin + 60
283        elif element == 'q':
284            self.handle_data(u'\u201c')
285        elif element == 'ol':
286            try:
287                start = int(attrs.get('start', 1))
288            except ValueError:
289                start = 1
290            format = attrs.get('type', '1') + '.'
291            builder = structured.OrderedListBuilder(start, format)
292            builder.begin(self, style)
293            self.list_stack.append(builder)
294        elif element in ('ul', 'dir', 'menu'):
295            type = attrs.get('type', 'disc').lower()
296            if type == 'circle':
297                mark = u'\u25cb'
298            elif type == 'square':
299                mark = u'\u25a1'
300            else:
301                mark = u'\u25cf'
302            builder = structured.UnorderedListBuilder(mark)
303            builder.begin(self, style)
304            self.list_stack.append(builder)
305        elif element == 'li':
306            self.list_stack[-1].item(self, style)
307            self.strip_leading_space = True
308        elif element == 'dl':
309            style['margin_bottom'] = 0
310        elif element == 'dd':
311            left_margin = self.current_style.get('margin_left') or 0
312            style['margin_left'] = left_margin + 30
313        elif element == 'img':
314            image = self.get_image(attrs.get('src'))
315            if image:
316                width = attrs.get('width')
317                if width:
318                    width = int(width)
319                height = attrs.get('height')
320                if height:
321                    height = int(height)
322                self.prepare_for_data()
323                self.add_element(structured.ImageElement(image, width, height))
324                self.strip_leading_space = False
325
326        self.push_style(element, style)
327
328    def handle_endtag(self, tag):
329        element = tag.lower()
330        if element not in self.element_stack:
331            return
332
333        self.pop_style(element)
334        while self.element_stack.pop() != element:
335            pass
336
337        if element in _metadata_elements:
338            self.in_metadata = False
339        elif element in _block_elements:
340            self.block_begin = False
341            self.need_block_begin = True
342
343        if element == 'font' and len(self._font_size_stack) > 1:
344            self._font_size_stack.pop()
345        elif element == 'pre':
346            self.in_pre = False
347        elif element == 'q':
348            self.handle_data(u'\u201d')
349        elif element in ('ul', 'ol'):
350            if len(self.list_stack) > 1:
351                self.list_stack.pop()
352
353    def handle_entityref(self, name):
354        if name in entities.name2codepoint:
355            self.handle_data(chr(entities.name2codepoint[name]))
356
357    def handle_charref(self, name):
358        name = name.lower()
359        try:
360            if name.startswith('x'):
361                self.handle_data(chr(int(name[1:], 16)))
362            else:
363                self.handle_data(chr(int(name)))
364        except ValueError:
365            pass
366