1#!/usr/local/bin/python3.8
2# -*- coding: utf-8 -*-
3# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
4
5# Copyright 2015 Kevin B. Hendricks Stratford Ontario Canada
6# Copyright 2012 Google Inc. All Rights Reserved.
7#
8# Licensed under the Apache License, Version 2.0 (the "License");
9# you may not use this file except in compliance with the License.
10# You may obtain a copy of the License at
11#
12#     http://www.apache.org/licenses/LICENSE-2.0
13#
14# Unless required by applicable law or agreed to in writing, software
15# distributed under the License is distributed on an "AS IS" BASIS,
16# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17# See the License for the specific language governing permissions and
18# limitations under the License.
19#
20"""CTypes bindings for the Gumbo HTML5 parser.
21
22This exports the raw interface of the library as a set of very thin ctypes
23wrappers.  It's intended to be wrapped by other libraries to provide a more
24Pythonic API.
25"""
26import os
27import re
28import contextlib
29import ctypes
30from ctypes.util import find_library
31import sigil_gumboc_tags as gumboc_tags
32
33__author__ = 'jdtang@google.com (Jonathan Tang)'
34
35_sigilgumbolibpath = None
36
37def _remove_xml_header(data):
38    return re.sub(br'<\s*\?xml\s*[^\?>]*\?*>\s*', b'', data, flags=re.I)
39
40if 'SigilGumboLibPath' in os.environ:
41    _sigilgumbolibpath = os.environ['SigilGumboLibPath']
42
43if _sigilgumbolibpath is not None:
44    try:
45        _dll = ctypes.cdll.LoadLibrary(_sigilgumbolibpath)
46    except OSError:
47        _dll = ctypes.cdll.LoadLibrary(find_library('sigilgumbo'))
48        pass
49else:
50    _dll = ctypes.cdll.LoadLibrary(find_library('sigilgumbo'))
51
52# Some aliases for common types.
53_bitvector = ctypes.c_uint
54_Ptr = ctypes.POINTER
55
56class EnumMetaclass(type(ctypes.c_uint)):
57    def __new__(metaclass, name, bases, cls_dict):
58        cls = type(ctypes.c_uint).__new__(metaclass, name, bases, cls_dict)
59        if name == 'Enum':
60            return cls
61        try:
62            for i, value in enumerate(cls_dict['_values_']):
63                setattr(cls, value, cls.from_param(i))
64        except KeyError:
65            raise ValueError('No _values_ list found inside enum type.')
66        except TypeError:
67            raise ValueError('_values_ must be a list of names of enum constants.')
68        return cls
69
70def with_metaclass(mcls):
71    def decorator(cls):
72        body = vars(cls).copy()
73        # clean out class body
74        body.pop('__dict__', None)
75        body.pop('__weakref__', None)
76        return mcls(cls.__name__, cls.__bases__, body)
77    return decorator
78
79@with_metaclass(EnumMetaclass)
80class Enum(ctypes.c_uint):
81    @classmethod
82    def from_param(cls, param):
83        if isinstance(param, Enum):
84            if param.__class__ != cls:
85                raise ValueError("Can't mix enums of different types")
86            return param
87        if param < 0 or param > len(cls._values_):
88            raise ValueError('%d is out of range for enum type %s; max %d.' %
89                             (param, cls.__name__, len(cls._values_)))
90        return cls(param)
91
92    def __eq__(self, other):
93        return self.value == other.value
94
95    def __ne__(self, other):
96        return self.value != other.value
97
98    def __hash__(self):
99        return hash(self.value)
100
101    def __repr__(self):
102        try:
103            return self._values_[self.value]
104        except IndexError:
105            raise IndexError('Value %d is out of range for %r' %
106                             (self.value, self._values_))
107
108
109
110class StringPiece(ctypes.Structure):
111    _fields_ = [
112        ('data', _Ptr(ctypes.c_char)),
113        ('length', ctypes.c_size_t),
114        ]
115
116    def __len__(self):
117        return self.length
118
119    def __str__(self):
120        return ctypes.string_at(self.data, self.length).decode('utf-8')
121
122    def __bytes__(self):
123        return ctypes.string_at(self.data, self.length)
124
125
126class SourcePosition(ctypes.Structure):
127    _fields_ = [
128        ('line', ctypes.c_uint),
129        ('column', ctypes.c_uint),
130        ('offset', ctypes.c_uint)
131        ]
132SourcePosition.EMPTY = SourcePosition.in_dll(_dll, 'kGumboEmptySourcePosition')
133
134
135class AttributeNamespace(Enum):
136    URLS = [
137        'http://www.w3.org/1999/xhtml',
138        'http://www.w3.org/1999/xlink',
139        'http://www.w3.org/XML/1998/namespace',
140        'http://www.w3.org/2000/xmlns',
141    ]
142    _values_ = ['NONE', 'XLINK', 'XML', 'XMLNS']
143
144    def to_url(self):
145        return self.URLS[self.value]
146
147
148class OutputStatus(Enum):
149    STATUS_MSG = [
150        'OK',
151        'Document tree depth limit exceeded',
152        'System allocator returned NULL during parsing',
153    ]
154    _values_ = ['STATUS_OK', 'STATUS_TREE_TOO_DEEP', 'STATUS_OUT_OF_MEMORY']
155
156    def to_string(self):
157        return self.STATUS_MSG[self.value]
158
159
160class Attribute(ctypes.Structure):
161    _fields_ = [
162        ('namespace', AttributeNamespace),
163        ('name', ctypes.c_char_p),
164        ('original_name', StringPiece),
165        ('value', ctypes.c_char_p),
166        ('original_value', StringPiece),
167        ('name_start', SourcePosition),
168        ('name_end', SourcePosition),
169        ('value_start', SourcePosition),
170        ('value_end', SourcePosition)
171        ]
172
173
174class Vector(ctypes.Structure):
175    _type_ = ctypes.c_void_p
176    _fields_ = [
177        ('data', _Ptr(ctypes.c_void_p)),
178        ('length', ctypes.c_uint),
179        ('capacity', ctypes.c_uint)
180        ]
181
182    class Iter(object):
183        def __init__(self, vector):
184            self.current = 0
185            self.vector = vector
186
187        def __iter__(self):
188            return self
189
190        def __next__(self):
191            # Python 3
192            if self.current >= self.vector.length:
193                raise StopIteration
194            obj = self.vector[self.current]
195            self.current += 1
196            return obj
197
198        def next(self):
199            # Python 2
200            return self.__next__()
201
202    def __len__(self):
203        return self.length
204
205    def __getitem__(self, i):
206        try:
207            # Python 2
208            numeric_types = (int, long)
209        except NameError:
210            # Python 3
211            numeric_types = int
212
213        if isinstance(i, numeric_types):
214            if i < 0:
215                i += self.length
216            if i > self.length:
217                raise IndexError
218            array_type = _Ptr(_Ptr(self._type_))
219            return ctypes.cast(self.data, array_type)[i].contents
220        return list(self)[i]
221
222    def __iter__(self):
223        return Vector.Iter(self)
224
225
226Vector.EMPTY = Vector.in_dll(_dll, 'kGumboEmptyVector')
227
228
229class AttributeVector(Vector):
230    _type_ = Attribute
231
232
233class NodeVector(Vector):
234    # _type_ assigned later, to avoid circular references with Node
235    pass
236
237
238class QuirksMode(Enum):
239    _values_ = ['NO_QUIRKS', 'QUIRKS', 'LIMITED_QUIRKS']
240
241
242class Document(ctypes.Structure):
243    _fields_ = [
244        ('children', NodeVector),
245        ('has_doctype', ctypes.c_bool),
246        ('name', ctypes.c_char_p),
247        ('public_identifier', ctypes.c_char_p),
248        ('system_identifier', ctypes.c_char_p),
249        ('doc_type_quirks_mode', QuirksMode),
250        ]
251
252    def __repr__(self):
253        return 'Document'
254
255
256class Namespace(Enum):
257    URLS = [
258        'http://www.w3.org/1999/xhtml',
259        'http://www.w3.org/2000/svg',
260        'http://www.w3.org/1998/Math/MathML',
261    ]
262    _values_ = ['HTML', 'SVG', 'MATHML']
263
264    def to_url(self):
265        return self.URLS[self.value]
266
267
268class Tag(Enum):
269    @staticmethod
270    def from_str(tagname):
271        text_ptr = ctypes.c_char_p(tagname.encode('utf-8'))
272        return _tag_enum(text_ptr)
273
274    _values_ = gumboc_tags.TagNames + ['UNKNOWN', 'LAST']
275
276class Element(ctypes.Structure):
277    _fields_ = [
278        ('children', NodeVector),
279        ('tag', Tag),
280        ('tag_namespace', Namespace),
281        ('original_tag', StringPiece),
282        ('original_end_tag', StringPiece),
283        ('start_pos', SourcePosition),
284        ('end_pos', SourcePosition),
285        ('attributes', AttributeVector),
286        ]
287
288    @property
289    def tag_name(self):
290        original_tag = StringPiece.from_buffer_copy(self.original_tag)
291        _tag_from_original_text(ctypes.byref(original_tag))
292        if self.tag_namespace == Namespace.SVG:
293            svg_tagname = _normalize_svg_tagname(ctypes.byref(original_tag))
294            if svg_tagname is not None:
295                return bytes(svg_tagname)
296        if self.tag == Tag.UNKNOWN:
297            if original_tag.data is None:
298                return ''
299            return (bytes(original_tag).decode('utf-8').lower()).encode('utf-8')
300        return _tagname(self.tag)
301
302    def __repr__(self):
303        return ('<%r>\n' % self.tag + '\n'.join(repr(child) for child in self.children) + '</%r>' % self.tag)
304
305
306class Text(ctypes.Structure):
307    _fields_ = [
308        ('text', ctypes.c_char_p),
309        ('original_text', StringPiece),
310        ('start_pos', SourcePosition)
311        ]
312
313    def __repr__(self):
314        return 'Text(%r)' % self.text
315
316
317class NodeType(Enum):
318    _values_ = ['DOCUMENT', 'ELEMENT', 'TEXT', 'CDATA',
319                'COMMENT', 'WHITESPACE', 'TEMPLATE']
320
321
322class NodeUnion(ctypes.Union):
323    _fields_ = [
324        ('document', Document),
325        ('element', Element),
326        ('text', Text),
327        ]
328
329
330class Node(ctypes.Structure):
331    # _fields_ set later to avoid a circular reference
332
333    def _contents(self):
334        # Python3 enters an infinite loop if you use an @property within
335        # __getattr__, so we factor it out to a helper.
336        if self.type == NodeType.DOCUMENT:
337            return self.v.document
338        elif self.type in (NodeType.ELEMENT, NodeType.TEMPLATE):
339            return self.v.element
340        else:
341            return self.v.text
342
343    @property
344    def contents(self):
345        return self._contents()
346
347    def __getattr__(self, name):
348        return getattr(self._contents(), name)
349
350    def __setattr__(self, name, value):
351        return setattr(self._contents(), name, value)
352
353    def __repr__(self):
354        return repr(self.contents)
355
356
357Node._fields_ = [
358    ('type', NodeType),
359    # Set the type to Node later to avoid a circular dependency.
360    ('parent', _Ptr(Node)),
361    ('index_within_parent', ctypes.c_uint),
362    # TODO(jdtang): Make a real list of enum constants for this.
363    ('parse_flags', _bitvector),
364    ('v', NodeUnion)
365    ]
366NodeVector._type_ = Node
367
368
369class Options(ctypes.Structure):
370    _fields_ = [
371        ('tab_stop', ctypes.c_int),
372        ('use_xhtml_rules', ctypes.c_bool),
373        ('stop_on_first_error', ctypes.c_bool),
374        ('max_tree_depth', ctypes.c_uint),
375        ('max_errors', ctypes.c_int),
376        ]
377
378
379class Output(ctypes.Structure):
380    _fields_ = [
381        ('document', _Ptr(Node)),
382        ('root', _Ptr(Node)),
383        ('status', OutputStatus),
384        # TODO(jdtang): Error type.
385        ('errors', Vector),
386        ]
387
388
389# Important Note: gumbo only supports the utf-8 encoding
390# Also gumbo is an html5 parser and does not grok xml pi headers
391@contextlib.contextmanager
392def parse(text, **kwargs):
393    options = Options()
394    context_tag = kwargs.get('container', Tag.LAST)
395    context_namespace = kwargs.get('container_namespace', Namespace.HTML)
396    for field_name, _ in Options._fields_:
397        try:
398            setattr(options, field_name, kwargs[field_name])
399        except KeyError:
400            setattr(options, field_name, getattr(_DEFAULT_OPTIONS, field_name))
401    # We have to manually take a reference to the input text here so that it
402    # outlives the parse output.  If we let ctypes do it automatically on function
403    # call, it creates a temporary buffer which is destroyed when the call
404    # completes, and then the original_text pointers point into invalid memory.
405    # convert string to be utf-8 encoded
406    if isinstance(text, str):
407        text = text.encode('utf-8')
408    text = _remove_xml_header(text)
409    text_ptr = ctypes.c_char_p(text)
410    output = _parse_fragment(
411        ctypes.byref(options), text_ptr, len(text),
412        context_tag, context_namespace)
413    try:
414        yield output
415    finally:
416        _destroy_output(output)
417
418_DEFAULT_OPTIONS = Options.in_dll(_dll, 'kGumboDefaultOptions')
419
420_parse_with_options = _dll.gumbo_parse_with_options
421_parse_with_options.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t]
422_parse_with_options.restype = _Ptr(Output)
423
424_parse_fragment = _dll.gumbo_parse_fragment
425_parse_fragment.argtypes = [
426    _Ptr(Options), ctypes.c_char_p, ctypes.c_size_t, Tag, Namespace]
427_parse_fragment.restype = _Ptr(Output)
428
429_tag_from_original_text = _dll.gumbo_tag_from_original_text
430_tag_from_original_text.argtypes = [_Ptr(StringPiece)]
431_tag_from_original_text.restype = None
432
433_normalize_svg_tagname = _dll.gumbo_normalize_svg_tagname
434_normalize_svg_tagname.argtypes = [_Ptr(StringPiece)]
435_normalize_svg_tagname.restype = ctypes.c_char_p
436
437_destroy_output = _dll.gumbo_destroy_output
438_destroy_output.argtypes = [_Ptr(Output)]
439_destroy_output.restype = None
440
441_tagname = _dll.gumbo_normalized_tagname
442_tagname.argtypes = [Tag]
443_tagname.restype = ctypes.c_char_p
444
445_tag_enum = _dll.gumbo_tag_enum
446_tag_enum.argtypes = [ctypes.c_char_p]
447_tag_enum.restype = Tag
448
449__all__ = ['StringPiece', 'SourcePosition', 'AttributeNamespace', 'Attribute',
450           'Vector', 'AttributeVector', 'NodeVector', 'QuirksMode', 'Document',
451           'Namespace', 'Tag', 'Element', 'Text', 'NodeType', 'Node',
452           'Options', 'Output', 'parse']
453