1# Copyright 2012 Google Inc. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14#
15
16"""CTypes bindings for the Gumbo HTML5 parser.
17
18This exports the raw interface of the library as a set of very thin ctypes
19wrappers.  It's intended to be wrapped by other libraries to provide a more
20Pythonic API.
21"""
22
23__author__ = 'jdtang@google.com (Jonathan Tang)'
24
25import sys
26import contextlib
27import ctypes
28import os.path
29import gumboc_tags
30
31_name_of_lib = 'libgumbo.so'
32if sys.platform.startswith('darwin'):
33  _name_of_lib = 'libgumbo.dylib'
34elif sys.platform.startswith('win'):
35  _name_of_lib = "gumbo.dll"
36
37try:
38  # First look for a freshly-built .so in the .libs directory, for development.
39  _dll = ctypes.cdll.LoadLibrary(os.path.join(
40      os.path.dirname(__file__), '..', '..', '.libs', _name_of_lib))
41except OSError:
42  # PyPI or setuptools install, look in the current directory.
43  _dll = ctypes.cdll.LoadLibrary(os.path.join(
44      os.path.dirname(__file__), _name_of_lib))
45except OSError:
46  # System library, on unix or mac osx
47  _dll = ctypes.cdll.LoadLibrary(_name_of_lib)
48
49# Some aliases for common types.
50_bitvector = ctypes.c_uint
51_Ptr = ctypes.POINTER
52
53class EnumMetaclass(type(ctypes.c_uint)):
54  def __new__(metaclass, name, bases, cls_dict):
55    cls = type(ctypes.c_uint).__new__(metaclass, name, bases, cls_dict)
56    if name == 'Enum':
57      return cls
58    try:
59      for i, value in enumerate(cls_dict['_values_']):
60        setattr(cls, value, cls.from_param(i))
61    except KeyError:
62      raise ValueError('No _values_ list found inside enum type.')
63    except TypeError:
64      raise ValueError('_values_ must be a list of names of enum constants.')
65    return cls
66
67def with_metaclass(mcls):
68    def decorator(cls):
69        body = vars(cls).copy()
70        # clean out class body
71        body.pop('__dict__', None)
72        body.pop('__weakref__', None)
73        return mcls(cls.__name__, cls.__bases__, body)
74    return decorator
75
76@with_metaclass(EnumMetaclass)
77class Enum(ctypes.c_uint):
78  @classmethod
79  def from_param(cls, param):
80    if isinstance(param, Enum):
81      if param.__class__ != cls:
82        raise ValueError("Can't mix enums of different types")
83      return param
84    if param < 0 or param > len(cls._values_):
85      raise ValueError('%d is out of range for enum type %s; max %d.' %
86                       (param, cls.__name__, len(cls._values_)))
87    return cls(param)
88
89  def __eq__(self, other):
90    return self.value == other.value
91
92  def __ne__(self, other):
93    return self.value != other.value
94
95  def __hash__(self):
96    return hash(self.value)
97
98  def __repr__(self):
99    try:
100      return self._values_[self.value]
101    except IndexError:
102      raise IndexError('Value %d is out of range for %r' %
103                       (self.value, self._values_))
104
105
106class StringPiece(ctypes.Structure):
107  _fields_ = [
108      ('data', _Ptr(ctypes.c_char)),
109      ('length', ctypes.c_size_t),
110      ]
111
112  def __len__(self):
113    return self.length
114
115  def __str__(self):
116    return ctypes.string_at(self.data, self.length)
117
118
119class SourcePosition(ctypes.Structure):
120  _fields_ = [
121      ('line', ctypes.c_uint),
122      ('column', ctypes.c_uint),
123      ('offset', ctypes.c_uint)
124      ]
125SourcePosition.EMPTY = SourcePosition.in_dll(_dll, 'kGumboEmptySourcePosition')
126
127
128class AttributeNamespace(Enum):
129  URLS = [
130      'http://www.w3.org/1999/xhtml',
131      'http://www.w3.org/1999/xlink',
132      'http://www.w3.org/XML/1998/namespace',
133      'http://www.w3.org/2000/xmlns',
134  ]
135  _values_ = ['NONE', 'XLINK', 'XML', 'XMLNS']
136
137  def to_url(self):
138    return self.URLS[self.value]
139
140
141class Attribute(ctypes.Structure):
142  _fields_ = [
143      ('namespace', AttributeNamespace),
144      ('name', ctypes.c_char_p),
145      ('original_name', StringPiece),
146      ('value', ctypes.c_char_p),
147      ('original_value', StringPiece),
148      ('name_start', SourcePosition),
149      ('name_end', SourcePosition),
150      ('value_start', SourcePosition),
151      ('value_end', SourcePosition)
152      ]
153
154
155class Vector(ctypes.Structure):
156  _type_ = ctypes.c_void_p
157  _fields_ = [
158      ('data', _Ptr(ctypes.c_void_p)),
159      ('length', ctypes.c_uint),
160      ('capacity', ctypes.c_uint)
161      ]
162
163  class Iter(object):
164    def __init__(self, vector):
165      self.current = 0
166      self.vector = vector
167
168    def __iter__(self):
169      return self
170
171    def __next__(self):
172      # Python 3
173      if self.current >= self.vector.length:
174        raise StopIteration
175      obj = self.vector[self.current]
176      self.current += 1
177      return obj
178
179    def next(self):
180      # Python 2
181      return self.__next__()
182
183  def __len__(self):
184    return self.length
185
186  def __getitem__(self, i):
187    try:
188      # Python 2
189      numeric_types = (int, long)
190    except NameError:
191      # Python 3
192      numeric_types = int
193
194    if isinstance(i, numeric_types):
195      if i < 0:
196        i += self.length
197      if i > self.length:
198        raise IndexError
199      array_type = _Ptr(_Ptr(self._type_))
200      return ctypes.cast(self.data, array_type)[i].contents
201    return list(self)[i]
202
203  def __iter__(self):
204    return Vector.Iter(self)
205
206
207Vector.EMPTY = Vector.in_dll(_dll, 'kGumboEmptyVector')
208
209
210class AttributeVector(Vector):
211  _type_ = Attribute
212
213
214class NodeVector(Vector):
215  # _type_ assigned later, to avoid circular references with Node
216  pass
217
218
219class QuirksMode(Enum):
220  _values_ = ['NO_QUIRKS', 'QUIRKS', 'LIMITED_QUIRKS']
221
222
223class Document(ctypes.Structure):
224  _fields_ = [
225      ('children', NodeVector),
226      ('has_doctype', ctypes.c_bool),
227      ('name', ctypes.c_char_p),
228      ('public_identifier', ctypes.c_char_p),
229      ('system_identifier', ctypes.c_char_p),
230      ('doc_type_quirks_mode', QuirksMode),
231      ]
232
233  def __repr__(self):
234    return 'Document'
235
236
237class Namespace(Enum):
238  URLS = [
239      'http://www.w3.org/1999/xhtml',
240      'http://www.w3.org/2000/svg',
241      'http://www.w3.org/1998/Math/MathML',
242  ]
243  _values_ = ['HTML', 'SVG', 'MATHML']
244
245  def to_url(self):
246    return self.URLS[self.value]
247
248
249class Tag(Enum):
250  @staticmethod
251  def from_str(tagname):
252    text_ptr = ctypes.c_char_p(tagname.encode('utf-8'))
253    return _tag_enum(text_ptr)
254
255  _values_ = gumboc_tags.TagNames + ['UNKNOWN', 'LAST']
256
257class Element(ctypes.Structure):
258  _fields_ = [
259      ('children', NodeVector),
260      ('tag', Tag),
261      ('tag_namespace', Namespace),
262      ('original_tag', StringPiece),
263      ('original_end_tag', StringPiece),
264      ('start_pos', SourcePosition),
265      ('end_pos', SourcePosition),
266      ('attributes', AttributeVector),
267      ]
268
269  @property
270  def tag_name(self):
271    original_tag = StringPiece.from_buffer_copy(self.original_tag)
272    _tag_from_original_text(ctypes.byref(original_tag))
273    if self.tag_namespace == Namespace.SVG:
274      svg_tagname = _normalize_svg_tagname(ctypes.byref(original_tag))
275      if svg_tagname is not None:
276        return str(svg_tagname)
277    if self.tag == Tag.UNKNOWN:
278      if original_tag.data is None:
279        return ''
280      return str(original_tag).lower()
281    return _tagname(self.tag)
282
283  def __repr__(self):
284    return ('<%r>\n' % self.tag +
285            '\n'.join(repr(child) for child in self.children) +
286            '</%r>' % self.tag)
287
288
289class Text(ctypes.Structure):
290  _fields_ = [
291      ('text', ctypes.c_char_p),
292      ('original_text', StringPiece),
293      ('start_pos', SourcePosition)
294      ]
295
296  def __repr__(self):
297    return 'Text(%r)' % self.text
298
299
300class NodeType(Enum):
301  _values_ = ['DOCUMENT', 'ELEMENT', 'TEXT', 'CDATA',
302              'COMMENT', 'WHITESPACE', 'TEMPLATE']
303
304
305class NodeUnion(ctypes.Union):
306  _fields_ = [
307      ('document', Document),
308      ('element', Element),
309      ('text', Text),
310      ]
311
312
313class Node(ctypes.Structure):
314  # _fields_ set later to avoid a circular reference
315
316  def _contents(self):
317    # Python3 enters an infinite loop if you use an @property within
318    # __getattr__, so we factor it out to a helper.
319    if self.type == NodeType.DOCUMENT:
320      return self.v.document
321    elif self.type in (NodeType.ELEMENT, NodeType.TEMPLATE):
322      return self.v.element
323    else:
324      return self.v.text
325
326  @property
327  def contents(self):
328    return self._contents()
329
330  def __getattr__(self, name):
331    return getattr(self._contents(), name)
332
333  def __setattr__(self, name, value):
334    return setattr(self._contents(), name, value)
335
336  def __repr__(self):
337    return repr(self.contents)
338
339
340Node._fields_ = [
341    ('type', NodeType),
342    # Set the type to Node later to avoid a circular dependency.
343    ('parent', _Ptr(Node)),
344    ('index_within_parent', ctypes.c_size_t),
345    # TODO(jdtang): Make a real list of enum constants for this.
346    ('parse_flags', _bitvector),
347    ('v', NodeUnion)
348    ]
349NodeVector._type_ = Node
350
351
352class Options(ctypes.Structure):
353  _fields_ = [
354      # TODO(jdtang): Allow the Python API to set the allocator/deallocator
355      # function.  Right now these are treated as opaque void pointers.
356      ('allocator', ctypes.c_void_p),
357      ('deallocator', ctypes.c_void_p),
358      ('userdata', ctypes.c_void_p),
359      ('tab_stop', ctypes.c_int),
360      ('stop_on_first_error', ctypes.c_bool),
361      ('max_errors', ctypes.c_int),
362      ('fragment_context', Tag),
363      ('fragment_namespace', Namespace),
364      ]
365
366
367class Output(ctypes.Structure):
368  _fields_ = [
369      ('document', _Ptr(Node)),
370      ('root', _Ptr(Node)),
371      # TODO(jdtang): Error type.
372      ('errors', Vector),
373      ]
374
375@contextlib.contextmanager
376def parse(text, **kwargs):
377  options = Options()
378  for field_name, _ in Options._fields_:
379    try:
380      setattr(options, field_name, kwargs[field_name])
381    except KeyError:
382      setattr(options, field_name, getattr(_DEFAULT_OPTIONS, field_name))
383  # We have to manually take a reference to the input text here so that it
384  # outlives the parse output.  If we let ctypes do it automatically on function
385  # call, it creates a temporary buffer which is destroyed when the call
386  # completes, and then the original_text pointers point into invalid memory.
387  text_ptr = ctypes.c_char_p(text.encode('utf-8'))
388  output = _parse_with_options(ctypes.byref(options), text_ptr, len(text))
389  try:
390    yield output
391  finally:
392    _destroy_output(ctypes.byref(options), output)
393
394_DEFAULT_OPTIONS = Options.in_dll(_dll, 'kGumboDefaultOptions')
395
396_parse_with_options = _dll.gumbo_parse_with_options
397_parse_with_options.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t]
398_parse_with_options.restype = _Ptr(Output)
399
400_tag_from_original_text = _dll.gumbo_tag_from_original_text
401_tag_from_original_text.argtypes = [_Ptr(StringPiece)]
402_tag_from_original_text.restype = None
403
404_normalize_svg_tagname = _dll.gumbo_normalize_svg_tagname
405_normalize_svg_tagname.argtypes = [_Ptr(StringPiece)]
406_normalize_svg_tagname.restype = ctypes.c_char_p
407
408_destroy_output = _dll.gumbo_destroy_output
409_destroy_output.argtypes = [_Ptr(Options), _Ptr(Output)]
410_destroy_output.restype = None
411
412_tagname = _dll.gumbo_normalized_tagname
413_tagname.argtypes = [Tag]
414_tagname.restype = ctypes.c_char_p
415
416_tag_enum = _dll.gumbo_tag_enum
417_tag_enum.argtypes = [ctypes.c_char_p]
418_tag_enum.restype = Tag
419
420__all__ = ['StringPiece', 'SourcePosition', 'AttributeNamespace', 'Attribute',
421           'Vector', 'AttributeVector', 'NodeVector', 'QuirksMode', 'Document',
422           'Namespace', 'Tag', 'Element', 'Text', 'NodeType', 'Node',
423           'Options', 'Output', 'parse']
424