1# Copyright 2012 Google Inc. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# 15 16"""CTypes bindings for the Gumbo HTML5 parser. 17 18This exports the raw interface of the library as a set of very thin ctypes 19wrappers. It's intended to be wrapped by other libraries to provide a more 20Pythonic API. 21""" 22 23__author__ = 'jdtang@google.com (Jonathan Tang)' 24 25import sys 26import contextlib 27import ctypes 28import os.path 29import gumboc_tags 30 31_name_of_lib = 'libgumbo.so' 32if sys.platform.startswith('darwin'): 33 _name_of_lib = 'libgumbo.dylib' 34elif sys.platform.startswith('win'): 35 _name_of_lib = "gumbo.dll" 36 37try: 38 # First look for a freshly-built .so in the .libs directory, for development. 39 _dll = ctypes.cdll.LoadLibrary(os.path.join( 40 os.path.dirname(__file__), '..', '..', '.libs', _name_of_lib)) 41except OSError: 42 # PyPI or setuptools install, look in the current directory. 43 _dll = ctypes.cdll.LoadLibrary(os.path.join( 44 os.path.dirname(__file__), _name_of_lib)) 45except OSError: 46 # System library, on unix or mac osx 47 _dll = ctypes.cdll.LoadLibrary(_name_of_lib) 48 49# Some aliases for common types. 50_bitvector = ctypes.c_uint 51_Ptr = ctypes.POINTER 52 53class EnumMetaclass(type(ctypes.c_uint)): 54 def __new__(metaclass, name, bases, cls_dict): 55 cls = type(ctypes.c_uint).__new__(metaclass, name, bases, cls_dict) 56 if name == 'Enum': 57 return cls 58 try: 59 for i, value in enumerate(cls_dict['_values_']): 60 setattr(cls, value, cls.from_param(i)) 61 except KeyError: 62 raise ValueError('No _values_ list found inside enum type.') 63 except TypeError: 64 raise ValueError('_values_ must be a list of names of enum constants.') 65 return cls 66 67def with_metaclass(mcls): 68 def decorator(cls): 69 body = vars(cls).copy() 70 # clean out class body 71 body.pop('__dict__', None) 72 body.pop('__weakref__', None) 73 return mcls(cls.__name__, cls.__bases__, body) 74 return decorator 75 76@with_metaclass(EnumMetaclass) 77class Enum(ctypes.c_uint): 78 @classmethod 79 def from_param(cls, param): 80 if isinstance(param, Enum): 81 if param.__class__ != cls: 82 raise ValueError("Can't mix enums of different types") 83 return param 84 if param < 0 or param > len(cls._values_): 85 raise ValueError('%d is out of range for enum type %s; max %d.' % 86 (param, cls.__name__, len(cls._values_))) 87 return cls(param) 88 89 def __eq__(self, other): 90 return self.value == other.value 91 92 def __ne__(self, other): 93 return self.value != other.value 94 95 def __hash__(self): 96 return hash(self.value) 97 98 def __repr__(self): 99 try: 100 return self._values_[self.value] 101 except IndexError: 102 raise IndexError('Value %d is out of range for %r' % 103 (self.value, self._values_)) 104 105 106class StringPiece(ctypes.Structure): 107 _fields_ = [ 108 ('data', _Ptr(ctypes.c_char)), 109 ('length', ctypes.c_size_t), 110 ] 111 112 def __len__(self): 113 return self.length 114 115 def __str__(self): 116 return ctypes.string_at(self.data, self.length) 117 118 119class SourcePosition(ctypes.Structure): 120 _fields_ = [ 121 ('line', ctypes.c_uint), 122 ('column', ctypes.c_uint), 123 ('offset', ctypes.c_uint) 124 ] 125SourcePosition.EMPTY = SourcePosition.in_dll(_dll, 'kGumboEmptySourcePosition') 126 127 128class AttributeNamespace(Enum): 129 URLS = [ 130 'http://www.w3.org/1999/xhtml', 131 'http://www.w3.org/1999/xlink', 132 'http://www.w3.org/XML/1998/namespace', 133 'http://www.w3.org/2000/xmlns', 134 ] 135 _values_ = ['NONE', 'XLINK', 'XML', 'XMLNS'] 136 137 def to_url(self): 138 return self.URLS[self.value] 139 140 141class Attribute(ctypes.Structure): 142 _fields_ = [ 143 ('namespace', AttributeNamespace), 144 ('name', ctypes.c_char_p), 145 ('original_name', StringPiece), 146 ('value', ctypes.c_char_p), 147 ('original_value', StringPiece), 148 ('name_start', SourcePosition), 149 ('name_end', SourcePosition), 150 ('value_start', SourcePosition), 151 ('value_end', SourcePosition) 152 ] 153 154 155class Vector(ctypes.Structure): 156 _type_ = ctypes.c_void_p 157 _fields_ = [ 158 ('data', _Ptr(ctypes.c_void_p)), 159 ('length', ctypes.c_uint), 160 ('capacity', ctypes.c_uint) 161 ] 162 163 class Iter(object): 164 def __init__(self, vector): 165 self.current = 0 166 self.vector = vector 167 168 def __iter__(self): 169 return self 170 171 def __next__(self): 172 # Python 3 173 if self.current >= self.vector.length: 174 raise StopIteration 175 obj = self.vector[self.current] 176 self.current += 1 177 return obj 178 179 def next(self): 180 # Python 2 181 return self.__next__() 182 183 def __len__(self): 184 return self.length 185 186 def __getitem__(self, i): 187 try: 188 # Python 2 189 numeric_types = (int, long) 190 except NameError: 191 # Python 3 192 numeric_types = int 193 194 if isinstance(i, numeric_types): 195 if i < 0: 196 i += self.length 197 if i > self.length: 198 raise IndexError 199 array_type = _Ptr(_Ptr(self._type_)) 200 return ctypes.cast(self.data, array_type)[i].contents 201 return list(self)[i] 202 203 def __iter__(self): 204 return Vector.Iter(self) 205 206 207Vector.EMPTY = Vector.in_dll(_dll, 'kGumboEmptyVector') 208 209 210class AttributeVector(Vector): 211 _type_ = Attribute 212 213 214class NodeVector(Vector): 215 # _type_ assigned later, to avoid circular references with Node 216 pass 217 218 219class QuirksMode(Enum): 220 _values_ = ['NO_QUIRKS', 'QUIRKS', 'LIMITED_QUIRKS'] 221 222 223class Document(ctypes.Structure): 224 _fields_ = [ 225 ('children', NodeVector), 226 ('has_doctype', ctypes.c_bool), 227 ('name', ctypes.c_char_p), 228 ('public_identifier', ctypes.c_char_p), 229 ('system_identifier', ctypes.c_char_p), 230 ('doc_type_quirks_mode', QuirksMode), 231 ] 232 233 def __repr__(self): 234 return 'Document' 235 236 237class Namespace(Enum): 238 URLS = [ 239 'http://www.w3.org/1999/xhtml', 240 'http://www.w3.org/2000/svg', 241 'http://www.w3.org/1998/Math/MathML', 242 ] 243 _values_ = ['HTML', 'SVG', 'MATHML'] 244 245 def to_url(self): 246 return self.URLS[self.value] 247 248 249class Tag(Enum): 250 @staticmethod 251 def from_str(tagname): 252 text_ptr = ctypes.c_char_p(tagname.encode('utf-8')) 253 return _tag_enum(text_ptr) 254 255 _values_ = gumboc_tags.TagNames + ['UNKNOWN', 'LAST'] 256 257class Element(ctypes.Structure): 258 _fields_ = [ 259 ('children', NodeVector), 260 ('tag', Tag), 261 ('tag_namespace', Namespace), 262 ('original_tag', StringPiece), 263 ('original_end_tag', StringPiece), 264 ('start_pos', SourcePosition), 265 ('end_pos', SourcePosition), 266 ('attributes', AttributeVector), 267 ] 268 269 @property 270 def tag_name(self): 271 original_tag = StringPiece.from_buffer_copy(self.original_tag) 272 _tag_from_original_text(ctypes.byref(original_tag)) 273 if self.tag_namespace == Namespace.SVG: 274 svg_tagname = _normalize_svg_tagname(ctypes.byref(original_tag)) 275 if svg_tagname is not None: 276 return str(svg_tagname) 277 if self.tag == Tag.UNKNOWN: 278 if original_tag.data is None: 279 return '' 280 return str(original_tag).lower() 281 return _tagname(self.tag) 282 283 def __repr__(self): 284 return ('<%r>\n' % self.tag + 285 '\n'.join(repr(child) for child in self.children) + 286 '</%r>' % self.tag) 287 288 289class Text(ctypes.Structure): 290 _fields_ = [ 291 ('text', ctypes.c_char_p), 292 ('original_text', StringPiece), 293 ('start_pos', SourcePosition) 294 ] 295 296 def __repr__(self): 297 return 'Text(%r)' % self.text 298 299 300class NodeType(Enum): 301 _values_ = ['DOCUMENT', 'ELEMENT', 'TEXT', 'CDATA', 302 'COMMENT', 'WHITESPACE', 'TEMPLATE'] 303 304 305class NodeUnion(ctypes.Union): 306 _fields_ = [ 307 ('document', Document), 308 ('element', Element), 309 ('text', Text), 310 ] 311 312 313class Node(ctypes.Structure): 314 # _fields_ set later to avoid a circular reference 315 316 def _contents(self): 317 # Python3 enters an infinite loop if you use an @property within 318 # __getattr__, so we factor it out to a helper. 319 if self.type == NodeType.DOCUMENT: 320 return self.v.document 321 elif self.type in (NodeType.ELEMENT, NodeType.TEMPLATE): 322 return self.v.element 323 else: 324 return self.v.text 325 326 @property 327 def contents(self): 328 return self._contents() 329 330 def __getattr__(self, name): 331 return getattr(self._contents(), name) 332 333 def __setattr__(self, name, value): 334 return setattr(self._contents(), name, value) 335 336 def __repr__(self): 337 return repr(self.contents) 338 339 340Node._fields_ = [ 341 ('type', NodeType), 342 # Set the type to Node later to avoid a circular dependency. 343 ('parent', _Ptr(Node)), 344 ('index_within_parent', ctypes.c_size_t), 345 # TODO(jdtang): Make a real list of enum constants for this. 346 ('parse_flags', _bitvector), 347 ('v', NodeUnion) 348 ] 349NodeVector._type_ = Node 350 351 352class Options(ctypes.Structure): 353 _fields_ = [ 354 # TODO(jdtang): Allow the Python API to set the allocator/deallocator 355 # function. Right now these are treated as opaque void pointers. 356 ('allocator', ctypes.c_void_p), 357 ('deallocator', ctypes.c_void_p), 358 ('userdata', ctypes.c_void_p), 359 ('tab_stop', ctypes.c_int), 360 ('stop_on_first_error', ctypes.c_bool), 361 ('max_errors', ctypes.c_int), 362 ('fragment_context', Tag), 363 ('fragment_namespace', Namespace), 364 ] 365 366 367class Output(ctypes.Structure): 368 _fields_ = [ 369 ('document', _Ptr(Node)), 370 ('root', _Ptr(Node)), 371 # TODO(jdtang): Error type. 372 ('errors', Vector), 373 ] 374 375@contextlib.contextmanager 376def parse(text, **kwargs): 377 options = Options() 378 for field_name, _ in Options._fields_: 379 try: 380 setattr(options, field_name, kwargs[field_name]) 381 except KeyError: 382 setattr(options, field_name, getattr(_DEFAULT_OPTIONS, field_name)) 383 # We have to manually take a reference to the input text here so that it 384 # outlives the parse output. If we let ctypes do it automatically on function 385 # call, it creates a temporary buffer which is destroyed when the call 386 # completes, and then the original_text pointers point into invalid memory. 387 text_ptr = ctypes.c_char_p(text.encode('utf-8')) 388 output = _parse_with_options(ctypes.byref(options), text_ptr, len(text)) 389 try: 390 yield output 391 finally: 392 _destroy_output(ctypes.byref(options), output) 393 394_DEFAULT_OPTIONS = Options.in_dll(_dll, 'kGumboDefaultOptions') 395 396_parse_with_options = _dll.gumbo_parse_with_options 397_parse_with_options.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t] 398_parse_with_options.restype = _Ptr(Output) 399 400_tag_from_original_text = _dll.gumbo_tag_from_original_text 401_tag_from_original_text.argtypes = [_Ptr(StringPiece)] 402_tag_from_original_text.restype = None 403 404_normalize_svg_tagname = _dll.gumbo_normalize_svg_tagname 405_normalize_svg_tagname.argtypes = [_Ptr(StringPiece)] 406_normalize_svg_tagname.restype = ctypes.c_char_p 407 408_destroy_output = _dll.gumbo_destroy_output 409_destroy_output.argtypes = [_Ptr(Options), _Ptr(Output)] 410_destroy_output.restype = None 411 412_tagname = _dll.gumbo_normalized_tagname 413_tagname.argtypes = [Tag] 414_tagname.restype = ctypes.c_char_p 415 416_tag_enum = _dll.gumbo_tag_enum 417_tag_enum.argtypes = [ctypes.c_char_p] 418_tag_enum.restype = Tag 419 420__all__ = ['StringPiece', 'SourcePosition', 'AttributeNamespace', 'Attribute', 421 'Vector', 'AttributeVector', 'NodeVector', 'QuirksMode', 'Document', 422 'Namespace', 'Tag', 'Element', 'Text', 'NodeType', 'Node', 423 'Options', 'Output', 'parse'] 424