1#!/usr/local/bin/python3.8 2# -*- coding: utf-8 -*- 3# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 5# Copyright 2015 Kevin B. Hendricks Stratford Ontario Canada 6# Copyright 2012 Google Inc. All Rights Reserved. 7# 8# Licensed under the Apache License, Version 2.0 (the "License"); 9# you may not use this file except in compliance with the License. 10# You may obtain a copy of the License at 11# 12# http://www.apache.org/licenses/LICENSE-2.0 13# 14# Unless required by applicable law or agreed to in writing, software 15# distributed under the License is distributed on an "AS IS" BASIS, 16# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17# See the License for the specific language governing permissions and 18# limitations under the License. 19# 20"""CTypes bindings for the Gumbo HTML5 parser. 21 22This exports the raw interface of the library as a set of very thin ctypes 23wrappers. It's intended to be wrapped by other libraries to provide a more 24Pythonic API. 25""" 26import os 27import re 28import contextlib 29import ctypes 30from ctypes.util import find_library 31import sigil_gumboc_tags as gumboc_tags 32 33__author__ = 'jdtang@google.com (Jonathan Tang)' 34 35_sigilgumbolibpath = None 36 37def _remove_xml_header(data): 38 return re.sub(br'<\s*\?xml\s*[^\?>]*\?*>\s*', b'', data, flags=re.I) 39 40if 'SigilGumboLibPath' in os.environ: 41 _sigilgumbolibpath = os.environ['SigilGumboLibPath'] 42 43if _sigilgumbolibpath is not None: 44 try: 45 _dll = ctypes.cdll.LoadLibrary(_sigilgumbolibpath) 46 except OSError: 47 _dll = ctypes.cdll.LoadLibrary(find_library('sigilgumbo')) 48 pass 49else: 50 _dll = ctypes.cdll.LoadLibrary(find_library('sigilgumbo')) 51 52# Some aliases for common types. 53_bitvector = ctypes.c_uint 54_Ptr = ctypes.POINTER 55 56class EnumMetaclass(type(ctypes.c_uint)): 57 def __new__(metaclass, name, bases, cls_dict): 58 cls = type(ctypes.c_uint).__new__(metaclass, name, bases, cls_dict) 59 if name == 'Enum': 60 return cls 61 try: 62 for i, value in enumerate(cls_dict['_values_']): 63 setattr(cls, value, cls.from_param(i)) 64 except KeyError: 65 raise ValueError('No _values_ list found inside enum type.') 66 except TypeError: 67 raise ValueError('_values_ must be a list of names of enum constants.') 68 return cls 69 70def with_metaclass(mcls): 71 def decorator(cls): 72 body = vars(cls).copy() 73 # clean out class body 74 body.pop('__dict__', None) 75 body.pop('__weakref__', None) 76 return mcls(cls.__name__, cls.__bases__, body) 77 return decorator 78 79@with_metaclass(EnumMetaclass) 80class Enum(ctypes.c_uint): 81 @classmethod 82 def from_param(cls, param): 83 if isinstance(param, Enum): 84 if param.__class__ != cls: 85 raise ValueError("Can't mix enums of different types") 86 return param 87 if param < 0 or param > len(cls._values_): 88 raise ValueError('%d is out of range for enum type %s; max %d.' % 89 (param, cls.__name__, len(cls._values_))) 90 return cls(param) 91 92 def __eq__(self, other): 93 return self.value == other.value 94 95 def __ne__(self, other): 96 return self.value != other.value 97 98 def __hash__(self): 99 return hash(self.value) 100 101 def __repr__(self): 102 try: 103 return self._values_[self.value] 104 except IndexError: 105 raise IndexError('Value %d is out of range for %r' % 106 (self.value, self._values_)) 107 108 109 110class StringPiece(ctypes.Structure): 111 _fields_ = [ 112 ('data', _Ptr(ctypes.c_char)), 113 ('length', ctypes.c_size_t), 114 ] 115 116 def __len__(self): 117 return self.length 118 119 def __str__(self): 120 return ctypes.string_at(self.data, self.length).decode('utf-8') 121 122 def __bytes__(self): 123 return ctypes.string_at(self.data, self.length) 124 125 126class SourcePosition(ctypes.Structure): 127 _fields_ = [ 128 ('line', ctypes.c_uint), 129 ('column', ctypes.c_uint), 130 ('offset', ctypes.c_uint) 131 ] 132SourcePosition.EMPTY = SourcePosition.in_dll(_dll, 'kGumboEmptySourcePosition') 133 134 135class AttributeNamespace(Enum): 136 URLS = [ 137 'http://www.w3.org/1999/xhtml', 138 'http://www.w3.org/1999/xlink', 139 'http://www.w3.org/XML/1998/namespace', 140 'http://www.w3.org/2000/xmlns', 141 ] 142 _values_ = ['NONE', 'XLINK', 'XML', 'XMLNS'] 143 144 def to_url(self): 145 return self.URLS[self.value] 146 147 148class OutputStatus(Enum): 149 STATUS_MSG = [ 150 'OK', 151 'Document tree depth limit exceeded', 152 'System allocator returned NULL during parsing', 153 ] 154 _values_ = ['STATUS_OK', 'STATUS_TREE_TOO_DEEP', 'STATUS_OUT_OF_MEMORY'] 155 156 def to_string(self): 157 return self.STATUS_MSG[self.value] 158 159 160class Attribute(ctypes.Structure): 161 _fields_ = [ 162 ('namespace', AttributeNamespace), 163 ('name', ctypes.c_char_p), 164 ('original_name', StringPiece), 165 ('value', ctypes.c_char_p), 166 ('original_value', StringPiece), 167 ('name_start', SourcePosition), 168 ('name_end', SourcePosition), 169 ('value_start', SourcePosition), 170 ('value_end', SourcePosition) 171 ] 172 173 174class Vector(ctypes.Structure): 175 _type_ = ctypes.c_void_p 176 _fields_ = [ 177 ('data', _Ptr(ctypes.c_void_p)), 178 ('length', ctypes.c_uint), 179 ('capacity', ctypes.c_uint) 180 ] 181 182 class Iter(object): 183 def __init__(self, vector): 184 self.current = 0 185 self.vector = vector 186 187 def __iter__(self): 188 return self 189 190 def __next__(self): 191 # Python 3 192 if self.current >= self.vector.length: 193 raise StopIteration 194 obj = self.vector[self.current] 195 self.current += 1 196 return obj 197 198 def next(self): 199 # Python 2 200 return self.__next__() 201 202 def __len__(self): 203 return self.length 204 205 def __getitem__(self, i): 206 try: 207 # Python 2 208 numeric_types = (int, long) 209 except NameError: 210 # Python 3 211 numeric_types = int 212 213 if isinstance(i, numeric_types): 214 if i < 0: 215 i += self.length 216 if i > self.length: 217 raise IndexError 218 array_type = _Ptr(_Ptr(self._type_)) 219 return ctypes.cast(self.data, array_type)[i].contents 220 return list(self)[i] 221 222 def __iter__(self): 223 return Vector.Iter(self) 224 225 226Vector.EMPTY = Vector.in_dll(_dll, 'kGumboEmptyVector') 227 228 229class AttributeVector(Vector): 230 _type_ = Attribute 231 232 233class NodeVector(Vector): 234 # _type_ assigned later, to avoid circular references with Node 235 pass 236 237 238class QuirksMode(Enum): 239 _values_ = ['NO_QUIRKS', 'QUIRKS', 'LIMITED_QUIRKS'] 240 241 242class Document(ctypes.Structure): 243 _fields_ = [ 244 ('children', NodeVector), 245 ('has_doctype', ctypes.c_bool), 246 ('name', ctypes.c_char_p), 247 ('public_identifier', ctypes.c_char_p), 248 ('system_identifier', ctypes.c_char_p), 249 ('doc_type_quirks_mode', QuirksMode), 250 ] 251 252 def __repr__(self): 253 return 'Document' 254 255 256class Namespace(Enum): 257 URLS = [ 258 'http://www.w3.org/1999/xhtml', 259 'http://www.w3.org/2000/svg', 260 'http://www.w3.org/1998/Math/MathML', 261 ] 262 _values_ = ['HTML', 'SVG', 'MATHML'] 263 264 def to_url(self): 265 return self.URLS[self.value] 266 267 268class Tag(Enum): 269 @staticmethod 270 def from_str(tagname): 271 text_ptr = ctypes.c_char_p(tagname.encode('utf-8')) 272 return _tag_enum(text_ptr) 273 274 _values_ = gumboc_tags.TagNames + ['UNKNOWN', 'LAST'] 275 276class Element(ctypes.Structure): 277 _fields_ = [ 278 ('children', NodeVector), 279 ('tag', Tag), 280 ('tag_namespace', Namespace), 281 ('original_tag', StringPiece), 282 ('original_end_tag', StringPiece), 283 ('start_pos', SourcePosition), 284 ('end_pos', SourcePosition), 285 ('attributes', AttributeVector), 286 ] 287 288 @property 289 def tag_name(self): 290 original_tag = StringPiece.from_buffer_copy(self.original_tag) 291 _tag_from_original_text(ctypes.byref(original_tag)) 292 if self.tag_namespace == Namespace.SVG: 293 svg_tagname = _normalize_svg_tagname(ctypes.byref(original_tag)) 294 if svg_tagname is not None: 295 return bytes(svg_tagname) 296 if self.tag == Tag.UNKNOWN: 297 if original_tag.data is None: 298 return '' 299 return (bytes(original_tag).decode('utf-8').lower()).encode('utf-8') 300 return _tagname(self.tag) 301 302 def __repr__(self): 303 return ('<%r>\n' % self.tag + '\n'.join(repr(child) for child in self.children) + '</%r>' % self.tag) 304 305 306class Text(ctypes.Structure): 307 _fields_ = [ 308 ('text', ctypes.c_char_p), 309 ('original_text', StringPiece), 310 ('start_pos', SourcePosition) 311 ] 312 313 def __repr__(self): 314 return 'Text(%r)' % self.text 315 316 317class NodeType(Enum): 318 _values_ = ['DOCUMENT', 'ELEMENT', 'TEXT', 'CDATA', 319 'COMMENT', 'WHITESPACE', 'TEMPLATE'] 320 321 322class NodeUnion(ctypes.Union): 323 _fields_ = [ 324 ('document', Document), 325 ('element', Element), 326 ('text', Text), 327 ] 328 329 330class Node(ctypes.Structure): 331 # _fields_ set later to avoid a circular reference 332 333 def _contents(self): 334 # Python3 enters an infinite loop if you use an @property within 335 # __getattr__, so we factor it out to a helper. 336 if self.type == NodeType.DOCUMENT: 337 return self.v.document 338 elif self.type in (NodeType.ELEMENT, NodeType.TEMPLATE): 339 return self.v.element 340 else: 341 return self.v.text 342 343 @property 344 def contents(self): 345 return self._contents() 346 347 def __getattr__(self, name): 348 return getattr(self._contents(), name) 349 350 def __setattr__(self, name, value): 351 return setattr(self._contents(), name, value) 352 353 def __repr__(self): 354 return repr(self.contents) 355 356 357Node._fields_ = [ 358 ('type', NodeType), 359 # Set the type to Node later to avoid a circular dependency. 360 ('parent', _Ptr(Node)), 361 ('index_within_parent', ctypes.c_uint), 362 # TODO(jdtang): Make a real list of enum constants for this. 363 ('parse_flags', _bitvector), 364 ('v', NodeUnion) 365 ] 366NodeVector._type_ = Node 367 368 369class Options(ctypes.Structure): 370 _fields_ = [ 371 ('tab_stop', ctypes.c_int), 372 ('use_xhtml_rules', ctypes.c_bool), 373 ('stop_on_first_error', ctypes.c_bool), 374 ('max_tree_depth', ctypes.c_uint), 375 ('max_errors', ctypes.c_int), 376 ] 377 378 379class Output(ctypes.Structure): 380 _fields_ = [ 381 ('document', _Ptr(Node)), 382 ('root', _Ptr(Node)), 383 ('status', OutputStatus), 384 # TODO(jdtang): Error type. 385 ('errors', Vector), 386 ] 387 388 389# Important Note: gumbo only supports the utf-8 encoding 390# Also gumbo is an html5 parser and does not grok xml pi headers 391@contextlib.contextmanager 392def parse(text, **kwargs): 393 options = Options() 394 context_tag = kwargs.get('container', Tag.LAST) 395 context_namespace = kwargs.get('container_namespace', Namespace.HTML) 396 for field_name, _ in Options._fields_: 397 try: 398 setattr(options, field_name, kwargs[field_name]) 399 except KeyError: 400 setattr(options, field_name, getattr(_DEFAULT_OPTIONS, field_name)) 401 # We have to manually take a reference to the input text here so that it 402 # outlives the parse output. If we let ctypes do it automatically on function 403 # call, it creates a temporary buffer which is destroyed when the call 404 # completes, and then the original_text pointers point into invalid memory. 405 # convert string to be utf-8 encoded 406 if isinstance(text, str): 407 text = text.encode('utf-8') 408 text = _remove_xml_header(text) 409 text_ptr = ctypes.c_char_p(text) 410 output = _parse_fragment( 411 ctypes.byref(options), text_ptr, len(text), 412 context_tag, context_namespace) 413 try: 414 yield output 415 finally: 416 _destroy_output(output) 417 418_DEFAULT_OPTIONS = Options.in_dll(_dll, 'kGumboDefaultOptions') 419 420_parse_with_options = _dll.gumbo_parse_with_options 421_parse_with_options.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t] 422_parse_with_options.restype = _Ptr(Output) 423 424_parse_fragment = _dll.gumbo_parse_fragment 425_parse_fragment.argtypes = [ 426 _Ptr(Options), ctypes.c_char_p, ctypes.c_size_t, Tag, Namespace] 427_parse_fragment.restype = _Ptr(Output) 428 429_tag_from_original_text = _dll.gumbo_tag_from_original_text 430_tag_from_original_text.argtypes = [_Ptr(StringPiece)] 431_tag_from_original_text.restype = None 432 433_normalize_svg_tagname = _dll.gumbo_normalize_svg_tagname 434_normalize_svg_tagname.argtypes = [_Ptr(StringPiece)] 435_normalize_svg_tagname.restype = ctypes.c_char_p 436 437_destroy_output = _dll.gumbo_destroy_output 438_destroy_output.argtypes = [_Ptr(Output)] 439_destroy_output.restype = None 440 441_tagname = _dll.gumbo_normalized_tagname 442_tagname.argtypes = [Tag] 443_tagname.restype = ctypes.c_char_p 444 445_tag_enum = _dll.gumbo_tag_enum 446_tag_enum.argtypes = [ctypes.c_char_p] 447_tag_enum.restype = Tag 448 449__all__ = ['StringPiece', 'SourcePosition', 'AttributeNamespace', 'Attribute', 450 'Vector', 'AttributeVector', 'NodeVector', 'QuirksMode', 'Document', 451 'Namespace', 'Tag', 'Element', 'Text', 'NodeType', 'Node', 452 'Options', 'Output', 'parse'] 453