1# Configurable Element class lookup 2 3################################################################################ 4# Custom Element classes 5 6cdef public class ElementBase(_Element) [ type LxmlElementBaseType, 7 object LxmlElementBase ]: 8 u"""ElementBase(*children, attrib=None, nsmap=None, **_extra) 9 10 The public Element class. All custom Element classes must inherit 11 from this one. To create an Element, use the `Element()` factory. 12 13 BIG FAT WARNING: Subclasses *must not* override __init__ or 14 __new__ as it is absolutely undefined when these objects will be 15 created or destroyed. All persistent state of Elements must be 16 stored in the underlying XML. If you really need to initialize 17 the object after creation, you can implement an ``_init(self)`` 18 method that will be called directly after object creation. 19 20 Subclasses of this class can be instantiated to create a new 21 Element. By default, the tag name will be the class name and the 22 namespace will be empty. You can modify this with the following 23 class attributes: 24 25 * TAG - the tag name, possibly containing a namespace in Clark 26 notation 27 28 * NAMESPACE - the default namespace URI, unless provided as part 29 of the TAG attribute. 30 31 * HTML - flag if the class is an HTML tag, as opposed to an XML 32 tag. This only applies to un-namespaced tags and defaults to 33 false (i.e. XML). 34 35 * PARSER - the parser that provides the configuration for the 36 newly created document. Providing an HTML parser here will 37 default to creating an HTML element. 38 39 In user code, the latter three are commonly inherited in class 40 hierarchies that implement a common namespace. 41 """ 42 def __init__(self, *children, attrib=None, nsmap=None, **_extra): 43 u"""ElementBase(*children, attrib=None, nsmap=None, **_extra) 44 """ 45 cdef bint is_html = 0 46 cdef _BaseParser parser 47 cdef _Element last_child 48 # don't use normal attribute access as it might be overridden 49 _getattr = object.__getattribute__ 50 try: 51 namespace = _utf8(_getattr(self, 'NAMESPACE')) 52 except AttributeError: 53 namespace = None 54 try: 55 ns, tag = _getNsTag(_getattr(self, 'TAG')) 56 if ns is not None: 57 namespace = ns 58 except AttributeError: 59 tag = _utf8(_getattr(_getattr(self, '__class__'), '__name__')) 60 if b'.' in tag: 61 tag = tag.split(b'.')[-1] 62 try: 63 parser = _getattr(self, 'PARSER') 64 except AttributeError: 65 parser = None 66 for child in children: 67 if isinstance(child, _Element): 68 parser = (<_Element>child)._doc._parser 69 break 70 if isinstance(parser, HTMLParser): 71 is_html = 1 72 if namespace is None: 73 try: 74 is_html = _getattr(self, 'HTML') 75 except AttributeError: 76 pass 77 _initNewElement(self, is_html, tag, namespace, parser, 78 attrib, nsmap, _extra) 79 last_child = None 80 for child in children: 81 if _isString(child): 82 if last_child is None: 83 _setNodeText(self._c_node, 84 (_collectText(self._c_node.children) or '') + child) 85 else: 86 _setTailText(last_child._c_node, 87 (_collectText(last_child._c_node.next) or '') + child) 88 elif isinstance(child, _Element): 89 last_child = child 90 _appendChild(self, last_child) 91 elif isinstance(child, type) and issubclass(child, ElementBase): 92 last_child = child() 93 _appendChild(self, last_child) 94 else: 95 raise TypeError, f"Invalid child type: {type(child)!r}" 96 97cdef class CommentBase(_Comment): 98 u"""All custom Comment classes must inherit from this one. 99 100 To create an XML Comment instance, use the ``Comment()`` factory. 101 102 Subclasses *must not* override __init__ or __new__ as it is 103 absolutely undefined when these objects will be created or 104 destroyed. All persistent state of Comments must be stored in the 105 underlying XML. If you really need to initialize the object after 106 creation, you can implement an ``_init(self)`` method that will be 107 called after object creation. 108 """ 109 def __init__(self, text): 110 # copied from Comment() factory 111 cdef _Document doc 112 cdef xmlDoc* c_doc 113 if text is None: 114 text = b'' 115 else: 116 text = _utf8(text) 117 c_doc = _newXMLDoc() 118 doc = _documentFactory(c_doc, None) 119 self._c_node = _createComment(c_doc, _xcstr(text)) 120 if self._c_node is NULL: 121 raise MemoryError() 122 tree.xmlAddChild(<xmlNode*>c_doc, self._c_node) 123 _registerProxy(self, doc, self._c_node) 124 self._init() 125 126cdef class PIBase(_ProcessingInstruction): 127 u"""All custom Processing Instruction classes must inherit from this one. 128 129 To create an XML ProcessingInstruction instance, use the ``PI()`` 130 factory. 131 132 Subclasses *must not* override __init__ or __new__ as it is 133 absolutely undefined when these objects will be created or 134 destroyed. All persistent state of PIs must be stored in the 135 underlying XML. If you really need to initialize the object after 136 creation, you can implement an ``_init(self)`` method that will be 137 called after object creation. 138 """ 139 def __init__(self, target, text=None): 140 # copied from PI() factory 141 cdef _Document doc 142 cdef xmlDoc* c_doc 143 target = _utf8(target) 144 if text is None: 145 text = b'' 146 else: 147 text = _utf8(text) 148 c_doc = _newXMLDoc() 149 doc = _documentFactory(c_doc, None) 150 self._c_node = _createPI(c_doc, _xcstr(target), _xcstr(text)) 151 if self._c_node is NULL: 152 raise MemoryError() 153 tree.xmlAddChild(<xmlNode*>c_doc, self._c_node) 154 _registerProxy(self, doc, self._c_node) 155 self._init() 156 157cdef class EntityBase(_Entity): 158 u"""All custom Entity classes must inherit from this one. 159 160 To create an XML Entity instance, use the ``Entity()`` factory. 161 162 Subclasses *must not* override __init__ or __new__ as it is 163 absolutely undefined when these objects will be created or 164 destroyed. All persistent state of Entities must be stored in the 165 underlying XML. If you really need to initialize the object after 166 creation, you can implement an ``_init(self)`` method that will be 167 called after object creation. 168 """ 169 def __init__(self, name): 170 cdef _Document doc 171 cdef xmlDoc* c_doc 172 name_utf = _utf8(name) 173 c_name = _xcstr(name_utf) 174 if c_name[0] == c'#': 175 if not _characterReferenceIsValid(c_name + 1): 176 raise ValueError, f"Invalid character reference: '{name}'" 177 elif not _xmlNameIsValid(c_name): 178 raise ValueError, f"Invalid entity reference: '{name}'" 179 c_doc = _newXMLDoc() 180 doc = _documentFactory(c_doc, None) 181 self._c_node = _createEntity(c_doc, c_name) 182 if self._c_node is NULL: 183 raise MemoryError() 184 tree.xmlAddChild(<xmlNode*>c_doc, self._c_node) 185 _registerProxy(self, doc, self._c_node) 186 self._init() 187 188 189cdef int _validateNodeClass(xmlNode* c_node, cls) except -1: 190 if c_node.type == tree.XML_ELEMENT_NODE: 191 expected = ElementBase 192 elif c_node.type == tree.XML_COMMENT_NODE: 193 expected = CommentBase 194 elif c_node.type == tree.XML_ENTITY_REF_NODE: 195 expected = EntityBase 196 elif c_node.type == tree.XML_PI_NODE: 197 expected = PIBase 198 else: 199 assert False, f"Unknown node type: {c_node.type}" 200 201 if not (isinstance(cls, type) and issubclass(cls, expected)): 202 raise TypeError( 203 f"result of class lookup must be subclass of {type(expected)}, got {type(cls)}") 204 return 0 205 206 207################################################################################ 208# Element class lookup 209 210ctypedef public object (*_element_class_lookup_function)(object, _Document, xmlNode*) 211 212# class to store element class lookup functions 213cdef public class ElementClassLookup [ type LxmlElementClassLookupType, 214 object LxmlElementClassLookup ]: 215 u"""ElementClassLookup(self) 216 Superclass of Element class lookups. 217 """ 218 cdef _element_class_lookup_function _lookup_function 219 220 221cdef public class FallbackElementClassLookup(ElementClassLookup) \ 222 [ type LxmlFallbackElementClassLookupType, 223 object LxmlFallbackElementClassLookup ]: 224 u"""FallbackElementClassLookup(self, fallback=None) 225 226 Superclass of Element class lookups with additional fallback. 227 """ 228 cdef readonly ElementClassLookup fallback 229 cdef _element_class_lookup_function _fallback_function 230 def __cinit__(self): 231 # fall back to default lookup 232 self._fallback_function = _lookupDefaultElementClass 233 234 def __init__(self, ElementClassLookup fallback=None): 235 if fallback is not None: 236 self._setFallback(fallback) 237 else: 238 self._fallback_function = _lookupDefaultElementClass 239 240 cdef void _setFallback(self, ElementClassLookup lookup): 241 u"""Sets the fallback scheme for this lookup method. 242 """ 243 self.fallback = lookup 244 self._fallback_function = lookup._lookup_function 245 if self._fallback_function is NULL: 246 self._fallback_function = _lookupDefaultElementClass 247 248 def set_fallback(self, ElementClassLookup lookup not None): 249 u"""set_fallback(self, lookup) 250 251 Sets the fallback scheme for this lookup method. 252 """ 253 self._setFallback(lookup) 254 255cdef inline object _callLookupFallback(FallbackElementClassLookup lookup, 256 _Document doc, xmlNode* c_node): 257 return lookup._fallback_function(lookup.fallback, doc, c_node) 258 259 260################################################################################ 261# default lookup scheme 262 263cdef class ElementDefaultClassLookup(ElementClassLookup): 264 u"""ElementDefaultClassLookup(self, element=None, comment=None, pi=None, entity=None) 265 Element class lookup scheme that always returns the default Element 266 class. 267 268 The keyword arguments ``element``, ``comment``, ``pi`` and ``entity`` 269 accept the respective Element classes. 270 """ 271 cdef readonly object element_class 272 cdef readonly object comment_class 273 cdef readonly object pi_class 274 cdef readonly object entity_class 275 def __cinit__(self): 276 self._lookup_function = _lookupDefaultElementClass 277 278 def __init__(self, element=None, comment=None, pi=None, entity=None): 279 if element is None: 280 self.element_class = _Element 281 elif issubclass(element, ElementBase): 282 self.element_class = element 283 else: 284 raise TypeError, u"element class must be subclass of ElementBase" 285 286 if comment is None: 287 self.comment_class = _Comment 288 elif issubclass(comment, CommentBase): 289 self.comment_class = comment 290 else: 291 raise TypeError, u"comment class must be subclass of CommentBase" 292 293 if entity is None: 294 self.entity_class = _Entity 295 elif issubclass(entity, EntityBase): 296 self.entity_class = entity 297 else: 298 raise TypeError, u"Entity class must be subclass of EntityBase" 299 300 if pi is None: 301 self.pi_class = None # special case, see below 302 elif issubclass(pi, PIBase): 303 self.pi_class = pi 304 else: 305 raise TypeError, u"PI class must be subclass of PIBase" 306 307cdef object _lookupDefaultElementClass(state, _Document _doc, xmlNode* c_node): 308 u"Trivial class lookup function that always returns the default class." 309 if c_node.type == tree.XML_ELEMENT_NODE: 310 if state is not None: 311 return (<ElementDefaultClassLookup>state).element_class 312 else: 313 return _Element 314 elif c_node.type == tree.XML_COMMENT_NODE: 315 if state is not None: 316 return (<ElementDefaultClassLookup>state).comment_class 317 else: 318 return _Comment 319 elif c_node.type == tree.XML_ENTITY_REF_NODE: 320 if state is not None: 321 return (<ElementDefaultClassLookup>state).entity_class 322 else: 323 return _Entity 324 elif c_node.type == tree.XML_PI_NODE: 325 if state is None or (<ElementDefaultClassLookup>state).pi_class is None: 326 # special case XSLT-PI 327 if c_node.name is not NULL and c_node.content is not NULL: 328 if tree.xmlStrcmp(c_node.name, <unsigned char*>"xml-stylesheet") == 0: 329 if tree.xmlStrstr(c_node.content, <unsigned char*>"text/xsl") is not NULL or \ 330 tree.xmlStrstr(c_node.content, <unsigned char*>"text/xml") is not NULL: 331 return _XSLTProcessingInstruction 332 return _ProcessingInstruction 333 else: 334 return (<ElementDefaultClassLookup>state).pi_class 335 else: 336 assert False, f"Unknown node type: {c_node.type}" 337 338 339################################################################################ 340# attribute based lookup scheme 341 342cdef class AttributeBasedElementClassLookup(FallbackElementClassLookup): 343 u"""AttributeBasedElementClassLookup(self, attribute_name, class_mapping, fallback=None) 344 Checks an attribute of an Element and looks up the value in a 345 class dictionary. 346 347 Arguments: 348 - attribute name - '{ns}name' style string 349 - class mapping - Python dict mapping attribute values to Element classes 350 - fallback - optional fallback lookup mechanism 351 352 A None key in the class mapping will be checked if the attribute is 353 missing. 354 """ 355 cdef object _class_mapping 356 cdef tuple _pytag 357 cdef const_xmlChar* _c_ns 358 cdef const_xmlChar* _c_name 359 def __cinit__(self): 360 self._lookup_function = _attribute_class_lookup 361 362 def __init__(self, attribute_name, class_mapping, 363 ElementClassLookup fallback=None): 364 self._pytag = _getNsTag(attribute_name) 365 ns, name = self._pytag 366 if ns is None: 367 self._c_ns = NULL 368 else: 369 self._c_ns = _xcstr(ns) 370 self._c_name = _xcstr(name) 371 self._class_mapping = dict(class_mapping) 372 373 FallbackElementClassLookup.__init__(self, fallback) 374 375cdef object _attribute_class_lookup(state, _Document doc, xmlNode* c_node): 376 cdef AttributeBasedElementClassLookup lookup 377 cdef python.PyObject* dict_result 378 379 lookup = <AttributeBasedElementClassLookup>state 380 if c_node.type == tree.XML_ELEMENT_NODE: 381 value = _attributeValueFromNsName( 382 c_node, lookup._c_ns, lookup._c_name) 383 dict_result = python.PyDict_GetItem(lookup._class_mapping, value) 384 if dict_result is not NULL: 385 cls = <object>dict_result 386 _validateNodeClass(c_node, cls) 387 return cls 388 return _callLookupFallback(lookup, doc, c_node) 389 390 391################################################################################ 392# per-parser lookup scheme 393 394cdef class ParserBasedElementClassLookup(FallbackElementClassLookup): 395 u"""ParserBasedElementClassLookup(self, fallback=None) 396 Element class lookup based on the XML parser. 397 """ 398 def __cinit__(self): 399 self._lookup_function = _parser_class_lookup 400 401cdef object _parser_class_lookup(state, _Document doc, xmlNode* c_node): 402 if doc._parser._class_lookup is not None: 403 return doc._parser._class_lookup._lookup_function( 404 doc._parser._class_lookup, doc, c_node) 405 return _callLookupFallback(<FallbackElementClassLookup>state, doc, c_node) 406 407 408################################################################################ 409# custom class lookup based on node type, namespace, name 410 411cdef class CustomElementClassLookup(FallbackElementClassLookup): 412 u"""CustomElementClassLookup(self, fallback=None) 413 Element class lookup based on a subclass method. 414 415 You can inherit from this class and override the method:: 416 417 lookup(self, type, doc, namespace, name) 418 419 to lookup the element class for a node. Arguments of the method: 420 * type: one of 'element', 'comment', 'PI', 'entity' 421 * doc: document that the node is in 422 * namespace: namespace URI of the node (or None for comments/PIs/entities) 423 * name: name of the element/entity, None for comments, target for PIs 424 425 If you return None from this method, the fallback will be called. 426 """ 427 def __cinit__(self): 428 self._lookup_function = _custom_class_lookup 429 430 def lookup(self, type, doc, namespace, name): 431 u"lookup(self, type, doc, namespace, name)" 432 return None 433 434cdef object _custom_class_lookup(state, _Document doc, xmlNode* c_node): 435 cdef CustomElementClassLookup lookup 436 437 lookup = <CustomElementClassLookup>state 438 439 if c_node.type == tree.XML_ELEMENT_NODE: 440 element_type = u"element" 441 elif c_node.type == tree.XML_COMMENT_NODE: 442 element_type = u"comment" 443 elif c_node.type == tree.XML_PI_NODE: 444 element_type = u"PI" 445 elif c_node.type == tree.XML_ENTITY_REF_NODE: 446 element_type = u"entity" 447 else: 448 element_type = u"element" 449 if c_node.name is NULL: 450 name = None 451 else: 452 name = funicode(c_node.name) 453 c_str = tree._getNs(c_node) 454 ns = funicode(c_str) if c_str is not NULL else None 455 456 cls = lookup.lookup(element_type, doc, ns, name) 457 if cls is not None: 458 _validateNodeClass(c_node, cls) 459 return cls 460 return _callLookupFallback(lookup, doc, c_node) 461 462 463################################################################################ 464# read-only tree based class lookup 465 466cdef class PythonElementClassLookup(FallbackElementClassLookup): 467 u"""PythonElementClassLookup(self, fallback=None) 468 Element class lookup based on a subclass method. 469 470 This class lookup scheme allows access to the entire XML tree in 471 read-only mode. To use it, re-implement the ``lookup(self, doc, 472 root)`` method in a subclass:: 473 474 from lxml import etree, pyclasslookup 475 476 class MyElementClass(etree.ElementBase): 477 honkey = True 478 479 class MyLookup(pyclasslookup.PythonElementClassLookup): 480 def lookup(self, doc, root): 481 if root.tag == "sometag": 482 return MyElementClass 483 else: 484 for child in root: 485 if child.tag == "someothertag": 486 return MyElementClass 487 # delegate to default 488 return None 489 490 If you return None from this method, the fallback will be called. 491 492 The first argument is the opaque document instance that contains 493 the Element. The second argument is a lightweight Element proxy 494 implementation that is only valid during the lookup. Do not try 495 to keep a reference to it. Once the lookup is done, the proxy 496 will be invalid. 497 498 Also, you cannot wrap such a read-only Element in an ElementTree, 499 and you must take care not to keep a reference to them outside of 500 the `lookup()` method. 501 502 Note that the API of the Element objects is not complete. It is 503 purely read-only and does not support all features of the normal 504 `lxml.etree` API (such as XPath, extended slicing or some 505 iteration methods). 506 507 See https://lxml.de/element_classes.html 508 """ 509 def __cinit__(self): 510 self._lookup_function = _python_class_lookup 511 512 def lookup(self, doc, element): 513 u"""lookup(self, doc, element) 514 515 Override this method to implement your own lookup scheme. 516 """ 517 return None 518 519cdef object _python_class_lookup(state, _Document doc, tree.xmlNode* c_node): 520 cdef PythonElementClassLookup lookup 521 cdef _ReadOnlyProxy proxy 522 lookup = <PythonElementClassLookup>state 523 524 proxy = _newReadOnlyProxy(None, c_node) 525 cls = lookup.lookup(doc, proxy) 526 _freeReadOnlyProxies(proxy) 527 528 if cls is not None: 529 _validateNodeClass(c_node, cls) 530 return cls 531 return _callLookupFallback(lookup, doc, c_node) 532 533################################################################################ 534# Global setup 535 536cdef _element_class_lookup_function LOOKUP_ELEMENT_CLASS 537cdef object ELEMENT_CLASS_LOOKUP_STATE 538 539cdef void _setElementClassLookupFunction( 540 _element_class_lookup_function function, object state): 541 global LOOKUP_ELEMENT_CLASS, ELEMENT_CLASS_LOOKUP_STATE 542 if function is NULL: 543 state = DEFAULT_ELEMENT_CLASS_LOOKUP 544 function = DEFAULT_ELEMENT_CLASS_LOOKUP._lookup_function 545 546 ELEMENT_CLASS_LOOKUP_STATE = state 547 LOOKUP_ELEMENT_CLASS = function 548 549def set_element_class_lookup(ElementClassLookup lookup = None): 550 u"""set_element_class_lookup(lookup = None) 551 552 Set the global default element class lookup method. 553 """ 554 if lookup is None or lookup._lookup_function is NULL: 555 _setElementClassLookupFunction(NULL, None) 556 else: 557 _setElementClassLookupFunction(lookup._lookup_function, lookup) 558 559# default setup: parser delegation 560cdef ParserBasedElementClassLookup DEFAULT_ELEMENT_CLASS_LOOKUP 561DEFAULT_ELEMENT_CLASS_LOOKUP = ParserBasedElementClassLookup() 562 563set_element_class_lookup(DEFAULT_ELEMENT_CLASS_LOOKUP) 564